diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,180027 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6, + "eval_steps": 2000, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4e-05, + "grad_norm": 456.0, + "learning_rate": 1.18e-05, + "loss": 85.4554, + "loss/crossentropy": 9.650346755981445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 8.066818237304688, + "step": 2 + }, + { + "epoch": 8e-05, + "grad_norm": 416.0, + "learning_rate": 1.3600000000000002e-05, + "loss": 84.3418, + "loss/crossentropy": 9.544375896453857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 7.628942489624023, + "step": 4 + }, + { + "epoch": 0.00012, + "grad_norm": 466.0, + "learning_rate": 1.54e-05, + "loss": 87.2187, + "loss/crossentropy": 9.569977283477783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 7.7909746170043945, + "step": 6 + }, + { + "epoch": 0.00016, + "grad_norm": 247.0, + "learning_rate": 1.72e-05, + "loss": 82.5078, + "loss/crossentropy": 9.06786823272705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 7.3673131465911865, + "step": 8 + }, + { + "epoch": 0.0002, + "grad_norm": 179.0, + "learning_rate": 1.9e-05, + "loss": 78.2757, + "loss/crossentropy": 8.918366432189941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 6.912693023681641, + "step": 10 + }, + { + "epoch": 0.00024, + "grad_norm": 148.0, + "learning_rate": 2.0800000000000004e-05, + "loss": 74.4248, + "loss/crossentropy": 8.443636417388916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 6.567321538925171, + "step": 12 + }, + { + "epoch": 0.00028, + "grad_norm": 131.0, + "learning_rate": 2.2600000000000004e-05, + "loss": 73.0003, + "loss/crossentropy": 8.428278923034668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 6.706400156021118, + "step": 14 + }, + { + "epoch": 0.00032, + "grad_norm": 181.0, + "grad_norm_var": 16279.8625, + "learning_rate": 2.4400000000000004e-05, + "loss": 70.0047, + "loss/crossentropy": 8.216889381408691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 6.056080102920532, + "step": 16 + }, + { + "epoch": 0.00036, + "grad_norm": 90.5, + "grad_norm_var": 14154.148958333333, + "learning_rate": 2.6200000000000003e-05, + "loss": 69.9766, + "loss/crossentropy": 8.191599607467651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 6.429446697235107, + "step": 18 + }, + { + "epoch": 0.0004, + "grad_norm": 52.25, + "grad_norm_var": 12194.27890625, + "learning_rate": 2.8000000000000003e-05, + "loss": 64.3807, + "loss/crossentropy": 7.506032228469849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 5.794633388519287, + "step": 20 + }, + { + "epoch": 0.00044, + "grad_norm": 39.25, + "grad_norm_var": 6249.4875, + "learning_rate": 2.9800000000000006e-05, + "loss": 61.2802, + "loss/crossentropy": 7.152851343154907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 5.261489152908325, + "step": 22 + }, + { + "epoch": 0.00048, + "grad_norm": 57.0, + "grad_norm_var": 4626.8875, + "learning_rate": 3.16e-05, + "loss": 58.3454, + "loss/crossentropy": 6.956738471984863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 5.020895004272461, + "step": 24 + }, + { + "epoch": 0.00052, + "grad_norm": 86.0, + "grad_norm_var": 4244.565625, + "learning_rate": 3.3400000000000005e-05, + "loss": 54.2703, + "loss/crossentropy": 6.686542987823486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 4.801911354064941, + "step": 26 + }, + { + "epoch": 0.00056, + "grad_norm": 110.5, + "grad_norm_var": 3868.5875, + "learning_rate": 3.520000000000001e-05, + "loss": 51.7343, + "loss/crossentropy": 6.4867262840271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 4.5746169090271, + "step": 28 + }, + { + "epoch": 0.0006, + "grad_norm": 50.0, + "grad_norm_var": 3953.82890625, + "learning_rate": 3.7e-05, + "loss": 49.6807, + "loss/crossentropy": 6.364065408706665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 4.230688810348511, + "step": 30 + }, + { + "epoch": 0.00064, + "grad_norm": 68.0, + "grad_norm_var": 3157.4958333333334, + "learning_rate": 3.88e-05, + "loss": 44.7112, + "loss/crossentropy": 5.731794834136963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 3.6969637870788574, + "step": 32 + }, + { + "epoch": 0.00068, + "grad_norm": 50.75, + "grad_norm_var": 500.59973958333336, + "learning_rate": 4.0600000000000004e-05, + "loss": 42.2923, + "loss/crossentropy": 5.553718328475952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 3.7271878719329834, + "step": 34 + }, + { + "epoch": 0.00072, + "grad_norm": 55.5, + "grad_norm_var": 342.2122395833333, + "learning_rate": 4.240000000000001e-05, + "loss": 37.7465, + "loss/crossentropy": 5.023651361465454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 3.2670111656188965, + "step": 36 + }, + { + "epoch": 0.00076, + "grad_norm": 75.5, + "grad_norm_var": 283.37395833333335, + "learning_rate": 4.420000000000001e-05, + "loss": 35.1313, + "loss/crossentropy": 4.921839237213135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 2.950661063194275, + "step": 38 + }, + { + "epoch": 0.0008, + "grad_norm": 44.5, + "grad_norm_var": 299.1958333333333, + "learning_rate": 4.600000000000001e-05, + "loss": 32.3316, + "loss/crossentropy": 4.782621145248413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 2.7473502159118652, + "step": 40 + }, + { + "epoch": 0.00084, + "grad_norm": 36.5, + "grad_norm_var": 361.1372395833333, + "learning_rate": 4.78e-05, + "loss": 28.4104, + "loss/crossentropy": 3.8754972219467163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 2.410745143890381, + "step": 42 + }, + { + "epoch": 0.00088, + "grad_norm": 36.25, + "grad_norm_var": 225.80598958333334, + "learning_rate": 4.96e-05, + "loss": 26.1806, + "loss/crossentropy": 3.9885865449905396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 2.1414425373077393, + "step": 44 + }, + { + "epoch": 0.00092, + "grad_norm": 46.0, + "grad_norm_var": 249.475, + "learning_rate": 5.14e-05, + "loss": 24.4012, + "loss/crossentropy": 3.750515580177307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 2.063339352607727, + "step": 46 + }, + { + "epoch": 0.00096, + "grad_norm": 20.875, + "grad_norm_var": 315.8291015625, + "learning_rate": 5.3200000000000006e-05, + "loss": 22.822, + "loss/crossentropy": 3.7912577390670776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.9196046590805054, + "step": 48 + }, + { + "epoch": 0.001, + "grad_norm": 35.5, + "grad_norm_var": 279.284375, + "learning_rate": 5.500000000000001e-05, + "loss": 21.036, + "loss/crossentropy": 3.777758002281189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.7479944229125977, + "step": 50 + }, + { + "epoch": 0.00104, + "grad_norm": 20.75, + "grad_norm_var": 306.15149739583336, + "learning_rate": 5.680000000000001e-05, + "loss": 20.3608, + "loss/crossentropy": 3.5903185606002808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.5533717274665833, + "step": 52 + }, + { + "epoch": 0.00108, + "grad_norm": 43.75, + "grad_norm_var": 213.07395833333334, + "learning_rate": 5.860000000000001e-05, + "loss": 18.813, + "loss/crossentropy": 3.691780686378479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.4755533933639526, + "step": 54 + }, + { + "epoch": 0.00112, + "grad_norm": 21.25, + "grad_norm_var": 70.690625, + "learning_rate": 6.040000000000001e-05, + "loss": 19.1421, + "loss/crossentropy": 3.557003617286682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.5198156833648682, + "step": 56 + }, + { + "epoch": 0.00116, + "grad_norm": 21.5, + "grad_norm_var": 76.30390625, + "learning_rate": 6.220000000000001e-05, + "loss": 17.2705, + "loss/crossentropy": 3.2730292081832886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.4131997227668762, + "step": 58 + }, + { + "epoch": 0.0012, + "grad_norm": 19.875, + "grad_norm_var": 77.11399739583334, + "learning_rate": 6.400000000000001e-05, + "loss": 16.4712, + "loss/crossentropy": 3.419156074523926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.3277101516723633, + "step": 60 + }, + { + "epoch": 0.00124, + "grad_norm": 25.75, + "grad_norm_var": 48.60520833333333, + "learning_rate": 6.58e-05, + "loss": 16.6219, + "loss/crossentropy": 2.973878502845764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.3438007831573486, + "step": 62 + }, + { + "epoch": 0.00128, + "grad_norm": 34.5, + "grad_norm_var": 53.18020833333333, + "learning_rate": 6.76e-05, + "loss": 15.0929, + "loss/crossentropy": 2.892021059989929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.1624282002449036, + "step": 64 + }, + { + "epoch": 0.00132, + "grad_norm": 15.4375, + "grad_norm_var": 51.195947265625, + "learning_rate": 6.94e-05, + "loss": 15.1967, + "loss/crossentropy": 2.954660177230835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.116301715373993, + "step": 66 + }, + { + "epoch": 0.00136, + "grad_norm": 32.0, + "grad_norm_var": 51.064306640625, + "learning_rate": 7.120000000000001e-05, + "loss": 14.9397, + "loss/crossentropy": 3.2686156034469604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.1978037357330322, + "step": 68 + }, + { + "epoch": 0.0014, + "grad_norm": 29.25, + "grad_norm_var": 32.794270833333336, + "learning_rate": 7.3e-05, + "loss": 14.4846, + "loss/crossentropy": 2.7956581115722656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.1973016262054443, + "step": 70 + }, + { + "epoch": 0.00144, + "grad_norm": 15.25, + "grad_norm_var": 37.80149739583333, + "learning_rate": 7.48e-05, + "loss": 14.1296, + "loss/crossentropy": 3.08966863155365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.151496708393097, + "step": 72 + }, + { + "epoch": 0.00148, + "grad_norm": 18.625, + "grad_norm_var": 41.88984375, + "learning_rate": 7.66e-05, + "loss": 13.6812, + "loss/crossentropy": 2.949987292289734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.9787414371967316, + "step": 74 + }, + { + "epoch": 0.00152, + "grad_norm": 15.1875, + "grad_norm_var": 49.143489583333334, + "learning_rate": 7.840000000000001e-05, + "loss": 12.8901, + "loss/crossentropy": 3.1161292791366577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.0640113949775696, + "step": 76 + }, + { + "epoch": 0.00156, + "grad_norm": 22.125, + "grad_norm_var": 47.63170572916667, + "learning_rate": 8.020000000000001e-05, + "loss": 13.157, + "loss/crossentropy": 3.3661664724349976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 1.1353825330734253, + "step": 78 + }, + { + "epoch": 0.0016, + "grad_norm": 16.25, + "grad_norm_var": 35.33274739583333, + "learning_rate": 8.200000000000001e-05, + "loss": 12.9372, + "loss/crossentropy": 2.927241563796997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.9984134435653687, + "step": 80 + }, + { + "epoch": 0.00164, + "grad_norm": 13.625, + "grad_norm_var": 37.53951822916667, + "learning_rate": 8.38e-05, + "loss": 12.0477, + "loss/crossentropy": 3.1273285150527954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.9241160154342651, + "step": 82 + }, + { + "epoch": 0.00168, + "grad_norm": 19.125, + "grad_norm_var": 19.602718098958334, + "learning_rate": 8.560000000000001e-05, + "loss": 11.9084, + "loss/crossentropy": 2.737278938293457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8822851181030273, + "step": 84 + }, + { + "epoch": 0.00172, + "grad_norm": 13.0625, + "grad_norm_var": 11.107014973958334, + "learning_rate": 8.740000000000001e-05, + "loss": 11.8594, + "loss/crossentropy": 2.4452388286590576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8693483769893646, + "step": 86 + }, + { + "epoch": 0.00176, + "grad_norm": 18.375, + "grad_norm_var": 11.328369140625, + "learning_rate": 8.92e-05, + "loss": 11.5058, + "loss/crossentropy": 2.89771831035614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8749278783798218, + "step": 88 + }, + { + "epoch": 0.0018, + "grad_norm": 12.1875, + "grad_norm_var": 13.811572265625, + "learning_rate": 9.1e-05, + "loss": 11.9281, + "loss/crossentropy": 3.0173208713531494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8714744746685028, + "step": 90 + }, + { + "epoch": 0.00184, + "grad_norm": 20.375, + "grad_norm_var": 16.114306640625, + "learning_rate": 9.28e-05, + "loss": 11.4244, + "loss/crossentropy": 2.588515043258667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.9651442170143127, + "step": 92 + }, + { + "epoch": 0.00188, + "grad_norm": 13.375, + "grad_norm_var": 12.784309895833333, + "learning_rate": 9.46e-05, + "loss": 11.6092, + "loss/crossentropy": 2.8116774559020996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.9143906235694885, + "step": 94 + }, + { + "epoch": 0.00192, + "grad_norm": 16.375, + "grad_norm_var": 13.225764973958333, + "learning_rate": 9.64e-05, + "loss": 10.6723, + "loss/crossentropy": 2.8734441995620728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.814399927854538, + "step": 96 + }, + { + "epoch": 0.00196, + "grad_norm": 10.3125, + "grad_norm_var": 17.228238932291667, + "learning_rate": 9.82e-05, + "loss": 10.6577, + "loss/crossentropy": 2.664194703102112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7905783653259277, + "step": 98 + }, + { + "epoch": 0.002, + "grad_norm": 12.3125, + "grad_norm_var": 17.113785807291666, + "learning_rate": 0.0001, + "loss": 10.6847, + "loss/crossentropy": 2.4851003885269165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.769305944442749, + "step": 100 + }, + { + "epoch": 0.00204, + "grad_norm": 10.75, + "grad_norm_var": 13.9140625, + "learning_rate": 0.0001, + "loss": 10.8119, + "loss/crossentropy": 2.2757182121276855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8241511881351471, + "step": 102 + }, + { + "epoch": 0.00208, + "grad_norm": 11.375, + "grad_norm_var": 13.615738932291666, + "learning_rate": 0.0001, + "loss": 10.6244, + "loss/crossentropy": 2.7211785316467285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.8518709540367126, + "step": 104 + }, + { + "epoch": 0.00212, + "grad_norm": 11.625, + "grad_norm_var": 14.163395182291667, + "learning_rate": 0.0001, + "loss": 10.5629, + "loss/crossentropy": 2.387019991874695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7568954229354858, + "step": 106 + }, + { + "epoch": 0.00216, + "grad_norm": 9.9375, + "grad_norm_var": 10.788525390625, + "learning_rate": 0.0001, + "loss": 10.1364, + "loss/crossentropy": 2.5363346338272095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7648341059684753, + "step": 108 + }, + { + "epoch": 0.0022, + "grad_norm": 20.0, + "grad_norm_var": 14.898893229166667, + "learning_rate": 0.0001, + "loss": 10.8773, + "loss/crossentropy": 2.8450236320495605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.737193763256073, + "step": 110 + }, + { + "epoch": 0.00224, + "grad_norm": 10.25, + "grad_norm_var": 15.364518229166666, + "learning_rate": 0.0001, + "loss": 9.4554, + "loss/crossentropy": 2.4827451705932617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6924614012241364, + "step": 112 + }, + { + "epoch": 0.00228, + "grad_norm": 10.5625, + "grad_norm_var": 7.461832682291667, + "learning_rate": 0.0001, + "loss": 9.9651, + "loss/crossentropy": 2.093318462371826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7038464546203613, + "step": 114 + }, + { + "epoch": 0.00232, + "grad_norm": 9.6875, + "grad_norm_var": 7.597330729166667, + "learning_rate": 0.0001, + "loss": 10.0297, + "loss/crossentropy": 2.5149790048599243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6980189085006714, + "step": 116 + }, + { + "epoch": 0.00236, + "grad_norm": 11.625, + "grad_norm_var": 6.672509765625, + "learning_rate": 0.0001, + "loss": 9.8176, + "loss/crossentropy": 2.6022276878356934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7005251348018646, + "step": 118 + }, + { + "epoch": 0.0024, + "grad_norm": 7.625, + "grad_norm_var": 7.298160807291667, + "learning_rate": 0.0001, + "loss": 9.5658, + "loss/crossentropy": 2.6836462020874023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7237774729728699, + "step": 120 + }, + { + "epoch": 0.00244, + "grad_norm": 9.5625, + "grad_norm_var": 7.402018229166667, + "learning_rate": 0.0001, + "loss": 9.7376, + "loss/crossentropy": 2.6823805570602417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7570162117481232, + "step": 122 + }, + { + "epoch": 0.00248, + "grad_norm": 11.25, + "grad_norm_var": 7.391259765625, + "learning_rate": 0.0001, + "loss": 9.4713, + "loss/crossentropy": 2.6233514547348022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7183247208595276, + "step": 124 + }, + { + "epoch": 0.00252, + "grad_norm": 9.9375, + "grad_norm_var": 1.0839680989583333, + "learning_rate": 0.0001, + "loss": 9.2243, + "loss/crossentropy": 2.331676959991455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6982125043869019, + "step": 126 + }, + { + "epoch": 0.00256, + "grad_norm": 9.1875, + "grad_norm_var": 0.9687337239583333, + "learning_rate": 0.0001, + "loss": 9.4777, + "loss/crossentropy": 2.429046392440796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.696417510509491, + "step": 128 + }, + { + "epoch": 0.0026, + "grad_norm": 14.8125, + "grad_norm_var": 2.448291015625, + "learning_rate": 0.0001, + "loss": 9.9024, + "loss/crossentropy": 2.5262571573257446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.716008871793747, + "step": 130 + }, + { + "epoch": 0.00264, + "grad_norm": 8.8125, + "grad_norm_var": 2.7044270833333335, + "learning_rate": 0.0001, + "loss": 9.2836, + "loss/crossentropy": 2.187526524066925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.666053056716919, + "step": 132 + }, + { + "epoch": 0.00268, + "grad_norm": 8.9375, + "grad_norm_var": 3.285270182291667, + "learning_rate": 0.0001, + "loss": 9.8338, + "loss/crossentropy": 2.4199057817459106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6709816455841064, + "step": 134 + }, + { + "epoch": 0.00272, + "grad_norm": 9.6875, + "grad_norm_var": 3.4072916666666666, + "learning_rate": 0.0001, + "loss": 9.4225, + "loss/crossentropy": 2.1963008642196655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5963725447654724, + "step": 136 + }, + { + "epoch": 0.00276, + "grad_norm": 8.0625, + "grad_norm_var": 3.556884765625, + "learning_rate": 0.0001, + "loss": 9.44, + "loss/crossentropy": 2.5878132581710815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6362220048904419, + "step": 138 + }, + { + "epoch": 0.0028, + "grad_norm": 9.25, + "grad_norm_var": 3.4852701822916665, + "learning_rate": 0.0001, + "loss": 9.4314, + "loss/crossentropy": 2.7800480127334595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6819457113742828, + "step": 140 + }, + { + "epoch": 0.00284, + "grad_norm": 8.4375, + "grad_norm_var": 3.838997395833333, + "learning_rate": 0.0001, + "loss": 9.1047, + "loss/crossentropy": 2.5055110454559326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.674308180809021, + "step": 142 + }, + { + "epoch": 0.00288, + "grad_norm": 8.25, + "grad_norm_var": 4.078499348958333, + "learning_rate": 0.0001, + "loss": 9.1578, + "loss/crossentropy": 2.8532944917678833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.7083481848239899, + "step": 144 + }, + { + "epoch": 0.00292, + "grad_norm": 7.8125, + "grad_norm_var": 2.2759765625, + "learning_rate": 0.0001, + "loss": 8.8023, + "loss/crossentropy": 2.442527174949646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6510869562625885, + "step": 146 + }, + { + "epoch": 0.00296, + "grad_norm": 10.9375, + "grad_norm_var": 2.8544881184895834, + "learning_rate": 0.0001, + "loss": 8.7238, + "loss/crossentropy": 2.516597867012024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6362285614013672, + "step": 148 + }, + { + "epoch": 0.003, + "grad_norm": 7.40625, + "grad_norm_var": 1.8615885416666667, + "learning_rate": 0.0001, + "loss": 8.5543, + "loss/crossentropy": 2.8672900199890137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6459421515464783, + "step": 150 + }, + { + "epoch": 0.00304, + "grad_norm": 8.1875, + "grad_norm_var": 1.7195963541666666, + "learning_rate": 0.0001, + "loss": 8.6403, + "loss/crossentropy": 2.2042795419692993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5720505118370056, + "step": 152 + }, + { + "epoch": 0.00308, + "grad_norm": 7.8125, + "grad_norm_var": 1.876025390625, + "learning_rate": 0.0001, + "loss": 8.768, + "loss/crossentropy": 2.225563883781433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6328675150871277, + "step": 154 + }, + { + "epoch": 0.00312, + "grad_norm": 8.0625, + "grad_norm_var": 1.5925618489583333, + "learning_rate": 0.0001, + "loss": 8.5743, + "loss/crossentropy": 2.3541462421417236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5927431881427765, + "step": 156 + }, + { + "epoch": 0.00316, + "grad_norm": 7.34375, + "grad_norm_var": 1.834619140625, + "learning_rate": 0.0001, + "loss": 8.7329, + "loss/crossentropy": 2.4685616493225098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6299368739128113, + "step": 158 + }, + { + "epoch": 0.0032, + "grad_norm": 9.25, + "grad_norm_var": 1.4429646809895833, + "learning_rate": 0.0001, + "loss": 8.4796, + "loss/crossentropy": 2.4637919664382935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6084515154361725, + "step": 160 + }, + { + "epoch": 0.00324, + "grad_norm": 9.0625, + "grad_norm_var": 1.3692545572916666, + "learning_rate": 0.0001, + "loss": 9.0446, + "loss/crossentropy": 2.598397374153137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.574739396572113, + "step": 162 + }, + { + "epoch": 0.00328, + "grad_norm": 7.15625, + "grad_norm_var": 0.9171834309895833, + "learning_rate": 0.0001, + "loss": 8.1508, + "loss/crossentropy": 2.5183030366897583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5985860526561737, + "step": 164 + }, + { + "epoch": 0.00332, + "grad_norm": 9.125, + "grad_norm_var": 0.9571614583333333, + "learning_rate": 0.0001, + "loss": 8.4296, + "loss/crossentropy": 2.252183437347412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5627379417419434, + "step": 166 + }, + { + "epoch": 0.00336, + "grad_norm": 7.875, + "grad_norm_var": 0.70533447265625, + "learning_rate": 0.0001, + "loss": 8.4549, + "loss/crossentropy": 2.5720516443252563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5945309698581696, + "step": 168 + }, + { + "epoch": 0.0034, + "grad_norm": 9.1875, + "grad_norm_var": 0.8844034830729167, + "learning_rate": 0.0001, + "loss": 8.6096, + "loss/crossentropy": 2.3004332184791565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5401738286018372, + "step": 170 + }, + { + "epoch": 0.00344, + "grad_norm": 7.71875, + "grad_norm_var": 0.948681640625, + "learning_rate": 0.0001, + "loss": 8.5484, + "loss/crossentropy": 2.689734935760498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6384358406066895, + "step": 172 + }, + { + "epoch": 0.00348, + "grad_norm": 6.84375, + "grad_norm_var": 0.9418253580729167, + "learning_rate": 0.0001, + "loss": 8.1888, + "loss/crossentropy": 1.944397747516632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5828913450241089, + "step": 174 + }, + { + "epoch": 0.00352, + "grad_norm": 6.53125, + "grad_norm_var": 0.8817342122395834, + "learning_rate": 0.0001, + "loss": 8.0577, + "loss/crossentropy": 2.8166507482528687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5916908979415894, + "step": 176 + }, + { + "epoch": 0.00356, + "grad_norm": 7.09375, + "grad_norm_var": 0.7456990559895833, + "learning_rate": 0.0001, + "loss": 8.7701, + "loss/crossentropy": 2.3208402395248413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5968939661979675, + "step": 178 + }, + { + "epoch": 0.0036, + "grad_norm": 7.25, + "grad_norm_var": 0.7425130208333334, + "learning_rate": 0.0001, + "loss": 8.2615, + "loss/crossentropy": 2.817763566970825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5800873041152954, + "step": 180 + }, + { + "epoch": 0.00364, + "grad_norm": 6.78125, + "grad_norm_var": 0.6126912434895834, + "learning_rate": 0.0001, + "loss": 8.3053, + "loss/crossentropy": 2.250023365020752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5279016494750977, + "step": 182 + }, + { + "epoch": 0.00368, + "grad_norm": 6.09375, + "grad_norm_var": 0.6917805989583333, + "learning_rate": 0.0001, + "loss": 7.7974, + "loss/crossentropy": 2.1337096095085144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5393811166286469, + "step": 184 + }, + { + "epoch": 0.00372, + "grad_norm": 6.34375, + "grad_norm_var": 0.5962198893229167, + "learning_rate": 0.0001, + "loss": 7.7258, + "loss/crossentropy": 2.6338934898376465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6444519460201263, + "step": 186 + }, + { + "epoch": 0.00376, + "grad_norm": 10.4375, + "grad_norm_var": 1.2786458333333333, + "learning_rate": 0.0001, + "loss": 8.0762, + "loss/crossentropy": 2.66677463054657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6012931764125824, + "step": 188 + }, + { + "epoch": 0.0038, + "grad_norm": 6.375, + "grad_norm_var": 1.6642537434895834, + "learning_rate": 0.0001, + "loss": 8.3177, + "loss/crossentropy": 2.3731196522712708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.6299596428871155, + "step": 190 + }, + { + "epoch": 0.00384, + "grad_norm": 7.4375, + "grad_norm_var": 1.7719889322916667, + "learning_rate": 0.0001, + "loss": 8.214, + "loss/crossentropy": 2.411492705345154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5328834354877472, + "step": 192 + }, + { + "epoch": 0.00388, + "grad_norm": 7.15625, + "grad_norm_var": 1.9489420572916667, + "learning_rate": 0.0001, + "loss": 7.9763, + "loss/crossentropy": 2.2402734756469727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.539243072271347, + "step": 194 + }, + { + "epoch": 0.00392, + "grad_norm": 7.53125, + "grad_norm_var": 1.895556640625, + "learning_rate": 0.0001, + "loss": 7.9292, + "loss/crossentropy": 2.3250681161880493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5374342203140259, + "step": 196 + }, + { + "epoch": 0.00396, + "grad_norm": 7.75, + "grad_norm_var": 1.8979451497395834, + "learning_rate": 0.0001, + "loss": 7.9201, + "loss/crossentropy": 2.42138135433197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5410921573638916, + "step": 198 + }, + { + "epoch": 0.004, + "grad_norm": 6.78125, + "grad_norm_var": 1.8954386393229166, + "learning_rate": 0.0001, + "loss": 7.7597, + "loss/crossentropy": 2.1954251527786255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.49367184937000275, + "step": 200 + }, + { + "epoch": 0.00404, + "grad_norm": 6.75, + "grad_norm_var": 1.6848307291666667, + "learning_rate": 0.0001, + "loss": 7.9033, + "loss/crossentropy": 2.81479811668396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5567455887794495, + "step": 202 + }, + { + "epoch": 0.00408, + "grad_norm": 6.5, + "grad_norm_var": 1.08228759765625, + "learning_rate": 0.0001, + "loss": 7.9812, + "loss/crossentropy": 2.611761450767517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.544409453868866, + "step": 204 + }, + { + "epoch": 0.00412, + "grad_norm": 6.125, + "grad_norm_var": 0.650244140625, + "learning_rate": 0.0001, + "loss": 7.7921, + "loss/crossentropy": 2.1369245052337646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5181446373462677, + "step": 206 + }, + { + "epoch": 0.00416, + "grad_norm": 7.1875, + "grad_norm_var": 0.30745035807291665, + "learning_rate": 0.0001, + "loss": 8.3375, + "loss/crossentropy": 2.435856580734253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5506252646446228, + "step": 208 + }, + { + "epoch": 0.0042, + "grad_norm": 6.21875, + "grad_norm_var": 0.26848958333333334, + "learning_rate": 0.0001, + "loss": 7.7599, + "loss/crossentropy": 2.2404768466949463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46975645422935486, + "step": 210 + }, + { + "epoch": 0.00424, + "grad_norm": 6.71875, + "grad_norm_var": 0.19713541666666667, + "learning_rate": 0.0001, + "loss": 7.7083, + "loss/crossentropy": 2.4866777658462524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5217231214046478, + "step": 212 + }, + { + "epoch": 0.00428, + "grad_norm": 6.15625, + "grad_norm_var": 0.121484375, + "learning_rate": 0.0001, + "loss": 7.6519, + "loss/crossentropy": 2.074867010116577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4683506190776825, + "step": 214 + }, + { + "epoch": 0.00432, + "grad_norm": 7.59375, + "grad_norm_var": 0.21679280598958334, + "learning_rate": 0.0001, + "loss": 7.6062, + "loss/crossentropy": 2.2040151357650757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5235520601272583, + "step": 216 + }, + { + "epoch": 0.00436, + "grad_norm": 7.25, + "grad_norm_var": 0.25690104166666666, + "learning_rate": 0.0001, + "loss": 7.886, + "loss/crossentropy": 2.174479365348816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5243187248706818, + "step": 218 + }, + { + "epoch": 0.0044, + "grad_norm": 6.28125, + "grad_norm_var": 0.26523030598958336, + "learning_rate": 0.0001, + "loss": 7.7535, + "loss/crossentropy": 2.5678584575653076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5818615555763245, + "step": 220 + }, + { + "epoch": 0.00444, + "grad_norm": 6.0625, + "grad_norm_var": 0.2814412434895833, + "learning_rate": 0.0001, + "loss": 7.859, + "loss/crossentropy": 2.4551891088485718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5400412082672119, + "step": 222 + }, + { + "epoch": 0.00448, + "grad_norm": 6.15625, + "grad_norm_var": 0.25310872395833334, + "learning_rate": 0.0001, + "loss": 7.7341, + "loss/crossentropy": 2.0638335943222046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.49279752373695374, + "step": 224 + }, + { + "epoch": 0.00452, + "grad_norm": 6.6875, + "grad_norm_var": 0.38235677083333336, + "learning_rate": 0.0001, + "loss": 8.1064, + "loss/crossentropy": 2.553247332572937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5566797852516174, + "step": 226 + }, + { + "epoch": 0.00456, + "grad_norm": 6.0625, + "grad_norm_var": 0.39451497395833335, + "learning_rate": 0.0001, + "loss": 7.7812, + "loss/crossentropy": 2.544332265853882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5118530094623566, + "step": 228 + }, + { + "epoch": 0.0046, + "grad_norm": 5.875, + "grad_norm_var": 0.4054036458333333, + "learning_rate": 0.0001, + "loss": 7.064, + "loss/crossentropy": 2.191234052181244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5157675743103027, + "step": 230 + }, + { + "epoch": 0.00464, + "grad_norm": 5.4375, + "grad_norm_var": 0.360791015625, + "learning_rate": 0.0001, + "loss": 7.6611, + "loss/crossentropy": 2.3671151399612427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5233491659164429, + "step": 232 + }, + { + "epoch": 0.00468, + "grad_norm": 7.125, + "grad_norm_var": 0.4180826822916667, + "learning_rate": 0.0001, + "loss": 7.4509, + "loss/crossentropy": 2.3003920316696167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5037694871425629, + "step": 234 + }, + { + "epoch": 0.00472, + "grad_norm": 5.625, + "grad_norm_var": 0.4305338541666667, + "learning_rate": 0.0001, + "loss": 7.8236, + "loss/crossentropy": 2.4672670364379883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5187652707099915, + "step": 236 + }, + { + "epoch": 0.00476, + "grad_norm": 6.0625, + "grad_norm_var": 0.4493326822916667, + "learning_rate": 0.0001, + "loss": 7.3246, + "loss/crossentropy": 2.179289937019348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5111505687236786, + "step": 238 + }, + { + "epoch": 0.0048, + "grad_norm": 6.34375, + "grad_norm_var": 0.5123697916666666, + "learning_rate": 0.0001, + "loss": 7.6064, + "loss/crossentropy": 2.2424627542495728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5187103897333145, + "step": 240 + }, + { + "epoch": 0.00484, + "grad_norm": 5.96875, + "grad_norm_var": 0.2986979166666667, + "learning_rate": 0.0001, + "loss": 7.8108, + "loss/crossentropy": 2.8024520874023438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5700482130050659, + "step": 242 + }, + { + "epoch": 0.00488, + "grad_norm": 6.25, + "grad_norm_var": 0.4554524739583333, + "learning_rate": 0.0001, + "loss": 7.6644, + "loss/crossentropy": 2.3653491735458374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5232449471950531, + "step": 244 + }, + { + "epoch": 0.00492, + "grad_norm": 6.90625, + "grad_norm_var": 0.48513997395833336, + "learning_rate": 0.0001, + "loss": 7.5028, + "loss/crossentropy": 2.6201778650283813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5684538185596466, + "step": 246 + }, + { + "epoch": 0.00496, + "grad_norm": 7.3125, + "grad_norm_var": 0.49654541015625, + "learning_rate": 0.0001, + "loss": 7.6631, + "loss/crossentropy": 2.2811471819877625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5543638169765472, + "step": 248 + }, + { + "epoch": 0.005, + "grad_norm": 5.625, + "grad_norm_var": 0.48544514973958336, + "learning_rate": 0.0001, + "loss": 7.6914, + "loss/crossentropy": 2.4381459951400757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5594777166843414, + "step": 250 + }, + { + "epoch": 0.00504, + "grad_norm": 9.375, + "grad_norm_var": 1.04000244140625, + "learning_rate": 0.0001, + "loss": 7.7346, + "loss/crossentropy": 2.435782313346863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5159732699394226, + "step": 252 + }, + { + "epoch": 0.00508, + "grad_norm": 5.46875, + "grad_norm_var": 1.0892862955729166, + "learning_rate": 0.0001, + "loss": 8.0282, + "loss/crossentropy": 2.7867215871810913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.551640123128891, + "step": 254 + }, + { + "epoch": 0.00512, + "grad_norm": 5.46875, + "grad_norm_var": 1.2432902018229166, + "learning_rate": 0.0001, + "loss": 7.0744, + "loss/crossentropy": 1.9328945875167847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45153285562992096, + "step": 256 + }, + { + "epoch": 0.00516, + "grad_norm": 6.21875, + "grad_norm_var": 1.2460286458333334, + "learning_rate": 0.0001, + "loss": 7.292, + "loss/crossentropy": 2.552613139152527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5215992629528046, + "step": 258 + }, + { + "epoch": 0.0052, + "grad_norm": 5.40625, + "grad_norm_var": 1.1848958333333333, + "learning_rate": 0.0001, + "loss": 7.4992, + "loss/crossentropy": 2.3720492124557495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5024219453334808, + "step": 260 + }, + { + "epoch": 0.00524, + "grad_norm": 8.1875, + "grad_norm_var": 1.4898274739583333, + "learning_rate": 0.0001, + "loss": 7.4676, + "loss/crossentropy": 2.465815782546997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5383751839399338, + "step": 262 + }, + { + "epoch": 0.00528, + "grad_norm": 6.0625, + "grad_norm_var": 1.5113240559895833, + "learning_rate": 0.0001, + "loss": 7.3163, + "loss/crossentropy": 2.2791935205459595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.534159854054451, + "step": 264 + }, + { + "epoch": 0.00532, + "grad_norm": 6.28125, + "grad_norm_var": 1.3855305989583333, + "learning_rate": 0.0001, + "loss": 7.9279, + "loss/crossentropy": 2.48906409740448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5344790518283844, + "step": 266 + }, + { + "epoch": 0.00536, + "grad_norm": 4.90625, + "grad_norm_var": 1.0180826822916667, + "learning_rate": 0.0001, + "loss": 7.3178, + "loss/crossentropy": 2.0858306884765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.47646892070770264, + "step": 268 + }, + { + "epoch": 0.0054, + "grad_norm": 8.375, + "grad_norm_var": 1.18258056640625, + "learning_rate": 0.0001, + "loss": 7.383, + "loss/crossentropy": 2.159322738647461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5543113648891449, + "step": 270 + }, + { + "epoch": 0.00544, + "grad_norm": 4.9375, + "grad_norm_var": 1.204931640625, + "learning_rate": 0.0001, + "loss": 7.1635, + "loss/crossentropy": 2.249913454055786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4896702766418457, + "step": 272 + }, + { + "epoch": 0.00548, + "grad_norm": 8.125, + "grad_norm_var": 1.388916015625, + "learning_rate": 0.0001, + "loss": 7.3565, + "loss/crossentropy": 1.998712420463562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4770403504371643, + "step": 274 + }, + { + "epoch": 0.00552, + "grad_norm": 5.53125, + "grad_norm_var": 1.9266764322916667, + "learning_rate": 0.0001, + "loss": 7.6522, + "loss/crossentropy": 2.391260862350464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5259605348110199, + "step": 276 + }, + { + "epoch": 0.00556, + "grad_norm": 6.1875, + "grad_norm_var": 1.6884765625, + "learning_rate": 0.0001, + "loss": 7.2935, + "loss/crossentropy": 2.523361325263977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4724537879228592, + "step": 278 + }, + { + "epoch": 0.0056, + "grad_norm": 5.34375, + "grad_norm_var": 1.7386555989583334, + "learning_rate": 0.0001, + "loss": 7.3505, + "loss/crossentropy": 2.281963586807251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.49724647402763367, + "step": 280 + }, + { + "epoch": 0.00564, + "grad_norm": 5.0625, + "grad_norm_var": 1.8446451822916667, + "learning_rate": 0.0001, + "loss": 7.1079, + "loss/crossentropy": 2.2403814792633057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45384156703948975, + "step": 282 + }, + { + "epoch": 0.00568, + "grad_norm": 5.6875, + "grad_norm_var": 1.670166015625, + "learning_rate": 0.0001, + "loss": 7.4318, + "loss/crossentropy": 2.2687963247299194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46648281812667847, + "step": 284 + }, + { + "epoch": 0.00572, + "grad_norm": 5.75, + "grad_norm_var": 1.38541259765625, + "learning_rate": 0.0001, + "loss": 7.3035, + "loss/crossentropy": 2.3336217403411865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4792183041572571, + "step": 286 + }, + { + "epoch": 0.00576, + "grad_norm": 6.4375, + "grad_norm_var": 1.292431640625, + "learning_rate": 0.0001, + "loss": 7.0031, + "loss/crossentropy": 2.4006571769714355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.473391056060791, + "step": 288 + }, + { + "epoch": 0.0058, + "grad_norm": 5.09375, + "grad_norm_var": 1.18671875, + "learning_rate": 0.0001, + "loss": 6.8037, + "loss/crossentropy": 2.0306124687194824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4722501188516617, + "step": 290 + }, + { + "epoch": 0.00584, + "grad_norm": 5.34375, + "grad_norm_var": 0.38865559895833335, + "learning_rate": 0.0001, + "loss": 7.1378, + "loss/crossentropy": 2.412277102470398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.48340730369091034, + "step": 292 + }, + { + "epoch": 0.00588, + "grad_norm": 6.21875, + "grad_norm_var": 0.3952433268229167, + "learning_rate": 0.0001, + "loss": 7.3589, + "loss/crossentropy": 2.3195769786834717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5074818134307861, + "step": 294 + }, + { + "epoch": 0.00592, + "grad_norm": 6.71875, + "grad_norm_var": 0.2867024739583333, + "learning_rate": 0.0001, + "loss": 7.2826, + "loss/crossentropy": 2.3265275955200195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5351093411445618, + "step": 296 + }, + { + "epoch": 0.00596, + "grad_norm": 5.625, + "grad_norm_var": 0.26171468098958334, + "learning_rate": 0.0001, + "loss": 7.0106, + "loss/crossentropy": 2.210574746131897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45469868183135986, + "step": 298 + }, + { + "epoch": 0.006, + "grad_norm": 6.3125, + "grad_norm_var": 0.3034138997395833, + "learning_rate": 0.0001, + "loss": 7.4741, + "loss/crossentropy": 2.347964644432068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45269395411014557, + "step": 300 + }, + { + "epoch": 0.00604, + "grad_norm": 5.0, + "grad_norm_var": 0.34308268229166666, + "learning_rate": 0.0001, + "loss": 6.8901, + "loss/crossentropy": 2.197494626045227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45010584592819214, + "step": 302 + }, + { + "epoch": 0.00608, + "grad_norm": 5.8125, + "grad_norm_var": 0.32224934895833335, + "learning_rate": 0.0001, + "loss": 6.909, + "loss/crossentropy": 2.3295196890830994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.48989084362983704, + "step": 304 + }, + { + "epoch": 0.00612, + "grad_norm": 6.3125, + "grad_norm_var": 0.26347249348958335, + "learning_rate": 0.0001, + "loss": 7.6606, + "loss/crossentropy": 2.5208678245544434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46991507709026337, + "step": 306 + }, + { + "epoch": 0.00616, + "grad_norm": 5.40625, + "grad_norm_var": 0.22849934895833332, + "learning_rate": 0.0001, + "loss": 7.2881, + "loss/crossentropy": 2.6091307401657104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5090319812297821, + "step": 308 + }, + { + "epoch": 0.0062, + "grad_norm": 5.4375, + "grad_norm_var": 0.3083984375, + "learning_rate": 0.0001, + "loss": 7.0888, + "loss/crossentropy": 2.4142966270446777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45959460735321045, + "step": 310 + }, + { + "epoch": 0.00624, + "grad_norm": 6.0625, + "grad_norm_var": 0.25006103515625, + "learning_rate": 0.0001, + "loss": 7.3054, + "loss/crossentropy": 2.3062673807144165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4381408095359802, + "step": 312 + }, + { + "epoch": 0.00628, + "grad_norm": 4.875, + "grad_norm_var": 0.29498697916666666, + "learning_rate": 0.0001, + "loss": 6.5202, + "loss/crossentropy": 2.1124885082244873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4392775595188141, + "step": 314 + }, + { + "epoch": 0.00632, + "grad_norm": 5.09375, + "grad_norm_var": 0.3001302083333333, + "learning_rate": 0.0001, + "loss": 6.3297, + "loss/crossentropy": 2.0250568985939026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4133095294237137, + "step": 316 + }, + { + "epoch": 0.00636, + "grad_norm": 5.625, + "grad_norm_var": 0.31021728515625, + "learning_rate": 0.0001, + "loss": 6.9903, + "loss/crossentropy": 2.4011316299438477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46330246329307556, + "step": 318 + }, + { + "epoch": 0.0064, + "grad_norm": 5.65625, + "grad_norm_var": 0.30305582682291665, + "learning_rate": 0.0001, + "loss": 7.2114, + "loss/crossentropy": 2.487559676170349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.47237157821655273, + "step": 320 + }, + { + "epoch": 0.00644, + "grad_norm": 5.1875, + "grad_norm_var": 0.2775349934895833, + "learning_rate": 0.0001, + "loss": 6.5935, + "loss/crossentropy": 1.999566912651062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4165455400943756, + "step": 322 + }, + { + "epoch": 0.00648, + "grad_norm": 6.03125, + "grad_norm_var": 0.27496337890625, + "learning_rate": 0.0001, + "loss": 7.0573, + "loss/crossentropy": 2.545841693878174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4725654572248459, + "step": 324 + }, + { + "epoch": 0.00652, + "grad_norm": 5.0625, + "grad_norm_var": 0.2528483072916667, + "learning_rate": 0.0001, + "loss": 7.2351, + "loss/crossentropy": 2.119086444377899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4649319499731064, + "step": 326 + }, + { + "epoch": 0.00656, + "grad_norm": 5.21875, + "grad_norm_var": 0.221728515625, + "learning_rate": 0.0001, + "loss": 6.8367, + "loss/crossentropy": 2.365525245666504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5121739506721497, + "step": 328 + }, + { + "epoch": 0.0066, + "grad_norm": 5.25, + "grad_norm_var": 0.17467041015625, + "learning_rate": 0.0001, + "loss": 6.8384, + "loss/crossentropy": 2.2604740858078003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4704676419496536, + "step": 330 + }, + { + "epoch": 0.00664, + "grad_norm": 6.15625, + "grad_norm_var": 0.18977864583333334, + "learning_rate": 0.0001, + "loss": 7.5125, + "loss/crossentropy": 2.4891955852508545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5196183770895004, + "step": 332 + }, + { + "epoch": 0.00668, + "grad_norm": 5.46875, + "grad_norm_var": 0.1767578125, + "learning_rate": 0.0001, + "loss": 7.3139, + "loss/crossentropy": 2.430082321166992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.47671228647232056, + "step": 334 + }, + { + "epoch": 0.00672, + "grad_norm": 5.53125, + "grad_norm_var": 0.187353515625, + "learning_rate": 0.0001, + "loss": 6.6969, + "loss/crossentropy": 2.2450510263442993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5124029517173767, + "step": 336 + }, + { + "epoch": 0.00676, + "grad_norm": 5.375, + "grad_norm_var": 0.18251546223958334, + "learning_rate": 0.0001, + "loss": 6.8537, + "loss/crossentropy": 2.225212812423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4358299970626831, + "step": 338 + }, + { + "epoch": 0.0068, + "grad_norm": 6.71875, + "grad_norm_var": 26.47734375, + "learning_rate": 0.0001, + "loss": 6.8775, + "loss/crossentropy": 2.320846140384674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42757023870944977, + "step": 340 + }, + { + "epoch": 0.00684, + "grad_norm": 4.84375, + "grad_norm_var": 26.739453125, + "learning_rate": 0.0001, + "loss": 6.7394, + "loss/crossentropy": 2.419093132019043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4633703678846359, + "step": 342 + }, + { + "epoch": 0.00688, + "grad_norm": 5.3125, + "grad_norm_var": 26.632405598958332, + "learning_rate": 0.0001, + "loss": 6.7304, + "loss/crossentropy": 1.939517080783844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4387335330247879, + "step": 344 + }, + { + "epoch": 0.00692, + "grad_norm": 5.75, + "grad_norm_var": 26.504410807291666, + "learning_rate": 0.0001, + "loss": 7.0914, + "loss/crossentropy": 2.695888638496399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.48031529784202576, + "step": 346 + }, + { + "epoch": 0.00696, + "grad_norm": 6.625, + "grad_norm_var": 26.468094889322916, + "learning_rate": 0.0001, + "loss": 6.8381, + "loss/crossentropy": 2.245330333709717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5032568573951721, + "step": 348 + }, + { + "epoch": 0.007, + "grad_norm": 4.28125, + "grad_norm_var": 26.707421875, + "learning_rate": 0.0001, + "loss": 6.4774, + "loss/crossentropy": 1.9668607115745544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4390462785959244, + "step": 350 + }, + { + "epoch": 0.00704, + "grad_norm": 5.25, + "grad_norm_var": 26.92008056640625, + "learning_rate": 0.0001, + "loss": 6.7795, + "loss/crossentropy": 2.4035123586654663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43734824657440186, + "step": 352 + }, + { + "epoch": 0.00708, + "grad_norm": 6.5, + "grad_norm_var": 26.851460774739582, + "learning_rate": 0.0001, + "loss": 7.3932, + "loss/crossentropy": 2.4636529684066772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5208825469017029, + "step": 354 + }, + { + "epoch": 0.00712, + "grad_norm": 6.4375, + "grad_norm_var": 0.5743123372395833, + "learning_rate": 0.0001, + "loss": 6.8482, + "loss/crossentropy": 2.085066556930542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3971068561077118, + "step": 356 + }, + { + "epoch": 0.00716, + "grad_norm": 4.59375, + "grad_norm_var": 0.57847900390625, + "learning_rate": 0.0001, + "loss": 6.8962, + "loss/crossentropy": 2.194266200065613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41747787594795227, + "step": 358 + }, + { + "epoch": 0.0072, + "grad_norm": 4.59375, + "grad_norm_var": 0.649462890625, + "learning_rate": 0.0001, + "loss": 6.7381, + "loss/crossentropy": 2.4678618907928467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4633233994245529, + "step": 360 + }, + { + "epoch": 0.00724, + "grad_norm": 5.59375, + "grad_norm_var": 0.6476847330729166, + "learning_rate": 0.0001, + "loss": 6.5748, + "loss/crossentropy": 2.362962484359741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4360974431037903, + "step": 362 + }, + { + "epoch": 0.00728, + "grad_norm": 5.6875, + "grad_norm_var": 0.5360514322916666, + "learning_rate": 0.0001, + "loss": 7.3497, + "loss/crossentropy": 2.3162096738815308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4641396403312683, + "step": 364 + }, + { + "epoch": 0.00732, + "grad_norm": 6.0625, + "grad_norm_var": 0.4325358072916667, + "learning_rate": 0.0001, + "loss": 7.0856, + "loss/crossentropy": 2.279396176338196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4331911951303482, + "step": 366 + }, + { + "epoch": 0.00736, + "grad_norm": 4.96875, + "grad_norm_var": 0.377587890625, + "learning_rate": 0.0001, + "loss": 6.8288, + "loss/crossentropy": 2.333961606025696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43770918250083923, + "step": 368 + }, + { + "epoch": 0.0074, + "grad_norm": 5.1875, + "grad_norm_var": 0.27955322265625, + "learning_rate": 0.0001, + "loss": 6.9245, + "loss/crossentropy": 2.130259871482849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4134685546159744, + "step": 370 + }, + { + "epoch": 0.00744, + "grad_norm": 5.59375, + "grad_norm_var": 0.19478759765625, + "learning_rate": 0.0001, + "loss": 6.4884, + "loss/crossentropy": 2.3000282049179077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46207693219184875, + "step": 372 + }, + { + "epoch": 0.00748, + "grad_norm": 5.59375, + "grad_norm_var": 0.20399983723958334, + "learning_rate": 0.0001, + "loss": 7.3714, + "loss/crossentropy": 2.687412142753601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46891947090625763, + "step": 374 + }, + { + "epoch": 0.00752, + "grad_norm": 4.28125, + "grad_norm_var": 0.21962483723958334, + "learning_rate": 0.0001, + "loss": 6.4194, + "loss/crossentropy": 2.2366563081741333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42876073718070984, + "step": 376 + }, + { + "epoch": 0.00756, + "grad_norm": 5.71875, + "grad_norm_var": 0.23108317057291666, + "learning_rate": 0.0001, + "loss": 7.0141, + "loss/crossentropy": 2.5960274934768677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4519210159778595, + "step": 378 + }, + { + "epoch": 0.0076, + "grad_norm": 5.71875, + "grad_norm_var": 0.23435872395833332, + "learning_rate": 0.0001, + "loss": 6.9654, + "loss/crossentropy": 2.4690704345703125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5009289383888245, + "step": 380 + }, + { + "epoch": 0.00764, + "grad_norm": 5.25, + "grad_norm_var": 0.17952067057291668, + "learning_rate": 0.0001, + "loss": 6.6068, + "loss/crossentropy": 2.188890814781189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4912077784538269, + "step": 382 + }, + { + "epoch": 0.00768, + "grad_norm": 5.15625, + "grad_norm_var": 0.19055582682291666, + "learning_rate": 0.0001, + "loss": 6.5789, + "loss/crossentropy": 2.2374125719070435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4437579810619354, + "step": 384 + }, + { + "epoch": 0.00772, + "grad_norm": 5.1875, + "grad_norm_var": 0.19308268229166667, + "learning_rate": 0.0001, + "loss": 6.8081, + "loss/crossentropy": 2.101546287536621, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39717453718185425, + "step": 386 + }, + { + "epoch": 0.00776, + "grad_norm": 5.1875, + "grad_norm_var": 0.182666015625, + "learning_rate": 0.0001, + "loss": 6.5688, + "loss/crossentropy": 2.2907408475875854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4657403528690338, + "step": 388 + }, + { + "epoch": 0.0078, + "grad_norm": 4.53125, + "grad_norm_var": 0.18203125, + "learning_rate": 0.0001, + "loss": 6.4664, + "loss/crossentropy": 1.9909976720809937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39810416102409363, + "step": 390 + }, + { + "epoch": 0.00784, + "grad_norm": 6.28125, + "grad_norm_var": 0.23746337890625, + "learning_rate": 0.0001, + "loss": 6.6015, + "loss/crossentropy": 2.109456777572632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4226441979408264, + "step": 392 + }, + { + "epoch": 0.00788, + "grad_norm": 6.03125, + "grad_norm_var": 0.26549072265625, + "learning_rate": 0.0001, + "loss": 7.2592, + "loss/crossentropy": 2.330615997314453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5082881152629852, + "step": 394 + }, + { + "epoch": 0.00792, + "grad_norm": 5.90625, + "grad_norm_var": 0.49332275390625, + "learning_rate": 0.0001, + "loss": 7.301, + "loss/crossentropy": 2.37632155418396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5454596877098083, + "step": 396 + }, + { + "epoch": 0.00796, + "grad_norm": 4.9375, + "grad_norm_var": 0.5669921875, + "learning_rate": 0.0001, + "loss": 6.6057, + "loss/crossentropy": 2.0303866863250732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3780263066291809, + "step": 398 + }, + { + "epoch": 0.008, + "grad_norm": 4.1875, + "grad_norm_var": 0.6671712239583333, + "learning_rate": 0.0001, + "loss": 6.8129, + "loss/crossentropy": 2.077945590019226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4111042767763138, + "step": 400 + }, + { + "epoch": 0.00804, + "grad_norm": 5.5625, + "grad_norm_var": 0.6867146809895833, + "learning_rate": 0.0001, + "loss": 6.919, + "loss/crossentropy": 2.3042391538619995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.44224822521209717, + "step": 402 + }, + { + "epoch": 0.00808, + "grad_norm": 4.3125, + "grad_norm_var": 0.74537353515625, + "learning_rate": 0.0001, + "loss": 6.4785, + "loss/crossentropy": 2.15978467464447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.44845885038375854, + "step": 404 + }, + { + "epoch": 0.00812, + "grad_norm": 4.90625, + "grad_norm_var": 0.69576416015625, + "learning_rate": 0.0001, + "loss": 6.3875, + "loss/crossentropy": 2.4571259021759033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4629521369934082, + "step": 406 + }, + { + "epoch": 0.00816, + "grad_norm": 5.40625, + "grad_norm_var": 0.6654947916666667, + "learning_rate": 0.0001, + "loss": 7.1165, + "loss/crossentropy": 2.653234601020813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5556869208812714, + "step": 408 + }, + { + "epoch": 0.0082, + "grad_norm": 4.53125, + "grad_norm_var": 0.66138916015625, + "learning_rate": 0.0001, + "loss": 6.6643, + "loss/crossentropy": 1.9738067388534546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42570993304252625, + "step": 410 + }, + { + "epoch": 0.00824, + "grad_norm": 4.0625, + "grad_norm_var": 0.46920166015625, + "learning_rate": 0.0001, + "loss": 6.2205, + "loss/crossentropy": 2.093988060951233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40544557571411133, + "step": 412 + }, + { + "epoch": 0.00828, + "grad_norm": 4.34375, + "grad_norm_var": 0.36213785807291665, + "learning_rate": 0.0001, + "loss": 6.6356, + "loss/crossentropy": 2.4798851013183594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4204765260219574, + "step": 414 + }, + { + "epoch": 0.00832, + "grad_norm": 5.625, + "grad_norm_var": 0.3986287434895833, + "learning_rate": 0.0001, + "loss": 6.5601, + "loss/crossentropy": 2.4342020750045776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5148662775754929, + "step": 416 + }, + { + "epoch": 0.00836, + "grad_norm": 5.40625, + "grad_norm_var": 0.3985514322916667, + "learning_rate": 0.0001, + "loss": 6.7757, + "loss/crossentropy": 2.3637804985046387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45475171506404877, + "step": 418 + }, + { + "epoch": 0.0084, + "grad_norm": 4.09375, + "grad_norm_var": 0.39586181640625, + "learning_rate": 0.0001, + "loss": 6.6923, + "loss/crossentropy": 2.4066261053085327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45181581377983093, + "step": 420 + }, + { + "epoch": 0.00844, + "grad_norm": 4.0, + "grad_norm_var": 0.43176676432291666, + "learning_rate": 0.0001, + "loss": 6.2428, + "loss/crossentropy": 2.1273797750473022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3976929485797882, + "step": 422 + }, + { + "epoch": 0.00848, + "grad_norm": 4.90625, + "grad_norm_var": 0.26330973307291666, + "learning_rate": 0.0001, + "loss": 6.6524, + "loss/crossentropy": 2.4227113723754883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46154454350471497, + "step": 424 + }, + { + "epoch": 0.00852, + "grad_norm": 4.65625, + "grad_norm_var": 0.261181640625, + "learning_rate": 0.0001, + "loss": 6.6558, + "loss/crossentropy": 2.3502479791641235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43580910563468933, + "step": 426 + }, + { + "epoch": 0.00856, + "grad_norm": 5.3125, + "grad_norm_var": 0.26236572265625, + "learning_rate": 0.0001, + "loss": 6.903, + "loss/crossentropy": 2.5034282207489014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4730219095945358, + "step": 428 + }, + { + "epoch": 0.0086, + "grad_norm": 4.40625, + "grad_norm_var": 0.2557576497395833, + "learning_rate": 0.0001, + "loss": 6.2148, + "loss/crossentropy": 2.0902098417282104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.409458264708519, + "step": 430 + }, + { + "epoch": 0.00864, + "grad_norm": 4.9375, + "grad_norm_var": 0.19420166015625, + "learning_rate": 0.0001, + "loss": 6.4634, + "loss/crossentropy": 2.2203429341316223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41066519916057587, + "step": 432 + }, + { + "epoch": 0.00868, + "grad_norm": 5.03125, + "grad_norm_var": 0.14855143229166667, + "learning_rate": 0.0001, + "loss": 6.6943, + "loss/crossentropy": 2.568304419517517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.48846572637557983, + "step": 434 + }, + { + "epoch": 0.00872, + "grad_norm": 5.15625, + "grad_norm_var": 0.13489176432291666, + "learning_rate": 0.0001, + "loss": 6.4829, + "loss/crossentropy": 2.359646439552307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43610572814941406, + "step": 436 + }, + { + "epoch": 0.00876, + "grad_norm": 5.03125, + "grad_norm_var": 0.08870035807291667, + "learning_rate": 0.0001, + "loss": 6.6119, + "loss/crossentropy": 2.2751121520996094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45837563276290894, + "step": 438 + }, + { + "epoch": 0.0088, + "grad_norm": 4.125, + "grad_norm_var": 0.13665364583333334, + "learning_rate": 0.0001, + "loss": 6.5338, + "loss/crossentropy": 2.334506392478943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4196038395166397, + "step": 440 + }, + { + "epoch": 0.00884, + "grad_norm": 4.1875, + "grad_norm_var": 0.15891927083333332, + "learning_rate": 0.0001, + "loss": 6.2206, + "loss/crossentropy": 1.9731069803237915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41661541163921356, + "step": 442 + }, + { + "epoch": 0.00888, + "grad_norm": 5.03125, + "grad_norm_var": 0.13339436848958333, + "learning_rate": 0.0001, + "loss": 6.3377, + "loss/crossentropy": 2.319058418273926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4656156301498413, + "step": 444 + }, + { + "epoch": 0.00892, + "grad_norm": 4.125, + "grad_norm_var": 0.15013020833333332, + "learning_rate": 0.0001, + "loss": 6.5345, + "loss/crossentropy": 2.309122085571289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41400712728500366, + "step": 446 + }, + { + "epoch": 0.00896, + "grad_norm": 4.53125, + "grad_norm_var": 0.14468994140625, + "learning_rate": 0.0001, + "loss": 6.5385, + "loss/crossentropy": 1.867617905139923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39080700278282166, + "step": 448 + }, + { + "epoch": 0.009, + "grad_norm": 5.21875, + "grad_norm_var": 0.79830322265625, + "learning_rate": 0.0001, + "loss": 6.799, + "loss/crossentropy": 2.205033838748932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4887084364891052, + "step": 450 + }, + { + "epoch": 0.00904, + "grad_norm": 4.9375, + "grad_norm_var": 0.826171875, + "learning_rate": 0.0001, + "loss": 6.6476, + "loss/crossentropy": 2.3054174184799194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4140937328338623, + "step": 452 + }, + { + "epoch": 0.00908, + "grad_norm": 4.53125, + "grad_norm_var": 0.8572224934895833, + "learning_rate": 0.0001, + "loss": 6.7036, + "loss/crossentropy": 2.1358219981193542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3849467635154724, + "step": 454 + }, + { + "epoch": 0.00912, + "grad_norm": 4.9375, + "grad_norm_var": 0.8132120768229166, + "learning_rate": 0.0001, + "loss": 6.4024, + "loss/crossentropy": 1.9811997413635254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42263032495975494, + "step": 456 + }, + { + "epoch": 0.00916, + "grad_norm": 4.8125, + "grad_norm_var": 0.7787109375, + "learning_rate": 0.0001, + "loss": 6.8381, + "loss/crossentropy": 2.319555103778839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4539293050765991, + "step": 458 + }, + { + "epoch": 0.0092, + "grad_norm": 4.6875, + "grad_norm_var": 0.7744425455729167, + "learning_rate": 0.0001, + "loss": 6.72, + "loss/crossentropy": 2.4030569791793823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41373930871486664, + "step": 460 + }, + { + "epoch": 0.00924, + "grad_norm": 4.8125, + "grad_norm_var": 0.7218587239583333, + "learning_rate": 0.0001, + "loss": 6.7376, + "loss/crossentropy": 2.4479328393936157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.447835311293602, + "step": 462 + }, + { + "epoch": 0.00928, + "grad_norm": 5.5, + "grad_norm_var": 0.7289021809895834, + "learning_rate": 0.0001, + "loss": 7.0562, + "loss/crossentropy": 2.056324601173401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4374549984931946, + "step": 464 + }, + { + "epoch": 0.00932, + "grad_norm": 4.28125, + "grad_norm_var": 0.19256184895833334, + "learning_rate": 0.0001, + "loss": 6.6994, + "loss/crossentropy": 2.4104079008102417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4148041307926178, + "step": 466 + }, + { + "epoch": 0.00936, + "grad_norm": 4.75, + "grad_norm_var": 0.13527018229166668, + "learning_rate": 0.0001, + "loss": 7.0758, + "loss/crossentropy": 2.734652876853943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.511719822883606, + "step": 468 + }, + { + "epoch": 0.0094, + "grad_norm": 5.0, + "grad_norm_var": 0.13118082682291668, + "learning_rate": 0.0001, + "loss": 6.4772, + "loss/crossentropy": 2.1748571395874023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3852091133594513, + "step": 470 + }, + { + "epoch": 0.00944, + "grad_norm": 4.625, + "grad_norm_var": 0.13912353515625, + "learning_rate": 0.0001, + "loss": 6.7127, + "loss/crossentropy": 2.476449966430664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4194856435060501, + "step": 472 + }, + { + "epoch": 0.00948, + "grad_norm": 4.40625, + "grad_norm_var": 0.16887613932291667, + "learning_rate": 0.0001, + "loss": 6.494, + "loss/crossentropy": 2.6383973360061646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.44083887338638306, + "step": 474 + }, + { + "epoch": 0.00952, + "grad_norm": 4.375, + "grad_norm_var": 0.18435872395833333, + "learning_rate": 0.0001, + "loss": 6.3184, + "loss/crossentropy": 2.3149259090423584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3863615244626999, + "step": 476 + }, + { + "epoch": 0.00956, + "grad_norm": 5.0625, + "grad_norm_var": 0.18388264973958332, + "learning_rate": 0.0001, + "loss": 6.6217, + "loss/crossentropy": 2.3096635341644287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40995509922504425, + "step": 478 + }, + { + "epoch": 0.0096, + "grad_norm": 4.34375, + "grad_norm_var": 0.10846354166666666, + "learning_rate": 0.0001, + "loss": 6.4849, + "loss/crossentropy": 2.6495853662490845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.412080317735672, + "step": 480 + }, + { + "epoch": 0.00964, + "grad_norm": 5.21875, + "grad_norm_var": 0.13948160807291668, + "learning_rate": 0.0001, + "loss": 6.9293, + "loss/crossentropy": 2.445754885673523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45622071623802185, + "step": 482 + }, + { + "epoch": 0.00968, + "grad_norm": 4.40625, + "grad_norm_var": 0.13661702473958334, + "learning_rate": 0.0001, + "loss": 6.4119, + "loss/crossentropy": 2.418110966682434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40424875915050507, + "step": 484 + }, + { + "epoch": 0.00972, + "grad_norm": 5.0625, + "grad_norm_var": 0.15959879557291667, + "learning_rate": 0.0001, + "loss": 6.6356, + "loss/crossentropy": 1.9564435482025146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.45834848284721375, + "step": 486 + }, + { + "epoch": 0.00976, + "grad_norm": 5.375, + "grad_norm_var": 0.16946207682291667, + "learning_rate": 0.0001, + "loss": 6.7056, + "loss/crossentropy": 2.3772581815719604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4418800473213196, + "step": 488 + }, + { + "epoch": 0.0098, + "grad_norm": 4.0625, + "grad_norm_var": 0.208056640625, + "learning_rate": 0.0001, + "loss": 6.3239, + "loss/crossentropy": 1.9526153802871704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3661540001630783, + "step": 490 + }, + { + "epoch": 0.00984, + "grad_norm": 4.875, + "grad_norm_var": 0.19999593098958332, + "learning_rate": 0.0001, + "loss": 6.7642, + "loss/crossentropy": 2.40561842918396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.44370119273662567, + "step": 492 + }, + { + "epoch": 0.00988, + "grad_norm": 4.53125, + "grad_norm_var": 0.1943359375, + "learning_rate": 0.0001, + "loss": 6.6475, + "loss/crossentropy": 2.4316108226776123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4437306672334671, + "step": 494 + }, + { + "epoch": 0.00992, + "grad_norm": 4.53125, + "grad_norm_var": 0.25276285807291665, + "learning_rate": 0.0001, + "loss": 6.4095, + "loss/crossentropy": 2.2919591665267944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4170967787504196, + "step": 496 + }, + { + "epoch": 0.00996, + "grad_norm": 5.03125, + "grad_norm_var": 0.21116129557291666, + "learning_rate": 0.0001, + "loss": 6.6291, + "loss/crossentropy": 2.571357250213623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4073399156332016, + "step": 498 + }, + { + "epoch": 0.01, + "grad_norm": 4.375, + "grad_norm_var": 0.21330973307291667, + "learning_rate": 0.0001, + "loss": 6.2903, + "loss/crossentropy": 2.389290928840637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41799379885196686, + "step": 500 + }, + { + "epoch": 0.01004, + "grad_norm": 4.90625, + "grad_norm_var": 0.20245768229166666, + "learning_rate": 0.0001, + "loss": 6.4319, + "loss/crossentropy": 2.0904359221458435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3914492577314377, + "step": 502 + }, + { + "epoch": 0.01008, + "grad_norm": 3.625, + "grad_norm_var": 0.218212890625, + "learning_rate": 0.0001, + "loss": 6.5581, + "loss/crossentropy": 2.5435129404067993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4660491645336151, + "step": 504 + }, + { + "epoch": 0.01012, + "grad_norm": 4.75, + "grad_norm_var": 0.21458333333333332, + "learning_rate": 0.0001, + "loss": 6.3665, + "loss/crossentropy": 2.039083242416382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38188865780830383, + "step": 506 + }, + { + "epoch": 0.01016, + "grad_norm": 5.21875, + "grad_norm_var": 0.361181640625, + "learning_rate": 0.0001, + "loss": 6.4862, + "loss/crossentropy": 2.056805729866028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4257620573043823, + "step": 508 + }, + { + "epoch": 0.0102, + "grad_norm": 4.03125, + "grad_norm_var": 0.38592122395833334, + "learning_rate": 0.0001, + "loss": 6.4822, + "loss/crossentropy": 2.6178410053253174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43898941576480865, + "step": 510 + }, + { + "epoch": 0.01024, + "grad_norm": 4.59375, + "grad_norm_var": 0.3405558268229167, + "learning_rate": 0.0001, + "loss": 6.4638, + "loss/crossentropy": 2.232435703277588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37769296765327454, + "step": 512 + }, + { + "epoch": 0.01028, + "grad_norm": 4.40625, + "grad_norm_var": 0.33033447265625, + "learning_rate": 0.0001, + "loss": 6.378, + "loss/crossentropy": 1.8679735660552979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3805970698595047, + "step": 514 + }, + { + "epoch": 0.01032, + "grad_norm": 4.3125, + "grad_norm_var": 0.33175455729166664, + "learning_rate": 0.0001, + "loss": 6.7453, + "loss/crossentropy": 2.6537472009658813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4450981914997101, + "step": 516 + }, + { + "epoch": 0.01036, + "grad_norm": 4.03125, + "grad_norm_var": 0.34915262858072915, + "learning_rate": 0.0001, + "loss": 6.2745, + "loss/crossentropy": 2.5199841260910034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4748214781284332, + "step": 518 + }, + { + "epoch": 0.0104, + "grad_norm": 5.5, + "grad_norm_var": 0.3482004801432292, + "learning_rate": 0.0001, + "loss": 6.6212, + "loss/crossentropy": 2.6603333950042725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4144483357667923, + "step": 520 + }, + { + "epoch": 0.01044, + "grad_norm": 4.28125, + "grad_norm_var": 0.3241119384765625, + "learning_rate": 0.0001, + "loss": 6.4073, + "loss/crossentropy": 2.284039616584778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42801250517368317, + "step": 522 + }, + { + "epoch": 0.01048, + "grad_norm": 3.78125, + "grad_norm_var": 0.1894683837890625, + "learning_rate": 0.0001, + "loss": 6.2153, + "loss/crossentropy": 2.473629951477051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4107852131128311, + "step": 524 + }, + { + "epoch": 0.01052, + "grad_norm": 4.5, + "grad_norm_var": 0.1827789306640625, + "learning_rate": 0.0001, + "loss": 6.6746, + "loss/crossentropy": 2.1443774700164795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3606857359409332, + "step": 526 + }, + { + "epoch": 0.01056, + "grad_norm": 3.96875, + "grad_norm_var": 0.18889058430989583, + "learning_rate": 0.0001, + "loss": 5.8493, + "loss/crossentropy": 1.8425135016441345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37770508229732513, + "step": 528 + }, + { + "epoch": 0.0106, + "grad_norm": 3.859375, + "grad_norm_var": 0.179052734375, + "learning_rate": 0.0001, + "loss": 6.3319, + "loss/crossentropy": 2.3705164194107056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4354119151830673, + "step": 530 + }, + { + "epoch": 0.01064, + "grad_norm": 4.53125, + "grad_norm_var": 0.16344401041666667, + "learning_rate": 0.0001, + "loss": 6.4823, + "loss/crossentropy": 2.1141316294670105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34213581681251526, + "step": 532 + }, + { + "epoch": 0.01068, + "grad_norm": 4.3125, + "grad_norm_var": 0.15579325358072918, + "learning_rate": 0.0001, + "loss": 6.0454, + "loss/crossentropy": 2.14878511428833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3756616413593292, + "step": 534 + }, + { + "epoch": 0.01072, + "grad_norm": 4.34375, + "grad_norm_var": 0.052155558268229166, + "learning_rate": 0.0001, + "loss": 6.4363, + "loss/crossentropy": 2.2513046264648438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.416190966963768, + "step": 536 + }, + { + "epoch": 0.01076, + "grad_norm": 3.921875, + "grad_norm_var": 0.049169921875, + "learning_rate": 0.0001, + "loss": 6.2674, + "loss/crossentropy": 2.1337047815322876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3839789927005768, + "step": 538 + }, + { + "epoch": 0.0108, + "grad_norm": 5.8125, + "grad_norm_var": 0.20488993326822916, + "learning_rate": 0.0001, + "loss": 6.0559, + "loss/crossentropy": 2.2019962072372437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3806414008140564, + "step": 540 + }, + { + "epoch": 0.01084, + "grad_norm": 4.125, + "grad_norm_var": 0.2133697509765625, + "learning_rate": 0.0001, + "loss": 5.8434, + "loss/crossentropy": 2.113224983215332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3797851800918579, + "step": 542 + }, + { + "epoch": 0.01088, + "grad_norm": 4.0, + "grad_norm_var": 0.21507059733072917, + "learning_rate": 0.0001, + "loss": 6.5131, + "loss/crossentropy": 2.461037516593933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41689516603946686, + "step": 544 + }, + { + "epoch": 0.01092, + "grad_norm": 4.625, + "grad_norm_var": 0.24849853515625, + "learning_rate": 0.0001, + "loss": 6.4191, + "loss/crossentropy": 2.277098298072815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4108494818210602, + "step": 546 + }, + { + "epoch": 0.01096, + "grad_norm": 4.34375, + "grad_norm_var": 0.48818359375, + "learning_rate": 0.0001, + "loss": 6.3436, + "loss/crossentropy": 2.007936477661133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36796192824840546, + "step": 548 + }, + { + "epoch": 0.011, + "grad_norm": 4.625, + "grad_norm_var": 0.5631795247395833, + "learning_rate": 0.0001, + "loss": 6.5793, + "loss/crossentropy": 2.197320520877838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38722094893455505, + "step": 550 + }, + { + "epoch": 0.01104, + "grad_norm": 4.6875, + "grad_norm_var": 0.5419230143229167, + "learning_rate": 0.0001, + "loss": 6.3185, + "loss/crossentropy": 2.225432515144348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3928917348384857, + "step": 552 + }, + { + "epoch": 0.01108, + "grad_norm": 4.28125, + "grad_norm_var": 0.5455067952473959, + "learning_rate": 0.0001, + "loss": 5.9686, + "loss/crossentropy": 2.5253326892852783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41547106206417084, + "step": 554 + }, + { + "epoch": 0.01112, + "grad_norm": 4.15625, + "grad_norm_var": 0.44996337890625, + "learning_rate": 0.0001, + "loss": 6.2158, + "loss/crossentropy": 2.488635540008545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4523312896490097, + "step": 556 + }, + { + "epoch": 0.01116, + "grad_norm": 4.03125, + "grad_norm_var": 0.46119384765625, + "learning_rate": 0.0001, + "loss": 6.3632, + "loss/crossentropy": 2.395568609237671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40695953369140625, + "step": 558 + }, + { + "epoch": 0.0112, + "grad_norm": 4.28125, + "grad_norm_var": 0.4615234375, + "learning_rate": 0.0001, + "loss": 5.998, + "loss/crossentropy": 2.383823275566101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4482497274875641, + "step": 560 + }, + { + "epoch": 0.01124, + "grad_norm": 4.46875, + "grad_norm_var": 0.46280008951822915, + "learning_rate": 0.0001, + "loss": 5.968, + "loss/crossentropy": 2.0261693000793457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36933301389217377, + "step": 562 + }, + { + "epoch": 0.01128, + "grad_norm": 4.65625, + "grad_norm_var": 0.2221588134765625, + "learning_rate": 0.0001, + "loss": 6.1467, + "loss/crossentropy": 2.131769895553589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.341948002576828, + "step": 564 + }, + { + "epoch": 0.01132, + "grad_norm": 4.8125, + "grad_norm_var": 0.0982574462890625, + "learning_rate": 0.0001, + "loss": 6.5288, + "loss/crossentropy": 2.3899158239364624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.417646586894989, + "step": 566 + }, + { + "epoch": 0.01136, + "grad_norm": 4.21875, + "grad_norm_var": 0.07683817545572917, + "learning_rate": 0.0001, + "loss": 6.6198, + "loss/crossentropy": 2.3139528036117554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39010919630527496, + "step": 568 + }, + { + "epoch": 0.0114, + "grad_norm": 4.3125, + "grad_norm_var": 0.06782124837239584, + "learning_rate": 0.0001, + "loss": 6.1209, + "loss/crossentropy": 1.966201364994049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3554569333791733, + "step": 570 + }, + { + "epoch": 0.01144, + "grad_norm": 4.21875, + "grad_norm_var": 0.07773335774739583, + "learning_rate": 0.0001, + "loss": 6.1746, + "loss/crossentropy": 2.2325466871261597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4008040726184845, + "step": 572 + }, + { + "epoch": 0.01148, + "grad_norm": 4.96875, + "grad_norm_var": 0.09848531087239583, + "learning_rate": 0.0001, + "loss": 6.121, + "loss/crossentropy": 1.7670194506645203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3362845778465271, + "step": 574 + }, + { + "epoch": 0.01152, + "grad_norm": 4.125, + "grad_norm_var": 0.10274149576822916, + "learning_rate": 0.0001, + "loss": 6.3956, + "loss/crossentropy": 2.332284092903137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4190225303173065, + "step": 576 + }, + { + "epoch": 0.01156, + "grad_norm": 4.71875, + "grad_norm_var": 0.10549723307291667, + "learning_rate": 0.0001, + "loss": 6.4367, + "loss/crossentropy": 2.265386700630188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4109695851802826, + "step": 578 + }, + { + "epoch": 0.0116, + "grad_norm": 5.15625, + "grad_norm_var": 0.13811442057291667, + "learning_rate": 0.0001, + "loss": 6.4475, + "loss/crossentropy": 2.265889286994934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42450472712516785, + "step": 580 + }, + { + "epoch": 0.01164, + "grad_norm": 4.5625, + "grad_norm_var": 0.163134765625, + "learning_rate": 0.0001, + "loss": 6.1483, + "loss/crossentropy": 1.867847204208374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3627365529537201, + "step": 582 + }, + { + "epoch": 0.01168, + "grad_norm": 3.796875, + "grad_norm_var": 0.23681538899739582, + "learning_rate": 0.0001, + "loss": 6.5909, + "loss/crossentropy": 2.5827555656433105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5194894820451736, + "step": 584 + }, + { + "epoch": 0.01172, + "grad_norm": 4.5625, + "grad_norm_var": 0.23188374837239584, + "learning_rate": 0.0001, + "loss": 6.6319, + "loss/crossentropy": 2.26031893491745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.412450835108757, + "step": 586 + }, + { + "epoch": 0.01176, + "grad_norm": 5.53125, + "grad_norm_var": 0.2718739827473958, + "learning_rate": 0.0001, + "loss": 6.5782, + "loss/crossentropy": 2.1885104179382324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41636165976524353, + "step": 588 + }, + { + "epoch": 0.0118, + "grad_norm": 4.53125, + "grad_norm_var": 0.26240132649739584, + "learning_rate": 0.0001, + "loss": 6.5247, + "loss/crossentropy": 2.682767391204834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4279082715511322, + "step": 590 + }, + { + "epoch": 0.01184, + "grad_norm": 4.3125, + "grad_norm_var": 0.24807027180989583, + "learning_rate": 0.0001, + "loss": 6.5862, + "loss/crossentropy": 2.185304641723633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4266812950372696, + "step": 592 + }, + { + "epoch": 0.01188, + "grad_norm": 4.15625, + "grad_norm_var": 0.2589752197265625, + "learning_rate": 0.0001, + "loss": 6.2494, + "loss/crossentropy": 2.383716344833374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38570962846279144, + "step": 594 + }, + { + "epoch": 0.01192, + "grad_norm": 4.3125, + "grad_norm_var": 0.2337066650390625, + "learning_rate": 0.0001, + "loss": 6.3066, + "loss/crossentropy": 2.2963398694992065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43451687693595886, + "step": 596 + }, + { + "epoch": 0.01196, + "grad_norm": 4.96875, + "grad_norm_var": 0.2128570556640625, + "learning_rate": 0.0001, + "loss": 6.3368, + "loss/crossentropy": 2.2728757858276367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4280686676502228, + "step": 598 + }, + { + "epoch": 0.012, + "grad_norm": 4.34375, + "grad_norm_var": 0.141650390625, + "learning_rate": 0.0001, + "loss": 6.1147, + "loss/crossentropy": 2.3486615419387817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37224848568439484, + "step": 600 + }, + { + "epoch": 0.01204, + "grad_norm": 3.765625, + "grad_norm_var": 0.17512613932291668, + "learning_rate": 0.0001, + "loss": 6.4017, + "loss/crossentropy": 2.3265292644500732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4079990088939667, + "step": 602 + }, + { + "epoch": 0.01208, + "grad_norm": 3.84375, + "grad_norm_var": 0.08847554524739583, + "learning_rate": 0.0001, + "loss": 6.0861, + "loss/crossentropy": 2.45253849029541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42630523443222046, + "step": 604 + }, + { + "epoch": 0.01212, + "grad_norm": 4.03125, + "grad_norm_var": 0.08964742024739583, + "learning_rate": 0.0001, + "loss": 6.4101, + "loss/crossentropy": 2.4417446851730347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40342913568019867, + "step": 606 + }, + { + "epoch": 0.01216, + "grad_norm": 4.25, + "grad_norm_var": 0.08886617024739583, + "learning_rate": 0.0001, + "loss": 6.2513, + "loss/crossentropy": 2.1483529210090637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3846609443426132, + "step": 608 + }, + { + "epoch": 0.0122, + "grad_norm": 4.1875, + "grad_norm_var": 0.09990132649739583, + "learning_rate": 0.0001, + "loss": 6.5479, + "loss/crossentropy": 2.481536865234375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.440000057220459, + "step": 610 + }, + { + "epoch": 0.01224, + "grad_norm": 4.15625, + "grad_norm_var": 0.09464518229166667, + "learning_rate": 0.0001, + "loss": 6.4986, + "loss/crossentropy": 2.4472655057907104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41242682933807373, + "step": 612 + }, + { + "epoch": 0.01228, + "grad_norm": 4.09375, + "grad_norm_var": 0.07828369140625, + "learning_rate": 0.0001, + "loss": 6.4348, + "loss/crossentropy": 2.3511135578155518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38262398540973663, + "step": 614 + }, + { + "epoch": 0.01232, + "grad_norm": 3.71875, + "grad_norm_var": 0.085791015625, + "learning_rate": 0.0001, + "loss": 6.4821, + "loss/crossentropy": 2.5090683698654175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.44584622979164124, + "step": 616 + }, + { + "epoch": 0.01236, + "grad_norm": 4.75, + "grad_norm_var": 0.1117340087890625, + "learning_rate": 0.0001, + "loss": 6.0395, + "loss/crossentropy": 2.166012942790985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40271493792533875, + "step": 618 + }, + { + "epoch": 0.0124, + "grad_norm": 5.09375, + "grad_norm_var": 0.15511067708333334, + "learning_rate": 0.0001, + "loss": 6.4644, + "loss/crossentropy": 2.583309531211853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43380285799503326, + "step": 620 + }, + { + "epoch": 0.01244, + "grad_norm": 3.984375, + "grad_norm_var": 0.19877827962239583, + "learning_rate": 0.0001, + "loss": 6.3289, + "loss/crossentropy": 2.125720262527466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40389589965343475, + "step": 622 + }, + { + "epoch": 0.01248, + "grad_norm": 6.34375, + "grad_norm_var": 0.4984283447265625, + "learning_rate": 0.0001, + "loss": 5.9651, + "loss/crossentropy": 1.7034094333648682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31466029584407806, + "step": 624 + }, + { + "epoch": 0.01252, + "grad_norm": 4.40625, + "grad_norm_var": 0.4901041666666667, + "learning_rate": 0.0001, + "loss": 5.9451, + "loss/crossentropy": 2.163281202316284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38622182607650757, + "step": 626 + }, + { + "epoch": 0.01256, + "grad_norm": 3.875, + "grad_norm_var": 0.4982818603515625, + "learning_rate": 0.0001, + "loss": 5.8931, + "loss/crossentropy": 1.7754453420639038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33256995677948, + "step": 628 + }, + { + "epoch": 0.0126, + "grad_norm": 4.09375, + "grad_norm_var": 0.4894683837890625, + "learning_rate": 0.0001, + "loss": 6.8606, + "loss/crossentropy": 2.273309350013733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5319447964429855, + "step": 630 + }, + { + "epoch": 0.01264, + "grad_norm": 3.828125, + "grad_norm_var": 0.480126953125, + "learning_rate": 0.0001, + "loss": 6.2103, + "loss/crossentropy": 2.397401988506317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4255771040916443, + "step": 632 + }, + { + "epoch": 0.01268, + "grad_norm": 5.28125, + "grad_norm_var": 0.521875, + "learning_rate": 0.0001, + "loss": 6.4179, + "loss/crossentropy": 2.3898611068725586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.46314406394958496, + "step": 634 + }, + { + "epoch": 0.01272, + "grad_norm": 7.40625, + "grad_norm_var": 1.0609212239583334, + "learning_rate": 0.0001, + "loss": 6.5634, + "loss/crossentropy": 2.3740471601486206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.48123428225517273, + "step": 636 + }, + { + "epoch": 0.01276, + "grad_norm": 3.90625, + "grad_norm_var": 1.0519846598307292, + "learning_rate": 0.0001, + "loss": 6.1136, + "loss/crossentropy": 2.236217498779297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39564159512519836, + "step": 638 + }, + { + "epoch": 0.0128, + "grad_norm": 4.21875, + "grad_norm_var": 0.7972157796223959, + "learning_rate": 0.0001, + "loss": 6.3134, + "loss/crossentropy": 2.4049174785614014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41312308609485626, + "step": 640 + }, + { + "epoch": 0.01284, + "grad_norm": 4.0, + "grad_norm_var": 0.80142822265625, + "learning_rate": 0.0001, + "loss": 6.2726, + "loss/crossentropy": 2.173800766468048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3983110189437866, + "step": 642 + }, + { + "epoch": 0.01288, + "grad_norm": 4.65625, + "grad_norm_var": 0.80084228515625, + "learning_rate": 0.0001, + "loss": 6.2779, + "loss/crossentropy": 2.2124537229537964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38358184695243835, + "step": 644 + }, + { + "epoch": 0.01292, + "grad_norm": 3.90625, + "grad_norm_var": 0.8355377197265625, + "learning_rate": 0.0001, + "loss": 6.1209, + "loss/crossentropy": 2.4939264059066772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3805152475833893, + "step": 646 + }, + { + "epoch": 0.01296, + "grad_norm": 4.25, + "grad_norm_var": 0.8250935872395834, + "learning_rate": 0.0001, + "loss": 6.0714, + "loss/crossentropy": 2.4526472091674805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3958089202642441, + "step": 648 + }, + { + "epoch": 0.013, + "grad_norm": 4.25, + "grad_norm_var": 0.7562978108723958, + "learning_rate": 0.0001, + "loss": 6.3648, + "loss/crossentropy": 2.5171029567718506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.446771502494812, + "step": 650 + }, + { + "epoch": 0.01304, + "grad_norm": 4.125, + "grad_norm_var": 0.1185943603515625, + "learning_rate": 0.0001, + "loss": 6.1656, + "loss/crossentropy": 2.270598888397217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38787929713726044, + "step": 652 + }, + { + "epoch": 0.01308, + "grad_norm": 4.15625, + "grad_norm_var": 0.11404520670572917, + "learning_rate": 0.0001, + "loss": 5.7739, + "loss/crossentropy": 1.8847617506980896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3258303850889206, + "step": 654 + }, + { + "epoch": 0.01312, + "grad_norm": 3.9375, + "grad_norm_var": 0.11286519368489584, + "learning_rate": 0.0001, + "loss": 6.042, + "loss/crossentropy": 2.2471452951431274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38874460756778717, + "step": 656 + }, + { + "epoch": 0.01316, + "grad_norm": 4.125, + "grad_norm_var": 0.10369364420572917, + "learning_rate": 0.0001, + "loss": 6.4383, + "loss/crossentropy": 2.3776252269744873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40553848445415497, + "step": 658 + }, + { + "epoch": 0.0132, + "grad_norm": 3.84375, + "grad_norm_var": 0.06461588541666667, + "learning_rate": 0.0001, + "loss": 5.5389, + "loss/crossentropy": 2.291012167930603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3630271404981613, + "step": 660 + }, + { + "epoch": 0.01324, + "grad_norm": 3.90625, + "grad_norm_var": 0.06116536458333333, + "learning_rate": 0.0001, + "loss": 6.298, + "loss/crossentropy": 2.2029112577438354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37700483202934265, + "step": 662 + }, + { + "epoch": 0.01328, + "grad_norm": 3.984375, + "grad_norm_var": 0.0598541259765625, + "learning_rate": 0.0001, + "loss": 6.4093, + "loss/crossentropy": 2.571411967277527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.5071892440319061, + "step": 664 + }, + { + "epoch": 0.01332, + "grad_norm": 3.484375, + "grad_norm_var": 0.045947265625, + "learning_rate": 0.0001, + "loss": 5.6839, + "loss/crossentropy": 2.148792862892151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3591457009315491, + "step": 666 + }, + { + "epoch": 0.01336, + "grad_norm": 4.09375, + "grad_norm_var": 0.04755452473958333, + "learning_rate": 0.0001, + "loss": 6.4444, + "loss/crossentropy": 2.5091140270233154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3882133811712265, + "step": 668 + }, + { + "epoch": 0.0134, + "grad_norm": 4.09375, + "grad_norm_var": 0.049332682291666666, + "learning_rate": 0.0001, + "loss": 6.15, + "loss/crossentropy": 2.4669524431228638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3941466957330704, + "step": 670 + }, + { + "epoch": 0.01344, + "grad_norm": 3.9375, + "grad_norm_var": 0.06620686848958333, + "learning_rate": 0.0001, + "loss": 6.5358, + "loss/crossentropy": 2.4111422300338745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3973373472690582, + "step": 672 + }, + { + "epoch": 0.01348, + "grad_norm": 4.1875, + "grad_norm_var": 0.06495768229166667, + "learning_rate": 0.0001, + "loss": 5.765, + "loss/crossentropy": 2.1109864115715027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36189010739326477, + "step": 674 + }, + { + "epoch": 0.01352, + "grad_norm": 3.578125, + "grad_norm_var": 0.0744140625, + "learning_rate": 0.0001, + "loss": 6.0289, + "loss/crossentropy": 2.069494664669037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3562029302120209, + "step": 676 + }, + { + "epoch": 0.01356, + "grad_norm": 4.0625, + "grad_norm_var": 0.07224934895833333, + "learning_rate": 0.0001, + "loss": 6.4526, + "loss/crossentropy": 2.1924527883529663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38948506116867065, + "step": 678 + }, + { + "epoch": 0.0136, + "grad_norm": 3.6875, + "grad_norm_var": 0.082421875, + "learning_rate": 0.0001, + "loss": 5.7311, + "loss/crossentropy": 2.1603400707244873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40414859354496, + "step": 680 + }, + { + "epoch": 0.01364, + "grad_norm": 3.71875, + "grad_norm_var": 0.07177632649739583, + "learning_rate": 0.0001, + "loss": 6.2459, + "loss/crossentropy": 2.515262722969055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4363710880279541, + "step": 682 + }, + { + "epoch": 0.01368, + "grad_norm": 4.09375, + "grad_norm_var": 0.07869364420572916, + "learning_rate": 0.0001, + "loss": 6.1174, + "loss/crossentropy": 2.3615161180496216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35220713913440704, + "step": 684 + }, + { + "epoch": 0.01372, + "grad_norm": 6.03125, + "grad_norm_var": 0.3293690999348958, + "learning_rate": 0.0001, + "loss": 6.1941, + "loss/crossentropy": 1.920493245124817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35293935239315033, + "step": 686 + }, + { + "epoch": 0.01376, + "grad_norm": 4.03125, + "grad_norm_var": 0.31579488118489585, + "learning_rate": 0.0001, + "loss": 6.4064, + "loss/crossentropy": 2.493665337562561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4243907481431961, + "step": 688 + }, + { + "epoch": 0.0138, + "grad_norm": 4.03125, + "grad_norm_var": 0.3451324462890625, + "learning_rate": 0.0001, + "loss": 6.1519, + "loss/crossentropy": 2.1182271242141724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39624081552028656, + "step": 690 + }, + { + "epoch": 0.01384, + "grad_norm": 4.09375, + "grad_norm_var": 0.32034098307291664, + "learning_rate": 0.0001, + "loss": 6.2457, + "loss/crossentropy": 2.146227180957794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3491668850183487, + "step": 692 + }, + { + "epoch": 0.01388, + "grad_norm": 3.734375, + "grad_norm_var": 0.3376261393229167, + "learning_rate": 0.0001, + "loss": 5.7331, + "loss/crossentropy": 1.8458876609802246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34608474373817444, + "step": 694 + }, + { + "epoch": 0.01392, + "grad_norm": 4.03125, + "grad_norm_var": 0.3294911702473958, + "learning_rate": 0.0001, + "loss": 6.3661, + "loss/crossentropy": 2.270371675491333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33937984704971313, + "step": 696 + }, + { + "epoch": 0.01396, + "grad_norm": 3.875, + "grad_norm_var": 0.32066141764322914, + "learning_rate": 0.0001, + "loss": 5.8264, + "loss/crossentropy": 2.040702223777771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3821101486682892, + "step": 698 + }, + { + "epoch": 0.014, + "grad_norm": 4.09375, + "grad_norm_var": 0.3197428385416667, + "learning_rate": 0.0001, + "loss": 6.1216, + "loss/crossentropy": 2.1711822152137756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35383065044879913, + "step": 700 + }, + { + "epoch": 0.01404, + "grad_norm": 3.890625, + "grad_norm_var": 0.0597076416015625, + "learning_rate": 0.0001, + "loss": 5.8384, + "loss/crossentropy": 2.3292651176452637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37580642104148865, + "step": 702 + }, + { + "epoch": 0.01408, + "grad_norm": 4.34375, + "grad_norm_var": 0.06575520833333333, + "learning_rate": 0.0001, + "loss": 6.0789, + "loss/crossentropy": 2.243735432624817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3997037708759308, + "step": 704 + }, + { + "epoch": 0.01412, + "grad_norm": 3.921875, + "grad_norm_var": 0.0524566650390625, + "learning_rate": 0.0001, + "loss": 5.9987, + "loss/crossentropy": 1.9908145666122437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33391132950782776, + "step": 706 + }, + { + "epoch": 0.01416, + "grad_norm": 4.375, + "grad_norm_var": 0.06243082682291667, + "learning_rate": 0.0001, + "loss": 5.536, + "loss/crossentropy": 2.280096471309662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3899015784263611, + "step": 708 + }, + { + "epoch": 0.0142, + "grad_norm": 3.6875, + "grad_norm_var": 0.12727762858072916, + "learning_rate": 0.0001, + "loss": 5.9373, + "loss/crossentropy": 2.0714810490608215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3797626197338104, + "step": 710 + }, + { + "epoch": 0.01424, + "grad_norm": 3.875, + "grad_norm_var": 0.14480692545572918, + "learning_rate": 0.0001, + "loss": 6.2466, + "loss/crossentropy": 2.250674605369568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3948116898536682, + "step": 712 + }, + { + "epoch": 0.01428, + "grad_norm": 3.65625, + "grad_norm_var": 0.15719401041666667, + "learning_rate": 0.0001, + "loss": 5.8891, + "loss/crossentropy": 2.0895228385925293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34471337497234344, + "step": 714 + }, + { + "epoch": 0.01432, + "grad_norm": 4.71875, + "grad_norm_var": 0.1826812744140625, + "learning_rate": 0.0001, + "loss": 6.3748, + "loss/crossentropy": 2.337170124053955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4226381927728653, + "step": 716 + }, + { + "epoch": 0.01436, + "grad_norm": 4.03125, + "grad_norm_var": 0.174462890625, + "learning_rate": 0.0001, + "loss": 6.3996, + "loss/crossentropy": 2.337436556816101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3738469183444977, + "step": 718 + }, + { + "epoch": 0.0144, + "grad_norm": 4.15625, + "grad_norm_var": 0.1669921875, + "learning_rate": 0.0001, + "loss": 6.0278, + "loss/crossentropy": 1.9506489634513855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37431904673576355, + "step": 720 + }, + { + "epoch": 0.01444, + "grad_norm": 3.71875, + "grad_norm_var": 0.17283528645833332, + "learning_rate": 0.0001, + "loss": 5.8083, + "loss/crossentropy": 2.253044009208679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36500048637390137, + "step": 722 + }, + { + "epoch": 0.01448, + "grad_norm": 5.15625, + "grad_norm_var": 0.23413798014322917, + "learning_rate": 0.0001, + "loss": 6.2307, + "loss/crossentropy": 1.9980219006538391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34674490988254547, + "step": 724 + }, + { + "epoch": 0.01452, + "grad_norm": 3.609375, + "grad_norm_var": 0.1903472900390625, + "learning_rate": 0.0001, + "loss": 5.8283, + "loss/crossentropy": 1.894662618637085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34921175241470337, + "step": 726 + }, + { + "epoch": 0.01456, + "grad_norm": 3.609375, + "grad_norm_var": 0.18310546875, + "learning_rate": 0.0001, + "loss": 6.1117, + "loss/crossentropy": 2.304685115814209, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36858032643795013, + "step": 728 + }, + { + "epoch": 0.0146, + "grad_norm": 3.6875, + "grad_norm_var": 0.17696940104166667, + "learning_rate": 0.0001, + "loss": 6.1041, + "loss/crossentropy": 1.9020891189575195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3854057639837265, + "step": 730 + }, + { + "epoch": 0.01464, + "grad_norm": 3.4375, + "grad_norm_var": 0.15485026041666666, + "learning_rate": 0.0001, + "loss": 5.5327, + "loss/crossentropy": 1.707019329071045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2961048036813736, + "step": 732 + }, + { + "epoch": 0.01468, + "grad_norm": 3.828125, + "grad_norm_var": 0.15344136555989582, + "learning_rate": 0.0001, + "loss": 6.0543, + "loss/crossentropy": 2.423463463783264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3739102631807327, + "step": 734 + }, + { + "epoch": 0.01472, + "grad_norm": 4.0625, + "grad_norm_var": 0.15335286458333333, + "learning_rate": 0.0001, + "loss": 6.2855, + "loss/crossentropy": 2.0490055680274963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4027387350797653, + "step": 736 + }, + { + "epoch": 0.01476, + "grad_norm": 3.984375, + "grad_norm_var": 0.15038960774739582, + "learning_rate": 0.0001, + "loss": 6.1236, + "loss/crossentropy": 2.3712635040283203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3602859079837799, + "step": 738 + }, + { + "epoch": 0.0148, + "grad_norm": 3.75, + "grad_norm_var": 0.058649698893229164, + "learning_rate": 0.0001, + "loss": 6.2938, + "loss/crossentropy": 2.306379556655884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38698625564575195, + "step": 740 + }, + { + "epoch": 0.01484, + "grad_norm": 3.984375, + "grad_norm_var": 0.055939737955729166, + "learning_rate": 0.0001, + "loss": 6.3983, + "loss/crossentropy": 2.6846178770065308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3881431221961975, + "step": 742 + }, + { + "epoch": 0.01488, + "grad_norm": 3.9375, + "grad_norm_var": 0.058690388997395836, + "learning_rate": 0.0001, + "loss": 5.7984, + "loss/crossentropy": 2.0555977821350098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36078818142414093, + "step": 744 + }, + { + "epoch": 0.01492, + "grad_norm": 4.09375, + "grad_norm_var": 0.059325154622395834, + "learning_rate": 0.0001, + "loss": 5.7834, + "loss/crossentropy": 2.0597304701805115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34516778588294983, + "step": 746 + }, + { + "epoch": 0.01496, + "grad_norm": 3.5625, + "grad_norm_var": 0.049479166666666664, + "learning_rate": 0.0001, + "loss": 5.8911, + "loss/crossentropy": 1.9607329964637756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33818933367729187, + "step": 748 + }, + { + "epoch": 0.015, + "grad_norm": 3.546875, + "grad_norm_var": 0.057291666666666664, + "learning_rate": 0.0001, + "loss": 5.8484, + "loss/crossentropy": 1.7854246497154236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3463115990161896, + "step": 750 + }, + { + "epoch": 0.01504, + "grad_norm": 3.703125, + "grad_norm_var": 0.0590240478515625, + "learning_rate": 0.0001, + "loss": 5.7788, + "loss/crossentropy": 1.845078468322754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3714388310909271, + "step": 752 + }, + { + "epoch": 0.01508, + "grad_norm": 3.9375, + "grad_norm_var": 0.05771077473958333, + "learning_rate": 0.0001, + "loss": 6.1677, + "loss/crossentropy": 2.3951027393341064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3723580837249756, + "step": 754 + }, + { + "epoch": 0.01512, + "grad_norm": 3.6875, + "grad_norm_var": 0.032486979166666666, + "learning_rate": 0.0001, + "loss": 6.219, + "loss/crossentropy": 2.500870108604431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42450079321861267, + "step": 756 + }, + { + "epoch": 0.01516, + "grad_norm": 3.75, + "grad_norm_var": 0.030060831705729166, + "learning_rate": 0.0001, + "loss": 5.886, + "loss/crossentropy": 2.1584274768829346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3798908591270447, + "step": 758 + }, + { + "epoch": 0.0152, + "grad_norm": 3.5625, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 0.0001, + "loss": 5.7937, + "loss/crossentropy": 2.3126983642578125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33131279051303864, + "step": 760 + }, + { + "epoch": 0.01524, + "grad_norm": 3.5, + "grad_norm_var": 0.020246378580729165, + "learning_rate": 0.0001, + "loss": 5.9182, + "loss/crossentropy": 2.349764347076416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3910932093858719, + "step": 762 + }, + { + "epoch": 0.01528, + "grad_norm": 3.640625, + "grad_norm_var": 0.0224273681640625, + "learning_rate": 0.0001, + "loss": 6.0472, + "loss/crossentropy": 2.2232795357704163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3483322113752365, + "step": 764 + }, + { + "epoch": 0.01532, + "grad_norm": 3.65625, + "grad_norm_var": 0.0207427978515625, + "learning_rate": 0.0001, + "loss": 6.0312, + "loss/crossentropy": 2.2273412942886353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38545119762420654, + "step": 766 + }, + { + "epoch": 0.01536, + "grad_norm": 4.125, + "grad_norm_var": 0.03355712890625, + "learning_rate": 0.0001, + "loss": 6.0523, + "loss/crossentropy": 2.5879149436950684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38191574811935425, + "step": 768 + }, + { + "epoch": 0.0154, + "grad_norm": 3.953125, + "grad_norm_var": 0.034032185872395836, + "learning_rate": 0.0001, + "loss": 6.027, + "loss/crossentropy": 2.3305420875549316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3704134076833725, + "step": 770 + }, + { + "epoch": 0.01544, + "grad_norm": 3.59375, + "grad_norm_var": 0.03241780598958333, + "learning_rate": 0.0001, + "loss": 6.121, + "loss/crossentropy": 2.0433666706085205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34387587010860443, + "step": 772 + }, + { + "epoch": 0.01548, + "grad_norm": 3.609375, + "grad_norm_var": 0.03337300618489583, + "learning_rate": 0.0001, + "loss": 5.5837, + "loss/crossentropy": 2.1127337217330933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3484109789133072, + "step": 774 + }, + { + "epoch": 0.01552, + "grad_norm": 3.859375, + "grad_norm_var": 0.029002888997395834, + "learning_rate": 0.0001, + "loss": 6.0028, + "loss/crossentropy": 2.1637459993362427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3757011145353317, + "step": 776 + }, + { + "epoch": 0.01556, + "grad_norm": 3.734375, + "grad_norm_var": 0.024446614583333335, + "learning_rate": 0.0001, + "loss": 5.9904, + "loss/crossentropy": 2.4118471145629883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3797076344490051, + "step": 778 + }, + { + "epoch": 0.0156, + "grad_norm": 3.828125, + "grad_norm_var": 0.022191365559895832, + "learning_rate": 0.0001, + "loss": 6.2649, + "loss/crossentropy": 1.9410768151283264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3254295587539673, + "step": 780 + }, + { + "epoch": 0.01564, + "grad_norm": 3.609375, + "grad_norm_var": 0.023558553059895834, + "learning_rate": 0.0001, + "loss": 5.9008, + "loss/crossentropy": 2.1669737100601196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3583529591560364, + "step": 782 + }, + { + "epoch": 0.01568, + "grad_norm": 3.5625, + "grad_norm_var": 0.017606608072916665, + "learning_rate": 0.0001, + "loss": 5.9868, + "loss/crossentropy": 2.217113733291626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3503105044364929, + "step": 784 + }, + { + "epoch": 0.01572, + "grad_norm": 3.84375, + "grad_norm_var": 0.017154947916666666, + "learning_rate": 0.0001, + "loss": 6.0695, + "loss/crossentropy": 2.588438868522644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3941201716661453, + "step": 786 + }, + { + "epoch": 0.01576, + "grad_norm": 3.546875, + "grad_norm_var": 0.018310546875, + "learning_rate": 0.0001, + "loss": 5.9436, + "loss/crossentropy": 2.3925808668136597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3755808621644974, + "step": 788 + }, + { + "epoch": 0.0158, + "grad_norm": 3.625, + "grad_norm_var": 0.018684895833333333, + "learning_rate": 0.0001, + "loss": 5.7254, + "loss/crossentropy": 1.9568504691123962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3865346759557724, + "step": 790 + }, + { + "epoch": 0.01584, + "grad_norm": 3.96875, + "grad_norm_var": 0.024104817708333334, + "learning_rate": 0.0001, + "loss": 6.0174, + "loss/crossentropy": 2.336462616920471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36937348544597626, + "step": 792 + }, + { + "epoch": 0.01588, + "grad_norm": 3.421875, + "grad_norm_var": 0.03623046875, + "learning_rate": 0.0001, + "loss": 5.7742, + "loss/crossentropy": 2.1867082715034485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35803911089897156, + "step": 794 + }, + { + "epoch": 0.01592, + "grad_norm": 3.765625, + "grad_norm_var": 0.03877665201822917, + "learning_rate": 0.0001, + "loss": 6.2825, + "loss/crossentropy": 2.070562243461609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42781224846839905, + "step": 796 + }, + { + "epoch": 0.01596, + "grad_norm": 4.09375, + "grad_norm_var": 0.044384765625, + "learning_rate": 0.0001, + "loss": 6.401, + "loss/crossentropy": 2.160820960998535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34495553374290466, + "step": 798 + }, + { + "epoch": 0.016, + "grad_norm": 4.0, + "grad_norm_var": 0.045426432291666666, + "learning_rate": 0.0001, + "loss": 5.9628, + "loss/crossentropy": 2.3424230813980103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4417698383331299, + "step": 800 + }, + { + "epoch": 0.01604, + "grad_norm": 4.59375, + "grad_norm_var": 0.08385009765625, + "learning_rate": 0.0001, + "loss": 6.1984, + "loss/crossentropy": 2.090175747871399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35883304476737976, + "step": 802 + }, + { + "epoch": 0.01608, + "grad_norm": 3.90625, + "grad_norm_var": 0.0759429931640625, + "learning_rate": 0.0001, + "loss": 6.2044, + "loss/crossentropy": 2.460660457611084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36723683774471283, + "step": 804 + }, + { + "epoch": 0.01612, + "grad_norm": 3.78125, + "grad_norm_var": 0.0783203125, + "learning_rate": 0.0001, + "loss": 5.8788, + "loss/crossentropy": 2.2680885791778564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3925183415412903, + "step": 806 + }, + { + "epoch": 0.01616, + "grad_norm": 3.796875, + "grad_norm_var": 0.10422261555989583, + "learning_rate": 0.0001, + "loss": 6.1179, + "loss/crossentropy": 2.272566020488739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3425859659910202, + "step": 808 + }, + { + "epoch": 0.0162, + "grad_norm": 3.546875, + "grad_norm_var": 0.10061442057291667, + "learning_rate": 0.0001, + "loss": 5.8933, + "loss/crossentropy": 2.2417107820510864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3634066879749298, + "step": 810 + }, + { + "epoch": 0.01624, + "grad_norm": 4.09375, + "grad_norm_var": 0.10075581868489583, + "learning_rate": 0.0001, + "loss": 5.9907, + "loss/crossentropy": 2.2117987275123596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3669978231191635, + "step": 812 + }, + { + "epoch": 0.01628, + "grad_norm": 4.53125, + "grad_norm_var": 0.12078348795572917, + "learning_rate": 0.0001, + "loss": 6.1767, + "loss/crossentropy": 2.3471380472183228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39207911491394043, + "step": 814 + }, + { + "epoch": 0.01632, + "grad_norm": 4.1875, + "grad_norm_var": 0.12200113932291666, + "learning_rate": 0.0001, + "loss": 6.005, + "loss/crossentropy": 2.1516740322113037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3394138962030411, + "step": 816 + }, + { + "epoch": 0.01636, + "grad_norm": 3.765625, + "grad_norm_var": 0.10528055826822917, + "learning_rate": 0.0001, + "loss": 6.0827, + "loss/crossentropy": 2.5085272789001465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.39268872141838074, + "step": 818 + }, + { + "epoch": 0.0164, + "grad_norm": 3.515625, + "grad_norm_var": 0.1198883056640625, + "learning_rate": 0.0001, + "loss": 6.0363, + "loss/crossentropy": 2.3051916360855103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3798936903476715, + "step": 820 + }, + { + "epoch": 0.01644, + "grad_norm": 3.5, + "grad_norm_var": 0.14094136555989584, + "learning_rate": 0.0001, + "loss": 5.4403, + "loss/crossentropy": 2.1685640811920166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3666132390499115, + "step": 822 + }, + { + "epoch": 0.01648, + "grad_norm": 3.53125, + "grad_norm_var": 0.109521484375, + "learning_rate": 0.0001, + "loss": 5.6981, + "loss/crossentropy": 2.3374987840652466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32505205273628235, + "step": 824 + }, + { + "epoch": 0.01652, + "grad_norm": 5.4375, + "grad_norm_var": 0.280859375, + "learning_rate": 0.0001, + "loss": 6.1428, + "loss/crossentropy": 2.6427528858184814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4296632409095764, + "step": 826 + }, + { + "epoch": 0.01656, + "grad_norm": 4.0, + "grad_norm_var": 0.28609619140625, + "learning_rate": 0.0001, + "loss": 5.8304, + "loss/crossentropy": 2.1534847617149353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37605202198028564, + "step": 828 + }, + { + "epoch": 0.0166, + "grad_norm": 4.28125, + "grad_norm_var": 0.2637278238932292, + "learning_rate": 0.0001, + "loss": 6.2115, + "loss/crossentropy": 1.9642478227615356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43740569055080414, + "step": 830 + }, + { + "epoch": 0.01664, + "grad_norm": 4.59375, + "grad_norm_var": 3.3863433837890624, + "learning_rate": 0.0001, + "loss": 6.5306, + "loss/crossentropy": 2.283148407936096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3901669532060623, + "step": 832 + }, + { + "epoch": 0.01668, + "grad_norm": 3.46875, + "grad_norm_var": 3.382255045572917, + "learning_rate": 0.0001, + "loss": 6.0831, + "loss/crossentropy": 2.418351888656616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4011431038379669, + "step": 834 + }, + { + "epoch": 0.01672, + "grad_norm": 4.03125, + "grad_norm_var": 3.322565714518229, + "learning_rate": 0.0001, + "loss": 6.1852, + "loss/crossentropy": 2.40928852558136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40596309304237366, + "step": 836 + }, + { + "epoch": 0.01676, + "grad_norm": 3.34375, + "grad_norm_var": 3.3059234619140625, + "learning_rate": 0.0001, + "loss": 5.791, + "loss/crossentropy": 2.4211392402648926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3640473484992981, + "step": 838 + }, + { + "epoch": 0.0168, + "grad_norm": 3.484375, + "grad_norm_var": 3.2890625, + "learning_rate": 0.0001, + "loss": 5.6553, + "loss/crossentropy": 2.047215461730957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3346950262784958, + "step": 840 + }, + { + "epoch": 0.01684, + "grad_norm": 3.578125, + "grad_norm_var": 3.28623046875, + "learning_rate": 0.0001, + "loss": 5.7093, + "loss/crossentropy": 2.020021378993988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3442958742380142, + "step": 842 + }, + { + "epoch": 0.01688, + "grad_norm": 3.96875, + "grad_norm_var": 3.27164306640625, + "learning_rate": 0.0001, + "loss": 6.1484, + "loss/crossentropy": 2.1585127115249634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3768642693758011, + "step": 844 + }, + { + "epoch": 0.01692, + "grad_norm": 3.84375, + "grad_norm_var": 3.298802693684896, + "learning_rate": 0.0001, + "loss": 5.8512, + "loss/crossentropy": 2.2717286348342896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3644135594367981, + "step": 846 + }, + { + "epoch": 0.01696, + "grad_norm": 3.828125, + "grad_norm_var": 0.06539713541666667, + "learning_rate": 0.0001, + "loss": 5.9203, + "loss/crossentropy": 2.2683321237564087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3795373737812042, + "step": 848 + }, + { + "epoch": 0.017, + "grad_norm": 3.1875, + "grad_norm_var": 0.07066141764322917, + "learning_rate": 0.0001, + "loss": 5.7235, + "loss/crossentropy": 2.1189464330673218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35512739419937134, + "step": 850 + }, + { + "epoch": 0.01704, + "grad_norm": 3.65625, + "grad_norm_var": 0.060384114583333336, + "learning_rate": 0.0001, + "loss": 5.6536, + "loss/crossentropy": 2.260777235031128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35152101516723633, + "step": 852 + }, + { + "epoch": 0.01708, + "grad_norm": 3.671875, + "grad_norm_var": 0.06897379557291666, + "learning_rate": 0.0001, + "loss": 5.93, + "loss/crossentropy": 2.323577642440796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32252687215805054, + "step": 854 + }, + { + "epoch": 0.01712, + "grad_norm": 3.71875, + "grad_norm_var": 0.07024637858072917, + "learning_rate": 0.0001, + "loss": 6.1006, + "loss/crossentropy": 2.5965300798416138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.43394728004932404, + "step": 856 + }, + { + "epoch": 0.01716, + "grad_norm": 3.140625, + "grad_norm_var": 0.08170572916666667, + "learning_rate": 0.0001, + "loss": 5.8612, + "loss/crossentropy": 2.078580856323242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3437638282775879, + "step": 858 + }, + { + "epoch": 0.0172, + "grad_norm": 3.359375, + "grad_norm_var": 0.08163960774739583, + "learning_rate": 0.0001, + "loss": 5.8394, + "loss/crossentropy": 2.425456404685974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3522993326187134, + "step": 860 + }, + { + "epoch": 0.01724, + "grad_norm": 3.515625, + "grad_norm_var": 0.08478190104166666, + "learning_rate": 0.0001, + "loss": 6.0154, + "loss/crossentropy": 2.2830835580825806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37188920378685, + "step": 862 + }, + { + "epoch": 0.01728, + "grad_norm": 3.8125, + "grad_norm_var": 0.06896870930989583, + "learning_rate": 0.0001, + "loss": 5.9086, + "loss/crossentropy": 2.090674340724945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32989686727523804, + "step": 864 + }, + { + "epoch": 0.01732, + "grad_norm": 3.8125, + "grad_norm_var": 0.0698394775390625, + "learning_rate": 0.0001, + "loss": 5.9617, + "loss/crossentropy": 2.304458498954773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37342821061611176, + "step": 866 + }, + { + "epoch": 0.01736, + "grad_norm": 3.421875, + "grad_norm_var": 0.0715484619140625, + "learning_rate": 0.0001, + "loss": 5.8593, + "loss/crossentropy": 2.6545844078063965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3627774566411972, + "step": 868 + }, + { + "epoch": 0.0174, + "grad_norm": 3.3125, + "grad_norm_var": 0.059789021809895836, + "learning_rate": 0.0001, + "loss": 5.6956, + "loss/crossentropy": 1.9977945685386658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.318715900182724, + "step": 870 + }, + { + "epoch": 0.01744, + "grad_norm": 3.203125, + "grad_norm_var": 0.08033854166666667, + "learning_rate": 0.0001, + "loss": 5.7408, + "loss/crossentropy": 1.9226595759391785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3187277615070343, + "step": 872 + }, + { + "epoch": 0.01748, + "grad_norm": 3.6875, + "grad_norm_var": 0.0694732666015625, + "learning_rate": 0.0001, + "loss": 5.9863, + "loss/crossentropy": 2.323302686214447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36724327504634857, + "step": 874 + }, + { + "epoch": 0.01752, + "grad_norm": 3.5625, + "grad_norm_var": 0.07043355305989583, + "learning_rate": 0.0001, + "loss": 5.9913, + "loss/crossentropy": 2.254343032836914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3444042354822159, + "step": 876 + }, + { + "epoch": 0.01756, + "grad_norm": 3.296875, + "grad_norm_var": 0.0774078369140625, + "learning_rate": 0.0001, + "loss": 5.5038, + "loss/crossentropy": 2.0819836854934692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3477388769388199, + "step": 878 + }, + { + "epoch": 0.0176, + "grad_norm": 3.703125, + "grad_norm_var": 0.10198160807291666, + "learning_rate": 0.0001, + "loss": 5.9947, + "loss/crossentropy": 2.377693295478821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36597058176994324, + "step": 880 + }, + { + "epoch": 0.01764, + "grad_norm": 4.21875, + "grad_norm_var": 0.11789449055989583, + "learning_rate": 0.0001, + "loss": 6.3011, + "loss/crossentropy": 2.5598798990249634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4082639515399933, + "step": 882 + }, + { + "epoch": 0.01768, + "grad_norm": 3.6875, + "grad_norm_var": 0.1294830322265625, + "learning_rate": 0.0001, + "loss": 5.9966, + "loss/crossentropy": 2.2847843170166016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3531750440597534, + "step": 884 + }, + { + "epoch": 0.01772, + "grad_norm": 3.390625, + "grad_norm_var": 0.12878316243489582, + "learning_rate": 0.0001, + "loss": 5.6685, + "loss/crossentropy": 1.8283140063285828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.336555078625679, + "step": 886 + }, + { + "epoch": 0.01776, + "grad_norm": 3.65625, + "grad_norm_var": 0.09971415201822917, + "learning_rate": 0.0001, + "loss": 5.9507, + "loss/crossentropy": 2.1001436710357666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3380406051874161, + "step": 888 + }, + { + "epoch": 0.0178, + "grad_norm": 4.21875, + "grad_norm_var": 0.11876627604166666, + "learning_rate": 0.0001, + "loss": 5.7552, + "loss/crossentropy": 2.0079030990600586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35362809896469116, + "step": 890 + }, + { + "epoch": 0.01784, + "grad_norm": 4.46875, + "grad_norm_var": 0.15650634765625, + "learning_rate": 0.0001, + "loss": 5.7416, + "loss/crossentropy": 2.176286220550537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34369874000549316, + "step": 892 + }, + { + "epoch": 0.01788, + "grad_norm": 3.984375, + "grad_norm_var": 0.13087565104166668, + "learning_rate": 0.0001, + "loss": 5.9236, + "loss/crossentropy": 2.17675244808197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3495863378047943, + "step": 894 + }, + { + "epoch": 0.01792, + "grad_norm": 4.0, + "grad_norm_var": 0.12189127604166666, + "learning_rate": 0.0001, + "loss": 5.9109, + "loss/crossentropy": 2.312318801879883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40353919565677643, + "step": 896 + }, + { + "epoch": 0.01796, + "grad_norm": 3.375, + "grad_norm_var": 0.14010009765625, + "learning_rate": 0.0001, + "loss": 6.071, + "loss/crossentropy": 2.28923499584198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3770768642425537, + "step": 898 + }, + { + "epoch": 0.018, + "grad_norm": 3.546875, + "grad_norm_var": 0.1447662353515625, + "learning_rate": 0.0001, + "loss": 6.0275, + "loss/crossentropy": 2.2060720920562744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35431359708309174, + "step": 900 + }, + { + "epoch": 0.01804, + "grad_norm": 3.296875, + "grad_norm_var": 0.17552083333333332, + "learning_rate": 0.0001, + "loss": 5.4052, + "loss/crossentropy": 2.0325206518173218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3037416934967041, + "step": 902 + }, + { + "epoch": 0.01808, + "grad_norm": 3.546875, + "grad_norm_var": 0.17635091145833334, + "learning_rate": 0.0001, + "loss": 5.9148, + "loss/crossentropy": 2.1943042278289795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3509814143180847, + "step": 904 + }, + { + "epoch": 0.01812, + "grad_norm": 3.4375, + "grad_norm_var": 0.1696197509765625, + "learning_rate": 0.0001, + "loss": 5.563, + "loss/crossentropy": 1.9589214324951172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31621459126472473, + "step": 906 + }, + { + "epoch": 0.01816, + "grad_norm": 3.375, + "grad_norm_var": 0.12841389973958334, + "learning_rate": 0.0001, + "loss": 5.6511, + "loss/crossentropy": 2.329489588737488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3837278485298157, + "step": 908 + }, + { + "epoch": 0.0182, + "grad_norm": 3.5625, + "grad_norm_var": 0.09648030598958333, + "learning_rate": 0.0001, + "loss": 5.8082, + "loss/crossentropy": 2.1757726669311523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3659580200910568, + "step": 910 + }, + { + "epoch": 0.01824, + "grad_norm": 3.75, + "grad_norm_var": 0.08772786458333333, + "learning_rate": 0.0001, + "loss": 5.7372, + "loss/crossentropy": 2.1498661041259766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3474857211112976, + "step": 912 + }, + { + "epoch": 0.01828, + "grad_norm": 15.8125, + "grad_norm_var": 9.504325358072917, + "learning_rate": 0.0001, + "loss": 5.9297, + "loss/crossentropy": 2.4722740650177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.505307987332344, + "step": 914 + }, + { + "epoch": 0.01832, + "grad_norm": 9.0, + "grad_norm_var": 10.75227762858073, + "learning_rate": 0.0001, + "loss": 5.6296, + "loss/crossentropy": 1.855428695678711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31347331404685974, + "step": 916 + }, + { + "epoch": 0.01836, + "grad_norm": 3.75, + "grad_norm_var": 10.50523173014323, + "learning_rate": 0.0001, + "loss": 5.9769, + "loss/crossentropy": 2.326256275177002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3996751010417938, + "step": 918 + }, + { + "epoch": 0.0184, + "grad_norm": 3.546875, + "grad_norm_var": 10.518257649739583, + "learning_rate": 0.0001, + "loss": 5.8444, + "loss/crossentropy": 2.3712844848632812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37993232905864716, + "step": 920 + }, + { + "epoch": 0.01844, + "grad_norm": 3.5, + "grad_norm_var": 10.657861328125, + "learning_rate": 0.0001, + "loss": 5.5577, + "loss/crossentropy": 2.0161430835723877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3590858578681946, + "step": 922 + }, + { + "epoch": 0.01848, + "grad_norm": 5.40625, + "grad_norm_var": 10.520926920572917, + "learning_rate": 0.0001, + "loss": 5.6675, + "loss/crossentropy": 2.2401121258735657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33380126953125, + "step": 924 + }, + { + "epoch": 0.01852, + "grad_norm": 3.328125, + "grad_norm_var": 10.598356119791667, + "learning_rate": 0.0001, + "loss": 5.9331, + "loss/crossentropy": 2.2354423999786377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34832654893398285, + "step": 926 + }, + { + "epoch": 0.01856, + "grad_norm": 3.46875, + "grad_norm_var": 10.647850545247396, + "learning_rate": 0.0001, + "loss": 5.5212, + "loss/crossentropy": 2.158566176891327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3664778769016266, + "step": 928 + }, + { + "epoch": 0.0186, + "grad_norm": 3.703125, + "grad_norm_var": 2.1230377197265624, + "learning_rate": 0.0001, + "loss": 5.9146, + "loss/crossentropy": 2.270231008529663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35586032271385193, + "step": 930 + }, + { + "epoch": 0.01864, + "grad_norm": 3.609375, + "grad_norm_var": 0.3744303385416667, + "learning_rate": 0.0001, + "loss": 6.0013, + "loss/crossentropy": 2.233540892601013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3789799362421036, + "step": 932 + }, + { + "epoch": 0.01868, + "grad_norm": 3.65625, + "grad_norm_var": 0.27898763020833334, + "learning_rate": 0.0001, + "loss": 5.5019, + "loss/crossentropy": 1.9381731152534485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28776855766773224, + "step": 934 + }, + { + "epoch": 0.01872, + "grad_norm": 3.34375, + "grad_norm_var": 0.28227437337239586, + "learning_rate": 0.0001, + "loss": 5.6759, + "loss/crossentropy": 2.4225244522094727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.355392187833786, + "step": 936 + }, + { + "epoch": 0.01876, + "grad_norm": 3.15625, + "grad_norm_var": 0.28084208170572916, + "learning_rate": 0.0001, + "loss": 5.911, + "loss/crossentropy": 2.58090603351593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3640855699777603, + "step": 938 + }, + { + "epoch": 0.0188, + "grad_norm": 3.5625, + "grad_norm_var": 0.03673502604166667, + "learning_rate": 0.0001, + "loss": 6.0623, + "loss/crossentropy": 2.436452269554138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3797999918460846, + "step": 940 + }, + { + "epoch": 0.01884, + "grad_norm": 3.3125, + "grad_norm_var": 0.03680013020833333, + "learning_rate": 0.0001, + "loss": 5.7428, + "loss/crossentropy": 2.0378769636154175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3278265744447708, + "step": 942 + }, + { + "epoch": 0.01888, + "grad_norm": 3.546875, + "grad_norm_var": 0.03860270182291667, + "learning_rate": 0.0001, + "loss": 5.6211, + "loss/crossentropy": 2.1212962865829468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34586282074451447, + "step": 944 + }, + { + "epoch": 0.01892, + "grad_norm": 3.515625, + "grad_norm_var": 0.040087890625, + "learning_rate": 0.0001, + "loss": 5.6695, + "loss/crossentropy": 2.1884353160858154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40608666837215424, + "step": 946 + }, + { + "epoch": 0.01896, + "grad_norm": 3.671875, + "grad_norm_var": 0.046858723958333334, + "learning_rate": 0.0001, + "loss": 5.6684, + "loss/crossentropy": 2.2093260288238525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3519093841314316, + "step": 948 + }, + { + "epoch": 0.019, + "grad_norm": 3.328125, + "grad_norm_var": 0.03243815104166667, + "learning_rate": 0.0001, + "loss": 6.0842, + "loss/crossentropy": 2.4246588945388794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37247334420681, + "step": 950 + }, + { + "epoch": 0.01904, + "grad_norm": 3.671875, + "grad_norm_var": 0.031022135416666666, + "learning_rate": 0.0001, + "loss": 5.6116, + "loss/crossentropy": 1.932490050792694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32985979318618774, + "step": 952 + }, + { + "epoch": 0.01908, + "grad_norm": 3.90625, + "grad_norm_var": 0.7084920247395833, + "learning_rate": 0.0001, + "loss": 5.7393, + "loss/crossentropy": 2.4439035654067993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38164034485816956, + "step": 954 + }, + { + "epoch": 0.01912, + "grad_norm": 3.296875, + "grad_norm_var": 0.72437744140625, + "learning_rate": 0.0001, + "loss": 5.6255, + "loss/crossentropy": 1.8876591920852661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3267661929130554, + "step": 956 + }, + { + "epoch": 0.01916, + "grad_norm": 3.578125, + "grad_norm_var": 0.6990193684895833, + "learning_rate": 0.0001, + "loss": 5.7367, + "loss/crossentropy": 2.284990072250366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34810060262680054, + "step": 958 + }, + { + "epoch": 0.0192, + "grad_norm": 3.53125, + "grad_norm_var": 0.6961008707682291, + "learning_rate": 0.0001, + "loss": 5.888, + "loss/crossentropy": 2.333263397216797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.42767176032066345, + "step": 960 + }, + { + "epoch": 0.01924, + "grad_norm": 3.296875, + "grad_norm_var": 0.7323527018229167, + "learning_rate": 0.0001, + "loss": 5.6021, + "loss/crossentropy": 2.2526148557662964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3284706473350525, + "step": 962 + }, + { + "epoch": 0.01928, + "grad_norm": 4.25, + "grad_norm_var": 0.7586252848307292, + "learning_rate": 0.0001, + "loss": 5.5479, + "loss/crossentropy": 2.1782984137535095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35957905650138855, + "step": 964 + }, + { + "epoch": 0.01932, + "grad_norm": 4.28125, + "grad_norm_var": 0.75205078125, + "learning_rate": 0.0001, + "loss": 6.2783, + "loss/crossentropy": 2.292098045349121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35862940549850464, + "step": 966 + }, + { + "epoch": 0.01936, + "grad_norm": 3.546875, + "grad_norm_var": 0.7503214518229167, + "learning_rate": 0.0001, + "loss": 5.9252, + "loss/crossentropy": 2.102781653404236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3527261018753052, + "step": 968 + }, + { + "epoch": 0.0194, + "grad_norm": 3.453125, + "grad_norm_var": 0.11111653645833333, + "learning_rate": 0.0001, + "loss": 5.8891, + "loss/crossentropy": 2.223380208015442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3608042299747467, + "step": 970 + }, + { + "epoch": 0.01944, + "grad_norm": 3.4375, + "grad_norm_var": 0.10501200358072917, + "learning_rate": 0.0001, + "loss": 5.3348, + "loss/crossentropy": 2.0684096813201904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31996411085128784, + "step": 972 + }, + { + "epoch": 0.01948, + "grad_norm": 4.15625, + "grad_norm_var": 0.126708984375, + "learning_rate": 0.0001, + "loss": 5.6642, + "loss/crossentropy": 2.2011090517044067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34404022991657257, + "step": 974 + }, + { + "epoch": 0.01952, + "grad_norm": 3.140625, + "grad_norm_var": 0.14937235514322916, + "learning_rate": 0.0001, + "loss": 5.5033, + "loss/crossentropy": 2.027641534805298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31718479096889496, + "step": 976 + }, + { + "epoch": 0.01956, + "grad_norm": 3.484375, + "grad_norm_var": 0.13909403483072916, + "learning_rate": 0.0001, + "loss": 5.7294, + "loss/crossentropy": 2.311842203140259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3904002010822296, + "step": 978 + }, + { + "epoch": 0.0196, + "grad_norm": 3.859375, + "grad_norm_var": 0.11236572265625, + "learning_rate": 0.0001, + "loss": 5.4402, + "loss/crossentropy": 2.3605271577835083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38789358735084534, + "step": 980 + }, + { + "epoch": 0.01964, + "grad_norm": 3.765625, + "grad_norm_var": 0.095166015625, + "learning_rate": 0.0001, + "loss": 6.1094, + "loss/crossentropy": 2.1687097549438477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3534909188747406, + "step": 982 + }, + { + "epoch": 0.01968, + "grad_norm": 3.6875, + "grad_norm_var": 0.0962554931640625, + "learning_rate": 0.0001, + "loss": 5.6656, + "loss/crossentropy": 2.194393038749695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34674669802188873, + "step": 984 + }, + { + "epoch": 0.01972, + "grad_norm": 3.359375, + "grad_norm_var": 0.09602762858072916, + "learning_rate": 0.0001, + "loss": 5.5513, + "loss/crossentropy": 2.1355903148651123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3180558532476425, + "step": 986 + }, + { + "epoch": 0.01976, + "grad_norm": 3.390625, + "grad_norm_var": 0.08559468587239584, + "learning_rate": 0.0001, + "loss": 5.9431, + "loss/crossentropy": 2.2688111066818237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.365144744515419, + "step": 988 + }, + { + "epoch": 0.0198, + "grad_norm": 3.65625, + "grad_norm_var": 0.06303609212239583, + "learning_rate": 0.0001, + "loss": 5.5117, + "loss/crossentropy": 2.423216700553894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34281550347805023, + "step": 990 + }, + { + "epoch": 0.01984, + "grad_norm": 3.375, + "grad_norm_var": 0.0419586181640625, + "learning_rate": 0.0001, + "loss": 5.4698, + "loss/crossentropy": 1.9360128045082092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3343205749988556, + "step": 992 + }, + { + "epoch": 0.01988, + "grad_norm": 3.421875, + "grad_norm_var": 0.04468994140625, + "learning_rate": 0.0001, + "loss": 5.8128, + "loss/crossentropy": 2.181576132774353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35567884147167206, + "step": 994 + }, + { + "epoch": 0.01992, + "grad_norm": 3.578125, + "grad_norm_var": 0.037398274739583334, + "learning_rate": 0.0001, + "loss": 5.9295, + "loss/crossentropy": 2.166663408279419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34105008840560913, + "step": 996 + }, + { + "epoch": 0.01996, + "grad_norm": 3.203125, + "grad_norm_var": 0.027408854166666666, + "learning_rate": 0.0001, + "loss": 5.5579, + "loss/crossentropy": 2.285332202911377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35812389850616455, + "step": 998 + }, + { + "epoch": 0.02, + "grad_norm": 4.0, + "grad_norm_var": 35.396484375, + "learning_rate": 0.0001, + "loss": 6.3986, + "loss/crossentropy": 2.088365077972412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3717931807041168, + "step": 1000 + }, + { + "epoch": 0.02004, + "grad_norm": 4.25, + "grad_norm_var": 35.223714192708336, + "learning_rate": 0.0001, + "loss": 6.1327, + "loss/crossentropy": 2.4051828384399414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.420933797955513, + "step": 1002 + }, + { + "epoch": 0.02008, + "grad_norm": 3.265625, + "grad_norm_var": 35.27108968098958, + "learning_rate": 0.0001, + "loss": 5.6789, + "loss/crossentropy": 2.3092572689056396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37571050226688385, + "step": 1004 + }, + { + "epoch": 0.02012, + "grad_norm": 3.640625, + "grad_norm_var": 35.29562072753906, + "learning_rate": 0.0001, + "loss": 5.6972, + "loss/crossentropy": 2.147248387336731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29974566400051117, + "step": 1006 + }, + { + "epoch": 0.02016, + "grad_norm": 3.984375, + "grad_norm_var": 35.141299438476565, + "learning_rate": 0.0001, + "loss": 5.988, + "loss/crossentropy": 2.3385868668556213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4254964739084244, + "step": 1008 + }, + { + "epoch": 0.0202, + "grad_norm": 3.625, + "grad_norm_var": 35.15125223795573, + "learning_rate": 0.0001, + "loss": 5.7553, + "loss/crossentropy": 2.142681658267975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3764440715312958, + "step": 1010 + }, + { + "epoch": 0.02024, + "grad_norm": 3.34375, + "grad_norm_var": 35.18043619791667, + "learning_rate": 0.0001, + "loss": 5.5947, + "loss/crossentropy": 2.241790771484375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3385400176048279, + "step": 1012 + }, + { + "epoch": 0.02028, + "grad_norm": 3.859375, + "grad_norm_var": 34.836360677083334, + "learning_rate": 0.0001, + "loss": 6.3228, + "loss/crossentropy": 2.1563867330551147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3954617381095886, + "step": 1014 + }, + { + "epoch": 0.02032, + "grad_norm": 3.484375, + "grad_norm_var": 0.09921468098958333, + "learning_rate": 0.0001, + "loss": 5.4613, + "loss/crossentropy": 1.9462800025939941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32294341921806335, + "step": 1016 + }, + { + "epoch": 0.02036, + "grad_norm": 3.53125, + "grad_norm_var": 0.08026936848958334, + "learning_rate": 0.0001, + "loss": 5.4993, + "loss/crossentropy": 1.83676278591156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3142092078924179, + "step": 1018 + }, + { + "epoch": 0.0204, + "grad_norm": 3.84375, + "grad_norm_var": 0.09038798014322917, + "learning_rate": 0.0001, + "loss": 5.8174, + "loss/crossentropy": 1.951962649822235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3889008015394211, + "step": 1020 + }, + { + "epoch": 0.02044, + "grad_norm": 3.578125, + "grad_norm_var": 0.09976806640625, + "learning_rate": 0.0001, + "loss": 5.7173, + "loss/crossentropy": 2.299771785736084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3214885741472244, + "step": 1022 + }, + { + "epoch": 0.02048, + "grad_norm": 3.1875, + "grad_norm_var": 0.09135640462239583, + "learning_rate": 0.0001, + "loss": 5.2291, + "loss/crossentropy": 1.9117569327354431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32827669382095337, + "step": 1024 + }, + { + "epoch": 0.02052, + "grad_norm": 3.1875, + "grad_norm_var": 0.096533203125, + "learning_rate": 0.0001, + "loss": 5.7974, + "loss/crossentropy": 2.484488010406494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34361426532268524, + "step": 1026 + }, + { + "epoch": 0.02056, + "grad_norm": 3.59375, + "grad_norm_var": 0.09973551432291666, + "learning_rate": 0.0001, + "loss": 5.7044, + "loss/crossentropy": 2.3155311346054077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3734404444694519, + "step": 1028 + }, + { + "epoch": 0.0206, + "grad_norm": 3.125, + "grad_norm_var": 0.04983317057291667, + "learning_rate": 0.0001, + "loss": 5.4202, + "loss/crossentropy": 2.081188380718231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3250262886285782, + "step": 1030 + }, + { + "epoch": 0.02064, + "grad_norm": 3.09375, + "grad_norm_var": 0.0574127197265625, + "learning_rate": 0.0001, + "loss": 5.5885, + "loss/crossentropy": 2.044768512248993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34609031677246094, + "step": 1032 + }, + { + "epoch": 0.02068, + "grad_norm": 4.0, + "grad_norm_var": 0.0875640869140625, + "learning_rate": 0.0001, + "loss": 6.078, + "loss/crossentropy": 2.0666560530662537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.365878626704216, + "step": 1034 + }, + { + "epoch": 0.02072, + "grad_norm": 3.5625, + "grad_norm_var": 0.08336588541666666, + "learning_rate": 0.0001, + "loss": 5.9891, + "loss/crossentropy": 2.2933902740478516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3754771202802658, + "step": 1036 + }, + { + "epoch": 0.02076, + "grad_norm": 3.28125, + "grad_norm_var": 0.08640848795572917, + "learning_rate": 0.0001, + "loss": 5.8105, + "loss/crossentropy": 2.28829288482666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3676798492670059, + "step": 1038 + }, + { + "epoch": 0.0208, + "grad_norm": 3.5, + "grad_norm_var": 0.08378499348958333, + "learning_rate": 0.0001, + "loss": 5.9305, + "loss/crossentropy": 2.5891193151474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40094244480133057, + "step": 1040 + }, + { + "epoch": 0.02084, + "grad_norm": 3.234375, + "grad_norm_var": 0.08056233723958334, + "learning_rate": 0.0001, + "loss": 5.8579, + "loss/crossentropy": 2.238967180252075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3226759731769562, + "step": 1042 + }, + { + "epoch": 0.02088, + "grad_norm": 3.296875, + "grad_norm_var": 0.07796223958333333, + "learning_rate": 0.0001, + "loss": 5.2959, + "loss/crossentropy": 2.0116711258888245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3511478453874588, + "step": 1044 + }, + { + "epoch": 0.02092, + "grad_norm": 3.515625, + "grad_norm_var": 0.07344462076822916, + "learning_rate": 0.0001, + "loss": 5.4885, + "loss/crossentropy": 2.4924051761627197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3375450670719147, + "step": 1046 + }, + { + "epoch": 0.02096, + "grad_norm": 3.234375, + "grad_norm_var": 0.059235636393229166, + "learning_rate": 0.0001, + "loss": 5.6838, + "loss/crossentropy": 2.138728439807892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34410610795021057, + "step": 1048 + }, + { + "epoch": 0.021, + "grad_norm": 3.515625, + "grad_norm_var": 0.046122233072916664, + "learning_rate": 0.0001, + "loss": 5.7371, + "loss/crossentropy": 2.3748635053634644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3580169975757599, + "step": 1050 + }, + { + "epoch": 0.02104, + "grad_norm": 3.125, + "grad_norm_var": 0.044310506184895834, + "learning_rate": 0.0001, + "loss": 5.427, + "loss/crossentropy": 2.061552882194519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31069953739643097, + "step": 1052 + }, + { + "epoch": 0.02108, + "grad_norm": 3.515625, + "grad_norm_var": 0.03769124348958333, + "learning_rate": 0.0001, + "loss": 5.3469, + "loss/crossentropy": 2.299555718898773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31592319905757904, + "step": 1054 + }, + { + "epoch": 0.02112, + "grad_norm": 3.53125, + "grad_norm_var": 0.04571940104166667, + "learning_rate": 0.0001, + "loss": 6.1254, + "loss/crossentropy": 2.4866377115249634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37435297667980194, + "step": 1056 + }, + { + "epoch": 0.02116, + "grad_norm": 3.203125, + "grad_norm_var": 0.047684733072916666, + "learning_rate": 0.0001, + "loss": 5.5676, + "loss/crossentropy": 1.8185054063796997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2784232199192047, + "step": 1058 + }, + { + "epoch": 0.0212, + "grad_norm": 3.3125, + "grad_norm_var": 0.04840087890625, + "learning_rate": 0.0001, + "loss": 5.6184, + "loss/crossentropy": 2.215538501739502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3471361994743347, + "step": 1060 + }, + { + "epoch": 0.02124, + "grad_norm": 3.078125, + "grad_norm_var": 0.057616170247395834, + "learning_rate": 0.0001, + "loss": 5.7615, + "loss/crossentropy": 2.6912894248962402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35911867022514343, + "step": 1062 + }, + { + "epoch": 0.02128, + "grad_norm": 3.125, + "grad_norm_var": 0.06670633951822917, + "learning_rate": 0.0001, + "loss": 5.3436, + "loss/crossentropy": 1.9757090210914612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29868973791599274, + "step": 1064 + }, + { + "epoch": 0.02132, + "grad_norm": 3.21875, + "grad_norm_var": 0.052294921875, + "learning_rate": 0.0001, + "loss": 5.5334, + "loss/crossentropy": 2.3396666049957275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3561312407255173, + "step": 1066 + }, + { + "epoch": 0.02136, + "grad_norm": 3.359375, + "grad_norm_var": 0.0477935791015625, + "learning_rate": 0.0001, + "loss": 5.7564, + "loss/crossentropy": 2.3498982191085815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3842166066169739, + "step": 1068 + }, + { + "epoch": 0.0214, + "grad_norm": 3.234375, + "grad_norm_var": 0.03717041015625, + "learning_rate": 0.0001, + "loss": 5.8936, + "loss/crossentropy": 2.037585139274597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3429017663002014, + "step": 1070 + }, + { + "epoch": 0.02144, + "grad_norm": 3.484375, + "grad_norm_var": 0.0216949462890625, + "learning_rate": 0.0001, + "loss": 5.6938, + "loss/crossentropy": 2.4804376363754272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35338981449604034, + "step": 1072 + }, + { + "epoch": 0.02148, + "grad_norm": 3.375, + "grad_norm_var": 0.027448527018229165, + "learning_rate": 0.0001, + "loss": 5.8146, + "loss/crossentropy": 2.5210201740264893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3769296407699585, + "step": 1074 + }, + { + "epoch": 0.02152, + "grad_norm": 3.40625, + "grad_norm_var": 0.03369852701822917, + "learning_rate": 0.0001, + "loss": 5.7854, + "loss/crossentropy": 2.1258187294006348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3353133201599121, + "step": 1076 + }, + { + "epoch": 0.02156, + "grad_norm": 3.328125, + "grad_norm_var": 0.026676432291666666, + "learning_rate": 0.0001, + "loss": 5.5436, + "loss/crossentropy": 2.2107938528060913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34667155146598816, + "step": 1078 + }, + { + "epoch": 0.0216, + "grad_norm": 3.171875, + "grad_norm_var": 0.0194244384765625, + "learning_rate": 0.0001, + "loss": 5.7639, + "loss/crossentropy": 1.9614633321762085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2958581745624542, + "step": 1080 + }, + { + "epoch": 0.02164, + "grad_norm": 3.515625, + "grad_norm_var": 0.0194976806640625, + "learning_rate": 0.0001, + "loss": 5.8219, + "loss/crossentropy": 2.1403249502182007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3220343589782715, + "step": 1082 + }, + { + "epoch": 0.02168, + "grad_norm": 3.28125, + "grad_norm_var": 0.022493489583333335, + "learning_rate": 0.0001, + "loss": 5.6037, + "loss/crossentropy": 1.6533048152923584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2814648747444153, + "step": 1084 + }, + { + "epoch": 0.02172, + "grad_norm": 3.59375, + "grad_norm_var": 0.0232086181640625, + "learning_rate": 0.0001, + "loss": 5.7731, + "loss/crossentropy": 2.69880211353302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3835880607366562, + "step": 1086 + }, + { + "epoch": 0.02176, + "grad_norm": 3.5, + "grad_norm_var": 0.0240631103515625, + "learning_rate": 0.0001, + "loss": 5.8119, + "loss/crossentropy": 2.214504837989807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31686101853847504, + "step": 1088 + }, + { + "epoch": 0.0218, + "grad_norm": 3.765625, + "grad_norm_var": 0.027164713541666666, + "learning_rate": 0.0001, + "loss": 6.0376, + "loss/crossentropy": 2.2377456426620483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3682183176279068, + "step": 1090 + }, + { + "epoch": 0.02184, + "grad_norm": 3.359375, + "grad_norm_var": 0.027799479166666665, + "learning_rate": 0.0001, + "loss": 5.7387, + "loss/crossentropy": 2.0977545976638794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32313986122608185, + "step": 1092 + }, + { + "epoch": 0.02188, + "grad_norm": 3.125, + "grad_norm_var": 0.0371490478515625, + "learning_rate": 0.0001, + "loss": 5.6407, + "loss/crossentropy": 2.1717870235443115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3619600385427475, + "step": 1094 + }, + { + "epoch": 0.02192, + "grad_norm": 3.546875, + "grad_norm_var": 0.03674723307291667, + "learning_rate": 0.0001, + "loss": 5.554, + "loss/crossentropy": 2.2805471420288086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3328556418418884, + "step": 1096 + }, + { + "epoch": 0.02196, + "grad_norm": 3.71875, + "grad_norm_var": 0.04625244140625, + "learning_rate": 0.0001, + "loss": 5.8009, + "loss/crossentropy": 2.034846782684326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3610256612300873, + "step": 1098 + }, + { + "epoch": 0.022, + "grad_norm": 3.359375, + "grad_norm_var": 0.04612528483072917, + "learning_rate": 0.0001, + "loss": 5.6121, + "loss/crossentropy": 2.0208348631858826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3311958611011505, + "step": 1100 + }, + { + "epoch": 0.02204, + "grad_norm": 3.59375, + "grad_norm_var": 0.04737040201822917, + "learning_rate": 0.0001, + "loss": 5.7135, + "loss/crossentropy": 2.1020554900169373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3638540059328079, + "step": 1102 + }, + { + "epoch": 0.02208, + "grad_norm": 3.46875, + "grad_norm_var": 0.046019490559895834, + "learning_rate": 0.0001, + "loss": 5.5099, + "loss/crossentropy": 2.28346848487854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3434429168701172, + "step": 1104 + }, + { + "epoch": 0.02212, + "grad_norm": 3.21875, + "grad_norm_var": 0.1152740478515625, + "learning_rate": 0.0001, + "loss": 5.6866, + "loss/crossentropy": 2.103623867034912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34000229835510254, + "step": 1106 + }, + { + "epoch": 0.02216, + "grad_norm": 4.1875, + "grad_norm_var": 0.14846089680989583, + "learning_rate": 0.0001, + "loss": 5.5196, + "loss/crossentropy": 2.205894947052002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3432965874671936, + "step": 1108 + }, + { + "epoch": 0.0222, + "grad_norm": 3.109375, + "grad_norm_var": 0.16467183430989582, + "learning_rate": 0.0001, + "loss": 5.6166, + "loss/crossentropy": 2.395035982131958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3834904432296753, + "step": 1110 + }, + { + "epoch": 0.02224, + "grad_norm": 3.296875, + "grad_norm_var": 0.16505533854166668, + "learning_rate": 0.0001, + "loss": 5.9042, + "loss/crossentropy": 2.3755353689193726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33256760239601135, + "step": 1112 + }, + { + "epoch": 0.02228, + "grad_norm": 3.28125, + "grad_norm_var": 0.16402079264322916, + "learning_rate": 0.0001, + "loss": 6.0135, + "loss/crossentropy": 2.6754449605941772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3524845540523529, + "step": 1114 + }, + { + "epoch": 0.02232, + "grad_norm": 3.515625, + "grad_norm_var": 0.15730692545572916, + "learning_rate": 0.0001, + "loss": 5.6448, + "loss/crossentropy": 2.2398552894592285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.343609020113945, + "step": 1116 + }, + { + "epoch": 0.02236, + "grad_norm": 3.3125, + "grad_norm_var": 0.16770833333333332, + "learning_rate": 0.0001, + "loss": 5.3075, + "loss/crossentropy": 2.399322271347046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34514427185058594, + "step": 1118 + }, + { + "epoch": 0.0224, + "grad_norm": 3.3125, + "grad_norm_var": 0.17229410807291667, + "learning_rate": 0.0001, + "loss": 5.8886, + "loss/crossentropy": 2.4002050161361694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35784730315208435, + "step": 1120 + }, + { + "epoch": 0.02244, + "grad_norm": 3.21875, + "grad_norm_var": 0.10064188639322917, + "learning_rate": 0.0001, + "loss": 5.6239, + "loss/crossentropy": 2.293683171272278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37052902579307556, + "step": 1122 + }, + { + "epoch": 0.02248, + "grad_norm": 3.234375, + "grad_norm_var": 0.0532379150390625, + "learning_rate": 0.0001, + "loss": 5.8244, + "loss/crossentropy": 2.2177391052246094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3482564836740494, + "step": 1124 + }, + { + "epoch": 0.02252, + "grad_norm": 4.1875, + "grad_norm_var": 0.0637603759765625, + "learning_rate": 0.0001, + "loss": 5.8959, + "loss/crossentropy": 2.288211703300476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3401540517807007, + "step": 1126 + }, + { + "epoch": 0.02256, + "grad_norm": 3.1875, + "grad_norm_var": 0.0933258056640625, + "learning_rate": 0.0001, + "loss": 5.5054, + "loss/crossentropy": 2.1786144971847534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.331302285194397, + "step": 1128 + }, + { + "epoch": 0.0226, + "grad_norm": 3.5, + "grad_norm_var": 0.09163004557291667, + "learning_rate": 0.0001, + "loss": 5.7604, + "loss/crossentropy": 2.0390175580978394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32953669130802155, + "step": 1130 + }, + { + "epoch": 0.02264, + "grad_norm": 3.5, + "grad_norm_var": 0.0921295166015625, + "learning_rate": 0.0001, + "loss": 5.6952, + "loss/crossentropy": 2.188543677330017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3585694134235382, + "step": 1132 + }, + { + "epoch": 0.02268, + "grad_norm": 3.296875, + "grad_norm_var": 0.08284098307291667, + "learning_rate": 0.0001, + "loss": 5.732, + "loss/crossentropy": 2.0731694102287292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3121813088655472, + "step": 1134 + }, + { + "epoch": 0.02272, + "grad_norm": 3.390625, + "grad_norm_var": 0.0794097900390625, + "learning_rate": 0.0001, + "loss": 5.6306, + "loss/crossentropy": 2.144552707672119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3303966820240021, + "step": 1136 + }, + { + "epoch": 0.02276, + "grad_norm": 3.890625, + "grad_norm_var": 0.09900614420572916, + "learning_rate": 0.0001, + "loss": 5.3794, + "loss/crossentropy": 2.1617428064346313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3420899361371994, + "step": 1138 + }, + { + "epoch": 0.0228, + "grad_norm": 3.109375, + "grad_norm_var": 0.12813212076822916, + "learning_rate": 0.0001, + "loss": 5.2834, + "loss/crossentropy": 1.8098865747451782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28376901149749756, + "step": 1140 + }, + { + "epoch": 0.02284, + "grad_norm": 2.984375, + "grad_norm_var": 0.09795633951822917, + "learning_rate": 0.0001, + "loss": 5.3911, + "loss/crossentropy": 2.133797824382782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3587050139904022, + "step": 1142 + }, + { + "epoch": 0.02288, + "grad_norm": 4.96875, + "grad_norm_var": 0.24221903483072918, + "learning_rate": 0.0001, + "loss": 5.8787, + "loss/crossentropy": 2.378090739250183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4244185537099838, + "step": 1144 + }, + { + "epoch": 0.02292, + "grad_norm": 3.03125, + "grad_norm_var": 0.2736887613932292, + "learning_rate": 0.0001, + "loss": 5.6692, + "loss/crossentropy": 2.4442414045333862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3569464683532715, + "step": 1146 + }, + { + "epoch": 0.02296, + "grad_norm": 3.671875, + "grad_norm_var": 0.29189453125, + "learning_rate": 0.0001, + "loss": 5.5468, + "loss/crossentropy": 2.6446973085403442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36859095096588135, + "step": 1148 + }, + { + "epoch": 0.023, + "grad_norm": 4.4375, + "grad_norm_var": 0.3506988525390625, + "learning_rate": 0.0001, + "loss": 5.9727, + "loss/crossentropy": 2.4100207090377808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4082309305667877, + "step": 1150 + }, + { + "epoch": 0.02304, + "grad_norm": 3.25, + "grad_norm_var": 0.3552317301432292, + "learning_rate": 0.0001, + "loss": 5.3537, + "loss/crossentropy": 2.1472485661506653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3299275189638138, + "step": 1152 + }, + { + "epoch": 0.02308, + "grad_norm": 3.21875, + "grad_norm_var": 0.3376373291015625, + "learning_rate": 0.0001, + "loss": 5.4393, + "loss/crossentropy": 2.1891872882843018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.356732040643692, + "step": 1154 + }, + { + "epoch": 0.02312, + "grad_norm": 3.71875, + "grad_norm_var": 0.30201416015625, + "learning_rate": 0.0001, + "loss": 6.0049, + "loss/crossentropy": 2.1432933807373047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.360423281788826, + "step": 1156 + }, + { + "epoch": 0.02316, + "grad_norm": 3.1875, + "grad_norm_var": 0.2923248291015625, + "learning_rate": 0.0001, + "loss": 5.5541, + "loss/crossentropy": 2.176819324493408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.340317040681839, + "step": 1158 + }, + { + "epoch": 0.0232, + "grad_norm": 2.96875, + "grad_norm_var": 0.17407938639322917, + "learning_rate": 0.0001, + "loss": 5.2312, + "loss/crossentropy": 2.325207471847534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3309635668992996, + "step": 1160 + }, + { + "epoch": 0.02324, + "grad_norm": 3.515625, + "grad_norm_var": 0.14537353515625, + "learning_rate": 0.0001, + "loss": 5.7322, + "loss/crossentropy": 2.2536743879318237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31422293186187744, + "step": 1162 + }, + { + "epoch": 0.02328, + "grad_norm": 3.328125, + "grad_norm_var": 0.12657877604166667, + "learning_rate": 0.0001, + "loss": 5.4354, + "loss/crossentropy": 2.180476427078247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2956361174583435, + "step": 1164 + }, + { + "epoch": 0.02332, + "grad_norm": 3.15625, + "grad_norm_var": 0.0527008056640625, + "learning_rate": 0.0001, + "loss": 5.6067, + "loss/crossentropy": 1.995088815689087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2936599552631378, + "step": 1166 + }, + { + "epoch": 0.02336, + "grad_norm": 3.1875, + "grad_norm_var": 0.05378316243489583, + "learning_rate": 0.0001, + "loss": 5.7304, + "loss/crossentropy": 2.2555994987487793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3630402684211731, + "step": 1168 + }, + { + "epoch": 0.0234, + "grad_norm": 3.296875, + "grad_norm_var": 0.05856119791666667, + "learning_rate": 0.0001, + "loss": 5.1613, + "loss/crossentropy": 2.0441415905952454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2960120141506195, + "step": 1170 + }, + { + "epoch": 0.02344, + "grad_norm": 3.25, + "grad_norm_var": 0.0204498291015625, + "learning_rate": 0.0001, + "loss": 5.2238, + "loss/crossentropy": 1.8090497255325317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29750876128673553, + "step": 1172 + }, + { + "epoch": 0.02348, + "grad_norm": 3.234375, + "grad_norm_var": 0.0281158447265625, + "learning_rate": 0.0001, + "loss": 5.2506, + "loss/crossentropy": 2.2780312299728394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3376633822917938, + "step": 1174 + }, + { + "epoch": 0.02352, + "grad_norm": 3.4375, + "grad_norm_var": 0.02506103515625, + "learning_rate": 0.0001, + "loss": 5.53, + "loss/crossentropy": 1.8047232627868652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29132279753685, + "step": 1176 + }, + { + "epoch": 0.02356, + "grad_norm": 3.265625, + "grad_norm_var": 0.020417277018229166, + "learning_rate": 0.0001, + "loss": 5.4149, + "loss/crossentropy": 2.203469753265381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3124798536300659, + "step": 1178 + }, + { + "epoch": 0.0236, + "grad_norm": 3.34375, + "grad_norm_var": 0.019775390625, + "learning_rate": 0.0001, + "loss": 5.4481, + "loss/crossentropy": 2.078102231025696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.331977054476738, + "step": 1180 + }, + { + "epoch": 0.02364, + "grad_norm": 3.765625, + "grad_norm_var": 0.0368316650390625, + "learning_rate": 0.0001, + "loss": 5.93, + "loss/crossentropy": 2.2701854705810547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3202142268419266, + "step": 1182 + }, + { + "epoch": 0.02368, + "grad_norm": 3.671875, + "grad_norm_var": 0.047240193684895834, + "learning_rate": 0.0001, + "loss": 5.6414, + "loss/crossentropy": 1.930423617362976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3192301094532013, + "step": 1184 + }, + { + "epoch": 0.02372, + "grad_norm": 3.546875, + "grad_norm_var": 0.06542867024739583, + "learning_rate": 0.0001, + "loss": 6.0688, + "loss/crossentropy": 2.291581869125366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3759836256504059, + "step": 1186 + }, + { + "epoch": 0.02376, + "grad_norm": 3.375, + "grad_norm_var": 0.06529541015625, + "learning_rate": 0.0001, + "loss": 5.6689, + "loss/crossentropy": 1.9887789487838745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31285202503204346, + "step": 1188 + }, + { + "epoch": 0.0238, + "grad_norm": 3.109375, + "grad_norm_var": 0.054671223958333334, + "learning_rate": 0.0001, + "loss": 5.7457, + "loss/crossentropy": 2.290129065513611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3461494445800781, + "step": 1190 + }, + { + "epoch": 0.02384, + "grad_norm": 3.6875, + "grad_norm_var": 0.0604888916015625, + "learning_rate": 0.0001, + "loss": 6.288, + "loss/crossentropy": 2.3252567052841187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3817393332719803, + "step": 1192 + }, + { + "epoch": 0.02388, + "grad_norm": 3.21875, + "grad_norm_var": 0.06230367024739583, + "learning_rate": 0.0001, + "loss": 5.4498, + "loss/crossentropy": 1.736217737197876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28928153216838837, + "step": 1194 + }, + { + "epoch": 0.02392, + "grad_norm": 3.09375, + "grad_norm_var": 0.06886393229166667, + "learning_rate": 0.0001, + "loss": 5.6234, + "loss/crossentropy": 2.112093210220337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3196643739938736, + "step": 1196 + }, + { + "epoch": 0.02396, + "grad_norm": 3.15625, + "grad_norm_var": 0.07258707682291667, + "learning_rate": 0.0001, + "loss": 5.3552, + "loss/crossentropy": 2.3334370851516724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3539857119321823, + "step": 1198 + }, + { + "epoch": 0.024, + "grad_norm": 3.484375, + "grad_norm_var": 0.09695536295572917, + "learning_rate": 0.0001, + "loss": 5.3293, + "loss/crossentropy": 1.7393567562103271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3211805671453476, + "step": 1200 + }, + { + "epoch": 0.02404, + "grad_norm": 3.4375, + "grad_norm_var": 0.07935791015625, + "learning_rate": 0.0001, + "loss": 5.6292, + "loss/crossentropy": 2.314788579940796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37599092721939087, + "step": 1202 + }, + { + "epoch": 0.02408, + "grad_norm": 3.1875, + "grad_norm_var": 0.076318359375, + "learning_rate": 0.0001, + "loss": 5.435, + "loss/crossentropy": 2.2820088863372803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33955833315849304, + "step": 1204 + }, + { + "epoch": 0.02412, + "grad_norm": 3.46875, + "grad_norm_var": 0.07183837890625, + "learning_rate": 0.0001, + "loss": 5.87, + "loss/crossentropy": 2.012476146221161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3048281967639923, + "step": 1206 + }, + { + "epoch": 0.02416, + "grad_norm": 3.1875, + "grad_norm_var": 0.07356669108072916, + "learning_rate": 0.0001, + "loss": 5.6751, + "loss/crossentropy": 2.315858483314514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32628118991851807, + "step": 1208 + }, + { + "epoch": 0.0242, + "grad_norm": 3.359375, + "grad_norm_var": 0.14773763020833333, + "learning_rate": 0.0001, + "loss": 5.6813, + "loss/crossentropy": 2.3260581493377686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.352965384721756, + "step": 1210 + }, + { + "epoch": 0.02424, + "grad_norm": 4.0625, + "grad_norm_var": 0.1748687744140625, + "learning_rate": 0.0001, + "loss": 5.689, + "loss/crossentropy": 2.199007749557495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3089260905981064, + "step": 1212 + }, + { + "epoch": 0.02428, + "grad_norm": 3.703125, + "grad_norm_var": 1.4942047119140625, + "learning_rate": 0.0001, + "loss": 5.798, + "loss/crossentropy": 2.30281138420105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3629491478204727, + "step": 1214 + }, + { + "epoch": 0.02432, + "grad_norm": 3.1875, + "grad_norm_var": 1.5125640869140624, + "learning_rate": 0.0001, + "loss": 5.4927, + "loss/crossentropy": 2.238295316696167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3436104357242584, + "step": 1216 + }, + { + "epoch": 0.02436, + "grad_norm": 3.9375, + "grad_norm_var": 1.501488240559896, + "learning_rate": 0.0001, + "loss": 5.5763, + "loss/crossentropy": 2.52456271648407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40382860600948334, + "step": 1218 + }, + { + "epoch": 0.0244, + "grad_norm": 3.46875, + "grad_norm_var": 1.47197265625, + "learning_rate": 0.0001, + "loss": 5.8796, + "loss/crossentropy": 2.4516665935516357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.378818541765213, + "step": 1220 + }, + { + "epoch": 0.02444, + "grad_norm": 3.046875, + "grad_norm_var": 1.485480753580729, + "learning_rate": 0.0001, + "loss": 5.5438, + "loss/crossentropy": 2.593857169151306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3743816912174225, + "step": 1222 + }, + { + "epoch": 0.02448, + "grad_norm": 3.375, + "grad_norm_var": 1.4396799723307292, + "learning_rate": 0.0001, + "loss": 5.2807, + "loss/crossentropy": 1.873874843120575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2743247449398041, + "step": 1224 + }, + { + "epoch": 0.02452, + "grad_norm": 3.265625, + "grad_norm_var": 1.4642862955729166, + "learning_rate": 0.0001, + "loss": 5.6455, + "loss/crossentropy": 2.1173152923583984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.350399985909462, + "step": 1226 + }, + { + "epoch": 0.02456, + "grad_norm": 3.5, + "grad_norm_var": 1.4721018473307292, + "learning_rate": 0.0001, + "loss": 5.7318, + "loss/crossentropy": 2.3650271892547607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3241463750600815, + "step": 1228 + }, + { + "epoch": 0.0246, + "grad_norm": 3.1875, + "grad_norm_var": 0.08748270670572916, + "learning_rate": 0.0001, + "loss": 5.5229, + "loss/crossentropy": 2.180622935295105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31293927133083344, + "step": 1230 + }, + { + "epoch": 0.02464, + "grad_norm": 3.546875, + "grad_norm_var": 0.08329976399739583, + "learning_rate": 0.0001, + "loss": 5.7443, + "loss/crossentropy": 2.230265259742737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3405953049659729, + "step": 1232 + }, + { + "epoch": 0.02468, + "grad_norm": 3.6875, + "grad_norm_var": 0.07164713541666666, + "learning_rate": 0.0001, + "loss": 5.5518, + "loss/crossentropy": 2.050285518169403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32737791538238525, + "step": 1234 + }, + { + "epoch": 0.02472, + "grad_norm": 3.140625, + "grad_norm_var": 0.07604166666666666, + "learning_rate": 0.0001, + "loss": 5.4412, + "loss/crossentropy": 2.2357693910598755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34366659820079803, + "step": 1236 + }, + { + "epoch": 0.02476, + "grad_norm": 3.09375, + "grad_norm_var": 0.09081624348958334, + "learning_rate": 0.0001, + "loss": 5.6066, + "loss/crossentropy": 2.308778762817383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33825138211250305, + "step": 1238 + }, + { + "epoch": 0.0248, + "grad_norm": 3.390625, + "grad_norm_var": 0.0979888916015625, + "learning_rate": 0.0001, + "loss": 5.419, + "loss/crossentropy": 2.016503393650055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30135589838027954, + "step": 1240 + }, + { + "epoch": 0.02484, + "grad_norm": 3.359375, + "grad_norm_var": 0.0870025634765625, + "learning_rate": 0.0001, + "loss": 5.2494, + "loss/crossentropy": 1.8450073599815369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34388674795627594, + "step": 1242 + }, + { + "epoch": 0.02488, + "grad_norm": 3.515625, + "grad_norm_var": 0.05263671875, + "learning_rate": 0.0001, + "loss": 5.7092, + "loss/crossentropy": 2.486730694770813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35796378552913666, + "step": 1244 + }, + { + "epoch": 0.02492, + "grad_norm": 2.859375, + "grad_norm_var": 0.0688385009765625, + "learning_rate": 0.0001, + "loss": 5.2565, + "loss/crossentropy": 2.010675370693207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31364670395851135, + "step": 1246 + }, + { + "epoch": 0.02496, + "grad_norm": 3.15625, + "grad_norm_var": 0.07418619791666667, + "learning_rate": 0.0001, + "loss": 5.366, + "loss/crossentropy": 2.128747880458832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3106560483574867, + "step": 1248 + }, + { + "epoch": 0.025, + "grad_norm": 3.359375, + "grad_norm_var": 0.06523335774739583, + "learning_rate": 0.0001, + "loss": 5.8005, + "loss/crossentropy": 2.4563735723495483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33769866824150085, + "step": 1250 + }, + { + "epoch": 0.02504, + "grad_norm": 3.15625, + "grad_norm_var": 0.06453450520833333, + "learning_rate": 0.0001, + "loss": 5.372, + "loss/crossentropy": 2.3785592317581177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36167748272418976, + "step": 1252 + }, + { + "epoch": 0.02508, + "grad_norm": 3.21875, + "grad_norm_var": 0.029069010416666666, + "learning_rate": 0.0001, + "loss": 5.5506, + "loss/crossentropy": 2.2642308473587036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3315223157405853, + "step": 1254 + }, + { + "epoch": 0.02512, + "grad_norm": 3.46875, + "grad_norm_var": 0.028416951497395832, + "learning_rate": 0.0001, + "loss": 5.4608, + "loss/crossentropy": 2.2246991395950317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33083613216876984, + "step": 1256 + }, + { + "epoch": 0.02516, + "grad_norm": 3.1875, + "grad_norm_var": 0.0290435791015625, + "learning_rate": 0.0001, + "loss": 5.762, + "loss/crossentropy": 2.129785656929016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32672610878944397, + "step": 1258 + }, + { + "epoch": 0.0252, + "grad_norm": 3.125, + "grad_norm_var": 0.0243804931640625, + "learning_rate": 0.0001, + "loss": 5.7297, + "loss/crossentropy": 2.0835453271865845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3504233658313751, + "step": 1260 + }, + { + "epoch": 0.02524, + "grad_norm": 3.703125, + "grad_norm_var": 0.024637858072916668, + "learning_rate": 0.0001, + "loss": 5.6707, + "loss/crossentropy": 2.443893313407898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3636191487312317, + "step": 1262 + }, + { + "epoch": 0.02528, + "grad_norm": 3.21875, + "grad_norm_var": 0.02197265625, + "learning_rate": 0.0001, + "loss": 5.6107, + "loss/crossentropy": 2.367433190345764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35734108090400696, + "step": 1264 + }, + { + "epoch": 0.02532, + "grad_norm": 3.1875, + "grad_norm_var": 0.02662353515625, + "learning_rate": 0.0001, + "loss": 5.049, + "loss/crossentropy": 1.8102024793624878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28676700592041016, + "step": 1266 + }, + { + "epoch": 0.02536, + "grad_norm": 3.0, + "grad_norm_var": 0.03738606770833333, + "learning_rate": 0.0001, + "loss": 5.3916, + "loss/crossentropy": 2.54524827003479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3380637466907501, + "step": 1268 + }, + { + "epoch": 0.0254, + "grad_norm": 2.984375, + "grad_norm_var": 0.040913899739583336, + "learning_rate": 0.0001, + "loss": 5.6667, + "loss/crossentropy": 2.4164276123046875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36949749290943146, + "step": 1270 + }, + { + "epoch": 0.02544, + "grad_norm": 3.515625, + "grad_norm_var": 0.043745930989583334, + "learning_rate": 0.0001, + "loss": 5.4235, + "loss/crossentropy": 2.4335602521896362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3608807325363159, + "step": 1272 + }, + { + "epoch": 0.02548, + "grad_norm": 3.1875, + "grad_norm_var": 0.0422760009765625, + "learning_rate": 0.0001, + "loss": 5.5858, + "loss/crossentropy": 2.2711308002471924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3631722033023834, + "step": 1274 + }, + { + "epoch": 0.02552, + "grad_norm": 3.140625, + "grad_norm_var": 0.0430816650390625, + "learning_rate": 0.0001, + "loss": 5.181, + "loss/crossentropy": 2.378043293952942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35628968477249146, + "step": 1276 + }, + { + "epoch": 0.02556, + "grad_norm": 3.1875, + "grad_norm_var": 0.02431640625, + "learning_rate": 0.0001, + "loss": 5.5721, + "loss/crossentropy": 1.8950039148330688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3105602264404297, + "step": 1278 + }, + { + "epoch": 0.0256, + "grad_norm": 2.96875, + "grad_norm_var": 0.026659138997395835, + "learning_rate": 0.0001, + "loss": 5.4649, + "loss/crossentropy": 1.8309656977653503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27458132803440094, + "step": 1280 + }, + { + "epoch": 0.02564, + "grad_norm": 3.40625, + "grad_norm_var": 0.034195963541666666, + "learning_rate": 0.0001, + "loss": 5.993, + "loss/crossentropy": 2.3949296474456787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37693680822849274, + "step": 1282 + }, + { + "epoch": 0.02568, + "grad_norm": 3.140625, + "grad_norm_var": 0.026725260416666667, + "learning_rate": 0.0001, + "loss": 5.6157, + "loss/crossentropy": 2.497879147529602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36076007783412933, + "step": 1284 + }, + { + "epoch": 0.02572, + "grad_norm": 2.953125, + "grad_norm_var": 0.027339680989583334, + "learning_rate": 0.0001, + "loss": 5.354, + "loss/crossentropy": 2.108432352542877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3115152269601822, + "step": 1286 + }, + { + "epoch": 0.02576, + "grad_norm": 3.171875, + "grad_norm_var": 0.0201568603515625, + "learning_rate": 0.0001, + "loss": 5.5424, + "loss/crossentropy": 2.079313635826111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31575673818588257, + "step": 1288 + }, + { + "epoch": 0.0258, + "grad_norm": 3.15625, + "grad_norm_var": 0.020099894205729166, + "learning_rate": 0.0001, + "loss": 5.2594, + "loss/crossentropy": 2.3390332460403442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3293873071670532, + "step": 1290 + }, + { + "epoch": 0.02584, + "grad_norm": 3.15625, + "grad_norm_var": 0.019612630208333332, + "learning_rate": 0.0001, + "loss": 5.2895, + "loss/crossentropy": 2.180980920791626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30600421130657196, + "step": 1292 + }, + { + "epoch": 0.02588, + "grad_norm": 2.9375, + "grad_norm_var": 0.022858683268229166, + "learning_rate": 0.0001, + "loss": 5.2784, + "loss/crossentropy": 2.0647836327552795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3114248663187027, + "step": 1294 + }, + { + "epoch": 0.02592, + "grad_norm": 2.84375, + "grad_norm_var": 0.02958984375, + "learning_rate": 0.0001, + "loss": 5.5063, + "loss/crossentropy": 1.931971788406372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32757866382598877, + "step": 1296 + }, + { + "epoch": 0.02596, + "grad_norm": 3.09375, + "grad_norm_var": 0.0202301025390625, + "learning_rate": 0.0001, + "loss": 5.6389, + "loss/crossentropy": 2.1180718541145325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3272206783294678, + "step": 1298 + }, + { + "epoch": 0.026, + "grad_norm": 3.4375, + "grad_norm_var": 0.024616495768229166, + "learning_rate": 0.0001, + "loss": 5.5069, + "loss/crossentropy": 1.8535473346710205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3100028932094574, + "step": 1300 + }, + { + "epoch": 0.02604, + "grad_norm": 4.5, + "grad_norm_var": 0.16031901041666666, + "learning_rate": 0.0001, + "loss": 5.574, + "loss/crossentropy": 1.9625197052955627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3074956685304642, + "step": 1302 + }, + { + "epoch": 0.02608, + "grad_norm": 3.25, + "grad_norm_var": 0.1673736572265625, + "learning_rate": 0.0001, + "loss": 5.3764, + "loss/crossentropy": 2.248521149158478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33484284579753876, + "step": 1304 + }, + { + "epoch": 0.02612, + "grad_norm": 2.921875, + "grad_norm_var": 0.1750640869140625, + "learning_rate": 0.0001, + "loss": 5.8318, + "loss/crossentropy": 2.6374051570892334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3604440838098526, + "step": 1306 + }, + { + "epoch": 0.02616, + "grad_norm": 4.0625, + "grad_norm_var": 0.22333984375, + "learning_rate": 0.0001, + "loss": 5.5151, + "loss/crossentropy": 2.3213003873825073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3659953773021698, + "step": 1308 + }, + { + "epoch": 0.0262, + "grad_norm": 3.734375, + "grad_norm_var": 0.21988525390625, + "learning_rate": 0.0001, + "loss": 5.9334, + "loss/crossentropy": 2.3527311086654663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3638792932033539, + "step": 1310 + }, + { + "epoch": 0.02624, + "grad_norm": 3.140625, + "grad_norm_var": 0.20551656087239584, + "learning_rate": 0.0001, + "loss": 5.4442, + "loss/crossentropy": 1.6319801807403564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29070258140563965, + "step": 1312 + }, + { + "epoch": 0.02628, + "grad_norm": 3.125, + "grad_norm_var": 0.215966796875, + "learning_rate": 0.0001, + "loss": 5.1864, + "loss/crossentropy": 1.986265480518341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30515219271183014, + "step": 1314 + }, + { + "epoch": 0.02632, + "grad_norm": 2.828125, + "grad_norm_var": 0.23336181640625, + "learning_rate": 0.0001, + "loss": 5.5808, + "loss/crossentropy": 2.237283766269684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32876546680927277, + "step": 1316 + }, + { + "epoch": 0.02636, + "grad_norm": 3.859375, + "grad_norm_var": 0.13362223307291668, + "learning_rate": 0.0001, + "loss": 5.4868, + "loss/crossentropy": 2.215467691421509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34449851512908936, + "step": 1318 + }, + { + "epoch": 0.0264, + "grad_norm": 3.03125, + "grad_norm_var": 0.12567952473958333, + "learning_rate": 0.0001, + "loss": 5.7427, + "loss/crossentropy": 2.264952063560486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.387825608253479, + "step": 1320 + }, + { + "epoch": 0.02644, + "grad_norm": 3.125, + "grad_norm_var": 0.12009989420572917, + "learning_rate": 0.0001, + "loss": 5.5614, + "loss/crossentropy": 2.0678945779800415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3228907287120819, + "step": 1322 + }, + { + "epoch": 0.02648, + "grad_norm": 4.40625, + "grad_norm_var": 0.212841796875, + "learning_rate": 0.0001, + "loss": 5.6259, + "loss/crossentropy": 2.1414765119552612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31144315004348755, + "step": 1324 + }, + { + "epoch": 0.02652, + "grad_norm": 4.03125, + "grad_norm_var": 0.23813374837239584, + "learning_rate": 0.0001, + "loss": 5.8487, + "loss/crossentropy": 2.3898890018463135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3721010088920593, + "step": 1326 + }, + { + "epoch": 0.02656, + "grad_norm": 3.421875, + "grad_norm_var": 0.23483784993489584, + "learning_rate": 0.0001, + "loss": 5.5417, + "loss/crossentropy": 2.404030442237854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36486634612083435, + "step": 1328 + }, + { + "epoch": 0.0266, + "grad_norm": 3.15625, + "grad_norm_var": 0.22004801432291668, + "learning_rate": 0.0001, + "loss": 5.7535, + "loss/crossentropy": 2.1436617970466614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3091920465230942, + "step": 1330 + }, + { + "epoch": 0.02664, + "grad_norm": 3.1875, + "grad_norm_var": 0.2074615478515625, + "learning_rate": 0.0001, + "loss": 5.2832, + "loss/crossentropy": 2.106055796146393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3344803601503372, + "step": 1332 + }, + { + "epoch": 0.02668, + "grad_norm": 3.640625, + "grad_norm_var": 0.19583333333333333, + "learning_rate": 0.0001, + "loss": 6.0091, + "loss/crossentropy": 2.465435266494751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34129244089126587, + "step": 1334 + }, + { + "epoch": 0.02672, + "grad_norm": 3.21875, + "grad_norm_var": 0.18728841145833333, + "learning_rate": 0.0001, + "loss": 5.5569, + "loss/crossentropy": 1.9109253883361816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30202071368694305, + "step": 1336 + }, + { + "epoch": 0.02676, + "grad_norm": 3.0, + "grad_norm_var": 0.1890045166015625, + "learning_rate": 0.0001, + "loss": 5.7093, + "loss/crossentropy": 2.267784833908081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32984335720539093, + "step": 1338 + }, + { + "epoch": 0.0268, + "grad_norm": 3.0625, + "grad_norm_var": 0.07266337076822917, + "learning_rate": 0.0001, + "loss": 5.4432, + "loss/crossentropy": 2.6131194829940796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3797858655452728, + "step": 1340 + }, + { + "epoch": 0.02684, + "grad_norm": 3.171875, + "grad_norm_var": 0.031891886393229166, + "learning_rate": 0.0001, + "loss": 5.7456, + "loss/crossentropy": 2.214663505554199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.340796560049057, + "step": 1342 + }, + { + "epoch": 0.02688, + "grad_norm": 3.0, + "grad_norm_var": 0.03245340983072917, + "learning_rate": 0.0001, + "loss": 5.4205, + "loss/crossentropy": 2.0236783027648926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3101559728384018, + "step": 1344 + }, + { + "epoch": 0.02692, + "grad_norm": 3.078125, + "grad_norm_var": 0.030565388997395835, + "learning_rate": 0.0001, + "loss": 5.2671, + "loss/crossentropy": 2.3260135650634766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.339004784822464, + "step": 1346 + }, + { + "epoch": 0.02696, + "grad_norm": 2.96875, + "grad_norm_var": 0.0317047119140625, + "learning_rate": 0.0001, + "loss": 5.6529, + "loss/crossentropy": 2.177807927131653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3171197772026062, + "step": 1348 + }, + { + "epoch": 0.027, + "grad_norm": 3.375, + "grad_norm_var": 0.0202056884765625, + "learning_rate": 0.0001, + "loss": 5.4682, + "loss/crossentropy": 2.351730227470398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34749266505241394, + "step": 1350 + }, + { + "epoch": 0.02704, + "grad_norm": 2.953125, + "grad_norm_var": 0.017867024739583334, + "learning_rate": 0.0001, + "loss": 5.3692, + "loss/crossentropy": 2.2959564924240112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3270048499107361, + "step": 1352 + }, + { + "epoch": 0.02708, + "grad_norm": 3.03125, + "grad_norm_var": 0.016890462239583334, + "learning_rate": 0.0001, + "loss": 5.5243, + "loss/crossentropy": 2.399070382118225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32505376636981964, + "step": 1354 + }, + { + "epoch": 0.02712, + "grad_norm": 3.046875, + "grad_norm_var": 0.01812744140625, + "learning_rate": 0.0001, + "loss": 5.3306, + "loss/crossentropy": 1.9084061980247498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3073619455099106, + "step": 1356 + }, + { + "epoch": 0.02716, + "grad_norm": 3.078125, + "grad_norm_var": 0.01441650390625, + "learning_rate": 0.0001, + "loss": 5.3942, + "loss/crossentropy": 2.1204254627227783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2885961979627609, + "step": 1358 + }, + { + "epoch": 0.0272, + "grad_norm": 3.28125, + "grad_norm_var": 0.014842732747395834, + "learning_rate": 0.0001, + "loss": 5.7313, + "loss/crossentropy": 2.0167239904403687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3026747703552246, + "step": 1360 + }, + { + "epoch": 0.02724, + "grad_norm": 3.171875, + "grad_norm_var": 0.014742024739583333, + "learning_rate": 0.0001, + "loss": 5.3987, + "loss/crossentropy": 1.9588357210159302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2976878881454468, + "step": 1362 + }, + { + "epoch": 0.02728, + "grad_norm": 2.796875, + "grad_norm_var": 0.019449869791666668, + "learning_rate": 0.0001, + "loss": 5.1658, + "loss/crossentropy": 1.8169561624526978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2874959260225296, + "step": 1364 + }, + { + "epoch": 0.02732, + "grad_norm": 3.1875, + "grad_norm_var": 0.014354451497395834, + "learning_rate": 0.0001, + "loss": 5.5128, + "loss/crossentropy": 2.258527398109436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34180621802806854, + "step": 1366 + }, + { + "epoch": 0.02736, + "grad_norm": 3.09375, + "grad_norm_var": 0.017284138997395834, + "learning_rate": 0.0001, + "loss": 5.4374, + "loss/crossentropy": 2.1959571838378906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3124794214963913, + "step": 1368 + }, + { + "epoch": 0.0274, + "grad_norm": 2.875, + "grad_norm_var": 0.0198150634765625, + "learning_rate": 0.0001, + "loss": 5.2299, + "loss/crossentropy": 2.1830934286117554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3260872811079025, + "step": 1370 + }, + { + "epoch": 0.02744, + "grad_norm": 3.203125, + "grad_norm_var": 0.02017822265625, + "learning_rate": 0.0001, + "loss": 5.6831, + "loss/crossentropy": 2.411653518676758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3575899302959442, + "step": 1372 + }, + { + "epoch": 0.02748, + "grad_norm": 3.140625, + "grad_norm_var": 0.021512858072916665, + "learning_rate": 0.0001, + "loss": 5.3919, + "loss/crossentropy": 2.1585222482681274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30734311044216156, + "step": 1374 + }, + { + "epoch": 0.02752, + "grad_norm": 2.90625, + "grad_norm_var": 0.023824055989583332, + "learning_rate": 0.0001, + "loss": 5.37, + "loss/crossentropy": 2.3621217012405396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3373589664697647, + "step": 1376 + }, + { + "epoch": 0.02756, + "grad_norm": 3.109375, + "grad_norm_var": 0.023160807291666665, + "learning_rate": 0.0001, + "loss": 5.4712, + "loss/crossentropy": 2.1203317046165466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28425413370132446, + "step": 1378 + }, + { + "epoch": 0.0276, + "grad_norm": 3.296875, + "grad_norm_var": 0.0223297119140625, + "learning_rate": 0.0001, + "loss": 5.5438, + "loss/crossentropy": 2.2568705081939697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.296497106552124, + "step": 1380 + }, + { + "epoch": 0.02764, + "grad_norm": 3.5625, + "grad_norm_var": 0.03665364583333333, + "learning_rate": 0.0001, + "loss": 5.5613, + "loss/crossentropy": 2.260026216506958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34534430503845215, + "step": 1382 + }, + { + "epoch": 0.02768, + "grad_norm": 5.5625, + "grad_norm_var": 0.4100494384765625, + "learning_rate": 0.0001, + "loss": 5.6217, + "loss/crossentropy": 1.9400787949562073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32853779196739197, + "step": 1384 + }, + { + "epoch": 0.02772, + "grad_norm": 3.390625, + "grad_norm_var": 0.39205322265625, + "learning_rate": 0.0001, + "loss": 5.2009, + "loss/crossentropy": 2.168904423713684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3421178460121155, + "step": 1386 + }, + { + "epoch": 0.02776, + "grad_norm": 2.90625, + "grad_norm_var": 0.405419921875, + "learning_rate": 0.0001, + "loss": 5.3992, + "loss/crossentropy": 2.3474777936935425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31982940435409546, + "step": 1388 + }, + { + "epoch": 0.0278, + "grad_norm": 3.140625, + "grad_norm_var": 0.40615234375, + "learning_rate": 0.0001, + "loss": 5.5791, + "loss/crossentropy": 2.3416868448257446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3400905281305313, + "step": 1390 + }, + { + "epoch": 0.02784, + "grad_norm": 2.8125, + "grad_norm_var": 0.4003570556640625, + "learning_rate": 0.0001, + "loss": 5.4413, + "loss/crossentropy": 2.299672842025757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3143058717250824, + "step": 1392 + }, + { + "epoch": 0.02788, + "grad_norm": 3.15625, + "grad_norm_var": 0.3945058186848958, + "learning_rate": 0.0001, + "loss": 5.6231, + "loss/crossentropy": 2.3058812618255615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33790935575962067, + "step": 1394 + }, + { + "epoch": 0.02792, + "grad_norm": 3.03125, + "grad_norm_var": 0.3947987874348958, + "learning_rate": 0.0001, + "loss": 5.324, + "loss/crossentropy": 2.2137999534606934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33156970143318176, + "step": 1396 + }, + { + "epoch": 0.02796, + "grad_norm": 4.1875, + "grad_norm_var": 0.4427571614583333, + "learning_rate": 0.0001, + "loss": 5.4567, + "loss/crossentropy": 2.04184353351593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32263143360614777, + "step": 1398 + }, + { + "epoch": 0.028, + "grad_norm": 3.265625, + "grad_norm_var": 0.09589436848958334, + "learning_rate": 0.0001, + "loss": 5.339, + "loss/crossentropy": 2.0377472639083862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3136487454175949, + "step": 1400 + }, + { + "epoch": 0.02804, + "grad_norm": 3.171875, + "grad_norm_var": 0.09228413899739583, + "learning_rate": 0.0001, + "loss": 5.5838, + "loss/crossentropy": 2.5366055965423584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3476262539625168, + "step": 1402 + }, + { + "epoch": 0.02808, + "grad_norm": 3.1875, + "grad_norm_var": 0.09063212076822917, + "learning_rate": 0.0001, + "loss": 5.2447, + "loss/crossentropy": 2.088012456893921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31111815571784973, + "step": 1404 + }, + { + "epoch": 0.02812, + "grad_norm": 3.53125, + "grad_norm_var": 0.10530598958333333, + "learning_rate": 0.0001, + "loss": 5.7316, + "loss/crossentropy": 2.1750329732894897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3138856291770935, + "step": 1406 + }, + { + "epoch": 0.02816, + "grad_norm": 3.625, + "grad_norm_var": 0.09811909993489583, + "learning_rate": 0.0001, + "loss": 5.4708, + "loss/crossentropy": 1.977031648159027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3179774433374405, + "step": 1408 + }, + { + "epoch": 0.0282, + "grad_norm": 3.296875, + "grad_norm_var": 0.12561848958333333, + "learning_rate": 0.0001, + "loss": 5.3815, + "loss/crossentropy": 2.0594210028648376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32200056314468384, + "step": 1410 + }, + { + "epoch": 0.02824, + "grad_norm": 3.59375, + "grad_norm_var": 0.11237691243489584, + "learning_rate": 0.0001, + "loss": 5.443, + "loss/crossentropy": 2.3887473344802856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3303599953651428, + "step": 1412 + }, + { + "epoch": 0.02828, + "grad_norm": 3.453125, + "grad_norm_var": 0.06500244140625, + "learning_rate": 0.0001, + "loss": 5.5507, + "loss/crossentropy": 2.188898801803589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3378629684448242, + "step": 1414 + }, + { + "epoch": 0.02832, + "grad_norm": 3.171875, + "grad_norm_var": 0.06982014973958334, + "learning_rate": 0.0001, + "loss": 5.458, + "loss/crossentropy": 1.981561303138733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2814937084913254, + "step": 1416 + }, + { + "epoch": 0.02836, + "grad_norm": 3.1875, + "grad_norm_var": 0.0756011962890625, + "learning_rate": 0.0001, + "loss": 5.271, + "loss/crossentropy": 2.2141716480255127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3429889380931854, + "step": 1418 + }, + { + "epoch": 0.0284, + "grad_norm": 3.1875, + "grad_norm_var": 0.0818267822265625, + "learning_rate": 0.0001, + "loss": 5.048, + "loss/crossentropy": 2.0720977783203125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30601558089256287, + "step": 1420 + }, + { + "epoch": 0.02844, + "grad_norm": 3.21875, + "grad_norm_var": 0.07434488932291666, + "learning_rate": 0.0001, + "loss": 5.6407, + "loss/crossentropy": 2.0516344904899597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3182393014431, + "step": 1422 + }, + { + "epoch": 0.02848, + "grad_norm": 3.28125, + "grad_norm_var": 0.070166015625, + "learning_rate": 0.0001, + "loss": 5.6532, + "loss/crossentropy": 2.161284327507019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3383013904094696, + "step": 1424 + }, + { + "epoch": 0.02852, + "grad_norm": 3.1875, + "grad_norm_var": 0.03163960774739583, + "learning_rate": 0.0001, + "loss": 5.1594, + "loss/crossentropy": 1.9955796599388123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3169983923435211, + "step": 1426 + }, + { + "epoch": 0.02856, + "grad_norm": 2.984375, + "grad_norm_var": 0.023421223958333334, + "learning_rate": 0.0001, + "loss": 5.3951, + "loss/crossentropy": 2.1046979427337646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30594320595264435, + "step": 1428 + }, + { + "epoch": 0.0286, + "grad_norm": 2.90625, + "grad_norm_var": 0.020231119791666665, + "learning_rate": 0.0001, + "loss": 5.364, + "loss/crossentropy": 1.9611601829528809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3039780706167221, + "step": 1430 + }, + { + "epoch": 0.02864, + "grad_norm": 2.9375, + "grad_norm_var": 0.021610514322916666, + "learning_rate": 0.0001, + "loss": 5.8823, + "loss/crossentropy": 2.256209373474121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3315662145614624, + "step": 1432 + }, + { + "epoch": 0.02868, + "grad_norm": 2.9375, + "grad_norm_var": 0.030594889322916666, + "learning_rate": 0.0001, + "loss": 5.2821, + "loss/crossentropy": 2.150561034679413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33046241104602814, + "step": 1434 + }, + { + "epoch": 0.02872, + "grad_norm": 3.078125, + "grad_norm_var": 0.024898274739583334, + "learning_rate": 0.0001, + "loss": 5.3609, + "loss/crossentropy": 2.279554605484009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3621693551540375, + "step": 1436 + }, + { + "epoch": 0.02876, + "grad_norm": 3.015625, + "grad_norm_var": 0.0259918212890625, + "learning_rate": 0.0001, + "loss": 5.4323, + "loss/crossentropy": 2.0183790922164917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33361808955669403, + "step": 1438 + }, + { + "epoch": 0.0288, + "grad_norm": 4.15625, + "grad_norm_var": 0.7352203369140625, + "learning_rate": 0.0001, + "loss": 5.4781, + "loss/crossentropy": 1.964252531528473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31638333201408386, + "step": 1440 + }, + { + "epoch": 0.02884, + "grad_norm": 3.046875, + "grad_norm_var": 0.73902587890625, + "learning_rate": 0.0001, + "loss": 5.6907, + "loss/crossentropy": 1.9271634817123413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3408525586128235, + "step": 1442 + }, + { + "epoch": 0.02888, + "grad_norm": 2.953125, + "grad_norm_var": 0.7411417643229167, + "learning_rate": 0.0001, + "loss": 5.4869, + "loss/crossentropy": 2.4400887489318848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33455249667167664, + "step": 1444 + }, + { + "epoch": 0.02892, + "grad_norm": 3.078125, + "grad_norm_var": 0.7270904541015625, + "learning_rate": 0.0001, + "loss": 5.6179, + "loss/crossentropy": 2.465711832046509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3528379648923874, + "step": 1446 + }, + { + "epoch": 0.02896, + "grad_norm": 3.328125, + "grad_norm_var": 0.7147288004557292, + "learning_rate": 0.0001, + "loss": 5.5125, + "loss/crossentropy": 2.269619941711426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32898105680942535, + "step": 1448 + }, + { + "epoch": 0.029, + "grad_norm": 2.984375, + "grad_norm_var": 0.68287353515625, + "learning_rate": 0.0001, + "loss": 5.5775, + "loss/crossentropy": 2.0418076515197754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3170912265777588, + "step": 1450 + }, + { + "epoch": 0.02904, + "grad_norm": 3.15625, + "grad_norm_var": 0.6841756184895833, + "learning_rate": 0.0001, + "loss": 5.5964, + "loss/crossentropy": 2.291188359260559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35694004595279694, + "step": 1452 + }, + { + "epoch": 0.02908, + "grad_norm": 2.75, + "grad_norm_var": 0.7253163655598959, + "learning_rate": 0.0001, + "loss": 5.1608, + "loss/crossentropy": 2.497802972793579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3271156847476959, + "step": 1454 + }, + { + "epoch": 0.02912, + "grad_norm": 2.984375, + "grad_norm_var": 0.0334625244140625, + "learning_rate": 0.0001, + "loss": 5.1527, + "loss/crossentropy": 2.161794900894165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32509492337703705, + "step": 1456 + }, + { + "epoch": 0.02916, + "grad_norm": 3.3125, + "grad_norm_var": 0.03470052083333333, + "learning_rate": 0.0001, + "loss": 5.7939, + "loss/crossentropy": 2.632015347480774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36461199820041656, + "step": 1458 + }, + { + "epoch": 0.0292, + "grad_norm": 3.03125, + "grad_norm_var": 0.03372395833333333, + "learning_rate": 0.0001, + "loss": 5.6233, + "loss/crossentropy": 2.2478749752044678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3311367332935333, + "step": 1460 + }, + { + "epoch": 0.02924, + "grad_norm": 4.03125, + "grad_norm_var": 0.09263916015625, + "learning_rate": 0.0001, + "loss": 5.2528, + "loss/crossentropy": 1.8451723456382751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31802159547805786, + "step": 1462 + }, + { + "epoch": 0.02928, + "grad_norm": 3.71875, + "grad_norm_var": 0.11772359212239583, + "learning_rate": 0.0001, + "loss": 5.6469, + "loss/crossentropy": 2.6684207916259766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35483625531196594, + "step": 1464 + }, + { + "epoch": 0.02932, + "grad_norm": 2.921875, + "grad_norm_var": 0.11543680826822916, + "learning_rate": 0.0001, + "loss": 5.2866, + "loss/crossentropy": 2.497614622116089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3422156721353531, + "step": 1466 + }, + { + "epoch": 0.02936, + "grad_norm": 4.25, + "grad_norm_var": 0.19032796223958334, + "learning_rate": 0.0001, + "loss": 5.7424, + "loss/crossentropy": 2.750740170478821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.377558171749115, + "step": 1468 + }, + { + "epoch": 0.0294, + "grad_norm": 2.8125, + "grad_norm_var": 0.17550455729166667, + "learning_rate": 0.0001, + "loss": 5.1741, + "loss/crossentropy": 1.9961607456207275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3166217654943466, + "step": 1470 + }, + { + "epoch": 0.02944, + "grad_norm": 3.359375, + "grad_norm_var": 0.1655914306640625, + "learning_rate": 0.0001, + "loss": 5.3114, + "loss/crossentropy": 2.0874768495559692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2912164479494095, + "step": 1472 + }, + { + "epoch": 0.02948, + "grad_norm": 2.953125, + "grad_norm_var": 0.174072265625, + "learning_rate": 0.0001, + "loss": 5.2102, + "loss/crossentropy": 2.112182080745697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29954925179481506, + "step": 1474 + }, + { + "epoch": 0.02952, + "grad_norm": 3.6875, + "grad_norm_var": 0.4074371337890625, + "learning_rate": 0.0001, + "loss": 5.7839, + "loss/crossentropy": 2.1319644451141357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3002544492483139, + "step": 1476 + }, + { + "epoch": 0.02956, + "grad_norm": 3.0625, + "grad_norm_var": 0.37743733723958334, + "learning_rate": 0.0001, + "loss": 5.8305, + "loss/crossentropy": 2.3029643297195435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3262677788734436, + "step": 1478 + }, + { + "epoch": 0.0296, + "grad_norm": 3.15625, + "grad_norm_var": 0.3826243082682292, + "learning_rate": 0.0001, + "loss": 5.6803, + "loss/crossentropy": 2.8598941564559937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.38816460967063904, + "step": 1480 + }, + { + "epoch": 0.02964, + "grad_norm": 2.984375, + "grad_norm_var": 0.37844950358072915, + "learning_rate": 0.0001, + "loss": 5.2177, + "loss/crossentropy": 2.134063720703125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3128468096256256, + "step": 1482 + }, + { + "epoch": 0.02968, + "grad_norm": 3.125, + "grad_norm_var": 0.31961263020833336, + "learning_rate": 0.0001, + "loss": 5.5234, + "loss/crossentropy": 2.481287717819214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3209179639816284, + "step": 1484 + }, + { + "epoch": 0.02972, + "grad_norm": 3.3125, + "grad_norm_var": 0.30549723307291665, + "learning_rate": 0.0001, + "loss": 5.3571, + "loss/crossentropy": 2.0571895837783813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3121480941772461, + "step": 1486 + }, + { + "epoch": 0.02976, + "grad_norm": 3.203125, + "grad_norm_var": 0.30614827473958334, + "learning_rate": 0.0001, + "loss": 5.2725, + "loss/crossentropy": 2.1930073499679565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3031492233276367, + "step": 1488 + }, + { + "epoch": 0.0298, + "grad_norm": 3.046875, + "grad_norm_var": 0.30426025390625, + "learning_rate": 0.0001, + "loss": 5.3256, + "loss/crossentropy": 2.403902530670166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32206277549266815, + "step": 1490 + }, + { + "epoch": 0.02984, + "grad_norm": 2.9375, + "grad_norm_var": 0.0538482666015625, + "learning_rate": 0.0001, + "loss": 5.2859, + "loss/crossentropy": 2.131115198135376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31651052832603455, + "step": 1492 + }, + { + "epoch": 0.02988, + "grad_norm": 2.828125, + "grad_norm_var": 0.05826416015625, + "learning_rate": 0.0001, + "loss": 5.1232, + "loss/crossentropy": 2.024750769138336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3142934888601303, + "step": 1494 + }, + { + "epoch": 0.02992, + "grad_norm": 2.984375, + "grad_norm_var": 0.06279195149739583, + "learning_rate": 0.0001, + "loss": 5.3804, + "loss/crossentropy": 1.9606707692146301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28610387444496155, + "step": 1496 + }, + { + "epoch": 0.02996, + "grad_norm": 3.15625, + "grad_norm_var": 0.062474568684895836, + "learning_rate": 0.0001, + "loss": 5.5731, + "loss/crossentropy": 2.31516432762146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32875190675258636, + "step": 1498 + }, + { + "epoch": 0.03, + "grad_norm": 3.359375, + "grad_norm_var": 0.0711822509765625, + "learning_rate": 0.0001, + "loss": 5.7626, + "loss/crossentropy": 2.295942783355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3379499167203903, + "step": 1500 + }, + { + "epoch": 0.03004, + "grad_norm": 3.21875, + "grad_norm_var": 0.06961263020833333, + "learning_rate": 0.0001, + "loss": 5.5997, + "loss/crossentropy": 2.1859577894210815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32143887877464294, + "step": 1502 + }, + { + "epoch": 0.03008, + "grad_norm": 3.28125, + "grad_norm_var": 0.07023824055989583, + "learning_rate": 0.0001, + "loss": 5.4159, + "loss/crossentropy": 2.1852502822875977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3582807630300522, + "step": 1504 + }, + { + "epoch": 0.03012, + "grad_norm": 2.75, + "grad_norm_var": 0.07810770670572917, + "learning_rate": 0.0001, + "loss": 5.3011, + "loss/crossentropy": 2.174897611141205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33067959547042847, + "step": 1506 + }, + { + "epoch": 0.03016, + "grad_norm": 2.96875, + "grad_norm_var": 0.04057515462239583, + "learning_rate": 0.0001, + "loss": 5.2775, + "loss/crossentropy": 2.24343740940094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34563779830932617, + "step": 1508 + }, + { + "epoch": 0.0302, + "grad_norm": 2.90625, + "grad_norm_var": 0.043473307291666666, + "learning_rate": 0.0001, + "loss": 5.317, + "loss/crossentropy": 1.9828822612762451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2887475937604904, + "step": 1510 + }, + { + "epoch": 0.03024, + "grad_norm": 3.0, + "grad_norm_var": 0.03655192057291667, + "learning_rate": 0.0001, + "loss": 5.3172, + "loss/crossentropy": 2.2110393047332764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32290786504745483, + "step": 1512 + }, + { + "epoch": 0.03028, + "grad_norm": 2.90625, + "grad_norm_var": 0.04527587890625, + "learning_rate": 0.0001, + "loss": 5.2598, + "loss/crossentropy": 2.3797603845596313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3262799382209778, + "step": 1514 + }, + { + "epoch": 0.03032, + "grad_norm": 3.1875, + "grad_norm_var": 0.0281158447265625, + "learning_rate": 0.0001, + "loss": 5.4107, + "loss/crossentropy": 2.0183085799217224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2915680408477783, + "step": 1516 + }, + { + "epoch": 0.03036, + "grad_norm": 3.78125, + "grad_norm_var": 0.9888631184895833, + "learning_rate": 0.0001, + "loss": 5.4632, + "loss/crossentropy": 1.875212013721466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30001460015773773, + "step": 1518 + }, + { + "epoch": 0.0304, + "grad_norm": 3.0625, + "grad_norm_var": 0.9916300455729167, + "learning_rate": 0.0001, + "loss": 5.4406, + "loss/crossentropy": 2.1000564098358154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.354188472032547, + "step": 1520 + }, + { + "epoch": 0.03044, + "grad_norm": 2.84375, + "grad_norm_var": 0.980126953125, + "learning_rate": 0.0001, + "loss": 5.4837, + "loss/crossentropy": 2.071643114089966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.333335280418396, + "step": 1522 + }, + { + "epoch": 0.03048, + "grad_norm": 3.203125, + "grad_norm_var": 0.9749664306640625, + "learning_rate": 0.0001, + "loss": 5.2716, + "loss/crossentropy": 2.4253947734832764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31365686655044556, + "step": 1524 + }, + { + "epoch": 0.03052, + "grad_norm": 3.0625, + "grad_norm_var": 0.9605133056640625, + "learning_rate": 0.0001, + "loss": 5.1601, + "loss/crossentropy": 1.967090904712677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30677899718284607, + "step": 1526 + }, + { + "epoch": 0.03056, + "grad_norm": 3.15625, + "grad_norm_var": 0.954443359375, + "learning_rate": 0.0001, + "loss": 5.0971, + "loss/crossentropy": 2.112701952457428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3029911667108536, + "step": 1528 + }, + { + "epoch": 0.0306, + "grad_norm": 4.5625, + "grad_norm_var": 1.0084269205729166, + "learning_rate": 0.0001, + "loss": 5.6836, + "loss/crossentropy": 2.5657063722610474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3484763503074646, + "step": 1530 + }, + { + "epoch": 0.03064, + "grad_norm": 3.234375, + "grad_norm_var": 1.110497029622396, + "learning_rate": 0.0001, + "loss": 5.3888, + "loss/crossentropy": 2.1214585304260254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3243858218193054, + "step": 1532 + }, + { + "epoch": 0.03068, + "grad_norm": 3.234375, + "grad_norm_var": 0.35461832682291666, + "learning_rate": 0.0001, + "loss": 5.4662, + "loss/crossentropy": 2.427902936935425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3428474962711334, + "step": 1534 + }, + { + "epoch": 0.03072, + "grad_norm": 3.078125, + "grad_norm_var": 0.36741434733072914, + "learning_rate": 0.0001, + "loss": 5.2956, + "loss/crossentropy": 1.975690484046936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3136949688196182, + "step": 1536 + }, + { + "epoch": 0.03076, + "grad_norm": 2.734375, + "grad_norm_var": 0.38342692057291666, + "learning_rate": 0.0001, + "loss": 5.1233, + "loss/crossentropy": 2.295845150947571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31703390181064606, + "step": 1538 + }, + { + "epoch": 0.0308, + "grad_norm": 3.171875, + "grad_norm_var": 0.37892252604166665, + "learning_rate": 0.0001, + "loss": 5.8286, + "loss/crossentropy": 2.117497444152832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32601243257522583, + "step": 1540 + }, + { + "epoch": 0.03084, + "grad_norm": 3.140625, + "grad_norm_var": 0.49339192708333335, + "learning_rate": 0.0001, + "loss": 5.4065, + "loss/crossentropy": 2.3824862241744995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3332519829273224, + "step": 1542 + }, + { + "epoch": 0.03088, + "grad_norm": 3.140625, + "grad_norm_var": 0.4940582275390625, + "learning_rate": 0.0001, + "loss": 5.0672, + "loss/crossentropy": 1.9037857055664062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3143462985754013, + "step": 1544 + }, + { + "epoch": 0.03092, + "grad_norm": 3.09375, + "grad_norm_var": 0.4064737955729167, + "learning_rate": 0.0001, + "loss": 5.2328, + "loss/crossentropy": 1.9191133379936218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29443541169166565, + "step": 1546 + }, + { + "epoch": 0.03096, + "grad_norm": 2.734375, + "grad_norm_var": 0.28172200520833335, + "learning_rate": 0.0001, + "loss": 5.3402, + "loss/crossentropy": 2.216462254524231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31411711871623993, + "step": 1548 + }, + { + "epoch": 0.031, + "grad_norm": 2.96875, + "grad_norm_var": 0.227685546875, + "learning_rate": 0.0001, + "loss": 5.0455, + "loss/crossentropy": 1.8064754605293274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2667318135499954, + "step": 1550 + }, + { + "epoch": 0.03104, + "grad_norm": 3.453125, + "grad_norm_var": 0.46340738932291664, + "learning_rate": 0.0001, + "loss": 5.5672, + "loss/crossentropy": 2.488176465034485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3422655761241913, + "step": 1552 + }, + { + "epoch": 0.03108, + "grad_norm": 2.859375, + "grad_norm_var": 0.4529205322265625, + "learning_rate": 0.0001, + "loss": 5.4404, + "loss/crossentropy": 2.5670164823532104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3462950587272644, + "step": 1554 + }, + { + "epoch": 0.03112, + "grad_norm": 3.25, + "grad_norm_var": 0.46499735514322915, + "learning_rate": 0.0001, + "loss": 5.0295, + "loss/crossentropy": 2.0630581378936768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29241877794265747, + "step": 1556 + }, + { + "epoch": 0.03116, + "grad_norm": 2.84375, + "grad_norm_var": 0.30891011555989584, + "learning_rate": 0.0001, + "loss": 5.4751, + "loss/crossentropy": 2.685954213142395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37143297493457794, + "step": 1558 + }, + { + "epoch": 0.0312, + "grad_norm": 3.421875, + "grad_norm_var": 0.311669921875, + "learning_rate": 0.0001, + "loss": 5.6995, + "loss/crossentropy": 1.9786988496780396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40276341140270233, + "step": 1560 + }, + { + "epoch": 0.03124, + "grad_norm": 3.0, + "grad_norm_var": 0.31383056640625, + "learning_rate": 0.0001, + "loss": 5.6695, + "loss/crossentropy": 2.1484411358833313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3264364004135132, + "step": 1562 + }, + { + "epoch": 0.03128, + "grad_norm": 2.875, + "grad_norm_var": 0.3103424072265625, + "learning_rate": 0.0001, + "loss": 5.3015, + "loss/crossentropy": 2.1411852836608887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3156583160161972, + "step": 1564 + }, + { + "epoch": 0.03132, + "grad_norm": 2.90625, + "grad_norm_var": 0.31302083333333336, + "learning_rate": 0.0001, + "loss": 5.2911, + "loss/crossentropy": 2.1509006023406982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3066476732492447, + "step": 1566 + }, + { + "epoch": 0.03136, + "grad_norm": 2.90625, + "grad_norm_var": 0.0416412353515625, + "learning_rate": 0.0001, + "loss": 5.1904, + "loss/crossentropy": 1.7540676593780518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2705047130584717, + "step": 1568 + }, + { + "epoch": 0.0314, + "grad_norm": 3.046875, + "grad_norm_var": 0.0431640625, + "learning_rate": 0.0001, + "loss": 4.9637, + "loss/crossentropy": 2.2091184854507446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2920738309621811, + "step": 1570 + }, + { + "epoch": 0.03144, + "grad_norm": 2.984375, + "grad_norm_var": 0.045735677083333336, + "learning_rate": 0.0001, + "loss": 5.6048, + "loss/crossentropy": 1.8405091762542725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2981649935245514, + "step": 1572 + }, + { + "epoch": 0.03148, + "grad_norm": 2.6875, + "grad_norm_var": 0.0420806884765625, + "learning_rate": 0.0001, + "loss": 5.4258, + "loss/crossentropy": 2.2292457818984985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30654460191726685, + "step": 1574 + }, + { + "epoch": 0.03152, + "grad_norm": 4.21875, + "grad_norm_var": 0.12868550618489583, + "learning_rate": 0.0001, + "loss": 5.8439, + "loss/crossentropy": 2.653907895088196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35303865373134613, + "step": 1576 + }, + { + "epoch": 0.03156, + "grad_norm": 3.234375, + "grad_norm_var": 0.13888346354166667, + "learning_rate": 0.0001, + "loss": 5.6145, + "loss/crossentropy": 2.6799376010894775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37383997440338135, + "step": 1578 + }, + { + "epoch": 0.0316, + "grad_norm": 2.828125, + "grad_norm_var": 0.14431864420572918, + "learning_rate": 0.0001, + "loss": 5.3321, + "loss/crossentropy": 2.3438535928726196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30706922709941864, + "step": 1580 + }, + { + "epoch": 0.03164, + "grad_norm": 2.96875, + "grad_norm_var": 0.14254150390625, + "learning_rate": 0.0001, + "loss": 5.2139, + "loss/crossentropy": 2.1885964274406433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30707718431949615, + "step": 1582 + }, + { + "epoch": 0.03168, + "grad_norm": 2.984375, + "grad_norm_var": 0.14042561848958332, + "learning_rate": 0.0001, + "loss": 5.1661, + "loss/crossentropy": 2.0471584796905518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31723180413246155, + "step": 1584 + }, + { + "epoch": 0.03172, + "grad_norm": 3.0, + "grad_norm_var": 0.1363433837890625, + "learning_rate": 0.0001, + "loss": 5.227, + "loss/crossentropy": 2.135176420211792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29839280247688293, + "step": 1586 + }, + { + "epoch": 0.03176, + "grad_norm": 3.3125, + "grad_norm_var": 0.13482666015625, + "learning_rate": 0.0001, + "loss": 5.6412, + "loss/crossentropy": 2.4444775581359863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33574284613132477, + "step": 1588 + }, + { + "epoch": 0.0318, + "grad_norm": 3.140625, + "grad_norm_var": 0.12195536295572916, + "learning_rate": 0.0001, + "loss": 5.4289, + "loss/crossentropy": 2.1725653409957886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.310588076710701, + "step": 1590 + }, + { + "epoch": 0.03184, + "grad_norm": 2.875, + "grad_norm_var": 0.04572652180989583, + "learning_rate": 0.0001, + "loss": 5.3727, + "loss/crossentropy": 2.3610929250717163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32158929109573364, + "step": 1592 + }, + { + "epoch": 0.03188, + "grad_norm": 2.84375, + "grad_norm_var": 0.0502105712890625, + "learning_rate": 0.0001, + "loss": 4.8794, + "loss/crossentropy": 1.9271156787872314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28283432126045227, + "step": 1594 + }, + { + "epoch": 0.03192, + "grad_norm": 3.203125, + "grad_norm_var": 0.04372456868489583, + "learning_rate": 0.0001, + "loss": 5.3912, + "loss/crossentropy": 2.4196890592575073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3463610112667084, + "step": 1596 + }, + { + "epoch": 0.03196, + "grad_norm": 2.71875, + "grad_norm_var": 0.0500152587890625, + "learning_rate": 0.0001, + "loss": 5.1524, + "loss/crossentropy": 2.207236647605896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33829018473625183, + "step": 1598 + }, + { + "epoch": 0.032, + "grad_norm": 3.0625, + "grad_norm_var": 0.04101155598958333, + "learning_rate": 0.0001, + "loss": 5.4724, + "loss/crossentropy": 2.3757678270339966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3342677056789398, + "step": 1600 + }, + { + "epoch": 0.03204, + "grad_norm": 2.921875, + "grad_norm_var": 0.04512430826822917, + "learning_rate": 0.0001, + "loss": 5.1763, + "loss/crossentropy": 2.2605016231536865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3062159866094589, + "step": 1602 + }, + { + "epoch": 0.03208, + "grad_norm": 3.359375, + "grad_norm_var": 0.04794921875, + "learning_rate": 0.0001, + "loss": 5.7139, + "loss/crossentropy": 2.4536768198013306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36668023467063904, + "step": 1604 + }, + { + "epoch": 0.03212, + "grad_norm": 3.09375, + "grad_norm_var": 0.0523834228515625, + "learning_rate": 0.0001, + "loss": 5.137, + "loss/crossentropy": 1.9870036244392395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2974477708339691, + "step": 1606 + }, + { + "epoch": 0.03216, + "grad_norm": 3.015625, + "grad_norm_var": 0.039876302083333336, + "learning_rate": 0.0001, + "loss": 5.3926, + "loss/crossentropy": 2.1852506399154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3327452540397644, + "step": 1608 + }, + { + "epoch": 0.0322, + "grad_norm": 3.140625, + "grad_norm_var": 0.03128255208333333, + "learning_rate": 0.0001, + "loss": 5.4964, + "loss/crossentropy": 2.2226197719573975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3110218793153763, + "step": 1610 + }, + { + "epoch": 0.03224, + "grad_norm": 3.09375, + "grad_norm_var": 0.029523722330729165, + "learning_rate": 0.0001, + "loss": 5.398, + "loss/crossentropy": 1.8255922198295593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28216174244880676, + "step": 1612 + }, + { + "epoch": 0.03228, + "grad_norm": 2.96875, + "grad_norm_var": 0.023200480143229167, + "learning_rate": 0.0001, + "loss": 5.2777, + "loss/crossentropy": 1.9663920998573303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3095496743917465, + "step": 1614 + }, + { + "epoch": 0.03232, + "grad_norm": 2.953125, + "grad_norm_var": 0.023908487955729165, + "learning_rate": 0.0001, + "loss": 5.1578, + "loss/crossentropy": 2.2089942693710327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3307983875274658, + "step": 1616 + }, + { + "epoch": 0.03236, + "grad_norm": 3.03125, + "grad_norm_var": 0.01793212890625, + "learning_rate": 0.0001, + "loss": 5.3331, + "loss/crossentropy": 2.261039137840271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.314799427986145, + "step": 1618 + }, + { + "epoch": 0.0324, + "grad_norm": 2.859375, + "grad_norm_var": 0.015555826822916667, + "learning_rate": 0.0001, + "loss": 5.1674, + "loss/crossentropy": 2.350824236869812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3515756279230118, + "step": 1620 + }, + { + "epoch": 0.03244, + "grad_norm": 2.9375, + "grad_norm_var": 0.010749308268229167, + "learning_rate": 0.0001, + "loss": 5.4853, + "loss/crossentropy": 2.2964736223220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3059935122728348, + "step": 1622 + }, + { + "epoch": 0.03248, + "grad_norm": 2.859375, + "grad_norm_var": 0.016597493489583334, + "learning_rate": 0.0001, + "loss": 5.2503, + "loss/crossentropy": 2.36459481716156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33763329684734344, + "step": 1624 + }, + { + "epoch": 0.03252, + "grad_norm": 2.984375, + "grad_norm_var": 0.0126617431640625, + "learning_rate": 0.0001, + "loss": 5.2242, + "loss/crossentropy": 2.165920853614807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31318019330501556, + "step": 1626 + }, + { + "epoch": 0.03256, + "grad_norm": 2.859375, + "grad_norm_var": 0.010628255208333333, + "learning_rate": 0.0001, + "loss": 5.2544, + "loss/crossentropy": 2.326790690422058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3044355511665344, + "step": 1628 + }, + { + "epoch": 0.0326, + "grad_norm": 2.96875, + "grad_norm_var": 0.01129150390625, + "learning_rate": 0.0001, + "loss": 5.3369, + "loss/crossentropy": 1.8848688006401062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.306812584400177, + "step": 1630 + }, + { + "epoch": 0.03264, + "grad_norm": 2.953125, + "grad_norm_var": 0.0134185791015625, + "learning_rate": 0.0001, + "loss": 5.351, + "loss/crossentropy": 2.045863091945648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3091462701559067, + "step": 1632 + }, + { + "epoch": 0.03268, + "grad_norm": 2.78125, + "grad_norm_var": 0.01148681640625, + "learning_rate": 0.0001, + "loss": 4.9742, + "loss/crossentropy": 2.0707273483276367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3038959503173828, + "step": 1634 + }, + { + "epoch": 0.03272, + "grad_norm": 2.859375, + "grad_norm_var": 0.011237589518229167, + "learning_rate": 0.0001, + "loss": 5.2076, + "loss/crossentropy": 2.0786932706832886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29513464868068695, + "step": 1636 + }, + { + "epoch": 0.03276, + "grad_norm": 3.203125, + "grad_norm_var": 0.01900634765625, + "learning_rate": 0.0001, + "loss": 5.4237, + "loss/crossentropy": 2.1527108550071716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32102274894714355, + "step": 1638 + }, + { + "epoch": 0.0328, + "grad_norm": 3.15625, + "grad_norm_var": 0.021361287434895834, + "learning_rate": 0.0001, + "loss": 5.126, + "loss/crossentropy": 2.0383081436157227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3253529220819473, + "step": 1640 + }, + { + "epoch": 0.03284, + "grad_norm": 2.734375, + "grad_norm_var": 0.021809895833333332, + "learning_rate": 0.0001, + "loss": 5.0775, + "loss/crossentropy": 2.2801902294158936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32200203835964203, + "step": 1642 + }, + { + "epoch": 0.03288, + "grad_norm": 2.953125, + "grad_norm_var": 0.022184244791666665, + "learning_rate": 0.0001, + "loss": 5.3035, + "loss/crossentropy": 2.164717435836792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2997436225414276, + "step": 1644 + }, + { + "epoch": 0.03292, + "grad_norm": 3.1875, + "grad_norm_var": 0.025191243489583334, + "learning_rate": 0.0001, + "loss": 5.5166, + "loss/crossentropy": 2.389414072036743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32998231053352356, + "step": 1646 + }, + { + "epoch": 0.03296, + "grad_norm": 2.921875, + "grad_norm_var": 0.023949178059895833, + "learning_rate": 0.0001, + "loss": 5.3225, + "loss/crossentropy": 2.09418523311615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33829881250858307, + "step": 1648 + }, + { + "epoch": 0.033, + "grad_norm": 3.109375, + "grad_norm_var": 0.021800740559895834, + "learning_rate": 0.0001, + "loss": 5.261, + "loss/crossentropy": 2.324030637741089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30337512493133545, + "step": 1650 + }, + { + "epoch": 0.03304, + "grad_norm": 3.109375, + "grad_norm_var": 0.020992024739583334, + "learning_rate": 0.0001, + "loss": 5.4162, + "loss/crossentropy": 1.8635556101799011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27077023684978485, + "step": 1652 + }, + { + "epoch": 0.03308, + "grad_norm": 2.8125, + "grad_norm_var": 0.0215972900390625, + "learning_rate": 0.0001, + "loss": 5.2371, + "loss/crossentropy": 2.1776190996170044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30546560883522034, + "step": 1654 + }, + { + "epoch": 0.03312, + "grad_norm": 3.1875, + "grad_norm_var": 0.021240234375, + "learning_rate": 0.0001, + "loss": 5.1721, + "loss/crossentropy": 2.1003682613372803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3304767310619354, + "step": 1656 + }, + { + "epoch": 0.03316, + "grad_norm": 3.203125, + "grad_norm_var": 0.02584228515625, + "learning_rate": 0.0001, + "loss": 5.8029, + "loss/crossentropy": 2.4331823587417603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3653264045715332, + "step": 1658 + }, + { + "epoch": 0.0332, + "grad_norm": 3.125, + "grad_norm_var": 0.1084381103515625, + "learning_rate": 0.0001, + "loss": 5.7157, + "loss/crossentropy": 1.9939777851104736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28948865830898285, + "step": 1660 + }, + { + "epoch": 0.03324, + "grad_norm": 2.9375, + "grad_norm_var": 0.10896809895833333, + "learning_rate": 0.0001, + "loss": 4.933, + "loss/crossentropy": 1.9093859791755676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3007328063249588, + "step": 1662 + }, + { + "epoch": 0.03328, + "grad_norm": 2.734375, + "grad_norm_var": 0.11998697916666666, + "learning_rate": 0.0001, + "loss": 5.1861, + "loss/crossentropy": 2.2847355604171753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2975463569164276, + "step": 1664 + }, + { + "epoch": 0.03332, + "grad_norm": 3.046875, + "grad_norm_var": 0.120166015625, + "learning_rate": 0.0001, + "loss": 5.3623, + "loss/crossentropy": 2.0081310868263245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28596948087215424, + "step": 1666 + }, + { + "epoch": 0.03336, + "grad_norm": 3.125, + "grad_norm_var": 0.11974283854166666, + "learning_rate": 0.0001, + "loss": 5.2338, + "loss/crossentropy": 2.189584493637085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30838510394096375, + "step": 1668 + }, + { + "epoch": 0.0334, + "grad_norm": 2.796875, + "grad_norm_var": 0.12802327473958333, + "learning_rate": 0.0001, + "loss": 5.2749, + "loss/crossentropy": 2.204169988632202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3091500401496887, + "step": 1670 + }, + { + "epoch": 0.03344, + "grad_norm": 2.75, + "grad_norm_var": 0.1338775634765625, + "learning_rate": 0.0001, + "loss": 5.2755, + "loss/crossentropy": 2.195966899394989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30603383481502533, + "step": 1672 + }, + { + "epoch": 0.03348, + "grad_norm": 3.0, + "grad_norm_var": 0.1251953125, + "learning_rate": 0.0001, + "loss": 5.8768, + "loss/crossentropy": 2.5402153730392456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3657945841550827, + "step": 1674 + }, + { + "epoch": 0.03352, + "grad_norm": 3.09375, + "grad_norm_var": 0.0277252197265625, + "learning_rate": 0.0001, + "loss": 5.5387, + "loss/crossentropy": 2.1721729040145874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29539716243743896, + "step": 1676 + }, + { + "epoch": 0.03356, + "grad_norm": 2.953125, + "grad_norm_var": 0.027253214518229166, + "learning_rate": 0.0001, + "loss": 5.2355, + "loss/crossentropy": 2.0591543912887573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29050062596797943, + "step": 1678 + }, + { + "epoch": 0.0336, + "grad_norm": 2.765625, + "grad_norm_var": 0.025846354166666665, + "learning_rate": 0.0001, + "loss": 5.4895, + "loss/crossentropy": 2.1396639347076416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3261077404022217, + "step": 1680 + }, + { + "epoch": 0.03364, + "grad_norm": 3.015625, + "grad_norm_var": 0.024689737955729166, + "learning_rate": 0.0001, + "loss": 5.2094, + "loss/crossentropy": 1.9553123712539673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3204822689294815, + "step": 1682 + }, + { + "epoch": 0.03368, + "grad_norm": 2.78125, + "grad_norm_var": 0.023746744791666666, + "learning_rate": 0.0001, + "loss": 5.3417, + "loss/crossentropy": 2.4139883518218994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32602658867836, + "step": 1684 + }, + { + "epoch": 0.03372, + "grad_norm": 2.90625, + "grad_norm_var": 0.019624837239583335, + "learning_rate": 0.0001, + "loss": 5.3479, + "loss/crossentropy": 1.9849293231964111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32610756158828735, + "step": 1686 + }, + { + "epoch": 0.03376, + "grad_norm": 2.953125, + "grad_norm_var": 0.008622233072916667, + "learning_rate": 0.0001, + "loss": 5.6218, + "loss/crossentropy": 2.698970675468445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3749641329050064, + "step": 1688 + }, + { + "epoch": 0.0338, + "grad_norm": 3.0625, + "grad_norm_var": 0.01285400390625, + "learning_rate": 0.0001, + "loss": 5.7617, + "loss/crossentropy": 2.715620517730713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35715436935424805, + "step": 1690 + }, + { + "epoch": 0.03384, + "grad_norm": 3.0, + "grad_norm_var": 0.01201171875, + "learning_rate": 0.0001, + "loss": 5.5073, + "loss/crossentropy": 2.7213666439056396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34585659205913544, + "step": 1692 + }, + { + "epoch": 0.03388, + "grad_norm": 2.84375, + "grad_norm_var": 0.016044108072916667, + "learning_rate": 0.0001, + "loss": 5.2674, + "loss/crossentropy": 2.277606725692749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3140410780906677, + "step": 1694 + }, + { + "epoch": 0.03392, + "grad_norm": 3.765625, + "grad_norm_var": 0.05364176432291667, + "learning_rate": 0.0001, + "loss": 5.2633, + "loss/crossentropy": 2.332197904586792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30997559428215027, + "step": 1696 + }, + { + "epoch": 0.03396, + "grad_norm": 2.953125, + "grad_norm_var": 0.05464579264322917, + "learning_rate": 0.0001, + "loss": 5.4323, + "loss/crossentropy": 2.4413230419158936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3075388967990875, + "step": 1698 + }, + { + "epoch": 0.034, + "grad_norm": 2.859375, + "grad_norm_var": 0.0567535400390625, + "learning_rate": 0.0001, + "loss": 5.1099, + "loss/crossentropy": 2.2601696252822876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3050367534160614, + "step": 1700 + }, + { + "epoch": 0.03404, + "grad_norm": 2.640625, + "grad_norm_var": 0.06572977701822917, + "learning_rate": 0.0001, + "loss": 4.9925, + "loss/crossentropy": 2.2910414934158325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3248990923166275, + "step": 1702 + }, + { + "epoch": 0.03408, + "grad_norm": 2.890625, + "grad_norm_var": 0.06843159993489584, + "learning_rate": 0.0001, + "loss": 5.3814, + "loss/crossentropy": 1.9898682832717896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27573561668395996, + "step": 1704 + }, + { + "epoch": 0.03412, + "grad_norm": 2.96875, + "grad_norm_var": 0.06398824055989584, + "learning_rate": 0.0001, + "loss": 5.1506, + "loss/crossentropy": 2.1234602332115173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30724021792411804, + "step": 1706 + }, + { + "epoch": 0.03416, + "grad_norm": 3.390625, + "grad_norm_var": 0.0764801025390625, + "learning_rate": 0.0001, + "loss": 5.5433, + "loss/crossentropy": 2.34807026386261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32687780261039734, + "step": 1708 + }, + { + "epoch": 0.0342, + "grad_norm": 2.71875, + "grad_norm_var": 0.07669169108072917, + "learning_rate": 0.0001, + "loss": 5.1249, + "loss/crossentropy": 2.17264860868454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3086177706718445, + "step": 1710 + }, + { + "epoch": 0.03424, + "grad_norm": 2.96875, + "grad_norm_var": 0.0303131103515625, + "learning_rate": 0.0001, + "loss": 5.3613, + "loss/crossentropy": 2.1094497442245483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29483039677143097, + "step": 1712 + }, + { + "epoch": 0.03428, + "grad_norm": 2.71875, + "grad_norm_var": 0.03284505208333333, + "learning_rate": 0.0001, + "loss": 5.256, + "loss/crossentropy": 2.2379074692726135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2799292802810669, + "step": 1714 + }, + { + "epoch": 0.03432, + "grad_norm": 3.078125, + "grad_norm_var": 0.21389567057291667, + "learning_rate": 0.0001, + "loss": 5.5834, + "loss/crossentropy": 2.4616905450820923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33721986413002014, + "step": 1716 + }, + { + "epoch": 0.03436, + "grad_norm": 2.953125, + "grad_norm_var": 0.2042877197265625, + "learning_rate": 0.0001, + "loss": 5.4415, + "loss/crossentropy": 2.383226990699768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3242805302143097, + "step": 1718 + }, + { + "epoch": 0.0344, + "grad_norm": 2.921875, + "grad_norm_var": 0.19931538899739584, + "learning_rate": 0.0001, + "loss": 5.5514, + "loss/crossentropy": 2.495948314666748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3131762146949768, + "step": 1720 + }, + { + "epoch": 0.03444, + "grad_norm": 3.109375, + "grad_norm_var": 0.19524739583333334, + "learning_rate": 0.0001, + "loss": 5.6767, + "loss/crossentropy": 2.1921653747558594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.320631667971611, + "step": 1722 + }, + { + "epoch": 0.03448, + "grad_norm": 3.40625, + "grad_norm_var": 0.20221354166666666, + "learning_rate": 0.0001, + "loss": 5.2, + "loss/crossentropy": 2.0795475840568542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29602357745170593, + "step": 1724 + }, + { + "epoch": 0.03452, + "grad_norm": 2.75, + "grad_norm_var": 0.201171875, + "learning_rate": 0.0001, + "loss": 4.7489, + "loss/crossentropy": 1.911207377910614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.287412166595459, + "step": 1726 + }, + { + "epoch": 0.03456, + "grad_norm": 2.96875, + "grad_norm_var": 0.20129801432291666, + "learning_rate": 0.0001, + "loss": 5.155, + "loss/crossentropy": 2.0638214349746704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32240423560142517, + "step": 1728 + }, + { + "epoch": 0.0346, + "grad_norm": 2.75, + "grad_norm_var": 0.20066630045572917, + "learning_rate": 0.0001, + "loss": 5.3464, + "loss/crossentropy": 2.355573534965515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3161381185054779, + "step": 1730 + }, + { + "epoch": 0.03464, + "grad_norm": 2.8125, + "grad_norm_var": 0.04185282389322917, + "learning_rate": 0.0001, + "loss": 5.1039, + "loss/crossentropy": 2.2227123975753784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31909704208374023, + "step": 1732 + }, + { + "epoch": 0.03468, + "grad_norm": 2.765625, + "grad_norm_var": 0.03954671223958333, + "learning_rate": 0.0001, + "loss": 5.2249, + "loss/crossentropy": 2.203602910041809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3238847255706787, + "step": 1734 + }, + { + "epoch": 0.03472, + "grad_norm": 3.34375, + "grad_norm_var": 0.07062174479166666, + "learning_rate": 0.0001, + "loss": 5.0623, + "loss/crossentropy": 2.2696332335472107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3174774497747421, + "step": 1736 + }, + { + "epoch": 0.03476, + "grad_norm": 2.78125, + "grad_norm_var": 0.07385660807291666, + "learning_rate": 0.0001, + "loss": 5.0469, + "loss/crossentropy": 1.8124465942382812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2645348161458969, + "step": 1738 + }, + { + "epoch": 0.0348, + "grad_norm": 3.015625, + "grad_norm_var": 0.0578765869140625, + "learning_rate": 0.0001, + "loss": 5.589, + "loss/crossentropy": 2.1951464414596558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31606370210647583, + "step": 1740 + }, + { + "epoch": 0.03484, + "grad_norm": 3.109375, + "grad_norm_var": 0.05624593098958333, + "learning_rate": 0.0001, + "loss": 5.4867, + "loss/crossentropy": 2.4413230419158936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33688417077064514, + "step": 1742 + }, + { + "epoch": 0.03488, + "grad_norm": 3.09375, + "grad_norm_var": 0.05705973307291667, + "learning_rate": 0.0001, + "loss": 5.7898, + "loss/crossentropy": 2.1357219219207764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3476516157388687, + "step": 1744 + }, + { + "epoch": 0.03492, + "grad_norm": 2.859375, + "grad_norm_var": 0.0551666259765625, + "learning_rate": 0.0001, + "loss": 5.4465, + "loss/crossentropy": 2.1557592153549194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3080083876848221, + "step": 1746 + }, + { + "epoch": 0.03496, + "grad_norm": 2.859375, + "grad_norm_var": 0.0512603759765625, + "learning_rate": 0.0001, + "loss": 5.4949, + "loss/crossentropy": 2.3549705743789673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33814217150211334, + "step": 1748 + }, + { + "epoch": 0.035, + "grad_norm": 3.109375, + "grad_norm_var": 0.04348958333333333, + "learning_rate": 0.0001, + "loss": 5.5881, + "loss/crossentropy": 2.382844924926758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31944599747657776, + "step": 1750 + }, + { + "epoch": 0.03504, + "grad_norm": 2.765625, + "grad_norm_var": 0.02138671875, + "learning_rate": 0.0001, + "loss": 5.2937, + "loss/crossentropy": 2.3312920331954956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3346693813800812, + "step": 1752 + }, + { + "epoch": 0.03508, + "grad_norm": 3.015625, + "grad_norm_var": 0.019677734375, + "learning_rate": 0.0001, + "loss": 5.1981, + "loss/crossentropy": 2.1921491026878357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31133508682250977, + "step": 1754 + }, + { + "epoch": 0.03512, + "grad_norm": 2.71875, + "grad_norm_var": 0.024494425455729166, + "learning_rate": 0.0001, + "loss": 4.8796, + "loss/crossentropy": 2.0229611992836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26832816004753113, + "step": 1756 + }, + { + "epoch": 0.03516, + "grad_norm": 2.75, + "grad_norm_var": 0.025764973958333333, + "learning_rate": 0.0001, + "loss": 5.0322, + "loss/crossentropy": 1.8138108849525452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2576165944337845, + "step": 1758 + }, + { + "epoch": 0.0352, + "grad_norm": 2.75, + "grad_norm_var": 0.0232574462890625, + "learning_rate": 0.0001, + "loss": 5.1276, + "loss/crossentropy": 2.0019126534461975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28094957768917084, + "step": 1760 + }, + { + "epoch": 0.03524, + "grad_norm": 2.703125, + "grad_norm_var": 0.022554524739583335, + "learning_rate": 0.0001, + "loss": 5.1776, + "loss/crossentropy": 2.400240898132324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30936548113822937, + "step": 1762 + }, + { + "epoch": 0.03528, + "grad_norm": 3.328125, + "grad_norm_var": 0.03495992024739583, + "learning_rate": 0.0001, + "loss": 4.9401, + "loss/crossentropy": 2.019958734512329, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27741560339927673, + "step": 1764 + }, + { + "epoch": 0.03532, + "grad_norm": 2.875, + "grad_norm_var": 0.025032552083333333, + "learning_rate": 0.0001, + "loss": 5.0912, + "loss/crossentropy": 1.9099596738815308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28769225627183914, + "step": 1766 + }, + { + "epoch": 0.03536, + "grad_norm": 2.890625, + "grad_norm_var": 0.024344889322916667, + "learning_rate": 0.0001, + "loss": 5.42, + "loss/crossentropy": 2.2508283853530884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2945093661546707, + "step": 1768 + }, + { + "epoch": 0.0354, + "grad_norm": 2.828125, + "grad_norm_var": 0.0222320556640625, + "learning_rate": 0.0001, + "loss": 5.2516, + "loss/crossentropy": 1.9332409501075745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26304441690444946, + "step": 1770 + }, + { + "epoch": 0.03544, + "grad_norm": 2.875, + "grad_norm_var": 0.022557576497395832, + "learning_rate": 0.0001, + "loss": 5.1896, + "loss/crossentropy": 2.143627643585205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28407415747642517, + "step": 1772 + }, + { + "epoch": 0.03548, + "grad_norm": 2.859375, + "grad_norm_var": 0.029683430989583332, + "learning_rate": 0.0001, + "loss": 5.4763, + "loss/crossentropy": 2.32085120677948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3176119029521942, + "step": 1774 + }, + { + "epoch": 0.03552, + "grad_norm": 2.828125, + "grad_norm_var": 0.028539021809895832, + "learning_rate": 0.0001, + "loss": 5.1717, + "loss/crossentropy": 1.7893801927566528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27407801151275635, + "step": 1776 + }, + { + "epoch": 0.03556, + "grad_norm": 2.78125, + "grad_norm_var": 0.027228800455729167, + "learning_rate": 0.0001, + "loss": 5.42, + "loss/crossentropy": 2.206292986869812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3071517199277878, + "step": 1778 + }, + { + "epoch": 0.0356, + "grad_norm": 2.84375, + "grad_norm_var": 0.013263956705729166, + "learning_rate": 0.0001, + "loss": 5.1879, + "loss/crossentropy": 2.1285043954849243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3164139539003372, + "step": 1780 + }, + { + "epoch": 0.03564, + "grad_norm": 3.15625, + "grad_norm_var": 0.018635050455729166, + "learning_rate": 0.0001, + "loss": 5.2826, + "loss/crossentropy": 2.16570383310318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3052050769329071, + "step": 1782 + }, + { + "epoch": 0.03568, + "grad_norm": 2.734375, + "grad_norm_var": 0.019950358072916667, + "learning_rate": 0.0001, + "loss": 5.0921, + "loss/crossentropy": 1.9799941778182983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2655785381793976, + "step": 1784 + }, + { + "epoch": 0.03572, + "grad_norm": 2.765625, + "grad_norm_var": 0.0209625244140625, + "learning_rate": 0.0001, + "loss": 5.2468, + "loss/crossentropy": 1.9801498651504517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2824363112449646, + "step": 1786 + }, + { + "epoch": 0.03576, + "grad_norm": 2.828125, + "grad_norm_var": 0.020947265625, + "learning_rate": 0.0001, + "loss": 5.0131, + "loss/crossentropy": 1.5805786848068237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25234321504831314, + "step": 1788 + }, + { + "epoch": 0.0358, + "grad_norm": 3.15625, + "grad_norm_var": 0.021484375, + "learning_rate": 0.0001, + "loss": 5.296, + "loss/crossentropy": 2.2434048652648926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2896551638841629, + "step": 1790 + }, + { + "epoch": 0.03584, + "grad_norm": 2.78125, + "grad_norm_var": 0.022411092122395834, + "learning_rate": 0.0001, + "loss": 5.0179, + "loss/crossentropy": 1.9738762378692627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2959328889846802, + "step": 1792 + }, + { + "epoch": 0.03588, + "grad_norm": 3.0625, + "grad_norm_var": 0.0272857666015625, + "learning_rate": 0.0001, + "loss": 5.2317, + "loss/crossentropy": 2.222583770751953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29915711283683777, + "step": 1794 + }, + { + "epoch": 0.03592, + "grad_norm": 2.8125, + "grad_norm_var": 0.032942708333333334, + "learning_rate": 0.0001, + "loss": 5.4192, + "loss/crossentropy": 2.188909649848938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.294817179441452, + "step": 1796 + }, + { + "epoch": 0.03596, + "grad_norm": 3.015625, + "grad_norm_var": 0.030907185872395833, + "learning_rate": 0.0001, + "loss": 5.6303, + "loss/crossentropy": 2.4745373725891113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32838208973407745, + "step": 1798 + }, + { + "epoch": 0.036, + "grad_norm": 2.890625, + "grad_norm_var": 0.028595987955729166, + "learning_rate": 0.0001, + "loss": 5.3466, + "loss/crossentropy": 1.9314215779304504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29102426767349243, + "step": 1800 + }, + { + "epoch": 0.03604, + "grad_norm": 2.796875, + "grad_norm_var": 0.026276652018229166, + "learning_rate": 0.0001, + "loss": 4.9782, + "loss/crossentropy": 2.0099900364875793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30170293152332306, + "step": 1802 + }, + { + "epoch": 0.03608, + "grad_norm": 2.859375, + "grad_norm_var": 0.024388631184895832, + "learning_rate": 0.0001, + "loss": 5.2506, + "loss/crossentropy": 2.1273564100265503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3019937574863434, + "step": 1804 + }, + { + "epoch": 0.03612, + "grad_norm": 2.96875, + "grad_norm_var": 0.016341145833333334, + "learning_rate": 0.0001, + "loss": 5.1003, + "loss/crossentropy": 2.065160095691681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2817380279302597, + "step": 1806 + }, + { + "epoch": 0.03616, + "grad_norm": 2.828125, + "grad_norm_var": 0.018561808268229167, + "learning_rate": 0.0001, + "loss": 5.1569, + "loss/crossentropy": 2.262821078300476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33245067298412323, + "step": 1808 + }, + { + "epoch": 0.0362, + "grad_norm": 2.625, + "grad_norm_var": 0.020003255208333334, + "learning_rate": 0.0001, + "loss": 4.8323, + "loss/crossentropy": 2.164163827896118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26008155941963196, + "step": 1810 + }, + { + "epoch": 0.03624, + "grad_norm": 2.78125, + "grad_norm_var": 0.012398274739583333, + "learning_rate": 0.0001, + "loss": 5.2517, + "loss/crossentropy": 2.147629737854004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3073730617761612, + "step": 1812 + }, + { + "epoch": 0.03628, + "grad_norm": 2.75, + "grad_norm_var": 0.023900349934895832, + "learning_rate": 0.0001, + "loss": 5.1255, + "loss/crossentropy": 2.04233980178833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2981158718466759, + "step": 1814 + }, + { + "epoch": 0.03632, + "grad_norm": 3.0, + "grad_norm_var": 0.026439412434895834, + "learning_rate": 0.0001, + "loss": 4.9465, + "loss/crossentropy": 2.264409065246582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32124973833560944, + "step": 1816 + }, + { + "epoch": 0.03636, + "grad_norm": 3.109375, + "grad_norm_var": 0.033610026041666664, + "learning_rate": 0.0001, + "loss": 5.4324, + "loss/crossentropy": 2.092079997062683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3318764269351959, + "step": 1818 + }, + { + "epoch": 0.0364, + "grad_norm": 2.828125, + "grad_norm_var": 0.03850504557291667, + "learning_rate": 0.0001, + "loss": 5.2766, + "loss/crossentropy": 2.1007314324378967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2930498272180557, + "step": 1820 + }, + { + "epoch": 0.03644, + "grad_norm": 2.84375, + "grad_norm_var": 0.03697001139322917, + "learning_rate": 0.0001, + "loss": 5.129, + "loss/crossentropy": 2.2377375960350037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30473431944847107, + "step": 1822 + }, + { + "epoch": 0.03648, + "grad_norm": 3.296875, + "grad_norm_var": 0.04684244791666667, + "learning_rate": 0.0001, + "loss": 5.4209, + "loss/crossentropy": 2.0965787172317505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30526305735111237, + "step": 1824 + }, + { + "epoch": 0.03652, + "grad_norm": 3.078125, + "grad_norm_var": 0.04755859375, + "learning_rate": 0.0001, + "loss": 5.0359, + "loss/crossentropy": 2.208711862564087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2727830111980438, + "step": 1826 + }, + { + "epoch": 0.03656, + "grad_norm": 2.796875, + "grad_norm_var": 0.050633748372395836, + "learning_rate": 0.0001, + "loss": 5.1692, + "loss/crossentropy": 2.1706738471984863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3054092824459076, + "step": 1828 + }, + { + "epoch": 0.0366, + "grad_norm": 2.796875, + "grad_norm_var": 0.037262980143229166, + "learning_rate": 0.0001, + "loss": 5.2311, + "loss/crossentropy": 2.0891621112823486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3111531287431717, + "step": 1830 + }, + { + "epoch": 0.03664, + "grad_norm": 2.859375, + "grad_norm_var": 0.03560791015625, + "learning_rate": 0.0001, + "loss": 5.0537, + "loss/crossentropy": 2.0721256732940674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30227208137512207, + "step": 1832 + }, + { + "epoch": 0.03668, + "grad_norm": 3.015625, + "grad_norm_var": 0.03023681640625, + "learning_rate": 0.0001, + "loss": 5.1267, + "loss/crossentropy": 2.2015734910964966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3185647875070572, + "step": 1834 + }, + { + "epoch": 0.03672, + "grad_norm": 3.03125, + "grad_norm_var": 0.026493326822916666, + "learning_rate": 0.0001, + "loss": 5.1973, + "loss/crossentropy": 2.1985132694244385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3032165467739105, + "step": 1836 + }, + { + "epoch": 0.03676, + "grad_norm": 2.890625, + "grad_norm_var": 0.026334635416666665, + "learning_rate": 0.0001, + "loss": 5.2041, + "loss/crossentropy": 2.170067548751831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3295029550790787, + "step": 1838 + }, + { + "epoch": 0.0368, + "grad_norm": 2.921875, + "grad_norm_var": 0.014615885416666667, + "learning_rate": 0.0001, + "loss": 5.0049, + "loss/crossentropy": 2.113592267036438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2775915116071701, + "step": 1840 + }, + { + "epoch": 0.03684, + "grad_norm": 2.78125, + "grad_norm_var": 0.0117828369140625, + "learning_rate": 0.0001, + "loss": 5.4221, + "loss/crossentropy": 2.1905024647712708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33297547698020935, + "step": 1842 + }, + { + "epoch": 0.03688, + "grad_norm": 2.828125, + "grad_norm_var": 0.012919108072916666, + "learning_rate": 0.0001, + "loss": 5.1859, + "loss/crossentropy": 2.252369999885559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29375749826431274, + "step": 1844 + }, + { + "epoch": 0.03692, + "grad_norm": 3.015625, + "grad_norm_var": 0.020182291666666668, + "learning_rate": 0.0001, + "loss": 4.8942, + "loss/crossentropy": 1.7526759505271912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2707225978374481, + "step": 1846 + }, + { + "epoch": 0.03696, + "grad_norm": 3.21875, + "grad_norm_var": 0.028880818684895834, + "learning_rate": 0.0001, + "loss": 5.6529, + "loss/crossentropy": 2.592544913291931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3520146906375885, + "step": 1848 + }, + { + "epoch": 0.037, + "grad_norm": 2.8125, + "grad_norm_var": 0.0278961181640625, + "learning_rate": 0.0001, + "loss": 4.9816, + "loss/crossentropy": 1.8699345588684082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2852860391139984, + "step": 1850 + }, + { + "epoch": 0.03704, + "grad_norm": 2.640625, + "grad_norm_var": 0.03209228515625, + "learning_rate": 0.0001, + "loss": 5.1326, + "loss/crossentropy": 2.2219313383102417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3210798054933548, + "step": 1852 + }, + { + "epoch": 0.03708, + "grad_norm": 2.796875, + "grad_norm_var": 0.0327056884765625, + "learning_rate": 0.0001, + "loss": 5.2301, + "loss/crossentropy": 1.9926818013191223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26928839832544327, + "step": 1854 + }, + { + "epoch": 0.03712, + "grad_norm": 2.828125, + "grad_norm_var": 0.032515462239583334, + "learning_rate": 0.0001, + "loss": 5.0372, + "loss/crossentropy": 2.019917130470276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28046920895576477, + "step": 1856 + }, + { + "epoch": 0.03716, + "grad_norm": 4.84375, + "grad_norm_var": 0.2762858072916667, + "learning_rate": 0.0001, + "loss": 5.6297, + "loss/crossentropy": 2.2585690021514893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30598941445350647, + "step": 1858 + }, + { + "epoch": 0.0372, + "grad_norm": 2.984375, + "grad_norm_var": 0.27327067057291665, + "learning_rate": 0.0001, + "loss": 5.3508, + "loss/crossentropy": 2.298324942588806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3494870364665985, + "step": 1860 + }, + { + "epoch": 0.03724, + "grad_norm": 3.125, + "grad_norm_var": 0.25745035807291666, + "learning_rate": 0.0001, + "loss": 5.54, + "loss/crossentropy": 2.430496573448181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3227449208498001, + "step": 1862 + }, + { + "epoch": 0.03728, + "grad_norm": 2.953125, + "grad_norm_var": 0.26183980305989585, + "learning_rate": 0.0001, + "loss": 5.1876, + "loss/crossentropy": 2.090576171875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2790074646472931, + "step": 1864 + }, + { + "epoch": 0.03732, + "grad_norm": 3.0625, + "grad_norm_var": 0.25806884765625, + "learning_rate": 0.0001, + "loss": 5.1799, + "loss/crossentropy": 2.2794109582901, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3135898858308792, + "step": 1866 + }, + { + "epoch": 0.03736, + "grad_norm": 2.640625, + "grad_norm_var": 0.2597076416015625, + "learning_rate": 0.0001, + "loss": 4.9333, + "loss/crossentropy": 2.2433481216430664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27558377385139465, + "step": 1868 + }, + { + "epoch": 0.0374, + "grad_norm": 2.59375, + "grad_norm_var": 0.26806233723958334, + "learning_rate": 0.0001, + "loss": 5.292, + "loss/crossentropy": 2.3111730813980103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3172578364610672, + "step": 1870 + }, + { + "epoch": 0.03744, + "grad_norm": 2.921875, + "grad_norm_var": 0.270654296875, + "learning_rate": 0.0001, + "loss": 5.2364, + "loss/crossentropy": 2.0028095841407776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35644619166851044, + "step": 1872 + }, + { + "epoch": 0.03748, + "grad_norm": 2.828125, + "grad_norm_var": 0.04501546223958333, + "learning_rate": 0.0001, + "loss": 5.2183, + "loss/crossentropy": 2.347644329071045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3119680881500244, + "step": 1874 + }, + { + "epoch": 0.03752, + "grad_norm": 2.65625, + "grad_norm_var": 0.03474833170572917, + "learning_rate": 0.0001, + "loss": 5.0787, + "loss/crossentropy": 2.118954062461853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28975334763526917, + "step": 1876 + }, + { + "epoch": 0.03756, + "grad_norm": 3.0625, + "grad_norm_var": 0.033568318684895834, + "learning_rate": 0.0001, + "loss": 5.6649, + "loss/crossentropy": 2.4376548528671265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3358190506696701, + "step": 1878 + }, + { + "epoch": 0.0376, + "grad_norm": 3.28125, + "grad_norm_var": 0.042724609375, + "learning_rate": 0.0001, + "loss": 5.4924, + "loss/crossentropy": 2.5907636880874634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3234570771455765, + "step": 1880 + }, + { + "epoch": 0.03764, + "grad_norm": 2.859375, + "grad_norm_var": 0.04352925618489583, + "learning_rate": 0.0001, + "loss": 5.0298, + "loss/crossentropy": 1.574956238269806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26118964701890945, + "step": 1882 + }, + { + "epoch": 0.03768, + "grad_norm": 3.203125, + "grad_norm_var": 0.04599609375, + "learning_rate": 0.0001, + "loss": 5.3339, + "loss/crossentropy": 2.3571736812591553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.308853879570961, + "step": 1884 + }, + { + "epoch": 0.03772, + "grad_norm": 2.546875, + "grad_norm_var": 0.03877665201822917, + "learning_rate": 0.0001, + "loss": 5.0807, + "loss/crossentropy": 2.1266958117485046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.307782918214798, + "step": 1886 + }, + { + "epoch": 0.03776, + "grad_norm": 3.0, + "grad_norm_var": 0.03876546223958333, + "learning_rate": 0.0001, + "loss": 5.5402, + "loss/crossentropy": 2.3529077768325806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32287272810935974, + "step": 1888 + }, + { + "epoch": 0.0378, + "grad_norm": 3.015625, + "grad_norm_var": 0.03905843098958333, + "learning_rate": 0.0001, + "loss": 5.4555, + "loss/crossentropy": 2.277345299720764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32319171726703644, + "step": 1890 + }, + { + "epoch": 0.03784, + "grad_norm": 2.859375, + "grad_norm_var": 0.035008748372395836, + "learning_rate": 0.0001, + "loss": 5.01, + "loss/crossentropy": 2.102940857410431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.292771652340889, + "step": 1892 + }, + { + "epoch": 0.03788, + "grad_norm": 2.640625, + "grad_norm_var": 0.03998921712239583, + "learning_rate": 0.0001, + "loss": 5.0784, + "loss/crossentropy": 1.9744033813476562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2844446450471878, + "step": 1894 + }, + { + "epoch": 0.03792, + "grad_norm": 2.625, + "grad_norm_var": 0.03250325520833333, + "learning_rate": 0.0001, + "loss": 4.9859, + "loss/crossentropy": 1.8222022652626038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28611525893211365, + "step": 1896 + }, + { + "epoch": 0.03796, + "grad_norm": 2.96875, + "grad_norm_var": 0.03135477701822917, + "learning_rate": 0.0001, + "loss": 5.0704, + "loss/crossentropy": 2.1963966488838196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2988738566637039, + "step": 1898 + }, + { + "epoch": 0.038, + "grad_norm": 3.046875, + "grad_norm_var": 0.025927734375, + "learning_rate": 0.0001, + "loss": 5.1155, + "loss/crossentropy": 1.9982789754867554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.312205046415329, + "step": 1900 + }, + { + "epoch": 0.03804, + "grad_norm": 2.75, + "grad_norm_var": 0.020612589518229165, + "learning_rate": 0.0001, + "loss": 5.2097, + "loss/crossentropy": 2.1999258995056152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29916079342365265, + "step": 1902 + }, + { + "epoch": 0.03808, + "grad_norm": 2.859375, + "grad_norm_var": 0.016999308268229166, + "learning_rate": 0.0001, + "loss": 5.2233, + "loss/crossentropy": 2.0725532174110413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3093830645084381, + "step": 1904 + }, + { + "epoch": 0.03812, + "grad_norm": 3.03125, + "grad_norm_var": 0.027079264322916668, + "learning_rate": 0.0001, + "loss": 5.7014, + "loss/crossentropy": 2.2504276037216187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.332836389541626, + "step": 1906 + }, + { + "epoch": 0.03816, + "grad_norm": 2.875, + "grad_norm_var": 0.026927693684895834, + "learning_rate": 0.0001, + "loss": 5.3413, + "loss/crossentropy": 2.1570577025413513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3210139572620392, + "step": 1908 + }, + { + "epoch": 0.0382, + "grad_norm": 3.59375, + "grad_norm_var": 0.05137430826822917, + "learning_rate": 0.0001, + "loss": 5.6215, + "loss/crossentropy": 2.0739041566848755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3091907352209091, + "step": 1910 + }, + { + "epoch": 0.03824, + "grad_norm": 2.703125, + "grad_norm_var": 0.0502838134765625, + "learning_rate": 0.0001, + "loss": 5.0772, + "loss/crossentropy": 2.0542168021202087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2791624963283539, + "step": 1912 + }, + { + "epoch": 0.03828, + "grad_norm": 2.65625, + "grad_norm_var": 0.0561431884765625, + "learning_rate": 0.0001, + "loss": 4.9386, + "loss/crossentropy": 1.9705287218093872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25483617186546326, + "step": 1914 + }, + { + "epoch": 0.03832, + "grad_norm": 2.875, + "grad_norm_var": 0.05819905598958333, + "learning_rate": 0.0001, + "loss": 5.054, + "loss/crossentropy": 2.0234111547470093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3110152333974838, + "step": 1916 + }, + { + "epoch": 0.03836, + "grad_norm": 3.15625, + "grad_norm_var": 0.06004130045572917, + "learning_rate": 0.0001, + "loss": 5.2723, + "loss/crossentropy": 2.010735273361206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2943515181541443, + "step": 1918 + }, + { + "epoch": 0.0384, + "grad_norm": 2.734375, + "grad_norm_var": 0.06575419108072916, + "learning_rate": 0.0001, + "loss": 4.9471, + "loss/crossentropy": 2.1912686824798584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27914971113204956, + "step": 1920 + }, + { + "epoch": 0.03844, + "grad_norm": 2.421875, + "grad_norm_var": 0.0709869384765625, + "learning_rate": 0.0001, + "loss": 5.085, + "loss/crossentropy": 1.9889940023422241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26166442036628723, + "step": 1922 + }, + { + "epoch": 0.03848, + "grad_norm": 2.96875, + "grad_norm_var": 0.07668863932291667, + "learning_rate": 0.0001, + "loss": 5.3534, + "loss/crossentropy": 2.154898941516876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31905338168144226, + "step": 1924 + }, + { + "epoch": 0.03852, + "grad_norm": 2.796875, + "grad_norm_var": 0.03997395833333333, + "learning_rate": 0.0001, + "loss": 5.0847, + "loss/crossentropy": 2.44269061088562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28736811876296997, + "step": 1926 + }, + { + "epoch": 0.03856, + "grad_norm": 2.6875, + "grad_norm_var": 0.04108784993489583, + "learning_rate": 0.0001, + "loss": 5.0585, + "loss/crossentropy": 1.6790328621864319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32607489824295044, + "step": 1928 + }, + { + "epoch": 0.0386, + "grad_norm": 3.046875, + "grad_norm_var": 0.05115559895833333, + "learning_rate": 0.0001, + "loss": 5.336, + "loss/crossentropy": 2.0223641991615295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2871186435222626, + "step": 1930 + }, + { + "epoch": 0.03864, + "grad_norm": 2.8125, + "grad_norm_var": 0.05756734212239583, + "learning_rate": 0.0001, + "loss": 5.549, + "loss/crossentropy": 2.451051712036133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33973076939582825, + "step": 1932 + }, + { + "epoch": 0.03868, + "grad_norm": 2.765625, + "grad_norm_var": 0.052262369791666666, + "learning_rate": 0.0001, + "loss": 5.4403, + "loss/crossentropy": 2.2884862422943115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29729554057121277, + "step": 1934 + }, + { + "epoch": 0.03872, + "grad_norm": 2.890625, + "grad_norm_var": 0.04889322916666667, + "learning_rate": 0.0001, + "loss": 5.6203, + "loss/crossentropy": 2.113345444202423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.36945630609989166, + "step": 1936 + }, + { + "epoch": 0.03876, + "grad_norm": 2.71875, + "grad_norm_var": 0.03752848307291667, + "learning_rate": 0.0001, + "loss": 5.1845, + "loss/crossentropy": 2.139409840106964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2899101823568344, + "step": 1938 + }, + { + "epoch": 0.0388, + "grad_norm": 2.953125, + "grad_norm_var": 0.03622639973958333, + "learning_rate": 0.0001, + "loss": 5.5397, + "loss/crossentropy": 2.1029305458068848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29830390214920044, + "step": 1940 + }, + { + "epoch": 0.03884, + "grad_norm": 2.734375, + "grad_norm_var": 0.03816630045572917, + "learning_rate": 0.0001, + "loss": 5.1383, + "loss/crossentropy": 1.7736502885818481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27648696303367615, + "step": 1942 + }, + { + "epoch": 0.03888, + "grad_norm": 2.671875, + "grad_norm_var": 0.03942057291666667, + "learning_rate": 0.0001, + "loss": 5.0885, + "loss/crossentropy": 2.0281469225883484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30329641699790955, + "step": 1944 + }, + { + "epoch": 0.03892, + "grad_norm": 2.890625, + "grad_norm_var": 0.03528544108072917, + "learning_rate": 0.0001, + "loss": 5.3265, + "loss/crossentropy": 2.404891610145569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3563212752342224, + "step": 1946 + }, + { + "epoch": 0.03896, + "grad_norm": 2.53125, + "grad_norm_var": 0.03572489420572917, + "learning_rate": 0.0001, + "loss": 5.0657, + "loss/crossentropy": 2.2187922596931458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2999056503176689, + "step": 1948 + }, + { + "epoch": 0.039, + "grad_norm": 2.875, + "grad_norm_var": 0.03566792805989583, + "learning_rate": 0.0001, + "loss": 5.0499, + "loss/crossentropy": 2.3901994228363037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3288661539554596, + "step": 1950 + }, + { + "epoch": 0.03904, + "grad_norm": 3.15625, + "grad_norm_var": 0.04069722493489583, + "learning_rate": 0.0001, + "loss": 5.252, + "loss/crossentropy": 2.274617314338684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2928028404712677, + "step": 1952 + }, + { + "epoch": 0.03908, + "grad_norm": 2.859375, + "grad_norm_var": 0.03430582682291667, + "learning_rate": 0.0001, + "loss": 5.4463, + "loss/crossentropy": 2.2478950023651123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3027127981185913, + "step": 1954 + }, + { + "epoch": 0.03912, + "grad_norm": 2.78125, + "grad_norm_var": 0.030321248372395835, + "learning_rate": 0.0001, + "loss": 5.2707, + "loss/crossentropy": 2.0634876489639282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29374830424785614, + "step": 1956 + }, + { + "epoch": 0.03916, + "grad_norm": 2.90625, + "grad_norm_var": 0.0287994384765625, + "learning_rate": 0.0001, + "loss": 5.3441, + "loss/crossentropy": 2.171326994895935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2995557487010956, + "step": 1958 + }, + { + "epoch": 0.0392, + "grad_norm": 2.515625, + "grad_norm_var": 0.040266927083333334, + "learning_rate": 0.0001, + "loss": 4.9937, + "loss/crossentropy": 2.142563223838806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27376608550548553, + "step": 1960 + }, + { + "epoch": 0.03924, + "grad_norm": 5.34375, + "grad_norm_var": 0.42568359375, + "learning_rate": 0.0001, + "loss": 5.4145, + "loss/crossentropy": 2.3189245462417603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.37109237909317017, + "step": 1962 + }, + { + "epoch": 0.03928, + "grad_norm": 3.203125, + "grad_norm_var": 0.40812886555989586, + "learning_rate": 0.0001, + "loss": 5.1826, + "loss/crossentropy": 2.1483139991760254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32499393820762634, + "step": 1964 + }, + { + "epoch": 0.03932, + "grad_norm": 3.046875, + "grad_norm_var": 0.404736328125, + "learning_rate": 0.0001, + "loss": 5.445, + "loss/crossentropy": 2.2916383743286133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29367291927337646, + "step": 1966 + }, + { + "epoch": 0.03936, + "grad_norm": 2.9375, + "grad_norm_var": 0.3999582926432292, + "learning_rate": 0.0001, + "loss": 5.1663, + "loss/crossentropy": 2.4217371940612793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31309331953525543, + "step": 1968 + }, + { + "epoch": 0.0394, + "grad_norm": 2.9375, + "grad_norm_var": 0.40103759765625, + "learning_rate": 0.0001, + "loss": 5.2057, + "loss/crossentropy": 1.9491975903511047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29921913146972656, + "step": 1970 + }, + { + "epoch": 0.03944, + "grad_norm": 2.78125, + "grad_norm_var": 0.39126688639322915, + "learning_rate": 0.0001, + "loss": 5.1788, + "loss/crossentropy": 2.144432306289673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29162150621414185, + "step": 1972 + }, + { + "epoch": 0.03948, + "grad_norm": 2.765625, + "grad_norm_var": 0.40748291015625, + "learning_rate": 0.0001, + "loss": 5.1336, + "loss/crossentropy": 1.9492529034614563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2875414192676544, + "step": 1974 + }, + { + "epoch": 0.03952, + "grad_norm": 2.75, + "grad_norm_var": 0.4002593994140625, + "learning_rate": 0.0001, + "loss": 5.053, + "loss/crossentropy": 1.9269813895225525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2755381464958191, + "step": 1976 + }, + { + "epoch": 0.03956, + "grad_norm": 2.828125, + "grad_norm_var": 0.03798726399739583, + "learning_rate": 0.0001, + "loss": 4.8879, + "loss/crossentropy": 2.074360489845276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2987503558397293, + "step": 1978 + }, + { + "epoch": 0.0396, + "grad_norm": 2.953125, + "grad_norm_var": 0.02086181640625, + "learning_rate": 0.0001, + "loss": 4.8834, + "loss/crossentropy": 2.257633090019226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2992274910211563, + "step": 1980 + }, + { + "epoch": 0.03964, + "grad_norm": 2.75, + "grad_norm_var": 0.01802978515625, + "learning_rate": 0.0001, + "loss": 4.9533, + "loss/crossentropy": 1.8207083940505981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2752758115530014, + "step": 1982 + }, + { + "epoch": 0.03968, + "grad_norm": 2.875, + "grad_norm_var": 0.015623982747395833, + "learning_rate": 0.0001, + "loss": 5.343, + "loss/crossentropy": 2.105292797088623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3131226450204849, + "step": 1984 + }, + { + "epoch": 0.03972, + "grad_norm": 3.484375, + "grad_norm_var": 0.05191650390625, + "learning_rate": 0.0001, + "loss": 5.4785, + "loss/crossentropy": 2.1191373467445374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3175530731678009, + "step": 1986 + }, + { + "epoch": 0.03976, + "grad_norm": 2.75, + "grad_norm_var": 0.051878865559895834, + "learning_rate": 0.0001, + "loss": 4.9236, + "loss/crossentropy": 2.2214397192001343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3091724067926407, + "step": 1988 + }, + { + "epoch": 0.0398, + "grad_norm": 2.875, + "grad_norm_var": 0.05133056640625, + "learning_rate": 0.0001, + "loss": 5.0031, + "loss/crossentropy": 1.7347424626350403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2825440764427185, + "step": 1990 + }, + { + "epoch": 0.03984, + "grad_norm": 2.890625, + "grad_norm_var": 0.0500396728515625, + "learning_rate": 0.0001, + "loss": 5.0951, + "loss/crossentropy": 2.1566559076309204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2961925268173218, + "step": 1992 + }, + { + "epoch": 0.03988, + "grad_norm": 2.703125, + "grad_norm_var": 0.06396077473958334, + "learning_rate": 0.0001, + "loss": 4.9195, + "loss/crossentropy": 2.2129205465316772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29672613739967346, + "step": 1994 + }, + { + "epoch": 0.03992, + "grad_norm": 3.3125, + "grad_norm_var": 0.07595113118489584, + "learning_rate": 0.0001, + "loss": 5.7534, + "loss/crossentropy": 2.4702744483947754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3707122802734375, + "step": 1996 + }, + { + "epoch": 0.03996, + "grad_norm": 2.84375, + "grad_norm_var": 0.06731669108072917, + "learning_rate": 0.0001, + "loss": 5.0296, + "loss/crossentropy": 2.0463536977767944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3124735355377197, + "step": 1998 + }, + { + "epoch": 0.04, + "grad_norm": 2.796875, + "grad_norm_var": 0.07281494140625, + "learning_rate": 0.0001, + "loss": 4.959, + "loss/crossentropy": 2.1550235748291016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32177163660526276, + "step": 2000 + }, + { + "epoch": 0.04004, + "grad_norm": 2.71875, + "grad_norm_var": 0.059798177083333334, + "learning_rate": 0.0001, + "loss": 5.2078, + "loss/crossentropy": 2.1312190890312195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29013920575380325, + "step": 2002 + }, + { + "epoch": 0.04008, + "grad_norm": 2.75, + "grad_norm_var": 0.06082356770833333, + "learning_rate": 0.0001, + "loss": 5.0086, + "loss/crossentropy": 1.8546085357666016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2619543671607971, + "step": 2004 + }, + { + "epoch": 0.04012, + "grad_norm": 3.015625, + "grad_norm_var": 0.0561920166015625, + "learning_rate": 0.0001, + "loss": 5.3416, + "loss/crossentropy": 2.262398660182953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28742220997810364, + "step": 2006 + }, + { + "epoch": 0.04016, + "grad_norm": 3.109375, + "grad_norm_var": 0.05734049479166667, + "learning_rate": 0.0001, + "loss": 5.4315, + "loss/crossentropy": 2.156043767929077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30488699674606323, + "step": 2008 + }, + { + "epoch": 0.0402, + "grad_norm": 2.765625, + "grad_norm_var": 0.04290364583333333, + "learning_rate": 0.0001, + "loss": 4.8761, + "loss/crossentropy": 1.925516963005066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2795708477497101, + "step": 2010 + }, + { + "epoch": 0.04024, + "grad_norm": 2.6875, + "grad_norm_var": 0.031281534830729166, + "learning_rate": 0.0001, + "loss": 5.0729, + "loss/crossentropy": 1.947714388370514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27192793786525726, + "step": 2012 + }, + { + "epoch": 0.04028, + "grad_norm": 2.625, + "grad_norm_var": 0.038939412434895834, + "learning_rate": 0.0001, + "loss": 4.6214, + "loss/crossentropy": 1.9584010243415833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2835424840450287, + "step": 2014 + }, + { + "epoch": 0.04032, + "grad_norm": 2.578125, + "grad_norm_var": 0.04108784993489583, + "learning_rate": 0.0001, + "loss": 5.1974, + "loss/crossentropy": 2.461808919906616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3104698956012726, + "step": 2016 + }, + { + "epoch": 0.04036, + "grad_norm": 2.734375, + "grad_norm_var": 0.021903483072916667, + "learning_rate": 0.0001, + "loss": 5.286, + "loss/crossentropy": 2.094545900821686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30010756850242615, + "step": 2018 + }, + { + "epoch": 0.0404, + "grad_norm": 2.78125, + "grad_norm_var": 0.021923828125, + "learning_rate": 0.0001, + "loss": 5.1274, + "loss/crossentropy": 2.353589177131653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29775144159793854, + "step": 2020 + }, + { + "epoch": 0.04044, + "grad_norm": 3.375, + "grad_norm_var": 10.55537821451823, + "learning_rate": 0.0001, + "loss": 5.3359, + "loss/crossentropy": 2.4468252658843994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.34534354507923126, + "step": 2022 + }, + { + "epoch": 0.04048, + "grad_norm": 2.984375, + "grad_norm_var": 10.529678344726562, + "learning_rate": 0.0001, + "loss": 5.5028, + "loss/crossentropy": 2.2037755250930786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.303710475564003, + "step": 2024 + }, + { + "epoch": 0.04052, + "grad_norm": 2.71875, + "grad_norm_var": 10.546240234375, + "learning_rate": 0.0001, + "loss": 4.9229, + "loss/crossentropy": 1.9658318161964417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2829178273677826, + "step": 2026 + }, + { + "epoch": 0.04056, + "grad_norm": 2.734375, + "grad_norm_var": 10.555729166666667, + "learning_rate": 0.0001, + "loss": 4.8996, + "loss/crossentropy": 2.118351697921753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29469528794288635, + "step": 2028 + }, + { + "epoch": 0.0406, + "grad_norm": 2.875, + "grad_norm_var": 10.507957967122396, + "learning_rate": 0.0001, + "loss": 5.5817, + "loss/crossentropy": 2.172826111316681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35036201775074005, + "step": 2030 + }, + { + "epoch": 0.04064, + "grad_norm": 3.953125, + "grad_norm_var": 10.432124837239583, + "learning_rate": 0.0001, + "loss": 5.4846, + "loss/crossentropy": 2.185975730419159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3397497236728668, + "step": 2032 + }, + { + "epoch": 0.04068, + "grad_norm": 2.78125, + "grad_norm_var": 10.411026000976562, + "learning_rate": 0.0001, + "loss": 5.0145, + "loss/crossentropy": 2.043874442577362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3034070134162903, + "step": 2034 + }, + { + "epoch": 0.04072, + "grad_norm": 2.984375, + "grad_norm_var": 10.380106608072916, + "learning_rate": 0.0001, + "loss": 5.3958, + "loss/crossentropy": 2.3315287828445435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32198067009449005, + "step": 2036 + }, + { + "epoch": 0.04076, + "grad_norm": 2.828125, + "grad_norm_var": 0.1112213134765625, + "learning_rate": 0.0001, + "loss": 5.0266, + "loss/crossentropy": 2.05656898021698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30919137597084045, + "step": 2038 + }, + { + "epoch": 0.0408, + "grad_norm": 2.578125, + "grad_norm_var": 0.10188395182291667, + "learning_rate": 0.0001, + "loss": 5.3205, + "loss/crossentropy": 2.2451635599136353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3412973880767822, + "step": 2040 + }, + { + "epoch": 0.04084, + "grad_norm": 3.1875, + "grad_norm_var": 0.10715738932291667, + "learning_rate": 0.0001, + "loss": 5.1734, + "loss/crossentropy": 2.527924060821533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3178148865699768, + "step": 2042 + }, + { + "epoch": 0.04088, + "grad_norm": 2.78125, + "grad_norm_var": 0.10305582682291667, + "learning_rate": 0.0001, + "loss": 4.8441, + "loss/crossentropy": 2.03126460313797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29549214243888855, + "step": 2044 + }, + { + "epoch": 0.04092, + "grad_norm": 2.515625, + "grad_norm_var": 0.11038004557291667, + "learning_rate": 0.0001, + "loss": 5.0134, + "loss/crossentropy": 2.029997706413269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27938202023506165, + "step": 2046 + }, + { + "epoch": 0.04096, + "grad_norm": 2.703125, + "grad_norm_var": 0.03211263020833333, + "learning_rate": 0.0001, + "loss": 4.9321, + "loss/crossentropy": 1.764098048210144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27712512016296387, + "step": 2048 + }, + { + "epoch": 0.041, + "grad_norm": 3.046875, + "grad_norm_var": 0.0356597900390625, + "learning_rate": 0.0001, + "loss": 5.435, + "loss/crossentropy": 2.605324864387512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3540754020214081, + "step": 2050 + }, + { + "epoch": 0.04104, + "grad_norm": 2.875, + "grad_norm_var": 0.0349761962890625, + "learning_rate": 0.0001, + "loss": 5.0207, + "loss/crossentropy": 1.9333613514900208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29670488089323044, + "step": 2052 + }, + { + "epoch": 0.04108, + "grad_norm": 3.078125, + "grad_norm_var": 0.040022786458333334, + "learning_rate": 0.0001, + "loss": 5.0056, + "loss/crossentropy": 1.7876797914505005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2579014301300049, + "step": 2054 + }, + { + "epoch": 0.04112, + "grad_norm": 3.125, + "grad_norm_var": 0.0417877197265625, + "learning_rate": 0.0001, + "loss": 5.1401, + "loss/crossentropy": 2.0947588682174683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3334304690361023, + "step": 2056 + }, + { + "epoch": 0.04116, + "grad_norm": 3.0, + "grad_norm_var": 0.035965983072916666, + "learning_rate": 0.0001, + "loss": 4.9009, + "loss/crossentropy": 2.1838767528533936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.306907519698143, + "step": 2058 + }, + { + "epoch": 0.0412, + "grad_norm": 3.015625, + "grad_norm_var": 0.0341461181640625, + "learning_rate": 0.0001, + "loss": 5.3724, + "loss/crossentropy": 2.2180997133255005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.314627081155777, + "step": 2060 + }, + { + "epoch": 0.04124, + "grad_norm": 2.875, + "grad_norm_var": 0.025419108072916665, + "learning_rate": 0.0001, + "loss": 4.8362, + "loss/crossentropy": 1.914646863937378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2848198413848877, + "step": 2062 + }, + { + "epoch": 0.04128, + "grad_norm": 2.609375, + "grad_norm_var": 0.0262603759765625, + "learning_rate": 0.0001, + "loss": 5.4538, + "loss/crossentropy": 2.42458713054657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31268230080604553, + "step": 2064 + }, + { + "epoch": 0.04132, + "grad_norm": 2.828125, + "grad_norm_var": 0.02457275390625, + "learning_rate": 0.0001, + "loss": 5.2497, + "loss/crossentropy": 2.23202121257782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30988384783267975, + "step": 2066 + }, + { + "epoch": 0.04136, + "grad_norm": 2.5625, + "grad_norm_var": 0.0284820556640625, + "learning_rate": 0.0001, + "loss": 5.0416, + "loss/crossentropy": 2.0225483179092407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26953594386577606, + "step": 2068 + }, + { + "epoch": 0.0414, + "grad_norm": 2.875, + "grad_norm_var": 0.028669230143229165, + "learning_rate": 0.0001, + "loss": 4.8259, + "loss/crossentropy": 1.8593338131904602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27706706523895264, + "step": 2070 + }, + { + "epoch": 0.04144, + "grad_norm": 2.59375, + "grad_norm_var": 0.023387654622395834, + "learning_rate": 0.0001, + "loss": 4.9949, + "loss/crossentropy": 2.373727560043335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3000074476003647, + "step": 2072 + }, + { + "epoch": 0.04148, + "grad_norm": 2.578125, + "grad_norm_var": 0.022704060872395834, + "learning_rate": 0.0001, + "loss": 4.9438, + "loss/crossentropy": 1.959564983844757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26365046203136444, + "step": 2074 + }, + { + "epoch": 0.04152, + "grad_norm": 2.921875, + "grad_norm_var": 0.014159138997395833, + "learning_rate": 0.0001, + "loss": 4.983, + "loss/crossentropy": 2.0590676069259644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2816864550113678, + "step": 2076 + }, + { + "epoch": 0.04156, + "grad_norm": 2.84375, + "grad_norm_var": 0.017121378580729166, + "learning_rate": 0.0001, + "loss": 5.2049, + "loss/crossentropy": 2.147680163383484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29019051790237427, + "step": 2078 + }, + { + "epoch": 0.0416, + "grad_norm": 2.609375, + "grad_norm_var": 0.020198567708333334, + "learning_rate": 0.0001, + "loss": 5.6103, + "loss/crossentropy": 2.212267220020294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2804659754037857, + "step": 2080 + }, + { + "epoch": 0.04164, + "grad_norm": 2.65625, + "grad_norm_var": 0.020406087239583332, + "learning_rate": 0.0001, + "loss": 5.0489, + "loss/crossentropy": 2.144743025302887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2946523129940033, + "step": 2082 + }, + { + "epoch": 0.04168, + "grad_norm": 2.703125, + "grad_norm_var": 0.019172159830729167, + "learning_rate": 0.0001, + "loss": 4.9311, + "loss/crossentropy": 2.3702481985092163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2923210561275482, + "step": 2084 + }, + { + "epoch": 0.04172, + "grad_norm": 2.96875, + "grad_norm_var": 0.019684855143229166, + "learning_rate": 0.0001, + "loss": 5.2291, + "loss/crossentropy": 1.9512975811958313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28704553842544556, + "step": 2086 + }, + { + "epoch": 0.04176, + "grad_norm": 3.25, + "grad_norm_var": 0.25388895670572914, + "learning_rate": 0.0001, + "loss": 4.9572, + "loss/crossentropy": 2.180745005607605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28361976146698, + "step": 2088 + }, + { + "epoch": 0.0418, + "grad_norm": 2.671875, + "grad_norm_var": 0.24712626139322916, + "learning_rate": 0.0001, + "loss": 4.8579, + "loss/crossentropy": 1.9768275022506714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28042787313461304, + "step": 2090 + }, + { + "epoch": 0.04184, + "grad_norm": 2.90625, + "grad_norm_var": 0.24544270833333334, + "learning_rate": 0.0001, + "loss": 5.2602, + "loss/crossentropy": 2.148472547531128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30099035799503326, + "step": 2092 + }, + { + "epoch": 0.04188, + "grad_norm": 2.734375, + "grad_norm_var": 0.24763081868489584, + "learning_rate": 0.0001, + "loss": 5.0152, + "loss/crossentropy": 2.1698715686798096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3140450567007065, + "step": 2094 + }, + { + "epoch": 0.04192, + "grad_norm": 2.828125, + "grad_norm_var": 0.24172770182291667, + "learning_rate": 0.0001, + "loss": 4.8679, + "loss/crossentropy": 2.1142334938049316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2847675681114197, + "step": 2096 + }, + { + "epoch": 0.04196, + "grad_norm": 2.703125, + "grad_norm_var": 0.2395660400390625, + "learning_rate": 0.0001, + "loss": 5.2185, + "loss/crossentropy": 2.1908479928970337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28702451288700104, + "step": 2098 + }, + { + "epoch": 0.042, + "grad_norm": 2.6875, + "grad_norm_var": 0.23321024576822916, + "learning_rate": 0.0001, + "loss": 5.0212, + "loss/crossentropy": 2.0519612431526184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29224735498428345, + "step": 2100 + }, + { + "epoch": 0.04204, + "grad_norm": 2.96875, + "grad_norm_var": 0.2412994384765625, + "learning_rate": 0.0001, + "loss": 4.871, + "loss/crossentropy": 1.9304961562156677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28785137832164764, + "step": 2102 + }, + { + "epoch": 0.04208, + "grad_norm": 2.765625, + "grad_norm_var": 0.0134185791015625, + "learning_rate": 0.0001, + "loss": 5.2462, + "loss/crossentropy": 2.297300934791565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.302143856883049, + "step": 2104 + }, + { + "epoch": 0.04212, + "grad_norm": 2.453125, + "grad_norm_var": 0.019782511393229167, + "learning_rate": 0.0001, + "loss": 5.0491, + "loss/crossentropy": 2.2764381170272827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28773219883441925, + "step": 2106 + }, + { + "epoch": 0.04216, + "grad_norm": 2.625, + "grad_norm_var": 0.019391886393229165, + "learning_rate": 0.0001, + "loss": 5.0563, + "loss/crossentropy": 2.141321837902069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3108212947845459, + "step": 2108 + }, + { + "epoch": 0.0422, + "grad_norm": 2.71875, + "grad_norm_var": 0.0185699462890625, + "learning_rate": 0.0001, + "loss": 5.0362, + "loss/crossentropy": 1.9619495272636414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2938811331987381, + "step": 2110 + }, + { + "epoch": 0.04224, + "grad_norm": 3.015625, + "grad_norm_var": 0.026325480143229166, + "learning_rate": 0.0001, + "loss": 5.4496, + "loss/crossentropy": 1.9741051197052002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28835102915763855, + "step": 2112 + }, + { + "epoch": 0.04228, + "grad_norm": 2.546875, + "grad_norm_var": 0.029255167643229166, + "learning_rate": 0.0001, + "loss": 4.9303, + "loss/crossentropy": 1.9510936737060547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2839510589838028, + "step": 2114 + }, + { + "epoch": 0.04232, + "grad_norm": 2.828125, + "grad_norm_var": 0.026753743489583332, + "learning_rate": 0.0001, + "loss": 5.2446, + "loss/crossentropy": 2.0201885104179382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30627067387104034, + "step": 2116 + }, + { + "epoch": 0.04236, + "grad_norm": 2.6875, + "grad_norm_var": 0.022493489583333335, + "learning_rate": 0.0001, + "loss": 5.1411, + "loss/crossentropy": 2.4522262811660767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.308579683303833, + "step": 2118 + }, + { + "epoch": 0.0424, + "grad_norm": 3.09375, + "grad_norm_var": 0.04345296223958333, + "learning_rate": 0.0001, + "loss": 5.5535, + "loss/crossentropy": 1.9289590120315552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29346515238285065, + "step": 2120 + }, + { + "epoch": 0.04244, + "grad_norm": 2.8125, + "grad_norm_var": 0.03437398274739583, + "learning_rate": 0.0001, + "loss": 5.0588, + "loss/crossentropy": 2.2020061016082764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2992282509803772, + "step": 2122 + }, + { + "epoch": 0.04248, + "grad_norm": 2.78125, + "grad_norm_var": 0.031494140625, + "learning_rate": 0.0001, + "loss": 5.2466, + "loss/crossentropy": 2.180301785469055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28323256969451904, + "step": 2124 + }, + { + "epoch": 0.04252, + "grad_norm": 2.78125, + "grad_norm_var": 0.03435872395833333, + "learning_rate": 0.0001, + "loss": 4.9061, + "loss/crossentropy": 2.1250513792037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.272533118724823, + "step": 2126 + }, + { + "epoch": 0.04256, + "grad_norm": 2.78125, + "grad_norm_var": 0.037873331705729166, + "learning_rate": 0.0001, + "loss": 5.4375, + "loss/crossentropy": 2.3509981632232666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3719516545534134, + "step": 2128 + }, + { + "epoch": 0.0426, + "grad_norm": 3.078125, + "grad_norm_var": 0.03508707682291667, + "learning_rate": 0.0001, + "loss": 5.3067, + "loss/crossentropy": 2.135426163673401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3582882583141327, + "step": 2130 + }, + { + "epoch": 0.04264, + "grad_norm": 2.578125, + "grad_norm_var": 0.0398590087890625, + "learning_rate": 0.0001, + "loss": 5.2406, + "loss/crossentropy": 2.316452383995056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30162203311920166, + "step": 2132 + }, + { + "epoch": 0.04268, + "grad_norm": 2.765625, + "grad_norm_var": 0.03846028645833333, + "learning_rate": 0.0001, + "loss": 5.0372, + "loss/crossentropy": 2.0325432419776917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.287256121635437, + "step": 2134 + }, + { + "epoch": 0.04272, + "grad_norm": 2.6875, + "grad_norm_var": 0.0236236572265625, + "learning_rate": 0.0001, + "loss": 5.1985, + "loss/crossentropy": 2.070056974887848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2552843391895294, + "step": 2136 + }, + { + "epoch": 0.04276, + "grad_norm": 3.078125, + "grad_norm_var": 0.029157511393229165, + "learning_rate": 0.0001, + "loss": 5.0623, + "loss/crossentropy": 1.7005944848060608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24984879791736603, + "step": 2138 + }, + { + "epoch": 0.0428, + "grad_norm": 2.75, + "grad_norm_var": 0.03203125, + "learning_rate": 0.0001, + "loss": 5.0862, + "loss/crossentropy": 1.6700931787490845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2572527676820755, + "step": 2140 + }, + { + "epoch": 0.04284, + "grad_norm": 2.65625, + "grad_norm_var": 0.03125, + "learning_rate": 0.0001, + "loss": 5.0186, + "loss/crossentropy": 2.3074774742126465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31137382984161377, + "step": 2142 + }, + { + "epoch": 0.04288, + "grad_norm": 2.84375, + "grad_norm_var": 0.0229400634765625, + "learning_rate": 0.0001, + "loss": 5.1973, + "loss/crossentropy": 2.103408098220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30157215893268585, + "step": 2144 + }, + { + "epoch": 0.04292, + "grad_norm": 2.796875, + "grad_norm_var": 0.020979817708333334, + "learning_rate": 0.0001, + "loss": 4.8206, + "loss/crossentropy": 1.8602584600448608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28300634026527405, + "step": 2146 + }, + { + "epoch": 0.04296, + "grad_norm": 2.671875, + "grad_norm_var": 0.019188435872395833, + "learning_rate": 0.0001, + "loss": 5.0525, + "loss/crossentropy": 2.337582588195801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28086431324481964, + "step": 2148 + }, + { + "epoch": 0.043, + "grad_norm": 2.796875, + "grad_norm_var": 0.026439412434895834, + "learning_rate": 0.0001, + "loss": 5.2405, + "loss/crossentropy": 2.2635254859924316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3113311231136322, + "step": 2150 + }, + { + "epoch": 0.04304, + "grad_norm": 2.96875, + "grad_norm_var": 0.0277984619140625, + "learning_rate": 0.0001, + "loss": 5.187, + "loss/crossentropy": 2.3971948623657227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32285284996032715, + "step": 2152 + }, + { + "epoch": 0.04308, + "grad_norm": 2.875, + "grad_norm_var": 0.021891276041666668, + "learning_rate": 0.0001, + "loss": 5.1438, + "loss/crossentropy": 1.8900776505470276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28163351118564606, + "step": 2154 + }, + { + "epoch": 0.04312, + "grad_norm": 2.765625, + "grad_norm_var": 0.020536295572916665, + "learning_rate": 0.0001, + "loss": 4.9774, + "loss/crossentropy": 1.908443808555603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2901918590068817, + "step": 2156 + }, + { + "epoch": 0.04316, + "grad_norm": 2.75, + "grad_norm_var": 0.020409138997395833, + "learning_rate": 0.0001, + "loss": 4.7808, + "loss/crossentropy": 1.8003268837928772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2762569487094879, + "step": 2158 + }, + { + "epoch": 0.0432, + "grad_norm": 2.640625, + "grad_norm_var": 0.0221832275390625, + "learning_rate": 0.0001, + "loss": 5.0477, + "loss/crossentropy": 1.996739387512207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24626458436250687, + "step": 2160 + }, + { + "epoch": 0.04324, + "grad_norm": 2.84375, + "grad_norm_var": 0.018880208333333332, + "learning_rate": 0.0001, + "loss": 5.1737, + "loss/crossentropy": 2.0175461173057556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31059183180332184, + "step": 2162 + }, + { + "epoch": 0.04328, + "grad_norm": 2.6875, + "grad_norm_var": 0.019266764322916668, + "learning_rate": 0.0001, + "loss": 5.0448, + "loss/crossentropy": 2.0009909868240356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2835536003112793, + "step": 2164 + }, + { + "epoch": 0.04332, + "grad_norm": 2.65625, + "grad_norm_var": 0.017967732747395833, + "learning_rate": 0.0001, + "loss": 4.9035, + "loss/crossentropy": 1.9848785400390625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26657119393348694, + "step": 2166 + }, + { + "epoch": 0.04336, + "grad_norm": 2.734375, + "grad_norm_var": 0.01207275390625, + "learning_rate": 0.0001, + "loss": 4.9108, + "loss/crossentropy": 2.076065957546234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2801993191242218, + "step": 2168 + }, + { + "epoch": 0.0434, + "grad_norm": 2.828125, + "grad_norm_var": 0.011617024739583334, + "learning_rate": 0.0001, + "loss": 5.1425, + "loss/crossentropy": 2.1208528876304626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3152369260787964, + "step": 2170 + }, + { + "epoch": 0.04344, + "grad_norm": 2.625, + "grad_norm_var": 0.011400349934895833, + "learning_rate": 0.0001, + "loss": 5.0608, + "loss/crossentropy": 2.1971306204795837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31046128273010254, + "step": 2172 + }, + { + "epoch": 0.04348, + "grad_norm": 2.734375, + "grad_norm_var": 0.013158162434895834, + "learning_rate": 0.0001, + "loss": 5.2445, + "loss/crossentropy": 2.275176525115967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3090529441833496, + "step": 2174 + }, + { + "epoch": 0.04352, + "grad_norm": 2.8125, + "grad_norm_var": 0.01197509765625, + "learning_rate": 0.0001, + "loss": 4.9366, + "loss/crossentropy": 2.1574501395225525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.280165433883667, + "step": 2176 + }, + { + "epoch": 0.04356, + "grad_norm": 2.65625, + "grad_norm_var": 0.012548828125, + "learning_rate": 0.0001, + "loss": 5.2338, + "loss/crossentropy": 2.4236754179000854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3061629384756088, + "step": 2178 + }, + { + "epoch": 0.0436, + "grad_norm": 2.78125, + "grad_norm_var": 0.012809244791666667, + "learning_rate": 0.0001, + "loss": 5.1707, + "loss/crossentropy": 2.1282758712768555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3050261586904526, + "step": 2180 + }, + { + "epoch": 0.04364, + "grad_norm": 2.828125, + "grad_norm_var": 0.008561197916666667, + "learning_rate": 0.0001, + "loss": 5.4629, + "loss/crossentropy": 2.4244707822799683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33464157581329346, + "step": 2182 + }, + { + "epoch": 0.04368, + "grad_norm": 2.640625, + "grad_norm_var": 0.010904947916666666, + "learning_rate": 0.0001, + "loss": 5.3891, + "loss/crossentropy": 2.289917469024658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3181813210248947, + "step": 2184 + }, + { + "epoch": 0.04372, + "grad_norm": 2.890625, + "grad_norm_var": 0.01031494140625, + "learning_rate": 0.0001, + "loss": 5.4207, + "loss/crossentropy": 2.1540024280548096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.298043891787529, + "step": 2186 + }, + { + "epoch": 0.04376, + "grad_norm": 2.609375, + "grad_norm_var": 0.0145660400390625, + "learning_rate": 0.0001, + "loss": 4.8595, + "loss/crossentropy": 1.6615915298461914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23485098034143448, + "step": 2188 + }, + { + "epoch": 0.0438, + "grad_norm": 2.796875, + "grad_norm_var": 0.01357421875, + "learning_rate": 0.0001, + "loss": 5.0595, + "loss/crossentropy": 2.352560341358185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2965056747198105, + "step": 2190 + }, + { + "epoch": 0.04384, + "grad_norm": 2.625, + "grad_norm_var": 0.0183746337890625, + "learning_rate": 0.0001, + "loss": 5.1463, + "loss/crossentropy": 2.0864007472991943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2859686613082886, + "step": 2192 + }, + { + "epoch": 0.04388, + "grad_norm": 2.765625, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 5.2599, + "loss/crossentropy": 1.8934992551803589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24994677305221558, + "step": 2194 + }, + { + "epoch": 0.04392, + "grad_norm": 2.578125, + "grad_norm_var": 0.02476806640625, + "learning_rate": 0.0001, + "loss": 4.9976, + "loss/crossentropy": 2.2395824193954468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2719078063964844, + "step": 2196 + }, + { + "epoch": 0.04396, + "grad_norm": 2.84375, + "grad_norm_var": 0.02515869140625, + "learning_rate": 0.0001, + "loss": 5.2631, + "loss/crossentropy": 2.089230954647064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2759791761636734, + "step": 2198 + }, + { + "epoch": 0.044, + "grad_norm": 2.75, + "grad_norm_var": 0.023567708333333333, + "learning_rate": 0.0001, + "loss": 5.298, + "loss/crossentropy": 2.2770241498947144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31005042791366577, + "step": 2200 + }, + { + "epoch": 0.04404, + "grad_norm": 2.46875, + "grad_norm_var": 0.028938802083333333, + "learning_rate": 0.0001, + "loss": 4.6842, + "loss/crossentropy": 2.067028760910034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30712655186653137, + "step": 2202 + }, + { + "epoch": 0.04408, + "grad_norm": 2.46875, + "grad_norm_var": 0.026395670572916665, + "learning_rate": 0.0001, + "loss": 5.0557, + "loss/crossentropy": 2.4397774934768677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3508765548467636, + "step": 2204 + }, + { + "epoch": 0.04412, + "grad_norm": 2.890625, + "grad_norm_var": 0.028880818684895834, + "learning_rate": 0.0001, + "loss": 4.966, + "loss/crossentropy": 1.8136217594146729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2495090439915657, + "step": 2206 + }, + { + "epoch": 0.04416, + "grad_norm": 2.71875, + "grad_norm_var": 0.022945149739583334, + "learning_rate": 0.0001, + "loss": 5.11, + "loss/crossentropy": 2.4620203971862793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31419822573661804, + "step": 2208 + }, + { + "epoch": 0.0442, + "grad_norm": 2.625, + "grad_norm_var": 0.020042928059895833, + "learning_rate": 0.0001, + "loss": 4.9756, + "loss/crossentropy": 1.8817986249923706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2705220878124237, + "step": 2210 + }, + { + "epoch": 0.04424, + "grad_norm": 3.125, + "grad_norm_var": 0.02769775390625, + "learning_rate": 0.0001, + "loss": 5.0069, + "loss/crossentropy": 1.9593598246574402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.262384794652462, + "step": 2212 + }, + { + "epoch": 0.04428, + "grad_norm": 2.484375, + "grad_norm_var": 0.03303120930989583, + "learning_rate": 0.0001, + "loss": 4.9133, + "loss/crossentropy": 2.1003851294517517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28965799510478973, + "step": 2214 + }, + { + "epoch": 0.04432, + "grad_norm": 3.46875, + "grad_norm_var": 0.07888081868489584, + "learning_rate": 0.0001, + "loss": 5.3243, + "loss/crossentropy": 2.23227858543396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3120953291654587, + "step": 2216 + }, + { + "epoch": 0.04436, + "grad_norm": 2.625, + "grad_norm_var": 0.0726226806640625, + "learning_rate": 0.0001, + "loss": 5.0901, + "loss/crossentropy": 1.8880399465560913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2740217447280884, + "step": 2218 + }, + { + "epoch": 0.0444, + "grad_norm": 2.5, + "grad_norm_var": 0.07111714680989584, + "learning_rate": 0.0001, + "loss": 4.9444, + "loss/crossentropy": 2.132355511188507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27536119520664215, + "step": 2220 + }, + { + "epoch": 0.04444, + "grad_norm": 2.578125, + "grad_norm_var": 0.07419331868489583, + "learning_rate": 0.0001, + "loss": 4.7325, + "loss/crossentropy": 1.831633746623993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26966987550258636, + "step": 2222 + }, + { + "epoch": 0.04448, + "grad_norm": 2.703125, + "grad_norm_var": 0.08056538899739583, + "learning_rate": 0.0001, + "loss": 5.004, + "loss/crossentropy": 2.066656529903412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27303647994995117, + "step": 2224 + }, + { + "epoch": 0.04452, + "grad_norm": 2.75, + "grad_norm_var": 0.0809234619140625, + "learning_rate": 0.0001, + "loss": 4.9265, + "loss/crossentropy": 2.1416667699813843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2781240791082382, + "step": 2226 + }, + { + "epoch": 0.04456, + "grad_norm": 2.671875, + "grad_norm_var": 0.07366536458333334, + "learning_rate": 0.0001, + "loss": 5.0701, + "loss/crossentropy": 1.7953566908836365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27528999745845795, + "step": 2228 + }, + { + "epoch": 0.0446, + "grad_norm": 2.84375, + "grad_norm_var": 0.06728515625, + "learning_rate": 0.0001, + "loss": 5.0182, + "loss/crossentropy": 2.1580333709716797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2998732179403305, + "step": 2230 + }, + { + "epoch": 0.04464, + "grad_norm": 2.9375, + "grad_norm_var": 0.034326171875, + "learning_rate": 0.0001, + "loss": 5.3619, + "loss/crossentropy": 2.1685701608657837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3457205891609192, + "step": 2232 + }, + { + "epoch": 0.04468, + "grad_norm": 2.578125, + "grad_norm_var": 0.03345438639322917, + "learning_rate": 0.0001, + "loss": 4.7602, + "loss/crossentropy": 1.9424286484718323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2728651314973831, + "step": 2234 + }, + { + "epoch": 0.04472, + "grad_norm": 2.765625, + "grad_norm_var": 0.029736328125, + "learning_rate": 0.0001, + "loss": 5.2351, + "loss/crossentropy": 2.2802772521972656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30141082406044006, + "step": 2236 + }, + { + "epoch": 0.04476, + "grad_norm": 2.515625, + "grad_norm_var": 0.03277587890625, + "learning_rate": 0.0001, + "loss": 4.8906, + "loss/crossentropy": 1.9490987062454224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2573640048503876, + "step": 2238 + }, + { + "epoch": 0.0448, + "grad_norm": 3.171875, + "grad_norm_var": 0.037679036458333336, + "learning_rate": 0.0001, + "loss": 5.2112, + "loss/crossentropy": 1.993924081325531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25628305971622467, + "step": 2240 + }, + { + "epoch": 0.04484, + "grad_norm": 2.6875, + "grad_norm_var": 0.0359283447265625, + "learning_rate": 0.0001, + "loss": 5.268, + "loss/crossentropy": 2.5151875019073486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.326447993516922, + "step": 2242 + }, + { + "epoch": 0.04488, + "grad_norm": 2.609375, + "grad_norm_var": 0.04641520182291667, + "learning_rate": 0.0001, + "loss": 5.0191, + "loss/crossentropy": 2.5175565481185913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3036232739686966, + "step": 2244 + }, + { + "epoch": 0.04492, + "grad_norm": 2.546875, + "grad_norm_var": 0.05388997395833333, + "learning_rate": 0.0001, + "loss": 4.8489, + "loss/crossentropy": 2.020721971988678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28087201714515686, + "step": 2246 + }, + { + "epoch": 0.04496, + "grad_norm": 2.96875, + "grad_norm_var": 0.045703125, + "learning_rate": 0.0001, + "loss": 5.6809, + "loss/crossentropy": 2.4800511598587036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3100287467241287, + "step": 2248 + }, + { + "epoch": 0.045, + "grad_norm": 2.59375, + "grad_norm_var": 0.0449127197265625, + "learning_rate": 0.0001, + "loss": 4.9055, + "loss/crossentropy": 1.826172411441803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2782330811023712, + "step": 2250 + }, + { + "epoch": 0.04504, + "grad_norm": 2.828125, + "grad_norm_var": 0.04533589680989583, + "learning_rate": 0.0001, + "loss": 5.133, + "loss/crossentropy": 2.256316304206848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3070906698703766, + "step": 2252 + }, + { + "epoch": 0.04508, + "grad_norm": 2.765625, + "grad_norm_var": 0.041402180989583336, + "learning_rate": 0.0001, + "loss": 5.173, + "loss/crossentropy": 1.9046601057052612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2894355356693268, + "step": 2254 + }, + { + "epoch": 0.04512, + "grad_norm": 2.828125, + "grad_norm_var": 0.03288472493489583, + "learning_rate": 0.0001, + "loss": 5.0311, + "loss/crossentropy": 1.8359373211860657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2705196440219879, + "step": 2256 + }, + { + "epoch": 0.04516, + "grad_norm": 2.96875, + "grad_norm_var": 0.0349273681640625, + "learning_rate": 0.0001, + "loss": 4.717, + "loss/crossentropy": 2.096512258052826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26635921001434326, + "step": 2258 + }, + { + "epoch": 0.0452, + "grad_norm": 2.984375, + "grad_norm_var": 0.031371053059895834, + "learning_rate": 0.0001, + "loss": 5.6736, + "loss/crossentropy": 2.4621278047561646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3513137400150299, + "step": 2260 + }, + { + "epoch": 0.04524, + "grad_norm": 2.828125, + "grad_norm_var": 0.020442708333333334, + "learning_rate": 0.0001, + "loss": 5.1413, + "loss/crossentropy": 1.8345229029655457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2414976954460144, + "step": 2262 + }, + { + "epoch": 0.04528, + "grad_norm": 2.65625, + "grad_norm_var": 0.026432291666666666, + "learning_rate": 0.0001, + "loss": 4.9746, + "loss/crossentropy": 2.24505877494812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31794628500938416, + "step": 2264 + }, + { + "epoch": 0.04532, + "grad_norm": 2.609375, + "grad_norm_var": 0.0285552978515625, + "learning_rate": 0.0001, + "loss": 5.1691, + "loss/crossentropy": 2.2141382694244385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2925301343202591, + "step": 2266 + }, + { + "epoch": 0.04536, + "grad_norm": 3.1875, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 0.0001, + "loss": 5.0559, + "loss/crossentropy": 2.1515613794326782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2843547910451889, + "step": 2268 + }, + { + "epoch": 0.0454, + "grad_norm": 2.765625, + "grad_norm_var": 0.03422749837239583, + "learning_rate": 0.0001, + "loss": 4.6899, + "loss/crossentropy": 2.1234883666038513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2981575280427933, + "step": 2270 + }, + { + "epoch": 0.04544, + "grad_norm": 2.65625, + "grad_norm_var": 0.03806864420572917, + "learning_rate": 0.0001, + "loss": 4.9871, + "loss/crossentropy": 2.1212490797042847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.276496559381485, + "step": 2272 + }, + { + "epoch": 0.04548, + "grad_norm": 2.921875, + "grad_norm_var": 0.03574930826822917, + "learning_rate": 0.0001, + "loss": 5.2925, + "loss/crossentropy": 2.4330636262893677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2815839499235153, + "step": 2274 + }, + { + "epoch": 0.04552, + "grad_norm": 2.65625, + "grad_norm_var": 0.028238932291666668, + "learning_rate": 0.0001, + "loss": 5.2874, + "loss/crossentropy": 2.110591411590576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2832919806241989, + "step": 2276 + }, + { + "epoch": 0.04556, + "grad_norm": 2.875, + "grad_norm_var": 0.20754801432291667, + "learning_rate": 0.0001, + "loss": 5.0303, + "loss/crossentropy": 2.231989800930023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28183089196681976, + "step": 2278 + }, + { + "epoch": 0.0456, + "grad_norm": 2.84375, + "grad_norm_var": 0.194873046875, + "learning_rate": 0.0001, + "loss": 5.3746, + "loss/crossentropy": 2.1275558471679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29279497265815735, + "step": 2280 + }, + { + "epoch": 0.04564, + "grad_norm": 2.8125, + "grad_norm_var": 0.1891510009765625, + "learning_rate": 0.0001, + "loss": 5.2023, + "loss/crossentropy": 1.7988306283950806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2625636160373688, + "step": 2282 + }, + { + "epoch": 0.04568, + "grad_norm": 2.65625, + "grad_norm_var": 0.18694254557291667, + "learning_rate": 0.0001, + "loss": 5.2017, + "loss/crossentropy": 2.3405990600585938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30855217576026917, + "step": 2284 + }, + { + "epoch": 0.04572, + "grad_norm": 2.828125, + "grad_norm_var": 0.18612874348958333, + "learning_rate": 0.0001, + "loss": 5.4582, + "loss/crossentropy": 2.2062121629714966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30755001306533813, + "step": 2286 + }, + { + "epoch": 0.04576, + "grad_norm": 2.5625, + "grad_norm_var": 0.18968098958333332, + "learning_rate": 0.0001, + "loss": 4.8984, + "loss/crossentropy": 1.9439310431480408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26844222843647003, + "step": 2288 + }, + { + "epoch": 0.0458, + "grad_norm": 2.78125, + "grad_norm_var": 0.19010416666666666, + "learning_rate": 0.0001, + "loss": 5.2097, + "loss/crossentropy": 2.3106162548065186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30821681022644043, + "step": 2290 + }, + { + "epoch": 0.04584, + "grad_norm": 2.640625, + "grad_norm_var": 0.19246317545572916, + "learning_rate": 0.0001, + "loss": 5.1401, + "loss/crossentropy": 2.3809561729431152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3307010233402252, + "step": 2292 + }, + { + "epoch": 0.04588, + "grad_norm": 2.703125, + "grad_norm_var": 0.01103515625, + "learning_rate": 0.0001, + "loss": 5.4066, + "loss/crossentropy": 2.209702253341675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2795914113521576, + "step": 2294 + }, + { + "epoch": 0.04592, + "grad_norm": 2.5625, + "grad_norm_var": 0.0158111572265625, + "learning_rate": 0.0001, + "loss": 4.8772, + "loss/crossentropy": 2.4084372520446777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31208573281764984, + "step": 2296 + }, + { + "epoch": 0.04596, + "grad_norm": 2.6875, + "grad_norm_var": 0.0146392822265625, + "learning_rate": 0.0001, + "loss": 4.7757, + "loss/crossentropy": 1.9384723901748657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2625032365322113, + "step": 2298 + }, + { + "epoch": 0.046, + "grad_norm": 2.78125, + "grad_norm_var": 0.013704427083333333, + "learning_rate": 0.0001, + "loss": 5.2455, + "loss/crossentropy": 2.150592088699341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.278359517455101, + "step": 2300 + }, + { + "epoch": 0.04604, + "grad_norm": 2.65625, + "grad_norm_var": 0.01256103515625, + "learning_rate": 0.0001, + "loss": 5.0788, + "loss/crossentropy": 1.8317970037460327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2927433103322983, + "step": 2302 + }, + { + "epoch": 0.04608, + "grad_norm": 3.09375, + "grad_norm_var": 0.029564412434895833, + "learning_rate": 0.0001, + "loss": 5.091, + "loss/crossentropy": 2.323367118835449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2943577915430069, + "step": 2304 + }, + { + "epoch": 0.04612, + "grad_norm": 2.859375, + "grad_norm_var": 0.03943583170572917, + "learning_rate": 0.0001, + "loss": 5.5301, + "loss/crossentropy": 2.369907855987549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2990037202835083, + "step": 2306 + }, + { + "epoch": 0.04616, + "grad_norm": 2.921875, + "grad_norm_var": 0.04127197265625, + "learning_rate": 0.0001, + "loss": 4.7508, + "loss/crossentropy": 1.691443145275116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27280642092227936, + "step": 2308 + }, + { + "epoch": 0.0462, + "grad_norm": 2.71875, + "grad_norm_var": 0.0464508056640625, + "learning_rate": 0.0001, + "loss": 4.9413, + "loss/crossentropy": 2.2883838415145874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3064710944890976, + "step": 2310 + }, + { + "epoch": 0.04624, + "grad_norm": 2.671875, + "grad_norm_var": 0.035380045572916664, + "learning_rate": 0.0001, + "loss": 5.4165, + "loss/crossentropy": 2.2042444944381714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3241504430770874, + "step": 2312 + }, + { + "epoch": 0.04628, + "grad_norm": 2.609375, + "grad_norm_var": 0.048680623372395836, + "learning_rate": 0.0001, + "loss": 4.6657, + "loss/crossentropy": 1.977793574333191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2775610163807869, + "step": 2314 + }, + { + "epoch": 0.04632, + "grad_norm": 24.875, + "grad_norm_var": 30.570881144205728, + "learning_rate": 0.0001, + "loss": 5.8585, + "loss/crossentropy": 2.034530758857727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27170561254024506, + "step": 2316 + }, + { + "epoch": 0.04636, + "grad_norm": 2.875, + "grad_norm_var": 30.404881795247395, + "learning_rate": 0.0001, + "loss": 5.1565, + "loss/crossentropy": 2.439123511314392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2999258190393448, + "step": 2318 + }, + { + "epoch": 0.0464, + "grad_norm": 2.5, + "grad_norm_var": 30.530557250976564, + "learning_rate": 0.0001, + "loss": 4.8907, + "loss/crossentropy": 2.2600624561309814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2952795475721359, + "step": 2320 + }, + { + "epoch": 0.04644, + "grad_norm": 2.859375, + "grad_norm_var": 30.544131469726562, + "learning_rate": 0.0001, + "loss": 5.0521, + "loss/crossentropy": 2.144679367542267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28713342547416687, + "step": 2322 + }, + { + "epoch": 0.04648, + "grad_norm": 2.890625, + "grad_norm_var": 30.469155883789064, + "learning_rate": 0.0001, + "loss": 5.5054, + "loss/crossentropy": 2.34474778175354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.292633980512619, + "step": 2324 + }, + { + "epoch": 0.04652, + "grad_norm": 2.609375, + "grad_norm_var": 30.491536458333332, + "learning_rate": 0.0001, + "loss": 4.5903, + "loss/crossentropy": 1.96743243932724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.265610933303833, + "step": 2326 + }, + { + "epoch": 0.04656, + "grad_norm": 2.53125, + "grad_norm_var": 30.524051920572916, + "learning_rate": 0.0001, + "loss": 5.2353, + "loss/crossentropy": 2.3895785808563232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3287513107061386, + "step": 2328 + }, + { + "epoch": 0.0466, + "grad_norm": 2.984375, + "grad_norm_var": 30.446451822916668, + "learning_rate": 0.0001, + "loss": 4.9017, + "loss/crossentropy": 2.0607098937034607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30049796402454376, + "step": 2330 + }, + { + "epoch": 0.04664, + "grad_norm": 2.96875, + "grad_norm_var": 0.08322652180989583, + "learning_rate": 0.0001, + "loss": 5.1698, + "loss/crossentropy": 2.162124752998352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2983020693063736, + "step": 2332 + }, + { + "epoch": 0.04668, + "grad_norm": 2.890625, + "grad_norm_var": 0.05139567057291667, + "learning_rate": 0.0001, + "loss": 5.2702, + "loss/crossentropy": 2.34970760345459, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3322293907403946, + "step": 2334 + }, + { + "epoch": 0.04672, + "grad_norm": 2.703125, + "grad_norm_var": 0.04527587890625, + "learning_rate": 0.0001, + "loss": 4.9763, + "loss/crossentropy": 1.9286972284317017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2752893418073654, + "step": 2336 + }, + { + "epoch": 0.04676, + "grad_norm": 2.828125, + "grad_norm_var": 0.04159749348958333, + "learning_rate": 0.0001, + "loss": 5.1677, + "loss/crossentropy": 2.182044267654419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30808278918266296, + "step": 2338 + }, + { + "epoch": 0.0468, + "grad_norm": 2.96875, + "grad_norm_var": 0.03916727701822917, + "learning_rate": 0.0001, + "loss": 5.28, + "loss/crossentropy": 2.0002610087394714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2944178581237793, + "step": 2340 + }, + { + "epoch": 0.04684, + "grad_norm": 2.609375, + "grad_norm_var": 0.034830729166666664, + "learning_rate": 0.0001, + "loss": 4.984, + "loss/crossentropy": 2.0721842646598816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2769011855125427, + "step": 2342 + }, + { + "epoch": 0.04688, + "grad_norm": 2.625, + "grad_norm_var": 0.03178609212239583, + "learning_rate": 0.0001, + "loss": 5.0911, + "loss/crossentropy": 1.9710460305213928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28619086742401123, + "step": 2344 + }, + { + "epoch": 0.04692, + "grad_norm": 3.28125, + "grad_norm_var": 1.54605712890625, + "learning_rate": 0.0001, + "loss": 5.5092, + "loss/crossentropy": 2.0506762266159058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31705181300640106, + "step": 2346 + }, + { + "epoch": 0.04696, + "grad_norm": 2.609375, + "grad_norm_var": 1.5601064046223958, + "learning_rate": 0.0001, + "loss": 5.0045, + "loss/crossentropy": 2.0095930695533752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2831149846315384, + "step": 2348 + }, + { + "epoch": 0.047, + "grad_norm": 2.890625, + "grad_norm_var": 1.573631795247396, + "learning_rate": 0.0001, + "loss": 5.19, + "loss/crossentropy": 2.023163616657257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2813292294740677, + "step": 2350 + }, + { + "epoch": 0.04704, + "grad_norm": 2.671875, + "grad_norm_var": 1.561424763997396, + "learning_rate": 0.0001, + "loss": 5.2907, + "loss/crossentropy": 2.230435371398926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30891451239585876, + "step": 2352 + }, + { + "epoch": 0.04708, + "grad_norm": 3.0625, + "grad_norm_var": 1.5700154622395834, + "learning_rate": 0.0001, + "loss": 4.9261, + "loss/crossentropy": 2.1553521156311035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29839709401130676, + "step": 2354 + }, + { + "epoch": 0.04712, + "grad_norm": 2.96875, + "grad_norm_var": 1.5770792643229166, + "learning_rate": 0.0001, + "loss": 5.2553, + "loss/crossentropy": 2.175648272037506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2972148358821869, + "step": 2356 + }, + { + "epoch": 0.04716, + "grad_norm": 2.703125, + "grad_norm_var": 1.5980377197265625, + "learning_rate": 0.0001, + "loss": 5.0436, + "loss/crossentropy": 2.3852503299713135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3166612535715103, + "step": 2358 + }, + { + "epoch": 0.0472, + "grad_norm": 2.75, + "grad_norm_var": 1.5826456705729166, + "learning_rate": 0.0001, + "loss": 5.1827, + "loss/crossentropy": 2.1905999183654785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29487256705760956, + "step": 2360 + }, + { + "epoch": 0.04724, + "grad_norm": 2.5625, + "grad_norm_var": 0.0413482666015625, + "learning_rate": 0.0001, + "loss": 4.9295, + "loss/crossentropy": 1.9224759340286255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27821336686611176, + "step": 2362 + }, + { + "epoch": 0.04728, + "grad_norm": 3.109375, + "grad_norm_var": 0.04057515462239583, + "learning_rate": 0.0001, + "loss": 5.1078, + "loss/crossentropy": 2.5025261640548706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3345927745103836, + "step": 2364 + }, + { + "epoch": 0.04732, + "grad_norm": 2.640625, + "grad_norm_var": 0.04006245930989583, + "learning_rate": 0.0001, + "loss": 5.0502, + "loss/crossentropy": 2.2385451793670654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27095621824264526, + "step": 2366 + }, + { + "epoch": 0.04736, + "grad_norm": 4.5, + "grad_norm_var": 0.22924702962239582, + "learning_rate": 0.0001, + "loss": 5.2761, + "loss/crossentropy": 2.0266553163528442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2856762409210205, + "step": 2368 + }, + { + "epoch": 0.0474, + "grad_norm": 3.359375, + "grad_norm_var": 0.24001363118489583, + "learning_rate": 0.0001, + "loss": 5.4918, + "loss/crossentropy": 2.5139355659484863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3186872750520706, + "step": 2370 + }, + { + "epoch": 0.04744, + "grad_norm": 2.609375, + "grad_norm_var": 0.24321187337239583, + "learning_rate": 0.0001, + "loss": 5.1302, + "loss/crossentropy": 2.066476881504059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2889470160007477, + "step": 2372 + }, + { + "epoch": 0.04748, + "grad_norm": 2.671875, + "grad_norm_var": 0.23772786458333334, + "learning_rate": 0.0001, + "loss": 5.0031, + "loss/crossentropy": 2.0537307262420654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31795741617679596, + "step": 2374 + }, + { + "epoch": 0.04752, + "grad_norm": 2.546875, + "grad_norm_var": 0.24492085774739583, + "learning_rate": 0.0001, + "loss": 4.9922, + "loss/crossentropy": 1.9254986643791199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25411880016326904, + "step": 2376 + }, + { + "epoch": 0.04756, + "grad_norm": 2.671875, + "grad_norm_var": 0.23855692545572918, + "learning_rate": 0.0001, + "loss": 5.0284, + "loss/crossentropy": 2.221043348312378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28534361720085144, + "step": 2378 + }, + { + "epoch": 0.0476, + "grad_norm": 2.90625, + "grad_norm_var": 0.228271484375, + "learning_rate": 0.0001, + "loss": 5.2106, + "loss/crossentropy": 2.3516281843185425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3128499984741211, + "step": 2380 + }, + { + "epoch": 0.04764, + "grad_norm": 2.734375, + "grad_norm_var": 0.22349853515625, + "learning_rate": 0.0001, + "loss": 5.6266, + "loss/crossentropy": 2.2144338488578796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3071902245283127, + "step": 2382 + }, + { + "epoch": 0.04768, + "grad_norm": 2.625, + "grad_norm_var": 0.0567291259765625, + "learning_rate": 0.0001, + "loss": 5.2429, + "loss/crossentropy": 2.3324203491210938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.322970449924469, + "step": 2384 + }, + { + "epoch": 0.04772, + "grad_norm": 2.546875, + "grad_norm_var": 0.03332926432291667, + "learning_rate": 0.0001, + "loss": 4.7732, + "loss/crossentropy": 2.08349871635437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2774003893136978, + "step": 2386 + }, + { + "epoch": 0.04776, + "grad_norm": 2.4375, + "grad_norm_var": 0.03902587890625, + "learning_rate": 0.0001, + "loss": 4.8585, + "loss/crossentropy": 1.9565780758857727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29790589213371277, + "step": 2388 + }, + { + "epoch": 0.0478, + "grad_norm": 2.625, + "grad_norm_var": 0.022782389322916666, + "learning_rate": 0.0001, + "loss": 5.0176, + "loss/crossentropy": 2.261398434638977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3143853694200516, + "step": 2390 + }, + { + "epoch": 0.04784, + "grad_norm": 2.96875, + "grad_norm_var": 0.024494425455729166, + "learning_rate": 0.0001, + "loss": 5.0688, + "loss/crossentropy": 1.9077460169792175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25433091819286346, + "step": 2392 + }, + { + "epoch": 0.04788, + "grad_norm": 2.65625, + "grad_norm_var": 0.0240142822265625, + "learning_rate": 0.0001, + "loss": 4.9531, + "loss/crossentropy": 1.9948468208312988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2598777562379837, + "step": 2394 + }, + { + "epoch": 0.04792, + "grad_norm": 2.9375, + "grad_norm_var": 0.45455729166666664, + "learning_rate": 0.0001, + "loss": 5.0972, + "loss/crossentropy": 2.1177526116371155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2775426208972931, + "step": 2396 + }, + { + "epoch": 0.04796, + "grad_norm": 2.5, + "grad_norm_var": 0.4634348551432292, + "learning_rate": 0.0001, + "loss": 4.8571, + "loss/crossentropy": 2.1756062507629395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33678852021694183, + "step": 2398 + }, + { + "epoch": 0.048, + "grad_norm": 2.984375, + "grad_norm_var": 0.4576171875, + "learning_rate": 0.0001, + "loss": 5.2617, + "loss/crossentropy": 2.0923725366592407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3255026638507843, + "step": 2400 + }, + { + "epoch": 0.04804, + "grad_norm": 2.84375, + "grad_norm_var": 0.4471181233723958, + "learning_rate": 0.0001, + "loss": 4.9421, + "loss/crossentropy": 1.9236682653427124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2642124071717262, + "step": 2402 + }, + { + "epoch": 0.04808, + "grad_norm": 2.84375, + "grad_norm_var": 0.43757222493489584, + "learning_rate": 0.0001, + "loss": 5.0604, + "loss/crossentropy": 2.1742242574691772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30006398260593414, + "step": 2404 + }, + { + "epoch": 0.04812, + "grad_norm": 2.6875, + "grad_norm_var": 0.43835347493489585, + "learning_rate": 0.0001, + "loss": 4.7077, + "loss/crossentropy": 1.7445701956748962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25791122019290924, + "step": 2406 + }, + { + "epoch": 0.04816, + "grad_norm": 4.34375, + "grad_norm_var": 0.5614735921223958, + "learning_rate": 0.0001, + "loss": 5.1289, + "loss/crossentropy": 1.8616467714309692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532489001750946, + "step": 2408 + }, + { + "epoch": 0.0482, + "grad_norm": 3.203125, + "grad_norm_var": 0.5608723958333334, + "learning_rate": 0.0001, + "loss": 5.0486, + "loss/crossentropy": 1.9146783351898193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2659924626350403, + "step": 2410 + }, + { + "epoch": 0.04824, + "grad_norm": 2.859375, + "grad_norm_var": 0.21818745930989583, + "learning_rate": 0.0001, + "loss": 5.0972, + "loss/crossentropy": 2.179564118385315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29064056277275085, + "step": 2412 + }, + { + "epoch": 0.04828, + "grad_norm": 2.65625, + "grad_norm_var": 0.21357421875, + "learning_rate": 0.0001, + "loss": 4.9578, + "loss/crossentropy": 2.04409658908844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27891072630882263, + "step": 2414 + }, + { + "epoch": 0.04832, + "grad_norm": 2.625, + "grad_norm_var": 0.22568257649739584, + "learning_rate": 0.0001, + "loss": 4.8833, + "loss/crossentropy": 2.590337038040161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3158426731824875, + "step": 2416 + }, + { + "epoch": 0.04836, + "grad_norm": 2.625, + "grad_norm_var": 0.23155924479166667, + "learning_rate": 0.0001, + "loss": 4.6919, + "loss/crossentropy": 1.8753941059112549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2669401317834854, + "step": 2418 + }, + { + "epoch": 0.0484, + "grad_norm": 2.734375, + "grad_norm_var": 0.23361714680989584, + "learning_rate": 0.0001, + "loss": 5.0231, + "loss/crossentropy": 2.1412659287452698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2839512526988983, + "step": 2420 + }, + { + "epoch": 0.04844, + "grad_norm": 2.65625, + "grad_norm_var": 0.23371988932291668, + "learning_rate": 0.0001, + "loss": 5.1187, + "loss/crossentropy": 2.545991063117981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3331379294395447, + "step": 2422 + }, + { + "epoch": 0.04848, + "grad_norm": 3.25, + "grad_norm_var": 0.089599609375, + "learning_rate": 0.0001, + "loss": 5.1246, + "loss/crossentropy": 2.12838077545166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3043065369129181, + "step": 2424 + }, + { + "epoch": 0.04852, + "grad_norm": 2.953125, + "grad_norm_var": 0.03660380045572917, + "learning_rate": 0.0001, + "loss": 5.3821, + "loss/crossentropy": 2.1983221769332886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2846619784832001, + "step": 2426 + }, + { + "epoch": 0.04856, + "grad_norm": 3.0625, + "grad_norm_var": 0.03819986979166667, + "learning_rate": 0.0001, + "loss": 5.1044, + "loss/crossentropy": 2.241136312484741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30275705456733704, + "step": 2428 + }, + { + "epoch": 0.0486, + "grad_norm": 2.890625, + "grad_norm_var": 0.03831278483072917, + "learning_rate": 0.0001, + "loss": 5.203, + "loss/crossentropy": 2.097459554672241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2986721396446228, + "step": 2430 + }, + { + "epoch": 0.04864, + "grad_norm": 4.4375, + "grad_norm_var": 0.20159403483072916, + "learning_rate": 0.0001, + "loss": 5.1158, + "loss/crossentropy": 2.333081007003784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2975248098373413, + "step": 2432 + }, + { + "epoch": 0.04868, + "grad_norm": 2.625, + "grad_norm_var": 0.19758707682291668, + "learning_rate": 0.0001, + "loss": 4.9183, + "loss/crossentropy": 2.3171510696411133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28452740609645844, + "step": 2434 + }, + { + "epoch": 0.04872, + "grad_norm": 2.375, + "grad_norm_var": 0.2141754150390625, + "learning_rate": 0.0001, + "loss": 4.8853, + "loss/crossentropy": 1.8334497213363647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25271379947662354, + "step": 2436 + }, + { + "epoch": 0.04876, + "grad_norm": 2.78125, + "grad_norm_var": 0.20258687337239584, + "learning_rate": 0.0001, + "loss": 5.3809, + "loss/crossentropy": 2.2712661027908325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30827929079532623, + "step": 2438 + }, + { + "epoch": 0.0488, + "grad_norm": 3.3125, + "grad_norm_var": 0.20465087890625, + "learning_rate": 0.0001, + "loss": 5.0863, + "loss/crossentropy": 2.160263180732727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3078690320253372, + "step": 2440 + }, + { + "epoch": 0.04884, + "grad_norm": 2.96875, + "grad_norm_var": 0.20429585774739584, + "learning_rate": 0.0001, + "loss": 5.2224, + "loss/crossentropy": 2.071319878101349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2733730524778366, + "step": 2442 + }, + { + "epoch": 0.04888, + "grad_norm": 2.796875, + "grad_norm_var": 0.203076171875, + "learning_rate": 0.0001, + "loss": 5.1476, + "loss/crossentropy": 2.0742560029029846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2835587412118912, + "step": 2444 + }, + { + "epoch": 0.04892, + "grad_norm": 2.765625, + "grad_norm_var": 0.24807535807291667, + "learning_rate": 0.0001, + "loss": 5.0211, + "loss/crossentropy": 1.8836837410926819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3089100867509842, + "step": 2446 + }, + { + "epoch": 0.04896, + "grad_norm": 2.875, + "grad_norm_var": 0.177099609375, + "learning_rate": 0.0001, + "loss": 4.9537, + "loss/crossentropy": 1.8539315462112427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2697101980447769, + "step": 2448 + }, + { + "epoch": 0.049, + "grad_norm": 2.828125, + "grad_norm_var": 0.17392578125, + "learning_rate": 0.0001, + "loss": 5.1907, + "loss/crossentropy": 2.219490647315979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2898380011320114, + "step": 2450 + }, + { + "epoch": 0.04904, + "grad_norm": 3.1875, + "grad_norm_var": 0.13853251139322917, + "learning_rate": 0.0001, + "loss": 5.468, + "loss/crossentropy": 2.328765392303467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.4013051837682724, + "step": 2452 + }, + { + "epoch": 0.04908, + "grad_norm": 2.5625, + "grad_norm_var": 0.14879557291666667, + "learning_rate": 0.0001, + "loss": 4.8967, + "loss/crossentropy": 1.9204192161560059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25281261652708054, + "step": 2454 + }, + { + "epoch": 0.04912, + "grad_norm": 2.765625, + "grad_norm_var": 0.15563151041666667, + "learning_rate": 0.0001, + "loss": 5.0935, + "loss/crossentropy": 2.377043604850769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2928486764431, + "step": 2456 + }, + { + "epoch": 0.04916, + "grad_norm": 2.796875, + "grad_norm_var": 0.15930989583333333, + "learning_rate": 0.0001, + "loss": 5.4528, + "loss/crossentropy": 2.4364209175109863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3111976683139801, + "step": 2458 + }, + { + "epoch": 0.0492, + "grad_norm": 2.703125, + "grad_norm_var": 0.15985921223958333, + "learning_rate": 0.0001, + "loss": 5.1357, + "loss/crossentropy": 2.3738330602645874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3019126206636429, + "step": 2460 + }, + { + "epoch": 0.04924, + "grad_norm": 2.765625, + "grad_norm_var": 0.1141265869140625, + "learning_rate": 0.0001, + "loss": 5.2876, + "loss/crossentropy": 2.4575772285461426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3428986072540283, + "step": 2462 + }, + { + "epoch": 0.04928, + "grad_norm": 2.734375, + "grad_norm_var": 0.028888956705729166, + "learning_rate": 0.0001, + "loss": 4.6505, + "loss/crossentropy": 1.9849627017974854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27611130475997925, + "step": 2464 + }, + { + "epoch": 0.04932, + "grad_norm": 2.625, + "grad_norm_var": 0.030402628580729167, + "learning_rate": 0.0001, + "loss": 4.8482, + "loss/crossentropy": 1.9617170691490173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.257377490401268, + "step": 2466 + }, + { + "epoch": 0.04936, + "grad_norm": 2.640625, + "grad_norm_var": 0.01871337890625, + "learning_rate": 0.0001, + "loss": 5.1739, + "loss/crossentropy": 2.4031273126602173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28843145072460175, + "step": 2468 + }, + { + "epoch": 0.0494, + "grad_norm": 2.484375, + "grad_norm_var": 0.0165435791015625, + "learning_rate": 0.0001, + "loss": 4.8452, + "loss/crossentropy": 1.7263792753219604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25591571629047394, + "step": 2470 + }, + { + "epoch": 0.04944, + "grad_norm": 2.734375, + "grad_norm_var": 0.0191802978515625, + "learning_rate": 0.0001, + "loss": 4.7154, + "loss/crossentropy": 2.106898784637451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.280623197555542, + "step": 2472 + }, + { + "epoch": 0.04948, + "grad_norm": 2.6875, + "grad_norm_var": 0.018358357747395835, + "learning_rate": 0.0001, + "loss": 5.0092, + "loss/crossentropy": 2.325208902359009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27794161438941956, + "step": 2474 + }, + { + "epoch": 0.04952, + "grad_norm": 2.65625, + "grad_norm_var": 0.013736979166666666, + "learning_rate": 0.0001, + "loss": 5.1128, + "loss/crossentropy": 2.367414712905884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31346653401851654, + "step": 2476 + }, + { + "epoch": 0.04956, + "grad_norm": 2.59375, + "grad_norm_var": 0.013792928059895833, + "learning_rate": 0.0001, + "loss": 5.2611, + "loss/crossentropy": 2.191115140914917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30560287833213806, + "step": 2478 + }, + { + "epoch": 0.0496, + "grad_norm": 2.765625, + "grad_norm_var": 0.012430826822916666, + "learning_rate": 0.0001, + "loss": 4.9993, + "loss/crossentropy": 2.2559624314308167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3029457628726959, + "step": 2480 + }, + { + "epoch": 0.04964, + "grad_norm": 3.390625, + "grad_norm_var": 0.04160868326822917, + "learning_rate": 0.0001, + "loss": 5.125, + "loss/crossentropy": 2.0088382363319397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2923784404993057, + "step": 2482 + }, + { + "epoch": 0.04968, + "grad_norm": 2.71875, + "grad_norm_var": 0.03943583170572917, + "learning_rate": 0.0001, + "loss": 4.8858, + "loss/crossentropy": 1.8445329070091248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2587483897805214, + "step": 2484 + }, + { + "epoch": 0.04972, + "grad_norm": 2.8125, + "grad_norm_var": 0.03534749348958333, + "learning_rate": 0.0001, + "loss": 4.7918, + "loss/crossentropy": 2.015140950679779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27480585873126984, + "step": 2486 + }, + { + "epoch": 0.04976, + "grad_norm": 2.765625, + "grad_norm_var": 0.030833943684895834, + "learning_rate": 0.0001, + "loss": 5.1959, + "loss/crossentropy": 1.918801188468933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28786011040210724, + "step": 2488 + }, + { + "epoch": 0.0498, + "grad_norm": 2.734375, + "grad_norm_var": 0.030269368489583334, + "learning_rate": 0.0001, + "loss": 5.0108, + "loss/crossentropy": 1.9899121522903442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25822708010673523, + "step": 2490 + }, + { + "epoch": 0.04984, + "grad_norm": 2.59375, + "grad_norm_var": 0.037206013997395836, + "learning_rate": 0.0001, + "loss": 4.7259, + "loss/crossentropy": 2.3775535821914673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31345370411872864, + "step": 2492 + }, + { + "epoch": 0.04988, + "grad_norm": 2.71875, + "grad_norm_var": 0.03791402180989583, + "learning_rate": 0.0001, + "loss": 4.9496, + "loss/crossentropy": 2.0874632596969604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2836592495441437, + "step": 2494 + }, + { + "epoch": 0.04992, + "grad_norm": 2.78125, + "grad_norm_var": 0.046019490559895834, + "learning_rate": 0.0001, + "loss": 5.2404, + "loss/crossentropy": 2.226976454257965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27826404571533203, + "step": 2496 + }, + { + "epoch": 0.04996, + "grad_norm": 2.71875, + "grad_norm_var": 0.023900349934895832, + "learning_rate": 0.0001, + "loss": 5.1503, + "loss/crossentropy": 2.4569294452667236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31981319189071655, + "step": 2498 + }, + { + "epoch": 0.05, + "grad_norm": 2.703125, + "grad_norm_var": 0.025581868489583333, + "learning_rate": 0.0001, + "loss": 4.922, + "loss/crossentropy": 2.1134212017059326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28260529041290283, + "step": 2500 + }, + { + "epoch": 0.05004, + "grad_norm": 3.0, + "grad_norm_var": 0.029195149739583332, + "learning_rate": 0.0001, + "loss": 5.3555, + "loss/crossentropy": 2.2914888858795166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3182682394981384, + "step": 2502 + }, + { + "epoch": 0.05008, + "grad_norm": 2.65625, + "grad_norm_var": 0.030078125, + "learning_rate": 0.0001, + "loss": 4.9644, + "loss/crossentropy": 2.3261003494262695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30558250844478607, + "step": 2504 + }, + { + "epoch": 0.05012, + "grad_norm": 2.703125, + "grad_norm_var": 0.030598958333333332, + "learning_rate": 0.0001, + "loss": 4.9517, + "loss/crossentropy": 2.351989507675171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2934701144695282, + "step": 2506 + }, + { + "epoch": 0.05016, + "grad_norm": 2.5625, + "grad_norm_var": 0.025031534830729167, + "learning_rate": 0.0001, + "loss": 4.71, + "loss/crossentropy": 1.8742690086364746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26305729895830154, + "step": 2508 + }, + { + "epoch": 0.0502, + "grad_norm": 2.78125, + "grad_norm_var": 0.0247222900390625, + "learning_rate": 0.0001, + "loss": 4.8214, + "loss/crossentropy": 2.1668856143951416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2728075534105301, + "step": 2510 + }, + { + "epoch": 0.05024, + "grad_norm": 2.734375, + "grad_norm_var": 0.0137115478515625, + "learning_rate": 0.0001, + "loss": 4.7171, + "loss/crossentropy": 1.773424208164215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26808495819568634, + "step": 2512 + }, + { + "epoch": 0.05028, + "grad_norm": 2.59375, + "grad_norm_var": 0.013109334309895833, + "learning_rate": 0.0001, + "loss": 4.9224, + "loss/crossentropy": 1.7541643977165222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2600491940975189, + "step": 2514 + }, + { + "epoch": 0.05032, + "grad_norm": 2.796875, + "grad_norm_var": 0.011324055989583333, + "learning_rate": 0.0001, + "loss": 5.129, + "loss/crossentropy": 1.9693496227264404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2844541072845459, + "step": 2516 + }, + { + "epoch": 0.05036, + "grad_norm": 2.765625, + "grad_norm_var": 0.009065755208333333, + "learning_rate": 0.0001, + "loss": 4.9202, + "loss/crossentropy": 1.7539461851119995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2542608380317688, + "step": 2518 + }, + { + "epoch": 0.0504, + "grad_norm": 2.984375, + "grad_norm_var": 0.017878214518229168, + "learning_rate": 0.0001, + "loss": 5.0686, + "loss/crossentropy": 2.155138313770294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3023010194301605, + "step": 2520 + }, + { + "epoch": 0.05044, + "grad_norm": 2.609375, + "grad_norm_var": 0.018619791666666666, + "learning_rate": 0.0001, + "loss": 4.9674, + "loss/crossentropy": 2.069350838661194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26573850214481354, + "step": 2522 + }, + { + "epoch": 0.05048, + "grad_norm": 2.5, + "grad_norm_var": 0.022101847330729167, + "learning_rate": 0.0001, + "loss": 5.0295, + "loss/crossentropy": 2.1864534616470337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27845603227615356, + "step": 2524 + }, + { + "epoch": 0.05052, + "grad_norm": 2.5, + "grad_norm_var": 0.023509724934895834, + "learning_rate": 0.0001, + "loss": 5.1173, + "loss/crossentropy": 2.2462135553359985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29221346974372864, + "step": 2526 + }, + { + "epoch": 0.05056, + "grad_norm": 2.78125, + "grad_norm_var": 0.030492146809895832, + "learning_rate": 0.0001, + "loss": 5.0027, + "loss/crossentropy": 2.043266773223877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25857964158058167, + "step": 2528 + }, + { + "epoch": 0.0506, + "grad_norm": 2.65625, + "grad_norm_var": 0.0299957275390625, + "learning_rate": 0.0001, + "loss": 4.9672, + "loss/crossentropy": 1.8892702460289001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2879898101091385, + "step": 2530 + }, + { + "epoch": 0.05064, + "grad_norm": 2.46875, + "grad_norm_var": 0.03238525390625, + "learning_rate": 0.0001, + "loss": 4.5332, + "loss/crossentropy": 1.9220558404922485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26652154326438904, + "step": 2532 + }, + { + "epoch": 0.05068, + "grad_norm": 2.515625, + "grad_norm_var": 0.03430989583333333, + "learning_rate": 0.0001, + "loss": 4.4176, + "loss/crossentropy": 1.7282914519309998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2367371916770935, + "step": 2534 + }, + { + "epoch": 0.05072, + "grad_norm": 2.78125, + "grad_norm_var": 0.021092732747395832, + "learning_rate": 0.0001, + "loss": 4.9589, + "loss/crossentropy": 1.729803204536438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25624871999025345, + "step": 2536 + }, + { + "epoch": 0.05076, + "grad_norm": 2.578125, + "grad_norm_var": 0.020873006184895834, + "learning_rate": 0.0001, + "loss": 4.6952, + "loss/crossentropy": 2.0921449661254883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2855897545814514, + "step": 2538 + }, + { + "epoch": 0.0508, + "grad_norm": 2.6875, + "grad_norm_var": 0.015404256184895833, + "learning_rate": 0.0001, + "loss": 4.8967, + "loss/crossentropy": 2.0569751858711243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2829667925834656, + "step": 2540 + }, + { + "epoch": 0.05084, + "grad_norm": 2.609375, + "grad_norm_var": 0.015973917643229165, + "learning_rate": 0.0001, + "loss": 5.2438, + "loss/crossentropy": 1.983904242515564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30939212441444397, + "step": 2542 + }, + { + "epoch": 0.05088, + "grad_norm": 2.703125, + "grad_norm_var": 0.011864217122395833, + "learning_rate": 0.0001, + "loss": 4.9968, + "loss/crossentropy": 2.2631462812423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27867285907268524, + "step": 2544 + }, + { + "epoch": 0.05092, + "grad_norm": 3.078125, + "grad_norm_var": 0.023856608072916667, + "learning_rate": 0.0001, + "loss": 4.9621, + "loss/crossentropy": 1.9918989539146423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27703428268432617, + "step": 2546 + }, + { + "epoch": 0.05096, + "grad_norm": 2.78125, + "grad_norm_var": 0.021110026041666667, + "learning_rate": 0.0001, + "loss": 5.2041, + "loss/crossentropy": 2.1356931924819946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27815964818000793, + "step": 2548 + }, + { + "epoch": 0.051, + "grad_norm": 2.46875, + "grad_norm_var": 0.019071451822916665, + "learning_rate": 0.0001, + "loss": 4.6514, + "loss/crossentropy": 2.172751545906067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2971910834312439, + "step": 2550 + }, + { + "epoch": 0.05104, + "grad_norm": 2.578125, + "grad_norm_var": 0.019527180989583334, + "learning_rate": 0.0001, + "loss": 5.0178, + "loss/crossentropy": 2.0799094438552856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30459292232990265, + "step": 2552 + }, + { + "epoch": 0.05108, + "grad_norm": 2.5, + "grad_norm_var": 0.0211090087890625, + "learning_rate": 0.0001, + "loss": 4.8235, + "loss/crossentropy": 1.7769129872322083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2526697665452957, + "step": 2554 + }, + { + "epoch": 0.05112, + "grad_norm": 2.8125, + "grad_norm_var": 0.024494425455729166, + "learning_rate": 0.0001, + "loss": 4.8457, + "loss/crossentropy": 2.044790804386139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2821648418903351, + "step": 2556 + }, + { + "epoch": 0.05116, + "grad_norm": 2.921875, + "grad_norm_var": 0.0287109375, + "learning_rate": 0.0001, + "loss": 5.5336, + "loss/crossentropy": 2.3708614110946655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30810464918613434, + "step": 2558 + }, + { + "epoch": 0.0512, + "grad_norm": 2.71875, + "grad_norm_var": 0.028709920247395833, + "learning_rate": 0.0001, + "loss": 5.2385, + "loss/crossentropy": 2.2216718196868896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2919304668903351, + "step": 2560 + }, + { + "epoch": 0.05124, + "grad_norm": 2.671875, + "grad_norm_var": 0.019466145833333334, + "learning_rate": 0.0001, + "loss": 5.28, + "loss/crossentropy": 2.4692097902297974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30620162189006805, + "step": 2562 + }, + { + "epoch": 0.05128, + "grad_norm": 2.984375, + "grad_norm_var": 0.026936848958333332, + "learning_rate": 0.0001, + "loss": 4.9476, + "loss/crossentropy": 2.0491623282432556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25617313385009766, + "step": 2564 + }, + { + "epoch": 0.05132, + "grad_norm": 2.8125, + "grad_norm_var": 0.023368326822916667, + "learning_rate": 0.0001, + "loss": 4.958, + "loss/crossentropy": 1.8305597305297852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25237561762332916, + "step": 2566 + }, + { + "epoch": 0.05136, + "grad_norm": 2.640625, + "grad_norm_var": 0.0223297119140625, + "learning_rate": 0.0001, + "loss": 4.9853, + "loss/crossentropy": 1.9471853971481323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2712964415550232, + "step": 2568 + }, + { + "epoch": 0.0514, + "grad_norm": 2.765625, + "grad_norm_var": 0.0198638916015625, + "learning_rate": 0.0001, + "loss": 5.0932, + "loss/crossentropy": 2.575412631034851, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3106851130723953, + "step": 2570 + }, + { + "epoch": 0.05144, + "grad_norm": 2.515625, + "grad_norm_var": 0.020686848958333334, + "learning_rate": 0.0001, + "loss": 4.6755, + "loss/crossentropy": 2.0210241079330444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.269680991768837, + "step": 2572 + }, + { + "epoch": 0.05148, + "grad_norm": 2.4375, + "grad_norm_var": 0.02076416015625, + "learning_rate": 0.0001, + "loss": 4.6308, + "loss/crossentropy": 1.9054389595985413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24933087825775146, + "step": 2574 + }, + { + "epoch": 0.05152, + "grad_norm": 2.625, + "grad_norm_var": 0.021284993489583334, + "learning_rate": 0.0001, + "loss": 4.9682, + "loss/crossentropy": 2.142069697380066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3162301778793335, + "step": 2576 + }, + { + "epoch": 0.05156, + "grad_norm": 2.59375, + "grad_norm_var": 0.0197662353515625, + "learning_rate": 0.0001, + "loss": 5.018, + "loss/crossentropy": 1.9952309727668762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560836151242256, + "step": 2578 + }, + { + "epoch": 0.0516, + "grad_norm": 2.765625, + "grad_norm_var": 0.016405232747395835, + "learning_rate": 0.0001, + "loss": 4.8889, + "loss/crossentropy": 2.0579317212104797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26214616745710373, + "step": 2580 + }, + { + "epoch": 0.05164, + "grad_norm": 2.46875, + "grad_norm_var": 0.016927083333333332, + "learning_rate": 0.0001, + "loss": 4.6306, + "loss/crossentropy": 2.076499104499817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26244185864925385, + "step": 2582 + }, + { + "epoch": 0.05168, + "grad_norm": 2.890625, + "grad_norm_var": 0.026146443684895833, + "learning_rate": 0.0001, + "loss": 4.9393, + "loss/crossentropy": 2.2277501821517944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32310059666633606, + "step": 2584 + }, + { + "epoch": 0.05172, + "grad_norm": 2.53125, + "grad_norm_var": 0.03141988118489583, + "learning_rate": 0.0001, + "loss": 5.0929, + "loss/crossentropy": 2.101436138153076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26044395565986633, + "step": 2586 + }, + { + "epoch": 0.05176, + "grad_norm": 2.703125, + "grad_norm_var": 0.030939737955729168, + "learning_rate": 0.0001, + "loss": 5.045, + "loss/crossentropy": 2.2617305517196655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2768043726682663, + "step": 2588 + }, + { + "epoch": 0.0518, + "grad_norm": 2.515625, + "grad_norm_var": 0.0349029541015625, + "learning_rate": 0.0001, + "loss": 4.989, + "loss/crossentropy": 2.2669495940208435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2941794842481613, + "step": 2590 + }, + { + "epoch": 0.05184, + "grad_norm": 2.546875, + "grad_norm_var": 0.03871968587239583, + "learning_rate": 0.0001, + "loss": 4.7676, + "loss/crossentropy": 1.8102391958236694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24766983091831207, + "step": 2592 + }, + { + "epoch": 0.05188, + "grad_norm": 2.53125, + "grad_norm_var": 0.04006754557291667, + "learning_rate": 0.0001, + "loss": 5.0022, + "loss/crossentropy": 2.1426219940185547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2808763086795807, + "step": 2594 + }, + { + "epoch": 0.05192, + "grad_norm": 2.75, + "grad_norm_var": 0.0368804931640625, + "learning_rate": 0.0001, + "loss": 4.8649, + "loss/crossentropy": 2.0731321573257446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28768520057201385, + "step": 2596 + }, + { + "epoch": 0.05196, + "grad_norm": 2.46875, + "grad_norm_var": 0.03619384765625, + "learning_rate": 0.0001, + "loss": 4.8019, + "loss/crossentropy": 1.8902159333229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27209727466106415, + "step": 2598 + }, + { + "epoch": 0.052, + "grad_norm": 2.390625, + "grad_norm_var": 0.030855305989583335, + "learning_rate": 0.0001, + "loss": 4.837, + "loss/crossentropy": 2.1860097646713257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3191404938697815, + "step": 2600 + }, + { + "epoch": 0.05204, + "grad_norm": 2.78125, + "grad_norm_var": 0.027074178059895832, + "learning_rate": 0.0001, + "loss": 4.8471, + "loss/crossentropy": 2.1010658740997314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2924908995628357, + "step": 2602 + }, + { + "epoch": 0.05208, + "grad_norm": 2.71875, + "grad_norm_var": 0.0285064697265625, + "learning_rate": 0.0001, + "loss": 5.1725, + "loss/crossentropy": 2.0668399930000305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2691944092512131, + "step": 2604 + }, + { + "epoch": 0.05212, + "grad_norm": 2.8125, + "grad_norm_var": 0.023824055989583332, + "learning_rate": 0.0001, + "loss": 5.0115, + "loss/crossentropy": 2.310541272163391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31448885798454285, + "step": 2606 + }, + { + "epoch": 0.05216, + "grad_norm": 2.75, + "grad_norm_var": 0.0222808837890625, + "learning_rate": 0.0001, + "loss": 4.7731, + "loss/crossentropy": 2.023577332496643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28528447449207306, + "step": 2608 + }, + { + "epoch": 0.0522, + "grad_norm": 2.5625, + "grad_norm_var": 0.0217193603515625, + "learning_rate": 0.0001, + "loss": 4.8408, + "loss/crossentropy": 2.0232901573181152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25540125370025635, + "step": 2610 + }, + { + "epoch": 0.05224, + "grad_norm": 2.5625, + "grad_norm_var": 0.019554646809895833, + "learning_rate": 0.0001, + "loss": 4.8374, + "loss/crossentropy": 2.147561550140381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2914447784423828, + "step": 2612 + }, + { + "epoch": 0.05228, + "grad_norm": 2.46875, + "grad_norm_var": 0.0193359375, + "learning_rate": 0.0001, + "loss": 4.8044, + "loss/crossentropy": 1.9136184453964233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27879445254802704, + "step": 2614 + }, + { + "epoch": 0.05232, + "grad_norm": 3.03125, + "grad_norm_var": 0.0606842041015625, + "learning_rate": 0.0001, + "loss": 5.1245, + "loss/crossentropy": 2.1118494272232056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2990800142288208, + "step": 2616 + }, + { + "epoch": 0.05236, + "grad_norm": 2.640625, + "grad_norm_var": 0.0599761962890625, + "learning_rate": 0.0001, + "loss": 5.2189, + "loss/crossentropy": 1.9564325213432312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2805679142475128, + "step": 2618 + }, + { + "epoch": 0.0524, + "grad_norm": 3.046875, + "grad_norm_var": 0.06641337076822916, + "learning_rate": 0.0001, + "loss": 4.9511, + "loss/crossentropy": 2.0683051347732544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2906472980976105, + "step": 2620 + }, + { + "epoch": 0.05244, + "grad_norm": 2.609375, + "grad_norm_var": 0.06653645833333334, + "learning_rate": 0.0001, + "loss": 5.058, + "loss/crossentropy": 2.0510823130607605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28148986399173737, + "step": 2622 + }, + { + "epoch": 0.05248, + "grad_norm": 2.640625, + "grad_norm_var": 0.06256103515625, + "learning_rate": 0.0001, + "loss": 5.2113, + "loss/crossentropy": 2.2972904443740845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32902073860168457, + "step": 2624 + }, + { + "epoch": 0.05252, + "grad_norm": 2.890625, + "grad_norm_var": 0.0582427978515625, + "learning_rate": 0.0001, + "loss": 5.0939, + "loss/crossentropy": 1.9502179026603699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2844041585922241, + "step": 2626 + }, + { + "epoch": 0.05256, + "grad_norm": 2.640625, + "grad_norm_var": 0.05607808430989583, + "learning_rate": 0.0001, + "loss": 5.1078, + "loss/crossentropy": 2.1577298045158386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2673380598425865, + "step": 2628 + }, + { + "epoch": 0.0526, + "grad_norm": 2.59375, + "grad_norm_var": 0.05271708170572917, + "learning_rate": 0.0001, + "loss": 5.0514, + "loss/crossentropy": 2.1707664132118225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2697260081768036, + "step": 2630 + }, + { + "epoch": 0.05264, + "grad_norm": 2.71875, + "grad_norm_var": 0.017606608072916665, + "learning_rate": 0.0001, + "loss": 4.9617, + "loss/crossentropy": 2.0975311398506165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27033862471580505, + "step": 2632 + }, + { + "epoch": 0.05268, + "grad_norm": 3.25, + "grad_norm_var": 0.033869425455729164, + "learning_rate": 0.0001, + "loss": 5.1841, + "loss/crossentropy": 2.197197914123535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28717951476573944, + "step": 2634 + }, + { + "epoch": 0.05272, + "grad_norm": 2.765625, + "grad_norm_var": 0.03570556640625, + "learning_rate": 0.0001, + "loss": 5.2558, + "loss/crossentropy": 2.2898266315460205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31359314918518066, + "step": 2636 + }, + { + "epoch": 0.05276, + "grad_norm": 2.796875, + "grad_norm_var": 0.0347564697265625, + "learning_rate": 0.0001, + "loss": 5.4849, + "loss/crossentropy": 2.525710701942444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3199215829372406, + "step": 2638 + }, + { + "epoch": 0.0528, + "grad_norm": 2.796875, + "grad_norm_var": 0.03349609375, + "learning_rate": 0.0001, + "loss": 5.0495, + "loss/crossentropy": 2.2799761295318604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28715676069259644, + "step": 2640 + }, + { + "epoch": 0.05284, + "grad_norm": 2.796875, + "grad_norm_var": 0.032373046875, + "learning_rate": 0.0001, + "loss": 4.9837, + "loss/crossentropy": 1.8681190013885498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25435833632946014, + "step": 2642 + }, + { + "epoch": 0.05288, + "grad_norm": 2.578125, + "grad_norm_var": 0.03730061848958333, + "learning_rate": 0.0001, + "loss": 4.807, + "loss/crossentropy": 1.9067540168762207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25444281101226807, + "step": 2644 + }, + { + "epoch": 0.05292, + "grad_norm": 2.53125, + "grad_norm_var": 0.03732096354166667, + "learning_rate": 0.0001, + "loss": 5.0326, + "loss/crossentropy": 2.370971202850342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31173495948314667, + "step": 2646 + }, + { + "epoch": 0.05296, + "grad_norm": 2.671875, + "grad_norm_var": 0.03752848307291667, + "learning_rate": 0.0001, + "loss": 5.1148, + "loss/crossentropy": 2.1829749941825867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29970400035381317, + "step": 2648 + }, + { + "epoch": 0.053, + "grad_norm": 2.609375, + "grad_norm_var": 0.020052083333333335, + "learning_rate": 0.0001, + "loss": 5.1086, + "loss/crossentropy": 2.0818498134613037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27712464332580566, + "step": 2650 + }, + { + "epoch": 0.05304, + "grad_norm": 2.609375, + "grad_norm_var": 0.010789998372395833, + "learning_rate": 0.0001, + "loss": 5.1071, + "loss/crossentropy": 2.2080377340316772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27552157640457153, + "step": 2652 + }, + { + "epoch": 0.05308, + "grad_norm": 2.53125, + "grad_norm_var": 0.01109619140625, + "learning_rate": 0.0001, + "loss": 4.9685, + "loss/crossentropy": 1.7115904092788696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23266373574733734, + "step": 2654 + }, + { + "epoch": 0.05312, + "grad_norm": 2.59375, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 4.8654, + "loss/crossentropy": 2.1736810207366943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29696571826934814, + "step": 2656 + }, + { + "epoch": 0.05316, + "grad_norm": 2.765625, + "grad_norm_var": 0.012886555989583333, + "learning_rate": 0.0001, + "loss": 5.0834, + "loss/crossentropy": 2.2366485595703125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3062315583229065, + "step": 2658 + }, + { + "epoch": 0.0532, + "grad_norm": 2.609375, + "grad_norm_var": 0.018748982747395834, + "learning_rate": 0.0001, + "loss": 5.0888, + "loss/crossentropy": 1.9835070371627808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.35663366317749023, + "step": 2660 + }, + { + "epoch": 0.05324, + "grad_norm": 2.4375, + "grad_norm_var": 0.026688639322916666, + "learning_rate": 0.0001, + "loss": 4.6725, + "loss/crossentropy": 2.148723840713501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2687191218137741, + "step": 2662 + }, + { + "epoch": 0.05328, + "grad_norm": 2.625, + "grad_norm_var": 0.02789306640625, + "learning_rate": 0.0001, + "loss": 4.9191, + "loss/crossentropy": 2.2642128467559814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2738381028175354, + "step": 2664 + }, + { + "epoch": 0.05332, + "grad_norm": 2.546875, + "grad_norm_var": 0.028483072916666668, + "learning_rate": 0.0001, + "loss": 4.8209, + "loss/crossentropy": 1.839052438735962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2566695362329483, + "step": 2666 + }, + { + "epoch": 0.05336, + "grad_norm": 2.6875, + "grad_norm_var": 0.026927693684895834, + "learning_rate": 0.0001, + "loss": 4.8771, + "loss/crossentropy": 2.083684980869293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2632910907268524, + "step": 2668 + }, + { + "epoch": 0.0534, + "grad_norm": 2.75, + "grad_norm_var": 0.028434244791666667, + "learning_rate": 0.0001, + "loss": 4.8585, + "loss/crossentropy": 2.2125936150550842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31363190710544586, + "step": 2670 + }, + { + "epoch": 0.05344, + "grad_norm": 2.734375, + "grad_norm_var": 0.029157511393229165, + "learning_rate": 0.0001, + "loss": 5.0354, + "loss/crossentropy": 2.2075263261795044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2850564122200012, + "step": 2672 + }, + { + "epoch": 0.05348, + "grad_norm": 2.859375, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 5.0145, + "loss/crossentropy": 2.0876463651657104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2972792685031891, + "step": 2674 + }, + { + "epoch": 0.05352, + "grad_norm": 2.84375, + "grad_norm_var": 0.020406087239583332, + "learning_rate": 0.0001, + "loss": 5.2755, + "loss/crossentropy": 2.4033172130584717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.304116889834404, + "step": 2676 + }, + { + "epoch": 0.05356, + "grad_norm": 2.734375, + "grad_norm_var": 0.014501953125, + "learning_rate": 0.0001, + "loss": 5.0921, + "loss/crossentropy": 2.30586314201355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3264722675085068, + "step": 2678 + }, + { + "epoch": 0.0536, + "grad_norm": 2.46875, + "grad_norm_var": 0.015152994791666667, + "learning_rate": 0.0001, + "loss": 5.0941, + "loss/crossentropy": 2.2175174951553345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2661540359258652, + "step": 2680 + }, + { + "epoch": 0.05364, + "grad_norm": 2.921875, + "grad_norm_var": 0.022606404622395833, + "learning_rate": 0.0001, + "loss": 5.1162, + "loss/crossentropy": 2.0583900213241577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26995618641376495, + "step": 2682 + }, + { + "epoch": 0.05368, + "grad_norm": 2.59375, + "grad_norm_var": 0.020173136393229166, + "learning_rate": 0.0001, + "loss": 4.9357, + "loss/crossentropy": 2.310309052467346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29642508924007416, + "step": 2684 + }, + { + "epoch": 0.05372, + "grad_norm": 3.15625, + "grad_norm_var": 0.03332926432291667, + "learning_rate": 0.0001, + "loss": 4.7945, + "loss/crossentropy": 2.134134352207184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2619224041700363, + "step": 2686 + }, + { + "epoch": 0.05376, + "grad_norm": 2.609375, + "grad_norm_var": 0.03306884765625, + "learning_rate": 0.0001, + "loss": 4.8411, + "loss/crossentropy": 1.930562138557434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27108173072338104, + "step": 2688 + }, + { + "epoch": 0.0538, + "grad_norm": 2.46875, + "grad_norm_var": 0.03455403645833333, + "learning_rate": 0.0001, + "loss": 4.9591, + "loss/crossentropy": 2.3414769172668457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.270420178771019, + "step": 2690 + }, + { + "epoch": 0.05384, + "grad_norm": 2.78125, + "grad_norm_var": 0.0356842041015625, + "learning_rate": 0.0001, + "loss": 4.83, + "loss/crossentropy": 2.1726938486099243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2710433751344681, + "step": 2692 + }, + { + "epoch": 0.05388, + "grad_norm": 2.640625, + "grad_norm_var": 0.0361968994140625, + "learning_rate": 0.0001, + "loss": 5.0163, + "loss/crossentropy": 2.220117926597595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3212582617998123, + "step": 2694 + }, + { + "epoch": 0.05392, + "grad_norm": 2.71875, + "grad_norm_var": 0.03297119140625, + "learning_rate": 0.0001, + "loss": 5.1845, + "loss/crossentropy": 2.2557668685913086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3096832036972046, + "step": 2696 + }, + { + "epoch": 0.05396, + "grad_norm": 2.734375, + "grad_norm_var": 0.0246246337890625, + "learning_rate": 0.0001, + "loss": 5.0399, + "loss/crossentropy": 1.94975346326828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2530350238084793, + "step": 2698 + }, + { + "epoch": 0.054, + "grad_norm": 2.6875, + "grad_norm_var": 0.024169921875, + "learning_rate": 0.0001, + "loss": 5.19, + "loss/crossentropy": 2.4622775316238403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3135389983654022, + "step": 2700 + }, + { + "epoch": 0.05404, + "grad_norm": 2.65625, + "grad_norm_var": 0.0110748291015625, + "learning_rate": 0.0001, + "loss": 5.2005, + "loss/crossentropy": 2.5367363691329956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30742934346199036, + "step": 2702 + }, + { + "epoch": 0.05408, + "grad_norm": 2.5, + "grad_norm_var": 0.014631144205729167, + "learning_rate": 0.0001, + "loss": 5.146, + "loss/crossentropy": 2.5733184814453125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33410580456256866, + "step": 2704 + }, + { + "epoch": 0.05412, + "grad_norm": 2.6875, + "grad_norm_var": 0.011725870768229167, + "learning_rate": 0.0001, + "loss": 4.8888, + "loss/crossentropy": 1.9339997172355652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28760699927806854, + "step": 2706 + }, + { + "epoch": 0.05416, + "grad_norm": 2.484375, + "grad_norm_var": 0.010724894205729167, + "learning_rate": 0.0001, + "loss": 4.8719, + "loss/crossentropy": 1.8515672087669373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23648252338171005, + "step": 2708 + }, + { + "epoch": 0.0542, + "grad_norm": 2.546875, + "grad_norm_var": 0.014420572916666667, + "learning_rate": 0.0001, + "loss": 4.6598, + "loss/crossentropy": 2.0973429083824158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2605738639831543, + "step": 2710 + }, + { + "epoch": 0.05424, + "grad_norm": 2.90625, + "grad_norm_var": 0.021219889322916668, + "learning_rate": 0.0001, + "loss": 5.2795, + "loss/crossentropy": 2.406570076942444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2992263287305832, + "step": 2712 + }, + { + "epoch": 0.05428, + "grad_norm": 2.53125, + "grad_norm_var": 0.024958292643229168, + "learning_rate": 0.0001, + "loss": 4.8591, + "loss/crossentropy": 2.040315330028534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27726222574710846, + "step": 2714 + }, + { + "epoch": 0.05432, + "grad_norm": 2.40625, + "grad_norm_var": 0.025178019205729166, + "learning_rate": 0.0001, + "loss": 4.7879, + "loss/crossentropy": 2.250051259994507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2764698565006256, + "step": 2716 + }, + { + "epoch": 0.05436, + "grad_norm": 2.4375, + "grad_norm_var": 0.026985677083333333, + "learning_rate": 0.0001, + "loss": 4.8813, + "loss/crossentropy": 2.25112247467041, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3101722151041031, + "step": 2718 + }, + { + "epoch": 0.0544, + "grad_norm": 2.421875, + "grad_norm_var": 0.028709920247395833, + "learning_rate": 0.0001, + "loss": 4.7242, + "loss/crossentropy": 2.261968731880188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2805032432079315, + "step": 2720 + }, + { + "epoch": 0.05444, + "grad_norm": 2.59375, + "grad_norm_var": 0.030078125, + "learning_rate": 0.0001, + "loss": 5.0449, + "loss/crossentropy": 2.376634955406189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25400323420763016, + "step": 2722 + }, + { + "epoch": 0.05448, + "grad_norm": 2.515625, + "grad_norm_var": 0.03351949055989583, + "learning_rate": 0.0001, + "loss": 5.2325, + "loss/crossentropy": 2.61246657371521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3170415759086609, + "step": 2724 + }, + { + "epoch": 0.05452, + "grad_norm": 2.59375, + "grad_norm_var": 0.0296051025390625, + "learning_rate": 0.0001, + "loss": 5.0433, + "loss/crossentropy": 2.3982752561569214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2730572074651718, + "step": 2726 + }, + { + "epoch": 0.05456, + "grad_norm": 2.609375, + "grad_norm_var": 0.0207916259765625, + "learning_rate": 0.0001, + "loss": 4.8836, + "loss/crossentropy": 1.9890516996383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26453813910484314, + "step": 2728 + }, + { + "epoch": 0.0546, + "grad_norm": 2.4375, + "grad_norm_var": 0.016559855143229166, + "learning_rate": 0.0001, + "loss": 4.7252, + "loss/crossentropy": 2.1825047731399536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28369753062725067, + "step": 2730 + }, + { + "epoch": 0.05464, + "grad_norm": 2.5, + "grad_norm_var": 0.015360514322916666, + "learning_rate": 0.0001, + "loss": 4.9445, + "loss/crossentropy": 1.9745987057685852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2662041634321213, + "step": 2732 + }, + { + "epoch": 0.05468, + "grad_norm": 2.640625, + "grad_norm_var": 0.020783487955729166, + "learning_rate": 0.0001, + "loss": 4.7591, + "loss/crossentropy": 2.2962852716445923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2986691743135452, + "step": 2734 + }, + { + "epoch": 0.05472, + "grad_norm": 2.65625, + "grad_norm_var": 0.018342081705729166, + "learning_rate": 0.0001, + "loss": 4.9712, + "loss/crossentropy": 2.0517550110816956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27293455600738525, + "step": 2736 + }, + { + "epoch": 0.05476, + "grad_norm": 2.859375, + "grad_norm_var": 0.021434529622395834, + "learning_rate": 0.0001, + "loss": 5.4052, + "loss/crossentropy": 2.327734112739563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3019224554300308, + "step": 2738 + }, + { + "epoch": 0.0548, + "grad_norm": 2.5, + "grad_norm_var": 0.018680826822916666, + "learning_rate": 0.0001, + "loss": 4.7002, + "loss/crossentropy": 2.236689567565918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2816064953804016, + "step": 2740 + }, + { + "epoch": 0.05484, + "grad_norm": 2.75, + "grad_norm_var": 0.020894368489583332, + "learning_rate": 0.0001, + "loss": 5.1321, + "loss/crossentropy": 2.0209690928459167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2734442874789238, + "step": 2742 + }, + { + "epoch": 0.05488, + "grad_norm": 2.640625, + "grad_norm_var": 0.026276652018229166, + "learning_rate": 0.0001, + "loss": 4.9841, + "loss/crossentropy": 2.2264864444732666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28523482382297516, + "step": 2744 + }, + { + "epoch": 0.05492, + "grad_norm": 2.421875, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 4.8661, + "loss/crossentropy": 1.9897980690002441, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26623376458883286, + "step": 2746 + }, + { + "epoch": 0.05496, + "grad_norm": 2.53125, + "grad_norm_var": 0.024442545572916665, + "learning_rate": 0.0001, + "loss": 4.9299, + "loss/crossentropy": 2.0583779215812683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2803298681974411, + "step": 2748 + }, + { + "epoch": 0.055, + "grad_norm": 2.515625, + "grad_norm_var": 0.022001139322916665, + "learning_rate": 0.0001, + "loss": 4.9818, + "loss/crossentropy": 1.8448269367218018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2606130689382553, + "step": 2750 + }, + { + "epoch": 0.05504, + "grad_norm": 2.640625, + "grad_norm_var": 0.02301025390625, + "learning_rate": 0.0001, + "loss": 5.0885, + "loss/crossentropy": 2.294836401939392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29403699934482574, + "step": 2752 + }, + { + "epoch": 0.05508, + "grad_norm": 2.296875, + "grad_norm_var": 0.02808837890625, + "learning_rate": 0.0001, + "loss": 4.4637, + "loss/crossentropy": 2.2402058839797974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28176650404930115, + "step": 2754 + }, + { + "epoch": 0.05512, + "grad_norm": 2.734375, + "grad_norm_var": 0.0298980712890625, + "learning_rate": 0.0001, + "loss": 4.7805, + "loss/crossentropy": 1.7882421612739563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25275079905986786, + "step": 2756 + }, + { + "epoch": 0.05516, + "grad_norm": 2.8125, + "grad_norm_var": 0.02919921875, + "learning_rate": 0.0001, + "loss": 5.1277, + "loss/crossentropy": 2.4185458421707153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30663780868053436, + "step": 2758 + }, + { + "epoch": 0.0552, + "grad_norm": 2.484375, + "grad_norm_var": 0.0318511962890625, + "learning_rate": 0.0001, + "loss": 4.9259, + "loss/crossentropy": 2.2588642835617065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26785464584827423, + "step": 2760 + }, + { + "epoch": 0.05524, + "grad_norm": 2.484375, + "grad_norm_var": 0.034012858072916666, + "learning_rate": 0.0001, + "loss": 4.8359, + "loss/crossentropy": 2.145754337310791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2710302472114563, + "step": 2762 + }, + { + "epoch": 0.05528, + "grad_norm": 2.609375, + "grad_norm_var": 0.03648681640625, + "learning_rate": 0.0001, + "loss": 4.739, + "loss/crossentropy": 2.3627569675445557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2926081120967865, + "step": 2764 + }, + { + "epoch": 0.05532, + "grad_norm": 2.609375, + "grad_norm_var": 0.03632405598958333, + "learning_rate": 0.0001, + "loss": 4.8512, + "loss/crossentropy": 1.988103210926056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24308288842439651, + "step": 2766 + }, + { + "epoch": 0.05536, + "grad_norm": 2.828125, + "grad_norm_var": 0.040445963541666664, + "learning_rate": 0.0001, + "loss": 5.1075, + "loss/crossentropy": 2.2497689723968506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.313574954867363, + "step": 2768 + }, + { + "epoch": 0.0554, + "grad_norm": 2.609375, + "grad_norm_var": 0.030492146809895832, + "learning_rate": 0.0001, + "loss": 4.8978, + "loss/crossentropy": 2.2603683471679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27997657656669617, + "step": 2770 + }, + { + "epoch": 0.05544, + "grad_norm": 2.796875, + "grad_norm_var": 0.030614217122395832, + "learning_rate": 0.0001, + "loss": 4.9485, + "loss/crossentropy": 2.2585065364837646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2818940281867981, + "step": 2772 + }, + { + "epoch": 0.05548, + "grad_norm": 2.578125, + "grad_norm_var": 0.027730305989583332, + "learning_rate": 0.0001, + "loss": 5.2222, + "loss/crossentropy": 2.1413429975509644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2950669527053833, + "step": 2774 + }, + { + "epoch": 0.05552, + "grad_norm": 2.734375, + "grad_norm_var": 0.0171875, + "learning_rate": 0.0001, + "loss": 5.1567, + "loss/crossentropy": 1.9994583129882812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3946940451860428, + "step": 2776 + }, + { + "epoch": 0.05556, + "grad_norm": 2.796875, + "grad_norm_var": 0.014762369791666667, + "learning_rate": 0.0001, + "loss": 5.2877, + "loss/crossentropy": 2.423824667930603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30817069113254547, + "step": 2778 + }, + { + "epoch": 0.0556, + "grad_norm": 2.4375, + "grad_norm_var": 0.013719685872395833, + "learning_rate": 0.0001, + "loss": 4.6657, + "loss/crossentropy": 1.8579126000404358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24312064796686172, + "step": 2780 + }, + { + "epoch": 0.05564, + "grad_norm": 2.390625, + "grad_norm_var": 0.018485514322916667, + "learning_rate": 0.0001, + "loss": 4.7653, + "loss/crossentropy": 2.3444939851760864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28466545045375824, + "step": 2782 + }, + { + "epoch": 0.05568, + "grad_norm": 2.71875, + "grad_norm_var": 0.03806864420572917, + "learning_rate": 0.0001, + "loss": 5.1187, + "loss/crossentropy": 2.221144914627075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2694521099328995, + "step": 2784 + }, + { + "epoch": 0.05572, + "grad_norm": 2.578125, + "grad_norm_var": 0.0376861572265625, + "learning_rate": 0.0001, + "loss": 5.0401, + "loss/crossentropy": 1.9372909665107727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2698594778776169, + "step": 2786 + }, + { + "epoch": 0.05576, + "grad_norm": 2.765625, + "grad_norm_var": 0.038834635416666666, + "learning_rate": 0.0001, + "loss": 4.834, + "loss/crossentropy": 2.129204750061035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27333614230155945, + "step": 2788 + }, + { + "epoch": 0.0558, + "grad_norm": 2.5, + "grad_norm_var": 0.04045817057291667, + "learning_rate": 0.0001, + "loss": 4.8462, + "loss/crossentropy": 1.6917370557785034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24290545284748077, + "step": 2790 + }, + { + "epoch": 0.05584, + "grad_norm": 2.765625, + "grad_norm_var": 0.04096577962239583, + "learning_rate": 0.0001, + "loss": 4.6942, + "loss/crossentropy": 1.7883749604225159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2554834187030792, + "step": 2792 + }, + { + "epoch": 0.05588, + "grad_norm": 2.765625, + "grad_norm_var": 0.04560445149739583, + "learning_rate": 0.0001, + "loss": 4.6835, + "loss/crossentropy": 1.867136001586914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2667583078145981, + "step": 2794 + }, + { + "epoch": 0.05592, + "grad_norm": 2.546875, + "grad_norm_var": 0.04143473307291667, + "learning_rate": 0.0001, + "loss": 4.9686, + "loss/crossentropy": 2.032800853252411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2580890506505966, + "step": 2796 + }, + { + "epoch": 0.05596, + "grad_norm": 3.03125, + "grad_norm_var": 0.04389546712239583, + "learning_rate": 0.0001, + "loss": 5.018, + "loss/crossentropy": 1.9867863655090332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2694792151451111, + "step": 2798 + }, + { + "epoch": 0.056, + "grad_norm": 2.59375, + "grad_norm_var": 0.026786295572916667, + "learning_rate": 0.0001, + "loss": 4.9137, + "loss/crossentropy": 2.1026757955551147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2664051130414009, + "step": 2800 + }, + { + "epoch": 0.05604, + "grad_norm": 2.65625, + "grad_norm_var": 0.0259918212890625, + "learning_rate": 0.0001, + "loss": 4.9883, + "loss/crossentropy": 2.0649160742759705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2473129704594612, + "step": 2802 + }, + { + "epoch": 0.05608, + "grad_norm": 2.921875, + "grad_norm_var": 0.030451456705729168, + "learning_rate": 0.0001, + "loss": 4.8959, + "loss/crossentropy": 2.2110248804092407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27131715416908264, + "step": 2804 + }, + { + "epoch": 0.05612, + "grad_norm": 2.65625, + "grad_norm_var": 0.032933553059895836, + "learning_rate": 0.0001, + "loss": 4.9603, + "loss/crossentropy": 2.3483060598373413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27071496844291687, + "step": 2806 + }, + { + "epoch": 0.05616, + "grad_norm": 2.640625, + "grad_norm_var": 0.03277587890625, + "learning_rate": 0.0001, + "loss": 4.9273, + "loss/crossentropy": 1.9061944484710693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2550426125526428, + "step": 2808 + }, + { + "epoch": 0.0562, + "grad_norm": 2.671875, + "grad_norm_var": 0.0278472900390625, + "learning_rate": 0.0001, + "loss": 5.1734, + "loss/crossentropy": 2.3073103427886963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30051568150520325, + "step": 2810 + }, + { + "epoch": 0.05624, + "grad_norm": 2.671875, + "grad_norm_var": 0.027106730143229167, + "learning_rate": 0.0001, + "loss": 5.0268, + "loss/crossentropy": 2.393891453742981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29226459562778473, + "step": 2812 + }, + { + "epoch": 0.05628, + "grad_norm": 2.484375, + "grad_norm_var": 0.018578084309895833, + "learning_rate": 0.0001, + "loss": 5.0839, + "loss/crossentropy": 2.3082761764526367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29539754986763, + "step": 2814 + }, + { + "epoch": 0.05632, + "grad_norm": 2.515625, + "grad_norm_var": 0.021776326497395835, + "learning_rate": 0.0001, + "loss": 4.8973, + "loss/crossentropy": 2.7815494537353516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33258646726608276, + "step": 2816 + }, + { + "epoch": 0.05636, + "grad_norm": 2.390625, + "grad_norm_var": 0.031538899739583334, + "learning_rate": 0.0001, + "loss": 4.7095, + "loss/crossentropy": 2.077984571456909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.261022225022316, + "step": 2818 + }, + { + "epoch": 0.0564, + "grad_norm": 2.484375, + "grad_norm_var": 0.0251617431640625, + "learning_rate": 0.0001, + "loss": 4.6389, + "loss/crossentropy": 2.0524688363075256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2936979830265045, + "step": 2820 + }, + { + "epoch": 0.05644, + "grad_norm": 2.5, + "grad_norm_var": 0.018212890625, + "learning_rate": 0.0001, + "loss": 4.9657, + "loss/crossentropy": 1.8323140740394592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26426824927330017, + "step": 2822 + }, + { + "epoch": 0.05648, + "grad_norm": 2.765625, + "grad_norm_var": 0.0201171875, + "learning_rate": 0.0001, + "loss": 4.9513, + "loss/crossentropy": 2.2290462255477905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28852027654647827, + "step": 2824 + }, + { + "epoch": 0.05652, + "grad_norm": 2.5625, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 4.7828, + "loss/crossentropy": 1.8788678050041199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24312467128038406, + "step": 2826 + }, + { + "epoch": 0.05656, + "grad_norm": 2.46875, + "grad_norm_var": 0.01646728515625, + "learning_rate": 0.0001, + "loss": 4.7882, + "loss/crossentropy": 1.9402090311050415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25435876101255417, + "step": 2828 + }, + { + "epoch": 0.0566, + "grad_norm": 2.390625, + "grad_norm_var": 0.0144683837890625, + "learning_rate": 0.0001, + "loss": 4.5823, + "loss/crossentropy": 2.4833847284317017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27234241366386414, + "step": 2830 + }, + { + "epoch": 0.05664, + "grad_norm": 2.515625, + "grad_norm_var": 0.0154296875, + "learning_rate": 0.0001, + "loss": 5.0521, + "loss/crossentropy": 2.351561665534973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28028184175491333, + "step": 2832 + }, + { + "epoch": 0.05668, + "grad_norm": 2.953125, + "grad_norm_var": 0.019722493489583333, + "learning_rate": 0.0001, + "loss": 5.0688, + "loss/crossentropy": 2.0372352600097656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26005299389362335, + "step": 2834 + }, + { + "epoch": 0.05672, + "grad_norm": 2.828125, + "grad_norm_var": 0.021744791666666666, + "learning_rate": 0.0001, + "loss": 4.4282, + "loss/crossentropy": 1.6522082090377808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26683834940195084, + "step": 2836 + }, + { + "epoch": 0.05676, + "grad_norm": 2.390625, + "grad_norm_var": 0.023778279622395832, + "learning_rate": 0.0001, + "loss": 4.7165, + "loss/crossentropy": 1.8886643052101135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23464814573526382, + "step": 2838 + }, + { + "epoch": 0.0568, + "grad_norm": 2.734375, + "grad_norm_var": 0.023119099934895835, + "learning_rate": 0.0001, + "loss": 4.9477, + "loss/crossentropy": 2.2161459922790527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29381541907787323, + "step": 2840 + }, + { + "epoch": 0.05684, + "grad_norm": 2.421875, + "grad_norm_var": 0.024779256184895834, + "learning_rate": 0.0001, + "loss": 4.7408, + "loss/crossentropy": 2.1542043685913086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2753119468688965, + "step": 2842 + }, + { + "epoch": 0.05688, + "grad_norm": 2.78125, + "grad_norm_var": 0.026090494791666665, + "learning_rate": 0.0001, + "loss": 5.1288, + "loss/crossentropy": 2.5605628490448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.329460546374321, + "step": 2844 + }, + { + "epoch": 0.05692, + "grad_norm": 2.609375, + "grad_norm_var": 0.023224894205729166, + "learning_rate": 0.0001, + "loss": 4.9955, + "loss/crossentropy": 2.1000319719314575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.273986279964447, + "step": 2846 + }, + { + "epoch": 0.05696, + "grad_norm": 2.484375, + "grad_norm_var": 0.023558553059895834, + "learning_rate": 0.0001, + "loss": 4.6813, + "loss/crossentropy": 2.1036760210990906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26293954253196716, + "step": 2848 + }, + { + "epoch": 0.057, + "grad_norm": 2.640625, + "grad_norm_var": 0.014286295572916666, + "learning_rate": 0.0001, + "loss": 4.9799, + "loss/crossentropy": 2.2130608558654785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2977828085422516, + "step": 2850 + }, + { + "epoch": 0.05704, + "grad_norm": 2.578125, + "grad_norm_var": 0.01461181640625, + "learning_rate": 0.0001, + "loss": 5.1761, + "loss/crossentropy": 2.1878823041915894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27322643995285034, + "step": 2852 + }, + { + "epoch": 0.05708, + "grad_norm": 2.703125, + "grad_norm_var": 0.012547810872395834, + "learning_rate": 0.0001, + "loss": 4.9448, + "loss/crossentropy": 2.1559258699417114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32625503838062286, + "step": 2854 + }, + { + "epoch": 0.05712, + "grad_norm": 2.390625, + "grad_norm_var": 0.015848795572916668, + "learning_rate": 0.0001, + "loss": 4.4611, + "loss/crossentropy": 2.0818992257118225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2677721679210663, + "step": 2856 + }, + { + "epoch": 0.05716, + "grad_norm": 2.578125, + "grad_norm_var": 0.016243489583333333, + "learning_rate": 0.0001, + "loss": 5.2755, + "loss/crossentropy": 2.544907331466675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30028045177459717, + "step": 2858 + }, + { + "epoch": 0.0572, + "grad_norm": 3.203125, + "grad_norm_var": 0.04491780598958333, + "learning_rate": 0.0001, + "loss": 5.2855, + "loss/crossentropy": 2.5932188034057617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30431173741817474, + "step": 2860 + }, + { + "epoch": 0.05724, + "grad_norm": 2.453125, + "grad_norm_var": 0.046219889322916666, + "learning_rate": 0.0001, + "loss": 4.9799, + "loss/crossentropy": 2.1007986068725586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2989690601825714, + "step": 2862 + }, + { + "epoch": 0.05728, + "grad_norm": 2.5625, + "grad_norm_var": 0.044169108072916664, + "learning_rate": 0.0001, + "loss": 4.7706, + "loss/crossentropy": 2.0102875232696533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26936179399490356, + "step": 2864 + }, + { + "epoch": 0.05732, + "grad_norm": 3.109375, + "grad_norm_var": 0.056559244791666664, + "learning_rate": 0.0001, + "loss": 5.2099, + "loss/crossentropy": 2.3457159996032715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29194609820842743, + "step": 2866 + }, + { + "epoch": 0.05736, + "grad_norm": 2.625, + "grad_norm_var": 0.055582682291666664, + "learning_rate": 0.0001, + "loss": 4.8768, + "loss/crossentropy": 2.559054732322693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3018783777952194, + "step": 2868 + }, + { + "epoch": 0.0574, + "grad_norm": 2.703125, + "grad_norm_var": 0.05831705729166667, + "learning_rate": 0.0001, + "loss": 4.9863, + "loss/crossentropy": 2.0641059279441833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2789234220981598, + "step": 2870 + }, + { + "epoch": 0.05744, + "grad_norm": 2.734375, + "grad_norm_var": 0.04683837890625, + "learning_rate": 0.0001, + "loss": 4.9222, + "loss/crossentropy": 2.0819749236106873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26825186610221863, + "step": 2872 + }, + { + "epoch": 0.05748, + "grad_norm": 2.703125, + "grad_norm_var": 0.046263631184895834, + "learning_rate": 0.0001, + "loss": 4.9586, + "loss/crossentropy": 2.1114020347595215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2613547742366791, + "step": 2874 + }, + { + "epoch": 0.05752, + "grad_norm": 2.4375, + "grad_norm_var": 0.02916259765625, + "learning_rate": 0.0001, + "loss": 4.9474, + "loss/crossentropy": 2.174915075302124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29503974318504333, + "step": 2876 + }, + { + "epoch": 0.05756, + "grad_norm": 2.6875, + "grad_norm_var": 0.0265289306640625, + "learning_rate": 0.0001, + "loss": 5.0541, + "loss/crossentropy": 2.4342113733291626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30623678863048553, + "step": 2878 + }, + { + "epoch": 0.0576, + "grad_norm": 2.703125, + "grad_norm_var": 0.024918619791666666, + "learning_rate": 0.0001, + "loss": 5.0316, + "loss/crossentropy": 2.0518307089805603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28422991931438446, + "step": 2880 + }, + { + "epoch": 0.05764, + "grad_norm": 2.6875, + "grad_norm_var": 0.011921183268229166, + "learning_rate": 0.0001, + "loss": 5.0806, + "loss/crossentropy": 2.5378612279891968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29730531573295593, + "step": 2882 + }, + { + "epoch": 0.05768, + "grad_norm": 2.5, + "grad_norm_var": 0.014972941080729166, + "learning_rate": 0.0001, + "loss": 4.8685, + "loss/crossentropy": 2.2670027017593384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26347288489341736, + "step": 2884 + }, + { + "epoch": 0.05772, + "grad_norm": 3.671875, + "grad_norm_var": 0.07822265625, + "learning_rate": 0.0001, + "loss": 5.0385, + "loss/crossentropy": 2.351823568344116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3020637035369873, + "step": 2886 + }, + { + "epoch": 0.05776, + "grad_norm": 2.65625, + "grad_norm_var": 0.0792633056640625, + "learning_rate": 0.0001, + "loss": 4.9699, + "loss/crossentropy": 2.190839111804962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2639586254954338, + "step": 2888 + }, + { + "epoch": 0.0578, + "grad_norm": 2.3125, + "grad_norm_var": 0.08765869140625, + "learning_rate": 0.0001, + "loss": 4.6882, + "loss/crossentropy": 2.148400902748108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27762140333652496, + "step": 2890 + }, + { + "epoch": 0.05784, + "grad_norm": 2.984375, + "grad_norm_var": 0.09696858723958333, + "learning_rate": 0.0001, + "loss": 5.1033, + "loss/crossentropy": 2.01130074262619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2678648605942726, + "step": 2892 + }, + { + "epoch": 0.05788, + "grad_norm": 2.75, + "grad_norm_var": 0.09986572265625, + "learning_rate": 0.0001, + "loss": 5.0829, + "loss/crossentropy": 2.2239269018173218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2971164286136627, + "step": 2894 + }, + { + "epoch": 0.05792, + "grad_norm": 2.328125, + "grad_norm_var": 0.10974934895833334, + "learning_rate": 0.0001, + "loss": 4.48, + "loss/crossentropy": 1.9573850631713867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.241651751101017, + "step": 2896 + }, + { + "epoch": 0.05796, + "grad_norm": 2.5, + "grad_norm_var": 0.11169331868489583, + "learning_rate": 0.0001, + "loss": 4.8806, + "loss/crossentropy": 1.9522782564163208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2685594707727432, + "step": 2898 + }, + { + "epoch": 0.058, + "grad_norm": 2.671875, + "grad_norm_var": 0.10886128743489583, + "learning_rate": 0.0001, + "loss": 4.9335, + "loss/crossentropy": 2.1742069721221924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27867090702056885, + "step": 2900 + }, + { + "epoch": 0.05804, + "grad_norm": 5.15625, + "grad_norm_var": 0.44882405598958336, + "learning_rate": 0.0001, + "loss": 4.9164, + "loss/crossentropy": 2.201782703399658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2731190174818039, + "step": 2902 + }, + { + "epoch": 0.05808, + "grad_norm": 2.734375, + "grad_norm_var": 0.4452301025390625, + "learning_rate": 0.0001, + "loss": 4.9787, + "loss/crossentropy": 2.159709095954895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2940017879009247, + "step": 2904 + }, + { + "epoch": 0.05812, + "grad_norm": 2.71875, + "grad_norm_var": 0.4297271728515625, + "learning_rate": 0.0001, + "loss": 4.9192, + "loss/crossentropy": 2.1989885568618774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2776087671518326, + "step": 2906 + }, + { + "epoch": 0.05816, + "grad_norm": 2.546875, + "grad_norm_var": 0.42688395182291666, + "learning_rate": 0.0001, + "loss": 4.8851, + "loss/crossentropy": 1.9634575247764587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27917809784412384, + "step": 2908 + }, + { + "epoch": 0.0582, + "grad_norm": 2.390625, + "grad_norm_var": 0.43651936848958334, + "learning_rate": 0.0001, + "loss": 4.7776, + "loss/crossentropy": 2.1553479433059692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28876082599163055, + "step": 2910 + }, + { + "epoch": 0.05824, + "grad_norm": 2.515625, + "grad_norm_var": 0.42451883951822916, + "learning_rate": 0.0001, + "loss": 4.881, + "loss/crossentropy": 2.1035598516464233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2851293236017227, + "step": 2912 + }, + { + "epoch": 0.05828, + "grad_norm": 2.890625, + "grad_norm_var": 0.42097880045572916, + "learning_rate": 0.0001, + "loss": 5.2733, + "loss/crossentropy": 2.17076575756073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28258734941482544, + "step": 2914 + }, + { + "epoch": 0.05832, + "grad_norm": 2.484375, + "grad_norm_var": 0.42477213541666664, + "learning_rate": 0.0001, + "loss": 4.9147, + "loss/crossentropy": 2.2914711236953735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29451698064804077, + "step": 2916 + }, + { + "epoch": 0.05836, + "grad_norm": 2.703125, + "grad_norm_var": 0.021410115559895835, + "learning_rate": 0.0001, + "loss": 5.1395, + "loss/crossentropy": 2.554638981819153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30992935597896576, + "step": 2918 + }, + { + "epoch": 0.0584, + "grad_norm": 2.546875, + "grad_norm_var": 0.05735575358072917, + "learning_rate": 0.0001, + "loss": 4.6262, + "loss/crossentropy": 1.808376431465149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22574126720428467, + "step": 2920 + }, + { + "epoch": 0.05844, + "grad_norm": 2.78125, + "grad_norm_var": 0.05660400390625, + "learning_rate": 0.0001, + "loss": 5.1636, + "loss/crossentropy": 2.1471784114837646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27325738966464996, + "step": 2922 + }, + { + "epoch": 0.05848, + "grad_norm": 2.765625, + "grad_norm_var": 0.054182942708333334, + "learning_rate": 0.0001, + "loss": 5.071, + "loss/crossentropy": 2.2175731658935547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3013547956943512, + "step": 2924 + }, + { + "epoch": 0.05852, + "grad_norm": 2.953125, + "grad_norm_var": 0.04949544270833333, + "learning_rate": 0.0001, + "loss": 5.467, + "loss/crossentropy": 2.369373917579651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29249751567840576, + "step": 2926 + }, + { + "epoch": 0.05856, + "grad_norm": 2.453125, + "grad_norm_var": 0.054915364583333334, + "learning_rate": 0.0001, + "loss": 4.8778, + "loss/crossentropy": 2.1758522987365723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28846102952957153, + "step": 2928 + }, + { + "epoch": 0.0586, + "grad_norm": 2.546875, + "grad_norm_var": 0.05413411458333333, + "learning_rate": 0.0001, + "loss": 4.929, + "loss/crossentropy": 2.46218478679657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2824682295322418, + "step": 2930 + }, + { + "epoch": 0.05864, + "grad_norm": 2.40625, + "grad_norm_var": 0.0606597900390625, + "learning_rate": 0.0001, + "loss": 4.5557, + "loss/crossentropy": 2.058937907218933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26767465472221375, + "step": 2932 + }, + { + "epoch": 0.05868, + "grad_norm": 2.546875, + "grad_norm_var": 0.06109619140625, + "learning_rate": 0.0001, + "loss": 4.9103, + "loss/crossentropy": 2.312442421913147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2952658236026764, + "step": 2934 + }, + { + "epoch": 0.05872, + "grad_norm": 2.71875, + "grad_norm_var": 0.026911417643229168, + "learning_rate": 0.0001, + "loss": 5.1452, + "loss/crossentropy": 2.18839955329895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31788623332977295, + "step": 2936 + }, + { + "epoch": 0.05876, + "grad_norm": 2.671875, + "grad_norm_var": 0.025537109375, + "learning_rate": 0.0001, + "loss": 5.1358, + "loss/crossentropy": 2.1330811977386475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28829698264598846, + "step": 2938 + }, + { + "epoch": 0.0588, + "grad_norm": 2.9375, + "grad_norm_var": 0.0294586181640625, + "learning_rate": 0.0001, + "loss": 5.1071, + "loss/crossentropy": 2.124038338661194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2708826810121536, + "step": 2940 + }, + { + "epoch": 0.05884, + "grad_norm": 2.53125, + "grad_norm_var": 0.024312337239583332, + "learning_rate": 0.0001, + "loss": 4.8479, + "loss/crossentropy": 2.1934449076652527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27314090728759766, + "step": 2942 + }, + { + "epoch": 0.05888, + "grad_norm": 2.6875, + "grad_norm_var": 0.024535115559895834, + "learning_rate": 0.0001, + "loss": 4.9904, + "loss/crossentropy": 1.967636525630951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23761005699634552, + "step": 2944 + }, + { + "epoch": 0.05892, + "grad_norm": 3.015625, + "grad_norm_var": 0.034821573893229166, + "learning_rate": 0.0001, + "loss": 4.9214, + "loss/crossentropy": 2.2380826473236084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3402934819459915, + "step": 2946 + }, + { + "epoch": 0.05896, + "grad_norm": 2.5625, + "grad_norm_var": 0.0299468994140625, + "learning_rate": 0.0001, + "loss": 4.7702, + "loss/crossentropy": 2.227039933204651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27934837341308594, + "step": 2948 + }, + { + "epoch": 0.059, + "grad_norm": 2.6875, + "grad_norm_var": 0.03258056640625, + "learning_rate": 0.0001, + "loss": 4.789, + "loss/crossentropy": 1.999170958995819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.275806725025177, + "step": 2950 + }, + { + "epoch": 0.05904, + "grad_norm": 2.546875, + "grad_norm_var": 0.03241780598958333, + "learning_rate": 0.0001, + "loss": 4.5754, + "loss/crossentropy": 1.8843520879745483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26203446090221405, + "step": 2952 + }, + { + "epoch": 0.05908, + "grad_norm": 2.5, + "grad_norm_var": 0.033426920572916664, + "learning_rate": 0.0001, + "loss": 4.5485, + "loss/crossentropy": 2.0742241740226746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28777699172496796, + "step": 2954 + }, + { + "epoch": 0.05912, + "grad_norm": 2.6875, + "grad_norm_var": 0.028206380208333333, + "learning_rate": 0.0001, + "loss": 4.9676, + "loss/crossentropy": 2.2272751331329346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27761510014533997, + "step": 2956 + }, + { + "epoch": 0.05916, + "grad_norm": 2.734375, + "grad_norm_var": 0.029878743489583335, + "learning_rate": 0.0001, + "loss": 4.6329, + "loss/crossentropy": 2.2758638858795166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28144824504852295, + "step": 2958 + }, + { + "epoch": 0.0592, + "grad_norm": 2.46875, + "grad_norm_var": 0.060530598958333334, + "learning_rate": 0.0001, + "loss": 4.9543, + "loss/crossentropy": 2.245158016681671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3030128926038742, + "step": 2960 + }, + { + "epoch": 0.05924, + "grad_norm": 2.453125, + "grad_norm_var": 0.053498331705729166, + "learning_rate": 0.0001, + "loss": 4.567, + "loss/crossentropy": 2.3337708711624146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29228493571281433, + "step": 2962 + }, + { + "epoch": 0.05928, + "grad_norm": 2.5, + "grad_norm_var": 0.05788472493489583, + "learning_rate": 0.0001, + "loss": 4.6477, + "loss/crossentropy": 2.3765406608581543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2715871036052704, + "step": 2964 + }, + { + "epoch": 0.05932, + "grad_norm": 2.375, + "grad_norm_var": 0.0562652587890625, + "learning_rate": 0.0001, + "loss": 4.8649, + "loss/crossentropy": 2.090362787246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505037933588028, + "step": 2966 + }, + { + "epoch": 0.05936, + "grad_norm": 2.453125, + "grad_norm_var": 0.056396484375, + "learning_rate": 0.0001, + "loss": 4.8669, + "loss/crossentropy": 2.011539399623871, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27219071984291077, + "step": 2968 + }, + { + "epoch": 0.0594, + "grad_norm": 2.734375, + "grad_norm_var": 0.05693359375, + "learning_rate": 0.0001, + "loss": 5.2081, + "loss/crossentropy": 2.1791563034057617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2796429842710495, + "step": 2970 + }, + { + "epoch": 0.05944, + "grad_norm": 2.65625, + "grad_norm_var": 0.05869852701822917, + "learning_rate": 0.0001, + "loss": 5.2689, + "loss/crossentropy": 2.4645297527313232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3106658458709717, + "step": 2972 + }, + { + "epoch": 0.05948, + "grad_norm": 7.0, + "grad_norm_var": 1.25670166015625, + "learning_rate": 0.0001, + "loss": 5.0715, + "loss/crossentropy": 2.2050880193710327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26860976219177246, + "step": 2974 + }, + { + "epoch": 0.05952, + "grad_norm": 2.828125, + "grad_norm_var": 1.236034138997396, + "learning_rate": 0.0001, + "loss": 4.5815, + "loss/crossentropy": 2.0141921639442444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27980539202690125, + "step": 2976 + }, + { + "epoch": 0.05956, + "grad_norm": 4.4375, + "grad_norm_var": 1.3665924072265625, + "learning_rate": 0.0001, + "loss": 4.8067, + "loss/crossentropy": 1.9399088025093079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26379524916410446, + "step": 2978 + }, + { + "epoch": 0.0596, + "grad_norm": 2.796875, + "grad_norm_var": 1.3243886311848958, + "learning_rate": 0.0001, + "loss": 5.1743, + "loss/crossentropy": 2.5415157079696655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.291620597243309, + "step": 2980 + }, + { + "epoch": 0.05964, + "grad_norm": 2.5625, + "grad_norm_var": 1.3170562744140626, + "learning_rate": 0.0001, + "loss": 5.0695, + "loss/crossentropy": 2.1215542554855347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26189403235912323, + "step": 2982 + }, + { + "epoch": 0.05968, + "grad_norm": 2.578125, + "grad_norm_var": 1.32301025390625, + "learning_rate": 0.0001, + "loss": 5.0049, + "loss/crossentropy": 1.7749422788619995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27546317875385284, + "step": 2984 + }, + { + "epoch": 0.05972, + "grad_norm": 2.59375, + "grad_norm_var": 1.3371490478515624, + "learning_rate": 0.0001, + "loss": 4.8331, + "loss/crossentropy": 2.3383208513259888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2790983319282532, + "step": 2986 + }, + { + "epoch": 0.05976, + "grad_norm": 2.5625, + "grad_norm_var": 1.35230712890625, + "learning_rate": 0.0001, + "loss": 4.8656, + "loss/crossentropy": 2.3688780069351196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2752939611673355, + "step": 2988 + }, + { + "epoch": 0.0598, + "grad_norm": 2.6875, + "grad_norm_var": 0.22967020670572916, + "learning_rate": 0.0001, + "loss": 4.7299, + "loss/crossentropy": 2.261468529701233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2725592106580734, + "step": 2990 + }, + { + "epoch": 0.05984, + "grad_norm": 2.671875, + "grad_norm_var": 0.2226226806640625, + "learning_rate": 0.0001, + "loss": 4.7417, + "loss/crossentropy": 2.0632028579711914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2617062032222748, + "step": 2992 + }, + { + "epoch": 0.05988, + "grad_norm": 2.25, + "grad_norm_var": 0.022151692708333334, + "learning_rate": 0.0001, + "loss": 4.4282, + "loss/crossentropy": 2.1409813165664673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2657589614391327, + "step": 2994 + }, + { + "epoch": 0.05992, + "grad_norm": 2.921875, + "grad_norm_var": 0.0258941650390625, + "learning_rate": 0.0001, + "loss": 5.1816, + "loss/crossentropy": 2.431404948234558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3087555915117264, + "step": 2996 + }, + { + "epoch": 0.05996, + "grad_norm": 2.390625, + "grad_norm_var": 0.029743448893229166, + "learning_rate": 0.0001, + "loss": 4.8253, + "loss/crossentropy": 2.165238618850708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29382775723934174, + "step": 2998 + }, + { + "epoch": 0.06, + "grad_norm": 2.546875, + "grad_norm_var": 0.034566243489583336, + "learning_rate": 0.0001, + "loss": 4.6328, + "loss/crossentropy": 1.9987847208976746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24466252326965332, + "step": 3000 + }, + { + "epoch": 0.06004, + "grad_norm": 2.203125, + "grad_norm_var": 0.04084370930989583, + "learning_rate": 0.0001, + "loss": 4.5474, + "loss/crossentropy": 2.1153565049171448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2435970976948738, + "step": 3002 + }, + { + "epoch": 0.06008, + "grad_norm": 2.3125, + "grad_norm_var": 0.046402994791666666, + "learning_rate": 0.0001, + "loss": 4.7765, + "loss/crossentropy": 1.8660866618156433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24413185566663742, + "step": 3004 + }, + { + "epoch": 0.06012, + "grad_norm": 2.40625, + "grad_norm_var": 0.04309794108072917, + "learning_rate": 0.0001, + "loss": 4.7476, + "loss/crossentropy": 2.1816678047180176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28010787069797516, + "step": 3006 + }, + { + "epoch": 0.06016, + "grad_norm": 2.53125, + "grad_norm_var": 0.040827433268229164, + "learning_rate": 0.0001, + "loss": 4.5995, + "loss/crossentropy": 2.1673622131347656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2659531533718109, + "step": 3008 + }, + { + "epoch": 0.0602, + "grad_norm": 2.78125, + "grad_norm_var": 0.040913899739583336, + "learning_rate": 0.0001, + "loss": 5.2466, + "loss/crossentropy": 2.277818202972412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2929365783929825, + "step": 3010 + }, + { + "epoch": 0.06024, + "grad_norm": 2.65625, + "grad_norm_var": 0.0284576416015625, + "learning_rate": 0.0001, + "loss": 4.8904, + "loss/crossentropy": 2.294926404953003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2753874659538269, + "step": 3012 + }, + { + "epoch": 0.06028, + "grad_norm": 2.359375, + "grad_norm_var": 0.028173828125, + "learning_rate": 0.0001, + "loss": 4.735, + "loss/crossentropy": 2.0222257375717163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27295801043510437, + "step": 3014 + }, + { + "epoch": 0.06032, + "grad_norm": 2.484375, + "grad_norm_var": 0.023628743489583333, + "learning_rate": 0.0001, + "loss": 4.818, + "loss/crossentropy": 2.43736469745636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2934436649084091, + "step": 3016 + }, + { + "epoch": 0.06036, + "grad_norm": 2.703125, + "grad_norm_var": 0.020426432291666668, + "learning_rate": 0.0001, + "loss": 5.0053, + "loss/crossentropy": 2.027767241001129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26353244483470917, + "step": 3018 + }, + { + "epoch": 0.0604, + "grad_norm": 2.59375, + "grad_norm_var": 0.015034993489583334, + "learning_rate": 0.0001, + "loss": 4.9059, + "loss/crossentropy": 2.150290071964264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27290941774845123, + "step": 3020 + }, + { + "epoch": 0.06044, + "grad_norm": 2.421875, + "grad_norm_var": 0.014697265625, + "learning_rate": 0.0001, + "loss": 4.8307, + "loss/crossentropy": 1.8709848523139954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25272610783576965, + "step": 3022 + }, + { + "epoch": 0.06048, + "grad_norm": 2.46875, + "grad_norm_var": 0.015949503580729166, + "learning_rate": 0.0001, + "loss": 5.0166, + "loss/crossentropy": 2.2170883417129517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29401010274887085, + "step": 3024 + }, + { + "epoch": 0.06052, + "grad_norm": 2.53125, + "grad_norm_var": 0.0109375, + "learning_rate": 0.0001, + "loss": 4.6221, + "loss/crossentropy": 1.728318691253662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24246910959482193, + "step": 3026 + }, + { + "epoch": 0.06056, + "grad_norm": 2.34375, + "grad_norm_var": 0.0095123291015625, + "learning_rate": 0.0001, + "loss": 4.5597, + "loss/crossentropy": 1.8453290462493896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24357211589813232, + "step": 3028 + }, + { + "epoch": 0.0606, + "grad_norm": 2.640625, + "grad_norm_var": 0.010456339518229166, + "learning_rate": 0.0001, + "loss": 4.6617, + "loss/crossentropy": 2.273196220397949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29239149391651154, + "step": 3030 + }, + { + "epoch": 0.06064, + "grad_norm": 2.484375, + "grad_norm_var": 0.012165323893229166, + "learning_rate": 0.0001, + "loss": 4.9464, + "loss/crossentropy": 2.031871259212494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2604901194572449, + "step": 3032 + }, + { + "epoch": 0.06068, + "grad_norm": 2.515625, + "grad_norm_var": 0.008915201822916666, + "learning_rate": 0.0001, + "loss": 5.1618, + "loss/crossentropy": 2.1781771183013916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2853371948003769, + "step": 3034 + }, + { + "epoch": 0.06072, + "grad_norm": 2.484375, + "grad_norm_var": 0.006590779622395833, + "learning_rate": 0.0001, + "loss": 4.816, + "loss/crossentropy": 1.9578353762626648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23352904617786407, + "step": 3036 + }, + { + "epoch": 0.06076, + "grad_norm": 2.671875, + "grad_norm_var": 0.012516276041666666, + "learning_rate": 0.0001, + "loss": 5.3012, + "loss/crossentropy": 2.470985770225525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28890977799892426, + "step": 3038 + }, + { + "epoch": 0.0608, + "grad_norm": 2.65625, + "grad_norm_var": 0.013361612955729166, + "learning_rate": 0.0001, + "loss": 4.9243, + "loss/crossentropy": 2.056231141090393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2963344603776932, + "step": 3040 + }, + { + "epoch": 0.06084, + "grad_norm": 2.609375, + "grad_norm_var": 0.013646443684895834, + "learning_rate": 0.0001, + "loss": 4.8612, + "loss/crossentropy": 2.000037968158722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27152783423662186, + "step": 3042 + }, + { + "epoch": 0.06088, + "grad_norm": 2.59375, + "grad_norm_var": 0.0111480712890625, + "learning_rate": 0.0001, + "loss": 4.8576, + "loss/crossentropy": 2.323809027671814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29995349049568176, + "step": 3044 + }, + { + "epoch": 0.06092, + "grad_norm": 2.4375, + "grad_norm_var": 0.010407511393229167, + "learning_rate": 0.0001, + "loss": 5.0094, + "loss/crossentropy": 2.06933856010437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26831966638565063, + "step": 3046 + }, + { + "epoch": 0.06096, + "grad_norm": 2.53125, + "grad_norm_var": 0.008463541666666666, + "learning_rate": 0.0001, + "loss": 5.042, + "loss/crossentropy": 2.1903880834579468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2833031117916107, + "step": 3048 + }, + { + "epoch": 0.061, + "grad_norm": 2.46875, + "grad_norm_var": 0.008893839518229167, + "learning_rate": 0.0001, + "loss": 4.9557, + "loss/crossentropy": 1.910680890083313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2708408683538437, + "step": 3050 + }, + { + "epoch": 0.06104, + "grad_norm": 2.546875, + "grad_norm_var": 0.0083404541015625, + "learning_rate": 0.0001, + "loss": 4.8902, + "loss/crossentropy": 2.5203059911727905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2981104403734207, + "step": 3052 + }, + { + "epoch": 0.06108, + "grad_norm": 2.5, + "grad_norm_var": 0.00611572265625, + "learning_rate": 0.0001, + "loss": 4.68, + "loss/crossentropy": 1.7918179035186768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2462354451417923, + "step": 3054 + }, + { + "epoch": 0.06112, + "grad_norm": 2.65625, + "grad_norm_var": 0.006441243489583333, + "learning_rate": 0.0001, + "loss": 4.9908, + "loss/crossentropy": 2.185121774673462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3046490252017975, + "step": 3056 + }, + { + "epoch": 0.06116, + "grad_norm": 2.28125, + "grad_norm_var": 0.01064453125, + "learning_rate": 0.0001, + "loss": 4.8041, + "loss/crossentropy": 1.859390914440155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2523205131292343, + "step": 3058 + }, + { + "epoch": 0.0612, + "grad_norm": 2.40625, + "grad_norm_var": 0.011930338541666667, + "learning_rate": 0.0001, + "loss": 4.7388, + "loss/crossentropy": 1.9202255606651306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26971636712551117, + "step": 3060 + }, + { + "epoch": 0.06124, + "grad_norm": 2.578125, + "grad_norm_var": 0.011481730143229167, + "learning_rate": 0.0001, + "loss": 4.7421, + "loss/crossentropy": 2.059127449989319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25721821188926697, + "step": 3062 + }, + { + "epoch": 0.06128, + "grad_norm": 2.6875, + "grad_norm_var": 0.0154205322265625, + "learning_rate": 0.0001, + "loss": 5.1392, + "loss/crossentropy": 2.3587669134140015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30162671208381653, + "step": 3064 + }, + { + "epoch": 0.06132, + "grad_norm": 2.59375, + "grad_norm_var": 0.018050130208333334, + "learning_rate": 0.0001, + "loss": 5.2748, + "loss/crossentropy": 2.239704966545105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2713697552680969, + "step": 3066 + }, + { + "epoch": 0.06136, + "grad_norm": 2.796875, + "grad_norm_var": 0.02076416015625, + "learning_rate": 0.0001, + "loss": 5.0534, + "loss/crossentropy": 2.230944514274597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27418144047260284, + "step": 3068 + }, + { + "epoch": 0.0614, + "grad_norm": 2.515625, + "grad_norm_var": 0.020563761393229168, + "learning_rate": 0.0001, + "loss": 4.9145, + "loss/crossentropy": 2.196586310863495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24946682900190353, + "step": 3070 + }, + { + "epoch": 0.06144, + "grad_norm": 2.65625, + "grad_norm_var": 0.019873046875, + "learning_rate": 0.0001, + "loss": 5.058, + "loss/crossentropy": 2.115275800228119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2668849602341652, + "step": 3072 + }, + { + "epoch": 0.06148, + "grad_norm": 2.765625, + "grad_norm_var": 0.014286295572916666, + "learning_rate": 0.0001, + "loss": 5.1311, + "loss/crossentropy": 2.0227994322776794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2587117701768875, + "step": 3074 + }, + { + "epoch": 0.06152, + "grad_norm": 2.421875, + "grad_norm_var": 0.016022745768229166, + "learning_rate": 0.0001, + "loss": 4.711, + "loss/crossentropy": 2.231368660926819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2621329501271248, + "step": 3076 + }, + { + "epoch": 0.06156, + "grad_norm": 2.421875, + "grad_norm_var": 0.0190093994140625, + "learning_rate": 0.0001, + "loss": 4.6707, + "loss/crossentropy": 2.469847083091736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2758704572916031, + "step": 3078 + }, + { + "epoch": 0.0616, + "grad_norm": 2.5, + "grad_norm_var": 0.01666259765625, + "learning_rate": 0.0001, + "loss": 4.5753, + "loss/crossentropy": 2.257850766181946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28196004033088684, + "step": 3080 + }, + { + "epoch": 0.06164, + "grad_norm": 2.46875, + "grad_norm_var": 0.014969889322916667, + "learning_rate": 0.0001, + "loss": 4.8783, + "loss/crossentropy": 2.52754008769989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.301498681306839, + "step": 3082 + }, + { + "epoch": 0.06168, + "grad_norm": 2.484375, + "grad_norm_var": 0.012279256184895834, + "learning_rate": 0.0001, + "loss": 4.6595, + "loss/crossentropy": 2.120129644870758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30073782801628113, + "step": 3084 + }, + { + "epoch": 0.06172, + "grad_norm": 2.546875, + "grad_norm_var": 0.0231353759765625, + "learning_rate": 0.0001, + "loss": 5.017, + "loss/crossentropy": 2.1483632922172546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29400819540023804, + "step": 3086 + }, + { + "epoch": 0.06176, + "grad_norm": 2.59375, + "grad_norm_var": 0.022297159830729166, + "learning_rate": 0.0001, + "loss": 5.2447, + "loss/crossentropy": 2.5588048696517944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3099561035633087, + "step": 3088 + }, + { + "epoch": 0.0618, + "grad_norm": 2.359375, + "grad_norm_var": 0.019852701822916666, + "learning_rate": 0.0001, + "loss": 4.5507, + "loss/crossentropy": 2.338167190551758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2638262137770653, + "step": 3090 + }, + { + "epoch": 0.06184, + "grad_norm": 2.5, + "grad_norm_var": 0.019603474934895834, + "learning_rate": 0.0001, + "loss": 5.0471, + "loss/crossentropy": 2.61361825466156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29420773684978485, + "step": 3092 + }, + { + "epoch": 0.06188, + "grad_norm": 2.5625, + "grad_norm_var": 0.019108072916666666, + "learning_rate": 0.0001, + "loss": 4.6931, + "loss/crossentropy": 2.1078842878341675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27283959090709686, + "step": 3094 + }, + { + "epoch": 0.06192, + "grad_norm": 2.296875, + "grad_norm_var": 0.023958333333333335, + "learning_rate": 0.0001, + "loss": 4.463, + "loss/crossentropy": 2.1501123905181885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3052368611097336, + "step": 3096 + }, + { + "epoch": 0.06196, + "grad_norm": 2.671875, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 5.0482, + "loss/crossentropy": 2.3622714281082153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2851791977882385, + "step": 3098 + }, + { + "epoch": 0.062, + "grad_norm": 2.671875, + "grad_norm_var": 0.05203450520833333, + "learning_rate": 0.0001, + "loss": 4.3506, + "loss/crossentropy": 1.9757406115531921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24451570957899094, + "step": 3100 + }, + { + "epoch": 0.06204, + "grad_norm": 2.859375, + "grad_norm_var": 0.07011311848958333, + "learning_rate": 0.0001, + "loss": 5.0409, + "loss/crossentropy": 2.1915838718414307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2749434858560562, + "step": 3102 + }, + { + "epoch": 0.06208, + "grad_norm": 2.78125, + "grad_norm_var": 0.07214253743489583, + "learning_rate": 0.0001, + "loss": 4.8408, + "loss/crossentropy": 2.130649447441101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2590179592370987, + "step": 3104 + }, + { + "epoch": 0.06212, + "grad_norm": 2.59375, + "grad_norm_var": 0.06317952473958334, + "learning_rate": 0.0001, + "loss": 4.9083, + "loss/crossentropy": 2.4478825330734253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30854369699954987, + "step": 3106 + }, + { + "epoch": 0.06216, + "grad_norm": 2.578125, + "grad_norm_var": 0.06524149576822917, + "learning_rate": 0.0001, + "loss": 4.8249, + "loss/crossentropy": 1.7108858227729797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22502756118774414, + "step": 3108 + }, + { + "epoch": 0.0622, + "grad_norm": 2.46875, + "grad_norm_var": 0.06788736979166667, + "learning_rate": 0.0001, + "loss": 4.5355, + "loss/crossentropy": 1.79804128408432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24767793715000153, + "step": 3110 + }, + { + "epoch": 0.06224, + "grad_norm": 3.28125, + "grad_norm_var": 0.07480061848958333, + "learning_rate": 0.0001, + "loss": 4.8779, + "loss/crossentropy": 1.899846076965332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23476070165634155, + "step": 3112 + }, + { + "epoch": 0.06228, + "grad_norm": 2.453125, + "grad_norm_var": 0.0790679931640625, + "learning_rate": 0.0001, + "loss": 4.7506, + "loss/crossentropy": 1.8458155393600464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24417708814144135, + "step": 3114 + }, + { + "epoch": 0.06232, + "grad_norm": 2.5, + "grad_norm_var": 0.06979166666666667, + "learning_rate": 0.0001, + "loss": 4.814, + "loss/crossentropy": 1.947302520275116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2566901594400406, + "step": 3116 + }, + { + "epoch": 0.06236, + "grad_norm": 2.546875, + "grad_norm_var": 0.046923828125, + "learning_rate": 0.0001, + "loss": 4.8424, + "loss/crossentropy": 2.039161205291748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26955537497997284, + "step": 3118 + }, + { + "epoch": 0.0624, + "grad_norm": 2.65625, + "grad_norm_var": 0.04488016764322917, + "learning_rate": 0.0001, + "loss": 5.1656, + "loss/crossentropy": 2.1528661251068115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.291052982211113, + "step": 3120 + }, + { + "epoch": 0.06244, + "grad_norm": 2.640625, + "grad_norm_var": 0.04644775390625, + "learning_rate": 0.0001, + "loss": 4.7941, + "loss/crossentropy": 2.4113996028900146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2962254136800766, + "step": 3122 + }, + { + "epoch": 0.06248, + "grad_norm": 2.65625, + "grad_norm_var": 0.044722493489583334, + "learning_rate": 0.0001, + "loss": 4.8463, + "loss/crossentropy": 2.0718825459480286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24676042795181274, + "step": 3124 + }, + { + "epoch": 0.06252, + "grad_norm": 2.546875, + "grad_norm_var": 0.04442952473958333, + "learning_rate": 0.0001, + "loss": 4.9723, + "loss/crossentropy": 2.433539032936096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.263284370303154, + "step": 3126 + }, + { + "epoch": 0.06256, + "grad_norm": 2.421875, + "grad_norm_var": 0.0087799072265625, + "learning_rate": 0.0001, + "loss": 4.7688, + "loss/crossentropy": 2.047150671482086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2655043303966522, + "step": 3128 + }, + { + "epoch": 0.0626, + "grad_norm": 2.546875, + "grad_norm_var": 0.008219401041666666, + "learning_rate": 0.0001, + "loss": 4.9257, + "loss/crossentropy": 1.8897674679756165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2314881831407547, + "step": 3130 + }, + { + "epoch": 0.06264, + "grad_norm": 2.390625, + "grad_norm_var": 0.00797119140625, + "learning_rate": 0.0001, + "loss": 4.8053, + "loss/crossentropy": 2.1436809301376343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26931750774383545, + "step": 3132 + }, + { + "epoch": 0.06268, + "grad_norm": 2.421875, + "grad_norm_var": 0.0081695556640625, + "learning_rate": 0.0001, + "loss": 4.67, + "loss/crossentropy": 1.9773973226547241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26880529522895813, + "step": 3134 + }, + { + "epoch": 0.06272, + "grad_norm": 2.984375, + "grad_norm_var": 0.021903483072916667, + "learning_rate": 0.0001, + "loss": 5.1236, + "loss/crossentropy": 2.208973228931427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29979653656482697, + "step": 3136 + }, + { + "epoch": 0.06276, + "grad_norm": 2.8125, + "grad_norm_var": 0.026676432291666666, + "learning_rate": 0.0001, + "loss": 5.0619, + "loss/crossentropy": 2.64120090007782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29249200224876404, + "step": 3138 + }, + { + "epoch": 0.0628, + "grad_norm": 2.703125, + "grad_norm_var": 0.0306549072265625, + "learning_rate": 0.0001, + "loss": 5.0152, + "loss/crossentropy": 2.293095588684082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2689897269010544, + "step": 3140 + }, + { + "epoch": 0.06284, + "grad_norm": 2.5625, + "grad_norm_var": 0.028758748372395834, + "learning_rate": 0.0001, + "loss": 4.71, + "loss/crossentropy": 2.455227494239807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3202812373638153, + "step": 3142 + }, + { + "epoch": 0.06288, + "grad_norm": 2.578125, + "grad_norm_var": 0.0253814697265625, + "learning_rate": 0.0001, + "loss": 4.9622, + "loss/crossentropy": 1.9667487740516663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2702512592077255, + "step": 3144 + }, + { + "epoch": 0.06292, + "grad_norm": 2.578125, + "grad_norm_var": 0.024637858072916668, + "learning_rate": 0.0001, + "loss": 4.7756, + "loss/crossentropy": 1.8181490898132324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2310035452246666, + "step": 3146 + }, + { + "epoch": 0.06296, + "grad_norm": 2.640625, + "grad_norm_var": 0.0373687744140625, + "learning_rate": 0.0001, + "loss": 4.9556, + "loss/crossentropy": 1.9985284805297852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2944463640451431, + "step": 3148 + }, + { + "epoch": 0.063, + "grad_norm": 2.28125, + "grad_norm_var": 0.04142964680989583, + "learning_rate": 0.0001, + "loss": 5.0316, + "loss/crossentropy": 2.3850419521331787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29605095088481903, + "step": 3150 + }, + { + "epoch": 0.06304, + "grad_norm": 4.125, + "grad_norm_var": 0.17021077473958332, + "learning_rate": 0.0001, + "loss": 5.2062, + "loss/crossentropy": 2.3815460205078125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30247962474823, + "step": 3152 + }, + { + "epoch": 0.06308, + "grad_norm": 3.421875, + "grad_norm_var": 0.19840087890625, + "learning_rate": 0.0001, + "loss": 4.5851, + "loss/crossentropy": 1.995104193687439, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2506560683250427, + "step": 3154 + }, + { + "epoch": 0.06312, + "grad_norm": 3.328125, + "grad_norm_var": 0.2305084228515625, + "learning_rate": 0.0001, + "loss": 4.5885, + "loss/crossentropy": 2.2037696838378906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2716705799102783, + "step": 3156 + }, + { + "epoch": 0.06316, + "grad_norm": 2.84375, + "grad_norm_var": 0.23944905598958333, + "learning_rate": 0.0001, + "loss": 4.7155, + "loss/crossentropy": 2.067265272140503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25495442003011703, + "step": 3158 + }, + { + "epoch": 0.0632, + "grad_norm": 2.78125, + "grad_norm_var": 0.23763020833333334, + "learning_rate": 0.0001, + "loss": 5.1219, + "loss/crossentropy": 2.227355480194092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27113111317157745, + "step": 3160 + }, + { + "epoch": 0.06324, + "grad_norm": 2.546875, + "grad_norm_var": 0.23921610514322916, + "learning_rate": 0.0001, + "loss": 4.6777, + "loss/crossentropy": 1.9077441096305847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2630976662039757, + "step": 3162 + }, + { + "epoch": 0.06328, + "grad_norm": 2.59375, + "grad_norm_var": 0.23515523274739583, + "learning_rate": 0.0001, + "loss": 5.0084, + "loss/crossentropy": 2.1737552881240845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28365984559059143, + "step": 3164 + }, + { + "epoch": 0.06332, + "grad_norm": 2.5625, + "grad_norm_var": 0.22542317708333334, + "learning_rate": 0.0001, + "loss": 4.9822, + "loss/crossentropy": 1.9357402920722961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27245980501174927, + "step": 3166 + }, + { + "epoch": 0.06336, + "grad_norm": 2.390625, + "grad_norm_var": 0.0995269775390625, + "learning_rate": 0.0001, + "loss": 4.7681, + "loss/crossentropy": 1.8484191298484802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2933400571346283, + "step": 3168 + }, + { + "epoch": 0.0634, + "grad_norm": 2.609375, + "grad_norm_var": 0.057738240559895834, + "learning_rate": 0.0001, + "loss": 5.0719, + "loss/crossentropy": 2.3085306882858276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2926720678806305, + "step": 3170 + }, + { + "epoch": 0.06344, + "grad_norm": 2.78125, + "grad_norm_var": 0.040380859375, + "learning_rate": 0.0001, + "loss": 5.1464, + "loss/crossentropy": 2.080895185470581, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2638559564948082, + "step": 3172 + }, + { + "epoch": 0.06348, + "grad_norm": 2.53125, + "grad_norm_var": 0.03287353515625, + "learning_rate": 0.0001, + "loss": 4.6857, + "loss/crossentropy": 2.035506248474121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2692716419696808, + "step": 3174 + }, + { + "epoch": 0.06352, + "grad_norm": 2.859375, + "grad_norm_var": 0.035252888997395836, + "learning_rate": 0.0001, + "loss": 4.9479, + "loss/crossentropy": 2.1712740659713745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26579485833644867, + "step": 3176 + }, + { + "epoch": 0.06356, + "grad_norm": 2.96875, + "grad_norm_var": 0.04252827962239583, + "learning_rate": 0.0001, + "loss": 5.1654, + "loss/crossentropy": 2.2256147861480713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27605514228343964, + "step": 3178 + }, + { + "epoch": 0.0636, + "grad_norm": 2.359375, + "grad_norm_var": 0.046507771809895834, + "learning_rate": 0.0001, + "loss": 4.6489, + "loss/crossentropy": 2.0917118191719055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560323476791382, + "step": 3180 + }, + { + "epoch": 0.06364, + "grad_norm": 2.671875, + "grad_norm_var": 0.045491536458333336, + "learning_rate": 0.0001, + "loss": 4.9833, + "loss/crossentropy": 2.3109938502311707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27040477097034454, + "step": 3182 + }, + { + "epoch": 0.06368, + "grad_norm": 2.65625, + "grad_norm_var": 0.03984273274739583, + "learning_rate": 0.0001, + "loss": 5.1441, + "loss/crossentropy": 2.3505672812461853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29180124402046204, + "step": 3184 + }, + { + "epoch": 0.06372, + "grad_norm": 2.578125, + "grad_norm_var": 0.04014383951822917, + "learning_rate": 0.0001, + "loss": 5.0224, + "loss/crossentropy": 2.2471169233322144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2676118314266205, + "step": 3186 + }, + { + "epoch": 0.06376, + "grad_norm": 2.46875, + "grad_norm_var": 0.0264312744140625, + "learning_rate": 0.0001, + "loss": 4.8247, + "loss/crossentropy": 2.3312125205993652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29243919253349304, + "step": 3188 + }, + { + "epoch": 0.0638, + "grad_norm": 2.4375, + "grad_norm_var": 0.02681884765625, + "learning_rate": 0.0001, + "loss": 4.9374, + "loss/crossentropy": 2.1709930896759033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2786664366722107, + "step": 3190 + }, + { + "epoch": 0.06384, + "grad_norm": 2.609375, + "grad_norm_var": 0.02476806640625, + "learning_rate": 0.0001, + "loss": 4.7391, + "loss/crossentropy": 1.8477665185928345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26132869720458984, + "step": 3192 + }, + { + "epoch": 0.06388, + "grad_norm": 2.6875, + "grad_norm_var": 0.01724853515625, + "learning_rate": 0.0001, + "loss": 4.8167, + "loss/crossentropy": 2.2102121114730835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26027651876211166, + "step": 3194 + }, + { + "epoch": 0.06392, + "grad_norm": 2.640625, + "grad_norm_var": 0.0168365478515625, + "learning_rate": 0.0001, + "loss": 4.6381, + "loss/crossentropy": 2.0011088252067566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25629256665706635, + "step": 3196 + }, + { + "epoch": 0.06396, + "grad_norm": 2.625, + "grad_norm_var": 0.01685791015625, + "learning_rate": 0.0001, + "loss": 4.9215, + "loss/crossentropy": 2.319412350654602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28315384685993195, + "step": 3198 + }, + { + "epoch": 0.064, + "grad_norm": 2.328125, + "grad_norm_var": 0.0145172119140625, + "learning_rate": 0.0001, + "loss": 4.7936, + "loss/crossentropy": 2.192026138305664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.264492504298687, + "step": 3200 + }, + { + "epoch": 0.06404, + "grad_norm": 2.78125, + "grad_norm_var": 0.017024739583333334, + "learning_rate": 0.0001, + "loss": 4.8178, + "loss/crossentropy": 2.1745734214782715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29369065165519714, + "step": 3202 + }, + { + "epoch": 0.06408, + "grad_norm": 3.109375, + "grad_norm_var": 0.042313639322916666, + "learning_rate": 0.0001, + "loss": 5.577, + "loss/crossentropy": 2.5039013624191284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2835303544998169, + "step": 3204 + }, + { + "epoch": 0.06412, + "grad_norm": 2.53125, + "grad_norm_var": 0.04075419108072917, + "learning_rate": 0.0001, + "loss": 5.0308, + "loss/crossentropy": 2.4255706071853638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3014884740114212, + "step": 3206 + }, + { + "epoch": 0.06416, + "grad_norm": 2.5625, + "grad_norm_var": 0.03873291015625, + "learning_rate": 0.0001, + "loss": 5.002, + "loss/crossentropy": 2.266150116920471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2698349952697754, + "step": 3208 + }, + { + "epoch": 0.0642, + "grad_norm": 2.4375, + "grad_norm_var": 0.03805338541666667, + "learning_rate": 0.0001, + "loss": 5.0173, + "loss/crossentropy": 2.2042946815490723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28830648958683014, + "step": 3210 + }, + { + "epoch": 0.06424, + "grad_norm": 2.5625, + "grad_norm_var": 0.03578999837239583, + "learning_rate": 0.0001, + "loss": 5.0292, + "loss/crossentropy": 2.4034690856933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2834039032459259, + "step": 3212 + }, + { + "epoch": 0.06428, + "grad_norm": 2.4375, + "grad_norm_var": 0.0444000244140625, + "learning_rate": 0.0001, + "loss": 4.6655, + "loss/crossentropy": 2.1347755193710327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26405099034309387, + "step": 3214 + }, + { + "epoch": 0.06432, + "grad_norm": 3.34375, + "grad_norm_var": 0.07415262858072917, + "learning_rate": 0.0001, + "loss": 5.0862, + "loss/crossentropy": 2.018262207508087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27144598215818405, + "step": 3216 + }, + { + "epoch": 0.06436, + "grad_norm": 2.46875, + "grad_norm_var": 0.076953125, + "learning_rate": 0.0001, + "loss": 4.3969, + "loss/crossentropy": 2.0254003405570984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24976836144924164, + "step": 3218 + }, + { + "epoch": 0.0644, + "grad_norm": 2.78125, + "grad_norm_var": 0.06026102701822917, + "learning_rate": 0.0001, + "loss": 4.6779, + "loss/crossentropy": 2.1952659487724304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2786559462547302, + "step": 3220 + }, + { + "epoch": 0.06444, + "grad_norm": 2.609375, + "grad_norm_var": 0.06024983723958333, + "learning_rate": 0.0001, + "loss": 4.8828, + "loss/crossentropy": 2.1383036375045776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2570757120847702, + "step": 3222 + }, + { + "epoch": 0.06448, + "grad_norm": 2.65625, + "grad_norm_var": 0.05915425618489583, + "learning_rate": 0.0001, + "loss": 4.8248, + "loss/crossentropy": 2.267812967300415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2774328589439392, + "step": 3224 + }, + { + "epoch": 0.06452, + "grad_norm": 2.515625, + "grad_norm_var": 0.0587066650390625, + "learning_rate": 0.0001, + "loss": 4.9389, + "loss/crossentropy": 1.906205415725708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2528962790966034, + "step": 3226 + }, + { + "epoch": 0.06456, + "grad_norm": 2.453125, + "grad_norm_var": 0.05895182291666667, + "learning_rate": 0.0001, + "loss": 4.9823, + "loss/crossentropy": 2.1628336906433105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27087944746017456, + "step": 3228 + }, + { + "epoch": 0.0646, + "grad_norm": 2.390625, + "grad_norm_var": 0.05807291666666667, + "learning_rate": 0.0001, + "loss": 4.5893, + "loss/crossentropy": 1.8845162391662598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2241554707288742, + "step": 3230 + }, + { + "epoch": 0.06464, + "grad_norm": 2.390625, + "grad_norm_var": 0.019237263997395834, + "learning_rate": 0.0001, + "loss": 4.9143, + "loss/crossentropy": 2.4157146215438843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2770439088344574, + "step": 3232 + }, + { + "epoch": 0.06468, + "grad_norm": 2.515625, + "grad_norm_var": 0.021825154622395832, + "learning_rate": 0.0001, + "loss": 4.832, + "loss/crossentropy": 1.9979816675186157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23696578294038773, + "step": 3234 + }, + { + "epoch": 0.06472, + "grad_norm": 2.484375, + "grad_norm_var": 0.016364542643229167, + "learning_rate": 0.0001, + "loss": 5.1123, + "loss/crossentropy": 2.1790190935134888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.252600260078907, + "step": 3236 + }, + { + "epoch": 0.06476, + "grad_norm": 2.4375, + "grad_norm_var": 0.016624959309895833, + "learning_rate": 0.0001, + "loss": 4.5943, + "loss/crossentropy": 2.1612111926078796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27929094433784485, + "step": 3238 + }, + { + "epoch": 0.0648, + "grad_norm": 2.546875, + "grad_norm_var": 0.08124898274739584, + "learning_rate": 0.0001, + "loss": 4.8293, + "loss/crossentropy": 2.261025071144104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3034510314464569, + "step": 3240 + }, + { + "epoch": 0.06484, + "grad_norm": 2.46875, + "grad_norm_var": 0.081689453125, + "learning_rate": 0.0001, + "loss": 4.8243, + "loss/crossentropy": 2.02247554063797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2655429244041443, + "step": 3242 + }, + { + "epoch": 0.06488, + "grad_norm": 2.390625, + "grad_norm_var": 0.08289286295572916, + "learning_rate": 0.0001, + "loss": 4.9428, + "loss/crossentropy": 2.495269775390625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32837189733982086, + "step": 3244 + }, + { + "epoch": 0.06492, + "grad_norm": 2.53125, + "grad_norm_var": 0.07550455729166666, + "learning_rate": 0.0001, + "loss": 4.9928, + "loss/crossentropy": 2.365166425704956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27000221610069275, + "step": 3246 + }, + { + "epoch": 0.06496, + "grad_norm": 2.4375, + "grad_norm_var": 0.0746246337890625, + "learning_rate": 0.0001, + "loss": 4.837, + "loss/crossentropy": 1.8728906512260437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2711311876773834, + "step": 3248 + }, + { + "epoch": 0.065, + "grad_norm": 2.375, + "grad_norm_var": 0.07427978515625, + "learning_rate": 0.0001, + "loss": 4.7916, + "loss/crossentropy": 1.916531264781952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23036544770002365, + "step": 3250 + }, + { + "epoch": 0.06504, + "grad_norm": 2.421875, + "grad_norm_var": 0.07463277180989583, + "learning_rate": 0.0001, + "loss": 4.9399, + "loss/crossentropy": 1.996503233909607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24317056685686111, + "step": 3252 + }, + { + "epoch": 0.06508, + "grad_norm": 2.5625, + "grad_norm_var": 0.07366536458333334, + "learning_rate": 0.0001, + "loss": 4.8746, + "loss/crossentropy": 2.101921260356903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2775810658931732, + "step": 3254 + }, + { + "epoch": 0.06512, + "grad_norm": 2.40625, + "grad_norm_var": 0.005125935872395833, + "learning_rate": 0.0001, + "loss": 4.7526, + "loss/crossentropy": 1.9286046028137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23964455723762512, + "step": 3256 + }, + { + "epoch": 0.06516, + "grad_norm": 2.40625, + "grad_norm_var": 0.005464680989583333, + "learning_rate": 0.0001, + "loss": 4.8216, + "loss/crossentropy": 2.224915862083435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27516382932662964, + "step": 3258 + }, + { + "epoch": 0.0652, + "grad_norm": 2.34375, + "grad_norm_var": 0.0133697509765625, + "learning_rate": 0.0001, + "loss": 4.6553, + "loss/crossentropy": 2.259337306022644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27185751497745514, + "step": 3260 + }, + { + "epoch": 0.06524, + "grad_norm": 2.484375, + "grad_norm_var": 0.013117472330729166, + "learning_rate": 0.0001, + "loss": 4.7095, + "loss/crossentropy": 2.1081044673919678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23740330338478088, + "step": 3262 + }, + { + "epoch": 0.06528, + "grad_norm": 2.546875, + "grad_norm_var": 0.0131744384765625, + "learning_rate": 0.0001, + "loss": 4.6375, + "loss/crossentropy": 1.9032058119773865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2508246824145317, + "step": 3264 + }, + { + "epoch": 0.06532, + "grad_norm": 2.671875, + "grad_norm_var": 0.013353474934895833, + "learning_rate": 0.0001, + "loss": 4.7165, + "loss/crossentropy": 2.0773792266845703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2797544598579407, + "step": 3266 + }, + { + "epoch": 0.06536, + "grad_norm": 2.40625, + "grad_norm_var": 0.013509114583333334, + "learning_rate": 0.0001, + "loss": 4.9136, + "loss/crossentropy": 2.1591526865959167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2540442571043968, + "step": 3268 + }, + { + "epoch": 0.0654, + "grad_norm": 2.546875, + "grad_norm_var": 0.014351399739583333, + "learning_rate": 0.0001, + "loss": 4.6248, + "loss/crossentropy": 1.7735809683799744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24107928574085236, + "step": 3270 + }, + { + "epoch": 0.06544, + "grad_norm": 2.5, + "grad_norm_var": 0.014647420247395833, + "learning_rate": 0.0001, + "loss": 4.7568, + "loss/crossentropy": 2.069553792476654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26356737315654755, + "step": 3272 + }, + { + "epoch": 0.06548, + "grad_norm": 2.453125, + "grad_norm_var": 0.014623006184895834, + "learning_rate": 0.0001, + "loss": 4.7274, + "loss/crossentropy": 1.9688642024993896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29453104734420776, + "step": 3274 + }, + { + "epoch": 0.06552, + "grad_norm": 2.296875, + "grad_norm_var": 0.012043253580729166, + "learning_rate": 0.0001, + "loss": 4.4264, + "loss/crossentropy": 1.785252034664154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23213861882686615, + "step": 3276 + }, + { + "epoch": 0.06556, + "grad_norm": 2.65625, + "grad_norm_var": 0.013255818684895834, + "learning_rate": 0.0001, + "loss": 4.779, + "loss/crossentropy": 2.155774712562561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25758183747529984, + "step": 3278 + }, + { + "epoch": 0.0656, + "grad_norm": 2.5625, + "grad_norm_var": 0.016161092122395835, + "learning_rate": 0.0001, + "loss": 5.0453, + "loss/crossentropy": 2.0403348803520203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25007252395153046, + "step": 3280 + }, + { + "epoch": 0.06564, + "grad_norm": 2.4375, + "grad_norm_var": 0.015208943684895834, + "learning_rate": 0.0001, + "loss": 4.7161, + "loss/crossentropy": 1.9963608384132385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2608266994357109, + "step": 3282 + }, + { + "epoch": 0.06568, + "grad_norm": 2.75, + "grad_norm_var": 0.019364420572916666, + "learning_rate": 0.0001, + "loss": 4.9275, + "loss/crossentropy": 2.2021098732948303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28361018002033234, + "step": 3284 + }, + { + "epoch": 0.06572, + "grad_norm": 2.59375, + "grad_norm_var": 0.01812744140625, + "learning_rate": 0.0001, + "loss": 4.6533, + "loss/crossentropy": 2.0363903641700745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2641526162624359, + "step": 3286 + }, + { + "epoch": 0.06576, + "grad_norm": 2.671875, + "grad_norm_var": 0.020466105143229166, + "learning_rate": 0.0001, + "loss": 4.6884, + "loss/crossentropy": 2.1052531003952026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24344927817583084, + "step": 3288 + }, + { + "epoch": 0.0658, + "grad_norm": 2.578125, + "grad_norm_var": 0.02027587890625, + "learning_rate": 0.0001, + "loss": 4.9246, + "loss/crossentropy": 2.0664029717445374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25679293274879456, + "step": 3290 + }, + { + "epoch": 0.06584, + "grad_norm": 2.625, + "grad_norm_var": 0.013678995768229167, + "learning_rate": 0.0001, + "loss": 4.7802, + "loss/crossentropy": 2.168351709842682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2759328931570053, + "step": 3292 + }, + { + "epoch": 0.06588, + "grad_norm": 2.515625, + "grad_norm_var": 0.013703409830729167, + "learning_rate": 0.0001, + "loss": 4.8481, + "loss/crossentropy": 1.9305949211120605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26165173947811127, + "step": 3294 + }, + { + "epoch": 0.06592, + "grad_norm": 2.4375, + "grad_norm_var": 0.017476399739583332, + "learning_rate": 0.0001, + "loss": 4.6942, + "loss/crossentropy": 1.9257155060768127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24499841034412384, + "step": 3296 + }, + { + "epoch": 0.06596, + "grad_norm": 2.484375, + "grad_norm_var": 0.017513020833333334, + "learning_rate": 0.0001, + "loss": 4.8941, + "loss/crossentropy": 2.1406426429748535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26676414906978607, + "step": 3298 + }, + { + "epoch": 0.066, + "grad_norm": 2.734375, + "grad_norm_var": 0.016422526041666666, + "learning_rate": 0.0001, + "loss": 4.8293, + "loss/crossentropy": 2.099781036376953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2916935384273529, + "step": 3300 + }, + { + "epoch": 0.06604, + "grad_norm": 2.875, + "grad_norm_var": 0.023094685872395833, + "learning_rate": 0.0001, + "loss": 4.9504, + "loss/crossentropy": 2.1077913641929626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2667912393808365, + "step": 3302 + }, + { + "epoch": 0.06608, + "grad_norm": 3.90625, + "grad_norm_var": 0.1297271728515625, + "learning_rate": 0.0001, + "loss": 5.2952, + "loss/crossentropy": 2.2583223581314087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2907385528087616, + "step": 3304 + }, + { + "epoch": 0.06612, + "grad_norm": 2.71875, + "grad_norm_var": 0.13405659993489583, + "learning_rate": 0.0001, + "loss": 4.5813, + "loss/crossentropy": 1.847477912902832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2272188812494278, + "step": 3306 + }, + { + "epoch": 0.06616, + "grad_norm": 2.40625, + "grad_norm_var": 0.1392486572265625, + "learning_rate": 0.0001, + "loss": 4.6502, + "loss/crossentropy": 1.7610225677490234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23973071575164795, + "step": 3308 + }, + { + "epoch": 0.0662, + "grad_norm": 2.328125, + "grad_norm_var": 0.1464019775390625, + "learning_rate": 0.0001, + "loss": 4.7123, + "loss/crossentropy": 1.977916419506073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25456882268190384, + "step": 3310 + }, + { + "epoch": 0.06624, + "grad_norm": 2.734375, + "grad_norm_var": 0.14095052083333334, + "learning_rate": 0.0001, + "loss": 4.7353, + "loss/crossentropy": 2.2196428775787354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2563377171754837, + "step": 3312 + }, + { + "epoch": 0.06628, + "grad_norm": 2.53125, + "grad_norm_var": 0.14537760416666667, + "learning_rate": 0.0001, + "loss": 4.6907, + "loss/crossentropy": 2.0861470699310303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27121224999427795, + "step": 3314 + }, + { + "epoch": 0.06632, + "grad_norm": 2.53125, + "grad_norm_var": 0.14287821451822916, + "learning_rate": 0.0001, + "loss": 4.6484, + "loss/crossentropy": 1.9716283679008484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.277963787317276, + "step": 3316 + }, + { + "epoch": 0.06636, + "grad_norm": 2.609375, + "grad_norm_var": 0.1375, + "learning_rate": 0.0001, + "loss": 4.7985, + "loss/crossentropy": 2.5604729652404785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31031742691993713, + "step": 3318 + }, + { + "epoch": 0.0664, + "grad_norm": 2.71875, + "grad_norm_var": 0.022337849934895834, + "learning_rate": 0.0001, + "loss": 5.4329, + "loss/crossentropy": 2.2672252655029297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29225634038448334, + "step": 3320 + }, + { + "epoch": 0.06644, + "grad_norm": 2.671875, + "grad_norm_var": 0.020015462239583334, + "learning_rate": 0.0001, + "loss": 5.0557, + "loss/crossentropy": 2.0992931723594666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2648598402738571, + "step": 3322 + }, + { + "epoch": 0.06648, + "grad_norm": 2.65625, + "grad_norm_var": 0.014827473958333334, + "learning_rate": 0.0001, + "loss": 5.0895, + "loss/crossentropy": 2.208917260169983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2998420298099518, + "step": 3324 + }, + { + "epoch": 0.06652, + "grad_norm": 2.671875, + "grad_norm_var": 0.012791951497395834, + "learning_rate": 0.0001, + "loss": 4.8834, + "loss/crossentropy": 2.290796995162964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2638905793428421, + "step": 3326 + }, + { + "epoch": 0.06656, + "grad_norm": 2.375, + "grad_norm_var": 0.012214152018229167, + "learning_rate": 0.0001, + "loss": 4.8521, + "loss/crossentropy": 2.4156445264816284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28743064403533936, + "step": 3328 + }, + { + "epoch": 0.0666, + "grad_norm": 2.734375, + "grad_norm_var": 0.012132771809895833, + "learning_rate": 0.0001, + "loss": 5.2205, + "loss/crossentropy": 2.4604904651641846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27585768699645996, + "step": 3330 + }, + { + "epoch": 0.06664, + "grad_norm": 2.421875, + "grad_norm_var": 0.013818359375, + "learning_rate": 0.0001, + "loss": 4.8296, + "loss/crossentropy": 1.8613844513893127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27038969844579697, + "step": 3332 + }, + { + "epoch": 0.06668, + "grad_norm": 2.53125, + "grad_norm_var": 0.014046223958333333, + "learning_rate": 0.0001, + "loss": 4.8777, + "loss/crossentropy": 2.232776403427124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2686954140663147, + "step": 3334 + }, + { + "epoch": 0.06672, + "grad_norm": 2.5625, + "grad_norm_var": 0.0120758056640625, + "learning_rate": 0.0001, + "loss": 4.9536, + "loss/crossentropy": 2.070033550262451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26176171004772186, + "step": 3336 + }, + { + "epoch": 0.06676, + "grad_norm": 2.4375, + "grad_norm_var": 0.01187744140625, + "learning_rate": 0.0001, + "loss": 4.8616, + "loss/crossentropy": 2.38937509059906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2539200633764267, + "step": 3338 + }, + { + "epoch": 0.0668, + "grad_norm": 2.53125, + "grad_norm_var": 0.010933430989583333, + "learning_rate": 0.0001, + "loss": 5.1104, + "loss/crossentropy": 2.272845983505249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2856733053922653, + "step": 3340 + }, + { + "epoch": 0.06684, + "grad_norm": 2.5625, + "grad_norm_var": 0.01181640625, + "learning_rate": 0.0001, + "loss": 4.7935, + "loss/crossentropy": 2.427489161491394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30572691559791565, + "step": 3342 + }, + { + "epoch": 0.06688, + "grad_norm": 2.59375, + "grad_norm_var": 0.009407552083333333, + "learning_rate": 0.0001, + "loss": 4.9801, + "loss/crossentropy": 2.4701327085494995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28469331562519073, + "step": 3344 + }, + { + "epoch": 0.06692, + "grad_norm": 2.421875, + "grad_norm_var": 0.006917317708333333, + "learning_rate": 0.0001, + "loss": 4.815, + "loss/crossentropy": 2.0733558535575867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.292859822511673, + "step": 3346 + }, + { + "epoch": 0.06696, + "grad_norm": 2.640625, + "grad_norm_var": 0.0061187744140625, + "learning_rate": 0.0001, + "loss": 4.8379, + "loss/crossentropy": 2.301755905151367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3174070864915848, + "step": 3348 + }, + { + "epoch": 0.067, + "grad_norm": 2.234375, + "grad_norm_var": 0.012580362955729167, + "learning_rate": 0.0001, + "loss": 4.3359, + "loss/crossentropy": 2.039419114589691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24543144553899765, + "step": 3350 + }, + { + "epoch": 0.06704, + "grad_norm": 2.4375, + "grad_norm_var": 0.0144683837890625, + "learning_rate": 0.0001, + "loss": 4.7938, + "loss/crossentropy": 1.9247611165046692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2595005929470062, + "step": 3352 + }, + { + "epoch": 0.06708, + "grad_norm": 2.34375, + "grad_norm_var": 0.01630859375, + "learning_rate": 0.0001, + "loss": 4.8294, + "loss/crossentropy": 2.224974751472473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26880575716495514, + "step": 3354 + }, + { + "epoch": 0.06712, + "grad_norm": 2.734375, + "grad_norm_var": 0.0195220947265625, + "learning_rate": 0.0001, + "loss": 4.8922, + "loss/crossentropy": 2.2601993083953857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.279633030295372, + "step": 3356 + }, + { + "epoch": 0.06716, + "grad_norm": 2.4375, + "grad_norm_var": 0.017085774739583334, + "learning_rate": 0.0001, + "loss": 4.5911, + "loss/crossentropy": 1.8156417608261108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22118539363145828, + "step": 3358 + }, + { + "epoch": 0.0672, + "grad_norm": 2.734375, + "grad_norm_var": 0.020140584309895834, + "learning_rate": 0.0001, + "loss": 5.2032, + "loss/crossentropy": 2.207859516143799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2771998345851898, + "step": 3360 + }, + { + "epoch": 0.06724, + "grad_norm": 2.453125, + "grad_norm_var": 0.02017822265625, + "learning_rate": 0.0001, + "loss": 4.9352, + "loss/crossentropy": 1.9886083602905273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545766308903694, + "step": 3362 + }, + { + "epoch": 0.06728, + "grad_norm": 2.234375, + "grad_norm_var": 0.025777180989583332, + "learning_rate": 0.0001, + "loss": 4.4323, + "loss/crossentropy": 1.7046592235565186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2207612618803978, + "step": 3364 + }, + { + "epoch": 0.06732, + "grad_norm": 2.453125, + "grad_norm_var": 0.021122233072916666, + "learning_rate": 0.0001, + "loss": 5.0209, + "loss/crossentropy": 2.0283663868904114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2743644416332245, + "step": 3366 + }, + { + "epoch": 0.06736, + "grad_norm": 2.453125, + "grad_norm_var": 0.019498697916666665, + "learning_rate": 0.0001, + "loss": 4.4788, + "loss/crossentropy": 1.975772500038147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505730241537094, + "step": 3368 + }, + { + "epoch": 0.0674, + "grad_norm": 2.5, + "grad_norm_var": 0.019950358072916667, + "learning_rate": 0.0001, + "loss": 4.6863, + "loss/crossentropy": 1.9021872282028198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25430944561958313, + "step": 3370 + }, + { + "epoch": 0.06744, + "grad_norm": 2.671875, + "grad_norm_var": 0.0182037353515625, + "learning_rate": 0.0001, + "loss": 5.1859, + "loss/crossentropy": 2.3888463973999023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27896443009376526, + "step": 3372 + }, + { + "epoch": 0.06748, + "grad_norm": 2.71875, + "grad_norm_var": 0.02086181640625, + "learning_rate": 0.0001, + "loss": 4.6041, + "loss/crossentropy": 1.7844690680503845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536455988883972, + "step": 3374 + }, + { + "epoch": 0.06752, + "grad_norm": 2.703125, + "grad_norm_var": 0.021442667643229166, + "learning_rate": 0.0001, + "loss": 4.4538, + "loss/crossentropy": 1.919598639011383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26129309833049774, + "step": 3376 + }, + { + "epoch": 0.06756, + "grad_norm": 2.671875, + "grad_norm_var": 0.022835286458333333, + "learning_rate": 0.0001, + "loss": 5.0568, + "loss/crossentropy": 2.2292110919952393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25612247735261917, + "step": 3378 + }, + { + "epoch": 0.0676, + "grad_norm": 3.4375, + "grad_norm_var": 0.0779296875, + "learning_rate": 0.0001, + "loss": 4.8674, + "loss/crossentropy": 2.2235841751098633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2607487738132477, + "step": 3380 + }, + { + "epoch": 0.06764, + "grad_norm": 2.34375, + "grad_norm_var": 0.08089192708333333, + "learning_rate": 0.0001, + "loss": 4.5872, + "loss/crossentropy": 2.1375235319137573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2344205006957054, + "step": 3382 + }, + { + "epoch": 0.06768, + "grad_norm": 2.28125, + "grad_norm_var": 0.08501688639322917, + "learning_rate": 0.0001, + "loss": 4.6076, + "loss/crossentropy": 2.020237445831299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2550392523407936, + "step": 3384 + }, + { + "epoch": 0.06772, + "grad_norm": 2.734375, + "grad_norm_var": 0.07911783854166667, + "learning_rate": 0.0001, + "loss": 4.6939, + "loss/crossentropy": 2.138959765434265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2742607295513153, + "step": 3386 + }, + { + "epoch": 0.06776, + "grad_norm": 2.578125, + "grad_norm_var": 0.07864481608072917, + "learning_rate": 0.0001, + "loss": 4.9065, + "loss/crossentropy": 2.521559953689575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2892928719520569, + "step": 3388 + }, + { + "epoch": 0.0678, + "grad_norm": 2.375, + "grad_norm_var": 0.07815348307291667, + "learning_rate": 0.0001, + "loss": 5.049, + "loss/crossentropy": 2.169550120830536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27750439941883087, + "step": 3390 + }, + { + "epoch": 0.06784, + "grad_norm": 2.6875, + "grad_norm_var": 0.07464090983072917, + "learning_rate": 0.0001, + "loss": 4.7401, + "loss/crossentropy": 1.975584864616394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557516545057297, + "step": 3392 + }, + { + "epoch": 0.06788, + "grad_norm": 2.6875, + "grad_norm_var": 0.07617899576822916, + "learning_rate": 0.0001, + "loss": 4.9207, + "loss/crossentropy": 2.5837322473526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3444886952638626, + "step": 3394 + }, + { + "epoch": 0.06792, + "grad_norm": 2.546875, + "grad_norm_var": 0.022294108072916666, + "learning_rate": 0.0001, + "loss": 4.743, + "loss/crossentropy": 1.963110864162445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24120041728019714, + "step": 3396 + }, + { + "epoch": 0.06796, + "grad_norm": 2.65625, + "grad_norm_var": 0.0211090087890625, + "learning_rate": 0.0001, + "loss": 4.5759, + "loss/crossentropy": 2.381603956222534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26319295167922974, + "step": 3398 + }, + { + "epoch": 0.068, + "grad_norm": 2.25, + "grad_norm_var": 0.023078409830729167, + "learning_rate": 0.0001, + "loss": 4.5642, + "loss/crossentropy": 2.054026961326599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24540280550718307, + "step": 3400 + }, + { + "epoch": 0.06804, + "grad_norm": 2.46875, + "grad_norm_var": 0.0187652587890625, + "learning_rate": 0.0001, + "loss": 4.3661, + "loss/crossentropy": 2.047453820705414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2524164840579033, + "step": 3402 + }, + { + "epoch": 0.06808, + "grad_norm": 2.390625, + "grad_norm_var": 0.018317667643229167, + "learning_rate": 0.0001, + "loss": 4.7839, + "loss/crossentropy": 1.8616933226585388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23905867338180542, + "step": 3404 + }, + { + "epoch": 0.06812, + "grad_norm": 2.4375, + "grad_norm_var": 0.016422526041666666, + "learning_rate": 0.0001, + "loss": 4.6932, + "loss/crossentropy": 2.4304568767547607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2802550047636032, + "step": 3406 + }, + { + "epoch": 0.06816, + "grad_norm": 2.5, + "grad_norm_var": 0.01318359375, + "learning_rate": 0.0001, + "loss": 4.7005, + "loss/crossentropy": 1.8178748488426208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24266308546066284, + "step": 3408 + }, + { + "epoch": 0.0682, + "grad_norm": 2.625, + "grad_norm_var": 0.0116363525390625, + "learning_rate": 0.0001, + "loss": 5.0152, + "loss/crossentropy": 2.025859773159027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.282450333237648, + "step": 3410 + }, + { + "epoch": 0.06824, + "grad_norm": 2.53125, + "grad_norm_var": 0.010383097330729167, + "learning_rate": 0.0001, + "loss": 4.737, + "loss/crossentropy": 2.032994568347931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26047058403491974, + "step": 3412 + }, + { + "epoch": 0.06828, + "grad_norm": 2.46875, + "grad_norm_var": 0.007840983072916667, + "learning_rate": 0.0001, + "loss": 4.6428, + "loss/crossentropy": 2.2468607425689697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2814445495605469, + "step": 3414 + }, + { + "epoch": 0.06832, + "grad_norm": 2.75, + "grad_norm_var": 0.03629150390625, + "learning_rate": 0.0001, + "loss": 4.6854, + "loss/crossentropy": 2.2534161806106567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3231179416179657, + "step": 3416 + }, + { + "epoch": 0.06836, + "grad_norm": 2.65625, + "grad_norm_var": 0.03443603515625, + "learning_rate": 0.0001, + "loss": 4.9599, + "loss/crossentropy": 2.1678181886672974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2840191572904587, + "step": 3418 + }, + { + "epoch": 0.0684, + "grad_norm": 2.328125, + "grad_norm_var": 0.03453776041666667, + "learning_rate": 0.0001, + "loss": 4.8667, + "loss/crossentropy": 2.053459882736206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.271483838558197, + "step": 3420 + }, + { + "epoch": 0.06844, + "grad_norm": 2.53125, + "grad_norm_var": 0.03243815104166667, + "learning_rate": 0.0001, + "loss": 4.714, + "loss/crossentropy": 2.2278919219970703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2781473994255066, + "step": 3422 + }, + { + "epoch": 0.06848, + "grad_norm": 2.296875, + "grad_norm_var": 0.0384674072265625, + "learning_rate": 0.0001, + "loss": 4.5262, + "loss/crossentropy": 1.9582479000091553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24171434342861176, + "step": 3424 + }, + { + "epoch": 0.06852, + "grad_norm": 2.359375, + "grad_norm_var": 0.04121805826822917, + "learning_rate": 0.0001, + "loss": 4.4734, + "loss/crossentropy": 1.808964192867279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22458729147911072, + "step": 3426 + }, + { + "epoch": 0.06856, + "grad_norm": 2.40625, + "grad_norm_var": 0.04296468098958333, + "learning_rate": 0.0001, + "loss": 4.464, + "loss/crossentropy": 1.9225260019302368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2508438527584076, + "step": 3428 + }, + { + "epoch": 0.0686, + "grad_norm": 2.578125, + "grad_norm_var": 0.044465128580729166, + "learning_rate": 0.0001, + "loss": 4.9647, + "loss/crossentropy": 2.2446881532669067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26919613778591156, + "step": 3430 + }, + { + "epoch": 0.06864, + "grad_norm": 2.515625, + "grad_norm_var": 0.017899576822916666, + "learning_rate": 0.0001, + "loss": 4.6865, + "loss/crossentropy": 2.2047033309936523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2581355720758438, + "step": 3432 + }, + { + "epoch": 0.06868, + "grad_norm": 2.46875, + "grad_norm_var": 0.015729777018229165, + "learning_rate": 0.0001, + "loss": 4.7139, + "loss/crossentropy": 2.1223543882369995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25814756751060486, + "step": 3434 + }, + { + "epoch": 0.06872, + "grad_norm": 2.59375, + "grad_norm_var": 0.016813151041666665, + "learning_rate": 0.0001, + "loss": 4.6771, + "loss/crossentropy": 2.1641053557395935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25222497433423996, + "step": 3436 + }, + { + "epoch": 0.06876, + "grad_norm": 2.640625, + "grad_norm_var": 0.0189605712890625, + "learning_rate": 0.0001, + "loss": 4.6972, + "loss/crossentropy": 1.9569795727729797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2681007981300354, + "step": 3438 + }, + { + "epoch": 0.0688, + "grad_norm": 2.453125, + "grad_norm_var": 0.013216145833333333, + "learning_rate": 0.0001, + "loss": 4.8214, + "loss/crossentropy": 2.212220251560211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28191742300987244, + "step": 3440 + }, + { + "epoch": 0.06884, + "grad_norm": 2.34375, + "grad_norm_var": 0.011546834309895834, + "learning_rate": 0.0001, + "loss": 4.4162, + "loss/crossentropy": 1.9564262628555298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560906559228897, + "step": 3442 + }, + { + "epoch": 0.06888, + "grad_norm": 2.46875, + "grad_norm_var": 0.012043253580729166, + "learning_rate": 0.0001, + "loss": 4.6837, + "loss/crossentropy": 1.8553346395492554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24626825004816055, + "step": 3444 + }, + { + "epoch": 0.06892, + "grad_norm": 2.3125, + "grad_norm_var": 0.012007649739583333, + "learning_rate": 0.0001, + "loss": 4.7914, + "loss/crossentropy": 1.9803723692893982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2712126225233078, + "step": 3446 + }, + { + "epoch": 0.06896, + "grad_norm": 2.46875, + "grad_norm_var": 0.010423787434895833, + "learning_rate": 0.0001, + "loss": 4.857, + "loss/crossentropy": 1.9914751648902893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2565220594406128, + "step": 3448 + }, + { + "epoch": 0.069, + "grad_norm": 2.484375, + "grad_norm_var": 0.010758463541666667, + "learning_rate": 0.0001, + "loss": 4.8013, + "loss/crossentropy": 2.2114094495773315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28365010023117065, + "step": 3450 + }, + { + "epoch": 0.06904, + "grad_norm": 2.453125, + "grad_norm_var": 0.009577433268229166, + "learning_rate": 0.0001, + "loss": 5.0322, + "loss/crossentropy": 2.4366514682769775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30788426101207733, + "step": 3452 + }, + { + "epoch": 0.06908, + "grad_norm": 2.671875, + "grad_norm_var": 0.010595703125, + "learning_rate": 0.0001, + "loss": 4.7902, + "loss/crossentropy": 2.304569959640503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2835633158683777, + "step": 3454 + }, + { + "epoch": 0.06912, + "grad_norm": 2.65625, + "grad_norm_var": 0.027318318684895832, + "learning_rate": 0.0001, + "loss": 5.151, + "loss/crossentropy": 2.2518080472946167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.278719499707222, + "step": 3456 + }, + { + "epoch": 0.06916, + "grad_norm": 2.71875, + "grad_norm_var": 0.027904256184895834, + "learning_rate": 0.0001, + "loss": 5.1688, + "loss/crossentropy": 2.333768129348755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28694969415664673, + "step": 3458 + }, + { + "epoch": 0.0692, + "grad_norm": 2.46875, + "grad_norm_var": 0.0260650634765625, + "learning_rate": 0.0001, + "loss": 5.1421, + "loss/crossentropy": 2.3534432649612427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3005864769220352, + "step": 3460 + }, + { + "epoch": 0.06924, + "grad_norm": 2.4375, + "grad_norm_var": 0.022981770833333335, + "learning_rate": 0.0001, + "loss": 4.9502, + "loss/crossentropy": 2.0703017711639404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25969168543815613, + "step": 3462 + }, + { + "epoch": 0.06928, + "grad_norm": 2.65625, + "grad_norm_var": 0.023502604166666666, + "learning_rate": 0.0001, + "loss": 5.0258, + "loss/crossentropy": 2.167420506477356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32322582602500916, + "step": 3464 + }, + { + "epoch": 0.06932, + "grad_norm": 2.515625, + "grad_norm_var": 0.021222941080729165, + "learning_rate": 0.0001, + "loss": 4.5342, + "loss/crossentropy": 2.0845181941986084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26804475486278534, + "step": 3466 + }, + { + "epoch": 0.06936, + "grad_norm": 2.4375, + "grad_norm_var": 0.021219889322916668, + "learning_rate": 0.0001, + "loss": 5.0459, + "loss/crossentropy": 2.4165114164352417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2866530567407608, + "step": 3468 + }, + { + "epoch": 0.0694, + "grad_norm": 2.765625, + "grad_norm_var": 0.022359212239583332, + "learning_rate": 0.0001, + "loss": 5.0938, + "loss/crossentropy": 2.4152863025665283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30185502767562866, + "step": 3470 + }, + { + "epoch": 0.06944, + "grad_norm": 2.28125, + "grad_norm_var": 0.014924112955729167, + "learning_rate": 0.0001, + "loss": 4.6362, + "loss/crossentropy": 2.1878501176834106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2736133933067322, + "step": 3472 + }, + { + "epoch": 0.06948, + "grad_norm": 2.4375, + "grad_norm_var": 0.015262858072916666, + "learning_rate": 0.0001, + "loss": 4.812, + "loss/crossentropy": 2.055173695087433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2609568238258362, + "step": 3474 + }, + { + "epoch": 0.06952, + "grad_norm": 2.453125, + "grad_norm_var": 0.015550740559895833, + "learning_rate": 0.0001, + "loss": 4.8201, + "loss/crossentropy": 2.050000250339508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2713121324777603, + "step": 3476 + }, + { + "epoch": 0.06956, + "grad_norm": 2.65625, + "grad_norm_var": 0.019169108072916666, + "learning_rate": 0.0001, + "loss": 4.9916, + "loss/crossentropy": 2.227464199066162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2710970640182495, + "step": 3478 + }, + { + "epoch": 0.0696, + "grad_norm": 2.46875, + "grad_norm_var": 0.016828409830729165, + "learning_rate": 0.0001, + "loss": 4.7435, + "loss/crossentropy": 2.096015691757202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29556676745414734, + "step": 3480 + }, + { + "epoch": 0.06964, + "grad_norm": 2.921875, + "grad_norm_var": 0.028804524739583334, + "learning_rate": 0.0001, + "loss": 4.6738, + "loss/crossentropy": 1.9252901673316956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2976933419704437, + "step": 3482 + }, + { + "epoch": 0.06968, + "grad_norm": 2.25, + "grad_norm_var": 0.03385009765625, + "learning_rate": 0.0001, + "loss": 4.7679, + "loss/crossentropy": 2.258090019226074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27399829030036926, + "step": 3484 + }, + { + "epoch": 0.06972, + "grad_norm": 2.265625, + "grad_norm_var": 0.03192952473958333, + "learning_rate": 0.0001, + "loss": 4.7614, + "loss/crossentropy": 2.2776867151260376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29224833846092224, + "step": 3486 + }, + { + "epoch": 0.06976, + "grad_norm": 2.703125, + "grad_norm_var": 0.031819661458333336, + "learning_rate": 0.0001, + "loss": 5.1628, + "loss/crossentropy": 2.097061276435852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26103661954402924, + "step": 3488 + }, + { + "epoch": 0.0698, + "grad_norm": 2.28125, + "grad_norm_var": 0.03264567057291667, + "learning_rate": 0.0001, + "loss": 4.7364, + "loss/crossentropy": 2.206419885158539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2775641232728958, + "step": 3490 + }, + { + "epoch": 0.06984, + "grad_norm": 2.59375, + "grad_norm_var": 0.035563151041666664, + "learning_rate": 0.0001, + "loss": 4.7216, + "loss/crossentropy": 1.962704062461853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2421455979347229, + "step": 3492 + }, + { + "epoch": 0.06988, + "grad_norm": 2.4375, + "grad_norm_var": 0.032698567708333334, + "learning_rate": 0.0001, + "loss": 4.7358, + "loss/crossentropy": 2.223302483558655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2709382176399231, + "step": 3494 + }, + { + "epoch": 0.06992, + "grad_norm": 2.390625, + "grad_norm_var": 0.034764607747395836, + "learning_rate": 0.0001, + "loss": 4.8563, + "loss/crossentropy": 2.259950876235962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29514479637145996, + "step": 3496 + }, + { + "epoch": 0.06996, + "grad_norm": 2.359375, + "grad_norm_var": 0.021141560872395833, + "learning_rate": 0.0001, + "loss": 4.6147, + "loss/crossentropy": 2.2337416410446167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2585422098636627, + "step": 3498 + }, + { + "epoch": 0.07, + "grad_norm": 2.75, + "grad_norm_var": 0.022663370768229166, + "learning_rate": 0.0001, + "loss": 4.9278, + "loss/crossentropy": 2.131904423236847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32493171095848083, + "step": 3500 + }, + { + "epoch": 0.07004, + "grad_norm": 3.109375, + "grad_norm_var": 0.04553629557291667, + "learning_rate": 0.0001, + "loss": 4.9285, + "loss/crossentropy": 2.461912155151367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2917899489402771, + "step": 3502 + }, + { + "epoch": 0.07008, + "grad_norm": 2.40625, + "grad_norm_var": 0.04366861979166667, + "learning_rate": 0.0001, + "loss": 4.6466, + "loss/crossentropy": 2.05659943819046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24786780774593353, + "step": 3504 + }, + { + "epoch": 0.07012, + "grad_norm": 2.625, + "grad_norm_var": 0.0412261962890625, + "learning_rate": 0.0001, + "loss": 4.7282, + "loss/crossentropy": 2.3972705602645874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29012058675289154, + "step": 3506 + }, + { + "epoch": 0.07016, + "grad_norm": 2.390625, + "grad_norm_var": 0.03945210774739583, + "learning_rate": 0.0001, + "loss": 4.6561, + "loss/crossentropy": 1.8465647101402283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26938022673130035, + "step": 3508 + }, + { + "epoch": 0.0702, + "grad_norm": 2.65625, + "grad_norm_var": 0.0434722900390625, + "learning_rate": 0.0001, + "loss": 4.9007, + "loss/crossentropy": 2.2447493076324463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27260421216487885, + "step": 3510 + }, + { + "epoch": 0.07024, + "grad_norm": 2.453125, + "grad_norm_var": 0.042464192708333334, + "learning_rate": 0.0001, + "loss": 4.7774, + "loss/crossentropy": 2.4258209466934204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.285652831196785, + "step": 3512 + }, + { + "epoch": 0.07028, + "grad_norm": 2.328125, + "grad_norm_var": 0.04248046875, + "learning_rate": 0.0001, + "loss": 4.7353, + "loss/crossentropy": 2.1033068895339966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2628294378519058, + "step": 3514 + }, + { + "epoch": 0.07032, + "grad_norm": 2.5, + "grad_norm_var": 0.03819071451822917, + "learning_rate": 0.0001, + "loss": 4.8036, + "loss/crossentropy": 2.2740964889526367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27228833734989166, + "step": 3516 + }, + { + "epoch": 0.07036, + "grad_norm": 2.265625, + "grad_norm_var": 0.015104166666666667, + "learning_rate": 0.0001, + "loss": 4.7699, + "loss/crossentropy": 2.2315655946731567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26613348722457886, + "step": 3518 + }, + { + "epoch": 0.0704, + "grad_norm": 2.359375, + "grad_norm_var": 0.014720662434895834, + "learning_rate": 0.0001, + "loss": 4.6038, + "loss/crossentropy": 1.9001839756965637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.248238705098629, + "step": 3520 + }, + { + "epoch": 0.07044, + "grad_norm": 2.59375, + "grad_norm_var": 0.013655598958333333, + "learning_rate": 0.0001, + "loss": 4.7034, + "loss/crossentropy": 2.0940937399864197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28850243985652924, + "step": 3522 + }, + { + "epoch": 0.07048, + "grad_norm": 2.421875, + "grad_norm_var": 0.014338175455729166, + "learning_rate": 0.0001, + "loss": 4.9187, + "loss/crossentropy": 1.9088054299354553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2577759325504303, + "step": 3524 + }, + { + "epoch": 0.07052, + "grad_norm": 2.359375, + "grad_norm_var": 0.01031494140625, + "learning_rate": 0.0001, + "loss": 4.7852, + "loss/crossentropy": 2.1965672969818115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2696071192622185, + "step": 3526 + }, + { + "epoch": 0.07056, + "grad_norm": 2.59375, + "grad_norm_var": 0.010770670572916667, + "learning_rate": 0.0001, + "loss": 4.7657, + "loss/crossentropy": 2.245758891105652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2646239101886749, + "step": 3528 + }, + { + "epoch": 0.0706, + "grad_norm": 2.546875, + "grad_norm_var": 0.013895670572916666, + "learning_rate": 0.0001, + "loss": 4.7794, + "loss/crossentropy": 2.180204927921295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2582162171602249, + "step": 3530 + }, + { + "epoch": 0.07064, + "grad_norm": 2.71875, + "grad_norm_var": 0.019562784830729166, + "learning_rate": 0.0001, + "loss": 5.04, + "loss/crossentropy": 2.193474531173706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.40557755529880524, + "step": 3532 + }, + { + "epoch": 0.07068, + "grad_norm": 2.78125, + "grad_norm_var": 0.021361287434895834, + "learning_rate": 0.0001, + "loss": 4.9562, + "loss/crossentropy": 2.2667607069015503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28741554915905, + "step": 3534 + }, + { + "epoch": 0.07072, + "grad_norm": 2.3125, + "grad_norm_var": 0.022093709309895834, + "learning_rate": 0.0001, + "loss": 4.8215, + "loss/crossentropy": 1.9890388250350952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2612537443637848, + "step": 3536 + }, + { + "epoch": 0.07076, + "grad_norm": 2.390625, + "grad_norm_var": 0.02392578125, + "learning_rate": 0.0001, + "loss": 4.7544, + "loss/crossentropy": 1.9390615820884705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23259633034467697, + "step": 3538 + }, + { + "epoch": 0.0708, + "grad_norm": 2.515625, + "grad_norm_var": 0.023193359375, + "learning_rate": 0.0001, + "loss": 4.8227, + "loss/crossentropy": 2.3050389289855957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2738967537879944, + "step": 3540 + }, + { + "epoch": 0.07084, + "grad_norm": 2.265625, + "grad_norm_var": 0.025862630208333334, + "learning_rate": 0.0001, + "loss": 4.4323, + "loss/crossentropy": 2.3832077980041504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29863911867141724, + "step": 3542 + }, + { + "epoch": 0.07088, + "grad_norm": 2.4375, + "grad_norm_var": 0.02822265625, + "learning_rate": 0.0001, + "loss": 4.7101, + "loss/crossentropy": 1.8186699748039246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22366883605718613, + "step": 3544 + }, + { + "epoch": 0.07092, + "grad_norm": 2.46875, + "grad_norm_var": 0.024235026041666666, + "learning_rate": 0.0001, + "loss": 4.8151, + "loss/crossentropy": 2.2650288343429565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2733971029520035, + "step": 3546 + }, + { + "epoch": 0.07096, + "grad_norm": 2.53125, + "grad_norm_var": 0.0191070556640625, + "learning_rate": 0.0001, + "loss": 4.7514, + "loss/crossentropy": 2.432945966720581, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.269818976521492, + "step": 3548 + }, + { + "epoch": 0.071, + "grad_norm": 4.15625, + "grad_norm_var": 0.19250386555989582, + "learning_rate": 0.0001, + "loss": 4.9461, + "loss/crossentropy": 2.021497666835785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2488800287246704, + "step": 3550 + }, + { + "epoch": 0.07104, + "grad_norm": 2.3125, + "grad_norm_var": 0.2001129150390625, + "learning_rate": 0.0001, + "loss": 5.0341, + "loss/crossentropy": 2.0840535163879395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2780974507331848, + "step": 3552 + }, + { + "epoch": 0.07108, + "grad_norm": 2.515625, + "grad_norm_var": 0.19724833170572917, + "learning_rate": 0.0001, + "loss": 4.8123, + "loss/crossentropy": 2.352238416671753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2830911874771118, + "step": 3554 + }, + { + "epoch": 0.07112, + "grad_norm": 2.484375, + "grad_norm_var": 0.19795633951822916, + "learning_rate": 0.0001, + "loss": 4.9632, + "loss/crossentropy": 2.395397186279297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2861028015613556, + "step": 3556 + }, + { + "epoch": 0.07116, + "grad_norm": 2.84375, + "grad_norm_var": 0.19630533854166668, + "learning_rate": 0.0001, + "loss": 5.0203, + "loss/crossentropy": 2.6454248428344727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28888577222824097, + "step": 3558 + }, + { + "epoch": 0.0712, + "grad_norm": 2.609375, + "grad_norm_var": 0.19394429524739584, + "learning_rate": 0.0001, + "loss": 4.8228, + "loss/crossentropy": 2.21127188205719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2676347494125366, + "step": 3560 + }, + { + "epoch": 0.07124, + "grad_norm": 2.5625, + "grad_norm_var": 0.19245503743489584, + "learning_rate": 0.0001, + "loss": 4.7502, + "loss/crossentropy": 2.0187097787857056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24967321753501892, + "step": 3562 + }, + { + "epoch": 0.07128, + "grad_norm": 2.4375, + "grad_norm_var": 0.19409077962239582, + "learning_rate": 0.0001, + "loss": 4.9548, + "loss/crossentropy": 2.1822619438171387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24893341958522797, + "step": 3564 + }, + { + "epoch": 0.07132, + "grad_norm": 2.65625, + "grad_norm_var": 0.03178609212239583, + "learning_rate": 0.0001, + "loss": 5.0912, + "loss/crossentropy": 2.486607313156128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29665203392505646, + "step": 3566 + }, + { + "epoch": 0.07136, + "grad_norm": 2.546875, + "grad_norm_var": 0.020048014322916665, + "learning_rate": 0.0001, + "loss": 5.0494, + "loss/crossentropy": 2.2631163597106934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.268096499145031, + "step": 3568 + }, + { + "epoch": 0.0714, + "grad_norm": 2.671875, + "grad_norm_var": 0.02760009765625, + "learning_rate": 0.0001, + "loss": 4.7668, + "loss/crossentropy": 2.3393882513046265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29145532846450806, + "step": 3570 + }, + { + "epoch": 0.07144, + "grad_norm": 2.65625, + "grad_norm_var": 0.03980712890625, + "learning_rate": 0.0001, + "loss": 4.8281, + "loss/crossentropy": 2.007299244403839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30512700974941254, + "step": 3572 + }, + { + "epoch": 0.07148, + "grad_norm": 2.609375, + "grad_norm_var": 0.0369049072265625, + "learning_rate": 0.0001, + "loss": 4.7891, + "loss/crossentropy": 1.9879329800605774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22510841488838196, + "step": 3574 + }, + { + "epoch": 0.07152, + "grad_norm": 2.46875, + "grad_norm_var": 0.03443603515625, + "learning_rate": 0.0001, + "loss": 4.9688, + "loss/crossentropy": 2.387833833694458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3007010221481323, + "step": 3576 + }, + { + "epoch": 0.07156, + "grad_norm": 2.625, + "grad_norm_var": 0.03664957682291667, + "learning_rate": 0.0001, + "loss": 4.7975, + "loss/crossentropy": 2.3306411504745483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.269440695643425, + "step": 3578 + }, + { + "epoch": 0.0716, + "grad_norm": 2.5, + "grad_norm_var": 0.03462626139322917, + "learning_rate": 0.0001, + "loss": 4.5348, + "loss/crossentropy": 1.8359156847000122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23353691399097443, + "step": 3580 + }, + { + "epoch": 0.07164, + "grad_norm": 2.5625, + "grad_norm_var": 0.033080037434895834, + "learning_rate": 0.0001, + "loss": 4.9226, + "loss/crossentropy": 2.257680654525757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2764490246772766, + "step": 3582 + }, + { + "epoch": 0.07168, + "grad_norm": 2.4375, + "grad_norm_var": 0.03310445149739583, + "learning_rate": 0.0001, + "loss": 4.6434, + "loss/crossentropy": 2.589483857154846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28172188997268677, + "step": 3584 + }, + { + "epoch": 0.07172, + "grad_norm": 2.390625, + "grad_norm_var": 0.04182840983072917, + "learning_rate": 0.0001, + "loss": 4.5434, + "loss/crossentropy": 1.8202016949653625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22330023348331451, + "step": 3586 + }, + { + "epoch": 0.07176, + "grad_norm": 2.421875, + "grad_norm_var": 0.028609212239583334, + "learning_rate": 0.0001, + "loss": 4.8344, + "loss/crossentropy": 2.0311816334724426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2622206509113312, + "step": 3588 + }, + { + "epoch": 0.0718, + "grad_norm": 2.296875, + "grad_norm_var": 0.028400675455729166, + "learning_rate": 0.0001, + "loss": 4.5158, + "loss/crossentropy": 1.7991753220558167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23093865811824799, + "step": 3590 + }, + { + "epoch": 0.07184, + "grad_norm": 2.265625, + "grad_norm_var": 0.030924479166666668, + "learning_rate": 0.0001, + "loss": 4.3148, + "loss/crossentropy": 1.6141473054885864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21914401650428772, + "step": 3592 + }, + { + "epoch": 0.07188, + "grad_norm": 2.34375, + "grad_norm_var": 0.0289703369140625, + "learning_rate": 0.0001, + "loss": 4.8007, + "loss/crossentropy": 2.3337208032608032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2612123265862465, + "step": 3594 + }, + { + "epoch": 0.07192, + "grad_norm": 2.265625, + "grad_norm_var": 0.0322662353515625, + "learning_rate": 0.0001, + "loss": 4.5811, + "loss/crossentropy": 2.191028594970703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2598741352558136, + "step": 3596 + }, + { + "epoch": 0.07196, + "grad_norm": 2.546875, + "grad_norm_var": 0.03178609212239583, + "learning_rate": 0.0001, + "loss": 4.9562, + "loss/crossentropy": 2.0293691158294678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27143266797065735, + "step": 3598 + }, + { + "epoch": 0.072, + "grad_norm": 2.640625, + "grad_norm_var": 0.03437398274739583, + "learning_rate": 0.0001, + "loss": 5.1335, + "loss/crossentropy": 2.1257725954055786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33300966024398804, + "step": 3600 + }, + { + "epoch": 0.07204, + "grad_norm": 2.375, + "grad_norm_var": 0.0171875, + "learning_rate": 0.0001, + "loss": 4.8223, + "loss/crossentropy": 2.296278953552246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28724825382232666, + "step": 3602 + }, + { + "epoch": 0.07208, + "grad_norm": 2.453125, + "grad_norm_var": 0.016731770833333333, + "learning_rate": 0.0001, + "loss": 4.8925, + "loss/crossentropy": 2.258358597755432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26510895788669586, + "step": 3604 + }, + { + "epoch": 0.07212, + "grad_norm": 2.421875, + "grad_norm_var": 0.0170318603515625, + "learning_rate": 0.0001, + "loss": 5.0383, + "loss/crossentropy": 2.0454649925231934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2725224494934082, + "step": 3606 + }, + { + "epoch": 0.07216, + "grad_norm": 2.390625, + "grad_norm_var": 0.015250651041666667, + "learning_rate": 0.0001, + "loss": 4.6582, + "loss/crossentropy": 2.1844204664230347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2729914039373398, + "step": 3608 + }, + { + "epoch": 0.0722, + "grad_norm": 2.484375, + "grad_norm_var": 0.015608723958333333, + "learning_rate": 0.0001, + "loss": 4.4613, + "loss/crossentropy": 1.8897106647491455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26307350397109985, + "step": 3610 + }, + { + "epoch": 0.07224, + "grad_norm": 2.5, + "grad_norm_var": 0.013377888997395834, + "learning_rate": 0.0001, + "loss": 4.5695, + "loss/crossentropy": 1.9441962838172913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24454529583454132, + "step": 3612 + }, + { + "epoch": 0.07228, + "grad_norm": 2.6875, + "grad_norm_var": 0.063623046875, + "learning_rate": 0.0001, + "loss": 4.7654, + "loss/crossentropy": 2.10969078540802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2900787442922592, + "step": 3614 + }, + { + "epoch": 0.07232, + "grad_norm": 2.453125, + "grad_norm_var": 0.06301676432291667, + "learning_rate": 0.0001, + "loss": 4.5195, + "loss/crossentropy": 2.1384644508361816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2722831964492798, + "step": 3616 + }, + { + "epoch": 0.07236, + "grad_norm": 2.296875, + "grad_norm_var": 0.06412760416666667, + "learning_rate": 0.0001, + "loss": 4.4995, + "loss/crossentropy": 2.0648157596588135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24860350787639618, + "step": 3618 + }, + { + "epoch": 0.0724, + "grad_norm": 2.421875, + "grad_norm_var": 0.06441650390625, + "learning_rate": 0.0001, + "loss": 4.7863, + "loss/crossentropy": 2.2188034057617188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2476331740617752, + "step": 3620 + }, + { + "epoch": 0.07244, + "grad_norm": 2.4375, + "grad_norm_var": 0.06516927083333333, + "learning_rate": 0.0001, + "loss": 4.792, + "loss/crossentropy": 2.1361395120620728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24962469190359116, + "step": 3622 + }, + { + "epoch": 0.07248, + "grad_norm": 2.46875, + "grad_norm_var": 0.06457417805989583, + "learning_rate": 0.0001, + "loss": 4.739, + "loss/crossentropy": 2.2646392583847046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.260420486330986, + "step": 3624 + }, + { + "epoch": 0.07252, + "grad_norm": 2.421875, + "grad_norm_var": 0.0630523681640625, + "learning_rate": 0.0001, + "loss": 4.6937, + "loss/crossentropy": 2.2822424173355103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2585812509059906, + "step": 3626 + }, + { + "epoch": 0.07256, + "grad_norm": 2.203125, + "grad_norm_var": 0.06852925618489583, + "learning_rate": 0.0001, + "loss": 4.2868, + "loss/crossentropy": 1.8197516798973083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24321961402893066, + "step": 3628 + }, + { + "epoch": 0.0726, + "grad_norm": 2.4375, + "grad_norm_var": 0.007649739583333333, + "learning_rate": 0.0001, + "loss": 4.4835, + "loss/crossentropy": 2.1475032567977905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2687895894050598, + "step": 3630 + }, + { + "epoch": 0.07264, + "grad_norm": 2.25, + "grad_norm_var": 0.006538899739583334, + "learning_rate": 0.0001, + "loss": 4.2353, + "loss/crossentropy": 1.9497992992401123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24942665547132492, + "step": 3632 + }, + { + "epoch": 0.07268, + "grad_norm": 2.5, + "grad_norm_var": 0.007059733072916667, + "learning_rate": 0.0001, + "loss": 4.7455, + "loss/crossentropy": 1.8786492347717285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23713821917772293, + "step": 3634 + }, + { + "epoch": 0.07272, + "grad_norm": 2.4375, + "grad_norm_var": 0.00738525390625, + "learning_rate": 0.0001, + "loss": 5.089, + "loss/crossentropy": 2.474532127380371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3133770227432251, + "step": 3636 + }, + { + "epoch": 0.07276, + "grad_norm": 2.484375, + "grad_norm_var": 0.0073394775390625, + "learning_rate": 0.0001, + "loss": 4.912, + "loss/crossentropy": 2.1231455206871033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2547219544649124, + "step": 3638 + }, + { + "epoch": 0.0728, + "grad_norm": 2.546875, + "grad_norm_var": 0.009007771809895834, + "learning_rate": 0.0001, + "loss": 4.4727, + "loss/crossentropy": 2.0511630177497864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26947300136089325, + "step": 3640 + }, + { + "epoch": 0.07284, + "grad_norm": 2.5625, + "grad_norm_var": 0.01060791015625, + "learning_rate": 0.0001, + "loss": 4.7332, + "loss/crossentropy": 1.7076187133789062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21764510869979858, + "step": 3642 + }, + { + "epoch": 0.07288, + "grad_norm": 2.359375, + "grad_norm_var": 0.009723917643229166, + "learning_rate": 0.0001, + "loss": 4.8396, + "loss/crossentropy": 2.069926142692566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26851218193769455, + "step": 3644 + }, + { + "epoch": 0.07292, + "grad_norm": 2.609375, + "grad_norm_var": 0.010054524739583333, + "learning_rate": 0.0001, + "loss": 5.0802, + "loss/crossentropy": 2.1369277238845825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27878354489803314, + "step": 3646 + }, + { + "epoch": 0.07296, + "grad_norm": 2.375, + "grad_norm_var": 0.008072916666666667, + "learning_rate": 0.0001, + "loss": 4.3659, + "loss/crossentropy": 2.095974624156952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25285808742046356, + "step": 3648 + }, + { + "epoch": 0.073, + "grad_norm": 2.5625, + "grad_norm_var": 0.011888631184895833, + "learning_rate": 0.0001, + "loss": 4.9301, + "loss/crossentropy": 2.240627646446228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24894578754901886, + "step": 3650 + }, + { + "epoch": 0.07304, + "grad_norm": 2.359375, + "grad_norm_var": 0.013899739583333333, + "learning_rate": 0.0001, + "loss": 4.6588, + "loss/crossentropy": 2.2270851135253906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26908691227436066, + "step": 3652 + }, + { + "epoch": 0.07308, + "grad_norm": 2.671875, + "grad_norm_var": 0.017039998372395834, + "learning_rate": 0.0001, + "loss": 4.7385, + "loss/crossentropy": 2.2684017419815063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28442947566509247, + "step": 3654 + }, + { + "epoch": 0.07312, + "grad_norm": 2.46875, + "grad_norm_var": 0.07785542805989583, + "learning_rate": 0.0001, + "loss": 4.7585, + "loss/crossentropy": 2.0922030806541443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2621888816356659, + "step": 3656 + }, + { + "epoch": 0.07316, + "grad_norm": 2.3125, + "grad_norm_var": 0.08323567708333333, + "learning_rate": 0.0001, + "loss": 4.7425, + "loss/crossentropy": 2.0134947896003723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2717447876930237, + "step": 3658 + }, + { + "epoch": 0.0732, + "grad_norm": 2.421875, + "grad_norm_var": 0.08206278483072917, + "learning_rate": 0.0001, + "loss": 4.5573, + "loss/crossentropy": 1.9246947765350342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25095127522945404, + "step": 3660 + }, + { + "epoch": 0.07324, + "grad_norm": 2.5625, + "grad_norm_var": 0.08561197916666667, + "learning_rate": 0.0001, + "loss": 4.8616, + "loss/crossentropy": 2.0655113458633423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24166538566350937, + "step": 3662 + }, + { + "epoch": 0.07328, + "grad_norm": 2.46875, + "grad_norm_var": 0.08198954264322916, + "learning_rate": 0.0001, + "loss": 4.9137, + "loss/crossentropy": 2.2706735730171204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26484307646751404, + "step": 3664 + }, + { + "epoch": 0.07332, + "grad_norm": 2.359375, + "grad_norm_var": 0.0883453369140625, + "learning_rate": 0.0001, + "loss": 4.2911, + "loss/crossentropy": 1.7969809770584106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23407897353172302, + "step": 3666 + }, + { + "epoch": 0.07336, + "grad_norm": 2.421875, + "grad_norm_var": 0.09109700520833333, + "learning_rate": 0.0001, + "loss": 4.7081, + "loss/crossentropy": 2.0398870706558228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2502745985984802, + "step": 3668 + }, + { + "epoch": 0.0734, + "grad_norm": 2.40625, + "grad_norm_var": 0.09381103515625, + "learning_rate": 0.0001, + "loss": 4.8738, + "loss/crossentropy": 2.090283453464508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2936585247516632, + "step": 3670 + }, + { + "epoch": 0.07344, + "grad_norm": 2.5625, + "grad_norm_var": 0.035008748372395836, + "learning_rate": 0.0001, + "loss": 4.8305, + "loss/crossentropy": 2.286925792694092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27951307594776154, + "step": 3672 + }, + { + "epoch": 0.07348, + "grad_norm": 2.40625, + "grad_norm_var": 0.032059733072916666, + "learning_rate": 0.0001, + "loss": 4.4885, + "loss/crossentropy": 2.0264610052108765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24908316135406494, + "step": 3674 + }, + { + "epoch": 0.07352, + "grad_norm": 2.6875, + "grad_norm_var": 0.0372467041015625, + "learning_rate": 0.0001, + "loss": 4.9239, + "loss/crossentropy": 2.1947755217552185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2865990996360779, + "step": 3676 + }, + { + "epoch": 0.07356, + "grad_norm": 2.703125, + "grad_norm_var": 0.0422515869140625, + "learning_rate": 0.0001, + "loss": 4.8784, + "loss/crossentropy": 2.0050416588783264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23362033069133759, + "step": 3678 + }, + { + "epoch": 0.0736, + "grad_norm": 2.4375, + "grad_norm_var": 0.041112263997395836, + "learning_rate": 0.0001, + "loss": 4.7423, + "loss/crossentropy": 1.8935424089431763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532464489340782, + "step": 3680 + }, + { + "epoch": 0.07364, + "grad_norm": 2.40625, + "grad_norm_var": 0.0350250244140625, + "learning_rate": 0.0001, + "loss": 4.7389, + "loss/crossentropy": 2.0181053280830383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2544455900788307, + "step": 3682 + }, + { + "epoch": 0.07368, + "grad_norm": 2.359375, + "grad_norm_var": 0.03144124348958333, + "learning_rate": 0.0001, + "loss": 4.7099, + "loss/crossentropy": 2.1172796487808228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27609144151210785, + "step": 3684 + }, + { + "epoch": 0.07372, + "grad_norm": 2.375, + "grad_norm_var": 0.023737589518229168, + "learning_rate": 0.0001, + "loss": 4.7185, + "loss/crossentropy": 2.3926355838775635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28892992436885834, + "step": 3686 + }, + { + "epoch": 0.07376, + "grad_norm": 2.21875, + "grad_norm_var": 0.026883951822916665, + "learning_rate": 0.0001, + "loss": 4.9086, + "loss/crossentropy": 2.2512835264205933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26166096329689026, + "step": 3688 + }, + { + "epoch": 0.0738, + "grad_norm": 2.578125, + "grad_norm_var": 0.0264312744140625, + "learning_rate": 0.0001, + "loss": 4.6311, + "loss/crossentropy": 2.0656538009643555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25399819016456604, + "step": 3690 + }, + { + "epoch": 0.07384, + "grad_norm": 2.453125, + "grad_norm_var": 0.0222808837890625, + "learning_rate": 0.0001, + "loss": 4.9565, + "loss/crossentropy": 2.454928994178772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2964669317007065, + "step": 3692 + }, + { + "epoch": 0.07388, + "grad_norm": 2.4375, + "grad_norm_var": 0.009859212239583333, + "learning_rate": 0.0001, + "loss": 4.5703, + "loss/crossentropy": 1.988040804862976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27345800399780273, + "step": 3694 + }, + { + "epoch": 0.07392, + "grad_norm": 2.671875, + "grad_norm_var": 0.013752237955729166, + "learning_rate": 0.0001, + "loss": 4.9418, + "loss/crossentropy": 1.910742998123169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24037255346775055, + "step": 3696 + }, + { + "epoch": 0.07396, + "grad_norm": 2.421875, + "grad_norm_var": 0.013700358072916667, + "learning_rate": 0.0001, + "loss": 4.6541, + "loss/crossentropy": 2.19545578956604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2778017520904541, + "step": 3698 + }, + { + "epoch": 0.074, + "grad_norm": 2.46875, + "grad_norm_var": 0.0127838134765625, + "learning_rate": 0.0001, + "loss": 4.3839, + "loss/crossentropy": 2.436691403388977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26254843175411224, + "step": 3700 + }, + { + "epoch": 0.07404, + "grad_norm": 2.5, + "grad_norm_var": 0.012398274739583333, + "learning_rate": 0.0001, + "loss": 4.833, + "loss/crossentropy": 2.7458308935165405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27952516078948975, + "step": 3702 + }, + { + "epoch": 0.07408, + "grad_norm": 2.78125, + "grad_norm_var": 0.014876302083333333, + "learning_rate": 0.0001, + "loss": 4.8308, + "loss/crossentropy": 2.2321633100509644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2633504122495651, + "step": 3704 + }, + { + "epoch": 0.07412, + "grad_norm": 2.359375, + "grad_norm_var": 0.013801066080729167, + "learning_rate": 0.0001, + "loss": 4.8159, + "loss/crossentropy": 1.9883576035499573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25319087505340576, + "step": 3706 + }, + { + "epoch": 0.07416, + "grad_norm": 2.4375, + "grad_norm_var": 0.0147857666015625, + "learning_rate": 0.0001, + "loss": 4.5546, + "loss/crossentropy": 1.7647870182991028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21732009947299957, + "step": 3708 + }, + { + "epoch": 0.0742, + "grad_norm": 2.734375, + "grad_norm_var": 0.046793619791666664, + "learning_rate": 0.0001, + "loss": 4.9271, + "loss/crossentropy": 2.1113381385803223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25888970494270325, + "step": 3710 + }, + { + "epoch": 0.07424, + "grad_norm": 2.34375, + "grad_norm_var": 0.04690348307291667, + "learning_rate": 0.0001, + "loss": 4.5878, + "loss/crossentropy": 1.975549578666687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23899925500154495, + "step": 3712 + }, + { + "epoch": 0.07428, + "grad_norm": 2.40625, + "grad_norm_var": 0.04664306640625, + "learning_rate": 0.0001, + "loss": 4.9262, + "loss/crossentropy": 2.0562495589256287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31428879499435425, + "step": 3714 + }, + { + "epoch": 0.07432, + "grad_norm": 2.265625, + "grad_norm_var": 0.04951883951822917, + "learning_rate": 0.0001, + "loss": 4.3719, + "loss/crossentropy": 2.114805221557617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25172004848718643, + "step": 3716 + }, + { + "epoch": 0.07436, + "grad_norm": 2.65625, + "grad_norm_var": 0.052783203125, + "learning_rate": 0.0001, + "loss": 4.6032, + "loss/crossentropy": 2.1865739822387695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.252632200717926, + "step": 3718 + }, + { + "epoch": 0.0744, + "grad_norm": 2.53125, + "grad_norm_var": 0.04739176432291667, + "learning_rate": 0.0001, + "loss": 4.8493, + "loss/crossentropy": 2.2550876140594482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27425335347652435, + "step": 3720 + }, + { + "epoch": 0.07444, + "grad_norm": 2.640625, + "grad_norm_var": 0.047379557291666666, + "learning_rate": 0.0001, + "loss": 4.9072, + "loss/crossentropy": 2.293414354324341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26979324221611023, + "step": 3722 + }, + { + "epoch": 0.07448, + "grad_norm": 2.59375, + "grad_norm_var": 0.043745930989583334, + "learning_rate": 0.0001, + "loss": 4.4962, + "loss/crossentropy": 2.014510452747345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25709769129753113, + "step": 3724 + }, + { + "epoch": 0.07452, + "grad_norm": 2.40625, + "grad_norm_var": 0.015746053059895834, + "learning_rate": 0.0001, + "loss": 4.6485, + "loss/crossentropy": 2.0332603454589844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2585446834564209, + "step": 3726 + }, + { + "epoch": 0.07456, + "grad_norm": 2.296875, + "grad_norm_var": 0.0165191650390625, + "learning_rate": 0.0001, + "loss": 4.7268, + "loss/crossentropy": 1.9425334930419922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22921039909124374, + "step": 3728 + }, + { + "epoch": 0.0746, + "grad_norm": 2.71875, + "grad_norm_var": 0.019090779622395835, + "learning_rate": 0.0001, + "loss": 4.9306, + "loss/crossentropy": 2.1233898997306824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26014500856399536, + "step": 3730 + }, + { + "epoch": 0.07464, + "grad_norm": 2.609375, + "grad_norm_var": 0.023566691080729167, + "learning_rate": 0.0001, + "loss": 4.9958, + "loss/crossentropy": 2.3929240703582764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2791339308023453, + "step": 3732 + }, + { + "epoch": 0.07468, + "grad_norm": 2.546875, + "grad_norm_var": 0.021370442708333333, + "learning_rate": 0.0001, + "loss": 4.6072, + "loss/crossentropy": 2.163137674331665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26793332397937775, + "step": 3734 + }, + { + "epoch": 0.07472, + "grad_norm": 2.453125, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 4.723, + "loss/crossentropy": 2.1300129294395447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2533607929944992, + "step": 3736 + }, + { + "epoch": 0.07476, + "grad_norm": 2.5, + "grad_norm_var": 0.028055826822916668, + "learning_rate": 0.0001, + "loss": 4.7232, + "loss/crossentropy": 1.9808942675590515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25632843375205994, + "step": 3738 + }, + { + "epoch": 0.0748, + "grad_norm": 2.453125, + "grad_norm_var": 0.03328348795572917, + "learning_rate": 0.0001, + "loss": 4.8219, + "loss/crossentropy": 2.161437451839447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2529585212469101, + "step": 3740 + }, + { + "epoch": 0.07484, + "grad_norm": 2.53125, + "grad_norm_var": 0.03369140625, + "learning_rate": 0.0001, + "loss": 4.6541, + "loss/crossentropy": 1.852737545967102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2333993762731552, + "step": 3742 + }, + { + "epoch": 0.07488, + "grad_norm": 2.265625, + "grad_norm_var": 0.03439127604166667, + "learning_rate": 0.0001, + "loss": 4.4355, + "loss/crossentropy": 1.664733350276947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20309723168611526, + "step": 3744 + }, + { + "epoch": 0.07492, + "grad_norm": 2.203125, + "grad_norm_var": 0.03276265462239583, + "learning_rate": 0.0001, + "loss": 4.4554, + "loss/crossentropy": 2.1815799474716187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2470541000366211, + "step": 3746 + }, + { + "epoch": 0.07496, + "grad_norm": 2.421875, + "grad_norm_var": 0.024657185872395834, + "learning_rate": 0.0001, + "loss": 4.7423, + "loss/crossentropy": 1.9546562433242798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2685271203517914, + "step": 3748 + }, + { + "epoch": 0.075, + "grad_norm": 2.25, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 4.5171, + "loss/crossentropy": 1.920817255973816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2558089941740036, + "step": 3750 + }, + { + "epoch": 0.07504, + "grad_norm": 2.265625, + "grad_norm_var": 0.024982706705729166, + "learning_rate": 0.0001, + "loss": 4.4795, + "loss/crossentropy": 1.7570490837097168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265816479921341, + "step": 3752 + }, + { + "epoch": 0.07508, + "grad_norm": 2.890625, + "grad_norm_var": 0.203564453125, + "learning_rate": 0.0001, + "loss": 4.756, + "loss/crossentropy": 2.1815105676651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536233216524124, + "step": 3754 + }, + { + "epoch": 0.07512, + "grad_norm": 2.4375, + "grad_norm_var": 0.19650777180989584, + "learning_rate": 0.0001, + "loss": 4.7178, + "loss/crossentropy": 2.1385504603385925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2445325404405594, + "step": 3756 + }, + { + "epoch": 0.07516, + "grad_norm": 2.34375, + "grad_norm_var": 0.19650777180989584, + "learning_rate": 0.0001, + "loss": 4.6449, + "loss/crossentropy": 1.7325092554092407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2419673353433609, + "step": 3758 + }, + { + "epoch": 0.0752, + "grad_norm": 2.21875, + "grad_norm_var": 0.19572652180989583, + "learning_rate": 0.0001, + "loss": 4.6096, + "loss/crossentropy": 2.358627676963806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2713842839002609, + "step": 3760 + }, + { + "epoch": 0.07524, + "grad_norm": 3.078125, + "grad_norm_var": 0.20392252604166666, + "learning_rate": 0.0001, + "loss": 5.1712, + "loss/crossentropy": 2.048672080039978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3007172644138336, + "step": 3762 + }, + { + "epoch": 0.07528, + "grad_norm": 2.609375, + "grad_norm_var": 0.19885660807291666, + "learning_rate": 0.0001, + "loss": 4.8473, + "loss/crossentropy": 2.2967183589935303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28548331558704376, + "step": 3764 + }, + { + "epoch": 0.07532, + "grad_norm": 2.3125, + "grad_norm_var": 0.19228108723958334, + "learning_rate": 0.0001, + "loss": 4.7541, + "loss/crossentropy": 2.1280438899993896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2825516611337662, + "step": 3766 + }, + { + "epoch": 0.07536, + "grad_norm": 2.375, + "grad_norm_var": 0.19146728515625, + "learning_rate": 0.0001, + "loss": 4.8404, + "loss/crossentropy": 2.5528002977371216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28681764006614685, + "step": 3768 + }, + { + "epoch": 0.0754, + "grad_norm": 2.546875, + "grad_norm_var": 0.0549468994140625, + "learning_rate": 0.0001, + "loss": 4.729, + "loss/crossentropy": 2.235885262489319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.259520560503006, + "step": 3770 + }, + { + "epoch": 0.07544, + "grad_norm": 2.421875, + "grad_norm_var": 0.05718994140625, + "learning_rate": 0.0001, + "loss": 4.4705, + "loss/crossentropy": 1.8836966753005981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2360890954732895, + "step": 3772 + }, + { + "epoch": 0.07548, + "grad_norm": 2.6875, + "grad_norm_var": 0.056538899739583336, + "learning_rate": 0.0001, + "loss": 4.9291, + "loss/crossentropy": 2.3396376371383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30392636358737946, + "step": 3774 + }, + { + "epoch": 0.07552, + "grad_norm": 2.359375, + "grad_norm_var": 0.051301066080729166, + "learning_rate": 0.0001, + "loss": 4.7518, + "loss/crossentropy": 2.4024877548217773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2858506590127945, + "step": 3776 + }, + { + "epoch": 0.07556, + "grad_norm": 2.25, + "grad_norm_var": 0.0353515625, + "learning_rate": 0.0001, + "loss": 4.4073, + "loss/crossentropy": 2.138229727745056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25458595901727676, + "step": 3778 + }, + { + "epoch": 0.0756, + "grad_norm": 2.328125, + "grad_norm_var": 0.0359375, + "learning_rate": 0.0001, + "loss": 4.3882, + "loss/crossentropy": 1.8413254618644714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23022352159023285, + "step": 3780 + }, + { + "epoch": 0.07564, + "grad_norm": 2.390625, + "grad_norm_var": 0.034764607747395836, + "learning_rate": 0.0001, + "loss": 4.613, + "loss/crossentropy": 1.8554572463035583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25644390285015106, + "step": 3782 + }, + { + "epoch": 0.07568, + "grad_norm": 2.34375, + "grad_norm_var": 0.018452962239583332, + "learning_rate": 0.0001, + "loss": 4.7013, + "loss/crossentropy": 2.0096731781959534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25370142608880997, + "step": 3784 + }, + { + "epoch": 0.07572, + "grad_norm": 2.65625, + "grad_norm_var": 0.020018513997395834, + "learning_rate": 0.0001, + "loss": 4.9317, + "loss/crossentropy": 1.7932413220405579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2426912784576416, + "step": 3786 + }, + { + "epoch": 0.07576, + "grad_norm": 2.125, + "grad_norm_var": 0.0263336181640625, + "learning_rate": 0.0001, + "loss": 4.1599, + "loss/crossentropy": 2.0372042655944824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24203844368457794, + "step": 3788 + }, + { + "epoch": 0.0758, + "grad_norm": 2.296875, + "grad_norm_var": 0.021773274739583334, + "learning_rate": 0.0001, + "loss": 4.3627, + "loss/crossentropy": 1.8986076712608337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24614746868610382, + "step": 3790 + }, + { + "epoch": 0.07584, + "grad_norm": 2.234375, + "grad_norm_var": 0.022135416666666668, + "learning_rate": 0.0001, + "loss": 4.563, + "loss/crossentropy": 1.8080393075942993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23206676542758942, + "step": 3792 + }, + { + "epoch": 0.07588, + "grad_norm": 2.25, + "grad_norm_var": 0.015067545572916667, + "learning_rate": 0.0001, + "loss": 4.6857, + "loss/crossentropy": 1.7578041553497314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21686340868473053, + "step": 3794 + }, + { + "epoch": 0.07592, + "grad_norm": 2.40625, + "grad_norm_var": 0.015087890625, + "learning_rate": 0.0001, + "loss": 4.4938, + "loss/crossentropy": 2.0115376710891724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2651440501213074, + "step": 3796 + }, + { + "epoch": 0.07596, + "grad_norm": 2.40625, + "grad_norm_var": 0.015803019205729168, + "learning_rate": 0.0001, + "loss": 4.598, + "loss/crossentropy": 2.028555393218994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26161982119083405, + "step": 3798 + }, + { + "epoch": 0.076, + "grad_norm": 2.515625, + "grad_norm_var": 0.016429646809895834, + "learning_rate": 0.0001, + "loss": 4.8315, + "loss/crossentropy": 2.158663272857666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28110067546367645, + "step": 3800 + }, + { + "epoch": 0.07604, + "grad_norm": 2.734375, + "grad_norm_var": 0.019652303059895834, + "learning_rate": 0.0001, + "loss": 5.2376, + "loss/crossentropy": 2.2959556579589844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28477419912815094, + "step": 3802 + }, + { + "epoch": 0.07608, + "grad_norm": 2.296875, + "grad_norm_var": 0.014997355143229167, + "learning_rate": 0.0001, + "loss": 4.5289, + "loss/crossentropy": 2.149766206741333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25216003507375717, + "step": 3804 + }, + { + "epoch": 0.07612, + "grad_norm": 2.390625, + "grad_norm_var": 0.01416015625, + "learning_rate": 0.0001, + "loss": 4.7496, + "loss/crossentropy": 1.9866302609443665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26193149387836456, + "step": 3806 + }, + { + "epoch": 0.07616, + "grad_norm": 2.46875, + "grad_norm_var": 0.0122955322265625, + "learning_rate": 0.0001, + "loss": 4.9968, + "loss/crossentropy": 2.4230403900146484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27897247672080994, + "step": 3808 + }, + { + "epoch": 0.0762, + "grad_norm": 2.65625, + "grad_norm_var": 0.012613932291666666, + "learning_rate": 0.0001, + "loss": 4.9133, + "loss/crossentropy": 2.2995522022247314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27623192965984344, + "step": 3810 + }, + { + "epoch": 0.07624, + "grad_norm": 2.484375, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 4.6632, + "loss/crossentropy": 2.167468547821045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26121728122234344, + "step": 3812 + }, + { + "epoch": 0.07628, + "grad_norm": 2.5, + "grad_norm_var": 0.013353474934895833, + "learning_rate": 0.0001, + "loss": 4.8435, + "loss/crossentropy": 2.3259944915771484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29026439785957336, + "step": 3814 + }, + { + "epoch": 0.07632, + "grad_norm": 2.46875, + "grad_norm_var": 0.041825358072916666, + "learning_rate": 0.0001, + "loss": 4.8704, + "loss/crossentropy": 2.18080472946167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2550061345100403, + "step": 3816 + }, + { + "epoch": 0.07636, + "grad_norm": 2.390625, + "grad_norm_var": 0.03871968587239583, + "learning_rate": 0.0001, + "loss": 4.5681, + "loss/crossentropy": 2.1185330748558044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25014883279800415, + "step": 3818 + }, + { + "epoch": 0.0764, + "grad_norm": 2.28125, + "grad_norm_var": 0.039383951822916666, + "learning_rate": 0.0001, + "loss": 4.5776, + "loss/crossentropy": 1.9028193354606628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22502654790878296, + "step": 3820 + }, + { + "epoch": 0.07644, + "grad_norm": 2.640625, + "grad_norm_var": 0.039713541666666664, + "learning_rate": 0.0001, + "loss": 5.0188, + "loss/crossentropy": 2.266402840614319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24728389829397202, + "step": 3822 + }, + { + "epoch": 0.07648, + "grad_norm": 2.546875, + "grad_norm_var": 0.04279683430989583, + "learning_rate": 0.0001, + "loss": 4.7036, + "loss/crossentropy": 2.0918440222740173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2669295519590378, + "step": 3824 + }, + { + "epoch": 0.07652, + "grad_norm": 2.296875, + "grad_norm_var": 0.04456380208333333, + "learning_rate": 0.0001, + "loss": 4.1446, + "loss/crossentropy": 1.6120481491088867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21910656243562698, + "step": 3826 + }, + { + "epoch": 0.07656, + "grad_norm": 2.609375, + "grad_norm_var": 0.044331868489583336, + "learning_rate": 0.0001, + "loss": 4.6352, + "loss/crossentropy": 2.2294809818267822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2720891535282135, + "step": 3828 + }, + { + "epoch": 0.0766, + "grad_norm": 2.765625, + "grad_norm_var": 0.0524322509765625, + "learning_rate": 0.0001, + "loss": 4.873, + "loss/crossentropy": 2.2588730454444885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2613115608692169, + "step": 3830 + }, + { + "epoch": 0.07664, + "grad_norm": 2.28125, + "grad_norm_var": 0.02584228515625, + "learning_rate": 0.0001, + "loss": 4.5881, + "loss/crossentropy": 2.2658292055130005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2729395925998688, + "step": 3832 + }, + { + "epoch": 0.07668, + "grad_norm": 2.484375, + "grad_norm_var": 0.02662353515625, + "learning_rate": 0.0001, + "loss": 5.1787, + "loss/crossentropy": 2.314574718475342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28539060056209564, + "step": 3834 + }, + { + "epoch": 0.07672, + "grad_norm": 2.546875, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 5.0388, + "loss/crossentropy": 2.4684417247772217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29843954741954803, + "step": 3836 + }, + { + "epoch": 0.07676, + "grad_norm": 2.546875, + "grad_norm_var": 0.025944010416666666, + "learning_rate": 0.0001, + "loss": 4.5976, + "loss/crossentropy": 2.528733253479004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2696636766195297, + "step": 3838 + }, + { + "epoch": 0.0768, + "grad_norm": 2.390625, + "grad_norm_var": 0.025340779622395834, + "learning_rate": 0.0001, + "loss": 4.6449, + "loss/crossentropy": 2.203901529312134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26598773896694183, + "step": 3840 + }, + { + "epoch": 0.07684, + "grad_norm": 2.5625, + "grad_norm_var": 0.021907552083333334, + "learning_rate": 0.0001, + "loss": 4.7322, + "loss/crossentropy": 1.9192892909049988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25631098449230194, + "step": 3842 + }, + { + "epoch": 0.07688, + "grad_norm": 2.3125, + "grad_norm_var": 0.022077433268229165, + "learning_rate": 0.0001, + "loss": 4.6372, + "loss/crossentropy": 1.8314838409423828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22842589765787125, + "step": 3844 + }, + { + "epoch": 0.07692, + "grad_norm": 2.3125, + "grad_norm_var": 0.013395182291666667, + "learning_rate": 0.0001, + "loss": 4.6023, + "loss/crossentropy": 2.2416744232177734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24890189617872238, + "step": 3846 + }, + { + "epoch": 0.07696, + "grad_norm": 2.46875, + "grad_norm_var": 0.012580362955729167, + "learning_rate": 0.0001, + "loss": 4.8454, + "loss/crossentropy": 2.034749209880829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29580502212047577, + "step": 3848 + }, + { + "epoch": 0.077, + "grad_norm": 2.4375, + "grad_norm_var": 0.0117095947265625, + "learning_rate": 0.0001, + "loss": 4.5923, + "loss/crossentropy": 1.9982805848121643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2618003487586975, + "step": 3850 + }, + { + "epoch": 0.07704, + "grad_norm": 2.59375, + "grad_norm_var": 0.0127105712890625, + "learning_rate": 0.0001, + "loss": 4.7704, + "loss/crossentropy": 2.065816104412079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2571987137198448, + "step": 3852 + }, + { + "epoch": 0.07708, + "grad_norm": 2.46875, + "grad_norm_var": 0.0100982666015625, + "learning_rate": 0.0001, + "loss": 4.7493, + "loss/crossentropy": 1.933334231376648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24824900180101395, + "step": 3854 + }, + { + "epoch": 0.07712, + "grad_norm": 2.28125, + "grad_norm_var": 0.010514322916666667, + "learning_rate": 0.0001, + "loss": 4.5805, + "loss/crossentropy": 1.9197405576705933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2559673935174942, + "step": 3856 + }, + { + "epoch": 0.07716, + "grad_norm": 2.3125, + "grad_norm_var": 0.010252888997395833, + "learning_rate": 0.0001, + "loss": 4.429, + "loss/crossentropy": 2.307250142097473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2613677680492401, + "step": 3858 + }, + { + "epoch": 0.0772, + "grad_norm": 2.28125, + "grad_norm_var": 0.01011962890625, + "learning_rate": 0.0001, + "loss": 4.4494, + "loss/crossentropy": 2.1120635271072388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24830741435289383, + "step": 3860 + }, + { + "epoch": 0.07724, + "grad_norm": 2.484375, + "grad_norm_var": 0.039013671875, + "learning_rate": 0.0001, + "loss": 4.8585, + "loss/crossentropy": 2.404169201850891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2960711419582367, + "step": 3862 + }, + { + "epoch": 0.07728, + "grad_norm": 2.421875, + "grad_norm_var": 0.038899739583333336, + "learning_rate": 0.0001, + "loss": 5.0257, + "loss/crossentropy": 2.2490307688713074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2569812461733818, + "step": 3864 + }, + { + "epoch": 0.07732, + "grad_norm": 2.34375, + "grad_norm_var": 0.0386871337890625, + "learning_rate": 0.0001, + "loss": 4.9086, + "loss/crossentropy": 2.0773178339004517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505042105913162, + "step": 3866 + }, + { + "epoch": 0.07736, + "grad_norm": 2.40625, + "grad_norm_var": 0.038899739583333336, + "learning_rate": 0.0001, + "loss": 4.4621, + "loss/crossentropy": 1.83626389503479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22876176983118057, + "step": 3868 + }, + { + "epoch": 0.0774, + "grad_norm": 2.34375, + "grad_norm_var": 0.03911031087239583, + "learning_rate": 0.0001, + "loss": 4.5298, + "loss/crossentropy": 1.8159971833229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22787011414766312, + "step": 3870 + }, + { + "epoch": 0.07744, + "grad_norm": 2.453125, + "grad_norm_var": 0.0378326416015625, + "learning_rate": 0.0001, + "loss": 4.5995, + "loss/crossentropy": 2.0361026525497437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23802263289690018, + "step": 3872 + }, + { + "epoch": 0.07748, + "grad_norm": 2.703125, + "grad_norm_var": 0.04433492024739583, + "learning_rate": 0.0001, + "loss": 4.9506, + "loss/crossentropy": 2.2464375495910645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26598919928073883, + "step": 3874 + }, + { + "epoch": 0.07752, + "grad_norm": 2.453125, + "grad_norm_var": 0.0408355712890625, + "learning_rate": 0.0001, + "loss": 4.9416, + "loss/crossentropy": 2.163287401199341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29068198800086975, + "step": 3876 + }, + { + "epoch": 0.07756, + "grad_norm": 2.5, + "grad_norm_var": 0.22014567057291667, + "learning_rate": 0.0001, + "loss": 4.9478, + "loss/crossentropy": 2.1638875007629395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2609352171421051, + "step": 3878 + }, + { + "epoch": 0.0776, + "grad_norm": 2.703125, + "grad_norm_var": 0.21923421223958334, + "learning_rate": 0.0001, + "loss": 4.6247, + "loss/crossentropy": 1.9216270446777344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25062238425016403, + "step": 3880 + }, + { + "epoch": 0.07764, + "grad_norm": 2.828125, + "grad_norm_var": 0.21585184733072918, + "learning_rate": 0.0001, + "loss": 5.0786, + "loss/crossentropy": 2.036958694458008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2897229939699173, + "step": 3882 + }, + { + "epoch": 0.07768, + "grad_norm": 2.5625, + "grad_norm_var": 0.2094635009765625, + "learning_rate": 0.0001, + "loss": 4.6062, + "loss/crossentropy": 2.1493492126464844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.258526012301445, + "step": 3884 + }, + { + "epoch": 0.07772, + "grad_norm": 2.484375, + "grad_norm_var": 0.20414937337239583, + "learning_rate": 0.0001, + "loss": 4.468, + "loss/crossentropy": 2.0496288537979126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2386137992143631, + "step": 3886 + }, + { + "epoch": 0.07776, + "grad_norm": 2.40625, + "grad_norm_var": 0.20829671223958332, + "learning_rate": 0.0001, + "loss": 4.2402, + "loss/crossentropy": 1.5763422846794128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2227308303117752, + "step": 3888 + }, + { + "epoch": 0.0778, + "grad_norm": 2.640625, + "grad_norm_var": 0.20852864583333333, + "learning_rate": 0.0001, + "loss": 5.0582, + "loss/crossentropy": 2.5032416582107544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3047761619091034, + "step": 3890 + }, + { + "epoch": 0.07784, + "grad_norm": 2.375, + "grad_norm_var": 0.21199544270833334, + "learning_rate": 0.0001, + "loss": 4.7694, + "loss/crossentropy": 2.3609601259231567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27848154306411743, + "step": 3892 + }, + { + "epoch": 0.07788, + "grad_norm": 2.328125, + "grad_norm_var": 0.0202056884765625, + "learning_rate": 0.0001, + "loss": 4.776, + "loss/crossentropy": 2.188078999519348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2570301741361618, + "step": 3894 + }, + { + "epoch": 0.07792, + "grad_norm": 2.546875, + "grad_norm_var": 0.018733723958333334, + "learning_rate": 0.0001, + "loss": 4.8374, + "loss/crossentropy": 1.9860637784004211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25376833230257034, + "step": 3896 + }, + { + "epoch": 0.07796, + "grad_norm": 2.234375, + "grad_norm_var": 0.010399373372395833, + "learning_rate": 0.0001, + "loss": 4.404, + "loss/crossentropy": 2.0886037945747375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24766233563423157, + "step": 3898 + }, + { + "epoch": 0.078, + "grad_norm": 2.3125, + "grad_norm_var": 0.009845987955729166, + "learning_rate": 0.0001, + "loss": 4.6833, + "loss/crossentropy": 2.373010039329529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27916407585144043, + "step": 3900 + }, + { + "epoch": 0.07804, + "grad_norm": 2.359375, + "grad_norm_var": 0.009837849934895834, + "learning_rate": 0.0001, + "loss": 4.6295, + "loss/crossentropy": 1.6733890771865845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211885154247284, + "step": 3902 + }, + { + "epoch": 0.07808, + "grad_norm": 2.46875, + "grad_norm_var": 0.010184733072916667, + "learning_rate": 0.0001, + "loss": 4.6588, + "loss/crossentropy": 2.0506675243377686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27091431617736816, + "step": 3904 + }, + { + "epoch": 0.07812, + "grad_norm": 2.28125, + "grad_norm_var": 0.008784993489583334, + "learning_rate": 0.0001, + "loss": 4.712, + "loss/crossentropy": 2.3200724124908447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26983049511909485, + "step": 3906 + }, + { + "epoch": 0.07816, + "grad_norm": 2.765625, + "grad_norm_var": 0.0164459228515625, + "learning_rate": 0.0001, + "loss": 4.7171, + "loss/crossentropy": 1.928814709186554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2423655241727829, + "step": 3908 + }, + { + "epoch": 0.0782, + "grad_norm": 2.40625, + "grad_norm_var": 0.016389973958333335, + "learning_rate": 0.0001, + "loss": 4.6944, + "loss/crossentropy": 2.007555842399597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2444767728447914, + "step": 3910 + }, + { + "epoch": 0.07824, + "grad_norm": 2.875, + "grad_norm_var": 0.027469889322916666, + "learning_rate": 0.0001, + "loss": 4.7955, + "loss/crossentropy": 2.2054057121276855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2849784791469574, + "step": 3912 + }, + { + "epoch": 0.07828, + "grad_norm": 2.53125, + "grad_norm_var": 0.03986002604166667, + "learning_rate": 0.0001, + "loss": 4.6885, + "loss/crossentropy": 2.331532597541809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2820790112018585, + "step": 3914 + }, + { + "epoch": 0.07832, + "grad_norm": 2.40625, + "grad_norm_var": 0.036519368489583336, + "learning_rate": 0.0001, + "loss": 4.6967, + "loss/crossentropy": 2.142041563987732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25799722969532013, + "step": 3916 + }, + { + "epoch": 0.07836, + "grad_norm": 2.625, + "grad_norm_var": 0.0394683837890625, + "learning_rate": 0.0001, + "loss": 4.5415, + "loss/crossentropy": 2.010735809803009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24202881753444672, + "step": 3918 + }, + { + "epoch": 0.0784, + "grad_norm": 2.4375, + "grad_norm_var": 0.03970947265625, + "learning_rate": 0.0001, + "loss": 5.037, + "loss/crossentropy": 2.382808804512024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27038049697875977, + "step": 3920 + }, + { + "epoch": 0.07844, + "grad_norm": 2.375, + "grad_norm_var": 0.0372955322265625, + "learning_rate": 0.0001, + "loss": 4.7708, + "loss/crossentropy": 2.099658191204071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557126358151436, + "step": 3922 + }, + { + "epoch": 0.07848, + "grad_norm": 2.375, + "grad_norm_var": 0.03968098958333333, + "learning_rate": 0.0001, + "loss": 4.3775, + "loss/crossentropy": 1.7840275764465332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22102414071559906, + "step": 3924 + }, + { + "epoch": 0.07852, + "grad_norm": 2.5, + "grad_norm_var": 0.039159138997395836, + "learning_rate": 0.0001, + "loss": 4.6637, + "loss/crossentropy": 1.8730725049972534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23219943791627884, + "step": 3926 + }, + { + "epoch": 0.07856, + "grad_norm": 2.203125, + "grad_norm_var": 0.034520467122395836, + "learning_rate": 0.0001, + "loss": 4.4432, + "loss/crossentropy": 1.9218623638153076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23333143442869186, + "step": 3928 + }, + { + "epoch": 0.0786, + "grad_norm": 2.46875, + "grad_norm_var": 0.0152740478515625, + "learning_rate": 0.0001, + "loss": 4.8944, + "loss/crossentropy": 1.9885727763175964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2596626430749893, + "step": 3930 + }, + { + "epoch": 0.07864, + "grad_norm": 2.390625, + "grad_norm_var": 0.014989217122395834, + "learning_rate": 0.0001, + "loss": 4.9358, + "loss/crossentropy": 2.397018015384674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2790713906288147, + "step": 3932 + }, + { + "epoch": 0.07868, + "grad_norm": 2.1875, + "grad_norm_var": 0.014383951822916666, + "learning_rate": 0.0001, + "loss": 4.3062, + "loss/crossentropy": 1.7345170378684998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23668452352285385, + "step": 3934 + }, + { + "epoch": 0.07872, + "grad_norm": 2.25, + "grad_norm_var": 0.013309733072916666, + "learning_rate": 0.0001, + "loss": 4.6173, + "loss/crossentropy": 1.8630162477493286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2371346428990364, + "step": 3936 + }, + { + "epoch": 0.07876, + "grad_norm": 2.40625, + "grad_norm_var": 0.010367838541666667, + "learning_rate": 0.0001, + "loss": 4.5774, + "loss/crossentropy": 2.0738128423690796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24340355396270752, + "step": 3938 + }, + { + "epoch": 0.0788, + "grad_norm": 2.40625, + "grad_norm_var": 0.010835774739583333, + "learning_rate": 0.0001, + "loss": 5.0027, + "loss/crossentropy": 2.2932467460632324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2527567446231842, + "step": 3940 + }, + { + "epoch": 0.07884, + "grad_norm": 2.46875, + "grad_norm_var": 0.009956868489583333, + "learning_rate": 0.0001, + "loss": 4.6785, + "loss/crossentropy": 2.0000113248825073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23461253196001053, + "step": 3942 + }, + { + "epoch": 0.07888, + "grad_norm": 2.296875, + "grad_norm_var": 0.008430989583333333, + "learning_rate": 0.0001, + "loss": 4.7307, + "loss/crossentropy": 2.0753955841064453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2535083442926407, + "step": 3944 + }, + { + "epoch": 0.07892, + "grad_norm": 2.40625, + "grad_norm_var": 0.008153279622395834, + "learning_rate": 0.0001, + "loss": 4.6906, + "loss/crossentropy": 2.167261242866516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24734552949666977, + "step": 3946 + }, + { + "epoch": 0.07896, + "grad_norm": 2.578125, + "grad_norm_var": 0.0108795166015625, + "learning_rate": 0.0001, + "loss": 4.7347, + "loss/crossentropy": 1.9755831956863403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24861737340688705, + "step": 3948 + }, + { + "epoch": 0.079, + "grad_norm": 2.484375, + "grad_norm_var": 0.009261067708333333, + "learning_rate": 0.0001, + "loss": 4.6813, + "loss/crossentropy": 2.195417881011963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2518697530031204, + "step": 3950 + }, + { + "epoch": 0.07904, + "grad_norm": 2.828125, + "grad_norm_var": 0.01920166015625, + "learning_rate": 0.0001, + "loss": 5.0172, + "loss/crossentropy": 2.5771371126174927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28423887491226196, + "step": 3952 + }, + { + "epoch": 0.07908, + "grad_norm": 2.375, + "grad_norm_var": 0.017870076497395835, + "learning_rate": 0.0001, + "loss": 4.7071, + "loss/crossentropy": 1.683276355266571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2075405865907669, + "step": 3954 + }, + { + "epoch": 0.07912, + "grad_norm": 2.515625, + "grad_norm_var": 0.018993123372395834, + "learning_rate": 0.0001, + "loss": 4.7128, + "loss/crossentropy": 2.2983756065368652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28010235726833344, + "step": 3956 + }, + { + "epoch": 0.07916, + "grad_norm": 2.421875, + "grad_norm_var": 0.020361328125, + "learning_rate": 0.0001, + "loss": 4.7896, + "loss/crossentropy": 2.3263272047042847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28625819087028503, + "step": 3958 + }, + { + "epoch": 0.0792, + "grad_norm": 2.40625, + "grad_norm_var": 0.0204742431640625, + "learning_rate": 0.0001, + "loss": 4.5201, + "loss/crossentropy": 1.9820871353149414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2594129145145416, + "step": 3960 + }, + { + "epoch": 0.07924, + "grad_norm": 2.171875, + "grad_norm_var": 0.025634765625, + "learning_rate": 0.0001, + "loss": 4.4754, + "loss/crossentropy": 1.8991515636444092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23694587498903275, + "step": 3962 + }, + { + "epoch": 0.07928, + "grad_norm": 2.921875, + "grad_norm_var": 0.03843994140625, + "learning_rate": 0.0001, + "loss": 4.9865, + "loss/crossentropy": 2.485508918762207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2957809865474701, + "step": 3964 + }, + { + "epoch": 0.07932, + "grad_norm": 2.390625, + "grad_norm_var": 0.037262980143229166, + "learning_rate": 0.0001, + "loss": 4.8871, + "loss/crossentropy": 2.156081974506378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27474651485681534, + "step": 3966 + }, + { + "epoch": 0.07936, + "grad_norm": 2.5625, + "grad_norm_var": 0.028544108072916668, + "learning_rate": 0.0001, + "loss": 4.8694, + "loss/crossentropy": 2.0370571613311768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2732074484229088, + "step": 3968 + }, + { + "epoch": 0.0794, + "grad_norm": 2.421875, + "grad_norm_var": 0.028251139322916667, + "learning_rate": 0.0001, + "loss": 5.062, + "loss/crossentropy": 2.3039989471435547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31054411828517914, + "step": 3970 + }, + { + "epoch": 0.07944, + "grad_norm": 2.34375, + "grad_norm_var": 0.028641764322916666, + "learning_rate": 0.0001, + "loss": 4.7875, + "loss/crossentropy": 2.2280107736587524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2711277902126312, + "step": 3972 + }, + { + "epoch": 0.07948, + "grad_norm": 2.625, + "grad_norm_var": 0.02935791015625, + "learning_rate": 0.0001, + "loss": 4.8992, + "loss/crossentropy": 2.0609869956970215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24142058193683624, + "step": 3974 + }, + { + "epoch": 0.07952, + "grad_norm": 2.296875, + "grad_norm_var": 0.03277587890625, + "learning_rate": 0.0001, + "loss": 4.1391, + "loss/crossentropy": 2.0541876554489136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24537865817546844, + "step": 3976 + }, + { + "epoch": 0.07956, + "grad_norm": 2.578125, + "grad_norm_var": 0.6102701822916666, + "learning_rate": 0.0001, + "loss": 4.9591, + "loss/crossentropy": 2.344236969947815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2770863175392151, + "step": 3978 + }, + { + "epoch": 0.0796, + "grad_norm": 2.625, + "grad_norm_var": 0.65465087890625, + "learning_rate": 0.0001, + "loss": 4.7232, + "loss/crossentropy": 1.7899338603019714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23951984196901321, + "step": 3980 + }, + { + "epoch": 0.07964, + "grad_norm": 2.328125, + "grad_norm_var": 0.6591471354166667, + "learning_rate": 0.0001, + "loss": 4.6771, + "loss/crossentropy": 2.3253976106643677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2691802680492401, + "step": 3982 + }, + { + "epoch": 0.07968, + "grad_norm": 2.453125, + "grad_norm_var": 0.65885009765625, + "learning_rate": 0.0001, + "loss": 4.8223, + "loss/crossentropy": 2.14141583442688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2603989467024803, + "step": 3984 + }, + { + "epoch": 0.07972, + "grad_norm": 2.53125, + "grad_norm_var": 0.6512196858723959, + "learning_rate": 0.0001, + "loss": 4.9059, + "loss/crossentropy": 2.262465476989746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.291474387049675, + "step": 3986 + }, + { + "epoch": 0.07976, + "grad_norm": 2.328125, + "grad_norm_var": 0.66064453125, + "learning_rate": 0.0001, + "loss": 4.5626, + "loss/crossentropy": 2.1835561990737915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23615659773349762, + "step": 3988 + }, + { + "epoch": 0.0798, + "grad_norm": 2.359375, + "grad_norm_var": 0.6716145833333333, + "learning_rate": 0.0001, + "loss": 4.8482, + "loss/crossentropy": 2.020140767097473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2593151703476906, + "step": 3990 + }, + { + "epoch": 0.07984, + "grad_norm": 2.59375, + "grad_norm_var": 0.6518513997395833, + "learning_rate": 0.0001, + "loss": 4.7986, + "loss/crossentropy": 2.277661681175232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2721617519855499, + "step": 3992 + }, + { + "epoch": 0.07988, + "grad_norm": 2.34375, + "grad_norm_var": 0.09846089680989584, + "learning_rate": 0.0001, + "loss": 4.5113, + "loss/crossentropy": 2.1883193254470825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2610222101211548, + "step": 3994 + }, + { + "epoch": 0.07992, + "grad_norm": 2.421875, + "grad_norm_var": 0.01754150390625, + "learning_rate": 0.0001, + "loss": 4.5987, + "loss/crossentropy": 2.152850031852722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27418845891952515, + "step": 3996 + }, + { + "epoch": 0.07996, + "grad_norm": 2.546875, + "grad_norm_var": 0.017513020833333334, + "learning_rate": 0.0001, + "loss": 4.8214, + "loss/crossentropy": 2.3313716650009155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27529503405094147, + "step": 3998 + }, + { + "epoch": 0.08, + "grad_norm": 2.640625, + "grad_norm_var": 0.020319620768229168, + "learning_rate": 0.0001, + "loss": 5.086, + "loss/crossentropy": 2.250498414039612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26368021965026855, + "step": 4000 + }, + { + "epoch": 0.08004, + "grad_norm": 2.4375, + "grad_norm_var": 0.011188761393229166, + "learning_rate": 0.0001, + "loss": 4.8005, + "loss/crossentropy": 2.322459101676941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2868216335773468, + "step": 4002 + }, + { + "epoch": 0.08008, + "grad_norm": 2.40625, + "grad_norm_var": 0.0142730712890625, + "learning_rate": 0.0001, + "loss": 4.8693, + "loss/crossentropy": 1.9340506792068481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316983863711357, + "step": 4004 + }, + { + "epoch": 0.08012, + "grad_norm": 2.359375, + "grad_norm_var": 0.0150054931640625, + "learning_rate": 0.0001, + "loss": 4.7395, + "loss/crossentropy": 1.8635645508766174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22694773972034454, + "step": 4006 + }, + { + "epoch": 0.08016, + "grad_norm": 10.375, + "grad_norm_var": 3.9615631103515625, + "learning_rate": 0.0001, + "loss": 4.8916, + "loss/crossentropy": 1.9252317547798157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24893560260534286, + "step": 4008 + }, + { + "epoch": 0.0802, + "grad_norm": 2.671875, + "grad_norm_var": 3.9093424479166665, + "learning_rate": 0.0001, + "loss": 5.2636, + "loss/crossentropy": 2.1964328289031982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2567693591117859, + "step": 4010 + }, + { + "epoch": 0.08024, + "grad_norm": 2.671875, + "grad_norm_var": 3.8960113525390625, + "learning_rate": 0.0001, + "loss": 4.9054, + "loss/crossentropy": 2.296012043952942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27360107749700546, + "step": 4012 + }, + { + "epoch": 0.08028, + "grad_norm": 2.453125, + "grad_norm_var": 3.9009724934895833, + "learning_rate": 0.0001, + "loss": 4.8894, + "loss/crossentropy": 2.360015869140625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27541905641555786, + "step": 4014 + }, + { + "epoch": 0.08032, + "grad_norm": 2.390625, + "grad_norm_var": 3.906591796875, + "learning_rate": 0.0001, + "loss": 4.8865, + "loss/crossentropy": 2.36370050907135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27881547808647156, + "step": 4016 + }, + { + "epoch": 0.08036, + "grad_norm": 2.25, + "grad_norm_var": 3.9068593343098956, + "learning_rate": 0.0001, + "loss": 4.6461, + "loss/crossentropy": 1.8704780340194702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2512069493532181, + "step": 4018 + }, + { + "epoch": 0.0804, + "grad_norm": 2.4375, + "grad_norm_var": 3.9156483968098956, + "learning_rate": 0.0001, + "loss": 4.733, + "loss/crossentropy": 2.1989234685897827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2755106985569, + "step": 4020 + }, + { + "epoch": 0.08044, + "grad_norm": 2.21875, + "grad_norm_var": 3.9420237223307293, + "learning_rate": 0.0001, + "loss": 4.3471, + "loss/crossentropy": 1.9905433058738708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2579897418618202, + "step": 4022 + }, + { + "epoch": 0.08048, + "grad_norm": 2.515625, + "grad_norm_var": 0.08837788899739583, + "learning_rate": 0.0001, + "loss": 4.9025, + "loss/crossentropy": 2.270000696182251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2693728804588318, + "step": 4024 + }, + { + "epoch": 0.08052, + "grad_norm": 2.375, + "grad_norm_var": 0.08504130045572916, + "learning_rate": 0.0001, + "loss": 4.7569, + "loss/crossentropy": 2.178301692008972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2617819905281067, + "step": 4026 + }, + { + "epoch": 0.08056, + "grad_norm": 2.25, + "grad_norm_var": 0.08346354166666667, + "learning_rate": 0.0001, + "loss": 4.687, + "loss/crossentropy": 2.518654465675354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29282887279987335, + "step": 4028 + }, + { + "epoch": 0.0806, + "grad_norm": 2.25, + "grad_norm_var": 0.08859049479166667, + "learning_rate": 0.0001, + "loss": 4.4825, + "loss/crossentropy": 1.9181422591209412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23210199177265167, + "step": 4030 + }, + { + "epoch": 0.08064, + "grad_norm": 2.359375, + "grad_norm_var": 0.08816630045572917, + "learning_rate": 0.0001, + "loss": 4.6407, + "loss/crossentropy": 2.343222141265869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28908614814281464, + "step": 4032 + }, + { + "epoch": 0.08068, + "grad_norm": 2.390625, + "grad_norm_var": 0.0302886962890625, + "learning_rate": 0.0001, + "loss": 4.6879, + "loss/crossentropy": 2.0816845893859863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2571762502193451, + "step": 4034 + }, + { + "epoch": 0.08072, + "grad_norm": 2.296875, + "grad_norm_var": 0.029878743489583335, + "learning_rate": 0.0001, + "loss": 4.3324, + "loss/crossentropy": 1.9679544568061829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316160574555397, + "step": 4036 + }, + { + "epoch": 0.08076, + "grad_norm": 2.328125, + "grad_norm_var": 0.029195149739583332, + "learning_rate": 0.0001, + "loss": 4.7335, + "loss/crossentropy": 2.1553682684898376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25725623965263367, + "step": 4038 + }, + { + "epoch": 0.0808, + "grad_norm": 2.421875, + "grad_norm_var": 0.004964192708333333, + "learning_rate": 0.0001, + "loss": 4.6892, + "loss/crossentropy": 2.0269790291786194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26298412680625916, + "step": 4040 + }, + { + "epoch": 0.08084, + "grad_norm": 2.484375, + "grad_norm_var": 0.0060943603515625, + "learning_rate": 0.0001, + "loss": 4.6686, + "loss/crossentropy": 1.984773874282837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24288517236709595, + "step": 4042 + }, + { + "epoch": 0.08088, + "grad_norm": 2.3125, + "grad_norm_var": 0.0051910400390625, + "learning_rate": 0.0001, + "loss": 4.9282, + "loss/crossentropy": 2.178356111049652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2684163451194763, + "step": 4044 + }, + { + "epoch": 0.08092, + "grad_norm": 2.46875, + "grad_norm_var": 0.0053670247395833336, + "learning_rate": 0.0001, + "loss": 4.8191, + "loss/crossentropy": 2.235984683036804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26377667486667633, + "step": 4046 + }, + { + "epoch": 0.08096, + "grad_norm": 2.34375, + "grad_norm_var": 0.005826822916666667, + "learning_rate": 0.0001, + "loss": 4.7026, + "loss/crossentropy": 2.085321545600891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23403701931238174, + "step": 4048 + }, + { + "epoch": 0.081, + "grad_norm": 2.3125, + "grad_norm_var": 0.0052642822265625, + "learning_rate": 0.0001, + "loss": 4.9932, + "loss/crossentropy": 2.419228672981262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27236658334732056, + "step": 4050 + }, + { + "epoch": 0.08104, + "grad_norm": 2.421875, + "grad_norm_var": 0.0054972330729166664, + "learning_rate": 0.0001, + "loss": 4.6105, + "loss/crossentropy": 2.153620958328247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28071053326129913, + "step": 4052 + }, + { + "epoch": 0.08108, + "grad_norm": 2.71875, + "grad_norm_var": 0.012743123372395833, + "learning_rate": 0.0001, + "loss": 4.8775, + "loss/crossentropy": 2.1466477513313293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27491800487041473, + "step": 4054 + }, + { + "epoch": 0.08112, + "grad_norm": 2.265625, + "grad_norm_var": 0.013932291666666667, + "learning_rate": 0.0001, + "loss": 4.3707, + "loss/crossentropy": 2.2020710706710815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25149518996477127, + "step": 4056 + }, + { + "epoch": 0.08116, + "grad_norm": 2.34375, + "grad_norm_var": 0.013623046875, + "learning_rate": 0.0001, + "loss": 4.8458, + "loss/crossentropy": 2.264205574989319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27612583339214325, + "step": 4058 + }, + { + "epoch": 0.0812, + "grad_norm": 2.4375, + "grad_norm_var": 0.016434733072916666, + "learning_rate": 0.0001, + "loss": 4.8644, + "loss/crossentropy": 2.269905209541321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2692303955554962, + "step": 4060 + }, + { + "epoch": 0.08124, + "grad_norm": 2.34375, + "grad_norm_var": 0.018505859375, + "learning_rate": 0.0001, + "loss": 4.5057, + "loss/crossentropy": 1.920631766319275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22550494968891144, + "step": 4062 + }, + { + "epoch": 0.08128, + "grad_norm": 2.53125, + "grad_norm_var": 0.019749959309895832, + "learning_rate": 0.0001, + "loss": 5.0796, + "loss/crossentropy": 2.307617664337158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22719035297632217, + "step": 4064 + }, + { + "epoch": 0.08132, + "grad_norm": 2.375, + "grad_norm_var": 0.022102864583333333, + "learning_rate": 0.0001, + "loss": 4.6167, + "loss/crossentropy": 2.113444685935974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.240354023873806, + "step": 4066 + }, + { + "epoch": 0.08136, + "grad_norm": 2.375, + "grad_norm_var": 0.02232666015625, + "learning_rate": 0.0001, + "loss": 4.9152, + "loss/crossentropy": 2.4516230821609497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27590544521808624, + "step": 4068 + }, + { + "epoch": 0.0814, + "grad_norm": 2.34375, + "grad_norm_var": 0.08772379557291667, + "learning_rate": 0.0001, + "loss": 4.5976, + "loss/crossentropy": 1.8287339806556702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22503511607646942, + "step": 4070 + }, + { + "epoch": 0.08144, + "grad_norm": 2.5, + "grad_norm_var": 0.08479715983072916, + "learning_rate": 0.0001, + "loss": 5.1623, + "loss/crossentropy": 2.3468997478485107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2773839682340622, + "step": 4072 + }, + { + "epoch": 0.08148, + "grad_norm": 2.296875, + "grad_norm_var": 0.08782145182291666, + "learning_rate": 0.0001, + "loss": 4.5413, + "loss/crossentropy": 2.1307512521743774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24247504770755768, + "step": 4074 + }, + { + "epoch": 0.08152, + "grad_norm": 2.21875, + "grad_norm_var": 0.08982645670572917, + "learning_rate": 0.0001, + "loss": 4.7447, + "loss/crossentropy": 2.248755097389221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2696859538555145, + "step": 4076 + }, + { + "epoch": 0.08156, + "grad_norm": 2.265625, + "grad_norm_var": 0.09045817057291666, + "learning_rate": 0.0001, + "loss": 4.2948, + "loss/crossentropy": 2.0233980417251587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2370016872882843, + "step": 4078 + }, + { + "epoch": 0.0816, + "grad_norm": 2.234375, + "grad_norm_var": 0.0923828125, + "learning_rate": 0.0001, + "loss": 4.432, + "loss/crossentropy": 1.9536627531051636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23242933303117752, + "step": 4080 + }, + { + "epoch": 0.08164, + "grad_norm": 2.375, + "grad_norm_var": 0.0889801025390625, + "learning_rate": 0.0001, + "loss": 4.5037, + "loss/crossentropy": 1.9631904363632202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2337196245789528, + "step": 4082 + }, + { + "epoch": 0.08168, + "grad_norm": 2.546875, + "grad_norm_var": 0.08935139973958334, + "learning_rate": 0.0001, + "loss": 4.7406, + "loss/crossentropy": 2.193789482116699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2687358558177948, + "step": 4084 + }, + { + "epoch": 0.08172, + "grad_norm": 2.40625, + "grad_norm_var": 0.020210774739583333, + "learning_rate": 0.0001, + "loss": 4.6454, + "loss/crossentropy": 2.308240056037903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2679348289966583, + "step": 4086 + }, + { + "epoch": 0.08176, + "grad_norm": 2.359375, + "grad_norm_var": 0.021903483072916667, + "learning_rate": 0.0001, + "loss": 4.9126, + "loss/crossentropy": 2.343047261238098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3056950569152832, + "step": 4088 + }, + { + "epoch": 0.0818, + "grad_norm": 2.234375, + "grad_norm_var": 0.022526041666666666, + "learning_rate": 0.0001, + "loss": 4.7315, + "loss/crossentropy": 1.9583085179328918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23911744356155396, + "step": 4090 + }, + { + "epoch": 0.08184, + "grad_norm": 2.296875, + "grad_norm_var": 0.021198527018229166, + "learning_rate": 0.0001, + "loss": 4.6559, + "loss/crossentropy": 2.341569185256958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26053962111473083, + "step": 4092 + }, + { + "epoch": 0.08188, + "grad_norm": 2.921875, + "grad_norm_var": 0.034601847330729164, + "learning_rate": 0.0001, + "loss": 4.5773, + "loss/crossentropy": 2.068669080734253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25753986835479736, + "step": 4094 + }, + { + "epoch": 0.08192, + "grad_norm": 2.625, + "grad_norm_var": 0.04309488932291667, + "learning_rate": 0.0001, + "loss": 5.0253, + "loss/crossentropy": 2.1461241841316223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2628704681992531, + "step": 4096 + }, + { + "epoch": 0.08196, + "grad_norm": 2.3125, + "grad_norm_var": 0.04810791015625, + "learning_rate": 0.0001, + "loss": 4.5587, + "loss/crossentropy": 2.0718055963516235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24923217296600342, + "step": 4098 + }, + { + "epoch": 0.082, + "grad_norm": 2.359375, + "grad_norm_var": 0.0503082275390625, + "learning_rate": 0.0001, + "loss": 4.379, + "loss/crossentropy": 1.9812004566192627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316955253481865, + "step": 4100 + }, + { + "epoch": 0.08204, + "grad_norm": 2.3125, + "grad_norm_var": 0.0465240478515625, + "learning_rate": 0.0001, + "loss": 4.7909, + "loss/crossentropy": 2.2669100761413574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2806926518678665, + "step": 4102 + }, + { + "epoch": 0.08208, + "grad_norm": 2.375, + "grad_norm_var": 0.043196614583333334, + "learning_rate": 0.0001, + "loss": 4.7502, + "loss/crossentropy": 2.0620261430740356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2610047310590744, + "step": 4104 + }, + { + "epoch": 0.08212, + "grad_norm": 3.046875, + "grad_norm_var": 0.06297098795572917, + "learning_rate": 0.0001, + "loss": 4.6672, + "loss/crossentropy": 2.249971866607666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2783341556787491, + "step": 4106 + }, + { + "epoch": 0.08216, + "grad_norm": 2.328125, + "grad_norm_var": 0.06320699055989583, + "learning_rate": 0.0001, + "loss": 4.834, + "loss/crossentropy": 2.1064823865890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27534550428390503, + "step": 4108 + }, + { + "epoch": 0.0822, + "grad_norm": 2.28125, + "grad_norm_var": 0.05406901041666667, + "learning_rate": 0.0001, + "loss": 4.3944, + "loss/crossentropy": 1.886509656906128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2315160632133484, + "step": 4110 + }, + { + "epoch": 0.08224, + "grad_norm": 2.40625, + "grad_norm_var": 0.039290364583333334, + "learning_rate": 0.0001, + "loss": 4.2969, + "loss/crossentropy": 1.6429635286331177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20593100041151047, + "step": 4112 + }, + { + "epoch": 0.08228, + "grad_norm": 2.40625, + "grad_norm_var": 0.0370025634765625, + "learning_rate": 0.0001, + "loss": 4.4581, + "loss/crossentropy": 2.3236618041992188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2674623131752014, + "step": 4114 + }, + { + "epoch": 0.08232, + "grad_norm": 2.578125, + "grad_norm_var": 0.03658447265625, + "learning_rate": 0.0001, + "loss": 4.9734, + "loss/crossentropy": 2.1479567885398865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25003983825445175, + "step": 4116 + }, + { + "epoch": 0.08236, + "grad_norm": 2.578125, + "grad_norm_var": 0.03611551920572917, + "learning_rate": 0.0001, + "loss": 5.0477, + "loss/crossentropy": 2.140569031238556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24322029948234558, + "step": 4118 + }, + { + "epoch": 0.0824, + "grad_norm": 2.328125, + "grad_norm_var": 0.03762613932291667, + "learning_rate": 0.0001, + "loss": 4.6061, + "loss/crossentropy": 2.126375436782837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27629759907722473, + "step": 4120 + }, + { + "epoch": 0.08244, + "grad_norm": 2.28125, + "grad_norm_var": 0.015843709309895832, + "learning_rate": 0.0001, + "loss": 4.9143, + "loss/crossentropy": 2.3699214458465576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2692546397447586, + "step": 4122 + }, + { + "epoch": 0.08248, + "grad_norm": 2.296875, + "grad_norm_var": 0.010904947916666666, + "learning_rate": 0.0001, + "loss": 4.5672, + "loss/crossentropy": 2.013331353664398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24442073702812195, + "step": 4124 + }, + { + "epoch": 0.08252, + "grad_norm": 2.359375, + "grad_norm_var": 0.010152180989583334, + "learning_rate": 0.0001, + "loss": 4.4841, + "loss/crossentropy": 2.1869460344314575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.250150203704834, + "step": 4126 + }, + { + "epoch": 0.08256, + "grad_norm": 2.203125, + "grad_norm_var": 0.0127838134765625, + "learning_rate": 0.0001, + "loss": 4.4564, + "loss/crossentropy": 2.2725884914398193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2810261696577072, + "step": 4128 + }, + { + "epoch": 0.0826, + "grad_norm": 2.328125, + "grad_norm_var": 0.013688151041666667, + "learning_rate": 0.0001, + "loss": 4.7311, + "loss/crossentropy": 1.9190022945404053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2693525403738022, + "step": 4130 + }, + { + "epoch": 0.08264, + "grad_norm": 2.65625, + "grad_norm_var": 0.016112263997395834, + "learning_rate": 0.0001, + "loss": 4.7967, + "loss/crossentropy": 2.5477795600891113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27040669322013855, + "step": 4132 + }, + { + "epoch": 0.08268, + "grad_norm": 2.40625, + "grad_norm_var": 0.0113677978515625, + "learning_rate": 0.0001, + "loss": 4.7617, + "loss/crossentropy": 2.231198728084564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28101250529289246, + "step": 4134 + }, + { + "epoch": 0.08272, + "grad_norm": 2.296875, + "grad_norm_var": 0.011693318684895834, + "learning_rate": 0.0001, + "loss": 4.6334, + "loss/crossentropy": 2.17776882648468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24743208289146423, + "step": 4136 + }, + { + "epoch": 0.08276, + "grad_norm": 2.46875, + "grad_norm_var": 0.011799112955729166, + "learning_rate": 0.0001, + "loss": 5.0233, + "loss/crossentropy": 2.418373703956604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2954525351524353, + "step": 4138 + }, + { + "epoch": 0.0828, + "grad_norm": 2.3125, + "grad_norm_var": 0.010969034830729167, + "learning_rate": 0.0001, + "loss": 4.5564, + "loss/crossentropy": 2.054605543613434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23863950371742249, + "step": 4140 + }, + { + "epoch": 0.08284, + "grad_norm": 2.5, + "grad_norm_var": 0.011872355143229167, + "learning_rate": 0.0001, + "loss": 4.8983, + "loss/crossentropy": 2.054013967514038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29851172864437103, + "step": 4142 + }, + { + "epoch": 0.08288, + "grad_norm": 2.46875, + "grad_norm_var": 0.008610026041666666, + "learning_rate": 0.0001, + "loss": 4.7425, + "loss/crossentropy": 2.193961024284363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2563214898109436, + "step": 4144 + }, + { + "epoch": 0.08292, + "grad_norm": 2.40625, + "grad_norm_var": 0.008382161458333334, + "learning_rate": 0.0001, + "loss": 4.7995, + "loss/crossentropy": 2.460008382797241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26705074310302734, + "step": 4146 + }, + { + "epoch": 0.08296, + "grad_norm": 2.3125, + "grad_norm_var": 0.005060831705729167, + "learning_rate": 0.0001, + "loss": 4.894, + "loss/crossentropy": 2.508321523666382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26217761635780334, + "step": 4148 + }, + { + "epoch": 0.083, + "grad_norm": 2.328125, + "grad_norm_var": 0.0065582275390625, + "learning_rate": 0.0001, + "loss": 4.6103, + "loss/crossentropy": 1.8445284366607666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23396535962820053, + "step": 4150 + }, + { + "epoch": 0.08304, + "grad_norm": 2.375, + "grad_norm_var": 0.005952962239583333, + "learning_rate": 0.0001, + "loss": 4.8048, + "loss/crossentropy": 2.433600902557373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2825329154729843, + "step": 4152 + }, + { + "epoch": 0.08308, + "grad_norm": 2.421875, + "grad_norm_var": 0.0053212483723958336, + "learning_rate": 0.0001, + "loss": 4.8632, + "loss/crossentropy": 2.386221170425415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2941686511039734, + "step": 4154 + }, + { + "epoch": 0.08312, + "grad_norm": 2.421875, + "grad_norm_var": 0.0052154541015625, + "learning_rate": 0.0001, + "loss": 4.7486, + "loss/crossentropy": 1.9578949809074402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24851053953170776, + "step": 4156 + }, + { + "epoch": 0.08316, + "grad_norm": 2.3125, + "grad_norm_var": 0.004150390625, + "learning_rate": 0.0001, + "loss": 4.6443, + "loss/crossentropy": 2.0034408569335938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25524984300136566, + "step": 4158 + }, + { + "epoch": 0.0832, + "grad_norm": 2.28125, + "grad_norm_var": 0.0069488525390625, + "learning_rate": 0.0001, + "loss": 4.6856, + "loss/crossentropy": 2.3342589139938354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31023962795734406, + "step": 4160 + }, + { + "epoch": 0.08324, + "grad_norm": 2.46875, + "grad_norm_var": 0.042867024739583336, + "learning_rate": 0.0001, + "loss": 4.6762, + "loss/crossentropy": 2.3941839933395386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2641746401786804, + "step": 4162 + }, + { + "epoch": 0.08328, + "grad_norm": 2.453125, + "grad_norm_var": 0.04168294270833333, + "learning_rate": 0.0001, + "loss": 4.7149, + "loss/crossentropy": 2.371219038963318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24197939038276672, + "step": 4164 + }, + { + "epoch": 0.08332, + "grad_norm": 2.734375, + "grad_norm_var": 0.04442952473958333, + "learning_rate": 0.0001, + "loss": 4.7949, + "loss/crossentropy": 2.133378028869629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25449906289577484, + "step": 4166 + }, + { + "epoch": 0.08336, + "grad_norm": 2.4375, + "grad_norm_var": 0.04496968587239583, + "learning_rate": 0.0001, + "loss": 4.5974, + "loss/crossentropy": 1.7460412979125977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157151699066162, + "step": 4168 + }, + { + "epoch": 0.0834, + "grad_norm": 2.609375, + "grad_norm_var": 0.046873982747395834, + "learning_rate": 0.0001, + "loss": 4.7234, + "loss/crossentropy": 2.215083122253418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2823774367570877, + "step": 4170 + }, + { + "epoch": 0.08344, + "grad_norm": 2.453125, + "grad_norm_var": 0.04820048014322917, + "learning_rate": 0.0001, + "loss": 4.5189, + "loss/crossentropy": 2.0528377890586853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2551003098487854, + "step": 4172 + }, + { + "epoch": 0.08348, + "grad_norm": 2.359375, + "grad_norm_var": 0.05054423014322917, + "learning_rate": 0.0001, + "loss": 4.4008, + "loss/crossentropy": 1.7953855395317078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23185917735099792, + "step": 4174 + }, + { + "epoch": 0.08352, + "grad_norm": 2.34375, + "grad_norm_var": 0.058934529622395836, + "learning_rate": 0.0001, + "loss": 4.4879, + "loss/crossentropy": 2.0794734954833984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23386041820049286, + "step": 4176 + }, + { + "epoch": 0.08356, + "grad_norm": 2.234375, + "grad_norm_var": 0.031086222330729166, + "learning_rate": 0.0001, + "loss": 4.3802, + "loss/crossentropy": 2.1685845851898193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26074862480163574, + "step": 4178 + }, + { + "epoch": 0.0836, + "grad_norm": 2.484375, + "grad_norm_var": 0.031412760416666664, + "learning_rate": 0.0001, + "loss": 4.5507, + "loss/crossentropy": 2.1495825052261353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2689145505428314, + "step": 4180 + }, + { + "epoch": 0.08364, + "grad_norm": 2.328125, + "grad_norm_var": 0.024967447916666666, + "learning_rate": 0.0001, + "loss": 4.5258, + "loss/crossentropy": 2.043331503868103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2498578578233719, + "step": 4182 + }, + { + "epoch": 0.08368, + "grad_norm": 2.734375, + "grad_norm_var": 0.07787984212239583, + "learning_rate": 0.0001, + "loss": 5.0198, + "loss/crossentropy": 2.04026997089386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2873340845108032, + "step": 4184 + }, + { + "epoch": 0.08372, + "grad_norm": 2.515625, + "grad_norm_var": 0.09128316243489583, + "learning_rate": 0.0001, + "loss": 4.9537, + "loss/crossentropy": 2.4653968811035156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2911294251680374, + "step": 4186 + }, + { + "epoch": 0.08376, + "grad_norm": 2.421875, + "grad_norm_var": 0.09215494791666666, + "learning_rate": 0.0001, + "loss": 4.6589, + "loss/crossentropy": 2.2960848808288574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23984040319919586, + "step": 4188 + }, + { + "epoch": 0.0838, + "grad_norm": 2.515625, + "grad_norm_var": 0.09599202473958333, + "learning_rate": 0.0001, + "loss": 4.4324, + "loss/crossentropy": 2.011807084083557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2523125037550926, + "step": 4190 + }, + { + "epoch": 0.08384, + "grad_norm": 2.421875, + "grad_norm_var": 0.0886871337890625, + "learning_rate": 0.0001, + "loss": 4.8437, + "loss/crossentropy": 2.0016889572143555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2291206791996956, + "step": 4192 + }, + { + "epoch": 0.08388, + "grad_norm": 2.1875, + "grad_norm_var": 0.09378255208333333, + "learning_rate": 0.0001, + "loss": 4.3604, + "loss/crossentropy": 1.97197824716568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2595779076218605, + "step": 4194 + }, + { + "epoch": 0.08392, + "grad_norm": 2.34375, + "grad_norm_var": 0.09763895670572917, + "learning_rate": 0.0001, + "loss": 4.5823, + "loss/crossentropy": 2.2910103797912598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24870596826076508, + "step": 4196 + }, + { + "epoch": 0.08396, + "grad_norm": 2.234375, + "grad_norm_var": 0.1001617431640625, + "learning_rate": 0.0001, + "loss": 4.562, + "loss/crossentropy": 2.1453208923339844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25001347810029984, + "step": 4198 + }, + { + "epoch": 0.084, + "grad_norm": 2.359375, + "grad_norm_var": 0.0398834228515625, + "learning_rate": 0.0001, + "loss": 4.8835, + "loss/crossentropy": 2.1935043334960938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26807525753974915, + "step": 4200 + }, + { + "epoch": 0.08404, + "grad_norm": 2.3125, + "grad_norm_var": 0.009765625, + "learning_rate": 0.0001, + "loss": 4.5912, + "loss/crossentropy": 2.039341926574707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460380420088768, + "step": 4202 + }, + { + "epoch": 0.08408, + "grad_norm": 3.09375, + "grad_norm_var": 0.0478179931640625, + "learning_rate": 0.0001, + "loss": 4.8243, + "loss/crossentropy": 2.4660122394561768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3042101263999939, + "step": 4204 + }, + { + "epoch": 0.08412, + "grad_norm": 2.453125, + "grad_norm_var": 0.08088785807291667, + "learning_rate": 0.0001, + "loss": 4.8635, + "loss/crossentropy": 1.9346272349357605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25924334675073624, + "step": 4206 + }, + { + "epoch": 0.08416, + "grad_norm": 2.1875, + "grad_norm_var": 0.08311258951822917, + "learning_rate": 0.0001, + "loss": 4.5152, + "loss/crossentropy": 2.0120063424110413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24327433109283447, + "step": 4208 + }, + { + "epoch": 0.0842, + "grad_norm": 2.703125, + "grad_norm_var": 0.0852935791015625, + "learning_rate": 0.0001, + "loss": 4.6148, + "loss/crossentropy": 2.2359931468963623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25374244898557663, + "step": 4210 + }, + { + "epoch": 0.08424, + "grad_norm": 2.53125, + "grad_norm_var": 0.08185221354166666, + "learning_rate": 0.0001, + "loss": 4.5751, + "loss/crossentropy": 2.0038134455680847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22315364331007004, + "step": 4212 + }, + { + "epoch": 0.08428, + "grad_norm": 2.296875, + "grad_norm_var": 0.08567301432291667, + "learning_rate": 0.0001, + "loss": 4.721, + "loss/crossentropy": 2.2041471004486084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2655494213104248, + "step": 4214 + }, + { + "epoch": 0.08432, + "grad_norm": 2.390625, + "grad_norm_var": 0.08399149576822916, + "learning_rate": 0.0001, + "loss": 4.8091, + "loss/crossentropy": 2.344551682472229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2743126451969147, + "step": 4216 + }, + { + "epoch": 0.08436, + "grad_norm": 2.453125, + "grad_norm_var": 0.08025614420572917, + "learning_rate": 0.0001, + "loss": 4.7162, + "loss/crossentropy": 1.9694250226020813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24425261467695236, + "step": 4218 + }, + { + "epoch": 0.0844, + "grad_norm": 2.40625, + "grad_norm_var": 0.05671284993489583, + "learning_rate": 0.0001, + "loss": 4.8526, + "loss/crossentropy": 2.164921760559082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.272469699382782, + "step": 4220 + }, + { + "epoch": 0.08444, + "grad_norm": 2.484375, + "grad_norm_var": 0.02642822265625, + "learning_rate": 0.0001, + "loss": 4.5513, + "loss/crossentropy": 1.944575309753418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23848393559455872, + "step": 4222 + }, + { + "epoch": 0.08448, + "grad_norm": 2.359375, + "grad_norm_var": 0.025386555989583334, + "learning_rate": 0.0001, + "loss": 4.7416, + "loss/crossentropy": 2.278227686882019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2558315545320511, + "step": 4224 + }, + { + "epoch": 0.08452, + "grad_norm": 2.453125, + "grad_norm_var": 0.01802978515625, + "learning_rate": 0.0001, + "loss": 4.7318, + "loss/crossentropy": 2.035117268562317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2585935667157173, + "step": 4226 + }, + { + "epoch": 0.08456, + "grad_norm": 2.234375, + "grad_norm_var": 0.020466105143229166, + "learning_rate": 0.0001, + "loss": 4.5674, + "loss/crossentropy": 2.0172035694122314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23832131922245026, + "step": 4228 + }, + { + "epoch": 0.0846, + "grad_norm": 2.484375, + "grad_norm_var": 0.011165364583333334, + "learning_rate": 0.0001, + "loss": 4.8113, + "loss/crossentropy": 2.0574535727500916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23829226195812225, + "step": 4230 + }, + { + "epoch": 0.08464, + "grad_norm": 2.3125, + "grad_norm_var": 0.010773722330729167, + "learning_rate": 0.0001, + "loss": 4.6776, + "loss/crossentropy": 2.5003366470336914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27905476093292236, + "step": 4232 + }, + { + "epoch": 0.08468, + "grad_norm": 2.578125, + "grad_norm_var": 0.012043253580729166, + "learning_rate": 0.0001, + "loss": 4.9137, + "loss/crossentropy": 2.207367777824402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27426937222480774, + "step": 4234 + }, + { + "epoch": 0.08472, + "grad_norm": 2.1875, + "grad_norm_var": 0.01627197265625, + "learning_rate": 0.0001, + "loss": 4.716, + "loss/crossentropy": 2.240189790725708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26208513230085373, + "step": 4236 + }, + { + "epoch": 0.08476, + "grad_norm": 2.3125, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 4.4569, + "loss/crossentropy": 2.1357412338256836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24108420312404633, + "step": 4238 + }, + { + "epoch": 0.0848, + "grad_norm": 2.296875, + "grad_norm_var": 0.013004557291666666, + "learning_rate": 0.0001, + "loss": 4.7249, + "loss/crossentropy": 2.1073816418647766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25213342159986496, + "step": 4240 + }, + { + "epoch": 0.08484, + "grad_norm": 2.40625, + "grad_norm_var": 0.013102213541666666, + "learning_rate": 0.0001, + "loss": 4.9558, + "loss/crossentropy": 2.158124566078186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505848854780197, + "step": 4242 + }, + { + "epoch": 0.08488, + "grad_norm": 2.328125, + "grad_norm_var": 0.011555989583333334, + "learning_rate": 0.0001, + "loss": 4.7743, + "loss/crossentropy": 2.253539562225342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2675466388463974, + "step": 4244 + }, + { + "epoch": 0.08492, + "grad_norm": 2.5, + "grad_norm_var": 0.01396484375, + "learning_rate": 0.0001, + "loss": 4.272, + "loss/crossentropy": 1.7170023918151855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20157357305288315, + "step": 4246 + }, + { + "epoch": 0.08496, + "grad_norm": 2.40625, + "grad_norm_var": 0.0188385009765625, + "learning_rate": 0.0001, + "loss": 4.4479, + "loss/crossentropy": 2.082640767097473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2600134015083313, + "step": 4248 + }, + { + "epoch": 0.085, + "grad_norm": 2.34375, + "grad_norm_var": 0.0165679931640625, + "learning_rate": 0.0001, + "loss": 4.4176, + "loss/crossentropy": 2.044301390647888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23220707476139069, + "step": 4250 + }, + { + "epoch": 0.08504, + "grad_norm": 2.453125, + "grad_norm_var": 0.0168121337890625, + "learning_rate": 0.0001, + "loss": 4.648, + "loss/crossentropy": 2.293405532836914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28027066588401794, + "step": 4252 + }, + { + "epoch": 0.08508, + "grad_norm": 2.265625, + "grad_norm_var": 0.016649373372395835, + "learning_rate": 0.0001, + "loss": 4.55, + "loss/crossentropy": 2.2604206800460815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25545646995306015, + "step": 4254 + }, + { + "epoch": 0.08512, + "grad_norm": 2.65625, + "grad_norm_var": 0.023949178059895833, + "learning_rate": 0.0001, + "loss": 4.6258, + "loss/crossentropy": 2.118361234664917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24799348413944244, + "step": 4256 + }, + { + "epoch": 0.08516, + "grad_norm": 2.390625, + "grad_norm_var": 0.022847493489583332, + "learning_rate": 0.0001, + "loss": 4.6751, + "loss/crossentropy": 1.9369969964027405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24676478654146194, + "step": 4258 + }, + { + "epoch": 0.0852, + "grad_norm": 2.40625, + "grad_norm_var": 0.021728515625, + "learning_rate": 0.0001, + "loss": 4.5197, + "loss/crossentropy": 2.075170874595642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21742676943540573, + "step": 4260 + }, + { + "epoch": 0.08524, + "grad_norm": 2.296875, + "grad_norm_var": 0.018973795572916667, + "learning_rate": 0.0001, + "loss": 4.4112, + "loss/crossentropy": 2.056099236011505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2490842342376709, + "step": 4262 + }, + { + "epoch": 0.08528, + "grad_norm": 2.328125, + "grad_norm_var": 0.014436848958333333, + "learning_rate": 0.0001, + "loss": 4.6169, + "loss/crossentropy": 2.2279993891716003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24964337795972824, + "step": 4264 + }, + { + "epoch": 0.08532, + "grad_norm": 2.375, + "grad_norm_var": 0.011393229166666666, + "learning_rate": 0.0001, + "loss": 4.6686, + "loss/crossentropy": 2.1645933389663696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24576786905527115, + "step": 4266 + }, + { + "epoch": 0.08536, + "grad_norm": 2.25, + "grad_norm_var": 0.010887654622395833, + "learning_rate": 0.0001, + "loss": 4.4458, + "loss/crossentropy": 1.9033920764923096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23393237590789795, + "step": 4268 + }, + { + "epoch": 0.0854, + "grad_norm": 2.46875, + "grad_norm_var": 0.011865234375, + "learning_rate": 0.0001, + "loss": 4.4022, + "loss/crossentropy": 2.153634190559387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2496839165687561, + "step": 4270 + }, + { + "epoch": 0.08544, + "grad_norm": 2.28125, + "grad_norm_var": 0.0045206705729166664, + "learning_rate": 0.0001, + "loss": 4.4781, + "loss/crossentropy": 1.9188589453697205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23544569313526154, + "step": 4272 + }, + { + "epoch": 0.08548, + "grad_norm": 2.328125, + "grad_norm_var": 0.004264322916666666, + "learning_rate": 0.0001, + "loss": 4.704, + "loss/crossentropy": 2.4337977170944214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2951700836420059, + "step": 4274 + }, + { + "epoch": 0.08552, + "grad_norm": 2.359375, + "grad_norm_var": 0.003902180989583333, + "learning_rate": 0.0001, + "loss": 4.7051, + "loss/crossentropy": 1.9108383059501648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24459081888198853, + "step": 4276 + }, + { + "epoch": 0.08556, + "grad_norm": 2.375, + "grad_norm_var": 0.003123982747395833, + "learning_rate": 0.0001, + "loss": 4.3751, + "loss/crossentropy": 1.6632736921310425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22495487332344055, + "step": 4278 + }, + { + "epoch": 0.0856, + "grad_norm": 2.296875, + "grad_norm_var": 0.021312459309895834, + "learning_rate": 0.0001, + "loss": 4.8144, + "loss/crossentropy": 2.519997477531433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2657178193330765, + "step": 4280 + }, + { + "epoch": 0.08564, + "grad_norm": 2.640625, + "grad_norm_var": 0.025275675455729167, + "learning_rate": 0.0001, + "loss": 4.6837, + "loss/crossentropy": 2.150822162628174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30192580074071884, + "step": 4282 + }, + { + "epoch": 0.08568, + "grad_norm": 2.421875, + "grad_norm_var": 0.023763020833333332, + "learning_rate": 0.0001, + "loss": 4.7411, + "loss/crossentropy": 1.9970062971115112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24695321917533875, + "step": 4284 + }, + { + "epoch": 0.08572, + "grad_norm": 2.25, + "grad_norm_var": 0.024372355143229166, + "learning_rate": 0.0001, + "loss": 4.5361, + "loss/crossentropy": 2.3136098384857178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25082121044397354, + "step": 4286 + }, + { + "epoch": 0.08576, + "grad_norm": 2.5625, + "grad_norm_var": 0.025992838541666667, + "learning_rate": 0.0001, + "loss": 4.9171, + "loss/crossentropy": 2.112035870552063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27019062638282776, + "step": 4288 + }, + { + "epoch": 0.0858, + "grad_norm": 2.328125, + "grad_norm_var": 0.025992838541666667, + "learning_rate": 0.0001, + "loss": 4.4985, + "loss/crossentropy": 2.068653643131256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24297921359539032, + "step": 4290 + }, + { + "epoch": 0.08584, + "grad_norm": 2.375, + "grad_norm_var": 0.025520833333333333, + "learning_rate": 0.0001, + "loss": 4.5182, + "loss/crossentropy": 1.9013578295707703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23058265447616577, + "step": 4292 + }, + { + "epoch": 0.08588, + "grad_norm": 2.15625, + "grad_norm_var": 0.0294830322265625, + "learning_rate": 0.0001, + "loss": 4.6825, + "loss/crossentropy": 2.149984359741211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25223904848098755, + "step": 4294 + }, + { + "epoch": 0.08592, + "grad_norm": 2.328125, + "grad_norm_var": 0.014469401041666666, + "learning_rate": 0.0001, + "loss": 4.4109, + "loss/crossentropy": 1.894010066986084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2323223054409027, + "step": 4296 + }, + { + "epoch": 0.08596, + "grad_norm": 2.421875, + "grad_norm_var": 0.010123697916666667, + "learning_rate": 0.0001, + "loss": 4.7653, + "loss/crossentropy": 2.3351621627807617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27194930613040924, + "step": 4298 + }, + { + "epoch": 0.086, + "grad_norm": 2.328125, + "grad_norm_var": 0.0097320556640625, + "learning_rate": 0.0001, + "loss": 4.741, + "loss/crossentropy": 2.224352180957794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24884501099586487, + "step": 4300 + }, + { + "epoch": 0.08604, + "grad_norm": 2.421875, + "grad_norm_var": 0.008854166666666666, + "learning_rate": 0.0001, + "loss": 4.6592, + "loss/crossentropy": 1.908318042755127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24747492372989655, + "step": 4302 + }, + { + "epoch": 0.08608, + "grad_norm": 2.296875, + "grad_norm_var": 0.00758056640625, + "learning_rate": 0.0001, + "loss": 4.8566, + "loss/crossentropy": 2.1990396976470947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27217794954776764, + "step": 4304 + }, + { + "epoch": 0.08612, + "grad_norm": 2.390625, + "grad_norm_var": 0.0098297119140625, + "learning_rate": 0.0001, + "loss": 4.6432, + "loss/crossentropy": 2.3146010637283325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26875850558280945, + "step": 4306 + }, + { + "epoch": 0.08616, + "grad_norm": 2.46875, + "grad_norm_var": 0.015208943684895834, + "learning_rate": 0.0001, + "loss": 4.8254, + "loss/crossentropy": 2.2507941722869873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27052539587020874, + "step": 4308 + }, + { + "epoch": 0.0862, + "grad_norm": 2.234375, + "grad_norm_var": 0.013199869791666667, + "learning_rate": 0.0001, + "loss": 4.4067, + "loss/crossentropy": 1.9077125787734985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22987178713083267, + "step": 4310 + }, + { + "epoch": 0.08624, + "grad_norm": 2.515625, + "grad_norm_var": 0.01353759765625, + "learning_rate": 0.0001, + "loss": 4.4822, + "loss/crossentropy": 1.951395332813263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2392890453338623, + "step": 4312 + }, + { + "epoch": 0.08628, + "grad_norm": 2.53125, + "grad_norm_var": 0.033854166666666664, + "learning_rate": 0.0001, + "loss": 4.5371, + "loss/crossentropy": 1.9426860213279724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24374966323375702, + "step": 4314 + }, + { + "epoch": 0.08632, + "grad_norm": 3.390625, + "grad_norm_var": 0.09062398274739583, + "learning_rate": 0.0001, + "loss": 5.253, + "loss/crossentropy": 2.2508288621902466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3862452507019043, + "step": 4316 + }, + { + "epoch": 0.08636, + "grad_norm": 2.359375, + "grad_norm_var": 0.09058329264322916, + "learning_rate": 0.0001, + "loss": 4.5288, + "loss/crossentropy": 2.1161463260650635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557579278945923, + "step": 4318 + }, + { + "epoch": 0.0864, + "grad_norm": 2.3125, + "grad_norm_var": 0.09374593098958334, + "learning_rate": 0.0001, + "loss": 5.1146, + "loss/crossentropy": 2.2570544481277466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31447017192840576, + "step": 4320 + }, + { + "epoch": 0.08644, + "grad_norm": 2.328125, + "grad_norm_var": 0.08740234375, + "learning_rate": 0.0001, + "loss": 4.9724, + "loss/crossentropy": 2.3211100101470947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26107798516750336, + "step": 4322 + }, + { + "epoch": 0.08648, + "grad_norm": 2.25, + "grad_norm_var": 0.09231669108072917, + "learning_rate": 0.0001, + "loss": 4.5236, + "loss/crossentropy": 2.1451058387756348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23633568733930588, + "step": 4324 + }, + { + "epoch": 0.08652, + "grad_norm": 2.21875, + "grad_norm_var": 0.09463602701822917, + "learning_rate": 0.0001, + "loss": 4.5828, + "loss/crossentropy": 1.9880141615867615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23688867688179016, + "step": 4326 + }, + { + "epoch": 0.08656, + "grad_norm": 2.296875, + "grad_norm_var": 0.09724934895833333, + "learning_rate": 0.0001, + "loss": 4.7098, + "loss/crossentropy": 2.021056890487671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2543798238039017, + "step": 4328 + }, + { + "epoch": 0.0866, + "grad_norm": 2.359375, + "grad_norm_var": 0.0814453125, + "learning_rate": 0.0001, + "loss": 4.6439, + "loss/crossentropy": 2.1323755979537964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27410852909088135, + "step": 4330 + }, + { + "epoch": 0.08664, + "grad_norm": 2.296875, + "grad_norm_var": 0.014940388997395833, + "learning_rate": 0.0001, + "loss": 4.6945, + "loss/crossentropy": 1.9674875736236572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24818265438079834, + "step": 4332 + }, + { + "epoch": 0.08668, + "grad_norm": 2.25, + "grad_norm_var": 0.01490478515625, + "learning_rate": 0.0001, + "loss": 4.1911, + "loss/crossentropy": 2.0466583967208862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2567761391401291, + "step": 4334 + }, + { + "epoch": 0.08672, + "grad_norm": 2.375, + "grad_norm_var": 0.004423014322916667, + "learning_rate": 0.0001, + "loss": 4.5937, + "loss/crossentropy": 2.138857126235962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26430967450141907, + "step": 4336 + }, + { + "epoch": 0.08676, + "grad_norm": 2.390625, + "grad_norm_var": 0.003902180989583333, + "learning_rate": 0.0001, + "loss": 4.7168, + "loss/crossentropy": 2.164841413497925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24389629065990448, + "step": 4338 + }, + { + "epoch": 0.0868, + "grad_norm": 2.4375, + "grad_norm_var": 0.005322265625, + "learning_rate": 0.0001, + "loss": 4.6017, + "loss/crossentropy": 2.2220189571380615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24361558258533478, + "step": 4340 + }, + { + "epoch": 0.08684, + "grad_norm": 2.390625, + "grad_norm_var": 0.004792277018229167, + "learning_rate": 0.0001, + "loss": 4.3088, + "loss/crossentropy": 1.7106285095214844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21558403968811035, + "step": 4342 + }, + { + "epoch": 0.08688, + "grad_norm": 2.203125, + "grad_norm_var": 0.0053670247395833336, + "learning_rate": 0.0001, + "loss": 4.1647, + "loss/crossentropy": 1.9200173020362854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2502904310822487, + "step": 4344 + }, + { + "epoch": 0.08692, + "grad_norm": 2.828125, + "grad_norm_var": 0.020796712239583334, + "learning_rate": 0.0001, + "loss": 4.6817, + "loss/crossentropy": 1.883722960948944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24818243086338043, + "step": 4346 + }, + { + "epoch": 0.08696, + "grad_norm": 2.484375, + "grad_norm_var": 0.026146443684895833, + "learning_rate": 0.0001, + "loss": 4.7509, + "loss/crossentropy": 2.2069878578186035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2518744319677353, + "step": 4348 + }, + { + "epoch": 0.087, + "grad_norm": 2.484375, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 4.7235, + "loss/crossentropy": 2.2158325910568237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2551049590110779, + "step": 4350 + }, + { + "epoch": 0.08704, + "grad_norm": 2.359375, + "grad_norm_var": 0.02730712890625, + "learning_rate": 0.0001, + "loss": 4.8406, + "loss/crossentropy": 2.0580105781555176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2742728739976883, + "step": 4352 + }, + { + "epoch": 0.08708, + "grad_norm": 2.34375, + "grad_norm_var": 0.02906494140625, + "learning_rate": 0.0001, + "loss": 4.5267, + "loss/crossentropy": 2.190276265144348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.274506613612175, + "step": 4354 + }, + { + "epoch": 0.08712, + "grad_norm": 2.3125, + "grad_norm_var": 0.027242024739583332, + "learning_rate": 0.0001, + "loss": 4.7499, + "loss/crossentropy": 2.2595328092575073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25295622646808624, + "step": 4356 + }, + { + "epoch": 0.08716, + "grad_norm": 2.4375, + "grad_norm_var": 0.027391560872395835, + "learning_rate": 0.0001, + "loss": 5.1247, + "loss/crossentropy": 2.322342872619629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2490948587656021, + "step": 4358 + }, + { + "epoch": 0.0872, + "grad_norm": 2.328125, + "grad_norm_var": 0.024665323893229167, + "learning_rate": 0.0001, + "loss": 4.7045, + "loss/crossentropy": 2.108223795890808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25404803454875946, + "step": 4360 + }, + { + "epoch": 0.08724, + "grad_norm": 2.3125, + "grad_norm_var": 0.013825480143229167, + "learning_rate": 0.0001, + "loss": 4.5332, + "loss/crossentropy": 2.1363136768341064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2679157853126526, + "step": 4362 + }, + { + "epoch": 0.08728, + "grad_norm": 2.390625, + "grad_norm_var": 0.0076171875, + "learning_rate": 0.0001, + "loss": 4.6418, + "loss/crossentropy": 2.3207738399505615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2725762128829956, + "step": 4364 + }, + { + "epoch": 0.08732, + "grad_norm": 2.40625, + "grad_norm_var": 0.004813639322916666, + "learning_rate": 0.0001, + "loss": 4.7431, + "loss/crossentropy": 2.3179128170013428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27275949716567993, + "step": 4366 + }, + { + "epoch": 0.08736, + "grad_norm": 2.328125, + "grad_norm_var": 0.005182902018229167, + "learning_rate": 0.0001, + "loss": 4.7355, + "loss/crossentropy": 2.2130206823349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2787477523088455, + "step": 4368 + }, + { + "epoch": 0.0874, + "grad_norm": 2.40625, + "grad_norm_var": 0.004548136393229167, + "learning_rate": 0.0001, + "loss": 4.6193, + "loss/crossentropy": 2.3350926637649536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26483266800642014, + "step": 4370 + }, + { + "epoch": 0.08744, + "grad_norm": 2.4375, + "grad_norm_var": 0.005736287434895833, + "learning_rate": 0.0001, + "loss": 4.7329, + "loss/crossentropy": 1.9162638187408447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23748356848955154, + "step": 4372 + }, + { + "epoch": 0.08748, + "grad_norm": 2.484375, + "grad_norm_var": 0.006322224934895833, + "learning_rate": 0.0001, + "loss": 4.7045, + "loss/crossentropy": 2.2708429098129272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2709425985813141, + "step": 4374 + }, + { + "epoch": 0.08752, + "grad_norm": 2.234375, + "grad_norm_var": 0.011604817708333333, + "learning_rate": 0.0001, + "loss": 4.3481, + "loss/crossentropy": 1.7216318845748901, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22498781234025955, + "step": 4376 + }, + { + "epoch": 0.08756, + "grad_norm": 2.328125, + "grad_norm_var": 0.011937459309895834, + "learning_rate": 0.0001, + "loss": 4.4261, + "loss/crossentropy": 2.144331693649292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25400668382644653, + "step": 4378 + }, + { + "epoch": 0.0876, + "grad_norm": 2.328125, + "grad_norm_var": 0.011750284830729167, + "learning_rate": 0.0001, + "loss": 4.5617, + "loss/crossentropy": 2.305369734764099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29137127101421356, + "step": 4380 + }, + { + "epoch": 0.08764, + "grad_norm": 2.65625, + "grad_norm_var": 0.018115234375, + "learning_rate": 0.0001, + "loss": 4.6861, + "loss/crossentropy": 2.1156765818595886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24203064292669296, + "step": 4382 + }, + { + "epoch": 0.08768, + "grad_norm": 2.28125, + "grad_norm_var": 0.016630045572916665, + "learning_rate": 0.0001, + "loss": 4.4544, + "loss/crossentropy": 1.9081769585609436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22396781295537949, + "step": 4384 + }, + { + "epoch": 0.08772, + "grad_norm": 2.265625, + "grad_norm_var": 0.017769368489583333, + "learning_rate": 0.0001, + "loss": 4.704, + "loss/crossentropy": 2.0938609838485718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27330371737480164, + "step": 4386 + }, + { + "epoch": 0.08776, + "grad_norm": 2.34375, + "grad_norm_var": 0.017748006184895835, + "learning_rate": 0.0001, + "loss": 4.1179, + "loss/crossentropy": 1.9685207605361938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2287725731730461, + "step": 4388 + }, + { + "epoch": 0.0878, + "grad_norm": 2.3125, + "grad_norm_var": 0.016109212239583334, + "learning_rate": 0.0001, + "loss": 4.5233, + "loss/crossentropy": 1.7536925673484802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2345883920788765, + "step": 4390 + }, + { + "epoch": 0.08784, + "grad_norm": 2.359375, + "grad_norm_var": 0.029150390625, + "learning_rate": 0.0001, + "loss": 4.3897, + "loss/crossentropy": 2.145567834377289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26126645505428314, + "step": 4392 + }, + { + "epoch": 0.08788, + "grad_norm": 2.390625, + "grad_norm_var": 0.028348795572916665, + "learning_rate": 0.0001, + "loss": 4.4677, + "loss/crossentropy": 2.2211687564849854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3332909345626831, + "step": 4394 + }, + { + "epoch": 0.08792, + "grad_norm": 2.375, + "grad_norm_var": 0.028539021809895832, + "learning_rate": 0.0001, + "loss": 4.5545, + "loss/crossentropy": 1.919084072113037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2480143904685974, + "step": 4396 + }, + { + "epoch": 0.08796, + "grad_norm": 2.421875, + "grad_norm_var": 0.025121053059895832, + "learning_rate": 0.0001, + "loss": 4.6408, + "loss/crossentropy": 2.1769548654556274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.269208699464798, + "step": 4398 + }, + { + "epoch": 0.088, + "grad_norm": 2.5, + "grad_norm_var": 0.028706868489583332, + "learning_rate": 0.0001, + "loss": 4.7372, + "loss/crossentropy": 1.8480086922645569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2432136833667755, + "step": 4400 + }, + { + "epoch": 0.08804, + "grad_norm": 2.46875, + "grad_norm_var": 0.026838175455729165, + "learning_rate": 0.0001, + "loss": 4.7303, + "loss/crossentropy": 2.1948903799057007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28328536450862885, + "step": 4402 + }, + { + "epoch": 0.08808, + "grad_norm": 2.40625, + "grad_norm_var": 0.022554524739583335, + "learning_rate": 0.0001, + "loss": 4.6929, + "loss/crossentropy": 2.163570761680603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.282375693321228, + "step": 4404 + }, + { + "epoch": 0.08812, + "grad_norm": 2.34375, + "grad_norm_var": 0.022359212239583332, + "learning_rate": 0.0001, + "loss": 4.6882, + "loss/crossentropy": 2.3737696409225464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26703887432813644, + "step": 4406 + }, + { + "epoch": 0.08816, + "grad_norm": 2.71875, + "grad_norm_var": 0.015192667643229166, + "learning_rate": 0.0001, + "loss": 4.6959, + "loss/crossentropy": 2.2449779510498047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2744368612766266, + "step": 4408 + }, + { + "epoch": 0.0882, + "grad_norm": 2.515625, + "grad_norm_var": 0.015348307291666667, + "learning_rate": 0.0001, + "loss": 4.6602, + "loss/crossentropy": 2.1167399287223816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28688907623291016, + "step": 4410 + }, + { + "epoch": 0.08824, + "grad_norm": 2.3125, + "grad_norm_var": 0.017118326822916665, + "learning_rate": 0.0001, + "loss": 4.3777, + "loss/crossentropy": 2.2249929904937744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24534232914447784, + "step": 4412 + }, + { + "epoch": 0.08828, + "grad_norm": 2.421875, + "grad_norm_var": 0.019261678059895832, + "learning_rate": 0.0001, + "loss": 4.7013, + "loss/crossentropy": 2.172566771507263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2783561646938324, + "step": 4414 + }, + { + "epoch": 0.08832, + "grad_norm": 2.234375, + "grad_norm_var": 0.018684895833333333, + "learning_rate": 0.0001, + "loss": 4.3536, + "loss/crossentropy": 2.0709031224250793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2419627606868744, + "step": 4416 + }, + { + "epoch": 0.08836, + "grad_norm": 3.359375, + "grad_norm_var": 0.0735260009765625, + "learning_rate": 0.0001, + "loss": 4.8378, + "loss/crossentropy": 2.2390655279159546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27328142523765564, + "step": 4418 + }, + { + "epoch": 0.0884, + "grad_norm": 2.59375, + "grad_norm_var": 0.07681884765625, + "learning_rate": 0.0001, + "loss": 4.6879, + "loss/crossentropy": 2.061118960380554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23206621408462524, + "step": 4420 + }, + { + "epoch": 0.08844, + "grad_norm": 2.140625, + "grad_norm_var": 0.09147847493489583, + "learning_rate": 0.0001, + "loss": 4.3028, + "loss/crossentropy": 1.4919558763504028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17881463468074799, + "step": 4422 + }, + { + "epoch": 0.08848, + "grad_norm": 2.53125, + "grad_norm_var": 0.08662109375, + "learning_rate": 0.0001, + "loss": 4.7705, + "loss/crossentropy": 2.267301082611084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27718164026737213, + "step": 4424 + }, + { + "epoch": 0.08852, + "grad_norm": 2.375, + "grad_norm_var": 0.0862457275390625, + "learning_rate": 0.0001, + "loss": 4.635, + "loss/crossentropy": 1.9008094668388367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22865734994411469, + "step": 4426 + }, + { + "epoch": 0.08856, + "grad_norm": 2.421875, + "grad_norm_var": 0.090966796875, + "learning_rate": 0.0001, + "loss": 4.385, + "loss/crossentropy": 1.8788060545921326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23521529138088226, + "step": 4428 + }, + { + "epoch": 0.0886, + "grad_norm": 2.546875, + "grad_norm_var": 0.08844401041666666, + "learning_rate": 0.0001, + "loss": 4.6703, + "loss/crossentropy": 2.0775802731513977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3402235209941864, + "step": 4430 + }, + { + "epoch": 0.08864, + "grad_norm": 2.515625, + "grad_norm_var": 0.08642171223958334, + "learning_rate": 0.0001, + "loss": 4.5722, + "loss/crossentropy": 2.0950201749801636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2665044367313385, + "step": 4432 + }, + { + "epoch": 0.08868, + "grad_norm": 2.40625, + "grad_norm_var": 0.026167805989583334, + "learning_rate": 0.0001, + "loss": 4.9235, + "loss/crossentropy": 2.328200340270996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24911007285118103, + "step": 4434 + }, + { + "epoch": 0.08872, + "grad_norm": 2.515625, + "grad_norm_var": 0.023273722330729166, + "learning_rate": 0.0001, + "loss": 4.7462, + "loss/crossentropy": 2.1840142011642456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2855416387319565, + "step": 4436 + }, + { + "epoch": 0.08876, + "grad_norm": 2.25, + "grad_norm_var": 0.015559895833333334, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.6167555451393127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21646380424499512, + "step": 4438 + }, + { + "epoch": 0.0888, + "grad_norm": 2.25, + "grad_norm_var": 0.015543619791666666, + "learning_rate": 0.0001, + "loss": 4.5274, + "loss/crossentropy": 1.9902858138084412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.237789124250412, + "step": 4440 + }, + { + "epoch": 0.08884, + "grad_norm": 2.359375, + "grad_norm_var": 0.016357421875, + "learning_rate": 0.0001, + "loss": 4.6682, + "loss/crossentropy": 2.4779287576675415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2726414203643799, + "step": 4442 + }, + { + "epoch": 0.08888, + "grad_norm": 2.46875, + "grad_norm_var": 0.0116119384765625, + "learning_rate": 0.0001, + "loss": 4.7415, + "loss/crossentropy": 2.0914896726608276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22887279838323593, + "step": 4444 + }, + { + "epoch": 0.08892, + "grad_norm": 2.265625, + "grad_norm_var": 0.010570271809895834, + "learning_rate": 0.0001, + "loss": 4.498, + "loss/crossentropy": 2.0526055693626404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2509382963180542, + "step": 4446 + }, + { + "epoch": 0.08896, + "grad_norm": 2.15625, + "grad_norm_var": 0.010347493489583333, + "learning_rate": 0.0001, + "loss": 4.4306, + "loss/crossentropy": 1.9779084920883179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2320951297879219, + "step": 4448 + }, + { + "epoch": 0.089, + "grad_norm": 2.421875, + "grad_norm_var": 0.010399373372395833, + "learning_rate": 0.0001, + "loss": 4.7725, + "loss/crossentropy": 2.2081698179244995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29023079574108124, + "step": 4450 + }, + { + "epoch": 0.08904, + "grad_norm": 2.296875, + "grad_norm_var": 0.0074127197265625, + "learning_rate": 0.0001, + "loss": 4.542, + "loss/crossentropy": 1.834806501865387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23430980741977692, + "step": 4452 + }, + { + "epoch": 0.08908, + "grad_norm": 2.28125, + "grad_norm_var": 0.006884765625, + "learning_rate": 0.0001, + "loss": 4.7087, + "loss/crossentropy": 2.4750468730926514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27945323288440704, + "step": 4454 + }, + { + "epoch": 0.08912, + "grad_norm": 2.28125, + "grad_norm_var": 0.007255045572916666, + "learning_rate": 0.0001, + "loss": 4.6822, + "loss/crossentropy": 2.1766942739486694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28881968557834625, + "step": 4456 + }, + { + "epoch": 0.08916, + "grad_norm": 2.28125, + "grad_norm_var": 0.009065755208333333, + "learning_rate": 0.0001, + "loss": 4.9204, + "loss/crossentropy": 2.265195846557617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2616717368364334, + "step": 4458 + }, + { + "epoch": 0.0892, + "grad_norm": 2.171875, + "grad_norm_var": 0.011473592122395833, + "learning_rate": 0.0001, + "loss": 4.5747, + "loss/crossentropy": 2.2438716888427734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2630545049905777, + "step": 4460 + }, + { + "epoch": 0.08924, + "grad_norm": 2.28125, + "grad_norm_var": 0.012572224934895833, + "learning_rate": 0.0001, + "loss": 4.3404, + "loss/crossentropy": 2.060324013233185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23639824986457825, + "step": 4462 + }, + { + "epoch": 0.08928, + "grad_norm": 2.453125, + "grad_norm_var": 0.0331207275390625, + "learning_rate": 0.0001, + "loss": 4.733, + "loss/crossentropy": 1.8830525279045105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21016474813222885, + "step": 4464 + }, + { + "epoch": 0.08932, + "grad_norm": 2.484375, + "grad_norm_var": 0.03435872395833333, + "learning_rate": 0.0001, + "loss": 4.7571, + "loss/crossentropy": 2.3420846462249756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28108468651771545, + "step": 4466 + }, + { + "epoch": 0.08936, + "grad_norm": 2.515625, + "grad_norm_var": 0.0337310791015625, + "learning_rate": 0.0001, + "loss": 4.6851, + "loss/crossentropy": 1.9140342473983765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22587041556835175, + "step": 4468 + }, + { + "epoch": 0.0894, + "grad_norm": 2.390625, + "grad_norm_var": 0.032613118489583336, + "learning_rate": 0.0001, + "loss": 4.79, + "loss/crossentropy": 1.9753122925758362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24506878852844238, + "step": 4470 + }, + { + "epoch": 0.08944, + "grad_norm": 2.3125, + "grad_norm_var": 0.032389322916666664, + "learning_rate": 0.0001, + "loss": 4.7769, + "loss/crossentropy": 2.0735195875167847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2763114273548126, + "step": 4472 + }, + { + "epoch": 0.08948, + "grad_norm": 2.28125, + "grad_norm_var": 0.034403483072916664, + "learning_rate": 0.0001, + "loss": 4.3593, + "loss/crossentropy": 1.7784460186958313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22829323261976242, + "step": 4474 + }, + { + "epoch": 0.08952, + "grad_norm": 2.625, + "grad_norm_var": 0.051041666666666666, + "learning_rate": 0.0001, + "loss": 4.9451, + "loss/crossentropy": 2.1188095808029175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27410270273685455, + "step": 4476 + }, + { + "epoch": 0.08956, + "grad_norm": 2.421875, + "grad_norm_var": 0.051634724934895834, + "learning_rate": 0.0001, + "loss": 4.3392, + "loss/crossentropy": 2.320235252380371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27514997124671936, + "step": 4478 + }, + { + "epoch": 0.0896, + "grad_norm": 2.265625, + "grad_norm_var": 0.03766988118489583, + "learning_rate": 0.0001, + "loss": 4.8345, + "loss/crossentropy": 2.3023080825805664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2516366094350815, + "step": 4480 + }, + { + "epoch": 0.08964, + "grad_norm": 2.390625, + "grad_norm_var": 0.039567057291666666, + "learning_rate": 0.0001, + "loss": 4.6698, + "loss/crossentropy": 1.9677563905715942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2362382560968399, + "step": 4482 + }, + { + "epoch": 0.08968, + "grad_norm": 2.34375, + "grad_norm_var": 0.03951416015625, + "learning_rate": 0.0001, + "loss": 4.9159, + "loss/crossentropy": 2.2005198001861572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2760336175560951, + "step": 4484 + }, + { + "epoch": 0.08972, + "grad_norm": 2.0625, + "grad_norm_var": 0.0478912353515625, + "learning_rate": 0.0001, + "loss": 4.4418, + "loss/crossentropy": 1.9799931049346924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25132423639297485, + "step": 4486 + }, + { + "epoch": 0.08976, + "grad_norm": 2.46875, + "grad_norm_var": 0.10212300618489584, + "learning_rate": 0.0001, + "loss": 4.9643, + "loss/crossentropy": 2.205371141433716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2695635259151459, + "step": 4488 + }, + { + "epoch": 0.0898, + "grad_norm": 2.421875, + "grad_norm_var": 0.09692281087239583, + "learning_rate": 0.0001, + "loss": 4.9695, + "loss/crossentropy": 2.3500062227249146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2736963629722595, + "step": 4490 + }, + { + "epoch": 0.08984, + "grad_norm": 2.59375, + "grad_norm_var": 0.07867431640625, + "learning_rate": 0.0001, + "loss": 4.9083, + "loss/crossentropy": 2.386352837085724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2472890019416809, + "step": 4492 + }, + { + "epoch": 0.08988, + "grad_norm": 2.265625, + "grad_norm_var": 0.07701416015625, + "learning_rate": 0.0001, + "loss": 4.3598, + "loss/crossentropy": 1.9863982200622559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2288871705532074, + "step": 4494 + }, + { + "epoch": 0.08992, + "grad_norm": 2.34375, + "grad_norm_var": 0.07757059733072917, + "learning_rate": 0.0001, + "loss": 4.6157, + "loss/crossentropy": 2.4088594913482666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24304527044296265, + "step": 4496 + }, + { + "epoch": 0.08996, + "grad_norm": 2.5, + "grad_norm_var": 0.07517801920572917, + "learning_rate": 0.0001, + "loss": 4.7457, + "loss/crossentropy": 2.1663140058517456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2681227922439575, + "step": 4498 + }, + { + "epoch": 0.09, + "grad_norm": 2.171875, + "grad_norm_var": 0.0810455322265625, + "learning_rate": 0.0001, + "loss": 4.2199, + "loss/crossentropy": 1.9233570098876953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23169977217912674, + "step": 4500 + }, + { + "epoch": 0.09004, + "grad_norm": 2.40625, + "grad_norm_var": 0.07579752604166666, + "learning_rate": 0.0001, + "loss": 4.4677, + "loss/crossentropy": 2.2940425872802734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24809539318084717, + "step": 4502 + }, + { + "epoch": 0.09008, + "grad_norm": 2.21875, + "grad_norm_var": 0.015771484375, + "learning_rate": 0.0001, + "loss": 4.5333, + "loss/crossentropy": 2.1874176263809204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26286616921424866, + "step": 4504 + }, + { + "epoch": 0.09012, + "grad_norm": 2.28125, + "grad_norm_var": 0.015868123372395834, + "learning_rate": 0.0001, + "loss": 4.5029, + "loss/crossentropy": 2.190543472766876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26766398549079895, + "step": 4506 + }, + { + "epoch": 0.09016, + "grad_norm": 2.296875, + "grad_norm_var": 0.010823567708333334, + "learning_rate": 0.0001, + "loss": 4.464, + "loss/crossentropy": 2.132491707801819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2514026165008545, + "step": 4508 + }, + { + "epoch": 0.0902, + "grad_norm": 2.296875, + "grad_norm_var": 0.0107818603515625, + "learning_rate": 0.0001, + "loss": 4.5032, + "loss/crossentropy": 2.1492353677749634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25663119554519653, + "step": 4510 + }, + { + "epoch": 0.09024, + "grad_norm": 2.203125, + "grad_norm_var": 0.011454264322916666, + "learning_rate": 0.0001, + "loss": 4.6166, + "loss/crossentropy": 2.2471927404403687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24812395125627518, + "step": 4512 + }, + { + "epoch": 0.09028, + "grad_norm": 2.34375, + "grad_norm_var": 0.007013956705729167, + "learning_rate": 0.0001, + "loss": 4.5651, + "loss/crossentropy": 2.1944304704666138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26567137241363525, + "step": 4514 + }, + { + "epoch": 0.09032, + "grad_norm": 2.203125, + "grad_norm_var": 0.006883748372395833, + "learning_rate": 0.0001, + "loss": 4.3054, + "loss/crossentropy": 1.7537739872932434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21784386038780212, + "step": 4516 + }, + { + "epoch": 0.09036, + "grad_norm": 2.46875, + "grad_norm_var": 0.006981404622395834, + "learning_rate": 0.0001, + "loss": 4.6961, + "loss/crossentropy": 2.133580207824707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27865441143512726, + "step": 4518 + }, + { + "epoch": 0.0904, + "grad_norm": 2.5, + "grad_norm_var": 0.008723958333333334, + "learning_rate": 0.0001, + "loss": 4.5486, + "loss/crossentropy": 2.0858315229415894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26390860974788666, + "step": 4520 + }, + { + "epoch": 0.09044, + "grad_norm": 2.3125, + "grad_norm_var": 0.008101399739583333, + "learning_rate": 0.0001, + "loss": 4.6833, + "loss/crossentropy": 1.9084516763687134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23013630509376526, + "step": 4522 + }, + { + "epoch": 0.09048, + "grad_norm": 2.234375, + "grad_norm_var": 0.00859375, + "learning_rate": 0.0001, + "loss": 4.5817, + "loss/crossentropy": 2.2123712301254272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2739466577768326, + "step": 4524 + }, + { + "epoch": 0.09052, + "grad_norm": 2.453125, + "grad_norm_var": 0.009691365559895833, + "learning_rate": 0.0001, + "loss": 4.9898, + "loss/crossentropy": 2.3532934188842773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25509175658226013, + "step": 4526 + }, + { + "epoch": 0.09056, + "grad_norm": 2.328125, + "grad_norm_var": 0.008137003580729166, + "learning_rate": 0.0001, + "loss": 4.7786, + "loss/crossentropy": 2.1543048620224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23639082163572311, + "step": 4528 + }, + { + "epoch": 0.0906, + "grad_norm": 2.4375, + "grad_norm_var": 0.0109527587890625, + "learning_rate": 0.0001, + "loss": 4.647, + "loss/crossentropy": 2.1322286128997803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23874164372682571, + "step": 4530 + }, + { + "epoch": 0.09064, + "grad_norm": 2.25, + "grad_norm_var": 0.01011962890625, + "learning_rate": 0.0001, + "loss": 4.7623, + "loss/crossentropy": 2.5552597045898438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24949757009744644, + "step": 4532 + }, + { + "epoch": 0.09068, + "grad_norm": 2.421875, + "grad_norm_var": 0.0117095947265625, + "learning_rate": 0.0001, + "loss": 4.6999, + "loss/crossentropy": 2.1248819231987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2795845717191696, + "step": 4534 + }, + { + "epoch": 0.09072, + "grad_norm": 2.390625, + "grad_norm_var": 0.014655558268229167, + "learning_rate": 0.0001, + "loss": 4.7739, + "loss/crossentropy": 1.985984206199646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22764238715171814, + "step": 4536 + }, + { + "epoch": 0.09076, + "grad_norm": 2.328125, + "grad_norm_var": 0.01519775390625, + "learning_rate": 0.0001, + "loss": 4.64, + "loss/crossentropy": 2.220720648765564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2693287283182144, + "step": 4538 + }, + { + "epoch": 0.0908, + "grad_norm": 2.3125, + "grad_norm_var": 0.01324462890625, + "learning_rate": 0.0001, + "loss": 4.486, + "loss/crossentropy": 1.9541595578193665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2419680804014206, + "step": 4540 + }, + { + "epoch": 0.09084, + "grad_norm": 2.53125, + "grad_norm_var": 0.013916015625, + "learning_rate": 0.0001, + "loss": 4.963, + "loss/crossentropy": 2.275113582611084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2562567666172981, + "step": 4542 + }, + { + "epoch": 0.09088, + "grad_norm": 2.3125, + "grad_norm_var": 0.017122395833333335, + "learning_rate": 0.0001, + "loss": 4.7535, + "loss/crossentropy": 2.4411803483963013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25798996537923813, + "step": 4544 + }, + { + "epoch": 0.09092, + "grad_norm": 2.34375, + "grad_norm_var": 0.019823201497395835, + "learning_rate": 0.0001, + "loss": 4.7339, + "loss/crossentropy": 2.035769820213318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25313758105039597, + "step": 4546 + }, + { + "epoch": 0.09096, + "grad_norm": 2.171875, + "grad_norm_var": 0.018701171875, + "learning_rate": 0.0001, + "loss": 4.1331, + "loss/crossentropy": 1.9237529039382935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24456002563238144, + "step": 4548 + }, + { + "epoch": 0.091, + "grad_norm": 2.234375, + "grad_norm_var": 0.02281494140625, + "learning_rate": 0.0001, + "loss": 4.6762, + "loss/crossentropy": 2.179704189300537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2681535929441452, + "step": 4550 + }, + { + "epoch": 0.09104, + "grad_norm": 2.46875, + "grad_norm_var": 0.024137369791666665, + "learning_rate": 0.0001, + "loss": 4.6899, + "loss/crossentropy": 2.013023316860199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22643990069627762, + "step": 4552 + }, + { + "epoch": 0.09108, + "grad_norm": 2.453125, + "grad_norm_var": 0.024201456705729166, + "learning_rate": 0.0001, + "loss": 4.6527, + "loss/crossentropy": 2.173883557319641, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26495447754859924, + "step": 4554 + }, + { + "epoch": 0.09112, + "grad_norm": 2.296875, + "grad_norm_var": 0.025121053059895832, + "learning_rate": 0.0001, + "loss": 4.5323, + "loss/crossentropy": 1.9398870468139648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24307211488485336, + "step": 4556 + }, + { + "epoch": 0.09116, + "grad_norm": 2.28125, + "grad_norm_var": 0.025423177083333335, + "learning_rate": 0.0001, + "loss": 4.2028, + "loss/crossentropy": 1.8551223874092102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21659143269062042, + "step": 4558 + }, + { + "epoch": 0.0912, + "grad_norm": 2.375, + "grad_norm_var": 0.0254058837890625, + "learning_rate": 0.0001, + "loss": 4.7813, + "loss/crossentropy": 2.104207456111908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26515253633260727, + "step": 4560 + }, + { + "epoch": 0.09124, + "grad_norm": 2.484375, + "grad_norm_var": 0.023875935872395834, + "learning_rate": 0.0001, + "loss": 4.6241, + "loss/crossentropy": 2.31631863117218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25454505532979965, + "step": 4562 + }, + { + "epoch": 0.09128, + "grad_norm": 2.265625, + "grad_norm_var": 0.021800740559895834, + "learning_rate": 0.0001, + "loss": 4.5732, + "loss/crossentropy": 2.331356406211853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2667195200920105, + "step": 4564 + }, + { + "epoch": 0.09132, + "grad_norm": 2.515625, + "grad_norm_var": 0.0176910400390625, + "learning_rate": 0.0001, + "loss": 4.6714, + "loss/crossentropy": 2.2126184701919556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24132181704044342, + "step": 4566 + }, + { + "epoch": 0.09136, + "grad_norm": 2.25, + "grad_norm_var": 0.013167317708333333, + "learning_rate": 0.0001, + "loss": 4.5753, + "loss/crossentropy": 2.330659508705139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2838585078716278, + "step": 4568 + }, + { + "epoch": 0.0914, + "grad_norm": 2.28125, + "grad_norm_var": 0.0132720947265625, + "learning_rate": 0.0001, + "loss": 4.4935, + "loss/crossentropy": 2.167214035987854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24021611362695694, + "step": 4570 + }, + { + "epoch": 0.09144, + "grad_norm": 2.15625, + "grad_norm_var": 0.014046223958333333, + "learning_rate": 0.0001, + "loss": 4.5847, + "loss/crossentropy": 1.770102322101593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108836993575096, + "step": 4572 + }, + { + "epoch": 0.09148, + "grad_norm": 2.234375, + "grad_norm_var": 0.015425618489583333, + "learning_rate": 0.0001, + "loss": 4.2906, + "loss/crossentropy": 1.9293717741966248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2186967208981514, + "step": 4574 + }, + { + "epoch": 0.09152, + "grad_norm": 2.484375, + "grad_norm_var": 0.014188639322916667, + "learning_rate": 0.0001, + "loss": 4.8128, + "loss/crossentropy": 2.4099135398864746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25587645173072815, + "step": 4576 + }, + { + "epoch": 0.09156, + "grad_norm": 2.359375, + "grad_norm_var": 0.06220296223958333, + "learning_rate": 0.0001, + "loss": 4.5727, + "loss/crossentropy": 1.7967870831489563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2521464377641678, + "step": 4578 + }, + { + "epoch": 0.0916, + "grad_norm": 2.46875, + "grad_norm_var": 0.0616851806640625, + "learning_rate": 0.0001, + "loss": 4.7604, + "loss/crossentropy": 1.989583432674408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22892683744430542, + "step": 4580 + }, + { + "epoch": 0.09164, + "grad_norm": 2.625, + "grad_norm_var": 0.0626953125, + "learning_rate": 0.0001, + "loss": 4.4986, + "loss/crossentropy": 2.167198657989502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24552703648805618, + "step": 4582 + }, + { + "epoch": 0.09168, + "grad_norm": 2.453125, + "grad_norm_var": 0.06129150390625, + "learning_rate": 0.0001, + "loss": 4.8237, + "loss/crossentropy": 2.213137984275818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2614079788327217, + "step": 4584 + }, + { + "epoch": 0.09172, + "grad_norm": 2.265625, + "grad_norm_var": 0.06194661458333333, + "learning_rate": 0.0001, + "loss": 4.977, + "loss/crossentropy": 2.3586392998695374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2527218610048294, + "step": 4586 + }, + { + "epoch": 0.09176, + "grad_norm": 2.53125, + "grad_norm_var": 0.06054280598958333, + "learning_rate": 0.0001, + "loss": 4.5118, + "loss/crossentropy": 2.326598286628723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2684827446937561, + "step": 4588 + }, + { + "epoch": 0.0918, + "grad_norm": 2.234375, + "grad_norm_var": 0.055562337239583336, + "learning_rate": 0.0001, + "loss": 4.4296, + "loss/crossentropy": 2.1365907192230225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2705482095479965, + "step": 4590 + }, + { + "epoch": 0.09184, + "grad_norm": 2.328125, + "grad_norm_var": 0.056005859375, + "learning_rate": 0.0001, + "loss": 4.6895, + "loss/crossentropy": 1.816661775112152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2188858687877655, + "step": 4592 + }, + { + "epoch": 0.09188, + "grad_norm": 2.203125, + "grad_norm_var": 0.020731608072916668, + "learning_rate": 0.0001, + "loss": 4.5953, + "loss/crossentropy": 2.1533923149108887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2372177392244339, + "step": 4594 + }, + { + "epoch": 0.09192, + "grad_norm": 2.984375, + "grad_norm_var": 0.04504801432291667, + "learning_rate": 0.0001, + "loss": 4.9472, + "loss/crossentropy": 2.2175614833831787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27555912733078003, + "step": 4596 + }, + { + "epoch": 0.09196, + "grad_norm": 2.640625, + "grad_norm_var": 0.04644775390625, + "learning_rate": 0.0001, + "loss": 5.11, + "loss/crossentropy": 2.583792209625244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2970864772796631, + "step": 4598 + }, + { + "epoch": 0.092, + "grad_norm": 2.40625, + "grad_norm_var": 0.046751912434895834, + "learning_rate": 0.0001, + "loss": 4.5761, + "loss/crossentropy": 2.227868676185608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.259210504591465, + "step": 4600 + }, + { + "epoch": 0.09204, + "grad_norm": 2.265625, + "grad_norm_var": 0.047118123372395834, + "learning_rate": 0.0001, + "loss": 4.4125, + "loss/crossentropy": 2.146941900253296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26443855464458466, + "step": 4602 + }, + { + "epoch": 0.09208, + "grad_norm": 2.296875, + "grad_norm_var": 0.04439697265625, + "learning_rate": 0.0001, + "loss": 4.8655, + "loss/crossentropy": 2.129795551300049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2451881766319275, + "step": 4604 + }, + { + "epoch": 0.09212, + "grad_norm": 2.8125, + "grad_norm_var": 0.0506256103515625, + "learning_rate": 0.0001, + "loss": 4.895, + "loss/crossentropy": 2.2696213722229004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26037923991680145, + "step": 4606 + }, + { + "epoch": 0.09216, + "grad_norm": 3.03125, + "grad_norm_var": 0.07105712890625, + "learning_rate": 0.0001, + "loss": 4.7424, + "loss/crossentropy": 2.1916056871414185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2648691013455391, + "step": 4608 + }, + { + "epoch": 0.0922, + "grad_norm": 2.15625, + "grad_norm_var": 0.07119038899739584, + "learning_rate": 0.0001, + "loss": 4.5021, + "loss/crossentropy": 1.975761890411377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22248996049165726, + "step": 4610 + }, + { + "epoch": 0.09224, + "grad_norm": 2.265625, + "grad_norm_var": 0.053511555989583334, + "learning_rate": 0.0001, + "loss": 4.6596, + "loss/crossentropy": 2.403375506401062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24553030729293823, + "step": 4612 + }, + { + "epoch": 0.09228, + "grad_norm": 2.390625, + "grad_norm_var": 0.049637858072916666, + "learning_rate": 0.0001, + "loss": 4.4542, + "loss/crossentropy": 1.9468475580215454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24023501574993134, + "step": 4614 + }, + { + "epoch": 0.09232, + "grad_norm": 2.40625, + "grad_norm_var": 0.0534088134765625, + "learning_rate": 0.0001, + "loss": 4.8095, + "loss/crossentropy": 2.013557195663452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23829826712608337, + "step": 4616 + }, + { + "epoch": 0.09236, + "grad_norm": 2.265625, + "grad_norm_var": 0.0519439697265625, + "learning_rate": 0.0001, + "loss": 4.5736, + "loss/crossentropy": 2.0552549958229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2391991689801216, + "step": 4618 + }, + { + "epoch": 0.0924, + "grad_norm": 2.28125, + "grad_norm_var": 0.053694661458333334, + "learning_rate": 0.0001, + "loss": 4.4091, + "loss/crossentropy": 1.8311110734939575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23821169883012772, + "step": 4620 + }, + { + "epoch": 0.09244, + "grad_norm": 2.28125, + "grad_norm_var": 0.04431966145833333, + "learning_rate": 0.0001, + "loss": 4.5034, + "loss/crossentropy": 2.0346454977989197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23688799142837524, + "step": 4622 + }, + { + "epoch": 0.09248, + "grad_norm": 2.484375, + "grad_norm_var": 0.016304524739583333, + "learning_rate": 0.0001, + "loss": 4.5329, + "loss/crossentropy": 1.8475716710090637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23444947600364685, + "step": 4624 + }, + { + "epoch": 0.09252, + "grad_norm": 2.328125, + "grad_norm_var": 0.0136383056640625, + "learning_rate": 0.0001, + "loss": 4.4315, + "loss/crossentropy": 2.075626790523529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2663211151957512, + "step": 4626 + }, + { + "epoch": 0.09256, + "grad_norm": 2.46875, + "grad_norm_var": 0.0130035400390625, + "learning_rate": 0.0001, + "loss": 4.9214, + "loss/crossentropy": 2.413028359413147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29516373574733734, + "step": 4628 + }, + { + "epoch": 0.0926, + "grad_norm": 2.53125, + "grad_norm_var": 0.014143880208333333, + "learning_rate": 0.0001, + "loss": 4.6878, + "loss/crossentropy": 1.8549358248710632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23275860399007797, + "step": 4630 + }, + { + "epoch": 0.09264, + "grad_norm": 2.390625, + "grad_norm_var": 0.0076405843098958336, + "learning_rate": 0.0001, + "loss": 4.8594, + "loss/crossentropy": 2.273250460624695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2511187419295311, + "step": 4632 + }, + { + "epoch": 0.09268, + "grad_norm": 2.4375, + "grad_norm_var": 0.013093058268229167, + "learning_rate": 0.0001, + "loss": 4.336, + "loss/crossentropy": 1.929758369922638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25462278723716736, + "step": 4634 + }, + { + "epoch": 0.09272, + "grad_norm": 2.296875, + "grad_norm_var": 0.013158162434895834, + "learning_rate": 0.0001, + "loss": 4.5888, + "loss/crossentropy": 2.0929598212242126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23592843115329742, + "step": 4636 + }, + { + "epoch": 0.09276, + "grad_norm": 2.21875, + "grad_norm_var": 0.015282185872395833, + "learning_rate": 0.0001, + "loss": 4.4595, + "loss/crossentropy": 1.973824143409729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24049720913171768, + "step": 4638 + }, + { + "epoch": 0.0928, + "grad_norm": 2.25, + "grad_norm_var": 0.014207967122395833, + "learning_rate": 0.0001, + "loss": 4.2606, + "loss/crossentropy": 1.8606626987457275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21698658913373947, + "step": 4640 + }, + { + "epoch": 0.09284, + "grad_norm": 2.5625, + "grad_norm_var": 0.01875, + "learning_rate": 0.0001, + "loss": 4.9827, + "loss/crossentropy": 2.241386890411377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2831972986459732, + "step": 4642 + }, + { + "epoch": 0.09288, + "grad_norm": 2.34375, + "grad_norm_var": 0.0166656494140625, + "learning_rate": 0.0001, + "loss": 4.7046, + "loss/crossentropy": 2.2538920640945435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2482161968946457, + "step": 4644 + }, + { + "epoch": 0.09292, + "grad_norm": 2.578125, + "grad_norm_var": 0.01812744140625, + "learning_rate": 0.0001, + "loss": 4.8557, + "loss/crossentropy": 2.067206382751465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26665763556957245, + "step": 4646 + }, + { + "epoch": 0.09296, + "grad_norm": 2.5625, + "grad_norm_var": 0.021751912434895833, + "learning_rate": 0.0001, + "loss": 4.9474, + "loss/crossentropy": 2.33401358127594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2752760946750641, + "step": 4648 + }, + { + "epoch": 0.093, + "grad_norm": 2.4375, + "grad_norm_var": 0.04914449055989583, + "learning_rate": 0.0001, + "loss": 4.8658, + "loss/crossentropy": 2.1510735750198364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24588150531053543, + "step": 4650 + }, + { + "epoch": 0.09304, + "grad_norm": 2.28125, + "grad_norm_var": 0.049397786458333336, + "learning_rate": 0.0001, + "loss": 4.579, + "loss/crossentropy": 2.086448848247528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.261296346783638, + "step": 4652 + }, + { + "epoch": 0.09308, + "grad_norm": 2.5625, + "grad_norm_var": 0.0446685791015625, + "learning_rate": 0.0001, + "loss": 4.7495, + "loss/crossentropy": 1.5951193571090698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2312234491109848, + "step": 4654 + }, + { + "epoch": 0.09312, + "grad_norm": 2.328125, + "grad_norm_var": 0.04052734375, + "learning_rate": 0.0001, + "loss": 4.2273, + "loss/crossentropy": 2.0098360776901245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22354383766651154, + "step": 4656 + }, + { + "epoch": 0.09316, + "grad_norm": 2.375, + "grad_norm_var": 0.04079488118489583, + "learning_rate": 0.0001, + "loss": 4.6047, + "loss/crossentropy": 2.263219714164734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26900506019592285, + "step": 4658 + }, + { + "epoch": 0.0932, + "grad_norm": 2.828125, + "grad_norm_var": 0.0485504150390625, + "learning_rate": 0.0001, + "loss": 4.5575, + "loss/crossentropy": 1.9481555819511414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24665354192256927, + "step": 4660 + }, + { + "epoch": 0.09324, + "grad_norm": 2.265625, + "grad_norm_var": 0.04988505045572917, + "learning_rate": 0.0001, + "loss": 4.5895, + "loss/crossentropy": 1.8751367926597595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24356309324502945, + "step": 4662 + }, + { + "epoch": 0.09328, + "grad_norm": 2.28125, + "grad_norm_var": 0.06110026041666667, + "learning_rate": 0.0001, + "loss": 4.7739, + "loss/crossentropy": 2.343896746635437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2558425962924957, + "step": 4664 + }, + { + "epoch": 0.09332, + "grad_norm": 2.40625, + "grad_norm_var": 0.032811482747395836, + "learning_rate": 0.0001, + "loss": 4.779, + "loss/crossentropy": 2.067797303199768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25030098110437393, + "step": 4666 + }, + { + "epoch": 0.09336, + "grad_norm": 2.359375, + "grad_norm_var": 0.029816691080729166, + "learning_rate": 0.0001, + "loss": 4.7427, + "loss/crossentropy": 2.4221161603927612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26934675872325897, + "step": 4668 + }, + { + "epoch": 0.0934, + "grad_norm": 2.8125, + "grad_norm_var": 0.038182576497395836, + "learning_rate": 0.0001, + "loss": 4.845, + "loss/crossentropy": 2.2733672857284546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536270022392273, + "step": 4670 + }, + { + "epoch": 0.09344, + "grad_norm": 2.390625, + "grad_norm_var": 0.0369537353515625, + "learning_rate": 0.0001, + "loss": 4.7721, + "loss/crossentropy": 2.179157257080078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24323320388793945, + "step": 4672 + }, + { + "epoch": 0.09348, + "grad_norm": 2.578125, + "grad_norm_var": 0.03875325520833333, + "learning_rate": 0.0001, + "loss": 4.8574, + "loss/crossentropy": 2.3613970279693604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2567686140537262, + "step": 4674 + }, + { + "epoch": 0.09352, + "grad_norm": 2.5, + "grad_norm_var": 0.029683430989583332, + "learning_rate": 0.0001, + "loss": 4.787, + "loss/crossentropy": 2.0755810141563416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2349473536014557, + "step": 4676 + }, + { + "epoch": 0.09356, + "grad_norm": 2.25, + "grad_norm_var": 0.03134358723958333, + "learning_rate": 0.0001, + "loss": 4.6235, + "loss/crossentropy": 1.9971619248390198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22000454366207123, + "step": 4678 + }, + { + "epoch": 0.0936, + "grad_norm": 2.3125, + "grad_norm_var": 0.020068359375, + "learning_rate": 0.0001, + "loss": 4.5307, + "loss/crossentropy": 1.9419977068901062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21052303910255432, + "step": 4680 + }, + { + "epoch": 0.09364, + "grad_norm": 2.3125, + "grad_norm_var": 0.021122233072916666, + "learning_rate": 0.0001, + "loss": 4.6314, + "loss/crossentropy": 2.1175334453582764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23567666858434677, + "step": 4682 + }, + { + "epoch": 0.09368, + "grad_norm": 2.40625, + "grad_norm_var": 0.021122233072916666, + "learning_rate": 0.0001, + "loss": 4.5955, + "loss/crossentropy": 2.0300605297088623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29177258908748627, + "step": 4684 + }, + { + "epoch": 0.09372, + "grad_norm": 2.296875, + "grad_norm_var": 0.009228515625, + "learning_rate": 0.0001, + "loss": 4.5583, + "loss/crossentropy": 2.052473723888397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22480525821447372, + "step": 4686 + }, + { + "epoch": 0.09376, + "grad_norm": 2.359375, + "grad_norm_var": 0.008275349934895834, + "learning_rate": 0.0001, + "loss": 4.4925, + "loss/crossentropy": 2.2506834268569946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26876674592494965, + "step": 4688 + }, + { + "epoch": 0.0938, + "grad_norm": 2.125, + "grad_norm_var": 0.007515462239583334, + "learning_rate": 0.0001, + "loss": 4.291, + "loss/crossentropy": 1.930393099784851, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22246946394443512, + "step": 4690 + }, + { + "epoch": 0.09384, + "grad_norm": 2.5, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 4.9317, + "loss/crossentropy": 2.5477651357650757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27808643877506256, + "step": 4692 + }, + { + "epoch": 0.09388, + "grad_norm": 2.15625, + "grad_norm_var": 0.009033203125, + "learning_rate": 0.0001, + "loss": 4.5313, + "loss/crossentropy": 2.1059221625328064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.253792941570282, + "step": 4694 + }, + { + "epoch": 0.09392, + "grad_norm": 2.265625, + "grad_norm_var": 0.0091217041015625, + "learning_rate": 0.0001, + "loss": 4.607, + "loss/crossentropy": 2.0058358907699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23942459374666214, + "step": 4696 + }, + { + "epoch": 0.09396, + "grad_norm": 2.25, + "grad_norm_var": 0.0103515625, + "learning_rate": 0.0001, + "loss": 4.2024, + "loss/crossentropy": 1.9118947982788086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2300974577665329, + "step": 4698 + }, + { + "epoch": 0.094, + "grad_norm": 2.5625, + "grad_norm_var": 0.013337198893229167, + "learning_rate": 0.0001, + "loss": 4.6541, + "loss/crossentropy": 2.1172733902931213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24785596132278442, + "step": 4700 + }, + { + "epoch": 0.09404, + "grad_norm": 2.5, + "grad_norm_var": 0.016044108072916667, + "learning_rate": 0.0001, + "loss": 4.9005, + "loss/crossentropy": 2.0064170956611633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25045372545719147, + "step": 4702 + }, + { + "epoch": 0.09408, + "grad_norm": 2.3125, + "grad_norm_var": 0.015738932291666667, + "learning_rate": 0.0001, + "loss": 4.8139, + "loss/crossentropy": 2.5816656351089478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27938568592071533, + "step": 4704 + }, + { + "epoch": 0.09412, + "grad_norm": 2.25, + "grad_norm_var": 0.016597493489583334, + "learning_rate": 0.0001, + "loss": 4.866, + "loss/crossentropy": 2.5768171548843384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27975398302078247, + "step": 4706 + }, + { + "epoch": 0.09416, + "grad_norm": 2.234375, + "grad_norm_var": 0.0153717041015625, + "learning_rate": 0.0001, + "loss": 4.6075, + "loss/crossentropy": 2.323893189430237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2608431279659271, + "step": 4708 + }, + { + "epoch": 0.0942, + "grad_norm": 2.265625, + "grad_norm_var": 0.01357421875, + "learning_rate": 0.0001, + "loss": 4.3564, + "loss/crossentropy": 1.7910810708999634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21380099654197693, + "step": 4710 + }, + { + "epoch": 0.09424, + "grad_norm": 2.21875, + "grad_norm_var": 0.015848795572916668, + "learning_rate": 0.0001, + "loss": 4.6374, + "loss/crossentropy": 2.214052677154541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24833911657333374, + "step": 4712 + }, + { + "epoch": 0.09428, + "grad_norm": 2.40625, + "grad_norm_var": 0.0158355712890625, + "learning_rate": 0.0001, + "loss": 4.9987, + "loss/crossentropy": 2.0850380063056946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2419341504573822, + "step": 4714 + }, + { + "epoch": 0.09432, + "grad_norm": 2.359375, + "grad_norm_var": 0.014058430989583334, + "learning_rate": 0.0001, + "loss": 4.2961, + "loss/crossentropy": 2.0707927346229553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2499033510684967, + "step": 4716 + }, + { + "epoch": 0.09436, + "grad_norm": 2.3125, + "grad_norm_var": 0.0125396728515625, + "learning_rate": 0.0001, + "loss": 4.4057, + "loss/crossentropy": 2.11221444606781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25916673243045807, + "step": 4718 + }, + { + "epoch": 0.0944, + "grad_norm": 2.390625, + "grad_norm_var": 0.012718709309895833, + "learning_rate": 0.0001, + "loss": 4.6057, + "loss/crossentropy": 1.8400230407714844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22771906107664108, + "step": 4720 + }, + { + "epoch": 0.09444, + "grad_norm": 2.28125, + "grad_norm_var": 0.008234659830729166, + "learning_rate": 0.0001, + "loss": 4.4929, + "loss/crossentropy": 2.2271865606307983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2574647441506386, + "step": 4722 + }, + { + "epoch": 0.09448, + "grad_norm": 2.203125, + "grad_norm_var": 0.008649698893229167, + "learning_rate": 0.0001, + "loss": 4.4629, + "loss/crossentropy": 2.4532920122146606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27140843868255615, + "step": 4724 + }, + { + "epoch": 0.09452, + "grad_norm": 2.375, + "grad_norm_var": 0.009504191080729167, + "learning_rate": 0.0001, + "loss": 4.731, + "loss/crossentropy": 1.951303780078888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2333502620458603, + "step": 4726 + }, + { + "epoch": 0.09456, + "grad_norm": 2.40625, + "grad_norm_var": 0.009137980143229167, + "learning_rate": 0.0001, + "loss": 4.6334, + "loss/crossentropy": 2.209821343421936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24776015430688858, + "step": 4728 + }, + { + "epoch": 0.0946, + "grad_norm": 2.3125, + "grad_norm_var": 0.010872395833333333, + "learning_rate": 0.0001, + "loss": 4.861, + "loss/crossentropy": 2.434941530227661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2747645229101181, + "step": 4730 + }, + { + "epoch": 0.09464, + "grad_norm": 2.1875, + "grad_norm_var": 0.010602823893229167, + "learning_rate": 0.0001, + "loss": 4.6145, + "loss/crossentropy": 2.051652252674103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24515582621097565, + "step": 4732 + }, + { + "epoch": 0.09468, + "grad_norm": 2.265625, + "grad_norm_var": 0.009797159830729167, + "learning_rate": 0.0001, + "loss": 4.6804, + "loss/crossentropy": 2.0039377212524414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23073314130306244, + "step": 4734 + }, + { + "epoch": 0.09472, + "grad_norm": 2.4375, + "grad_norm_var": 0.19903971354166666, + "learning_rate": 0.0001, + "loss": 4.8333, + "loss/crossentropy": 2.166410982608795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25871724635362625, + "step": 4736 + }, + { + "epoch": 0.09476, + "grad_norm": 2.3125, + "grad_norm_var": 0.2066802978515625, + "learning_rate": 0.0001, + "loss": 4.2739, + "loss/crossentropy": 1.9289153218269348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23183659464120865, + "step": 4738 + }, + { + "epoch": 0.0948, + "grad_norm": 2.46875, + "grad_norm_var": 0.20640360514322917, + "learning_rate": 0.0001, + "loss": 4.5903, + "loss/crossentropy": 2.270771861076355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26335832476615906, + "step": 4740 + }, + { + "epoch": 0.09484, + "grad_norm": 2.4375, + "grad_norm_var": 0.206494140625, + "learning_rate": 0.0001, + "loss": 4.7142, + "loss/crossentropy": 2.395651936531067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28527122735977173, + "step": 4742 + }, + { + "epoch": 0.09488, + "grad_norm": 2.390625, + "grad_norm_var": 0.20437723795572918, + "learning_rate": 0.0001, + "loss": 4.5083, + "loss/crossentropy": 1.8597867488861084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224149189889431, + "step": 4744 + }, + { + "epoch": 0.09492, + "grad_norm": 2.265625, + "grad_norm_var": 0.20693257649739583, + "learning_rate": 0.0001, + "loss": 4.4415, + "loss/crossentropy": 1.7795116305351257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23233074694871902, + "step": 4746 + }, + { + "epoch": 0.09496, + "grad_norm": 2.4375, + "grad_norm_var": 0.20255533854166666, + "learning_rate": 0.0001, + "loss": 4.7749, + "loss/crossentropy": 1.9449282884597778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2524118423461914, + "step": 4748 + }, + { + "epoch": 0.095, + "grad_norm": 2.53125, + "grad_norm_var": 0.2005035400390625, + "learning_rate": 0.0001, + "loss": 4.7047, + "loss/crossentropy": 2.195169448852539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2349853590130806, + "step": 4750 + }, + { + "epoch": 0.09504, + "grad_norm": 2.671875, + "grad_norm_var": 0.022468058268229167, + "learning_rate": 0.0001, + "loss": 4.5268, + "loss/crossentropy": 1.6628928184509277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975274682044983, + "step": 4752 + }, + { + "epoch": 0.09508, + "grad_norm": 2.25, + "grad_norm_var": 0.018488566080729168, + "learning_rate": 0.0001, + "loss": 4.5862, + "loss/crossentropy": 2.0991236567497253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23341374844312668, + "step": 4754 + }, + { + "epoch": 0.09512, + "grad_norm": 2.40625, + "grad_norm_var": 0.017024739583333334, + "learning_rate": 0.0001, + "loss": 4.8338, + "loss/crossentropy": 2.350286066532135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2648291736841202, + "step": 4756 + }, + { + "epoch": 0.09516, + "grad_norm": 2.375, + "grad_norm_var": 0.017430623372395832, + "learning_rate": 0.0001, + "loss": 4.5788, + "loss/crossentropy": 2.02384877204895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24653150886297226, + "step": 4758 + }, + { + "epoch": 0.0952, + "grad_norm": 2.203125, + "grad_norm_var": 0.01734619140625, + "learning_rate": 0.0001, + "loss": 4.7052, + "loss/crossentropy": 2.264349341392517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25881223380565643, + "step": 4760 + }, + { + "epoch": 0.09524, + "grad_norm": 2.890625, + "grad_norm_var": 0.03779296875, + "learning_rate": 0.0001, + "loss": 4.8216, + "loss/crossentropy": 2.197356939315796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2565019279718399, + "step": 4762 + }, + { + "epoch": 0.09528, + "grad_norm": 2.515625, + "grad_norm_var": 0.038655598958333336, + "learning_rate": 0.0001, + "loss": 4.3849, + "loss/crossentropy": 1.8794787526130676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21771979331970215, + "step": 4764 + }, + { + "epoch": 0.09532, + "grad_norm": 2.3125, + "grad_norm_var": 0.03795572916666667, + "learning_rate": 0.0001, + "loss": 4.7391, + "loss/crossentropy": 2.107520580291748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2350333333015442, + "step": 4766 + }, + { + "epoch": 0.09536, + "grad_norm": 2.3125, + "grad_norm_var": 0.03297119140625, + "learning_rate": 0.0001, + "loss": 4.5194, + "loss/crossentropy": 2.082249402999878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24860269576311111, + "step": 4768 + }, + { + "epoch": 0.0954, + "grad_norm": 2.296875, + "grad_norm_var": 0.03062744140625, + "learning_rate": 0.0001, + "loss": 4.3985, + "loss/crossentropy": 1.9632289409637451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24099770188331604, + "step": 4770 + }, + { + "epoch": 0.09544, + "grad_norm": 2.359375, + "grad_norm_var": 0.02994384765625, + "learning_rate": 0.0001, + "loss": 4.4526, + "loss/crossentropy": 2.3403327465057373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2573448717594147, + "step": 4772 + }, + { + "epoch": 0.09548, + "grad_norm": 2.40625, + "grad_norm_var": 0.03181050618489583, + "learning_rate": 0.0001, + "loss": 4.7711, + "loss/crossentropy": 2.302455425262451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28050975501537323, + "step": 4774 + }, + { + "epoch": 0.09552, + "grad_norm": 2.28125, + "grad_norm_var": 0.031103515625, + "learning_rate": 0.0001, + "loss": 4.6092, + "loss/crossentropy": 2.2875213623046875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23910009860992432, + "step": 4776 + }, + { + "epoch": 0.09556, + "grad_norm": 2.765625, + "grad_norm_var": 0.022721354166666666, + "learning_rate": 0.0001, + "loss": 4.5237, + "loss/crossentropy": 2.0440263748168945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24640783667564392, + "step": 4778 + }, + { + "epoch": 0.0956, + "grad_norm": 2.53125, + "grad_norm_var": 0.02613525390625, + "learning_rate": 0.0001, + "loss": 4.6328, + "loss/crossentropy": 2.1597142219543457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24566050618886948, + "step": 4780 + }, + { + "epoch": 0.09564, + "grad_norm": 2.203125, + "grad_norm_var": 0.027765909830729168, + "learning_rate": 0.0001, + "loss": 4.7138, + "loss/crossentropy": 2.2846235036849976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2598261833190918, + "step": 4782 + }, + { + "epoch": 0.09568, + "grad_norm": 2.1875, + "grad_norm_var": 0.029683430989583332, + "learning_rate": 0.0001, + "loss": 4.4527, + "loss/crossentropy": 2.032243251800537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2488306611776352, + "step": 4784 + }, + { + "epoch": 0.09572, + "grad_norm": 2.5, + "grad_norm_var": 0.037230428059895834, + "learning_rate": 0.0001, + "loss": 4.7651, + "loss/crossentropy": 2.4999172687530518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25949685275554657, + "step": 4786 + }, + { + "epoch": 0.09576, + "grad_norm": 2.296875, + "grad_norm_var": 0.0375152587890625, + "learning_rate": 0.0001, + "loss": 4.6068, + "loss/crossentropy": 2.1610575914382935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2414923906326294, + "step": 4788 + }, + { + "epoch": 0.0958, + "grad_norm": 2.390625, + "grad_norm_var": 0.03135477701822917, + "learning_rate": 0.0001, + "loss": 4.5309, + "loss/crossentropy": 1.8679919838905334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21853071451187134, + "step": 4790 + }, + { + "epoch": 0.09584, + "grad_norm": 2.390625, + "grad_norm_var": 0.031281534830729166, + "learning_rate": 0.0001, + "loss": 4.697, + "loss/crossentropy": 2.072615623474121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24943216145038605, + "step": 4792 + }, + { + "epoch": 0.09588, + "grad_norm": 2.765625, + "grad_norm_var": 0.032624308268229166, + "learning_rate": 0.0001, + "loss": 4.5555, + "loss/crossentropy": 2.420115113258362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2840816229581833, + "step": 4794 + }, + { + "epoch": 0.09592, + "grad_norm": 2.6875, + "grad_norm_var": 0.0455078125, + "learning_rate": 0.0001, + "loss": 5.102, + "loss/crossentropy": 2.2809360027313232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2739071249961853, + "step": 4796 + }, + { + "epoch": 0.09596, + "grad_norm": 2.328125, + "grad_norm_var": 0.04319559733072917, + "learning_rate": 0.0001, + "loss": 4.5418, + "loss/crossentropy": 2.1809465289115906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24680403620004654, + "step": 4798 + }, + { + "epoch": 0.096, + "grad_norm": 2.15625, + "grad_norm_var": 0.044611612955729164, + "learning_rate": 0.0001, + "loss": 4.6994, + "loss/crossentropy": 1.9710316061973572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2598233222961426, + "step": 4800 + }, + { + "epoch": 0.09604, + "grad_norm": 2.40625, + "grad_norm_var": 0.035477701822916666, + "learning_rate": 0.0001, + "loss": 4.9811, + "loss/crossentropy": 2.4378572702407837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2665309011936188, + "step": 4802 + }, + { + "epoch": 0.09608, + "grad_norm": 2.390625, + "grad_norm_var": 0.03759358723958333, + "learning_rate": 0.0001, + "loss": 4.604, + "loss/crossentropy": 1.8578800559043884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24051420390605927, + "step": 4804 + }, + { + "epoch": 0.09612, + "grad_norm": 2.40625, + "grad_norm_var": 0.034520467122395836, + "learning_rate": 0.0001, + "loss": 4.4567, + "loss/crossentropy": 2.191601037979126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25017624348402023, + "step": 4806 + }, + { + "epoch": 0.09616, + "grad_norm": 2.296875, + "grad_norm_var": 0.038374837239583334, + "learning_rate": 0.0001, + "loss": 4.5777, + "loss/crossentropy": 2.2077550888061523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31426818668842316, + "step": 4808 + }, + { + "epoch": 0.0962, + "grad_norm": 2.390625, + "grad_norm_var": 0.030659993489583332, + "learning_rate": 0.0001, + "loss": 4.917, + "loss/crossentropy": 2.2983503341674805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26998060941696167, + "step": 4810 + }, + { + "epoch": 0.09624, + "grad_norm": 2.703125, + "grad_norm_var": 0.020824178059895834, + "learning_rate": 0.0001, + "loss": 4.8691, + "loss/crossentropy": 2.0875505208969116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2673826217651367, + "step": 4812 + }, + { + "epoch": 0.09628, + "grad_norm": 2.03125, + "grad_norm_var": 0.031403605143229166, + "learning_rate": 0.0001, + "loss": 4.0736, + "loss/crossentropy": 1.9439318776130676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2379719465970993, + "step": 4814 + }, + { + "epoch": 0.09632, + "grad_norm": 2.359375, + "grad_norm_var": 0.03611551920572917, + "learning_rate": 0.0001, + "loss": 4.4698, + "loss/crossentropy": 1.9822518229484558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24425340443849564, + "step": 4816 + }, + { + "epoch": 0.09636, + "grad_norm": 2.296875, + "grad_norm_var": 0.03902994791666667, + "learning_rate": 0.0001, + "loss": 4.475, + "loss/crossentropy": 2.0238336324691772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2409110590815544, + "step": 4818 + }, + { + "epoch": 0.0964, + "grad_norm": 2.34375, + "grad_norm_var": 0.03707275390625, + "learning_rate": 0.0001, + "loss": 4.5401, + "loss/crossentropy": 2.454450249671936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26127950847148895, + "step": 4820 + }, + { + "epoch": 0.09644, + "grad_norm": 2.296875, + "grad_norm_var": 0.038407389322916666, + "learning_rate": 0.0001, + "loss": 4.5682, + "loss/crossentropy": 2.210579752922058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23662539571523666, + "step": 4822 + }, + { + "epoch": 0.09648, + "grad_norm": 2.28125, + "grad_norm_var": 0.034601847330729164, + "learning_rate": 0.0001, + "loss": 4.5527, + "loss/crossentropy": 1.8737664222717285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22884630411863327, + "step": 4824 + }, + { + "epoch": 0.09652, + "grad_norm": 2.484375, + "grad_norm_var": 0.0335357666015625, + "learning_rate": 0.0001, + "loss": 4.6221, + "loss/crossentropy": 2.1939562559127808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25145241618156433, + "step": 4826 + }, + { + "epoch": 0.09656, + "grad_norm": 2.328125, + "grad_norm_var": 0.0278472900390625, + "learning_rate": 0.0001, + "loss": 4.6685, + "loss/crossentropy": 2.133625030517578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25248992443084717, + "step": 4828 + }, + { + "epoch": 0.0966, + "grad_norm": 2.328125, + "grad_norm_var": 0.0197906494140625, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.259738326072693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24727293848991394, + "step": 4830 + }, + { + "epoch": 0.09664, + "grad_norm": 2.984375, + "grad_norm_var": 0.043635050455729164, + "learning_rate": 0.0001, + "loss": 4.7497, + "loss/crossentropy": 2.0186268091201782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27940231561660767, + "step": 4832 + }, + { + "epoch": 0.09668, + "grad_norm": 2.234375, + "grad_norm_var": 0.04185282389322917, + "learning_rate": 0.0001, + "loss": 4.5867, + "loss/crossentropy": 1.9686395525932312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23021705448627472, + "step": 4834 + }, + { + "epoch": 0.09672, + "grad_norm": 2.296875, + "grad_norm_var": 0.0424224853515625, + "learning_rate": 0.0001, + "loss": 4.6157, + "loss/crossentropy": 2.1677820682525635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2766892910003662, + "step": 4836 + }, + { + "epoch": 0.09676, + "grad_norm": 2.1875, + "grad_norm_var": 0.04670308430989583, + "learning_rate": 0.0001, + "loss": 4.3751, + "loss/crossentropy": 1.8458788990974426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23294126987457275, + "step": 4838 + }, + { + "epoch": 0.0968, + "grad_norm": 2.640625, + "grad_norm_var": 0.0495269775390625, + "learning_rate": 0.0001, + "loss": 4.8931, + "loss/crossentropy": 1.7898097038269043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23436476290225983, + "step": 4840 + }, + { + "epoch": 0.09684, + "grad_norm": 2.375, + "grad_norm_var": 0.04843343098958333, + "learning_rate": 0.0001, + "loss": 4.5169, + "loss/crossentropy": 2.4594497680664062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2618473023176193, + "step": 4842 + }, + { + "epoch": 0.09688, + "grad_norm": 2.4375, + "grad_norm_var": 0.04752197265625, + "learning_rate": 0.0001, + "loss": 4.4841, + "loss/crossentropy": 2.061558425426483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2523345798254013, + "step": 4844 + }, + { + "epoch": 0.09692, + "grad_norm": 2.484375, + "grad_norm_var": 0.04462890625, + "learning_rate": 0.0001, + "loss": 4.5606, + "loss/crossentropy": 2.060658574104309, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2493506520986557, + "step": 4846 + }, + { + "epoch": 0.09696, + "grad_norm": 2.25, + "grad_norm_var": 0.016169230143229168, + "learning_rate": 0.0001, + "loss": 4.8028, + "loss/crossentropy": 2.201029062271118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2629493921995163, + "step": 4848 + }, + { + "epoch": 0.097, + "grad_norm": 2.34375, + "grad_norm_var": 0.01519775390625, + "learning_rate": 0.0001, + "loss": 4.7404, + "loss/crossentropy": 2.199273705482483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27054519951343536, + "step": 4850 + }, + { + "epoch": 0.09704, + "grad_norm": 2.109375, + "grad_norm_var": 0.0189453125, + "learning_rate": 0.0001, + "loss": 4.3008, + "loss/crossentropy": 2.2466784715652466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2678111642599106, + "step": 4852 + }, + { + "epoch": 0.09708, + "grad_norm": 2.71875, + "grad_norm_var": 0.022054036458333332, + "learning_rate": 0.0001, + "loss": 4.6762, + "loss/crossentropy": 1.9152815341949463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2441672906279564, + "step": 4854 + }, + { + "epoch": 0.09712, + "grad_norm": 2.21875, + "grad_norm_var": 0.017577107747395834, + "learning_rate": 0.0001, + "loss": 4.3818, + "loss/crossentropy": 2.092438578605652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24061349034309387, + "step": 4856 + }, + { + "epoch": 0.09716, + "grad_norm": 2.4375, + "grad_norm_var": 0.01802978515625, + "learning_rate": 0.0001, + "loss": 4.3224, + "loss/crossentropy": 1.8620384335517883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23738765716552734, + "step": 4858 + }, + { + "epoch": 0.0972, + "grad_norm": 2.953125, + "grad_norm_var": 0.04138997395833333, + "learning_rate": 0.0001, + "loss": 4.8798, + "loss/crossentropy": 2.199701428413391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26521213352680206, + "step": 4860 + }, + { + "epoch": 0.09724, + "grad_norm": 2.53125, + "grad_norm_var": 0.042170206705729164, + "learning_rate": 0.0001, + "loss": 4.7598, + "loss/crossentropy": 2.3637821674346924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2707534506917, + "step": 4862 + }, + { + "epoch": 0.09728, + "grad_norm": 2.21875, + "grad_norm_var": 0.045491536458333336, + "learning_rate": 0.0001, + "loss": 4.5184, + "loss/crossentropy": 2.207235813140869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28080131858587265, + "step": 4864 + }, + { + "epoch": 0.09732, + "grad_norm": 2.265625, + "grad_norm_var": 0.0466461181640625, + "learning_rate": 0.0001, + "loss": 4.3798, + "loss/crossentropy": 1.901595950126648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23459318280220032, + "step": 4866 + }, + { + "epoch": 0.09736, + "grad_norm": 2.3125, + "grad_norm_var": 0.042769368489583334, + "learning_rate": 0.0001, + "loss": 4.4289, + "loss/crossentropy": 2.298312544822693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2580094337463379, + "step": 4868 + }, + { + "epoch": 0.0974, + "grad_norm": 2.3125, + "grad_norm_var": 0.03658854166666667, + "learning_rate": 0.0001, + "loss": 4.7576, + "loss/crossentropy": 2.29680597782135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24749226868152618, + "step": 4870 + }, + { + "epoch": 0.09744, + "grad_norm": 2.25, + "grad_norm_var": 0.0361968994140625, + "learning_rate": 0.0001, + "loss": 4.5839, + "loss/crossentropy": 2.1169378757476807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2565944790840149, + "step": 4872 + }, + { + "epoch": 0.09748, + "grad_norm": 2.140625, + "grad_norm_var": 0.04035542805989583, + "learning_rate": 0.0001, + "loss": 4.5364, + "loss/crossentropy": 2.0863184928894043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2527791038155556, + "step": 4874 + }, + { + "epoch": 0.09752, + "grad_norm": 2.296875, + "grad_norm_var": 0.015625, + "learning_rate": 0.0001, + "loss": 4.6176, + "loss/crossentropy": 2.146193563938141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23968011140823364, + "step": 4876 + }, + { + "epoch": 0.09756, + "grad_norm": 2.375, + "grad_norm_var": 0.01060791015625, + "learning_rate": 0.0001, + "loss": 4.5598, + "loss/crossentropy": 2.1125508546829224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2498009204864502, + "step": 4878 + }, + { + "epoch": 0.0976, + "grad_norm": 2.375, + "grad_norm_var": 0.004130045572916667, + "learning_rate": 0.0001, + "loss": 4.772, + "loss/crossentropy": 2.13166081905365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2640175372362137, + "step": 4880 + }, + { + "epoch": 0.09764, + "grad_norm": 2.15625, + "grad_norm_var": 0.0051910400390625, + "learning_rate": 0.0001, + "loss": 4.5276, + "loss/crossentropy": 2.047860622406006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23980721086263657, + "step": 4882 + }, + { + "epoch": 0.09768, + "grad_norm": 2.515625, + "grad_norm_var": 0.010477701822916666, + "learning_rate": 0.0001, + "loss": 4.8519, + "loss/crossentropy": 2.362569808959961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27908293902873993, + "step": 4884 + }, + { + "epoch": 0.09772, + "grad_norm": 2.109375, + "grad_norm_var": 0.012398274739583333, + "learning_rate": 0.0001, + "loss": 4.2709, + "loss/crossentropy": 1.8781500458717346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22341010719537735, + "step": 4886 + }, + { + "epoch": 0.09776, + "grad_norm": 2.390625, + "grad_norm_var": 0.019807942708333335, + "learning_rate": 0.0001, + "loss": 4.9408, + "loss/crossentropy": 2.1138893365859985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2533891350030899, + "step": 4888 + }, + { + "epoch": 0.0978, + "grad_norm": 2.484375, + "grad_norm_var": 0.0188140869140625, + "learning_rate": 0.0001, + "loss": 4.5981, + "loss/crossentropy": 2.2212090492248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.279159352183342, + "step": 4890 + }, + { + "epoch": 0.09784, + "grad_norm": 2.453125, + "grad_norm_var": 0.0177886962890625, + "learning_rate": 0.0001, + "loss": 4.7245, + "loss/crossentropy": 1.9747707843780518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24078013002872467, + "step": 4892 + }, + { + "epoch": 0.09788, + "grad_norm": 2.53125, + "grad_norm_var": 0.020015462239583334, + "learning_rate": 0.0001, + "loss": 4.4051, + "loss/crossentropy": 1.8607316613197327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25342857837677, + "step": 4894 + }, + { + "epoch": 0.09792, + "grad_norm": 2.359375, + "grad_norm_var": 0.020963541666666665, + "learning_rate": 0.0001, + "loss": 4.6759, + "loss/crossentropy": 2.245271682739258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24260863661766052, + "step": 4896 + }, + { + "epoch": 0.09796, + "grad_norm": 2.390625, + "grad_norm_var": 0.017560831705729165, + "learning_rate": 0.0001, + "loss": 4.5006, + "loss/crossentropy": 2.0503702759742737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25429168343544006, + "step": 4898 + }, + { + "epoch": 0.098, + "grad_norm": 2.3125, + "grad_norm_var": 0.017236328125, + "learning_rate": 0.0001, + "loss": 4.421, + "loss/crossentropy": 1.7784077525138855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2249627709388733, + "step": 4900 + }, + { + "epoch": 0.09804, + "grad_norm": 2.390625, + "grad_norm_var": 0.0114898681640625, + "learning_rate": 0.0001, + "loss": 4.5398, + "loss/crossentropy": 1.9827336072921753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001181542873383, + "step": 4902 + }, + { + "epoch": 0.09808, + "grad_norm": 2.53125, + "grad_norm_var": 1.638996378580729, + "learning_rate": 0.0001, + "loss": 4.8149, + "loss/crossentropy": 2.1003565788269043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2550586014986038, + "step": 4904 + }, + { + "epoch": 0.09812, + "grad_norm": 2.484375, + "grad_norm_var": 1.6366933186848958, + "learning_rate": 0.0001, + "loss": 4.5099, + "loss/crossentropy": 1.9565055966377258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25481177121400833, + "step": 4906 + }, + { + "epoch": 0.09816, + "grad_norm": 2.234375, + "grad_norm_var": 1.6479777018229167, + "learning_rate": 0.0001, + "loss": 4.6214, + "loss/crossentropy": 2.1693456172943115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23336851596832275, + "step": 4908 + }, + { + "epoch": 0.0982, + "grad_norm": 2.40625, + "grad_norm_var": 1.6428995768229167, + "learning_rate": 0.0001, + "loss": 4.7637, + "loss/crossentropy": 2.050541341304779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25821831077337265, + "step": 4910 + }, + { + "epoch": 0.09824, + "grad_norm": 2.40625, + "grad_norm_var": 1.62906494140625, + "learning_rate": 0.0001, + "loss": 4.8515, + "loss/crossentropy": 2.168497681617737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2674206495285034, + "step": 4912 + }, + { + "epoch": 0.09828, + "grad_norm": 2.609375, + "grad_norm_var": 1.6097320556640624, + "learning_rate": 0.0001, + "loss": 5.1667, + "loss/crossentropy": 2.147577404975891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26900771260261536, + "step": 4914 + }, + { + "epoch": 0.09832, + "grad_norm": 2.09375, + "grad_norm_var": 1.6371734619140625, + "learning_rate": 0.0001, + "loss": 4.3958, + "loss/crossentropy": 2.4436198472976685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2504816800355911, + "step": 4916 + }, + { + "epoch": 0.09836, + "grad_norm": 2.25, + "grad_norm_var": 1.6447265625, + "learning_rate": 0.0001, + "loss": 4.7244, + "loss/crossentropy": 2.097359538078308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24005448818206787, + "step": 4918 + }, + { + "epoch": 0.0984, + "grad_norm": 2.078125, + "grad_norm_var": 0.0267974853515625, + "learning_rate": 0.0001, + "loss": 4.2305, + "loss/crossentropy": 1.8467384576797485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22932368516921997, + "step": 4920 + }, + { + "epoch": 0.09844, + "grad_norm": 2.859375, + "grad_norm_var": 0.04263916015625, + "learning_rate": 0.0001, + "loss": 4.4407, + "loss/crossentropy": 2.1454135179519653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2634875178337097, + "step": 4922 + }, + { + "epoch": 0.09848, + "grad_norm": 2.765625, + "grad_norm_var": 0.05164286295572917, + "learning_rate": 0.0001, + "loss": 4.5775, + "loss/crossentropy": 1.9837967157363892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23980768024921417, + "step": 4924 + }, + { + "epoch": 0.09852, + "grad_norm": 2.265625, + "grad_norm_var": 0.054032389322916666, + "learning_rate": 0.0001, + "loss": 4.6888, + "loss/crossentropy": 2.099667489528656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23482084274291992, + "step": 4926 + }, + { + "epoch": 0.09856, + "grad_norm": 3.125, + "grad_norm_var": 0.08787333170572917, + "learning_rate": 0.0001, + "loss": 4.5685, + "loss/crossentropy": 1.8467332124710083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.256004735827446, + "step": 4928 + }, + { + "epoch": 0.0986, + "grad_norm": 2.53125, + "grad_norm_var": 0.08548075358072917, + "learning_rate": 0.0001, + "loss": 4.4474, + "loss/crossentropy": 2.036003887653351, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2377806007862091, + "step": 4930 + }, + { + "epoch": 0.09864, + "grad_norm": 2.484375, + "grad_norm_var": 0.07834370930989583, + "learning_rate": 0.0001, + "loss": 4.8258, + "loss/crossentropy": 2.091457724571228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24746537953615189, + "step": 4932 + }, + { + "epoch": 0.09868, + "grad_norm": 2.296875, + "grad_norm_var": 0.07929280598958334, + "learning_rate": 0.0001, + "loss": 4.6583, + "loss/crossentropy": 2.245227336883545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24141517281532288, + "step": 4934 + }, + { + "epoch": 0.09872, + "grad_norm": 2.40625, + "grad_norm_var": 0.0661285400390625, + "learning_rate": 0.0001, + "loss": 4.6514, + "loss/crossentropy": 2.295682668685913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28035247325897217, + "step": 4936 + }, + { + "epoch": 0.09876, + "grad_norm": 2.234375, + "grad_norm_var": 0.05524800618489583, + "learning_rate": 0.0001, + "loss": 4.5525, + "loss/crossentropy": 1.9251704812049866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23609354346990585, + "step": 4938 + }, + { + "epoch": 0.0988, + "grad_norm": 2.5, + "grad_norm_var": 0.048314412434895836, + "learning_rate": 0.0001, + "loss": 5.1583, + "loss/crossentropy": 2.289652466773987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30698196589946747, + "step": 4940 + }, + { + "epoch": 0.09884, + "grad_norm": 2.09375, + "grad_norm_var": 0.0534820556640625, + "learning_rate": 0.0001, + "loss": 4.7759, + "loss/crossentropy": 2.3339043855667114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2449272722005844, + "step": 4942 + }, + { + "epoch": 0.09888, + "grad_norm": 2.203125, + "grad_norm_var": 0.01949462890625, + "learning_rate": 0.0001, + "loss": 4.6678, + "loss/crossentropy": 2.0610195994377136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2553517669439316, + "step": 4944 + }, + { + "epoch": 0.09892, + "grad_norm": 2.203125, + "grad_norm_var": 0.018122355143229168, + "learning_rate": 0.0001, + "loss": 4.5849, + "loss/crossentropy": 2.1093825101852417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24767974764108658, + "step": 4946 + }, + { + "epoch": 0.09896, + "grad_norm": 2.328125, + "grad_norm_var": 0.01416015625, + "learning_rate": 0.0001, + "loss": 4.3631, + "loss/crossentropy": 2.1742878556251526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24640457332134247, + "step": 4948 + }, + { + "epoch": 0.099, + "grad_norm": 2.390625, + "grad_norm_var": 0.0144195556640625, + "learning_rate": 0.0001, + "loss": 4.8319, + "loss/crossentropy": 2.4237486124038696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2610916793346405, + "step": 4950 + }, + { + "epoch": 0.09904, + "grad_norm": 2.15625, + "grad_norm_var": 0.015445963541666666, + "learning_rate": 0.0001, + "loss": 4.3397, + "loss/crossentropy": 2.197754144668579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275979220867157, + "step": 4952 + }, + { + "epoch": 0.09908, + "grad_norm": 2.21875, + "grad_norm_var": 0.01529541015625, + "learning_rate": 0.0001, + "loss": 4.4533, + "loss/crossentropy": 2.267225503921509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23391032963991165, + "step": 4954 + }, + { + "epoch": 0.09912, + "grad_norm": 2.265625, + "grad_norm_var": 0.009504191080729167, + "learning_rate": 0.0001, + "loss": 4.47, + "loss/crossentropy": 2.04893159866333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23763196170330048, + "step": 4956 + }, + { + "epoch": 0.09916, + "grad_norm": 2.4375, + "grad_norm_var": 0.010716756184895834, + "learning_rate": 0.0001, + "loss": 4.97, + "loss/crossentropy": 2.4489223957061768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27698180079460144, + "step": 4958 + }, + { + "epoch": 0.0992, + "grad_norm": 2.328125, + "grad_norm_var": 0.011930338541666667, + "learning_rate": 0.0001, + "loss": 4.8604, + "loss/crossentropy": 2.3654375076293945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2495984137058258, + "step": 4960 + }, + { + "epoch": 0.09924, + "grad_norm": 2.328125, + "grad_norm_var": 0.011812337239583333, + "learning_rate": 0.0001, + "loss": 4.7671, + "loss/crossentropy": 1.8583308458328247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21832667291164398, + "step": 4962 + }, + { + "epoch": 0.09928, + "grad_norm": 2.265625, + "grad_norm_var": 0.01177978515625, + "learning_rate": 0.0001, + "loss": 4.4951, + "loss/crossentropy": 2.0762425661087036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280900850892067, + "step": 4964 + }, + { + "epoch": 0.09932, + "grad_norm": 2.15625, + "grad_norm_var": 0.012532552083333334, + "learning_rate": 0.0001, + "loss": 4.7461, + "loss/crossentropy": 2.228640913963318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26595622301101685, + "step": 4966 + }, + { + "epoch": 0.09936, + "grad_norm": 2.234375, + "grad_norm_var": 0.0111236572265625, + "learning_rate": 0.0001, + "loss": 4.521, + "loss/crossentropy": 2.1447466611862183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2542327791452408, + "step": 4968 + }, + { + "epoch": 0.0994, + "grad_norm": 2.09375, + "grad_norm_var": 0.01226806640625, + "learning_rate": 0.0001, + "loss": 4.3959, + "loss/crossentropy": 2.0750887989997864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2362435683608055, + "step": 4970 + }, + { + "epoch": 0.09944, + "grad_norm": 2.328125, + "grad_norm_var": 0.009666951497395833, + "learning_rate": 0.0001, + "loss": 4.6705, + "loss/crossentropy": 1.9413353204727173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24418477714061737, + "step": 4972 + }, + { + "epoch": 0.09948, + "grad_norm": 2.265625, + "grad_norm_var": 0.00904541015625, + "learning_rate": 0.0001, + "loss": 4.609, + "loss/crossentropy": 2.102766752243042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24170882254838943, + "step": 4974 + }, + { + "epoch": 0.09952, + "grad_norm": 2.359375, + "grad_norm_var": 0.0090240478515625, + "learning_rate": 0.0001, + "loss": 4.7568, + "loss/crossentropy": 2.431061267852783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2855495512485504, + "step": 4976 + }, + { + "epoch": 0.09956, + "grad_norm": 2.453125, + "grad_norm_var": 0.010448201497395834, + "learning_rate": 0.0001, + "loss": 4.7449, + "loss/crossentropy": 1.9656312465667725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26410341262817383, + "step": 4978 + }, + { + "epoch": 0.0996, + "grad_norm": 2.578125, + "grad_norm_var": 0.01627197265625, + "learning_rate": 0.0001, + "loss": 5.0286, + "loss/crossentropy": 2.2365923523902893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27946531772613525, + "step": 4980 + }, + { + "epoch": 0.09964, + "grad_norm": 2.4375, + "grad_norm_var": 0.015184529622395833, + "learning_rate": 0.0001, + "loss": 4.5791, + "loss/crossentropy": 2.203595757484436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25247688591480255, + "step": 4982 + }, + { + "epoch": 0.09968, + "grad_norm": 2.296875, + "grad_norm_var": 0.0163482666015625, + "learning_rate": 0.0001, + "loss": 4.6824, + "loss/crossentropy": 2.1462446451187134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2539386674761772, + "step": 4984 + }, + { + "epoch": 0.09972, + "grad_norm": 2.25, + "grad_norm_var": 0.014127604166666667, + "learning_rate": 0.0001, + "loss": 4.7121, + "loss/crossentropy": 2.4518587589263916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24366125464439392, + "step": 4986 + }, + { + "epoch": 0.09976, + "grad_norm": 2.25, + "grad_norm_var": 0.01402587890625, + "learning_rate": 0.0001, + "loss": 4.5131, + "loss/crossentropy": 2.003869950771332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22874485701322556, + "step": 4988 + }, + { + "epoch": 0.0998, + "grad_norm": 2.328125, + "grad_norm_var": 0.017476399739583332, + "learning_rate": 0.0001, + "loss": 4.3221, + "loss/crossentropy": 2.0251912474632263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240670546889305, + "step": 4990 + }, + { + "epoch": 0.09984, + "grad_norm": 2.140625, + "grad_norm_var": 0.020166015625, + "learning_rate": 0.0001, + "loss": 4.5297, + "loss/crossentropy": 2.199779748916626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22916750609874725, + "step": 4992 + }, + { + "epoch": 0.09988, + "grad_norm": 2.40625, + "grad_norm_var": 0.0207672119140625, + "learning_rate": 0.0001, + "loss": 4.8066, + "loss/crossentropy": 2.3852288722991943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26489999890327454, + "step": 4994 + }, + { + "epoch": 0.09992, + "grad_norm": 2.359375, + "grad_norm_var": 0.03874409993489583, + "learning_rate": 0.0001, + "loss": 4.7245, + "loss/crossentropy": 1.9446094632148743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24112944304943085, + "step": 4996 + }, + { + "epoch": 0.09996, + "grad_norm": 2.234375, + "grad_norm_var": 0.04269205729166667, + "learning_rate": 0.0001, + "loss": 4.7063, + "loss/crossentropy": 2.585115671157837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26535044610500336, + "step": 4998 + }, + { + "epoch": 0.1, + "grad_norm": 2.8125, + "grad_norm_var": 1.14400634765625, + "learning_rate": 0.0001, + "loss": 4.6848, + "loss/crossentropy": 1.9871427416801453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24747569859027863, + "step": 5000 + }, + { + "epoch": 0.10004, + "grad_norm": 2.359375, + "grad_norm_var": 1.1424967447916667, + "learning_rate": 0.0001, + "loss": 4.6058, + "loss/crossentropy": 1.8981972336769104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24131165444850922, + "step": 5002 + }, + { + "epoch": 0.10008, + "grad_norm": 2.28125, + "grad_norm_var": 1.1522786458333334, + "learning_rate": 0.0001, + "loss": 4.5731, + "loss/crossentropy": 2.323825240135193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2584778293967247, + "step": 5004 + }, + { + "epoch": 0.10012, + "grad_norm": 2.390625, + "grad_norm_var": 1.1363118489583333, + "learning_rate": 0.0001, + "loss": 4.7065, + "loss/crossentropy": 1.728028118610382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2997850477695465, + "step": 5006 + }, + { + "epoch": 0.10016, + "grad_norm": 2.28125, + "grad_norm_var": 1.12437744140625, + "learning_rate": 0.0001, + "loss": 4.8001, + "loss/crossentropy": 2.1486289501190186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25335805118083954, + "step": 5008 + }, + { + "epoch": 0.1002, + "grad_norm": 2.5, + "grad_norm_var": 1.1099761962890624, + "learning_rate": 0.0001, + "loss": 4.936, + "loss/crossentropy": 2.3132145404815674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26033517718315125, + "step": 5010 + }, + { + "epoch": 0.10024, + "grad_norm": 2.671875, + "grad_norm_var": 1.1134928385416667, + "learning_rate": 0.0001, + "loss": 4.493, + "loss/crossentropy": 1.9233656525611877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2471245899796486, + "step": 5012 + }, + { + "epoch": 0.10028, + "grad_norm": 2.21875, + "grad_norm_var": 1.1301747639973958, + "learning_rate": 0.0001, + "loss": 4.5221, + "loss/crossentropy": 1.9435511827468872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24730819463729858, + "step": 5014 + }, + { + "epoch": 0.10032, + "grad_norm": 2.1875, + "grad_norm_var": 0.018928019205729167, + "learning_rate": 0.0001, + "loss": 4.3895, + "loss/crossentropy": 2.2031294107437134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23213820159435272, + "step": 5016 + }, + { + "epoch": 0.10036, + "grad_norm": 2.703125, + "grad_norm_var": 0.0276763916015625, + "learning_rate": 0.0001, + "loss": 4.7247, + "loss/crossentropy": 2.285850405693054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25809091329574585, + "step": 5018 + }, + { + "epoch": 0.1004, + "grad_norm": 2.328125, + "grad_norm_var": 0.025211588541666666, + "learning_rate": 0.0001, + "loss": 4.7697, + "loss/crossentropy": 1.8660435676574707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2436714917421341, + "step": 5020 + }, + { + "epoch": 0.10044, + "grad_norm": 2.421875, + "grad_norm_var": 0.025877888997395834, + "learning_rate": 0.0001, + "loss": 4.3716, + "loss/crossentropy": 2.0659420490264893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24248096346855164, + "step": 5022 + }, + { + "epoch": 0.10048, + "grad_norm": 2.328125, + "grad_norm_var": 0.030516560872395834, + "learning_rate": 0.0001, + "loss": 4.7093, + "loss/crossentropy": 2.213133215904236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2423061951994896, + "step": 5024 + }, + { + "epoch": 0.10052, + "grad_norm": 2.234375, + "grad_norm_var": 0.0313385009765625, + "learning_rate": 0.0001, + "loss": 4.5288, + "loss/crossentropy": 2.3052343130111694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24757324904203415, + "step": 5026 + }, + { + "epoch": 0.10056, + "grad_norm": 2.796875, + "grad_norm_var": 0.039094034830729166, + "learning_rate": 0.0001, + "loss": 4.8709, + "loss/crossentropy": 2.226990580558777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25892695784568787, + "step": 5028 + }, + { + "epoch": 0.1006, + "grad_norm": 2.71875, + "grad_norm_var": 0.04163004557291667, + "learning_rate": 0.0001, + "loss": 4.9444, + "loss/crossentropy": 2.3460742235183716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27760128676891327, + "step": 5030 + }, + { + "epoch": 0.10064, + "grad_norm": 2.125, + "grad_norm_var": 0.04345703125, + "learning_rate": 0.0001, + "loss": 4.3597, + "loss/crossentropy": 1.9782095551490784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2349972277879715, + "step": 5032 + }, + { + "epoch": 0.10068, + "grad_norm": 2.96875, + "grad_norm_var": 0.06148681640625, + "learning_rate": 0.0001, + "loss": 4.5602, + "loss/crossentropy": 1.847929298877716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22401423752307892, + "step": 5034 + }, + { + "epoch": 0.10072, + "grad_norm": 2.46875, + "grad_norm_var": 0.06073811848958333, + "learning_rate": 0.0001, + "loss": 4.4989, + "loss/crossentropy": 2.172071158885956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2656550332903862, + "step": 5036 + }, + { + "epoch": 0.10076, + "grad_norm": 2.234375, + "grad_norm_var": 0.07026265462239584, + "learning_rate": 0.0001, + "loss": 4.3892, + "loss/crossentropy": 2.3497499227523804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26085225492715836, + "step": 5038 + }, + { + "epoch": 0.1008, + "grad_norm": 2.328125, + "grad_norm_var": 0.07284749348958333, + "learning_rate": 0.0001, + "loss": 4.2583, + "loss/crossentropy": 2.0916348695755005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21129868924617767, + "step": 5040 + }, + { + "epoch": 0.10084, + "grad_norm": 2.59375, + "grad_norm_var": 0.07073160807291666, + "learning_rate": 0.0001, + "loss": 4.8931, + "loss/crossentropy": 2.243077278137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2657035142183304, + "step": 5042 + }, + { + "epoch": 0.10088, + "grad_norm": 10.0, + "grad_norm_var": 3.600194295247396, + "learning_rate": 0.0001, + "loss": 4.8887, + "loss/crossentropy": 1.9361066222190857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.41255413740873337, + "step": 5044 + }, + { + "epoch": 0.10092, + "grad_norm": 3.703125, + "grad_norm_var": 3.6162760416666666, + "learning_rate": 0.0001, + "loss": 4.7632, + "loss/crossentropy": 2.0139313340187073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2591235190629959, + "step": 5046 + }, + { + "epoch": 0.10096, + "grad_norm": 2.203125, + "grad_norm_var": 3.589704386393229, + "learning_rate": 0.0001, + "loss": 4.6642, + "loss/crossentropy": 2.335710287094116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26457205414772034, + "step": 5048 + }, + { + "epoch": 0.101, + "grad_norm": 2.34375, + "grad_norm_var": 3.604325358072917, + "learning_rate": 0.0001, + "loss": 4.3329, + "loss/crossentropy": 1.9269848465919495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23867928236722946, + "step": 5050 + }, + { + "epoch": 0.10104, + "grad_norm": 2.34375, + "grad_norm_var": 3.627415974934896, + "learning_rate": 0.0001, + "loss": 4.4982, + "loss/crossentropy": 1.9732608795166016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24961821734905243, + "step": 5052 + }, + { + "epoch": 0.10108, + "grad_norm": 2.359375, + "grad_norm_var": 3.6287506103515623, + "learning_rate": 0.0001, + "loss": 4.7218, + "loss/crossentropy": 2.2659696340560913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24580512940883636, + "step": 5054 + }, + { + "epoch": 0.10112, + "grad_norm": 2.84375, + "grad_norm_var": 3.57880859375, + "learning_rate": 0.0001, + "loss": 4.4318, + "loss/crossentropy": 1.6600720882415771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20283473283052444, + "step": 5056 + }, + { + "epoch": 0.10116, + "grad_norm": 2.375, + "grad_norm_var": 3.602855428059896, + "learning_rate": 0.0001, + "loss": 4.8249, + "loss/crossentropy": 2.0175185799598694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23329314589500427, + "step": 5058 + }, + { + "epoch": 0.1012, + "grad_norm": 2.4375, + "grad_norm_var": 0.146484375, + "learning_rate": 0.0001, + "loss": 4.5132, + "loss/crossentropy": 2.259668231010437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2692077234387398, + "step": 5060 + }, + { + "epoch": 0.10124, + "grad_norm": 2.46875, + "grad_norm_var": 0.02144775390625, + "learning_rate": 0.0001, + "loss": 4.6392, + "loss/crossentropy": 2.260953903198242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28225430846214294, + "step": 5062 + }, + { + "epoch": 0.10128, + "grad_norm": 2.171875, + "grad_norm_var": 0.021686808268229166, + "learning_rate": 0.0001, + "loss": 4.3624, + "loss/crossentropy": 1.9721892476081848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2336483597755432, + "step": 5064 + }, + { + "epoch": 0.10132, + "grad_norm": 2.40625, + "grad_norm_var": 0.021512858072916665, + "learning_rate": 0.0001, + "loss": 4.6257, + "loss/crossentropy": 2.188947319984436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23959602415561676, + "step": 5066 + }, + { + "epoch": 0.10136, + "grad_norm": 3.078125, + "grad_norm_var": 0.04843343098958333, + "learning_rate": 0.0001, + "loss": 4.5601, + "loss/crossentropy": 1.7914190292358398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2234276980161667, + "step": 5068 + }, + { + "epoch": 0.1014, + "grad_norm": 2.265625, + "grad_norm_var": 0.049925740559895834, + "learning_rate": 0.0001, + "loss": 4.5806, + "loss/crossentropy": 2.1124663949012756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22523616254329681, + "step": 5070 + }, + { + "epoch": 0.10144, + "grad_norm": 2.359375, + "grad_norm_var": 0.03764546712239583, + "learning_rate": 0.0001, + "loss": 4.7098, + "loss/crossentropy": 2.1146361231803894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22483345866203308, + "step": 5072 + }, + { + "epoch": 0.10148, + "grad_norm": 2.171875, + "grad_norm_var": 0.04121805826822917, + "learning_rate": 0.0001, + "loss": 4.5076, + "loss/crossentropy": 2.335755705833435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.256914846599102, + "step": 5074 + }, + { + "epoch": 0.10152, + "grad_norm": 2.34375, + "grad_norm_var": 0.04346415201822917, + "learning_rate": 0.0001, + "loss": 4.8665, + "loss/crossentropy": 2.2911819219589233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26038119196891785, + "step": 5076 + }, + { + "epoch": 0.10156, + "grad_norm": 2.171875, + "grad_norm_var": 0.047587076822916664, + "learning_rate": 0.0001, + "loss": 4.592, + "loss/crossentropy": 2.059769034385681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24004006385803223, + "step": 5078 + }, + { + "epoch": 0.1016, + "grad_norm": 2.109375, + "grad_norm_var": 0.04888916015625, + "learning_rate": 0.0001, + "loss": 4.5028, + "loss/crossentropy": 2.141201138496399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22454539686441422, + "step": 5080 + }, + { + "epoch": 0.10164, + "grad_norm": 2.421875, + "grad_norm_var": 0.050146484375, + "learning_rate": 0.0001, + "loss": 4.7527, + "loss/crossentropy": 1.9538633823394775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24477297067642212, + "step": 5082 + }, + { + "epoch": 0.10168, + "grad_norm": 2.1875, + "grad_norm_var": 0.00992431640625, + "learning_rate": 0.0001, + "loss": 4.4799, + "loss/crossentropy": 2.1555078625679016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2425938919186592, + "step": 5084 + }, + { + "epoch": 0.10172, + "grad_norm": 2.296875, + "grad_norm_var": 0.009235636393229166, + "learning_rate": 0.0001, + "loss": 4.5583, + "loss/crossentropy": 2.2306214570999146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2445632517337799, + "step": 5086 + }, + { + "epoch": 0.10176, + "grad_norm": 2.03125, + "grad_norm_var": 0.010179646809895833, + "learning_rate": 0.0001, + "loss": 4.1075, + "loss/crossentropy": 1.9713392853736877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22239256650209427, + "step": 5088 + }, + { + "epoch": 0.1018, + "grad_norm": 2.234375, + "grad_norm_var": 0.011799112955729166, + "learning_rate": 0.0001, + "loss": 4.5181, + "loss/crossentropy": 1.951128602027893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24310563504695892, + "step": 5090 + }, + { + "epoch": 0.10184, + "grad_norm": 2.28125, + "grad_norm_var": 0.011051432291666666, + "learning_rate": 0.0001, + "loss": 4.3617, + "loss/crossentropy": 2.0100057125091553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23054375499486923, + "step": 5092 + }, + { + "epoch": 0.10188, + "grad_norm": 2.140625, + "grad_norm_var": 0.011042277018229166, + "learning_rate": 0.0001, + "loss": 4.4573, + "loss/crossentropy": 2.1898789405822754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24956130981445312, + "step": 5094 + }, + { + "epoch": 0.10192, + "grad_norm": 2.21875, + "grad_norm_var": 0.021512858072916665, + "learning_rate": 0.0001, + "loss": 4.6576, + "loss/crossentropy": 1.5666239857673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20010025054216385, + "step": 5096 + }, + { + "epoch": 0.10196, + "grad_norm": 2.296875, + "grad_norm_var": 0.022468058268229167, + "learning_rate": 0.0001, + "loss": 4.2762, + "loss/crossentropy": 1.884951651096344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2484818547964096, + "step": 5098 + }, + { + "epoch": 0.102, + "grad_norm": 2.34375, + "grad_norm_var": 0.022554524739583335, + "learning_rate": 0.0001, + "loss": 4.5207, + "loss/crossentropy": 1.976080298423767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23380043357610703, + "step": 5100 + }, + { + "epoch": 0.10204, + "grad_norm": 2.375, + "grad_norm_var": 0.0229156494140625, + "learning_rate": 0.0001, + "loss": 4.592, + "loss/crossentropy": 2.1262658834457397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2441401332616806, + "step": 5102 + }, + { + "epoch": 0.10208, + "grad_norm": 2.515625, + "grad_norm_var": 0.08055013020833333, + "learning_rate": 0.0001, + "loss": 4.2735, + "loss/crossentropy": 1.7588757276535034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101762592792511, + "step": 5104 + }, + { + "epoch": 0.10212, + "grad_norm": 2.578125, + "grad_norm_var": 0.08059488932291667, + "learning_rate": 0.0001, + "loss": 4.6721, + "loss/crossentropy": 2.292783260345459, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26928654313087463, + "step": 5106 + }, + { + "epoch": 0.10216, + "grad_norm": 2.65625, + "grad_norm_var": 0.4834218343098958, + "learning_rate": 0.0001, + "loss": 4.7475, + "loss/crossentropy": 2.0670888423919678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2644767463207245, + "step": 5108 + }, + { + "epoch": 0.1022, + "grad_norm": 2.1875, + "grad_norm_var": 0.4729563395182292, + "learning_rate": 0.0001, + "loss": 4.4113, + "loss/crossentropy": 2.0522598028182983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2410094290971756, + "step": 5110 + }, + { + "epoch": 0.10224, + "grad_norm": 2.359375, + "grad_norm_var": 0.4729075113932292, + "learning_rate": 0.0001, + "loss": 4.4914, + "loss/crossentropy": 2.0756974816322327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27077721059322357, + "step": 5112 + }, + { + "epoch": 0.10228, + "grad_norm": 2.3125, + "grad_norm_var": 0.47226155598958336, + "learning_rate": 0.0001, + "loss": 4.6524, + "loss/crossentropy": 2.1569767594337463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23582034558057785, + "step": 5114 + }, + { + "epoch": 0.10232, + "grad_norm": 2.21875, + "grad_norm_var": 0.4847819010416667, + "learning_rate": 0.0001, + "loss": 4.2821, + "loss/crossentropy": 1.9736077785491943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23984025418758392, + "step": 5116 + }, + { + "epoch": 0.10236, + "grad_norm": 2.3125, + "grad_norm_var": 0.4942698160807292, + "learning_rate": 0.0001, + "loss": 4.3047, + "loss/crossentropy": 2.1400066614151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2347380667924881, + "step": 5118 + }, + { + "epoch": 0.1024, + "grad_norm": 2.171875, + "grad_norm_var": 0.46923726399739585, + "learning_rate": 0.0001, + "loss": 4.3531, + "loss/crossentropy": 1.989999234676361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22508147358894348, + "step": 5120 + }, + { + "epoch": 0.10244, + "grad_norm": 2.5, + "grad_norm_var": 0.4704498291015625, + "learning_rate": 0.0001, + "loss": 4.8817, + "loss/crossentropy": 2.132390856742859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3074522316455841, + "step": 5122 + }, + { + "epoch": 0.10248, + "grad_norm": 2.296875, + "grad_norm_var": 0.011002604166666667, + "learning_rate": 0.0001, + "loss": 4.3606, + "loss/crossentropy": 1.7906856536865234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126675397157669, + "step": 5124 + }, + { + "epoch": 0.10252, + "grad_norm": 2.265625, + "grad_norm_var": 0.0112457275390625, + "learning_rate": 0.0001, + "loss": 4.7045, + "loss/crossentropy": 2.0576369762420654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27211636304855347, + "step": 5126 + }, + { + "epoch": 0.10256, + "grad_norm": 2.265625, + "grad_norm_var": 0.01099853515625, + "learning_rate": 0.0001, + "loss": 4.5402, + "loss/crossentropy": 2.1174184679985046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2344469577074051, + "step": 5128 + }, + { + "epoch": 0.1026, + "grad_norm": 2.28125, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 4.7227, + "loss/crossentropy": 1.9139717817306519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2362786829471588, + "step": 5130 + }, + { + "epoch": 0.10264, + "grad_norm": 2.3125, + "grad_norm_var": 0.008495076497395834, + "learning_rate": 0.0001, + "loss": 4.4801, + "loss/crossentropy": 2.022357940673828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2307513877749443, + "step": 5132 + }, + { + "epoch": 0.10268, + "grad_norm": 2.203125, + "grad_norm_var": 0.007721964518229167, + "learning_rate": 0.0001, + "loss": 4.3963, + "loss/crossentropy": 2.038477897644043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22871223092079163, + "step": 5134 + }, + { + "epoch": 0.10272, + "grad_norm": 2.421875, + "grad_norm_var": 0.007079060872395833, + "learning_rate": 0.0001, + "loss": 4.7283, + "loss/crossentropy": 2.0895442962646484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23719681799411774, + "step": 5136 + }, + { + "epoch": 0.10276, + "grad_norm": 2.5625, + "grad_norm_var": 0.008820597330729167, + "learning_rate": 0.0001, + "loss": 4.7059, + "loss/crossentropy": 2.17978835105896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2519204765558243, + "step": 5138 + }, + { + "epoch": 0.1028, + "grad_norm": 2.46875, + "grad_norm_var": 0.00953369140625, + "learning_rate": 0.0001, + "loss": 4.7318, + "loss/crossentropy": 2.19089937210083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24376338720321655, + "step": 5140 + }, + { + "epoch": 0.10284, + "grad_norm": 2.515625, + "grad_norm_var": 0.011002604166666667, + "learning_rate": 0.0001, + "loss": 4.3961, + "loss/crossentropy": 2.018259823322296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23707401752471924, + "step": 5142 + }, + { + "epoch": 0.10288, + "grad_norm": 2.484375, + "grad_norm_var": 0.012287394205729166, + "learning_rate": 0.0001, + "loss": 4.7556, + "loss/crossentropy": 2.110401153564453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23409561812877655, + "step": 5144 + }, + { + "epoch": 0.10292, + "grad_norm": 2.390625, + "grad_norm_var": 0.012239583333333333, + "learning_rate": 0.0001, + "loss": 4.615, + "loss/crossentropy": 2.2096832990646362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2580249160528183, + "step": 5146 + }, + { + "epoch": 0.10296, + "grad_norm": 2.09375, + "grad_norm_var": 0.015803019205729168, + "learning_rate": 0.0001, + "loss": 4.5218, + "loss/crossentropy": 2.1825822591781616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23592937737703323, + "step": 5148 + }, + { + "epoch": 0.103, + "grad_norm": 2.359375, + "grad_norm_var": 0.01451416015625, + "learning_rate": 0.0001, + "loss": 4.6945, + "loss/crossentropy": 2.2440234422683716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23741164803504944, + "step": 5150 + }, + { + "epoch": 0.10304, + "grad_norm": 2.4375, + "grad_norm_var": 0.015071614583333334, + "learning_rate": 0.0001, + "loss": 4.962, + "loss/crossentropy": 2.3818799257278442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2642097622156143, + "step": 5152 + }, + { + "epoch": 0.10308, + "grad_norm": 2.296875, + "grad_norm_var": 0.014378865559895834, + "learning_rate": 0.0001, + "loss": 4.5706, + "loss/crossentropy": 2.1905806064605713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26945509016513824, + "step": 5154 + }, + { + "epoch": 0.10312, + "grad_norm": 2.140625, + "grad_norm_var": 0.017671712239583335, + "learning_rate": 0.0001, + "loss": 4.5678, + "loss/crossentropy": 2.096913695335388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2540033459663391, + "step": 5156 + }, + { + "epoch": 0.10316, + "grad_norm": 2.375, + "grad_norm_var": 0.015721638997395832, + "learning_rate": 0.0001, + "loss": 4.5286, + "loss/crossentropy": 1.7916489243507385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22504088282585144, + "step": 5158 + }, + { + "epoch": 0.1032, + "grad_norm": 2.34375, + "grad_norm_var": 0.014240519205729166, + "learning_rate": 0.0001, + "loss": 4.6256, + "loss/crossentropy": 1.9366079568862915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25474467873573303, + "step": 5160 + }, + { + "epoch": 0.10324, + "grad_norm": 2.40625, + "grad_norm_var": 0.014533487955729167, + "learning_rate": 0.0001, + "loss": 4.5054, + "loss/crossentropy": 2.0233771800994873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23394355177879333, + "step": 5162 + }, + { + "epoch": 0.10328, + "grad_norm": 2.234375, + "grad_norm_var": 0.010693359375, + "learning_rate": 0.0001, + "loss": 4.7803, + "loss/crossentropy": 2.442312717437744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28390438854694366, + "step": 5164 + }, + { + "epoch": 0.10332, + "grad_norm": 2.3125, + "grad_norm_var": 0.014435831705729167, + "learning_rate": 0.0001, + "loss": 4.7849, + "loss/crossentropy": 1.9547526836395264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2298036813735962, + "step": 5166 + }, + { + "epoch": 0.10336, + "grad_norm": 2.25, + "grad_norm_var": 0.018766276041666665, + "learning_rate": 0.0001, + "loss": 4.3225, + "loss/crossentropy": 1.7974739074707031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21959447860717773, + "step": 5168 + }, + { + "epoch": 0.1034, + "grad_norm": 2.328125, + "grad_norm_var": 0.020075480143229168, + "learning_rate": 0.0001, + "loss": 4.5438, + "loss/crossentropy": 1.9860564470291138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24661653488874435, + "step": 5170 + }, + { + "epoch": 0.10344, + "grad_norm": 2.34375, + "grad_norm_var": 0.019050089518229167, + "learning_rate": 0.0001, + "loss": 4.5084, + "loss/crossentropy": 1.6198940873146057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20592768490314484, + "step": 5172 + }, + { + "epoch": 0.10348, + "grad_norm": 2.25, + "grad_norm_var": 0.019554646809895833, + "learning_rate": 0.0001, + "loss": 4.4831, + "loss/crossentropy": 2.193474531173706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24437790364027023, + "step": 5174 + }, + { + "epoch": 0.10352, + "grad_norm": 2.5, + "grad_norm_var": 0.022557576497395832, + "learning_rate": 0.0001, + "loss": 4.9209, + "loss/crossentropy": 2.221992254257202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2931511402130127, + "step": 5176 + }, + { + "epoch": 0.10356, + "grad_norm": 2.28125, + "grad_norm_var": 0.022272745768229168, + "learning_rate": 0.0001, + "loss": 4.4611, + "loss/crossentropy": 2.006419837474823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2335921749472618, + "step": 5178 + }, + { + "epoch": 0.1036, + "grad_norm": 2.171875, + "grad_norm_var": 0.022175089518229166, + "learning_rate": 0.0001, + "loss": 4.4477, + "loss/crossentropy": 2.2861804962158203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2549041658639908, + "step": 5180 + }, + { + "epoch": 0.10364, + "grad_norm": 2.171875, + "grad_norm_var": 0.014378865559895834, + "learning_rate": 0.0001, + "loss": 4.4079, + "loss/crossentropy": 2.217998743057251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24741299450397491, + "step": 5182 + }, + { + "epoch": 0.10368, + "grad_norm": 2.328125, + "grad_norm_var": 0.0123931884765625, + "learning_rate": 0.0001, + "loss": 4.7344, + "loss/crossentropy": 2.1875526905059814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23636415600776672, + "step": 5184 + }, + { + "epoch": 0.10372, + "grad_norm": 2.078125, + "grad_norm_var": 0.014850870768229166, + "learning_rate": 0.0001, + "loss": 4.4629, + "loss/crossentropy": 1.8408135175704956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23278112709522247, + "step": 5186 + }, + { + "epoch": 0.10376, + "grad_norm": 2.359375, + "grad_norm_var": 0.014188639322916667, + "learning_rate": 0.0001, + "loss": 4.6848, + "loss/crossentropy": 1.7936646342277527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23859456181526184, + "step": 5188 + }, + { + "epoch": 0.1038, + "grad_norm": 2.21875, + "grad_norm_var": 0.014058430989583334, + "learning_rate": 0.0001, + "loss": 4.3825, + "loss/crossentropy": 2.0800318717956543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23155340552330017, + "step": 5190 + }, + { + "epoch": 0.10384, + "grad_norm": 2.265625, + "grad_norm_var": 0.0066802978515625, + "learning_rate": 0.0001, + "loss": 4.9645, + "loss/crossentropy": 2.277778387069702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25029345601797104, + "step": 5192 + }, + { + "epoch": 0.10388, + "grad_norm": 2.09375, + "grad_norm_var": 0.0081695556640625, + "learning_rate": 0.0001, + "loss": 4.2486, + "loss/crossentropy": 1.9658478498458862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22655482590198517, + "step": 5194 + }, + { + "epoch": 0.10392, + "grad_norm": 2.21875, + "grad_norm_var": 0.01060791015625, + "learning_rate": 0.0001, + "loss": 4.7503, + "loss/crossentropy": 2.214509129524231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.254493810236454, + "step": 5196 + }, + { + "epoch": 0.10396, + "grad_norm": 2.21875, + "grad_norm_var": 0.0102203369140625, + "learning_rate": 0.0001, + "loss": 4.3648, + "loss/crossentropy": 1.9465742707252502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23288530111312866, + "step": 5198 + }, + { + "epoch": 0.104, + "grad_norm": 2.21875, + "grad_norm_var": 0.010277303059895833, + "learning_rate": 0.0001, + "loss": 4.3216, + "loss/crossentropy": 2.062779188156128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21423730999231339, + "step": 5200 + }, + { + "epoch": 0.10404, + "grad_norm": 2.390625, + "grad_norm_var": 0.010350545247395834, + "learning_rate": 0.0001, + "loss": 4.1645, + "loss/crossentropy": 1.777470588684082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22998665273189545, + "step": 5202 + }, + { + "epoch": 0.10408, + "grad_norm": 2.46875, + "grad_norm_var": 0.05396728515625, + "learning_rate": 0.0001, + "loss": 4.8979, + "loss/crossentropy": 2.2505980730056763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30352361500263214, + "step": 5204 + }, + { + "epoch": 0.10412, + "grad_norm": 3.359375, + "grad_norm_var": 0.12148335774739584, + "learning_rate": 0.0001, + "loss": 4.4093, + "loss/crossentropy": 1.8989517092704773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2210940569639206, + "step": 5206 + }, + { + "epoch": 0.10416, + "grad_norm": 2.734375, + "grad_norm_var": 0.12923177083333334, + "learning_rate": 0.0001, + "loss": 4.5535, + "loss/crossentropy": 2.378798723220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25575730204582214, + "step": 5208 + }, + { + "epoch": 0.1042, + "grad_norm": 2.34375, + "grad_norm_var": 0.11901041666666666, + "learning_rate": 0.0001, + "loss": 4.6763, + "loss/crossentropy": 1.7642006278038025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22105304896831512, + "step": 5210 + }, + { + "epoch": 0.10424, + "grad_norm": 2.375, + "grad_norm_var": 0.11607666015625, + "learning_rate": 0.0001, + "loss": 4.8945, + "loss/crossentropy": 2.188746988773346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24192160367965698, + "step": 5212 + }, + { + "epoch": 0.10428, + "grad_norm": 2.3125, + "grad_norm_var": 0.11298421223958334, + "learning_rate": 0.0001, + "loss": 4.6001, + "loss/crossentropy": 2.116630494594574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2879187613725662, + "step": 5214 + }, + { + "epoch": 0.10432, + "grad_norm": 2.15625, + "grad_norm_var": 0.11013895670572917, + "learning_rate": 0.0001, + "loss": 4.4932, + "loss/crossentropy": 1.8329599499702454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21425092220306396, + "step": 5216 + }, + { + "epoch": 0.10436, + "grad_norm": 2.296875, + "grad_norm_var": 0.1064849853515625, + "learning_rate": 0.0001, + "loss": 4.3966, + "loss/crossentropy": 2.1063259840011597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23704984784126282, + "step": 5218 + }, + { + "epoch": 0.1044, + "grad_norm": 2.25, + "grad_norm_var": 0.08238525390625, + "learning_rate": 0.0001, + "loss": 4.4821, + "loss/crossentropy": 1.7994996309280396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21408653259277344, + "step": 5220 + }, + { + "epoch": 0.10444, + "grad_norm": 2.65625, + "grad_norm_var": 0.0281158447265625, + "learning_rate": 0.0001, + "loss": 4.2437, + "loss/crossentropy": 1.880006492137909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24362730979919434, + "step": 5222 + }, + { + "epoch": 0.10448, + "grad_norm": 2.25, + "grad_norm_var": 0.017853800455729166, + "learning_rate": 0.0001, + "loss": 4.2726, + "loss/crossentropy": 2.010268449783325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22580985724925995, + "step": 5224 + }, + { + "epoch": 0.10452, + "grad_norm": 2.3125, + "grad_norm_var": 0.0177886962890625, + "learning_rate": 0.0001, + "loss": 4.6839, + "loss/crossentropy": 2.057162046432495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23878254741430283, + "step": 5226 + }, + { + "epoch": 0.10456, + "grad_norm": 2.21875, + "grad_norm_var": 0.019603474934895834, + "learning_rate": 0.0001, + "loss": 4.3774, + "loss/crossentropy": 2.0615930557250977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22890077531337738, + "step": 5228 + }, + { + "epoch": 0.1046, + "grad_norm": 2.40625, + "grad_norm_var": 0.020817057291666666, + "learning_rate": 0.0001, + "loss": 4.7693, + "loss/crossentropy": 1.9013472199440002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22539222240447998, + "step": 5230 + }, + { + "epoch": 0.10464, + "grad_norm": 2.296875, + "grad_norm_var": 0.019482421875, + "learning_rate": 0.0001, + "loss": 4.5531, + "loss/crossentropy": 2.239185929298401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24706681817770004, + "step": 5232 + }, + { + "epoch": 0.10468, + "grad_norm": 2.234375, + "grad_norm_var": 0.020540364583333335, + "learning_rate": 0.0001, + "loss": 4.3902, + "loss/crossentropy": 1.9881523251533508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24434638023376465, + "step": 5234 + }, + { + "epoch": 0.10472, + "grad_norm": 2.3125, + "grad_norm_var": 0.020524088541666666, + "learning_rate": 0.0001, + "loss": 4.7908, + "loss/crossentropy": 1.9529212713241577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2321469634771347, + "step": 5236 + }, + { + "epoch": 0.10476, + "grad_norm": 2.40625, + "grad_norm_var": 0.01041259765625, + "learning_rate": 0.0001, + "loss": 4.8045, + "loss/crossentropy": 2.196424722671509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24054434895515442, + "step": 5238 + }, + { + "epoch": 0.1048, + "grad_norm": 2.234375, + "grad_norm_var": 0.009781901041666667, + "learning_rate": 0.0001, + "loss": 4.5895, + "loss/crossentropy": 2.082987070083618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2760557308793068, + "step": 5240 + }, + { + "epoch": 0.10484, + "grad_norm": 2.359375, + "grad_norm_var": 0.011652628580729166, + "learning_rate": 0.0001, + "loss": 4.5961, + "loss/crossentropy": 2.2369720935821533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25305214524269104, + "step": 5242 + }, + { + "epoch": 0.10488, + "grad_norm": 2.1875, + "grad_norm_var": 0.0119537353515625, + "learning_rate": 0.0001, + "loss": 4.7268, + "loss/crossentropy": 2.3372031450271606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26765232533216476, + "step": 5244 + }, + { + "epoch": 0.10492, + "grad_norm": 2.3125, + "grad_norm_var": 0.0113677978515625, + "learning_rate": 0.0001, + "loss": 4.6148, + "loss/crossentropy": 2.23944628238678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26508912444114685, + "step": 5246 + }, + { + "epoch": 0.10496, + "grad_norm": 2.21875, + "grad_norm_var": 0.012105305989583334, + "learning_rate": 0.0001, + "loss": 4.3453, + "loss/crossentropy": 2.2701858282089233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2551077604293823, + "step": 5248 + }, + { + "epoch": 0.105, + "grad_norm": 2.234375, + "grad_norm_var": 0.010887654622395833, + "learning_rate": 0.0001, + "loss": 4.6455, + "loss/crossentropy": 2.293464779853821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25016437470912933, + "step": 5250 + }, + { + "epoch": 0.10504, + "grad_norm": 2.46875, + "grad_norm_var": 0.0109527587890625, + "learning_rate": 0.0001, + "loss": 4.5798, + "loss/crossentropy": 2.3072171211242676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27824972569942474, + "step": 5252 + }, + { + "epoch": 0.10508, + "grad_norm": 2.453125, + "grad_norm_var": 0.010498046875, + "learning_rate": 0.0001, + "loss": 4.5948, + "loss/crossentropy": 1.9855756759643555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2312464788556099, + "step": 5254 + }, + { + "epoch": 0.10512, + "grad_norm": 2.3125, + "grad_norm_var": 0.009129842122395834, + "learning_rate": 0.0001, + "loss": 4.8104, + "loss/crossentropy": 1.9584077596664429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2386016771197319, + "step": 5256 + }, + { + "epoch": 0.10516, + "grad_norm": 2.265625, + "grad_norm_var": 0.010741170247395833, + "learning_rate": 0.0001, + "loss": 4.6103, + "loss/crossentropy": 2.2184669375419617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25914129614830017, + "step": 5258 + }, + { + "epoch": 0.1052, + "grad_norm": 2.1875, + "grad_norm_var": 0.0122222900390625, + "learning_rate": 0.0001, + "loss": 4.1221, + "loss/crossentropy": 1.6798554062843323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2037271112203598, + "step": 5260 + }, + { + "epoch": 0.10524, + "grad_norm": 2.203125, + "grad_norm_var": 0.011995442708333333, + "learning_rate": 0.0001, + "loss": 4.2718, + "loss/crossentropy": 1.8675006031990051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21148693561553955, + "step": 5262 + }, + { + "epoch": 0.10528, + "grad_norm": 2.390625, + "grad_norm_var": 0.012336222330729167, + "learning_rate": 0.0001, + "loss": 4.5136, + "loss/crossentropy": 1.9439855813980103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24611759185791016, + "step": 5264 + }, + { + "epoch": 0.10532, + "grad_norm": 2.171875, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 4.5298, + "loss/crossentropy": 2.1550748348236084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2428945228457451, + "step": 5266 + }, + { + "epoch": 0.10536, + "grad_norm": 2.34375, + "grad_norm_var": 0.0117584228515625, + "learning_rate": 0.0001, + "loss": 4.6777, + "loss/crossentropy": 1.9924054741859436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21866007149219513, + "step": 5268 + }, + { + "epoch": 0.1054, + "grad_norm": 2.40625, + "grad_norm_var": 0.011571248372395834, + "learning_rate": 0.0001, + "loss": 4.5359, + "loss/crossentropy": 2.0413920879364014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2579498365521431, + "step": 5270 + }, + { + "epoch": 0.10544, + "grad_norm": 2.578125, + "grad_norm_var": 0.016380818684895833, + "learning_rate": 0.0001, + "loss": 4.6406, + "loss/crossentropy": 2.062632381916046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24655035883188248, + "step": 5272 + }, + { + "epoch": 0.10548, + "grad_norm": 2.421875, + "grad_norm_var": 0.021833292643229165, + "learning_rate": 0.0001, + "loss": 4.8133, + "loss/crossentropy": 2.0620386600494385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3041655272245407, + "step": 5274 + }, + { + "epoch": 0.10552, + "grad_norm": 2.234375, + "grad_norm_var": 0.016657511393229168, + "learning_rate": 0.0001, + "loss": 4.4775, + "loss/crossentropy": 1.966421365737915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22333864122629166, + "step": 5276 + }, + { + "epoch": 0.10556, + "grad_norm": 2.25, + "grad_norm_var": 0.017650349934895834, + "learning_rate": 0.0001, + "loss": 4.4323, + "loss/crossentropy": 1.9120238423347473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22857370972633362, + "step": 5278 + }, + { + "epoch": 0.1056, + "grad_norm": 2.40625, + "grad_norm_var": 0.019701131184895835, + "learning_rate": 0.0001, + "loss": 4.5179, + "loss/crossentropy": 2.084389805793762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2556762397289276, + "step": 5280 + }, + { + "epoch": 0.10564, + "grad_norm": 2.328125, + "grad_norm_var": 0.019758097330729165, + "learning_rate": 0.0001, + "loss": 4.4552, + "loss/crossentropy": 1.8707188367843628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22955116629600525, + "step": 5282 + }, + { + "epoch": 0.10568, + "grad_norm": 2.265625, + "grad_norm_var": 0.019775390625, + "learning_rate": 0.0001, + "loss": 4.3538, + "loss/crossentropy": 1.8243692517280579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21088172495365143, + "step": 5284 + }, + { + "epoch": 0.10572, + "grad_norm": 2.359375, + "grad_norm_var": 0.0192535400390625, + "learning_rate": 0.0001, + "loss": 4.5046, + "loss/crossentropy": 2.111305356025696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2595943957567215, + "step": 5286 + }, + { + "epoch": 0.10576, + "grad_norm": 2.15625, + "grad_norm_var": 0.015697224934895834, + "learning_rate": 0.0001, + "loss": 4.4598, + "loss/crossentropy": 2.3729283809661865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2632211595773697, + "step": 5288 + }, + { + "epoch": 0.1058, + "grad_norm": 2.140625, + "grad_norm_var": 0.007249959309895833, + "learning_rate": 0.0001, + "loss": 4.6256, + "loss/crossentropy": 2.3542696237564087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25645585358142853, + "step": 5290 + }, + { + "epoch": 0.10584, + "grad_norm": 2.4375, + "grad_norm_var": 0.01002197265625, + "learning_rate": 0.0001, + "loss": 4.4863, + "loss/crossentropy": 2.0140068531036377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23033145815134048, + "step": 5292 + }, + { + "epoch": 0.10588, + "grad_norm": 2.375, + "grad_norm_var": 0.0099761962890625, + "learning_rate": 0.0001, + "loss": 4.8126, + "loss/crossentropy": 1.9499077796936035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21458172798156738, + "step": 5294 + }, + { + "epoch": 0.10592, + "grad_norm": 2.25, + "grad_norm_var": 0.0084625244140625, + "learning_rate": 0.0001, + "loss": 4.479, + "loss/crossentropy": 2.1434344053268433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2552379444241524, + "step": 5296 + }, + { + "epoch": 0.10596, + "grad_norm": 2.359375, + "grad_norm_var": 0.007819620768229167, + "learning_rate": 0.0001, + "loss": 4.7149, + "loss/crossentropy": 1.9951340556144714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21683495491743088, + "step": 5298 + }, + { + "epoch": 0.106, + "grad_norm": 2.5625, + "grad_norm_var": 0.02330322265625, + "learning_rate": 0.0001, + "loss": 4.7855, + "loss/crossentropy": 2.380235195159912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2756577730178833, + "step": 5300 + }, + { + "epoch": 0.10604, + "grad_norm": 2.3125, + "grad_norm_var": 0.02330322265625, + "learning_rate": 0.0001, + "loss": 4.8596, + "loss/crossentropy": 2.298241972923279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2598598450422287, + "step": 5302 + }, + { + "epoch": 0.10608, + "grad_norm": 2.5, + "grad_norm_var": 0.032136027018229166, + "learning_rate": 0.0001, + "loss": 4.8615, + "loss/crossentropy": 2.093233823776245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23809552192687988, + "step": 5304 + }, + { + "epoch": 0.10612, + "grad_norm": 2.15625, + "grad_norm_var": 0.031493123372395834, + "learning_rate": 0.0001, + "loss": 4.5271, + "loss/crossentropy": 2.0901564955711365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24874016642570496, + "step": 5306 + }, + { + "epoch": 0.10616, + "grad_norm": 2.453125, + "grad_norm_var": 0.0304107666015625, + "learning_rate": 0.0001, + "loss": 4.3193, + "loss/crossentropy": 1.8029736280441284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20973137766122818, + "step": 5308 + }, + { + "epoch": 0.1062, + "grad_norm": 2.25, + "grad_norm_var": 0.03316650390625, + "learning_rate": 0.0001, + "loss": 4.4255, + "loss/crossentropy": 2.4068437814712524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25310443341732025, + "step": 5310 + }, + { + "epoch": 0.10624, + "grad_norm": 2.265625, + "grad_norm_var": 0.031029256184895833, + "learning_rate": 0.0001, + "loss": 4.4499, + "loss/crossentropy": 2.1125503182411194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2455870360136032, + "step": 5312 + }, + { + "epoch": 0.10628, + "grad_norm": 2.234375, + "grad_norm_var": 0.0323150634765625, + "learning_rate": 0.0001, + "loss": 4.4752, + "loss/crossentropy": 2.0995737314224243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2418016716837883, + "step": 5314 + }, + { + "epoch": 0.10632, + "grad_norm": 2.234375, + "grad_norm_var": 0.0212890625, + "learning_rate": 0.0001, + "loss": 4.5873, + "loss/crossentropy": 1.8753212690353394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24125799536705017, + "step": 5316 + }, + { + "epoch": 0.10636, + "grad_norm": 2.390625, + "grad_norm_var": 0.021451822916666665, + "learning_rate": 0.0001, + "loss": 4.7269, + "loss/crossentropy": 2.0175408720970154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23516137897968292, + "step": 5318 + }, + { + "epoch": 0.1064, + "grad_norm": 2.109375, + "grad_norm_var": 0.00914306640625, + "learning_rate": 0.0001, + "loss": 4.4953, + "loss/crossentropy": 2.2671592235565186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356845736503601, + "step": 5320 + }, + { + "epoch": 0.10644, + "grad_norm": 2.140625, + "grad_norm_var": 0.009468587239583333, + "learning_rate": 0.0001, + "loss": 4.5328, + "loss/crossentropy": 2.142452359199524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26422591507434845, + "step": 5322 + }, + { + "epoch": 0.10648, + "grad_norm": 2.3125, + "grad_norm_var": 0.007225545247395834, + "learning_rate": 0.0001, + "loss": 4.4328, + "loss/crossentropy": 1.9664896726608276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25856246054172516, + "step": 5324 + }, + { + "epoch": 0.10652, + "grad_norm": 2.203125, + "grad_norm_var": 0.007222493489583333, + "learning_rate": 0.0001, + "loss": 4.5531, + "loss/crossentropy": 2.168110191822052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23274105042219162, + "step": 5326 + }, + { + "epoch": 0.10656, + "grad_norm": 2.21875, + "grad_norm_var": 0.00552978515625, + "learning_rate": 0.0001, + "loss": 4.5242, + "loss/crossentropy": 2.006514251232147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532104551792145, + "step": 5328 + }, + { + "epoch": 0.1066, + "grad_norm": 2.296875, + "grad_norm_var": 0.0070220947265625, + "learning_rate": 0.0001, + "loss": 4.5593, + "loss/crossentropy": 2.462701439857483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2771739661693573, + "step": 5330 + }, + { + "epoch": 0.10664, + "grad_norm": 2.203125, + "grad_norm_var": 0.0076324462890625, + "learning_rate": 0.0001, + "loss": 4.4076, + "loss/crossentropy": 2.0889209508895874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2479858472943306, + "step": 5332 + }, + { + "epoch": 0.10668, + "grad_norm": 2.25, + "grad_norm_var": 0.008356730143229166, + "learning_rate": 0.0001, + "loss": 4.329, + "loss/crossentropy": 1.8056100606918335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22194840013980865, + "step": 5334 + }, + { + "epoch": 0.10672, + "grad_norm": 2.171875, + "grad_norm_var": 0.0075185139973958336, + "learning_rate": 0.0001, + "loss": 4.703, + "loss/crossentropy": 2.32460880279541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28185322880744934, + "step": 5336 + }, + { + "epoch": 0.10676, + "grad_norm": 2.140625, + "grad_norm_var": 0.0075185139973958336, + "learning_rate": 0.0001, + "loss": 4.6388, + "loss/crossentropy": 2.238978862762451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24314726889133453, + "step": 5338 + }, + { + "epoch": 0.1068, + "grad_norm": 2.0625, + "grad_norm_var": 0.00914306640625, + "learning_rate": 0.0001, + "loss": 4.4161, + "loss/crossentropy": 1.8914734721183777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20767460763454437, + "step": 5340 + }, + { + "epoch": 0.10684, + "grad_norm": 2.328125, + "grad_norm_var": 0.010445149739583333, + "learning_rate": 0.0001, + "loss": 4.5628, + "loss/crossentropy": 1.9704068899154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23617815226316452, + "step": 5342 + }, + { + "epoch": 0.10688, + "grad_norm": 2.40625, + "grad_norm_var": 0.013374837239583333, + "learning_rate": 0.0001, + "loss": 4.214, + "loss/crossentropy": 1.8539690971374512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2218247577548027, + "step": 5344 + }, + { + "epoch": 0.10692, + "grad_norm": 2.28125, + "grad_norm_var": 0.012596638997395833, + "learning_rate": 0.0001, + "loss": 4.6077, + "loss/crossentropy": 1.982038140296936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23828908801078796, + "step": 5346 + }, + { + "epoch": 0.10696, + "grad_norm": 2.28125, + "grad_norm_var": 0.013313802083333333, + "learning_rate": 0.0001, + "loss": 4.2879, + "loss/crossentropy": 1.8247870802879333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22487633675336838, + "step": 5348 + }, + { + "epoch": 0.107, + "grad_norm": 2.1875, + "grad_norm_var": 0.011253865559895833, + "learning_rate": 0.0001, + "loss": 4.382, + "loss/crossentropy": 2.0704214572906494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24332743138074875, + "step": 5350 + }, + { + "epoch": 0.10704, + "grad_norm": 2.5, + "grad_norm_var": 0.013509114583333334, + "learning_rate": 0.0001, + "loss": 4.709, + "loss/crossentropy": 2.037345290184021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2515557184815407, + "step": 5352 + }, + { + "epoch": 0.10708, + "grad_norm": 2.609375, + "grad_norm_var": 0.020702107747395834, + "learning_rate": 0.0001, + "loss": 4.5427, + "loss/crossentropy": 2.0561426877975464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24030926823616028, + "step": 5354 + }, + { + "epoch": 0.10712, + "grad_norm": 2.3125, + "grad_norm_var": 0.017943318684895834, + "learning_rate": 0.0001, + "loss": 4.3108, + "loss/crossentropy": 1.6871100068092346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2526541203260422, + "step": 5356 + }, + { + "epoch": 0.10716, + "grad_norm": 2.390625, + "grad_norm_var": 0.01978759765625, + "learning_rate": 0.0001, + "loss": 4.5238, + "loss/crossentropy": 2.0133201479911804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23663413524627686, + "step": 5358 + }, + { + "epoch": 0.1072, + "grad_norm": 2.34375, + "grad_norm_var": 0.0170562744140625, + "learning_rate": 0.0001, + "loss": 4.557, + "loss/crossentropy": 2.0627574920654297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24395380914211273, + "step": 5360 + }, + { + "epoch": 0.10724, + "grad_norm": 2.140625, + "grad_norm_var": 0.020929972330729168, + "learning_rate": 0.0001, + "loss": 4.5495, + "loss/crossentropy": 2.280818462371826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2806383967399597, + "step": 5362 + }, + { + "epoch": 0.10728, + "grad_norm": 2.5, + "grad_norm_var": 0.022684733072916668, + "learning_rate": 0.0001, + "loss": 4.4743, + "loss/crossentropy": 2.002636671066284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23986798524856567, + "step": 5364 + }, + { + "epoch": 0.10732, + "grad_norm": 2.453125, + "grad_norm_var": 0.022298177083333332, + "learning_rate": 0.0001, + "loss": 4.6816, + "loss/crossentropy": 2.042721927165985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23199205100536346, + "step": 5366 + }, + { + "epoch": 0.10736, + "grad_norm": 2.25, + "grad_norm_var": 0.021540323893229168, + "learning_rate": 0.0001, + "loss": 4.3225, + "loss/crossentropy": 2.1047908663749695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23987916857004166, + "step": 5368 + }, + { + "epoch": 0.1074, + "grad_norm": 2.1875, + "grad_norm_var": 0.014774576822916666, + "learning_rate": 0.0001, + "loss": 4.4827, + "loss/crossentropy": 2.0513075590133667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.242512047290802, + "step": 5370 + }, + { + "epoch": 0.10744, + "grad_norm": 2.4375, + "grad_norm_var": 0.015913899739583334, + "learning_rate": 0.0001, + "loss": 4.4452, + "loss/crossentropy": 1.9151215553283691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23007020354270935, + "step": 5372 + }, + { + "epoch": 0.10748, + "grad_norm": 2.125, + "grad_norm_var": 0.019303385416666666, + "learning_rate": 0.0001, + "loss": 4.2344, + "loss/crossentropy": 1.8759313821792603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068658247590065, + "step": 5374 + }, + { + "epoch": 0.10752, + "grad_norm": 2.46875, + "grad_norm_var": 0.020807902018229168, + "learning_rate": 0.0001, + "loss": 4.4667, + "loss/crossentropy": 2.1500572562217712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24392293393611908, + "step": 5376 + }, + { + "epoch": 0.10756, + "grad_norm": 2.109375, + "grad_norm_var": 0.018355305989583334, + "learning_rate": 0.0001, + "loss": 4.0917, + "loss/crossentropy": 1.6089633703231812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20190145075321198, + "step": 5378 + }, + { + "epoch": 0.1076, + "grad_norm": 2.1875, + "grad_norm_var": 0.014452107747395833, + "learning_rate": 0.0001, + "loss": 4.6521, + "loss/crossentropy": 2.1967561841011047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2749984338879585, + "step": 5380 + }, + { + "epoch": 0.10764, + "grad_norm": 2.171875, + "grad_norm_var": 0.011311848958333334, + "learning_rate": 0.0001, + "loss": 4.0843, + "loss/crossentropy": 1.8293656706809998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20650158822536469, + "step": 5382 + }, + { + "epoch": 0.10768, + "grad_norm": 2.125, + "grad_norm_var": 0.011847941080729167, + "learning_rate": 0.0001, + "loss": 4.3441, + "loss/crossentropy": 2.3964673280715942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2814205437898636, + "step": 5384 + }, + { + "epoch": 0.10772, + "grad_norm": 2.4375, + "grad_norm_var": 0.021842447916666667, + "learning_rate": 0.0001, + "loss": 4.8872, + "loss/crossentropy": 2.4995274543762207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28704003244638443, + "step": 5386 + }, + { + "epoch": 0.10776, + "grad_norm": 2.296875, + "grad_norm_var": 0.019462076822916667, + "learning_rate": 0.0001, + "loss": 4.6066, + "loss/crossentropy": 2.0650060176849365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23606212437152863, + "step": 5388 + }, + { + "epoch": 0.1078, + "grad_norm": 2.203125, + "grad_norm_var": 0.01640625, + "learning_rate": 0.0001, + "loss": 4.3723, + "loss/crossentropy": 2.3049341440200806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23550046980381012, + "step": 5390 + }, + { + "epoch": 0.10784, + "grad_norm": 2.21875, + "grad_norm_var": 0.013505045572916667, + "learning_rate": 0.0001, + "loss": 4.813, + "loss/crossentropy": 2.2687143087387085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2854095697402954, + "step": 5392 + }, + { + "epoch": 0.10788, + "grad_norm": 2.28125, + "grad_norm_var": 0.014557902018229167, + "learning_rate": 0.0001, + "loss": 4.6267, + "loss/crossentropy": 2.029325544834137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23444775491952896, + "step": 5394 + }, + { + "epoch": 0.10792, + "grad_norm": 2.3125, + "grad_norm_var": 0.013895670572916666, + "learning_rate": 0.0001, + "loss": 4.5214, + "loss/crossentropy": 2.2012031078338623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27986764907836914, + "step": 5396 + }, + { + "epoch": 0.10796, + "grad_norm": 2.3125, + "grad_norm_var": 0.012626139322916667, + "learning_rate": 0.0001, + "loss": 4.65, + "loss/crossentropy": 2.2396020889282227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24839117377996445, + "step": 5398 + }, + { + "epoch": 0.108, + "grad_norm": 2.3125, + "grad_norm_var": 0.009566243489583333, + "learning_rate": 0.0001, + "loss": 4.4634, + "loss/crossentropy": 2.1481886506080627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545444592833519, + "step": 5400 + }, + { + "epoch": 0.10804, + "grad_norm": 2.21875, + "grad_norm_var": 0.006005859375, + "learning_rate": 0.0001, + "loss": 4.6109, + "loss/crossentropy": 1.9799351692199707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23296385258436203, + "step": 5402 + }, + { + "epoch": 0.10808, + "grad_norm": 2.34375, + "grad_norm_var": 0.006180826822916667, + "learning_rate": 0.0001, + "loss": 4.5612, + "loss/crossentropy": 1.845237910747528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316332384943962, + "step": 5404 + }, + { + "epoch": 0.10812, + "grad_norm": 2.234375, + "grad_norm_var": 0.005692545572916667, + "learning_rate": 0.0001, + "loss": 4.4825, + "loss/crossentropy": 2.078865647315979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.251836359500885, + "step": 5406 + }, + { + "epoch": 0.10816, + "grad_norm": 2.171875, + "grad_norm_var": 0.006245930989583333, + "learning_rate": 0.0001, + "loss": 4.4409, + "loss/crossentropy": 2.031971752643585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23658733069896698, + "step": 5408 + }, + { + "epoch": 0.1082, + "grad_norm": 2.21875, + "grad_norm_var": 0.00504150390625, + "learning_rate": 0.0001, + "loss": 4.3034, + "loss/crossentropy": 1.8173908591270447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2169174626469612, + "step": 5410 + }, + { + "epoch": 0.10824, + "grad_norm": 2.25, + "grad_norm_var": 0.0086090087890625, + "learning_rate": 0.0001, + "loss": 4.838, + "loss/crossentropy": 2.2501285672187805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.240458145737648, + "step": 5412 + }, + { + "epoch": 0.10828, + "grad_norm": 2.484375, + "grad_norm_var": 0.015168253580729167, + "learning_rate": 0.0001, + "loss": 4.5449, + "loss/crossentropy": 2.256573438644409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26129382848739624, + "step": 5414 + }, + { + "epoch": 0.10832, + "grad_norm": 2.359375, + "grad_norm_var": 0.0153717041015625, + "learning_rate": 0.0001, + "loss": 4.7704, + "loss/crossentropy": 2.2014705538749695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2334136888384819, + "step": 5416 + }, + { + "epoch": 0.10836, + "grad_norm": 2.34375, + "grad_norm_var": 0.013016764322916667, + "learning_rate": 0.0001, + "loss": 4.4046, + "loss/crossentropy": 1.8590609431266785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2271011471748352, + "step": 5418 + }, + { + "epoch": 0.1084, + "grad_norm": 2.40625, + "grad_norm_var": 0.017378743489583334, + "learning_rate": 0.0001, + "loss": 4.9419, + "loss/crossentropy": 2.2923961877822876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25833888351917267, + "step": 5420 + }, + { + "epoch": 0.10844, + "grad_norm": 2.21875, + "grad_norm_var": 0.017723592122395833, + "learning_rate": 0.0001, + "loss": 4.4535, + "loss/crossentropy": 2.1932299733161926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26209479570388794, + "step": 5422 + }, + { + "epoch": 0.10848, + "grad_norm": 2.390625, + "grad_norm_var": 0.016185506184895834, + "learning_rate": 0.0001, + "loss": 4.7057, + "loss/crossentropy": 2.3909924030303955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2525275945663452, + "step": 5424 + }, + { + "epoch": 0.10852, + "grad_norm": 2.296875, + "grad_norm_var": 0.011324055989583333, + "learning_rate": 0.0001, + "loss": 4.7509, + "loss/crossentropy": 2.423817992210388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2635423541069031, + "step": 5426 + }, + { + "epoch": 0.10856, + "grad_norm": 2.34375, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.7082, + "loss/crossentropy": 1.9641632437705994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460889369249344, + "step": 5428 + }, + { + "epoch": 0.1086, + "grad_norm": 2.34375, + "grad_norm_var": 0.0077707926432291664, + "learning_rate": 0.0001, + "loss": 4.6347, + "loss/crossentropy": 2.027769148349762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2357894703745842, + "step": 5430 + }, + { + "epoch": 0.10864, + "grad_norm": 2.53125, + "grad_norm_var": 0.0128570556640625, + "learning_rate": 0.0001, + "loss": 4.4833, + "loss/crossentropy": 2.122319996356964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2445123866200447, + "step": 5432 + }, + { + "epoch": 0.10868, + "grad_norm": 3.53125, + "grad_norm_var": 0.0993072509765625, + "learning_rate": 0.0001, + "loss": 4.6332, + "loss/crossentropy": 1.8631052374839783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23456327617168427, + "step": 5434 + }, + { + "epoch": 0.10872, + "grad_norm": 2.59375, + "grad_norm_var": 0.1000885009765625, + "learning_rate": 0.0001, + "loss": 4.6022, + "loss/crossentropy": 2.184281885623932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24073782563209534, + "step": 5436 + }, + { + "epoch": 0.10876, + "grad_norm": 2.453125, + "grad_norm_var": 0.09521077473958334, + "learning_rate": 0.0001, + "loss": 4.7912, + "loss/crossentropy": 1.9587833881378174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22247321158647537, + "step": 5438 + }, + { + "epoch": 0.1088, + "grad_norm": 2.296875, + "grad_norm_var": 0.09562886555989583, + "learning_rate": 0.0001, + "loss": 4.5185, + "loss/crossentropy": 2.334655284881592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24817728251218796, + "step": 5440 + }, + { + "epoch": 0.10884, + "grad_norm": 2.109375, + "grad_norm_var": 0.10161031087239583, + "learning_rate": 0.0001, + "loss": 4.3817, + "loss/crossentropy": 2.1424371004104614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2622353136539459, + "step": 5442 + }, + { + "epoch": 0.10888, + "grad_norm": 2.203125, + "grad_norm_var": 0.10598958333333333, + "learning_rate": 0.0001, + "loss": 4.5876, + "loss/crossentropy": 2.0363662242889404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2450125440955162, + "step": 5444 + }, + { + "epoch": 0.10892, + "grad_norm": 2.1875, + "grad_norm_var": 0.110791015625, + "learning_rate": 0.0001, + "loss": 4.4015, + "loss/crossentropy": 2.0536006689071655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23283181339502335, + "step": 5446 + }, + { + "epoch": 0.10896, + "grad_norm": 2.234375, + "grad_norm_var": 0.10741780598958334, + "learning_rate": 0.0001, + "loss": 4.5194, + "loss/crossentropy": 2.2678059339523315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24513862282037735, + "step": 5448 + }, + { + "epoch": 0.109, + "grad_norm": 2.203125, + "grad_norm_var": 0.02115478515625, + "learning_rate": 0.0001, + "loss": 4.7404, + "loss/crossentropy": 2.406686782836914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2901146113872528, + "step": 5450 + }, + { + "epoch": 0.10904, + "grad_norm": 2.296875, + "grad_norm_var": 0.017801920572916668, + "learning_rate": 0.0001, + "loss": 4.4724, + "loss/crossentropy": 2.352605938911438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25215400755405426, + "step": 5452 + }, + { + "epoch": 0.10908, + "grad_norm": 2.203125, + "grad_norm_var": 0.014546712239583334, + "learning_rate": 0.0001, + "loss": 4.5593, + "loss/crossentropy": 1.9139850735664368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22682765871286392, + "step": 5454 + }, + { + "epoch": 0.10912, + "grad_norm": 2.171875, + "grad_norm_var": 0.01441650390625, + "learning_rate": 0.0001, + "loss": 4.6163, + "loss/crossentropy": 2.3240445852279663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24574412405490875, + "step": 5456 + }, + { + "epoch": 0.10916, + "grad_norm": 2.21875, + "grad_norm_var": 0.0123687744140625, + "learning_rate": 0.0001, + "loss": 4.3356, + "loss/crossentropy": 1.9347040057182312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159302830696106, + "step": 5458 + }, + { + "epoch": 0.1092, + "grad_norm": 2.21875, + "grad_norm_var": 0.012360636393229167, + "learning_rate": 0.0001, + "loss": 4.6539, + "loss/crossentropy": 1.8933109641075134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.284725621342659, + "step": 5460 + }, + { + "epoch": 0.10924, + "grad_norm": 2.15625, + "grad_norm_var": 0.0129058837890625, + "learning_rate": 0.0001, + "loss": 4.2488, + "loss/crossentropy": 2.3611297607421875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2651517689228058, + "step": 5462 + }, + { + "epoch": 0.10928, + "grad_norm": 2.140625, + "grad_norm_var": 0.0235504150390625, + "learning_rate": 0.0001, + "loss": 4.4564, + "loss/crossentropy": 1.828608751296997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2337760180234909, + "step": 5464 + }, + { + "epoch": 0.10932, + "grad_norm": 2.140625, + "grad_norm_var": 0.015412394205729167, + "learning_rate": 0.0001, + "loss": 4.321, + "loss/crossentropy": 2.1374374628067017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2525453567504883, + "step": 5466 + }, + { + "epoch": 0.10936, + "grad_norm": 2.125, + "grad_norm_var": 0.0150543212890625, + "learning_rate": 0.0001, + "loss": 4.5307, + "loss/crossentropy": 1.8054441213607788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2252344712615013, + "step": 5468 + }, + { + "epoch": 0.1094, + "grad_norm": 2.421875, + "grad_norm_var": 0.022972615559895833, + "learning_rate": 0.0001, + "loss": 4.616, + "loss/crossentropy": 2.1468498706817627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2604861631989479, + "step": 5470 + }, + { + "epoch": 0.10944, + "grad_norm": 2.359375, + "grad_norm_var": 0.022684733072916668, + "learning_rate": 0.0001, + "loss": 4.7298, + "loss/crossentropy": 2.2180548906326294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2603686898946762, + "step": 5472 + }, + { + "epoch": 0.10948, + "grad_norm": 2.265625, + "grad_norm_var": 0.022261555989583334, + "learning_rate": 0.0001, + "loss": 4.5263, + "loss/crossentropy": 1.9773708581924438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23676057159900665, + "step": 5474 + }, + { + "epoch": 0.10952, + "grad_norm": 2.40625, + "grad_norm_var": 0.023485310872395835, + "learning_rate": 0.0001, + "loss": 4.6156, + "loss/crossentropy": 1.9277283549308777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22822558879852295, + "step": 5476 + }, + { + "epoch": 0.10956, + "grad_norm": 2.34375, + "grad_norm_var": 0.022484334309895833, + "learning_rate": 0.0001, + "loss": 4.5529, + "loss/crossentropy": 2.0625431537628174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2565220817923546, + "step": 5478 + }, + { + "epoch": 0.1096, + "grad_norm": 2.25, + "grad_norm_var": 0.01441650390625, + "learning_rate": 0.0001, + "loss": 4.7956, + "loss/crossentropy": 2.383894443511963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2563806623220444, + "step": 5480 + }, + { + "epoch": 0.10964, + "grad_norm": 2.171875, + "grad_norm_var": 0.0120758056640625, + "learning_rate": 0.0001, + "loss": 4.5226, + "loss/crossentropy": 2.409442663192749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2637571170926094, + "step": 5482 + }, + { + "epoch": 0.10968, + "grad_norm": 2.203125, + "grad_norm_var": 0.0151763916015625, + "learning_rate": 0.0001, + "loss": 4.743, + "loss/crossentropy": 2.1789854764938354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23846548050642014, + "step": 5484 + }, + { + "epoch": 0.10972, + "grad_norm": 2.265625, + "grad_norm_var": 0.011839803059895833, + "learning_rate": 0.0001, + "loss": 4.4108, + "loss/crossentropy": 2.127842903137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2622772455215454, + "step": 5486 + }, + { + "epoch": 0.10976, + "grad_norm": 2.1875, + "grad_norm_var": 0.01412353515625, + "learning_rate": 0.0001, + "loss": 4.6032, + "loss/crossentropy": 2.107556462287903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24714312702417374, + "step": 5488 + }, + { + "epoch": 0.1098, + "grad_norm": 2.578125, + "grad_norm_var": 0.022459920247395834, + "learning_rate": 0.0001, + "loss": 4.6525, + "loss/crossentropy": 2.1959601640701294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23568396270275116, + "step": 5490 + }, + { + "epoch": 0.10984, + "grad_norm": 2.171875, + "grad_norm_var": 0.021826171875, + "learning_rate": 0.0001, + "loss": 4.5584, + "loss/crossentropy": 2.1246761083602905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24083472788333893, + "step": 5492 + }, + { + "epoch": 0.10988, + "grad_norm": 2.15625, + "grad_norm_var": 0.025031534830729167, + "learning_rate": 0.0001, + "loss": 4.6436, + "loss/crossentropy": 2.091724157333374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2527346611022949, + "step": 5494 + }, + { + "epoch": 0.10992, + "grad_norm": 2.109375, + "grad_norm_var": 0.026056925455729168, + "learning_rate": 0.0001, + "loss": 4.3207, + "loss/crossentropy": 1.8898470997810364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20916878432035446, + "step": 5496 + }, + { + "epoch": 0.10996, + "grad_norm": 2.21875, + "grad_norm_var": 0.025951131184895834, + "learning_rate": 0.0001, + "loss": 4.399, + "loss/crossentropy": 2.1901716589927673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2409309297800064, + "step": 5498 + }, + { + "epoch": 0.11, + "grad_norm": 2.234375, + "grad_norm_var": 0.017626953125, + "learning_rate": 0.0001, + "loss": 4.6897, + "loss/crossentropy": 2.1018574237823486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24559657275676727, + "step": 5500 + }, + { + "epoch": 0.11004, + "grad_norm": 2.1875, + "grad_norm_var": 0.019449869791666668, + "learning_rate": 0.0001, + "loss": 3.8159, + "loss/crossentropy": 2.0575350522994995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23721858859062195, + "step": 5502 + }, + { + "epoch": 0.11008, + "grad_norm": 2.15625, + "grad_norm_var": 0.019710286458333334, + "learning_rate": 0.0001, + "loss": 4.6347, + "loss/crossentropy": 2.1846336126327515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2378462702035904, + "step": 5504 + }, + { + "epoch": 0.11012, + "grad_norm": 2.4375, + "grad_norm_var": 0.014574178059895833, + "learning_rate": 0.0001, + "loss": 4.4028, + "loss/crossentropy": 2.1359363198280334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26211391389369965, + "step": 5506 + }, + { + "epoch": 0.11016, + "grad_norm": 2.265625, + "grad_norm_var": 0.01416015625, + "learning_rate": 0.0001, + "loss": 4.5621, + "loss/crossentropy": 2.236825942993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557792589068413, + "step": 5508 + }, + { + "epoch": 0.1102, + "grad_norm": 2.25, + "grad_norm_var": 0.009521484375, + "learning_rate": 0.0001, + "loss": 4.3234, + "loss/crossentropy": 2.3140580654144287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2554958164691925, + "step": 5510 + }, + { + "epoch": 0.11024, + "grad_norm": 2.140625, + "grad_norm_var": 0.00904541015625, + "learning_rate": 0.0001, + "loss": 4.4382, + "loss/crossentropy": 1.7190355062484741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19964009523391724, + "step": 5512 + }, + { + "epoch": 0.11028, + "grad_norm": 2.203125, + "grad_norm_var": 0.014631144205729167, + "learning_rate": 0.0001, + "loss": 4.3831, + "loss/crossentropy": 1.8326427340507507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2199154868721962, + "step": 5514 + }, + { + "epoch": 0.11032, + "grad_norm": 2.109375, + "grad_norm_var": 0.01549072265625, + "learning_rate": 0.0001, + "loss": 4.4494, + "loss/crossentropy": 1.9013121724128723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20293578505516052, + "step": 5516 + }, + { + "epoch": 0.11036, + "grad_norm": 2.1875, + "grad_norm_var": 0.0138671875, + "learning_rate": 0.0001, + "loss": 4.7056, + "loss/crossentropy": 2.0221983790397644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23435519635677338, + "step": 5518 + }, + { + "epoch": 0.1104, + "grad_norm": 2.359375, + "grad_norm_var": 0.0140533447265625, + "learning_rate": 0.0001, + "loss": 4.6157, + "loss/crossentropy": 2.153970956802368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22127598524093628, + "step": 5520 + }, + { + "epoch": 0.11044, + "grad_norm": 2.125, + "grad_norm_var": 0.010835774739583333, + "learning_rate": 0.0001, + "loss": 4.4516, + "loss/crossentropy": 1.8674496412277222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.232261061668396, + "step": 5522 + }, + { + "epoch": 0.11048, + "grad_norm": 2.078125, + "grad_norm_var": 0.012495930989583333, + "learning_rate": 0.0001, + "loss": 4.6528, + "loss/crossentropy": 2.1575759649276733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22218701988458633, + "step": 5524 + }, + { + "epoch": 0.11052, + "grad_norm": 2.25, + "grad_norm_var": 0.012434895833333333, + "learning_rate": 0.0001, + "loss": 4.5553, + "loss/crossentropy": 2.054452419281006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25137215852737427, + "step": 5526 + }, + { + "epoch": 0.11056, + "grad_norm": 2.296875, + "grad_norm_var": 0.022639973958333334, + "learning_rate": 0.0001, + "loss": 4.4789, + "loss/crossentropy": 1.966478705406189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23289234936237335, + "step": 5528 + }, + { + "epoch": 0.1106, + "grad_norm": 2.234375, + "grad_norm_var": 0.017626953125, + "learning_rate": 0.0001, + "loss": 4.687, + "loss/crossentropy": 2.171034336090088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23373593389987946, + "step": 5530 + }, + { + "epoch": 0.11064, + "grad_norm": 1.953125, + "grad_norm_var": 0.022362263997395833, + "learning_rate": 0.0001, + "loss": 4.142, + "loss/crossentropy": 2.2416292428970337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24835532158613205, + "step": 5532 + }, + { + "epoch": 0.11068, + "grad_norm": 2.53125, + "grad_norm_var": 0.02720947265625, + "learning_rate": 0.0001, + "loss": 4.6252, + "loss/crossentropy": 2.2599780559539795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24948878586292267, + "step": 5534 + }, + { + "epoch": 0.11072, + "grad_norm": 2.34375, + "grad_norm_var": 0.028902180989583335, + "learning_rate": 0.0001, + "loss": 4.6984, + "loss/crossentropy": 2.2292014360427856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26616473495960236, + "step": 5536 + }, + { + "epoch": 0.11076, + "grad_norm": 2.375, + "grad_norm_var": 0.027424112955729166, + "learning_rate": 0.0001, + "loss": 4.336, + "loss/crossentropy": 1.9285388588905334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22964124381542206, + "step": 5538 + }, + { + "epoch": 0.1108, + "grad_norm": 2.171875, + "grad_norm_var": 0.0251953125, + "learning_rate": 0.0001, + "loss": 4.2861, + "loss/crossentropy": 1.864789366722107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2264304906129837, + "step": 5540 + }, + { + "epoch": 0.11084, + "grad_norm": 2.234375, + "grad_norm_var": 0.0250885009765625, + "learning_rate": 0.0001, + "loss": 4.5163, + "loss/crossentropy": 1.8676912188529968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21919699758291245, + "step": 5542 + }, + { + "epoch": 0.11088, + "grad_norm": 2.234375, + "grad_norm_var": 0.0190338134765625, + "learning_rate": 0.0001, + "loss": 4.3612, + "loss/crossentropy": 2.34523469209671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25638002157211304, + "step": 5544 + }, + { + "epoch": 0.11092, + "grad_norm": 2.09375, + "grad_norm_var": 0.020992024739583334, + "learning_rate": 0.0001, + "loss": 4.5165, + "loss/crossentropy": 2.2903120517730713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25111711025238037, + "step": 5546 + }, + { + "epoch": 0.11096, + "grad_norm": 2.15625, + "grad_norm_var": 0.017671712239583335, + "learning_rate": 0.0001, + "loss": 4.5336, + "loss/crossentropy": 2.2106658220291138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22340595722198486, + "step": 5548 + }, + { + "epoch": 0.111, + "grad_norm": 2.203125, + "grad_norm_var": 0.013700358072916667, + "learning_rate": 0.0001, + "loss": 4.6305, + "loss/crossentropy": 2.0777581334114075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2446284517645836, + "step": 5550 + }, + { + "epoch": 0.11104, + "grad_norm": 2.234375, + "grad_norm_var": 0.010758463541666667, + "learning_rate": 0.0001, + "loss": 4.5507, + "loss/crossentropy": 2.131237506866455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24113191664218903, + "step": 5552 + }, + { + "epoch": 0.11108, + "grad_norm": 2.578125, + "grad_norm_var": 0.018778483072916668, + "learning_rate": 0.0001, + "loss": 4.6337, + "loss/crossentropy": 2.190987467765808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2528213635087013, + "step": 5554 + }, + { + "epoch": 0.11112, + "grad_norm": 2.25, + "grad_norm_var": 0.019527180989583334, + "learning_rate": 0.0001, + "loss": 4.4889, + "loss/crossentropy": 2.26843523979187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27781064808368683, + "step": 5556 + }, + { + "epoch": 0.11116, + "grad_norm": 2.25, + "grad_norm_var": 0.019527180989583334, + "learning_rate": 0.0001, + "loss": 4.2242, + "loss/crossentropy": 1.9507999420166016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23172564804553986, + "step": 5558 + }, + { + "epoch": 0.1112, + "grad_norm": 2.28125, + "grad_norm_var": 0.016966756184895834, + "learning_rate": 0.0001, + "loss": 4.4821, + "loss/crossentropy": 2.0738234519958496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24497026205062866, + "step": 5560 + }, + { + "epoch": 0.11124, + "grad_norm": 2.515625, + "grad_norm_var": 0.019261678059895832, + "learning_rate": 0.0001, + "loss": 4.9501, + "loss/crossentropy": 2.273179054260254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2917838394641876, + "step": 5562 + }, + { + "epoch": 0.11128, + "grad_norm": 2.359375, + "grad_norm_var": 0.017899576822916666, + "learning_rate": 0.0001, + "loss": 4.774, + "loss/crossentropy": 2.085157036781311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2488701120018959, + "step": 5564 + }, + { + "epoch": 0.11132, + "grad_norm": 3.109375, + "grad_norm_var": 0.05950113932291667, + "learning_rate": 0.0001, + "loss": 4.2869, + "loss/crossentropy": 2.0528116822242737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24717991054058075, + "step": 5566 + }, + { + "epoch": 0.11136, + "grad_norm": 7.0, + "grad_norm_var": 1.3981597900390625, + "learning_rate": 0.0001, + "loss": 4.4443, + "loss/crossentropy": 2.0651500821113586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26806148886680603, + "step": 5568 + }, + { + "epoch": 0.1114, + "grad_norm": 2.328125, + "grad_norm_var": 1.4721588134765624, + "learning_rate": 0.0001, + "loss": 4.6162, + "loss/crossentropy": 2.1860616207122803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23045460134744644, + "step": 5570 + }, + { + "epoch": 0.11144, + "grad_norm": 2.328125, + "grad_norm_var": 1.460399373372396, + "learning_rate": 0.0001, + "loss": 4.3629, + "loss/crossentropy": 1.6931262016296387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20816385000944138, + "step": 5572 + }, + { + "epoch": 0.11148, + "grad_norm": 2.28125, + "grad_norm_var": 1.4581858317057292, + "learning_rate": 0.0001, + "loss": 4.3376, + "loss/crossentropy": 2.199341118335724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2490597665309906, + "step": 5574 + }, + { + "epoch": 0.11152, + "grad_norm": 2.484375, + "grad_norm_var": 1.4561513264973958, + "learning_rate": 0.0001, + "loss": 4.6627, + "loss/crossentropy": 2.2010069489479065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24095547199249268, + "step": 5576 + }, + { + "epoch": 0.11156, + "grad_norm": 2.234375, + "grad_norm_var": 1.4810129801432292, + "learning_rate": 0.0001, + "loss": 4.5243, + "loss/crossentropy": 1.9907150864601135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122703418135643, + "step": 5578 + }, + { + "epoch": 0.1116, + "grad_norm": 2.21875, + "grad_norm_var": 1.5023508707682292, + "learning_rate": 0.0001, + "loss": 4.2618, + "loss/crossentropy": 2.196335554122925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25111766904592514, + "step": 5580 + }, + { + "epoch": 0.11164, + "grad_norm": 2.171875, + "grad_norm_var": 1.5003000895182292, + "learning_rate": 0.0001, + "loss": 4.5104, + "loss/crossentropy": 2.0762988924980164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23773670941591263, + "step": 5582 + }, + { + "epoch": 0.11168, + "grad_norm": 2.140625, + "grad_norm_var": 0.15530192057291667, + "learning_rate": 0.0001, + "loss": 4.3166, + "loss/crossentropy": 2.0803143978118896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22162869572639465, + "step": 5584 + }, + { + "epoch": 0.11172, + "grad_norm": 2.3125, + "grad_norm_var": 0.0084136962890625, + "learning_rate": 0.0001, + "loss": 4.5596, + "loss/crossentropy": 2.1821994185447693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26280316710472107, + "step": 5586 + }, + { + "epoch": 0.11176, + "grad_norm": 2.171875, + "grad_norm_var": 0.011847941080729167, + "learning_rate": 0.0001, + "loss": 4.4328, + "loss/crossentropy": 2.1899439096450806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24677179753780365, + "step": 5588 + }, + { + "epoch": 0.1118, + "grad_norm": 2.21875, + "grad_norm_var": 0.0116851806640625, + "learning_rate": 0.0001, + "loss": 4.4361, + "loss/crossentropy": 2.334734559059143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2481135129928589, + "step": 5590 + }, + { + "epoch": 0.11184, + "grad_norm": 2.34375, + "grad_norm_var": 0.007291666666666667, + "learning_rate": 0.0001, + "loss": 4.7838, + "loss/crossentropy": 2.2976341247558594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2454354390501976, + "step": 5592 + }, + { + "epoch": 0.11188, + "grad_norm": 2.3125, + "grad_norm_var": 0.007477823893229167, + "learning_rate": 0.0001, + "loss": 4.7148, + "loss/crossentropy": 2.3243749141693115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2735731601715088, + "step": 5594 + }, + { + "epoch": 0.11192, + "grad_norm": 2.28125, + "grad_norm_var": 0.007372029622395833, + "learning_rate": 0.0001, + "loss": 4.7124, + "loss/crossentropy": 2.0328271985054016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21919244527816772, + "step": 5596 + }, + { + "epoch": 0.11196, + "grad_norm": 2.234375, + "grad_norm_var": 0.0077789306640625, + "learning_rate": 0.0001, + "loss": 4.4584, + "loss/crossentropy": 2.249367594718933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2577860951423645, + "step": 5598 + }, + { + "epoch": 0.112, + "grad_norm": 2.046875, + "grad_norm_var": 0.009505208333333333, + "learning_rate": 0.0001, + "loss": 4.2594, + "loss/crossentropy": 1.9844761490821838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142435386776924, + "step": 5600 + }, + { + "epoch": 0.11204, + "grad_norm": 2.375, + "grad_norm_var": 0.0111328125, + "learning_rate": 0.0001, + "loss": 4.6043, + "loss/crossentropy": 2.1334372758865356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23296397179365158, + "step": 5602 + }, + { + "epoch": 0.11208, + "grad_norm": 2.3125, + "grad_norm_var": 0.0078084309895833336, + "learning_rate": 0.0001, + "loss": 4.5308, + "loss/crossentropy": 2.119946002960205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23398507386446, + "step": 5604 + }, + { + "epoch": 0.11212, + "grad_norm": 2.25, + "grad_norm_var": 0.008040364583333333, + "learning_rate": 0.0001, + "loss": 4.5011, + "loss/crossentropy": 2.0414544343948364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23420867323875427, + "step": 5606 + }, + { + "epoch": 0.11216, + "grad_norm": 2.125, + "grad_norm_var": 0.009130859375, + "learning_rate": 0.0001, + "loss": 4.2092, + "loss/crossentropy": 1.9592725038528442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23922864347696304, + "step": 5608 + }, + { + "epoch": 0.1122, + "grad_norm": 2.4375, + "grad_norm_var": 0.011188761393229166, + "learning_rate": 0.0001, + "loss": 4.4795, + "loss/crossentropy": 2.150269627571106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2528265416622162, + "step": 5610 + }, + { + "epoch": 0.11224, + "grad_norm": 2.578125, + "grad_norm_var": 0.3099772135416667, + "learning_rate": 0.0001, + "loss": 4.6234, + "loss/crossentropy": 2.0591378211975098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2430240362882614, + "step": 5612 + }, + { + "epoch": 0.11228, + "grad_norm": 2.171875, + "grad_norm_var": 0.30794169108072916, + "learning_rate": 0.0001, + "loss": 4.4251, + "loss/crossentropy": 2.2132861614227295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22676552087068558, + "step": 5614 + }, + { + "epoch": 0.11232, + "grad_norm": 2.375, + "grad_norm_var": 0.3002237955729167, + "learning_rate": 0.0001, + "loss": 4.4332, + "loss/crossentropy": 1.9607917070388794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23068836331367493, + "step": 5616 + }, + { + "epoch": 0.11236, + "grad_norm": 2.25, + "grad_norm_var": 0.3021321614583333, + "learning_rate": 0.0001, + "loss": 4.549, + "loss/crossentropy": 2.2245940566062927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25029121339321136, + "step": 5618 + }, + { + "epoch": 0.1124, + "grad_norm": 2.234375, + "grad_norm_var": 0.30278218587239586, + "learning_rate": 0.0001, + "loss": 4.4539, + "loss/crossentropy": 2.2511253356933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25143079459667206, + "step": 5620 + }, + { + "epoch": 0.11244, + "grad_norm": 2.328125, + "grad_norm_var": 0.3026194254557292, + "learning_rate": 0.0001, + "loss": 4.4632, + "loss/crossentropy": 2.2945470809936523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2196483463048935, + "step": 5622 + }, + { + "epoch": 0.11248, + "grad_norm": 2.359375, + "grad_norm_var": 0.29273681640625, + "learning_rate": 0.0001, + "loss": 4.8769, + "loss/crossentropy": 2.2266393899917603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24020668864250183, + "step": 5624 + }, + { + "epoch": 0.11252, + "grad_norm": 2.359375, + "grad_norm_var": 0.29136454264322914, + "learning_rate": 0.0001, + "loss": 4.742, + "loss/crossentropy": 2.2835845947265625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27199871838092804, + "step": 5626 + }, + { + "epoch": 0.11256, + "grad_norm": 2.859375, + "grad_norm_var": 0.026383463541666666, + "learning_rate": 0.0001, + "loss": 4.4213, + "loss/crossentropy": 1.9576718211174011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21716968715190887, + "step": 5628 + }, + { + "epoch": 0.1126, + "grad_norm": 2.375, + "grad_norm_var": 0.044611612955729164, + "learning_rate": 0.0001, + "loss": 4.7695, + "loss/crossentropy": 2.0955676436424255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24259109795093536, + "step": 5630 + }, + { + "epoch": 0.11264, + "grad_norm": 2.5625, + "grad_norm_var": 0.044840494791666664, + "learning_rate": 0.0001, + "loss": 4.4127, + "loss/crossentropy": 2.119523346424103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2478165179491043, + "step": 5632 + }, + { + "epoch": 0.11268, + "grad_norm": 2.265625, + "grad_norm_var": 0.0444488525390625, + "learning_rate": 0.0001, + "loss": 4.5591, + "loss/crossentropy": 2.189425826072693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24532486498355865, + "step": 5634 + }, + { + "epoch": 0.11272, + "grad_norm": 2.390625, + "grad_norm_var": 0.0467193603515625, + "learning_rate": 0.0001, + "loss": 4.4969, + "loss/crossentropy": 2.215874433517456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24533041566610336, + "step": 5636 + }, + { + "epoch": 0.11276, + "grad_norm": 2.515625, + "grad_norm_var": 0.04868876139322917, + "learning_rate": 0.0001, + "loss": 4.5657, + "loss/crossentropy": 2.226451873779297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545652836561203, + "step": 5638 + }, + { + "epoch": 0.1128, + "grad_norm": 2.296875, + "grad_norm_var": 0.05181884765625, + "learning_rate": 0.0001, + "loss": 4.4779, + "loss/crossentropy": 2.0343592762947083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2503928989171982, + "step": 5640 + }, + { + "epoch": 0.11284, + "grad_norm": 2.25, + "grad_norm_var": 0.0535552978515625, + "learning_rate": 0.0001, + "loss": 4.6253, + "loss/crossentropy": 2.142001748085022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24624411761760712, + "step": 5642 + }, + { + "epoch": 0.11288, + "grad_norm": 2.140625, + "grad_norm_var": 0.039383951822916666, + "learning_rate": 0.0001, + "loss": 4.4212, + "loss/crossentropy": 1.822394609451294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21315739303827286, + "step": 5644 + }, + { + "epoch": 0.11292, + "grad_norm": 2.140625, + "grad_norm_var": 0.018863932291666666, + "learning_rate": 0.0001, + "loss": 4.1611, + "loss/crossentropy": 2.3221731185913086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24407917261123657, + "step": 5646 + }, + { + "epoch": 0.11296, + "grad_norm": 2.234375, + "grad_norm_var": 0.012531534830729166, + "learning_rate": 0.0001, + "loss": 4.5591, + "loss/crossentropy": 1.9784467816352844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22578515857458115, + "step": 5648 + }, + { + "epoch": 0.113, + "grad_norm": 2.328125, + "grad_norm_var": 0.0142730712890625, + "learning_rate": 0.0001, + "loss": 4.636, + "loss/crossentropy": 2.2148635387420654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25031210482120514, + "step": 5650 + }, + { + "epoch": 0.11304, + "grad_norm": 2.21875, + "grad_norm_var": 0.012369791666666666, + "learning_rate": 0.0001, + "loss": 4.4644, + "loss/crossentropy": 1.9204095602035522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22073335200548172, + "step": 5652 + }, + { + "epoch": 0.11308, + "grad_norm": 2.421875, + "grad_norm_var": 0.009764607747395833, + "learning_rate": 0.0001, + "loss": 4.6601, + "loss/crossentropy": 2.092605173587799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24161123484373093, + "step": 5654 + }, + { + "epoch": 0.11312, + "grad_norm": 2.546875, + "grad_norm_var": 0.0163726806640625, + "learning_rate": 0.0001, + "loss": 4.5977, + "loss/crossentropy": 1.9546263217926025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22741927206516266, + "step": 5656 + }, + { + "epoch": 0.11316, + "grad_norm": 2.28125, + "grad_norm_var": 0.016974894205729167, + "learning_rate": 0.0001, + "loss": 4.6585, + "loss/crossentropy": 2.256605863571167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560018301010132, + "step": 5658 + }, + { + "epoch": 0.1132, + "grad_norm": 2.21875, + "grad_norm_var": 0.0157135009765625, + "learning_rate": 0.0001, + "loss": 4.3605, + "loss/crossentropy": 2.24527370929718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.270521879196167, + "step": 5660 + }, + { + "epoch": 0.11324, + "grad_norm": 2.5, + "grad_norm_var": 0.014029947916666667, + "learning_rate": 0.0001, + "loss": 4.5429, + "loss/crossentropy": 1.8154722452163696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22786997258663177, + "step": 5662 + }, + { + "epoch": 0.11328, + "grad_norm": 2.359375, + "grad_norm_var": 0.0143951416015625, + "learning_rate": 0.0001, + "loss": 4.4079, + "loss/crossentropy": 2.135699689388275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.253468282520771, + "step": 5664 + }, + { + "epoch": 0.11332, + "grad_norm": 2.203125, + "grad_norm_var": 0.015087890625, + "learning_rate": 0.0001, + "loss": 4.5381, + "loss/crossentropy": 2.15896338224411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25651170313358307, + "step": 5666 + }, + { + "epoch": 0.11336, + "grad_norm": 2.40625, + "grad_norm_var": 0.012035115559895834, + "learning_rate": 0.0001, + "loss": 4.7877, + "loss/crossentropy": 2.115864336490631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28546491265296936, + "step": 5668 + }, + { + "epoch": 0.1134, + "grad_norm": 2.25, + "grad_norm_var": 0.012214152018229167, + "learning_rate": 0.0001, + "loss": 4.4283, + "loss/crossentropy": 2.2036256790161133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2397892326116562, + "step": 5670 + }, + { + "epoch": 0.11344, + "grad_norm": 2.234375, + "grad_norm_var": 0.008430989583333333, + "learning_rate": 0.0001, + "loss": 4.598, + "loss/crossentropy": 2.4966647624969482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2984588146209717, + "step": 5672 + }, + { + "epoch": 0.11348, + "grad_norm": 2.234375, + "grad_norm_var": 0.008202107747395833, + "learning_rate": 0.0001, + "loss": 4.5289, + "loss/crossentropy": 2.051860749721527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2748461365699768, + "step": 5674 + }, + { + "epoch": 0.11352, + "grad_norm": 2.328125, + "grad_norm_var": 0.007938639322916666, + "learning_rate": 0.0001, + "loss": 4.597, + "loss/crossentropy": 2.046416461467743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2320190668106079, + "step": 5676 + }, + { + "epoch": 0.11356, + "grad_norm": 2.21875, + "grad_norm_var": 0.0054514567057291664, + "learning_rate": 0.0001, + "loss": 4.7389, + "loss/crossentropy": 2.2385342121124268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21859309077262878, + "step": 5678 + }, + { + "epoch": 0.1136, + "grad_norm": 2.265625, + "grad_norm_var": 0.006843058268229166, + "learning_rate": 0.0001, + "loss": 4.2487, + "loss/crossentropy": 1.8511550426483154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22065181285142899, + "step": 5680 + }, + { + "epoch": 0.11364, + "grad_norm": 2.828125, + "grad_norm_var": 0.031477864583333334, + "learning_rate": 0.0001, + "loss": 4.642, + "loss/crossentropy": 2.304056167602539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29507844150066376, + "step": 5682 + }, + { + "epoch": 0.11368, + "grad_norm": 2.4375, + "grad_norm_var": 0.03329671223958333, + "learning_rate": 0.0001, + "loss": 4.5519, + "loss/crossentropy": 1.993275225162506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33826952427625656, + "step": 5684 + }, + { + "epoch": 0.11372, + "grad_norm": 2.140625, + "grad_norm_var": 0.03498433430989583, + "learning_rate": 0.0001, + "loss": 4.4729, + "loss/crossentropy": 2.1836347579956055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24539872258901596, + "step": 5686 + }, + { + "epoch": 0.11376, + "grad_norm": 2.296875, + "grad_norm_var": 0.03504231770833333, + "learning_rate": 0.0001, + "loss": 4.5908, + "loss/crossentropy": 1.9467885494232178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180013582110405, + "step": 5688 + }, + { + "epoch": 0.1138, + "grad_norm": 2.78125, + "grad_norm_var": 0.04907938639322917, + "learning_rate": 0.0001, + "loss": 4.2888, + "loss/crossentropy": 1.9907563924789429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2551623433828354, + "step": 5690 + }, + { + "epoch": 0.11384, + "grad_norm": 2.21875, + "grad_norm_var": 0.049153645833333336, + "learning_rate": 0.0001, + "loss": 4.2186, + "loss/crossentropy": 1.9452654719352722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24263620376586914, + "step": 5692 + }, + { + "epoch": 0.11388, + "grad_norm": 2.171875, + "grad_norm_var": 0.050093587239583334, + "learning_rate": 0.0001, + "loss": 4.5816, + "loss/crossentropy": 2.2448811531066895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2508034110069275, + "step": 5694 + }, + { + "epoch": 0.11392, + "grad_norm": 2.65625, + "grad_norm_var": 0.055712890625, + "learning_rate": 0.0001, + "loss": 4.1868, + "loss/crossentropy": 1.991935908794403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22709138691425323, + "step": 5696 + }, + { + "epoch": 0.11396, + "grad_norm": 2.625, + "grad_norm_var": 0.47700907389322916, + "learning_rate": 0.0001, + "loss": 4.8208, + "loss/crossentropy": 2.0479623675346375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23980189859867096, + "step": 5698 + }, + { + "epoch": 0.114, + "grad_norm": 2.15625, + "grad_norm_var": 0.4779581705729167, + "learning_rate": 0.0001, + "loss": 4.4742, + "loss/crossentropy": 1.9319151639938354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295292615890503, + "step": 5700 + }, + { + "epoch": 0.11404, + "grad_norm": 2.296875, + "grad_norm_var": 0.47330322265625, + "learning_rate": 0.0001, + "loss": 4.32, + "loss/crossentropy": 1.985447645187378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.246543288230896, + "step": 5702 + }, + { + "epoch": 0.11408, + "grad_norm": 2.84375, + "grad_norm_var": 0.478271484375, + "learning_rate": 0.0001, + "loss": 4.6602, + "loss/crossentropy": 2.0016521215438843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25966253876686096, + "step": 5704 + }, + { + "epoch": 0.11412, + "grad_norm": 2.421875, + "grad_norm_var": 0.47038472493489586, + "learning_rate": 0.0001, + "loss": 4.9207, + "loss/crossentropy": 2.112374246120453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3251144737005234, + "step": 5706 + }, + { + "epoch": 0.11416, + "grad_norm": 2.328125, + "grad_norm_var": 0.4698720296223958, + "learning_rate": 0.0001, + "loss": 4.5223, + "loss/crossentropy": 2.0931158661842346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275928407907486, + "step": 5708 + }, + { + "epoch": 0.1142, + "grad_norm": 2.328125, + "grad_norm_var": 0.46104227701822914, + "learning_rate": 0.0001, + "loss": 4.4481, + "loss/crossentropy": 1.9867743849754333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2419440597295761, + "step": 5710 + }, + { + "epoch": 0.11424, + "grad_norm": 2.171875, + "grad_norm_var": 0.4639312744140625, + "learning_rate": 0.0001, + "loss": 4.3874, + "loss/crossentropy": 2.1822216510772705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2437271624803543, + "step": 5712 + }, + { + "epoch": 0.11428, + "grad_norm": 6.46875, + "grad_norm_var": 1.1088775634765624, + "learning_rate": 0.0001, + "loss": 4.5307, + "loss/crossentropy": 2.412580370903015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3222763389348984, + "step": 5714 + }, + { + "epoch": 0.11432, + "grad_norm": 2.28125, + "grad_norm_var": 1.1019490559895833, + "learning_rate": 0.0001, + "loss": 4.2491, + "loss/crossentropy": 1.676392376422882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24499019235372543, + "step": 5716 + }, + { + "epoch": 0.11436, + "grad_norm": 2.328125, + "grad_norm_var": 1.1039876302083333, + "learning_rate": 0.0001, + "loss": 4.4741, + "loss/crossentropy": 1.7818856835365295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22935030609369278, + "step": 5718 + }, + { + "epoch": 0.1144, + "grad_norm": 2.40625, + "grad_norm_var": 1.0944732666015624, + "learning_rate": 0.0001, + "loss": 4.5445, + "loss/crossentropy": 2.012014925479889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.231942281126976, + "step": 5720 + }, + { + "epoch": 0.11444, + "grad_norm": 2.296875, + "grad_norm_var": 1.1023834228515625, + "learning_rate": 0.0001, + "loss": 4.5061, + "loss/crossentropy": 2.3663965463638306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.254768967628479, + "step": 5722 + }, + { + "epoch": 0.11448, + "grad_norm": 2.34375, + "grad_norm_var": 1.107982381184896, + "learning_rate": 0.0001, + "loss": 4.5878, + "loss/crossentropy": 2.343206286430359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26609115302562714, + "step": 5724 + }, + { + "epoch": 0.11452, + "grad_norm": 2.21875, + "grad_norm_var": 1.1131337483723958, + "learning_rate": 0.0001, + "loss": 4.5047, + "loss/crossentropy": 1.8696978092193604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24282050132751465, + "step": 5726 + }, + { + "epoch": 0.11456, + "grad_norm": 2.0625, + "grad_norm_var": 1.1170644124348958, + "learning_rate": 0.0001, + "loss": 4.2198, + "loss/crossentropy": 2.1430450677871704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23211582750082016, + "step": 5728 + }, + { + "epoch": 0.1146, + "grad_norm": 2.421875, + "grad_norm_var": 0.013841756184895833, + "learning_rate": 0.0001, + "loss": 4.6868, + "loss/crossentropy": 2.2231308221817017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23736387491226196, + "step": 5730 + }, + { + "epoch": 0.11464, + "grad_norm": 2.421875, + "grad_norm_var": 0.014069620768229167, + "learning_rate": 0.0001, + "loss": 4.4911, + "loss/crossentropy": 1.8789280652999878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24081497639417648, + "step": 5732 + }, + { + "epoch": 0.11468, + "grad_norm": 2.1875, + "grad_norm_var": 0.014875284830729167, + "learning_rate": 0.0001, + "loss": 4.4478, + "loss/crossentropy": 2.0491732358932495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20530463755130768, + "step": 5734 + }, + { + "epoch": 0.11472, + "grad_norm": 2.296875, + "grad_norm_var": 0.012516276041666666, + "learning_rate": 0.0001, + "loss": 4.4352, + "loss/crossentropy": 2.067046642303467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24701344966888428, + "step": 5736 + }, + { + "epoch": 0.11476, + "grad_norm": 2.234375, + "grad_norm_var": 0.012791951497395834, + "learning_rate": 0.0001, + "loss": 4.4515, + "loss/crossentropy": 2.0207647681236267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2243770807981491, + "step": 5738 + }, + { + "epoch": 0.1148, + "grad_norm": 2.21875, + "grad_norm_var": 0.013084920247395833, + "learning_rate": 0.0001, + "loss": 4.9205, + "loss/crossentropy": 2.230514347553253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24377264082431793, + "step": 5740 + }, + { + "epoch": 0.11484, + "grad_norm": 2.078125, + "grad_norm_var": 0.013997395833333334, + "learning_rate": 0.0001, + "loss": 4.3831, + "loss/crossentropy": 2.38068687915802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24760407954454422, + "step": 5742 + }, + { + "epoch": 0.11488, + "grad_norm": 2.3125, + "grad_norm_var": 0.012483723958333333, + "learning_rate": 0.0001, + "loss": 4.5598, + "loss/crossentropy": 2.238909125328064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2503668889403343, + "step": 5744 + }, + { + "epoch": 0.11492, + "grad_norm": 2.1875, + "grad_norm_var": 0.015265909830729167, + "learning_rate": 0.0001, + "loss": 4.4848, + "loss/crossentropy": 1.7423101663589478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20424582809209824, + "step": 5746 + }, + { + "epoch": 0.11496, + "grad_norm": 2.25, + "grad_norm_var": 0.0181304931640625, + "learning_rate": 0.0001, + "loss": 4.6754, + "loss/crossentropy": 2.5906827449798584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24731651693582535, + "step": 5748 + }, + { + "epoch": 0.115, + "grad_norm": 2.6875, + "grad_norm_var": 0.025830078125, + "learning_rate": 0.0001, + "loss": 4.7427, + "loss/crossentropy": 2.418861746788025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505089193582535, + "step": 5750 + }, + { + "epoch": 0.11504, + "grad_norm": 2.765625, + "grad_norm_var": 0.03658447265625, + "learning_rate": 0.0001, + "loss": 4.869, + "loss/crossentropy": 2.158658504486084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2381347045302391, + "step": 5752 + }, + { + "epoch": 0.11508, + "grad_norm": 2.40625, + "grad_norm_var": 0.03860270182291667, + "learning_rate": 0.0001, + "loss": 4.5073, + "loss/crossentropy": 2.0938435196876526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23639065027236938, + "step": 5754 + }, + { + "epoch": 0.11512, + "grad_norm": 2.046875, + "grad_norm_var": 0.0441314697265625, + "learning_rate": 0.0001, + "loss": 4.2227, + "loss/crossentropy": 2.023799479007721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24510329961776733, + "step": 5756 + }, + { + "epoch": 0.11516, + "grad_norm": 2.265625, + "grad_norm_var": 0.03854166666666667, + "learning_rate": 0.0001, + "loss": 4.5155, + "loss/crossentropy": 2.3589184284210205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2778936177492142, + "step": 5758 + }, + { + "epoch": 0.1152, + "grad_norm": 2.1875, + "grad_norm_var": 0.041731770833333334, + "learning_rate": 0.0001, + "loss": 4.4209, + "loss/crossentropy": 2.4897998571395874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23546544462442398, + "step": 5760 + }, + { + "epoch": 0.11524, + "grad_norm": 2.1875, + "grad_norm_var": 0.041747029622395834, + "learning_rate": 0.0001, + "loss": 4.4619, + "loss/crossentropy": 2.0426196455955505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22708147019147873, + "step": 5762 + }, + { + "epoch": 0.11528, + "grad_norm": 2.28125, + "grad_norm_var": 0.0397369384765625, + "learning_rate": 0.0001, + "loss": 4.6661, + "loss/crossentropy": 2.2582051753997803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25646351277828217, + "step": 5764 + }, + { + "epoch": 0.11532, + "grad_norm": 2.21875, + "grad_norm_var": 0.03430582682291667, + "learning_rate": 0.0001, + "loss": 4.5929, + "loss/crossentropy": 2.181105613708496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2380223199725151, + "step": 5766 + }, + { + "epoch": 0.11536, + "grad_norm": 2.46875, + "grad_norm_var": 0.026200358072916666, + "learning_rate": 0.0001, + "loss": 4.5078, + "loss/crossentropy": 2.236580967903137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26678355038166046, + "step": 5768 + }, + { + "epoch": 0.1154, + "grad_norm": 2.203125, + "grad_norm_var": 0.020361328125, + "learning_rate": 0.0001, + "loss": 4.6232, + "loss/crossentropy": 1.9383749961853027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24920085072517395, + "step": 5770 + }, + { + "epoch": 0.11544, + "grad_norm": 2.15625, + "grad_norm_var": 0.018529256184895832, + "learning_rate": 0.0001, + "loss": 4.6044, + "loss/crossentropy": 2.2856688499450684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.258881650865078, + "step": 5772 + }, + { + "epoch": 0.11548, + "grad_norm": 2.28125, + "grad_norm_var": 0.0239410400390625, + "learning_rate": 0.0001, + "loss": 4.8391, + "loss/crossentropy": 2.2897390127182007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24453644454479218, + "step": 5774 + }, + { + "epoch": 0.11552, + "grad_norm": 2.25, + "grad_norm_var": 0.014549763997395833, + "learning_rate": 0.0001, + "loss": 4.5698, + "loss/crossentropy": 2.0502785444259644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24106161296367645, + "step": 5776 + }, + { + "epoch": 0.11556, + "grad_norm": 2.21875, + "grad_norm_var": 0.01441650390625, + "learning_rate": 0.0001, + "loss": 4.5665, + "loss/crossentropy": 2.344050645828247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24690424650907516, + "step": 5778 + }, + { + "epoch": 0.1156, + "grad_norm": 2.609375, + "grad_norm_var": 0.021239217122395834, + "learning_rate": 0.0001, + "loss": 4.5273, + "loss/crossentropy": 2.2274389266967773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25618284940719604, + "step": 5780 + }, + { + "epoch": 0.11564, + "grad_norm": 2.203125, + "grad_norm_var": 0.022184244791666665, + "learning_rate": 0.0001, + "loss": 4.4888, + "loss/crossentropy": 2.063184678554535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22606099396944046, + "step": 5782 + }, + { + "epoch": 0.11568, + "grad_norm": 2.203125, + "grad_norm_var": 0.0206451416015625, + "learning_rate": 0.0001, + "loss": 4.2333, + "loss/crossentropy": 2.093947410583496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23588553071022034, + "step": 5784 + }, + { + "epoch": 0.11572, + "grad_norm": 2.015625, + "grad_norm_var": 0.0246490478515625, + "learning_rate": 0.0001, + "loss": 4.426, + "loss/crossentropy": 2.1599318981170654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20322780311107635, + "step": 5786 + }, + { + "epoch": 0.11576, + "grad_norm": 2.40625, + "grad_norm_var": 0.02506103515625, + "learning_rate": 0.0001, + "loss": 4.5398, + "loss/crossentropy": 2.274693012237549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2704206556081772, + "step": 5788 + }, + { + "epoch": 0.1158, + "grad_norm": 2.421875, + "grad_norm_var": 0.0222076416015625, + "learning_rate": 0.0001, + "loss": 4.6755, + "loss/crossentropy": 1.9712103009223938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28300437331199646, + "step": 5790 + }, + { + "epoch": 0.11584, + "grad_norm": 2.09375, + "grad_norm_var": 0.024344889322916667, + "learning_rate": 0.0001, + "loss": 4.2035, + "loss/crossentropy": 2.027747690677643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24419110268354416, + "step": 5792 + }, + { + "epoch": 0.11588, + "grad_norm": 2.34375, + "grad_norm_var": 0.024251302083333332, + "learning_rate": 0.0001, + "loss": 4.5614, + "loss/crossentropy": 2.162364959716797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2522001415491104, + "step": 5794 + }, + { + "epoch": 0.11592, + "grad_norm": 2.28125, + "grad_norm_var": 0.0166656494140625, + "learning_rate": 0.0001, + "loss": 4.4711, + "loss/crossentropy": 2.558881998062134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26168718934059143, + "step": 5796 + }, + { + "epoch": 0.11596, + "grad_norm": 2.125, + "grad_norm_var": 0.0162994384765625, + "learning_rate": 0.0001, + "loss": 4.4108, + "loss/crossentropy": 2.1027071475982666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24049362540245056, + "step": 5798 + }, + { + "epoch": 0.116, + "grad_norm": 2.234375, + "grad_norm_var": 0.015034993489583334, + "learning_rate": 0.0001, + "loss": 4.6059, + "loss/crossentropy": 2.488932490348816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2697457820177078, + "step": 5800 + }, + { + "epoch": 0.11604, + "grad_norm": 2.375, + "grad_norm_var": 0.0121734619140625, + "learning_rate": 0.0001, + "loss": 4.6706, + "loss/crossentropy": 2.0441418886184692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23980429768562317, + "step": 5802 + }, + { + "epoch": 0.11608, + "grad_norm": 2.375, + "grad_norm_var": 0.0107421875, + "learning_rate": 0.0001, + "loss": 4.8471, + "loss/crossentropy": 2.2873395681381226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3105090409517288, + "step": 5804 + }, + { + "epoch": 0.11612, + "grad_norm": 2.265625, + "grad_norm_var": 0.00826416015625, + "learning_rate": 0.0001, + "loss": 4.5388, + "loss/crossentropy": 1.8773444890975952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20640414953231812, + "step": 5806 + }, + { + "epoch": 0.11616, + "grad_norm": 2.375, + "grad_norm_var": 0.007502237955729167, + "learning_rate": 0.0001, + "loss": 4.7338, + "loss/crossentropy": 2.3736027479171753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2679155319929123, + "step": 5808 + }, + { + "epoch": 0.1162, + "grad_norm": 2.28125, + "grad_norm_var": 0.010676066080729166, + "learning_rate": 0.0001, + "loss": 4.0496, + "loss/crossentropy": 2.071690082550049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21868111193180084, + "step": 5810 + }, + { + "epoch": 0.11624, + "grad_norm": 2.390625, + "grad_norm_var": 0.015425618489583333, + "learning_rate": 0.0001, + "loss": 4.4029, + "loss/crossentropy": 2.0531184673309326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24457989633083344, + "step": 5812 + }, + { + "epoch": 0.11628, + "grad_norm": 2.3125, + "grad_norm_var": 0.015718587239583335, + "learning_rate": 0.0001, + "loss": 4.6511, + "loss/crossentropy": 2.1161271929740906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22809523344039917, + "step": 5814 + }, + { + "epoch": 0.11632, + "grad_norm": 2.046875, + "grad_norm_var": 0.0196929931640625, + "learning_rate": 0.0001, + "loss": 4.2736, + "loss/crossentropy": 1.6557151675224304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20782940834760666, + "step": 5816 + }, + { + "epoch": 0.11636, + "grad_norm": 2.3125, + "grad_norm_var": 0.03704020182291667, + "learning_rate": 0.0001, + "loss": 4.5977, + "loss/crossentropy": 2.2499040365219116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2713513821363449, + "step": 5818 + }, + { + "epoch": 0.1164, + "grad_norm": 2.390625, + "grad_norm_var": 0.03723958333333333, + "learning_rate": 0.0001, + "loss": 4.6417, + "loss/crossentropy": 2.3616446256637573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2613145411014557, + "step": 5820 + }, + { + "epoch": 0.11644, + "grad_norm": 2.265625, + "grad_norm_var": 0.0353515625, + "learning_rate": 0.0001, + "loss": 4.3744, + "loss/crossentropy": 2.0932790637016296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23843754082918167, + "step": 5822 + }, + { + "epoch": 0.11648, + "grad_norm": 2.25, + "grad_norm_var": 0.03540751139322917, + "learning_rate": 0.0001, + "loss": 4.4089, + "loss/crossentropy": 2.128177046775818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26579540967941284, + "step": 5824 + }, + { + "epoch": 0.11652, + "grad_norm": 2.296875, + "grad_norm_var": 0.031281534830729166, + "learning_rate": 0.0001, + "loss": 4.6942, + "loss/crossentropy": 2.332372784614563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27953268587589264, + "step": 5826 + }, + { + "epoch": 0.11656, + "grad_norm": 2.1875, + "grad_norm_var": 0.0263092041015625, + "learning_rate": 0.0001, + "loss": 4.4615, + "loss/crossentropy": 2.360959053039551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26249848306179047, + "step": 5828 + }, + { + "epoch": 0.1166, + "grad_norm": 2.0, + "grad_norm_var": 0.030745442708333334, + "learning_rate": 0.0001, + "loss": 3.9757, + "loss/crossentropy": 1.9800339341163635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22766301035881042, + "step": 5830 + }, + { + "epoch": 0.11664, + "grad_norm": 2.390625, + "grad_norm_var": 0.029442342122395833, + "learning_rate": 0.0001, + "loss": 4.3287, + "loss/crossentropy": 2.082980155944824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2368428260087967, + "step": 5832 + }, + { + "epoch": 0.11668, + "grad_norm": 2.34375, + "grad_norm_var": 0.011986287434895833, + "learning_rate": 0.0001, + "loss": 4.4441, + "loss/crossentropy": 2.093027710914612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21121951937675476, + "step": 5834 + }, + { + "epoch": 0.11672, + "grad_norm": 2.296875, + "grad_norm_var": 0.011031087239583333, + "learning_rate": 0.0001, + "loss": 4.5008, + "loss/crossentropy": 2.1329175233840942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25303877890110016, + "step": 5836 + }, + { + "epoch": 0.11676, + "grad_norm": 2.21875, + "grad_norm_var": 0.022932942708333334, + "learning_rate": 0.0001, + "loss": 4.5174, + "loss/crossentropy": 1.79305762052536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20448636263608932, + "step": 5838 + }, + { + "epoch": 0.1168, + "grad_norm": 2.28125, + "grad_norm_var": 0.024933878580729166, + "learning_rate": 0.0001, + "loss": 4.2722, + "loss/crossentropy": 1.957836627960205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22745755314826965, + "step": 5840 + }, + { + "epoch": 0.11684, + "grad_norm": 2.25, + "grad_norm_var": 0.025007120768229165, + "learning_rate": 0.0001, + "loss": 4.3773, + "loss/crossentropy": 2.1167174577713013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2437898963689804, + "step": 5842 + }, + { + "epoch": 0.11688, + "grad_norm": 2.1875, + "grad_norm_var": 0.024800618489583332, + "learning_rate": 0.0001, + "loss": 4.4856, + "loss/crossentropy": 2.3288447856903076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2355937883257866, + "step": 5844 + }, + { + "epoch": 0.11692, + "grad_norm": 2.125, + "grad_norm_var": 0.022337849934895834, + "learning_rate": 0.0001, + "loss": 4.4774, + "loss/crossentropy": 2.2526416778564453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24917340278625488, + "step": 5846 + }, + { + "epoch": 0.11696, + "grad_norm": 2.25, + "grad_norm_var": 0.0198883056640625, + "learning_rate": 0.0001, + "loss": 4.391, + "loss/crossentropy": 2.194224774837494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2625589966773987, + "step": 5848 + }, + { + "epoch": 0.117, + "grad_norm": 2.34375, + "grad_norm_var": 0.020116170247395832, + "learning_rate": 0.0001, + "loss": 4.4955, + "loss/crossentropy": 2.0156877040863037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22422078251838684, + "step": 5850 + }, + { + "epoch": 0.11704, + "grad_norm": 2.21875, + "grad_norm_var": 0.01962890625, + "learning_rate": 0.0001, + "loss": 4.6231, + "loss/crossentropy": 2.2480785846710205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24705884605646133, + "step": 5852 + }, + { + "epoch": 0.11708, + "grad_norm": 2.25, + "grad_norm_var": 0.0078521728515625, + "learning_rate": 0.0001, + "loss": 4.4817, + "loss/crossentropy": 2.0915993452072144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24669666588306427, + "step": 5854 + }, + { + "epoch": 0.11712, + "grad_norm": 2.171875, + "grad_norm_var": 0.0062652587890625, + "learning_rate": 0.0001, + "loss": 4.4444, + "loss/crossentropy": 2.1283876299858093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24228712916374207, + "step": 5856 + }, + { + "epoch": 0.11716, + "grad_norm": 2.625, + "grad_norm_var": 0.015523274739583334, + "learning_rate": 0.0001, + "loss": 4.6582, + "loss/crossentropy": 2.028861939907074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2281404584646225, + "step": 5858 + }, + { + "epoch": 0.1172, + "grad_norm": 2.15625, + "grad_norm_var": 0.015852864583333334, + "learning_rate": 0.0001, + "loss": 4.2293, + "loss/crossentropy": 2.154610753059387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24233703315258026, + "step": 5860 + }, + { + "epoch": 0.11724, + "grad_norm": 2.421875, + "grad_norm_var": 0.015787760416666668, + "learning_rate": 0.0001, + "loss": 4.5059, + "loss/crossentropy": 1.9396602511405945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22303076088428497, + "step": 5862 + }, + { + "epoch": 0.11728, + "grad_norm": 2.578125, + "grad_norm_var": 0.021061197916666666, + "learning_rate": 0.0001, + "loss": 4.7447, + "loss/crossentropy": 2.053893029689789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26897912472486496, + "step": 5864 + }, + { + "epoch": 0.11732, + "grad_norm": 2.0625, + "grad_norm_var": 0.0263336181640625, + "learning_rate": 0.0001, + "loss": 4.3529, + "loss/crossentropy": 1.990949273109436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22180908918380737, + "step": 5866 + }, + { + "epoch": 0.11736, + "grad_norm": 2.453125, + "grad_norm_var": 0.030085245768229168, + "learning_rate": 0.0001, + "loss": 4.6434, + "loss/crossentropy": 2.0929455161094666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24055248498916626, + "step": 5868 + }, + { + "epoch": 0.1174, + "grad_norm": 2.46875, + "grad_norm_var": 0.03168843587239583, + "learning_rate": 0.0001, + "loss": 4.5465, + "loss/crossentropy": 2.1476733684539795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2416895627975464, + "step": 5870 + }, + { + "epoch": 0.11744, + "grad_norm": 2.21875, + "grad_norm_var": 0.027962239583333333, + "learning_rate": 0.0001, + "loss": 4.5633, + "loss/crossentropy": 1.74330335855484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001302987337112, + "step": 5872 + }, + { + "epoch": 0.11748, + "grad_norm": 2.265625, + "grad_norm_var": 0.021507771809895833, + "learning_rate": 0.0001, + "loss": 4.2725, + "loss/crossentropy": 1.8903921246528625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21950766444206238, + "step": 5874 + }, + { + "epoch": 0.11752, + "grad_norm": 2.140625, + "grad_norm_var": 0.022069295247395832, + "learning_rate": 0.0001, + "loss": 4.3673, + "loss/crossentropy": 1.798406720161438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20271167159080505, + "step": 5876 + }, + { + "epoch": 0.11756, + "grad_norm": 2.25, + "grad_norm_var": 0.021089680989583335, + "learning_rate": 0.0001, + "loss": 4.7128, + "loss/crossentropy": 1.9651959538459778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21256517618894577, + "step": 5878 + }, + { + "epoch": 0.1176, + "grad_norm": 2.109375, + "grad_norm_var": 0.016844685872395834, + "learning_rate": 0.0001, + "loss": 4.4158, + "loss/crossentropy": 2.039245307445526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23225219547748566, + "step": 5880 + }, + { + "epoch": 0.11764, + "grad_norm": 2.40625, + "grad_norm_var": 0.013329060872395833, + "learning_rate": 0.0001, + "loss": 4.8741, + "loss/crossentropy": 2.500381350517273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31757834553718567, + "step": 5882 + }, + { + "epoch": 0.11768, + "grad_norm": 2.59375, + "grad_norm_var": 0.018195597330729167, + "learning_rate": 0.0001, + "loss": 4.6463, + "loss/crossentropy": 2.0540305972099304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2474198415875435, + "step": 5884 + }, + { + "epoch": 0.11772, + "grad_norm": 2.359375, + "grad_norm_var": 0.016462198893229165, + "learning_rate": 0.0001, + "loss": 4.5047, + "loss/crossentropy": 2.072624385356903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2482014298439026, + "step": 5886 + }, + { + "epoch": 0.11776, + "grad_norm": 2.328125, + "grad_norm_var": 0.017853800455729166, + "learning_rate": 0.0001, + "loss": 4.4025, + "loss/crossentropy": 2.153423309326172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.234448604285717, + "step": 5888 + }, + { + "epoch": 0.1178, + "grad_norm": 2.5625, + "grad_norm_var": 0.025325520833333334, + "learning_rate": 0.0001, + "loss": 4.2698, + "loss/crossentropy": 1.5941627621650696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20982372760772705, + "step": 5890 + }, + { + "epoch": 0.11784, + "grad_norm": 2.390625, + "grad_norm_var": 0.025886027018229167, + "learning_rate": 0.0001, + "loss": 4.5168, + "loss/crossentropy": 2.2858930826187134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24936140328645706, + "step": 5892 + }, + { + "epoch": 0.11788, + "grad_norm": 2.296875, + "grad_norm_var": 0.028076171875, + "learning_rate": 0.0001, + "loss": 4.536, + "loss/crossentropy": 2.333559274673462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26152122020721436, + "step": 5894 + }, + { + "epoch": 0.11792, + "grad_norm": 2.296875, + "grad_norm_var": 0.027106730143229167, + "learning_rate": 0.0001, + "loss": 4.412, + "loss/crossentropy": 2.134613037109375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24929091334342957, + "step": 5896 + }, + { + "epoch": 0.11796, + "grad_norm": 2.296875, + "grad_norm_var": 0.026423136393229168, + "learning_rate": 0.0001, + "loss": 4.4052, + "loss/crossentropy": 2.179764688014984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24523546546697617, + "step": 5898 + }, + { + "epoch": 0.118, + "grad_norm": 2.109375, + "grad_norm_var": 0.0204742431640625, + "learning_rate": 0.0001, + "loss": 4.3431, + "loss/crossentropy": 2.1184223294258118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21715932339429855, + "step": 5900 + }, + { + "epoch": 0.11804, + "grad_norm": 2.078125, + "grad_norm_var": 0.021882120768229166, + "learning_rate": 0.0001, + "loss": 4.5606, + "loss/crossentropy": 2.024593770503998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23016374558210373, + "step": 5902 + }, + { + "epoch": 0.11808, + "grad_norm": 2.265625, + "grad_norm_var": 0.0204986572265625, + "learning_rate": 0.0001, + "loss": 4.3646, + "loss/crossentropy": 2.077186107635498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2538782134652138, + "step": 5904 + }, + { + "epoch": 0.11812, + "grad_norm": 2.359375, + "grad_norm_var": 0.013190714518229167, + "learning_rate": 0.0001, + "loss": 4.9135, + "loss/crossentropy": 2.2535945177078247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25025132298469543, + "step": 5906 + }, + { + "epoch": 0.11816, + "grad_norm": 2.21875, + "grad_norm_var": 0.01064453125, + "learning_rate": 0.0001, + "loss": 4.5512, + "loss/crossentropy": 2.5321284532546997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2729681059718132, + "step": 5908 + }, + { + "epoch": 0.1182, + "grad_norm": 2.234375, + "grad_norm_var": 0.006403605143229167, + "learning_rate": 0.0001, + "loss": 4.332, + "loss/crossentropy": 2.043885111808777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23721691966056824, + "step": 5910 + }, + { + "epoch": 0.11824, + "grad_norm": 2.296875, + "grad_norm_var": 0.005204264322916667, + "learning_rate": 0.0001, + "loss": 4.3901, + "loss/crossentropy": 2.1343676447868347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23325734585523605, + "step": 5912 + }, + { + "epoch": 0.11828, + "grad_norm": 2.03125, + "grad_norm_var": 0.0074045817057291664, + "learning_rate": 0.0001, + "loss": 4.3551, + "loss/crossentropy": 2.1164477467536926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24526448547840118, + "step": 5914 + }, + { + "epoch": 0.11832, + "grad_norm": 2.1875, + "grad_norm_var": 0.0064737955729166664, + "learning_rate": 0.0001, + "loss": 4.5162, + "loss/crossentropy": 2.268216848373413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24837128818035126, + "step": 5916 + }, + { + "epoch": 0.11836, + "grad_norm": 2.25, + "grad_norm_var": 0.0057037353515625, + "learning_rate": 0.0001, + "loss": 4.4636, + "loss/crossentropy": 2.074695885181427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24956009536981583, + "step": 5918 + }, + { + "epoch": 0.1184, + "grad_norm": 2.203125, + "grad_norm_var": 0.007372029622395833, + "learning_rate": 0.0001, + "loss": 4.3165, + "loss/crossentropy": 1.9369722604751587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316487580537796, + "step": 5920 + }, + { + "epoch": 0.11844, + "grad_norm": 2.15625, + "grad_norm_var": 0.00611572265625, + "learning_rate": 0.0001, + "loss": 4.2789, + "loss/crossentropy": 2.2189531326293945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23884039372205734, + "step": 5922 + }, + { + "epoch": 0.11848, + "grad_norm": 2.34375, + "grad_norm_var": 0.007323201497395833, + "learning_rate": 0.0001, + "loss": 4.5965, + "loss/crossentropy": 2.3833028078079224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2504591718316078, + "step": 5924 + }, + { + "epoch": 0.11852, + "grad_norm": 2.234375, + "grad_norm_var": 0.007005818684895833, + "learning_rate": 0.0001, + "loss": 4.6346, + "loss/crossentropy": 2.0443845987319946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23422807455062866, + "step": 5926 + }, + { + "epoch": 0.11856, + "grad_norm": 2.203125, + "grad_norm_var": 0.0073964436848958336, + "learning_rate": 0.0001, + "loss": 4.9061, + "loss/crossentropy": 2.223625063896179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24985776841640472, + "step": 5928 + }, + { + "epoch": 0.1186, + "grad_norm": 2.171875, + "grad_norm_var": 0.005126953125, + "learning_rate": 0.0001, + "loss": 4.493, + "loss/crossentropy": 2.1353545784950256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22846727818250656, + "step": 5930 + }, + { + "epoch": 0.11864, + "grad_norm": 2.203125, + "grad_norm_var": 0.005159505208333333, + "learning_rate": 0.0001, + "loss": 4.5786, + "loss/crossentropy": 2.0497827529907227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21461189538240433, + "step": 5932 + }, + { + "epoch": 0.11868, + "grad_norm": 2.28125, + "grad_norm_var": 0.004715983072916667, + "learning_rate": 0.0001, + "loss": 4.5071, + "loss/crossentropy": 1.9257362484931946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216043472290039, + "step": 5934 + }, + { + "epoch": 0.11872, + "grad_norm": 2.171875, + "grad_norm_var": 0.0034006754557291668, + "learning_rate": 0.0001, + "loss": 4.4212, + "loss/crossentropy": 2.0458216071128845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23162957280874252, + "step": 5936 + }, + { + "epoch": 0.11876, + "grad_norm": 2.1875, + "grad_norm_var": 0.0033274332682291666, + "learning_rate": 0.0001, + "loss": 4.6109, + "loss/crossentropy": 2.0786396861076355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22303655743598938, + "step": 5938 + }, + { + "epoch": 0.1188, + "grad_norm": 2.546875, + "grad_norm_var": 0.04327799479166667, + "learning_rate": 0.0001, + "loss": 4.4861, + "loss/crossentropy": 1.9800568222999573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22236331552267075, + "step": 5940 + }, + { + "epoch": 0.11884, + "grad_norm": 2.0625, + "grad_norm_var": 0.0502349853515625, + "learning_rate": 0.0001, + "loss": 4.056, + "loss/crossentropy": 1.882250189781189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21664132922887802, + "step": 5942 + }, + { + "epoch": 0.11888, + "grad_norm": 2.203125, + "grad_norm_var": 0.05025634765625, + "learning_rate": 0.0001, + "loss": 4.5965, + "loss/crossentropy": 2.3682695627212524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2698971778154373, + "step": 5944 + }, + { + "epoch": 0.11892, + "grad_norm": 2.265625, + "grad_norm_var": 0.0505523681640625, + "learning_rate": 0.0001, + "loss": 4.6364, + "loss/crossentropy": 2.225574493408203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2642917186021805, + "step": 5946 + }, + { + "epoch": 0.11896, + "grad_norm": 2.1875, + "grad_norm_var": 0.05032145182291667, + "learning_rate": 0.0001, + "loss": 4.3157, + "loss/crossentropy": 1.8634169697761536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2170402556657791, + "step": 5948 + }, + { + "epoch": 0.119, + "grad_norm": 2.3125, + "grad_norm_var": 0.05123291015625, + "learning_rate": 0.0001, + "loss": 5.0392, + "loss/crossentropy": 2.4500025510787964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27845144271850586, + "step": 5950 + }, + { + "epoch": 0.11904, + "grad_norm": 2.046875, + "grad_norm_var": 0.054108683268229166, + "learning_rate": 0.0001, + "loss": 4.1179, + "loss/crossentropy": 2.1532052755355835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24952176213264465, + "step": 5952 + }, + { + "epoch": 0.11908, + "grad_norm": 2.140625, + "grad_norm_var": 0.0561676025390625, + "learning_rate": 0.0001, + "loss": 4.2592, + "loss/crossentropy": 2.066560387611389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356283888220787, + "step": 5954 + }, + { + "epoch": 0.11912, + "grad_norm": 2.265625, + "grad_norm_var": 0.01021728515625, + "learning_rate": 0.0001, + "loss": 4.6187, + "loss/crossentropy": 1.9679089784622192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23729706555604935, + "step": 5956 + }, + { + "epoch": 0.11916, + "grad_norm": 2.234375, + "grad_norm_var": 0.0071441650390625, + "learning_rate": 0.0001, + "loss": 4.3463, + "loss/crossentropy": 2.3490394353866577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26400282233953476, + "step": 5958 + }, + { + "epoch": 0.1192, + "grad_norm": 2.296875, + "grad_norm_var": 0.008576456705729167, + "learning_rate": 0.0001, + "loss": 4.3629, + "loss/crossentropy": 2.145757555961609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23329483717679977, + "step": 5960 + }, + { + "epoch": 0.11924, + "grad_norm": 2.1875, + "grad_norm_var": 0.005399576822916667, + "learning_rate": 0.0001, + "loss": 4.2376, + "loss/crossentropy": 2.0764617919921875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22371648252010345, + "step": 5962 + }, + { + "epoch": 0.11928, + "grad_norm": 2.46875, + "grad_norm_var": 0.010380045572916666, + "learning_rate": 0.0001, + "loss": 4.8106, + "loss/crossentropy": 2.199389696121216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23210449516773224, + "step": 5964 + }, + { + "epoch": 0.11932, + "grad_norm": 2.09375, + "grad_norm_var": 0.01064453125, + "learning_rate": 0.0001, + "loss": 4.3515, + "loss/crossentropy": 1.9008439183235168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21474837511777878, + "step": 5966 + }, + { + "epoch": 0.11936, + "grad_norm": 2.0, + "grad_norm_var": 0.0148590087890625, + "learning_rate": 0.0001, + "loss": 4.4603, + "loss/crossentropy": 2.1779539585113525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23485051095485687, + "step": 5968 + }, + { + "epoch": 0.1194, + "grad_norm": 2.15625, + "grad_norm_var": 0.014176432291666667, + "learning_rate": 0.0001, + "loss": 4.428, + "loss/crossentropy": 2.267147421836853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24924689531326294, + "step": 5970 + }, + { + "epoch": 0.11944, + "grad_norm": 2.1875, + "grad_norm_var": 0.016292317708333334, + "learning_rate": 0.0001, + "loss": 4.0856, + "loss/crossentropy": 2.0618110299110413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24352984875440598, + "step": 5972 + }, + { + "epoch": 0.11948, + "grad_norm": 2.34375, + "grad_norm_var": 0.023758951822916666, + "learning_rate": 0.0001, + "loss": 4.5802, + "loss/crossentropy": 2.1419676542282104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23680058121681213, + "step": 5974 + }, + { + "epoch": 0.11952, + "grad_norm": 2.265625, + "grad_norm_var": 0.023368326822916667, + "learning_rate": 0.0001, + "loss": 4.6554, + "loss/crossentropy": 2.0376622080802917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22515031695365906, + "step": 5976 + }, + { + "epoch": 0.11956, + "grad_norm": 2.46875, + "grad_norm_var": 0.025739542643229165, + "learning_rate": 0.0001, + "loss": 4.4213, + "loss/crossentropy": 1.7675965428352356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219834603369236, + "step": 5978 + }, + { + "epoch": 0.1196, + "grad_norm": 2.1875, + "grad_norm_var": 0.024030558268229165, + "learning_rate": 0.0001, + "loss": 4.2886, + "loss/crossentropy": 1.8919905424118042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2256327122449875, + "step": 5980 + }, + { + "epoch": 0.11964, + "grad_norm": 2.15625, + "grad_norm_var": 0.0228424072265625, + "learning_rate": 0.0001, + "loss": 4.6148, + "loss/crossentropy": 2.287980794906616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25571687519550323, + "step": 5982 + }, + { + "epoch": 0.11968, + "grad_norm": 2.15625, + "grad_norm_var": 0.0169830322265625, + "learning_rate": 0.0001, + "loss": 4.3527, + "loss/crossentropy": 2.1030094027519226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23401429504156113, + "step": 5984 + }, + { + "epoch": 0.11972, + "grad_norm": 2.421875, + "grad_norm_var": 0.0166412353515625, + "learning_rate": 0.0001, + "loss": 4.8857, + "loss/crossentropy": 2.469533920288086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2548582851886749, + "step": 5986 + }, + { + "epoch": 0.11976, + "grad_norm": 2.328125, + "grad_norm_var": 0.018047841389973958, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 2.140324354171753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22603372484445572, + "step": 5988 + }, + { + "epoch": 0.1198, + "grad_norm": 2.140625, + "grad_norm_var": 0.015773264567057292, + "learning_rate": 0.0001, + "loss": 4.3848, + "loss/crossentropy": 2.1848061084747314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22712922096252441, + "step": 5990 + }, + { + "epoch": 0.11984, + "grad_norm": 2.34375, + "grad_norm_var": 0.015380605061848959, + "learning_rate": 0.0001, + "loss": 4.6457, + "loss/crossentropy": 2.2872358560562134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2352529615163803, + "step": 5992 + }, + { + "epoch": 0.11988, + "grad_norm": 2.25, + "grad_norm_var": 0.012318674723307292, + "learning_rate": 0.0001, + "loss": 4.6136, + "loss/crossentropy": 2.082811713218689, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24107889831066132, + "step": 5994 + }, + { + "epoch": 0.11992, + "grad_norm": 2.109375, + "grad_norm_var": 0.013602447509765626, + "learning_rate": 0.0001, + "loss": 4.2089, + "loss/crossentropy": 2.23664391040802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23429522663354874, + "step": 5996 + }, + { + "epoch": 0.11996, + "grad_norm": 2.265625, + "grad_norm_var": 0.013242340087890625, + "learning_rate": 0.0001, + "loss": 4.5237, + "loss/crossentropy": 2.451270341873169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2541816979646683, + "step": 5998 + }, + { + "epoch": 0.12, + "grad_norm": 2.109375, + "grad_norm_var": 0.013561757405598958, + "learning_rate": 0.0001, + "loss": 4.5559, + "loss/crossentropy": 2.1744157671928406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22349119931459427, + "step": 6000 + }, + { + "epoch": 0.12004, + "grad_norm": 2.09375, + "grad_norm_var": 0.013171132405598958, + "learning_rate": 0.0001, + "loss": 4.5532, + "loss/crossentropy": 2.0316836833953857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.259635865688324, + "step": 6002 + }, + { + "epoch": 0.12008, + "grad_norm": 2.609375, + "grad_norm_var": 0.022337849934895834, + "learning_rate": 0.0001, + "loss": 4.3674, + "loss/crossentropy": 2.0989437103271484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25056491047143936, + "step": 6004 + }, + { + "epoch": 0.12012, + "grad_norm": 2.5625, + "grad_norm_var": 0.02633056640625, + "learning_rate": 0.0001, + "loss": 4.0883, + "loss/crossentropy": 1.9609100818634033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22969383746385574, + "step": 6006 + }, + { + "epoch": 0.12016, + "grad_norm": 2.15625, + "grad_norm_var": 0.027665201822916666, + "learning_rate": 0.0001, + "loss": 4.2094, + "loss/crossentropy": 2.077883243560791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22850078344345093, + "step": 6008 + }, + { + "epoch": 0.1202, + "grad_norm": 2.34375, + "grad_norm_var": 1.6465779622395833, + "learning_rate": 0.0001, + "loss": 4.5659, + "loss/crossentropy": 1.7967591285705566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126556932926178, + "step": 6010 + }, + { + "epoch": 0.12024, + "grad_norm": 2.234375, + "grad_norm_var": 1.62222900390625, + "learning_rate": 0.0001, + "loss": 4.4018, + "loss/crossentropy": 1.6516226530075073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2188895046710968, + "step": 6012 + }, + { + "epoch": 0.12028, + "grad_norm": 2.21875, + "grad_norm_var": 1.62984619140625, + "learning_rate": 0.0001, + "loss": 4.3303, + "loss/crossentropy": 2.161388635635376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23447516560554504, + "step": 6014 + }, + { + "epoch": 0.12032, + "grad_norm": 2.328125, + "grad_norm_var": 1.6235636393229167, + "learning_rate": 0.0001, + "loss": 4.5112, + "loss/crossentropy": 1.9205461740493774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23797179013490677, + "step": 6016 + }, + { + "epoch": 0.12036, + "grad_norm": 2.3125, + "grad_norm_var": 1.616844685872396, + "learning_rate": 0.0001, + "loss": 4.8019, + "loss/crossentropy": 2.025223135948181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25120896100997925, + "step": 6018 + }, + { + "epoch": 0.1204, + "grad_norm": 2.25, + "grad_norm_var": 1.6347005208333334, + "learning_rate": 0.0001, + "loss": 4.4819, + "loss/crossentropy": 1.957942008972168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22228525578975677, + "step": 6020 + }, + { + "epoch": 0.12044, + "grad_norm": 2.171875, + "grad_norm_var": 1.674201456705729, + "learning_rate": 0.0001, + "loss": 4.6193, + "loss/crossentropy": 2.3325445652008057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26250994950532913, + "step": 6022 + }, + { + "epoch": 0.12048, + "grad_norm": 2.03125, + "grad_norm_var": 1.6957997639973958, + "learning_rate": 0.0001, + "loss": 4.264, + "loss/crossentropy": 2.0049667954444885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22990728169679642, + "step": 6024 + }, + { + "epoch": 0.12052, + "grad_norm": 2.265625, + "grad_norm_var": 0.08680013020833334, + "learning_rate": 0.0001, + "loss": 4.5996, + "loss/crossentropy": 2.0473387241363525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23628919571638107, + "step": 6026 + }, + { + "epoch": 0.12056, + "grad_norm": 1.9765625, + "grad_norm_var": 0.09107640584309896, + "learning_rate": 0.0001, + "loss": 4.109, + "loss/crossentropy": 2.013141930103302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21728236973285675, + "step": 6028 + }, + { + "epoch": 0.1206, + "grad_norm": 2.109375, + "grad_norm_var": 0.09145278930664062, + "learning_rate": 0.0001, + "loss": 4.3869, + "loss/crossentropy": 2.1269132494926453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25139085948467255, + "step": 6030 + }, + { + "epoch": 0.12064, + "grad_norm": 2.234375, + "grad_norm_var": 0.09058405558268229, + "learning_rate": 0.0001, + "loss": 4.5568, + "loss/crossentropy": 2.5267512798309326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27998340129852295, + "step": 6032 + }, + { + "epoch": 0.12068, + "grad_norm": 2.78125, + "grad_norm_var": 0.10746027628580729, + "learning_rate": 0.0001, + "loss": 4.2615, + "loss/crossentropy": 1.8502249717712402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22863731533288956, + "step": 6034 + }, + { + "epoch": 0.12072, + "grad_norm": 2.34375, + "grad_norm_var": 0.10850601196289063, + "learning_rate": 0.0001, + "loss": 4.2197, + "loss/crossentropy": 1.7754456400871277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23809141665697098, + "step": 6036 + }, + { + "epoch": 0.12076, + "grad_norm": 2.203125, + "grad_norm_var": 0.033760325113932295, + "learning_rate": 0.0001, + "loss": 4.4714, + "loss/crossentropy": 1.9596920609474182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111937776207924, + "step": 6038 + }, + { + "epoch": 0.1208, + "grad_norm": 2.203125, + "grad_norm_var": 0.02995580037434896, + "learning_rate": 0.0001, + "loss": 4.3079, + "loss/crossentropy": 1.888563334941864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21698038280010223, + "step": 6040 + }, + { + "epoch": 0.12084, + "grad_norm": 2.296875, + "grad_norm_var": 0.031404368082682294, + "learning_rate": 0.0001, + "loss": 4.2412, + "loss/crossentropy": 2.1646993160247803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22735021263360977, + "step": 6042 + }, + { + "epoch": 0.12088, + "grad_norm": 2.1875, + "grad_norm_var": 0.0258941650390625, + "learning_rate": 0.0001, + "loss": 4.525, + "loss/crossentropy": 2.0790343284606934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193024456501007, + "step": 6044 + }, + { + "epoch": 0.12092, + "grad_norm": 2.109375, + "grad_norm_var": 0.025211588541666666, + "learning_rate": 0.0001, + "loss": 4.3538, + "loss/crossentropy": 2.2733768224716187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.249516561627388, + "step": 6046 + }, + { + "epoch": 0.12096, + "grad_norm": 2.21875, + "grad_norm_var": 0.02431640625, + "learning_rate": 0.0001, + "loss": 4.7286, + "loss/crossentropy": 2.5003533363342285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22928690910339355, + "step": 6048 + }, + { + "epoch": 0.121, + "grad_norm": 2.453125, + "grad_norm_var": 0.03629150390625, + "learning_rate": 0.0001, + "loss": 4.6269, + "loss/crossentropy": 1.9606900215148926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152085080742836, + "step": 6050 + }, + { + "epoch": 0.12104, + "grad_norm": 2.390625, + "grad_norm_var": 0.0384765625, + "learning_rate": 0.0001, + "loss": 4.5724, + "loss/crossentropy": 2.266395926475525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26055608689785004, + "step": 6052 + }, + { + "epoch": 0.12108, + "grad_norm": 2.296875, + "grad_norm_var": 0.037821451822916664, + "learning_rate": 0.0001, + "loss": 4.5841, + "loss/crossentropy": 2.1753041744232178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2327822595834732, + "step": 6054 + }, + { + "epoch": 0.12112, + "grad_norm": 2.15625, + "grad_norm_var": 0.041304524739583334, + "learning_rate": 0.0001, + "loss": 4.2198, + "loss/crossentropy": 1.775630235671997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073052078485489, + "step": 6056 + }, + { + "epoch": 0.12116, + "grad_norm": 2.21875, + "grad_norm_var": 0.03975321451822917, + "learning_rate": 0.0001, + "loss": 4.486, + "loss/crossentropy": 2.415855050086975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.270721398293972, + "step": 6058 + }, + { + "epoch": 0.1212, + "grad_norm": 2.34375, + "grad_norm_var": 0.039453125, + "learning_rate": 0.0001, + "loss": 4.6647, + "loss/crossentropy": 2.2122162580490112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24723708629608154, + "step": 6060 + }, + { + "epoch": 0.12124, + "grad_norm": 2.375, + "grad_norm_var": 0.041792805989583334, + "learning_rate": 0.0001, + "loss": 4.3803, + "loss/crossentropy": 1.9540830850601196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21385761350393295, + "step": 6062 + }, + { + "epoch": 0.12128, + "grad_norm": 2.34375, + "grad_norm_var": 0.0413726806640625, + "learning_rate": 0.0001, + "loss": 4.979, + "loss/crossentropy": 1.923313319683075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22986505925655365, + "step": 6064 + }, + { + "epoch": 0.12132, + "grad_norm": 3.03125, + "grad_norm_var": 0.05182291666666667, + "learning_rate": 0.0001, + "loss": 4.6895, + "loss/crossentropy": 2.4042444229125977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30044034123420715, + "step": 6066 + }, + { + "epoch": 0.12136, + "grad_norm": 2.25, + "grad_norm_var": 0.049225870768229166, + "learning_rate": 0.0001, + "loss": 4.5857, + "loss/crossentropy": 2.2610549926757812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2628016769886017, + "step": 6068 + }, + { + "epoch": 0.1214, + "grad_norm": 2.140625, + "grad_norm_var": 0.0513671875, + "learning_rate": 0.0001, + "loss": 4.5024, + "loss/crossentropy": 1.9100797176361084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22663169354200363, + "step": 6070 + }, + { + "epoch": 0.12144, + "grad_norm": 2.34375, + "grad_norm_var": 0.04830729166666667, + "learning_rate": 0.0001, + "loss": 4.5916, + "loss/crossentropy": 2.3971651792526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26831263303756714, + "step": 6072 + }, + { + "epoch": 0.12148, + "grad_norm": 2.453125, + "grad_norm_var": 0.0492095947265625, + "learning_rate": 0.0001, + "loss": 4.4553, + "loss/crossentropy": 2.106821596622467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23786011338233948, + "step": 6074 + }, + { + "epoch": 0.12152, + "grad_norm": 2.125, + "grad_norm_var": 0.052469889322916664, + "learning_rate": 0.0001, + "loss": 4.4225, + "loss/crossentropy": 2.1920535564422607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23812127113342285, + "step": 6076 + }, + { + "epoch": 0.12156, + "grad_norm": 2.203125, + "grad_norm_var": 0.049088541666666666, + "learning_rate": 0.0001, + "loss": 4.2314, + "loss/crossentropy": 2.3014419078826904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2380141019821167, + "step": 6078 + }, + { + "epoch": 0.1216, + "grad_norm": 2.265625, + "grad_norm_var": 0.04903055826822917, + "learning_rate": 0.0001, + "loss": 4.4558, + "loss/crossentropy": 2.1721781492233276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24158813059329987, + "step": 6080 + }, + { + "epoch": 0.12164, + "grad_norm": 2.21875, + "grad_norm_var": 0.013232421875, + "learning_rate": 0.0001, + "loss": 4.5192, + "loss/crossentropy": 2.1230934858322144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23764102160930634, + "step": 6082 + }, + { + "epoch": 0.12168, + "grad_norm": 2.296875, + "grad_norm_var": 0.0166412353515625, + "learning_rate": 0.0001, + "loss": 4.2147, + "loss/crossentropy": 1.9570311307907104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24687693268060684, + "step": 6084 + }, + { + "epoch": 0.12172, + "grad_norm": 2.15625, + "grad_norm_var": 0.014997355143229167, + "learning_rate": 0.0001, + "loss": 4.2063, + "loss/crossentropy": 2.272383213043213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2567671462893486, + "step": 6086 + }, + { + "epoch": 0.12176, + "grad_norm": 2.1875, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 4.645, + "loss/crossentropy": 2.216492176055908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24377377331256866, + "step": 6088 + }, + { + "epoch": 0.1218, + "grad_norm": 2.140625, + "grad_norm_var": 0.007201131184895833, + "learning_rate": 0.0001, + "loss": 4.5956, + "loss/crossentropy": 2.25177001953125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23176074773073196, + "step": 6090 + }, + { + "epoch": 0.12184, + "grad_norm": 2.21875, + "grad_norm_var": 0.009098307291666666, + "learning_rate": 0.0001, + "loss": 4.1966, + "loss/crossentropy": 2.2452452182769775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24528006464242935, + "step": 6092 + }, + { + "epoch": 0.12188, + "grad_norm": 2.25, + "grad_norm_var": 0.0091949462890625, + "learning_rate": 0.0001, + "loss": 4.3548, + "loss/crossentropy": 2.078445553779602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2276337668299675, + "step": 6094 + }, + { + "epoch": 0.12192, + "grad_norm": 2.125, + "grad_norm_var": 0.00982666015625, + "learning_rate": 0.0001, + "loss": 4.4435, + "loss/crossentropy": 2.2938032150268555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2784377336502075, + "step": 6096 + }, + { + "epoch": 0.12196, + "grad_norm": 2.015625, + "grad_norm_var": 0.0136627197265625, + "learning_rate": 0.0001, + "loss": 4.1588, + "loss/crossentropy": 2.028180480003357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23716723918914795, + "step": 6098 + }, + { + "epoch": 0.122, + "grad_norm": 2.609375, + "grad_norm_var": 0.020580037434895834, + "learning_rate": 0.0001, + "loss": 4.6345, + "loss/crossentropy": 2.4959323406219482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2510451450943947, + "step": 6100 + }, + { + "epoch": 0.12204, + "grad_norm": 2.21875, + "grad_norm_var": 0.023856608072916667, + "learning_rate": 0.0001, + "loss": 4.6639, + "loss/crossentropy": 2.186043620109558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23511512577533722, + "step": 6102 + }, + { + "epoch": 0.12208, + "grad_norm": 2.359375, + "grad_norm_var": 0.024442545572916665, + "learning_rate": 0.0001, + "loss": 4.6727, + "loss/crossentropy": 2.6631078720092773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25689610838890076, + "step": 6104 + }, + { + "epoch": 0.12212, + "grad_norm": 2.453125, + "grad_norm_var": 0.025609334309895832, + "learning_rate": 0.0001, + "loss": 4.7943, + "loss/crossentropy": 2.310486674308777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26937828958034515, + "step": 6106 + }, + { + "epoch": 0.12216, + "grad_norm": 2.359375, + "grad_norm_var": 0.0247711181640625, + "learning_rate": 0.0001, + "loss": 4.9892, + "loss/crossentropy": 2.2036240100860596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24968907237052917, + "step": 6108 + }, + { + "epoch": 0.1222, + "grad_norm": 2.984375, + "grad_norm_var": 0.058251953125, + "learning_rate": 0.0001, + "loss": 4.3527, + "loss/crossentropy": 1.9434874057769775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23247701674699783, + "step": 6110 + }, + { + "epoch": 0.12224, + "grad_norm": 2.296875, + "grad_norm_var": 0.05579020182291667, + "learning_rate": 0.0001, + "loss": 4.4149, + "loss/crossentropy": 2.0525330305099487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24810528755187988, + "step": 6112 + }, + { + "epoch": 0.12228, + "grad_norm": 2.328125, + "grad_norm_var": 0.04482320149739583, + "learning_rate": 0.0001, + "loss": 4.5205, + "loss/crossentropy": 2.0085532665252686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2514277398586273, + "step": 6114 + }, + { + "epoch": 0.12232, + "grad_norm": 2.328125, + "grad_norm_var": 0.04120686848958333, + "learning_rate": 0.0001, + "loss": 4.5745, + "loss/crossentropy": 1.7393967509269714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239851951599121, + "step": 6116 + }, + { + "epoch": 0.12236, + "grad_norm": 2.546875, + "grad_norm_var": 0.04327799479166667, + "learning_rate": 0.0001, + "loss": 4.7332, + "loss/crossentropy": 1.8714343905448914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22823140025138855, + "step": 6118 + }, + { + "epoch": 0.1224, + "grad_norm": 2.3125, + "grad_norm_var": 0.04192708333333333, + "learning_rate": 0.0001, + "loss": 4.6177, + "loss/crossentropy": 1.9353562593460083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22261886298656464, + "step": 6120 + }, + { + "epoch": 0.12244, + "grad_norm": 2.171875, + "grad_norm_var": 0.0469635009765625, + "learning_rate": 0.0001, + "loss": 4.3154, + "loss/crossentropy": 2.0409420132637024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2319282591342926, + "step": 6122 + }, + { + "epoch": 0.12248, + "grad_norm": 2.125, + "grad_norm_var": 0.04882405598958333, + "learning_rate": 0.0001, + "loss": 4.4764, + "loss/crossentropy": 2.2461307048797607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25047267973423004, + "step": 6124 + }, + { + "epoch": 0.12252, + "grad_norm": 2.203125, + "grad_norm_var": 0.014012654622395834, + "learning_rate": 0.0001, + "loss": 4.4863, + "loss/crossentropy": 2.1060246229171753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24702349305152893, + "step": 6126 + }, + { + "epoch": 0.12256, + "grad_norm": 2.21875, + "grad_norm_var": 0.01718724568684896, + "learning_rate": 0.0001, + "loss": 3.9093, + "loss/crossentropy": 1.7481068968772888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19611438363790512, + "step": 6128 + }, + { + "epoch": 0.1226, + "grad_norm": 2.21875, + "grad_norm_var": 0.017329661051432292, + "learning_rate": 0.0001, + "loss": 4.5506, + "loss/crossentropy": 1.8341861963272095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20660528540611267, + "step": 6130 + }, + { + "epoch": 0.12264, + "grad_norm": 2.421875, + "grad_norm_var": 0.018507639567057293, + "learning_rate": 0.0001, + "loss": 4.3388, + "loss/crossentropy": 1.7332024574279785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21615543216466904, + "step": 6132 + }, + { + "epoch": 0.12268, + "grad_norm": 2.078125, + "grad_norm_var": 0.013203684488932292, + "learning_rate": 0.0001, + "loss": 4.3027, + "loss/crossentropy": 1.8959643244743347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219951793551445, + "step": 6134 + }, + { + "epoch": 0.12272, + "grad_norm": 2.265625, + "grad_norm_var": 0.012601470947265625, + "learning_rate": 0.0001, + "loss": 4.6945, + "loss/crossentropy": 2.234723210334778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2342153787612915, + "step": 6136 + }, + { + "epoch": 0.12276, + "grad_norm": 2.3125, + "grad_norm_var": 0.011637115478515625, + "learning_rate": 0.0001, + "loss": 4.7394, + "loss/crossentropy": 2.2297592759132385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24097590148448944, + "step": 6138 + }, + { + "epoch": 0.1228, + "grad_norm": 2.40625, + "grad_norm_var": 0.013586171468098958, + "learning_rate": 0.0001, + "loss": 4.592, + "loss/crossentropy": 2.013442814350128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532622739672661, + "step": 6140 + }, + { + "epoch": 0.12284, + "grad_norm": 2.15625, + "grad_norm_var": 0.014833323160807292, + "learning_rate": 0.0001, + "loss": 4.417, + "loss/crossentropy": 2.3066656589508057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24480555206537247, + "step": 6142 + }, + { + "epoch": 0.12288, + "grad_norm": 2.4375, + "grad_norm_var": 0.014241536458333334, + "learning_rate": 0.0001, + "loss": 4.4156, + "loss/crossentropy": 1.9281310439109802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22363708168268204, + "step": 6144 + }, + { + "epoch": 0.12292, + "grad_norm": 2.078125, + "grad_norm_var": 0.016792805989583333, + "learning_rate": 0.0001, + "loss": 4.4314, + "loss/crossentropy": 1.965875267982483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22153983265161514, + "step": 6146 + }, + { + "epoch": 0.12296, + "grad_norm": 2.8125, + "grad_norm_var": 0.03536783854166667, + "learning_rate": 0.0001, + "loss": 4.6139, + "loss/crossentropy": 1.811126947402954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21779045462608337, + "step": 6148 + }, + { + "epoch": 0.123, + "grad_norm": 2.34375, + "grad_norm_var": 0.0311676025390625, + "learning_rate": 0.0001, + "loss": 4.4855, + "loss/crossentropy": 2.5114762783050537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2542252689599991, + "step": 6150 + }, + { + "epoch": 0.12304, + "grad_norm": 2.296875, + "grad_norm_var": 0.028271484375, + "learning_rate": 0.0001, + "loss": 4.1269, + "loss/crossentropy": 1.8784565925598145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24637068808078766, + "step": 6152 + }, + { + "epoch": 0.12308, + "grad_norm": 2.234375, + "grad_norm_var": 0.028685506184895834, + "learning_rate": 0.0001, + "loss": 4.3377, + "loss/crossentropy": 1.6900760531425476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2170872688293457, + "step": 6154 + }, + { + "epoch": 0.12312, + "grad_norm": 2.21875, + "grad_norm_var": 0.02867431640625, + "learning_rate": 0.0001, + "loss": 4.5705, + "loss/crossentropy": 2.413783550262451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2624947279691696, + "step": 6156 + }, + { + "epoch": 0.12316, + "grad_norm": 2.03125, + "grad_norm_var": 0.030126953125, + "learning_rate": 0.0001, + "loss": 4.3463, + "loss/crossentropy": 2.0598058104515076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071695551276207, + "step": 6158 + }, + { + "epoch": 0.1232, + "grad_norm": 2.375, + "grad_norm_var": 0.029781087239583334, + "learning_rate": 0.0001, + "loss": 4.6702, + "loss/crossentropy": 2.0407246947288513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24353720247745514, + "step": 6160 + }, + { + "epoch": 0.12324, + "grad_norm": 2.171875, + "grad_norm_var": 0.028450520833333333, + "learning_rate": 0.0001, + "loss": 4.3116, + "loss/crossentropy": 1.9608840346336365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20965111255645752, + "step": 6162 + }, + { + "epoch": 0.12328, + "grad_norm": 2.046875, + "grad_norm_var": 0.01640625, + "learning_rate": 0.0001, + "loss": 4.2146, + "loss/crossentropy": 2.0231454372406006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219897098839283, + "step": 6164 + }, + { + "epoch": 0.12332, + "grad_norm": 2.25, + "grad_norm_var": 0.015999348958333333, + "learning_rate": 0.0001, + "loss": 4.614, + "loss/crossentropy": 2.217663288116455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2466331645846367, + "step": 6166 + }, + { + "epoch": 0.12336, + "grad_norm": 2.234375, + "grad_norm_var": 0.01597900390625, + "learning_rate": 0.0001, + "loss": 4.4135, + "loss/crossentropy": 1.6825732588768005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1999632865190506, + "step": 6168 + }, + { + "epoch": 0.1234, + "grad_norm": 2.5625, + "grad_norm_var": 0.020897420247395833, + "learning_rate": 0.0001, + "loss": 4.4552, + "loss/crossentropy": 2.288950800895691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26286834478378296, + "step": 6170 + }, + { + "epoch": 0.12344, + "grad_norm": 2.453125, + "grad_norm_var": 0.023225911458333335, + "learning_rate": 0.0001, + "loss": 4.3376, + "loss/crossentropy": 1.8202561140060425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22252517193555832, + "step": 6172 + }, + { + "epoch": 0.12348, + "grad_norm": 2.3125, + "grad_norm_var": 0.019710286458333334, + "learning_rate": 0.0001, + "loss": 4.5023, + "loss/crossentropy": 2.1007159948349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24983422458171844, + "step": 6174 + }, + { + "epoch": 0.12352, + "grad_norm": 2.171875, + "grad_norm_var": 0.0199371337890625, + "learning_rate": 0.0001, + "loss": 4.4513, + "loss/crossentropy": 2.0953084230422974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25545743107795715, + "step": 6176 + }, + { + "epoch": 0.12356, + "grad_norm": 2.171875, + "grad_norm_var": 0.019001261393229166, + "learning_rate": 0.0001, + "loss": 4.4315, + "loss/crossentropy": 2.107246518135071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22741339355707169, + "step": 6178 + }, + { + "epoch": 0.1236, + "grad_norm": 2.265625, + "grad_norm_var": 0.013118489583333334, + "learning_rate": 0.0001, + "loss": 4.4982, + "loss/crossentropy": 2.1219626665115356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22499094158411026, + "step": 6180 + }, + { + "epoch": 0.12364, + "grad_norm": 2.25, + "grad_norm_var": 0.013093058268229167, + "learning_rate": 0.0001, + "loss": 4.4263, + "loss/crossentropy": 1.8892266154289246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2255062311887741, + "step": 6182 + }, + { + "epoch": 0.12368, + "grad_norm": 2.21875, + "grad_norm_var": 0.015315755208333334, + "learning_rate": 0.0001, + "loss": 4.5884, + "loss/crossentropy": 2.0765860080718994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24712087213993073, + "step": 6184 + }, + { + "epoch": 0.12372, + "grad_norm": 2.296875, + "grad_norm_var": 0.010074869791666666, + "learning_rate": 0.0001, + "loss": 4.5431, + "loss/crossentropy": 2.301337718963623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24419523775577545, + "step": 6186 + }, + { + "epoch": 0.12376, + "grad_norm": 2.359375, + "grad_norm_var": 0.008373006184895834, + "learning_rate": 0.0001, + "loss": 4.5043, + "loss/crossentropy": 2.0489712953567505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25768278539180756, + "step": 6188 + }, + { + "epoch": 0.1238, + "grad_norm": 2.140625, + "grad_norm_var": 0.0128082275390625, + "learning_rate": 0.0001, + "loss": 4.2654, + "loss/crossentropy": 1.9328826069831848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22916094958782196, + "step": 6190 + }, + { + "epoch": 0.12384, + "grad_norm": 2.25, + "grad_norm_var": 0.01171875, + "learning_rate": 0.0001, + "loss": 4.1574, + "loss/crossentropy": 1.6330446004867554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20600398629903793, + "step": 6192 + }, + { + "epoch": 0.12388, + "grad_norm": 2.234375, + "grad_norm_var": 0.0111724853515625, + "learning_rate": 0.0001, + "loss": 4.4769, + "loss/crossentropy": 1.988203227519989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22711507230997086, + "step": 6194 + }, + { + "epoch": 0.12392, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01590143839518229, + "learning_rate": 0.0001, + "loss": 4.125, + "loss/crossentropy": 2.036049246788025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110549360513687, + "step": 6196 + }, + { + "epoch": 0.12396, + "grad_norm": 2.265625, + "grad_norm_var": 0.01624120076497396, + "learning_rate": 0.0001, + "loss": 4.5546, + "loss/crossentropy": 2.217681884765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23705793917179108, + "step": 6198 + }, + { + "epoch": 0.124, + "grad_norm": 2.234375, + "grad_norm_var": 0.015276845296223958, + "learning_rate": 0.0001, + "loss": 4.4803, + "loss/crossentropy": 2.3878824710845947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24000737071037292, + "step": 6200 + }, + { + "epoch": 0.12404, + "grad_norm": 2.265625, + "grad_norm_var": 0.015852610270182293, + "learning_rate": 0.0001, + "loss": 4.4754, + "loss/crossentropy": 2.2790249586105347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26150786131620407, + "step": 6202 + }, + { + "epoch": 0.12408, + "grad_norm": 2.1875, + "grad_norm_var": 0.015036773681640626, + "learning_rate": 0.0001, + "loss": 4.4703, + "loss/crossentropy": 2.251123785972595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25231797993183136, + "step": 6204 + }, + { + "epoch": 0.12412, + "grad_norm": 2.171875, + "grad_norm_var": 0.010802968343098959, + "learning_rate": 0.0001, + "loss": 4.5294, + "loss/crossentropy": 1.8977670073509216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21783004701137543, + "step": 6206 + }, + { + "epoch": 0.12416, + "grad_norm": 2.265625, + "grad_norm_var": 0.012894439697265624, + "learning_rate": 0.0001, + "loss": 4.6458, + "loss/crossentropy": 2.385319232940674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29223111271858215, + "step": 6208 + }, + { + "epoch": 0.1242, + "grad_norm": 2.40625, + "grad_norm_var": 0.015964508056640625, + "learning_rate": 0.0001, + "loss": 4.6555, + "loss/crossentropy": 1.9274529218673706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22827968001365662, + "step": 6210 + }, + { + "epoch": 0.12424, + "grad_norm": 2.40625, + "grad_norm_var": 0.01226806640625, + "learning_rate": 0.0001, + "loss": 4.8232, + "loss/crossentropy": 2.1861478090286255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2424854040145874, + "step": 6212 + }, + { + "epoch": 0.12428, + "grad_norm": 2.171875, + "grad_norm_var": 0.01207275390625, + "learning_rate": 0.0001, + "loss": 4.3002, + "loss/crossentropy": 2.2234357595443726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2471691370010376, + "step": 6214 + }, + { + "epoch": 0.12432, + "grad_norm": 2.625, + "grad_norm_var": 0.022541300455729166, + "learning_rate": 0.0001, + "loss": 4.5216, + "loss/crossentropy": 2.365513563156128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2640291824936867, + "step": 6216 + }, + { + "epoch": 0.12436, + "grad_norm": 2.21875, + "grad_norm_var": 0.021955362955729165, + "learning_rate": 0.0001, + "loss": 4.282, + "loss/crossentropy": 1.964316964149475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22341662645339966, + "step": 6218 + }, + { + "epoch": 0.1244, + "grad_norm": 2.3125, + "grad_norm_var": 0.021805826822916666, + "learning_rate": 0.0001, + "loss": 4.7078, + "loss/crossentropy": 2.3704408407211304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24599966406822205, + "step": 6220 + }, + { + "epoch": 0.12444, + "grad_norm": 2.1875, + "grad_norm_var": 0.0193756103515625, + "learning_rate": 0.0001, + "loss": 4.7558, + "loss/crossentropy": 2.1461408138275146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25205330550670624, + "step": 6222 + }, + { + "epoch": 0.12448, + "grad_norm": 2.21875, + "grad_norm_var": 0.018745930989583333, + "learning_rate": 0.0001, + "loss": 4.7032, + "loss/crossentropy": 2.3997104167938232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23371660709381104, + "step": 6224 + }, + { + "epoch": 0.12452, + "grad_norm": 2.09375, + "grad_norm_var": 0.0212890625, + "learning_rate": 0.0001, + "loss": 4.1842, + "loss/crossentropy": 1.7976875305175781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21662414073944092, + "step": 6226 + }, + { + "epoch": 0.12456, + "grad_norm": 2.140625, + "grad_norm_var": 0.02008056640625, + "learning_rate": 0.0001, + "loss": 4.2214, + "loss/crossentropy": 1.9929583668708801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21378497034311295, + "step": 6228 + }, + { + "epoch": 0.1246, + "grad_norm": 2.21875, + "grad_norm_var": 0.02017822265625, + "learning_rate": 0.0001, + "loss": 4.2982, + "loss/crossentropy": 1.8853623867034912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21267645806074142, + "step": 6230 + }, + { + "epoch": 0.12464, + "grad_norm": 2.234375, + "grad_norm_var": 0.008454386393229167, + "learning_rate": 0.0001, + "loss": 4.2638, + "loss/crossentropy": 2.2534812688827515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23003943264484406, + "step": 6232 + }, + { + "epoch": 0.12468, + "grad_norm": 2.0, + "grad_norm_var": 0.010863240559895833, + "learning_rate": 0.0001, + "loss": 4.2504, + "loss/crossentropy": 2.026564121246338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172931283712387, + "step": 6234 + }, + { + "epoch": 0.12472, + "grad_norm": 2.25, + "grad_norm_var": 0.01021728515625, + "learning_rate": 0.0001, + "loss": 4.3555, + "loss/crossentropy": 1.9323118925094604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22170638293027878, + "step": 6236 + }, + { + "epoch": 0.12476, + "grad_norm": 2.296875, + "grad_norm_var": 0.007938639322916666, + "learning_rate": 0.0001, + "loss": 4.479, + "loss/crossentropy": 2.056011915206909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2656880244612694, + "step": 6238 + }, + { + "epoch": 0.1248, + "grad_norm": 2.296875, + "grad_norm_var": 0.00830078125, + "learning_rate": 0.0001, + "loss": 4.6288, + "loss/crossentropy": 2.095108926296234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23341026157140732, + "step": 6240 + }, + { + "epoch": 0.12484, + "grad_norm": 2.4375, + "grad_norm_var": 0.011555989583333334, + "learning_rate": 0.0001, + "loss": 4.737, + "loss/crossentropy": 2.0198334455490112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24095112830400467, + "step": 6242 + }, + { + "epoch": 0.12488, + "grad_norm": 2.09375, + "grad_norm_var": 0.012105305989583334, + "learning_rate": 0.0001, + "loss": 4.3745, + "loss/crossentropy": 2.0397544503211975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22567399591207504, + "step": 6244 + }, + { + "epoch": 0.12492, + "grad_norm": 2.1875, + "grad_norm_var": 0.0120513916015625, + "learning_rate": 0.0001, + "loss": 4.3633, + "loss/crossentropy": 1.9094319343566895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22690637409687042, + "step": 6246 + }, + { + "epoch": 0.12496, + "grad_norm": 2.265625, + "grad_norm_var": 0.011295572916666666, + "learning_rate": 0.0001, + "loss": 4.6474, + "loss/crossentropy": 2.27765429019928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25075703859329224, + "step": 6248 + }, + { + "epoch": 0.125, + "grad_norm": 2.609375, + "grad_norm_var": 0.017023722330729168, + "learning_rate": 0.0001, + "loss": 4.5769, + "loss/crossentropy": 2.0027456283569336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24267998337745667, + "step": 6250 + }, + { + "epoch": 0.12504, + "grad_norm": 2.359375, + "grad_norm_var": 0.017723592122395833, + "learning_rate": 0.0001, + "loss": 4.5398, + "loss/crossentropy": 2.276857614517212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24048637598752975, + "step": 6252 + }, + { + "epoch": 0.12508, + "grad_norm": 2.390625, + "grad_norm_var": 0.018065388997395834, + "learning_rate": 0.0001, + "loss": 4.7603, + "loss/crossentropy": 2.1234214305877686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23481453210115433, + "step": 6254 + }, + { + "epoch": 0.12512, + "grad_norm": 2.28125, + "grad_norm_var": 0.019624837239583335, + "learning_rate": 0.0001, + "loss": 4.7537, + "loss/crossentropy": 2.201158881187439, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23473594337701797, + "step": 6256 + }, + { + "epoch": 0.12516, + "grad_norm": 2.296875, + "grad_norm_var": 0.017220052083333333, + "learning_rate": 0.0001, + "loss": 4.7202, + "loss/crossentropy": 2.3437804579734802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24881581962108612, + "step": 6258 + }, + { + "epoch": 0.1252, + "grad_norm": 2.265625, + "grad_norm_var": 0.015461222330729166, + "learning_rate": 0.0001, + "loss": 4.5938, + "loss/crossentropy": 2.0984586477279663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24332843720912933, + "step": 6260 + }, + { + "epoch": 0.12524, + "grad_norm": 2.390625, + "grad_norm_var": 0.016942342122395832, + "learning_rate": 0.0001, + "loss": 4.2317, + "loss/crossentropy": 1.7946080565452576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21510492265224457, + "step": 6262 + }, + { + "epoch": 0.12528, + "grad_norm": 2.171875, + "grad_norm_var": 0.016161092122395835, + "learning_rate": 0.0001, + "loss": 4.3889, + "loss/crossentropy": 2.022711932659149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22890903800725937, + "step": 6264 + }, + { + "epoch": 0.12532, + "grad_norm": 2.046875, + "grad_norm_var": 0.009577433268229166, + "learning_rate": 0.0001, + "loss": 4.288, + "loss/crossentropy": 1.9752087593078613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21972095221281052, + "step": 6266 + }, + { + "epoch": 0.12536, + "grad_norm": 2.234375, + "grad_norm_var": 0.008625284830729166, + "learning_rate": 0.0001, + "loss": 4.4952, + "loss/crossentropy": 1.7267251014709473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21305006742477417, + "step": 6268 + }, + { + "epoch": 0.1254, + "grad_norm": 2.109375, + "grad_norm_var": 0.008869425455729166, + "learning_rate": 0.0001, + "loss": 4.2164, + "loss/crossentropy": 2.249786615371704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24875187873840332, + "step": 6270 + }, + { + "epoch": 0.12544, + "grad_norm": 2.46875, + "grad_norm_var": 0.012190755208333333, + "learning_rate": 0.0001, + "loss": 4.693, + "loss/crossentropy": 2.402994990348816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25536587834358215, + "step": 6272 + }, + { + "epoch": 0.12548, + "grad_norm": 2.46875, + "grad_norm_var": 0.016243489583333333, + "learning_rate": 0.0001, + "loss": 4.606, + "loss/crossentropy": 2.240913987159729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3013365715742111, + "step": 6274 + }, + { + "epoch": 0.12552, + "grad_norm": 2.390625, + "grad_norm_var": 0.019071451822916665, + "learning_rate": 0.0001, + "loss": 4.4691, + "loss/crossentropy": 2.0767332911491394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22113215178251266, + "step": 6276 + }, + { + "epoch": 0.12556, + "grad_norm": 2.140625, + "grad_norm_var": 0.02041015625, + "learning_rate": 0.0001, + "loss": 4.608, + "loss/crossentropy": 1.8625048995018005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20321352779865265, + "step": 6278 + }, + { + "epoch": 0.1256, + "grad_norm": 2.3125, + "grad_norm_var": 0.024072265625, + "learning_rate": 0.0001, + "loss": 4.0386, + "loss/crossentropy": 1.9143638610839844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22085773944854736, + "step": 6280 + }, + { + "epoch": 0.12564, + "grad_norm": 2.34375, + "grad_norm_var": 0.023192342122395834, + "learning_rate": 0.0001, + "loss": 4.372, + "loss/crossentropy": 2.3756210803985596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26027245819568634, + "step": 6282 + }, + { + "epoch": 0.12568, + "grad_norm": 2.21875, + "grad_norm_var": 0.02427978515625, + "learning_rate": 0.0001, + "loss": 4.5526, + "loss/crossentropy": 1.9310896396636963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2086925357580185, + "step": 6284 + }, + { + "epoch": 0.12572, + "grad_norm": 2.34375, + "grad_norm_var": 0.021675618489583333, + "learning_rate": 0.0001, + "loss": 4.4751, + "loss/crossentropy": 2.1757423877716064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.241075336933136, + "step": 6286 + }, + { + "epoch": 0.12576, + "grad_norm": 2.265625, + "grad_norm_var": 0.021284993489583334, + "learning_rate": 0.0001, + "loss": 4.0946, + "loss/crossentropy": 2.3057546615600586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24882248044013977, + "step": 6288 + }, + { + "epoch": 0.1258, + "grad_norm": 3.140625, + "grad_norm_var": 0.06883036295572917, + "learning_rate": 0.0001, + "loss": 4.5211, + "loss/crossentropy": 2.1551633477211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2796258181333542, + "step": 6290 + }, + { + "epoch": 0.12584, + "grad_norm": 2.296875, + "grad_norm_var": 0.06608072916666667, + "learning_rate": 0.0001, + "loss": 4.4915, + "loss/crossentropy": 2.0627459287643433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24781616777181625, + "step": 6292 + }, + { + "epoch": 0.12588, + "grad_norm": 2.453125, + "grad_norm_var": 0.06463216145833334, + "learning_rate": 0.0001, + "loss": 4.5896, + "loss/crossentropy": 1.795321524143219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113223671913147, + "step": 6294 + }, + { + "epoch": 0.12592, + "grad_norm": 2.171875, + "grad_norm_var": 0.06004130045572917, + "learning_rate": 0.0001, + "loss": 4.4988, + "loss/crossentropy": 2.323319673538208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24748852849006653, + "step": 6296 + }, + { + "epoch": 0.12596, + "grad_norm": 2.296875, + "grad_norm_var": 0.05998942057291667, + "learning_rate": 0.0001, + "loss": 4.4801, + "loss/crossentropy": 2.053200662136078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278411090373993, + "step": 6298 + }, + { + "epoch": 0.126, + "grad_norm": 2.4375, + "grad_norm_var": 0.060155232747395836, + "learning_rate": 0.0001, + "loss": 4.4605, + "loss/crossentropy": 2.102539896965027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26071713864803314, + "step": 6300 + }, + { + "epoch": 0.12604, + "grad_norm": 2.265625, + "grad_norm_var": 0.059554036458333334, + "learning_rate": 0.0001, + "loss": 4.4613, + "loss/crossentropy": 1.9316620826721191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23007915914058685, + "step": 6302 + }, + { + "epoch": 0.12608, + "grad_norm": 2.6875, + "grad_norm_var": 0.06288655598958333, + "learning_rate": 0.0001, + "loss": 4.4194, + "loss/crossentropy": 1.9966526627540588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2424854189157486, + "step": 6304 + }, + { + "epoch": 0.12612, + "grad_norm": 2.046875, + "grad_norm_var": 0.022541300455729166, + "learning_rate": 0.0001, + "loss": 4.2382, + "loss/crossentropy": 1.895868957042694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2255951091647148, + "step": 6306 + }, + { + "epoch": 0.12616, + "grad_norm": 2.171875, + "grad_norm_var": 0.023688761393229167, + "learning_rate": 0.0001, + "loss": 4.7758, + "loss/crossentropy": 2.3897345066070557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24461720883846283, + "step": 6308 + }, + { + "epoch": 0.1262, + "grad_norm": 2.40625, + "grad_norm_var": 0.022858683268229166, + "learning_rate": 0.0001, + "loss": 4.8275, + "loss/crossentropy": 2.059292197227478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29264035820961, + "step": 6310 + }, + { + "epoch": 0.12624, + "grad_norm": 2.390625, + "grad_norm_var": 0.0221343994140625, + "learning_rate": 0.0001, + "loss": 4.5098, + "loss/crossentropy": 1.8142406344413757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22127485275268555, + "step": 6312 + }, + { + "epoch": 0.12628, + "grad_norm": 2.109375, + "grad_norm_var": 0.0245025634765625, + "learning_rate": 0.0001, + "loss": 4.2349, + "loss/crossentropy": 2.0186127424240112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217330664396286, + "step": 6314 + }, + { + "epoch": 0.12632, + "grad_norm": 2.265625, + "grad_norm_var": 0.024820963541666668, + "learning_rate": 0.0001, + "loss": 4.0102, + "loss/crossentropy": 1.9439889192581177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108919695019722, + "step": 6316 + }, + { + "epoch": 0.12636, + "grad_norm": 2.28125, + "grad_norm_var": 0.024690755208333335, + "learning_rate": 0.0001, + "loss": 4.8672, + "loss/crossentropy": 2.617791175842285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2756097912788391, + "step": 6318 + }, + { + "epoch": 0.1264, + "grad_norm": 2.515625, + "grad_norm_var": 0.017609659830729166, + "learning_rate": 0.0001, + "loss": 4.3918, + "loss/crossentropy": 1.9074286818504333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22491587698459625, + "step": 6320 + }, + { + "epoch": 0.12644, + "grad_norm": 2.09375, + "grad_norm_var": 0.016331990559895832, + "learning_rate": 0.0001, + "loss": 4.4304, + "loss/crossentropy": 2.1414809226989746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22703612595796585, + "step": 6322 + }, + { + "epoch": 0.12648, + "grad_norm": 2.28125, + "grad_norm_var": 0.0154205322265625, + "learning_rate": 0.0001, + "loss": 4.4982, + "loss/crossentropy": 2.291784942150116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24490328133106232, + "step": 6324 + }, + { + "epoch": 0.12652, + "grad_norm": 2.1875, + "grad_norm_var": 0.0143218994140625, + "learning_rate": 0.0001, + "loss": 4.228, + "loss/crossentropy": 1.938249409198761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22611317038536072, + "step": 6326 + }, + { + "epoch": 0.12656, + "grad_norm": 2.03125, + "grad_norm_var": 0.0187408447265625, + "learning_rate": 0.0001, + "loss": 4.4932, + "loss/crossentropy": 1.7510024905204773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30439992994070053, + "step": 6328 + }, + { + "epoch": 0.1266, + "grad_norm": 2.46875, + "grad_norm_var": 0.022907511393229166, + "learning_rate": 0.0001, + "loss": 4.4351, + "loss/crossentropy": 2.3168221712112427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25660980492830276, + "step": 6330 + }, + { + "epoch": 0.12664, + "grad_norm": 2.046875, + "grad_norm_var": 0.024409993489583334, + "learning_rate": 0.0001, + "loss": 4.5002, + "loss/crossentropy": 1.9957427978515625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21420849114656448, + "step": 6332 + }, + { + "epoch": 0.12668, + "grad_norm": 2.0625, + "grad_norm_var": 0.025104777018229166, + "learning_rate": 0.0001, + "loss": 4.3583, + "loss/crossentropy": 2.4371464252471924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2576342821121216, + "step": 6334 + }, + { + "epoch": 0.12672, + "grad_norm": 2.171875, + "grad_norm_var": 0.021458943684895832, + "learning_rate": 0.0001, + "loss": 4.2497, + "loss/crossentropy": 1.910677433013916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21360614150762558, + "step": 6336 + }, + { + "epoch": 0.12676, + "grad_norm": 2.203125, + "grad_norm_var": 0.020750935872395834, + "learning_rate": 0.0001, + "loss": 4.2679, + "loss/crossentropy": 2.057362914085388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23079442232847214, + "step": 6338 + }, + { + "epoch": 0.1268, + "grad_norm": 2.21875, + "grad_norm_var": 0.0191070556640625, + "learning_rate": 0.0001, + "loss": 4.1818, + "loss/crossentropy": 2.0249438881874084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24038050323724747, + "step": 6340 + }, + { + "epoch": 0.12684, + "grad_norm": 2.25, + "grad_norm_var": 0.019074503580729166, + "learning_rate": 0.0001, + "loss": 4.5374, + "loss/crossentropy": 2.0371533632278442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2355760782957077, + "step": 6342 + }, + { + "epoch": 0.12688, + "grad_norm": 2.203125, + "grad_norm_var": 0.0130523681640625, + "learning_rate": 0.0001, + "loss": 4.4849, + "loss/crossentropy": 2.3101218938827515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24161705374717712, + "step": 6344 + }, + { + "epoch": 0.12692, + "grad_norm": 2.25, + "grad_norm_var": 0.010261027018229167, + "learning_rate": 0.0001, + "loss": 4.6399, + "loss/crossentropy": 2.1990097761154175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2537754699587822, + "step": 6346 + }, + { + "epoch": 0.12696, + "grad_norm": 2.125, + "grad_norm_var": 0.0136383056640625, + "learning_rate": 0.0001, + "loss": 4.2697, + "loss/crossentropy": 2.1783597469329834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106732353568077, + "step": 6348 + }, + { + "epoch": 0.127, + "grad_norm": 2.359375, + "grad_norm_var": 0.012987263997395833, + "learning_rate": 0.0001, + "loss": 4.5127, + "loss/crossentropy": 2.2316598892211914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23841773718595505, + "step": 6350 + }, + { + "epoch": 0.12704, + "grad_norm": 2.265625, + "grad_norm_var": 0.00943603515625, + "learning_rate": 0.0001, + "loss": 4.5651, + "loss/crossentropy": 2.0329924821853638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22755083441734314, + "step": 6352 + }, + { + "epoch": 0.12708, + "grad_norm": 2.078125, + "grad_norm_var": 0.014525349934895833, + "learning_rate": 0.0001, + "loss": 4.1888, + "loss/crossentropy": 1.9174052476882935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20000722259283066, + "step": 6354 + }, + { + "epoch": 0.12712, + "grad_norm": 2.15625, + "grad_norm_var": 0.0154693603515625, + "learning_rate": 0.0001, + "loss": 4.3321, + "loss/crossentropy": 1.968774676322937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2194407731294632, + "step": 6356 + }, + { + "epoch": 0.12716, + "grad_norm": 2.296875, + "grad_norm_var": 0.0183013916015625, + "learning_rate": 0.0001, + "loss": 4.5768, + "loss/crossentropy": 2.1767213344573975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24483858048915863, + "step": 6358 + }, + { + "epoch": 0.1272, + "grad_norm": 2.171875, + "grad_norm_var": 0.018391927083333332, + "learning_rate": 0.0001, + "loss": 4.5063, + "loss/crossentropy": 1.8390987515449524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21677500754594803, + "step": 6360 + }, + { + "epoch": 0.12724, + "grad_norm": 2.203125, + "grad_norm_var": 0.016039021809895835, + "learning_rate": 0.0001, + "loss": 4.2635, + "loss/crossentropy": 2.1923086643218994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21362057328224182, + "step": 6362 + }, + { + "epoch": 0.12728, + "grad_norm": 2.25, + "grad_norm_var": 0.010347493489583333, + "learning_rate": 0.0001, + "loss": 4.3104, + "loss/crossentropy": 2.030683994293213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22884567826986313, + "step": 6364 + }, + { + "epoch": 0.12732, + "grad_norm": 2.203125, + "grad_norm_var": 0.00816650390625, + "learning_rate": 0.0001, + "loss": 4.452, + "loss/crossentropy": 2.330837845802307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24233026057481766, + "step": 6366 + }, + { + "epoch": 0.12736, + "grad_norm": 2.453125, + "grad_norm_var": 0.012040201822916667, + "learning_rate": 0.0001, + "loss": 4.6404, + "loss/crossentropy": 2.001612663269043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2332654967904091, + "step": 6368 + }, + { + "epoch": 0.1274, + "grad_norm": 2.25, + "grad_norm_var": 0.011750284830729167, + "learning_rate": 0.0001, + "loss": 4.7101, + "loss/crossentropy": 1.9234941601753235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23741928488016129, + "step": 6370 + }, + { + "epoch": 0.12744, + "grad_norm": 2.078125, + "grad_norm_var": 0.012262980143229166, + "learning_rate": 0.0001, + "loss": 4.2164, + "loss/crossentropy": 2.059934139251709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24736596643924713, + "step": 6372 + }, + { + "epoch": 0.12748, + "grad_norm": 2.125, + "grad_norm_var": 0.010798136393229166, + "learning_rate": 0.0001, + "loss": 4.3916, + "loss/crossentropy": 2.25182843208313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24349220097064972, + "step": 6374 + }, + { + "epoch": 0.12752, + "grad_norm": 2.15625, + "grad_norm_var": 0.010888671875, + "learning_rate": 0.0001, + "loss": 4.3742, + "loss/crossentropy": 1.845405638217926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138313353061676, + "step": 6376 + }, + { + "epoch": 0.12756, + "grad_norm": 2.546875, + "grad_norm_var": 0.0164215087890625, + "learning_rate": 0.0001, + "loss": 4.568, + "loss/crossentropy": 1.8833998441696167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25304871797561646, + "step": 6378 + }, + { + "epoch": 0.1276, + "grad_norm": 2.28125, + "grad_norm_var": 0.017072550455729165, + "learning_rate": 0.0001, + "loss": 4.2282, + "loss/crossentropy": 2.1066314578056335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2231372445821762, + "step": 6380 + }, + { + "epoch": 0.12764, + "grad_norm": 2.328125, + "grad_norm_var": 0.018561808268229167, + "learning_rate": 0.0001, + "loss": 4.3381, + "loss/crossentropy": 2.024670898914337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22260528802871704, + "step": 6382 + }, + { + "epoch": 0.12768, + "grad_norm": 2.109375, + "grad_norm_var": 0.01763916015625, + "learning_rate": 0.0001, + "loss": 4.8132, + "loss/crossentropy": 2.425115466117859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2429298236966133, + "step": 6384 + }, + { + "epoch": 0.12772, + "grad_norm": 2.109375, + "grad_norm_var": 0.016597493489583334, + "learning_rate": 0.0001, + "loss": 4.2387, + "loss/crossentropy": 2.1847925186157227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2495381161570549, + "step": 6386 + }, + { + "epoch": 0.12776, + "grad_norm": 2.265625, + "grad_norm_var": 0.015746053059895834, + "learning_rate": 0.0001, + "loss": 4.6919, + "loss/crossentropy": 2.1463611125946045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2303348332643509, + "step": 6388 + }, + { + "epoch": 0.1278, + "grad_norm": 2.171875, + "grad_norm_var": 0.014839680989583333, + "learning_rate": 0.0001, + "loss": 4.2714, + "loss/crossentropy": 1.9755331873893738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232731431722641, + "step": 6390 + }, + { + "epoch": 0.12784, + "grad_norm": 2.078125, + "grad_norm_var": 0.015973917643229165, + "learning_rate": 0.0001, + "loss": 4.5255, + "loss/crossentropy": 2.0741465091705322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22912710905075073, + "step": 6392 + }, + { + "epoch": 0.12788, + "grad_norm": 2.203125, + "grad_norm_var": 0.009471638997395834, + "learning_rate": 0.0001, + "loss": 4.3834, + "loss/crossentropy": 2.1314439177513123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22910036891698837, + "step": 6394 + }, + { + "epoch": 0.12792, + "grad_norm": 2.109375, + "grad_norm_var": 0.0107086181640625, + "learning_rate": 0.0001, + "loss": 4.1615, + "loss/crossentropy": 1.976862907409668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.233707495033741, + "step": 6396 + }, + { + "epoch": 0.12796, + "grad_norm": 2.171875, + "grad_norm_var": 0.010807291666666666, + "learning_rate": 0.0001, + "loss": 4.3939, + "loss/crossentropy": 2.0534666180610657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21424879133701324, + "step": 6398 + }, + { + "epoch": 0.128, + "grad_norm": 2.109375, + "grad_norm_var": 0.008088175455729167, + "learning_rate": 0.0001, + "loss": 4.6647, + "loss/crossentropy": 2.2200992107391357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2836414724588394, + "step": 6400 + }, + { + "epoch": 0.12804, + "grad_norm": 2.234375, + "grad_norm_var": 0.007542928059895833, + "learning_rate": 0.0001, + "loss": 4.3802, + "loss/crossentropy": 1.8642286658287048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21438255906105042, + "step": 6402 + }, + { + "epoch": 0.12808, + "grad_norm": 2.15625, + "grad_norm_var": 0.005464680989583333, + "learning_rate": 0.0001, + "loss": 4.3868, + "loss/crossentropy": 1.9571613073349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22038634121418, + "step": 6404 + }, + { + "epoch": 0.12812, + "grad_norm": 2.125, + "grad_norm_var": 0.008980305989583333, + "learning_rate": 0.0001, + "loss": 4.6888, + "loss/crossentropy": 2.0420787930488586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532814294099808, + "step": 6406 + }, + { + "epoch": 0.12816, + "grad_norm": 2.109375, + "grad_norm_var": 0.008349609375, + "learning_rate": 0.0001, + "loss": 4.3357, + "loss/crossentropy": 1.7511736750602722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067284658551216, + "step": 6408 + }, + { + "epoch": 0.1282, + "grad_norm": 2.21875, + "grad_norm_var": 0.0095123291015625, + "learning_rate": 0.0001, + "loss": 4.5568, + "loss/crossentropy": 2.0310762524604797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2763645648956299, + "step": 6410 + }, + { + "epoch": 0.12824, + "grad_norm": 2.28125, + "grad_norm_var": 0.008617146809895834, + "learning_rate": 0.0001, + "loss": 4.575, + "loss/crossentropy": 2.112699866294861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2409205138683319, + "step": 6412 + }, + { + "epoch": 0.12828, + "grad_norm": 2.296875, + "grad_norm_var": 0.14166259765625, + "learning_rate": 0.0001, + "loss": 4.3023, + "loss/crossentropy": 1.8243904113769531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2079915553331375, + "step": 6414 + }, + { + "epoch": 0.12832, + "grad_norm": 2.125, + "grad_norm_var": 0.14143473307291668, + "learning_rate": 0.0001, + "loss": 4.5395, + "loss/crossentropy": 1.8310211896896362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21304991096258163, + "step": 6416 + }, + { + "epoch": 0.12836, + "grad_norm": 2.140625, + "grad_norm_var": 0.14011128743489584, + "learning_rate": 0.0001, + "loss": 4.3806, + "loss/crossentropy": 1.9653990268707275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22207710891962051, + "step": 6418 + }, + { + "epoch": 0.1284, + "grad_norm": 2.15625, + "grad_norm_var": 0.13982645670572916, + "learning_rate": 0.0001, + "loss": 4.5869, + "loss/crossentropy": 2.0583431124687195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23768695443868637, + "step": 6420 + }, + { + "epoch": 0.12844, + "grad_norm": 2.296875, + "grad_norm_var": 0.1391998291015625, + "learning_rate": 0.0001, + "loss": 4.6512, + "loss/crossentropy": 2.1162279844284058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23963302373886108, + "step": 6422 + }, + { + "epoch": 0.12848, + "grad_norm": 2.28125, + "grad_norm_var": 0.13347066243489583, + "learning_rate": 0.0001, + "loss": 4.6268, + "loss/crossentropy": 2.068794012069702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21967273205518723, + "step": 6424 + }, + { + "epoch": 0.12852, + "grad_norm": 2.109375, + "grad_norm_var": 0.13585611979166667, + "learning_rate": 0.0001, + "loss": 4.4004, + "loss/crossentropy": 2.165997266769409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22991405427455902, + "step": 6426 + }, + { + "epoch": 0.12856, + "grad_norm": 2.25, + "grad_norm_var": 0.14031473795572916, + "learning_rate": 0.0001, + "loss": 4.1816, + "loss/crossentropy": 1.9116491675376892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22784583270549774, + "step": 6428 + }, + { + "epoch": 0.1286, + "grad_norm": 2.296875, + "grad_norm_var": 0.0087890625, + "learning_rate": 0.0001, + "loss": 4.5268, + "loss/crossentropy": 2.296347498893738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2618508040904999, + "step": 6430 + }, + { + "epoch": 0.12864, + "grad_norm": 2.125, + "grad_norm_var": 0.0100250244140625, + "learning_rate": 0.0001, + "loss": 4.0363, + "loss/crossentropy": 2.3961373567581177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27667778730392456, + "step": 6432 + }, + { + "epoch": 0.12868, + "grad_norm": 2.171875, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 4.3993, + "loss/crossentropy": 1.9141033291816711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22818633913993835, + "step": 6434 + }, + { + "epoch": 0.12872, + "grad_norm": 2.15625, + "grad_norm_var": 0.0070149739583333336, + "learning_rate": 0.0001, + "loss": 4.481, + "loss/crossentropy": 2.4431718587875366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24852856248617172, + "step": 6436 + }, + { + "epoch": 0.12876, + "grad_norm": 2.171875, + "grad_norm_var": 0.006494140625, + "learning_rate": 0.0001, + "loss": 4.5153, + "loss/crossentropy": 1.9343949556350708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22181283682584763, + "step": 6438 + }, + { + "epoch": 0.1288, + "grad_norm": 2.171875, + "grad_norm_var": 0.005549112955729167, + "learning_rate": 0.0001, + "loss": 4.4547, + "loss/crossentropy": 2.203734040260315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23536919057369232, + "step": 6440 + }, + { + "epoch": 0.12884, + "grad_norm": 2.046875, + "grad_norm_var": 0.0055572509765625, + "learning_rate": 0.0001, + "loss": 4.354, + "loss/crossentropy": 2.0419046878814697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121758982539177, + "step": 6442 + }, + { + "epoch": 0.12888, + "grad_norm": 2.328125, + "grad_norm_var": 0.0054595947265625, + "learning_rate": 0.0001, + "loss": 4.5738, + "loss/crossentropy": 2.3551554679870605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23853591084480286, + "step": 6444 + }, + { + "epoch": 0.12892, + "grad_norm": 2.21875, + "grad_norm_var": 0.004813639322916666, + "learning_rate": 0.0001, + "loss": 4.4334, + "loss/crossentropy": 1.9079387784004211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22546496242284775, + "step": 6446 + }, + { + "epoch": 0.12896, + "grad_norm": 2.140625, + "grad_norm_var": 0.00390625, + "learning_rate": 0.0001, + "loss": 4.4188, + "loss/crossentropy": 1.9344156980514526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21923892199993134, + "step": 6448 + }, + { + "epoch": 0.129, + "grad_norm": 2.296875, + "grad_norm_var": 0.0047271728515625, + "learning_rate": 0.0001, + "loss": 4.3219, + "loss/crossentropy": 1.7627189755439758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20726975798606873, + "step": 6450 + }, + { + "epoch": 0.12904, + "grad_norm": 2.28125, + "grad_norm_var": 0.0048736572265625, + "learning_rate": 0.0001, + "loss": 4.4045, + "loss/crossentropy": 1.9988782405853271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23156572133302689, + "step": 6452 + }, + { + "epoch": 0.12908, + "grad_norm": 2.1875, + "grad_norm_var": 0.004621378580729167, + "learning_rate": 0.0001, + "loss": 4.6551, + "loss/crossentropy": 2.3970296382904053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24723558872938156, + "step": 6454 + }, + { + "epoch": 0.12912, + "grad_norm": 2.234375, + "grad_norm_var": 0.00455322265625, + "learning_rate": 0.0001, + "loss": 4.3785, + "loss/crossentropy": 2.009088099002838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2188432812690735, + "step": 6456 + }, + { + "epoch": 0.12916, + "grad_norm": 2.203125, + "grad_norm_var": 0.005882771809895834, + "learning_rate": 0.0001, + "loss": 4.9189, + "loss/crossentropy": 2.166573464870453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22506655752658844, + "step": 6458 + }, + { + "epoch": 0.1292, + "grad_norm": 2.109375, + "grad_norm_var": 0.00670166015625, + "learning_rate": 0.0001, + "loss": 4.3281, + "loss/crossentropy": 2.1567386388778687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23143760859966278, + "step": 6460 + }, + { + "epoch": 0.12924, + "grad_norm": 2.203125, + "grad_norm_var": 0.007307942708333333, + "learning_rate": 0.0001, + "loss": 3.9726, + "loss/crossentropy": 1.8458901643753052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22627578675746918, + "step": 6462 + }, + { + "epoch": 0.12928, + "grad_norm": 2.21875, + "grad_norm_var": 0.00738525390625, + "learning_rate": 0.0001, + "loss": 4.1416, + "loss/crossentropy": 1.7933887243270874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944974958896637, + "step": 6464 + }, + { + "epoch": 0.12932, + "grad_norm": 2.328125, + "grad_norm_var": 0.0077707926432291664, + "learning_rate": 0.0001, + "loss": 4.3712, + "loss/crossentropy": 2.14457631111145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2461908757686615, + "step": 6466 + }, + { + "epoch": 0.12936, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011563873291015625, + "learning_rate": 0.0001, + "loss": 4.2253, + "loss/crossentropy": 2.2191081047058105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24845656007528305, + "step": 6468 + }, + { + "epoch": 0.1294, + "grad_norm": 2.3125, + "grad_norm_var": 0.012284088134765624, + "learning_rate": 0.0001, + "loss": 4.3992, + "loss/crossentropy": 2.2655181884765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24961909651756287, + "step": 6470 + }, + { + "epoch": 0.12944, + "grad_norm": 2.171875, + "grad_norm_var": 0.012617746988932291, + "learning_rate": 0.0001, + "loss": 4.2431, + "loss/crossentropy": 1.9992872476577759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21180924773216248, + "step": 6472 + }, + { + "epoch": 0.12948, + "grad_norm": 2.078125, + "grad_norm_var": 0.009348297119140625, + "learning_rate": 0.0001, + "loss": 4.2865, + "loss/crossentropy": 2.1950103044509888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23401658236980438, + "step": 6474 + }, + { + "epoch": 0.12952, + "grad_norm": 2.234375, + "grad_norm_var": 0.010027821858723958, + "learning_rate": 0.0001, + "loss": 4.2614, + "loss/crossentropy": 2.148743689060211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146785706281662, + "step": 6476 + }, + { + "epoch": 0.12956, + "grad_norm": 2.140625, + "grad_norm_var": 0.010253651936848959, + "learning_rate": 0.0001, + "loss": 4.7196, + "loss/crossentropy": 2.4312193393707275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2596089243888855, + "step": 6478 + }, + { + "epoch": 0.1296, + "grad_norm": 2.25, + "grad_norm_var": 0.011844635009765625, + "learning_rate": 0.0001, + "loss": 4.7232, + "loss/crossentropy": 2.061104893684387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24564718455076218, + "step": 6480 + }, + { + "epoch": 0.12964, + "grad_norm": 2.21875, + "grad_norm_var": 0.011224110921223959, + "learning_rate": 0.0001, + "loss": 4.6833, + "loss/crossentropy": 2.1994687914848328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2352369725704193, + "step": 6482 + }, + { + "epoch": 0.12968, + "grad_norm": 2.203125, + "grad_norm_var": 0.010692342122395834, + "learning_rate": 0.0001, + "loss": 4.2378, + "loss/crossentropy": 2.085163116455078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2309635877609253, + "step": 6484 + }, + { + "epoch": 0.12972, + "grad_norm": 2.1875, + "grad_norm_var": 0.0098297119140625, + "learning_rate": 0.0001, + "loss": 4.5964, + "loss/crossentropy": 2.3360198736190796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24846713244915009, + "step": 6486 + }, + { + "epoch": 0.12976, + "grad_norm": 2.265625, + "grad_norm_var": 0.010602823893229167, + "learning_rate": 0.0001, + "loss": 4.4454, + "loss/crossentropy": 2.089003086090088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24980145692825317, + "step": 6488 + }, + { + "epoch": 0.1298, + "grad_norm": 2.125, + "grad_norm_var": 0.0108795166015625, + "learning_rate": 0.0001, + "loss": 4.4446, + "loss/crossentropy": 2.0884299874305725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23875930905342102, + "step": 6490 + }, + { + "epoch": 0.12984, + "grad_norm": 2.09375, + "grad_norm_var": 0.01002197265625, + "learning_rate": 0.0001, + "loss": 4.1661, + "loss/crossentropy": 1.9070702195167542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21942836046218872, + "step": 6492 + }, + { + "epoch": 0.12988, + "grad_norm": 2.609375, + "grad_norm_var": 0.02047119140625, + "learning_rate": 0.0001, + "loss": 4.6558, + "loss/crossentropy": 2.1617711782455444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22820374369621277, + "step": 6494 + }, + { + "epoch": 0.12992, + "grad_norm": 2.15625, + "grad_norm_var": 0.0203765869140625, + "learning_rate": 0.0001, + "loss": 4.0938, + "loss/crossentropy": 1.7782898545265198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22162485867738724, + "step": 6496 + }, + { + "epoch": 0.12996, + "grad_norm": 2.1875, + "grad_norm_var": 0.021028645833333335, + "learning_rate": 0.0001, + "loss": 4.4198, + "loss/crossentropy": 2.1364612579345703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22490044683218002, + "step": 6498 + }, + { + "epoch": 0.13, + "grad_norm": 2.34375, + "grad_norm_var": 0.020213826497395834, + "learning_rate": 0.0001, + "loss": 4.569, + "loss/crossentropy": 2.282773971557617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24553906172513962, + "step": 6500 + }, + { + "epoch": 0.13004, + "grad_norm": 2.171875, + "grad_norm_var": 0.020905558268229166, + "learning_rate": 0.0001, + "loss": 4.4965, + "loss/crossentropy": 2.008001983165741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22152648121118546, + "step": 6502 + }, + { + "epoch": 0.13008, + "grad_norm": 2.328125, + "grad_norm_var": 0.021833292643229165, + "learning_rate": 0.0001, + "loss": 4.4535, + "loss/crossentropy": 2.1681981086730957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26258186995983124, + "step": 6504 + }, + { + "epoch": 0.13012, + "grad_norm": 2.328125, + "grad_norm_var": 0.021761067708333335, + "learning_rate": 0.0001, + "loss": 4.6254, + "loss/crossentropy": 2.5754435062408447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26315446197986603, + "step": 6506 + }, + { + "epoch": 0.13016, + "grad_norm": 2.234375, + "grad_norm_var": 0.021187337239583333, + "learning_rate": 0.0001, + "loss": 4.1505, + "loss/crossentropy": 1.7897658348083496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22884277999401093, + "step": 6508 + }, + { + "epoch": 0.1302, + "grad_norm": 2.46875, + "grad_norm_var": 0.016844685872395834, + "learning_rate": 0.0001, + "loss": 4.6315, + "loss/crossentropy": 1.8081435561180115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21860769391059875, + "step": 6510 + }, + { + "epoch": 0.13024, + "grad_norm": 2.359375, + "grad_norm_var": 0.0140289306640625, + "learning_rate": 0.0001, + "loss": 4.7371, + "loss/crossentropy": 2.243759036064148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2545628473162651, + "step": 6512 + }, + { + "epoch": 0.13028, + "grad_norm": 2.1875, + "grad_norm_var": 0.03447265625, + "learning_rate": 0.0001, + "loss": 4.6147, + "loss/crossentropy": 2.069986939430237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20862630754709244, + "step": 6514 + }, + { + "epoch": 0.13032, + "grad_norm": 2.203125, + "grad_norm_var": 0.0349761962890625, + "learning_rate": 0.0001, + "loss": 4.4758, + "loss/crossentropy": 2.052341878414154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460884153842926, + "step": 6516 + }, + { + "epoch": 0.13036, + "grad_norm": 2.390625, + "grad_norm_var": 0.0317047119140625, + "learning_rate": 0.0001, + "loss": 4.6369, + "loss/crossentropy": 2.3376708030700684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25382688641548157, + "step": 6518 + }, + { + "epoch": 0.1304, + "grad_norm": 2.125, + "grad_norm_var": 0.0329498291015625, + "learning_rate": 0.0001, + "loss": 4.3048, + "loss/crossentropy": 1.8329171538352966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2135135680437088, + "step": 6520 + }, + { + "epoch": 0.13044, + "grad_norm": 2.421875, + "grad_norm_var": 0.035791015625, + "learning_rate": 0.0001, + "loss": 4.6582, + "loss/crossentropy": 2.077217698097229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265045866370201, + "step": 6522 + }, + { + "epoch": 0.13048, + "grad_norm": 2.171875, + "grad_norm_var": 0.036783854166666664, + "learning_rate": 0.0001, + "loss": 4.2189, + "loss/crossentropy": 2.0677568912506104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2468869537115097, + "step": 6524 + }, + { + "epoch": 0.13052, + "grad_norm": 2.296875, + "grad_norm_var": 0.03591206868489583, + "learning_rate": 0.0001, + "loss": 4.5246, + "loss/crossentropy": 2.0692074298858643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2602302059531212, + "step": 6526 + }, + { + "epoch": 0.13056, + "grad_norm": 2.421875, + "grad_norm_var": 0.03583984375, + "learning_rate": 0.0001, + "loss": 4.6709, + "loss/crossentropy": 1.9767250418663025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232941836118698, + "step": 6528 + }, + { + "epoch": 0.1306, + "grad_norm": 2.234375, + "grad_norm_var": 0.01480712890625, + "learning_rate": 0.0001, + "loss": 4.4851, + "loss/crossentropy": 1.7538996934890747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20786649733781815, + "step": 6530 + }, + { + "epoch": 0.13064, + "grad_norm": 2.203125, + "grad_norm_var": 0.01578369140625, + "learning_rate": 0.0001, + "loss": 4.5598, + "loss/crossentropy": 2.061249256134033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23238955438137054, + "step": 6532 + }, + { + "epoch": 0.13068, + "grad_norm": 2.171875, + "grad_norm_var": 0.0167877197265625, + "learning_rate": 0.0001, + "loss": 4.237, + "loss/crossentropy": 1.9083253145217896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22536182403564453, + "step": 6534 + }, + { + "epoch": 0.13072, + "grad_norm": 2.046875, + "grad_norm_var": 0.017952473958333333, + "learning_rate": 0.0001, + "loss": 4.3004, + "loss/crossentropy": 2.052153766155243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2343958392739296, + "step": 6536 + }, + { + "epoch": 0.13076, + "grad_norm": 2.125, + "grad_norm_var": 0.010091145833333334, + "learning_rate": 0.0001, + "loss": 4.3638, + "loss/crossentropy": 1.9979270100593567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20102836191654205, + "step": 6538 + }, + { + "epoch": 0.1308, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012031809488932291, + "learning_rate": 0.0001, + "loss": 4.019, + "loss/crossentropy": 1.618333637714386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21146921813488007, + "step": 6540 + }, + { + "epoch": 0.13084, + "grad_norm": 2.078125, + "grad_norm_var": 0.011940256754557291, + "learning_rate": 0.0001, + "loss": 4.2514, + "loss/crossentropy": 1.771790623664856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21661869436502457, + "step": 6542 + }, + { + "epoch": 0.13088, + "grad_norm": 2.25, + "grad_norm_var": 0.007165273030598958, + "learning_rate": 0.0001, + "loss": 4.3528, + "loss/crossentropy": 2.2347733974456787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2507361173629761, + "step": 6544 + }, + { + "epoch": 0.13092, + "grad_norm": 2.328125, + "grad_norm_var": 0.012389882405598959, + "learning_rate": 0.0001, + "loss": 4.4333, + "loss/crossentropy": 2.226397395133972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.257301464676857, + "step": 6546 + }, + { + "epoch": 0.13096, + "grad_norm": 2.125, + "grad_norm_var": 0.012776438395182292, + "learning_rate": 0.0001, + "loss": 4.1629, + "loss/crossentropy": 1.9884281158447266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2213538959622383, + "step": 6548 + }, + { + "epoch": 0.131, + "grad_norm": 2.0625, + "grad_norm_var": 0.01755549112955729, + "learning_rate": 0.0001, + "loss": 4.549, + "loss/crossentropy": 2.0790398120880127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2259911745786667, + "step": 6550 + }, + { + "epoch": 0.13104, + "grad_norm": 2.28125, + "grad_norm_var": 0.016294097900390624, + "learning_rate": 0.0001, + "loss": 4.5118, + "loss/crossentropy": 2.1309107542037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23984672129154205, + "step": 6552 + }, + { + "epoch": 0.13108, + "grad_norm": 2.296875, + "grad_norm_var": 0.01744562784830729, + "learning_rate": 0.0001, + "loss": 4.4528, + "loss/crossentropy": 1.9651963114738464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21699358522891998, + "step": 6554 + }, + { + "epoch": 0.13112, + "grad_norm": 2.21875, + "grad_norm_var": 0.014598592122395834, + "learning_rate": 0.0001, + "loss": 4.436, + "loss/crossentropy": 2.405247449874878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24624405801296234, + "step": 6556 + }, + { + "epoch": 0.13116, + "grad_norm": 2.125, + "grad_norm_var": 0.0154296875, + "learning_rate": 0.0001, + "loss": 4.3984, + "loss/crossentropy": 2.215611457824707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24835016578435898, + "step": 6558 + }, + { + "epoch": 0.1312, + "grad_norm": 4.125, + "grad_norm_var": 0.2436920166015625, + "learning_rate": 0.0001, + "loss": 4.3876, + "loss/crossentropy": 2.1731653809547424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24633550643920898, + "step": 6560 + }, + { + "epoch": 0.13124, + "grad_norm": 2.234375, + "grad_norm_var": 0.24501953125, + "learning_rate": 0.0001, + "loss": 4.5101, + "loss/crossentropy": 2.0507587790489197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25702129304409027, + "step": 6562 + }, + { + "epoch": 0.13128, + "grad_norm": 2.375, + "grad_norm_var": 0.2433502197265625, + "learning_rate": 0.0001, + "loss": 4.3464, + "loss/crossentropy": 2.2088446617126465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22610870003700256, + "step": 6564 + }, + { + "epoch": 0.13132, + "grad_norm": 2.34375, + "grad_norm_var": 0.23931884765625, + "learning_rate": 0.0001, + "loss": 4.3612, + "loss/crossentropy": 1.6911352276802063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2188590243458748, + "step": 6566 + }, + { + "epoch": 0.13136, + "grad_norm": 2.09375, + "grad_norm_var": 0.24976806640625, + "learning_rate": 0.0001, + "loss": 3.9795, + "loss/crossentropy": 1.807699978351593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21844930946826935, + "step": 6568 + }, + { + "epoch": 0.1314, + "grad_norm": 2.28125, + "grad_norm_var": 0.24514872233072918, + "learning_rate": 0.0001, + "loss": 4.4293, + "loss/crossentropy": 2.292602300643921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.242776520550251, + "step": 6570 + }, + { + "epoch": 0.13144, + "grad_norm": 2.03125, + "grad_norm_var": 0.24806315104166668, + "learning_rate": 0.0001, + "loss": 4.07, + "loss/crossentropy": 1.5262329578399658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1674257293343544, + "step": 6572 + }, + { + "epoch": 0.13148, + "grad_norm": 2.15625, + "grad_norm_var": 0.24504801432291667, + "learning_rate": 0.0001, + "loss": 4.4084, + "loss/crossentropy": 2.180319309234619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2260192409157753, + "step": 6574 + }, + { + "epoch": 0.13152, + "grad_norm": 2.265625, + "grad_norm_var": 0.0216705322265625, + "learning_rate": 0.0001, + "loss": 4.544, + "loss/crossentropy": 2.188440203666687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23389383405447006, + "step": 6576 + }, + { + "epoch": 0.13156, + "grad_norm": 2.125, + "grad_norm_var": 0.03427327473958333, + "learning_rate": 0.0001, + "loss": 4.7595, + "loss/crossentropy": 2.20253586769104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3217846751213074, + "step": 6578 + }, + { + "epoch": 0.1316, + "grad_norm": 2.359375, + "grad_norm_var": 0.034764607747395836, + "learning_rate": 0.0001, + "loss": 4.2771, + "loss/crossentropy": 2.1901479959487915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24521923065185547, + "step": 6580 + }, + { + "epoch": 0.13164, + "grad_norm": 2.6875, + "grad_norm_var": 0.04170633951822917, + "learning_rate": 0.0001, + "loss": 4.7105, + "loss/crossentropy": 2.0719348192214966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2640562355518341, + "step": 6582 + }, + { + "epoch": 0.13168, + "grad_norm": 2.1875, + "grad_norm_var": 0.040339152018229164, + "learning_rate": 0.0001, + "loss": 4.1487, + "loss/crossentropy": 2.331762194633484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24890532344579697, + "step": 6584 + }, + { + "epoch": 0.13172, + "grad_norm": 2.34375, + "grad_norm_var": 0.04429931640625, + "learning_rate": 0.0001, + "loss": 4.6013, + "loss/crossentropy": 1.5143779516220093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1957392692565918, + "step": 6586 + }, + { + "epoch": 0.13176, + "grad_norm": 2.171875, + "grad_norm_var": 0.04088134765625, + "learning_rate": 0.0001, + "loss": 4.4311, + "loss/crossentropy": 2.301910698413849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27353671938180923, + "step": 6588 + }, + { + "epoch": 0.1318, + "grad_norm": 2.84375, + "grad_norm_var": 0.11796773274739583, + "learning_rate": 0.0001, + "loss": 4.5035, + "loss/crossentropy": 2.018579602241516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24261966347694397, + "step": 6590 + }, + { + "epoch": 0.13184, + "grad_norm": 2.203125, + "grad_norm_var": 0.11685282389322917, + "learning_rate": 0.0001, + "loss": 4.6931, + "loss/crossentropy": 1.9749983549118042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20727626234292984, + "step": 6592 + }, + { + "epoch": 0.13188, + "grad_norm": 2.0, + "grad_norm_var": 0.11741536458333333, + "learning_rate": 0.0001, + "loss": 4.1929, + "loss/crossentropy": 2.040414035320282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22879169881343842, + "step": 6594 + }, + { + "epoch": 0.13192, + "grad_norm": 2.453125, + "grad_norm_var": 0.11551106770833333, + "learning_rate": 0.0001, + "loss": 4.3331, + "loss/crossentropy": 1.941766619682312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22591909021139145, + "step": 6596 + }, + { + "epoch": 0.13196, + "grad_norm": 2.34375, + "grad_norm_var": 0.10946858723958333, + "learning_rate": 0.0001, + "loss": 4.3659, + "loss/crossentropy": 1.8487200140953064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23654203116893768, + "step": 6598 + }, + { + "epoch": 0.132, + "grad_norm": 2.109375, + "grad_norm_var": 0.10340067545572916, + "learning_rate": 0.0001, + "loss": 4.225, + "loss/crossentropy": 1.9297338724136353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016594037413597, + "step": 6600 + }, + { + "epoch": 0.13204, + "grad_norm": 2.046875, + "grad_norm_var": 0.11005757649739584, + "learning_rate": 0.0001, + "loss": 4.3643, + "loss/crossentropy": 2.048487663269043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20044995844364166, + "step": 6602 + }, + { + "epoch": 0.13208, + "grad_norm": 2.0625, + "grad_norm_var": 0.11295572916666667, + "learning_rate": 0.0001, + "loss": 4.425, + "loss/crossentropy": 2.3620439767837524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24750277400016785, + "step": 6604 + }, + { + "epoch": 0.13212, + "grad_norm": 2.328125, + "grad_norm_var": 0.0158203125, + "learning_rate": 0.0001, + "loss": 4.2578, + "loss/crossentropy": 2.1324113607406616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23109012842178345, + "step": 6606 + }, + { + "epoch": 0.13216, + "grad_norm": 2.1875, + "grad_norm_var": 0.015620930989583334, + "learning_rate": 0.0001, + "loss": 4.4775, + "loss/crossentropy": 2.021477997303009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152409330010414, + "step": 6608 + }, + { + "epoch": 0.1322, + "grad_norm": 2.1875, + "grad_norm_var": 0.012669881184895834, + "learning_rate": 0.0001, + "loss": 4.6221, + "loss/crossentropy": 2.1693036556243896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.234656922519207, + "step": 6610 + }, + { + "epoch": 0.13224, + "grad_norm": 2.203125, + "grad_norm_var": 0.008854166666666666, + "learning_rate": 0.0001, + "loss": 4.1469, + "loss/crossentropy": 2.1138017177581787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239084094762802, + "step": 6612 + }, + { + "epoch": 0.13228, + "grad_norm": 2.28125, + "grad_norm_var": 0.008772786458333333, + "learning_rate": 0.0001, + "loss": 4.4862, + "loss/crossentropy": 2.2540252208709717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2614182382822037, + "step": 6614 + }, + { + "epoch": 0.13232, + "grad_norm": 2.078125, + "grad_norm_var": 0.008837890625, + "learning_rate": 0.0001, + "loss": 4.2131, + "loss/crossentropy": 2.033502757549286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2177339717745781, + "step": 6616 + }, + { + "epoch": 0.13236, + "grad_norm": 2.078125, + "grad_norm_var": 0.010054524739583333, + "learning_rate": 0.0001, + "loss": 4.2648, + "loss/crossentropy": 2.2490646839141846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22369390726089478, + "step": 6618 + }, + { + "epoch": 0.1324, + "grad_norm": 2.21875, + "grad_norm_var": 0.0150543212890625, + "learning_rate": 0.0001, + "loss": 4.3571, + "loss/crossentropy": 2.14319908618927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24394190311431885, + "step": 6620 + }, + { + "epoch": 0.13244, + "grad_norm": 2.390625, + "grad_norm_var": 0.01627197265625, + "learning_rate": 0.0001, + "loss": 4.5136, + "loss/crossentropy": 2.045474410057068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22738997638225555, + "step": 6622 + }, + { + "epoch": 0.13248, + "grad_norm": 2.203125, + "grad_norm_var": 0.015816243489583333, + "learning_rate": 0.0001, + "loss": 4.5773, + "loss/crossentropy": 2.1853290796279907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2565506473183632, + "step": 6624 + }, + { + "epoch": 0.13252, + "grad_norm": 2.078125, + "grad_norm_var": 0.017015584309895835, + "learning_rate": 0.0001, + "loss": 4.275, + "loss/crossentropy": 2.1161083579063416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22949891537427902, + "step": 6626 + }, + { + "epoch": 0.13256, + "grad_norm": 2.1875, + "grad_norm_var": 0.0167633056640625, + "learning_rate": 0.0001, + "loss": 4.3154, + "loss/crossentropy": 2.167048454284668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23087909072637558, + "step": 6628 + }, + { + "epoch": 0.1326, + "grad_norm": 2.21875, + "grad_norm_var": 0.0156158447265625, + "learning_rate": 0.0001, + "loss": 4.2777, + "loss/crossentropy": 2.1544610261917114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25153525173664093, + "step": 6630 + }, + { + "epoch": 0.13264, + "grad_norm": 2.484375, + "grad_norm_var": 0.018244425455729168, + "learning_rate": 0.0001, + "loss": 4.4778, + "loss/crossentropy": 2.0319228768348694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22067170590162277, + "step": 6632 + }, + { + "epoch": 0.13268, + "grad_norm": 2.140625, + "grad_norm_var": 0.0162261962890625, + "learning_rate": 0.0001, + "loss": 4.4398, + "loss/crossentropy": 1.9723476767539978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21380966901779175, + "step": 6634 + }, + { + "epoch": 0.13272, + "grad_norm": 2.796875, + "grad_norm_var": 0.031590779622395836, + "learning_rate": 0.0001, + "loss": 4.3474, + "loss/crossentropy": 2.2833333015441895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23245477676391602, + "step": 6636 + }, + { + "epoch": 0.13276, + "grad_norm": 2.28125, + "grad_norm_var": 0.03132222493489583, + "learning_rate": 0.0001, + "loss": 4.4522, + "loss/crossentropy": 2.13793683052063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22665268182754517, + "step": 6638 + }, + { + "epoch": 0.1328, + "grad_norm": 2.734375, + "grad_norm_var": 0.04592692057291667, + "learning_rate": 0.0001, + "loss": 4.6386, + "loss/crossentropy": 2.188693881034851, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2471916377544403, + "step": 6640 + }, + { + "epoch": 0.13284, + "grad_norm": 2.234375, + "grad_norm_var": 0.0438629150390625, + "learning_rate": 0.0001, + "loss": 4.6394, + "loss/crossentropy": 2.169856071472168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23427975177764893, + "step": 6642 + }, + { + "epoch": 0.13288, + "grad_norm": 2.15625, + "grad_norm_var": 0.04365132649739583, + "learning_rate": 0.0001, + "loss": 4.6498, + "loss/crossentropy": 2.1600061655044556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23366540670394897, + "step": 6644 + }, + { + "epoch": 0.13292, + "grad_norm": 2.359375, + "grad_norm_var": 0.044188435872395834, + "learning_rate": 0.0001, + "loss": 4.4521, + "loss/crossentropy": 1.822945475578308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2203340008854866, + "step": 6646 + }, + { + "epoch": 0.13296, + "grad_norm": 2.15625, + "grad_norm_var": 0.045735677083333336, + "learning_rate": 0.0001, + "loss": 4.3263, + "loss/crossentropy": 1.8908653259277344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2320382297039032, + "step": 6648 + }, + { + "epoch": 0.133, + "grad_norm": 2.265625, + "grad_norm_var": 0.04363606770833333, + "learning_rate": 0.0001, + "loss": 4.5065, + "loss/crossentropy": 2.126000165939331, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24388836324214935, + "step": 6650 + }, + { + "epoch": 0.13304, + "grad_norm": 2.328125, + "grad_norm_var": 0.0242828369140625, + "learning_rate": 0.0001, + "loss": 4.5787, + "loss/crossentropy": 2.434928297996521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24083568900823593, + "step": 6652 + }, + { + "epoch": 0.13308, + "grad_norm": 2.203125, + "grad_norm_var": 0.023824055989583332, + "learning_rate": 0.0001, + "loss": 4.5538, + "loss/crossentropy": 2.2186567783355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2220132276415825, + "step": 6654 + }, + { + "epoch": 0.13312, + "grad_norm": 2.25, + "grad_norm_var": 0.006257120768229167, + "learning_rate": 0.0001, + "loss": 4.4934, + "loss/crossentropy": 1.849799931049347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21551364660263062, + "step": 6656 + }, + { + "epoch": 0.13316, + "grad_norm": 2.109375, + "grad_norm_var": 0.0072662353515625, + "learning_rate": 0.0001, + "loss": 4.2237, + "loss/crossentropy": 2.082044243812561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21274320781230927, + "step": 6658 + }, + { + "epoch": 0.1332, + "grad_norm": 2.125, + "grad_norm_var": 0.010838826497395834, + "learning_rate": 0.0001, + "loss": 4.5884, + "loss/crossentropy": 2.1957098245620728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23078418523073196, + "step": 6660 + }, + { + "epoch": 0.13324, + "grad_norm": 2.28125, + "grad_norm_var": 0.16033528645833334, + "learning_rate": 0.0001, + "loss": 4.519, + "loss/crossentropy": 2.228309690952301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27178408950567245, + "step": 6662 + }, + { + "epoch": 0.13328, + "grad_norm": 2.40625, + "grad_norm_var": 0.156298828125, + "learning_rate": 0.0001, + "loss": 4.5987, + "loss/crossentropy": 1.8185940384864807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2145048901438713, + "step": 6664 + }, + { + "epoch": 0.13332, + "grad_norm": 2.171875, + "grad_norm_var": 0.1566558837890625, + "learning_rate": 0.0001, + "loss": 4.4722, + "loss/crossentropy": 2.198649048805237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262621968984604, + "step": 6666 + }, + { + "epoch": 0.13336, + "grad_norm": 2.15625, + "grad_norm_var": 0.159130859375, + "learning_rate": 0.0001, + "loss": 4.5729, + "loss/crossentropy": 2.2075835466384888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23531068861484528, + "step": 6668 + }, + { + "epoch": 0.1334, + "grad_norm": 2.140625, + "grad_norm_var": 0.16243387858072916, + "learning_rate": 0.0001, + "loss": 4.2913, + "loss/crossentropy": 1.9719768166542053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2206977903842926, + "step": 6670 + }, + { + "epoch": 0.13344, + "grad_norm": 2.03125, + "grad_norm_var": 0.17021484375, + "learning_rate": 0.0001, + "loss": 4.2144, + "loss/crossentropy": 2.304553985595703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23418369889259338, + "step": 6672 + }, + { + "epoch": 0.13348, + "grad_norm": 2.25, + "grad_norm_var": 0.1682281494140625, + "learning_rate": 0.0001, + "loss": 4.4485, + "loss/crossentropy": 2.212409734725952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23888318240642548, + "step": 6674 + }, + { + "epoch": 0.13352, + "grad_norm": 2.0625, + "grad_norm_var": 0.19371337890625, + "learning_rate": 0.0001, + "loss": 4.176, + "loss/crossentropy": 2.001897156238556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147163525223732, + "step": 6676 + }, + { + "epoch": 0.13356, + "grad_norm": 2.203125, + "grad_norm_var": 0.043680826822916664, + "learning_rate": 0.0001, + "loss": 4.4245, + "loss/crossentropy": 2.216760039329529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24913202226161957, + "step": 6678 + }, + { + "epoch": 0.1336, + "grad_norm": 2.09375, + "grad_norm_var": 0.04597142537434896, + "learning_rate": 0.0001, + "loss": 4.1862, + "loss/crossentropy": 1.8190750479698181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.200186125934124, + "step": 6680 + }, + { + "epoch": 0.13364, + "grad_norm": 2.140625, + "grad_norm_var": 0.047548166910807294, + "learning_rate": 0.0001, + "loss": 4.7271, + "loss/crossentropy": 2.311274528503418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22400956600904465, + "step": 6682 + }, + { + "epoch": 0.13368, + "grad_norm": 2.34375, + "grad_norm_var": 0.5293841044108073, + "learning_rate": 0.0001, + "loss": 4.3852, + "loss/crossentropy": 2.0381893515586853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23101551085710526, + "step": 6684 + }, + { + "epoch": 0.13372, + "grad_norm": 2.21875, + "grad_norm_var": 0.5331776936848959, + "learning_rate": 0.0001, + "loss": 4.1185, + "loss/crossentropy": 1.7441503405570984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20208899676799774, + "step": 6686 + }, + { + "epoch": 0.13376, + "grad_norm": 2.171875, + "grad_norm_var": 0.5252593994140625, + "learning_rate": 0.0001, + "loss": 4.3835, + "loss/crossentropy": 1.8874938488006592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2255900353193283, + "step": 6688 + }, + { + "epoch": 0.1338, + "grad_norm": 2.3125, + "grad_norm_var": 0.5284006754557292, + "learning_rate": 0.0001, + "loss": 4.2563, + "loss/crossentropy": 2.2768125534057617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21636968851089478, + "step": 6690 + }, + { + "epoch": 0.13384, + "grad_norm": 2.09375, + "grad_norm_var": 0.5063954671223958, + "learning_rate": 0.0001, + "loss": 4.278, + "loss/crossentropy": 2.0253931283950806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28937554359436035, + "step": 6692 + }, + { + "epoch": 0.13388, + "grad_norm": 2.28125, + "grad_norm_var": 0.5079661051432292, + "learning_rate": 0.0001, + "loss": 4.4205, + "loss/crossentropy": 2.1940718293190002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23384064435958862, + "step": 6694 + }, + { + "epoch": 0.13392, + "grad_norm": 2.078125, + "grad_norm_var": 0.4987993876139323, + "learning_rate": 0.0001, + "loss": 4.5066, + "loss/crossentropy": 2.071534514427185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146512120962143, + "step": 6696 + }, + { + "epoch": 0.13396, + "grad_norm": 2.28125, + "grad_norm_var": 0.5017534891764323, + "learning_rate": 0.0001, + "loss": 4.348, + "loss/crossentropy": 1.8530714511871338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21837462484836578, + "step": 6698 + }, + { + "epoch": 0.134, + "grad_norm": 2.28125, + "grad_norm_var": 0.013396962483723959, + "learning_rate": 0.0001, + "loss": 4.521, + "loss/crossentropy": 2.210664451122284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2199932038784027, + "step": 6700 + }, + { + "epoch": 0.13404, + "grad_norm": 2.171875, + "grad_norm_var": 0.011506144205729167, + "learning_rate": 0.0001, + "loss": 4.4226, + "loss/crossentropy": 1.8530223965644836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209881991147995, + "step": 6702 + }, + { + "epoch": 0.13408, + "grad_norm": 2.234375, + "grad_norm_var": 0.011839803059895833, + "learning_rate": 0.0001, + "loss": 4.4302, + "loss/crossentropy": 1.8609183430671692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22096765041351318, + "step": 6704 + }, + { + "epoch": 0.13412, + "grad_norm": 2.59375, + "grad_norm_var": 0.021214803059895832, + "learning_rate": 0.0001, + "loss": 4.8429, + "loss/crossentropy": 2.33315110206604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26414938271045685, + "step": 6706 + }, + { + "epoch": 0.13416, + "grad_norm": 2.359375, + "grad_norm_var": 0.04798075358072917, + "learning_rate": 0.0001, + "loss": 4.6054, + "loss/crossentropy": 2.2656116485595703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2769838646054268, + "step": 6708 + }, + { + "epoch": 0.1342, + "grad_norm": 2.1875, + "grad_norm_var": 0.0466705322265625, + "learning_rate": 0.0001, + "loss": 4.4875, + "loss/crossentropy": 2.2131590843200684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23640615493059158, + "step": 6710 + }, + { + "epoch": 0.13424, + "grad_norm": 2.25, + "grad_norm_var": 0.044831339518229166, + "learning_rate": 0.0001, + "loss": 4.1554, + "loss/crossentropy": 1.8667671084403992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212738998234272, + "step": 6712 + }, + { + "epoch": 0.13428, + "grad_norm": 2.203125, + "grad_norm_var": 0.04480692545572917, + "learning_rate": 0.0001, + "loss": 4.0958, + "loss/crossentropy": 1.9699830412864685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23507894575595856, + "step": 6714 + }, + { + "epoch": 0.13432, + "grad_norm": 2.078125, + "grad_norm_var": 0.04632161458333333, + "learning_rate": 0.0001, + "loss": 4.1571, + "loss/crossentropy": 1.8108918070793152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192004919052124, + "step": 6716 + }, + { + "epoch": 0.13436, + "grad_norm": 2.203125, + "grad_norm_var": 0.042740885416666666, + "learning_rate": 0.0001, + "loss": 4.4983, + "loss/crossentropy": 2.0528674125671387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22221814841032028, + "step": 6718 + }, + { + "epoch": 0.1344, + "grad_norm": 2.046875, + "grad_norm_var": 0.046305338541666664, + "learning_rate": 0.0001, + "loss": 4.2544, + "loss/crossentropy": 1.7881956696510315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21330178529024124, + "step": 6720 + }, + { + "epoch": 0.13444, + "grad_norm": 2.3125, + "grad_norm_var": 0.039159138997395836, + "learning_rate": 0.0001, + "loss": 4.5424, + "loss/crossentropy": 2.2016018629074097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22042546421289444, + "step": 6722 + }, + { + "epoch": 0.13448, + "grad_norm": 2.171875, + "grad_norm_var": 0.006322224934895833, + "learning_rate": 0.0001, + "loss": 4.3159, + "loss/crossentropy": 1.9661846160888672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21542846411466599, + "step": 6724 + }, + { + "epoch": 0.13452, + "grad_norm": 2.328125, + "grad_norm_var": 0.031037394205729166, + "learning_rate": 0.0001, + "loss": 4.4473, + "loss/crossentropy": 2.1073482036590576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23370730876922607, + "step": 6726 + }, + { + "epoch": 0.13456, + "grad_norm": 2.25, + "grad_norm_var": 0.030826822916666666, + "learning_rate": 0.0001, + "loss": 4.6069, + "loss/crossentropy": 2.1937917470932007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22414422780275345, + "step": 6728 + }, + { + "epoch": 0.1346, + "grad_norm": 2.109375, + "grad_norm_var": 0.030436197916666668, + "learning_rate": 0.0001, + "loss": 4.2937, + "loss/crossentropy": 2.078732967376709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21227504312992096, + "step": 6730 + }, + { + "epoch": 0.13464, + "grad_norm": 2.34375, + "grad_norm_var": 0.030301920572916665, + "learning_rate": 0.0001, + "loss": 4.3043, + "loss/crossentropy": 1.9002525806427002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21004119515419006, + "step": 6732 + }, + { + "epoch": 0.13468, + "grad_norm": 2.15625, + "grad_norm_var": 0.030794270833333335, + "learning_rate": 0.0001, + "loss": 4.5156, + "loss/crossentropy": 2.055518925189972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21610142290592194, + "step": 6734 + }, + { + "epoch": 0.13472, + "grad_norm": 2.1875, + "grad_norm_var": 0.028473917643229166, + "learning_rate": 0.0001, + "loss": 4.4889, + "loss/crossentropy": 2.0521084666252136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23528365790843964, + "step": 6736 + }, + { + "epoch": 0.13476, + "grad_norm": 2.203125, + "grad_norm_var": 0.02730712890625, + "learning_rate": 0.0001, + "loss": 4.5051, + "loss/crossentropy": 2.2536301612854004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21544227004051208, + "step": 6738 + }, + { + "epoch": 0.1348, + "grad_norm": 2.1875, + "grad_norm_var": 0.027179972330729166, + "learning_rate": 0.0001, + "loss": 4.5164, + "loss/crossentropy": 2.2610143423080444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2532905787229538, + "step": 6740 + }, + { + "epoch": 0.13484, + "grad_norm": 2.109375, + "grad_norm_var": 0.00836181640625, + "learning_rate": 0.0001, + "loss": 4.1301, + "loss/crossentropy": 1.8840081095695496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21514790505170822, + "step": 6742 + }, + { + "epoch": 0.13488, + "grad_norm": 2.109375, + "grad_norm_var": 0.0084136962890625, + "learning_rate": 0.0001, + "loss": 4.2532, + "loss/crossentropy": 2.0841002464294434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2328692153096199, + "step": 6744 + }, + { + "epoch": 0.13492, + "grad_norm": 2.390625, + "grad_norm_var": 0.010798136393229166, + "learning_rate": 0.0001, + "loss": 4.6335, + "loss/crossentropy": 2.507196068763733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2736222892999649, + "step": 6746 + }, + { + "epoch": 0.13496, + "grad_norm": 2.421875, + "grad_norm_var": 0.016527303059895835, + "learning_rate": 0.0001, + "loss": 4.189, + "loss/crossentropy": 2.0180357098579407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22328195720911026, + "step": 6748 + }, + { + "epoch": 0.135, + "grad_norm": 2.15625, + "grad_norm_var": 0.016097005208333334, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.2457560300827026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26572495698928833, + "step": 6750 + }, + { + "epoch": 0.13504, + "grad_norm": 2.6875, + "grad_norm_var": 0.031493123372395834, + "learning_rate": 0.0001, + "loss": 4.885, + "loss/crossentropy": 2.1280853748321533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22716474533081055, + "step": 6752 + }, + { + "epoch": 0.13508, + "grad_norm": 2.25, + "grad_norm_var": 0.03216145833333333, + "learning_rate": 0.0001, + "loss": 4.3901, + "loss/crossentropy": 2.1843650341033936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24630828201770782, + "step": 6754 + }, + { + "epoch": 0.13512, + "grad_norm": 2.21875, + "grad_norm_var": 0.03390299479166667, + "learning_rate": 0.0001, + "loss": 4.3584, + "loss/crossentropy": 1.7955012917518616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614011585712433, + "step": 6756 + }, + { + "epoch": 0.13516, + "grad_norm": 2.1875, + "grad_norm_var": 0.028563435872395834, + "learning_rate": 0.0001, + "loss": 4.3546, + "loss/crossentropy": 1.9315852522850037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22918011993169785, + "step": 6758 + }, + { + "epoch": 0.1352, + "grad_norm": 2.140625, + "grad_norm_var": 0.028055826822916668, + "learning_rate": 0.0001, + "loss": 4.2659, + "loss/crossentropy": 1.982887327671051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21635843813419342, + "step": 6760 + }, + { + "epoch": 0.13524, + "grad_norm": 2.0625, + "grad_norm_var": 0.029215494791666668, + "learning_rate": 0.0001, + "loss": 4.2828, + "loss/crossentropy": 2.25021892786026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22931701689958572, + "step": 6762 + }, + { + "epoch": 0.13528, + "grad_norm": 2.34375, + "grad_norm_var": 0.02252197265625, + "learning_rate": 0.0001, + "loss": 4.5991, + "loss/crossentropy": 2.5220746994018555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2406606376171112, + "step": 6764 + }, + { + "epoch": 0.13532, + "grad_norm": 2.375, + "grad_norm_var": 0.022386678059895835, + "learning_rate": 0.0001, + "loss": 4.5143, + "loss/crossentropy": 1.8115127086639404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220379076898098, + "step": 6766 + }, + { + "epoch": 0.13536, + "grad_norm": 2.0625, + "grad_norm_var": 0.010856119791666667, + "learning_rate": 0.0001, + "loss": 4.5113, + "loss/crossentropy": 1.8998088240623474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22993376106023788, + "step": 6768 + }, + { + "epoch": 0.1354, + "grad_norm": 2.09375, + "grad_norm_var": 0.01051025390625, + "learning_rate": 0.0001, + "loss": 4.28, + "loss/crossentropy": 2.0183660984039307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21991200000047684, + "step": 6770 + }, + { + "epoch": 0.13544, + "grad_norm": 2.296875, + "grad_norm_var": 0.009847005208333334, + "learning_rate": 0.0001, + "loss": 4.6224, + "loss/crossentropy": 2.1927448511123657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536699026823044, + "step": 6772 + }, + { + "epoch": 0.13548, + "grad_norm": 2.234375, + "grad_norm_var": 0.014404296875, + "learning_rate": 0.0001, + "loss": 4.196, + "loss/crossentropy": 1.92184317111969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20716708898544312, + "step": 6774 + }, + { + "epoch": 0.13552, + "grad_norm": 2.296875, + "grad_norm_var": 0.016813151041666665, + "learning_rate": 0.0001, + "loss": 4.7779, + "loss/crossentropy": 2.2437468767166138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26381388306617737, + "step": 6776 + }, + { + "epoch": 0.13556, + "grad_norm": 2.0625, + "grad_norm_var": 0.016722615559895834, + "learning_rate": 0.0001, + "loss": 4.2907, + "loss/crossentropy": 2.087414026260376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21566492319107056, + "step": 6778 + }, + { + "epoch": 0.1356, + "grad_norm": 2.140625, + "grad_norm_var": 0.015653483072916665, + "learning_rate": 0.0001, + "loss": 4.4273, + "loss/crossentropy": 2.1936367750167847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22492723166942596, + "step": 6780 + }, + { + "epoch": 0.13564, + "grad_norm": 2.109375, + "grad_norm_var": 0.014046223958333333, + "learning_rate": 0.0001, + "loss": 4.2992, + "loss/crossentropy": 1.7642306685447693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20992937684059143, + "step": 6782 + }, + { + "epoch": 0.13568, + "grad_norm": 2.140625, + "grad_norm_var": 0.014371744791666667, + "learning_rate": 0.0001, + "loss": 4.3593, + "loss/crossentropy": 2.01781964302063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22552715986967087, + "step": 6784 + }, + { + "epoch": 0.13572, + "grad_norm": 2.140625, + "grad_norm_var": 0.016402180989583334, + "learning_rate": 0.0001, + "loss": 4.4708, + "loss/crossentropy": 2.0788158774375916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24181769788265228, + "step": 6786 + }, + { + "epoch": 0.13576, + "grad_norm": 2.171875, + "grad_norm_var": 0.01539306640625, + "learning_rate": 0.0001, + "loss": 4.2163, + "loss/crossentropy": 2.0424017310142517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21705424785614014, + "step": 6788 + }, + { + "epoch": 0.1358, + "grad_norm": 2.46875, + "grad_norm_var": 0.019025675455729165, + "learning_rate": 0.0001, + "loss": 4.1182, + "loss/crossentropy": 1.6175345182418823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1899409219622612, + "step": 6790 + }, + { + "epoch": 0.13584, + "grad_norm": 2.3125, + "grad_norm_var": 0.016852823893229167, + "learning_rate": 0.0001, + "loss": 4.2914, + "loss/crossentropy": 2.004386007785797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22395263612270355, + "step": 6792 + }, + { + "epoch": 0.13588, + "grad_norm": 2.109375, + "grad_norm_var": 0.019245402018229166, + "learning_rate": 0.0001, + "loss": 4.2182, + "loss/crossentropy": 1.9224132895469666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2326791137456894, + "step": 6794 + }, + { + "epoch": 0.13592, + "grad_norm": 2.1875, + "grad_norm_var": 0.019755045572916668, + "learning_rate": 0.0001, + "loss": 4.4768, + "loss/crossentropy": 1.8331453204154968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21586360037326813, + "step": 6796 + }, + { + "epoch": 0.13596, + "grad_norm": 2.1875, + "grad_norm_var": 0.0197906494140625, + "learning_rate": 0.0001, + "loss": 4.3059, + "loss/crossentropy": 2.535244107246399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24744001775979996, + "step": 6798 + }, + { + "epoch": 0.136, + "grad_norm": 2.25, + "grad_norm_var": 0.017704264322916666, + "learning_rate": 0.0001, + "loss": 4.4444, + "loss/crossentropy": 2.0433249473571777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2502296343445778, + "step": 6800 + }, + { + "epoch": 0.13604, + "grad_norm": 2.09375, + "grad_norm_var": 0.0221588134765625, + "learning_rate": 0.0001, + "loss": 4.4619, + "loss/crossentropy": 2.35608172416687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2536633685231209, + "step": 6802 + }, + { + "epoch": 0.13608, + "grad_norm": 2.1875, + "grad_norm_var": 0.022362263997395833, + "learning_rate": 0.0001, + "loss": 4.4493, + "loss/crossentropy": 2.1230265498161316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2276441976428032, + "step": 6804 + }, + { + "epoch": 0.13612, + "grad_norm": 2.15625, + "grad_norm_var": 0.015925089518229168, + "learning_rate": 0.0001, + "loss": 4.2125, + "loss/crossentropy": 2.0186346769332886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22158686816692352, + "step": 6806 + }, + { + "epoch": 0.13616, + "grad_norm": 2.28125, + "grad_norm_var": 0.0151763916015625, + "learning_rate": 0.0001, + "loss": 4.6065, + "loss/crossentropy": 2.4136343002319336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22668009996414185, + "step": 6808 + }, + { + "epoch": 0.1362, + "grad_norm": 2.109375, + "grad_norm_var": 0.012495930989583333, + "learning_rate": 0.0001, + "loss": 4.3372, + "loss/crossentropy": 2.1241788268089294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23394256830215454, + "step": 6810 + }, + { + "epoch": 0.13624, + "grad_norm": 2.4375, + "grad_norm_var": 0.017145792643229168, + "learning_rate": 0.0001, + "loss": 4.6063, + "loss/crossentropy": 1.9051874279975891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560805454850197, + "step": 6812 + }, + { + "epoch": 0.13628, + "grad_norm": 2.234375, + "grad_norm_var": 0.015458170572916667, + "learning_rate": 0.0001, + "loss": 4.2419, + "loss/crossentropy": 1.9248363375663757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22457116842269897, + "step": 6814 + }, + { + "epoch": 0.13632, + "grad_norm": 1.921875, + "grad_norm_var": 0.022264607747395835, + "learning_rate": 0.0001, + "loss": 4.5953, + "loss/crossentropy": 2.3065048456192017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.245137557387352, + "step": 6816 + }, + { + "epoch": 0.13636, + "grad_norm": 2.234375, + "grad_norm_var": 0.0152496337890625, + "learning_rate": 0.0001, + "loss": 4.6096, + "loss/crossentropy": 2.152611255645752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22794383764266968, + "step": 6818 + }, + { + "epoch": 0.1364, + "grad_norm": 2.09375, + "grad_norm_var": 0.016242472330729167, + "learning_rate": 0.0001, + "loss": 4.129, + "loss/crossentropy": 1.9548735618591309, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20764236897230148, + "step": 6820 + }, + { + "epoch": 0.13644, + "grad_norm": 2.1875, + "grad_norm_var": 0.016136678059895833, + "learning_rate": 0.0001, + "loss": 4.3501, + "loss/crossentropy": 2.11979341506958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22793393582105637, + "step": 6822 + }, + { + "epoch": 0.13648, + "grad_norm": 2.15625, + "grad_norm_var": 0.016748046875, + "learning_rate": 0.0001, + "loss": 4.3338, + "loss/crossentropy": 2.351949691772461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2366446554660797, + "step": 6824 + }, + { + "epoch": 0.13652, + "grad_norm": 2.0625, + "grad_norm_var": 0.018843587239583334, + "learning_rate": 0.0001, + "loss": 4.3764, + "loss/crossentropy": 2.1197460889816284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24759536981582642, + "step": 6826 + }, + { + "epoch": 0.13656, + "grad_norm": 2.0625, + "grad_norm_var": 0.013264973958333334, + "learning_rate": 0.0001, + "loss": 4.1204, + "loss/crossentropy": 1.8491687178611755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20777105540037155, + "step": 6828 + }, + { + "epoch": 0.1366, + "grad_norm": 2.25, + "grad_norm_var": 0.013459269205729167, + "learning_rate": 0.0001, + "loss": 4.4867, + "loss/crossentropy": 1.964136004447937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2368694394826889, + "step": 6830 + }, + { + "epoch": 0.13664, + "grad_norm": 2.140625, + "grad_norm_var": 0.006859334309895834, + "learning_rate": 0.0001, + "loss": 4.469, + "loss/crossentropy": 1.8988104462623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19548720866441727, + "step": 6832 + }, + { + "epoch": 0.13668, + "grad_norm": 2.125, + "grad_norm_var": 0.0059855143229166664, + "learning_rate": 0.0001, + "loss": 4.3104, + "loss/crossentropy": 1.757002353668213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21120281517505646, + "step": 6834 + }, + { + "epoch": 0.13672, + "grad_norm": 2.109375, + "grad_norm_var": 0.005952962239583333, + "learning_rate": 0.0001, + "loss": 4.4239, + "loss/crossentropy": 1.9414420127868652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22160177677869797, + "step": 6836 + }, + { + "epoch": 0.13676, + "grad_norm": 2.0625, + "grad_norm_var": 0.0072174072265625, + "learning_rate": 0.0001, + "loss": 4.4294, + "loss/crossentropy": 2.280028223991394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24696747958660126, + "step": 6838 + }, + { + "epoch": 0.1368, + "grad_norm": 2.203125, + "grad_norm_var": 0.006787109375, + "learning_rate": 0.0001, + "loss": 4.523, + "loss/crossentropy": 2.106986403465271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24022039771080017, + "step": 6840 + }, + { + "epoch": 0.13684, + "grad_norm": 2.15625, + "grad_norm_var": 0.0349517822265625, + "learning_rate": 0.0001, + "loss": 4.2609, + "loss/crossentropy": 1.9540700912475586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968660056591034, + "step": 6842 + }, + { + "epoch": 0.13688, + "grad_norm": 2.0625, + "grad_norm_var": 0.03394775390625, + "learning_rate": 0.0001, + "loss": 4.2181, + "loss/crossentropy": 1.6771780252456665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20230162143707275, + "step": 6844 + }, + { + "epoch": 0.13692, + "grad_norm": 2.5625, + "grad_norm_var": 0.04419657389322917, + "learning_rate": 0.0001, + "loss": 4.7567, + "loss/crossentropy": 2.059873402118683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.276339128613472, + "step": 6846 + }, + { + "epoch": 0.13696, + "grad_norm": 2.0625, + "grad_norm_var": 0.04684244791666667, + "learning_rate": 0.0001, + "loss": 4.2512, + "loss/crossentropy": 1.7943353056907654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893974468111992, + "step": 6848 + }, + { + "epoch": 0.137, + "grad_norm": 2.265625, + "grad_norm_var": 0.046223958333333336, + "learning_rate": 0.0001, + "loss": 4.687, + "loss/crossentropy": 2.314136028289795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24898302555084229, + "step": 6850 + }, + { + "epoch": 0.13704, + "grad_norm": 2.328125, + "grad_norm_var": 0.04383036295572917, + "learning_rate": 0.0001, + "loss": 4.4257, + "loss/crossentropy": 2.0062466263771057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23169535398483276, + "step": 6852 + }, + { + "epoch": 0.13708, + "grad_norm": 2.09375, + "grad_norm_var": 0.044873046875, + "learning_rate": 0.0001, + "loss": 4.5787, + "loss/crossentropy": 2.3600821495056152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25484780967235565, + "step": 6854 + }, + { + "epoch": 0.13712, + "grad_norm": 2.28125, + "grad_norm_var": 0.043675740559895836, + "learning_rate": 0.0001, + "loss": 4.2113, + "loss/crossentropy": 1.885023295879364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.255520723760128, + "step": 6856 + }, + { + "epoch": 0.13716, + "grad_norm": 2.234375, + "grad_norm_var": 0.019806925455729166, + "learning_rate": 0.0001, + "loss": 4.6493, + "loss/crossentropy": 2.2864162921905518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22469403594732285, + "step": 6858 + }, + { + "epoch": 0.1372, + "grad_norm": 2.09375, + "grad_norm_var": 0.018961588541666668, + "learning_rate": 0.0001, + "loss": 4.4017, + "loss/crossentropy": 1.908643126487732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22016740590333939, + "step": 6860 + }, + { + "epoch": 0.13724, + "grad_norm": 2.3125, + "grad_norm_var": 0.010282389322916667, + "learning_rate": 0.0001, + "loss": 4.7255, + "loss/crossentropy": 2.1028786301612854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23632919788360596, + "step": 6862 + }, + { + "epoch": 0.13728, + "grad_norm": 2.546875, + "grad_norm_var": 0.01539306640625, + "learning_rate": 0.0001, + "loss": 4.4043, + "loss/crossentropy": 2.0363592505455017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22131157666444778, + "step": 6864 + }, + { + "epoch": 0.13732, + "grad_norm": 2.171875, + "grad_norm_var": 0.0158843994140625, + "learning_rate": 0.0001, + "loss": 4.4357, + "loss/crossentropy": 2.030495524406433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.233994759619236, + "step": 6866 + }, + { + "epoch": 0.13736, + "grad_norm": 2.046875, + "grad_norm_var": 0.018033854166666665, + "learning_rate": 0.0001, + "loss": 4.3036, + "loss/crossentropy": 1.6365603804588318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1909056007862091, + "step": 6868 + }, + { + "epoch": 0.1374, + "grad_norm": 2.0625, + "grad_norm_var": 0.015217081705729166, + "learning_rate": 0.0001, + "loss": 4.655, + "loss/crossentropy": 2.205111026763916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2267644703388214, + "step": 6870 + }, + { + "epoch": 0.13744, + "grad_norm": 2.0625, + "grad_norm_var": 0.016437784830729166, + "learning_rate": 0.0001, + "loss": 4.4207, + "loss/crossentropy": 2.0179646015167236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2307143434882164, + "step": 6872 + }, + { + "epoch": 0.13748, + "grad_norm": 2.09375, + "grad_norm_var": 0.018290201822916668, + "learning_rate": 0.0001, + "loss": 4.2154, + "loss/crossentropy": 1.9697463512420654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2207925096154213, + "step": 6874 + }, + { + "epoch": 0.13752, + "grad_norm": 2.265625, + "grad_norm_var": 0.019189453125, + "learning_rate": 0.0001, + "loss": 4.5691, + "loss/crossentropy": 2.186875820159912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25762687623500824, + "step": 6876 + }, + { + "epoch": 0.13756, + "grad_norm": 2.109375, + "grad_norm_var": 0.017867024739583334, + "learning_rate": 0.0001, + "loss": 4.2713, + "loss/crossentropy": 2.3203837871551514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23470903187990189, + "step": 6878 + }, + { + "epoch": 0.1376, + "grad_norm": 2.1875, + "grad_norm_var": 0.0066640218098958336, + "learning_rate": 0.0001, + "loss": 4.4154, + "loss/crossentropy": 2.2642472982406616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2454553246498108, + "step": 6880 + }, + { + "epoch": 0.13764, + "grad_norm": 2.1875, + "grad_norm_var": 0.008687337239583334, + "learning_rate": 0.0001, + "loss": 4.4076, + "loss/crossentropy": 1.9313859343528748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21797804534435272, + "step": 6882 + }, + { + "epoch": 0.13768, + "grad_norm": 3.4375, + "grad_norm_var": 0.11204020182291667, + "learning_rate": 0.0001, + "loss": 4.7846, + "loss/crossentropy": 2.5469977855682373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2651800066232681, + "step": 6884 + }, + { + "epoch": 0.13772, + "grad_norm": 2.296875, + "grad_norm_var": 0.11030171712239584, + "learning_rate": 0.0001, + "loss": 4.4893, + "loss/crossentropy": 2.549328088760376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2566085457801819, + "step": 6886 + }, + { + "epoch": 0.13776, + "grad_norm": 2.03125, + "grad_norm_var": 0.1126617431640625, + "learning_rate": 0.0001, + "loss": 4.4927, + "loss/crossentropy": 2.3094369769096375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24709751456975937, + "step": 6888 + }, + { + "epoch": 0.1378, + "grad_norm": 2.109375, + "grad_norm_var": 0.1097808837890625, + "learning_rate": 0.0001, + "loss": 4.2412, + "loss/crossentropy": 1.5071046948432922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19776180386543274, + "step": 6890 + }, + { + "epoch": 0.13784, + "grad_norm": 2.078125, + "grad_norm_var": 0.1101226806640625, + "learning_rate": 0.0001, + "loss": 4.3705, + "loss/crossentropy": 2.0064221620559692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20836234837770462, + "step": 6892 + }, + { + "epoch": 0.13788, + "grad_norm": 2.078125, + "grad_norm_var": 0.10816650390625, + "learning_rate": 0.0001, + "loss": 4.3608, + "loss/crossentropy": 2.1216301321983337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22151901572942734, + "step": 6894 + }, + { + "epoch": 0.13792, + "grad_norm": 2.25, + "grad_norm_var": 0.1073150634765625, + "learning_rate": 0.0001, + "loss": 4.4168, + "loss/crossentropy": 1.8417679071426392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21999332308769226, + "step": 6896 + }, + { + "epoch": 0.13796, + "grad_norm": 2.328125, + "grad_norm_var": 0.10695699055989584, + "learning_rate": 0.0001, + "loss": 4.7005, + "loss/crossentropy": 2.4651769399642944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27029043436050415, + "step": 6898 + }, + { + "epoch": 0.138, + "grad_norm": 2.21875, + "grad_norm_var": 0.014188639322916667, + "learning_rate": 0.0001, + "loss": 4.2985, + "loss/crossentropy": 1.7225988507270813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20930497348308563, + "step": 6900 + }, + { + "epoch": 0.13804, + "grad_norm": 2.34375, + "grad_norm_var": 0.015360514322916666, + "learning_rate": 0.0001, + "loss": 4.1156, + "loss/crossentropy": 2.1218297481536865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21276423335075378, + "step": 6902 + }, + { + "epoch": 0.13808, + "grad_norm": 2.25, + "grad_norm_var": 0.010856119791666667, + "learning_rate": 0.0001, + "loss": 4.2706, + "loss/crossentropy": 2.040019452571869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2438269630074501, + "step": 6904 + }, + { + "epoch": 0.13812, + "grad_norm": 15.8125, + "grad_norm_var": 11.600536092122395, + "learning_rate": 0.0001, + "loss": 4.5041, + "loss/crossentropy": 1.8229625225067139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22197778522968292, + "step": 6906 + }, + { + "epoch": 0.13816, + "grad_norm": 2.25, + "grad_norm_var": 11.543973795572917, + "learning_rate": 0.0001, + "loss": 4.7087, + "loss/crossentropy": 2.453408360481262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24046239256858826, + "step": 6908 + }, + { + "epoch": 0.1382, + "grad_norm": 2.046875, + "grad_norm_var": 11.55152587890625, + "learning_rate": 0.0001, + "loss": 4.4806, + "loss/crossentropy": 2.2724320888519287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23739346861839294, + "step": 6910 + }, + { + "epoch": 0.13824, + "grad_norm": 2.125, + "grad_norm_var": 11.562272135416666, + "learning_rate": 0.0001, + "loss": 4.2747, + "loss/crossentropy": 2.2382686138153076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22692064195871353, + "step": 6912 + }, + { + "epoch": 0.13828, + "grad_norm": 2.3125, + "grad_norm_var": 11.54869384765625, + "learning_rate": 0.0001, + "loss": 4.6867, + "loss/crossentropy": 2.021562337875366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24746088683605194, + "step": 6914 + }, + { + "epoch": 0.13832, + "grad_norm": 2.21875, + "grad_norm_var": 11.546556599934895, + "learning_rate": 0.0001, + "loss": 4.3247, + "loss/crossentropy": 2.1071943044662476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144002616405487, + "step": 6916 + }, + { + "epoch": 0.13836, + "grad_norm": 2.28125, + "grad_norm_var": 11.550846354166667, + "learning_rate": 0.0001, + "loss": 4.2686, + "loss/crossentropy": 1.9641517400741577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22584721446037292, + "step": 6918 + }, + { + "epoch": 0.1384, + "grad_norm": 2.078125, + "grad_norm_var": 11.559130859375, + "learning_rate": 0.0001, + "loss": 4.3194, + "loss/crossentropy": 2.4430564641952515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24351171404123306, + "step": 6920 + }, + { + "epoch": 0.13844, + "grad_norm": 2.125, + "grad_norm_var": 0.022484334309895833, + "learning_rate": 0.0001, + "loss": 4.4202, + "loss/crossentropy": 2.2237725257873535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2248261496424675, + "step": 6922 + }, + { + "epoch": 0.13848, + "grad_norm": 2.09375, + "grad_norm_var": 0.015412394205729167, + "learning_rate": 0.0001, + "loss": 4.2028, + "loss/crossentropy": 1.7291913628578186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19805100560188293, + "step": 6924 + }, + { + "epoch": 0.13852, + "grad_norm": 2.15625, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 4.3972, + "loss/crossentropy": 1.807108223438263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20386626571416855, + "step": 6926 + }, + { + "epoch": 0.13856, + "grad_norm": 2.28125, + "grad_norm_var": 0.01422119140625, + "learning_rate": 0.0001, + "loss": 4.5188, + "loss/crossentropy": 2.510676622390747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24666880816221237, + "step": 6928 + }, + { + "epoch": 0.1386, + "grad_norm": 2.015625, + "grad_norm_var": 0.007515462239583334, + "learning_rate": 0.0001, + "loss": 4.2006, + "loss/crossentropy": 1.9420115947723389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2214776575565338, + "step": 6930 + }, + { + "epoch": 0.13864, + "grad_norm": 2.3125, + "grad_norm_var": 0.010837554931640625, + "learning_rate": 0.0001, + "loss": 4.4445, + "loss/crossentropy": 2.2288190722465515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23719585686922073, + "step": 6932 + }, + { + "epoch": 0.13868, + "grad_norm": 2.09375, + "grad_norm_var": 0.011043294270833334, + "learning_rate": 0.0001, + "loss": 4.071, + "loss/crossentropy": 2.04274183511734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21653369069099426, + "step": 6934 + }, + { + "epoch": 0.13872, + "grad_norm": 2.296875, + "grad_norm_var": 0.0121002197265625, + "learning_rate": 0.0001, + "loss": 4.4041, + "loss/crossentropy": 1.9149779081344604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19941364973783493, + "step": 6936 + }, + { + "epoch": 0.13876, + "grad_norm": 2.15625, + "grad_norm_var": 0.011554972330729166, + "learning_rate": 0.0001, + "loss": 4.2577, + "loss/crossentropy": 1.7983179092407227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18979590386152267, + "step": 6938 + }, + { + "epoch": 0.1388, + "grad_norm": 2.296875, + "grad_norm_var": 0.023164876302083335, + "learning_rate": 0.0001, + "loss": 4.3314, + "loss/crossentropy": 2.1919915080070496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22652295976877213, + "step": 6940 + }, + { + "epoch": 0.13884, + "grad_norm": 2.21875, + "grad_norm_var": 0.023152669270833332, + "learning_rate": 0.0001, + "loss": 4.494, + "loss/crossentropy": 2.0362821221351624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24296525120735168, + "step": 6942 + }, + { + "epoch": 0.13888, + "grad_norm": 2.21875, + "grad_norm_var": 0.023653157552083335, + "learning_rate": 0.0001, + "loss": 4.4135, + "loss/crossentropy": 2.0371538400650024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2217850610613823, + "step": 6944 + }, + { + "epoch": 0.13892, + "grad_norm": 2.5, + "grad_norm_var": 0.0268310546875, + "learning_rate": 0.0001, + "loss": 4.3371, + "loss/crossentropy": 1.9137988686561584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2218162938952446, + "step": 6946 + }, + { + "epoch": 0.13896, + "grad_norm": 2.46875, + "grad_norm_var": 0.03438898722330729, + "learning_rate": 0.0001, + "loss": 4.6521, + "loss/crossentropy": 2.3215843439102173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2576482892036438, + "step": 6948 + }, + { + "epoch": 0.139, + "grad_norm": 2.171875, + "grad_norm_var": 0.026590983072916668, + "learning_rate": 0.0001, + "loss": 4.6004, + "loss/crossentropy": 2.169154405593872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24629274010658264, + "step": 6950 + }, + { + "epoch": 0.13904, + "grad_norm": 2.203125, + "grad_norm_var": 0.03173421223958333, + "learning_rate": 0.0001, + "loss": 4.3549, + "loss/crossentropy": 2.1355135440826416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316850870847702, + "step": 6952 + }, + { + "epoch": 0.13908, + "grad_norm": 2.140625, + "grad_norm_var": 0.03183186848958333, + "learning_rate": 0.0001, + "loss": 4.3462, + "loss/crossentropy": 2.264985144138336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23467915505170822, + "step": 6954 + }, + { + "epoch": 0.13912, + "grad_norm": 2.203125, + "grad_norm_var": 0.025926717122395835, + "learning_rate": 0.0001, + "loss": 4.6559, + "loss/crossentropy": 2.1007654666900635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2376948595046997, + "step": 6956 + }, + { + "epoch": 0.13916, + "grad_norm": 2.28125, + "grad_norm_var": 0.02603759765625, + "learning_rate": 0.0001, + "loss": 4.4871, + "loss/crossentropy": 2.284608840942383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24811238050460815, + "step": 6958 + }, + { + "epoch": 0.1392, + "grad_norm": 2.125, + "grad_norm_var": 0.025911458333333335, + "learning_rate": 0.0001, + "loss": 4.2338, + "loss/crossentropy": 1.657732367515564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19541934877634048, + "step": 6960 + }, + { + "epoch": 0.13924, + "grad_norm": 2.171875, + "grad_norm_var": 0.022391764322916667, + "learning_rate": 0.0001, + "loss": 4.3832, + "loss/crossentropy": 1.9607325792312622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2269211858510971, + "step": 6962 + }, + { + "epoch": 0.13928, + "grad_norm": 2.09375, + "grad_norm_var": 0.006571451822916667, + "learning_rate": 0.0001, + "loss": 4.3759, + "loss/crossentropy": 1.7454752326011658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21780573576688766, + "step": 6964 + }, + { + "epoch": 0.13932, + "grad_norm": 2.421875, + "grad_norm_var": 0.010350545247395834, + "learning_rate": 0.0001, + "loss": 4.7664, + "loss/crossentropy": 2.001866638660431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24929454922676086, + "step": 6966 + }, + { + "epoch": 0.13936, + "grad_norm": 2.21875, + "grad_norm_var": 0.008006795247395834, + "learning_rate": 0.0001, + "loss": 4.4181, + "loss/crossentropy": 1.9167855978012085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22114858031272888, + "step": 6968 + }, + { + "epoch": 0.1394, + "grad_norm": 2.15625, + "grad_norm_var": 0.008003743489583333, + "learning_rate": 0.0001, + "loss": 4.2284, + "loss/crossentropy": 2.0324739813804626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23271320760250092, + "step": 6970 + }, + { + "epoch": 0.13944, + "grad_norm": 2.25, + "grad_norm_var": 0.016307576497395834, + "learning_rate": 0.0001, + "loss": 4.5375, + "loss/crossentropy": 2.162013590335846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22832238674163818, + "step": 6972 + }, + { + "epoch": 0.13948, + "grad_norm": 2.203125, + "grad_norm_var": 0.018944295247395833, + "learning_rate": 0.0001, + "loss": 4.1406, + "loss/crossentropy": 2.074672818183899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2402098923921585, + "step": 6974 + }, + { + "epoch": 0.13952, + "grad_norm": 2.0625, + "grad_norm_var": 0.019758097330729165, + "learning_rate": 0.0001, + "loss": 4.4221, + "loss/crossentropy": 1.9982299208641052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21126113086938858, + "step": 6976 + }, + { + "epoch": 0.13956, + "grad_norm": 2.171875, + "grad_norm_var": 0.021751912434895833, + "learning_rate": 0.0001, + "loss": 4.3391, + "loss/crossentropy": 1.944049894809723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2296794354915619, + "step": 6978 + }, + { + "epoch": 0.1396, + "grad_norm": 2.21875, + "grad_norm_var": 0.02047119140625, + "learning_rate": 0.0001, + "loss": 4.4344, + "loss/crossentropy": 2.308506488800049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2589666247367859, + "step": 6980 + }, + { + "epoch": 0.13964, + "grad_norm": 2.296875, + "grad_norm_var": 0.017724609375, + "learning_rate": 0.0001, + "loss": 4.2867, + "loss/crossentropy": 2.129163682460785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21905581653118134, + "step": 6982 + }, + { + "epoch": 0.13968, + "grad_norm": 2.234375, + "grad_norm_var": 0.017902628580729166, + "learning_rate": 0.0001, + "loss": 4.3023, + "loss/crossentropy": 1.8560669422149658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119361013174057, + "step": 6984 + }, + { + "epoch": 0.13972, + "grad_norm": 2.140625, + "grad_norm_var": 0.018602498372395835, + "learning_rate": 0.0001, + "loss": 4.1212, + "loss/crossentropy": 1.8194095492362976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20872193574905396, + "step": 6986 + }, + { + "epoch": 0.13976, + "grad_norm": 2.15625, + "grad_norm_var": 0.005501302083333334, + "learning_rate": 0.0001, + "loss": 4.2612, + "loss/crossentropy": 2.0200153589248657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20160746574401855, + "step": 6988 + }, + { + "epoch": 0.1398, + "grad_norm": 2.359375, + "grad_norm_var": 0.007298787434895833, + "learning_rate": 0.0001, + "loss": 4.2757, + "loss/crossentropy": 1.982479751110077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23396103084087372, + "step": 6990 + }, + { + "epoch": 0.13984, + "grad_norm": 2.171875, + "grad_norm_var": 0.006494140625, + "learning_rate": 0.0001, + "loss": 4.5075, + "loss/crossentropy": 2.17054283618927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23827539384365082, + "step": 6992 + }, + { + "epoch": 0.13988, + "grad_norm": 2.171875, + "grad_norm_var": 0.0056640625, + "learning_rate": 0.0001, + "loss": 4.1794, + "loss/crossentropy": 1.619499921798706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1917721927165985, + "step": 6994 + }, + { + "epoch": 0.13992, + "grad_norm": 2.15625, + "grad_norm_var": 0.005353800455729167, + "learning_rate": 0.0001, + "loss": 4.3833, + "loss/crossentropy": 2.1082500219345093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23768241703510284, + "step": 6996 + }, + { + "epoch": 0.13996, + "grad_norm": 2.1875, + "grad_norm_var": 0.004076131184895833, + "learning_rate": 0.0001, + "loss": 4.6731, + "loss/crossentropy": 1.8480825424194336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21046122163534164, + "step": 6998 + }, + { + "epoch": 0.14, + "grad_norm": 2.4375, + "grad_norm_var": 0.008463541666666666, + "learning_rate": 0.0001, + "loss": 4.5285, + "loss/crossentropy": 2.0547631978988647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22980494797229767, + "step": 7000 + }, + { + "epoch": 0.14004, + "grad_norm": 2.15625, + "grad_norm_var": 0.007835896809895833, + "learning_rate": 0.0001, + "loss": 4.4625, + "loss/crossentropy": 2.0695141553878784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100546732544899, + "step": 7002 + }, + { + "epoch": 0.14008, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012277984619140625, + "learning_rate": 0.0001, + "loss": 4.3716, + "loss/crossentropy": 2.105263113975525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24552470445632935, + "step": 7004 + }, + { + "epoch": 0.14012, + "grad_norm": 2.375, + "grad_norm_var": 0.014427693684895833, + "learning_rate": 0.0001, + "loss": 4.3566, + "loss/crossentropy": 2.03000670671463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24780434370040894, + "step": 7006 + }, + { + "epoch": 0.14016, + "grad_norm": 2.40625, + "grad_norm_var": 0.0173980712890625, + "learning_rate": 0.0001, + "loss": 4.4758, + "loss/crossentropy": 2.288944959640503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2760937511920929, + "step": 7008 + }, + { + "epoch": 0.1402, + "grad_norm": 1.9609375, + "grad_norm_var": 0.06544570922851563, + "learning_rate": 0.0001, + "loss": 3.9326, + "loss/crossentropy": 1.790147304534912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1905786320567131, + "step": 7010 + }, + { + "epoch": 0.14024, + "grad_norm": 2.609375, + "grad_norm_var": 0.07765884399414062, + "learning_rate": 0.0001, + "loss": 4.6364, + "loss/crossentropy": 2.008346378803253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23138166218996048, + "step": 7012 + }, + { + "epoch": 0.14028, + "grad_norm": 2.1875, + "grad_norm_var": 0.07974014282226563, + "learning_rate": 0.0001, + "loss": 4.2304, + "loss/crossentropy": 1.9694496393203735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21347863972187042, + "step": 7014 + }, + { + "epoch": 0.14032, + "grad_norm": 2.203125, + "grad_norm_var": 0.07948989868164062, + "learning_rate": 0.0001, + "loss": 4.3685, + "loss/crossentropy": 2.0907286405563354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23349857330322266, + "step": 7016 + }, + { + "epoch": 0.14036, + "grad_norm": 2.40625, + "grad_norm_var": 0.07850316365559896, + "learning_rate": 0.0001, + "loss": 4.6454, + "loss/crossentropy": 2.161414623260498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23841089010238647, + "step": 7018 + }, + { + "epoch": 0.1404, + "grad_norm": 2.25, + "grad_norm_var": 0.07315266927083333, + "learning_rate": 0.0001, + "loss": 4.4868, + "loss/crossentropy": 2.1402887105941772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25493840873241425, + "step": 7020 + }, + { + "epoch": 0.14044, + "grad_norm": 2.234375, + "grad_norm_var": 0.06809666951497396, + "learning_rate": 0.0001, + "loss": 4.1248, + "loss/crossentropy": 2.0703811049461365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24300381541252136, + "step": 7022 + }, + { + "epoch": 0.14048, + "grad_norm": 2.53125, + "grad_norm_var": 0.0716875712076823, + "learning_rate": 0.0001, + "loss": 4.6165, + "loss/crossentropy": 2.152569532394409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23347805440425873, + "step": 7024 + }, + { + "epoch": 0.14052, + "grad_norm": 2.171875, + "grad_norm_var": 0.03453369140625, + "learning_rate": 0.0001, + "loss": 4.1642, + "loss/crossentropy": 2.0831095576286316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2198955938220024, + "step": 7026 + }, + { + "epoch": 0.14056, + "grad_norm": 2.234375, + "grad_norm_var": 0.019498697916666665, + "learning_rate": 0.0001, + "loss": 4.4929, + "loss/crossentropy": 2.1631508469581604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23971489816904068, + "step": 7028 + }, + { + "epoch": 0.1406, + "grad_norm": 2.421875, + "grad_norm_var": 0.019481404622395834, + "learning_rate": 0.0001, + "loss": 4.4687, + "loss/crossentropy": 2.1683043241500854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2628757208585739, + "step": 7030 + }, + { + "epoch": 0.14064, + "grad_norm": 2.1875, + "grad_norm_var": 0.07377827962239583, + "learning_rate": 0.0001, + "loss": 4.522, + "loss/crossentropy": 2.021001398563385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22922180593013763, + "step": 7032 + }, + { + "epoch": 0.14068, + "grad_norm": 2.265625, + "grad_norm_var": 0.0725494384765625, + "learning_rate": 0.0001, + "loss": 4.7729, + "loss/crossentropy": 2.267430543899536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27525072544813156, + "step": 7034 + }, + { + "epoch": 0.14072, + "grad_norm": 2.0625, + "grad_norm_var": 0.0788726806640625, + "learning_rate": 0.0001, + "loss": 4.3118, + "loss/crossentropy": 2.066729426383972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21959475427865982, + "step": 7036 + }, + { + "epoch": 0.14076, + "grad_norm": 2.0625, + "grad_norm_var": 0.08162333170572916, + "learning_rate": 0.0001, + "loss": 4.3203, + "loss/crossentropy": 1.7972697019577026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773608028888702, + "step": 7038 + }, + { + "epoch": 0.1408, + "grad_norm": 2.171875, + "grad_norm_var": 0.08068745930989583, + "learning_rate": 0.0001, + "loss": 4.0745, + "loss/crossentropy": 1.751904845237732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19929176568984985, + "step": 7040 + }, + { + "epoch": 0.14084, + "grad_norm": 2.171875, + "grad_norm_var": 0.07649332682291667, + "learning_rate": 0.0001, + "loss": 4.4188, + "loss/crossentropy": 1.8432873487472534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21235650032758713, + "step": 7042 + }, + { + "epoch": 0.14088, + "grad_norm": 2.15625, + "grad_norm_var": 0.07618815104166667, + "learning_rate": 0.0001, + "loss": 4.2343, + "loss/crossentropy": 1.9589285850524902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21506928652524948, + "step": 7044 + }, + { + "epoch": 0.14092, + "grad_norm": 2.046875, + "grad_norm_var": 0.0788726806640625, + "learning_rate": 0.0001, + "loss": 4.229, + "loss/crossentropy": 2.3658028841018677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23405643552541733, + "step": 7046 + }, + { + "epoch": 0.14096, + "grad_norm": 2.140625, + "grad_norm_var": 0.008837890625, + "learning_rate": 0.0001, + "loss": 4.3922, + "loss/crossentropy": 2.088135540485382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22823208570480347, + "step": 7048 + }, + { + "epoch": 0.141, + "grad_norm": 2.09375, + "grad_norm_var": 0.006810506184895833, + "learning_rate": 0.0001, + "loss": 4.2367, + "loss/crossentropy": 1.9309821724891663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21409911662340164, + "step": 7050 + }, + { + "epoch": 0.14104, + "grad_norm": 2.1875, + "grad_norm_var": 0.006636555989583333, + "learning_rate": 0.0001, + "loss": 4.5237, + "loss/crossentropy": 2.5411492586135864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25017087161540985, + "step": 7052 + }, + { + "epoch": 0.14108, + "grad_norm": 2.03125, + "grad_norm_var": 0.007352701822916667, + "learning_rate": 0.0001, + "loss": 4.4998, + "loss/crossentropy": 2.3210322856903076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23967693746089935, + "step": 7054 + }, + { + "epoch": 0.14112, + "grad_norm": 2.046875, + "grad_norm_var": 0.009370930989583333, + "learning_rate": 0.0001, + "loss": 4.4607, + "loss/crossentropy": 2.054674744606018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23270255327224731, + "step": 7056 + }, + { + "epoch": 0.14116, + "grad_norm": 2.125, + "grad_norm_var": 0.008968098958333334, + "learning_rate": 0.0001, + "loss": 4.5423, + "loss/crossentropy": 2.545789122581482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25560182332992554, + "step": 7058 + }, + { + "epoch": 0.1412, + "grad_norm": 2.171875, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 4.2682, + "loss/crossentropy": 2.262348175048828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24763159453868866, + "step": 7060 + }, + { + "epoch": 0.14124, + "grad_norm": 2.1875, + "grad_norm_var": 0.008447265625, + "learning_rate": 0.0001, + "loss": 4.618, + "loss/crossentropy": 2.1045475602149963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2472379505634308, + "step": 7062 + }, + { + "epoch": 0.14128, + "grad_norm": 2.046875, + "grad_norm_var": 0.0093170166015625, + "learning_rate": 0.0001, + "loss": 4.2312, + "loss/crossentropy": 1.5632115006446838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19010942429304123, + "step": 7064 + }, + { + "epoch": 0.14132, + "grad_norm": 2.171875, + "grad_norm_var": 0.0116851806640625, + "learning_rate": 0.0001, + "loss": 4.2638, + "loss/crossentropy": 2.0847875475883484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21371345967054367, + "step": 7066 + }, + { + "epoch": 0.14136, + "grad_norm": 2.25, + "grad_norm_var": 0.018016560872395834, + "learning_rate": 0.0001, + "loss": 4.5171, + "loss/crossentropy": 2.2243804931640625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23065787553787231, + "step": 7068 + }, + { + "epoch": 0.1414, + "grad_norm": 2.171875, + "grad_norm_var": 0.016927083333333332, + "learning_rate": 0.0001, + "loss": 4.2812, + "loss/crossentropy": 1.9477753639221191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21616356819868088, + "step": 7070 + }, + { + "epoch": 0.14144, + "grad_norm": 2.5625, + "grad_norm_var": 0.024169921875, + "learning_rate": 0.0001, + "loss": 4.7005, + "loss/crossentropy": 2.2598072290420532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25795431435108185, + "step": 7072 + }, + { + "epoch": 0.14148, + "grad_norm": 2.171875, + "grad_norm_var": 0.022652180989583333, + "learning_rate": 0.0001, + "loss": 4.4345, + "loss/crossentropy": 1.8817242980003357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21974974125623703, + "step": 7074 + }, + { + "epoch": 0.14152, + "grad_norm": 2.0, + "grad_norm_var": 0.0256744384765625, + "learning_rate": 0.0001, + "loss": 4.5688, + "loss/crossentropy": 2.5275847911834717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24608048796653748, + "step": 7076 + }, + { + "epoch": 0.14156, + "grad_norm": 2.125, + "grad_norm_var": 0.0265045166015625, + "learning_rate": 0.0001, + "loss": 4.3937, + "loss/crossentropy": 2.400865852832794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22960034757852554, + "step": 7078 + }, + { + "epoch": 0.1416, + "grad_norm": 2.140625, + "grad_norm_var": 0.0237457275390625, + "learning_rate": 0.0001, + "loss": 4.4005, + "loss/crossentropy": 1.908901333808899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22271078824996948, + "step": 7080 + }, + { + "epoch": 0.14164, + "grad_norm": 2.171875, + "grad_norm_var": 0.02213134765625, + "learning_rate": 0.0001, + "loss": 4.1384, + "loss/crossentropy": 2.3330780267715454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23944097012281418, + "step": 7082 + }, + { + "epoch": 0.14168, + "grad_norm": 2.140625, + "grad_norm_var": 0.018941243489583332, + "learning_rate": 0.0001, + "loss": 4.3516, + "loss/crossentropy": 2.332213521003723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23576530069112778, + "step": 7084 + }, + { + "epoch": 0.14172, + "grad_norm": 2.125, + "grad_norm_var": 0.022435506184895832, + "learning_rate": 0.0001, + "loss": 4.4654, + "loss/crossentropy": 2.2269067764282227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25341375917196274, + "step": 7086 + }, + { + "epoch": 0.14176, + "grad_norm": 2.203125, + "grad_norm_var": 0.01470947265625, + "learning_rate": 0.0001, + "loss": 4.2491, + "loss/crossentropy": 2.461983561515808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24491792172193527, + "step": 7088 + }, + { + "epoch": 0.1418, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013952382405598958, + "learning_rate": 0.0001, + "loss": 4.2348, + "loss/crossentropy": 2.428719997406006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2394469603896141, + "step": 7090 + }, + { + "epoch": 0.14184, + "grad_norm": 2.109375, + "grad_norm_var": 0.011433664957682292, + "learning_rate": 0.0001, + "loss": 4.2242, + "loss/crossentropy": 2.32351291179657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23441501706838608, + "step": 7092 + }, + { + "epoch": 0.14188, + "grad_norm": 2.171875, + "grad_norm_var": 0.011482493082682291, + "learning_rate": 0.0001, + "loss": 4.4155, + "loss/crossentropy": 2.1165764331817627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2350979596376419, + "step": 7094 + }, + { + "epoch": 0.14192, + "grad_norm": 2.3125, + "grad_norm_var": 0.014288075764973958, + "learning_rate": 0.0001, + "loss": 4.4952, + "loss/crossentropy": 2.150681734085083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2475891336798668, + "step": 7096 + }, + { + "epoch": 0.14196, + "grad_norm": 2.046875, + "grad_norm_var": 0.013079579671223958, + "learning_rate": 0.0001, + "loss": 4.2753, + "loss/crossentropy": 2.038177013397217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2293965071439743, + "step": 7098 + }, + { + "epoch": 0.142, + "grad_norm": 2.0625, + "grad_norm_var": 0.014062245686848959, + "learning_rate": 0.0001, + "loss": 4.298, + "loss/crossentropy": 1.899521827697754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20856370776891708, + "step": 7100 + }, + { + "epoch": 0.14204, + "grad_norm": 2.296875, + "grad_norm_var": 0.010593414306640625, + "learning_rate": 0.0001, + "loss": 4.3449, + "loss/crossentropy": 1.9807924032211304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22756796330213547, + "step": 7102 + }, + { + "epoch": 0.14208, + "grad_norm": 2.0625, + "grad_norm_var": 0.009421539306640626, + "learning_rate": 0.0001, + "loss": 4.2893, + "loss/crossentropy": 2.158667206764221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22594892233610153, + "step": 7104 + }, + { + "epoch": 0.14212, + "grad_norm": 2.15625, + "grad_norm_var": 0.007811482747395833, + "learning_rate": 0.0001, + "loss": 4.3751, + "loss/crossentropy": 2.3133270144462585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22925584018230438, + "step": 7106 + }, + { + "epoch": 0.14216, + "grad_norm": 2.015625, + "grad_norm_var": 0.010107421875, + "learning_rate": 0.0001, + "loss": 4.2762, + "loss/crossentropy": 1.9796301126480103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21650104224681854, + "step": 7108 + }, + { + "epoch": 0.1422, + "grad_norm": 2.09375, + "grad_norm_var": 0.010054524739583333, + "learning_rate": 0.0001, + "loss": 4.2778, + "loss/crossentropy": 2.092659056186676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23407263308763504, + "step": 7110 + }, + { + "epoch": 0.14224, + "grad_norm": 2.125, + "grad_norm_var": 0.009056599934895833, + "learning_rate": 0.0001, + "loss": 4.6078, + "loss/crossentropy": 1.970819890499115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262839823961258, + "step": 7112 + }, + { + "epoch": 0.14228, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011923980712890626, + "learning_rate": 0.0001, + "loss": 4.0642, + "loss/crossentropy": 1.5877107381820679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.182430237531662, + "step": 7114 + }, + { + "epoch": 0.14232, + "grad_norm": 2.21875, + "grad_norm_var": 0.020336659749348958, + "learning_rate": 0.0001, + "loss": 4.6997, + "loss/crossentropy": 2.3208755254745483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25018931180238724, + "step": 7116 + }, + { + "epoch": 0.14236, + "grad_norm": 2.203125, + "grad_norm_var": 0.01904271443684896, + "learning_rate": 0.0001, + "loss": 4.4483, + "loss/crossentropy": 2.1348973512649536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23516173660755157, + "step": 7118 + }, + { + "epoch": 0.1424, + "grad_norm": 2.46875, + "grad_norm_var": 0.02533543904622396, + "learning_rate": 0.0001, + "loss": 4.3443, + "loss/crossentropy": 2.1442995071411133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23094037175178528, + "step": 7120 + }, + { + "epoch": 0.14244, + "grad_norm": 2.1875, + "grad_norm_var": 0.025608062744140625, + "learning_rate": 0.0001, + "loss": 4.3297, + "loss/crossentropy": 2.24001145362854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2479296177625656, + "step": 7122 + }, + { + "epoch": 0.14248, + "grad_norm": 2.734375, + "grad_norm_var": 0.038917795817057295, + "learning_rate": 0.0001, + "loss": 4.4122, + "loss/crossentropy": 1.8284733891487122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23670874536037445, + "step": 7124 + }, + { + "epoch": 0.14252, + "grad_norm": 2.375, + "grad_norm_var": 0.04146499633789062, + "learning_rate": 0.0001, + "loss": 4.6223, + "loss/crossentropy": 2.1003533601760864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24235840141773224, + "step": 7126 + }, + { + "epoch": 0.14256, + "grad_norm": 2.078125, + "grad_norm_var": 0.04201024373372396, + "learning_rate": 0.0001, + "loss": 4.2591, + "loss/crossentropy": 2.3661316633224487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.258208692073822, + "step": 7128 + }, + { + "epoch": 0.1426, + "grad_norm": 2.109375, + "grad_norm_var": 0.030989583333333334, + "learning_rate": 0.0001, + "loss": 4.3288, + "loss/crossentropy": 2.2374593019485474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2543400973081589, + "step": 7130 + }, + { + "epoch": 0.14264, + "grad_norm": 2.15625, + "grad_norm_var": 0.02945556640625, + "learning_rate": 0.0001, + "loss": 4.5939, + "loss/crossentropy": 1.9141342639923096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23257911205291748, + "step": 7132 + }, + { + "epoch": 0.14268, + "grad_norm": 2.4375, + "grad_norm_var": 0.031224568684895832, + "learning_rate": 0.0001, + "loss": 4.1893, + "loss/crossentropy": 1.992666780948639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2298717424273491, + "step": 7134 + }, + { + "epoch": 0.14272, + "grad_norm": 2.03125, + "grad_norm_var": 0.033503214518229164, + "learning_rate": 0.0001, + "loss": 4.2665, + "loss/crossentropy": 1.9794283509254456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112378552556038, + "step": 7136 + }, + { + "epoch": 0.14276, + "grad_norm": 2.5, + "grad_norm_var": 0.0382232666015625, + "learning_rate": 0.0001, + "loss": 4.3801, + "loss/crossentropy": 2.1011139154434204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27278490364551544, + "step": 7138 + }, + { + "epoch": 0.1428, + "grad_norm": 2.234375, + "grad_norm_var": 0.02340087890625, + "learning_rate": 0.0001, + "loss": 4.6989, + "loss/crossentropy": 2.3489880561828613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24445360153913498, + "step": 7140 + }, + { + "epoch": 0.14284, + "grad_norm": 2.28125, + "grad_norm_var": 0.0173736572265625, + "learning_rate": 0.0001, + "loss": 4.3418, + "loss/crossentropy": 2.011172831058502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210151307284832, + "step": 7142 + }, + { + "epoch": 0.14288, + "grad_norm": 2.25, + "grad_norm_var": 0.017366536458333335, + "learning_rate": 0.0001, + "loss": 4.3488, + "loss/crossentropy": 1.963642418384552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22939135879278183, + "step": 7144 + }, + { + "epoch": 0.14292, + "grad_norm": 2.21875, + "grad_norm_var": 0.016194661458333332, + "learning_rate": 0.0001, + "loss": 4.5166, + "loss/crossentropy": 2.2739341259002686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23235367238521576, + "step": 7146 + }, + { + "epoch": 0.14296, + "grad_norm": 2.21875, + "grad_norm_var": 0.018973795572916667, + "learning_rate": 0.0001, + "loss": 4.5977, + "loss/crossentropy": 2.282576322555542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24144183099269867, + "step": 7148 + }, + { + "epoch": 0.143, + "grad_norm": 2.078125, + "grad_norm_var": 0.02329279581705729, + "learning_rate": 0.0001, + "loss": 4.141, + "loss/crossentropy": 1.7847901582717896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19184929877519608, + "step": 7150 + }, + { + "epoch": 0.14304, + "grad_norm": 2.15625, + "grad_norm_var": 0.021740468343098958, + "learning_rate": 0.0001, + "loss": 4.3379, + "loss/crossentropy": 2.165170907974243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.223430335521698, + "step": 7152 + }, + { + "epoch": 0.14308, + "grad_norm": 2.15625, + "grad_norm_var": 0.018873850504557293, + "learning_rate": 0.0001, + "loss": 4.3407, + "loss/crossentropy": 2.0395787954330444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21662698686122894, + "step": 7154 + }, + { + "epoch": 0.14312, + "grad_norm": 2.125, + "grad_norm_var": 0.01907323201497396, + "learning_rate": 0.0001, + "loss": 4.4936, + "loss/crossentropy": 2.014316141605377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23184175789356232, + "step": 7156 + }, + { + "epoch": 0.14316, + "grad_norm": 2.296875, + "grad_norm_var": 0.019419097900390626, + "learning_rate": 0.0001, + "loss": 4.5612, + "loss/crossentropy": 2.2581117153167725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24055174738168716, + "step": 7158 + }, + { + "epoch": 0.1432, + "grad_norm": 2.28125, + "grad_norm_var": 0.021022288004557292, + "learning_rate": 0.0001, + "loss": 4.3853, + "loss/crossentropy": 2.0905630588531494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2719188630580902, + "step": 7160 + }, + { + "epoch": 0.14324, + "grad_norm": 2.25, + "grad_norm_var": 0.025233713785807292, + "learning_rate": 0.0001, + "loss": 4.6485, + "loss/crossentropy": 2.414529800415039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24188701063394547, + "step": 7162 + }, + { + "epoch": 0.14328, + "grad_norm": 2.328125, + "grad_norm_var": 0.02240778605143229, + "learning_rate": 0.0001, + "loss": 4.3605, + "loss/crossentropy": 2.028432607650757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2300800457596779, + "step": 7164 + }, + { + "epoch": 0.14332, + "grad_norm": 2.515625, + "grad_norm_var": 0.03299153645833333, + "learning_rate": 0.0001, + "loss": 4.2947, + "loss/crossentropy": 2.096144199371338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23311930894851685, + "step": 7166 + }, + { + "epoch": 0.14336, + "grad_norm": 2.25, + "grad_norm_var": 0.029622395833333332, + "learning_rate": 0.0001, + "loss": 4.4375, + "loss/crossentropy": 2.259281277656555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25558799505233765, + "step": 7168 + }, + { + "epoch": 0.1434, + "grad_norm": 2.25, + "grad_norm_var": 0.0219390869140625, + "learning_rate": 0.0001, + "loss": 4.4364, + "loss/crossentropy": 2.0766254663467407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22725434601306915, + "step": 7170 + }, + { + "epoch": 0.14344, + "grad_norm": 2.15625, + "grad_norm_var": 0.024312337239583332, + "learning_rate": 0.0001, + "loss": 4.4695, + "loss/crossentropy": 2.26702618598938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22584324330091476, + "step": 7172 + }, + { + "epoch": 0.14348, + "grad_norm": 2.203125, + "grad_norm_var": 0.027534993489583333, + "learning_rate": 0.0001, + "loss": 4.3417, + "loss/crossentropy": 2.1933096647262573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23658160120248795, + "step": 7174 + }, + { + "epoch": 0.14352, + "grad_norm": 2.1875, + "grad_norm_var": 0.027372233072916665, + "learning_rate": 0.0001, + "loss": 4.351, + "loss/crossentropy": 2.2003660202026367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22349942475557327, + "step": 7176 + }, + { + "epoch": 0.14356, + "grad_norm": 2.28125, + "grad_norm_var": 0.025145467122395834, + "learning_rate": 0.0001, + "loss": 4.684, + "loss/crossentropy": 2.4630067348480225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.253003865480423, + "step": 7178 + }, + { + "epoch": 0.1436, + "grad_norm": 2.109375, + "grad_norm_var": 0.027082316080729165, + "learning_rate": 0.0001, + "loss": 4.6875, + "loss/crossentropy": 2.264480948448181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27604997158050537, + "step": 7180 + }, + { + "epoch": 0.14364, + "grad_norm": 2.203125, + "grad_norm_var": 0.00712890625, + "learning_rate": 0.0001, + "loss": 4.3064, + "loss/crossentropy": 2.1641955375671387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22587595880031586, + "step": 7182 + }, + { + "epoch": 0.14368, + "grad_norm": 2.296875, + "grad_norm_var": 0.00943603515625, + "learning_rate": 0.0001, + "loss": 4.6137, + "loss/crossentropy": 2.1432350873947144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24324018508195877, + "step": 7184 + }, + { + "epoch": 0.14372, + "grad_norm": 1.9140625, + "grad_norm_var": 0.014611562093098959, + "learning_rate": 0.0001, + "loss": 4.3394, + "loss/crossentropy": 1.7448238134384155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1751260682940483, + "step": 7186 + }, + { + "epoch": 0.14376, + "grad_norm": 2.1875, + "grad_norm_var": 0.015295155843098958, + "learning_rate": 0.0001, + "loss": 4.3602, + "loss/crossentropy": 2.3202184438705444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2314094603061676, + "step": 7188 + }, + { + "epoch": 0.1438, + "grad_norm": 1.984375, + "grad_norm_var": 0.019681549072265624, + "learning_rate": 0.0001, + "loss": 4.187, + "loss/crossentropy": 1.970094919204712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24998464435338974, + "step": 7190 + }, + { + "epoch": 0.14384, + "grad_norm": 2.125, + "grad_norm_var": 0.02075780232747396, + "learning_rate": 0.0001, + "loss": 4.215, + "loss/crossentropy": 2.1331114768981934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008385732769966, + "step": 7192 + }, + { + "epoch": 0.14388, + "grad_norm": 2.03125, + "grad_norm_var": 0.021345774332682293, + "learning_rate": 0.0001, + "loss": 4.3976, + "loss/crossentropy": 2.1659106016159058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22314336150884628, + "step": 7194 + }, + { + "epoch": 0.14392, + "grad_norm": 2.1875, + "grad_norm_var": 0.018507639567057293, + "learning_rate": 0.0001, + "loss": 4.6324, + "loss/crossentropy": 2.3382883071899414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26393643021583557, + "step": 7196 + }, + { + "epoch": 0.14396, + "grad_norm": 2.34375, + "grad_norm_var": 0.020499420166015626, + "learning_rate": 0.0001, + "loss": 4.7988, + "loss/crossentropy": 2.1325554847717285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2457222416996956, + "step": 7198 + }, + { + "epoch": 0.144, + "grad_norm": 2.078125, + "grad_norm_var": 0.03144709269205729, + "learning_rate": 0.0001, + "loss": 4.3175, + "loss/crossentropy": 1.7927106022834778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20284093916416168, + "step": 7200 + }, + { + "epoch": 0.14404, + "grad_norm": 2.296875, + "grad_norm_var": 0.03050715128580729, + "learning_rate": 0.0001, + "loss": 4.1676, + "loss/crossentropy": 1.9799031615257263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23475942015647888, + "step": 7202 + }, + { + "epoch": 0.14408, + "grad_norm": 2.234375, + "grad_norm_var": 0.02939020792643229, + "learning_rate": 0.0001, + "loss": 4.2627, + "loss/crossentropy": 2.0590370893478394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160736471414566, + "step": 7204 + }, + { + "epoch": 0.14412, + "grad_norm": 2.34375, + "grad_norm_var": 0.02800267537434896, + "learning_rate": 0.0001, + "loss": 4.4397, + "loss/crossentropy": 1.9866149425506592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23413674533367157, + "step": 7206 + }, + { + "epoch": 0.14416, + "grad_norm": 2.5625, + "grad_norm_var": 0.03551610310872396, + "learning_rate": 0.0001, + "loss": 4.526, + "loss/crossentropy": 2.1320748925209045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24469739198684692, + "step": 7208 + }, + { + "epoch": 0.1442, + "grad_norm": 2.375, + "grad_norm_var": 0.034708404541015626, + "learning_rate": 0.0001, + "loss": 4.5834, + "loss/crossentropy": 2.225857973098755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23838083446025848, + "step": 7210 + }, + { + "epoch": 0.14424, + "grad_norm": 2.078125, + "grad_norm_var": 0.03794733683268229, + "learning_rate": 0.0001, + "loss": 3.9952, + "loss/crossentropy": 1.9118528962135315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064301297068596, + "step": 7212 + }, + { + "epoch": 0.14428, + "grad_norm": 2.21875, + "grad_norm_var": 0.03806940714518229, + "learning_rate": 0.0001, + "loss": 4.182, + "loss/crossentropy": 1.8142234086990356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172461450099945, + "step": 7214 + }, + { + "epoch": 0.14432, + "grad_norm": 2.203125, + "grad_norm_var": 0.02540868123372396, + "learning_rate": 0.0001, + "loss": 4.4446, + "loss/crossentropy": 1.9308255910873413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2236044555902481, + "step": 7216 + }, + { + "epoch": 0.14436, + "grad_norm": 2.3125, + "grad_norm_var": 0.022391764322916667, + "learning_rate": 0.0001, + "loss": 4.4597, + "loss/crossentropy": 1.9821211695671082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21110422909259796, + "step": 7218 + }, + { + "epoch": 0.1444, + "grad_norm": 2.328125, + "grad_norm_var": 0.023176066080729165, + "learning_rate": 0.0001, + "loss": 4.6732, + "loss/crossentropy": 2.216045379638672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23433538526296616, + "step": 7220 + }, + { + "epoch": 0.14444, + "grad_norm": 2.5, + "grad_norm_var": 0.024442545572916665, + "learning_rate": 0.0001, + "loss": 4.6089, + "loss/crossentropy": 2.2303662300109863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2390453889966011, + "step": 7222 + }, + { + "epoch": 0.14448, + "grad_norm": 2.078125, + "grad_norm_var": 0.017020670572916667, + "learning_rate": 0.0001, + "loss": 4.0805, + "loss/crossentropy": 2.2152082920074463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22196897864341736, + "step": 7224 + }, + { + "epoch": 0.14452, + "grad_norm": 2.09375, + "grad_norm_var": 0.014860026041666667, + "learning_rate": 0.0001, + "loss": 4.2843, + "loss/crossentropy": 2.134513795375824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21964208781719208, + "step": 7226 + }, + { + "epoch": 0.14456, + "grad_norm": 2.0625, + "grad_norm_var": 0.015262858072916666, + "learning_rate": 0.0001, + "loss": 4.0289, + "loss/crossentropy": 1.6803861260414124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18217483162879944, + "step": 7228 + }, + { + "epoch": 0.1446, + "grad_norm": 2.078125, + "grad_norm_var": 0.0152252197265625, + "learning_rate": 0.0001, + "loss": 4.1353, + "loss/crossentropy": 1.6597792506217957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20317095518112183, + "step": 7230 + }, + { + "epoch": 0.14464, + "grad_norm": 2.015625, + "grad_norm_var": 0.017704264322916666, + "learning_rate": 0.0001, + "loss": 4.2978, + "loss/crossentropy": 1.776586651802063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21835225820541382, + "step": 7232 + }, + { + "epoch": 0.14468, + "grad_norm": 2.03125, + "grad_norm_var": 0.017154947916666666, + "learning_rate": 0.0001, + "loss": 4.1092, + "loss/crossentropy": 1.7347259521484375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21964067220687866, + "step": 7234 + }, + { + "epoch": 0.14472, + "grad_norm": 2.046875, + "grad_norm_var": 0.015380859375, + "learning_rate": 0.0001, + "loss": 4.2069, + "loss/crossentropy": 1.79097181558609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21776803582906723, + "step": 7236 + }, + { + "epoch": 0.14476, + "grad_norm": 1.984375, + "grad_norm_var": 0.0069244384765625, + "learning_rate": 0.0001, + "loss": 4.0335, + "loss/crossentropy": 2.051329553127289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21669812500476837, + "step": 7238 + }, + { + "epoch": 0.1448, + "grad_norm": 2.15625, + "grad_norm_var": 0.0070220947265625, + "learning_rate": 0.0001, + "loss": 4.3206, + "loss/crossentropy": 1.965324580669403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2283879667520523, + "step": 7240 + }, + { + "epoch": 0.14484, + "grad_norm": 2.296875, + "grad_norm_var": 0.010692342122395834, + "learning_rate": 0.0001, + "loss": 4.5952, + "loss/crossentropy": 2.248784363269806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24906039983034134, + "step": 7242 + }, + { + "epoch": 0.14488, + "grad_norm": 2.125, + "grad_norm_var": 0.010529581705729167, + "learning_rate": 0.0001, + "loss": 4.3321, + "loss/crossentropy": 1.9946890473365784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22583268582820892, + "step": 7244 + }, + { + "epoch": 0.14492, + "grad_norm": 2.0625, + "grad_norm_var": 0.010660807291666666, + "learning_rate": 0.0001, + "loss": 4.1436, + "loss/crossentropy": 2.2306413650512695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22068945318460464, + "step": 7246 + }, + { + "epoch": 0.14496, + "grad_norm": 2.1875, + "grad_norm_var": 0.00758056640625, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 2.110253095626831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22045104205608368, + "step": 7248 + }, + { + "epoch": 0.145, + "grad_norm": 2.21875, + "grad_norm_var": 0.008210245768229167, + "learning_rate": 0.0001, + "loss": 4.0693, + "loss/crossentropy": 1.928157925605774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043258175253868, + "step": 7250 + }, + { + "epoch": 0.14504, + "grad_norm": 2.0, + "grad_norm_var": 0.009110514322916667, + "learning_rate": 0.0001, + "loss": 4.2458, + "loss/crossentropy": 2.2674691677093506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23258862644433975, + "step": 7252 + }, + { + "epoch": 0.14508, + "grad_norm": 2.765625, + "grad_norm_var": 0.03052978515625, + "learning_rate": 0.0001, + "loss": 4.5107, + "loss/crossentropy": 2.2825024127960205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.263367660343647, + "step": 7254 + }, + { + "epoch": 0.14512, + "grad_norm": 2.265625, + "grad_norm_var": 0.040185546875, + "learning_rate": 0.0001, + "loss": 4.2394, + "loss/crossentropy": 2.1546168327331543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23073332011699677, + "step": 7256 + }, + { + "epoch": 0.14516, + "grad_norm": 2.34375, + "grad_norm_var": 0.040816243489583334, + "learning_rate": 0.0001, + "loss": 4.5504, + "loss/crossentropy": 2.0490044951438904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24161820113658905, + "step": 7258 + }, + { + "epoch": 0.1452, + "grad_norm": 2.234375, + "grad_norm_var": 0.04038798014322917, + "learning_rate": 0.0001, + "loss": 4.6468, + "loss/crossentropy": 2.115446150302887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2537624090909958, + "step": 7260 + }, + { + "epoch": 0.14524, + "grad_norm": 2.203125, + "grad_norm_var": 0.22388407389322917, + "learning_rate": 0.0001, + "loss": 4.1457, + "loss/crossentropy": 2.0302165746688843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21026766300201416, + "step": 7262 + }, + { + "epoch": 0.14528, + "grad_norm": 2.171875, + "grad_norm_var": 0.2228179931640625, + "learning_rate": 0.0001, + "loss": 4.4077, + "loss/crossentropy": 2.102527379989624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2354147508740425, + "step": 7264 + }, + { + "epoch": 0.14532, + "grad_norm": 2.171875, + "grad_norm_var": 0.21614176432291668, + "learning_rate": 0.0001, + "loss": 4.1108, + "loss/crossentropy": 2.0095282793045044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22679834067821503, + "step": 7266 + }, + { + "epoch": 0.14536, + "grad_norm": 2.265625, + "grad_norm_var": 0.20706278483072918, + "learning_rate": 0.0001, + "loss": 4.3849, + "loss/crossentropy": 1.8988603353500366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21238050609827042, + "step": 7268 + }, + { + "epoch": 0.1454, + "grad_norm": 2.203125, + "grad_norm_var": 0.19975484212239583, + "learning_rate": 0.0001, + "loss": 4.6614, + "loss/crossentropy": 2.186660885810852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24453039467334747, + "step": 7270 + }, + { + "epoch": 0.14544, + "grad_norm": 2.0625, + "grad_norm_var": 0.20244852701822916, + "learning_rate": 0.0001, + "loss": 4.1409, + "loss/crossentropy": 1.8927155137062073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147793024778366, + "step": 7272 + }, + { + "epoch": 0.14548, + "grad_norm": 2.046875, + "grad_norm_var": 0.20608317057291667, + "learning_rate": 0.0001, + "loss": 4.1375, + "loss/crossentropy": 1.8969642519950867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21910040825605392, + "step": 7274 + }, + { + "epoch": 0.14552, + "grad_norm": 2.21875, + "grad_norm_var": 0.20545247395833333, + "learning_rate": 0.0001, + "loss": 4.3325, + "loss/crossentropy": 2.090053617954254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22576116025447845, + "step": 7276 + }, + { + "epoch": 0.14556, + "grad_norm": 2.171875, + "grad_norm_var": 0.010480753580729167, + "learning_rate": 0.0001, + "loss": 4.7263, + "loss/crossentropy": 2.1606650352478027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22937766462564468, + "step": 7278 + }, + { + "epoch": 0.1456, + "grad_norm": 2.015625, + "grad_norm_var": 0.013451131184895833, + "learning_rate": 0.0001, + "loss": 4.0924, + "loss/crossentropy": 1.9946333765983582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20145532488822937, + "step": 7280 + }, + { + "epoch": 0.14564, + "grad_norm": 2.296875, + "grad_norm_var": 0.013834635416666666, + "learning_rate": 0.0001, + "loss": 4.6519, + "loss/crossentropy": 2.0958545207977295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22669509798288345, + "step": 7282 + }, + { + "epoch": 0.14568, + "grad_norm": 2.328125, + "grad_norm_var": 0.016109212239583334, + "learning_rate": 0.0001, + "loss": 4.4224, + "loss/crossentropy": 2.0515894889831543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2520884945988655, + "step": 7284 + }, + { + "epoch": 0.14572, + "grad_norm": 2.171875, + "grad_norm_var": 0.016063435872395834, + "learning_rate": 0.0001, + "loss": 4.1572, + "loss/crossentropy": 2.034587264060974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21523287147283554, + "step": 7286 + }, + { + "epoch": 0.14576, + "grad_norm": 2.140625, + "grad_norm_var": 0.016080729166666665, + "learning_rate": 0.0001, + "loss": 3.8649, + "loss/crossentropy": 1.6578314900398254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19603776931762695, + "step": 7288 + }, + { + "epoch": 0.1458, + "grad_norm": 2.109375, + "grad_norm_var": 0.014167277018229167, + "learning_rate": 0.0001, + "loss": 4.2443, + "loss/crossentropy": 2.0019100308418274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22880420833826065, + "step": 7290 + }, + { + "epoch": 0.14584, + "grad_norm": 2.046875, + "grad_norm_var": 0.018723297119140624, + "learning_rate": 0.0001, + "loss": 3.981, + "loss/crossentropy": 2.068517565727234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19675085693597794, + "step": 7292 + }, + { + "epoch": 0.14588, + "grad_norm": 2.203125, + "grad_norm_var": 0.013734690348307292, + "learning_rate": 0.0001, + "loss": 4.4037, + "loss/crossentropy": 2.000797212123871, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20464950054883957, + "step": 7294 + }, + { + "epoch": 0.14592, + "grad_norm": 2.125, + "grad_norm_var": 0.013734690348307292, + "learning_rate": 0.0001, + "loss": 4.4132, + "loss/crossentropy": 2.1914668679237366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2406519278883934, + "step": 7296 + }, + { + "epoch": 0.14596, + "grad_norm": 2.390625, + "grad_norm_var": 0.01685358683268229, + "learning_rate": 0.0001, + "loss": 4.4737, + "loss/crossentropy": 2.123211979866028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.250032439827919, + "step": 7298 + }, + { + "epoch": 0.146, + "grad_norm": 2.625, + "grad_norm_var": 0.028507232666015625, + "learning_rate": 0.0001, + "loss": 4.718, + "loss/crossentropy": 2.0686148405075073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22581970691680908, + "step": 7300 + }, + { + "epoch": 0.14604, + "grad_norm": 2.25, + "grad_norm_var": 0.029288482666015626, + "learning_rate": 0.0001, + "loss": 4.2295, + "loss/crossentropy": 2.141907751560211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21503636240959167, + "step": 7302 + }, + { + "epoch": 0.14608, + "grad_norm": 2.09375, + "grad_norm_var": 0.028436024983723957, + "learning_rate": 0.0001, + "loss": 4.1493, + "loss/crossentropy": 1.6741206645965576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20490698516368866, + "step": 7304 + }, + { + "epoch": 0.14612, + "grad_norm": 2.09375, + "grad_norm_var": 0.027854156494140626, + "learning_rate": 0.0001, + "loss": 4.1259, + "loss/crossentropy": 1.8094561696052551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21328043192625046, + "step": 7306 + }, + { + "epoch": 0.14616, + "grad_norm": 2.109375, + "grad_norm_var": 0.020970662434895832, + "learning_rate": 0.0001, + "loss": 4.3715, + "loss/crossentropy": 2.204083800315857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24204879999160767, + "step": 7308 + }, + { + "epoch": 0.1462, + "grad_norm": 2.234375, + "grad_norm_var": 0.021126302083333333, + "learning_rate": 0.0001, + "loss": 4.2614, + "loss/crossentropy": 2.0166819095611572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2219880372285843, + "step": 7310 + }, + { + "epoch": 0.14624, + "grad_norm": 2.25, + "grad_norm_var": 0.022684733072916668, + "learning_rate": 0.0001, + "loss": 4.3768, + "loss/crossentropy": 2.4667757749557495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2880199924111366, + "step": 7312 + }, + { + "epoch": 0.14628, + "grad_norm": 2.1875, + "grad_norm_var": 0.019287109375, + "learning_rate": 0.0001, + "loss": 4.2005, + "loss/crossentropy": 2.1497310400009155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21792854368686676, + "step": 7314 + }, + { + "epoch": 0.14632, + "grad_norm": 2.25, + "grad_norm_var": 0.0062896728515625, + "learning_rate": 0.0001, + "loss": 4.3911, + "loss/crossentropy": 2.179584264755249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23200812935829163, + "step": 7316 + }, + { + "epoch": 0.14636, + "grad_norm": 6.0625, + "grad_norm_var": 0.9582590738932292, + "learning_rate": 0.0001, + "loss": 4.1545, + "loss/crossentropy": 1.4067250490188599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204126738011837, + "step": 7318 + }, + { + "epoch": 0.1464, + "grad_norm": 2.390625, + "grad_norm_var": 0.9473592122395833, + "learning_rate": 0.0001, + "loss": 4.2439, + "loss/crossentropy": 1.9879435896873474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.247541606426239, + "step": 7320 + }, + { + "epoch": 0.14644, + "grad_norm": 2.28125, + "grad_norm_var": 0.9377919514973958, + "learning_rate": 0.0001, + "loss": 4.1795, + "loss/crossentropy": 1.62649005651474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2054424211382866, + "step": 7322 + }, + { + "epoch": 0.14648, + "grad_norm": 2.15625, + "grad_norm_var": 0.9344228108723959, + "learning_rate": 0.0001, + "loss": 4.1479, + "loss/crossentropy": 1.9012435674667358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240126132965088, + "step": 7324 + }, + { + "epoch": 0.14652, + "grad_norm": 2.03125, + "grad_norm_var": 0.948193359375, + "learning_rate": 0.0001, + "loss": 3.6841, + "loss/crossentropy": 1.8239200115203857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19691675901412964, + "step": 7326 + }, + { + "epoch": 0.14656, + "grad_norm": 2.171875, + "grad_norm_var": 0.94068603515625, + "learning_rate": 0.0001, + "loss": 4.2223, + "loss/crossentropy": 1.9689037799835205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21301231533288956, + "step": 7328 + }, + { + "epoch": 0.1466, + "grad_norm": 2.09375, + "grad_norm_var": 0.9363433837890625, + "learning_rate": 0.0001, + "loss": 4.3733, + "loss/crossentropy": 2.1064560413360596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23990632593631744, + "step": 7330 + }, + { + "epoch": 0.14664, + "grad_norm": 2.109375, + "grad_norm_var": 0.9397125244140625, + "learning_rate": 0.0001, + "loss": 4.2765, + "loss/crossentropy": 2.0871987342834473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23108436167240143, + "step": 7332 + }, + { + "epoch": 0.14668, + "grad_norm": 2.265625, + "grad_norm_var": 0.0209869384765625, + "learning_rate": 0.0001, + "loss": 4.4517, + "loss/crossentropy": 2.3061007857322693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24406791478395462, + "step": 7334 + }, + { + "epoch": 0.14672, + "grad_norm": 2.296875, + "grad_norm_var": 0.016943359375, + "learning_rate": 0.0001, + "loss": 4.4278, + "loss/crossentropy": 1.9471244812011719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20859277993440628, + "step": 7336 + }, + { + "epoch": 0.14676, + "grad_norm": 2.28125, + "grad_norm_var": 0.0131744384765625, + "learning_rate": 0.0001, + "loss": 4.188, + "loss/crossentropy": 2.0628740191459656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25501881539821625, + "step": 7338 + }, + { + "epoch": 0.1468, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014481353759765624, + "learning_rate": 0.0001, + "loss": 4.4329, + "loss/crossentropy": 1.8065250515937805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20222238451242447, + "step": 7340 + }, + { + "epoch": 0.14684, + "grad_norm": 2.109375, + "grad_norm_var": 0.012910715738932292, + "learning_rate": 0.0001, + "loss": 4.5431, + "loss/crossentropy": 2.134244918823242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22319403290748596, + "step": 7342 + }, + { + "epoch": 0.14688, + "grad_norm": 2.109375, + "grad_norm_var": 0.012359364827473959, + "learning_rate": 0.0001, + "loss": 4.3098, + "loss/crossentropy": 2.5807924270629883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26865454018116, + "step": 7344 + }, + { + "epoch": 0.14692, + "grad_norm": 2.203125, + "grad_norm_var": 0.015421295166015625, + "learning_rate": 0.0001, + "loss": 4.684, + "loss/crossentropy": 2.4128278493881226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25771988928318024, + "step": 7346 + }, + { + "epoch": 0.14696, + "grad_norm": 2.046875, + "grad_norm_var": 0.017561594645182293, + "learning_rate": 0.0001, + "loss": 4.0915, + "loss/crossentropy": 1.7323983907699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146565169095993, + "step": 7348 + }, + { + "epoch": 0.147, + "grad_norm": 2.125, + "grad_norm_var": 0.015553538004557292, + "learning_rate": 0.0001, + "loss": 4.5511, + "loss/crossentropy": 2.036192536354065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21692586690187454, + "step": 7350 + }, + { + "epoch": 0.14704, + "grad_norm": 2.09375, + "grad_norm_var": 0.012320709228515626, + "learning_rate": 0.0001, + "loss": 4.4173, + "loss/crossentropy": 1.9586528539657593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21543999761343002, + "step": 7352 + }, + { + "epoch": 0.14708, + "grad_norm": 2.234375, + "grad_norm_var": 0.011805979410807292, + "learning_rate": 0.0001, + "loss": 4.4506, + "loss/crossentropy": 2.3444113731384277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24080512672662735, + "step": 7354 + }, + { + "epoch": 0.14712, + "grad_norm": 2.15625, + "grad_norm_var": 0.0097076416015625, + "learning_rate": 0.0001, + "loss": 4.4865, + "loss/crossentropy": 2.3060439825057983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24567800760269165, + "step": 7356 + }, + { + "epoch": 0.14716, + "grad_norm": 2.359375, + "grad_norm_var": 0.01279296875, + "learning_rate": 0.0001, + "loss": 4.5975, + "loss/crossentropy": 2.2267106771469116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2345375493168831, + "step": 7358 + }, + { + "epoch": 0.1472, + "grad_norm": 2.359375, + "grad_norm_var": 0.014481608072916667, + "learning_rate": 0.0001, + "loss": 4.7052, + "loss/crossentropy": 2.2470518350601196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2417411357164383, + "step": 7360 + }, + { + "epoch": 0.14724, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012953440348307291, + "learning_rate": 0.0001, + "loss": 4.3299, + "loss/crossentropy": 2.200543165206909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24434109032154083, + "step": 7362 + }, + { + "epoch": 0.14728, + "grad_norm": 2.203125, + "grad_norm_var": 0.010227203369140625, + "learning_rate": 0.0001, + "loss": 4.3282, + "loss/crossentropy": 1.995844304561615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22591491788625717, + "step": 7364 + }, + { + "epoch": 0.14732, + "grad_norm": 2.046875, + "grad_norm_var": 0.011580149332682291, + "learning_rate": 0.0001, + "loss": 4.3354, + "loss/crossentropy": 1.9180658459663391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22776676714420319, + "step": 7366 + }, + { + "epoch": 0.14736, + "grad_norm": 2.078125, + "grad_norm_var": 0.011840565999348959, + "learning_rate": 0.0001, + "loss": 4.3662, + "loss/crossentropy": 2.473931312561035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26284685730934143, + "step": 7368 + }, + { + "epoch": 0.1474, + "grad_norm": 2.109375, + "grad_norm_var": 0.011744944254557292, + "learning_rate": 0.0001, + "loss": 4.3167, + "loss/crossentropy": 2.0392738580703735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22323766350746155, + "step": 7370 + }, + { + "epoch": 0.14744, + "grad_norm": 2.203125, + "grad_norm_var": 0.012094879150390625, + "learning_rate": 0.0001, + "loss": 4.3032, + "loss/crossentropy": 1.9847410917282104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22324485331773758, + "step": 7372 + }, + { + "epoch": 0.14748, + "grad_norm": 2.171875, + "grad_norm_var": 0.008459218343098958, + "learning_rate": 0.0001, + "loss": 4.2794, + "loss/crossentropy": 1.930326521396637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24155349284410477, + "step": 7374 + }, + { + "epoch": 0.14752, + "grad_norm": 2.09375, + "grad_norm_var": 0.0061724344889322914, + "learning_rate": 0.0001, + "loss": 4.0648, + "loss/crossentropy": 1.825449824333191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087552770972252, + "step": 7376 + }, + { + "epoch": 0.14756, + "grad_norm": 2.15625, + "grad_norm_var": 0.004325358072916666, + "learning_rate": 0.0001, + "loss": 4.2405, + "loss/crossentropy": 2.156645655632019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2519562169909477, + "step": 7378 + }, + { + "epoch": 0.1476, + "grad_norm": 2.40625, + "grad_norm_var": 0.008333333333333333, + "learning_rate": 0.0001, + "loss": 4.5316, + "loss/crossentropy": 2.255813479423523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23737946152687073, + "step": 7380 + }, + { + "epoch": 0.14764, + "grad_norm": 2.109375, + "grad_norm_var": 0.00797119140625, + "learning_rate": 0.0001, + "loss": 4.5795, + "loss/crossentropy": 2.47933566570282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23890959471464157, + "step": 7382 + }, + { + "epoch": 0.14768, + "grad_norm": 2.09375, + "grad_norm_var": 0.007710774739583333, + "learning_rate": 0.0001, + "loss": 4.5328, + "loss/crossentropy": 2.139566659927368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22334300726652145, + "step": 7384 + }, + { + "epoch": 0.14772, + "grad_norm": 2.125, + "grad_norm_var": 0.007673136393229167, + "learning_rate": 0.0001, + "loss": 4.1581, + "loss/crossentropy": 1.6182149052619934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19345169514417648, + "step": 7386 + }, + { + "epoch": 0.14776, + "grad_norm": 2.40625, + "grad_norm_var": 0.010758463541666667, + "learning_rate": 0.0001, + "loss": 4.4571, + "loss/crossentropy": 1.9984004497528076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24823245406150818, + "step": 7388 + }, + { + "epoch": 0.1478, + "grad_norm": 2.171875, + "grad_norm_var": 0.010445149739583333, + "learning_rate": 0.0001, + "loss": 4.3218, + "loss/crossentropy": 2.5892586708068848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2489030361175537, + "step": 7390 + }, + { + "epoch": 0.14784, + "grad_norm": 2.140625, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 4.6077, + "loss/crossentropy": 2.3723479509353638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23511765897274017, + "step": 7392 + }, + { + "epoch": 0.14788, + "grad_norm": 2.484375, + "grad_norm_var": 0.0145660400390625, + "learning_rate": 0.0001, + "loss": 4.3902, + "loss/crossentropy": 2.001940071582794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22256013005971909, + "step": 7394 + }, + { + "epoch": 0.14792, + "grad_norm": 2.0625, + "grad_norm_var": 0.013850911458333334, + "learning_rate": 0.0001, + "loss": 4.4422, + "loss/crossentropy": 2.2255555391311646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23905682563781738, + "step": 7396 + }, + { + "epoch": 0.14796, + "grad_norm": 2.28125, + "grad_norm_var": 0.013277180989583333, + "learning_rate": 0.0001, + "loss": 4.6217, + "loss/crossentropy": 2.5670583248138428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25223178416490555, + "step": 7398 + }, + { + "epoch": 0.148, + "grad_norm": 2.203125, + "grad_norm_var": 2.3002278645833334, + "learning_rate": 0.0001, + "loss": 4.6365, + "loss/crossentropy": 2.16109037399292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2601509317755699, + "step": 7400 + }, + { + "epoch": 0.14804, + "grad_norm": 2.09375, + "grad_norm_var": 2.3012847900390625, + "learning_rate": 0.0001, + "loss": 4.4169, + "loss/crossentropy": 2.1484315395355225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21474219858646393, + "step": 7402 + }, + { + "epoch": 0.14808, + "grad_norm": 4.3125, + "grad_norm_var": 2.48385009765625, + "learning_rate": 0.0001, + "loss": 4.7452, + "loss/crossentropy": 2.183099091053009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25882233679294586, + "step": 7404 + }, + { + "epoch": 0.14812, + "grad_norm": 2.0625, + "grad_norm_var": 2.491097005208333, + "learning_rate": 0.0001, + "loss": 3.9739, + "loss/crossentropy": 1.795831561088562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20870305597782135, + "step": 7406 + }, + { + "epoch": 0.14816, + "grad_norm": 2.09375, + "grad_norm_var": 2.5058553059895834, + "learning_rate": 0.0001, + "loss": 4.3433, + "loss/crossentropy": 2.202280640602112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150556966662407, + "step": 7408 + }, + { + "epoch": 0.1482, + "grad_norm": 2.03125, + "grad_norm_var": 2.5274251302083335, + "learning_rate": 0.0001, + "loss": 4.3063, + "loss/crossentropy": 2.2616937160491943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24934212118387222, + "step": 7410 + }, + { + "epoch": 0.14824, + "grad_norm": 2.171875, + "grad_norm_var": 2.51627197265625, + "learning_rate": 0.0001, + "loss": 4.3303, + "loss/crossentropy": 1.909091055393219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23191271722316742, + "step": 7412 + }, + { + "epoch": 0.14828, + "grad_norm": 2.09375, + "grad_norm_var": 2.5269765218098956, + "learning_rate": 0.0001, + "loss": 4.4224, + "loss/crossentropy": 2.1383588314056396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22893162816762924, + "step": 7414 + }, + { + "epoch": 0.14832, + "grad_norm": 2.234375, + "grad_norm_var": 0.2983062744140625, + "learning_rate": 0.0001, + "loss": 4.3664, + "loss/crossentropy": 2.0628907680511475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23075110465288162, + "step": 7416 + }, + { + "epoch": 0.14836, + "grad_norm": 2.15625, + "grad_norm_var": 0.29704488118489586, + "learning_rate": 0.0001, + "loss": 4.2109, + "loss/crossentropy": 1.8996745347976685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21171879768371582, + "step": 7418 + }, + { + "epoch": 0.1484, + "grad_norm": 2.046875, + "grad_norm_var": 0.0032867431640625, + "learning_rate": 0.0001, + "loss": 4.4241, + "loss/crossentropy": 2.2367645502090454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2563689202070236, + "step": 7420 + }, + { + "epoch": 0.14844, + "grad_norm": 2.265625, + "grad_norm_var": 0.003934733072916667, + "learning_rate": 0.0001, + "loss": 4.2326, + "loss/crossentropy": 2.274489164352417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2462785318493843, + "step": 7422 + }, + { + "epoch": 0.14848, + "grad_norm": 2.328125, + "grad_norm_var": 0.00562744140625, + "learning_rate": 0.0001, + "loss": 4.5288, + "loss/crossentropy": 2.227620482444763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24729500710964203, + "step": 7424 + }, + { + "epoch": 0.14852, + "grad_norm": 2.0, + "grad_norm_var": 0.006864420572916667, + "learning_rate": 0.0001, + "loss": 4.0913, + "loss/crossentropy": 1.9911785125732422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216292105615139, + "step": 7426 + }, + { + "epoch": 0.14856, + "grad_norm": 2.15625, + "grad_norm_var": 0.008199055989583334, + "learning_rate": 0.0001, + "loss": 4.4071, + "loss/crossentropy": 1.8715736865997314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2289058193564415, + "step": 7428 + }, + { + "epoch": 0.1486, + "grad_norm": 2.125, + "grad_norm_var": 0.007991536458333334, + "learning_rate": 0.0001, + "loss": 4.4671, + "loss/crossentropy": 2.0192378759384155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2327374964952469, + "step": 7430 + }, + { + "epoch": 0.14864, + "grad_norm": 2.046875, + "grad_norm_var": 0.008226521809895833, + "learning_rate": 0.0001, + "loss": 4.213, + "loss/crossentropy": 1.9407023191452026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22586089372634888, + "step": 7432 + }, + { + "epoch": 0.14868, + "grad_norm": 2.203125, + "grad_norm_var": 0.01011962890625, + "learning_rate": 0.0001, + "loss": 4.7816, + "loss/crossentropy": 2.6349592208862305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2605717331171036, + "step": 7434 + }, + { + "epoch": 0.14872, + "grad_norm": 2.28125, + "grad_norm_var": 0.009989420572916666, + "learning_rate": 0.0001, + "loss": 4.5511, + "loss/crossentropy": 2.3015077114105225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2292959839105606, + "step": 7436 + }, + { + "epoch": 0.14876, + "grad_norm": 2.1875, + "grad_norm_var": 0.013231404622395833, + "learning_rate": 0.0001, + "loss": 4.1663, + "loss/crossentropy": 2.081148624420166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21435046195983887, + "step": 7438 + }, + { + "epoch": 0.1488, + "grad_norm": 2.109375, + "grad_norm_var": 0.011735026041666667, + "learning_rate": 0.0001, + "loss": 4.4342, + "loss/crossentropy": 2.235422372817993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23794984817504883, + "step": 7440 + }, + { + "epoch": 0.14884, + "grad_norm": 2.171875, + "grad_norm_var": 0.011295572916666666, + "learning_rate": 0.0001, + "loss": 4.4595, + "loss/crossentropy": 2.1423263549804688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200494110584259, + "step": 7442 + }, + { + "epoch": 0.14888, + "grad_norm": 2.421875, + "grad_norm_var": 0.018342081705729166, + "learning_rate": 0.0001, + "loss": 4.514, + "loss/crossentropy": 2.3156672716140747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25282321125268936, + "step": 7444 + }, + { + "epoch": 0.14892, + "grad_norm": 2.15625, + "grad_norm_var": 0.020051066080729166, + "learning_rate": 0.0001, + "loss": 4.3107, + "loss/crossentropy": 1.7777396440505981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104710191488266, + "step": 7446 + }, + { + "epoch": 0.14896, + "grad_norm": 2.09375, + "grad_norm_var": 0.023341623942057292, + "learning_rate": 0.0001, + "loss": 3.874, + "loss/crossentropy": 1.8015541434288025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976182758808136, + "step": 7448 + }, + { + "epoch": 0.149, + "grad_norm": 2.578125, + "grad_norm_var": 0.04948298136393229, + "learning_rate": 0.0001, + "loss": 4.5647, + "loss/crossentropy": 2.36995792388916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23767317831516266, + "step": 7450 + }, + { + "epoch": 0.14904, + "grad_norm": 2.1875, + "grad_norm_var": 0.050142161051432294, + "learning_rate": 0.0001, + "loss": 4.7568, + "loss/crossentropy": 2.2867971062660217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2666979879140854, + "step": 7452 + }, + { + "epoch": 0.14908, + "grad_norm": 2.328125, + "grad_norm_var": 0.043794504801432294, + "learning_rate": 0.0001, + "loss": 4.773, + "loss/crossentropy": 2.247913956642151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122594192624092, + "step": 7454 + }, + { + "epoch": 0.14912, + "grad_norm": 2.109375, + "grad_norm_var": 0.04197362263997396, + "learning_rate": 0.0001, + "loss": 4.273, + "loss/crossentropy": 2.0020886063575745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22448039799928665, + "step": 7456 + }, + { + "epoch": 0.14916, + "grad_norm": 2.15625, + "grad_norm_var": 0.043702952067057294, + "learning_rate": 0.0001, + "loss": 4.3139, + "loss/crossentropy": 2.2290679216384888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21489766240119934, + "step": 7458 + }, + { + "epoch": 0.1492, + "grad_norm": 2.609375, + "grad_norm_var": 0.049344635009765624, + "learning_rate": 0.0001, + "loss": 4.5919, + "loss/crossentropy": 2.0447877049446106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22640856355428696, + "step": 7460 + }, + { + "epoch": 0.14924, + "grad_norm": 2.3125, + "grad_norm_var": 0.045904286702473956, + "learning_rate": 0.0001, + "loss": 4.4614, + "loss/crossentropy": 2.395468831062317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2395479902625084, + "step": 7462 + }, + { + "epoch": 0.14928, + "grad_norm": 2.0625, + "grad_norm_var": 0.04019775390625, + "learning_rate": 0.0001, + "loss": 4.1552, + "loss/crossentropy": 2.117182433605194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22302603721618652, + "step": 7464 + }, + { + "epoch": 0.14932, + "grad_norm": 2.125, + "grad_norm_var": 0.019498697916666665, + "learning_rate": 0.0001, + "loss": 4.4046, + "loss/crossentropy": 2.206045985221863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2548774778842926, + "step": 7466 + }, + { + "epoch": 0.14936, + "grad_norm": 2.046875, + "grad_norm_var": 0.0194244384765625, + "learning_rate": 0.0001, + "loss": 4.1531, + "loss/crossentropy": 1.9933450818061829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22455794364213943, + "step": 7468 + }, + { + "epoch": 0.1494, + "grad_norm": 2.125, + "grad_norm_var": 0.017731730143229166, + "learning_rate": 0.0001, + "loss": 4.2744, + "loss/crossentropy": 2.216577649116516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24432705342769623, + "step": 7470 + }, + { + "epoch": 0.14944, + "grad_norm": 2.25, + "grad_norm_var": 0.018163045247395832, + "learning_rate": 0.0001, + "loss": 4.6072, + "loss/crossentropy": 2.4282405376434326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2599884122610092, + "step": 7472 + }, + { + "epoch": 0.14948, + "grad_norm": 2.140625, + "grad_norm_var": 0.018089803059895833, + "learning_rate": 0.0001, + "loss": 4.2833, + "loss/crossentropy": 2.2677053213119507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24509359151124954, + "step": 7474 + }, + { + "epoch": 0.14952, + "grad_norm": 2.046875, + "grad_norm_var": 0.007877604166666666, + "learning_rate": 0.0001, + "loss": 4.1271, + "loss/crossentropy": 1.9610475897789001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19867657870054245, + "step": 7476 + }, + { + "epoch": 0.14956, + "grad_norm": 2.4375, + "grad_norm_var": 0.012565104166666667, + "learning_rate": 0.0001, + "loss": 4.6286, + "loss/crossentropy": 2.1379209756851196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24773475527763367, + "step": 7478 + }, + { + "epoch": 0.1496, + "grad_norm": 2.0625, + "grad_norm_var": 0.0125152587890625, + "learning_rate": 0.0001, + "loss": 4.3593, + "loss/crossentropy": 2.295411467552185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2417762354016304, + "step": 7480 + }, + { + "epoch": 0.14964, + "grad_norm": 2.28125, + "grad_norm_var": 0.013374837239583333, + "learning_rate": 0.0001, + "loss": 4.4839, + "loss/crossentropy": 2.0983279943466187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22220823168754578, + "step": 7482 + }, + { + "epoch": 0.14968, + "grad_norm": 2.34375, + "grad_norm_var": 0.015086873372395834, + "learning_rate": 0.0001, + "loss": 4.3977, + "loss/crossentropy": 2.130508065223694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23387905955314636, + "step": 7484 + }, + { + "epoch": 0.14972, + "grad_norm": 2.265625, + "grad_norm_var": 0.015869140625, + "learning_rate": 0.0001, + "loss": 4.545, + "loss/crossentropy": 2.0727924704551697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23198049515485764, + "step": 7486 + }, + { + "epoch": 0.14976, + "grad_norm": 2.28125, + "grad_norm_var": 0.015523274739583334, + "learning_rate": 0.0001, + "loss": 4.3722, + "loss/crossentropy": 2.1407171487808228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23349495232105255, + "step": 7488 + }, + { + "epoch": 0.1498, + "grad_norm": 2.0625, + "grad_norm_var": 0.016063435872395834, + "learning_rate": 0.0001, + "loss": 4.3483, + "loss/crossentropy": 2.123879909515381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2230711579322815, + "step": 7490 + }, + { + "epoch": 0.14984, + "grad_norm": 1.8984375, + "grad_norm_var": 0.02194188435872396, + "learning_rate": 0.0001, + "loss": 3.8687, + "loss/crossentropy": 2.1362847685813904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22217128425836563, + "step": 7492 + }, + { + "epoch": 0.14988, + "grad_norm": 2.234375, + "grad_norm_var": 0.01625544230143229, + "learning_rate": 0.0001, + "loss": 4.4753, + "loss/crossentropy": 2.2171897292137146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.237472265958786, + "step": 7494 + }, + { + "epoch": 0.14992, + "grad_norm": 2.203125, + "grad_norm_var": 0.016841379801432292, + "learning_rate": 0.0001, + "loss": 4.2439, + "loss/crossentropy": 1.9622138142585754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19700734317302704, + "step": 7496 + }, + { + "epoch": 0.14996, + "grad_norm": 2.078125, + "grad_norm_var": 0.01666234334309896, + "learning_rate": 0.0001, + "loss": 4.1296, + "loss/crossentropy": 2.300232410430908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26260019838809967, + "step": 7498 + }, + { + "epoch": 0.15, + "grad_norm": 2.21875, + "grad_norm_var": 0.013765207926432292, + "learning_rate": 0.0001, + "loss": 4.4466, + "loss/crossentropy": 1.9829946756362915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21770135313272476, + "step": 7500 + }, + { + "epoch": 0.15004, + "grad_norm": 2.109375, + "grad_norm_var": 0.010935211181640625, + "learning_rate": 0.0001, + "loss": 4.017, + "loss/crossentropy": 1.8421878218650818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20965785533189774, + "step": 7502 + }, + { + "epoch": 0.15008, + "grad_norm": 2.203125, + "grad_norm_var": 0.010267893473307291, + "learning_rate": 0.0001, + "loss": 4.3111, + "loss/crossentropy": 2.0559566020965576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21752064675092697, + "step": 7504 + }, + { + "epoch": 0.15012, + "grad_norm": 2.25, + "grad_norm_var": 0.009834543863932291, + "learning_rate": 0.0001, + "loss": 4.5526, + "loss/crossentropy": 1.862777590751648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026660442352295, + "step": 7506 + }, + { + "epoch": 0.15016, + "grad_norm": 2.046875, + "grad_norm_var": 0.0064198811848958336, + "learning_rate": 0.0001, + "loss": 4.0238, + "loss/crossentropy": 2.340041399002075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23489895462989807, + "step": 7508 + }, + { + "epoch": 0.1502, + "grad_norm": 2.21875, + "grad_norm_var": 0.006224568684895833, + "learning_rate": 0.0001, + "loss": 4.5434, + "loss/crossentropy": 2.3596150875091553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23775358498096466, + "step": 7510 + }, + { + "epoch": 0.15024, + "grad_norm": 2.015625, + "grad_norm_var": 0.005980428059895833, + "learning_rate": 0.0001, + "loss": 4.4637, + "loss/crossentropy": 1.883503019809723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22191710770130157, + "step": 7512 + }, + { + "epoch": 0.15028, + "grad_norm": 2.09375, + "grad_norm_var": 0.006723785400390625, + "learning_rate": 0.0001, + "loss": 4.1485, + "loss/crossentropy": 2.12862491607666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2177818939089775, + "step": 7514 + }, + { + "epoch": 0.15032, + "grad_norm": 2.078125, + "grad_norm_var": 0.005936431884765625, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 1.9326343536376953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20991001278162003, + "step": 7516 + }, + { + "epoch": 0.15036, + "grad_norm": 2.046875, + "grad_norm_var": 0.008070627848307291, + "learning_rate": 0.0001, + "loss": 4.4552, + "loss/crossentropy": 2.1532761454582214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308126464486122, + "step": 7518 + }, + { + "epoch": 0.1504, + "grad_norm": 2.21875, + "grad_norm_var": 0.007867177327473959, + "learning_rate": 0.0001, + "loss": 4.42, + "loss/crossentropy": 1.876515507698059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1940685734152794, + "step": 7520 + }, + { + "epoch": 0.15044, + "grad_norm": 2.28125, + "grad_norm_var": 0.008937327067057292, + "learning_rate": 0.0001, + "loss": 4.3515, + "loss/crossentropy": 2.3677611351013184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24787116795778275, + "step": 7522 + }, + { + "epoch": 0.15048, + "grad_norm": 2.296875, + "grad_norm_var": 0.010892486572265625, + "learning_rate": 0.0001, + "loss": 4.0489, + "loss/crossentropy": 1.8500076532363892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19271107017993927, + "step": 7524 + }, + { + "epoch": 0.15052, + "grad_norm": 2.171875, + "grad_norm_var": 0.010432688395182292, + "learning_rate": 0.0001, + "loss": 4.3298, + "loss/crossentropy": 1.8013980984687805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181655913591385, + "step": 7526 + }, + { + "epoch": 0.15056, + "grad_norm": 2.09375, + "grad_norm_var": 0.010361480712890624, + "learning_rate": 0.0001, + "loss": 4.4958, + "loss/crossentropy": 2.6469568014144897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2441466525197029, + "step": 7528 + }, + { + "epoch": 0.1506, + "grad_norm": 2.203125, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.1317, + "loss/crossentropy": 1.7992960214614868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2209397256374359, + "step": 7530 + }, + { + "epoch": 0.15064, + "grad_norm": 2.625, + "grad_norm_var": 0.023258463541666666, + "learning_rate": 0.0001, + "loss": 4.9501, + "loss/crossentropy": 2.1914783120155334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23764144629240036, + "step": 7532 + }, + { + "epoch": 0.15068, + "grad_norm": 2.140625, + "grad_norm_var": 0.02906494140625, + "learning_rate": 0.0001, + "loss": 4.4833, + "loss/crossentropy": 2.0035120844841003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21483591943979263, + "step": 7534 + }, + { + "epoch": 0.15072, + "grad_norm": 1.921875, + "grad_norm_var": 0.03235677083333333, + "learning_rate": 0.0001, + "loss": 4.2674, + "loss/crossentropy": 2.0218639969825745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20919294655323029, + "step": 7536 + }, + { + "epoch": 0.15076, + "grad_norm": 2.015625, + "grad_norm_var": 0.03219401041666667, + "learning_rate": 0.0001, + "loss": 4.1091, + "loss/crossentropy": 2.006688416004181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21247616410255432, + "step": 7538 + }, + { + "epoch": 0.1508, + "grad_norm": 2.484375, + "grad_norm_var": 0.03542378743489583, + "learning_rate": 0.0001, + "loss": 4.5424, + "loss/crossentropy": 2.1565613746643066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30109211802482605, + "step": 7540 + }, + { + "epoch": 0.15084, + "grad_norm": 2.359375, + "grad_norm_var": 0.03603413899739583, + "learning_rate": 0.0001, + "loss": 4.3621, + "loss/crossentropy": 1.9676685333251953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23605409264564514, + "step": 7542 + }, + { + "epoch": 0.15088, + "grad_norm": 2.34375, + "grad_norm_var": 0.051493326822916664, + "learning_rate": 0.0001, + "loss": 4.4178, + "loss/crossentropy": 2.1150137186050415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22903620451688766, + "step": 7544 + }, + { + "epoch": 0.15092, + "grad_norm": 2.34375, + "grad_norm_var": 0.051253255208333334, + "learning_rate": 0.0001, + "loss": 4.3186, + "loss/crossentropy": 2.1894554495811462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24362927675247192, + "step": 7546 + }, + { + "epoch": 0.15096, + "grad_norm": 2.21875, + "grad_norm_var": 0.0410308837890625, + "learning_rate": 0.0001, + "loss": 4.4529, + "loss/crossentropy": 1.9624019861221313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22628726810216904, + "step": 7548 + }, + { + "epoch": 0.151, + "grad_norm": 2.0, + "grad_norm_var": 0.04338277180989583, + "learning_rate": 0.0001, + "loss": 4.2795, + "loss/crossentropy": 2.1547625064849854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295984849333763, + "step": 7550 + }, + { + "epoch": 0.15104, + "grad_norm": 2.125, + "grad_norm_var": 0.03857014973958333, + "learning_rate": 0.0001, + "loss": 4.4638, + "loss/crossentropy": 2.229305863380432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2427791878581047, + "step": 7552 + }, + { + "epoch": 0.15108, + "grad_norm": 2.4375, + "grad_norm_var": 0.037093098958333334, + "learning_rate": 0.0001, + "loss": 4.7319, + "loss/crossentropy": 2.4998362064361572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23718395829200745, + "step": 7554 + }, + { + "epoch": 0.15112, + "grad_norm": 2.15625, + "grad_norm_var": 0.033722941080729166, + "learning_rate": 0.0001, + "loss": 4.2118, + "loss/crossentropy": 2.319428563117981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26024487614631653, + "step": 7556 + }, + { + "epoch": 0.15116, + "grad_norm": 2.015625, + "grad_norm_var": 0.0357818603515625, + "learning_rate": 0.0001, + "loss": 4.2451, + "loss/crossentropy": 1.861966609954834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121218591928482, + "step": 7558 + }, + { + "epoch": 0.1512, + "grad_norm": 2.265625, + "grad_norm_var": 0.015751139322916666, + "learning_rate": 0.0001, + "loss": 4.5963, + "loss/crossentropy": 2.1688510179519653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22341617196798325, + "step": 7560 + }, + { + "epoch": 0.15124, + "grad_norm": 2.140625, + "grad_norm_var": 0.013932291666666667, + "learning_rate": 0.0001, + "loss": 4.4808, + "loss/crossentropy": 2.2833406925201416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22648481279611588, + "step": 7562 + }, + { + "epoch": 0.15128, + "grad_norm": 2.203125, + "grad_norm_var": 0.0138671875, + "learning_rate": 0.0001, + "loss": 4.4126, + "loss/crossentropy": 2.224393129348755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23414459079504013, + "step": 7564 + }, + { + "epoch": 0.15132, + "grad_norm": 2.25, + "grad_norm_var": 0.0098297119140625, + "learning_rate": 0.0001, + "loss": 4.253, + "loss/crossentropy": 2.003828763961792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166791632771492, + "step": 7566 + }, + { + "epoch": 0.15136, + "grad_norm": 2.3125, + "grad_norm_var": 0.01168212890625, + "learning_rate": 0.0001, + "loss": 4.0904, + "loss/crossentropy": 2.1813005208969116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28873007744550705, + "step": 7568 + }, + { + "epoch": 0.1514, + "grad_norm": 1.9921875, + "grad_norm_var": 0.028696441650390626, + "learning_rate": 0.0001, + "loss": 4.3824, + "loss/crossentropy": 2.3818061351776123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24916332960128784, + "step": 7570 + }, + { + "epoch": 0.15144, + "grad_norm": 2.03125, + "grad_norm_var": 0.030460357666015625, + "learning_rate": 0.0001, + "loss": 4.1825, + "loss/crossentropy": 2.041518449783325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22352956235408783, + "step": 7572 + }, + { + "epoch": 0.15148, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0326904296875, + "learning_rate": 0.0001, + "loss": 3.8531, + "loss/crossentropy": 1.856759488582611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20227209478616714, + "step": 7574 + }, + { + "epoch": 0.15152, + "grad_norm": 2.09375, + "grad_norm_var": 0.03157450358072917, + "learning_rate": 0.0001, + "loss": 4.237, + "loss/crossentropy": 1.9612281918525696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22188737243413925, + "step": 7576 + }, + { + "epoch": 0.15156, + "grad_norm": 2.015625, + "grad_norm_var": 0.0337554931640625, + "learning_rate": 0.0001, + "loss": 4.0405, + "loss/crossentropy": 1.9610649943351746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2282957062125206, + "step": 7578 + }, + { + "epoch": 0.1516, + "grad_norm": 2.078125, + "grad_norm_var": 0.03400472005208333, + "learning_rate": 0.0001, + "loss": 4.1253, + "loss/crossentropy": 1.9239189624786377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22926658391952515, + "step": 7580 + }, + { + "epoch": 0.15164, + "grad_norm": 2.171875, + "grad_norm_var": 0.033299763997395836, + "learning_rate": 0.0001, + "loss": 4.548, + "loss/crossentropy": 2.425737738609314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24260805547237396, + "step": 7582 + }, + { + "epoch": 0.15168, + "grad_norm": 2.28125, + "grad_norm_var": 0.03242085774739583, + "learning_rate": 0.0001, + "loss": 4.2894, + "loss/crossentropy": 1.9602521061897278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23231150209903717, + "step": 7584 + }, + { + "epoch": 0.15172, + "grad_norm": 2.03125, + "grad_norm_var": 0.007954661051432292, + "learning_rate": 0.0001, + "loss": 4.3131, + "loss/crossentropy": 2.3671375513076782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24326395988464355, + "step": 7586 + }, + { + "epoch": 0.15176, + "grad_norm": 2.015625, + "grad_norm_var": 0.007675933837890625, + "learning_rate": 0.0001, + "loss": 4.1297, + "loss/crossentropy": 2.0128119587898254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20082567632198334, + "step": 7588 + }, + { + "epoch": 0.1518, + "grad_norm": 2.265625, + "grad_norm_var": 0.0085845947265625, + "learning_rate": 0.0001, + "loss": 4.5813, + "loss/crossentropy": 2.3719125986099243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.243422269821167, + "step": 7590 + }, + { + "epoch": 0.15184, + "grad_norm": 2.28125, + "grad_norm_var": 0.010575358072916667, + "learning_rate": 0.0001, + "loss": 4.6177, + "loss/crossentropy": 2.088695764541626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23578013479709625, + "step": 7592 + }, + { + "epoch": 0.15188, + "grad_norm": 2.203125, + "grad_norm_var": 0.01099853515625, + "learning_rate": 0.0001, + "loss": 4.3719, + "loss/crossentropy": 2.151872456073761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2496410757303238, + "step": 7594 + }, + { + "epoch": 0.15192, + "grad_norm": 2.265625, + "grad_norm_var": 0.017281087239583333, + "learning_rate": 0.0001, + "loss": 4.6041, + "loss/crossentropy": 2.020963430404663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25003430247306824, + "step": 7596 + }, + { + "epoch": 0.15196, + "grad_norm": 2.34375, + "grad_norm_var": 0.0199615478515625, + "learning_rate": 0.0001, + "loss": 4.4439, + "loss/crossentropy": 1.9290395379066467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20771078765392303, + "step": 7598 + }, + { + "epoch": 0.152, + "grad_norm": 2.09375, + "grad_norm_var": 0.020536295572916665, + "learning_rate": 0.0001, + "loss": 4.2461, + "loss/crossentropy": 2.0420188307762146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2375432327389717, + "step": 7600 + }, + { + "epoch": 0.15204, + "grad_norm": 2.21875, + "grad_norm_var": 0.019169108072916666, + "learning_rate": 0.0001, + "loss": 4.4291, + "loss/crossentropy": 2.474969744682312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2535504847764969, + "step": 7602 + }, + { + "epoch": 0.15208, + "grad_norm": 2.3125, + "grad_norm_var": 0.015550740559895833, + "learning_rate": 0.0001, + "loss": 4.3487, + "loss/crossentropy": 2.177125334739685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24919381737709045, + "step": 7604 + }, + { + "epoch": 0.15212, + "grad_norm": 2.3125, + "grad_norm_var": 0.015510050455729167, + "learning_rate": 0.0001, + "loss": 4.4719, + "loss/crossentropy": 2.348747491836548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22336618602275848, + "step": 7606 + }, + { + "epoch": 0.15216, + "grad_norm": 2.28125, + "grad_norm_var": 0.014676920572916667, + "learning_rate": 0.0001, + "loss": 4.4502, + "loss/crossentropy": 1.718321442604065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19830843806266785, + "step": 7608 + }, + { + "epoch": 0.1522, + "grad_norm": 2.171875, + "grad_norm_var": 0.015901692708333335, + "learning_rate": 0.0001, + "loss": 4.4505, + "loss/crossentropy": 2.2954800128936768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23203244805335999, + "step": 7610 + }, + { + "epoch": 0.15224, + "grad_norm": 2.0625, + "grad_norm_var": 0.013570149739583334, + "learning_rate": 0.0001, + "loss": 4.1141, + "loss/crossentropy": 1.6918454766273499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18312305957078934, + "step": 7612 + }, + { + "epoch": 0.15228, + "grad_norm": 2.015625, + "grad_norm_var": 0.012727864583333333, + "learning_rate": 0.0001, + "loss": 4.1312, + "loss/crossentropy": 1.887774109840393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2151220142841339, + "step": 7614 + }, + { + "epoch": 0.15232, + "grad_norm": 2.125, + "grad_norm_var": 0.01373291015625, + "learning_rate": 0.0001, + "loss": 4.1274, + "loss/crossentropy": 2.0903998613357544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22486191242933273, + "step": 7616 + }, + { + "epoch": 0.15236, + "grad_norm": 2.0625, + "grad_norm_var": 0.016310373942057293, + "learning_rate": 0.0001, + "loss": 4.0659, + "loss/crossentropy": 1.929358720779419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20021560043096542, + "step": 7618 + }, + { + "epoch": 0.1524, + "grad_norm": 2.703125, + "grad_norm_var": 0.03484064737955729, + "learning_rate": 0.0001, + "loss": 4.5726, + "loss/crossentropy": 1.9812661409378052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21395482122898102, + "step": 7620 + }, + { + "epoch": 0.15244, + "grad_norm": 2.390625, + "grad_norm_var": 0.03358942667643229, + "learning_rate": 0.0001, + "loss": 4.15, + "loss/crossentropy": 2.148552179336548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22805052995681763, + "step": 7622 + }, + { + "epoch": 0.15248, + "grad_norm": 2.078125, + "grad_norm_var": 0.03416926066080729, + "learning_rate": 0.0001, + "loss": 4.3471, + "loss/crossentropy": 2.012804687023163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2248745858669281, + "step": 7624 + }, + { + "epoch": 0.15252, + "grad_norm": 2.046875, + "grad_norm_var": 0.03463312784830729, + "learning_rate": 0.0001, + "loss": 4.3948, + "loss/crossentropy": 2.3378156423568726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2457970678806305, + "step": 7626 + }, + { + "epoch": 0.15256, + "grad_norm": 2.296875, + "grad_norm_var": 0.036382802327473956, + "learning_rate": 0.0001, + "loss": 4.5414, + "loss/crossentropy": 2.0815274119377136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22083784639835358, + "step": 7628 + }, + { + "epoch": 0.1526, + "grad_norm": 1.984375, + "grad_norm_var": 0.039589182535807295, + "learning_rate": 0.0001, + "loss": 4.2609, + "loss/crossentropy": 2.172307014465332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25988608598709106, + "step": 7630 + }, + { + "epoch": 0.15264, + "grad_norm": 1.9921875, + "grad_norm_var": 0.03921305338541667, + "learning_rate": 0.0001, + "loss": 4.3711, + "loss/crossentropy": 1.973683476448059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20379862189292908, + "step": 7632 + }, + { + "epoch": 0.15268, + "grad_norm": 1.921875, + "grad_norm_var": 0.039406077067057295, + "learning_rate": 0.0001, + "loss": 4.3394, + "loss/crossentropy": 2.175020456314087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22480874508619308, + "step": 7634 + }, + { + "epoch": 0.15272, + "grad_norm": 2.1875, + "grad_norm_var": 0.022304026285807292, + "learning_rate": 0.0001, + "loss": 4.2176, + "loss/crossentropy": 2.063227415084839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2324352264404297, + "step": 7636 + }, + { + "epoch": 0.15276, + "grad_norm": 2.296875, + "grad_norm_var": 0.019769032796223957, + "learning_rate": 0.0001, + "loss": 4.4654, + "loss/crossentropy": 1.9297555088996887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171614021062851, + "step": 7638 + }, + { + "epoch": 0.1528, + "grad_norm": 2.296875, + "grad_norm_var": 0.019421132405598958, + "learning_rate": 0.0001, + "loss": 4.4698, + "loss/crossentropy": 1.9808542132377625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21765583008527756, + "step": 7640 + }, + { + "epoch": 0.15284, + "grad_norm": 2.09375, + "grad_norm_var": 0.018790435791015626, + "learning_rate": 0.0001, + "loss": 4.4145, + "loss/crossentropy": 2.009860336780548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20963389426469803, + "step": 7642 + }, + { + "epoch": 0.15288, + "grad_norm": 2.015625, + "grad_norm_var": 0.017319488525390624, + "learning_rate": 0.0001, + "loss": 4.2649, + "loss/crossentropy": 1.912703514099121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23796956986188889, + "step": 7644 + }, + { + "epoch": 0.15292, + "grad_norm": 2.25, + "grad_norm_var": 0.012341054280598958, + "learning_rate": 0.0001, + "loss": 4.4965, + "loss/crossentropy": 2.2945642471313477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2621624022722244, + "step": 7646 + }, + { + "epoch": 0.15296, + "grad_norm": 2.03125, + "grad_norm_var": 0.011067708333333334, + "learning_rate": 0.0001, + "loss": 4.2573, + "loss/crossentropy": 2.185902237892151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23890018463134766, + "step": 7648 + }, + { + "epoch": 0.153, + "grad_norm": 2.546875, + "grad_norm_var": 0.018648274739583335, + "learning_rate": 0.0001, + "loss": 4.5815, + "loss/crossentropy": 1.8625503778457642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2206135168671608, + "step": 7650 + }, + { + "epoch": 0.15304, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020684560139973957, + "learning_rate": 0.0001, + "loss": 3.9142, + "loss/crossentropy": 1.8815893530845642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21166741847991943, + "step": 7652 + }, + { + "epoch": 0.15308, + "grad_norm": 2.203125, + "grad_norm_var": 0.020979563395182293, + "learning_rate": 0.0001, + "loss": 4.2277, + "loss/crossentropy": 2.156697630882263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22994951903820038, + "step": 7654 + }, + { + "epoch": 0.15312, + "grad_norm": 2.25, + "grad_norm_var": 0.02005182902018229, + "learning_rate": 0.0001, + "loss": 4.366, + "loss/crossentropy": 1.6800576448440552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21905134618282318, + "step": 7656 + }, + { + "epoch": 0.15316, + "grad_norm": 2.046875, + "grad_norm_var": 0.02061945597330729, + "learning_rate": 0.0001, + "loss": 4.1446, + "loss/crossentropy": 1.9215145707130432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034405767917633, + "step": 7658 + }, + { + "epoch": 0.1532, + "grad_norm": 2.390625, + "grad_norm_var": 0.02380956013997396, + "learning_rate": 0.0001, + "loss": 4.6475, + "loss/crossentropy": 2.153718650341034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25605182349681854, + "step": 7660 + }, + { + "epoch": 0.15324, + "grad_norm": 2.203125, + "grad_norm_var": 0.02466608683268229, + "learning_rate": 0.0001, + "loss": 4.2676, + "loss/crossentropy": 1.8782889246940613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20574549585580826, + "step": 7662 + }, + { + "epoch": 0.15328, + "grad_norm": 2.265625, + "grad_norm_var": 0.031040191650390625, + "learning_rate": 0.0001, + "loss": 4.2171, + "loss/crossentropy": 2.003354489803314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24152260273694992, + "step": 7664 + }, + { + "epoch": 0.15332, + "grad_norm": 2.203125, + "grad_norm_var": 0.022299957275390626, + "learning_rate": 0.0001, + "loss": 4.5828, + "loss/crossentropy": 2.217758059501648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24700726568698883, + "step": 7666 + }, + { + "epoch": 0.15336, + "grad_norm": 2.1875, + "grad_norm_var": 0.017757161458333334, + "learning_rate": 0.0001, + "loss": 4.3418, + "loss/crossentropy": 1.934537410736084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2163502648472786, + "step": 7668 + }, + { + "epoch": 0.1534, + "grad_norm": 2.03125, + "grad_norm_var": 0.020340983072916666, + "learning_rate": 0.0001, + "loss": 4.3, + "loss/crossentropy": 2.007661819458008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21769292652606964, + "step": 7670 + }, + { + "epoch": 0.15344, + "grad_norm": 2.09375, + "grad_norm_var": 0.020653279622395833, + "learning_rate": 0.0001, + "loss": 4.3082, + "loss/crossentropy": 2.1586949825286865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2453690618276596, + "step": 7672 + }, + { + "epoch": 0.15348, + "grad_norm": 2.109375, + "grad_norm_var": 0.0197174072265625, + "learning_rate": 0.0001, + "loss": 4.4433, + "loss/crossentropy": 2.2201706171035767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22639702260494232, + "step": 7674 + }, + { + "epoch": 0.15352, + "grad_norm": 2.15625, + "grad_norm_var": 0.015104166666666667, + "learning_rate": 0.0001, + "loss": 4.2337, + "loss/crossentropy": 2.059146285057068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22411910444498062, + "step": 7676 + }, + { + "epoch": 0.15356, + "grad_norm": 2.0625, + "grad_norm_var": 0.015397135416666667, + "learning_rate": 0.0001, + "loss": 4.2318, + "loss/crossentropy": 1.9768954515457153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21798508614301682, + "step": 7678 + }, + { + "epoch": 0.1536, + "grad_norm": 2.109375, + "grad_norm_var": 0.003413899739583333, + "learning_rate": 0.0001, + "loss": 4.4823, + "loss/crossentropy": 1.8555094003677368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19453337788581848, + "step": 7680 + }, + { + "epoch": 0.15364, + "grad_norm": 2.03125, + "grad_norm_var": 0.0036783854166666666, + "learning_rate": 0.0001, + "loss": 4.0379, + "loss/crossentropy": 1.5948917865753174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17746463418006897, + "step": 7682 + }, + { + "epoch": 0.15368, + "grad_norm": 2.1875, + "grad_norm_var": 0.0034464518229166668, + "learning_rate": 0.0001, + "loss": 4.2901, + "loss/crossentropy": 1.8917757868766785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103574424982071, + "step": 7684 + }, + { + "epoch": 0.15372, + "grad_norm": 2.140625, + "grad_norm_var": 0.0030670166015625, + "learning_rate": 0.0001, + "loss": 4.3326, + "loss/crossentropy": 2.090232729911804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2199823334813118, + "step": 7686 + }, + { + "epoch": 0.15376, + "grad_norm": 2.03125, + "grad_norm_var": 0.004979451497395833, + "learning_rate": 0.0001, + "loss": 4.3695, + "loss/crossentropy": 1.8155178427696228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21044857800006866, + "step": 7688 + }, + { + "epoch": 0.1538, + "grad_norm": 2.09375, + "grad_norm_var": 0.0063629150390625, + "learning_rate": 0.0001, + "loss": 4.5003, + "loss/crossentropy": 2.31532621383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2437051385641098, + "step": 7690 + }, + { + "epoch": 0.15384, + "grad_norm": 2.078125, + "grad_norm_var": 0.0065419514973958336, + "learning_rate": 0.0001, + "loss": 4.3815, + "loss/crossentropy": 1.9688079357147217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22686263918876648, + "step": 7692 + }, + { + "epoch": 0.15388, + "grad_norm": 2.046875, + "grad_norm_var": 0.010001373291015626, + "learning_rate": 0.0001, + "loss": 3.9937, + "loss/crossentropy": 1.9029017686843872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20635761320590973, + "step": 7694 + }, + { + "epoch": 0.15392, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010587565104166667, + "learning_rate": 0.0001, + "loss": 3.91, + "loss/crossentropy": 1.9817028641700745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2080235257744789, + "step": 7696 + }, + { + "epoch": 0.15396, + "grad_norm": 2.171875, + "grad_norm_var": 0.017463175455729167, + "learning_rate": 0.0001, + "loss": 4.3301, + "loss/crossentropy": 2.3392014503479004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26226382702589035, + "step": 7698 + }, + { + "epoch": 0.154, + "grad_norm": 1.9375, + "grad_norm_var": 0.021320597330729166, + "learning_rate": 0.0001, + "loss": 4.0381, + "loss/crossentropy": 1.7265403866767883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21790936589241028, + "step": 7700 + }, + { + "epoch": 0.15404, + "grad_norm": 2.203125, + "grad_norm_var": 0.02276585896809896, + "learning_rate": 0.0001, + "loss": 4.1725, + "loss/crossentropy": 2.024384081363678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21987473219633102, + "step": 7702 + }, + { + "epoch": 0.15408, + "grad_norm": 2.5, + "grad_norm_var": 0.03117650349934896, + "learning_rate": 0.0001, + "loss": 4.6702, + "loss/crossentropy": 2.1840893030166626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24231631308794022, + "step": 7704 + }, + { + "epoch": 0.15412, + "grad_norm": 2.171875, + "grad_norm_var": 0.03001683553059896, + "learning_rate": 0.0001, + "loss": 4.2197, + "loss/crossentropy": 2.1950928568840027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21808429062366486, + "step": 7706 + }, + { + "epoch": 0.15416, + "grad_norm": 2.140625, + "grad_norm_var": 0.03029352823893229, + "learning_rate": 0.0001, + "loss": 4.3215, + "loss/crossentropy": 1.9541537165641785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22109197825193405, + "step": 7708 + }, + { + "epoch": 0.1542, + "grad_norm": 2.03125, + "grad_norm_var": 0.0257476806640625, + "learning_rate": 0.0001, + "loss": 4.1966, + "loss/crossentropy": 2.0232877135276794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214762382209301, + "step": 7710 + }, + { + "epoch": 0.15424, + "grad_norm": 1.9609375, + "grad_norm_var": 0.023859659830729168, + "learning_rate": 0.0001, + "loss": 4.0012, + "loss/crossentropy": 2.003768503665924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21820105612277985, + "step": 7712 + }, + { + "epoch": 0.15428, + "grad_norm": 2.21875, + "grad_norm_var": 0.020026652018229167, + "learning_rate": 0.0001, + "loss": 4.2471, + "loss/crossentropy": 2.007221221923828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2203196883201599, + "step": 7714 + }, + { + "epoch": 0.15432, + "grad_norm": 2.09375, + "grad_norm_var": 0.015900675455729166, + "learning_rate": 0.0001, + "loss": 4.3201, + "loss/crossentropy": 2.0134615898132324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21916545927524567, + "step": 7716 + }, + { + "epoch": 0.15436, + "grad_norm": 2.328125, + "grad_norm_var": 0.016013336181640626, + "learning_rate": 0.0001, + "loss": 4.3821, + "loss/crossentropy": 1.9012999534606934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25758640468120575, + "step": 7718 + }, + { + "epoch": 0.1544, + "grad_norm": 2.25, + "grad_norm_var": 0.008727773030598959, + "learning_rate": 0.0001, + "loss": 4.1555, + "loss/crossentropy": 2.074169874191284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22667942196130753, + "step": 7720 + }, + { + "epoch": 0.15444, + "grad_norm": 2.21875, + "grad_norm_var": 0.009059397379557292, + "learning_rate": 0.0001, + "loss": 4.4355, + "loss/crossentropy": 2.070925295352936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20353248715400696, + "step": 7722 + }, + { + "epoch": 0.15448, + "grad_norm": 2.359375, + "grad_norm_var": 0.011163075764973959, + "learning_rate": 0.0001, + "loss": 4.5676, + "loss/crossentropy": 2.289568066596985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23175117373466492, + "step": 7724 + }, + { + "epoch": 0.15452, + "grad_norm": 2.15625, + "grad_norm_var": 0.012827301025390625, + "learning_rate": 0.0001, + "loss": 4.6152, + "loss/crossentropy": 2.21374249458313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23585008084774017, + "step": 7726 + }, + { + "epoch": 0.15456, + "grad_norm": 2.328125, + "grad_norm_var": 0.010791015625, + "learning_rate": 0.0001, + "loss": 4.5575, + "loss/crossentropy": 2.15897136926651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24143048375844955, + "step": 7728 + }, + { + "epoch": 0.1546, + "grad_norm": 2.359375, + "grad_norm_var": 0.01142578125, + "learning_rate": 0.0001, + "loss": 4.4722, + "loss/crossentropy": 2.134206771850586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24344487488269806, + "step": 7730 + }, + { + "epoch": 0.15464, + "grad_norm": 2.359375, + "grad_norm_var": 0.011180623372395834, + "learning_rate": 0.0001, + "loss": 4.5827, + "loss/crossentropy": 2.3832513093948364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24878299236297607, + "step": 7732 + }, + { + "epoch": 0.15468, + "grad_norm": 2.203125, + "grad_norm_var": 0.0102203369140625, + "learning_rate": 0.0001, + "loss": 4.0722, + "loss/crossentropy": 1.917544960975647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20134451985359192, + "step": 7734 + }, + { + "epoch": 0.15472, + "grad_norm": 2.40625, + "grad_norm_var": 0.011767578125, + "learning_rate": 0.0001, + "loss": 4.4499, + "loss/crossentropy": 2.1081286668777466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23577219247817993, + "step": 7736 + }, + { + "epoch": 0.15476, + "grad_norm": 2.4375, + "grad_norm_var": 0.0137603759765625, + "learning_rate": 0.0001, + "loss": 4.7942, + "loss/crossentropy": 2.214662790298462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23744845390319824, + "step": 7738 + }, + { + "epoch": 0.1548, + "grad_norm": 2.359375, + "grad_norm_var": 0.013895670572916666, + "learning_rate": 0.0001, + "loss": 4.4325, + "loss/crossentropy": 1.995256781578064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21037640422582626, + "step": 7740 + }, + { + "epoch": 0.15484, + "grad_norm": 1.890625, + "grad_norm_var": 0.024006144205729166, + "learning_rate": 0.0001, + "loss": 4.0701, + "loss/crossentropy": 2.2877765893936157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23090071976184845, + "step": 7742 + }, + { + "epoch": 0.15488, + "grad_norm": 2.125, + "grad_norm_var": 0.022652180989583333, + "learning_rate": 0.0001, + "loss": 4.6167, + "loss/crossentropy": 2.23935329914093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24617131054401398, + "step": 7744 + }, + { + "epoch": 0.15492, + "grad_norm": 2.125, + "grad_norm_var": 0.023005167643229168, + "learning_rate": 0.0001, + "loss": 4.4886, + "loss/crossentropy": 2.15006422996521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2235095053911209, + "step": 7746 + }, + { + "epoch": 0.15496, + "grad_norm": 2.015625, + "grad_norm_var": 0.024144490559895832, + "learning_rate": 0.0001, + "loss": 4.1873, + "loss/crossentropy": 1.9917905926704407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21606986224651337, + "step": 7748 + }, + { + "epoch": 0.155, + "grad_norm": 2.203125, + "grad_norm_var": 0.025804646809895835, + "learning_rate": 0.0001, + "loss": 4.6347, + "loss/crossentropy": 2.303179979324341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2541813999414444, + "step": 7750 + }, + { + "epoch": 0.15504, + "grad_norm": 2.109375, + "grad_norm_var": 0.022786458333333332, + "learning_rate": 0.0001, + "loss": 4.3647, + "loss/crossentropy": 2.231510281562805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2479737550020218, + "step": 7752 + }, + { + "epoch": 0.15508, + "grad_norm": 2.25, + "grad_norm_var": 0.01778132120768229, + "learning_rate": 0.0001, + "loss": 4.1714, + "loss/crossentropy": 2.0530437231063843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20220057666301727, + "step": 7754 + }, + { + "epoch": 0.15512, + "grad_norm": 2.171875, + "grad_norm_var": 0.015909830729166668, + "learning_rate": 0.0001, + "loss": 4.1978, + "loss/crossentropy": 2.0889222025871277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22389977425336838, + "step": 7756 + }, + { + "epoch": 0.15516, + "grad_norm": 2.28125, + "grad_norm_var": 0.0131256103515625, + "learning_rate": 0.0001, + "loss": 4.4748, + "loss/crossentropy": 2.3807711601257324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2397037297487259, + "step": 7758 + }, + { + "epoch": 0.1552, + "grad_norm": 2.171875, + "grad_norm_var": 0.01297607421875, + "learning_rate": 0.0001, + "loss": 4.3382, + "loss/crossentropy": 1.8144067525863647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19052604585886002, + "step": 7760 + }, + { + "epoch": 0.15524, + "grad_norm": 2.09375, + "grad_norm_var": 0.013240559895833334, + "learning_rate": 0.0001, + "loss": 4.5358, + "loss/crossentropy": 2.295349955558777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2168404459953308, + "step": 7762 + }, + { + "epoch": 0.15528, + "grad_norm": 11.8125, + "grad_norm_var": 5.868936920166016, + "learning_rate": 0.0001, + "loss": 4.1706, + "loss/crossentropy": 1.7281805276870728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.230300635099411, + "step": 7764 + }, + { + "epoch": 0.15532, + "grad_norm": 2.375, + "grad_norm_var": 5.861083730061849, + "learning_rate": 0.0001, + "loss": 4.3867, + "loss/crossentropy": 2.2153135538101196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24419061839580536, + "step": 7766 + }, + { + "epoch": 0.15536, + "grad_norm": 2.03125, + "grad_norm_var": 5.890169270833334, + "learning_rate": 0.0001, + "loss": 4.2047, + "loss/crossentropy": 2.0259060859680176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119704708456993, + "step": 7768 + }, + { + "epoch": 0.1554, + "grad_norm": 2.21875, + "grad_norm_var": 5.871726226806641, + "learning_rate": 0.0001, + "loss": 4.3405, + "loss/crossentropy": 2.2399297952651978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23252833634614944, + "step": 7770 + }, + { + "epoch": 0.15544, + "grad_norm": 2.109375, + "grad_norm_var": 5.86380615234375, + "learning_rate": 0.0001, + "loss": 4.3287, + "loss/crossentropy": 2.0974661111831665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085232511162758, + "step": 7772 + }, + { + "epoch": 0.15548, + "grad_norm": 2.21875, + "grad_norm_var": 5.861717732747396, + "learning_rate": 0.0001, + "loss": 4.4141, + "loss/crossentropy": 2.0121108293533325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24354346096515656, + "step": 7774 + }, + { + "epoch": 0.15552, + "grad_norm": 2.171875, + "grad_norm_var": 5.843431599934896, + "learning_rate": 0.0001, + "loss": 4.3167, + "loss/crossentropy": 2.1463273763656616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22797952592372894, + "step": 7776 + }, + { + "epoch": 0.15556, + "grad_norm": 1.9140625, + "grad_norm_var": 5.875705718994141, + "learning_rate": 0.0001, + "loss": 3.9814, + "loss/crossentropy": 1.6362827122211456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18291430547833443, + "step": 7778 + }, + { + "epoch": 0.1556, + "grad_norm": 2.21875, + "grad_norm_var": 0.016283162434895835, + "learning_rate": 0.0001, + "loss": 4.3534, + "loss/crossentropy": 2.4894620180130005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24631594866514206, + "step": 7780 + }, + { + "epoch": 0.15564, + "grad_norm": 2.15625, + "grad_norm_var": 0.0119781494140625, + "learning_rate": 0.0001, + "loss": 4.5799, + "loss/crossentropy": 2.346967577934265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20964853465557098, + "step": 7782 + }, + { + "epoch": 0.15568, + "grad_norm": 2.03125, + "grad_norm_var": 0.010282135009765625, + "learning_rate": 0.0001, + "loss": 4.5036, + "loss/crossentropy": 2.0165189504623413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22410035878419876, + "step": 7784 + }, + { + "epoch": 0.15572, + "grad_norm": 2.140625, + "grad_norm_var": 0.009417470296223958, + "learning_rate": 0.0001, + "loss": 4.0735, + "loss/crossentropy": 1.7486848831176758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20792805403470993, + "step": 7786 + }, + { + "epoch": 0.15576, + "grad_norm": 2.03125, + "grad_norm_var": 0.009905751546223958, + "learning_rate": 0.0001, + "loss": 4.1321, + "loss/crossentropy": 1.9615037441253662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22578728944063187, + "step": 7788 + }, + { + "epoch": 0.1558, + "grad_norm": 2.09375, + "grad_norm_var": 0.009069569905598958, + "learning_rate": 0.0001, + "loss": 4.326, + "loss/crossentropy": 1.8386783003807068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21898606419563293, + "step": 7790 + }, + { + "epoch": 0.15584, + "grad_norm": 2.15625, + "grad_norm_var": 0.011533355712890625, + "learning_rate": 0.0001, + "loss": 4.6429, + "loss/crossentropy": 2.1383039951324463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2322411835193634, + "step": 7792 + }, + { + "epoch": 0.15588, + "grad_norm": 2.21875, + "grad_norm_var": 0.008918253580729167, + "learning_rate": 0.0001, + "loss": 4.3568, + "loss/crossentropy": 2.510488271713257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24672146886587143, + "step": 7794 + }, + { + "epoch": 0.15592, + "grad_norm": 2.125, + "grad_norm_var": 0.009309895833333333, + "learning_rate": 0.0001, + "loss": 4.4328, + "loss/crossentropy": 2.03993421792984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23064473271369934, + "step": 7796 + }, + { + "epoch": 0.15596, + "grad_norm": 2.09375, + "grad_norm_var": 0.007306925455729167, + "learning_rate": 0.0001, + "loss": 4.4833, + "loss/crossentropy": 2.305809736251831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21916814893484116, + "step": 7798 + }, + { + "epoch": 0.156, + "grad_norm": 2.03125, + "grad_norm_var": 0.007796223958333333, + "learning_rate": 0.0001, + "loss": 4.1218, + "loss/crossentropy": 1.8330454230308533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20832987129688263, + "step": 7800 + }, + { + "epoch": 0.15604, + "grad_norm": 2.140625, + "grad_norm_var": 0.007420857747395833, + "learning_rate": 0.0001, + "loss": 4.2579, + "loss/crossentropy": 1.9194663166999817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22163349390029907, + "step": 7802 + }, + { + "epoch": 0.15608, + "grad_norm": 2.078125, + "grad_norm_var": 0.006917317708333333, + "learning_rate": 0.0001, + "loss": 4.2219, + "loss/crossentropy": 1.798878252506256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19532181322574615, + "step": 7804 + }, + { + "epoch": 0.15612, + "grad_norm": 2.140625, + "grad_norm_var": 0.0067942301432291664, + "learning_rate": 0.0001, + "loss": 4.2949, + "loss/crossentropy": 1.730432152748108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20092248916625977, + "step": 7806 + }, + { + "epoch": 0.15616, + "grad_norm": 2.515625, + "grad_norm_var": 0.01304931640625, + "learning_rate": 0.0001, + "loss": 4.3109, + "loss/crossentropy": 2.1426968574523926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22185371816158295, + "step": 7808 + }, + { + "epoch": 0.1562, + "grad_norm": 2.078125, + "grad_norm_var": 0.012848917643229167, + "learning_rate": 0.0001, + "loss": 4.4308, + "loss/crossentropy": 1.983969271183014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22203146666288376, + "step": 7810 + }, + { + "epoch": 0.15624, + "grad_norm": 2.0625, + "grad_norm_var": 0.016532389322916667, + "learning_rate": 0.0001, + "loss": 4.2114, + "loss/crossentropy": 2.2948192954063416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23453453928232193, + "step": 7812 + }, + { + "epoch": 0.15628, + "grad_norm": 2.09375, + "grad_norm_var": 0.0175933837890625, + "learning_rate": 0.0001, + "loss": 4.287, + "loss/crossentropy": 1.9190048575401306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22457829862833023, + "step": 7814 + }, + { + "epoch": 0.15632, + "grad_norm": 2.171875, + "grad_norm_var": 0.01529541015625, + "learning_rate": 0.0001, + "loss": 4.1721, + "loss/crossentropy": 2.0268847346305847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21628276258707047, + "step": 7816 + }, + { + "epoch": 0.15636, + "grad_norm": 2.203125, + "grad_norm_var": 0.017411295572916666, + "learning_rate": 0.0001, + "loss": 4.3553, + "loss/crossentropy": 2.0050706267356873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20108875632286072, + "step": 7818 + }, + { + "epoch": 0.1564, + "grad_norm": 2.453125, + "grad_norm_var": 0.024470774332682292, + "learning_rate": 0.0001, + "loss": 4.2059, + "loss/crossentropy": 2.096635937690735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22623063623905182, + "step": 7820 + }, + { + "epoch": 0.15644, + "grad_norm": 2.21875, + "grad_norm_var": 0.024580637613932293, + "learning_rate": 0.0001, + "loss": 4.3901, + "loss/crossentropy": 1.9188768863677979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21088901162147522, + "step": 7822 + }, + { + "epoch": 0.15648, + "grad_norm": 2.203125, + "grad_norm_var": 0.018304189046223957, + "learning_rate": 0.0001, + "loss": 4.4062, + "loss/crossentropy": 2.3639817237854004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2608294039964676, + "step": 7824 + }, + { + "epoch": 0.15652, + "grad_norm": 2.1875, + "grad_norm_var": 0.017765045166015625, + "learning_rate": 0.0001, + "loss": 4.2123, + "loss/crossentropy": 1.9716956615447998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23327408730983734, + "step": 7826 + }, + { + "epoch": 0.15656, + "grad_norm": 2.265625, + "grad_norm_var": 0.08131688435872396, + "learning_rate": 0.0001, + "loss": 4.1377, + "loss/crossentropy": 2.004276990890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22930586338043213, + "step": 7828 + }, + { + "epoch": 0.1566, + "grad_norm": 2.078125, + "grad_norm_var": 0.08247858683268229, + "learning_rate": 0.0001, + "loss": 4.4703, + "loss/crossentropy": 1.7996181845664978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21030305325984955, + "step": 7830 + }, + { + "epoch": 0.15664, + "grad_norm": 2.34375, + "grad_norm_var": 0.09555435180664062, + "learning_rate": 0.0001, + "loss": 4.7938, + "loss/crossentropy": 2.192178189754486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232302725315094, + "step": 7832 + }, + { + "epoch": 0.15668, + "grad_norm": 2.109375, + "grad_norm_var": 0.09538345336914063, + "learning_rate": 0.0001, + "loss": 4.2381, + "loss/crossentropy": 1.7093925476074219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19697313755750656, + "step": 7834 + }, + { + "epoch": 0.15672, + "grad_norm": 2.234375, + "grad_norm_var": 0.09130859375, + "learning_rate": 0.0001, + "loss": 4.2534, + "loss/crossentropy": 1.915247917175293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21003933250904083, + "step": 7836 + }, + { + "epoch": 0.15676, + "grad_norm": 2.375, + "grad_norm_var": 0.091162109375, + "learning_rate": 0.0001, + "loss": 4.3825, + "loss/crossentropy": 2.188641667366028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23260314762592316, + "step": 7838 + }, + { + "epoch": 0.1568, + "grad_norm": 2.109375, + "grad_norm_var": 0.09501953125, + "learning_rate": 0.0001, + "loss": 4.7329, + "loss/crossentropy": 2.3316495418548584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24601806700229645, + "step": 7840 + }, + { + "epoch": 0.15684, + "grad_norm": 2.28125, + "grad_norm_var": 0.0918121337890625, + "learning_rate": 0.0001, + "loss": 4.2934, + "loss/crossentropy": 2.140946924686432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23532958328723907, + "step": 7842 + }, + { + "epoch": 0.15688, + "grad_norm": 2.21875, + "grad_norm_var": 0.031525675455729166, + "learning_rate": 0.0001, + "loss": 4.2738, + "loss/crossentropy": 2.372095465660095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2615740895271301, + "step": 7844 + }, + { + "epoch": 0.15692, + "grad_norm": 2.25, + "grad_norm_var": 0.028727213541666668, + "learning_rate": 0.0001, + "loss": 4.1954, + "loss/crossentropy": 1.8433185815811157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160269021987915, + "step": 7846 + }, + { + "epoch": 0.15696, + "grad_norm": 2.140625, + "grad_norm_var": 0.016071573893229166, + "learning_rate": 0.0001, + "loss": 4.2489, + "loss/crossentropy": 2.012324333190918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1964300200343132, + "step": 7848 + }, + { + "epoch": 0.157, + "grad_norm": 2.203125, + "grad_norm_var": 0.014090983072916667, + "learning_rate": 0.0001, + "loss": 4.486, + "loss/crossentropy": 2.0325884222984314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22080926597118378, + "step": 7850 + }, + { + "epoch": 0.15704, + "grad_norm": 2.296875, + "grad_norm_var": 0.013036092122395834, + "learning_rate": 0.0001, + "loss": 4.3784, + "loss/crossentropy": 2.3786104917526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24448946118354797, + "step": 7852 + }, + { + "epoch": 0.15708, + "grad_norm": 2.21875, + "grad_norm_var": 0.011693318684895834, + "learning_rate": 0.0001, + "loss": 4.5173, + "loss/crossentropy": 2.304913640022278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24708375334739685, + "step": 7854 + }, + { + "epoch": 0.15712, + "grad_norm": 2.1875, + "grad_norm_var": 0.006371053059895834, + "learning_rate": 0.0001, + "loss": 4.5711, + "loss/crossentropy": 1.8640215396881104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19824489206075668, + "step": 7856 + }, + { + "epoch": 0.15716, + "grad_norm": 2.0625, + "grad_norm_var": 0.00625, + "learning_rate": 0.0001, + "loss": 4.6174, + "loss/crossentropy": 2.444548487663269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2542492523789406, + "step": 7858 + }, + { + "epoch": 0.1572, + "grad_norm": 2.078125, + "grad_norm_var": 0.00592041015625, + "learning_rate": 0.0001, + "loss": 4.0522, + "loss/crossentropy": 1.991346299648285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159758359193802, + "step": 7860 + }, + { + "epoch": 0.15724, + "grad_norm": 2.15625, + "grad_norm_var": 0.005101521809895833, + "learning_rate": 0.0001, + "loss": 4.2627, + "loss/crossentropy": 2.3172048926353455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2506742626428604, + "step": 7862 + }, + { + "epoch": 0.15728, + "grad_norm": 2.265625, + "grad_norm_var": 0.0050201416015625, + "learning_rate": 0.0001, + "loss": 4.5415, + "loss/crossentropy": 1.9302632212638855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104542776942253, + "step": 7864 + }, + { + "epoch": 0.15732, + "grad_norm": 2.125, + "grad_norm_var": 0.020726521809895832, + "learning_rate": 0.0001, + "loss": 4.7831, + "loss/crossentropy": 2.457883358001709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23538483679294586, + "step": 7866 + }, + { + "epoch": 0.15736, + "grad_norm": 2.125, + "grad_norm_var": 0.019677734375, + "learning_rate": 0.0001, + "loss": 4.1912, + "loss/crossentropy": 1.9857566952705383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2131117358803749, + "step": 7868 + }, + { + "epoch": 0.1574, + "grad_norm": 2.203125, + "grad_norm_var": 0.019140625, + "learning_rate": 0.0001, + "loss": 4.4264, + "loss/crossentropy": 2.0062127113342285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22806233912706375, + "step": 7870 + }, + { + "epoch": 0.15744, + "grad_norm": 2.15625, + "grad_norm_var": 0.019489542643229166, + "learning_rate": 0.0001, + "loss": 4.6181, + "loss/crossentropy": 2.283127784729004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23883548378944397, + "step": 7872 + }, + { + "epoch": 0.15748, + "grad_norm": 2.234375, + "grad_norm_var": 0.019136555989583335, + "learning_rate": 0.0001, + "loss": 4.4442, + "loss/crossentropy": 2.110979437828064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209333136677742, + "step": 7874 + }, + { + "epoch": 0.15752, + "grad_norm": 2.140625, + "grad_norm_var": 0.019856770833333332, + "learning_rate": 0.0001, + "loss": 4.2107, + "loss/crossentropy": 1.8705166578292847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20791998505592346, + "step": 7876 + }, + { + "epoch": 0.15756, + "grad_norm": 2.140625, + "grad_norm_var": 0.019017537434895832, + "learning_rate": 0.0001, + "loss": 4.2956, + "loss/crossentropy": 2.133803129196167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23219672590494156, + "step": 7878 + }, + { + "epoch": 0.1576, + "grad_norm": 2.21875, + "grad_norm_var": 0.019627888997395832, + "learning_rate": 0.0001, + "loss": 4.5028, + "loss/crossentropy": 2.1062549352645874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2266012579202652, + "step": 7880 + }, + { + "epoch": 0.15764, + "grad_norm": 2.28125, + "grad_norm_var": 0.0058553059895833336, + "learning_rate": 0.0001, + "loss": 4.5078, + "loss/crossentropy": 2.0088155269622803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2079932913184166, + "step": 7882 + }, + { + "epoch": 0.15768, + "grad_norm": 2.234375, + "grad_norm_var": 0.0065582275390625, + "learning_rate": 0.0001, + "loss": 3.9204, + "loss/crossentropy": 1.6621176600456238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19370710104703903, + "step": 7884 + }, + { + "epoch": 0.15772, + "grad_norm": 2.09375, + "grad_norm_var": 0.007136027018229167, + "learning_rate": 0.0001, + "loss": 4.3462, + "loss/crossentropy": 1.7729167938232422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18562481552362442, + "step": 7886 + }, + { + "epoch": 0.15776, + "grad_norm": 2.109375, + "grad_norm_var": 0.006843058268229166, + "learning_rate": 0.0001, + "loss": 4.3897, + "loss/crossentropy": 2.132485508918762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23432063311338425, + "step": 7888 + }, + { + "epoch": 0.1578, + "grad_norm": 2.328125, + "grad_norm_var": 0.0091949462890625, + "learning_rate": 0.0001, + "loss": 4.3731, + "loss/crossentropy": 2.122144937515259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23042195290327072, + "step": 7890 + }, + { + "epoch": 0.15784, + "grad_norm": 2.8125, + "grad_norm_var": 0.034956868489583334, + "learning_rate": 0.0001, + "loss": 4.4025, + "loss/crossentropy": 1.855578601360321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20124691724777222, + "step": 7892 + }, + { + "epoch": 0.15788, + "grad_norm": 2.15625, + "grad_norm_var": 0.035054524739583336, + "learning_rate": 0.0001, + "loss": 4.172, + "loss/crossentropy": 2.02128005027771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20766886323690414, + "step": 7894 + }, + { + "epoch": 0.15792, + "grad_norm": 1.984375, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 0.0001, + "loss": 4.3096, + "loss/crossentropy": 2.112824857234955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22250613570213318, + "step": 7896 + }, + { + "epoch": 0.15796, + "grad_norm": 2.3125, + "grad_norm_var": 0.03662821451822917, + "learning_rate": 0.0001, + "loss": 4.5595, + "loss/crossentropy": 2.2290207147598267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2531846910715103, + "step": 7898 + }, + { + "epoch": 0.158, + "grad_norm": 1.96875, + "grad_norm_var": 0.03882548014322917, + "learning_rate": 0.0001, + "loss": 4.3586, + "loss/crossentropy": 2.135373592376709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2266940325498581, + "step": 7900 + }, + { + "epoch": 0.15804, + "grad_norm": 2.125, + "grad_norm_var": 0.04052734375, + "learning_rate": 0.0001, + "loss": 4.1719, + "loss/crossentropy": 1.7298616170883179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1986929401755333, + "step": 7902 + }, + { + "epoch": 0.15808, + "grad_norm": 2.3125, + "grad_norm_var": 0.04173075358072917, + "learning_rate": 0.0001, + "loss": 4.403, + "loss/crossentropy": 2.18759286403656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21622422337532043, + "step": 7904 + }, + { + "epoch": 0.15812, + "grad_norm": 2.140625, + "grad_norm_var": 0.03889058430989583, + "learning_rate": 0.0001, + "loss": 4.4765, + "loss/crossentropy": 2.0889216661453247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21440055221319199, + "step": 7906 + }, + { + "epoch": 0.15816, + "grad_norm": 2.1875, + "grad_norm_var": 0.010204060872395834, + "learning_rate": 0.0001, + "loss": 4.2231, + "loss/crossentropy": 1.7791658639907837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1967781037092209, + "step": 7908 + }, + { + "epoch": 0.1582, + "grad_norm": 1.953125, + "grad_norm_var": 0.012386067708333334, + "learning_rate": 0.0001, + "loss": 4.2314, + "loss/crossentropy": 2.2144845724105835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22152486443519592, + "step": 7910 + }, + { + "epoch": 0.15824, + "grad_norm": 2.078125, + "grad_norm_var": 0.012723795572916667, + "learning_rate": 0.0001, + "loss": 4.6236, + "loss/crossentropy": 2.316117286682129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24549901485443115, + "step": 7912 + }, + { + "epoch": 0.15828, + "grad_norm": 2.046875, + "grad_norm_var": 0.0115875244140625, + "learning_rate": 0.0001, + "loss": 4.0867, + "loss/crossentropy": 1.8642511367797852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22441796958446503, + "step": 7914 + }, + { + "epoch": 0.15832, + "grad_norm": 2.078125, + "grad_norm_var": 0.0098541259765625, + "learning_rate": 0.0001, + "loss": 4.1365, + "loss/crossentropy": 1.807969868183136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22138798981904984, + "step": 7916 + }, + { + "epoch": 0.15836, + "grad_norm": 2.390625, + "grad_norm_var": 0.013866170247395834, + "learning_rate": 0.0001, + "loss": 4.5625, + "loss/crossentropy": 2.266697645187378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22544697672128677, + "step": 7918 + }, + { + "epoch": 0.1584, + "grad_norm": 2.203125, + "grad_norm_var": 0.012262980143229166, + "learning_rate": 0.0001, + "loss": 4.5632, + "loss/crossentropy": 2.0871587991714478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21997228264808655, + "step": 7920 + }, + { + "epoch": 0.15844, + "grad_norm": 2.21875, + "grad_norm_var": 0.0126861572265625, + "learning_rate": 0.0001, + "loss": 4.5567, + "loss/crossentropy": 2.1993759870529175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2416691780090332, + "step": 7922 + }, + { + "epoch": 0.15848, + "grad_norm": 2.5, + "grad_norm_var": 0.020340983072916666, + "learning_rate": 0.0001, + "loss": 4.2997, + "loss/crossentropy": 1.8039653897285461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21721573173999786, + "step": 7924 + }, + { + "epoch": 0.15852, + "grad_norm": 2.3125, + "grad_norm_var": 0.0225982666015625, + "learning_rate": 0.0001, + "loss": 4.4795, + "loss/crossentropy": 2.188117265701294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.231883242726326, + "step": 7926 + }, + { + "epoch": 0.15856, + "grad_norm": 2.171875, + "grad_norm_var": 0.021923828125, + "learning_rate": 0.0001, + "loss": 4.6694, + "loss/crossentropy": 2.3920425176620483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24655026197433472, + "step": 7928 + }, + { + "epoch": 0.1586, + "grad_norm": 2.125, + "grad_norm_var": 0.017561848958333334, + "learning_rate": 0.0001, + "loss": 4.1001, + "loss/crossentropy": 2.286831498146057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23398208618164062, + "step": 7930 + }, + { + "epoch": 0.15864, + "grad_norm": 2.0625, + "grad_norm_var": 0.016825358072916668, + "learning_rate": 0.0001, + "loss": 4.0007, + "loss/crossentropy": 2.075824797153473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22423933446407318, + "step": 7932 + }, + { + "epoch": 0.15868, + "grad_norm": 2.25, + "grad_norm_var": 0.0155426025390625, + "learning_rate": 0.0001, + "loss": 4.5697, + "loss/crossentropy": 2.197165012359619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22215355187654495, + "step": 7934 + }, + { + "epoch": 0.15872, + "grad_norm": 2.078125, + "grad_norm_var": 0.017854817708333335, + "learning_rate": 0.0001, + "loss": 4.2899, + "loss/crossentropy": 1.8253535032272339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20457974076271057, + "step": 7936 + }, + { + "epoch": 0.15876, + "grad_norm": 2.21875, + "grad_norm_var": 0.01783447265625, + "learning_rate": 0.0001, + "loss": 4.4774, + "loss/crossentropy": 1.842383086681366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20742817968130112, + "step": 7938 + }, + { + "epoch": 0.1588, + "grad_norm": 2.0625, + "grad_norm_var": 0.014208984375, + "learning_rate": 0.0001, + "loss": 4.3801, + "loss/crossentropy": 2.2086315155029297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2086438685655594, + "step": 7940 + }, + { + "epoch": 0.15884, + "grad_norm": 2.109375, + "grad_norm_var": 0.0067047119140625, + "learning_rate": 0.0001, + "loss": 4.3718, + "loss/crossentropy": 2.3381282091140747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2546956539154053, + "step": 7942 + }, + { + "epoch": 0.15888, + "grad_norm": 2.265625, + "grad_norm_var": 0.005248006184895833, + "learning_rate": 0.0001, + "loss": 4.3439, + "loss/crossentropy": 2.1159931421279907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23917824029922485, + "step": 7944 + }, + { + "epoch": 0.15892, + "grad_norm": 2.078125, + "grad_norm_var": 0.0069081624348958336, + "learning_rate": 0.0001, + "loss": 4.2056, + "loss/crossentropy": 1.7507159113883972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1870383694767952, + "step": 7946 + }, + { + "epoch": 0.15896, + "grad_norm": 2.140625, + "grad_norm_var": 0.0073964436848958336, + "learning_rate": 0.0001, + "loss": 4.2118, + "loss/crossentropy": 2.213807225227356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22211921215057373, + "step": 7948 + }, + { + "epoch": 0.159, + "grad_norm": 2.03125, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 4.4316, + "loss/crossentropy": 2.0570366978645325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2662791311740875, + "step": 7950 + }, + { + "epoch": 0.15904, + "grad_norm": 2.171875, + "grad_norm_var": 0.009748331705729167, + "learning_rate": 0.0001, + "loss": 4.2041, + "loss/crossentropy": 1.8775206208229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22045740485191345, + "step": 7952 + }, + { + "epoch": 0.15908, + "grad_norm": 2.046875, + "grad_norm_var": 0.009601847330729166, + "learning_rate": 0.0001, + "loss": 4.2575, + "loss/crossentropy": 2.3483108282089233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22626785188913345, + "step": 7954 + }, + { + "epoch": 0.15912, + "grad_norm": 2.125, + "grad_norm_var": 0.009496053059895834, + "learning_rate": 0.0001, + "loss": 4.193, + "loss/crossentropy": 1.9531084895133972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22194421291351318, + "step": 7956 + }, + { + "epoch": 0.15916, + "grad_norm": 2.328125, + "grad_norm_var": 0.011213175455729167, + "learning_rate": 0.0001, + "loss": 4.4819, + "loss/crossentropy": 2.057813823223114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21116723865270615, + "step": 7958 + }, + { + "epoch": 0.1592, + "grad_norm": 2.28125, + "grad_norm_var": 0.01148681640625, + "learning_rate": 0.0001, + "loss": 4.3478, + "loss/crossentropy": 1.9398415088653564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2257898524403572, + "step": 7960 + }, + { + "epoch": 0.15924, + "grad_norm": 2.015625, + "grad_norm_var": 0.0111480712890625, + "learning_rate": 0.0001, + "loss": 4.1942, + "loss/crossentropy": 1.9200270175933838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119666188955307, + "step": 7962 + }, + { + "epoch": 0.15928, + "grad_norm": 2.203125, + "grad_norm_var": 0.010640462239583334, + "learning_rate": 0.0001, + "loss": 4.4513, + "loss/crossentropy": 2.157149076461792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23271384835243225, + "step": 7964 + }, + { + "epoch": 0.15932, + "grad_norm": 2.25, + "grad_norm_var": 0.0083404541015625, + "learning_rate": 0.0001, + "loss": 4.2064, + "loss/crossentropy": 2.0308582186698914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23934553563594818, + "step": 7966 + }, + { + "epoch": 0.15936, + "grad_norm": 2.234375, + "grad_norm_var": 0.015543619791666666, + "learning_rate": 0.0001, + "loss": 4.3977, + "loss/crossentropy": 2.1855711936950684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.236568883061409, + "step": 7968 + }, + { + "epoch": 0.1594, + "grad_norm": 2.375, + "grad_norm_var": 0.07629292805989583, + "learning_rate": 0.0001, + "loss": 4.719, + "loss/crossentropy": 2.4479551315307617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24693088978528976, + "step": 7970 + }, + { + "epoch": 0.15944, + "grad_norm": 2.21875, + "grad_norm_var": 0.07333882649739583, + "learning_rate": 0.0001, + "loss": 4.5221, + "loss/crossentropy": 2.439974784851074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2502682954072952, + "step": 7972 + }, + { + "epoch": 0.15948, + "grad_norm": 2.375, + "grad_norm_var": 0.07625325520833333, + "learning_rate": 0.0001, + "loss": 4.4024, + "loss/crossentropy": 1.9899010062217712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23031124472618103, + "step": 7974 + }, + { + "epoch": 0.15952, + "grad_norm": 2.109375, + "grad_norm_var": 0.07517903645833333, + "learning_rate": 0.0001, + "loss": 4.2522, + "loss/crossentropy": 1.830255150794983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984459012746811, + "step": 7976 + }, + { + "epoch": 0.15956, + "grad_norm": 2.21875, + "grad_norm_var": 0.07088114420572916, + "learning_rate": 0.0001, + "loss": 4.3832, + "loss/crossentropy": 1.9675705432891846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21336429566144943, + "step": 7978 + }, + { + "epoch": 0.1596, + "grad_norm": 2.1875, + "grad_norm_var": 0.06965738932291667, + "learning_rate": 0.0001, + "loss": 4.4784, + "loss/crossentropy": 2.030815005302429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102503478527069, + "step": 7980 + }, + { + "epoch": 0.15964, + "grad_norm": 2.109375, + "grad_norm_var": 0.07285054524739583, + "learning_rate": 0.0001, + "loss": 4.492, + "loss/crossentropy": 2.4932440519332886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.246867336332798, + "step": 7982 + }, + { + "epoch": 0.15968, + "grad_norm": 2.078125, + "grad_norm_var": 0.07330322265625, + "learning_rate": 0.0001, + "loss": 4.2085, + "loss/crossentropy": 1.5839802622795105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17231802642345428, + "step": 7984 + }, + { + "epoch": 0.15972, + "grad_norm": 2.359375, + "grad_norm_var": 0.0091217041015625, + "learning_rate": 0.0001, + "loss": 4.4045, + "loss/crossentropy": 1.821477472782135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239791452884674, + "step": 7986 + }, + { + "epoch": 0.15976, + "grad_norm": 2.109375, + "grad_norm_var": 0.0093414306640625, + "learning_rate": 0.0001, + "loss": 4.0655, + "loss/crossentropy": 2.013838052749634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2335103377699852, + "step": 7988 + }, + { + "epoch": 0.1598, + "grad_norm": 2.03125, + "grad_norm_var": 0.006376139322916667, + "learning_rate": 0.0001, + "loss": 4.4989, + "loss/crossentropy": 1.9412779211997986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20336110144853592, + "step": 7990 + }, + { + "epoch": 0.15984, + "grad_norm": 2.0, + "grad_norm_var": 0.008234659830729166, + "learning_rate": 0.0001, + "loss": 3.9015, + "loss/crossentropy": 1.6653677225112915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025909200310707, + "step": 7992 + }, + { + "epoch": 0.15988, + "grad_norm": 2.125, + "grad_norm_var": 0.00865478515625, + "learning_rate": 0.0001, + "loss": 4.4299, + "loss/crossentropy": 2.0069726705551147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21300900727510452, + "step": 7994 + }, + { + "epoch": 0.15992, + "grad_norm": 2.046875, + "grad_norm_var": 0.009919230143229167, + "learning_rate": 0.0001, + "loss": 4.4395, + "loss/crossentropy": 2.1118472814559937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25853876769542694, + "step": 7996 + }, + { + "epoch": 0.15996, + "grad_norm": 2.34375, + "grad_norm_var": 0.013622029622395834, + "learning_rate": 0.0001, + "loss": 4.1922, + "loss/crossentropy": 1.608262836933136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958206593990326, + "step": 7998 + }, + { + "epoch": 0.16, + "grad_norm": 2.203125, + "grad_norm_var": 0.0138580322265625, + "learning_rate": 0.0001, + "loss": 4.2896, + "loss/crossentropy": 1.6572073101997375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19102878868579865, + "step": 8000 + }, + { + "epoch": 0.16004, + "grad_norm": 2.015625, + "grad_norm_var": 0.0113677978515625, + "learning_rate": 0.0001, + "loss": 4.0811, + "loss/crossentropy": 1.9421688318252563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19628942012786865, + "step": 8002 + }, + { + "epoch": 0.16008, + "grad_norm": 2.265625, + "grad_norm_var": 0.0134918212890625, + "learning_rate": 0.0001, + "loss": 4.5344, + "loss/crossentropy": 2.2197489738464355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.248014435172081, + "step": 8004 + }, + { + "epoch": 0.16012, + "grad_norm": 2.203125, + "grad_norm_var": 0.0126617431640625, + "learning_rate": 0.0001, + "loss": 4.4625, + "loss/crossentropy": 2.200868308544159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2476908192038536, + "step": 8006 + }, + { + "epoch": 0.16016, + "grad_norm": 2.34375, + "grad_norm_var": 0.01162109375, + "learning_rate": 0.0001, + "loss": 4.8151, + "loss/crossentropy": 2.3793649673461914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356039509177208, + "step": 8008 + }, + { + "epoch": 0.1602, + "grad_norm": 2.140625, + "grad_norm_var": 0.0112945556640625, + "learning_rate": 0.0001, + "loss": 4.1849, + "loss/crossentropy": 2.130257308483124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224358968436718, + "step": 8010 + }, + { + "epoch": 0.16024, + "grad_norm": 2.15625, + "grad_norm_var": 0.009566243489583333, + "learning_rate": 0.0001, + "loss": 4.5948, + "loss/crossentropy": 2.370365023612976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23811528086662292, + "step": 8012 + }, + { + "epoch": 0.16028, + "grad_norm": 2.0, + "grad_norm_var": 0.008812459309895833, + "learning_rate": 0.0001, + "loss": 4.4326, + "loss/crossentropy": 1.985486626625061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19639353454113007, + "step": 8014 + }, + { + "epoch": 0.16032, + "grad_norm": 2.40625, + "grad_norm_var": 0.01129150390625, + "learning_rate": 0.0001, + "loss": 4.4335, + "loss/crossentropy": 2.2128632068634033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2340361252427101, + "step": 8016 + }, + { + "epoch": 0.16036, + "grad_norm": 2.0625, + "grad_norm_var": 0.009956868489583333, + "learning_rate": 0.0001, + "loss": 4.3688, + "loss/crossentropy": 1.830498456954956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19760622829198837, + "step": 8018 + }, + { + "epoch": 0.1604, + "grad_norm": 4.375, + "grad_norm_var": 0.3061757405598958, + "learning_rate": 0.0001, + "loss": 4.6443, + "loss/crossentropy": 1.9595977067947388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138657420873642, + "step": 8020 + }, + { + "epoch": 0.16044, + "grad_norm": 2.1875, + "grad_norm_var": 0.30684305826822916, + "learning_rate": 0.0001, + "loss": 4.3101, + "loss/crossentropy": 2.400893449783325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2615668326616287, + "step": 8022 + }, + { + "epoch": 0.16048, + "grad_norm": 2.140625, + "grad_norm_var": 0.3080963134765625, + "learning_rate": 0.0001, + "loss": 4.6359, + "loss/crossentropy": 2.5079843997955322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.251235693693161, + "step": 8024 + }, + { + "epoch": 0.16052, + "grad_norm": 2.265625, + "grad_norm_var": 0.30686848958333335, + "learning_rate": 0.0001, + "loss": 4.4973, + "loss/crossentropy": 2.573891043663025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2646481841802597, + "step": 8026 + }, + { + "epoch": 0.16056, + "grad_norm": 2.1875, + "grad_norm_var": 0.3064605712890625, + "learning_rate": 0.0001, + "loss": 4.4177, + "loss/crossentropy": 2.0088363885879517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22888045758008957, + "step": 8028 + }, + { + "epoch": 0.1606, + "grad_norm": 2.109375, + "grad_norm_var": 0.31115697224934896, + "learning_rate": 0.0001, + "loss": 4.214, + "loss/crossentropy": 2.4596647024154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2507014721632004, + "step": 8030 + }, + { + "epoch": 0.16064, + "grad_norm": 2.1875, + "grad_norm_var": 0.3093462626139323, + "learning_rate": 0.0001, + "loss": 4.6826, + "loss/crossentropy": 2.204255223274231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22913406044244766, + "step": 8032 + }, + { + "epoch": 0.16068, + "grad_norm": 2.125, + "grad_norm_var": 0.30677261352539065, + "learning_rate": 0.0001, + "loss": 4.6603, + "loss/crossentropy": 2.285408139228821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23185817897319794, + "step": 8034 + }, + { + "epoch": 0.16072, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012520345052083333, + "learning_rate": 0.0001, + "loss": 3.9513, + "loss/crossentropy": 2.0788660645484924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20811481028795242, + "step": 8036 + }, + { + "epoch": 0.16076, + "grad_norm": 2.140625, + "grad_norm_var": 0.01307373046875, + "learning_rate": 0.0001, + "loss": 4.5368, + "loss/crossentropy": 2.398258686065674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2411409318447113, + "step": 8038 + }, + { + "epoch": 0.1608, + "grad_norm": 2.140625, + "grad_norm_var": 0.015579986572265624, + "learning_rate": 0.0001, + "loss": 4.0415, + "loss/crossentropy": 2.3101454973220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22493668645620346, + "step": 8040 + }, + { + "epoch": 0.16084, + "grad_norm": 2.203125, + "grad_norm_var": 0.015134429931640625, + "learning_rate": 0.0001, + "loss": 4.0612, + "loss/crossentropy": 1.9015105962753296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22360052913427353, + "step": 8042 + }, + { + "epoch": 0.16088, + "grad_norm": 2.34375, + "grad_norm_var": 0.017704010009765625, + "learning_rate": 0.0001, + "loss": 4.216, + "loss/crossentropy": 2.0112481117248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24062782526016235, + "step": 8044 + }, + { + "epoch": 0.16092, + "grad_norm": 2.171875, + "grad_norm_var": 0.0161773681640625, + "learning_rate": 0.0001, + "loss": 4.3638, + "loss/crossentropy": 2.2024285793304443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21959003806114197, + "step": 8046 + }, + { + "epoch": 0.16096, + "grad_norm": 2.125, + "grad_norm_var": 0.014412434895833333, + "learning_rate": 0.0001, + "loss": 4.4825, + "loss/crossentropy": 2.1069058775901794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21980682760477066, + "step": 8048 + }, + { + "epoch": 0.161, + "grad_norm": 2.3125, + "grad_norm_var": 0.012540690104166667, + "learning_rate": 0.0001, + "loss": 4.2106, + "loss/crossentropy": 1.8380340337753296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2201671525835991, + "step": 8050 + }, + { + "epoch": 0.16104, + "grad_norm": 2.046875, + "grad_norm_var": 0.010965728759765625, + "learning_rate": 0.0001, + "loss": 4.3778, + "loss/crossentropy": 2.276741087436676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23787930607795715, + "step": 8052 + }, + { + "epoch": 0.16108, + "grad_norm": 2.140625, + "grad_norm_var": 0.010680898030598959, + "learning_rate": 0.0001, + "loss": 4.0784, + "loss/crossentropy": 1.631809651851654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19654212146997452, + "step": 8054 + }, + { + "epoch": 0.16112, + "grad_norm": 2.234375, + "grad_norm_var": 0.007453409830729166, + "learning_rate": 0.0001, + "loss": 4.5927, + "loss/crossentropy": 2.067444145679474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21420229971408844, + "step": 8056 + }, + { + "epoch": 0.16116, + "grad_norm": 2.21875, + "grad_norm_var": 0.011767578125, + "learning_rate": 0.0001, + "loss": 4.562, + "loss/crossentropy": 2.384890556335449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2312106341123581, + "step": 8058 + }, + { + "epoch": 0.1612, + "grad_norm": 2.265625, + "grad_norm_var": 0.010986328125, + "learning_rate": 0.0001, + "loss": 4.363, + "loss/crossentropy": 2.3683160543441772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2527216002345085, + "step": 8060 + }, + { + "epoch": 0.16124, + "grad_norm": 2.171875, + "grad_norm_var": 0.011253865559895833, + "learning_rate": 0.0001, + "loss": 4.4576, + "loss/crossentropy": 2.1545952558517456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23928315192461014, + "step": 8062 + }, + { + "epoch": 0.16128, + "grad_norm": 1.96875, + "grad_norm_var": 0.013695271809895833, + "learning_rate": 0.0001, + "loss": 4.2285, + "loss/crossentropy": 2.0792208313941956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2226722463965416, + "step": 8064 + }, + { + "epoch": 0.16132, + "grad_norm": 2.09375, + "grad_norm_var": 0.013068644205729167, + "learning_rate": 0.0001, + "loss": 3.8788, + "loss/crossentropy": 2.181519627571106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193506434559822, + "step": 8066 + }, + { + "epoch": 0.16136, + "grad_norm": 2.375, + "grad_norm_var": 0.015067545572916667, + "learning_rate": 0.0001, + "loss": 4.4445, + "loss/crossentropy": 1.7798657417297363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23413337767124176, + "step": 8068 + }, + { + "epoch": 0.1614, + "grad_norm": 2.125, + "grad_norm_var": 0.017235310872395833, + "learning_rate": 0.0001, + "loss": 4.0731, + "loss/crossentropy": 2.1232666969299316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23351240158081055, + "step": 8070 + }, + { + "epoch": 0.16144, + "grad_norm": 2.203125, + "grad_norm_var": 0.017609659830729166, + "learning_rate": 0.0001, + "loss": 4.6975, + "loss/crossentropy": 2.34002685546875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2533388137817383, + "step": 8072 + }, + { + "epoch": 0.16148, + "grad_norm": 2.21875, + "grad_norm_var": 0.011637369791666666, + "learning_rate": 0.0001, + "loss": 4.3487, + "loss/crossentropy": 2.066399872303009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21156150847673416, + "step": 8074 + }, + { + "epoch": 0.16152, + "grad_norm": 2.171875, + "grad_norm_var": 0.010677083333333334, + "learning_rate": 0.0001, + "loss": 4.3646, + "loss/crossentropy": 2.2298463582992554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2307809516787529, + "step": 8076 + }, + { + "epoch": 0.16156, + "grad_norm": 2.0625, + "grad_norm_var": 0.0105865478515625, + "learning_rate": 0.0001, + "loss": 4.1946, + "loss/crossentropy": 1.9858508110046387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21499747782945633, + "step": 8078 + }, + { + "epoch": 0.1616, + "grad_norm": 2.0625, + "grad_norm_var": 0.011149088541666666, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 1.5669215321540833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889330968260765, + "step": 8080 + }, + { + "epoch": 0.16164, + "grad_norm": 2.25, + "grad_norm_var": 0.011881510416666666, + "learning_rate": 0.0001, + "loss": 4.4743, + "loss/crossentropy": 2.296878218650818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2405889928340912, + "step": 8082 + }, + { + "epoch": 0.16168, + "grad_norm": 2.203125, + "grad_norm_var": 0.007307942708333333, + "learning_rate": 0.0001, + "loss": 4.2205, + "loss/crossentropy": 1.954626441001892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19809379428625107, + "step": 8084 + }, + { + "epoch": 0.16172, + "grad_norm": 2.3125, + "grad_norm_var": 0.008687337239583334, + "learning_rate": 0.0001, + "loss": 4.5923, + "loss/crossentropy": 1.8640353083610535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.226090669631958, + "step": 8086 + }, + { + "epoch": 0.16176, + "grad_norm": 2.28125, + "grad_norm_var": 0.009663899739583334, + "learning_rate": 0.0001, + "loss": 4.3363, + "loss/crossentropy": 2.1914591789245605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24432511627674103, + "step": 8088 + }, + { + "epoch": 0.1618, + "grad_norm": 2.21875, + "grad_norm_var": 0.009798177083333333, + "learning_rate": 0.0001, + "loss": 4.7291, + "loss/crossentropy": 2.192594051361084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23682628571987152, + "step": 8090 + }, + { + "epoch": 0.16184, + "grad_norm": 2.09375, + "grad_norm_var": 0.010993448893229167, + "learning_rate": 0.0001, + "loss": 4.4592, + "loss/crossentropy": 2.210235595703125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.230379119515419, + "step": 8092 + }, + { + "epoch": 0.16188, + "grad_norm": 2.140625, + "grad_norm_var": 0.011115519205729167, + "learning_rate": 0.0001, + "loss": 4.4111, + "loss/crossentropy": 2.214667320251465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26643867790699005, + "step": 8094 + }, + { + "epoch": 0.16192, + "grad_norm": 2.09375, + "grad_norm_var": 0.009586588541666666, + "learning_rate": 0.0001, + "loss": 4.3131, + "loss/crossentropy": 1.9808599948883057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20679324120283127, + "step": 8096 + }, + { + "epoch": 0.16196, + "grad_norm": 2.109375, + "grad_norm_var": 0.008333333333333333, + "learning_rate": 0.0001, + "loss": 4.4683, + "loss/crossentropy": 2.076589345932007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23430196940898895, + "step": 8098 + }, + { + "epoch": 0.162, + "grad_norm": 2.15625, + "grad_norm_var": 0.006883748372395833, + "learning_rate": 0.0001, + "loss": 4.5645, + "loss/crossentropy": 2.364492177963257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23023054748773575, + "step": 8100 + }, + { + "epoch": 0.16204, + "grad_norm": 2.140625, + "grad_norm_var": 0.005890909830729167, + "learning_rate": 0.0001, + "loss": 4.6595, + "loss/crossentropy": 2.2908111214637756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21239468455314636, + "step": 8102 + }, + { + "epoch": 0.16208, + "grad_norm": 2.40625, + "grad_norm_var": 0.008958943684895833, + "learning_rate": 0.0001, + "loss": 4.5503, + "loss/crossentropy": 1.8306183218955994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21511316299438477, + "step": 8104 + }, + { + "epoch": 0.16212, + "grad_norm": 2.0625, + "grad_norm_var": 0.010724894205729167, + "learning_rate": 0.0001, + "loss": 4.1582, + "loss/crossentropy": 2.1121758222579956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23471853882074356, + "step": 8106 + }, + { + "epoch": 0.16216, + "grad_norm": 2.140625, + "grad_norm_var": 0.012355295817057292, + "learning_rate": 0.0001, + "loss": 4.1464, + "loss/crossentropy": 1.7613067030906677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025228664278984, + "step": 8108 + }, + { + "epoch": 0.1622, + "grad_norm": 2.0, + "grad_norm_var": 0.013844553629557292, + "learning_rate": 0.0001, + "loss": 4.0797, + "loss/crossentropy": 2.1413058042526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21817607432603836, + "step": 8110 + }, + { + "epoch": 0.16224, + "grad_norm": 2.265625, + "grad_norm_var": 0.013641103108723959, + "learning_rate": 0.0001, + "loss": 4.5739, + "loss/crossentropy": 2.375948429107666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30982857942581177, + "step": 8112 + }, + { + "epoch": 0.16228, + "grad_norm": 2.1875, + "grad_norm_var": 0.013396962483723959, + "learning_rate": 0.0001, + "loss": 4.076, + "loss/crossentropy": 1.9669193029403687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21873307973146439, + "step": 8114 + }, + { + "epoch": 0.16232, + "grad_norm": 2.21875, + "grad_norm_var": 0.015592193603515625, + "learning_rate": 0.0001, + "loss": 4.4024, + "loss/crossentropy": 2.2028547525405884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.229970782995224, + "step": 8116 + }, + { + "epoch": 0.16236, + "grad_norm": 2.078125, + "grad_norm_var": 0.01622289021809896, + "learning_rate": 0.0001, + "loss": 4.3359, + "loss/crossentropy": 2.082156002521515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262335941195488, + "step": 8118 + }, + { + "epoch": 0.1624, + "grad_norm": 2.078125, + "grad_norm_var": 0.012379709879557292, + "learning_rate": 0.0001, + "loss": 4.2737, + "loss/crossentropy": 2.0538666248321533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22941745817661285, + "step": 8120 + }, + { + "epoch": 0.16244, + "grad_norm": 2.4375, + "grad_norm_var": 0.015750885009765625, + "learning_rate": 0.0001, + "loss": 4.3615, + "loss/crossentropy": 2.338989734649658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2434876710176468, + "step": 8122 + }, + { + "epoch": 0.16248, + "grad_norm": 2.375, + "grad_norm_var": 2.602311197916667, + "learning_rate": 0.0001, + "loss": 4.5472, + "loss/crossentropy": 2.277916193008423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29285988211631775, + "step": 8124 + }, + { + "epoch": 0.16252, + "grad_norm": 2.46875, + "grad_norm_var": 2.567577107747396, + "learning_rate": 0.0001, + "loss": 4.3437, + "loss/crossentropy": 2.1196334958076477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20728030800819397, + "step": 8126 + }, + { + "epoch": 0.16256, + "grad_norm": 2.15625, + "grad_norm_var": 2.5655558268229166, + "learning_rate": 0.0001, + "loss": 4.4216, + "loss/crossentropy": 2.0216450095176697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.229092076420784, + "step": 8128 + }, + { + "epoch": 0.1626, + "grad_norm": 2.171875, + "grad_norm_var": 2.5546834309895834, + "learning_rate": 0.0001, + "loss": 4.5163, + "loss/crossentropy": 2.098921537399292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22539281845092773, + "step": 8130 + }, + { + "epoch": 0.16264, + "grad_norm": 2.03125, + "grad_norm_var": 2.580052693684896, + "learning_rate": 0.0001, + "loss": 4.1066, + "loss/crossentropy": 2.0186068415641785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20286529511213303, + "step": 8132 + }, + { + "epoch": 0.16268, + "grad_norm": 2.28125, + "grad_norm_var": 2.5584706624348956, + "learning_rate": 0.0001, + "loss": 4.7847, + "loss/crossentropy": 2.157357335090637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23342902958393097, + "step": 8134 + }, + { + "epoch": 0.16272, + "grad_norm": 2.140625, + "grad_norm_var": 2.5516998291015627, + "learning_rate": 0.0001, + "loss": 4.3489, + "loss/crossentropy": 2.1885476112365723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22314801812171936, + "step": 8136 + }, + { + "epoch": 0.16276, + "grad_norm": 2.21875, + "grad_norm_var": 2.5796160380045574, + "learning_rate": 0.0001, + "loss": 4.3403, + "loss/crossentropy": 2.2208757400512695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295266091823578, + "step": 8138 + }, + { + "epoch": 0.1628, + "grad_norm": 2.015625, + "grad_norm_var": 0.027522532145182292, + "learning_rate": 0.0001, + "loss": 4.0473, + "loss/crossentropy": 1.7885233163833618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008970081806183, + "step": 8140 + }, + { + "epoch": 0.16284, + "grad_norm": 2.078125, + "grad_norm_var": 0.022989654541015626, + "learning_rate": 0.0001, + "loss": 4.0835, + "loss/crossentropy": 2.152435064315796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2311881259083748, + "step": 8142 + }, + { + "epoch": 0.16288, + "grad_norm": 2.296875, + "grad_norm_var": 0.016001129150390626, + "learning_rate": 0.0001, + "loss": 4.3384, + "loss/crossentropy": 2.0818406343460083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24117180705070496, + "step": 8144 + }, + { + "epoch": 0.16292, + "grad_norm": 2.140625, + "grad_norm_var": 0.02513402303059896, + "learning_rate": 0.0001, + "loss": 4.4246, + "loss/crossentropy": 2.1775856614112854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23246955126523972, + "step": 8146 + }, + { + "epoch": 0.16296, + "grad_norm": 2.578125, + "grad_norm_var": 0.033607737223307295, + "learning_rate": 0.0001, + "loss": 3.9993, + "loss/crossentropy": 1.7029682397842407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19564303010702133, + "step": 8148 + }, + { + "epoch": 0.163, + "grad_norm": 2.453125, + "grad_norm_var": 0.03468195597330729, + "learning_rate": 0.0001, + "loss": 4.5631, + "loss/crossentropy": 2.070194900035858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2382609099149704, + "step": 8150 + }, + { + "epoch": 0.16304, + "grad_norm": 2.046875, + "grad_norm_var": 0.036043039957682294, + "learning_rate": 0.0001, + "loss": 4.19, + "loss/crossentropy": 1.9214876890182495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20894134789705276, + "step": 8152 + }, + { + "epoch": 0.16308, + "grad_norm": 2.21875, + "grad_norm_var": 0.03243815104166667, + "learning_rate": 0.0001, + "loss": 4.3622, + "loss/crossentropy": 1.7311474084854126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1962232068181038, + "step": 8154 + }, + { + "epoch": 0.16312, + "grad_norm": 2.0, + "grad_norm_var": 0.0323150634765625, + "learning_rate": 0.0001, + "loss": 4.4623, + "loss/crossentropy": 2.2161877155303955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22318138182163239, + "step": 8156 + }, + { + "epoch": 0.16316, + "grad_norm": 2.078125, + "grad_norm_var": 0.03186747233072917, + "learning_rate": 0.0001, + "loss": 4.7343, + "loss/crossentropy": 2.21865177154541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.234280064702034, + "step": 8158 + }, + { + "epoch": 0.1632, + "grad_norm": 2.09375, + "grad_norm_var": 0.032835896809895834, + "learning_rate": 0.0001, + "loss": 4.3277, + "loss/crossentropy": 1.9517142176628113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1964063122868538, + "step": 8160 + }, + { + "epoch": 0.16324, + "grad_norm": 2.25, + "grad_norm_var": 0.023713175455729166, + "learning_rate": 0.0001, + "loss": 4.3878, + "loss/crossentropy": 2.261234760284424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24549759924411774, + "step": 8162 + }, + { + "epoch": 0.16328, + "grad_norm": 2.109375, + "grad_norm_var": 0.012751261393229166, + "learning_rate": 0.0001, + "loss": 4.159, + "loss/crossentropy": 1.9791623950004578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2018037736415863, + "step": 8164 + }, + { + "epoch": 0.16332, + "grad_norm": 2.1875, + "grad_norm_var": 0.005125935872395833, + "learning_rate": 0.0001, + "loss": 4.6231, + "loss/crossentropy": 2.3916029930114746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24446460604667664, + "step": 8166 + }, + { + "epoch": 0.16336, + "grad_norm": 2.328125, + "grad_norm_var": 0.007991536458333334, + "learning_rate": 0.0001, + "loss": 4.3251, + "loss/crossentropy": 2.204437553882599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22334939241409302, + "step": 8168 + }, + { + "epoch": 0.1634, + "grad_norm": 2.0625, + "grad_norm_var": 0.007111612955729167, + "learning_rate": 0.0001, + "loss": 4.3236, + "loss/crossentropy": 2.2013272047042847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068365067243576, + "step": 8170 + }, + { + "epoch": 0.16344, + "grad_norm": 2.03125, + "grad_norm_var": 0.0065826416015625, + "learning_rate": 0.0001, + "loss": 4.2597, + "loss/crossentropy": 1.8648701310157776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19690127670764923, + "step": 8172 + }, + { + "epoch": 0.16348, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008857981363932291, + "learning_rate": 0.0001, + "loss": 4.1324, + "loss/crossentropy": 2.221195936203003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900760903954506, + "step": 8174 + }, + { + "epoch": 0.16352, + "grad_norm": 2.203125, + "grad_norm_var": 0.009445953369140624, + "learning_rate": 0.0001, + "loss": 4.4874, + "loss/crossentropy": 1.8648499846458435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1982845515012741, + "step": 8176 + }, + { + "epoch": 0.16356, + "grad_norm": 2.125, + "grad_norm_var": 0.010762278238932292, + "learning_rate": 0.0001, + "loss": 4.588, + "loss/crossentropy": 2.3032894134521484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25082532316446304, + "step": 8178 + }, + { + "epoch": 0.1636, + "grad_norm": 2.203125, + "grad_norm_var": 0.010931142171223958, + "learning_rate": 0.0001, + "loss": 4.3375, + "loss/crossentropy": 2.0634626150131226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23546822369098663, + "step": 8180 + }, + { + "epoch": 0.16364, + "grad_norm": 2.28125, + "grad_norm_var": 0.012737782796223958, + "learning_rate": 0.0001, + "loss": 4.2402, + "loss/crossentropy": 1.7406468391418457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20627902448177338, + "step": 8182 + }, + { + "epoch": 0.16368, + "grad_norm": 2.125, + "grad_norm_var": 0.012668609619140625, + "learning_rate": 0.0001, + "loss": 4.4103, + "loss/crossentropy": 2.3812272548675537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25258868932724, + "step": 8184 + }, + { + "epoch": 0.16372, + "grad_norm": 2.1875, + "grad_norm_var": 0.012499745686848958, + "learning_rate": 0.0001, + "loss": 4.3584, + "loss/crossentropy": 2.20754611492157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21015550196170807, + "step": 8186 + }, + { + "epoch": 0.16376, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0140777587890625, + "learning_rate": 0.0001, + "loss": 4.1902, + "loss/crossentropy": 2.081672966480255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19573034346103668, + "step": 8188 + }, + { + "epoch": 0.1638, + "grad_norm": 2.140625, + "grad_norm_var": 0.009905751546223958, + "learning_rate": 0.0001, + "loss": 4.5267, + "loss/crossentropy": 2.184974491596222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2477174997329712, + "step": 8190 + }, + { + "epoch": 0.16384, + "grad_norm": 2.125, + "grad_norm_var": 0.010322825113932291, + "learning_rate": 0.0001, + "loss": 4.3804, + "loss/crossentropy": 2.1048192977905273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22900478541851044, + "step": 8192 + }, + { + "epoch": 0.16388, + "grad_norm": 2.046875, + "grad_norm_var": 0.010135650634765625, + "learning_rate": 0.0001, + "loss": 4.1068, + "loss/crossentropy": 1.960956335067749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23504969477653503, + "step": 8194 + }, + { + "epoch": 0.16392, + "grad_norm": 2.484375, + "grad_norm_var": 0.017114003499348957, + "learning_rate": 0.0001, + "loss": 4.7698, + "loss/crossentropy": 2.158856213092804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200375646352768, + "step": 8196 + }, + { + "epoch": 0.16396, + "grad_norm": 2.015625, + "grad_norm_var": 0.017286936442057293, + "learning_rate": 0.0001, + "loss": 4.0533, + "loss/crossentropy": 2.0954058170318604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22729048877954483, + "step": 8198 + }, + { + "epoch": 0.164, + "grad_norm": 2.078125, + "grad_norm_var": 0.01599299112955729, + "learning_rate": 0.0001, + "loss": 4.3492, + "loss/crossentropy": 2.1452964544296265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21412881463766098, + "step": 8200 + }, + { + "epoch": 0.16404, + "grad_norm": 1.8359375, + "grad_norm_var": 0.02072321573893229, + "learning_rate": 0.0001, + "loss": 4.0204, + "loss/crossentropy": 1.9737866520881653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19449186325073242, + "step": 8202 + }, + { + "epoch": 0.16408, + "grad_norm": 2.109375, + "grad_norm_var": 0.020344034830729166, + "learning_rate": 0.0001, + "loss": 4.5119, + "loss/crossentropy": 2.213072657585144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2333887815475464, + "step": 8204 + }, + { + "epoch": 0.16412, + "grad_norm": 2.09375, + "grad_norm_var": 0.0203277587890625, + "learning_rate": 0.0001, + "loss": 4.2139, + "loss/crossentropy": 1.8702161312103271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21368274092674255, + "step": 8206 + }, + { + "epoch": 0.16416, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02102635701497396, + "learning_rate": 0.0001, + "loss": 4.242, + "loss/crossentropy": 2.177275776863098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21248809248209, + "step": 8208 + }, + { + "epoch": 0.1642, + "grad_norm": 1.9375, + "grad_norm_var": 0.021945953369140625, + "learning_rate": 0.0001, + "loss": 4.3151, + "loss/crossentropy": 2.3422038555145264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2288341298699379, + "step": 8210 + }, + { + "epoch": 0.16424, + "grad_norm": 2.265625, + "grad_norm_var": 0.014212799072265626, + "learning_rate": 0.0001, + "loss": 4.2195, + "loss/crossentropy": 2.094432234764099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2224871665239334, + "step": 8212 + }, + { + "epoch": 0.16428, + "grad_norm": 2.125, + "grad_norm_var": 0.020072174072265626, + "learning_rate": 0.0001, + "loss": 4.5045, + "loss/crossentropy": 2.3371682167053223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2710718363523483, + "step": 8214 + }, + { + "epoch": 0.16432, + "grad_norm": 2.25, + "grad_norm_var": 0.034242502848307294, + "learning_rate": 0.0001, + "loss": 4.6926, + "loss/crossentropy": 1.8700988292694092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20795634388923645, + "step": 8216 + }, + { + "epoch": 0.16436, + "grad_norm": 2.046875, + "grad_norm_var": 0.02575658162434896, + "learning_rate": 0.0001, + "loss": 4.3266, + "loss/crossentropy": 2.080985188484192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23497651517391205, + "step": 8218 + }, + { + "epoch": 0.1644, + "grad_norm": 2.046875, + "grad_norm_var": 0.026364898681640624, + "learning_rate": 0.0001, + "loss": 4.3873, + "loss/crossentropy": 2.3486984968185425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25393396615982056, + "step": 8220 + }, + { + "epoch": 0.16444, + "grad_norm": 2.15625, + "grad_norm_var": 0.028148396809895834, + "learning_rate": 0.0001, + "loss": 4.3446, + "loss/crossentropy": 2.0084245800971985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21738936007022858, + "step": 8222 + }, + { + "epoch": 0.16448, + "grad_norm": 2.03125, + "grad_norm_var": 0.02846247355143229, + "learning_rate": 0.0001, + "loss": 3.8584, + "loss/crossentropy": 1.536482572555542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17787320166826248, + "step": 8224 + }, + { + "epoch": 0.16452, + "grad_norm": 2.1875, + "grad_norm_var": 0.024857330322265624, + "learning_rate": 0.0001, + "loss": 4.494, + "loss/crossentropy": 2.1727080941200256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22967493534088135, + "step": 8226 + }, + { + "epoch": 0.16456, + "grad_norm": 2.015625, + "grad_norm_var": 0.02490208943684896, + "learning_rate": 0.0001, + "loss": 4.1895, + "loss/crossentropy": 1.6915069222450256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19113170355558395, + "step": 8228 + }, + { + "epoch": 0.1646, + "grad_norm": 2.046875, + "grad_norm_var": 0.022739410400390625, + "learning_rate": 0.0001, + "loss": 4.4378, + "loss/crossentropy": 2.1645957231521606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21431444585323334, + "step": 8230 + }, + { + "epoch": 0.16464, + "grad_norm": 2.09375, + "grad_norm_var": 0.00858154296875, + "learning_rate": 0.0001, + "loss": 3.8088, + "loss/crossentropy": 1.8797736763954163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19344569742679596, + "step": 8232 + }, + { + "epoch": 0.16468, + "grad_norm": 2.1875, + "grad_norm_var": 0.0148345947265625, + "learning_rate": 0.0001, + "loss": 4.3119, + "loss/crossentropy": 2.092893421649933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21476523578166962, + "step": 8234 + }, + { + "epoch": 0.16472, + "grad_norm": 2.125, + "grad_norm_var": 0.0143310546875, + "learning_rate": 0.0001, + "loss": 4.4145, + "loss/crossentropy": 2.336071252822876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25474119186401367, + "step": 8236 + }, + { + "epoch": 0.16476, + "grad_norm": 2.03125, + "grad_norm_var": 0.014410146077473958, + "learning_rate": 0.0001, + "loss": 4.3054, + "loss/crossentropy": 2.008872926235199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22672003507614136, + "step": 8238 + }, + { + "epoch": 0.1648, + "grad_norm": 2.171875, + "grad_norm_var": 0.011822255452473958, + "learning_rate": 0.0001, + "loss": 4.4997, + "loss/crossentropy": 2.2229605317115784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2392737865447998, + "step": 8240 + }, + { + "epoch": 0.16484, + "grad_norm": 2.171875, + "grad_norm_var": 0.011525217692057292, + "learning_rate": 0.0001, + "loss": 4.5894, + "loss/crossentropy": 2.1929808855056763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22832375764846802, + "step": 8242 + }, + { + "epoch": 0.16488, + "grad_norm": 2.171875, + "grad_norm_var": 0.010625966389973958, + "learning_rate": 0.0001, + "loss": 4.642, + "loss/crossentropy": 2.1557860374450684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24640469253063202, + "step": 8244 + }, + { + "epoch": 0.16492, + "grad_norm": 2.109375, + "grad_norm_var": 0.009582265218098959, + "learning_rate": 0.0001, + "loss": 4.2508, + "loss/crossentropy": 2.2462236881256104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23639583587646484, + "step": 8246 + }, + { + "epoch": 0.16496, + "grad_norm": 2.03125, + "grad_norm_var": 0.009501139322916666, + "learning_rate": 0.0001, + "loss": 4.1476, + "loss/crossentropy": 2.1238350868225098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23066531121730804, + "step": 8248 + }, + { + "epoch": 0.165, + "grad_norm": 2.078125, + "grad_norm_var": 0.0049479166666666664, + "learning_rate": 0.0001, + "loss": 4.1666, + "loss/crossentropy": 1.6866248846054077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20677632093429565, + "step": 8250 + }, + { + "epoch": 0.16504, + "grad_norm": 2.03125, + "grad_norm_var": 0.005338541666666667, + "learning_rate": 0.0001, + "loss": 4.1494, + "loss/crossentropy": 2.0640709400177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148914858698845, + "step": 8252 + }, + { + "epoch": 0.16508, + "grad_norm": 2.234375, + "grad_norm_var": 0.00513916015625, + "learning_rate": 0.0001, + "loss": 4.4138, + "loss/crossentropy": 2.003119468688965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21817681193351746, + "step": 8254 + }, + { + "epoch": 0.16512, + "grad_norm": 2.328125, + "grad_norm_var": 0.007763671875, + "learning_rate": 0.0001, + "loss": 4.276, + "loss/crossentropy": 2.053581953048706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22257455438375473, + "step": 8256 + }, + { + "epoch": 0.16516, + "grad_norm": 2.046875, + "grad_norm_var": 0.008381144205729166, + "learning_rate": 0.0001, + "loss": 4.1314, + "loss/crossentropy": 1.788454830646515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21449437737464905, + "step": 8258 + }, + { + "epoch": 0.1652, + "grad_norm": 2.265625, + "grad_norm_var": 0.009032185872395833, + "learning_rate": 0.0001, + "loss": 4.3986, + "loss/crossentropy": 1.8791787028312683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877290904521942, + "step": 8260 + }, + { + "epoch": 0.16524, + "grad_norm": 2.140625, + "grad_norm_var": 0.009761555989583334, + "learning_rate": 0.0001, + "loss": 4.4326, + "loss/crossentropy": 2.2346811294555664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.221164733171463, + "step": 8262 + }, + { + "epoch": 0.16528, + "grad_norm": 2.03125, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.3296, + "loss/crossentropy": 2.1032413244247437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21829386800527573, + "step": 8264 + }, + { + "epoch": 0.16532, + "grad_norm": 2.09375, + "grad_norm_var": 0.008690388997395833, + "learning_rate": 0.0001, + "loss": 4.3188, + "loss/crossentropy": 2.1452749967575073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24733971804380417, + "step": 8266 + }, + { + "epoch": 0.16536, + "grad_norm": 2.171875, + "grad_norm_var": 0.008373006184895834, + "learning_rate": 0.0001, + "loss": 4.2093, + "loss/crossentropy": 1.8318313956260681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22139593213796616, + "step": 8268 + }, + { + "epoch": 0.1654, + "grad_norm": 2.09375, + "grad_norm_var": 0.0074045817057291664, + "learning_rate": 0.0001, + "loss": 4.3494, + "loss/crossentropy": 1.9865980744361877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22354473173618317, + "step": 8270 + }, + { + "epoch": 0.16544, + "grad_norm": 2.21875, + "grad_norm_var": 0.005464680989583333, + "learning_rate": 0.0001, + "loss": 4.318, + "loss/crossentropy": 2.140509843826294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22832269966602325, + "step": 8272 + }, + { + "epoch": 0.16548, + "grad_norm": 2.15625, + "grad_norm_var": 0.005052693684895833, + "learning_rate": 0.0001, + "loss": 4.1632, + "loss/crossentropy": 2.3763319849967957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22866757214069366, + "step": 8274 + }, + { + "epoch": 0.16552, + "grad_norm": 2.03125, + "grad_norm_var": 0.004198201497395833, + "learning_rate": 0.0001, + "loss": 4.2366, + "loss/crossentropy": 1.785762071609497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19961480796337128, + "step": 8276 + }, + { + "epoch": 0.16556, + "grad_norm": 7.34375, + "grad_norm_var": 1.7181477864583334, + "learning_rate": 0.0001, + "loss": 4.5317, + "loss/crossentropy": 2.0500977635383606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2972872704267502, + "step": 8278 + }, + { + "epoch": 0.1656, + "grad_norm": 2.265625, + "grad_norm_var": 1.6987589518229167, + "learning_rate": 0.0001, + "loss": 4.4247, + "loss/crossentropy": 2.3665153980255127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24115745723247528, + "step": 8280 + }, + { + "epoch": 0.16564, + "grad_norm": 2.25, + "grad_norm_var": 1.6973592122395833, + "learning_rate": 0.0001, + "loss": 4.326, + "loss/crossentropy": 1.9636226892471313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048879787325859, + "step": 8282 + }, + { + "epoch": 0.16568, + "grad_norm": 2.171875, + "grad_norm_var": 1.6948720296223958, + "learning_rate": 0.0001, + "loss": 4.4303, + "loss/crossentropy": 2.2624993324279785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22186043858528137, + "step": 8284 + }, + { + "epoch": 0.16572, + "grad_norm": 2.046875, + "grad_norm_var": 1.7048886617024739, + "learning_rate": 0.0001, + "loss": 4.0816, + "loss/crossentropy": 2.098397970199585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219336099922657, + "step": 8286 + }, + { + "epoch": 0.16576, + "grad_norm": 2.140625, + "grad_norm_var": 1.6997393290201823, + "learning_rate": 0.0001, + "loss": 4.4705, + "loss/crossentropy": 1.9795190691947937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2145320549607277, + "step": 8288 + }, + { + "epoch": 0.1658, + "grad_norm": 2.203125, + "grad_norm_var": 1.6936927795410157, + "learning_rate": 0.0001, + "loss": 4.4625, + "loss/crossentropy": 2.1028788089752197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22656689584255219, + "step": 8290 + }, + { + "epoch": 0.16584, + "grad_norm": 1.984375, + "grad_norm_var": 1.7015398661295573, + "learning_rate": 0.0001, + "loss": 4.1901, + "loss/crossentropy": 2.2936136722564697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23688847571611404, + "step": 8292 + }, + { + "epoch": 0.16588, + "grad_norm": 2.234375, + "grad_norm_var": 0.01862360636393229, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 1.870754897594452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20914533734321594, + "step": 8294 + }, + { + "epoch": 0.16592, + "grad_norm": 2.109375, + "grad_norm_var": 0.007999674479166666, + "learning_rate": 0.0001, + "loss": 3.9884, + "loss/crossentropy": 2.034530520439148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22234197705984116, + "step": 8296 + }, + { + "epoch": 0.16596, + "grad_norm": 2.203125, + "grad_norm_var": 0.0073811848958333336, + "learning_rate": 0.0001, + "loss": 4.3674, + "loss/crossentropy": 1.974400520324707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20479100942611694, + "step": 8298 + }, + { + "epoch": 0.166, + "grad_norm": 2.21875, + "grad_norm_var": 0.0078765869140625, + "learning_rate": 0.0001, + "loss": 4.3157, + "loss/crossentropy": 2.180828809738159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229994386434555, + "step": 8300 + }, + { + "epoch": 0.16604, + "grad_norm": 2.078125, + "grad_norm_var": 0.007624308268229167, + "learning_rate": 0.0001, + "loss": 4.0704, + "loss/crossentropy": 2.098487079143524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19709115475416183, + "step": 8302 + }, + { + "epoch": 0.16608, + "grad_norm": 2.0625, + "grad_norm_var": 0.007673136393229167, + "learning_rate": 0.0001, + "loss": 4.2292, + "loss/crossentropy": 1.999358892440796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20891200006008148, + "step": 8304 + }, + { + "epoch": 0.16612, + "grad_norm": 2.1875, + "grad_norm_var": 0.009022776285807292, + "learning_rate": 0.0001, + "loss": 4.4263, + "loss/crossentropy": 2.40866219997406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22355159372091293, + "step": 8306 + }, + { + "epoch": 0.16616, + "grad_norm": 2.171875, + "grad_norm_var": 0.008156077067057291, + "learning_rate": 0.0001, + "loss": 4.0532, + "loss/crossentropy": 1.774325966835022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058887928724289, + "step": 8308 + }, + { + "epoch": 0.1662, + "grad_norm": 2.125, + "grad_norm_var": 0.007352447509765625, + "learning_rate": 0.0001, + "loss": 4.4951, + "loss/crossentropy": 1.9870773553848267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21368569880723953, + "step": 8310 + }, + { + "epoch": 0.16624, + "grad_norm": 2.40625, + "grad_norm_var": 0.01121826171875, + "learning_rate": 0.0001, + "loss": 4.3496, + "loss/crossentropy": 1.9224175810813904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2279355376958847, + "step": 8312 + }, + { + "epoch": 0.16628, + "grad_norm": 2.171875, + "grad_norm_var": 0.012230428059895833, + "learning_rate": 0.0001, + "loss": 4.4074, + "loss/crossentropy": 2.011419177055359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2363004833459854, + "step": 8314 + }, + { + "epoch": 0.16632, + "grad_norm": 2.328125, + "grad_norm_var": 0.01444091796875, + "learning_rate": 0.0001, + "loss": 4.6627, + "loss/crossentropy": 1.9777795672416687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21830055862665176, + "step": 8316 + }, + { + "epoch": 0.16636, + "grad_norm": 2.046875, + "grad_norm_var": 0.013508860270182292, + "learning_rate": 0.0001, + "loss": 4.3798, + "loss/crossentropy": 2.3334981203079224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21089734882116318, + "step": 8318 + }, + { + "epoch": 0.1664, + "grad_norm": 2.28125, + "grad_norm_var": 0.015148671468098958, + "learning_rate": 0.0001, + "loss": 4.3792, + "loss/crossentropy": 1.9137234687805176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21954041719436646, + "step": 8320 + }, + { + "epoch": 0.16644, + "grad_norm": 2.15625, + "grad_norm_var": 0.013492838541666666, + "learning_rate": 0.0001, + "loss": 4.3826, + "loss/crossentropy": 2.1358155608177185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21321023255586624, + "step": 8322 + }, + { + "epoch": 0.16648, + "grad_norm": 2.203125, + "grad_norm_var": 0.013688151041666667, + "learning_rate": 0.0001, + "loss": 4.3454, + "loss/crossentropy": 2.1747822165489197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23682481050491333, + "step": 8324 + }, + { + "epoch": 0.16652, + "grad_norm": 2.078125, + "grad_norm_var": 0.013505045572916667, + "learning_rate": 0.0001, + "loss": 4.2685, + "loss/crossentropy": 2.2818257808685303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24226247519254684, + "step": 8326 + }, + { + "epoch": 0.16656, + "grad_norm": 2.1875, + "grad_norm_var": 0.010155232747395833, + "learning_rate": 0.0001, + "loss": 4.4806, + "loss/crossentropy": 1.9066791534423828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20610930025577545, + "step": 8328 + }, + { + "epoch": 0.1666, + "grad_norm": 2.078125, + "grad_norm_var": 0.008284505208333333, + "learning_rate": 0.0001, + "loss": 4.4799, + "loss/crossentropy": 2.2431830763816833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22407780587673187, + "step": 8330 + }, + { + "epoch": 0.16664, + "grad_norm": 2.125, + "grad_norm_var": 0.0067708333333333336, + "learning_rate": 0.0001, + "loss": 4.5894, + "loss/crossentropy": 2.569726347923279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2647472620010376, + "step": 8332 + }, + { + "epoch": 0.16668, + "grad_norm": 2.140625, + "grad_norm_var": 0.006371053059895834, + "learning_rate": 0.0001, + "loss": 4.2307, + "loss/crossentropy": 1.8364137411117554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113686427474022, + "step": 8334 + }, + { + "epoch": 0.16672, + "grad_norm": 2.078125, + "grad_norm_var": 0.007968902587890625, + "learning_rate": 0.0001, + "loss": 4.0369, + "loss/crossentropy": 2.0510441064834595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2268069088459015, + "step": 8336 + }, + { + "epoch": 0.16676, + "grad_norm": 2.046875, + "grad_norm_var": 0.007968902587890625, + "learning_rate": 0.0001, + "loss": 4.2528, + "loss/crossentropy": 1.7306728959083557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20341812074184418, + "step": 8338 + }, + { + "epoch": 0.1668, + "grad_norm": 2.578125, + "grad_norm_var": 0.023361968994140624, + "learning_rate": 0.0001, + "loss": 4.8146, + "loss/crossentropy": 2.398088574409485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2426001876592636, + "step": 8340 + }, + { + "epoch": 0.16684, + "grad_norm": 2.109375, + "grad_norm_var": 0.023128000895182292, + "learning_rate": 0.0001, + "loss": 4.0659, + "loss/crossentropy": 2.1259487867355347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22666773200035095, + "step": 8342 + }, + { + "epoch": 0.16688, + "grad_norm": 2.109375, + "grad_norm_var": 0.022141265869140624, + "learning_rate": 0.0001, + "loss": 4.2857, + "loss/crossentropy": 1.9744665026664734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2887374758720398, + "step": 8344 + }, + { + "epoch": 0.16692, + "grad_norm": 2.09375, + "grad_norm_var": 0.02316869099934896, + "learning_rate": 0.0001, + "loss": 4.1989, + "loss/crossentropy": 1.9841225743293762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008478343486786, + "step": 8346 + }, + { + "epoch": 0.16696, + "grad_norm": 2.125, + "grad_norm_var": 0.1913469950358073, + "learning_rate": 0.0001, + "loss": 4.5208, + "loss/crossentropy": 2.060486137866974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23954987525939941, + "step": 8348 + }, + { + "epoch": 0.167, + "grad_norm": 2.03125, + "grad_norm_var": 0.19230931599934895, + "learning_rate": 0.0001, + "loss": 4.2185, + "loss/crossentropy": 1.9356245398521423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987425833940506, + "step": 8350 + }, + { + "epoch": 0.16704, + "grad_norm": 2.203125, + "grad_norm_var": 0.1835845947265625, + "learning_rate": 0.0001, + "loss": 4.1875, + "loss/crossentropy": 1.9968677163124084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21396416425704956, + "step": 8352 + }, + { + "epoch": 0.16708, + "grad_norm": 2.109375, + "grad_norm_var": 0.1820465087890625, + "learning_rate": 0.0001, + "loss": 4.1696, + "loss/crossentropy": 2.1678614616394043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22860293090343475, + "step": 8354 + }, + { + "epoch": 0.16712, + "grad_norm": 2.078125, + "grad_norm_var": 0.17534891764322916, + "learning_rate": 0.0001, + "loss": 4.1761, + "loss/crossentropy": 1.7752264142036438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20555391907691956, + "step": 8356 + }, + { + "epoch": 0.16716, + "grad_norm": 2.046875, + "grad_norm_var": 0.1767578125, + "learning_rate": 0.0001, + "loss": 4.2391, + "loss/crossentropy": 1.7073925137519836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19325412809848785, + "step": 8358 + }, + { + "epoch": 0.1672, + "grad_norm": 2.4375, + "grad_norm_var": 0.17827860514322916, + "learning_rate": 0.0001, + "loss": 4.7185, + "loss/crossentropy": 2.2200660705566406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21834757924079895, + "step": 8360 + }, + { + "epoch": 0.16724, + "grad_norm": 2.03125, + "grad_norm_var": 0.17594401041666666, + "learning_rate": 0.0001, + "loss": 4.3551, + "loss/crossentropy": 2.3598183393478394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25592371821403503, + "step": 8362 + }, + { + "epoch": 0.16728, + "grad_norm": 2.03125, + "grad_norm_var": 0.014644368489583334, + "learning_rate": 0.0001, + "loss": 4.3429, + "loss/crossentropy": 2.342926025390625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2480776235461235, + "step": 8364 + }, + { + "epoch": 0.16732, + "grad_norm": 2.046875, + "grad_norm_var": 0.014411417643229167, + "learning_rate": 0.0001, + "loss": 4.2802, + "loss/crossentropy": 1.8505961894989014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21376194059848785, + "step": 8366 + }, + { + "epoch": 0.16736, + "grad_norm": 2.078125, + "grad_norm_var": 0.0146636962890625, + "learning_rate": 0.0001, + "loss": 4.4199, + "loss/crossentropy": 2.2064108848571777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21334534883499146, + "step": 8368 + }, + { + "epoch": 0.1674, + "grad_norm": 2.6875, + "grad_norm_var": 0.0338043212890625, + "learning_rate": 0.0001, + "loss": 4.5344, + "loss/crossentropy": 2.1280709505081177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24501197040081024, + "step": 8370 + }, + { + "epoch": 0.16744, + "grad_norm": 2.203125, + "grad_norm_var": 0.03312886555989583, + "learning_rate": 0.0001, + "loss": 4.3382, + "loss/crossentropy": 2.2134695053100586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2368277609348297, + "step": 8372 + }, + { + "epoch": 0.16748, + "grad_norm": 2.03125, + "grad_norm_var": 0.03319905598958333, + "learning_rate": 0.0001, + "loss": 4.1197, + "loss/crossentropy": 1.6942040920257568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21226602047681808, + "step": 8374 + }, + { + "epoch": 0.16752, + "grad_norm": 2.09375, + "grad_norm_var": 0.025755818684895834, + "learning_rate": 0.0001, + "loss": 4.5004, + "loss/crossentropy": 2.1554355025291443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2655039578676224, + "step": 8376 + }, + { + "epoch": 0.16756, + "grad_norm": 2.078125, + "grad_norm_var": 0.024800618489583332, + "learning_rate": 0.0001, + "loss": 4.2446, + "loss/crossentropy": 1.8107115030288696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756090670824051, + "step": 8378 + }, + { + "epoch": 0.1676, + "grad_norm": 1.921875, + "grad_norm_var": 0.026496378580729167, + "learning_rate": 0.0001, + "loss": 4.0745, + "loss/crossentropy": 1.655519425868988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1760415881872177, + "step": 8380 + }, + { + "epoch": 0.16764, + "grad_norm": 2.140625, + "grad_norm_var": 0.026460774739583335, + "learning_rate": 0.0001, + "loss": 4.2369, + "loss/crossentropy": 2.1618025302886963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21886380016803741, + "step": 8382 + }, + { + "epoch": 0.16768, + "grad_norm": 2.109375, + "grad_norm_var": 0.028270467122395834, + "learning_rate": 0.0001, + "loss": 4.0798, + "loss/crossentropy": 2.043319880962372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21368451416492462, + "step": 8384 + }, + { + "epoch": 0.16772, + "grad_norm": 2.1875, + "grad_norm_var": 0.010724894205729167, + "learning_rate": 0.0001, + "loss": 4.4035, + "loss/crossentropy": 2.3043102025985718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25452760607004166, + "step": 8386 + }, + { + "epoch": 0.16776, + "grad_norm": 2.21875, + "grad_norm_var": 0.011546834309895834, + "learning_rate": 0.0001, + "loss": 4.5633, + "loss/crossentropy": 2.3627192974090576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2438303530216217, + "step": 8388 + }, + { + "epoch": 0.1678, + "grad_norm": 2.109375, + "grad_norm_var": 0.011335245768229167, + "learning_rate": 0.0001, + "loss": 4.4339, + "loss/crossentropy": 2.0960012674331665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20813053101301193, + "step": 8390 + }, + { + "epoch": 0.16784, + "grad_norm": 1.921875, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 4.0401, + "loss/crossentropy": 1.9132550358772278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19716795533895493, + "step": 8392 + }, + { + "epoch": 0.16788, + "grad_norm": 2.078125, + "grad_norm_var": 0.014742024739583333, + "learning_rate": 0.0001, + "loss": 4.2371, + "loss/crossentropy": 2.1912059783935547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.233934685587883, + "step": 8394 + }, + { + "epoch": 0.16792, + "grad_norm": 2.0625, + "grad_norm_var": 0.012691243489583334, + "learning_rate": 0.0001, + "loss": 4.4501, + "loss/crossentropy": 2.165616512298584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23475389927625656, + "step": 8396 + }, + { + "epoch": 0.16796, + "grad_norm": 2.078125, + "grad_norm_var": 0.01265869140625, + "learning_rate": 0.0001, + "loss": 4.4841, + "loss/crossentropy": 2.209625542163849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24280209839344025, + "step": 8398 + }, + { + "epoch": 0.168, + "grad_norm": 2.125, + "grad_norm_var": 0.013606516520182292, + "learning_rate": 0.0001, + "loss": 3.8959, + "loss/crossentropy": 1.972103476524353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072778418660164, + "step": 8400 + }, + { + "epoch": 0.16804, + "grad_norm": 2.171875, + "grad_norm_var": 0.009663645426432292, + "learning_rate": 0.0001, + "loss": 4.4646, + "loss/crossentropy": 2.402593731880188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23028723895549774, + "step": 8402 + }, + { + "epoch": 0.16808, + "grad_norm": 2.125, + "grad_norm_var": 0.008754221598307292, + "learning_rate": 0.0001, + "loss": 4.392, + "loss/crossentropy": 2.218156576156616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24104444682598114, + "step": 8404 + }, + { + "epoch": 0.16812, + "grad_norm": 2.09375, + "grad_norm_var": 0.008722941080729166, + "learning_rate": 0.0001, + "loss": 4.2713, + "loss/crossentropy": 1.9802079796791077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134247124195099, + "step": 8406 + }, + { + "epoch": 0.16816, + "grad_norm": 1.953125, + "grad_norm_var": 0.008577219645182292, + "learning_rate": 0.0001, + "loss": 3.9765, + "loss/crossentropy": 2.0322983264923096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21680974960327148, + "step": 8408 + }, + { + "epoch": 0.1682, + "grad_norm": 2.03125, + "grad_norm_var": 0.008194732666015624, + "learning_rate": 0.0001, + "loss": 4.2457, + "loss/crossentropy": 2.11838436126709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2319432571530342, + "step": 8410 + }, + { + "epoch": 0.16824, + "grad_norm": 2.125, + "grad_norm_var": 0.013398996988932292, + "learning_rate": 0.0001, + "loss": 4.2204, + "loss/crossentropy": 2.3801279067993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23981253802776337, + "step": 8412 + }, + { + "epoch": 0.16828, + "grad_norm": 2.078125, + "grad_norm_var": 0.013042958577473958, + "learning_rate": 0.0001, + "loss": 4.2766, + "loss/crossentropy": 2.0953307151794434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22835668921470642, + "step": 8414 + }, + { + "epoch": 0.16832, + "grad_norm": 2.15625, + "grad_norm_var": 0.011774698893229166, + "learning_rate": 0.0001, + "loss": 4.528, + "loss/crossentropy": 2.0654338598251343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100541964173317, + "step": 8416 + }, + { + "epoch": 0.16836, + "grad_norm": 2.0625, + "grad_norm_var": 0.018184407552083334, + "learning_rate": 0.0001, + "loss": 4.5734, + "loss/crossentropy": 2.383318305015564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.246080219745636, + "step": 8418 + }, + { + "epoch": 0.1684, + "grad_norm": 2.078125, + "grad_norm_var": 0.018318684895833333, + "learning_rate": 0.0001, + "loss": 4.2764, + "loss/crossentropy": 2.103544294834137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20659767091274261, + "step": 8420 + }, + { + "epoch": 0.16844, + "grad_norm": 2.09375, + "grad_norm_var": 0.017437489827473958, + "learning_rate": 0.0001, + "loss": 4.2737, + "loss/crossentropy": 2.064394950866699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23458171635866165, + "step": 8422 + }, + { + "epoch": 0.16848, + "grad_norm": 2.25, + "grad_norm_var": 0.014872233072916666, + "learning_rate": 0.0001, + "loss": 4.4133, + "loss/crossentropy": 2.087044835090637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23527196049690247, + "step": 8424 + }, + { + "epoch": 0.16852, + "grad_norm": 2.15625, + "grad_norm_var": 0.0133941650390625, + "learning_rate": 0.0001, + "loss": 4.5867, + "loss/crossentropy": 2.41584312915802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2425323873758316, + "step": 8426 + }, + { + "epoch": 0.16856, + "grad_norm": 2.09375, + "grad_norm_var": 0.010400390625, + "learning_rate": 0.0001, + "loss": 4.3292, + "loss/crossentropy": 2.2542352080345154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2330578714609146, + "step": 8428 + }, + { + "epoch": 0.1686, + "grad_norm": 2.09375, + "grad_norm_var": 0.010724894205729167, + "learning_rate": 0.0001, + "loss": 4.6628, + "loss/crossentropy": 2.453263282775879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.280864879488945, + "step": 8430 + }, + { + "epoch": 0.16864, + "grad_norm": 2.140625, + "grad_norm_var": 0.01197509765625, + "learning_rate": 0.0001, + "loss": 4.0443, + "loss/crossentropy": 1.9185429811477661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19230744242668152, + "step": 8432 + }, + { + "epoch": 0.16868, + "grad_norm": 2.140625, + "grad_norm_var": 0.005956013997395833, + "learning_rate": 0.0001, + "loss": 4.1683, + "loss/crossentropy": 2.020436644554138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21756108105182648, + "step": 8434 + }, + { + "epoch": 0.16872, + "grad_norm": 2.328125, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 4.7727, + "loss/crossentropy": 2.3050636053085327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24950820207595825, + "step": 8436 + }, + { + "epoch": 0.16876, + "grad_norm": 2.0625, + "grad_norm_var": 0.01510009765625, + "learning_rate": 0.0001, + "loss": 4.4091, + "loss/crossentropy": 2.2787232398986816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24267014116048813, + "step": 8438 + }, + { + "epoch": 0.1688, + "grad_norm": 2.078125, + "grad_norm_var": 0.015327962239583333, + "learning_rate": 0.0001, + "loss": 4.2927, + "loss/crossentropy": 2.185176372528076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157401591539383, + "step": 8440 + }, + { + "epoch": 0.16884, + "grad_norm": 2.265625, + "grad_norm_var": 0.01558837890625, + "learning_rate": 0.0001, + "loss": 4.2087, + "loss/crossentropy": 2.0673694610595703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23145683109760284, + "step": 8442 + }, + { + "epoch": 0.16888, + "grad_norm": 2.171875, + "grad_norm_var": 0.01754150390625, + "learning_rate": 0.0001, + "loss": 4.0229, + "loss/crossentropy": 1.9011740684509277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21629629284143448, + "step": 8444 + }, + { + "epoch": 0.16892, + "grad_norm": 2.34375, + "grad_norm_var": 0.018651326497395832, + "learning_rate": 0.0001, + "loss": 4.4821, + "loss/crossentropy": 2.055977463722229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275683432817459, + "step": 8446 + }, + { + "epoch": 0.16896, + "grad_norm": 2.140625, + "grad_norm_var": 0.0171295166015625, + "learning_rate": 0.0001, + "loss": 4.2152, + "loss/crossentropy": 1.752756416797638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19437911361455917, + "step": 8448 + }, + { + "epoch": 0.169, + "grad_norm": 2.09375, + "grad_norm_var": 0.015706380208333332, + "learning_rate": 0.0001, + "loss": 4.3631, + "loss/crossentropy": 2.479012131690979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23593680560588837, + "step": 8450 + }, + { + "epoch": 0.16904, + "grad_norm": 2.1875, + "grad_norm_var": 0.015523274739583334, + "learning_rate": 0.0001, + "loss": 4.3245, + "loss/crossentropy": 2.065472185611725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148493528366089, + "step": 8452 + }, + { + "epoch": 0.16908, + "grad_norm": 2.078125, + "grad_norm_var": 0.015901692708333335, + "learning_rate": 0.0001, + "loss": 4.3955, + "loss/crossentropy": 2.214062213897705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25444991141557693, + "step": 8454 + }, + { + "epoch": 0.16912, + "grad_norm": 2.296875, + "grad_norm_var": 0.0155426025390625, + "learning_rate": 0.0001, + "loss": 4.2059, + "loss/crossentropy": 1.7410383224487305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21134207397699356, + "step": 8456 + }, + { + "epoch": 0.16916, + "grad_norm": 2.234375, + "grad_norm_var": 0.020542144775390625, + "learning_rate": 0.0001, + "loss": 4.196, + "loss/crossentropy": 1.9303107857704163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18110749125480652, + "step": 8458 + }, + { + "epoch": 0.1692, + "grad_norm": 2.203125, + "grad_norm_var": 0.01810480753580729, + "learning_rate": 0.0001, + "loss": 4.3916, + "loss/crossentropy": 1.9091919660568237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21320972591638565, + "step": 8460 + }, + { + "epoch": 0.16924, + "grad_norm": 2.0625, + "grad_norm_var": 0.025099436442057293, + "learning_rate": 0.0001, + "loss": 4.4075, + "loss/crossentropy": 1.8021087050437927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21294282376766205, + "step": 8462 + }, + { + "epoch": 0.16928, + "grad_norm": 2.0, + "grad_norm_var": 0.027186838785807292, + "learning_rate": 0.0001, + "loss": 3.97, + "loss/crossentropy": 2.018254518508911, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21781788766384125, + "step": 8464 + }, + { + "epoch": 0.16932, + "grad_norm": 2.0625, + "grad_norm_var": 0.02906061808268229, + "learning_rate": 0.0001, + "loss": 4.5665, + "loss/crossentropy": 2.0593737959861755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24223209917545319, + "step": 8466 + }, + { + "epoch": 0.16936, + "grad_norm": 2.125, + "grad_norm_var": 0.029504140218098957, + "learning_rate": 0.0001, + "loss": 4.3116, + "loss/crossentropy": 2.218974232673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22679369151592255, + "step": 8468 + }, + { + "epoch": 0.1694, + "grad_norm": 2.078125, + "grad_norm_var": 0.022332509358723957, + "learning_rate": 0.0001, + "loss": 4.3313, + "loss/crossentropy": 2.182355046272278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23690057545900345, + "step": 8470 + }, + { + "epoch": 0.16944, + "grad_norm": 2.375, + "grad_norm_var": 0.024930572509765624, + "learning_rate": 0.0001, + "loss": 4.6241, + "loss/crossentropy": 2.1669063568115234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2458687722682953, + "step": 8472 + }, + { + "epoch": 0.16948, + "grad_norm": 2.171875, + "grad_norm_var": 0.021100870768229165, + "learning_rate": 0.0001, + "loss": 4.3181, + "loss/crossentropy": 2.0656558871269226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20421817898750305, + "step": 8474 + }, + { + "epoch": 0.16952, + "grad_norm": 2.03125, + "grad_norm_var": 0.024625651041666665, + "learning_rate": 0.0001, + "loss": 4.4916, + "loss/crossentropy": 2.1470741033554077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20798998326063156, + "step": 8476 + }, + { + "epoch": 0.16956, + "grad_norm": 2.1875, + "grad_norm_var": 0.014972941080729166, + "learning_rate": 0.0001, + "loss": 4.2616, + "loss/crossentropy": 2.206741452217102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2327529340982437, + "step": 8478 + }, + { + "epoch": 0.1696, + "grad_norm": 2.171875, + "grad_norm_var": 0.012596638997395833, + "learning_rate": 0.0001, + "loss": 4.3338, + "loss/crossentropy": 2.4421777725219727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2385794073343277, + "step": 8480 + }, + { + "epoch": 0.16964, + "grad_norm": 2.015625, + "grad_norm_var": 0.0115234375, + "learning_rate": 0.0001, + "loss": 3.963, + "loss/crossentropy": 1.8545736074447632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18871797621250153, + "step": 8482 + }, + { + "epoch": 0.16968, + "grad_norm": 2.078125, + "grad_norm_var": 0.0118072509765625, + "learning_rate": 0.0001, + "loss": 4.1352, + "loss/crossentropy": 1.807646930217743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19625309854745865, + "step": 8484 + }, + { + "epoch": 0.16972, + "grad_norm": 2.1875, + "grad_norm_var": 0.012202962239583334, + "learning_rate": 0.0001, + "loss": 4.5029, + "loss/crossentropy": 1.923595905303955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2233208492398262, + "step": 8486 + }, + { + "epoch": 0.16976, + "grad_norm": 2.0625, + "grad_norm_var": 0.0080718994140625, + "learning_rate": 0.0001, + "loss": 4.3251, + "loss/crossentropy": 2.1018277406692505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2556355446577072, + "step": 8488 + }, + { + "epoch": 0.1698, + "grad_norm": 1.90625, + "grad_norm_var": 0.01099853515625, + "learning_rate": 0.0001, + "loss": 4.1127, + "loss/crossentropy": 1.9638542532920837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101784572005272, + "step": 8490 + }, + { + "epoch": 0.16984, + "grad_norm": 2.109375, + "grad_norm_var": 0.008703358968098958, + "learning_rate": 0.0001, + "loss": 3.9463, + "loss/crossentropy": 1.7770507335662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2004244327545166, + "step": 8492 + }, + { + "epoch": 0.16988, + "grad_norm": 2.171875, + "grad_norm_var": 0.008573150634765625, + "learning_rate": 0.0001, + "loss": 4.3901, + "loss/crossentropy": 2.4064877033233643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23509501665830612, + "step": 8494 + }, + { + "epoch": 0.16992, + "grad_norm": 2.0, + "grad_norm_var": 0.008713531494140624, + "learning_rate": 0.0001, + "loss": 4.3197, + "loss/crossentropy": 2.183193802833557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22985967248678207, + "step": 8496 + }, + { + "epoch": 0.16996, + "grad_norm": 2.203125, + "grad_norm_var": 0.009126536051432292, + "learning_rate": 0.0001, + "loss": 4.4983, + "loss/crossentropy": 2.241714060306549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22633583843708038, + "step": 8498 + }, + { + "epoch": 0.17, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0097808837890625, + "learning_rate": 0.0001, + "loss": 4.32, + "loss/crossentropy": 2.212075114250183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20900271832942963, + "step": 8500 + }, + { + "epoch": 0.17004, + "grad_norm": 2.09375, + "grad_norm_var": 0.0071685791015625, + "learning_rate": 0.0001, + "loss": 4.1885, + "loss/crossentropy": 2.1283940076828003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21419794112443924, + "step": 8502 + }, + { + "epoch": 0.17008, + "grad_norm": 2.15625, + "grad_norm_var": 0.00731201171875, + "learning_rate": 0.0001, + "loss": 4.3498, + "loss/crossentropy": 2.1958925127983093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23375380039215088, + "step": 8504 + }, + { + "epoch": 0.17012, + "grad_norm": 2.171875, + "grad_norm_var": 0.0065673828125, + "learning_rate": 0.0001, + "loss": 4.3396, + "loss/crossentropy": 1.7883376479148865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097182646393776, + "step": 8506 + }, + { + "epoch": 0.17016, + "grad_norm": 2.296875, + "grad_norm_var": 0.006811269124348958, + "learning_rate": 0.0001, + "loss": 4.5606, + "loss/crossentropy": 2.114001750946045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059268057346344, + "step": 8508 + }, + { + "epoch": 0.1702, + "grad_norm": 2.15625, + "grad_norm_var": 0.006929270426432292, + "learning_rate": 0.0001, + "loss": 4.3178, + "loss/crossentropy": 2.2413129806518555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2517316862940788, + "step": 8510 + }, + { + "epoch": 0.17024, + "grad_norm": 1.921875, + "grad_norm_var": 0.015083567301432291, + "learning_rate": 0.0001, + "loss": 4.4202, + "loss/crossentropy": 2.0968031883239746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2454070746898651, + "step": 8512 + }, + { + "epoch": 0.17028, + "grad_norm": 1.875, + "grad_norm_var": 0.01945978800455729, + "learning_rate": 0.0001, + "loss": 4.407, + "loss/crossentropy": 1.9813454151153564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21238256990909576, + "step": 8514 + }, + { + "epoch": 0.17032, + "grad_norm": 2.28125, + "grad_norm_var": 0.019755045572916668, + "learning_rate": 0.0001, + "loss": 4.4724, + "loss/crossentropy": 2.261451005935669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2393329069018364, + "step": 8516 + }, + { + "epoch": 0.17036, + "grad_norm": 2.171875, + "grad_norm_var": 0.020654296875, + "learning_rate": 0.0001, + "loss": 4.5377, + "loss/crossentropy": 1.9661999344825745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088441476225853, + "step": 8518 + }, + { + "epoch": 0.1704, + "grad_norm": 2.046875, + "grad_norm_var": 0.022163899739583333, + "learning_rate": 0.0001, + "loss": 3.9288, + "loss/crossentropy": 1.8275291323661804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20091407746076584, + "step": 8520 + }, + { + "epoch": 0.17044, + "grad_norm": 2.046875, + "grad_norm_var": 0.023346964518229166, + "learning_rate": 0.0001, + "loss": 4.3154, + "loss/crossentropy": 2.266944646835327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2514675632119179, + "step": 8522 + }, + { + "epoch": 0.17048, + "grad_norm": 2.28125, + "grad_norm_var": 0.02271728515625, + "learning_rate": 0.0001, + "loss": 4.3571, + "loss/crossentropy": 2.244120240211487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25534868240356445, + "step": 8524 + }, + { + "epoch": 0.17052, + "grad_norm": 2.015625, + "grad_norm_var": 0.025048828125, + "learning_rate": 0.0001, + "loss": 4.3037, + "loss/crossentropy": 1.8628552556037903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21675898134708405, + "step": 8526 + }, + { + "epoch": 0.17056, + "grad_norm": 2.109375, + "grad_norm_var": 0.01597900390625, + "learning_rate": 0.0001, + "loss": 4.3883, + "loss/crossentropy": 1.971808135509491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2143295481801033, + "step": 8528 + }, + { + "epoch": 0.1706, + "grad_norm": 2.0625, + "grad_norm_var": 0.0114410400390625, + "learning_rate": 0.0001, + "loss": 4.3481, + "loss/crossentropy": 1.959191381931305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24014096707105637, + "step": 8530 + }, + { + "epoch": 0.17064, + "grad_norm": 3.0, + "grad_norm_var": 0.05729878743489583, + "learning_rate": 0.0001, + "loss": 4.2633, + "loss/crossentropy": 1.836454451084137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20957449078559875, + "step": 8532 + }, + { + "epoch": 0.17068, + "grad_norm": 2.140625, + "grad_norm_var": 0.06005452473958333, + "learning_rate": 0.0001, + "loss": 4.1254, + "loss/crossentropy": 2.003947675228119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23538918048143387, + "step": 8534 + }, + { + "epoch": 0.17072, + "grad_norm": 2.375, + "grad_norm_var": 0.05788472493489583, + "learning_rate": 0.0001, + "loss": 4.4412, + "loss/crossentropy": 2.0154194831848145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278473973274231, + "step": 8536 + }, + { + "epoch": 0.17076, + "grad_norm": 2.015625, + "grad_norm_var": 0.05548502604166667, + "learning_rate": 0.0001, + "loss": 4.2426, + "loss/crossentropy": 2.1560275554656982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21601636707782745, + "step": 8538 + }, + { + "epoch": 0.1708, + "grad_norm": 2.078125, + "grad_norm_var": 0.05689697265625, + "learning_rate": 0.0001, + "loss": 4.2258, + "loss/crossentropy": 2.1494773626327515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22883395850658417, + "step": 8540 + }, + { + "epoch": 0.17084, + "grad_norm": 2.234375, + "grad_norm_var": 0.05607808430989583, + "learning_rate": 0.0001, + "loss": 4.5201, + "loss/crossentropy": 2.20908784866333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26363062113523483, + "step": 8542 + }, + { + "epoch": 0.17088, + "grad_norm": 2.0625, + "grad_norm_var": 0.0566802978515625, + "learning_rate": 0.0001, + "loss": 4.2477, + "loss/crossentropy": 1.7932087182998657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18538028001785278, + "step": 8544 + }, + { + "epoch": 0.17092, + "grad_norm": 2.078125, + "grad_norm_var": 0.05625, + "learning_rate": 0.0001, + "loss": 4.471, + "loss/crossentropy": 2.069899260997772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22498925775289536, + "step": 8546 + }, + { + "epoch": 0.17096, + "grad_norm": 2.46875, + "grad_norm_var": 0.019169108072916666, + "learning_rate": 0.0001, + "loss": 4.474, + "loss/crossentropy": 2.008604884147644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21895557641983032, + "step": 8548 + }, + { + "epoch": 0.171, + "grad_norm": 2.171875, + "grad_norm_var": 0.018798828125, + "learning_rate": 0.0001, + "loss": 4.1973, + "loss/crossentropy": 2.0652626156806946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23341115564107895, + "step": 8550 + }, + { + "epoch": 0.17104, + "grad_norm": 2.234375, + "grad_norm_var": 0.015607706705729167, + "learning_rate": 0.0001, + "loss": 4.383, + "loss/crossentropy": 1.9780349135398865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22891747951507568, + "step": 8552 + }, + { + "epoch": 0.17108, + "grad_norm": 2.5, + "grad_norm_var": 0.022980753580729166, + "learning_rate": 0.0001, + "loss": 4.4424, + "loss/crossentropy": 2.163089871406555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2447328343987465, + "step": 8554 + }, + { + "epoch": 0.17112, + "grad_norm": 2.21875, + "grad_norm_var": 0.022945149739583334, + "learning_rate": 0.0001, + "loss": 4.1503, + "loss/crossentropy": 2.2946064472198486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24353116750717163, + "step": 8556 + }, + { + "epoch": 0.17116, + "grad_norm": 2.078125, + "grad_norm_var": 0.022459920247395834, + "learning_rate": 0.0001, + "loss": 4.3049, + "loss/crossentropy": 2.0238161087036133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2153262495994568, + "step": 8558 + }, + { + "epoch": 0.1712, + "grad_norm": 2.09375, + "grad_norm_var": 0.02604955037434896, + "learning_rate": 0.0001, + "loss": 3.8636, + "loss/crossentropy": 1.5683120489120483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17940818518400192, + "step": 8560 + }, + { + "epoch": 0.17124, + "grad_norm": 2.296875, + "grad_norm_var": 0.027337392171223957, + "learning_rate": 0.0001, + "loss": 4.3867, + "loss/crossentropy": 1.9956589937210083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22829821705818176, + "step": 8562 + }, + { + "epoch": 0.17128, + "grad_norm": 2.640625, + "grad_norm_var": 0.036834462483723955, + "learning_rate": 0.0001, + "loss": 4.7055, + "loss/crossentropy": 2.2242285013198853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23567892611026764, + "step": 8564 + }, + { + "epoch": 0.17132, + "grad_norm": 2.046875, + "grad_norm_var": 0.033719635009765624, + "learning_rate": 0.0001, + "loss": 4.274, + "loss/crossentropy": 2.2143776416778564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2457498088479042, + "step": 8566 + }, + { + "epoch": 0.17136, + "grad_norm": 2.15625, + "grad_norm_var": 0.033782704671223955, + "learning_rate": 0.0001, + "loss": 4.354, + "loss/crossentropy": 1.852292537689209, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24004538357257843, + "step": 8568 + }, + { + "epoch": 0.1714, + "grad_norm": 2.21875, + "grad_norm_var": 0.02676976521809896, + "learning_rate": 0.0001, + "loss": 4.4429, + "loss/crossentropy": 2.311089515686035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22059186547994614, + "step": 8570 + }, + { + "epoch": 0.17144, + "grad_norm": 2.0625, + "grad_norm_var": 0.027675120035807292, + "learning_rate": 0.0001, + "loss": 4.1697, + "loss/crossentropy": 1.9324169754981995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21051711589097977, + "step": 8572 + }, + { + "epoch": 0.17148, + "grad_norm": 2.171875, + "grad_norm_var": 0.02904052734375, + "learning_rate": 0.0001, + "loss": 4.2457, + "loss/crossentropy": 1.982999861240387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2084646075963974, + "step": 8574 + }, + { + "epoch": 0.17152, + "grad_norm": 2.078125, + "grad_norm_var": 0.02505671183268229, + "learning_rate": 0.0001, + "loss": 4.2503, + "loss/crossentropy": 2.1837204694747925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22953644394874573, + "step": 8576 + }, + { + "epoch": 0.17156, + "grad_norm": 2.25, + "grad_norm_var": 0.024102528889973957, + "learning_rate": 0.0001, + "loss": 4.4448, + "loss/crossentropy": 2.2588841319084167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22851599752902985, + "step": 8578 + }, + { + "epoch": 0.1716, + "grad_norm": 2.390625, + "grad_norm_var": 0.011502838134765625, + "learning_rate": 0.0001, + "loss": 4.2033, + "loss/crossentropy": 1.982733964920044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21284686028957367, + "step": 8580 + }, + { + "epoch": 0.17164, + "grad_norm": 2.046875, + "grad_norm_var": 0.012143707275390625, + "learning_rate": 0.0001, + "loss": 4.1238, + "loss/crossentropy": 2.443873167037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23966002464294434, + "step": 8582 + }, + { + "epoch": 0.17168, + "grad_norm": 5.09375, + "grad_norm_var": 0.5580645243326823, + "learning_rate": 0.0001, + "loss": 4.2654, + "loss/crossentropy": 2.462417483329773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2341725453734398, + "step": 8584 + }, + { + "epoch": 0.17172, + "grad_norm": 2.453125, + "grad_norm_var": 0.5521705627441407, + "learning_rate": 0.0001, + "loss": 4.183, + "loss/crossentropy": 1.8569464683532715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224809430539608, + "step": 8586 + }, + { + "epoch": 0.17176, + "grad_norm": 2.609375, + "grad_norm_var": 0.55804443359375, + "learning_rate": 0.0001, + "loss": 4.2762, + "loss/crossentropy": 2.017254650592804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2259984165430069, + "step": 8588 + }, + { + "epoch": 0.1718, + "grad_norm": 2.1875, + "grad_norm_var": 0.550066884358724, + "learning_rate": 0.0001, + "loss": 4.463, + "loss/crossentropy": 1.9804525971412659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20844170451164246, + "step": 8590 + }, + { + "epoch": 0.17184, + "grad_norm": 2.015625, + "grad_norm_var": 0.544781239827474, + "learning_rate": 0.0001, + "loss": 4.2676, + "loss/crossentropy": 2.2506592869758606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22404606640338898, + "step": 8592 + }, + { + "epoch": 0.17188, + "grad_norm": 2.0625, + "grad_norm_var": 0.5579335530598958, + "learning_rate": 0.0001, + "loss": 4.197, + "loss/crossentropy": 1.9515153765678406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216490276157856, + "step": 8594 + }, + { + "epoch": 0.17192, + "grad_norm": 2.109375, + "grad_norm_var": 0.5598052978515625, + "learning_rate": 0.0001, + "loss": 4.4037, + "loss/crossentropy": 2.090883791446686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22496677190065384, + "step": 8596 + }, + { + "epoch": 0.17196, + "grad_norm": 2.828125, + "grad_norm_var": 0.5523844401041667, + "learning_rate": 0.0001, + "loss": 4.8429, + "loss/crossentropy": 2.4344359636306763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2748369127511978, + "step": 8598 + }, + { + "epoch": 0.172, + "grad_norm": 2.15625, + "grad_norm_var": 0.05840250651041667, + "learning_rate": 0.0001, + "loss": 4.4359, + "loss/crossentropy": 1.9280555844306946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20140594244003296, + "step": 8600 + }, + { + "epoch": 0.17204, + "grad_norm": 2.15625, + "grad_norm_var": 0.05537007649739583, + "learning_rate": 0.0001, + "loss": 4.2802, + "loss/crossentropy": 2.045006573200226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21741003543138504, + "step": 8602 + }, + { + "epoch": 0.17208, + "grad_norm": 2.0625, + "grad_norm_var": 0.04461034138997396, + "learning_rate": 0.0001, + "loss": 4.1622, + "loss/crossentropy": 2.092265546321869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20971956849098206, + "step": 8604 + }, + { + "epoch": 0.17212, + "grad_norm": 1.9375, + "grad_norm_var": 0.048130035400390625, + "learning_rate": 0.0001, + "loss": 4.16, + "loss/crossentropy": 1.794768512248993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20680297911167145, + "step": 8606 + }, + { + "epoch": 0.17216, + "grad_norm": 2.109375, + "grad_norm_var": 0.05690078735351563, + "learning_rate": 0.0001, + "loss": 4.2213, + "loss/crossentropy": 1.9316805601119995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2163936346769333, + "step": 8608 + }, + { + "epoch": 0.1722, + "grad_norm": 2.359375, + "grad_norm_var": 0.05872294108072917, + "learning_rate": 0.0001, + "loss": 4.5981, + "loss/crossentropy": 2.2786675691604614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2411205694079399, + "step": 8610 + }, + { + "epoch": 0.17224, + "grad_norm": 2.234375, + "grad_norm_var": 0.05852457682291667, + "learning_rate": 0.0001, + "loss": 4.5688, + "loss/crossentropy": 1.9211469888687134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23195043951272964, + "step": 8612 + }, + { + "epoch": 0.17228, + "grad_norm": 1.9921875, + "grad_norm_var": 0.037393951416015626, + "learning_rate": 0.0001, + "loss": 4.1776, + "loss/crossentropy": 1.900360643863678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2339140772819519, + "step": 8614 + }, + { + "epoch": 0.17232, + "grad_norm": 2.125, + "grad_norm_var": 0.03743464152018229, + "learning_rate": 0.0001, + "loss": 4.4408, + "loss/crossentropy": 1.976994514465332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.218951515853405, + "step": 8616 + }, + { + "epoch": 0.17236, + "grad_norm": 2.09375, + "grad_norm_var": 0.035982004801432294, + "learning_rate": 0.0001, + "loss": 4.1988, + "loss/crossentropy": 2.045244038105011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20886047929525375, + "step": 8618 + }, + { + "epoch": 0.1724, + "grad_norm": 2.25, + "grad_norm_var": 0.03240534464518229, + "learning_rate": 0.0001, + "loss": 4.5082, + "loss/crossentropy": 2.21256685256958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21809116005897522, + "step": 8620 + }, + { + "epoch": 0.17244, + "grad_norm": 2.171875, + "grad_norm_var": 0.02641779581705729, + "learning_rate": 0.0001, + "loss": 4.6127, + "loss/crossentropy": 2.300337314605713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22060109674930573, + "step": 8622 + }, + { + "epoch": 0.17248, + "grad_norm": 2.015625, + "grad_norm_var": 0.016932932535807292, + "learning_rate": 0.0001, + "loss": 4.2411, + "loss/crossentropy": 1.8734883666038513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20403072237968445, + "step": 8624 + }, + { + "epoch": 0.17252, + "grad_norm": 2.5625, + "grad_norm_var": 0.01962865193684896, + "learning_rate": 0.0001, + "loss": 4.6907, + "loss/crossentropy": 2.1382813453674316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27397096157073975, + "step": 8626 + }, + { + "epoch": 0.17256, + "grad_norm": 2.0625, + "grad_norm_var": 0.019760894775390624, + "learning_rate": 0.0001, + "loss": 4.0917, + "loss/crossentropy": 1.9718505144119263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20658842474222183, + "step": 8628 + }, + { + "epoch": 0.1726, + "grad_norm": 2.125, + "grad_norm_var": 0.016706339518229165, + "learning_rate": 0.0001, + "loss": 4.5233, + "loss/crossentropy": 2.0957319736480713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187560573220253, + "step": 8630 + }, + { + "epoch": 0.17264, + "grad_norm": 1.984375, + "grad_norm_var": 0.019954427083333334, + "learning_rate": 0.0001, + "loss": 4.0986, + "loss/crossentropy": 2.0504234433174133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22376062721014023, + "step": 8632 + }, + { + "epoch": 0.17268, + "grad_norm": 2.21875, + "grad_norm_var": 0.05233968098958333, + "learning_rate": 0.0001, + "loss": 4.4513, + "loss/crossentropy": 2.057171046733856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23209689557552338, + "step": 8634 + }, + { + "epoch": 0.17272, + "grad_norm": 2.109375, + "grad_norm_var": 0.052611287434895834, + "learning_rate": 0.0001, + "loss": 4.3473, + "loss/crossentropy": 1.9635317921638489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21854296326637268, + "step": 8636 + }, + { + "epoch": 0.17276, + "grad_norm": 2.359375, + "grad_norm_var": 0.0546539306640625, + "learning_rate": 0.0001, + "loss": 4.3124, + "loss/crossentropy": 1.8973188400268555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24171262234449387, + "step": 8638 + }, + { + "epoch": 0.1728, + "grad_norm": 2.109375, + "grad_norm_var": 0.05347900390625, + "learning_rate": 0.0001, + "loss": 4.2068, + "loss/crossentropy": 1.6730469465255737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17975886166095734, + "step": 8640 + }, + { + "epoch": 0.17284, + "grad_norm": 2.265625, + "grad_norm_var": 0.04702123006184896, + "learning_rate": 0.0001, + "loss": 4.4222, + "loss/crossentropy": 2.2531689405441284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2170827016234398, + "step": 8642 + }, + { + "epoch": 0.17288, + "grad_norm": 2.171875, + "grad_norm_var": 0.045873769124348956, + "learning_rate": 0.0001, + "loss": 4.0249, + "loss/crossentropy": 2.0913639068603516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22586838155984879, + "step": 8644 + }, + { + "epoch": 0.17292, + "grad_norm": 2.140625, + "grad_norm_var": 0.04533869425455729, + "learning_rate": 0.0001, + "loss": 4.45, + "loss/crossentropy": 2.163489580154419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2290281057357788, + "step": 8646 + }, + { + "epoch": 0.17296, + "grad_norm": 2.15625, + "grad_norm_var": 0.04267552693684896, + "learning_rate": 0.0001, + "loss": 4.3198, + "loss/crossentropy": 2.0669034719467163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22411519289016724, + "step": 8648 + }, + { + "epoch": 0.173, + "grad_norm": 2.046875, + "grad_norm_var": 0.008213043212890625, + "learning_rate": 0.0001, + "loss": 4.0474, + "loss/crossentropy": 1.9942336678504944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21262076497077942, + "step": 8650 + }, + { + "epoch": 0.17304, + "grad_norm": 2.1875, + "grad_norm_var": 0.009474436442057291, + "learning_rate": 0.0001, + "loss": 4.2701, + "loss/crossentropy": 2.046514868736267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2376151606440544, + "step": 8652 + }, + { + "epoch": 0.17308, + "grad_norm": 2.046875, + "grad_norm_var": 0.005995432535807292, + "learning_rate": 0.0001, + "loss": 4.3308, + "loss/crossentropy": 1.8385429382324219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19642101973295212, + "step": 8654 + }, + { + "epoch": 0.17312, + "grad_norm": 2.125, + "grad_norm_var": 0.007252756754557292, + "learning_rate": 0.0001, + "loss": 4.446, + "loss/crossentropy": 2.259633481502533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24319174885749817, + "step": 8656 + }, + { + "epoch": 0.17316, + "grad_norm": 2.15625, + "grad_norm_var": 0.005301920572916666, + "learning_rate": 0.0001, + "loss": 4.2067, + "loss/crossentropy": 1.9811018109321594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21801364421844482, + "step": 8658 + }, + { + "epoch": 0.1732, + "grad_norm": 2.203125, + "grad_norm_var": 0.0059722900390625, + "learning_rate": 0.0001, + "loss": 4.2158, + "loss/crossentropy": 2.1726362705230713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23193368315696716, + "step": 8660 + }, + { + "epoch": 0.17324, + "grad_norm": 2.1875, + "grad_norm_var": 0.006086222330729167, + "learning_rate": 0.0001, + "loss": 4.2587, + "loss/crossentropy": 1.9915854930877686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111910656094551, + "step": 8662 + }, + { + "epoch": 0.17328, + "grad_norm": 2.671875, + "grad_norm_var": 0.026851399739583334, + "learning_rate": 0.0001, + "loss": 4.5001, + "loss/crossentropy": 1.9651137590408325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.242890365421772, + "step": 8664 + }, + { + "epoch": 0.17332, + "grad_norm": 2.15625, + "grad_norm_var": 0.026008097330729167, + "learning_rate": 0.0001, + "loss": 4.3848, + "loss/crossentropy": 1.865262508392334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21065659821033478, + "step": 8666 + }, + { + "epoch": 0.17336, + "grad_norm": 2.0625, + "grad_norm_var": 0.02535400390625, + "learning_rate": 0.0001, + "loss": 4.3402, + "loss/crossentropy": 1.9073076248168945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959603875875473, + "step": 8668 + }, + { + "epoch": 0.1734, + "grad_norm": 2.078125, + "grad_norm_var": 0.024332682291666668, + "learning_rate": 0.0001, + "loss": 4.2965, + "loss/crossentropy": 2.167983889579773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22176912426948547, + "step": 8670 + }, + { + "epoch": 0.17344, + "grad_norm": 2.34375, + "grad_norm_var": 0.5460896809895833, + "learning_rate": 0.0001, + "loss": 4.4551, + "loss/crossentropy": 1.7029761672019958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.222617506980896, + "step": 8672 + }, + { + "epoch": 0.17348, + "grad_norm": 2.046875, + "grad_norm_var": 0.5439849853515625, + "learning_rate": 0.0001, + "loss": 4.3414, + "loss/crossentropy": 2.053748309612274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23398682475090027, + "step": 8674 + }, + { + "epoch": 0.17352, + "grad_norm": 2.703125, + "grad_norm_var": 0.5408274332682291, + "learning_rate": 0.0001, + "loss": 4.7882, + "loss/crossentropy": 2.309812903404236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2961876690387726, + "step": 8676 + }, + { + "epoch": 0.17356, + "grad_norm": 1.9765625, + "grad_norm_var": 0.5421953837076823, + "learning_rate": 0.0001, + "loss": 4.4416, + "loss/crossentropy": 2.1045809984207153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22072184830904007, + "step": 8678 + }, + { + "epoch": 0.1736, + "grad_norm": 2.015625, + "grad_norm_var": 0.5490435282389323, + "learning_rate": 0.0001, + "loss": 4.4437, + "loss/crossentropy": 2.2114070653915405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2208957076072693, + "step": 8680 + }, + { + "epoch": 0.17364, + "grad_norm": 2.21875, + "grad_norm_var": 0.5490435282389323, + "learning_rate": 0.0001, + "loss": 4.5931, + "loss/crossentropy": 2.1773669719696045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22639526426792145, + "step": 8682 + }, + { + "epoch": 0.17368, + "grad_norm": 2.234375, + "grad_norm_var": 0.5398272196451823, + "learning_rate": 0.0001, + "loss": 3.9397, + "loss/crossentropy": 1.4213417768478394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17653951048851013, + "step": 8684 + }, + { + "epoch": 0.17372, + "grad_norm": 1.9609375, + "grad_norm_var": 0.5425374348958333, + "learning_rate": 0.0001, + "loss": 4.2711, + "loss/crossentropy": 1.968630075454712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2007492408156395, + "step": 8686 + }, + { + "epoch": 0.17376, + "grad_norm": 2.21875, + "grad_norm_var": 0.030304972330729166, + "learning_rate": 0.0001, + "loss": 4.2297, + "loss/crossentropy": 2.0826632976531982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21895240992307663, + "step": 8688 + }, + { + "epoch": 0.1738, + "grad_norm": 2.21875, + "grad_norm_var": 0.029002888997395834, + "learning_rate": 0.0001, + "loss": 4.4188, + "loss/crossentropy": 2.2756701707839966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22170037031173706, + "step": 8690 + }, + { + "epoch": 0.17384, + "grad_norm": 2.0625, + "grad_norm_var": 0.010773722330729167, + "learning_rate": 0.0001, + "loss": 4.51, + "loss/crossentropy": 2.329536557197571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24717354029417038, + "step": 8692 + }, + { + "epoch": 0.17388, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010033162434895833, + "learning_rate": 0.0001, + "loss": 4.0776, + "loss/crossentropy": 2.077241063117981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215146966278553, + "step": 8694 + }, + { + "epoch": 0.17392, + "grad_norm": 2.109375, + "grad_norm_var": 0.0091949462890625, + "learning_rate": 0.0001, + "loss": 4.3482, + "loss/crossentropy": 2.2363221645355225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23428452014923096, + "step": 8696 + }, + { + "epoch": 0.17396, + "grad_norm": 2.0, + "grad_norm_var": 0.01024169921875, + "learning_rate": 0.0001, + "loss": 4.1321, + "loss/crossentropy": 2.055815279483795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22674524784088135, + "step": 8698 + }, + { + "epoch": 0.174, + "grad_norm": 2.015625, + "grad_norm_var": 0.0098541259765625, + "learning_rate": 0.0001, + "loss": 4.2789, + "loss/crossentropy": 2.205570936203003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23787499964237213, + "step": 8700 + }, + { + "epoch": 0.17404, + "grad_norm": 2.0625, + "grad_norm_var": 0.009895579020182291, + "learning_rate": 0.0001, + "loss": 4.4636, + "loss/crossentropy": 2.262540578842163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23921719938516617, + "step": 8702 + }, + { + "epoch": 0.17408, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010114542643229167, + "learning_rate": 0.0001, + "loss": 4.3372, + "loss/crossentropy": 2.5464816093444824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23302219063043594, + "step": 8704 + }, + { + "epoch": 0.17412, + "grad_norm": 2.09375, + "grad_norm_var": 0.00897216796875, + "learning_rate": 0.0001, + "loss": 4.2355, + "loss/crossentropy": 2.050383508205414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23393510282039642, + "step": 8706 + }, + { + "epoch": 0.17416, + "grad_norm": 2.046875, + "grad_norm_var": 0.0076812744140625, + "learning_rate": 0.0001, + "loss": 3.9943, + "loss/crossentropy": 1.9034642577171326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20801686495542526, + "step": 8708 + }, + { + "epoch": 0.1742, + "grad_norm": 2.09375, + "grad_norm_var": 0.007045237223307291, + "learning_rate": 0.0001, + "loss": 4.2801, + "loss/crossentropy": 2.313044309616089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24881915748119354, + "step": 8710 + }, + { + "epoch": 0.17424, + "grad_norm": 2.015625, + "grad_norm_var": 0.009388987223307292, + "learning_rate": 0.0001, + "loss": 4.3754, + "loss/crossentropy": 1.973829746246338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21400053054094315, + "step": 8712 + }, + { + "epoch": 0.17428, + "grad_norm": 2.25, + "grad_norm_var": 0.010109202067057291, + "learning_rate": 0.0001, + "loss": 4.2936, + "loss/crossentropy": 1.831783950328827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20855721831321716, + "step": 8714 + }, + { + "epoch": 0.17432, + "grad_norm": 2.15625, + "grad_norm_var": 0.009683990478515625, + "learning_rate": 0.0001, + "loss": 4.2681, + "loss/crossentropy": 2.0173734426498413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22365443408489227, + "step": 8716 + }, + { + "epoch": 0.17436, + "grad_norm": 2.125, + "grad_norm_var": 0.007155100504557292, + "learning_rate": 0.0001, + "loss": 4.4177, + "loss/crossentropy": 1.6534234285354614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20528900623321533, + "step": 8718 + }, + { + "epoch": 0.1744, + "grad_norm": 2.125, + "grad_norm_var": 0.006396484375, + "learning_rate": 0.0001, + "loss": 4.3658, + "loss/crossentropy": 1.8113531470298767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21433213353157043, + "step": 8720 + }, + { + "epoch": 0.17444, + "grad_norm": 2.125, + "grad_norm_var": 0.0064280192057291664, + "learning_rate": 0.0001, + "loss": 4.5135, + "loss/crossentropy": 2.0750836730003357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22202756255865097, + "step": 8722 + }, + { + "epoch": 0.17448, + "grad_norm": 2.046875, + "grad_norm_var": 0.0056955973307291664, + "learning_rate": 0.0001, + "loss": 4.2512, + "loss/crossentropy": 2.1388206481933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24137140065431595, + "step": 8724 + }, + { + "epoch": 0.17452, + "grad_norm": 2.0625, + "grad_norm_var": 0.006745402018229167, + "learning_rate": 0.0001, + "loss": 4.0401, + "loss/crossentropy": 2.068696141242981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215234711766243, + "step": 8726 + }, + { + "epoch": 0.17456, + "grad_norm": 1.984375, + "grad_norm_var": 0.005280558268229167, + "learning_rate": 0.0001, + "loss": 4.0277, + "loss/crossentropy": 1.6970900893211365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21108710020780563, + "step": 8728 + }, + { + "epoch": 0.1746, + "grad_norm": 2.15625, + "grad_norm_var": 0.005582682291666667, + "learning_rate": 0.0001, + "loss": 3.8555, + "loss/crossentropy": 1.8847576975822449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2151619866490364, + "step": 8730 + }, + { + "epoch": 0.17464, + "grad_norm": 2.328125, + "grad_norm_var": 0.009403483072916666, + "learning_rate": 0.0001, + "loss": 4.4088, + "loss/crossentropy": 2.4103721380233765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2589537426829338, + "step": 8732 + }, + { + "epoch": 0.17468, + "grad_norm": 2.203125, + "grad_norm_var": 0.01011962890625, + "learning_rate": 0.0001, + "loss": 4.1415, + "loss/crossentropy": 1.8340824842453003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22282177209854126, + "step": 8734 + }, + { + "epoch": 0.17472, + "grad_norm": 2.078125, + "grad_norm_var": 0.010302734375, + "learning_rate": 0.0001, + "loss": 4.2472, + "loss/crossentropy": 1.88236665725708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193661779165268, + "step": 8736 + }, + { + "epoch": 0.17476, + "grad_norm": 2.28125, + "grad_norm_var": 0.05056050618489583, + "learning_rate": 0.0001, + "loss": 4.441, + "loss/crossentropy": 1.8121293783187866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21855003386735916, + "step": 8738 + }, + { + "epoch": 0.1748, + "grad_norm": 2.203125, + "grad_norm_var": 0.05090738932291667, + "learning_rate": 0.0001, + "loss": 4.1422, + "loss/crossentropy": 2.215694308280945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2405528798699379, + "step": 8740 + }, + { + "epoch": 0.17484, + "grad_norm": 2.0625, + "grad_norm_var": 0.05090738932291667, + "learning_rate": 0.0001, + "loss": 4.2431, + "loss/crossentropy": 2.124837279319763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22565175592899323, + "step": 8742 + }, + { + "epoch": 0.17488, + "grad_norm": 2.203125, + "grad_norm_var": 0.0476959228515625, + "learning_rate": 0.0001, + "loss": 4.6783, + "loss/crossentropy": 2.2531429529190063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.259593665599823, + "step": 8744 + }, + { + "epoch": 0.17492, + "grad_norm": 2.046875, + "grad_norm_var": 0.04537760416666667, + "learning_rate": 0.0001, + "loss": 4.3546, + "loss/crossentropy": 2.403178572654724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2475409209728241, + "step": 8746 + }, + { + "epoch": 0.17496, + "grad_norm": 2.328125, + "grad_norm_var": 0.04527587890625, + "learning_rate": 0.0001, + "loss": 4.5698, + "loss/crossentropy": 1.7886858582496643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21677076816558838, + "step": 8748 + }, + { + "epoch": 0.175, + "grad_norm": 2.109375, + "grad_norm_var": 0.04397379557291667, + "learning_rate": 0.0001, + "loss": 4.1374, + "loss/crossentropy": 1.9257569313049316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2098483294248581, + "step": 8750 + }, + { + "epoch": 0.17504, + "grad_norm": 2.078125, + "grad_norm_var": 0.04396870930989583, + "learning_rate": 0.0001, + "loss": 4.407, + "loss/crossentropy": 2.1609140634536743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22846446931362152, + "step": 8752 + }, + { + "epoch": 0.17508, + "grad_norm": 2.0625, + "grad_norm_var": 0.0081939697265625, + "learning_rate": 0.0001, + "loss": 4.2683, + "loss/crossentropy": 2.529700756072998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2515157088637352, + "step": 8754 + }, + { + "epoch": 0.17512, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008548736572265625, + "learning_rate": 0.0001, + "loss": 4.4965, + "loss/crossentropy": 2.1920565366744995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24070476740598679, + "step": 8756 + }, + { + "epoch": 0.17516, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0097900390625, + "learning_rate": 0.0001, + "loss": 3.8416, + "loss/crossentropy": 1.7714558839797974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1787928193807602, + "step": 8758 + }, + { + "epoch": 0.1752, + "grad_norm": 2.171875, + "grad_norm_var": 0.009666951497395833, + "learning_rate": 0.0001, + "loss": 4.4457, + "loss/crossentropy": 1.986818790435791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20736730098724365, + "step": 8760 + }, + { + "epoch": 0.17524, + "grad_norm": 2.015625, + "grad_norm_var": 0.010453287760416667, + "learning_rate": 0.0001, + "loss": 4.2731, + "loss/crossentropy": 1.8152282238006592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22459527850151062, + "step": 8762 + }, + { + "epoch": 0.17528, + "grad_norm": 2.109375, + "grad_norm_var": 0.0069163004557291664, + "learning_rate": 0.0001, + "loss": 4.4058, + "loss/crossentropy": 2.2312777042388916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23017627000808716, + "step": 8764 + }, + { + "epoch": 0.17532, + "grad_norm": 2.375, + "grad_norm_var": 0.013252766927083333, + "learning_rate": 0.0001, + "loss": 4.4714, + "loss/crossentropy": 2.107849955558777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21224269270896912, + "step": 8766 + }, + { + "epoch": 0.17536, + "grad_norm": 2.15625, + "grad_norm_var": 0.013206990559895833, + "learning_rate": 0.0001, + "loss": 4.1641, + "loss/crossentropy": 2.1588711738586426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200954109430313, + "step": 8768 + }, + { + "epoch": 0.1754, + "grad_norm": 2.1875, + "grad_norm_var": 0.0130126953125, + "learning_rate": 0.0001, + "loss": 4.4056, + "loss/crossentropy": 2.1355313062667847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24048195779323578, + "step": 8770 + }, + { + "epoch": 0.17544, + "grad_norm": 2.125, + "grad_norm_var": 0.011557769775390626, + "learning_rate": 0.0001, + "loss": 4.3505, + "loss/crossentropy": 2.477591037750244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24459562450647354, + "step": 8772 + }, + { + "epoch": 0.17548, + "grad_norm": 2.046875, + "grad_norm_var": 0.00845947265625, + "learning_rate": 0.0001, + "loss": 4.4184, + "loss/crossentropy": 2.2577285766601562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23038798570632935, + "step": 8774 + }, + { + "epoch": 0.17552, + "grad_norm": 2.15625, + "grad_norm_var": 0.009813435872395833, + "learning_rate": 0.0001, + "loss": 4.2681, + "loss/crossentropy": 2.239536762237549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22290733456611633, + "step": 8776 + }, + { + "epoch": 0.17556, + "grad_norm": 2.125, + "grad_norm_var": 0.007373046875, + "learning_rate": 0.0001, + "loss": 4.4351, + "loss/crossentropy": 1.9139958024024963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2175864800810814, + "step": 8778 + }, + { + "epoch": 0.1756, + "grad_norm": 2.0625, + "grad_norm_var": 0.00888671875, + "learning_rate": 0.0001, + "loss": 4.0756, + "loss/crossentropy": 2.0622661113739014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2223067432641983, + "step": 8780 + }, + { + "epoch": 0.17564, + "grad_norm": 2.15625, + "grad_norm_var": 0.004524739583333334, + "learning_rate": 0.0001, + "loss": 4.4552, + "loss/crossentropy": 2.222475051879883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21181221306324005, + "step": 8782 + }, + { + "epoch": 0.17568, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007002512613932292, + "learning_rate": 0.0001, + "loss": 4.1696, + "loss/crossentropy": 2.2612074613571167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24079158157110214, + "step": 8784 + }, + { + "epoch": 0.17572, + "grad_norm": 2.078125, + "grad_norm_var": 0.005863189697265625, + "learning_rate": 0.0001, + "loss": 4.0889, + "loss/crossentropy": 2.1629387736320496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21437199413776398, + "step": 8786 + }, + { + "epoch": 0.17576, + "grad_norm": 2.328125, + "grad_norm_var": 0.009492746988932292, + "learning_rate": 0.0001, + "loss": 4.6583, + "loss/crossentropy": 2.145151972770691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23797442018985748, + "step": 8788 + }, + { + "epoch": 0.1758, + "grad_norm": 2.453125, + "grad_norm_var": 0.015457916259765624, + "learning_rate": 0.0001, + "loss": 4.5188, + "loss/crossentropy": 2.0366984605789185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2191104218363762, + "step": 8790 + }, + { + "epoch": 0.17584, + "grad_norm": 2.203125, + "grad_norm_var": 0.014422353108723958, + "learning_rate": 0.0001, + "loss": 4.1604, + "loss/crossentropy": 2.1049715280532837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.230033777654171, + "step": 8792 + }, + { + "epoch": 0.17588, + "grad_norm": 2.15625, + "grad_norm_var": 0.014338938395182292, + "learning_rate": 0.0001, + "loss": 4.4327, + "loss/crossentropy": 2.2549991607666016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23084092140197754, + "step": 8794 + }, + { + "epoch": 0.17592, + "grad_norm": 2.125, + "grad_norm_var": 0.015547688802083333, + "learning_rate": 0.0001, + "loss": 4.4653, + "loss/crossentropy": 1.9873813390731812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879335910081863, + "step": 8796 + }, + { + "epoch": 0.17596, + "grad_norm": 2.140625, + "grad_norm_var": 0.015677897135416667, + "learning_rate": 0.0001, + "loss": 4.6371, + "loss/crossentropy": 2.0723283886909485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20538055896759033, + "step": 8798 + }, + { + "epoch": 0.176, + "grad_norm": 2.125, + "grad_norm_var": 0.013099924723307291, + "learning_rate": 0.0001, + "loss": 4.0313, + "loss/crossentropy": 2.090642750263214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2329270914196968, + "step": 8800 + }, + { + "epoch": 0.17604, + "grad_norm": 2.1875, + "grad_norm_var": 0.012672678629557291, + "learning_rate": 0.0001, + "loss": 4.4626, + "loss/crossentropy": 2.3432271480560303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2456662431359291, + "step": 8802 + }, + { + "epoch": 0.17608, + "grad_norm": 2.21875, + "grad_norm_var": 0.010465240478515625, + "learning_rate": 0.0001, + "loss": 4.5133, + "loss/crossentropy": 2.1210837364196777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22580894827842712, + "step": 8804 + }, + { + "epoch": 0.17612, + "grad_norm": 2.0, + "grad_norm_var": 0.005147043863932292, + "learning_rate": 0.0001, + "loss": 4.024, + "loss/crossentropy": 2.142494797706604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22426588833332062, + "step": 8806 + }, + { + "epoch": 0.17616, + "grad_norm": 2.0625, + "grad_norm_var": 0.0049435933430989586, + "learning_rate": 0.0001, + "loss": 4.2877, + "loss/crossentropy": 1.9163227677345276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21529845893383026, + "step": 8808 + }, + { + "epoch": 0.1762, + "grad_norm": 2.140625, + "grad_norm_var": 0.004937489827473958, + "learning_rate": 0.0001, + "loss": 4.4776, + "loss/crossentropy": 2.1478612422943115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2354428842663765, + "step": 8810 + }, + { + "epoch": 0.17624, + "grad_norm": 2.15625, + "grad_norm_var": 0.0033854166666666668, + "learning_rate": 0.0001, + "loss": 4.435, + "loss/crossentropy": 2.1546601057052612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22553270310163498, + "step": 8812 + }, + { + "epoch": 0.17628, + "grad_norm": 2.140625, + "grad_norm_var": 0.004233551025390625, + "learning_rate": 0.0001, + "loss": 4.1661, + "loss/crossentropy": 2.0559862852096558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19714127480983734, + "step": 8814 + }, + { + "epoch": 0.17632, + "grad_norm": 2.140625, + "grad_norm_var": 0.004078928629557292, + "learning_rate": 0.0001, + "loss": 4.3543, + "loss/crossentropy": 2.1340363025665283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22422882914543152, + "step": 8816 + }, + { + "epoch": 0.17636, + "grad_norm": 2.078125, + "grad_norm_var": 0.0038937886555989584, + "learning_rate": 0.0001, + "loss": 4.4464, + "loss/crossentropy": 2.265942335128784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23249086737632751, + "step": 8818 + }, + { + "epoch": 0.1764, + "grad_norm": 2.0625, + "grad_norm_var": 0.0031939188639322916, + "learning_rate": 0.0001, + "loss": 4.3187, + "loss/crossentropy": 2.245513081550598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22869569063186646, + "step": 8820 + }, + { + "epoch": 0.17644, + "grad_norm": 2.0, + "grad_norm_var": 0.006461334228515625, + "learning_rate": 0.0001, + "loss": 4.179, + "loss/crossentropy": 1.851025104522705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21989689767360687, + "step": 8822 + }, + { + "epoch": 0.17648, + "grad_norm": 2.09375, + "grad_norm_var": 0.006266021728515625, + "learning_rate": 0.0001, + "loss": 4.276, + "loss/crossentropy": 2.2972241640090942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22793132066726685, + "step": 8824 + }, + { + "epoch": 0.17652, + "grad_norm": 2.078125, + "grad_norm_var": 0.006276194254557292, + "learning_rate": 0.0001, + "loss": 4.3955, + "loss/crossentropy": 2.248735189437866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2133597657084465, + "step": 8826 + }, + { + "epoch": 0.17656, + "grad_norm": 2.15625, + "grad_norm_var": 0.008314768473307291, + "learning_rate": 0.0001, + "loss": 4.4423, + "loss/crossentropy": 2.4173099994659424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23985996842384338, + "step": 8828 + }, + { + "epoch": 0.1766, + "grad_norm": 2.28125, + "grad_norm_var": 0.008698527018229167, + "learning_rate": 0.0001, + "loss": 4.5425, + "loss/crossentropy": 2.5017653703689575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26119648665189743, + "step": 8830 + }, + { + "epoch": 0.17664, + "grad_norm": 2.09375, + "grad_norm_var": 0.0090484619140625, + "learning_rate": 0.0001, + "loss": 4.2474, + "loss/crossentropy": 1.9006813764572144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19938994944095612, + "step": 8832 + }, + { + "epoch": 0.17668, + "grad_norm": 2.046875, + "grad_norm_var": 0.009496053059895834, + "learning_rate": 0.0001, + "loss": 4.1643, + "loss/crossentropy": 2.101746916770935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20544035732746124, + "step": 8834 + }, + { + "epoch": 0.17672, + "grad_norm": 2.15625, + "grad_norm_var": 0.010542805989583333, + "learning_rate": 0.0001, + "loss": 4.3211, + "loss/crossentropy": 2.1605160236358643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23061934113502502, + "step": 8836 + }, + { + "epoch": 0.17676, + "grad_norm": 2.296875, + "grad_norm_var": 0.008967081705729166, + "learning_rate": 0.0001, + "loss": 4.4357, + "loss/crossentropy": 1.963772177696228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20736530423164368, + "step": 8838 + }, + { + "epoch": 0.1768, + "grad_norm": 2.1875, + "grad_norm_var": 0.009733072916666667, + "learning_rate": 0.0001, + "loss": 4.3572, + "loss/crossentropy": 2.154300093650818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22221273183822632, + "step": 8840 + }, + { + "epoch": 0.17684, + "grad_norm": 2.109375, + "grad_norm_var": 0.009501139322916666, + "learning_rate": 0.0001, + "loss": 4.4876, + "loss/crossentropy": 2.1576497554779053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21834726631641388, + "step": 8842 + }, + { + "epoch": 0.17688, + "grad_norm": 1.9375, + "grad_norm_var": 0.0115875244140625, + "learning_rate": 0.0001, + "loss": 4.1192, + "loss/crossentropy": 2.1316112279891968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166184037923813, + "step": 8844 + }, + { + "epoch": 0.17692, + "grad_norm": 2.15625, + "grad_norm_var": 0.0098052978515625, + "learning_rate": 0.0001, + "loss": 4.2421, + "loss/crossentropy": 2.068525493144989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23183748871088028, + "step": 8846 + }, + { + "epoch": 0.17696, + "grad_norm": 2.203125, + "grad_norm_var": 0.01051025390625, + "learning_rate": 0.0001, + "loss": 4.5128, + "loss/crossentropy": 2.185767650604248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25881427526474, + "step": 8848 + }, + { + "epoch": 0.177, + "grad_norm": 2.125, + "grad_norm_var": 0.010619099934895833, + "learning_rate": 0.0001, + "loss": 4.3073, + "loss/crossentropy": 1.979454517364502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20669714361429214, + "step": 8850 + }, + { + "epoch": 0.17704, + "grad_norm": 2.109375, + "grad_norm_var": 0.009373982747395834, + "learning_rate": 0.0001, + "loss": 4.3356, + "loss/crossentropy": 2.3473092317581177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23232270777225494, + "step": 8852 + }, + { + "epoch": 0.17708, + "grad_norm": 2.046875, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 4.3732, + "loss/crossentropy": 2.461324691772461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.242417573928833, + "step": 8854 + }, + { + "epoch": 0.17712, + "grad_norm": 2.125, + "grad_norm_var": 0.008385976155598959, + "learning_rate": 0.0001, + "loss": 4.1107, + "loss/crossentropy": 1.5953214168548584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17768454551696777, + "step": 8856 + }, + { + "epoch": 0.17716, + "grad_norm": 2.15625, + "grad_norm_var": 0.008377838134765624, + "learning_rate": 0.0001, + "loss": 4.4289, + "loss/crossentropy": 2.1969146728515625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21466109156608582, + "step": 8858 + }, + { + "epoch": 0.1772, + "grad_norm": 1.921875, + "grad_norm_var": 0.008459218343098958, + "learning_rate": 0.0001, + "loss": 4.2154, + "loss/crossentropy": 2.243234634399414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22729554027318954, + "step": 8860 + }, + { + "epoch": 0.17724, + "grad_norm": 2.296875, + "grad_norm_var": 0.010628000895182291, + "learning_rate": 0.0001, + "loss": 4.2062, + "loss/crossentropy": 2.1855397820472717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24905066192150116, + "step": 8862 + }, + { + "epoch": 0.17728, + "grad_norm": 2.203125, + "grad_norm_var": 0.010628000895182291, + "learning_rate": 0.0001, + "loss": 4.4075, + "loss/crossentropy": 2.320886254310608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22135943174362183, + "step": 8864 + }, + { + "epoch": 0.17732, + "grad_norm": 1.96875, + "grad_norm_var": 0.011736806233723958, + "learning_rate": 0.0001, + "loss": 4.1444, + "loss/crossentropy": 2.140891909599304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2109990492463112, + "step": 8866 + }, + { + "epoch": 0.17736, + "grad_norm": 1.921875, + "grad_norm_var": 0.013038889567057291, + "learning_rate": 0.0001, + "loss": 4.1009, + "loss/crossentropy": 2.147824764251709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2131204530596733, + "step": 8868 + }, + { + "epoch": 0.1774, + "grad_norm": 2.71875, + "grad_norm_var": 0.039249420166015625, + "learning_rate": 0.0001, + "loss": 4.7136, + "loss/crossentropy": 2.187807321548462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2335718423128128, + "step": 8870 + }, + { + "epoch": 0.17744, + "grad_norm": 2.1875, + "grad_norm_var": 0.037287394205729164, + "learning_rate": 0.0001, + "loss": 4.5265, + "loss/crossentropy": 2.3127458095550537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2385788857936859, + "step": 8872 + }, + { + "epoch": 0.17748, + "grad_norm": 2.046875, + "grad_norm_var": 0.0369537353515625, + "learning_rate": 0.0001, + "loss": 4.1364, + "loss/crossentropy": 1.859586775302887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204575777053833, + "step": 8874 + }, + { + "epoch": 0.17752, + "grad_norm": 2.0, + "grad_norm_var": 0.03585611979166667, + "learning_rate": 0.0001, + "loss": 4.1993, + "loss/crossentropy": 1.9626107215881348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21016598492860794, + "step": 8876 + }, + { + "epoch": 0.17756, + "grad_norm": 2.046875, + "grad_norm_var": 0.034077962239583336, + "learning_rate": 0.0001, + "loss": 4.462, + "loss/crossentropy": 2.1130539774894714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21995095163583755, + "step": 8878 + }, + { + "epoch": 0.1776, + "grad_norm": 2.203125, + "grad_norm_var": 0.035374959309895836, + "learning_rate": 0.0001, + "loss": 4.4677, + "loss/crossentropy": 1.8914743065834045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20437531173229218, + "step": 8880 + }, + { + "epoch": 0.17764, + "grad_norm": 2.125, + "grad_norm_var": 0.032763671875, + "learning_rate": 0.0001, + "loss": 4.4975, + "loss/crossentropy": 2.2135708332061768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2478664517402649, + "step": 8882 + }, + { + "epoch": 0.17768, + "grad_norm": 2.203125, + "grad_norm_var": 0.028922526041666667, + "learning_rate": 0.0001, + "loss": 4.3763, + "loss/crossentropy": 2.194110333919525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2366471290588379, + "step": 8884 + }, + { + "epoch": 0.17772, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009822336832682292, + "learning_rate": 0.0001, + "loss": 4.2844, + "loss/crossentropy": 2.4365748167037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25842973589897156, + "step": 8886 + }, + { + "epoch": 0.17776, + "grad_norm": 2.078125, + "grad_norm_var": 0.010001373291015626, + "learning_rate": 0.0001, + "loss": 4.0574, + "loss/crossentropy": 1.9800177216529846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21982619166374207, + "step": 8888 + }, + { + "epoch": 0.1778, + "grad_norm": 2.15625, + "grad_norm_var": 0.009956614176432291, + "learning_rate": 0.0001, + "loss": 4.3465, + "loss/crossentropy": 2.1437748670578003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21494000405073166, + "step": 8890 + }, + { + "epoch": 0.17784, + "grad_norm": 2.1875, + "grad_norm_var": 0.008957672119140624, + "learning_rate": 0.0001, + "loss": 4.4051, + "loss/crossentropy": 2.0610267519950867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22473593801259995, + "step": 8892 + }, + { + "epoch": 0.17788, + "grad_norm": 1.953125, + "grad_norm_var": 0.013952382405598958, + "learning_rate": 0.0001, + "loss": 4.4637, + "loss/crossentropy": 2.261958599090576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2250354364514351, + "step": 8894 + }, + { + "epoch": 0.17792, + "grad_norm": 2.109375, + "grad_norm_var": 0.012237294514973959, + "learning_rate": 0.0001, + "loss": 4.3508, + "loss/crossentropy": 2.3689773082733154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22185539454221725, + "step": 8896 + }, + { + "epoch": 0.17796, + "grad_norm": 2.15625, + "grad_norm_var": 0.013034820556640625, + "learning_rate": 0.0001, + "loss": 4.7432, + "loss/crossentropy": 2.612341523170471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2561237961053848, + "step": 8898 + }, + { + "epoch": 0.178, + "grad_norm": 2.0625, + "grad_norm_var": 0.010990142822265625, + "learning_rate": 0.0001, + "loss": 4.5876, + "loss/crossentropy": 2.1230576038360596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23115848749876022, + "step": 8900 + }, + { + "epoch": 0.17804, + "grad_norm": 2.109375, + "grad_norm_var": 0.009626261393229167, + "learning_rate": 0.0001, + "loss": 4.2154, + "loss/crossentropy": 2.200004458427429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22002413868904114, + "step": 8902 + }, + { + "epoch": 0.17808, + "grad_norm": 2.03125, + "grad_norm_var": 0.009989420572916666, + "learning_rate": 0.0001, + "loss": 4.3438, + "loss/crossentropy": 2.323713779449463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22923698276281357, + "step": 8904 + }, + { + "epoch": 0.17812, + "grad_norm": 2.140625, + "grad_norm_var": 0.00982666015625, + "learning_rate": 0.0001, + "loss": 4.3079, + "loss/crossentropy": 2.038426458835602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21465667337179184, + "step": 8906 + }, + { + "epoch": 0.17816, + "grad_norm": 2.109375, + "grad_norm_var": 0.01217041015625, + "learning_rate": 0.0001, + "loss": 4.4498, + "loss/crossentropy": 2.3639097213745117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2378860041499138, + "step": 8908 + }, + { + "epoch": 0.1782, + "grad_norm": 2.046875, + "grad_norm_var": 0.00699462890625, + "learning_rate": 0.0001, + "loss": 4.261, + "loss/crossentropy": 1.8291080594062805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19894887506961823, + "step": 8910 + }, + { + "epoch": 0.17824, + "grad_norm": 2.265625, + "grad_norm_var": 0.011263020833333333, + "learning_rate": 0.0001, + "loss": 4.5606, + "loss/crossentropy": 2.3113714456558228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24526391178369522, + "step": 8912 + }, + { + "epoch": 0.17828, + "grad_norm": 2.109375, + "grad_norm_var": 0.011449178059895834, + "learning_rate": 0.0001, + "loss": 4.3016, + "loss/crossentropy": 2.114617943763733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21305133402347565, + "step": 8914 + }, + { + "epoch": 0.17832, + "grad_norm": 2.15625, + "grad_norm_var": 0.014975738525390626, + "learning_rate": 0.0001, + "loss": 3.9333, + "loss/crossentropy": 1.6893808841705322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19195494800806046, + "step": 8916 + }, + { + "epoch": 0.17836, + "grad_norm": 2.234375, + "grad_norm_var": 0.016721343994140624, + "learning_rate": 0.0001, + "loss": 4.2501, + "loss/crossentropy": 1.829396367073059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21181780844926834, + "step": 8918 + }, + { + "epoch": 0.1784, + "grad_norm": 2.03125, + "grad_norm_var": 0.01869481404622396, + "learning_rate": 0.0001, + "loss": 4.1542, + "loss/crossentropy": 1.8910154104232788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20975399017333984, + "step": 8920 + }, + { + "epoch": 0.17844, + "grad_norm": 2.140625, + "grad_norm_var": 0.01953709920247396, + "learning_rate": 0.0001, + "loss": 4.3327, + "loss/crossentropy": 2.0501255989074707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22336408495903015, + "step": 8922 + }, + { + "epoch": 0.17848, + "grad_norm": 2.109375, + "grad_norm_var": 0.01740086873372396, + "learning_rate": 0.0001, + "loss": 4.3035, + "loss/crossentropy": 2.3023892641067505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23251917958259583, + "step": 8924 + }, + { + "epoch": 0.17852, + "grad_norm": 2.015625, + "grad_norm_var": 0.01822077433268229, + "learning_rate": 0.0001, + "loss": 4.0592, + "loss/crossentropy": 2.0030421018600464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19997069239616394, + "step": 8926 + }, + { + "epoch": 0.17856, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013741048177083333, + "learning_rate": 0.0001, + "loss": 4.2568, + "loss/crossentropy": 2.2309017181396484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22145257890224457, + "step": 8928 + }, + { + "epoch": 0.1786, + "grad_norm": 2.171875, + "grad_norm_var": 0.013728841145833334, + "learning_rate": 0.0001, + "loss": 4.4366, + "loss/crossentropy": 2.1135157346725464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21580957621335983, + "step": 8930 + }, + { + "epoch": 0.17864, + "grad_norm": 2.015625, + "grad_norm_var": 0.011146799723307291, + "learning_rate": 0.0001, + "loss": 4.3473, + "loss/crossentropy": 2.098900556564331, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22469021379947662, + "step": 8932 + }, + { + "epoch": 0.17868, + "grad_norm": 2.21875, + "grad_norm_var": 0.0103179931640625, + "learning_rate": 0.0001, + "loss": 4.2702, + "loss/crossentropy": 2.1558337211608887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24765773862600327, + "step": 8934 + }, + { + "epoch": 0.17872, + "grad_norm": 2.09375, + "grad_norm_var": 0.006004842122395834, + "learning_rate": 0.0001, + "loss": 4.3318, + "loss/crossentropy": 2.141040623188019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22490675747394562, + "step": 8936 + }, + { + "epoch": 0.17876, + "grad_norm": 2.171875, + "grad_norm_var": 0.0096099853515625, + "learning_rate": 0.0001, + "loss": 4.4126, + "loss/crossentropy": 2.1036806106567383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23260055482387543, + "step": 8938 + }, + { + "epoch": 0.1788, + "grad_norm": 1.984375, + "grad_norm_var": 0.013036092122395834, + "learning_rate": 0.0001, + "loss": 4.4151, + "loss/crossentropy": 1.9403663277626038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21941428631544113, + "step": 8940 + }, + { + "epoch": 0.17884, + "grad_norm": 2.03125, + "grad_norm_var": 0.014717610677083333, + "learning_rate": 0.0001, + "loss": 4.2, + "loss/crossentropy": 1.8589079976081848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21533852070569992, + "step": 8942 + }, + { + "epoch": 0.17888, + "grad_norm": 2.21875, + "grad_norm_var": 0.013816070556640626, + "learning_rate": 0.0001, + "loss": 4.2476, + "loss/crossentropy": 2.3136903643608093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23139026761054993, + "step": 8944 + }, + { + "epoch": 0.17892, + "grad_norm": 2.109375, + "grad_norm_var": 0.012835439046223958, + "learning_rate": 0.0001, + "loss": 4.2669, + "loss/crossentropy": 2.305663585662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24263737350702286, + "step": 8946 + }, + { + "epoch": 0.17896, + "grad_norm": 2.5625, + "grad_norm_var": 0.02195002237955729, + "learning_rate": 0.0001, + "loss": 4.3059, + "loss/crossentropy": 2.0450612902641296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124529778957367, + "step": 8948 + }, + { + "epoch": 0.179, + "grad_norm": 2.3125, + "grad_norm_var": 0.020393880208333333, + "learning_rate": 0.0001, + "loss": 4.4229, + "loss/crossentropy": 2.435065984725952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24603386223316193, + "step": 8950 + }, + { + "epoch": 0.17904, + "grad_norm": 2.234375, + "grad_norm_var": 0.02295099894205729, + "learning_rate": 0.0001, + "loss": 4.2604, + "loss/crossentropy": 2.092818021774292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20905248820781708, + "step": 8952 + }, + { + "epoch": 0.17908, + "grad_norm": 2.109375, + "grad_norm_var": 0.024621327718098957, + "learning_rate": 0.0001, + "loss": 4.2539, + "loss/crossentropy": 2.08588969707489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142588049173355, + "step": 8954 + }, + { + "epoch": 0.17912, + "grad_norm": 2.25, + "grad_norm_var": 0.021144358317057292, + "learning_rate": 0.0001, + "loss": 4.3958, + "loss/crossentropy": 1.9161878824234009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20626582205295563, + "step": 8956 + }, + { + "epoch": 0.17916, + "grad_norm": 2.078125, + "grad_norm_var": 0.020182037353515626, + "learning_rate": 0.0001, + "loss": 4.3726, + "loss/crossentropy": 2.3072937726974487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22028075903654099, + "step": 8958 + }, + { + "epoch": 0.1792, + "grad_norm": 2.15625, + "grad_norm_var": 0.021109771728515626, + "learning_rate": 0.0001, + "loss": 4.4876, + "loss/crossentropy": 2.3053938150405884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22212930023670197, + "step": 8960 + }, + { + "epoch": 0.17924, + "grad_norm": 2.0625, + "grad_norm_var": 0.02269261678059896, + "learning_rate": 0.0001, + "loss": 4.5137, + "loss/crossentropy": 1.9130414128303528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20036083459854126, + "step": 8962 + }, + { + "epoch": 0.17928, + "grad_norm": 2.0, + "grad_norm_var": 0.013242340087890625, + "learning_rate": 0.0001, + "loss": 4.1299, + "loss/crossentropy": 2.3808066844940186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116570845246315, + "step": 8964 + }, + { + "epoch": 0.17932, + "grad_norm": 2.125, + "grad_norm_var": 0.009549713134765625, + "learning_rate": 0.0001, + "loss": 4.296, + "loss/crossentropy": 2.180716395378113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2275484874844551, + "step": 8966 + }, + { + "epoch": 0.17936, + "grad_norm": 1.984375, + "grad_norm_var": 0.008837890625, + "learning_rate": 0.0001, + "loss": 4.2597, + "loss/crossentropy": 2.027850866317749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2011229619383812, + "step": 8968 + }, + { + "epoch": 0.1794, + "grad_norm": 2.171875, + "grad_norm_var": 0.017447916666666667, + "learning_rate": 0.0001, + "loss": 4.3167, + "loss/crossentropy": 2.077622890472412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21361806988716125, + "step": 8970 + }, + { + "epoch": 0.17944, + "grad_norm": 2.03125, + "grad_norm_var": 0.017513020833333334, + "learning_rate": 0.0001, + "loss": 4.1972, + "loss/crossentropy": 2.004905104637146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21110886335372925, + "step": 8972 + }, + { + "epoch": 0.17948, + "grad_norm": 2.21875, + "grad_norm_var": 0.019466145833333334, + "learning_rate": 0.0001, + "loss": 4.5026, + "loss/crossentropy": 2.1859925389289856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23011507838964462, + "step": 8974 + }, + { + "epoch": 0.17952, + "grad_norm": 2.21875, + "grad_norm_var": 0.019840494791666666, + "learning_rate": 0.0001, + "loss": 4.2908, + "loss/crossentropy": 1.8372295498847961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171105071902275, + "step": 8976 + }, + { + "epoch": 0.17956, + "grad_norm": 2.328125, + "grad_norm_var": 0.020197550455729168, + "learning_rate": 0.0001, + "loss": 4.3887, + "loss/crossentropy": 2.127313494682312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22017831355333328, + "step": 8978 + }, + { + "epoch": 0.1796, + "grad_norm": 2.421875, + "grad_norm_var": 0.022098795572916666, + "learning_rate": 0.0001, + "loss": 4.3389, + "loss/crossentropy": 2.121580421924591, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24898843467235565, + "step": 8980 + }, + { + "epoch": 0.17964, + "grad_norm": 2.109375, + "grad_norm_var": 0.0226470947265625, + "learning_rate": 0.0001, + "loss": 4.1127, + "loss/crossentropy": 2.0973563194274902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21355029940605164, + "step": 8982 + }, + { + "epoch": 0.17968, + "grad_norm": 2.125, + "grad_norm_var": 0.018896484375, + "learning_rate": 0.0001, + "loss": 4.3898, + "loss/crossentropy": 2.1109927892684937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2096879929304123, + "step": 8984 + }, + { + "epoch": 0.17972, + "grad_norm": 2.078125, + "grad_norm_var": 0.0126617431640625, + "learning_rate": 0.0001, + "loss": 4.1264, + "loss/crossentropy": 1.954129159450531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20745816081762314, + "step": 8986 + }, + { + "epoch": 0.17976, + "grad_norm": 2.390625, + "grad_norm_var": 0.013483683268229166, + "learning_rate": 0.0001, + "loss": 4.5357, + "loss/crossentropy": 1.875806748867035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19150983542203903, + "step": 8988 + }, + { + "epoch": 0.1798, + "grad_norm": 2.109375, + "grad_norm_var": 0.01451416015625, + "learning_rate": 0.0001, + "loss": 4.3042, + "loss/crossentropy": 2.3101617097854614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2393202781677246, + "step": 8990 + }, + { + "epoch": 0.17984, + "grad_norm": 2.234375, + "grad_norm_var": 0.013374837239583333, + "learning_rate": 0.0001, + "loss": 4.4954, + "loss/crossentropy": 2.3156551122665405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23346291482448578, + "step": 8992 + }, + { + "epoch": 0.17988, + "grad_norm": 2.1875, + "grad_norm_var": 0.013966623942057292, + "learning_rate": 0.0001, + "loss": 4.3675, + "loss/crossentropy": 2.4437999725341797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172817811369896, + "step": 8994 + }, + { + "epoch": 0.17992, + "grad_norm": 2.046875, + "grad_norm_var": 0.009437815348307291, + "learning_rate": 0.0001, + "loss": 4.3122, + "loss/crossentropy": 2.0451250076293945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22556670010089874, + "step": 8996 + }, + { + "epoch": 0.17996, + "grad_norm": 2.21875, + "grad_norm_var": 0.009852854410807292, + "learning_rate": 0.0001, + "loss": 4.5654, + "loss/crossentropy": 2.3135393857955933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2313593551516533, + "step": 8998 + }, + { + "epoch": 0.18, + "grad_norm": 2.125, + "grad_norm_var": 0.010530344645182292, + "learning_rate": 0.0001, + "loss": 4.3001, + "loss/crossentropy": 1.9934805035591125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20815817266702652, + "step": 9000 + }, + { + "epoch": 0.18004, + "grad_norm": 2.125, + "grad_norm_var": 0.011982981363932292, + "learning_rate": 0.0001, + "loss": 4.6193, + "loss/crossentropy": 2.1770662665367126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23489046096801758, + "step": 9002 + }, + { + "epoch": 0.18008, + "grad_norm": 2.3125, + "grad_norm_var": 0.009779612223307291, + "learning_rate": 0.0001, + "loss": 4.3708, + "loss/crossentropy": 1.9791364073753357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139936238527298, + "step": 9004 + }, + { + "epoch": 0.18012, + "grad_norm": 2.015625, + "grad_norm_var": 0.011445871988932292, + "learning_rate": 0.0001, + "loss": 4.298, + "loss/crossentropy": 2.2092931270599365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22620443254709244, + "step": 9006 + }, + { + "epoch": 0.18016, + "grad_norm": 2.171875, + "grad_norm_var": 0.011034901936848958, + "learning_rate": 0.0001, + "loss": 4.3114, + "loss/crossentropy": 2.123443365097046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21943332999944687, + "step": 9008 + }, + { + "epoch": 0.1802, + "grad_norm": 2.09375, + "grad_norm_var": 0.009212239583333334, + "learning_rate": 0.0001, + "loss": 4.4666, + "loss/crossentropy": 2.243329405784607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24108020961284637, + "step": 9010 + }, + { + "epoch": 0.18024, + "grad_norm": 2.1875, + "grad_norm_var": 0.008426920572916666, + "learning_rate": 0.0001, + "loss": 4.3687, + "loss/crossentropy": 2.366227388381958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21806316077709198, + "step": 9012 + }, + { + "epoch": 0.18028, + "grad_norm": 2.03125, + "grad_norm_var": 0.009663899739583334, + "learning_rate": 0.0001, + "loss": 3.8255, + "loss/crossentropy": 1.768812358379364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1861925944685936, + "step": 9014 + }, + { + "epoch": 0.18032, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012225087483723958, + "learning_rate": 0.0001, + "loss": 4.1236, + "loss/crossentropy": 1.9376537799835205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18691913783550262, + "step": 9016 + }, + { + "epoch": 0.18036, + "grad_norm": 2.140625, + "grad_norm_var": 0.010709381103515625, + "learning_rate": 0.0001, + "loss": 4.2748, + "loss/crossentropy": 2.3026299476623535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23757526278495789, + "step": 9018 + }, + { + "epoch": 0.1804, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010057576497395833, + "learning_rate": 0.0001, + "loss": 4.0026, + "loss/crossentropy": 1.9697216153144836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21453910320997238, + "step": 9020 + }, + { + "epoch": 0.18044, + "grad_norm": 2.0625, + "grad_norm_var": 0.011058553059895834, + "learning_rate": 0.0001, + "loss": 4.5457, + "loss/crossentropy": 2.257638931274414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24236120283603668, + "step": 9022 + }, + { + "epoch": 0.18048, + "grad_norm": 2.0625, + "grad_norm_var": 0.0115142822265625, + "learning_rate": 0.0001, + "loss": 4.3323, + "loss/crossentropy": 2.244320869445801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25429578125476837, + "step": 9024 + }, + { + "epoch": 0.18052, + "grad_norm": 2.21875, + "grad_norm_var": 0.012214152018229167, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.1105872988700867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23081901669502258, + "step": 9026 + }, + { + "epoch": 0.18056, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012839508056640626, + "learning_rate": 0.0001, + "loss": 4.2561, + "loss/crossentropy": 1.9657647609710693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22111424803733826, + "step": 9028 + }, + { + "epoch": 0.1806, + "grad_norm": 2.125, + "grad_norm_var": 0.012308502197265625, + "learning_rate": 0.0001, + "loss": 4.4019, + "loss/crossentropy": 2.0759438276290894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22763221710920334, + "step": 9030 + }, + { + "epoch": 0.18064, + "grad_norm": 2.125, + "grad_norm_var": 0.010888417561848959, + "learning_rate": 0.0001, + "loss": 3.9999, + "loss/crossentropy": 2.0250572562217712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2279352843761444, + "step": 9032 + }, + { + "epoch": 0.18068, + "grad_norm": 2.09375, + "grad_norm_var": 0.010534413655598958, + "learning_rate": 0.0001, + "loss": 4.3772, + "loss/crossentropy": 2.270031213760376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23242096602916718, + "step": 9034 + }, + { + "epoch": 0.18072, + "grad_norm": 2.25, + "grad_norm_var": 0.011922200520833334, + "learning_rate": 0.0001, + "loss": 4.4304, + "loss/crossentropy": 2.10041344165802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23192601650953293, + "step": 9036 + }, + { + "epoch": 0.18076, + "grad_norm": 2.15625, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.1041, + "loss/crossentropy": 2.0255953073501587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23707614094018936, + "step": 9038 + }, + { + "epoch": 0.1808, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010406239827473959, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.8162729740142822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19466694444417953, + "step": 9040 + }, + { + "epoch": 0.18084, + "grad_norm": 2.203125, + "grad_norm_var": 0.010170237223307291, + "learning_rate": 0.0001, + "loss": 4.5543, + "loss/crossentropy": 2.1271599531173706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22750889509916306, + "step": 9042 + }, + { + "epoch": 0.18088, + "grad_norm": 2.171875, + "grad_norm_var": 0.009447224934895833, + "learning_rate": 0.0001, + "loss": 4.5377, + "loss/crossentropy": 2.3638603687286377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2456393539905548, + "step": 9044 + }, + { + "epoch": 0.18092, + "grad_norm": 2.125, + "grad_norm_var": 0.019437662760416665, + "learning_rate": 0.0001, + "loss": 4.3413, + "loss/crossentropy": 1.5851669907569885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20994101464748383, + "step": 9046 + }, + { + "epoch": 0.18096, + "grad_norm": 2.1875, + "grad_norm_var": 0.017789459228515624, + "learning_rate": 0.0001, + "loss": 4.2284, + "loss/crossentropy": 1.7990906834602356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18919725716114044, + "step": 9048 + }, + { + "epoch": 0.181, + "grad_norm": 2.28125, + "grad_norm_var": 0.017490386962890625, + "learning_rate": 0.0001, + "loss": 4.5415, + "loss/crossentropy": 2.0975595712661743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23837832361459732, + "step": 9050 + }, + { + "epoch": 0.18104, + "grad_norm": 2.0625, + "grad_norm_var": 0.018070475260416666, + "learning_rate": 0.0001, + "loss": 4.1304, + "loss/crossentropy": 2.1970856189727783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23219536244869232, + "step": 9052 + }, + { + "epoch": 0.18108, + "grad_norm": 2.15625, + "grad_norm_var": 0.017292277018229166, + "learning_rate": 0.0001, + "loss": 4.211, + "loss/crossentropy": 2.0846009850502014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20775136351585388, + "step": 9054 + }, + { + "epoch": 0.18112, + "grad_norm": 2.0625, + "grad_norm_var": 0.015075429280598959, + "learning_rate": 0.0001, + "loss": 4.0415, + "loss/crossentropy": 1.662496030330658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18223516643047333, + "step": 9056 + }, + { + "epoch": 0.18116, + "grad_norm": 2.046875, + "grad_norm_var": 0.016078440348307292, + "learning_rate": 0.0001, + "loss": 4.2561, + "loss/crossentropy": 2.126902401447296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193778082728386, + "step": 9058 + }, + { + "epoch": 0.1812, + "grad_norm": 2.984375, + "grad_norm_var": 0.062459309895833336, + "learning_rate": 0.0001, + "loss": 4.0343, + "loss/crossentropy": 1.9529814720153809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19026879966259003, + "step": 9060 + }, + { + "epoch": 0.18124, + "grad_norm": 1.953125, + "grad_norm_var": 0.05981852213541667, + "learning_rate": 0.0001, + "loss": 4.1911, + "loss/crossentropy": 2.140450179576874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23864319920539856, + "step": 9062 + }, + { + "epoch": 0.18128, + "grad_norm": 2.015625, + "grad_norm_var": 0.06083882649739583, + "learning_rate": 0.0001, + "loss": 4.3442, + "loss/crossentropy": 2.4139195680618286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2360195592045784, + "step": 9064 + }, + { + "epoch": 0.18132, + "grad_norm": 2.046875, + "grad_norm_var": 0.06083882649739583, + "learning_rate": 0.0001, + "loss": 4.5122, + "loss/crossentropy": 2.190356135368347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22671552002429962, + "step": 9066 + }, + { + "epoch": 0.18136, + "grad_norm": 2.078125, + "grad_norm_var": 0.05916315714518229, + "learning_rate": 0.0001, + "loss": 4.281, + "loss/crossentropy": 1.9428812861442566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22318705916404724, + "step": 9068 + }, + { + "epoch": 0.1814, + "grad_norm": 2.046875, + "grad_norm_var": 0.059242502848307295, + "learning_rate": 0.0001, + "loss": 4.4628, + "loss/crossentropy": 2.296473503112793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22005227208137512, + "step": 9070 + }, + { + "epoch": 0.18144, + "grad_norm": 2.1875, + "grad_norm_var": 0.05919774373372396, + "learning_rate": 0.0001, + "loss": 4.5848, + "loss/crossentropy": 2.2118901014328003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22189343720674515, + "step": 9072 + }, + { + "epoch": 0.18148, + "grad_norm": 2.15625, + "grad_norm_var": 0.05810114542643229, + "learning_rate": 0.0001, + "loss": 4.285, + "loss/crossentropy": 1.8919037580490112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1986190229654312, + "step": 9074 + }, + { + "epoch": 0.18152, + "grad_norm": 2.171875, + "grad_norm_var": 0.011161295572916667, + "learning_rate": 0.0001, + "loss": 4.3046, + "loss/crossentropy": 1.975312054157257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20403321832418442, + "step": 9076 + }, + { + "epoch": 0.18156, + "grad_norm": 2.265625, + "grad_norm_var": 0.0075032552083333336, + "learning_rate": 0.0001, + "loss": 4.2175, + "loss/crossentropy": 1.8076966404914856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19708115607500076, + "step": 9078 + }, + { + "epoch": 0.1816, + "grad_norm": 2.265625, + "grad_norm_var": 0.007112630208333333, + "learning_rate": 0.0001, + "loss": 4.2166, + "loss/crossentropy": 2.101171374320984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22123181074857712, + "step": 9080 + }, + { + "epoch": 0.18164, + "grad_norm": 2.1875, + "grad_norm_var": 0.007225545247395834, + "learning_rate": 0.0001, + "loss": 4.2292, + "loss/crossentropy": 2.09942090511322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21761803328990936, + "step": 9082 + }, + { + "epoch": 0.18168, + "grad_norm": 2.1875, + "grad_norm_var": 0.008072916666666667, + "learning_rate": 0.0001, + "loss": 4.4802, + "loss/crossentropy": 2.4418424367904663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24908769130706787, + "step": 9084 + }, + { + "epoch": 0.18172, + "grad_norm": 2.109375, + "grad_norm_var": 0.0087890625, + "learning_rate": 0.0001, + "loss": 4.3408, + "loss/crossentropy": 1.944950520992279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20515284687280655, + "step": 9086 + }, + { + "epoch": 0.18176, + "grad_norm": 2.21875, + "grad_norm_var": 0.007991536458333334, + "learning_rate": 0.0001, + "loss": 4.3117, + "loss/crossentropy": 1.870418667793274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20427027344703674, + "step": 9088 + }, + { + "epoch": 0.1818, + "grad_norm": 2.171875, + "grad_norm_var": 0.007682291666666666, + "learning_rate": 0.0001, + "loss": 4.1323, + "loss/crossentropy": 1.9338520169258118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19928501546382904, + "step": 9090 + }, + { + "epoch": 0.18184, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008876291910807292, + "learning_rate": 0.0001, + "loss": 3.9734, + "loss/crossentropy": 1.7826221585273743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19778436422348022, + "step": 9092 + }, + { + "epoch": 0.18188, + "grad_norm": 2.078125, + "grad_norm_var": 0.007458241780598959, + "learning_rate": 0.0001, + "loss": 4.3537, + "loss/crossentropy": 2.2922680377960205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23613491654396057, + "step": 9094 + }, + { + "epoch": 0.18192, + "grad_norm": 2.296875, + "grad_norm_var": 0.15102513631184897, + "learning_rate": 0.0001, + "loss": 4.6789, + "loss/crossentropy": 2.1802788972854614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969536542892456, + "step": 9096 + }, + { + "epoch": 0.18196, + "grad_norm": 2.15625, + "grad_norm_var": 0.1498308817545573, + "learning_rate": 0.0001, + "loss": 4.1715, + "loss/crossentropy": 1.9129992723464966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1942882016301155, + "step": 9098 + }, + { + "epoch": 0.182, + "grad_norm": 2.203125, + "grad_norm_var": 0.14738337198893228, + "learning_rate": 0.0001, + "loss": 4.3374, + "loss/crossentropy": 1.8288249969482422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879616051912308, + "step": 9100 + }, + { + "epoch": 0.18204, + "grad_norm": 2.3125, + "grad_norm_var": 0.14580663045247397, + "learning_rate": 0.0001, + "loss": 4.2824, + "loss/crossentropy": 2.039812684059143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21374420076608658, + "step": 9102 + }, + { + "epoch": 0.18208, + "grad_norm": 1.9609375, + "grad_norm_var": 0.15449193318684895, + "learning_rate": 0.0001, + "loss": 4.1862, + "loss/crossentropy": 2.1177414059638977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193453460931778, + "step": 9104 + }, + { + "epoch": 0.18212, + "grad_norm": 2.28125, + "grad_norm_var": 0.16704076131184895, + "learning_rate": 0.0001, + "loss": 4.4869, + "loss/crossentropy": 2.171097159385681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21211445331573486, + "step": 9106 + }, + { + "epoch": 0.18216, + "grad_norm": 2.359375, + "grad_norm_var": 0.16413548787434895, + "learning_rate": 0.0001, + "loss": 4.1422, + "loss/crossentropy": 1.9781638383865356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21307373046875, + "step": 9108 + }, + { + "epoch": 0.1822, + "grad_norm": 1.984375, + "grad_norm_var": 0.17157363891601562, + "learning_rate": 0.0001, + "loss": 4.2454, + "loss/crossentropy": 2.0868560075759888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20633937418460846, + "step": 9110 + }, + { + "epoch": 0.18224, + "grad_norm": 2.03125, + "grad_norm_var": 0.04592463175455729, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.079172134399414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20709815621376038, + "step": 9112 + }, + { + "epoch": 0.18228, + "grad_norm": 2.359375, + "grad_norm_var": 0.04835383097330729, + "learning_rate": 0.0001, + "loss": 4.1836, + "loss/crossentropy": 2.100913643836975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21831409633159637, + "step": 9114 + }, + { + "epoch": 0.18232, + "grad_norm": 2.203125, + "grad_norm_var": 0.049478912353515626, + "learning_rate": 0.0001, + "loss": 4.4087, + "loss/crossentropy": 2.018395781517029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22545243054628372, + "step": 9116 + }, + { + "epoch": 0.18236, + "grad_norm": 2.203125, + "grad_norm_var": 0.04278132120768229, + "learning_rate": 0.0001, + "loss": 4.3421, + "loss/crossentropy": 1.9177632331848145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23346271365880966, + "step": 9118 + }, + { + "epoch": 0.1824, + "grad_norm": 2.0625, + "grad_norm_var": 0.038358306884765624, + "learning_rate": 0.0001, + "loss": 3.9924, + "loss/crossentropy": 2.1122357845306396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22593770176172256, + "step": 9120 + }, + { + "epoch": 0.18244, + "grad_norm": 2.03125, + "grad_norm_var": 0.014021555582682291, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.7920495867729187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127150148153305, + "step": 9122 + }, + { + "epoch": 0.18248, + "grad_norm": 2.1875, + "grad_norm_var": 0.0096099853515625, + "learning_rate": 0.0001, + "loss": 4.2061, + "loss/crossentropy": 2.2180920839309692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25088224560022354, + "step": 9124 + }, + { + "epoch": 0.18252, + "grad_norm": 2.1875, + "grad_norm_var": 0.008185831705729167, + "learning_rate": 0.0001, + "loss": 4.3287, + "loss/crossentropy": 1.8999969959259033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21616832166910172, + "step": 9126 + }, + { + "epoch": 0.18256, + "grad_norm": 2.125, + "grad_norm_var": 0.0081451416015625, + "learning_rate": 0.0001, + "loss": 4.5351, + "loss/crossentropy": 1.9424527287483215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22016742825508118, + "step": 9128 + }, + { + "epoch": 0.1826, + "grad_norm": 2.0625, + "grad_norm_var": 0.0049468994140625, + "learning_rate": 0.0001, + "loss": 4.4432, + "loss/crossentropy": 2.0539366006851196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26818516850471497, + "step": 9130 + }, + { + "epoch": 0.18264, + "grad_norm": 2.0625, + "grad_norm_var": 0.00458984375, + "learning_rate": 0.0001, + "loss": 4.1634, + "loss/crossentropy": 1.9180519580841064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067112922668457, + "step": 9132 + }, + { + "epoch": 0.18268, + "grad_norm": 2.078125, + "grad_norm_var": 0.005052693684895833, + "learning_rate": 0.0001, + "loss": 3.9913, + "loss/crossentropy": 1.7417545318603516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970825269818306, + "step": 9134 + }, + { + "epoch": 0.18272, + "grad_norm": 2.09375, + "grad_norm_var": 0.004833984375, + "learning_rate": 0.0001, + "loss": 4.016, + "loss/crossentropy": 1.9498217701911926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21281737089157104, + "step": 9136 + }, + { + "epoch": 0.18276, + "grad_norm": 2.21875, + "grad_norm_var": 0.005615234375, + "learning_rate": 0.0001, + "loss": 4.3842, + "loss/crossentropy": 2.140692949295044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24291902035474777, + "step": 9138 + }, + { + "epoch": 0.1828, + "grad_norm": 1.984375, + "grad_norm_var": 0.0069976806640625, + "learning_rate": 0.0001, + "loss": 4.1785, + "loss/crossentropy": 2.3510342836380005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23326712846755981, + "step": 9140 + }, + { + "epoch": 0.18284, + "grad_norm": 2.140625, + "grad_norm_var": 0.0072743733723958336, + "learning_rate": 0.0001, + "loss": 4.0707, + "loss/crossentropy": 2.049591898918152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21118033677339554, + "step": 9142 + }, + { + "epoch": 0.18288, + "grad_norm": 2.109375, + "grad_norm_var": 0.005631510416666667, + "learning_rate": 0.0001, + "loss": 4.263, + "loss/crossentropy": 1.949703335762024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22161198407411575, + "step": 9144 + }, + { + "epoch": 0.18292, + "grad_norm": 2.109375, + "grad_norm_var": 0.005399576822916667, + "learning_rate": 0.0001, + "loss": 4.5602, + "loss/crossentropy": 2.1442413330078125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21281076222658157, + "step": 9146 + }, + { + "epoch": 0.18296, + "grad_norm": 2.078125, + "grad_norm_var": 0.007111612955729167, + "learning_rate": 0.0001, + "loss": 4.0848, + "loss/crossentropy": 2.103494882583618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20892268419265747, + "step": 9148 + }, + { + "epoch": 0.183, + "grad_norm": 2.25, + "grad_norm_var": 0.01060791015625, + "learning_rate": 0.0001, + "loss": 4.133, + "loss/crossentropy": 2.1544495224952698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008143737912178, + "step": 9150 + }, + { + "epoch": 0.18304, + "grad_norm": 1.984375, + "grad_norm_var": 0.01109619140625, + "learning_rate": 0.0001, + "loss": 4.2799, + "loss/crossentropy": 2.021821677684784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20975126326084137, + "step": 9152 + }, + { + "epoch": 0.18308, + "grad_norm": 1.921875, + "grad_norm_var": 0.0102203369140625, + "learning_rate": 0.0001, + "loss": 4.2858, + "loss/crossentropy": 2.109215199947357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19855067878961563, + "step": 9154 + }, + { + "epoch": 0.18312, + "grad_norm": 2.09375, + "grad_norm_var": 0.011165364583333334, + "learning_rate": 0.0001, + "loss": 4.2181, + "loss/crossentropy": 1.7631941437721252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20584283769130707, + "step": 9156 + }, + { + "epoch": 0.18316, + "grad_norm": 2.171875, + "grad_norm_var": 0.0116851806640625, + "learning_rate": 0.0001, + "loss": 4.4005, + "loss/crossentropy": 2.131524443626404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2151477411389351, + "step": 9158 + }, + { + "epoch": 0.1832, + "grad_norm": 2.015625, + "grad_norm_var": 0.01334228515625, + "learning_rate": 0.0001, + "loss": 4.2729, + "loss/crossentropy": 2.018375277519226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21982873231172562, + "step": 9160 + }, + { + "epoch": 0.18324, + "grad_norm": 2.09375, + "grad_norm_var": 0.0154205322265625, + "learning_rate": 0.0001, + "loss": 4.3302, + "loss/crossentropy": 2.217617154121399, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2257251739501953, + "step": 9162 + }, + { + "epoch": 0.18328, + "grad_norm": 2.15625, + "grad_norm_var": 0.014606730143229166, + "learning_rate": 0.0001, + "loss": 4.227, + "loss/crossentropy": 1.8632460832595825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19728046655654907, + "step": 9164 + }, + { + "epoch": 0.18332, + "grad_norm": 2.0625, + "grad_norm_var": 0.012137858072916667, + "learning_rate": 0.0001, + "loss": 4.5335, + "loss/crossentropy": 2.2818111181259155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22397568821907043, + "step": 9166 + }, + { + "epoch": 0.18336, + "grad_norm": 2.109375, + "grad_norm_var": 0.01177978515625, + "learning_rate": 0.0001, + "loss": 4.4104, + "loss/crossentropy": 2.1209938526153564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22490206360816956, + "step": 9168 + }, + { + "epoch": 0.1834, + "grad_norm": 2.046875, + "grad_norm_var": 0.00924072265625, + "learning_rate": 0.0001, + "loss": 4.244, + "loss/crossentropy": 2.141623795032501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21710850298404694, + "step": 9170 + }, + { + "epoch": 0.18344, + "grad_norm": 2.125, + "grad_norm_var": 0.007323201497395833, + "learning_rate": 0.0001, + "loss": 4.2063, + "loss/crossentropy": 2.165239691734314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.235738106071949, + "step": 9172 + }, + { + "epoch": 0.18348, + "grad_norm": 2.015625, + "grad_norm_var": 0.008103179931640624, + "learning_rate": 0.0001, + "loss": 4.0186, + "loss/crossentropy": 1.8649475574493408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088451236486435, + "step": 9174 + }, + { + "epoch": 0.18352, + "grad_norm": 2.15625, + "grad_norm_var": 0.007012685139973958, + "learning_rate": 0.0001, + "loss": 4.2853, + "loss/crossentropy": 2.312218189239502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.236283540725708, + "step": 9176 + }, + { + "epoch": 0.18356, + "grad_norm": 2.109375, + "grad_norm_var": 0.008699544270833333, + "learning_rate": 0.0001, + "loss": 3.9626, + "loss/crossentropy": 2.0149282217025757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21434535831212997, + "step": 9178 + }, + { + "epoch": 0.1836, + "grad_norm": 2.125, + "grad_norm_var": 0.0086822509765625, + "learning_rate": 0.0001, + "loss": 4.5081, + "loss/crossentropy": 2.499966621398926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26318275928497314, + "step": 9180 + }, + { + "epoch": 0.18364, + "grad_norm": 2.140625, + "grad_norm_var": 0.0084136962890625, + "learning_rate": 0.0001, + "loss": 4.4498, + "loss/crossentropy": 2.0454984307289124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202616035938263, + "step": 9182 + }, + { + "epoch": 0.18368, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010835520426432292, + "learning_rate": 0.0001, + "loss": 4.0754, + "loss/crossentropy": 2.1708725094795227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20958629250526428, + "step": 9184 + }, + { + "epoch": 0.18372, + "grad_norm": 2.140625, + "grad_norm_var": 0.010792795817057292, + "learning_rate": 0.0001, + "loss": 4.2104, + "loss/crossentropy": 1.8261350989341736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1952563151717186, + "step": 9186 + }, + { + "epoch": 0.18376, + "grad_norm": 2.046875, + "grad_norm_var": 0.010009511311848959, + "learning_rate": 0.0001, + "loss": 4.1247, + "loss/crossentropy": 2.036627769470215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22490298002958298, + "step": 9188 + }, + { + "epoch": 0.1838, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010501861572265625, + "learning_rate": 0.0001, + "loss": 4.0088, + "loss/crossentropy": 1.7977086305618286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19282807409763336, + "step": 9190 + }, + { + "epoch": 0.18384, + "grad_norm": 2.21875, + "grad_norm_var": 0.011211903889973958, + "learning_rate": 0.0001, + "loss": 4.1774, + "loss/crossentropy": 2.170135021209717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23149186372756958, + "step": 9192 + }, + { + "epoch": 0.18388, + "grad_norm": 2.328125, + "grad_norm_var": 0.012035115559895834, + "learning_rate": 0.0001, + "loss": 4.4155, + "loss/crossentropy": 2.1453020572662354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25120319426059723, + "step": 9194 + }, + { + "epoch": 0.18392, + "grad_norm": 2.125, + "grad_norm_var": 0.01224365234375, + "learning_rate": 0.0001, + "loss": 4.3628, + "loss/crossentropy": 1.8794063925743103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20883548259735107, + "step": 9196 + }, + { + "epoch": 0.18396, + "grad_norm": 2.3125, + "grad_norm_var": 0.0141998291015625, + "learning_rate": 0.0001, + "loss": 4.3384, + "loss/crossentropy": 2.021254241466522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21939975768327713, + "step": 9198 + }, + { + "epoch": 0.184, + "grad_norm": 2.09375, + "grad_norm_var": 0.009557851155598958, + "learning_rate": 0.0001, + "loss": 4.158, + "loss/crossentropy": 2.195580303668976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108924463391304, + "step": 9200 + }, + { + "epoch": 0.18404, + "grad_norm": 2.21875, + "grad_norm_var": 0.014085896809895833, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 1.6260902881622314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20428159832954407, + "step": 9202 + }, + { + "epoch": 0.18408, + "grad_norm": 1.96875, + "grad_norm_var": 0.015363566080729167, + "learning_rate": 0.0001, + "loss": 4.2525, + "loss/crossentropy": 2.138678550720215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21641074120998383, + "step": 9204 + }, + { + "epoch": 0.18412, + "grad_norm": 2.125, + "grad_norm_var": 0.01825129191080729, + "learning_rate": 0.0001, + "loss": 4.4706, + "loss/crossentropy": 2.047194480895996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20891964435577393, + "step": 9206 + }, + { + "epoch": 0.18416, + "grad_norm": 2.171875, + "grad_norm_var": 0.019606272379557293, + "learning_rate": 0.0001, + "loss": 4.2312, + "loss/crossentropy": 2.189277768135071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150397077202797, + "step": 9208 + }, + { + "epoch": 0.1842, + "grad_norm": 2.140625, + "grad_norm_var": 0.017618560791015626, + "learning_rate": 0.0001, + "loss": 4.2792, + "loss/crossentropy": 2.184122920036316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23278063535690308, + "step": 9210 + }, + { + "epoch": 0.18424, + "grad_norm": 1.9921875, + "grad_norm_var": 0.019139607747395832, + "learning_rate": 0.0001, + "loss": 4.3471, + "loss/crossentropy": 2.413718104362488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24333150684833527, + "step": 9212 + }, + { + "epoch": 0.18428, + "grad_norm": 2.078125, + "grad_norm_var": 0.019017537434895832, + "learning_rate": 0.0001, + "loss": 3.8486, + "loss/crossentropy": 1.8086814880371094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19976364076137543, + "step": 9214 + }, + { + "epoch": 0.18432, + "grad_norm": 2.09375, + "grad_norm_var": 0.019978841145833332, + "learning_rate": 0.0001, + "loss": 4.5222, + "loss/crossentropy": 2.2418206930160522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.229690782725811, + "step": 9216 + }, + { + "epoch": 0.18436, + "grad_norm": 2.015625, + "grad_norm_var": 0.016355133056640624, + "learning_rate": 0.0001, + "loss": 4.0817, + "loss/crossentropy": 1.8083258867263794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20999443531036377, + "step": 9218 + }, + { + "epoch": 0.1844, + "grad_norm": 1.953125, + "grad_norm_var": 0.01693115234375, + "learning_rate": 0.0001, + "loss": 3.772, + "loss/crossentropy": 1.8117709755897522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20436270534992218, + "step": 9220 + }, + { + "epoch": 0.18444, + "grad_norm": 2.15625, + "grad_norm_var": 0.010550944010416667, + "learning_rate": 0.0001, + "loss": 4.2149, + "loss/crossentropy": 2.024270534515381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20097267627716064, + "step": 9222 + }, + { + "epoch": 0.18448, + "grad_norm": 2.25, + "grad_norm_var": 0.011449178059895834, + "learning_rate": 0.0001, + "loss": 4.4756, + "loss/crossentropy": 2.2385981678962708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21131044626235962, + "step": 9224 + }, + { + "epoch": 0.18452, + "grad_norm": 2.140625, + "grad_norm_var": 0.011271158854166666, + "learning_rate": 0.0001, + "loss": 4.3745, + "loss/crossentropy": 2.127749502658844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22855369001626968, + "step": 9226 + }, + { + "epoch": 0.18456, + "grad_norm": 2.15625, + "grad_norm_var": 0.011237589518229167, + "learning_rate": 0.0001, + "loss": 4.2645, + "loss/crossentropy": 2.1877033710479736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2339663878083229, + "step": 9228 + }, + { + "epoch": 0.1846, + "grad_norm": 2.25, + "grad_norm_var": 0.011139933268229167, + "learning_rate": 0.0001, + "loss": 4.2642, + "loss/crossentropy": 1.931507408618927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20912021398544312, + "step": 9230 + }, + { + "epoch": 0.18464, + "grad_norm": 2.234375, + "grad_norm_var": 0.011579386393229167, + "learning_rate": 0.0001, + "loss": 4.3309, + "loss/crossentropy": 1.8101251125335693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23724676668643951, + "step": 9232 + }, + { + "epoch": 0.18468, + "grad_norm": 2.125, + "grad_norm_var": 0.0111480712890625, + "learning_rate": 0.0001, + "loss": 4.201, + "loss/crossentropy": 2.015208065509796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23482007533311844, + "step": 9234 + }, + { + "epoch": 0.18472, + "grad_norm": 2.1875, + "grad_norm_var": 0.010282135009765625, + "learning_rate": 0.0001, + "loss": 4.5357, + "loss/crossentropy": 2.2742738723754883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23632052540779114, + "step": 9236 + }, + { + "epoch": 0.18476, + "grad_norm": 2.46875, + "grad_norm_var": 0.016806793212890626, + "learning_rate": 0.0001, + "loss": 4.3574, + "loss/crossentropy": 1.7254774570465088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104022428393364, + "step": 9238 + }, + { + "epoch": 0.1848, + "grad_norm": 2.09375, + "grad_norm_var": 0.016585032145182293, + "learning_rate": 0.0001, + "loss": 4.3477, + "loss/crossentropy": 2.181770443916321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22411521524190903, + "step": 9240 + }, + { + "epoch": 0.18484, + "grad_norm": 1.953125, + "grad_norm_var": 0.020157877604166666, + "learning_rate": 0.0001, + "loss": 3.9517, + "loss/crossentropy": 1.8449691534042358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091139778494835, + "step": 9242 + }, + { + "epoch": 0.18488, + "grad_norm": 2.1875, + "grad_norm_var": 0.017195383707682293, + "learning_rate": 0.0001, + "loss": 4.3389, + "loss/crossentropy": 2.0917609333992004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2644127458333969, + "step": 9244 + }, + { + "epoch": 0.18492, + "grad_norm": 2.28125, + "grad_norm_var": 0.01793390909830729, + "learning_rate": 0.0001, + "loss": 4.083, + "loss/crossentropy": 2.060012102127075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21294714510440826, + "step": 9246 + }, + { + "epoch": 0.18496, + "grad_norm": 2.078125, + "grad_norm_var": 0.018387603759765624, + "learning_rate": 0.0001, + "loss": 4.6598, + "loss/crossentropy": 2.059940278530121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22648683190345764, + "step": 9248 + }, + { + "epoch": 0.185, + "grad_norm": 2.078125, + "grad_norm_var": 0.01862360636393229, + "learning_rate": 0.0001, + "loss": 4.4696, + "loss/crossentropy": 1.8423291444778442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22509171068668365, + "step": 9250 + }, + { + "epoch": 0.18504, + "grad_norm": 2.125, + "grad_norm_var": 0.016410064697265626, + "learning_rate": 0.0001, + "loss": 4.4629, + "loss/crossentropy": 2.2559698820114136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23896963894367218, + "step": 9252 + }, + { + "epoch": 0.18508, + "grad_norm": 2.03125, + "grad_norm_var": 0.01579767862955729, + "learning_rate": 0.0001, + "loss": 4.6027, + "loss/crossentropy": 2.0583658814430237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22763275355100632, + "step": 9254 + }, + { + "epoch": 0.18512, + "grad_norm": 2.015625, + "grad_norm_var": 0.016658274332682292, + "learning_rate": 0.0001, + "loss": 4.2654, + "loss/crossentropy": 1.816649854183197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18608735501766205, + "step": 9256 + }, + { + "epoch": 0.18516, + "grad_norm": 4.65625, + "grad_norm_var": 0.4073150634765625, + "learning_rate": 0.0001, + "loss": 4.2837, + "loss/crossentropy": 2.0920958518981934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22859029471874237, + "step": 9258 + }, + { + "epoch": 0.1852, + "grad_norm": 2.125, + "grad_norm_var": 0.4108306884765625, + "learning_rate": 0.0001, + "loss": 4.2517, + "loss/crossentropy": 2.1475982666015625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21678777784109116, + "step": 9260 + }, + { + "epoch": 0.18524, + "grad_norm": 2.21875, + "grad_norm_var": 0.4100901285807292, + "learning_rate": 0.0001, + "loss": 4.4258, + "loss/crossentropy": 2.22346031665802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2333592176437378, + "step": 9262 + }, + { + "epoch": 0.18528, + "grad_norm": 2.09375, + "grad_norm_var": 0.41646703084309894, + "learning_rate": 0.0001, + "loss": 4.2134, + "loss/crossentropy": 1.7652028799057007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18461769074201584, + "step": 9264 + }, + { + "epoch": 0.18532, + "grad_norm": 2.109375, + "grad_norm_var": 0.4225006103515625, + "learning_rate": 0.0001, + "loss": 4.2362, + "loss/crossentropy": 2.1549625396728516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22428707033395767, + "step": 9266 + }, + { + "epoch": 0.18536, + "grad_norm": 2.0, + "grad_norm_var": 0.4281972249348958, + "learning_rate": 0.0001, + "loss": 4.3685, + "loss/crossentropy": 2.1677842140197754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22887174785137177, + "step": 9268 + }, + { + "epoch": 0.1854, + "grad_norm": 2.09375, + "grad_norm_var": 0.42428792317708336, + "learning_rate": 0.0001, + "loss": 4.1517, + "loss/crossentropy": 1.8352131247520447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19628287106752396, + "step": 9270 + }, + { + "epoch": 0.18544, + "grad_norm": 2.109375, + "grad_norm_var": 0.43038304646809894, + "learning_rate": 0.0001, + "loss": 4.101, + "loss/crossentropy": 2.104279100894928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20481227338314056, + "step": 9272 + }, + { + "epoch": 0.18548, + "grad_norm": 2.21875, + "grad_norm_var": 0.009795888264973959, + "learning_rate": 0.0001, + "loss": 4.2011, + "loss/crossentropy": 1.8447301387786865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2227916270494461, + "step": 9274 + }, + { + "epoch": 0.18552, + "grad_norm": 2.296875, + "grad_norm_var": 0.013651275634765625, + "learning_rate": 0.0001, + "loss": 4.3544, + "loss/crossentropy": 2.303207755088806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24895529448986053, + "step": 9276 + }, + { + "epoch": 0.18556, + "grad_norm": 2.171875, + "grad_norm_var": 0.014115142822265624, + "learning_rate": 0.0001, + "loss": 4.699, + "loss/crossentropy": 2.248077630996704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23077847063541412, + "step": 9278 + }, + { + "epoch": 0.1856, + "grad_norm": 2.34375, + "grad_norm_var": 0.017438761393229165, + "learning_rate": 0.0001, + "loss": 4.4943, + "loss/crossentropy": 2.261389970779419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23669905960559845, + "step": 9280 + }, + { + "epoch": 0.18564, + "grad_norm": 2.234375, + "grad_norm_var": 0.016056060791015625, + "learning_rate": 0.0001, + "loss": 4.3805, + "loss/crossentropy": 2.402338147163391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2453952580690384, + "step": 9282 + }, + { + "epoch": 0.18568, + "grad_norm": 2.265625, + "grad_norm_var": 0.01587702433268229, + "learning_rate": 0.0001, + "loss": 4.2253, + "loss/crossentropy": 2.011174201965332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22673364728689194, + "step": 9284 + }, + { + "epoch": 0.18572, + "grad_norm": 2.234375, + "grad_norm_var": 0.016810862223307292, + "learning_rate": 0.0001, + "loss": 4.3897, + "loss/crossentropy": 1.9700093269348145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22711393237113953, + "step": 9286 + }, + { + "epoch": 0.18576, + "grad_norm": 2.140625, + "grad_norm_var": 0.014922841389973959, + "learning_rate": 0.0001, + "loss": 3.9609, + "loss/crossentropy": 2.0253939032554626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21902555227279663, + "step": 9288 + }, + { + "epoch": 0.1858, + "grad_norm": 2.296875, + "grad_norm_var": 0.016961415608723957, + "learning_rate": 0.0001, + "loss": 4.1799, + "loss/crossentropy": 1.741984784603119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1879509538412094, + "step": 9290 + }, + { + "epoch": 0.18584, + "grad_norm": 2.296875, + "grad_norm_var": 0.022564442952473958, + "learning_rate": 0.0001, + "loss": 4.7983, + "loss/crossentropy": 2.3156943321228027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2687100023031235, + "step": 9292 + }, + { + "epoch": 0.18588, + "grad_norm": 2.015625, + "grad_norm_var": 0.025833892822265624, + "learning_rate": 0.0001, + "loss": 4.2894, + "loss/crossentropy": 2.0826371908187866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21234872937202454, + "step": 9294 + }, + { + "epoch": 0.18592, + "grad_norm": 2.078125, + "grad_norm_var": 0.025921376546223958, + "learning_rate": 0.0001, + "loss": 4.6473, + "loss/crossentropy": 2.4080610275268555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2670409381389618, + "step": 9296 + }, + { + "epoch": 0.18596, + "grad_norm": 2.046875, + "grad_norm_var": 0.02958958943684896, + "learning_rate": 0.0001, + "loss": 4.1997, + "loss/crossentropy": 1.722270905971527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20094333589076996, + "step": 9298 + }, + { + "epoch": 0.186, + "grad_norm": 2.03125, + "grad_norm_var": 0.028527577718098957, + "learning_rate": 0.0001, + "loss": 4.3247, + "loss/crossentropy": 2.0514711141586304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209539495408535, + "step": 9300 + }, + { + "epoch": 0.18604, + "grad_norm": 2.640625, + "grad_norm_var": 2.5507850646972656, + "learning_rate": 0.0001, + "loss": 4.9819, + "loss/crossentropy": 2.4808409214019775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2367517650127411, + "step": 9302 + }, + { + "epoch": 0.18608, + "grad_norm": 2.078125, + "grad_norm_var": 2.536018880208333, + "learning_rate": 0.0001, + "loss": 4.1105, + "loss/crossentropy": 2.2539944648742676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22834083437919617, + "step": 9304 + }, + { + "epoch": 0.18612, + "grad_norm": 2.078125, + "grad_norm_var": 2.5449544270833333, + "learning_rate": 0.0001, + "loss": 4.2383, + "loss/crossentropy": 1.9335210919380188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20264607667922974, + "step": 9306 + }, + { + "epoch": 0.18616, + "grad_norm": 2.109375, + "grad_norm_var": 2.5707194010416665, + "learning_rate": 0.0001, + "loss": 4.2285, + "loss/crossentropy": 1.90863037109375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20882528275251389, + "step": 9308 + }, + { + "epoch": 0.1862, + "grad_norm": 2.1875, + "grad_norm_var": 2.5589996337890626, + "learning_rate": 0.0001, + "loss": 4.2195, + "loss/crossentropy": 2.1449084281921387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2329169511795044, + "step": 9310 + }, + { + "epoch": 0.18624, + "grad_norm": 2.015625, + "grad_norm_var": 2.567341105143229, + "learning_rate": 0.0001, + "loss": 4.1806, + "loss/crossentropy": 1.9571366906166077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148708775639534, + "step": 9312 + }, + { + "epoch": 0.18628, + "grad_norm": 2.25, + "grad_norm_var": 2.542252604166667, + "learning_rate": 0.0001, + "loss": 4.4691, + "loss/crossentropy": 2.174731135368347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280728593468666, + "step": 9314 + }, + { + "epoch": 0.18632, + "grad_norm": 1.96875, + "grad_norm_var": 2.5416575113932294, + "learning_rate": 0.0001, + "loss": 4.4344, + "loss/crossentropy": 1.9569833874702454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20855706185102463, + "step": 9316 + }, + { + "epoch": 0.18636, + "grad_norm": 2.1875, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 4.3162, + "loss/crossentropy": 2.153563976287842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22807130962610245, + "step": 9318 + }, + { + "epoch": 0.1864, + "grad_norm": 2.203125, + "grad_norm_var": 0.010741933186848959, + "learning_rate": 0.0001, + "loss": 4.3028, + "loss/crossentropy": 1.9219058752059937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295902967453003, + "step": 9320 + }, + { + "epoch": 0.18644, + "grad_norm": 1.96875, + "grad_norm_var": 0.011926015218098959, + "learning_rate": 0.0001, + "loss": 4.29, + "loss/crossentropy": 2.1993675231933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.223759263753891, + "step": 9322 + }, + { + "epoch": 0.18648, + "grad_norm": 2.078125, + "grad_norm_var": 0.011730702718098958, + "learning_rate": 0.0001, + "loss": 4.2254, + "loss/crossentropy": 2.173740863800049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23405525833368301, + "step": 9324 + }, + { + "epoch": 0.18652, + "grad_norm": 2.25, + "grad_norm_var": 0.012564849853515626, + "learning_rate": 0.0001, + "loss": 4.3717, + "loss/crossentropy": 2.026577115058899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265823632478714, + "step": 9326 + }, + { + "epoch": 0.18656, + "grad_norm": 2.671875, + "grad_norm_var": 0.030326080322265626, + "learning_rate": 0.0001, + "loss": 4.4664, + "loss/crossentropy": 2.3629637956619263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23659023642539978, + "step": 9328 + }, + { + "epoch": 0.1866, + "grad_norm": 2.234375, + "grad_norm_var": 0.03050715128580729, + "learning_rate": 0.0001, + "loss": 4.3245, + "loss/crossentropy": 2.100727915763855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21468425542116165, + "step": 9330 + }, + { + "epoch": 0.18664, + "grad_norm": 2.109375, + "grad_norm_var": 0.025923411051432293, + "learning_rate": 0.0001, + "loss": 4.5291, + "loss/crossentropy": 2.163568615913391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23402437567710876, + "step": 9332 + }, + { + "epoch": 0.18668, + "grad_norm": 1.9375, + "grad_norm_var": 0.02934748331705729, + "learning_rate": 0.0001, + "loss": 3.9687, + "loss/crossentropy": 1.9579994082450867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19713342934846878, + "step": 9334 + }, + { + "epoch": 0.18672, + "grad_norm": 1.9609375, + "grad_norm_var": 0.029412587483723957, + "learning_rate": 0.0001, + "loss": 4.2586, + "loss/crossentropy": 1.805375874042511, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22647518664598465, + "step": 9336 + }, + { + "epoch": 0.18676, + "grad_norm": 2.171875, + "grad_norm_var": 0.02797215779622396, + "learning_rate": 0.0001, + "loss": 4.2025, + "loss/crossentropy": 2.0764458775520325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2270444855093956, + "step": 9338 + }, + { + "epoch": 0.1868, + "grad_norm": 2.03125, + "grad_norm_var": 0.030987294514973958, + "learning_rate": 0.0001, + "loss": 3.8731, + "loss/crossentropy": 1.5654467940330505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15944529324769974, + "step": 9340 + }, + { + "epoch": 0.18684, + "grad_norm": 2.15625, + "grad_norm_var": 0.02976048787434896, + "learning_rate": 0.0001, + "loss": 4.3831, + "loss/crossentropy": 2.007612407207489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061111181974411, + "step": 9342 + }, + { + "epoch": 0.18688, + "grad_norm": 1.875, + "grad_norm_var": 0.010910797119140624, + "learning_rate": 0.0001, + "loss": 4.0835, + "loss/crossentropy": 1.981432855129242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19339826703071594, + "step": 9344 + }, + { + "epoch": 0.18692, + "grad_norm": 1.9921875, + "grad_norm_var": 0.00965576171875, + "learning_rate": 0.0001, + "loss": 4.1291, + "loss/crossentropy": 1.804275631904602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968545839190483, + "step": 9346 + }, + { + "epoch": 0.18696, + "grad_norm": 2.078125, + "grad_norm_var": 0.008763631184895834, + "learning_rate": 0.0001, + "loss": 4.2109, + "loss/crossentropy": 1.9718617796897888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21482623368501663, + "step": 9348 + }, + { + "epoch": 0.187, + "grad_norm": 2.15625, + "grad_norm_var": 0.008397420247395834, + "learning_rate": 0.0001, + "loss": 4.1672, + "loss/crossentropy": 1.8358338475227356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20042669028043747, + "step": 9350 + }, + { + "epoch": 0.18704, + "grad_norm": 2.15625, + "grad_norm_var": 0.009089914957682292, + "learning_rate": 0.0001, + "loss": 4.2045, + "loss/crossentropy": 1.9447709321975708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21207460761070251, + "step": 9352 + }, + { + "epoch": 0.18708, + "grad_norm": 2.03125, + "grad_norm_var": 0.011356353759765625, + "learning_rate": 0.0001, + "loss": 4.3134, + "loss/crossentropy": 1.8921163082122803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19945629686117172, + "step": 9354 + }, + { + "epoch": 0.18712, + "grad_norm": 2.171875, + "grad_norm_var": 0.009580230712890625, + "learning_rate": 0.0001, + "loss": 4.2018, + "loss/crossentropy": 2.1323947310447693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2258753925561905, + "step": 9356 + }, + { + "epoch": 0.18716, + "grad_norm": 2.109375, + "grad_norm_var": 0.009277089436848959, + "learning_rate": 0.0001, + "loss": 4.4278, + "loss/crossentropy": 2.064914345741272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23450962454080582, + "step": 9358 + }, + { + "epoch": 0.1872, + "grad_norm": 2.046875, + "grad_norm_var": 0.005783843994140625, + "learning_rate": 0.0001, + "loss": 4.0173, + "loss/crossentropy": 2.1264703273773193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2169734537601471, + "step": 9360 + }, + { + "epoch": 0.18724, + "grad_norm": 2.09375, + "grad_norm_var": 0.004784138997395834, + "learning_rate": 0.0001, + "loss": 4.2926, + "loss/crossentropy": 1.7044150233268738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19169463962316513, + "step": 9362 + }, + { + "epoch": 0.18728, + "grad_norm": 2.0625, + "grad_norm_var": 0.004541015625, + "learning_rate": 0.0001, + "loss": 4.5367, + "loss/crossentropy": 1.9475398659706116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22891414165496826, + "step": 9364 + }, + { + "epoch": 0.18732, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008017730712890626, + "learning_rate": 0.0001, + "loss": 4.1416, + "loss/crossentropy": 2.134114623069763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2241017445921898, + "step": 9366 + }, + { + "epoch": 0.18736, + "grad_norm": 2.1875, + "grad_norm_var": 0.007342274983723958, + "learning_rate": 0.0001, + "loss": 4.2185, + "loss/crossentropy": 1.8317620158195496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20189791917800903, + "step": 9368 + }, + { + "epoch": 0.1874, + "grad_norm": 2.140625, + "grad_norm_var": 0.005041249593098958, + "learning_rate": 0.0001, + "loss": 4.3518, + "loss/crossentropy": 2.257538855075836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2343401461839676, + "step": 9370 + }, + { + "epoch": 0.18744, + "grad_norm": 2.09375, + "grad_norm_var": 0.005228424072265625, + "learning_rate": 0.0001, + "loss": 4.0535, + "loss/crossentropy": 2.3722634315490723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23045828938484192, + "step": 9372 + }, + { + "epoch": 0.18748, + "grad_norm": 2.078125, + "grad_norm_var": 0.005222320556640625, + "learning_rate": 0.0001, + "loss": 4.2293, + "loss/crossentropy": 2.1880545020103455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2345728725194931, + "step": 9374 + }, + { + "epoch": 0.18752, + "grad_norm": 2.3125, + "grad_norm_var": 0.008255767822265624, + "learning_rate": 0.0001, + "loss": 4.5037, + "loss/crossentropy": 1.8883287906646729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25232937932014465, + "step": 9376 + }, + { + "epoch": 0.18756, + "grad_norm": 2.21875, + "grad_norm_var": 0.009124501546223959, + "learning_rate": 0.0001, + "loss": 4.2963, + "loss/crossentropy": 1.9253730773925781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20324261486530304, + "step": 9378 + }, + { + "epoch": 0.1876, + "grad_norm": 2.078125, + "grad_norm_var": 0.009211985270182292, + "learning_rate": 0.0001, + "loss": 4.2745, + "loss/crossentropy": 1.961540937423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22623597085475922, + "step": 9380 + }, + { + "epoch": 0.18764, + "grad_norm": 2.203125, + "grad_norm_var": 0.007835896809895833, + "learning_rate": 0.0001, + "loss": 4.347, + "loss/crossentropy": 2.190120279788971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24154195934534073, + "step": 9382 + }, + { + "epoch": 0.18768, + "grad_norm": 2.140625, + "grad_norm_var": 0.007445271809895833, + "learning_rate": 0.0001, + "loss": 4.6159, + "loss/crossentropy": 2.0888350009918213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22750811278820038, + "step": 9384 + }, + { + "epoch": 0.18772, + "grad_norm": 2.1875, + "grad_norm_var": 0.008128865559895834, + "learning_rate": 0.0001, + "loss": 4.3171, + "loss/crossentropy": 1.724816918373108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21245518326759338, + "step": 9386 + }, + { + "epoch": 0.18776, + "grad_norm": 2.0, + "grad_norm_var": 0.009307607014973959, + "learning_rate": 0.0001, + "loss": 4.0868, + "loss/crossentropy": 1.6542762517929077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1961463838815689, + "step": 9388 + }, + { + "epoch": 0.1878, + "grad_norm": 2.078125, + "grad_norm_var": 0.009714508056640625, + "learning_rate": 0.0001, + "loss": 4.4748, + "loss/crossentropy": 2.3528761863708496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23583710938692093, + "step": 9390 + }, + { + "epoch": 0.18784, + "grad_norm": 2.03125, + "grad_norm_var": 0.009012603759765625, + "learning_rate": 0.0001, + "loss": 4.394, + "loss/crossentropy": 2.181188702583313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24201467633247375, + "step": 9392 + }, + { + "epoch": 0.18788, + "grad_norm": 2.0625, + "grad_norm_var": 0.008414459228515626, + "learning_rate": 0.0001, + "loss": 4.3476, + "loss/crossentropy": 2.280683398246765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22065220028162003, + "step": 9394 + }, + { + "epoch": 0.18792, + "grad_norm": 2.21875, + "grad_norm_var": 0.009275054931640625, + "learning_rate": 0.0001, + "loss": 4.1603, + "loss/crossentropy": 1.9839438199996948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22708184272050858, + "step": 9396 + }, + { + "epoch": 0.18796, + "grad_norm": 2.125, + "grad_norm_var": 0.0070879618326822914, + "learning_rate": 0.0001, + "loss": 4.4322, + "loss/crossentropy": 2.311874270439148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22249652445316315, + "step": 9398 + }, + { + "epoch": 0.188, + "grad_norm": 2.140625, + "grad_norm_var": 0.008503214518229166, + "learning_rate": 0.0001, + "loss": 4.3808, + "loss/crossentropy": 2.430112838745117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23544297367334366, + "step": 9400 + }, + { + "epoch": 0.18804, + "grad_norm": 1.96875, + "grad_norm_var": 0.00892333984375, + "learning_rate": 0.0001, + "loss": 4.2393, + "loss/crossentropy": 2.146397888660431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23211465775966644, + "step": 9402 + }, + { + "epoch": 0.18808, + "grad_norm": 2.203125, + "grad_norm_var": 0.008318837483723958, + "learning_rate": 0.0001, + "loss": 4.5071, + "loss/crossentropy": 2.4402170181274414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25594406574964523, + "step": 9404 + }, + { + "epoch": 0.18812, + "grad_norm": 2.046875, + "grad_norm_var": 0.008294423421223959, + "learning_rate": 0.0001, + "loss": 4.3848, + "loss/crossentropy": 2.141621232032776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156461626291275, + "step": 9406 + }, + { + "epoch": 0.18816, + "grad_norm": 2.0625, + "grad_norm_var": 0.006290435791015625, + "learning_rate": 0.0001, + "loss": 4.2511, + "loss/crossentropy": 2.0266456604003906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20194754749536514, + "step": 9408 + }, + { + "epoch": 0.1882, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007062784830729167, + "learning_rate": 0.0001, + "loss": 4.2831, + "loss/crossentropy": 2.304496645927429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22345459461212158, + "step": 9410 + }, + { + "epoch": 0.18824, + "grad_norm": 2.25, + "grad_norm_var": 0.007523600260416667, + "learning_rate": 0.0001, + "loss": 4.4085, + "loss/crossentropy": 1.8486470580101013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20493299514055252, + "step": 9412 + }, + { + "epoch": 0.18828, + "grad_norm": 2.09375, + "grad_norm_var": 0.0075927734375, + "learning_rate": 0.0001, + "loss": 4.4562, + "loss/crossentropy": 2.0593990683555603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22942744940519333, + "step": 9414 + }, + { + "epoch": 0.18832, + "grad_norm": 2.15625, + "grad_norm_var": 0.006528472900390625, + "learning_rate": 0.0001, + "loss": 4.1551, + "loss/crossentropy": 1.7936111688613892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1776513010263443, + "step": 9416 + }, + { + "epoch": 0.18836, + "grad_norm": 2.0, + "grad_norm_var": 0.005997467041015625, + "learning_rate": 0.0001, + "loss": 4.4484, + "loss/crossentropy": 2.0676616430282593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21748381853103638, + "step": 9418 + }, + { + "epoch": 0.1884, + "grad_norm": 2.1875, + "grad_norm_var": 0.005236562093098958, + "learning_rate": 0.0001, + "loss": 4.3415, + "loss/crossentropy": 2.0317665934562683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215682566165924, + "step": 9420 + }, + { + "epoch": 0.18844, + "grad_norm": 2.234375, + "grad_norm_var": 0.006359608968098959, + "learning_rate": 0.0001, + "loss": 4.5623, + "loss/crossentropy": 2.3345483541488647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24850556254386902, + "step": 9422 + }, + { + "epoch": 0.18848, + "grad_norm": 2.03125, + "grad_norm_var": 0.007043202718098958, + "learning_rate": 0.0001, + "loss": 4.3526, + "loss/crossentropy": 2.0603779554367065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20997442305088043, + "step": 9424 + }, + { + "epoch": 0.18852, + "grad_norm": 2.15625, + "grad_norm_var": 0.006591796875, + "learning_rate": 0.0001, + "loss": 4.4944, + "loss/crossentropy": 1.9450209140777588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21465806663036346, + "step": 9426 + }, + { + "epoch": 0.18856, + "grad_norm": 2.046875, + "grad_norm_var": 0.0065266927083333336, + "learning_rate": 0.0001, + "loss": 4.2452, + "loss/crossentropy": 2.3568087816238403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24619507789611816, + "step": 9428 + }, + { + "epoch": 0.1886, + "grad_norm": 2.25, + "grad_norm_var": 0.007828776041666667, + "learning_rate": 0.0001, + "loss": 4.1976, + "loss/crossentropy": 2.2047020196914673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2519628629088402, + "step": 9430 + }, + { + "epoch": 0.18864, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010188547770182292, + "learning_rate": 0.0001, + "loss": 4.3168, + "loss/crossentropy": 2.1594117879867554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21274243295192719, + "step": 9432 + }, + { + "epoch": 0.18868, + "grad_norm": 2.25, + "grad_norm_var": 0.009936269124348958, + "learning_rate": 0.0001, + "loss": 4.3847, + "loss/crossentropy": 2.120336890220642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2092270776629448, + "step": 9434 + }, + { + "epoch": 0.18872, + "grad_norm": 2.015625, + "grad_norm_var": 0.011120351155598958, + "learning_rate": 0.0001, + "loss": 4.2725, + "loss/crossentropy": 2.13198459148407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.238224595785141, + "step": 9436 + }, + { + "epoch": 0.18876, + "grad_norm": 2.09375, + "grad_norm_var": 0.010009511311848959, + "learning_rate": 0.0001, + "loss": 4.3706, + "loss/crossentropy": 1.9252395629882812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22254322469234467, + "step": 9438 + }, + { + "epoch": 0.1888, + "grad_norm": 2.09375, + "grad_norm_var": 0.0123291015625, + "learning_rate": 0.0001, + "loss": 4.0574, + "loss/crossentropy": 2.123266577720642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22276867926120758, + "step": 9440 + }, + { + "epoch": 0.18884, + "grad_norm": 2.140625, + "grad_norm_var": 0.010741170247395833, + "learning_rate": 0.0001, + "loss": 4.2195, + "loss/crossentropy": 1.9219747185707092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18884174525737762, + "step": 9442 + }, + { + "epoch": 0.18888, + "grad_norm": 2.125, + "grad_norm_var": 0.010587565104166667, + "learning_rate": 0.0001, + "loss": 4.2603, + "loss/crossentropy": 2.0122207403182983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20913395285606384, + "step": 9444 + }, + { + "epoch": 0.18892, + "grad_norm": 1.953125, + "grad_norm_var": 0.010632069905598958, + "learning_rate": 0.0001, + "loss": 4.0871, + "loss/crossentropy": 2.0255361199378967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1982431635260582, + "step": 9446 + }, + { + "epoch": 0.18896, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008770497639973958, + "learning_rate": 0.0001, + "loss": 4.4085, + "loss/crossentropy": 1.8060500025749207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19787351042032242, + "step": 9448 + }, + { + "epoch": 0.189, + "grad_norm": 2.171875, + "grad_norm_var": 0.007389068603515625, + "learning_rate": 0.0001, + "loss": 4.3799, + "loss/crossentropy": 2.340656042098999, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22321298718452454, + "step": 9450 + }, + { + "epoch": 0.18904, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00966796875, + "learning_rate": 0.0001, + "loss": 4.3973, + "loss/crossentropy": 2.35786235332489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356308028101921, + "step": 9452 + }, + { + "epoch": 0.18908, + "grad_norm": 2.0625, + "grad_norm_var": 0.01002197265625, + "learning_rate": 0.0001, + "loss": 4.2622, + "loss/crossentropy": 2.344806671142578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21194154769182205, + "step": 9454 + }, + { + "epoch": 0.18912, + "grad_norm": 2.046875, + "grad_norm_var": 0.008786773681640625, + "learning_rate": 0.0001, + "loss": 4.4861, + "loss/crossentropy": 2.2449493408203125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21038557589054108, + "step": 9456 + }, + { + "epoch": 0.18916, + "grad_norm": 1.8515625, + "grad_norm_var": 0.014518229166666667, + "learning_rate": 0.0001, + "loss": 4.0343, + "loss/crossentropy": 1.977162778377533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20586547255516052, + "step": 9458 + }, + { + "epoch": 0.1892, + "grad_norm": 2.203125, + "grad_norm_var": 0.015860748291015626, + "learning_rate": 0.0001, + "loss": 4.3453, + "loss/crossentropy": 2.0941065549850464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2197253629565239, + "step": 9460 + }, + { + "epoch": 0.18924, + "grad_norm": 2.03125, + "grad_norm_var": 0.0143463134765625, + "learning_rate": 0.0001, + "loss": 4.3479, + "loss/crossentropy": 2.5145565271377563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25138507783412933, + "step": 9462 + }, + { + "epoch": 0.18928, + "grad_norm": 2.0, + "grad_norm_var": 0.014289347330729167, + "learning_rate": 0.0001, + "loss": 4.4076, + "loss/crossentropy": 2.2870718240737915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2298167496919632, + "step": 9464 + }, + { + "epoch": 0.18932, + "grad_norm": 2.109375, + "grad_norm_var": 0.013678995768229167, + "learning_rate": 0.0001, + "loss": 4.3588, + "loss/crossentropy": 2.2095978260040283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217079259455204, + "step": 9466 + }, + { + "epoch": 0.18936, + "grad_norm": 2.140625, + "grad_norm_var": 0.011726633707682291, + "learning_rate": 0.0001, + "loss": 4.3735, + "loss/crossentropy": 1.9591819047927856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21825183182954788, + "step": 9468 + }, + { + "epoch": 0.1894, + "grad_norm": 2.1875, + "grad_norm_var": 0.011352284749348959, + "learning_rate": 0.0001, + "loss": 4.5072, + "loss/crossentropy": 2.266845226287842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24172072112560272, + "step": 9470 + }, + { + "epoch": 0.18944, + "grad_norm": 2.203125, + "grad_norm_var": 0.011437733968098959, + "learning_rate": 0.0001, + "loss": 4.3448, + "loss/crossentropy": 2.0732688903808594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23074156790971756, + "step": 9472 + }, + { + "epoch": 0.18948, + "grad_norm": 2.34375, + "grad_norm_var": 0.0090728759765625, + "learning_rate": 0.0001, + "loss": 4.6697, + "loss/crossentropy": 1.935340702533722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2194545865058899, + "step": 9474 + }, + { + "epoch": 0.18952, + "grad_norm": 2.125, + "grad_norm_var": 0.007594553629557291, + "learning_rate": 0.0001, + "loss": 4.1518, + "loss/crossentropy": 1.795669674873352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18794939666986465, + "step": 9476 + }, + { + "epoch": 0.18956, + "grad_norm": 2.109375, + "grad_norm_var": 0.008841705322265626, + "learning_rate": 0.0001, + "loss": 3.7745, + "loss/crossentropy": 1.9292446970939636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20147182047367096, + "step": 9478 + }, + { + "epoch": 0.1896, + "grad_norm": 2.09375, + "grad_norm_var": 0.0067779541015625, + "learning_rate": 0.0001, + "loss": 4.1955, + "loss/crossentropy": 2.0110061168670654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2096811756491661, + "step": 9480 + }, + { + "epoch": 0.18964, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008034006754557291, + "learning_rate": 0.0001, + "loss": 4.2937, + "loss/crossentropy": 2.1543468236923218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142205834388733, + "step": 9482 + }, + { + "epoch": 0.18968, + "grad_norm": 2.046875, + "grad_norm_var": 0.009723917643229166, + "learning_rate": 0.0001, + "loss": 4.0126, + "loss/crossentropy": 2.0860520601272583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21716032177209854, + "step": 9484 + }, + { + "epoch": 0.18972, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010241444905598958, + "learning_rate": 0.0001, + "loss": 3.9412, + "loss/crossentropy": 1.7190409302711487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19397014379501343, + "step": 9486 + }, + { + "epoch": 0.18976, + "grad_norm": 2.234375, + "grad_norm_var": 0.017116038004557292, + "learning_rate": 0.0001, + "loss": 4.4711, + "loss/crossentropy": 2.178081512451172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280048429965973, + "step": 9488 + }, + { + "epoch": 0.1898, + "grad_norm": 2.109375, + "grad_norm_var": 0.013063303629557292, + "learning_rate": 0.0001, + "loss": 4.267, + "loss/crossentropy": 2.0749863982200623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22755203396081924, + "step": 9490 + }, + { + "epoch": 0.18984, + "grad_norm": 2.09375, + "grad_norm_var": 0.014098866780598959, + "learning_rate": 0.0001, + "loss": 4.3147, + "loss/crossentropy": 2.214204430580139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23071825504302979, + "step": 9492 + }, + { + "epoch": 0.18988, + "grad_norm": 2.09375, + "grad_norm_var": 0.015794881184895835, + "learning_rate": 0.0001, + "loss": 4.0041, + "loss/crossentropy": 1.6625414490699768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1857389286160469, + "step": 9494 + }, + { + "epoch": 0.18992, + "grad_norm": 2.375, + "grad_norm_var": 0.020539347330729166, + "learning_rate": 0.0001, + "loss": 4.4474, + "loss/crossentropy": 1.8537201285362244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21718977391719818, + "step": 9496 + }, + { + "epoch": 0.18996, + "grad_norm": 1.84375, + "grad_norm_var": 0.024074045817057292, + "learning_rate": 0.0001, + "loss": 4.0488, + "loss/crossentropy": 1.716725468635559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18544895946979523, + "step": 9498 + }, + { + "epoch": 0.19, + "grad_norm": 1.9453125, + "grad_norm_var": 0.024192047119140626, + "learning_rate": 0.0001, + "loss": 4.1035, + "loss/crossentropy": 1.94467431306839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210828959941864, + "step": 9500 + }, + { + "epoch": 0.19004, + "grad_norm": 2.015625, + "grad_norm_var": 0.023298136393229165, + "learning_rate": 0.0001, + "loss": 4.2383, + "loss/crossentropy": 1.9377062320709229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22141354531049728, + "step": 9502 + }, + { + "epoch": 0.19008, + "grad_norm": 2.0, + "grad_norm_var": 0.016950480143229165, + "learning_rate": 0.0001, + "loss": 4.3717, + "loss/crossentropy": 2.2015358209609985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22363336384296417, + "step": 9504 + }, + { + "epoch": 0.19012, + "grad_norm": 2.109375, + "grad_norm_var": 0.017235310872395833, + "learning_rate": 0.0001, + "loss": 4.2332, + "loss/crossentropy": 2.373024582862854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2402234748005867, + "step": 9506 + }, + { + "epoch": 0.19016, + "grad_norm": 2.140625, + "grad_norm_var": 0.016649373372395835, + "learning_rate": 0.0001, + "loss": 4.0519, + "loss/crossentropy": 2.1573110222816467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24207139760255814, + "step": 9508 + }, + { + "epoch": 0.1902, + "grad_norm": 2.203125, + "grad_norm_var": 0.014788564046223958, + "learning_rate": 0.0001, + "loss": 4.4041, + "loss/crossentropy": 2.324455976486206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24618541449308395, + "step": 9510 + }, + { + "epoch": 0.19024, + "grad_norm": 2.078125, + "grad_norm_var": 0.010603586832682291, + "learning_rate": 0.0001, + "loss": 4.0065, + "loss/crossentropy": 1.7480111718177795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049119919538498, + "step": 9512 + }, + { + "epoch": 0.19028, + "grad_norm": 2.125, + "grad_norm_var": 0.006300608317057292, + "learning_rate": 0.0001, + "loss": 4.3644, + "loss/crossentropy": 1.9925439953804016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2314591035246849, + "step": 9514 + }, + { + "epoch": 0.19032, + "grad_norm": 2.140625, + "grad_norm_var": 0.007094065348307292, + "learning_rate": 0.0001, + "loss": 4.0702, + "loss/crossentropy": 2.133601188659668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2329796403646469, + "step": 9516 + }, + { + "epoch": 0.19036, + "grad_norm": 1.984375, + "grad_norm_var": 0.010198720296223958, + "learning_rate": 0.0001, + "loss": 4.246, + "loss/crossentropy": 2.093464970588684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22636859863996506, + "step": 9518 + }, + { + "epoch": 0.1904, + "grad_norm": 2.0625, + "grad_norm_var": 0.009757232666015626, + "learning_rate": 0.0001, + "loss": 4.4766, + "loss/crossentropy": 2.6137614250183105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27037859708070755, + "step": 9520 + }, + { + "epoch": 0.19044, + "grad_norm": 2.125, + "grad_norm_var": 0.012300364176432292, + "learning_rate": 0.0001, + "loss": 4.5499, + "loss/crossentropy": 2.008640229701996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21647297590970993, + "step": 9522 + }, + { + "epoch": 0.19048, + "grad_norm": 2.078125, + "grad_norm_var": 0.012286122639973958, + "learning_rate": 0.0001, + "loss": 4.2467, + "loss/crossentropy": 2.12644362449646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23370585590600967, + "step": 9524 + }, + { + "epoch": 0.19052, + "grad_norm": 2.109375, + "grad_norm_var": 0.011425526936848958, + "learning_rate": 0.0001, + "loss": 4.5409, + "loss/crossentropy": 2.2338638305664062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23637598007917404, + "step": 9526 + }, + { + "epoch": 0.19056, + "grad_norm": 2.0, + "grad_norm_var": 0.011785634358723958, + "learning_rate": 0.0001, + "loss": 4.1405, + "loss/crossentropy": 2.0632832646369934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20682457089424133, + "step": 9528 + }, + { + "epoch": 0.1906, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012726847330729167, + "learning_rate": 0.0001, + "loss": 4.4058, + "loss/crossentropy": 2.219120740890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22551076859235764, + "step": 9530 + }, + { + "epoch": 0.19064, + "grad_norm": 1.953125, + "grad_norm_var": 0.011860911051432292, + "learning_rate": 0.0001, + "loss": 4.2985, + "loss/crossentropy": 2.4633371829986572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24491076171398163, + "step": 9532 + }, + { + "epoch": 0.19068, + "grad_norm": 2.03125, + "grad_norm_var": 0.008211008707682292, + "learning_rate": 0.0001, + "loss": 4.2152, + "loss/crossentropy": 2.2408339977264404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22265981882810593, + "step": 9534 + }, + { + "epoch": 0.19072, + "grad_norm": 2.234375, + "grad_norm_var": 0.009544881184895833, + "learning_rate": 0.0001, + "loss": 4.326, + "loss/crossentropy": 1.9779353141784668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159332111477852, + "step": 9536 + }, + { + "epoch": 0.19076, + "grad_norm": 2.265625, + "grad_norm_var": 0.0082275390625, + "learning_rate": 0.0001, + "loss": 4.3522, + "loss/crossentropy": 2.0315812826156616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20710154622793198, + "step": 9538 + }, + { + "epoch": 0.1908, + "grad_norm": 2.078125, + "grad_norm_var": 0.011787923177083333, + "learning_rate": 0.0001, + "loss": 4.18, + "loss/crossentropy": 2.085337817668915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22174393385648727, + "step": 9540 + }, + { + "epoch": 0.19084, + "grad_norm": 2.03125, + "grad_norm_var": 0.016507975260416665, + "learning_rate": 0.0001, + "loss": 4.4386, + "loss/crossentropy": 2.3449169397354126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3135230466723442, + "step": 9542 + }, + { + "epoch": 0.19088, + "grad_norm": 2.046875, + "grad_norm_var": 0.016169230143229168, + "learning_rate": 0.0001, + "loss": 4.3943, + "loss/crossentropy": 2.1083431243896484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21395261585712433, + "step": 9544 + }, + { + "epoch": 0.19092, + "grad_norm": 2.03125, + "grad_norm_var": 0.015730539957682293, + "learning_rate": 0.0001, + "loss": 4.2091, + "loss/crossentropy": 2.0386710166931152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22487390786409378, + "step": 9546 + }, + { + "epoch": 0.19096, + "grad_norm": 2.21875, + "grad_norm_var": 0.015240224202473958, + "learning_rate": 0.0001, + "loss": 4.2377, + "loss/crossentropy": 1.9533087611198425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2223353162407875, + "step": 9548 + }, + { + "epoch": 0.191, + "grad_norm": 2.21875, + "grad_norm_var": 0.015317535400390625, + "learning_rate": 0.0001, + "loss": 4.4183, + "loss/crossentropy": 2.35861599445343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26162558794021606, + "step": 9550 + }, + { + "epoch": 0.19104, + "grad_norm": 2.015625, + "grad_norm_var": 0.014388020833333333, + "learning_rate": 0.0001, + "loss": 4.2507, + "loss/crossentropy": 1.9529705047607422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125585675239563, + "step": 9552 + }, + { + "epoch": 0.19108, + "grad_norm": 2.109375, + "grad_norm_var": 0.011844889322916666, + "learning_rate": 0.0001, + "loss": 4.2041, + "loss/crossentropy": 2.0364453196525574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21163037419319153, + "step": 9554 + }, + { + "epoch": 0.19112, + "grad_norm": 2.078125, + "grad_norm_var": 0.009749348958333333, + "learning_rate": 0.0001, + "loss": 4.2758, + "loss/crossentropy": 2.1321409940719604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24250106513500214, + "step": 9556 + }, + { + "epoch": 0.19116, + "grad_norm": 2.125, + "grad_norm_var": 0.0073931376139322914, + "learning_rate": 0.0001, + "loss": 4.1848, + "loss/crossentropy": 2.024011969566345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20132286846637726, + "step": 9558 + }, + { + "epoch": 0.1912, + "grad_norm": 2.109375, + "grad_norm_var": 0.006980133056640625, + "learning_rate": 0.0001, + "loss": 4.3726, + "loss/crossentropy": 2.106776535511017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23114337772130966, + "step": 9560 + }, + { + "epoch": 0.19124, + "grad_norm": 2.265625, + "grad_norm_var": 0.007645416259765625, + "learning_rate": 0.0001, + "loss": 4.4642, + "loss/crossentropy": 1.9228236079216003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22426098585128784, + "step": 9562 + }, + { + "epoch": 0.19128, + "grad_norm": 2.171875, + "grad_norm_var": 0.008090972900390625, + "learning_rate": 0.0001, + "loss": 4.2384, + "loss/crossentropy": 2.3033370971679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20113499462604523, + "step": 9564 + }, + { + "epoch": 0.19132, + "grad_norm": 2.125, + "grad_norm_var": 0.008973948160807292, + "learning_rate": 0.0001, + "loss": 4.0736, + "loss/crossentropy": 1.6983963251113892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19003060460090637, + "step": 9566 + }, + { + "epoch": 0.19136, + "grad_norm": 2.34375, + "grad_norm_var": 0.011295318603515625, + "learning_rate": 0.0001, + "loss": 4.5283, + "loss/crossentropy": 2.502004861831665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24949797987937927, + "step": 9568 + }, + { + "epoch": 0.1914, + "grad_norm": 2.03125, + "grad_norm_var": 0.011793772379557291, + "learning_rate": 0.0001, + "loss": 4.2004, + "loss/crossentropy": 1.9981504678726196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152494639158249, + "step": 9570 + }, + { + "epoch": 0.19144, + "grad_norm": 1.96875, + "grad_norm_var": 0.012931060791015626, + "learning_rate": 0.0001, + "loss": 4.1211, + "loss/crossentropy": 2.1489784717559814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22105325013399124, + "step": 9572 + }, + { + "epoch": 0.19148, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013016510009765624, + "learning_rate": 0.0001, + "loss": 4.2234, + "loss/crossentropy": 1.974421203136444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23960395902395248, + "step": 9574 + }, + { + "epoch": 0.19152, + "grad_norm": 2.09375, + "grad_norm_var": 0.012670644124348958, + "learning_rate": 0.0001, + "loss": 4.4119, + "loss/crossentropy": 2.379599928855896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26731471717357635, + "step": 9576 + }, + { + "epoch": 0.19156, + "grad_norm": 2.015625, + "grad_norm_var": 0.012444814046223959, + "learning_rate": 0.0001, + "loss": 4.3718, + "loss/crossentropy": 2.26211154460907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22957566380500793, + "step": 9578 + }, + { + "epoch": 0.1916, + "grad_norm": 2.125, + "grad_norm_var": 0.011871083577473959, + "learning_rate": 0.0001, + "loss": 4.4594, + "loss/crossentropy": 2.282869577407837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2404445931315422, + "step": 9580 + }, + { + "epoch": 0.19164, + "grad_norm": 2.03125, + "grad_norm_var": 0.010457102457682292, + "learning_rate": 0.0001, + "loss": 4.1592, + "loss/crossentropy": 1.9183810949325562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18672315031290054, + "step": 9582 + }, + { + "epoch": 0.19168, + "grad_norm": 2.046875, + "grad_norm_var": 0.0076812744140625, + "learning_rate": 0.0001, + "loss": 4.1801, + "loss/crossentropy": 2.2722173929214478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23262400180101395, + "step": 9584 + }, + { + "epoch": 0.19172, + "grad_norm": 2.0625, + "grad_norm_var": 0.0075266520182291664, + "learning_rate": 0.0001, + "loss": 4.4032, + "loss/crossentropy": 2.426178455352783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23461100459098816, + "step": 9586 + }, + { + "epoch": 0.19176, + "grad_norm": 2.09375, + "grad_norm_var": 0.00687255859375, + "learning_rate": 0.0001, + "loss": 4.344, + "loss/crossentropy": 2.2266165018081665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22530251741409302, + "step": 9588 + }, + { + "epoch": 0.1918, + "grad_norm": 2.078125, + "grad_norm_var": 0.005541737874348958, + "learning_rate": 0.0001, + "loss": 4.1633, + "loss/crossentropy": 1.786275327205658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19257958233356476, + "step": 9590 + }, + { + "epoch": 0.19184, + "grad_norm": 2.0, + "grad_norm_var": 0.008868153889973958, + "learning_rate": 0.0001, + "loss": 4.2367, + "loss/crossentropy": 1.8497431874275208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21965742111206055, + "step": 9592 + }, + { + "epoch": 0.19188, + "grad_norm": 2.078125, + "grad_norm_var": 0.008143870035807292, + "learning_rate": 0.0001, + "loss": 4.5612, + "loss/crossentropy": 2.492846131324768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21851783990859985, + "step": 9594 + }, + { + "epoch": 0.19192, + "grad_norm": 2.28125, + "grad_norm_var": 0.010027821858723958, + "learning_rate": 0.0001, + "loss": 4.6838, + "loss/crossentropy": 2.3447986841201782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26567359268665314, + "step": 9596 + }, + { + "epoch": 0.19196, + "grad_norm": 1.984375, + "grad_norm_var": 0.010654449462890625, + "learning_rate": 0.0001, + "loss": 4.1513, + "loss/crossentropy": 1.656063199043274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1903422325849533, + "step": 9598 + }, + { + "epoch": 0.192, + "grad_norm": 2.125, + "grad_norm_var": 0.008329264322916667, + "learning_rate": 0.0001, + "loss": 4.3252, + "loss/crossentropy": 2.1816134452819824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23314762860536575, + "step": 9600 + }, + { + "epoch": 0.19204, + "grad_norm": 2.015625, + "grad_norm_var": 0.010007476806640625, + "learning_rate": 0.0001, + "loss": 3.8537, + "loss/crossentropy": 1.787261426448822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20938758552074432, + "step": 9602 + }, + { + "epoch": 0.19208, + "grad_norm": 2.015625, + "grad_norm_var": 0.010526275634765625, + "learning_rate": 0.0001, + "loss": 4.2374, + "loss/crossentropy": 1.9722678065299988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23085469752550125, + "step": 9604 + }, + { + "epoch": 0.19212, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012113444010416667, + "learning_rate": 0.0001, + "loss": 4.0954, + "loss/crossentropy": 2.04559987783432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21647220849990845, + "step": 9606 + }, + { + "epoch": 0.19216, + "grad_norm": 2.140625, + "grad_norm_var": 0.008934529622395833, + "learning_rate": 0.0001, + "loss": 4.5397, + "loss/crossentropy": 2.5426105260849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23680832237005234, + "step": 9608 + }, + { + "epoch": 0.1922, + "grad_norm": 2.0, + "grad_norm_var": 0.008890787760416666, + "learning_rate": 0.0001, + "loss": 4.245, + "loss/crossentropy": 2.1339075565338135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22501112520694733, + "step": 9610 + }, + { + "epoch": 0.19224, + "grad_norm": 2.03125, + "grad_norm_var": 0.006550852457682292, + "learning_rate": 0.0001, + "loss": 4.0924, + "loss/crossentropy": 1.9554831981658936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20187357813119888, + "step": 9612 + }, + { + "epoch": 0.19228, + "grad_norm": 2.203125, + "grad_norm_var": 0.009065500895182292, + "learning_rate": 0.0001, + "loss": 4.5578, + "loss/crossentropy": 2.2996249198913574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24954287707805634, + "step": 9614 + }, + { + "epoch": 0.19232, + "grad_norm": 2.125, + "grad_norm_var": 0.009065500895182292, + "learning_rate": 0.0001, + "loss": 4.3753, + "loss/crossentropy": 2.2439414262771606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22833245247602463, + "step": 9616 + }, + { + "epoch": 0.19236, + "grad_norm": 2.046875, + "grad_norm_var": 0.007933553059895833, + "learning_rate": 0.0001, + "loss": 4.2297, + "loss/crossentropy": 1.8900890946388245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21077623218297958, + "step": 9618 + }, + { + "epoch": 0.1924, + "grad_norm": 2.03125, + "grad_norm_var": 0.007804361979166666, + "learning_rate": 0.0001, + "loss": 4.287, + "loss/crossentropy": 2.110726058483124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21897974610328674, + "step": 9620 + }, + { + "epoch": 0.19244, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0080474853515625, + "learning_rate": 0.0001, + "loss": 4.0135, + "loss/crossentropy": 1.7363090515136719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19939633458852768, + "step": 9622 + }, + { + "epoch": 0.19248, + "grad_norm": 2.109375, + "grad_norm_var": 0.01014404296875, + "learning_rate": 0.0001, + "loss": 4.3643, + "loss/crossentropy": 1.8148014545440674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21637701243162155, + "step": 9624 + }, + { + "epoch": 0.19252, + "grad_norm": 2.03125, + "grad_norm_var": 0.0092041015625, + "learning_rate": 0.0001, + "loss": 4.2451, + "loss/crossentropy": 2.2895134687423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2297188639640808, + "step": 9626 + }, + { + "epoch": 0.19256, + "grad_norm": 2.15625, + "grad_norm_var": 0.006296539306640625, + "learning_rate": 0.0001, + "loss": 4.4778, + "loss/crossentropy": 2.117924213409424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24057473242282867, + "step": 9628 + }, + { + "epoch": 0.1926, + "grad_norm": 2.140625, + "grad_norm_var": 0.005421702067057292, + "learning_rate": 0.0001, + "loss": 4.2791, + "loss/crossentropy": 1.8709319829940796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106107696890831, + "step": 9630 + }, + { + "epoch": 0.19264, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0079742431640625, + "learning_rate": 0.0001, + "loss": 4.0174, + "loss/crossentropy": 1.5156871676445007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17487400770187378, + "step": 9632 + }, + { + "epoch": 0.19268, + "grad_norm": 2.1875, + "grad_norm_var": 0.0183837890625, + "learning_rate": 0.0001, + "loss": 4.6281, + "loss/crossentropy": 2.153126537799835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22009101510047913, + "step": 9634 + }, + { + "epoch": 0.19272, + "grad_norm": 2.09375, + "grad_norm_var": 0.017853800455729166, + "learning_rate": 0.0001, + "loss": 4.443, + "loss/crossentropy": 2.0890414714813232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2233077436685562, + "step": 9636 + }, + { + "epoch": 0.19276, + "grad_norm": 2.015625, + "grad_norm_var": 0.01858495076497396, + "learning_rate": 0.0001, + "loss": 4.2572, + "loss/crossentropy": 2.1218496561050415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22576671838760376, + "step": 9638 + }, + { + "epoch": 0.1928, + "grad_norm": 1.921875, + "grad_norm_var": 0.020918528238932293, + "learning_rate": 0.0001, + "loss": 4.2522, + "loss/crossentropy": 2.1131649017333984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21846124529838562, + "step": 9640 + }, + { + "epoch": 0.19284, + "grad_norm": 2.1875, + "grad_norm_var": 0.02067845662434896, + "learning_rate": 0.0001, + "loss": 4.6643, + "loss/crossentropy": 2.3941714763641357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.246211439371109, + "step": 9642 + }, + { + "epoch": 0.19288, + "grad_norm": 1.890625, + "grad_norm_var": 0.02687352498372396, + "learning_rate": 0.0001, + "loss": 4.0641, + "loss/crossentropy": 1.9579638838768005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198220357298851, + "step": 9644 + }, + { + "epoch": 0.19292, + "grad_norm": 2.140625, + "grad_norm_var": 0.02754491170247396, + "learning_rate": 0.0001, + "loss": 4.564, + "loss/crossentropy": 2.388027787208557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23349716514348984, + "step": 9646 + }, + { + "epoch": 0.19296, + "grad_norm": 2.15625, + "grad_norm_var": 0.02520726521809896, + "learning_rate": 0.0001, + "loss": 4.3678, + "loss/crossentropy": 1.8203087449073792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18923642486333847, + "step": 9648 + }, + { + "epoch": 0.193, + "grad_norm": 2.03125, + "grad_norm_var": 0.015476226806640625, + "learning_rate": 0.0001, + "loss": 4.2119, + "loss/crossentropy": 1.996269702911377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2096453383564949, + "step": 9650 + }, + { + "epoch": 0.19304, + "grad_norm": 2.09375, + "grad_norm_var": 0.01572240193684896, + "learning_rate": 0.0001, + "loss": 4.2921, + "loss/crossentropy": 1.806606113910675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19824761897325516, + "step": 9652 + }, + { + "epoch": 0.19308, + "grad_norm": 2.28125, + "grad_norm_var": 0.01651178995768229, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 2.190830111503601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22319861501455307, + "step": 9654 + }, + { + "epoch": 0.19312, + "grad_norm": 2.15625, + "grad_norm_var": 0.014048004150390625, + "learning_rate": 0.0001, + "loss": 4.4579, + "loss/crossentropy": 1.9721493124961853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2011292800307274, + "step": 9656 + }, + { + "epoch": 0.19316, + "grad_norm": 2.234375, + "grad_norm_var": 0.014277903238932292, + "learning_rate": 0.0001, + "loss": 4.5673, + "loss/crossentropy": 2.1256929636001587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2728194147348404, + "step": 9658 + }, + { + "epoch": 0.1932, + "grad_norm": 2.125, + "grad_norm_var": 0.008345286051432291, + "learning_rate": 0.0001, + "loss": 4.3206, + "loss/crossentropy": 2.090156316757202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240355908870697, + "step": 9660 + }, + { + "epoch": 0.19324, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0108795166015625, + "learning_rate": 0.0001, + "loss": 4.0231, + "loss/crossentropy": 2.2930272817611694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23188824206590652, + "step": 9662 + }, + { + "epoch": 0.19328, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0122222900390625, + "learning_rate": 0.0001, + "loss": 4.0075, + "loss/crossentropy": 1.9754068851470947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18698863685131073, + "step": 9664 + }, + { + "epoch": 0.19332, + "grad_norm": 2.09375, + "grad_norm_var": 0.0118560791015625, + "learning_rate": 0.0001, + "loss": 4.1668, + "loss/crossentropy": 1.8987788558006287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22531752288341522, + "step": 9666 + }, + { + "epoch": 0.19336, + "grad_norm": 2.0625, + "grad_norm_var": 0.011678059895833334, + "learning_rate": 0.0001, + "loss": 4.3185, + "loss/crossentropy": 2.2699583768844604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2572309076786041, + "step": 9668 + }, + { + "epoch": 0.1934, + "grad_norm": 2.0625, + "grad_norm_var": 0.0072509765625, + "learning_rate": 0.0001, + "loss": 4.3267, + "loss/crossentropy": 1.6649349927902222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23388104140758514, + "step": 9670 + }, + { + "epoch": 0.19344, + "grad_norm": 2.171875, + "grad_norm_var": 0.007258097330729167, + "learning_rate": 0.0001, + "loss": 4.3846, + "loss/crossentropy": 2.173617362976074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20848755538463593, + "step": 9672 + }, + { + "epoch": 0.19348, + "grad_norm": 2.03125, + "grad_norm_var": 0.005890909830729167, + "learning_rate": 0.0001, + "loss": 4.3252, + "loss/crossentropy": 2.2690787315368652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23493453860282898, + "step": 9674 + }, + { + "epoch": 0.19352, + "grad_norm": 1.9375, + "grad_norm_var": 0.007079060872395833, + "learning_rate": 0.0001, + "loss": 4.3621, + "loss/crossentropy": 2.00560861825943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22538188099861145, + "step": 9676 + }, + { + "epoch": 0.19356, + "grad_norm": 2.0625, + "grad_norm_var": 0.004369862874348958, + "learning_rate": 0.0001, + "loss": 4.1264, + "loss/crossentropy": 1.960120975971222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21056914329528809, + "step": 9678 + }, + { + "epoch": 0.1936, + "grad_norm": 1.9921875, + "grad_norm_var": 0.004689280192057292, + "learning_rate": 0.0001, + "loss": 3.9868, + "loss/crossentropy": 1.8921862840652466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22250327467918396, + "step": 9680 + }, + { + "epoch": 0.19364, + "grad_norm": 2.0625, + "grad_norm_var": 0.004839833577473958, + "learning_rate": 0.0001, + "loss": 4.296, + "loss/crossentropy": 1.9474233984947205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19719959795475006, + "step": 9682 + }, + { + "epoch": 0.19368, + "grad_norm": 2.21875, + "grad_norm_var": 0.0072100321451822914, + "learning_rate": 0.0001, + "loss": 4.2985, + "loss/crossentropy": 2.3391844034194946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22263485193252563, + "step": 9684 + }, + { + "epoch": 0.19372, + "grad_norm": 2.21875, + "grad_norm_var": 0.008715565999348958, + "learning_rate": 0.0001, + "loss": 4.3641, + "loss/crossentropy": 2.190012037754059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22531607002019882, + "step": 9686 + }, + { + "epoch": 0.19376, + "grad_norm": 2.15625, + "grad_norm_var": 0.008283487955729167, + "learning_rate": 0.0001, + "loss": 4.2601, + "loss/crossentropy": 1.9935640096664429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21438511461019516, + "step": 9688 + }, + { + "epoch": 0.1938, + "grad_norm": 2.15625, + "grad_norm_var": 0.009186808268229167, + "learning_rate": 0.0001, + "loss": 4.1756, + "loss/crossentropy": 1.7482191324234009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20972990244627, + "step": 9690 + }, + { + "epoch": 0.19384, + "grad_norm": 2.125, + "grad_norm_var": 0.008353678385416667, + "learning_rate": 0.0001, + "loss": 4.475, + "loss/crossentropy": 2.2413275241851807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24103231728076935, + "step": 9692 + }, + { + "epoch": 0.19388, + "grad_norm": 2.140625, + "grad_norm_var": 0.008519490559895834, + "learning_rate": 0.0001, + "loss": 4.2886, + "loss/crossentropy": 2.2846572399139404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2420315518975258, + "step": 9694 + }, + { + "epoch": 0.19392, + "grad_norm": 2.265625, + "grad_norm_var": 0.007968902587890625, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 1.7977504134178162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20145538449287415, + "step": 9696 + }, + { + "epoch": 0.19396, + "grad_norm": 2.0625, + "grad_norm_var": 0.007535552978515625, + "learning_rate": 0.0001, + "loss": 4.205, + "loss/crossentropy": 2.0343902111053467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21595563739538193, + "step": 9698 + }, + { + "epoch": 0.194, + "grad_norm": 2.125, + "grad_norm_var": 0.0052487691243489586, + "learning_rate": 0.0001, + "loss": 3.9524, + "loss/crossentropy": 1.8828233480453491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19624938070774078, + "step": 9700 + }, + { + "epoch": 0.19404, + "grad_norm": 1.984375, + "grad_norm_var": 0.005602773030598958, + "learning_rate": 0.0001, + "loss": 4.1569, + "loss/crossentropy": 1.9177573323249817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19467243552207947, + "step": 9702 + }, + { + "epoch": 0.19408, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006613922119140625, + "learning_rate": 0.0001, + "loss": 3.8881, + "loss/crossentropy": 1.8025588393211365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18757501989603043, + "step": 9704 + }, + { + "epoch": 0.19412, + "grad_norm": 2.140625, + "grad_norm_var": 0.006723785400390625, + "learning_rate": 0.0001, + "loss": 4.3195, + "loss/crossentropy": 2.025633454322815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21623078733682632, + "step": 9706 + }, + { + "epoch": 0.19416, + "grad_norm": 2.109375, + "grad_norm_var": 0.009129842122395834, + "learning_rate": 0.0001, + "loss": 3.8244, + "loss/crossentropy": 1.9421055316925049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20563311874866486, + "step": 9708 + }, + { + "epoch": 0.1942, + "grad_norm": 2.09375, + "grad_norm_var": 0.009191640218098958, + "learning_rate": 0.0001, + "loss": 4.0977, + "loss/crossentropy": 1.9820671081542969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22029083967208862, + "step": 9710 + }, + { + "epoch": 0.19424, + "grad_norm": 2.328125, + "grad_norm_var": 0.010341135660807292, + "learning_rate": 0.0001, + "loss": 4.2037, + "loss/crossentropy": 1.9491158723831177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19559209793806076, + "step": 9712 + }, + { + "epoch": 0.19428, + "grad_norm": 2.0625, + "grad_norm_var": 0.010416412353515625, + "learning_rate": 0.0001, + "loss": 4.1592, + "loss/crossentropy": 1.8653306365013123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18629660457372665, + "step": 9714 + }, + { + "epoch": 0.19432, + "grad_norm": 1.953125, + "grad_norm_var": 0.01950251261393229, + "learning_rate": 0.0001, + "loss": 4.2567, + "loss/crossentropy": 1.6897491812705994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1948539912700653, + "step": 9716 + }, + { + "epoch": 0.19436, + "grad_norm": 2.203125, + "grad_norm_var": 0.020776112874348957, + "learning_rate": 0.0001, + "loss": 4.4387, + "loss/crossentropy": 2.07690966129303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147115021944046, + "step": 9718 + }, + { + "epoch": 0.1944, + "grad_norm": 2.03125, + "grad_norm_var": 0.020189412434895835, + "learning_rate": 0.0001, + "loss": 4.269, + "loss/crossentropy": 2.0356597304344177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20626354217529297, + "step": 9720 + }, + { + "epoch": 0.19444, + "grad_norm": 2.0625, + "grad_norm_var": 0.0204254150390625, + "learning_rate": 0.0001, + "loss": 4.0548, + "loss/crossentropy": 1.7709991931915283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18573015183210373, + "step": 9722 + }, + { + "epoch": 0.19448, + "grad_norm": 1.984375, + "grad_norm_var": 0.01830012003580729, + "learning_rate": 0.0001, + "loss": 4.117, + "loss/crossentropy": 2.0255925059318542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22114800661802292, + "step": 9724 + }, + { + "epoch": 0.19452, + "grad_norm": 2.25, + "grad_norm_var": 0.01871337890625, + "learning_rate": 0.0001, + "loss": 4.4655, + "loss/crossentropy": 2.0046772956848145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22649522870779037, + "step": 9726 + }, + { + "epoch": 0.19456, + "grad_norm": 2.15625, + "grad_norm_var": 0.015827433268229166, + "learning_rate": 0.0001, + "loss": 4.179, + "loss/crossentropy": 2.2746634483337402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23017344623804092, + "step": 9728 + }, + { + "epoch": 0.1946, + "grad_norm": 2.078125, + "grad_norm_var": 0.015184529622395833, + "learning_rate": 0.0001, + "loss": 4.203, + "loss/crossentropy": 2.0191025137901306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20670472085475922, + "step": 9730 + }, + { + "epoch": 0.19464, + "grad_norm": 2.015625, + "grad_norm_var": 0.008177693684895833, + "learning_rate": 0.0001, + "loss": 4.1657, + "loss/crossentropy": 2.3198455572128296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24985665082931519, + "step": 9732 + }, + { + "epoch": 0.19468, + "grad_norm": 2.09375, + "grad_norm_var": 0.0073150634765625, + "learning_rate": 0.0001, + "loss": 4.3886, + "loss/crossentropy": 2.5229711532592773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22920701652765274, + "step": 9734 + }, + { + "epoch": 0.19472, + "grad_norm": 2.078125, + "grad_norm_var": 0.0072662353515625, + "learning_rate": 0.0001, + "loss": 3.9991, + "loss/crossentropy": 2.081319808959961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21542686223983765, + "step": 9736 + }, + { + "epoch": 0.19476, + "grad_norm": 2.046875, + "grad_norm_var": 0.008373006184895834, + "learning_rate": 0.0001, + "loss": 4.0698, + "loss/crossentropy": 2.2170007824897766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21852095425128937, + "step": 9738 + }, + { + "epoch": 0.1948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008265940348307292, + "learning_rate": 0.0001, + "loss": 4.1528, + "loss/crossentropy": 1.830683708190918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910252571105957, + "step": 9740 + }, + { + "epoch": 0.19484, + "grad_norm": 2.25, + "grad_norm_var": 0.008420562744140625, + "learning_rate": 0.0001, + "loss": 4.3543, + "loss/crossentropy": 2.1303864121437073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.228670135140419, + "step": 9742 + }, + { + "epoch": 0.19488, + "grad_norm": 2.015625, + "grad_norm_var": 0.007486724853515625, + "learning_rate": 0.0001, + "loss": 4.1504, + "loss/crossentropy": 2.2621915340423584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22027206420898438, + "step": 9744 + }, + { + "epoch": 0.19492, + "grad_norm": 2.0, + "grad_norm_var": 0.0110260009765625, + "learning_rate": 0.0001, + "loss": 3.9564, + "loss/crossentropy": 2.044555902481079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20469766855239868, + "step": 9746 + }, + { + "epoch": 0.19496, + "grad_norm": 2.109375, + "grad_norm_var": 0.0115142822265625, + "learning_rate": 0.0001, + "loss": 4.1677, + "loss/crossentropy": 1.911176860332489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22036674618721008, + "step": 9748 + }, + { + "epoch": 0.195, + "grad_norm": 1.96875, + "grad_norm_var": 0.010545857747395833, + "learning_rate": 0.0001, + "loss": 4.0904, + "loss/crossentropy": 1.8650219440460205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19878911972045898, + "step": 9750 + }, + { + "epoch": 0.19504, + "grad_norm": 2.125, + "grad_norm_var": 0.02088623046875, + "learning_rate": 0.0001, + "loss": 4.3648, + "loss/crossentropy": 2.196391463279724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20845064520835876, + "step": 9752 + }, + { + "epoch": 0.19508, + "grad_norm": 1.9921875, + "grad_norm_var": 0.021201324462890626, + "learning_rate": 0.0001, + "loss": 4.0132, + "loss/crossentropy": 2.222484588623047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144618257880211, + "step": 9754 + }, + { + "epoch": 0.19512, + "grad_norm": 2.15625, + "grad_norm_var": 0.020685831705729168, + "learning_rate": 0.0001, + "loss": 4.4078, + "loss/crossentropy": 2.416514754295349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2370949387550354, + "step": 9756 + }, + { + "epoch": 0.19516, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020566558837890624, + "learning_rate": 0.0001, + "loss": 4.0119, + "loss/crossentropy": 2.22190260887146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23128122836351395, + "step": 9758 + }, + { + "epoch": 0.1952, + "grad_norm": 2.15625, + "grad_norm_var": 0.024008941650390626, + "learning_rate": 0.0001, + "loss": 4.4253, + "loss/crossentropy": 2.446492910385132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23962965607643127, + "step": 9760 + }, + { + "epoch": 0.19524, + "grad_norm": 2.015625, + "grad_norm_var": 0.019245402018229166, + "learning_rate": 0.0001, + "loss": 4.1676, + "loss/crossentropy": 2.1458136439323425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19657295942306519, + "step": 9762 + }, + { + "epoch": 0.19528, + "grad_norm": 2.203125, + "grad_norm_var": 0.0197418212890625, + "learning_rate": 0.0001, + "loss": 4.6029, + "loss/crossentropy": 1.9773340225219727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22887050360441208, + "step": 9764 + }, + { + "epoch": 0.19532, + "grad_norm": 1.9453125, + "grad_norm_var": 0.020334625244140626, + "learning_rate": 0.0001, + "loss": 4.1583, + "loss/crossentropy": 2.0938061475753784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19823112338781357, + "step": 9766 + }, + { + "epoch": 0.19536, + "grad_norm": 2.3125, + "grad_norm_var": 0.015636952718098958, + "learning_rate": 0.0001, + "loss": 4.2628, + "loss/crossentropy": 1.9068174958229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19924984872341156, + "step": 9768 + }, + { + "epoch": 0.1954, + "grad_norm": 2.109375, + "grad_norm_var": 0.012422688802083333, + "learning_rate": 0.0001, + "loss": 4.3567, + "loss/crossentropy": 2.045413613319397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19175175577402115, + "step": 9770 + }, + { + "epoch": 0.19544, + "grad_norm": 2.09375, + "grad_norm_var": 0.013106282552083333, + "learning_rate": 0.0001, + "loss": 4.1972, + "loss/crossentropy": 2.0262961983680725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22137057781219482, + "step": 9772 + }, + { + "epoch": 0.19548, + "grad_norm": 2.0625, + "grad_norm_var": 0.012992350260416667, + "learning_rate": 0.0001, + "loss": 4.0624, + "loss/crossentropy": 2.146475672721863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2284562587738037, + "step": 9774 + }, + { + "epoch": 0.19552, + "grad_norm": 2.015625, + "grad_norm_var": 0.011885579427083333, + "learning_rate": 0.0001, + "loss": 4.154, + "loss/crossentropy": 2.0316100120544434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26275022327899933, + "step": 9776 + }, + { + "epoch": 0.19556, + "grad_norm": 2.140625, + "grad_norm_var": 0.011359659830729167, + "learning_rate": 0.0001, + "loss": 4.5424, + "loss/crossentropy": 2.1386696100234985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22695952653884888, + "step": 9778 + }, + { + "epoch": 0.1956, + "grad_norm": 2.578125, + "grad_norm_var": 0.024405924479166667, + "learning_rate": 0.0001, + "loss": 4.2391, + "loss/crossentropy": 2.3000820875167847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24045731872320175, + "step": 9780 + }, + { + "epoch": 0.19564, + "grad_norm": 2.3125, + "grad_norm_var": 0.02415949503580729, + "learning_rate": 0.0001, + "loss": 4.7778, + "loss/crossentropy": 2.1272310614585876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2318389192223549, + "step": 9782 + }, + { + "epoch": 0.19568, + "grad_norm": 1.9453125, + "grad_norm_var": 0.025655110677083332, + "learning_rate": 0.0001, + "loss": 4.1232, + "loss/crossentropy": 1.8953965306282043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.213637076318264, + "step": 9784 + }, + { + "epoch": 0.19572, + "grad_norm": 2.0625, + "grad_norm_var": 0.0261871337890625, + "learning_rate": 0.0001, + "loss": 4.3459, + "loss/crossentropy": 2.0738734006881714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100028172135353, + "step": 9786 + }, + { + "epoch": 0.19576, + "grad_norm": 2.0, + "grad_norm_var": 0.027378082275390625, + "learning_rate": 0.0001, + "loss": 4.2599, + "loss/crossentropy": 1.9454593658447266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18557409197092056, + "step": 9788 + }, + { + "epoch": 0.1958, + "grad_norm": 2.015625, + "grad_norm_var": 0.026569620768229166, + "learning_rate": 0.0001, + "loss": 4.1655, + "loss/crossentropy": 2.101949095726013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22479471564292908, + "step": 9790 + }, + { + "epoch": 0.19584, + "grad_norm": 2.25, + "grad_norm_var": 0.025935872395833334, + "learning_rate": 0.0001, + "loss": 4.3841, + "loss/crossentropy": 1.9524416327476501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20574256777763367, + "step": 9792 + }, + { + "epoch": 0.19588, + "grad_norm": 2.296875, + "grad_norm_var": 0.06084391276041667, + "learning_rate": 0.0001, + "loss": 4.1596, + "loss/crossentropy": 1.9490719437599182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19411193579435349, + "step": 9794 + }, + { + "epoch": 0.19592, + "grad_norm": 2.046875, + "grad_norm_var": 0.050675455729166666, + "learning_rate": 0.0001, + "loss": 4.3438, + "loss/crossentropy": 2.095793664455414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21078093349933624, + "step": 9796 + }, + { + "epoch": 0.19596, + "grad_norm": 2.015625, + "grad_norm_var": 0.0508056640625, + "learning_rate": 0.0001, + "loss": 4.0828, + "loss/crossentropy": 2.032066822052002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21181444078683853, + "step": 9798 + }, + { + "epoch": 0.196, + "grad_norm": 2.140625, + "grad_norm_var": 0.048378245035807295, + "learning_rate": 0.0001, + "loss": 4.0019, + "loss/crossentropy": 2.1378380060195923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224056214094162, + "step": 9800 + }, + { + "epoch": 0.19604, + "grad_norm": 2.1875, + "grad_norm_var": 0.04784520467122396, + "learning_rate": 0.0001, + "loss": 4.2932, + "loss/crossentropy": 2.120614767074585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104548141360283, + "step": 9802 + }, + { + "epoch": 0.19608, + "grad_norm": 1.8984375, + "grad_norm_var": 0.048954010009765625, + "learning_rate": 0.0001, + "loss": 4.1975, + "loss/crossentropy": 1.8837141394615173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19755827635526657, + "step": 9804 + }, + { + "epoch": 0.19612, + "grad_norm": 2.03125, + "grad_norm_var": 0.04793675740559896, + "learning_rate": 0.0001, + "loss": 4.1645, + "loss/crossentropy": 1.9325945973396301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005266472697258, + "step": 9806 + }, + { + "epoch": 0.19616, + "grad_norm": 2.046875, + "grad_norm_var": 0.04839045206705729, + "learning_rate": 0.0001, + "loss": 4.2211, + "loss/crossentropy": 1.789370834827423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20060084015130997, + "step": 9808 + }, + { + "epoch": 0.1962, + "grad_norm": 1.96875, + "grad_norm_var": 0.01709162394205729, + "learning_rate": 0.0001, + "loss": 4.2463, + "loss/crossentropy": 1.9807876348495483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21805762499570847, + "step": 9810 + }, + { + "epoch": 0.19624, + "grad_norm": 1.9375, + "grad_norm_var": 0.018888092041015624, + "learning_rate": 0.0001, + "loss": 4.1339, + "loss/crossentropy": 2.210070848464966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23082506656646729, + "step": 9812 + }, + { + "epoch": 0.19628, + "grad_norm": 2.0625, + "grad_norm_var": 0.01693903605143229, + "learning_rate": 0.0001, + "loss": 4.1633, + "loss/crossentropy": 1.8644117712974548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20300551503896713, + "step": 9814 + }, + { + "epoch": 0.19632, + "grad_norm": 2.125, + "grad_norm_var": 0.01634496053059896, + "learning_rate": 0.0001, + "loss": 4.2064, + "loss/crossentropy": 1.6804233193397522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20438820868730545, + "step": 9816 + }, + { + "epoch": 0.19636, + "grad_norm": 2.09375, + "grad_norm_var": 0.01757990519205729, + "learning_rate": 0.0001, + "loss": 4.4339, + "loss/crossentropy": 2.2513452768325806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24517202377319336, + "step": 9818 + }, + { + "epoch": 0.1964, + "grad_norm": 2.203125, + "grad_norm_var": 0.01683527628580729, + "learning_rate": 0.0001, + "loss": 4.2238, + "loss/crossentropy": 2.3133161067962646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22146832942962646, + "step": 9820 + }, + { + "epoch": 0.19644, + "grad_norm": 2.1875, + "grad_norm_var": 0.016949208577473958, + "learning_rate": 0.0001, + "loss": 4.5003, + "loss/crossentropy": 2.3226611614227295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2327018678188324, + "step": 9822 + }, + { + "epoch": 0.19648, + "grad_norm": 2.078125, + "grad_norm_var": 0.016228993733723957, + "learning_rate": 0.0001, + "loss": 4.1832, + "loss/crossentropy": 2.234626054763794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23657850921154022, + "step": 9824 + }, + { + "epoch": 0.19652, + "grad_norm": 2.25, + "grad_norm_var": 0.008941396077473959, + "learning_rate": 0.0001, + "loss": 4.4576, + "loss/crossentropy": 2.3478844165802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24796272069215775, + "step": 9826 + }, + { + "epoch": 0.19656, + "grad_norm": 2.171875, + "grad_norm_var": 0.006982167561848958, + "learning_rate": 0.0001, + "loss": 4.3678, + "loss/crossentropy": 2.094748795032501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21118396520614624, + "step": 9828 + }, + { + "epoch": 0.1966, + "grad_norm": 2.140625, + "grad_norm_var": 0.007289377848307291, + "learning_rate": 0.0001, + "loss": 4.2916, + "loss/crossentropy": 2.166300058364868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773284673690796, + "step": 9830 + }, + { + "epoch": 0.19664, + "grad_norm": 2.09375, + "grad_norm_var": 0.007212066650390625, + "learning_rate": 0.0001, + "loss": 4.5253, + "loss/crossentropy": 2.198317289352417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22636590898036957, + "step": 9832 + }, + { + "epoch": 0.19668, + "grad_norm": 2.1875, + "grad_norm_var": 0.006278228759765625, + "learning_rate": 0.0001, + "loss": 4.3405, + "loss/crossentropy": 2.3081597089767456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.231675922870636, + "step": 9834 + }, + { + "epoch": 0.19672, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005206044514973958, + "learning_rate": 0.0001, + "loss": 4.1181, + "loss/crossentropy": 2.048017203807831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20089948922395706, + "step": 9836 + }, + { + "epoch": 0.19676, + "grad_norm": 2.125, + "grad_norm_var": 0.01768773396809896, + "learning_rate": 0.0001, + "loss": 4.0515, + "loss/crossentropy": 1.886117160320282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21702590584754944, + "step": 9838 + }, + { + "epoch": 0.1968, + "grad_norm": 2.0625, + "grad_norm_var": 0.017895253499348958, + "learning_rate": 0.0001, + "loss": 3.9827, + "loss/crossentropy": 1.902605414390564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19976041465997696, + "step": 9840 + }, + { + "epoch": 0.19684, + "grad_norm": 4.28125, + "grad_norm_var": 0.309179433186849, + "learning_rate": 0.0001, + "loss": 4.0874, + "loss/crossentropy": 1.7959995865821838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023170217871666, + "step": 9842 + }, + { + "epoch": 0.19688, + "grad_norm": 2.15625, + "grad_norm_var": 0.3066993713378906, + "learning_rate": 0.0001, + "loss": 4.5535, + "loss/crossentropy": 2.1481738090515137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22133169323205948, + "step": 9844 + }, + { + "epoch": 0.19692, + "grad_norm": 2.03125, + "grad_norm_var": 0.30752741495768227, + "learning_rate": 0.0001, + "loss": 4.0269, + "loss/crossentropy": 1.9328197240829468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20346572250127792, + "step": 9846 + }, + { + "epoch": 0.19696, + "grad_norm": 2.015625, + "grad_norm_var": 0.31202367146809895, + "learning_rate": 0.0001, + "loss": 4.0299, + "loss/crossentropy": 2.096457004547119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21341010928153992, + "step": 9848 + }, + { + "epoch": 0.197, + "grad_norm": 2.03125, + "grad_norm_var": 0.3146522521972656, + "learning_rate": 0.0001, + "loss": 4.3105, + "loss/crossentropy": 1.9698969721794128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1855439990758896, + "step": 9850 + }, + { + "epoch": 0.19704, + "grad_norm": 2.234375, + "grad_norm_var": 0.31038004557291665, + "learning_rate": 0.0001, + "loss": 4.2165, + "loss/crossentropy": 1.987346351146698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23657751083374023, + "step": 9852 + }, + { + "epoch": 0.19708, + "grad_norm": 1.8984375, + "grad_norm_var": 0.3109169006347656, + "learning_rate": 0.0001, + "loss": 4.0346, + "loss/crossentropy": 1.9535572528839111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21131790429353714, + "step": 9854 + }, + { + "epoch": 0.19712, + "grad_norm": 2.09375, + "grad_norm_var": 0.3090349833170573, + "learning_rate": 0.0001, + "loss": 4.5828, + "loss/crossentropy": 2.08541601896286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2290833741426468, + "step": 9856 + }, + { + "epoch": 0.19716, + "grad_norm": 2.28125, + "grad_norm_var": 0.016155751546223958, + "learning_rate": 0.0001, + "loss": 4.2774, + "loss/crossentropy": 2.1930192708969116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23072391748428345, + "step": 9858 + }, + { + "epoch": 0.1972, + "grad_norm": 2.359375, + "grad_norm_var": 0.016658274332682292, + "learning_rate": 0.0001, + "loss": 4.3923, + "loss/crossentropy": 1.8369358777999878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20687467604875565, + "step": 9860 + }, + { + "epoch": 0.19724, + "grad_norm": 2.0, + "grad_norm_var": 0.017380523681640624, + "learning_rate": 0.0001, + "loss": 4.0287, + "loss/crossentropy": 1.6707186102867126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1808091625571251, + "step": 9862 + }, + { + "epoch": 0.19728, + "grad_norm": 1.984375, + "grad_norm_var": 0.017651112874348958, + "learning_rate": 0.0001, + "loss": 4.1184, + "loss/crossentropy": 1.8352991342544556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19169726222753525, + "step": 9864 + }, + { + "epoch": 0.19732, + "grad_norm": 2.734375, + "grad_norm_var": 0.042909495035807294, + "learning_rate": 0.0001, + "loss": 4.2857, + "loss/crossentropy": 1.9427489638328552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20076656341552734, + "step": 9866 + }, + { + "epoch": 0.19736, + "grad_norm": 2.078125, + "grad_norm_var": 0.04237035115559896, + "learning_rate": 0.0001, + "loss": 4.3725, + "loss/crossentropy": 2.2190250158309937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24385111033916473, + "step": 9868 + }, + { + "epoch": 0.1974, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04182510375976563, + "learning_rate": 0.0001, + "loss": 4.2192, + "loss/crossentropy": 2.0958147644996643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23264098167419434, + "step": 9870 + }, + { + "epoch": 0.19744, + "grad_norm": 2.25, + "grad_norm_var": 0.04228897094726562, + "learning_rate": 0.0001, + "loss": 4.3571, + "loss/crossentropy": 2.542472720146179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23458892852067947, + "step": 9872 + }, + { + "epoch": 0.19748, + "grad_norm": 2.21875, + "grad_norm_var": 0.03728408813476562, + "learning_rate": 0.0001, + "loss": 4.2767, + "loss/crossentropy": 2.1562893390655518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2243279591202736, + "step": 9874 + }, + { + "epoch": 0.19752, + "grad_norm": 2.21875, + "grad_norm_var": 0.034395090738932294, + "learning_rate": 0.0001, + "loss": 4.0114, + "loss/crossentropy": 1.9851951599121094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2369420975446701, + "step": 9876 + }, + { + "epoch": 0.19756, + "grad_norm": 2.03125, + "grad_norm_var": 0.032714589436848955, + "learning_rate": 0.0001, + "loss": 4.3017, + "loss/crossentropy": 1.8751549124717712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20777788758277893, + "step": 9878 + }, + { + "epoch": 0.1976, + "grad_norm": 2.125, + "grad_norm_var": 0.0318267822265625, + "learning_rate": 0.0001, + "loss": 4.1328, + "loss/crossentropy": 2.010055720806122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21204733848571777, + "step": 9880 + }, + { + "epoch": 0.19764, + "grad_norm": 2.0625, + "grad_norm_var": 0.008942667643229167, + "learning_rate": 0.0001, + "loss": 4.1122, + "loss/crossentropy": 2.04589307308197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21330490708351135, + "step": 9882 + }, + { + "epoch": 0.19768, + "grad_norm": 2.125, + "grad_norm_var": 0.010138956705729167, + "learning_rate": 0.0001, + "loss": 4.1663, + "loss/crossentropy": 1.9441133737564087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22589778900146484, + "step": 9884 + }, + { + "epoch": 0.19772, + "grad_norm": 2.25, + "grad_norm_var": 0.008296457926432292, + "learning_rate": 0.0001, + "loss": 4.4239, + "loss/crossentropy": 2.305395483970642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24267761409282684, + "step": 9886 + }, + { + "epoch": 0.19776, + "grad_norm": 2.03125, + "grad_norm_var": 0.007269032796223958, + "learning_rate": 0.0001, + "loss": 4.386, + "loss/crossentropy": 2.096014082431793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21358592063188553, + "step": 9888 + }, + { + "epoch": 0.1978, + "grad_norm": 2.25, + "grad_norm_var": 0.008149973551432292, + "learning_rate": 0.0001, + "loss": 4.2315, + "loss/crossentropy": 2.1115033626556396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22994756698608398, + "step": 9890 + }, + { + "epoch": 0.19784, + "grad_norm": 1.9375, + "grad_norm_var": 0.008654530843098958, + "learning_rate": 0.0001, + "loss": 4.1419, + "loss/crossentropy": 1.6122692227363586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1897209882736206, + "step": 9892 + }, + { + "epoch": 0.19788, + "grad_norm": 2.0625, + "grad_norm_var": 0.012933095296223959, + "learning_rate": 0.0001, + "loss": 4.1679, + "loss/crossentropy": 1.4790136218070984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17421242594718933, + "step": 9894 + }, + { + "epoch": 0.19792, + "grad_norm": 2.09375, + "grad_norm_var": 0.013700358072916667, + "learning_rate": 0.0001, + "loss": 4.2921, + "loss/crossentropy": 1.8394885063171387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20840360969305038, + "step": 9896 + }, + { + "epoch": 0.19796, + "grad_norm": 2.03125, + "grad_norm_var": 0.0135894775390625, + "learning_rate": 0.0001, + "loss": 4.3993, + "loss/crossentropy": 2.152444541454315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23834071308374405, + "step": 9898 + }, + { + "epoch": 0.198, + "grad_norm": 2.1875, + "grad_norm_var": 0.012189737955729167, + "learning_rate": 0.0001, + "loss": 4.5268, + "loss/crossentropy": 2.367082357406616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23409543931484222, + "step": 9900 + }, + { + "epoch": 0.19804, + "grad_norm": 2.125, + "grad_norm_var": 0.011799112955729166, + "learning_rate": 0.0001, + "loss": 4.4599, + "loss/crossentropy": 2.136048436164856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21797242760658264, + "step": 9902 + }, + { + "epoch": 0.19808, + "grad_norm": 2.03125, + "grad_norm_var": 0.011945597330729167, + "learning_rate": 0.0001, + "loss": 4.2262, + "loss/crossentropy": 2.035883128643036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215388223528862, + "step": 9904 + }, + { + "epoch": 0.19812, + "grad_norm": 2.203125, + "grad_norm_var": 0.011970774332682291, + "learning_rate": 0.0001, + "loss": 4.2024, + "loss/crossentropy": 2.0339369773864746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085876688361168, + "step": 9906 + }, + { + "epoch": 0.19816, + "grad_norm": 2.0625, + "grad_norm_var": 0.009655507405598958, + "learning_rate": 0.0001, + "loss": 4.4441, + "loss/crossentropy": 1.829429566860199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20085029304027557, + "step": 9908 + }, + { + "epoch": 0.1982, + "grad_norm": 1.984375, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 4.1108, + "loss/crossentropy": 2.1721774339675903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2118668630719185, + "step": 9910 + }, + { + "epoch": 0.19824, + "grad_norm": 2.265625, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 4.3511, + "loss/crossentropy": 1.958261251449585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22304313629865646, + "step": 9912 + }, + { + "epoch": 0.19828, + "grad_norm": 2.21875, + "grad_norm_var": 0.009870402018229167, + "learning_rate": 0.0001, + "loss": 4.5819, + "loss/crossentropy": 2.4651763439178467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2435118407011032, + "step": 9914 + }, + { + "epoch": 0.19832, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012532297770182292, + "learning_rate": 0.0001, + "loss": 4.0973, + "loss/crossentropy": 2.173910617828369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21142201125621796, + "step": 9916 + }, + { + "epoch": 0.19836, + "grad_norm": 2.078125, + "grad_norm_var": 0.013239542643229166, + "learning_rate": 0.0001, + "loss": 3.9283, + "loss/crossentropy": 1.9559763073921204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19051063805818558, + "step": 9918 + }, + { + "epoch": 0.1984, + "grad_norm": 2.265625, + "grad_norm_var": 0.015895334879557292, + "learning_rate": 0.0001, + "loss": 4.2516, + "loss/crossentropy": 2.238003969192505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2338731735944748, + "step": 9920 + }, + { + "epoch": 0.19844, + "grad_norm": 2.140625, + "grad_norm_var": 0.015233357747395834, + "learning_rate": 0.0001, + "loss": 4.4624, + "loss/crossentropy": 2.269726276397705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19430368393659592, + "step": 9922 + }, + { + "epoch": 0.19848, + "grad_norm": 2.109375, + "grad_norm_var": 0.015697224934895834, + "learning_rate": 0.0001, + "loss": 4.436, + "loss/crossentropy": 2.351833701133728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24469508230686188, + "step": 9924 + }, + { + "epoch": 0.19852, + "grad_norm": 2.15625, + "grad_norm_var": 0.013398996988932292, + "learning_rate": 0.0001, + "loss": 4.3833, + "loss/crossentropy": 2.0600146055221558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21375566720962524, + "step": 9926 + }, + { + "epoch": 0.19856, + "grad_norm": 2.015625, + "grad_norm_var": 0.012562815348307292, + "learning_rate": 0.0001, + "loss": 4.2957, + "loss/crossentropy": 2.1543694734573364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20916736871004105, + "step": 9928 + }, + { + "epoch": 0.1986, + "grad_norm": 2.015625, + "grad_norm_var": 0.010782623291015625, + "learning_rate": 0.0001, + "loss": 4.3111, + "loss/crossentropy": 1.8952317833900452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216978445649147, + "step": 9930 + }, + { + "epoch": 0.19864, + "grad_norm": 2.03125, + "grad_norm_var": 0.008695475260416667, + "learning_rate": 0.0001, + "loss": 4.0542, + "loss/crossentropy": 2.102243661880493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22314947098493576, + "step": 9932 + }, + { + "epoch": 0.19868, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0072021484375, + "learning_rate": 0.0001, + "loss": 4.1653, + "loss/crossentropy": 1.9036884307861328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19729873538017273, + "step": 9934 + }, + { + "epoch": 0.19872, + "grad_norm": 2.0, + "grad_norm_var": 0.006459299723307292, + "learning_rate": 0.0001, + "loss": 4.17, + "loss/crossentropy": 1.823796033859253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18089265376329422, + "step": 9936 + }, + { + "epoch": 0.19876, + "grad_norm": 2.0, + "grad_norm_var": 0.006302642822265625, + "learning_rate": 0.0001, + "loss": 3.8742, + "loss/crossentropy": 2.055707633495331, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20620816200971603, + "step": 9938 + }, + { + "epoch": 0.1988, + "grad_norm": 2.109375, + "grad_norm_var": 0.0055010477701822914, + "learning_rate": 0.0001, + "loss": 4.256, + "loss/crossentropy": 2.0490049719810486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21709006279706955, + "step": 9940 + }, + { + "epoch": 0.19884, + "grad_norm": 2.0625, + "grad_norm_var": 0.005891672770182292, + "learning_rate": 0.0001, + "loss": 4.2733, + "loss/crossentropy": 2.164198637008667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2335330694913864, + "step": 9942 + }, + { + "epoch": 0.19888, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006941731770833333, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 1.9218478202819824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945241093635559, + "step": 9944 + }, + { + "epoch": 0.19892, + "grad_norm": 2.109375, + "grad_norm_var": 0.007328033447265625, + "learning_rate": 0.0001, + "loss": 3.8591, + "loss/crossentropy": 1.9456552267074585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20599794387817383, + "step": 9946 + }, + { + "epoch": 0.19896, + "grad_norm": 1.96875, + "grad_norm_var": 0.007749176025390625, + "learning_rate": 0.0001, + "loss": 4.0385, + "loss/crossentropy": 2.0472273230552673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20920252054929733, + "step": 9948 + }, + { + "epoch": 0.199, + "grad_norm": 1.96875, + "grad_norm_var": 0.007860310872395833, + "learning_rate": 0.0001, + "loss": 4.308, + "loss/crossentropy": 2.2252047061920166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2360471710562706, + "step": 9950 + }, + { + "epoch": 0.19904, + "grad_norm": 2.09375, + "grad_norm_var": 0.005537923177083333, + "learning_rate": 0.0001, + "loss": 4.0828, + "loss/crossentropy": 1.646431565284729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1692601889371872, + "step": 9952 + }, + { + "epoch": 0.19908, + "grad_norm": 2.046875, + "grad_norm_var": 0.0054443359375, + "learning_rate": 0.0001, + "loss": 3.8818, + "loss/crossentropy": 2.0114784836769104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20070793479681015, + "step": 9954 + }, + { + "epoch": 0.19912, + "grad_norm": 1.953125, + "grad_norm_var": 0.00677490234375, + "learning_rate": 0.0001, + "loss": 4.324, + "loss/crossentropy": 2.001778781414032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21009314060211182, + "step": 9956 + }, + { + "epoch": 0.19916, + "grad_norm": 2.0, + "grad_norm_var": 0.006037394205729167, + "learning_rate": 0.0001, + "loss": 4.3493, + "loss/crossentropy": 2.1330565214157104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22442802786827087, + "step": 9958 + }, + { + "epoch": 0.1992, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005635579427083333, + "learning_rate": 0.0001, + "loss": 4.193, + "loss/crossentropy": 1.9146793484687805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21145610511302948, + "step": 9960 + }, + { + "epoch": 0.19924, + "grad_norm": 2.015625, + "grad_norm_var": 0.004705556233723958, + "learning_rate": 0.0001, + "loss": 4.3262, + "loss/crossentropy": 2.5224483013153076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23354032635688782, + "step": 9962 + }, + { + "epoch": 0.19928, + "grad_norm": 2.109375, + "grad_norm_var": 0.004552968343098958, + "learning_rate": 0.0001, + "loss": 4.3663, + "loss/crossentropy": 2.245160937309265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21848157793283463, + "step": 9964 + }, + { + "epoch": 0.19932, + "grad_norm": 2.15625, + "grad_norm_var": 0.004622141520182292, + "learning_rate": 0.0001, + "loss": 4.3506, + "loss/crossentropy": 2.122299015522003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21853189170360565, + "step": 9966 + }, + { + "epoch": 0.19936, + "grad_norm": 2.171875, + "grad_norm_var": 0.005716705322265625, + "learning_rate": 0.0001, + "loss": 4.4524, + "loss/crossentropy": 2.31084668636322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23691194504499435, + "step": 9968 + }, + { + "epoch": 0.1994, + "grad_norm": 2.25, + "grad_norm_var": 0.007834625244140626, + "learning_rate": 0.0001, + "loss": 4.274, + "loss/crossentropy": 2.2242307662963867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21957046538591385, + "step": 9970 + }, + { + "epoch": 0.19944, + "grad_norm": 2.078125, + "grad_norm_var": 0.007063547770182292, + "learning_rate": 0.0001, + "loss": 4.2264, + "loss/crossentropy": 1.792852759361267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19996580481529236, + "step": 9972 + }, + { + "epoch": 0.19948, + "grad_norm": 2.09375, + "grad_norm_var": 0.0063250223795572914, + "learning_rate": 0.0001, + "loss": 4.3029, + "loss/crossentropy": 1.9593411087989807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21858739852905273, + "step": 9974 + }, + { + "epoch": 0.19952, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0067827860514322914, + "learning_rate": 0.0001, + "loss": 4.3067, + "loss/crossentropy": 2.259618401527405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22094043344259262, + "step": 9976 + }, + { + "epoch": 0.19956, + "grad_norm": 2.140625, + "grad_norm_var": 0.0063168843587239586, + "learning_rate": 0.0001, + "loss": 4.373, + "loss/crossentropy": 2.31876802444458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22025877982378006, + "step": 9978 + }, + { + "epoch": 0.1996, + "grad_norm": 2.0625, + "grad_norm_var": 0.006109364827473958, + "learning_rate": 0.0001, + "loss": 4.15, + "loss/crossentropy": 1.7625555396080017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21313250809907913, + "step": 9980 + }, + { + "epoch": 0.19964, + "grad_norm": 2.265625, + "grad_norm_var": 0.007342274983723958, + "learning_rate": 0.0001, + "loss": 4.4628, + "loss/crossentropy": 2.19295072555542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23694515973329544, + "step": 9982 + }, + { + "epoch": 0.19968, + "grad_norm": 2.0625, + "grad_norm_var": 0.008739217122395834, + "learning_rate": 0.0001, + "loss": 4.187, + "loss/crossentropy": 2.4073877334594727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22179137915372849, + "step": 9984 + }, + { + "epoch": 0.19972, + "grad_norm": 2.015625, + "grad_norm_var": 0.008125813802083333, + "learning_rate": 0.0001, + "loss": 3.9804, + "loss/crossentropy": 1.8942558765411377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20588286221027374, + "step": 9986 + }, + { + "epoch": 0.19976, + "grad_norm": 1.9375, + "grad_norm_var": 0.012188466389973958, + "learning_rate": 0.0001, + "loss": 4.109, + "loss/crossentropy": 2.220746397972107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144280970096588, + "step": 9988 + }, + { + "epoch": 0.1998, + "grad_norm": 2.125, + "grad_norm_var": 0.01260986328125, + "learning_rate": 0.0001, + "loss": 4.2279, + "loss/crossentropy": 1.9511706233024597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1998429372906685, + "step": 9990 + }, + { + "epoch": 0.19984, + "grad_norm": 2.125, + "grad_norm_var": 0.012247467041015625, + "learning_rate": 0.0001, + "loss": 4.143, + "loss/crossentropy": 2.249726891517639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24011528491973877, + "step": 9992 + }, + { + "epoch": 0.19988, + "grad_norm": 2.6875, + "grad_norm_var": 0.03792292277018229, + "learning_rate": 0.0001, + "loss": 4.3104, + "loss/crossentropy": 1.957375943660736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24464774131774902, + "step": 9994 + }, + { + "epoch": 0.19992, + "grad_norm": 2.21875, + "grad_norm_var": 0.039033762613932294, + "learning_rate": 0.0001, + "loss": 4.5492, + "loss/crossentropy": 2.265984058380127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24291887879371643, + "step": 9996 + }, + { + "epoch": 0.19996, + "grad_norm": 2.046875, + "grad_norm_var": 0.03920873006184896, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 1.8064668774604797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21571090072393417, + "step": 9998 + }, + { + "epoch": 0.2, + "grad_norm": 2.234375, + "grad_norm_var": 0.03875732421875, + "learning_rate": 0.0001, + "loss": 4.2295, + "loss/crossentropy": 1.9072380661964417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19980185478925705, + "step": 10000 + }, + { + "epoch": 0.20004, + "grad_norm": 2.15625, + "grad_norm_var": 0.036641438802083336, + "learning_rate": 0.0001, + "loss": 4.3919, + "loss/crossentropy": 2.074933707714081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22161328792572021, + "step": 10002 + }, + { + "epoch": 0.20008, + "grad_norm": 2.015625, + "grad_norm_var": 0.027581532796223957, + "learning_rate": 0.0001, + "loss": 4.0447, + "loss/crossentropy": 1.9344687461853027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1825423538684845, + "step": 10004 + }, + { + "epoch": 0.20012, + "grad_norm": 3.640625, + "grad_norm_var": 0.162158203125, + "learning_rate": 0.0001, + "loss": 4.1116, + "loss/crossentropy": 1.866003930568695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22012518346309662, + "step": 10006 + }, + { + "epoch": 0.20016, + "grad_norm": 2.0, + "grad_norm_var": 0.162158203125, + "learning_rate": 0.0001, + "loss": 4.3517, + "loss/crossentropy": 2.145058751106262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22737383097410202, + "step": 10008 + }, + { + "epoch": 0.2002, + "grad_norm": 2.09375, + "grad_norm_var": 0.15038960774739582, + "learning_rate": 0.0001, + "loss": 3.9755, + "loss/crossentropy": 1.7828176617622375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22082456946372986, + "step": 10010 + }, + { + "epoch": 0.20024, + "grad_norm": 2.109375, + "grad_norm_var": 0.1537994384765625, + "learning_rate": 0.0001, + "loss": 4.3143, + "loss/crossentropy": 2.1222537755966187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2176828756928444, + "step": 10012 + }, + { + "epoch": 0.20028, + "grad_norm": 2.203125, + "grad_norm_var": 0.15230712890625, + "learning_rate": 0.0001, + "loss": 4.53, + "loss/crossentropy": 2.119267463684082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24002012610435486, + "step": 10014 + }, + { + "epoch": 0.20032, + "grad_norm": 2.078125, + "grad_norm_var": 0.15458577473958332, + "learning_rate": 0.0001, + "loss": 4.3528, + "loss/crossentropy": 2.198129415512085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23767977952957153, + "step": 10016 + }, + { + "epoch": 0.20036, + "grad_norm": 2.125, + "grad_norm_var": 0.15608317057291668, + "learning_rate": 0.0001, + "loss": 4.0228, + "loss/crossentropy": 1.7466872334480286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19916100800037384, + "step": 10018 + }, + { + "epoch": 0.2004, + "grad_norm": 1.9296875, + "grad_norm_var": 0.15812352498372395, + "learning_rate": 0.0001, + "loss": 4.1989, + "loss/crossentropy": 1.9834936261177063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891954466700554, + "step": 10020 + }, + { + "epoch": 0.20044, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005840810139973959, + "learning_rate": 0.0001, + "loss": 4.0987, + "loss/crossentropy": 1.8997412323951721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19624030590057373, + "step": 10022 + }, + { + "epoch": 0.20048, + "grad_norm": 2.1875, + "grad_norm_var": 0.0065915425618489586, + "learning_rate": 0.0001, + "loss": 4.2531, + "loss/crossentropy": 2.0051563382148743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069488987326622, + "step": 10024 + }, + { + "epoch": 0.20052, + "grad_norm": 2.265625, + "grad_norm_var": 0.008107248942057292, + "learning_rate": 0.0001, + "loss": 4.4215, + "loss/crossentropy": 1.8662462830543518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053019255399704, + "step": 10026 + }, + { + "epoch": 0.20056, + "grad_norm": 2.171875, + "grad_norm_var": 0.02585627237955729, + "learning_rate": 0.0001, + "loss": 4.1444, + "loss/crossentropy": 2.1043936014175415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22167576104402542, + "step": 10028 + }, + { + "epoch": 0.2006, + "grad_norm": 5.78125, + "grad_norm_var": 0.859185536702474, + "learning_rate": 0.0001, + "loss": 4.3685, + "loss/crossentropy": 2.1248152256011963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22400003671646118, + "step": 10030 + }, + { + "epoch": 0.20064, + "grad_norm": 2.078125, + "grad_norm_var": 0.852441151936849, + "learning_rate": 0.0001, + "loss": 4.1971, + "loss/crossentropy": 2.326894521713257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24597454071044922, + "step": 10032 + }, + { + "epoch": 0.20068, + "grad_norm": 2.203125, + "grad_norm_var": 0.8413164774576823, + "learning_rate": 0.0001, + "loss": 4.1801, + "loss/crossentropy": 1.8590435981750488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20270948112010956, + "step": 10034 + }, + { + "epoch": 0.20072, + "grad_norm": 2.0, + "grad_norm_var": 0.82972412109375, + "learning_rate": 0.0001, + "loss": 4.5098, + "loss/crossentropy": 2.0383604168891907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21504472196102142, + "step": 10036 + }, + { + "epoch": 0.20076, + "grad_norm": 2.078125, + "grad_norm_var": 0.81461181640625, + "learning_rate": 0.0001, + "loss": 4.599, + "loss/crossentropy": 2.42622447013855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2143569439649582, + "step": 10038 + }, + { + "epoch": 0.2008, + "grad_norm": 2.109375, + "grad_norm_var": 0.825640614827474, + "learning_rate": 0.0001, + "loss": 4.3216, + "loss/crossentropy": 2.0825703144073486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21215181052684784, + "step": 10040 + }, + { + "epoch": 0.20084, + "grad_norm": 2.140625, + "grad_norm_var": 0.8326515197753906, + "learning_rate": 0.0001, + "loss": 3.9615, + "loss/crossentropy": 2.1682112216949463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23538567870855331, + "step": 10042 + }, + { + "epoch": 0.20088, + "grad_norm": 1.984375, + "grad_norm_var": 0.8466957092285157, + "learning_rate": 0.0001, + "loss": 4.2244, + "loss/crossentropy": 2.0161439180374146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21078374981880188, + "step": 10044 + }, + { + "epoch": 0.20092, + "grad_norm": 2.078125, + "grad_norm_var": 0.018143463134765624, + "learning_rate": 0.0001, + "loss": 4.5491, + "loss/crossentropy": 2.2262184619903564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2855340391397476, + "step": 10046 + }, + { + "epoch": 0.20096, + "grad_norm": 2.203125, + "grad_norm_var": 0.011321767171223959, + "learning_rate": 0.0001, + "loss": 4.5078, + "loss/crossentropy": 2.454360246658325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2287754938006401, + "step": 10048 + }, + { + "epoch": 0.201, + "grad_norm": 2.015625, + "grad_norm_var": 0.007940419514973958, + "learning_rate": 0.0001, + "loss": 4.1146, + "loss/crossentropy": 1.970844566822052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20197579264640808, + "step": 10050 + }, + { + "epoch": 0.20104, + "grad_norm": 2.03125, + "grad_norm_var": 0.006211090087890625, + "learning_rate": 0.0001, + "loss": 4.3573, + "loss/crossentropy": 2.0942559242248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21927295625209808, + "step": 10052 + }, + { + "epoch": 0.20108, + "grad_norm": 2.0, + "grad_norm_var": 0.005771636962890625, + "learning_rate": 0.0001, + "loss": 4.036, + "loss/crossentropy": 1.868508517742157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1838754191994667, + "step": 10054 + }, + { + "epoch": 0.20112, + "grad_norm": 2.15625, + "grad_norm_var": 0.0058095296223958336, + "learning_rate": 0.0001, + "loss": 4.2101, + "loss/crossentropy": 2.028561532497406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21768346428871155, + "step": 10056 + }, + { + "epoch": 0.20116, + "grad_norm": 1.9375, + "grad_norm_var": 0.006669108072916667, + "learning_rate": 0.0001, + "loss": 3.9803, + "loss/crossentropy": 2.3005030155181885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23332027345895767, + "step": 10058 + }, + { + "epoch": 0.2012, + "grad_norm": 2.03125, + "grad_norm_var": 0.0065826416015625, + "learning_rate": 0.0001, + "loss": 4.089, + "loss/crossentropy": 1.7793474793434143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18733646720647812, + "step": 10060 + }, + { + "epoch": 0.20124, + "grad_norm": 2.015625, + "grad_norm_var": 0.005125935872395833, + "learning_rate": 0.0001, + "loss": 4.0197, + "loss/crossentropy": 2.0612844228744507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21674886345863342, + "step": 10062 + }, + { + "epoch": 0.20128, + "grad_norm": 2.078125, + "grad_norm_var": 0.004500071207682292, + "learning_rate": 0.0001, + "loss": 4.237, + "loss/crossentropy": 2.3219568729400635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20989079773426056, + "step": 10064 + }, + { + "epoch": 0.20132, + "grad_norm": 2.078125, + "grad_norm_var": 0.004659016927083333, + "learning_rate": 0.0001, + "loss": 4.0959, + "loss/crossentropy": 2.0941001176834106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19992788136005402, + "step": 10066 + }, + { + "epoch": 0.20136, + "grad_norm": 2.09375, + "grad_norm_var": 0.0038330078125, + "learning_rate": 0.0001, + "loss": 4.053, + "loss/crossentropy": 1.7448341250419617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108384072780609, + "step": 10068 + }, + { + "epoch": 0.2014, + "grad_norm": 2.0625, + "grad_norm_var": 0.0059397379557291664, + "learning_rate": 0.0001, + "loss": 4.263, + "loss/crossentropy": 2.0475903749465942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19875742495059967, + "step": 10070 + }, + { + "epoch": 0.20144, + "grad_norm": 2.140625, + "grad_norm_var": 0.0499755859375, + "learning_rate": 0.0001, + "loss": 4.1014, + "loss/crossentropy": 2.0624433755874634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20577051490545273, + "step": 10072 + }, + { + "epoch": 0.20148, + "grad_norm": 2.03125, + "grad_norm_var": 0.04807840983072917, + "learning_rate": 0.0001, + "loss": 4.3612, + "loss/crossentropy": 2.194978952407837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19964048266410828, + "step": 10074 + }, + { + "epoch": 0.20152, + "grad_norm": 2.0625, + "grad_norm_var": 0.04691162109375, + "learning_rate": 0.0001, + "loss": 4.4025, + "loss/crossentropy": 2.011984169483185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22259068489074707, + "step": 10076 + }, + { + "epoch": 0.20156, + "grad_norm": 2.203125, + "grad_norm_var": 0.0458740234375, + "learning_rate": 0.0001, + "loss": 4.3539, + "loss/crossentropy": 2.098285675048828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232050970196724, + "step": 10078 + }, + { + "epoch": 0.2016, + "grad_norm": 1.9609375, + "grad_norm_var": 0.04423726399739583, + "learning_rate": 0.0001, + "loss": 4.2772, + "loss/crossentropy": 2.313677191734314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2224883735179901, + "step": 10080 + }, + { + "epoch": 0.20164, + "grad_norm": 1.953125, + "grad_norm_var": 0.04454523722330729, + "learning_rate": 0.0001, + "loss": 4.1523, + "loss/crossentropy": 2.3712470531463623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23282987624406815, + "step": 10082 + }, + { + "epoch": 0.20168, + "grad_norm": 2.125, + "grad_norm_var": 0.043342844645182295, + "learning_rate": 0.0001, + "loss": 4.3173, + "loss/crossentropy": 2.204525947570801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2384341061115265, + "step": 10084 + }, + { + "epoch": 0.20172, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04528401692708333, + "learning_rate": 0.0001, + "loss": 3.9497, + "loss/crossentropy": 1.8102558851242065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17472535371780396, + "step": 10086 + }, + { + "epoch": 0.20176, + "grad_norm": 2.046875, + "grad_norm_var": 0.005711873372395833, + "learning_rate": 0.0001, + "loss": 4.2119, + "loss/crossentropy": 2.041890263557434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21201669424772263, + "step": 10088 + }, + { + "epoch": 0.2018, + "grad_norm": 2.125, + "grad_norm_var": 0.006026204427083333, + "learning_rate": 0.0001, + "loss": 4.3869, + "loss/crossentropy": 1.9904854893684387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23453570157289505, + "step": 10090 + }, + { + "epoch": 0.20184, + "grad_norm": 2.0625, + "grad_norm_var": 0.00716552734375, + "learning_rate": 0.0001, + "loss": 4.4324, + "loss/crossentropy": 2.199320912361145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22087062150239944, + "step": 10092 + }, + { + "epoch": 0.20188, + "grad_norm": 2.203125, + "grad_norm_var": 0.0073150634765625, + "learning_rate": 0.0001, + "loss": 4.392, + "loss/crossentropy": 2.321953535079956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2463374137878418, + "step": 10094 + }, + { + "epoch": 0.20192, + "grad_norm": 2.046875, + "grad_norm_var": 0.006076812744140625, + "learning_rate": 0.0001, + "loss": 4.3276, + "loss/crossentropy": 2.1109840869903564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23713566362857819, + "step": 10096 + }, + { + "epoch": 0.20196, + "grad_norm": 1.9375, + "grad_norm_var": 0.006322987874348958, + "learning_rate": 0.0001, + "loss": 3.9339, + "loss/crossentropy": 1.9126858711242676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20392102003097534, + "step": 10098 + }, + { + "epoch": 0.202, + "grad_norm": 2.09375, + "grad_norm_var": 0.006268056233723959, + "learning_rate": 0.0001, + "loss": 4.2902, + "loss/crossentropy": 2.073318660259247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20465004444122314, + "step": 10100 + }, + { + "epoch": 0.20204, + "grad_norm": 2.390625, + "grad_norm_var": 0.009837849934895834, + "learning_rate": 0.0001, + "loss": 4.1883, + "loss/crossentropy": 1.7532709836959839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22591036558151245, + "step": 10102 + }, + { + "epoch": 0.20208, + "grad_norm": 2.21875, + "grad_norm_var": 0.010660807291666666, + "learning_rate": 0.0001, + "loss": 4.3088, + "loss/crossentropy": 2.1874141693115234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.227812297642231, + "step": 10104 + }, + { + "epoch": 0.20212, + "grad_norm": 2.015625, + "grad_norm_var": 0.014095052083333334, + "learning_rate": 0.0001, + "loss": 4.4738, + "loss/crossentropy": 2.522923469543457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24104881286621094, + "step": 10106 + }, + { + "epoch": 0.20216, + "grad_norm": 2.21875, + "grad_norm_var": 0.013963826497395833, + "learning_rate": 0.0001, + "loss": 4.2823, + "loss/crossentropy": 1.9359918236732483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19563085585832596, + "step": 10108 + }, + { + "epoch": 0.2022, + "grad_norm": 2.015625, + "grad_norm_var": 0.015315755208333334, + "learning_rate": 0.0001, + "loss": 4.3582, + "loss/crossentropy": 2.390757203102112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24748887866735458, + "step": 10110 + }, + { + "epoch": 0.20224, + "grad_norm": 2.0625, + "grad_norm_var": 0.015013631184895833, + "learning_rate": 0.0001, + "loss": 4.5576, + "loss/crossentropy": 2.419153571128845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2228071466088295, + "step": 10112 + }, + { + "epoch": 0.20228, + "grad_norm": 2.03125, + "grad_norm_var": 0.015404256184895833, + "learning_rate": 0.0001, + "loss": 4.0235, + "loss/crossentropy": 1.7460771799087524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19431117177009583, + "step": 10114 + }, + { + "epoch": 0.20232, + "grad_norm": 2.140625, + "grad_norm_var": 0.01754150390625, + "learning_rate": 0.0001, + "loss": 4.601, + "loss/crossentropy": 2.308094024658203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24104833602905273, + "step": 10116 + }, + { + "epoch": 0.20236, + "grad_norm": 2.015625, + "grad_norm_var": 0.013374837239583333, + "learning_rate": 0.0001, + "loss": 4.1696, + "loss/crossentropy": 2.0273314118385315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2325449138879776, + "step": 10118 + }, + { + "epoch": 0.2024, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013792928059895833, + "learning_rate": 0.0001, + "loss": 4.016, + "loss/crossentropy": 2.1261476278305054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21481933444738388, + "step": 10120 + }, + { + "epoch": 0.20244, + "grad_norm": 2.03125, + "grad_norm_var": 0.0107818603515625, + "learning_rate": 0.0001, + "loss": 4.478, + "loss/crossentropy": 2.5006214380264282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25569023191928864, + "step": 10122 + }, + { + "epoch": 0.20248, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011775461832682292, + "learning_rate": 0.0001, + "loss": 4.3208, + "loss/crossentropy": 2.2033116817474365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2601129561662674, + "step": 10124 + }, + { + "epoch": 0.20252, + "grad_norm": 2.09375, + "grad_norm_var": 0.011572011311848958, + "learning_rate": 0.0001, + "loss": 4.1098, + "loss/crossentropy": 2.2208765745162964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20958464592695236, + "step": 10126 + }, + { + "epoch": 0.20256, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011848958333333333, + "learning_rate": 0.0001, + "loss": 3.943, + "loss/crossentropy": 1.7752392888069153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22697371244430542, + "step": 10128 + }, + { + "epoch": 0.2026, + "grad_norm": 2.0625, + "grad_norm_var": 0.011937459309895834, + "learning_rate": 0.0001, + "loss": 4.0529, + "loss/crossentropy": 1.6444379687309265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18721824884414673, + "step": 10130 + }, + { + "epoch": 0.20264, + "grad_norm": 2.046875, + "grad_norm_var": 0.007271321614583334, + "learning_rate": 0.0001, + "loss": 4.1796, + "loss/crossentropy": 1.7681297659873962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24296899884939194, + "step": 10132 + }, + { + "epoch": 0.20268, + "grad_norm": 2.015625, + "grad_norm_var": 0.007614898681640625, + "learning_rate": 0.0001, + "loss": 4.1593, + "loss/crossentropy": 1.8710024952888489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891346201300621, + "step": 10134 + }, + { + "epoch": 0.20272, + "grad_norm": 2.125, + "grad_norm_var": 0.007515462239583334, + "learning_rate": 0.0001, + "loss": 4.0822, + "loss/crossentropy": 2.0258530974388123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22128118574619293, + "step": 10136 + }, + { + "epoch": 0.20276, + "grad_norm": 2.015625, + "grad_norm_var": 1.7479237874348958, + "learning_rate": 0.0001, + "loss": 4.3091, + "loss/crossentropy": 2.3366141319274902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2612725794315338, + "step": 10138 + }, + { + "epoch": 0.2028, + "grad_norm": 2.0625, + "grad_norm_var": 1.739208730061849, + "learning_rate": 0.0001, + "loss": 4.357, + "loss/crossentropy": 2.152353823184967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239176332950592, + "step": 10140 + }, + { + "epoch": 0.20284, + "grad_norm": 2.15625, + "grad_norm_var": 1.7346433003743489, + "learning_rate": 0.0001, + "loss": 4.2552, + "loss/crossentropy": 1.8189843893051147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21529949456453323, + "step": 10142 + }, + { + "epoch": 0.20288, + "grad_norm": 2.03125, + "grad_norm_var": 1.7219970703125, + "learning_rate": 0.0001, + "loss": 4.3179, + "loss/crossentropy": 2.308253049850464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21748851984739304, + "step": 10144 + }, + { + "epoch": 0.20292, + "grad_norm": 2.015625, + "grad_norm_var": 1.7289377848307292, + "learning_rate": 0.0001, + "loss": 4.1762, + "loss/crossentropy": 1.953293800354004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1948333978652954, + "step": 10146 + }, + { + "epoch": 0.20296, + "grad_norm": 1.953125, + "grad_norm_var": 1.726512654622396, + "learning_rate": 0.0001, + "loss": 3.9573, + "loss/crossentropy": 2.1122325658798218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085581198334694, + "step": 10148 + }, + { + "epoch": 0.203, + "grad_norm": 2.0625, + "grad_norm_var": 1.716387685139974, + "learning_rate": 0.0001, + "loss": 4.4209, + "loss/crossentropy": 2.450512409210205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23670841753482819, + "step": 10150 + }, + { + "epoch": 0.20304, + "grad_norm": 1.984375, + "grad_norm_var": 1.7187327067057292, + "learning_rate": 0.0001, + "loss": 4.1131, + "loss/crossentropy": 2.3858957290649414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23381955921649933, + "step": 10152 + }, + { + "epoch": 0.20308, + "grad_norm": 2.0625, + "grad_norm_var": 0.0060618082682291664, + "learning_rate": 0.0001, + "loss": 4.2154, + "loss/crossentropy": 2.1214572191238403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22399096935987473, + "step": 10154 + }, + { + "epoch": 0.20312, + "grad_norm": 1.90625, + "grad_norm_var": 0.0082916259765625, + "learning_rate": 0.0001, + "loss": 4.0487, + "loss/crossentropy": 1.9299064874649048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19886164367198944, + "step": 10156 + }, + { + "epoch": 0.20316, + "grad_norm": 2.234375, + "grad_norm_var": 0.009877268473307292, + "learning_rate": 0.0001, + "loss": 4.108, + "loss/crossentropy": 1.9582993388175964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1859385445713997, + "step": 10158 + }, + { + "epoch": 0.2032, + "grad_norm": 2.15625, + "grad_norm_var": 0.008790842692057292, + "learning_rate": 0.0001, + "loss": 4.408, + "loss/crossentropy": 2.0555814504623413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25231435894966125, + "step": 10160 + }, + { + "epoch": 0.20324, + "grad_norm": 2.109375, + "grad_norm_var": 0.009090169270833334, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 1.9849395155906677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20529592037200928, + "step": 10162 + }, + { + "epoch": 0.20328, + "grad_norm": 2.125, + "grad_norm_var": 0.0071441650390625, + "learning_rate": 0.0001, + "loss": 4.5619, + "loss/crossentropy": 1.9529971480369568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997845619916916, + "step": 10164 + }, + { + "epoch": 0.20332, + "grad_norm": 1.953125, + "grad_norm_var": 0.009330240885416667, + "learning_rate": 0.0001, + "loss": 4.3117, + "loss/crossentropy": 1.9885541200637817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19847019761800766, + "step": 10166 + }, + { + "epoch": 0.20336, + "grad_norm": 2.21875, + "grad_norm_var": 0.010013834635416666, + "learning_rate": 0.0001, + "loss": 4.3431, + "loss/crossentropy": 1.8039852380752563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19332807511091232, + "step": 10168 + }, + { + "epoch": 0.2034, + "grad_norm": 2.125, + "grad_norm_var": 0.01004638671875, + "learning_rate": 0.0001, + "loss": 4.4287, + "loss/crossentropy": 2.1633352041244507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22904948145151138, + "step": 10170 + }, + { + "epoch": 0.20344, + "grad_norm": 2.046875, + "grad_norm_var": 0.0070953369140625, + "learning_rate": 0.0001, + "loss": 4.1148, + "loss/crossentropy": 2.2646039724349976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22390951961278915, + "step": 10172 + }, + { + "epoch": 0.20348, + "grad_norm": 2.015625, + "grad_norm_var": 0.005741119384765625, + "learning_rate": 0.0001, + "loss": 4.4114, + "loss/crossentropy": 2.604608416557312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25521157681941986, + "step": 10174 + }, + { + "epoch": 0.20352, + "grad_norm": 2.125, + "grad_norm_var": 0.016257476806640626, + "learning_rate": 0.0001, + "loss": 4.3319, + "loss/crossentropy": 2.286113977432251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23809745162725449, + "step": 10176 + }, + { + "epoch": 0.20356, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01777928670247396, + "learning_rate": 0.0001, + "loss": 3.9951, + "loss/crossentropy": 2.0746694207191467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21103999018669128, + "step": 10178 + }, + { + "epoch": 0.2036, + "grad_norm": 2.0, + "grad_norm_var": 0.018381500244140626, + "learning_rate": 0.0001, + "loss": 3.9892, + "loss/crossentropy": 1.9076440930366516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048335075378418, + "step": 10180 + }, + { + "epoch": 0.20364, + "grad_norm": 2.0, + "grad_norm_var": 0.0178863525390625, + "learning_rate": 0.0001, + "loss": 4.1768, + "loss/crossentropy": 1.8896904587745667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20643991231918335, + "step": 10182 + }, + { + "epoch": 0.20368, + "grad_norm": 2.0625, + "grad_norm_var": 0.016747029622395833, + "learning_rate": 0.0001, + "loss": 4.215, + "loss/crossentropy": 2.156657338142395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22853681445121765, + "step": 10184 + }, + { + "epoch": 0.20372, + "grad_norm": 2.203125, + "grad_norm_var": 0.017772420247395834, + "learning_rate": 0.0001, + "loss": 4.6307, + "loss/crossentropy": 2.3767203092575073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2422076091170311, + "step": 10186 + }, + { + "epoch": 0.20376, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01834894816080729, + "learning_rate": 0.0001, + "loss": 4.3402, + "loss/crossentropy": 2.5728834867477417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22912005335092545, + "step": 10188 + }, + { + "epoch": 0.2038, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02005182902018229, + "learning_rate": 0.0001, + "loss": 4.11, + "loss/crossentropy": 2.1542125940322876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19575025141239166, + "step": 10190 + }, + { + "epoch": 0.20384, + "grad_norm": 1.9765625, + "grad_norm_var": 0.004610188802083333, + "learning_rate": 0.0001, + "loss": 3.9114, + "loss/crossentropy": 1.5845852494239807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845107451081276, + "step": 10192 + }, + { + "epoch": 0.20388, + "grad_norm": 2.34375, + "grad_norm_var": 0.010957590738932292, + "learning_rate": 0.0001, + "loss": 4.2193, + "loss/crossentropy": 2.1166247129440308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2352868989109993, + "step": 10194 + }, + { + "epoch": 0.20392, + "grad_norm": 2.0, + "grad_norm_var": 0.010941314697265624, + "learning_rate": 0.0001, + "loss": 4.3374, + "loss/crossentropy": 2.0129401683807373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22231722623109818, + "step": 10196 + }, + { + "epoch": 0.20396, + "grad_norm": 2.125, + "grad_norm_var": 0.010595703125, + "learning_rate": 0.0001, + "loss": 3.9709, + "loss/crossentropy": 1.7132073044776917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18660317361354828, + "step": 10198 + }, + { + "epoch": 0.204, + "grad_norm": 2.21875, + "grad_norm_var": 0.016743977864583332, + "learning_rate": 0.0001, + "loss": 4.3959, + "loss/crossentropy": 2.0064845085144043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20879874378442764, + "step": 10200 + }, + { + "epoch": 0.20404, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01803766886393229, + "learning_rate": 0.0001, + "loss": 4.3558, + "loss/crossentropy": 2.4205944538116455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24201631546020508, + "step": 10202 + }, + { + "epoch": 0.20408, + "grad_norm": 1.9375, + "grad_norm_var": 0.019694010416666668, + "learning_rate": 0.0001, + "loss": 3.8977, + "loss/crossentropy": 2.0392255187034607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21578273177146912, + "step": 10204 + }, + { + "epoch": 0.20412, + "grad_norm": 2.25, + "grad_norm_var": 0.019230143229166666, + "learning_rate": 0.0001, + "loss": 4.1414, + "loss/crossentropy": 1.8652849197387695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1896674558520317, + "step": 10206 + }, + { + "epoch": 0.20416, + "grad_norm": 2.40625, + "grad_norm_var": 0.02332331339518229, + "learning_rate": 0.0001, + "loss": 4.7698, + "loss/crossentropy": 2.016683042049408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23144973814487457, + "step": 10208 + }, + { + "epoch": 0.2042, + "grad_norm": 2.046875, + "grad_norm_var": 0.020401763916015624, + "learning_rate": 0.0001, + "loss": 4.3691, + "loss/crossentropy": 1.9395010471343994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19717292487621307, + "step": 10210 + }, + { + "epoch": 0.20424, + "grad_norm": 2.234375, + "grad_norm_var": 0.0210845947265625, + "learning_rate": 0.0001, + "loss": 4.0907, + "loss/crossentropy": 1.7626919150352478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.200415201485157, + "step": 10212 + }, + { + "epoch": 0.20428, + "grad_norm": 2.171875, + "grad_norm_var": 0.021955362955729165, + "learning_rate": 0.0001, + "loss": 4.3411, + "loss/crossentropy": 2.3014339208602905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22324562072753906, + "step": 10214 + }, + { + "epoch": 0.20432, + "grad_norm": 1.921875, + "grad_norm_var": 0.019636027018229165, + "learning_rate": 0.0001, + "loss": 4.1237, + "loss/crossentropy": 1.906779408454895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2223847657442093, + "step": 10216 + }, + { + "epoch": 0.20436, + "grad_norm": 2.046875, + "grad_norm_var": 0.01789118448893229, + "learning_rate": 0.0001, + "loss": 4.4555, + "loss/crossentropy": 2.085246205329895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21039249747991562, + "step": 10218 + }, + { + "epoch": 0.2044, + "grad_norm": 2.171875, + "grad_norm_var": 0.014806874593098958, + "learning_rate": 0.0001, + "loss": 4.4477, + "loss/crossentropy": 2.213107645511627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22880114614963531, + "step": 10220 + }, + { + "epoch": 0.20444, + "grad_norm": 2.078125, + "grad_norm_var": 0.013392893473307292, + "learning_rate": 0.0001, + "loss": 3.9015, + "loss/crossentropy": 1.9510034322738647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20935232937335968, + "step": 10222 + }, + { + "epoch": 0.20448, + "grad_norm": 1.921875, + "grad_norm_var": 0.008906809488932292, + "learning_rate": 0.0001, + "loss": 4.1237, + "loss/crossentropy": 1.8595823645591736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20829308032989502, + "step": 10224 + }, + { + "epoch": 0.20452, + "grad_norm": 2.015625, + "grad_norm_var": 0.008990224202473958, + "learning_rate": 0.0001, + "loss": 4.1287, + "loss/crossentropy": 1.8250519037246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18982955813407898, + "step": 10226 + }, + { + "epoch": 0.20456, + "grad_norm": 2.15625, + "grad_norm_var": 0.0074859619140625, + "learning_rate": 0.0001, + "loss": 4.3656, + "loss/crossentropy": 2.410372495651245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22655458748340607, + "step": 10228 + }, + { + "epoch": 0.2046, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007533518473307291, + "learning_rate": 0.0001, + "loss": 4.2581, + "loss/crossentropy": 2.321051836013794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280098795890808, + "step": 10230 + }, + { + "epoch": 0.20464, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007380167643229167, + "learning_rate": 0.0001, + "loss": 3.9927, + "loss/crossentropy": 2.266388177871704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2341095432639122, + "step": 10232 + }, + { + "epoch": 0.20468, + "grad_norm": 2.15625, + "grad_norm_var": 0.007478841145833333, + "learning_rate": 0.0001, + "loss": 4.5263, + "loss/crossentropy": 2.390430450439453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23684432357549667, + "step": 10234 + }, + { + "epoch": 0.20472, + "grad_norm": 3.390625, + "grad_norm_var": 0.11719563802083334, + "learning_rate": 0.0001, + "loss": 4.4269, + "loss/crossentropy": 2.07179594039917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21902770549058914, + "step": 10236 + }, + { + "epoch": 0.20476, + "grad_norm": 2.1875, + "grad_norm_var": 0.12704264322916667, + "learning_rate": 0.0001, + "loss": 4.3473, + "loss/crossentropy": 1.686942458152771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20007195323705673, + "step": 10238 + }, + { + "epoch": 0.2048, + "grad_norm": 2.0, + "grad_norm_var": 0.26913248697916664, + "learning_rate": 0.0001, + "loss": 4.2375, + "loss/crossentropy": 2.0421791076660156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187313288450241, + "step": 10240 + }, + { + "epoch": 0.20484, + "grad_norm": 2.28125, + "grad_norm_var": 0.26201985677083334, + "learning_rate": 0.0001, + "loss": 4.1641, + "loss/crossentropy": 1.9503712058067322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879397749900818, + "step": 10242 + }, + { + "epoch": 0.20488, + "grad_norm": 3.21875, + "grad_norm_var": 0.82620849609375, + "learning_rate": 0.0001, + "loss": 4.413, + "loss/crossentropy": 2.1453936100006104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22767861932516098, + "step": 10244 + }, + { + "epoch": 0.20492, + "grad_norm": 2.171875, + "grad_norm_var": 0.808221181233724, + "learning_rate": 0.0001, + "loss": 4.1924, + "loss/crossentropy": 1.9007731080055237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20069654285907745, + "step": 10246 + }, + { + "epoch": 0.20496, + "grad_norm": 1.875, + "grad_norm_var": 0.8184832255045573, + "learning_rate": 0.0001, + "loss": 4.0685, + "loss/crossentropy": 2.0545393228530884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983654722571373, + "step": 10248 + }, + { + "epoch": 0.205, + "grad_norm": 2.1875, + "grad_norm_var": 0.804272206624349, + "learning_rate": 0.0001, + "loss": 4.3391, + "loss/crossentropy": 2.158636450767517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24239712953567505, + "step": 10250 + }, + { + "epoch": 0.20504, + "grad_norm": 2.15625, + "grad_norm_var": 0.7762794494628906, + "learning_rate": 0.0001, + "loss": 4.1608, + "loss/crossentropy": 2.1119120121002197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2136205956339836, + "step": 10252 + }, + { + "epoch": 0.20508, + "grad_norm": 2.21875, + "grad_norm_var": 0.7838417053222656, + "learning_rate": 0.0001, + "loss": 4.3184, + "loss/crossentropy": 2.0690027475357056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20616184175014496, + "step": 10254 + }, + { + "epoch": 0.20512, + "grad_norm": 2.0625, + "grad_norm_var": 0.6926798502604167, + "learning_rate": 0.0001, + "loss": 4.0635, + "loss/crossentropy": 2.178507924079895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21580064296722412, + "step": 10256 + }, + { + "epoch": 0.20516, + "grad_norm": 2.140625, + "grad_norm_var": 0.6889719645182292, + "learning_rate": 0.0001, + "loss": 4.1433, + "loss/crossentropy": 2.292190670967102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22163032740354538, + "step": 10258 + }, + { + "epoch": 0.2052, + "grad_norm": 2.015625, + "grad_norm_var": 0.03568115234375, + "learning_rate": 0.0001, + "loss": 4.0138, + "loss/crossentropy": 2.066649317741394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2081037238240242, + "step": 10260 + }, + { + "epoch": 0.20524, + "grad_norm": 2.296875, + "grad_norm_var": 0.05111490885416667, + "learning_rate": 0.0001, + "loss": 4.4315, + "loss/crossentropy": 1.9017595052719116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19284649938344955, + "step": 10262 + }, + { + "epoch": 0.20528, + "grad_norm": 2.125, + "grad_norm_var": 0.04326960245768229, + "learning_rate": 0.0001, + "loss": 4.0974, + "loss/crossentropy": 2.1215697526931763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244422286748886, + "step": 10264 + }, + { + "epoch": 0.20532, + "grad_norm": 2.03125, + "grad_norm_var": 0.02539647420247396, + "learning_rate": 0.0001, + "loss": 4.3231, + "loss/crossentropy": 2.170191764831543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308938354253769, + "step": 10266 + }, + { + "epoch": 0.20536, + "grad_norm": 2.140625, + "grad_norm_var": 0.02535985310872396, + "learning_rate": 0.0001, + "loss": 4.3472, + "loss/crossentropy": 2.0430655479431152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2435612976551056, + "step": 10268 + }, + { + "epoch": 0.2054, + "grad_norm": 2.203125, + "grad_norm_var": 0.025394439697265625, + "learning_rate": 0.0001, + "loss": 4.536, + "loss/crossentropy": 2.3141634464263916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2234780564904213, + "step": 10270 + }, + { + "epoch": 0.20544, + "grad_norm": 2.09375, + "grad_norm_var": 0.023371378580729168, + "learning_rate": 0.0001, + "loss": 4.1944, + "loss/crossentropy": 2.310709834098816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505229711532593, + "step": 10272 + }, + { + "epoch": 0.20548, + "grad_norm": 2.078125, + "grad_norm_var": 0.02434056599934896, + "learning_rate": 0.0001, + "loss": 4.0664, + "loss/crossentropy": 1.9158611297607422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062816470861435, + "step": 10274 + }, + { + "epoch": 0.20552, + "grad_norm": 2.015625, + "grad_norm_var": 0.02697728474934896, + "learning_rate": 0.0001, + "loss": 4.0545, + "loss/crossentropy": 2.0835859179496765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21051569283008575, + "step": 10276 + }, + { + "epoch": 0.20556, + "grad_norm": 2.125, + "grad_norm_var": 0.006951649983723958, + "learning_rate": 0.0001, + "loss": 4.4047, + "loss/crossentropy": 1.9533037543296814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20007294416427612, + "step": 10278 + }, + { + "epoch": 0.2056, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010701497395833334, + "learning_rate": 0.0001, + "loss": 4.4499, + "loss/crossentropy": 2.3090076446533203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295464426279068, + "step": 10280 + }, + { + "epoch": 0.20564, + "grad_norm": 2.171875, + "grad_norm_var": 0.014989217122395834, + "learning_rate": 0.0001, + "loss": 4.2708, + "loss/crossentropy": 2.2951393127441406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22297964990139008, + "step": 10282 + }, + { + "epoch": 0.20568, + "grad_norm": 2.09375, + "grad_norm_var": 0.014574178059895833, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.311514675617218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2287898138165474, + "step": 10284 + }, + { + "epoch": 0.20572, + "grad_norm": 1.921875, + "grad_norm_var": 0.014989217122395834, + "learning_rate": 0.0001, + "loss": 4.2287, + "loss/crossentropy": 2.277890205383301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21169421076774597, + "step": 10286 + }, + { + "epoch": 0.20576, + "grad_norm": 2.109375, + "grad_norm_var": 0.015143839518229167, + "learning_rate": 0.0001, + "loss": 4.4448, + "loss/crossentropy": 2.070693612098694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21682647615671158, + "step": 10288 + }, + { + "epoch": 0.2058, + "grad_norm": 1.984375, + "grad_norm_var": 0.014788564046223958, + "learning_rate": 0.0001, + "loss": 4.205, + "loss/crossentropy": 2.223360061645508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24135399609804153, + "step": 10290 + }, + { + "epoch": 0.20584, + "grad_norm": 2.40625, + "grad_norm_var": 0.018155670166015624, + "learning_rate": 0.0001, + "loss": 4.3304, + "loss/crossentropy": 2.430101752281189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21579021215438843, + "step": 10292 + }, + { + "epoch": 0.20588, + "grad_norm": 2.03125, + "grad_norm_var": 0.018173980712890624, + "learning_rate": 0.0001, + "loss": 4.1615, + "loss/crossentropy": 1.960309624671936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22619223594665527, + "step": 10294 + }, + { + "epoch": 0.20592, + "grad_norm": 2.515625, + "grad_norm_var": 0.024217732747395835, + "learning_rate": 0.0001, + "loss": 4.8862, + "loss/crossentropy": 2.0035970211029053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20148956030607224, + "step": 10296 + }, + { + "epoch": 0.20596, + "grad_norm": 2.09375, + "grad_norm_var": 0.021968587239583334, + "learning_rate": 0.0001, + "loss": 4.175, + "loss/crossentropy": 1.976987361907959, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21035870164632797, + "step": 10298 + }, + { + "epoch": 0.206, + "grad_norm": 2.140625, + "grad_norm_var": 0.021675618489583333, + "learning_rate": 0.0001, + "loss": 4.1596, + "loss/crossentropy": 2.0631470680236816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21725613623857498, + "step": 10300 + }, + { + "epoch": 0.20604, + "grad_norm": 2.421875, + "grad_norm_var": 0.02693456013997396, + "learning_rate": 0.0001, + "loss": 4.4734, + "loss/crossentropy": 2.2747987508773804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295055389404297, + "step": 10302 + }, + { + "epoch": 0.20608, + "grad_norm": 1.9609375, + "grad_norm_var": 0.029255167643229166, + "learning_rate": 0.0001, + "loss": 4.1255, + "loss/crossentropy": 2.0811264514923096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21890189498662949, + "step": 10304 + }, + { + "epoch": 0.20612, + "grad_norm": 2.203125, + "grad_norm_var": 0.028319295247395834, + "learning_rate": 0.0001, + "loss": 4.4391, + "loss/crossentropy": 2.2474766969680786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2430667206645012, + "step": 10306 + }, + { + "epoch": 0.20616, + "grad_norm": 2.03125, + "grad_norm_var": 0.024933878580729166, + "learning_rate": 0.0001, + "loss": 4.3016, + "loss/crossentropy": 1.899698257446289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21101202815771103, + "step": 10308 + }, + { + "epoch": 0.2062, + "grad_norm": 2.078125, + "grad_norm_var": 0.023851521809895835, + "learning_rate": 0.0001, + "loss": 3.9769, + "loss/crossentropy": 1.6432967782020569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19004638493061066, + "step": 10310 + }, + { + "epoch": 0.20624, + "grad_norm": 2.046875, + "grad_norm_var": 0.0149658203125, + "learning_rate": 0.0001, + "loss": 4.3901, + "loss/crossentropy": 1.9606398940086365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24546240270137787, + "step": 10312 + }, + { + "epoch": 0.20628, + "grad_norm": 2.15625, + "grad_norm_var": 0.015021769205729167, + "learning_rate": 0.0001, + "loss": 4.0988, + "loss/crossentropy": 2.140414595603943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2428836077451706, + "step": 10314 + }, + { + "epoch": 0.20632, + "grad_norm": 2.3125, + "grad_norm_var": 0.018393707275390626, + "learning_rate": 0.0001, + "loss": 4.3768, + "loss/crossentropy": 2.0886260271072388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2201090231537819, + "step": 10316 + }, + { + "epoch": 0.20636, + "grad_norm": 2.234375, + "grad_norm_var": 0.1039947509765625, + "learning_rate": 0.0001, + "loss": 4.5696, + "loss/crossentropy": 2.3409098386764526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24670831114053726, + "step": 10318 + }, + { + "epoch": 0.2064, + "grad_norm": 2.09375, + "grad_norm_var": 0.10114313761393229, + "learning_rate": 0.0001, + "loss": 4.4153, + "loss/crossentropy": 2.233125150203705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21548831462860107, + "step": 10320 + }, + { + "epoch": 0.20644, + "grad_norm": 2.0625, + "grad_norm_var": 0.1031206766764323, + "learning_rate": 0.0001, + "loss": 4.0617, + "loss/crossentropy": 2.131038188934326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20244313776493073, + "step": 10322 + }, + { + "epoch": 0.20648, + "grad_norm": 2.03125, + "grad_norm_var": 0.10423965454101562, + "learning_rate": 0.0001, + "loss": 4.3428, + "loss/crossentropy": 2.1683152318000793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21852879226207733, + "step": 10324 + }, + { + "epoch": 0.20652, + "grad_norm": 2.09375, + "grad_norm_var": 0.10465672810872396, + "learning_rate": 0.0001, + "loss": 4.4117, + "loss/crossentropy": 2.2986634969711304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24344030022621155, + "step": 10326 + }, + { + "epoch": 0.20656, + "grad_norm": 1.9140625, + "grad_norm_var": 0.10876057942708334, + "learning_rate": 0.0001, + "loss": 4.063, + "loss/crossentropy": 2.2356297969818115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21090354025363922, + "step": 10328 + }, + { + "epoch": 0.2066, + "grad_norm": 2.28125, + "grad_norm_var": 0.10851949055989583, + "learning_rate": 0.0001, + "loss": 4.4235, + "loss/crossentropy": 2.6431000232696533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26503315567970276, + "step": 10330 + }, + { + "epoch": 0.20664, + "grad_norm": 2.125, + "grad_norm_var": 0.10668919881184896, + "learning_rate": 0.0001, + "loss": 4.2277, + "loss/crossentropy": 2.0261669754981995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21528497338294983, + "step": 10332 + }, + { + "epoch": 0.20668, + "grad_norm": 2.15625, + "grad_norm_var": 0.007755279541015625, + "learning_rate": 0.0001, + "loss": 4.1092, + "loss/crossentropy": 1.5593605041503906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18069667369127274, + "step": 10334 + }, + { + "epoch": 0.20672, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0087646484375, + "learning_rate": 0.0001, + "loss": 4.1836, + "loss/crossentropy": 1.953243374824524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19982123374938965, + "step": 10336 + }, + { + "epoch": 0.20676, + "grad_norm": 2.09375, + "grad_norm_var": 0.0088531494140625, + "learning_rate": 0.0001, + "loss": 4.4369, + "loss/crossentropy": 2.067806303501129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2209654077887535, + "step": 10338 + }, + { + "epoch": 0.2068, + "grad_norm": 2.0625, + "grad_norm_var": 0.0088043212890625, + "learning_rate": 0.0001, + "loss": 4.2458, + "loss/crossentropy": 1.9948397874832153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21432363241910934, + "step": 10340 + }, + { + "epoch": 0.20684, + "grad_norm": 2.046875, + "grad_norm_var": 0.010231272379557291, + "learning_rate": 0.0001, + "loss": 3.7643, + "loss/crossentropy": 1.7932568788528442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19161275029182434, + "step": 10342 + }, + { + "epoch": 0.20688, + "grad_norm": 2.3125, + "grad_norm_var": 0.0162017822265625, + "learning_rate": 0.0001, + "loss": 4.5325, + "loss/crossentropy": 2.021036922931671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21150880306959152, + "step": 10344 + }, + { + "epoch": 0.20692, + "grad_norm": 1.9453125, + "grad_norm_var": 0.014623769124348958, + "learning_rate": 0.0001, + "loss": 3.9351, + "loss/crossentropy": 2.0004186630249023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18971875309944153, + "step": 10346 + }, + { + "epoch": 0.20696, + "grad_norm": 1.984375, + "grad_norm_var": 0.014898427327473958, + "learning_rate": 0.0001, + "loss": 4.1234, + "loss/crossentropy": 2.2949434518814087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356676608324051, + "step": 10348 + }, + { + "epoch": 0.207, + "grad_norm": 2.15625, + "grad_norm_var": 0.013108062744140624, + "learning_rate": 0.0001, + "loss": 4.1755, + "loss/crossentropy": 2.161319613456726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24161628633737564, + "step": 10350 + }, + { + "epoch": 0.20704, + "grad_norm": 1.921875, + "grad_norm_var": 0.013285319010416666, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 2.027602195739746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19196761399507523, + "step": 10352 + }, + { + "epoch": 0.20708, + "grad_norm": 2.109375, + "grad_norm_var": 0.013703409830729167, + "learning_rate": 0.0001, + "loss": 4.3658, + "loss/crossentropy": 2.163583278656006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21044812351465225, + "step": 10354 + }, + { + "epoch": 0.20712, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014045969645182291, + "learning_rate": 0.0001, + "loss": 4.0419, + "loss/crossentropy": 2.055150866508484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21587203443050385, + "step": 10356 + }, + { + "epoch": 0.20716, + "grad_norm": 2.4375, + "grad_norm_var": 0.0192047119140625, + "learning_rate": 0.0001, + "loss": 4.2947, + "loss/crossentropy": 2.368631362915039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2406565323472023, + "step": 10358 + }, + { + "epoch": 0.2072, + "grad_norm": 2.234375, + "grad_norm_var": 0.016486612955729167, + "learning_rate": 0.0001, + "loss": 4.6473, + "loss/crossentropy": 2.5399086475372314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2554238885641098, + "step": 10360 + }, + { + "epoch": 0.20724, + "grad_norm": 2.046875, + "grad_norm_var": 0.015366363525390624, + "learning_rate": 0.0001, + "loss": 4.2346, + "loss/crossentropy": 2.0829185843467712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278829663991928, + "step": 10362 + }, + { + "epoch": 0.20728, + "grad_norm": 2.078125, + "grad_norm_var": 0.014371490478515625, + "learning_rate": 0.0001, + "loss": 4.2583, + "loss/crossentropy": 1.9829052090644836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19677383452653885, + "step": 10364 + }, + { + "epoch": 0.20732, + "grad_norm": 2.0625, + "grad_norm_var": 0.014385732014973958, + "learning_rate": 0.0001, + "loss": 4.3801, + "loss/crossentropy": 2.2335458993911743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23407185822725296, + "step": 10366 + }, + { + "epoch": 0.20736, + "grad_norm": 2.046875, + "grad_norm_var": 0.012149810791015625, + "learning_rate": 0.0001, + "loss": 4.2074, + "loss/crossentropy": 1.824280858039856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203867107629776, + "step": 10368 + }, + { + "epoch": 0.2074, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013898722330729167, + "learning_rate": 0.0001, + "loss": 4.0666, + "loss/crossentropy": 2.1007773876190186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20312541723251343, + "step": 10370 + }, + { + "epoch": 0.20744, + "grad_norm": 2.109375, + "grad_norm_var": 0.013350168863932291, + "learning_rate": 0.0001, + "loss": 4.3692, + "loss/crossentropy": 1.953888475894928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20697196573019028, + "step": 10372 + }, + { + "epoch": 0.20748, + "grad_norm": 2.28125, + "grad_norm_var": 0.03509089152018229, + "learning_rate": 0.0001, + "loss": 4.2786, + "loss/crossentropy": 2.186478853225708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24410755187273026, + "step": 10374 + }, + { + "epoch": 0.20752, + "grad_norm": 2.34375, + "grad_norm_var": 0.03706232706705729, + "learning_rate": 0.0001, + "loss": 3.9974, + "loss/crossentropy": 1.9011998772621155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983819529414177, + "step": 10376 + }, + { + "epoch": 0.20756, + "grad_norm": 2.03125, + "grad_norm_var": 0.03695246378580729, + "learning_rate": 0.0001, + "loss": 4.5884, + "loss/crossentropy": 2.599787950515747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25155550241470337, + "step": 10378 + }, + { + "epoch": 0.2076, + "grad_norm": 2.078125, + "grad_norm_var": 0.037021636962890625, + "learning_rate": 0.0001, + "loss": 4.2702, + "loss/crossentropy": 2.0120421648025513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21281170845031738, + "step": 10380 + }, + { + "epoch": 0.20764, + "grad_norm": 2.09375, + "grad_norm_var": 0.036710357666015624, + "learning_rate": 0.0001, + "loss": 4.4915, + "loss/crossentropy": 2.0685967803001404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23026303946971893, + "step": 10382 + }, + { + "epoch": 0.20768, + "grad_norm": 2.125, + "grad_norm_var": 0.0445068359375, + "learning_rate": 0.0001, + "loss": 4.1171, + "loss/crossentropy": 1.8392394185066223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19240978360176086, + "step": 10384 + }, + { + "epoch": 0.20772, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04523518880208333, + "learning_rate": 0.0001, + "loss": 4.2428, + "loss/crossentropy": 1.786954402923584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20547360181808472, + "step": 10386 + }, + { + "epoch": 0.20776, + "grad_norm": 2.234375, + "grad_norm_var": 0.0452301025390625, + "learning_rate": 0.0001, + "loss": 4.2084, + "loss/crossentropy": 1.6960806250572205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1748766005039215, + "step": 10388 + }, + { + "epoch": 0.2078, + "grad_norm": 2.3125, + "grad_norm_var": 0.018693033854166666, + "learning_rate": 0.0001, + "loss": 4.2491, + "loss/crossentropy": 2.271879196166992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25173740088939667, + "step": 10390 + }, + { + "epoch": 0.20784, + "grad_norm": 2.109375, + "grad_norm_var": 0.015778605143229166, + "learning_rate": 0.0001, + "loss": 4.5199, + "loss/crossentropy": 2.2860567569732666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23291928321123123, + "step": 10392 + }, + { + "epoch": 0.20788, + "grad_norm": 2.046875, + "grad_norm_var": 0.0148193359375, + "learning_rate": 0.0001, + "loss": 4.1871, + "loss/crossentropy": 1.9925037026405334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212161086499691, + "step": 10394 + }, + { + "epoch": 0.20792, + "grad_norm": 2.140625, + "grad_norm_var": 0.016306304931640626, + "learning_rate": 0.0001, + "loss": 4.3728, + "loss/crossentropy": 2.1189831495285034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19486143440008163, + "step": 10396 + }, + { + "epoch": 0.20796, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0168121337890625, + "learning_rate": 0.0001, + "loss": 4.2325, + "loss/crossentropy": 2.170132279396057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23083829134702682, + "step": 10398 + }, + { + "epoch": 0.208, + "grad_norm": 2.109375, + "grad_norm_var": 0.012129465738932291, + "learning_rate": 0.0001, + "loss": 4.0773, + "loss/crossentropy": 1.8486035466194153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18999971449375153, + "step": 10400 + }, + { + "epoch": 0.20804, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011240386962890625, + "learning_rate": 0.0001, + "loss": 4.138, + "loss/crossentropy": 1.727788269519806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19234652817249298, + "step": 10402 + }, + { + "epoch": 0.20808, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01046142578125, + "learning_rate": 0.0001, + "loss": 4.4102, + "loss/crossentropy": 2.196265935897827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21071433275938034, + "step": 10404 + }, + { + "epoch": 0.20812, + "grad_norm": 2.15625, + "grad_norm_var": 0.007111612955729167, + "learning_rate": 0.0001, + "loss": 4.3378, + "loss/crossentropy": 1.7230273485183716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19570616632699966, + "step": 10406 + }, + { + "epoch": 0.20816, + "grad_norm": 2.109375, + "grad_norm_var": 0.00567626953125, + "learning_rate": 0.0001, + "loss": 3.9693, + "loss/crossentropy": 1.8815646767616272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055530995130539, + "step": 10408 + }, + { + "epoch": 0.2082, + "grad_norm": 2.015625, + "grad_norm_var": 0.006400299072265625, + "learning_rate": 0.0001, + "loss": 4.1974, + "loss/crossentropy": 2.2131329774856567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22710958123207092, + "step": 10410 + }, + { + "epoch": 0.20824, + "grad_norm": 2.046875, + "grad_norm_var": 0.006103515625, + "learning_rate": 0.0001, + "loss": 4.1904, + "loss/crossentropy": 1.5199981927871704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16881641000509262, + "step": 10412 + }, + { + "epoch": 0.20828, + "grad_norm": 2.0625, + "grad_norm_var": 0.005356597900390625, + "learning_rate": 0.0001, + "loss": 4.2456, + "loss/crossentropy": 2.305867075920105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2523300349712372, + "step": 10414 + }, + { + "epoch": 0.20832, + "grad_norm": 2.0625, + "grad_norm_var": 0.004571278889973958, + "learning_rate": 0.0001, + "loss": 4.2727, + "loss/crossentropy": 1.989980161190033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19124356657266617, + "step": 10416 + }, + { + "epoch": 0.20836, + "grad_norm": 2.046875, + "grad_norm_var": 0.004255167643229167, + "learning_rate": 0.0001, + "loss": 4.4745, + "loss/crossentropy": 2.165328025817871, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23673278093338013, + "step": 10418 + }, + { + "epoch": 0.2084, + "grad_norm": 2.328125, + "grad_norm_var": 0.008074696858723958, + "learning_rate": 0.0001, + "loss": 4.5429, + "loss/crossentropy": 2.2451056241989136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23130114376544952, + "step": 10420 + }, + { + "epoch": 0.20844, + "grad_norm": 1.953125, + "grad_norm_var": 0.0106689453125, + "learning_rate": 0.0001, + "loss": 3.9259, + "loss/crossentropy": 2.1694064140319824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21424317359924316, + "step": 10422 + }, + { + "epoch": 0.20848, + "grad_norm": 2.1875, + "grad_norm_var": 0.012059529622395834, + "learning_rate": 0.0001, + "loss": 4.3558, + "loss/crossentropy": 2.140601873397827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22452331334352493, + "step": 10424 + }, + { + "epoch": 0.20852, + "grad_norm": 2.125, + "grad_norm_var": 0.011476389567057292, + "learning_rate": 0.0001, + "loss": 4.3058, + "loss/crossentropy": 2.2076770067214966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21608063578605652, + "step": 10426 + }, + { + "epoch": 0.20856, + "grad_norm": 2.125, + "grad_norm_var": 0.011579386393229167, + "learning_rate": 0.0001, + "loss": 4.151, + "loss/crossentropy": 2.1738568544387817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2288571298122406, + "step": 10428 + }, + { + "epoch": 0.2086, + "grad_norm": 2.015625, + "grad_norm_var": 0.012165323893229166, + "learning_rate": 0.0001, + "loss": 4.2576, + "loss/crossentropy": 2.2586612701416016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22107571363449097, + "step": 10430 + }, + { + "epoch": 0.20864, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014511871337890624, + "learning_rate": 0.0001, + "loss": 4.5441, + "loss/crossentropy": 2.336306095123291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23462744057178497, + "step": 10432 + }, + { + "epoch": 0.20868, + "grad_norm": 2.046875, + "grad_norm_var": 0.0148590087890625, + "learning_rate": 0.0001, + "loss": 4.1625, + "loss/crossentropy": 2.3164994716644287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21541497856378555, + "step": 10434 + }, + { + "epoch": 0.20872, + "grad_norm": 2.046875, + "grad_norm_var": 0.0121734619140625, + "learning_rate": 0.0001, + "loss": 4.2497, + "loss/crossentropy": 1.848636507987976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20993927866220474, + "step": 10436 + }, + { + "epoch": 0.20876, + "grad_norm": 2.25, + "grad_norm_var": 0.010509999593098958, + "learning_rate": 0.0001, + "loss": 4.4985, + "loss/crossentropy": 2.234964370727539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21675845235586166, + "step": 10438 + }, + { + "epoch": 0.2088, + "grad_norm": 2.125, + "grad_norm_var": 0.009159088134765625, + "learning_rate": 0.0001, + "loss": 4.2389, + "loss/crossentropy": 1.9301238656044006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21708600223064423, + "step": 10440 + }, + { + "epoch": 0.20884, + "grad_norm": 2.171875, + "grad_norm_var": 0.010721842447916666, + "learning_rate": 0.0001, + "loss": 4.1706, + "loss/crossentropy": 2.231620192527771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23156532645225525, + "step": 10442 + }, + { + "epoch": 0.20888, + "grad_norm": 2.015625, + "grad_norm_var": 0.010237375895182291, + "learning_rate": 0.0001, + "loss": 4.18, + "loss/crossentropy": 1.8612747192382812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914522647857666, + "step": 10444 + }, + { + "epoch": 0.20892, + "grad_norm": 2.109375, + "grad_norm_var": 0.009907786051432292, + "learning_rate": 0.0001, + "loss": 4.1713, + "loss/crossentropy": 2.2229456305503845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23199696838855743, + "step": 10446 + }, + { + "epoch": 0.20896, + "grad_norm": 2.046875, + "grad_norm_var": 0.009708658854166666, + "learning_rate": 0.0001, + "loss": 4.0556, + "loss/crossentropy": 1.9055940508842468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158679962158203, + "step": 10448 + }, + { + "epoch": 0.209, + "grad_norm": 1.8984375, + "grad_norm_var": 0.011774698893229166, + "learning_rate": 0.0001, + "loss": 3.8876, + "loss/crossentropy": 1.5537404417991638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17176809161901474, + "step": 10450 + }, + { + "epoch": 0.20904, + "grad_norm": 1.890625, + "grad_norm_var": 0.0123931884765625, + "learning_rate": 0.0001, + "loss": 3.9222, + "loss/crossentropy": 2.319318413734436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001250088214874, + "step": 10452 + }, + { + "epoch": 0.20908, + "grad_norm": 2.140625, + "grad_norm_var": 0.010383097330729167, + "learning_rate": 0.0001, + "loss": 4.421, + "loss/crossentropy": 2.2334396839141846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.222968190908432, + "step": 10454 + }, + { + "epoch": 0.20912, + "grad_norm": 2.109375, + "grad_norm_var": 0.0110260009765625, + "learning_rate": 0.0001, + "loss": 4.5172, + "loss/crossentropy": 2.5762773752212524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26605312526226044, + "step": 10456 + }, + { + "epoch": 0.20916, + "grad_norm": 2.0, + "grad_norm_var": 0.010503896077473958, + "learning_rate": 0.0001, + "loss": 4.1109, + "loss/crossentropy": 1.7634761333465576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18641388416290283, + "step": 10458 + }, + { + "epoch": 0.2092, + "grad_norm": 1.875, + "grad_norm_var": 0.013224029541015625, + "learning_rate": 0.0001, + "loss": 4.126, + "loss/crossentropy": 2.28191876411438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22916647791862488, + "step": 10460 + }, + { + "epoch": 0.20924, + "grad_norm": 2.125, + "grad_norm_var": 0.013903554280598958, + "learning_rate": 0.0001, + "loss": 4.1984, + "loss/crossentropy": 1.9712265729904175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2293689101934433, + "step": 10462 + }, + { + "epoch": 0.20928, + "grad_norm": 2.109375, + "grad_norm_var": 0.012292226155598959, + "learning_rate": 0.0001, + "loss": 4.3826, + "loss/crossentropy": 2.425857424736023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.255269430577755, + "step": 10464 + }, + { + "epoch": 0.20932, + "grad_norm": 2.109375, + "grad_norm_var": 0.011031087239583333, + "learning_rate": 0.0001, + "loss": 4.1919, + "loss/crossentropy": 2.0697131752967834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22233501076698303, + "step": 10466 + }, + { + "epoch": 0.20936, + "grad_norm": 2.140625, + "grad_norm_var": 0.008854166666666666, + "learning_rate": 0.0001, + "loss": 4.4017, + "loss/crossentropy": 1.8367178440093994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063748985528946, + "step": 10468 + }, + { + "epoch": 0.2094, + "grad_norm": 2.046875, + "grad_norm_var": 0.018192545572916666, + "learning_rate": 0.0001, + "loss": 4.1864, + "loss/crossentropy": 1.977232813835144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20568673312664032, + "step": 10470 + }, + { + "epoch": 0.20944, + "grad_norm": 2.0, + "grad_norm_var": 0.018195597330729167, + "learning_rate": 0.0001, + "loss": 4.1622, + "loss/crossentropy": 2.1934465169906616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2175101339817047, + "step": 10472 + }, + { + "epoch": 0.20948, + "grad_norm": 2.09375, + "grad_norm_var": 0.016047159830729168, + "learning_rate": 0.0001, + "loss": 4.3005, + "loss/crossentropy": 1.7997339367866516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20166774094104767, + "step": 10474 + }, + { + "epoch": 0.20952, + "grad_norm": 2.140625, + "grad_norm_var": 0.014671834309895833, + "learning_rate": 0.0001, + "loss": 4.1167, + "loss/crossentropy": 1.9334582090377808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19190441817045212, + "step": 10476 + }, + { + "epoch": 0.20956, + "grad_norm": 2.109375, + "grad_norm_var": 0.015148671468098958, + "learning_rate": 0.0001, + "loss": 3.9434, + "loss/crossentropy": 1.7263885140419006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17968116700649261, + "step": 10478 + }, + { + "epoch": 0.2096, + "grad_norm": 1.9375, + "grad_norm_var": 0.01661961873372396, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.1710296869277954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19717055559158325, + "step": 10480 + }, + { + "epoch": 0.20964, + "grad_norm": 2.171875, + "grad_norm_var": 0.016947174072265626, + "learning_rate": 0.0001, + "loss": 4.2089, + "loss/crossentropy": 2.043896973133087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21403591334819794, + "step": 10482 + }, + { + "epoch": 0.20968, + "grad_norm": 2.203125, + "grad_norm_var": 0.02459691365559896, + "learning_rate": 0.0001, + "loss": 3.9237, + "loss/crossentropy": 1.7813313603401184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20273292809724808, + "step": 10484 + }, + { + "epoch": 0.20972, + "grad_norm": 2.0625, + "grad_norm_var": 0.013155110677083333, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 1.8628470301628113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945454627275467, + "step": 10486 + }, + { + "epoch": 0.20976, + "grad_norm": 2.03125, + "grad_norm_var": 0.013044230143229167, + "learning_rate": 0.0001, + "loss": 4.2279, + "loss/crossentropy": 2.2047882080078125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23798923939466476, + "step": 10488 + }, + { + "epoch": 0.2098, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012697092692057292, + "learning_rate": 0.0001, + "loss": 4.1327, + "loss/crossentropy": 1.8838441967964172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2224583625793457, + "step": 10490 + }, + { + "epoch": 0.20984, + "grad_norm": 2.015625, + "grad_norm_var": 0.011201731363932292, + "learning_rate": 0.0001, + "loss": 4.1985, + "loss/crossentropy": 1.9326539039611816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116800993680954, + "step": 10492 + }, + { + "epoch": 0.20988, + "grad_norm": 2.375, + "grad_norm_var": 0.017829386393229167, + "learning_rate": 0.0001, + "loss": 4.2023, + "loss/crossentropy": 1.7767577171325684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20496132969856262, + "step": 10494 + }, + { + "epoch": 0.20992, + "grad_norm": 2.015625, + "grad_norm_var": 0.016996256510416665, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 1.996176838874817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979241669178009, + "step": 10496 + }, + { + "epoch": 0.20996, + "grad_norm": 2.015625, + "grad_norm_var": 0.016136678059895833, + "learning_rate": 0.0001, + "loss": 4.1467, + "loss/crossentropy": 2.5282262563705444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23003733158111572, + "step": 10498 + }, + { + "epoch": 0.21, + "grad_norm": 2.109375, + "grad_norm_var": 0.008760579427083333, + "learning_rate": 0.0001, + "loss": 4.1981, + "loss/crossentropy": 2.3120675086975098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22543538361787796, + "step": 10500 + }, + { + "epoch": 0.21004, + "grad_norm": 2.15625, + "grad_norm_var": 0.008766428629557291, + "learning_rate": 0.0001, + "loss": 4.1216, + "loss/crossentropy": 2.2110280990600586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23066550493240356, + "step": 10502 + }, + { + "epoch": 0.21008, + "grad_norm": 2.015625, + "grad_norm_var": 0.009242502848307292, + "learning_rate": 0.0001, + "loss": 4.0804, + "loss/crossentropy": 1.6792908906936646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20689093321561813, + "step": 10504 + }, + { + "epoch": 0.21012, + "grad_norm": 2.296875, + "grad_norm_var": 0.011116536458333333, + "learning_rate": 0.0001, + "loss": 4.137, + "loss/crossentropy": 1.9797767400741577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2185518741607666, + "step": 10506 + }, + { + "epoch": 0.21016, + "grad_norm": 2.046875, + "grad_norm_var": 0.011533355712890625, + "learning_rate": 0.0001, + "loss": 4.0243, + "loss/crossentropy": 2.0001984238624573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22058366239070892, + "step": 10508 + }, + { + "epoch": 0.2102, + "grad_norm": 2.125, + "grad_norm_var": 0.006414540608723958, + "learning_rate": 0.0001, + "loss": 4.3707, + "loss/crossentropy": 2.1622806787490845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21733585745096207, + "step": 10510 + }, + { + "epoch": 0.21024, + "grad_norm": 2.03125, + "grad_norm_var": 0.00740966796875, + "learning_rate": 0.0001, + "loss": 4.0392, + "loss/crossentropy": 2.0224735736846924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20596059411764145, + "step": 10512 + }, + { + "epoch": 0.21028, + "grad_norm": 1.8359375, + "grad_norm_var": 0.011244455973307291, + "learning_rate": 0.0001, + "loss": 4.0712, + "loss/crossentropy": 2.099945902824402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19777391105890274, + "step": 10514 + }, + { + "epoch": 0.21032, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011823527018229167, + "learning_rate": 0.0001, + "loss": 4.1246, + "loss/crossentropy": 1.9806578159332275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19959519058465958, + "step": 10516 + }, + { + "epoch": 0.21036, + "grad_norm": 2.140625, + "grad_norm_var": 0.012239329020182292, + "learning_rate": 0.0001, + "loss": 4.1182, + "loss/crossentropy": 2.0056468844413757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21810457110404968, + "step": 10518 + }, + { + "epoch": 0.2104, + "grad_norm": 2.171875, + "grad_norm_var": 0.01785456339518229, + "learning_rate": 0.0001, + "loss": 4.1085, + "loss/crossentropy": 1.9525137543678284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19195258617401123, + "step": 10520 + }, + { + "epoch": 0.21044, + "grad_norm": 2.15625, + "grad_norm_var": 0.01580988566080729, + "learning_rate": 0.0001, + "loss": 4.4648, + "loss/crossentropy": 2.2391252517700195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23115740716457367, + "step": 10522 + }, + { + "epoch": 0.21048, + "grad_norm": 2.203125, + "grad_norm_var": 0.016727701822916666, + "learning_rate": 0.0001, + "loss": 4.6467, + "loss/crossentropy": 2.550819158554077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2846238762140274, + "step": 10524 + }, + { + "epoch": 0.21052, + "grad_norm": 2.234375, + "grad_norm_var": 0.0176177978515625, + "learning_rate": 0.0001, + "loss": 4.2787, + "loss/crossentropy": 2.181081712245941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23952369391918182, + "step": 10526 + }, + { + "epoch": 0.21056, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01784032185872396, + "learning_rate": 0.0001, + "loss": 4.1627, + "loss/crossentropy": 2.3070446848869324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21255015581846237, + "step": 10528 + }, + { + "epoch": 0.2106, + "grad_norm": 2.140625, + "grad_norm_var": 0.014235178629557291, + "learning_rate": 0.0001, + "loss": 4.3184, + "loss/crossentropy": 1.9560331106185913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19652864336967468, + "step": 10530 + }, + { + "epoch": 0.21064, + "grad_norm": 2.203125, + "grad_norm_var": 0.0127593994140625, + "learning_rate": 0.0001, + "loss": 4.5728, + "loss/crossentropy": 2.2470709085464478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25076867640018463, + "step": 10532 + }, + { + "epoch": 0.21068, + "grad_norm": 2.03125, + "grad_norm_var": 0.011767323811848958, + "learning_rate": 0.0001, + "loss": 4.2652, + "loss/crossentropy": 2.0689820051193237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21815705299377441, + "step": 10534 + }, + { + "epoch": 0.21072, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013499959309895834, + "learning_rate": 0.0001, + "loss": 3.9524, + "loss/crossentropy": 2.0633797645568848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20865381509065628, + "step": 10536 + }, + { + "epoch": 0.21076, + "grad_norm": 1.96875, + "grad_norm_var": 0.014029947916666667, + "learning_rate": 0.0001, + "loss": 4.2644, + "loss/crossentropy": 2.3332748413085938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21530094742774963, + "step": 10538 + }, + { + "epoch": 0.2108, + "grad_norm": 2.078125, + "grad_norm_var": 0.011942545572916666, + "learning_rate": 0.0001, + "loss": 4.1452, + "loss/crossentropy": 2.093048572540283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23568043112754822, + "step": 10540 + }, + { + "epoch": 0.21084, + "grad_norm": 2.125, + "grad_norm_var": 0.009992472330729167, + "learning_rate": 0.0001, + "loss": 4.5464, + "loss/crossentropy": 2.0059397220611572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.253838449716568, + "step": 10542 + }, + { + "epoch": 0.21088, + "grad_norm": 2.0625, + "grad_norm_var": 0.009284464518229167, + "learning_rate": 0.0001, + "loss": 4.1417, + "loss/crossentropy": 2.278248429298401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23320979624986649, + "step": 10544 + }, + { + "epoch": 0.21092, + "grad_norm": 2.0625, + "grad_norm_var": 0.008540598551432292, + "learning_rate": 0.0001, + "loss": 4.0951, + "loss/crossentropy": 2.063527822494507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21290308982133865, + "step": 10546 + }, + { + "epoch": 0.21096, + "grad_norm": 1.953125, + "grad_norm_var": 0.007511138916015625, + "learning_rate": 0.0001, + "loss": 4.1456, + "loss/crossentropy": 2.092045545578003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21721098572015762, + "step": 10548 + }, + { + "epoch": 0.211, + "grad_norm": 2.078125, + "grad_norm_var": 0.06824111938476562, + "learning_rate": 0.0001, + "loss": 4.0586, + "loss/crossentropy": 1.9876770973205566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21775592118501663, + "step": 10550 + }, + { + "epoch": 0.21104, + "grad_norm": 2.0625, + "grad_norm_var": 0.06347249348958334, + "learning_rate": 0.0001, + "loss": 4.2372, + "loss/crossentropy": 2.292428970336914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24829304218292236, + "step": 10552 + }, + { + "epoch": 0.21108, + "grad_norm": 2.015625, + "grad_norm_var": 0.06552632649739583, + "learning_rate": 0.0001, + "loss": 4.3412, + "loss/crossentropy": 2.257239043712616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2195405438542366, + "step": 10554 + }, + { + "epoch": 0.21112, + "grad_norm": 2.171875, + "grad_norm_var": 0.06616923014322916, + "learning_rate": 0.0001, + "loss": 4.404, + "loss/crossentropy": 2.1424754858016968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21044345945119858, + "step": 10556 + }, + { + "epoch": 0.21116, + "grad_norm": 2.15625, + "grad_norm_var": 0.06642964680989584, + "learning_rate": 0.0001, + "loss": 4.2812, + "loss/crossentropy": 1.777747094631195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19684316962957382, + "step": 10558 + }, + { + "epoch": 0.2112, + "grad_norm": 2.015625, + "grad_norm_var": 0.06655171712239584, + "learning_rate": 0.0001, + "loss": 4.1239, + "loss/crossentropy": 2.0860772728919983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20560266077518463, + "step": 10560 + }, + { + "epoch": 0.21124, + "grad_norm": 1.96875, + "grad_norm_var": 0.06787007649739583, + "learning_rate": 0.0001, + "loss": 4.2409, + "loss/crossentropy": 1.9594369530677795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015877440571785, + "step": 10562 + }, + { + "epoch": 0.21128, + "grad_norm": 2.078125, + "grad_norm_var": 0.06467692057291667, + "learning_rate": 0.0001, + "loss": 4.2539, + "loss/crossentropy": 2.069046676158905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21409741044044495, + "step": 10564 + }, + { + "epoch": 0.21132, + "grad_norm": 1.921875, + "grad_norm_var": 0.009464263916015625, + "learning_rate": 0.0001, + "loss": 3.822, + "loss/crossentropy": 1.7300589084625244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18024658411741257, + "step": 10566 + }, + { + "epoch": 0.21136, + "grad_norm": 2.125, + "grad_norm_var": 0.0077288309733072914, + "learning_rate": 0.0001, + "loss": 4.2608, + "loss/crossentropy": 2.048615336418152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.228649340569973, + "step": 10568 + }, + { + "epoch": 0.2114, + "grad_norm": 2.171875, + "grad_norm_var": 0.007085927327473958, + "learning_rate": 0.0001, + "loss": 4.4622, + "loss/crossentropy": 1.9678268432617188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20117972791194916, + "step": 10570 + }, + { + "epoch": 0.21144, + "grad_norm": 2.1875, + "grad_norm_var": 0.006534576416015625, + "learning_rate": 0.0001, + "loss": 4.3919, + "loss/crossentropy": 2.595113754272461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24565115571022034, + "step": 10572 + }, + { + "epoch": 0.21148, + "grad_norm": 2.25, + "grad_norm_var": 0.008658599853515626, + "learning_rate": 0.0001, + "loss": 4.0873, + "loss/crossentropy": 2.096100628376007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20913005620241165, + "step": 10574 + }, + { + "epoch": 0.21152, + "grad_norm": 2.25, + "grad_norm_var": 0.012223052978515624, + "learning_rate": 0.0001, + "loss": 4.181, + "loss/crossentropy": 2.0096259713172913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21125900745391846, + "step": 10576 + }, + { + "epoch": 0.21156, + "grad_norm": 2.046875, + "grad_norm_var": 0.011572011311848958, + "learning_rate": 0.0001, + "loss": 3.9738, + "loss/crossentropy": 1.9990533590316772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20894166082143784, + "step": 10578 + }, + { + "epoch": 0.2116, + "grad_norm": 2.0625, + "grad_norm_var": 0.013606516520182292, + "learning_rate": 0.0001, + "loss": 4.0404, + "loss/crossentropy": 2.1385116577148438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19753456860780716, + "step": 10580 + }, + { + "epoch": 0.21164, + "grad_norm": 2.328125, + "grad_norm_var": 0.012482706705729167, + "learning_rate": 0.0001, + "loss": 4.5056, + "loss/crossentropy": 2.2081239819526672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23155531287193298, + "step": 10582 + }, + { + "epoch": 0.21168, + "grad_norm": 2.125, + "grad_norm_var": 0.014339192708333334, + "learning_rate": 0.0001, + "loss": 4.161, + "loss/crossentropy": 1.6915860772132874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18060292303562164, + "step": 10584 + }, + { + "epoch": 0.21172, + "grad_norm": 1.875, + "grad_norm_var": 0.019245402018229166, + "learning_rate": 0.0001, + "loss": 4.083, + "loss/crossentropy": 1.5825872421264648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19387374818325043, + "step": 10586 + }, + { + "epoch": 0.21176, + "grad_norm": 2.109375, + "grad_norm_var": 0.0253082275390625, + "learning_rate": 0.0001, + "loss": 4.6102, + "loss/crossentropy": 2.386876940727234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23594971746206284, + "step": 10588 + }, + { + "epoch": 0.2118, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02759577433268229, + "learning_rate": 0.0001, + "loss": 4.106, + "loss/crossentropy": 2.202653169631958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22666631639003754, + "step": 10590 + }, + { + "epoch": 0.21184, + "grad_norm": 2.046875, + "grad_norm_var": 0.026041412353515626, + "learning_rate": 0.0001, + "loss": 4.2285, + "loss/crossentropy": 2.013135075569153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22155165672302246, + "step": 10592 + }, + { + "epoch": 0.21188, + "grad_norm": 2.25, + "grad_norm_var": 2.762861887613932, + "learning_rate": 0.0001, + "loss": 4.1353, + "loss/crossentropy": 1.6701499223709106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21808167546987534, + "step": 10594 + }, + { + "epoch": 0.21192, + "grad_norm": 2.015625, + "grad_norm_var": 2.7677996317545572, + "learning_rate": 0.0001, + "loss": 4.1438, + "loss/crossentropy": 2.074462592601776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22254129499197006, + "step": 10596 + }, + { + "epoch": 0.21196, + "grad_norm": 2.203125, + "grad_norm_var": 2.787275950113932, + "learning_rate": 0.0001, + "loss": 4.2329, + "loss/crossentropy": 2.182308316230774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21201200038194656, + "step": 10598 + }, + { + "epoch": 0.212, + "grad_norm": 2.0625, + "grad_norm_var": 2.784148915608724, + "learning_rate": 0.0001, + "loss": 4.0688, + "loss/crossentropy": 1.9777602553367615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21454624831676483, + "step": 10600 + }, + { + "epoch": 0.21204, + "grad_norm": 2.03125, + "grad_norm_var": 2.778930409749349, + "learning_rate": 0.0001, + "loss": 4.2707, + "loss/crossentropy": 2.272592306137085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22290532290935516, + "step": 10602 + }, + { + "epoch": 0.21208, + "grad_norm": 2.078125, + "grad_norm_var": 2.784010569254557, + "learning_rate": 0.0001, + "loss": 4.3406, + "loss/crossentropy": 2.110643744468689, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22552277147769928, + "step": 10604 + }, + { + "epoch": 0.21212, + "grad_norm": 2.125, + "grad_norm_var": 2.7753326416015627, + "learning_rate": 0.0001, + "loss": 4.3485, + "loss/crossentropy": 2.279823422431946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22356732934713364, + "step": 10606 + }, + { + "epoch": 0.21216, + "grad_norm": 2.109375, + "grad_norm_var": 2.768701171875, + "learning_rate": 0.0001, + "loss": 4.3203, + "loss/crossentropy": 2.0985517501831055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22894078493118286, + "step": 10608 + }, + { + "epoch": 0.2122, + "grad_norm": 2.046875, + "grad_norm_var": 0.006884765625, + "learning_rate": 0.0001, + "loss": 4.2295, + "loss/crossentropy": 2.251029133796692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24473578482866287, + "step": 10610 + }, + { + "epoch": 0.21224, + "grad_norm": 2.140625, + "grad_norm_var": 0.005785115559895833, + "learning_rate": 0.0001, + "loss": 4.2451, + "loss/crossentropy": 2.1706892251968384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21725767105817795, + "step": 10612 + }, + { + "epoch": 0.21228, + "grad_norm": 2.046875, + "grad_norm_var": 0.004378255208333333, + "learning_rate": 0.0001, + "loss": 4.3319, + "loss/crossentropy": 2.0709590315818787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22771050035953522, + "step": 10614 + }, + { + "epoch": 0.21232, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005163319905598958, + "learning_rate": 0.0001, + "loss": 4.2507, + "loss/crossentropy": 2.0437510013580322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055979147553444, + "step": 10616 + }, + { + "epoch": 0.21236, + "grad_norm": 2.140625, + "grad_norm_var": 0.005204010009765625, + "learning_rate": 0.0001, + "loss": 4.2492, + "loss/crossentropy": 2.023369252681732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23950359225273132, + "step": 10618 + }, + { + "epoch": 0.2124, + "grad_norm": 2.140625, + "grad_norm_var": 0.004325103759765625, + "learning_rate": 0.0001, + "loss": 4.3522, + "loss/crossentropy": 2.051850199699402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24145089089870453, + "step": 10620 + }, + { + "epoch": 0.21244, + "grad_norm": 2.015625, + "grad_norm_var": 0.0044247945149739586, + "learning_rate": 0.0001, + "loss": 4.1743, + "loss/crossentropy": 1.9617546796798706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2095404863357544, + "step": 10622 + }, + { + "epoch": 0.21248, + "grad_norm": 2.140625, + "grad_norm_var": 0.004662831624348958, + "learning_rate": 0.0001, + "loss": 4.2115, + "loss/crossentropy": 1.9792284965515137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21156969666481018, + "step": 10624 + }, + { + "epoch": 0.21252, + "grad_norm": 2.046875, + "grad_norm_var": 0.004662831624348958, + "learning_rate": 0.0001, + "loss": 4.4402, + "loss/crossentropy": 1.936375379562378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21360263973474503, + "step": 10626 + }, + { + "epoch": 0.21256, + "grad_norm": 1.859375, + "grad_norm_var": 0.0071408589680989586, + "learning_rate": 0.0001, + "loss": 4.0695, + "loss/crossentropy": 2.11286723613739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20090097934007645, + "step": 10628 + }, + { + "epoch": 0.2126, + "grad_norm": 2.015625, + "grad_norm_var": 0.007120513916015625, + "learning_rate": 0.0001, + "loss": 4.2555, + "loss/crossentropy": 2.1020379066467285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21139420568943024, + "step": 10630 + }, + { + "epoch": 0.21264, + "grad_norm": 2.0625, + "grad_norm_var": 0.0064453125, + "learning_rate": 0.0001, + "loss": 4.2027, + "loss/crossentropy": 2.220509111881256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22286564111709595, + "step": 10632 + }, + { + "epoch": 0.21268, + "grad_norm": 1.890625, + "grad_norm_var": 0.0080078125, + "learning_rate": 0.0001, + "loss": 4.0012, + "loss/crossentropy": 2.058075189590454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22242067009210587, + "step": 10634 + }, + { + "epoch": 0.21272, + "grad_norm": 2.09375, + "grad_norm_var": 0.005269368489583333, + "learning_rate": 0.0001, + "loss": 4.1825, + "loss/crossentropy": 1.9641217589378357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20702090859413147, + "step": 10636 + }, + { + "epoch": 0.21276, + "grad_norm": 2.046875, + "grad_norm_var": 0.005231730143229167, + "learning_rate": 0.0001, + "loss": 4.1309, + "loss/crossentropy": 1.9234120845794678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20697829127311707, + "step": 10638 + }, + { + "epoch": 0.2128, + "grad_norm": 1.9375, + "grad_norm_var": 0.0067047119140625, + "learning_rate": 0.0001, + "loss": 4.2551, + "loss/crossentropy": 2.1050453782081604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21321691572666168, + "step": 10640 + }, + { + "epoch": 0.21284, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009323883056640624, + "learning_rate": 0.0001, + "loss": 4.0852, + "loss/crossentropy": 2.21867573261261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21468330919742584, + "step": 10642 + }, + { + "epoch": 0.21288, + "grad_norm": 2.09375, + "grad_norm_var": 0.007319895426432291, + "learning_rate": 0.0001, + "loss": 4.4209, + "loss/crossentropy": 2.213089942932129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21172764152288437, + "step": 10644 + }, + { + "epoch": 0.21292, + "grad_norm": 2.109375, + "grad_norm_var": 0.012556711832682291, + "learning_rate": 0.0001, + "loss": 4.3261, + "loss/crossentropy": 2.0370752811431885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23427169024944305, + "step": 10646 + }, + { + "epoch": 0.21296, + "grad_norm": 2.109375, + "grad_norm_var": 0.016035715738932293, + "learning_rate": 0.0001, + "loss": 4.2622, + "loss/crossentropy": 2.080373227596283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148713618516922, + "step": 10648 + }, + { + "epoch": 0.213, + "grad_norm": 2.078125, + "grad_norm_var": 0.014134724934895834, + "learning_rate": 0.0001, + "loss": 4.5408, + "loss/crossentropy": 2.3023130893707275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22768481075763702, + "step": 10650 + }, + { + "epoch": 0.21304, + "grad_norm": 2.203125, + "grad_norm_var": 0.016185506184895834, + "learning_rate": 0.0001, + "loss": 4.372, + "loss/crossentropy": 2.14642870426178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22463608533143997, + "step": 10652 + }, + { + "epoch": 0.21308, + "grad_norm": 2.03125, + "grad_norm_var": 0.016377766927083332, + "learning_rate": 0.0001, + "loss": 4.2164, + "loss/crossentropy": 2.2469639778137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505262568593025, + "step": 10654 + }, + { + "epoch": 0.21312, + "grad_norm": 2.0625, + "grad_norm_var": 0.015915679931640624, + "learning_rate": 0.0001, + "loss": 4.243, + "loss/crossentropy": 2.0431448221206665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21013544499874115, + "step": 10656 + }, + { + "epoch": 0.21316, + "grad_norm": 1.9375, + "grad_norm_var": 0.013631184895833334, + "learning_rate": 0.0001, + "loss": 3.9872, + "loss/crossentropy": 1.6768526434898376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18652133643627167, + "step": 10658 + }, + { + "epoch": 0.2132, + "grad_norm": 2.0, + "grad_norm_var": 0.017175038655598957, + "learning_rate": 0.0001, + "loss": 3.9442, + "loss/crossentropy": 1.3748261332511902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15284860879182816, + "step": 10660 + }, + { + "epoch": 0.21324, + "grad_norm": 2.03125, + "grad_norm_var": 0.012282053629557291, + "learning_rate": 0.0001, + "loss": 4.1451, + "loss/crossentropy": 1.9126732349395752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21141140908002853, + "step": 10662 + }, + { + "epoch": 0.21328, + "grad_norm": 2.1875, + "grad_norm_var": 0.010908762613932291, + "learning_rate": 0.0001, + "loss": 4.2392, + "loss/crossentropy": 1.97357976436615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21326088905334473, + "step": 10664 + }, + { + "epoch": 0.21332, + "grad_norm": 2.109375, + "grad_norm_var": 0.011201985677083333, + "learning_rate": 0.0001, + "loss": 4.2825, + "loss/crossentropy": 2.1782814860343933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21183249354362488, + "step": 10666 + }, + { + "epoch": 0.21336, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009098307291666666, + "learning_rate": 0.0001, + "loss": 4.1251, + "loss/crossentropy": 2.1700649857521057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2310282066464424, + "step": 10668 + }, + { + "epoch": 0.2134, + "grad_norm": 2.109375, + "grad_norm_var": 0.0087554931640625, + "learning_rate": 0.0001, + "loss": 4.479, + "loss/crossentropy": 2.249666213989258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21025578677654266, + "step": 10670 + }, + { + "epoch": 0.21344, + "grad_norm": 2.15625, + "grad_norm_var": 0.009287261962890625, + "learning_rate": 0.0001, + "loss": 4.3445, + "loss/crossentropy": 1.993752121925354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20829375833272934, + "step": 10672 + }, + { + "epoch": 0.21348, + "grad_norm": 2.046875, + "grad_norm_var": 0.009244791666666667, + "learning_rate": 0.0001, + "loss": 4.0814, + "loss/crossentropy": 2.0472013354301453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21456149220466614, + "step": 10674 + }, + { + "epoch": 0.21352, + "grad_norm": 2.109375, + "grad_norm_var": 0.007222239176432292, + "learning_rate": 0.0001, + "loss": 3.9497, + "loss/crossentropy": 2.0347819328308105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2133965790271759, + "step": 10676 + }, + { + "epoch": 0.21356, + "grad_norm": 1.984375, + "grad_norm_var": 0.008168284098307292, + "learning_rate": 0.0001, + "loss": 4.1528, + "loss/crossentropy": 1.86410254240036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22139018028974533, + "step": 10678 + }, + { + "epoch": 0.2136, + "grad_norm": 2.015625, + "grad_norm_var": 0.005641428629557291, + "learning_rate": 0.0001, + "loss": 4.3134, + "loss/crossentropy": 2.3244482278823853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24854417145252228, + "step": 10680 + }, + { + "epoch": 0.21364, + "grad_norm": 2.0, + "grad_norm_var": 0.006304677327473958, + "learning_rate": 0.0001, + "loss": 4.7182, + "loss/crossentropy": 2.518718123435974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.274397537112236, + "step": 10682 + }, + { + "epoch": 0.21368, + "grad_norm": 2.09375, + "grad_norm_var": 0.005928548177083334, + "learning_rate": 0.0001, + "loss": 4.1289, + "loss/crossentropy": 1.8111079931259155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039540931582451, + "step": 10684 + }, + { + "epoch": 0.21372, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008129628499348958, + "learning_rate": 0.0001, + "loss": 4.3446, + "loss/crossentropy": 2.186485230922699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22497782111167908, + "step": 10686 + }, + { + "epoch": 0.21376, + "grad_norm": 2.21875, + "grad_norm_var": 0.009877268473307292, + "learning_rate": 0.0001, + "loss": 4.4708, + "loss/crossentropy": 2.1850993633270264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22541093826293945, + "step": 10688 + }, + { + "epoch": 0.2138, + "grad_norm": 2.078125, + "grad_norm_var": 0.0090576171875, + "learning_rate": 0.0001, + "loss": 4.3991, + "loss/crossentropy": 1.9756001830101013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23400776088237762, + "step": 10690 + }, + { + "epoch": 0.21384, + "grad_norm": 2.125, + "grad_norm_var": 0.010652669270833333, + "learning_rate": 0.0001, + "loss": 4.0283, + "loss/crossentropy": 1.7454423904418945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18263398110866547, + "step": 10692 + }, + { + "epoch": 0.21388, + "grad_norm": 2.015625, + "grad_norm_var": 0.010066731770833334, + "learning_rate": 0.0001, + "loss": 3.9364, + "loss/crossentropy": 1.5824024081230164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18881092965602875, + "step": 10694 + }, + { + "epoch": 0.21392, + "grad_norm": 2.15625, + "grad_norm_var": 0.009383138020833333, + "learning_rate": 0.0001, + "loss": 4.3587, + "loss/crossentropy": 2.1171644926071167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2173115462064743, + "step": 10696 + }, + { + "epoch": 0.21396, + "grad_norm": 2.0625, + "grad_norm_var": 0.008698527018229167, + "learning_rate": 0.0001, + "loss": 4.2102, + "loss/crossentropy": 1.9327389001846313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22025491297245026, + "step": 10698 + }, + { + "epoch": 0.214, + "grad_norm": 1.90625, + "grad_norm_var": 0.009952799479166666, + "learning_rate": 0.0001, + "loss": 3.9622, + "loss/crossentropy": 1.9806901216506958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2089216560125351, + "step": 10700 + }, + { + "epoch": 0.21404, + "grad_norm": 2.109375, + "grad_norm_var": 0.007503000895182291, + "learning_rate": 0.0001, + "loss": 4.2503, + "loss/crossentropy": 2.216805338859558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21268506348133087, + "step": 10702 + }, + { + "epoch": 0.21408, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0055084228515625, + "learning_rate": 0.0001, + "loss": 4.5121, + "loss/crossentropy": 2.3998383283615112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26092807948589325, + "step": 10704 + }, + { + "epoch": 0.21412, + "grad_norm": 2.171875, + "grad_norm_var": 0.007869466145833334, + "learning_rate": 0.0001, + "loss": 4.4273, + "loss/crossentropy": 2.0581844449043274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2316882163286209, + "step": 10706 + }, + { + "epoch": 0.21416, + "grad_norm": 2.09375, + "grad_norm_var": 0.006258138020833333, + "learning_rate": 0.0001, + "loss": 4.2684, + "loss/crossentropy": 2.3091371059417725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23538943380117416, + "step": 10708 + }, + { + "epoch": 0.2142, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0067291259765625, + "learning_rate": 0.0001, + "loss": 3.9567, + "loss/crossentropy": 1.7134324312210083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1941806674003601, + "step": 10710 + }, + { + "epoch": 0.21424, + "grad_norm": 2.078125, + "grad_norm_var": 0.0060699462890625, + "learning_rate": 0.0001, + "loss": 4.3165, + "loss/crossentropy": 2.2040648460388184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20669078081846237, + "step": 10712 + }, + { + "epoch": 0.21428, + "grad_norm": 2.25, + "grad_norm_var": 0.020612589518229165, + "learning_rate": 0.0001, + "loss": 4.3207, + "loss/crossentropy": 2.3040376901626587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20685256272554398, + "step": 10714 + }, + { + "epoch": 0.21432, + "grad_norm": 2.328125, + "grad_norm_var": 0.020318349202473957, + "learning_rate": 0.0001, + "loss": 4.2674, + "loss/crossentropy": 2.189309239387512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21499691903591156, + "step": 10716 + }, + { + "epoch": 0.21436, + "grad_norm": 2.1875, + "grad_norm_var": 0.020216623942057293, + "learning_rate": 0.0001, + "loss": 4.4144, + "loss/crossentropy": 2.1955957412719727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20932748913764954, + "step": 10718 + }, + { + "epoch": 0.2144, + "grad_norm": 2.140625, + "grad_norm_var": 0.020271809895833333, + "learning_rate": 0.0001, + "loss": 4.2135, + "loss/crossentropy": 1.7800896763801575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1939619928598404, + "step": 10720 + }, + { + "epoch": 0.21444, + "grad_norm": 2.15625, + "grad_norm_var": 0.020335896809895834, + "learning_rate": 0.0001, + "loss": 4.2757, + "loss/crossentropy": 1.8974847197532654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19434937089681625, + "step": 10722 + }, + { + "epoch": 0.21448, + "grad_norm": 2.171875, + "grad_norm_var": 0.021455891927083335, + "learning_rate": 0.0001, + "loss": 4.2688, + "loss/crossentropy": 2.1631242632865906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21765826642513275, + "step": 10724 + }, + { + "epoch": 0.21452, + "grad_norm": 2.03125, + "grad_norm_var": 0.0203521728515625, + "learning_rate": 0.0001, + "loss": 4.6421, + "loss/crossentropy": 2.241260290145874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2334459200501442, + "step": 10726 + }, + { + "epoch": 0.21456, + "grad_norm": 2.0625, + "grad_norm_var": 0.0226470947265625, + "learning_rate": 0.0001, + "loss": 3.9478, + "loss/crossentropy": 1.9063156247138977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22118454426527023, + "step": 10728 + }, + { + "epoch": 0.2146, + "grad_norm": 2.03125, + "grad_norm_var": 0.011310831705729166, + "learning_rate": 0.0001, + "loss": 4.0446, + "loss/crossentropy": 1.8109349012374878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18581955134868622, + "step": 10730 + }, + { + "epoch": 0.21464, + "grad_norm": 2.265625, + "grad_norm_var": 0.009749348958333333, + "learning_rate": 0.0001, + "loss": 4.3262, + "loss/crossentropy": 2.1113163232803345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21339301764965057, + "step": 10732 + }, + { + "epoch": 0.21468, + "grad_norm": 2.0625, + "grad_norm_var": 0.0108551025390625, + "learning_rate": 0.0001, + "loss": 4.5308, + "loss/crossentropy": 2.1731717586517334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22267260402441025, + "step": 10734 + }, + { + "epoch": 0.21472, + "grad_norm": 2.078125, + "grad_norm_var": 0.010087076822916667, + "learning_rate": 0.0001, + "loss": 4.2363, + "loss/crossentropy": 2.322808027267456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21375543624162674, + "step": 10736 + }, + { + "epoch": 0.21476, + "grad_norm": 2.0, + "grad_norm_var": 0.010358683268229167, + "learning_rate": 0.0001, + "loss": 4.109, + "loss/crossentropy": 1.8200489282608032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19796901941299438, + "step": 10738 + }, + { + "epoch": 0.2148, + "grad_norm": 2.0625, + "grad_norm_var": 0.009455362955729166, + "learning_rate": 0.0001, + "loss": 4.0918, + "loss/crossentropy": 2.353461265563965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22755976021289825, + "step": 10740 + }, + { + "epoch": 0.21484, + "grad_norm": 2.015625, + "grad_norm_var": 0.0071451822916666664, + "learning_rate": 0.0001, + "loss": 4.2925, + "loss/crossentropy": 2.3200663328170776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23450587689876556, + "step": 10742 + }, + { + "epoch": 0.21488, + "grad_norm": 2.0, + "grad_norm_var": 0.006403605143229167, + "learning_rate": 0.0001, + "loss": 4.187, + "loss/crossentropy": 1.7293912768363953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17938270419836044, + "step": 10744 + }, + { + "epoch": 0.21492, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008656565348307292, + "learning_rate": 0.0001, + "loss": 4.3213, + "loss/crossentropy": 1.9759944081306458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20631389319896698, + "step": 10746 + }, + { + "epoch": 0.21496, + "grad_norm": 2.1875, + "grad_norm_var": 0.006870269775390625, + "learning_rate": 0.0001, + "loss": 4.3658, + "loss/crossentropy": 2.2232764959335327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21919700503349304, + "step": 10748 + }, + { + "epoch": 0.215, + "grad_norm": 1.953125, + "grad_norm_var": 0.004898834228515625, + "learning_rate": 0.0001, + "loss": 4.2466, + "loss/crossentropy": 2.0827722549438477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20569747686386108, + "step": 10750 + }, + { + "epoch": 0.21504, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005509440104166667, + "learning_rate": 0.0001, + "loss": 4.2373, + "loss/crossentropy": 2.0712032318115234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20504066348075867, + "step": 10752 + }, + { + "epoch": 0.21508, + "grad_norm": 2.234375, + "grad_norm_var": 0.007963053385416667, + "learning_rate": 0.0001, + "loss": 4.3466, + "loss/crossentropy": 2.2717082500457764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22957370430231094, + "step": 10754 + }, + { + "epoch": 0.21512, + "grad_norm": 2.359375, + "grad_norm_var": 0.0133056640625, + "learning_rate": 0.0001, + "loss": 4.2821, + "loss/crossentropy": 2.263810157775879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2291572391986847, + "step": 10756 + }, + { + "epoch": 0.21516, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013724517822265626, + "learning_rate": 0.0001, + "loss": 4.2157, + "loss/crossentropy": 2.103231191635132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21839886158704758, + "step": 10758 + }, + { + "epoch": 0.2152, + "grad_norm": 2.203125, + "grad_norm_var": 0.013962554931640624, + "learning_rate": 0.0001, + "loss": 4.4772, + "loss/crossentropy": 2.1272148489952087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280830293893814, + "step": 10760 + }, + { + "epoch": 0.21524, + "grad_norm": 2.078125, + "grad_norm_var": 0.012962849934895833, + "learning_rate": 0.0001, + "loss": 4.1192, + "loss/crossentropy": 2.133938789367676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2162996605038643, + "step": 10762 + }, + { + "epoch": 0.21528, + "grad_norm": 2.828125, + "grad_norm_var": 0.04528401692708333, + "learning_rate": 0.0001, + "loss": 4.5351, + "loss/crossentropy": 2.305683732032776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23797836154699326, + "step": 10764 + }, + { + "epoch": 0.21532, + "grad_norm": 2.078125, + "grad_norm_var": 0.04370829264322917, + "learning_rate": 0.0001, + "loss": 3.9886, + "loss/crossentropy": 1.7248413562774658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17226764559745789, + "step": 10766 + }, + { + "epoch": 0.21536, + "grad_norm": 2.1875, + "grad_norm_var": 0.041715240478515624, + "learning_rate": 0.0001, + "loss": 4.6422, + "loss/crossentropy": 2.275644540786743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2433001920580864, + "step": 10768 + }, + { + "epoch": 0.2154, + "grad_norm": 2.1875, + "grad_norm_var": 0.043338775634765625, + "learning_rate": 0.0001, + "loss": 4.2836, + "loss/crossentropy": 1.908457100391388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19758973270654678, + "step": 10770 + }, + { + "epoch": 0.21544, + "grad_norm": 1.8984375, + "grad_norm_var": 0.07423909505208333, + "learning_rate": 0.0001, + "loss": 4.2317, + "loss/crossentropy": 1.9401238560676575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22177566587924957, + "step": 10772 + }, + { + "epoch": 0.21548, + "grad_norm": 2.046875, + "grad_norm_var": 0.07476170857747395, + "learning_rate": 0.0001, + "loss": 4.295, + "loss/crossentropy": 2.247257351875305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21560464799404144, + "step": 10774 + }, + { + "epoch": 0.21552, + "grad_norm": 2.015625, + "grad_norm_var": 0.07644424438476563, + "learning_rate": 0.0001, + "loss": 4.1482, + "loss/crossentropy": 1.9191248416900635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214834064245224, + "step": 10776 + }, + { + "epoch": 0.21556, + "grad_norm": 2.0625, + "grad_norm_var": 0.0785888671875, + "learning_rate": 0.0001, + "loss": 4.0644, + "loss/crossentropy": 1.8963102102279663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20552606880664825, + "step": 10778 + }, + { + "epoch": 0.2156, + "grad_norm": 2.015625, + "grad_norm_var": 0.047078450520833336, + "learning_rate": 0.0001, + "loss": 4.265, + "loss/crossentropy": 2.0517951250076294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963544860482216, + "step": 10780 + }, + { + "epoch": 0.21564, + "grad_norm": 2.171875, + "grad_norm_var": 0.046727498372395836, + "learning_rate": 0.0001, + "loss": 4.3571, + "loss/crossentropy": 2.1484888792037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23963741213083267, + "step": 10782 + }, + { + "epoch": 0.21568, + "grad_norm": 2.671875, + "grad_norm_var": 0.06716206868489584, + "learning_rate": 0.0001, + "loss": 4.6827, + "loss/crossentropy": 2.012593388557434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115195021033287, + "step": 10784 + }, + { + "epoch": 0.21572, + "grad_norm": 2.03125, + "grad_norm_var": 0.06651102701822917, + "learning_rate": 0.0001, + "loss": 4.018, + "loss/crossentropy": 2.139856696128845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23609659075737, + "step": 10786 + }, + { + "epoch": 0.21576, + "grad_norm": 2.25, + "grad_norm_var": 0.03216120402018229, + "learning_rate": 0.0001, + "loss": 4.5007, + "loss/crossentropy": 2.0139951705932617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22799161076545715, + "step": 10788 + }, + { + "epoch": 0.2158, + "grad_norm": 2.125, + "grad_norm_var": 0.035676829020182294, + "learning_rate": 0.0001, + "loss": 4.227, + "loss/crossentropy": 1.8325074315071106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18156912177801132, + "step": 10790 + }, + { + "epoch": 0.21584, + "grad_norm": 2.109375, + "grad_norm_var": 0.04155654907226562, + "learning_rate": 0.0001, + "loss": 3.8042, + "loss/crossentropy": 1.7531892657279968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18498936295509338, + "step": 10792 + }, + { + "epoch": 0.21588, + "grad_norm": 2.25, + "grad_norm_var": 0.03681233723958333, + "learning_rate": 0.0001, + "loss": 4.3955, + "loss/crossentropy": 2.09742671251297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21543975174427032, + "step": 10794 + }, + { + "epoch": 0.21592, + "grad_norm": 2.09375, + "grad_norm_var": 0.03532613118489583, + "learning_rate": 0.0001, + "loss": 4.4752, + "loss/crossentropy": 2.4303117990493774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24365779757499695, + "step": 10796 + }, + { + "epoch": 0.21596, + "grad_norm": 2.1875, + "grad_norm_var": 0.0371002197265625, + "learning_rate": 0.0001, + "loss": 4.1973, + "loss/crossentropy": 2.007950007915497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20470578223466873, + "step": 10798 + }, + { + "epoch": 0.216, + "grad_norm": 2.5, + "grad_norm_var": 0.026949055989583335, + "learning_rate": 0.0001, + "loss": 3.9975, + "loss/crossentropy": 1.895507276058197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005787119269371, + "step": 10800 + }, + { + "epoch": 0.21604, + "grad_norm": 2.25, + "grad_norm_var": 0.026691691080729166, + "learning_rate": 0.0001, + "loss": 4.2543, + "loss/crossentropy": 2.174374043941498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2282470539212227, + "step": 10802 + }, + { + "epoch": 0.21608, + "grad_norm": 1.9921875, + "grad_norm_var": 0.02789484659830729, + "learning_rate": 0.0001, + "loss": 4.0642, + "loss/crossentropy": 1.8878389596939087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19730794429779053, + "step": 10804 + }, + { + "epoch": 0.21612, + "grad_norm": 2.203125, + "grad_norm_var": 0.023884073893229166, + "learning_rate": 0.0001, + "loss": 4.1733, + "loss/crossentropy": 1.9874022006988525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21521702408790588, + "step": 10806 + }, + { + "epoch": 0.21616, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01962458292643229, + "learning_rate": 0.0001, + "loss": 4.0783, + "loss/crossentropy": 2.0173474550247192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21552105993032455, + "step": 10808 + }, + { + "epoch": 0.2162, + "grad_norm": 2.140625, + "grad_norm_var": 0.018344879150390625, + "learning_rate": 0.0001, + "loss": 4.0583, + "loss/crossentropy": 2.175139367580414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20279064774513245, + "step": 10810 + }, + { + "epoch": 0.21624, + "grad_norm": 2.390625, + "grad_norm_var": 0.022849273681640626, + "learning_rate": 0.0001, + "loss": 4.5125, + "loss/crossentropy": 2.6407864093780518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2939087897539139, + "step": 10812 + }, + { + "epoch": 0.21628, + "grad_norm": 2.09375, + "grad_norm_var": 0.024930826822916665, + "learning_rate": 0.0001, + "loss": 4.0869, + "loss/crossentropy": 1.947974681854248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102196291089058, + "step": 10814 + }, + { + "epoch": 0.21632, + "grad_norm": 9.875, + "grad_norm_var": 3.7874745686848956, + "learning_rate": 0.0001, + "loss": 4.3404, + "loss/crossentropy": 1.9040276408195496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20579400658607483, + "step": 10816 + }, + { + "epoch": 0.21636, + "grad_norm": 2.21875, + "grad_norm_var": 3.7853190104166665, + "learning_rate": 0.0001, + "loss": 3.7119, + "loss/crossentropy": 1.7941421270370483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19792088121175766, + "step": 10818 + }, + { + "epoch": 0.2164, + "grad_norm": 1.953125, + "grad_norm_var": 3.7839088439941406, + "learning_rate": 0.0001, + "loss": 4.2573, + "loss/crossentropy": 2.2164441347122192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2903618812561035, + "step": 10820 + }, + { + "epoch": 0.21644, + "grad_norm": 2.109375, + "grad_norm_var": 3.77445068359375, + "learning_rate": 0.0001, + "loss": 4.2537, + "loss/crossentropy": 1.9888933897018433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20800678431987762, + "step": 10822 + }, + { + "epoch": 0.21648, + "grad_norm": 1.875, + "grad_norm_var": 3.79569091796875, + "learning_rate": 0.0001, + "loss": 3.9218, + "loss/crossentropy": 1.9026559591293335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18179579824209213, + "step": 10824 + }, + { + "epoch": 0.21652, + "grad_norm": 2.078125, + "grad_norm_var": 3.7974202473958334, + "learning_rate": 0.0001, + "loss": 4.1267, + "loss/crossentropy": 1.9020920991897583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20755057036876678, + "step": 10826 + }, + { + "epoch": 0.21656, + "grad_norm": 2.03125, + "grad_norm_var": 3.8177286783854165, + "learning_rate": 0.0001, + "loss": 4.4689, + "loss/crossentropy": 2.3464537858963013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23209182918071747, + "step": 10828 + }, + { + "epoch": 0.2166, + "grad_norm": 2.3125, + "grad_norm_var": 3.7860877990722654, + "learning_rate": 0.0001, + "loss": 4.4486, + "loss/crossentropy": 2.305312991142273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2548810988664627, + "step": 10830 + }, + { + "epoch": 0.21664, + "grad_norm": 2.515625, + "grad_norm_var": 0.026775868733723958, + "learning_rate": 0.0001, + "loss": 3.9818, + "loss/crossentropy": 1.4548576474189758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1718049794435501, + "step": 10832 + }, + { + "epoch": 0.21668, + "grad_norm": 2.1875, + "grad_norm_var": 0.02504247029622396, + "learning_rate": 0.0001, + "loss": 4.4909, + "loss/crossentropy": 2.2954181432724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22977370768785477, + "step": 10834 + }, + { + "epoch": 0.21672, + "grad_norm": 2.09375, + "grad_norm_var": 0.022930653889973958, + "learning_rate": 0.0001, + "loss": 4.0433, + "loss/crossentropy": 2.1974023580551147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21474600583314896, + "step": 10836 + }, + { + "epoch": 0.21676, + "grad_norm": 2.109375, + "grad_norm_var": 0.02304865519205729, + "learning_rate": 0.0001, + "loss": 4.5905, + "loss/crossentropy": 2.1211158633232117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19471213221549988, + "step": 10838 + }, + { + "epoch": 0.2168, + "grad_norm": 1.984375, + "grad_norm_var": 0.0204010009765625, + "learning_rate": 0.0001, + "loss": 4.0868, + "loss/crossentropy": 1.9942908883094788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1899253949522972, + "step": 10840 + }, + { + "epoch": 0.21684, + "grad_norm": 1.9296875, + "grad_norm_var": 0.023656209309895832, + "learning_rate": 0.0001, + "loss": 4.0787, + "loss/crossentropy": 1.9848283529281616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1777423620223999, + "step": 10842 + }, + { + "epoch": 0.21688, + "grad_norm": 1.9296875, + "grad_norm_var": 0.02545140584309896, + "learning_rate": 0.0001, + "loss": 4.1365, + "loss/crossentropy": 1.84994775056839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19193057715892792, + "step": 10844 + }, + { + "epoch": 0.21692, + "grad_norm": 2.09375, + "grad_norm_var": 0.02163670857747396, + "learning_rate": 0.0001, + "loss": 4.2869, + "loss/crossentropy": 1.8300130367279053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19747256487607956, + "step": 10846 + }, + { + "epoch": 0.21696, + "grad_norm": 2.234375, + "grad_norm_var": 0.010676829020182292, + "learning_rate": 0.0001, + "loss": 4.1624, + "loss/crossentropy": 1.9316250681877136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22893256694078445, + "step": 10848 + }, + { + "epoch": 0.217, + "grad_norm": 2.109375, + "grad_norm_var": 0.009085845947265626, + "learning_rate": 0.0001, + "loss": 4.0294, + "loss/crossentropy": 1.9862067103385925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22470611333847046, + "step": 10850 + }, + { + "epoch": 0.21704, + "grad_norm": 2.140625, + "grad_norm_var": 0.009372711181640625, + "learning_rate": 0.0001, + "loss": 4.3979, + "loss/crossentropy": 2.196234107017517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22592387348413467, + "step": 10852 + }, + { + "epoch": 0.21708, + "grad_norm": 2.328125, + "grad_norm_var": 0.2918291727701823, + "learning_rate": 0.0001, + "loss": 4.5516, + "loss/crossentropy": 2.3097496032714844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29694322496652603, + "step": 10854 + }, + { + "epoch": 0.21712, + "grad_norm": 1.984375, + "grad_norm_var": 0.2865191141764323, + "learning_rate": 0.0001, + "loss": 3.8687, + "loss/crossentropy": 1.883777916431427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21657781302928925, + "step": 10856 + }, + { + "epoch": 0.21716, + "grad_norm": 2.25, + "grad_norm_var": 0.2916338602701823, + "learning_rate": 0.0001, + "loss": 4.1915, + "loss/crossentropy": 2.0483964681625366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22432015091180801, + "step": 10858 + }, + { + "epoch": 0.2172, + "grad_norm": 2.140625, + "grad_norm_var": 0.2870839436848958, + "learning_rate": 0.0001, + "loss": 4.1503, + "loss/crossentropy": 2.2700769901275635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21760191768407822, + "step": 10860 + }, + { + "epoch": 0.21724, + "grad_norm": 2.03125, + "grad_norm_var": 0.2878214518229167, + "learning_rate": 0.0001, + "loss": 4.574, + "loss/crossentropy": 2.3758704662323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21792180836200714, + "step": 10862 + }, + { + "epoch": 0.21728, + "grad_norm": 2.03125, + "grad_norm_var": 0.2934641520182292, + "learning_rate": 0.0001, + "loss": 4.2895, + "loss/crossentropy": 2.1972473859786987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22460343688726425, + "step": 10864 + }, + { + "epoch": 0.21732, + "grad_norm": 2.171875, + "grad_norm_var": 0.29321187337239585, + "learning_rate": 0.0001, + "loss": 4.3693, + "loss/crossentropy": 1.9811919331550598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23008134216070175, + "step": 10866 + }, + { + "epoch": 0.21736, + "grad_norm": 2.03125, + "grad_norm_var": 0.2992327372233073, + "learning_rate": 0.0001, + "loss": 4.2237, + "loss/crossentropy": 2.032214403152466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958027482032776, + "step": 10868 + }, + { + "epoch": 0.2174, + "grad_norm": 2.09375, + "grad_norm_var": 0.035162099202473956, + "learning_rate": 0.0001, + "loss": 4.0956, + "loss/crossentropy": 2.08488667011261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21948565542697906, + "step": 10870 + }, + { + "epoch": 0.21744, + "grad_norm": 2.109375, + "grad_norm_var": 0.03401260375976563, + "learning_rate": 0.0001, + "loss": 4.2923, + "loss/crossentropy": 2.304922103881836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977907121181488, + "step": 10872 + }, + { + "epoch": 0.21748, + "grad_norm": 2.03125, + "grad_norm_var": 0.005582427978515625, + "learning_rate": 0.0001, + "loss": 4.1762, + "loss/crossentropy": 1.991280436515808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2196018323302269, + "step": 10874 + }, + { + "epoch": 0.21752, + "grad_norm": 2.09375, + "grad_norm_var": 0.004965972900390625, + "learning_rate": 0.0001, + "loss": 4.4116, + "loss/crossentropy": 1.955183207988739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21059879660606384, + "step": 10876 + }, + { + "epoch": 0.21756, + "grad_norm": 2.140625, + "grad_norm_var": 0.0045562744140625, + "learning_rate": 0.0001, + "loss": 4.2105, + "loss/crossentropy": 2.1443604230880737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23160522431135178, + "step": 10878 + }, + { + "epoch": 0.2176, + "grad_norm": 2.140625, + "grad_norm_var": 0.004889933268229166, + "learning_rate": 0.0001, + "loss": 4.2374, + "loss/crossentropy": 2.0859211683273315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2296902909874916, + "step": 10880 + }, + { + "epoch": 0.21764, + "grad_norm": 2.0, + "grad_norm_var": 0.0059234619140625, + "learning_rate": 0.0001, + "loss": 4.3303, + "loss/crossentropy": 2.220987915992737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22863281518220901, + "step": 10882 + }, + { + "epoch": 0.21768, + "grad_norm": 1.96875, + "grad_norm_var": 0.005625152587890625, + "learning_rate": 0.0001, + "loss": 4.2195, + "loss/crossentropy": 2.1172574758529663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197882778942585, + "step": 10884 + }, + { + "epoch": 0.21772, + "grad_norm": 2.25, + "grad_norm_var": 0.02047704060872396, + "learning_rate": 0.0001, + "loss": 4.3907, + "loss/crossentropy": 1.7861940264701843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984492540359497, + "step": 10886 + }, + { + "epoch": 0.21776, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0218414306640625, + "learning_rate": 0.0001, + "loss": 4.0445, + "loss/crossentropy": 2.1119033098220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20427606999874115, + "step": 10888 + }, + { + "epoch": 0.2178, + "grad_norm": 2.15625, + "grad_norm_var": 0.02235692342122396, + "learning_rate": 0.0001, + "loss": 3.923, + "loss/crossentropy": 1.7849717140197754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20817570388317108, + "step": 10890 + }, + { + "epoch": 0.21784, + "grad_norm": 2.03125, + "grad_norm_var": 0.025131988525390624, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 2.0543535351753235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20579595863819122, + "step": 10892 + }, + { + "epoch": 0.21788, + "grad_norm": 1.8984375, + "grad_norm_var": 0.02643000284830729, + "learning_rate": 0.0001, + "loss": 4.1527, + "loss/crossentropy": 2.085465431213379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23541826009750366, + "step": 10894 + }, + { + "epoch": 0.21792, + "grad_norm": 2.40625, + "grad_norm_var": 0.03343073527018229, + "learning_rate": 0.0001, + "loss": 4.3288, + "loss/crossentropy": 1.9521069526672363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21404746174812317, + "step": 10896 + }, + { + "epoch": 0.21796, + "grad_norm": 1.96875, + "grad_norm_var": 0.03172378540039063, + "learning_rate": 0.0001, + "loss": 4.039, + "loss/crossentropy": 1.8710272908210754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20500759035348892, + "step": 10898 + }, + { + "epoch": 0.218, + "grad_norm": 2.0625, + "grad_norm_var": 0.032083892822265626, + "learning_rate": 0.0001, + "loss": 4.0695, + "loss/crossentropy": 2.275243639945984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24073244631290436, + "step": 10900 + }, + { + "epoch": 0.21804, + "grad_norm": 2.109375, + "grad_norm_var": 0.0181060791015625, + "learning_rate": 0.0001, + "loss": 4.3508, + "loss/crossentropy": 2.2093106508255005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23254899680614471, + "step": 10902 + }, + { + "epoch": 0.21808, + "grad_norm": 2.0, + "grad_norm_var": 0.01702855428059896, + "learning_rate": 0.0001, + "loss": 4.2805, + "loss/crossentropy": 2.0315810441970825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614907145500183, + "step": 10904 + }, + { + "epoch": 0.21812, + "grad_norm": 2.015625, + "grad_norm_var": 0.016478474934895834, + "learning_rate": 0.0001, + "loss": 4.3461, + "loss/crossentropy": 2.0397735834121704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093086987733841, + "step": 10906 + }, + { + "epoch": 0.21816, + "grad_norm": 2.03125, + "grad_norm_var": 0.013508097330729166, + "learning_rate": 0.0001, + "loss": 4.0286, + "loss/crossentropy": 1.9616519808769226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19433944672346115, + "step": 10908 + }, + { + "epoch": 0.2182, + "grad_norm": 2.25, + "grad_norm_var": 0.01749445597330729, + "learning_rate": 0.0001, + "loss": 4.2157, + "loss/crossentropy": 2.1907248497009277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23218485713005066, + "step": 10910 + }, + { + "epoch": 0.21824, + "grad_norm": 2.0625, + "grad_norm_var": 0.009981282552083333, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 2.2785152196884155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21416624635457993, + "step": 10912 + }, + { + "epoch": 0.21828, + "grad_norm": 2.03125, + "grad_norm_var": 0.00955810546875, + "learning_rate": 0.0001, + "loss": 4.2528, + "loss/crossentropy": 2.151498794555664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23632780462503433, + "step": 10914 + }, + { + "epoch": 0.21832, + "grad_norm": 2.03125, + "grad_norm_var": 0.008829752604166666, + "learning_rate": 0.0001, + "loss": 4.3937, + "loss/crossentropy": 2.816411852836609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2243700549006462, + "step": 10916 + }, + { + "epoch": 0.21836, + "grad_norm": 2.359375, + "grad_norm_var": 0.014662424723307291, + "learning_rate": 0.0001, + "loss": 4.4091, + "loss/crossentropy": 2.2203346490859985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22302204370498657, + "step": 10918 + }, + { + "epoch": 0.2184, + "grad_norm": 2.0, + "grad_norm_var": 0.014662424723307291, + "learning_rate": 0.0001, + "loss": 4.3534, + "loss/crossentropy": 2.0416316390037537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20798437297344208, + "step": 10920 + }, + { + "epoch": 0.21844, + "grad_norm": 2.078125, + "grad_norm_var": 0.014426422119140626, + "learning_rate": 0.0001, + "loss": 4.1014, + "loss/crossentropy": 2.2251007556915283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149331197142601, + "step": 10922 + }, + { + "epoch": 0.21848, + "grad_norm": 2.0, + "grad_norm_var": 0.014631907145182291, + "learning_rate": 0.0001, + "loss": 4.2346, + "loss/crossentropy": 1.976640522480011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21704353392124176, + "step": 10924 + }, + { + "epoch": 0.21852, + "grad_norm": 2.0625, + "grad_norm_var": 0.011006418863932292, + "learning_rate": 0.0001, + "loss": 4.3284, + "loss/crossentropy": 2.041864037513733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19621634483337402, + "step": 10926 + }, + { + "epoch": 0.21856, + "grad_norm": 2.125, + "grad_norm_var": 0.009696451822916667, + "learning_rate": 0.0001, + "loss": 4.241, + "loss/crossentropy": 1.7188060879707336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992729976773262, + "step": 10928 + }, + { + "epoch": 0.2186, + "grad_norm": 2.140625, + "grad_norm_var": 0.010512034098307291, + "learning_rate": 0.0001, + "loss": 4.0333, + "loss/crossentropy": 1.664458692073822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20099520683288574, + "step": 10930 + }, + { + "epoch": 0.21864, + "grad_norm": 2.015625, + "grad_norm_var": 0.011161041259765626, + "learning_rate": 0.0001, + "loss": 4.1952, + "loss/crossentropy": 2.199326276779175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23000852018594742, + "step": 10932 + }, + { + "epoch": 0.21868, + "grad_norm": 2.015625, + "grad_norm_var": 0.005694325764973958, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.980049967765808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211422473192215, + "step": 10934 + }, + { + "epoch": 0.21872, + "grad_norm": 2.09375, + "grad_norm_var": 0.005411529541015625, + "learning_rate": 0.0001, + "loss": 4.0876, + "loss/crossentropy": 1.7884072661399841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203270323574543, + "step": 10936 + }, + { + "epoch": 0.21876, + "grad_norm": 2.0, + "grad_norm_var": 0.005527496337890625, + "learning_rate": 0.0001, + "loss": 4.1923, + "loss/crossentropy": 2.1758522987365723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21960537880659103, + "step": 10938 + }, + { + "epoch": 0.2188, + "grad_norm": 2.171875, + "grad_norm_var": 0.004416656494140625, + "learning_rate": 0.0001, + "loss": 4.1708, + "loss/crossentropy": 2.255508303642273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171120047569275, + "step": 10940 + }, + { + "epoch": 0.21884, + "grad_norm": 2.0, + "grad_norm_var": 0.004308827718098958, + "learning_rate": 0.0001, + "loss": 4.1294, + "loss/crossentropy": 1.9758012890815735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19945884495973587, + "step": 10942 + }, + { + "epoch": 0.21888, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0055582682291666664, + "learning_rate": 0.0001, + "loss": 4.2122, + "loss/crossentropy": 2.2980172634124756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20864219218492508, + "step": 10944 + }, + { + "epoch": 0.21892, + "grad_norm": 2.09375, + "grad_norm_var": 0.006666819254557292, + "learning_rate": 0.0001, + "loss": 3.919, + "loss/crossentropy": 1.5307916402816772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799665167927742, + "step": 10946 + }, + { + "epoch": 0.21896, + "grad_norm": 2.0625, + "grad_norm_var": 0.006648508707682291, + "learning_rate": 0.0001, + "loss": 4.3796, + "loss/crossentropy": 2.3332602977752686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22964681684970856, + "step": 10948 + }, + { + "epoch": 0.219, + "grad_norm": 2.078125, + "grad_norm_var": 0.007380930582682291, + "learning_rate": 0.0001, + "loss": 4.1207, + "loss/crossentropy": 2.2509007453918457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22141733020544052, + "step": 10950 + }, + { + "epoch": 0.21904, + "grad_norm": 2.1875, + "grad_norm_var": 0.008949534098307291, + "learning_rate": 0.0001, + "loss": 4.4879, + "loss/crossentropy": 2.3677018880844116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25676336884498596, + "step": 10952 + }, + { + "epoch": 0.21908, + "grad_norm": 2.21875, + "grad_norm_var": 0.009934234619140624, + "learning_rate": 0.0001, + "loss": 4.4283, + "loss/crossentropy": 2.2807843685150146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21486903727054596, + "step": 10954 + }, + { + "epoch": 0.21912, + "grad_norm": 2.078125, + "grad_norm_var": 0.009232330322265624, + "learning_rate": 0.0001, + "loss": 4.3126, + "loss/crossentropy": 2.3883347511291504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21362561732530594, + "step": 10956 + }, + { + "epoch": 0.21916, + "grad_norm": 2.0625, + "grad_norm_var": 0.009405263264973958, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 1.8598107695579529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19806977361440659, + "step": 10958 + }, + { + "epoch": 0.2192, + "grad_norm": 2.140625, + "grad_norm_var": 0.007738240559895833, + "learning_rate": 0.0001, + "loss": 4.3259, + "loss/crossentropy": 2.1962249875068665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20640095323324203, + "step": 10960 + }, + { + "epoch": 0.21924, + "grad_norm": 2.359375, + "grad_norm_var": 0.007958984375, + "learning_rate": 0.0001, + "loss": 4.1774, + "loss/crossentropy": 2.061887502670288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22463876008987427, + "step": 10962 + }, + { + "epoch": 0.21928, + "grad_norm": 2.0625, + "grad_norm_var": 0.010237375895182291, + "learning_rate": 0.0001, + "loss": 3.9749, + "loss/crossentropy": 1.9112628102302551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19327937066555023, + "step": 10964 + }, + { + "epoch": 0.21932, + "grad_norm": 2.03125, + "grad_norm_var": 0.010640207926432292, + "learning_rate": 0.0001, + "loss": 4.3078, + "loss/crossentropy": 2.1655001640319824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22729264944791794, + "step": 10966 + }, + { + "epoch": 0.21936, + "grad_norm": 2.171875, + "grad_norm_var": 0.010400136311848959, + "learning_rate": 0.0001, + "loss": 4.1641, + "loss/crossentropy": 2.199634552001953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23565081506967545, + "step": 10968 + }, + { + "epoch": 0.2194, + "grad_norm": 2.03125, + "grad_norm_var": 0.010155995686848959, + "learning_rate": 0.0001, + "loss": 4.4298, + "loss/crossentropy": 2.4142041206359863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23020224273204803, + "step": 10970 + }, + { + "epoch": 0.21944, + "grad_norm": 2.03125, + "grad_norm_var": 0.011193593343098959, + "learning_rate": 0.0001, + "loss": 4.3856, + "loss/crossentropy": 2.2480571269989014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224863201379776, + "step": 10972 + }, + { + "epoch": 0.21948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012238566080729167, + "learning_rate": 0.0001, + "loss": 4.099, + "loss/crossentropy": 2.1786953806877136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21182173490524292, + "step": 10974 + }, + { + "epoch": 0.21952, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013337961832682292, + "learning_rate": 0.0001, + "loss": 4.2826, + "loss/crossentropy": 2.3767744302749634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22259585559368134, + "step": 10976 + }, + { + "epoch": 0.21956, + "grad_norm": 2.25, + "grad_norm_var": 0.010794830322265626, + "learning_rate": 0.0001, + "loss": 4.405, + "loss/crossentropy": 1.9838140606880188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21830464899539948, + "step": 10978 + }, + { + "epoch": 0.2196, + "grad_norm": 2.046875, + "grad_norm_var": 0.0092193603515625, + "learning_rate": 0.0001, + "loss": 4.3811, + "loss/crossentropy": 2.0167009234428406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21239794790744781, + "step": 10980 + }, + { + "epoch": 0.21964, + "grad_norm": 2.5, + "grad_norm_var": 0.019269816080729165, + "learning_rate": 0.0001, + "loss": 4.3117, + "loss/crossentropy": 2.001616358757019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21801268309354782, + "step": 10982 + }, + { + "epoch": 0.21968, + "grad_norm": 2.1875, + "grad_norm_var": 0.019432576497395833, + "learning_rate": 0.0001, + "loss": 4.6266, + "loss/crossentropy": 2.259532332420349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20131191611289978, + "step": 10984 + }, + { + "epoch": 0.21972, + "grad_norm": 2.09375, + "grad_norm_var": 0.0215728759765625, + "learning_rate": 0.0001, + "loss": 4.1159, + "loss/crossentropy": 1.9223415851593018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21203956007957458, + "step": 10986 + }, + { + "epoch": 0.21976, + "grad_norm": 2.0, + "grad_norm_var": 0.020295206705729166, + "learning_rate": 0.0001, + "loss": 4.0177, + "loss/crossentropy": 2.3779542446136475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23436614871025085, + "step": 10988 + }, + { + "epoch": 0.2198, + "grad_norm": 2.046875, + "grad_norm_var": 0.02195002237955729, + "learning_rate": 0.0001, + "loss": 4.0412, + "loss/crossentropy": 1.8552000522613525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20212603360414505, + "step": 10990 + }, + { + "epoch": 0.21984, + "grad_norm": 2.03125, + "grad_norm_var": 0.0206451416015625, + "learning_rate": 0.0001, + "loss": 4.2264, + "loss/crossentropy": 1.79928320646286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19676420837640762, + "step": 10992 + }, + { + "epoch": 0.21988, + "grad_norm": 2.078125, + "grad_norm_var": 0.018619791666666666, + "learning_rate": 0.0001, + "loss": 4.2426, + "loss/crossentropy": 1.886910319328308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21270643919706345, + "step": 10994 + }, + { + "epoch": 0.21992, + "grad_norm": 2.078125, + "grad_norm_var": 0.019066365559895833, + "learning_rate": 0.0001, + "loss": 4.0775, + "loss/crossentropy": 1.5682110786437988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19159042835235596, + "step": 10996 + }, + { + "epoch": 0.21996, + "grad_norm": 2.21875, + "grad_norm_var": 0.008104451497395833, + "learning_rate": 0.0001, + "loss": 4.2933, + "loss/crossentropy": 2.3335143327713013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23225059360265732, + "step": 10998 + }, + { + "epoch": 0.22, + "grad_norm": 1.953125, + "grad_norm_var": 0.007222493489583333, + "learning_rate": 0.0001, + "loss": 4.3146, + "loss/crossentropy": 2.2234359979629517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21465980261564255, + "step": 11000 + }, + { + "epoch": 0.22004, + "grad_norm": 2.09375, + "grad_norm_var": 0.00601806640625, + "learning_rate": 0.0001, + "loss": 4.6023, + "loss/crossentropy": 2.3676271438598633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2674448639154434, + "step": 11002 + }, + { + "epoch": 0.22008, + "grad_norm": 2.0, + "grad_norm_var": 0.006180826822916667, + "learning_rate": 0.0001, + "loss": 4.1064, + "loss/crossentropy": 1.9216612577438354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2036839798092842, + "step": 11004 + }, + { + "epoch": 0.22012, + "grad_norm": 2.078125, + "grad_norm_var": 0.004369099934895833, + "learning_rate": 0.0001, + "loss": 4.0427, + "loss/crossentropy": 1.6732578873634338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18962469696998596, + "step": 11006 + }, + { + "epoch": 0.22016, + "grad_norm": 2.265625, + "grad_norm_var": 0.009761555989583334, + "learning_rate": 0.0001, + "loss": 4.2704, + "loss/crossentropy": 2.0608668327331543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23521222919225693, + "step": 11008 + }, + { + "epoch": 0.2202, + "grad_norm": 2.015625, + "grad_norm_var": 0.010872395833333333, + "learning_rate": 0.0001, + "loss": 4.1808, + "loss/crossentropy": 1.8038227558135986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1811496466398239, + "step": 11010 + }, + { + "epoch": 0.22024, + "grad_norm": 2.15625, + "grad_norm_var": 0.010789998372395833, + "learning_rate": 0.0001, + "loss": 4.3407, + "loss/crossentropy": 1.8687627911567688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20196181535720825, + "step": 11012 + }, + { + "epoch": 0.22028, + "grad_norm": 2.1875, + "grad_norm_var": 0.01138916015625, + "learning_rate": 0.0001, + "loss": 4.0608, + "loss/crossentropy": 1.8011687397956848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18384280055761337, + "step": 11014 + }, + { + "epoch": 0.22032, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012727610270182292, + "learning_rate": 0.0001, + "loss": 4.0229, + "loss/crossentropy": 1.8108493089675903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19042345136404037, + "step": 11016 + }, + { + "epoch": 0.22036, + "grad_norm": 2.09375, + "grad_norm_var": 0.012611643473307291, + "learning_rate": 0.0001, + "loss": 4.2755, + "loss/crossentropy": 2.4056872129440308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22949732840061188, + "step": 11018 + }, + { + "epoch": 0.2204, + "grad_norm": 2.0625, + "grad_norm_var": 0.011451975504557291, + "learning_rate": 0.0001, + "loss": 4.2768, + "loss/crossentropy": 2.0417627692222595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21793337166309357, + "step": 11020 + }, + { + "epoch": 0.22044, + "grad_norm": 2.125, + "grad_norm_var": 0.011549631754557291, + "learning_rate": 0.0001, + "loss": 4.2767, + "loss/crossentropy": 1.8312503099441528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19038879871368408, + "step": 11022 + }, + { + "epoch": 0.22048, + "grad_norm": 2.484375, + "grad_norm_var": 0.01941095987955729, + "learning_rate": 0.0001, + "loss": 4.1846, + "loss/crossentropy": 2.0615866780281067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224861241877079, + "step": 11024 + }, + { + "epoch": 0.22052, + "grad_norm": 1.984375, + "grad_norm_var": 0.02702611287434896, + "learning_rate": 0.0001, + "loss": 4.0216, + "loss/crossentropy": 1.8578996062278748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20426958799362183, + "step": 11026 + }, + { + "epoch": 0.22056, + "grad_norm": 1.9375, + "grad_norm_var": 0.028364817301432293, + "learning_rate": 0.0001, + "loss": 3.9303, + "loss/crossentropy": 2.01333224773407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140725627541542, + "step": 11028 + }, + { + "epoch": 0.2206, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0296142578125, + "learning_rate": 0.0001, + "loss": 3.8142, + "loss/crossentropy": 1.9607431292533875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19605513662099838, + "step": 11030 + }, + { + "epoch": 0.22064, + "grad_norm": 2.078125, + "grad_norm_var": 0.026712799072265626, + "learning_rate": 0.0001, + "loss": 4.1336, + "loss/crossentropy": 1.8091335892677307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18755877763032913, + "step": 11032 + }, + { + "epoch": 0.22068, + "grad_norm": 2.09375, + "grad_norm_var": 0.0282958984375, + "learning_rate": 0.0001, + "loss": 4.0206, + "loss/crossentropy": 1.5547168254852295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18891388177871704, + "step": 11034 + }, + { + "epoch": 0.22072, + "grad_norm": 1.9453125, + "grad_norm_var": 0.029605865478515625, + "learning_rate": 0.0001, + "loss": 4.1658, + "loss/crossentropy": 2.0881760120391846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21140296012163162, + "step": 11036 + }, + { + "epoch": 0.22076, + "grad_norm": 2.953125, + "grad_norm_var": 0.07485936482747396, + "learning_rate": 0.0001, + "loss": 4.0865, + "loss/crossentropy": 2.0874351263046265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202945575118065, + "step": 11038 + }, + { + "epoch": 0.2208, + "grad_norm": 2.109375, + "grad_norm_var": 0.06611302693684896, + "learning_rate": 0.0001, + "loss": 4.3998, + "loss/crossentropy": 2.2349069118499756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247784063220024, + "step": 11040 + }, + { + "epoch": 0.22084, + "grad_norm": 2.328125, + "grad_norm_var": 0.060373687744140626, + "learning_rate": 0.0001, + "loss": 4.4679, + "loss/crossentropy": 2.0404593348503113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23377884924411774, + "step": 11042 + }, + { + "epoch": 0.22088, + "grad_norm": 2.046875, + "grad_norm_var": 0.05997314453125, + "learning_rate": 0.0001, + "loss": 4.115, + "loss/crossentropy": 2.418761968612671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2288268655538559, + "step": 11044 + }, + { + "epoch": 0.22092, + "grad_norm": 2.25, + "grad_norm_var": 0.05608495076497396, + "learning_rate": 0.0001, + "loss": 4.514, + "loss/crossentropy": 2.3263691663742065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23381420969963074, + "step": 11046 + }, + { + "epoch": 0.22096, + "grad_norm": 2.34375, + "grad_norm_var": 0.058166249593098955, + "learning_rate": 0.0001, + "loss": 4.3659, + "loss/crossentropy": 2.0020187497138977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993579939007759, + "step": 11048 + }, + { + "epoch": 0.221, + "grad_norm": 1.984375, + "grad_norm_var": 0.05726318359375, + "learning_rate": 0.0001, + "loss": 4.1869, + "loss/crossentropy": 2.3417880535125732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21232012659311295, + "step": 11050 + }, + { + "epoch": 0.22104, + "grad_norm": 2.375, + "grad_norm_var": 0.06346817016601562, + "learning_rate": 0.0001, + "loss": 4.9882, + "loss/crossentropy": 2.2952964305877686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22096505016088486, + "step": 11052 + }, + { + "epoch": 0.22108, + "grad_norm": 2.015625, + "grad_norm_var": 0.028562164306640624, + "learning_rate": 0.0001, + "loss": 4.2431, + "loss/crossentropy": 1.7963152527809143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18996400386095047, + "step": 11054 + }, + { + "epoch": 0.22112, + "grad_norm": 2.109375, + "grad_norm_var": 0.028433990478515626, + "learning_rate": 0.0001, + "loss": 4.4276, + "loss/crossentropy": 2.1328593492507935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21772069483995438, + "step": 11056 + }, + { + "epoch": 0.22116, + "grad_norm": 2.3125, + "grad_norm_var": 0.0318267822265625, + "learning_rate": 0.0001, + "loss": 4.317, + "loss/crossentropy": 1.8048993349075317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17359213531017303, + "step": 11058 + }, + { + "epoch": 0.2212, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0309234619140625, + "learning_rate": 0.0001, + "loss": 4.0173, + "loss/crossentropy": 1.9539333581924438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20828361809253693, + "step": 11060 + }, + { + "epoch": 0.22124, + "grad_norm": 1.9921875, + "grad_norm_var": 0.032291412353515625, + "learning_rate": 0.0001, + "loss": 4.0288, + "loss/crossentropy": 1.8231340050697327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20730753242969513, + "step": 11062 + }, + { + "epoch": 0.22128, + "grad_norm": 2.25, + "grad_norm_var": 0.033878326416015625, + "learning_rate": 0.0001, + "loss": 4.2756, + "loss/crossentropy": 1.9315263032913208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20077265799045563, + "step": 11064 + }, + { + "epoch": 0.22132, + "grad_norm": 1.9375, + "grad_norm_var": 0.034795888264973956, + "learning_rate": 0.0001, + "loss": 4.2137, + "loss/crossentropy": 1.9278368949890137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2203744500875473, + "step": 11066 + }, + { + "epoch": 0.22136, + "grad_norm": 2.21875, + "grad_norm_var": 0.016812896728515624, + "learning_rate": 0.0001, + "loss": 4.38, + "loss/crossentropy": 1.9314138889312744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024018093943596, + "step": 11068 + }, + { + "epoch": 0.2214, + "grad_norm": 1.9609375, + "grad_norm_var": 0.019618479410807292, + "learning_rate": 0.0001, + "loss": 3.9142, + "loss/crossentropy": 1.822297751903534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20530518889427185, + "step": 11070 + }, + { + "epoch": 0.22144, + "grad_norm": 2.09375, + "grad_norm_var": 0.02112401326497396, + "learning_rate": 0.0001, + "loss": 4.1412, + "loss/crossentropy": 2.0021358132362366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963522955775261, + "step": 11072 + }, + { + "epoch": 0.22148, + "grad_norm": 2.15625, + "grad_norm_var": 0.018293253580729165, + "learning_rate": 0.0001, + "loss": 4.5595, + "loss/crossentropy": 2.3780601024627686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24999547004699707, + "step": 11074 + }, + { + "epoch": 0.22152, + "grad_norm": 2.109375, + "grad_norm_var": 0.01727472941080729, + "learning_rate": 0.0001, + "loss": 4.2572, + "loss/crossentropy": 1.8128371238708496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1940363049507141, + "step": 11076 + }, + { + "epoch": 0.22156, + "grad_norm": 2.09375, + "grad_norm_var": 0.0147857666015625, + "learning_rate": 0.0001, + "loss": 4.1388, + "loss/crossentropy": 1.836561381816864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19289480894804, + "step": 11078 + }, + { + "epoch": 0.2216, + "grad_norm": 2.03125, + "grad_norm_var": 0.0125640869140625, + "learning_rate": 0.0001, + "loss": 3.9891, + "loss/crossentropy": 2.203549385070801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23240803182125092, + "step": 11080 + }, + { + "epoch": 0.22164, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0147857666015625, + "learning_rate": 0.0001, + "loss": 3.9827, + "loss/crossentropy": 2.126620829105377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21564993262290955, + "step": 11082 + }, + { + "epoch": 0.22168, + "grad_norm": 2.203125, + "grad_norm_var": 0.015672810872395835, + "learning_rate": 0.0001, + "loss": 4.6468, + "loss/crossentropy": 2.5914783477783203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23293393850326538, + "step": 11084 + }, + { + "epoch": 0.22172, + "grad_norm": 2.046875, + "grad_norm_var": 0.016752115885416665, + "learning_rate": 0.0001, + "loss": 4.0881, + "loss/crossentropy": 2.097515106201172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21570277214050293, + "step": 11086 + }, + { + "epoch": 0.22176, + "grad_norm": 2.203125, + "grad_norm_var": 0.01565526326497396, + "learning_rate": 0.0001, + "loss": 4.2469, + "loss/crossentropy": 1.9408356547355652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22177009284496307, + "step": 11088 + }, + { + "epoch": 0.2218, + "grad_norm": 2.171875, + "grad_norm_var": 0.014713287353515625, + "learning_rate": 0.0001, + "loss": 4.378, + "loss/crossentropy": 1.9851223826408386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20538930594921112, + "step": 11090 + }, + { + "epoch": 0.22184, + "grad_norm": 2.03125, + "grad_norm_var": 0.014694976806640624, + "learning_rate": 0.0001, + "loss": 4.2424, + "loss/crossentropy": 2.261754631996155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21294979751110077, + "step": 11092 + }, + { + "epoch": 0.22188, + "grad_norm": 2.015625, + "grad_norm_var": 0.014924875895182292, + "learning_rate": 0.0001, + "loss": 4.1847, + "loss/crossentropy": 1.978752851486206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21184594929218292, + "step": 11094 + }, + { + "epoch": 0.22192, + "grad_norm": 2.21875, + "grad_norm_var": 0.014918772379557292, + "learning_rate": 0.0001, + "loss": 4.515, + "loss/crossentropy": 2.1992926597595215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2428576499223709, + "step": 11096 + }, + { + "epoch": 0.22196, + "grad_norm": 2.078125, + "grad_norm_var": 0.010131581624348959, + "learning_rate": 0.0001, + "loss": 4.1475, + "loss/crossentropy": 1.9407767057418823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25351718813180923, + "step": 11098 + }, + { + "epoch": 0.222, + "grad_norm": 2.234375, + "grad_norm_var": 0.009993235270182291, + "learning_rate": 0.0001, + "loss": 4.2485, + "loss/crossentropy": 2.2973347902297974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211253046989441, + "step": 11100 + }, + { + "epoch": 0.22204, + "grad_norm": 2.109375, + "grad_norm_var": 0.007380930582682291, + "learning_rate": 0.0001, + "loss": 4.466, + "loss/crossentropy": 2.323284387588501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23907601833343506, + "step": 11102 + }, + { + "epoch": 0.22208, + "grad_norm": 2.21875, + "grad_norm_var": 0.00631103515625, + "learning_rate": 0.0001, + "loss": 4.3745, + "loss/crossentropy": 1.9557610750198364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20206698775291443, + "step": 11104 + }, + { + "epoch": 0.22212, + "grad_norm": 2.203125, + "grad_norm_var": 0.006083170572916667, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 2.130094528198242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22658853232860565, + "step": 11106 + }, + { + "epoch": 0.22216, + "grad_norm": 2.140625, + "grad_norm_var": 0.006078084309895833, + "learning_rate": 0.0001, + "loss": 4.0207, + "loss/crossentropy": 1.6461073160171509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.183306485414505, + "step": 11108 + }, + { + "epoch": 0.2222, + "grad_norm": 2.265625, + "grad_norm_var": 0.008310699462890625, + "learning_rate": 0.0001, + "loss": 4.2901, + "loss/crossentropy": 1.9869291186332703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20724371075630188, + "step": 11110 + }, + { + "epoch": 0.22224, + "grad_norm": 2.15625, + "grad_norm_var": 0.007696278889973958, + "learning_rate": 0.0001, + "loss": 4.2361, + "loss/crossentropy": 2.314267873764038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22680803388357162, + "step": 11112 + }, + { + "epoch": 0.22228, + "grad_norm": 2.109375, + "grad_norm_var": 0.0104888916015625, + "learning_rate": 0.0001, + "loss": 4.37, + "loss/crossentropy": 2.061935067176819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931627094745636, + "step": 11114 + }, + { + "epoch": 0.22232, + "grad_norm": 2.03125, + "grad_norm_var": 0.009601847330729166, + "learning_rate": 0.0001, + "loss": 4.1174, + "loss/crossentropy": 2.157355546951294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2328970953822136, + "step": 11116 + }, + { + "epoch": 0.22236, + "grad_norm": 2.1875, + "grad_norm_var": 0.012214914957682291, + "learning_rate": 0.0001, + "loss": 4.2092, + "loss/crossentropy": 1.8968737125396729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19509851932525635, + "step": 11118 + }, + { + "epoch": 0.2224, + "grad_norm": 2.15625, + "grad_norm_var": 0.013516998291015625, + "learning_rate": 0.0001, + "loss": 4.3433, + "loss/crossentropy": 2.1147825717926025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22639968246221542, + "step": 11120 + }, + { + "epoch": 0.22244, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01519775390625, + "learning_rate": 0.0001, + "loss": 3.9818, + "loss/crossentropy": 1.9084222316741943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16847409307956696, + "step": 11122 + }, + { + "epoch": 0.22248, + "grad_norm": 2.0625, + "grad_norm_var": 0.015028635660807291, + "learning_rate": 0.0001, + "loss": 4.0188, + "loss/crossentropy": 1.8558747172355652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2013881430029869, + "step": 11124 + }, + { + "epoch": 0.22252, + "grad_norm": 2.203125, + "grad_norm_var": 0.015755208333333333, + "learning_rate": 0.0001, + "loss": 4.3236, + "loss/crossentropy": 2.2021052837371826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2129388153553009, + "step": 11126 + }, + { + "epoch": 0.22256, + "grad_norm": 1.8125, + "grad_norm_var": 0.019212849934895835, + "learning_rate": 0.0001, + "loss": 4.2892, + "loss/crossentropy": 2.267111897468567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22087457031011581, + "step": 11128 + }, + { + "epoch": 0.2226, + "grad_norm": 2.0625, + "grad_norm_var": 0.02021052042643229, + "learning_rate": 0.0001, + "loss": 4.3781, + "loss/crossentropy": 1.860244870185852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21286892145872116, + "step": 11130 + }, + { + "epoch": 0.22264, + "grad_norm": 2.046875, + "grad_norm_var": 0.021142578125, + "learning_rate": 0.0001, + "loss": 3.7832, + "loss/crossentropy": 1.8312670588493347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20548796653747559, + "step": 11132 + }, + { + "epoch": 0.22268, + "grad_norm": 2.015625, + "grad_norm_var": 0.019087473551432293, + "learning_rate": 0.0001, + "loss": 4.1916, + "loss/crossentropy": 2.2309051752090454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244933694601059, + "step": 11134 + }, + { + "epoch": 0.22272, + "grad_norm": 2.09375, + "grad_norm_var": 0.01761449178059896, + "learning_rate": 0.0001, + "loss": 4.1125, + "loss/crossentropy": 2.1003236770629883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22222542017698288, + "step": 11136 + }, + { + "epoch": 0.22276, + "grad_norm": 1.953125, + "grad_norm_var": 0.015965779622395832, + "learning_rate": 0.0001, + "loss": 3.9391, + "loss/crossentropy": 1.95048588514328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614230632781982, + "step": 11138 + }, + { + "epoch": 0.2228, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016364542643229167, + "learning_rate": 0.0001, + "loss": 4.1608, + "loss/crossentropy": 2.2856240272521973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23748096823692322, + "step": 11140 + }, + { + "epoch": 0.22284, + "grad_norm": 2.03125, + "grad_norm_var": 0.012455240885416666, + "learning_rate": 0.0001, + "loss": 4.2187, + "loss/crossentropy": 2.3873090744018555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23743586987257004, + "step": 11142 + }, + { + "epoch": 0.22288, + "grad_norm": 2.09375, + "grad_norm_var": 0.008234659830729166, + "learning_rate": 0.0001, + "loss": 4.1283, + "loss/crossentropy": 2.248707115650177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22035827487707138, + "step": 11144 + }, + { + "epoch": 0.22292, + "grad_norm": 2.09375, + "grad_norm_var": 0.0067860921223958336, + "learning_rate": 0.0001, + "loss": 4.3561, + "loss/crossentropy": 1.9792630672454834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102261707186699, + "step": 11146 + }, + { + "epoch": 0.22296, + "grad_norm": 2.078125, + "grad_norm_var": 0.006156158447265625, + "learning_rate": 0.0001, + "loss": 4.0479, + "loss/crossentropy": 2.284587323665619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20941253006458282, + "step": 11148 + }, + { + "epoch": 0.223, + "grad_norm": 2.03125, + "grad_norm_var": 0.006361643473307292, + "learning_rate": 0.0001, + "loss": 4.1792, + "loss/crossentropy": 1.9800288677215576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20009202510118484, + "step": 11150 + }, + { + "epoch": 0.22304, + "grad_norm": 2.046875, + "grad_norm_var": 0.0064165751139322914, + "learning_rate": 0.0001, + "loss": 4.1859, + "loss/crossentropy": 1.925826370716095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18616148829460144, + "step": 11152 + }, + { + "epoch": 0.22308, + "grad_norm": 2.046875, + "grad_norm_var": 0.0053708394368489586, + "learning_rate": 0.0001, + "loss": 3.9281, + "loss/crossentropy": 2.158196806907654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21486472338438034, + "step": 11154 + }, + { + "epoch": 0.22312, + "grad_norm": 2.078125, + "grad_norm_var": 0.004393513997395833, + "learning_rate": 0.0001, + "loss": 4.3342, + "loss/crossentropy": 2.0873841047286987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22101643681526184, + "step": 11156 + }, + { + "epoch": 0.22316, + "grad_norm": 2.0625, + "grad_norm_var": 0.0044748942057291664, + "learning_rate": 0.0001, + "loss": 4.3772, + "loss/crossentropy": 1.9485042691230774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19757962226867676, + "step": 11158 + }, + { + "epoch": 0.2232, + "grad_norm": 2.0625, + "grad_norm_var": 0.004279581705729166, + "learning_rate": 0.0001, + "loss": 4.331, + "loss/crossentropy": 2.055552661418915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19271192699670792, + "step": 11160 + }, + { + "epoch": 0.22324, + "grad_norm": 2.046875, + "grad_norm_var": 0.0033599853515625, + "learning_rate": 0.0001, + "loss": 4.0958, + "loss/crossentropy": 1.9115247130393982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1922249048948288, + "step": 11162 + }, + { + "epoch": 0.22328, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0043609619140625, + "learning_rate": 0.0001, + "loss": 4.2582, + "loss/crossentropy": 2.1461042761802673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2185581848025322, + "step": 11164 + }, + { + "epoch": 0.22332, + "grad_norm": 2.109375, + "grad_norm_var": 0.004808553059895833, + "learning_rate": 0.0001, + "loss": 4.1212, + "loss/crossentropy": 2.1625128984451294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20505116879940033, + "step": 11166 + }, + { + "epoch": 0.22336, + "grad_norm": 2.109375, + "grad_norm_var": 0.003692372639973958, + "learning_rate": 0.0001, + "loss": 4.0317, + "loss/crossentropy": 1.9978403449058533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015106976032257, + "step": 11168 + }, + { + "epoch": 0.2234, + "grad_norm": 2.109375, + "grad_norm_var": 0.003794097900390625, + "learning_rate": 0.0001, + "loss": 4.0108, + "loss/crossentropy": 1.9462909698486328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20814144611358643, + "step": 11170 + }, + { + "epoch": 0.22344, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004423014322916667, + "learning_rate": 0.0001, + "loss": 4.2147, + "loss/crossentropy": 2.262490153312683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21692577749490738, + "step": 11172 + }, + { + "epoch": 0.22348, + "grad_norm": 2.140625, + "grad_norm_var": 0.0052734375, + "learning_rate": 0.0001, + "loss": 4.4079, + "loss/crossentropy": 2.092001974582672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22943469882011414, + "step": 11174 + }, + { + "epoch": 0.22352, + "grad_norm": 2.09375, + "grad_norm_var": 0.0052734375, + "learning_rate": 0.0001, + "loss": 4.273, + "loss/crossentropy": 2.302329421043396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21188674122095108, + "step": 11176 + }, + { + "epoch": 0.22356, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008701324462890625, + "learning_rate": 0.0001, + "loss": 4.0762, + "loss/crossentropy": 1.7181463837623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19460663199424744, + "step": 11178 + }, + { + "epoch": 0.2236, + "grad_norm": 2.046875, + "grad_norm_var": 0.007895660400390626, + "learning_rate": 0.0001, + "loss": 4.0061, + "loss/crossentropy": 1.8402328491210938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1928708627820015, + "step": 11180 + }, + { + "epoch": 0.22364, + "grad_norm": 2.21875, + "grad_norm_var": 0.008103179931640624, + "learning_rate": 0.0001, + "loss": 4.2968, + "loss/crossentropy": 1.9490735530853271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2492925077676773, + "step": 11182 + }, + { + "epoch": 0.22368, + "grad_norm": 2.015625, + "grad_norm_var": 0.00867919921875, + "learning_rate": 0.0001, + "loss": 4.113, + "loss/crossentropy": 2.367197036743164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25882700830698013, + "step": 11184 + }, + { + "epoch": 0.22372, + "grad_norm": 2.015625, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 4.0782, + "loss/crossentropy": 2.1192296743392944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2208312824368477, + "step": 11186 + }, + { + "epoch": 0.22376, + "grad_norm": 2.296875, + "grad_norm_var": 0.010772450764973959, + "learning_rate": 0.0001, + "loss": 4.7239, + "loss/crossentropy": 2.2150460481643677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21440500020980835, + "step": 11188 + }, + { + "epoch": 0.2238, + "grad_norm": 2.203125, + "grad_norm_var": 0.0121490478515625, + "learning_rate": 0.0001, + "loss": 4.4245, + "loss/crossentropy": 2.0837016105651855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2777148336172104, + "step": 11190 + }, + { + "epoch": 0.22384, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014240519205729166, + "learning_rate": 0.0001, + "loss": 3.918, + "loss/crossentropy": 1.7995057106018066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18295922130346298, + "step": 11192 + }, + { + "epoch": 0.22388, + "grad_norm": 2.1875, + "grad_norm_var": 0.014357248942057291, + "learning_rate": 0.0001, + "loss": 4.3361, + "loss/crossentropy": 2.2150347232818604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24010418355464935, + "step": 11194 + }, + { + "epoch": 0.22392, + "grad_norm": 2.140625, + "grad_norm_var": 0.014357248942057291, + "learning_rate": 0.0001, + "loss": 4.1711, + "loss/crossentropy": 2.1161770820617676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124250829219818, + "step": 11196 + }, + { + "epoch": 0.22396, + "grad_norm": 2.0625, + "grad_norm_var": 0.013578033447265625, + "learning_rate": 0.0001, + "loss": 4.4875, + "loss/crossentropy": 2.224100112915039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23966734111309052, + "step": 11198 + }, + { + "epoch": 0.224, + "grad_norm": 2.09375, + "grad_norm_var": 0.011921946207682292, + "learning_rate": 0.0001, + "loss": 4.377, + "loss/crossentropy": 2.196989417076111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21709956228733063, + "step": 11200 + }, + { + "epoch": 0.22404, + "grad_norm": 2.03125, + "grad_norm_var": 0.011574045817057291, + "learning_rate": 0.0001, + "loss": 4.0457, + "loss/crossentropy": 1.6592280864715576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827787384390831, + "step": 11202 + }, + { + "epoch": 0.22408, + "grad_norm": 2.109375, + "grad_norm_var": 0.008790842692057292, + "learning_rate": 0.0001, + "loss": 4.0564, + "loss/crossentropy": 1.9689467549324036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20387520641088486, + "step": 11204 + }, + { + "epoch": 0.22412, + "grad_norm": 2.8125, + "grad_norm_var": 0.03997802734375, + "learning_rate": 0.0001, + "loss": 4.3494, + "loss/crossentropy": 2.1846379041671753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068176046013832, + "step": 11206 + }, + { + "epoch": 0.22416, + "grad_norm": 2.0, + "grad_norm_var": 0.037353515625, + "learning_rate": 0.0001, + "loss": 4.3948, + "loss/crossentropy": 1.8361602425575256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22452392429113388, + "step": 11208 + }, + { + "epoch": 0.2242, + "grad_norm": 1.9375, + "grad_norm_var": 0.0390625, + "learning_rate": 0.0001, + "loss": 4.3597, + "loss/crossentropy": 2.5658172369003296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2602368891239166, + "step": 11210 + }, + { + "epoch": 0.22424, + "grad_norm": 1.9296875, + "grad_norm_var": 0.04119440714518229, + "learning_rate": 0.0001, + "loss": 4.0555, + "loss/crossentropy": 1.8167916536331177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20330622047185898, + "step": 11212 + }, + { + "epoch": 0.22428, + "grad_norm": 2.140625, + "grad_norm_var": 0.04070612589518229, + "learning_rate": 0.0001, + "loss": 4.2374, + "loss/crossentropy": 1.9151161313056946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20695462822914124, + "step": 11214 + }, + { + "epoch": 0.22432, + "grad_norm": 1.8515625, + "grad_norm_var": 0.04528401692708333, + "learning_rate": 0.0001, + "loss": 3.9807, + "loss/crossentropy": 1.9590752720832825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945583075284958, + "step": 11216 + }, + { + "epoch": 0.22436, + "grad_norm": 2.0625, + "grad_norm_var": 0.04522298177083333, + "learning_rate": 0.0001, + "loss": 4.2803, + "loss/crossentropy": 1.9777710437774658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20104076713323593, + "step": 11218 + }, + { + "epoch": 0.2244, + "grad_norm": 1.9296875, + "grad_norm_var": 0.04744847615559896, + "learning_rate": 0.0001, + "loss": 4.0279, + "loss/crossentropy": 1.725940465927124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19038556516170502, + "step": 11220 + }, + { + "epoch": 0.22444, + "grad_norm": 2.203125, + "grad_norm_var": 0.012963612874348959, + "learning_rate": 0.0001, + "loss": 4.4067, + "loss/crossentropy": 2.505728602409363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2434321641921997, + "step": 11222 + }, + { + "epoch": 0.22448, + "grad_norm": 2.03125, + "grad_norm_var": 0.010518137613932292, + "learning_rate": 0.0001, + "loss": 4.2902, + "loss/crossentropy": 2.0733951330184937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21223794668912888, + "step": 11224 + }, + { + "epoch": 0.22452, + "grad_norm": 2.03125, + "grad_norm_var": 0.010628255208333333, + "learning_rate": 0.0001, + "loss": 4.1548, + "loss/crossentropy": 2.105073928833008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22444891929626465, + "step": 11226 + }, + { + "epoch": 0.22456, + "grad_norm": 2.109375, + "grad_norm_var": 0.009745025634765625, + "learning_rate": 0.0001, + "loss": 4.2361, + "loss/crossentropy": 1.9164994359016418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20893585681915283, + "step": 11228 + }, + { + "epoch": 0.2246, + "grad_norm": 2.328125, + "grad_norm_var": 0.014902496337890625, + "learning_rate": 0.0001, + "loss": 4.3753, + "loss/crossentropy": 2.0922536849975586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22757402062416077, + "step": 11230 + }, + { + "epoch": 0.22464, + "grad_norm": 2.015625, + "grad_norm_var": 0.01190185546875, + "learning_rate": 0.0001, + "loss": 3.9791, + "loss/crossentropy": 1.886056363582611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2054021805524826, + "step": 11232 + }, + { + "epoch": 0.22468, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014300282796223958, + "learning_rate": 0.0001, + "loss": 3.9194, + "loss/crossentropy": 2.028389871120453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2109459862112999, + "step": 11234 + }, + { + "epoch": 0.22472, + "grad_norm": 2.078125, + "grad_norm_var": 0.012813313802083334, + "learning_rate": 0.0001, + "loss": 4.2084, + "loss/crossentropy": 1.9590765833854675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19774210453033447, + "step": 11236 + }, + { + "epoch": 0.22476, + "grad_norm": 2.03125, + "grad_norm_var": 0.012800852457682291, + "learning_rate": 0.0001, + "loss": 4.0153, + "loss/crossentropy": 1.8284756541252136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17485490441322327, + "step": 11238 + }, + { + "epoch": 0.2248, + "grad_norm": 2.203125, + "grad_norm_var": 0.014625803629557291, + "learning_rate": 0.0001, + "loss": 4.3494, + "loss/crossentropy": 2.1119225025177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22235971689224243, + "step": 11240 + }, + { + "epoch": 0.22484, + "grad_norm": 2.1875, + "grad_norm_var": 0.015209706624348958, + "learning_rate": 0.0001, + "loss": 4.3303, + "loss/crossentropy": 2.3050538301467896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2249491587281227, + "step": 11242 + }, + { + "epoch": 0.22488, + "grad_norm": 2.0625, + "grad_norm_var": 0.015360514322916666, + "learning_rate": 0.0001, + "loss": 4.1235, + "loss/crossentropy": 1.9806398153305054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20141054689884186, + "step": 11244 + }, + { + "epoch": 0.22492, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007879384358723958, + "learning_rate": 0.0001, + "loss": 4.0793, + "loss/crossentropy": 2.0702012181282043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980852484703064, + "step": 11246 + }, + { + "epoch": 0.22496, + "grad_norm": 2.0625, + "grad_norm_var": 0.008113606770833334, + "learning_rate": 0.0001, + "loss": 3.9997, + "loss/crossentropy": 2.205365300178528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22107956558465958, + "step": 11248 + }, + { + "epoch": 0.225, + "grad_norm": 2.890625, + "grad_norm_var": 0.05272598266601562, + "learning_rate": 0.0001, + "loss": 4.0748, + "loss/crossentropy": 2.196021556854248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199070006608963, + "step": 11250 + }, + { + "epoch": 0.22504, + "grad_norm": 2.140625, + "grad_norm_var": 0.05241673787434896, + "learning_rate": 0.0001, + "loss": 4.2062, + "loss/crossentropy": 2.0661864280700684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22117173671722412, + "step": 11252 + }, + { + "epoch": 0.22508, + "grad_norm": 2.15625, + "grad_norm_var": 0.050687408447265624, + "learning_rate": 0.0001, + "loss": 3.9176, + "loss/crossentropy": 1.825901210308075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.174439437687397, + "step": 11254 + }, + { + "epoch": 0.22512, + "grad_norm": 2.3125, + "grad_norm_var": 0.061470286051432295, + "learning_rate": 0.0001, + "loss": 4.7052, + "loss/crossentropy": 2.3710498809814453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2357504665851593, + "step": 11256 + }, + { + "epoch": 0.22516, + "grad_norm": 1.859375, + "grad_norm_var": 0.06520182291666667, + "learning_rate": 0.0001, + "loss": 3.9419, + "loss/crossentropy": 2.1523157358169556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2179568186402321, + "step": 11258 + }, + { + "epoch": 0.2252, + "grad_norm": 2.0, + "grad_norm_var": 0.06559015909830729, + "learning_rate": 0.0001, + "loss": 4.1953, + "loss/crossentropy": 2.0974292755126953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980036199092865, + "step": 11260 + }, + { + "epoch": 0.22524, + "grad_norm": 2.234375, + "grad_norm_var": 0.06339925130208333, + "learning_rate": 0.0001, + "loss": 4.4409, + "loss/crossentropy": 2.2679883241653442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22085320204496384, + "step": 11262 + }, + { + "epoch": 0.22528, + "grad_norm": 2.109375, + "grad_norm_var": 0.06108373006184896, + "learning_rate": 0.0001, + "loss": 4.2972, + "loss/crossentropy": 2.0697352290153503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21976519376039505, + "step": 11264 + }, + { + "epoch": 0.22532, + "grad_norm": 2.03125, + "grad_norm_var": 0.025465647379557293, + "learning_rate": 0.0001, + "loss": 4.4149, + "loss/crossentropy": 2.1807767748832703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21304857730865479, + "step": 11266 + }, + { + "epoch": 0.22536, + "grad_norm": 2.140625, + "grad_norm_var": 0.02541071573893229, + "learning_rate": 0.0001, + "loss": 4.4395, + "loss/crossentropy": 2.178891122341156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22129195928573608, + "step": 11268 + }, + { + "epoch": 0.2254, + "grad_norm": 2.109375, + "grad_norm_var": 0.023607381184895835, + "learning_rate": 0.0001, + "loss": 4.2908, + "loss/crossentropy": 2.1025387048721313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20581847429275513, + "step": 11270 + }, + { + "epoch": 0.22544, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013055165608723959, + "learning_rate": 0.0001, + "loss": 4.2192, + "loss/crossentropy": 1.8710424900054932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900918111205101, + "step": 11272 + }, + { + "epoch": 0.22548, + "grad_norm": 2.296875, + "grad_norm_var": 0.012373606363932291, + "learning_rate": 0.0001, + "loss": 4.498, + "loss/crossentropy": 1.8837561011314392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21019018441438675, + "step": 11274 + }, + { + "epoch": 0.22552, + "grad_norm": 1.984375, + "grad_norm_var": 0.013132476806640625, + "learning_rate": 0.0001, + "loss": 4.3139, + "loss/crossentropy": 2.2795380353927612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23858627676963806, + "step": 11276 + }, + { + "epoch": 0.22556, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0158111572265625, + "learning_rate": 0.0001, + "loss": 4.0532, + "loss/crossentropy": 2.0241262316703796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1990864798426628, + "step": 11278 + }, + { + "epoch": 0.2256, + "grad_norm": 2.40625, + "grad_norm_var": 0.022541300455729166, + "learning_rate": 0.0001, + "loss": 4.2935, + "loss/crossentropy": 1.9307058453559875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100597321987152, + "step": 11280 + }, + { + "epoch": 0.22564, + "grad_norm": 2.171875, + "grad_norm_var": 0.020414225260416665, + "learning_rate": 0.0001, + "loss": 4.2523, + "loss/crossentropy": 2.08352792263031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21930356323719025, + "step": 11282 + }, + { + "epoch": 0.22568, + "grad_norm": 2.171875, + "grad_norm_var": 0.021361287434895834, + "learning_rate": 0.0001, + "loss": 4.4277, + "loss/crossentropy": 2.393290877342224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2560981214046478, + "step": 11284 + }, + { + "epoch": 0.22572, + "grad_norm": 2.0, + "grad_norm_var": 0.023738606770833334, + "learning_rate": 0.0001, + "loss": 4.1612, + "loss/crossentropy": 2.20097017288208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20318082720041275, + "step": 11286 + }, + { + "epoch": 0.22576, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0221099853515625, + "learning_rate": 0.0001, + "loss": 4.2307, + "loss/crossentropy": 2.031981647014618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20313747972249985, + "step": 11288 + }, + { + "epoch": 0.2258, + "grad_norm": 2.078125, + "grad_norm_var": 0.0187255859375, + "learning_rate": 0.0001, + "loss": 4.3513, + "loss/crossentropy": 2.197006046772003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2301759570837021, + "step": 11290 + }, + { + "epoch": 0.22584, + "grad_norm": 2.0625, + "grad_norm_var": 0.0197021484375, + "learning_rate": 0.0001, + "loss": 4.2903, + "loss/crossentropy": 2.294014096260071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21280069649219513, + "step": 11292 + }, + { + "epoch": 0.22588, + "grad_norm": 1.9921875, + "grad_norm_var": 0.018089803059895833, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 2.0551719665527344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20910422503948212, + "step": 11294 + }, + { + "epoch": 0.22592, + "grad_norm": 2.109375, + "grad_norm_var": 0.010636393229166667, + "learning_rate": 0.0001, + "loss": 4.2109, + "loss/crossentropy": 2.1044358015060425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20463456213474274, + "step": 11296 + }, + { + "epoch": 0.22596, + "grad_norm": 2.265625, + "grad_norm_var": 0.012631988525390625, + "learning_rate": 0.0001, + "loss": 4.198, + "loss/crossentropy": 2.1013529300689697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21202504634857178, + "step": 11298 + }, + { + "epoch": 0.226, + "grad_norm": 2.046875, + "grad_norm_var": 0.009329986572265626, + "learning_rate": 0.0001, + "loss": 4.4227, + "loss/crossentropy": 2.1887649297714233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048492729663849, + "step": 11300 + }, + { + "epoch": 0.22604, + "grad_norm": 2.015625, + "grad_norm_var": 0.009525299072265625, + "learning_rate": 0.0001, + "loss": 4.0952, + "loss/crossentropy": 1.8243364691734314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2013952136039734, + "step": 11302 + }, + { + "epoch": 0.22608, + "grad_norm": 2.09375, + "grad_norm_var": 0.0137451171875, + "learning_rate": 0.0001, + "loss": 4.4636, + "loss/crossentropy": 2.2550541162490845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2493698000907898, + "step": 11304 + }, + { + "epoch": 0.22612, + "grad_norm": 1.953125, + "grad_norm_var": 0.0149169921875, + "learning_rate": 0.0001, + "loss": 3.9407, + "loss/crossentropy": 1.8744492530822754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19957631081342697, + "step": 11306 + }, + { + "epoch": 0.22616, + "grad_norm": 2.015625, + "grad_norm_var": 0.013681793212890625, + "learning_rate": 0.0001, + "loss": 4.0717, + "loss/crossentropy": 2.026526629924774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20474696904420853, + "step": 11308 + }, + { + "epoch": 0.2262, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015881093343098958, + "learning_rate": 0.0001, + "loss": 4.3898, + "loss/crossentropy": 2.3802725076675415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.247590571641922, + "step": 11310 + }, + { + "epoch": 0.22624, + "grad_norm": 2.015625, + "grad_norm_var": 0.016123199462890626, + "learning_rate": 0.0001, + "loss": 4.171, + "loss/crossentropy": 1.9359605312347412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207523413002491, + "step": 11312 + }, + { + "epoch": 0.22628, + "grad_norm": 2.046875, + "grad_norm_var": 0.0129547119140625, + "learning_rate": 0.0001, + "loss": 4.1634, + "loss/crossentropy": 2.3035311698913574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21815945208072662, + "step": 11314 + }, + { + "epoch": 0.22632, + "grad_norm": 2.234375, + "grad_norm_var": 0.0225341796875, + "learning_rate": 0.0001, + "loss": 4.3189, + "loss/crossentropy": 1.8083779215812683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20705801248550415, + "step": 11316 + }, + { + "epoch": 0.22636, + "grad_norm": 2.203125, + "grad_norm_var": 0.0205718994140625, + "learning_rate": 0.0001, + "loss": 4.2912, + "loss/crossentropy": 2.306682825088501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22666794806718826, + "step": 11318 + }, + { + "epoch": 0.2264, + "grad_norm": 2.0, + "grad_norm_var": 0.01737060546875, + "learning_rate": 0.0001, + "loss": 4.0519, + "loss/crossentropy": 1.809365153312683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20744331926107407, + "step": 11320 + }, + { + "epoch": 0.22644, + "grad_norm": 2.265625, + "grad_norm_var": 0.017894490559895834, + "learning_rate": 0.0001, + "loss": 4.2428, + "loss/crossentropy": 2.289853572845459, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23791835457086563, + "step": 11322 + }, + { + "epoch": 0.22648, + "grad_norm": 2.53125, + "grad_norm_var": 0.027341461181640624, + "learning_rate": 0.0001, + "loss": 4.5668, + "loss/crossentropy": 2.0609280467033386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166810780763626, + "step": 11324 + }, + { + "epoch": 0.22652, + "grad_norm": 2.0625, + "grad_norm_var": 0.02535400390625, + "learning_rate": 0.0001, + "loss": 4.0567, + "loss/crossentropy": 2.1174912452697754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20515341311693192, + "step": 11326 + }, + { + "epoch": 0.22656, + "grad_norm": 1.890625, + "grad_norm_var": 0.029319000244140626, + "learning_rate": 0.0001, + "loss": 4.0318, + "loss/crossentropy": 1.99192476272583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20127686113119125, + "step": 11328 + }, + { + "epoch": 0.2266, + "grad_norm": 1.9921875, + "grad_norm_var": 0.031060536702473957, + "learning_rate": 0.0001, + "loss": 3.9552, + "loss/crossentropy": 1.7805609107017517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17500489950180054, + "step": 11330 + }, + { + "epoch": 0.22664, + "grad_norm": 2.171875, + "grad_norm_var": 0.02398656209309896, + "learning_rate": 0.0001, + "loss": 4.1417, + "loss/crossentropy": 2.024085283279419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20659280568361282, + "step": 11332 + }, + { + "epoch": 0.22668, + "grad_norm": 1.9609375, + "grad_norm_var": 0.023631795247395834, + "learning_rate": 0.0001, + "loss": 4.2674, + "loss/crossentropy": 2.174618899822235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23250284045934677, + "step": 11334 + }, + { + "epoch": 0.22672, + "grad_norm": 1.9375, + "grad_norm_var": 0.02451171875, + "learning_rate": 0.0001, + "loss": 4.213, + "loss/crossentropy": 1.886943757534027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18855806440114975, + "step": 11336 + }, + { + "epoch": 0.22676, + "grad_norm": 1.9609375, + "grad_norm_var": 0.02182184855143229, + "learning_rate": 0.0001, + "loss": 4.1281, + "loss/crossentropy": 2.060371160507202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951771154999733, + "step": 11338 + }, + { + "epoch": 0.2268, + "grad_norm": 2.046875, + "grad_norm_var": 0.005576324462890625, + "learning_rate": 0.0001, + "loss": 4.1692, + "loss/crossentropy": 2.15705668926239, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23738030344247818, + "step": 11340 + }, + { + "epoch": 0.22684, + "grad_norm": 2.015625, + "grad_norm_var": 0.005576324462890625, + "learning_rate": 0.0001, + "loss": 4.2096, + "loss/crossentropy": 2.1922959089279175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24581187963485718, + "step": 11342 + }, + { + "epoch": 0.22688, + "grad_norm": 2.0, + "grad_norm_var": 0.007303873697916667, + "learning_rate": 0.0001, + "loss": 3.9786, + "loss/crossentropy": 2.1590365171432495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20936457812786102, + "step": 11344 + }, + { + "epoch": 0.22692, + "grad_norm": 2.53125, + "grad_norm_var": 0.020783487955729166, + "learning_rate": 0.0001, + "loss": 4.2382, + "loss/crossentropy": 1.8087702989578247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18874070048332214, + "step": 11346 + }, + { + "epoch": 0.22696, + "grad_norm": 2.203125, + "grad_norm_var": 0.021732584635416666, + "learning_rate": 0.0001, + "loss": 4.4981, + "loss/crossentropy": 2.5205971002578735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24057865887880325, + "step": 11348 + }, + { + "epoch": 0.227, + "grad_norm": 2.078125, + "grad_norm_var": 0.02061945597330729, + "learning_rate": 0.0001, + "loss": 4.16, + "loss/crossentropy": 2.114508092403412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20631257444620132, + "step": 11350 + }, + { + "epoch": 0.22704, + "grad_norm": 2.21875, + "grad_norm_var": 0.020881144205729167, + "learning_rate": 0.0001, + "loss": 4.4344, + "loss/crossentropy": 2.385537028312683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.233476921916008, + "step": 11352 + }, + { + "epoch": 0.22708, + "grad_norm": 2.21875, + "grad_norm_var": 0.0217926025390625, + "learning_rate": 0.0001, + "loss": 4.3335, + "loss/crossentropy": 2.4251039028167725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24502182751893997, + "step": 11354 + }, + { + "epoch": 0.22712, + "grad_norm": 2.078125, + "grad_norm_var": 0.025690714518229168, + "learning_rate": 0.0001, + "loss": 4.345, + "loss/crossentropy": 2.278030514717102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22328510880470276, + "step": 11356 + }, + { + "epoch": 0.22716, + "grad_norm": 2.03125, + "grad_norm_var": 0.025031534830729167, + "learning_rate": 0.0001, + "loss": 4.2835, + "loss/crossentropy": 1.7373265027999878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17635879665613174, + "step": 11358 + }, + { + "epoch": 0.2272, + "grad_norm": 2.25, + "grad_norm_var": 0.024461873372395835, + "learning_rate": 0.0001, + "loss": 4.3644, + "loss/crossentropy": 2.1546722650527954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22285740077495575, + "step": 11360 + }, + { + "epoch": 0.22724, + "grad_norm": 2.140625, + "grad_norm_var": 0.0143218994140625, + "learning_rate": 0.0001, + "loss": 4.5415, + "loss/crossentropy": 2.341711401939392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23693612217903137, + "step": 11362 + }, + { + "epoch": 0.22728, + "grad_norm": 2.65625, + "grad_norm_var": 0.0461090087890625, + "learning_rate": 0.0001, + "loss": 4.3131, + "loss/crossentropy": 1.8764930367469788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19202134013175964, + "step": 11364 + }, + { + "epoch": 0.22732, + "grad_norm": 2.0625, + "grad_norm_var": 0.0450439453125, + "learning_rate": 0.0001, + "loss": 4.1172, + "loss/crossentropy": 2.035600185394287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2188200280070305, + "step": 11366 + }, + { + "epoch": 0.22736, + "grad_norm": 2.015625, + "grad_norm_var": 0.045481109619140626, + "learning_rate": 0.0001, + "loss": 4.0469, + "loss/crossentropy": 2.120850682258606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22664117813110352, + "step": 11368 + }, + { + "epoch": 0.2274, + "grad_norm": 2.03125, + "grad_norm_var": 0.050455729166666664, + "learning_rate": 0.0001, + "loss": 3.9552, + "loss/crossentropy": 1.9753262996673584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20000861585140228, + "step": 11370 + }, + { + "epoch": 0.22744, + "grad_norm": 2.703125, + "grad_norm_var": 0.06503499348958333, + "learning_rate": 0.0001, + "loss": 4.2474, + "loss/crossentropy": 2.0264564156532288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026129513978958, + "step": 11372 + }, + { + "epoch": 0.22748, + "grad_norm": 1.890625, + "grad_norm_var": 0.07055562337239583, + "learning_rate": 0.0001, + "loss": 3.8546, + "loss/crossentropy": 1.9269734025001526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22436296939849854, + "step": 11374 + }, + { + "epoch": 0.22752, + "grad_norm": 2.09375, + "grad_norm_var": 0.07017822265625, + "learning_rate": 0.0001, + "loss": 4.4164, + "loss/crossentropy": 2.1881991624832153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22483092546463013, + "step": 11376 + }, + { + "epoch": 0.22756, + "grad_norm": 2.015625, + "grad_norm_var": 0.07446187337239583, + "learning_rate": 0.0001, + "loss": 4.0277, + "loss/crossentropy": 2.0142401456832886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21913451701402664, + "step": 11378 + }, + { + "epoch": 0.2276, + "grad_norm": 2.09375, + "grad_norm_var": 0.05676676432291667, + "learning_rate": 0.0001, + "loss": 4.3374, + "loss/crossentropy": 2.0229859352111816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.238722562789917, + "step": 11380 + }, + { + "epoch": 0.22764, + "grad_norm": 1.9921875, + "grad_norm_var": 0.05724054972330729, + "learning_rate": 0.0001, + "loss": 4.0223, + "loss/crossentropy": 2.043630540370941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21258120238780975, + "step": 11382 + }, + { + "epoch": 0.22768, + "grad_norm": 2.109375, + "grad_norm_var": 0.06020685831705729, + "learning_rate": 0.0001, + "loss": 4.0752, + "loss/crossentropy": 2.702946662902832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24362115561962128, + "step": 11384 + }, + { + "epoch": 0.22772, + "grad_norm": 2.046875, + "grad_norm_var": 0.05449600219726562, + "learning_rate": 0.0001, + "loss": 3.9806, + "loss/crossentropy": 2.243411898612976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22256402671337128, + "step": 11386 + }, + { + "epoch": 0.22776, + "grad_norm": 2.078125, + "grad_norm_var": 0.033699289957682295, + "learning_rate": 0.0001, + "loss": 3.7808, + "loss/crossentropy": 2.1451956033706665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21471337974071503, + "step": 11388 + }, + { + "epoch": 0.2278, + "grad_norm": 2.140625, + "grad_norm_var": 0.030775705973307293, + "learning_rate": 0.0001, + "loss": 4.2865, + "loss/crossentropy": 2.426279664039612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22846391052007675, + "step": 11390 + }, + { + "epoch": 0.22784, + "grad_norm": 2.15625, + "grad_norm_var": 0.033455149332682295, + "learning_rate": 0.0001, + "loss": 4.2878, + "loss/crossentropy": 2.0890655517578125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23008010536432266, + "step": 11392 + }, + { + "epoch": 0.22788, + "grad_norm": 2.046875, + "grad_norm_var": 0.030452219645182292, + "learning_rate": 0.0001, + "loss": 4.2124, + "loss/crossentropy": 2.3496296405792236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23718956112861633, + "step": 11394 + }, + { + "epoch": 0.22792, + "grad_norm": 2.171875, + "grad_norm_var": 0.013152821858723959, + "learning_rate": 0.0001, + "loss": 4.0025, + "loss/crossentropy": 2.02141535282135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21102941036224365, + "step": 11396 + }, + { + "epoch": 0.22796, + "grad_norm": 2.078125, + "grad_norm_var": 0.01275634765625, + "learning_rate": 0.0001, + "loss": 4.4817, + "loss/crossentropy": 2.213461995124817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21838735044002533, + "step": 11398 + }, + { + "epoch": 0.228, + "grad_norm": 2.0625, + "grad_norm_var": 0.11743062337239583, + "learning_rate": 0.0001, + "loss": 4.2313, + "loss/crossentropy": 2.161481499671936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24635545909404755, + "step": 11400 + }, + { + "epoch": 0.22804, + "grad_norm": 2.125, + "grad_norm_var": 0.1158843994140625, + "learning_rate": 0.0001, + "loss": 4.5372, + "loss/crossentropy": 2.0998951196670532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2205812931060791, + "step": 11402 + }, + { + "epoch": 0.22808, + "grad_norm": 2.0625, + "grad_norm_var": 0.11588109334309896, + "learning_rate": 0.0001, + "loss": 4.1606, + "loss/crossentropy": 1.8785207867622375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20330313593149185, + "step": 11404 + }, + { + "epoch": 0.22812, + "grad_norm": 2.03125, + "grad_norm_var": 0.11588109334309896, + "learning_rate": 0.0001, + "loss": 4.2408, + "loss/crossentropy": 2.254656672477722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220963753759861, + "step": 11406 + }, + { + "epoch": 0.22816, + "grad_norm": 2.078125, + "grad_norm_var": 0.11553726196289063, + "learning_rate": 0.0001, + "loss": 4.1356, + "loss/crossentropy": 1.7936404347419739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878006011247635, + "step": 11408 + }, + { + "epoch": 0.2282, + "grad_norm": 2.453125, + "grad_norm_var": 0.12290445963541667, + "learning_rate": 0.0001, + "loss": 4.2199, + "loss/crossentropy": 1.8537682890892029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893099844455719, + "step": 11410 + }, + { + "epoch": 0.22824, + "grad_norm": 3.234375, + "grad_norm_var": 0.1904205322265625, + "learning_rate": 0.0001, + "loss": 4.1628, + "loss/crossentropy": 2.0987170338630676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072230949997902, + "step": 11412 + }, + { + "epoch": 0.22828, + "grad_norm": 2.078125, + "grad_norm_var": 0.1907958984375, + "learning_rate": 0.0001, + "loss": 4.2473, + "loss/crossentropy": 2.049329698085785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142597660422325, + "step": 11414 + }, + { + "epoch": 0.22832, + "grad_norm": 2.03125, + "grad_norm_var": 0.09533284505208334, + "learning_rate": 0.0001, + "loss": 4.283, + "loss/crossentropy": 2.066833019256592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22086824476718903, + "step": 11416 + }, + { + "epoch": 0.22836, + "grad_norm": 2.15625, + "grad_norm_var": 0.09519755045572917, + "learning_rate": 0.0001, + "loss": 4.1837, + "loss/crossentropy": 2.01213002204895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23191991448402405, + "step": 11418 + }, + { + "epoch": 0.2284, + "grad_norm": 5.6875, + "grad_norm_var": 0.8556495666503906, + "learning_rate": 0.0001, + "loss": 4.1977, + "loss/crossentropy": 2.1349334716796875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22979671508073807, + "step": 11420 + }, + { + "epoch": 0.22844, + "grad_norm": 2.15625, + "grad_norm_var": 0.8487709045410157, + "learning_rate": 0.0001, + "loss": 4.2101, + "loss/crossentropy": 1.9201850295066833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21903745830059052, + "step": 11422 + }, + { + "epoch": 0.22848, + "grad_norm": 2.109375, + "grad_norm_var": 0.8546376546223958, + "learning_rate": 0.0001, + "loss": 4.1927, + "loss/crossentropy": 2.011150360107422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059009075164795, + "step": 11424 + }, + { + "epoch": 0.22852, + "grad_norm": 2.171875, + "grad_norm_var": 0.8533322652180989, + "learning_rate": 0.0001, + "loss": 4.3762, + "loss/crossentropy": 2.1529648303985596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23028475046157837, + "step": 11426 + }, + { + "epoch": 0.22856, + "grad_norm": 1.8828125, + "grad_norm_var": 0.8176177978515625, + "learning_rate": 0.0001, + "loss": 3.9657, + "loss/crossentropy": 2.1602566838264465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2262946516275406, + "step": 11428 + }, + { + "epoch": 0.2286, + "grad_norm": 1.953125, + "grad_norm_var": 0.829766591389974, + "learning_rate": 0.0001, + "loss": 3.9369, + "loss/crossentropy": 1.7230631113052368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19715693593025208, + "step": 11430 + }, + { + "epoch": 0.22864, + "grad_norm": 1.953125, + "grad_norm_var": 0.8287737528483073, + "learning_rate": 0.0001, + "loss": 4.3791, + "loss/crossentropy": 2.4396276473999023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24283458292484283, + "step": 11432 + }, + { + "epoch": 0.22868, + "grad_norm": 2.171875, + "grad_norm_var": 0.8311480204264323, + "learning_rate": 0.0001, + "loss": 4.2692, + "loss/crossentropy": 1.8615645170211792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21288780122995377, + "step": 11434 + }, + { + "epoch": 0.22872, + "grad_norm": 2.09375, + "grad_norm_var": 0.012941233317057292, + "learning_rate": 0.0001, + "loss": 4.394, + "loss/crossentropy": 1.8972707390785217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18510619550943375, + "step": 11436 + }, + { + "epoch": 0.22876, + "grad_norm": 2.0, + "grad_norm_var": 0.011980946858723958, + "learning_rate": 0.0001, + "loss": 4.1459, + "loss/crossentropy": 2.217754364013672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21217356622219086, + "step": 11438 + }, + { + "epoch": 0.2288, + "grad_norm": 2.09375, + "grad_norm_var": 0.014818318684895833, + "learning_rate": 0.0001, + "loss": 3.7955, + "loss/crossentropy": 1.6481398940086365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17804963141679764, + "step": 11440 + }, + { + "epoch": 0.22884, + "grad_norm": 2.0625, + "grad_norm_var": 0.01375732421875, + "learning_rate": 0.0001, + "loss": 4.3291, + "loss/crossentropy": 2.026508390903473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20739784091711044, + "step": 11442 + }, + { + "epoch": 0.22888, + "grad_norm": 2.09375, + "grad_norm_var": 0.012334950764973958, + "learning_rate": 0.0001, + "loss": 4.2086, + "loss/crossentropy": 2.05685293674469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22639908641576767, + "step": 11444 + }, + { + "epoch": 0.22892, + "grad_norm": 2.03125, + "grad_norm_var": 0.01197509765625, + "learning_rate": 0.0001, + "loss": 4.4689, + "loss/crossentropy": 2.073192059993744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21727359294891357, + "step": 11446 + }, + { + "epoch": 0.22896, + "grad_norm": 2.171875, + "grad_norm_var": 0.0092193603515625, + "learning_rate": 0.0001, + "loss": 4.4312, + "loss/crossentropy": 2.1311055421829224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19499187916517258, + "step": 11448 + }, + { + "epoch": 0.229, + "grad_norm": 2.1875, + "grad_norm_var": 0.01021728515625, + "learning_rate": 0.0001, + "loss": 4.2043, + "loss/crossentropy": 2.169051766395569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22266974300146103, + "step": 11450 + }, + { + "epoch": 0.22904, + "grad_norm": 1.875, + "grad_norm_var": 0.013727823893229166, + "learning_rate": 0.0001, + "loss": 4.1784, + "loss/crossentropy": 2.022417426109314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20541484653949738, + "step": 11452 + }, + { + "epoch": 0.22908, + "grad_norm": 2.140625, + "grad_norm_var": 0.015458170572916667, + "learning_rate": 0.0001, + "loss": 4.2921, + "loss/crossentropy": 2.2652071714401245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21639510244131088, + "step": 11454 + }, + { + "epoch": 0.22912, + "grad_norm": 2.203125, + "grad_norm_var": 0.01343994140625, + "learning_rate": 0.0001, + "loss": 4.1575, + "loss/crossentropy": 1.912036418914795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2204061597585678, + "step": 11456 + }, + { + "epoch": 0.22916, + "grad_norm": 2.0625, + "grad_norm_var": 0.014188639322916667, + "learning_rate": 0.0001, + "loss": 4.4497, + "loss/crossentropy": 2.029780328273773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032444253563881, + "step": 11458 + }, + { + "epoch": 0.2292, + "grad_norm": 2.09375, + "grad_norm_var": 0.016039021809895835, + "learning_rate": 0.0001, + "loss": 4.0627, + "loss/crossentropy": 1.954946756362915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2028508484363556, + "step": 11460 + }, + { + "epoch": 0.22924, + "grad_norm": 2.8125, + "grad_norm_var": 0.049494425455729164, + "learning_rate": 0.0001, + "loss": 4.5787, + "loss/crossentropy": 2.3707855939865112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460392713546753, + "step": 11462 + }, + { + "epoch": 0.22928, + "grad_norm": 2.078125, + "grad_norm_var": 0.047972615559895834, + "learning_rate": 0.0001, + "loss": 4.1509, + "loss/crossentropy": 2.410157322883606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24578960239887238, + "step": 11464 + }, + { + "epoch": 0.22932, + "grad_norm": 1.953125, + "grad_norm_var": 0.049117024739583334, + "learning_rate": 0.0001, + "loss": 4.0935, + "loss/crossentropy": 2.1403380036354065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20563968271017075, + "step": 11466 + }, + { + "epoch": 0.22936, + "grad_norm": 1.953125, + "grad_norm_var": 0.0471588134765625, + "learning_rate": 0.0001, + "loss": 4.0991, + "loss/crossentropy": 2.1596190333366394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24405791610479355, + "step": 11468 + }, + { + "epoch": 0.2294, + "grad_norm": 2.421875, + "grad_norm_var": 0.0506744384765625, + "learning_rate": 0.0001, + "loss": 4.3973, + "loss/crossentropy": 1.9427857398986816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22412577271461487, + "step": 11470 + }, + { + "epoch": 0.22944, + "grad_norm": 1.8984375, + "grad_norm_var": 0.05306574503580729, + "learning_rate": 0.0001, + "loss": 4.1093, + "loss/crossentropy": 2.078735053539276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20844466239213943, + "step": 11472 + }, + { + "epoch": 0.22948, + "grad_norm": 2.0, + "grad_norm_var": 0.054323069254557294, + "learning_rate": 0.0001, + "loss": 4.0749, + "loss/crossentropy": 2.351656198501587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24430027604103088, + "step": 11474 + }, + { + "epoch": 0.22952, + "grad_norm": 2.09375, + "grad_norm_var": 0.05269139607747396, + "learning_rate": 0.0001, + "loss": 3.9647, + "loss/crossentropy": 2.057854652404785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21224451810121536, + "step": 11476 + }, + { + "epoch": 0.22956, + "grad_norm": 2.03125, + "grad_norm_var": 0.018534088134765626, + "learning_rate": 0.0001, + "loss": 4.1451, + "loss/crossentropy": 2.207979917526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22855235636234283, + "step": 11478 + }, + { + "epoch": 0.2296, + "grad_norm": 2.125, + "grad_norm_var": 0.01870905558268229, + "learning_rate": 0.0001, + "loss": 4.3748, + "loss/crossentropy": 2.08840012550354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21988992393016815, + "step": 11480 + }, + { + "epoch": 0.22964, + "grad_norm": 2.125, + "grad_norm_var": 0.017144521077473957, + "learning_rate": 0.0001, + "loss": 4.2314, + "loss/crossentropy": 2.1339274644851685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278972491621971, + "step": 11482 + }, + { + "epoch": 0.22968, + "grad_norm": 2.15625, + "grad_norm_var": 0.015386708577473958, + "learning_rate": 0.0001, + "loss": 4.2013, + "loss/crossentropy": 1.970679223537445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20286859571933746, + "step": 11484 + }, + { + "epoch": 0.22972, + "grad_norm": 1.984375, + "grad_norm_var": 0.005295562744140625, + "learning_rate": 0.0001, + "loss": 4.2443, + "loss/crossentropy": 2.1023008823394775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21906304359436035, + "step": 11486 + }, + { + "epoch": 0.22976, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004870351155598958, + "learning_rate": 0.0001, + "loss": 4.2774, + "loss/crossentropy": 2.1415608525276184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21857701241970062, + "step": 11488 + }, + { + "epoch": 0.2298, + "grad_norm": 2.03125, + "grad_norm_var": 0.005191802978515625, + "learning_rate": 0.0001, + "loss": 4.0697, + "loss/crossentropy": 1.9523325562477112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19859597831964493, + "step": 11490 + }, + { + "epoch": 0.22984, + "grad_norm": 2.0, + "grad_norm_var": 0.0046770731608072914, + "learning_rate": 0.0001, + "loss": 4.2433, + "loss/crossentropy": 2.1532927751541138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239024043083191, + "step": 11492 + }, + { + "epoch": 0.22988, + "grad_norm": 2.1875, + "grad_norm_var": 0.006705474853515625, + "learning_rate": 0.0001, + "loss": 4.3556, + "loss/crossentropy": 2.160528779029846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23207154870033264, + "step": 11494 + }, + { + "epoch": 0.22992, + "grad_norm": 2.078125, + "grad_norm_var": 0.006528472900390625, + "learning_rate": 0.0001, + "loss": 4.277, + "loss/crossentropy": 2.07854962348938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2189328968524933, + "step": 11496 + }, + { + "epoch": 0.22996, + "grad_norm": 2.296875, + "grad_norm_var": 0.009069569905598958, + "learning_rate": 0.0001, + "loss": 4.4237, + "loss/crossentropy": 2.2270501852035522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23237968981266022, + "step": 11498 + }, + { + "epoch": 0.23, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0125396728515625, + "learning_rate": 0.0001, + "loss": 3.96, + "loss/crossentropy": 1.9641701579093933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063433900475502, + "step": 11500 + }, + { + "epoch": 0.23004, + "grad_norm": 2.0625, + "grad_norm_var": 0.016866048177083332, + "learning_rate": 0.0001, + "loss": 4.3002, + "loss/crossentropy": 1.9243032932281494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20478814095258713, + "step": 11502 + }, + { + "epoch": 0.23008, + "grad_norm": 2.140625, + "grad_norm_var": 0.01587092081705729, + "learning_rate": 0.0001, + "loss": 4.329, + "loss/crossentropy": 2.295292854309082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23184886574745178, + "step": 11504 + }, + { + "epoch": 0.23012, + "grad_norm": 2.046875, + "grad_norm_var": 0.01654052734375, + "learning_rate": 0.0001, + "loss": 4.0697, + "loss/crossentropy": 2.134859561920166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21923956274986267, + "step": 11506 + }, + { + "epoch": 0.23016, + "grad_norm": 2.109375, + "grad_norm_var": 0.0158355712890625, + "learning_rate": 0.0001, + "loss": 4.2523, + "loss/crossentropy": 1.905085265636444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22443564236164093, + "step": 11508 + }, + { + "epoch": 0.2302, + "grad_norm": 2.03125, + "grad_norm_var": 0.014892323811848959, + "learning_rate": 0.0001, + "loss": 4.0296, + "loss/crossentropy": 1.6967324614524841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19518497586250305, + "step": 11510 + }, + { + "epoch": 0.23024, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015364583333333333, + "learning_rate": 0.0001, + "loss": 4.225, + "loss/crossentropy": 1.638957679271698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19098830223083496, + "step": 11512 + }, + { + "epoch": 0.23028, + "grad_norm": 2.078125, + "grad_norm_var": 0.013732655843098959, + "learning_rate": 0.0001, + "loss": 4.0479, + "loss/crossentropy": 1.8760477900505066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202397421002388, + "step": 11514 + }, + { + "epoch": 0.23032, + "grad_norm": 2.03125, + "grad_norm_var": 0.012189737955729167, + "learning_rate": 0.0001, + "loss": 4.2265, + "loss/crossentropy": 2.201690912246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22841450572013855, + "step": 11516 + }, + { + "epoch": 0.23036, + "grad_norm": 2.203125, + "grad_norm_var": 0.007987467447916667, + "learning_rate": 0.0001, + "loss": 4.5451, + "loss/crossentropy": 2.5022183656692505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25180216133594513, + "step": 11518 + }, + { + "epoch": 0.2304, + "grad_norm": 2.0, + "grad_norm_var": 0.007682291666666666, + "learning_rate": 0.0001, + "loss": 4.2848, + "loss/crossentropy": 2.4634610414505005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23911123722791672, + "step": 11520 + }, + { + "epoch": 0.23044, + "grad_norm": 2.046875, + "grad_norm_var": 0.007503000895182291, + "learning_rate": 0.0001, + "loss": 4.2919, + "loss/crossentropy": 2.1176512241363525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20835164189338684, + "step": 11522 + }, + { + "epoch": 0.23048, + "grad_norm": 2.0625, + "grad_norm_var": 0.007252756754557292, + "learning_rate": 0.0001, + "loss": 4.0803, + "loss/crossentropy": 1.7708171606063843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18085652589797974, + "step": 11524 + }, + { + "epoch": 0.23052, + "grad_norm": 2.09375, + "grad_norm_var": 0.008504231770833334, + "learning_rate": 0.0001, + "loss": 4.1159, + "loss/crossentropy": 1.6740695238113403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17464587092399597, + "step": 11526 + }, + { + "epoch": 0.23056, + "grad_norm": 1.984375, + "grad_norm_var": 0.008135732014973958, + "learning_rate": 0.0001, + "loss": 4.1829, + "loss/crossentropy": 1.8153178691864014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19107046723365784, + "step": 11528 + }, + { + "epoch": 0.2306, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006231435139973958, + "learning_rate": 0.0001, + "loss": 4.2074, + "loss/crossentropy": 2.2833873629570007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19900934398174286, + "step": 11530 + }, + { + "epoch": 0.23064, + "grad_norm": 2.03125, + "grad_norm_var": 0.008593495686848958, + "learning_rate": 0.0001, + "loss": 4.3154, + "loss/crossentropy": 1.9106029272079468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996043100953102, + "step": 11532 + }, + { + "epoch": 0.23068, + "grad_norm": 2.109375, + "grad_norm_var": 0.006030019124348958, + "learning_rate": 0.0001, + "loss": 4.3849, + "loss/crossentropy": 2.049258530139923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21438845992088318, + "step": 11534 + }, + { + "epoch": 0.23072, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006029256184895833, + "learning_rate": 0.0001, + "loss": 3.9236, + "loss/crossentropy": 1.9124351739883423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20111311972141266, + "step": 11536 + }, + { + "epoch": 0.23076, + "grad_norm": 2.015625, + "grad_norm_var": 0.0059397379557291664, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 1.8045400381088257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20846854895353317, + "step": 11538 + }, + { + "epoch": 0.2308, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007297515869140625, + "learning_rate": 0.0001, + "loss": 4.1804, + "loss/crossentropy": 1.9970109462738037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20165999233722687, + "step": 11540 + }, + { + "epoch": 0.23084, + "grad_norm": 2.09375, + "grad_norm_var": 0.013065338134765625, + "learning_rate": 0.0001, + "loss": 4.1756, + "loss/crossentropy": 2.0300097465515137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24561113119125366, + "step": 11542 + }, + { + "epoch": 0.23088, + "grad_norm": 2.5, + "grad_norm_var": 0.025402577718098958, + "learning_rate": 0.0001, + "loss": 4.4841, + "loss/crossentropy": 2.194391131401062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24075081944465637, + "step": 11544 + }, + { + "epoch": 0.23092, + "grad_norm": 2.03125, + "grad_norm_var": 0.0239898681640625, + "learning_rate": 0.0001, + "loss": 3.9177, + "loss/crossentropy": 2.0229761600494385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21907109022140503, + "step": 11546 + }, + { + "epoch": 0.23096, + "grad_norm": 2.171875, + "grad_norm_var": 0.022606404622395833, + "learning_rate": 0.0001, + "loss": 4.3089, + "loss/crossentropy": 1.831793487071991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18943443894386292, + "step": 11548 + }, + { + "epoch": 0.231, + "grad_norm": 1.984375, + "grad_norm_var": 0.023395792643229166, + "learning_rate": 0.0001, + "loss": 4.19, + "loss/crossentropy": 2.4099985361099243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22612392157316208, + "step": 11550 + }, + { + "epoch": 0.23104, + "grad_norm": 2.109375, + "grad_norm_var": 0.022946929931640624, + "learning_rate": 0.0001, + "loss": 4.2512, + "loss/crossentropy": 2.0496281385421753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20928414165973663, + "step": 11552 + }, + { + "epoch": 0.23108, + "grad_norm": 1.875, + "grad_norm_var": 0.025986480712890624, + "learning_rate": 0.0001, + "loss": 4.196, + "loss/crossentropy": 1.9366755485534668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19255827367305756, + "step": 11554 + }, + { + "epoch": 0.23112, + "grad_norm": 1.9296875, + "grad_norm_var": 0.02516454060872396, + "learning_rate": 0.0001, + "loss": 4.3758, + "loss/crossentropy": 1.9077317714691162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20340882241725922, + "step": 11556 + }, + { + "epoch": 0.23116, + "grad_norm": 2.046875, + "grad_norm_var": 0.02020848592122396, + "learning_rate": 0.0001, + "loss": 4.091, + "loss/crossentropy": 1.9167283773422241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21566946804523468, + "step": 11558 + }, + { + "epoch": 0.2312, + "grad_norm": 2.125, + "grad_norm_var": 0.0059506734212239586, + "learning_rate": 0.0001, + "loss": 4.5758, + "loss/crossentropy": 2.4604564905166626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22104239463806152, + "step": 11560 + }, + { + "epoch": 0.23124, + "grad_norm": 1.828125, + "grad_norm_var": 0.009124501546223959, + "learning_rate": 0.0001, + "loss": 4.0398, + "loss/crossentropy": 2.359586775302887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064266949892044, + "step": 11562 + }, + { + "epoch": 0.23128, + "grad_norm": 1.875, + "grad_norm_var": 0.009456125895182292, + "learning_rate": 0.0001, + "loss": 4.0065, + "loss/crossentropy": 1.9757064580917358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19224581122398376, + "step": 11564 + }, + { + "epoch": 0.23132, + "grad_norm": 2.125, + "grad_norm_var": 0.03843561808268229, + "learning_rate": 0.0001, + "loss": 4.5544, + "loss/crossentropy": 1.9888432025909424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20799735933542252, + "step": 11566 + }, + { + "epoch": 0.23136, + "grad_norm": 2.0625, + "grad_norm_var": 0.03802057902018229, + "learning_rate": 0.0001, + "loss": 4.2281, + "loss/crossentropy": 2.0829046964645386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22345586121082306, + "step": 11568 + }, + { + "epoch": 0.2314, + "grad_norm": 2.203125, + "grad_norm_var": 0.036649322509765624, + "learning_rate": 0.0001, + "loss": 4.3421, + "loss/crossentropy": 1.964626431465149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21068184822797775, + "step": 11570 + }, + { + "epoch": 0.23144, + "grad_norm": 2.125, + "grad_norm_var": 0.034993489583333336, + "learning_rate": 0.0001, + "loss": 4.4206, + "loss/crossentropy": 2.313928008079529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23291154205799103, + "step": 11572 + }, + { + "epoch": 0.23148, + "grad_norm": 1.890625, + "grad_norm_var": 0.038386027018229164, + "learning_rate": 0.0001, + "loss": 3.9609, + "loss/crossentropy": 2.221343159675598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20674917846918106, + "step": 11574 + }, + { + "epoch": 0.23152, + "grad_norm": 2.0625, + "grad_norm_var": 0.03853759765625, + "learning_rate": 0.0001, + "loss": 4.4097, + "loss/crossentropy": 2.070296823978424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22503886371850967, + "step": 11576 + }, + { + "epoch": 0.23156, + "grad_norm": 2.09375, + "grad_norm_var": 0.03483784993489583, + "learning_rate": 0.0001, + "loss": 3.9417, + "loss/crossentropy": 2.21540367603302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2334480583667755, + "step": 11578 + }, + { + "epoch": 0.2316, + "grad_norm": 1.8359375, + "grad_norm_var": 0.03614679972330729, + "learning_rate": 0.0001, + "loss": 4.2491, + "loss/crossentropy": 1.9438464641571045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20312584936618805, + "step": 11580 + }, + { + "epoch": 0.23164, + "grad_norm": 1.90625, + "grad_norm_var": 0.010994211832682291, + "learning_rate": 0.0001, + "loss": 4.0622, + "loss/crossentropy": 1.4970324039459229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17415452748537064, + "step": 11582 + }, + { + "epoch": 0.23168, + "grad_norm": 2.515625, + "grad_norm_var": 0.026244099934895834, + "learning_rate": 0.0001, + "loss": 4.2293, + "loss/crossentropy": 2.2618579864501953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23066900670528412, + "step": 11584 + }, + { + "epoch": 0.23172, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02603123982747396, + "learning_rate": 0.0001, + "loss": 4.0974, + "loss/crossentropy": 2.2342761754989624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20567959547042847, + "step": 11586 + }, + { + "epoch": 0.23176, + "grad_norm": 2.03125, + "grad_norm_var": 0.026151275634765624, + "learning_rate": 0.0001, + "loss": 4.0417, + "loss/crossentropy": 2.144785463809967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19473369419574738, + "step": 11588 + }, + { + "epoch": 0.2318, + "grad_norm": 2.015625, + "grad_norm_var": 0.02851130167643229, + "learning_rate": 0.0001, + "loss": 4.1996, + "loss/crossentropy": 1.9032491445541382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956048086285591, + "step": 11590 + }, + { + "epoch": 0.23184, + "grad_norm": 2.234375, + "grad_norm_var": 0.03227717081705729, + "learning_rate": 0.0001, + "loss": 4.4689, + "loss/crossentropy": 1.934233546257019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22426530718803406, + "step": 11592 + }, + { + "epoch": 0.23188, + "grad_norm": 2.0, + "grad_norm_var": 0.03166071573893229, + "learning_rate": 0.0001, + "loss": 4.2321, + "loss/crossentropy": 2.1792030930519104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22469744831323624, + "step": 11594 + }, + { + "epoch": 0.23192, + "grad_norm": 2.109375, + "grad_norm_var": 0.027705891927083334, + "learning_rate": 0.0001, + "loss": 4.3779, + "loss/crossentropy": 2.3242534399032593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23015478998422623, + "step": 11596 + }, + { + "epoch": 0.23196, + "grad_norm": 2.5, + "grad_norm_var": 0.036622873942057294, + "learning_rate": 0.0001, + "loss": 4.3553, + "loss/crossentropy": 2.2343804836273193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2392263486981392, + "step": 11598 + }, + { + "epoch": 0.232, + "grad_norm": 2.171875, + "grad_norm_var": 0.024559529622395833, + "learning_rate": 0.0001, + "loss": 4.3019, + "loss/crossentropy": 1.9622855186462402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22550886124372482, + "step": 11600 + }, + { + "epoch": 0.23204, + "grad_norm": 1.9140625, + "grad_norm_var": 0.023981730143229168, + "learning_rate": 0.0001, + "loss": 4.4955, + "loss/crossentropy": 2.2274144887924194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21090717613697052, + "step": 11602 + }, + { + "epoch": 0.23208, + "grad_norm": 2.078125, + "grad_norm_var": 0.021683756510416666, + "learning_rate": 0.0001, + "loss": 4.3011, + "loss/crossentropy": 2.092648506164551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21386967599391937, + "step": 11604 + }, + { + "epoch": 0.23212, + "grad_norm": 1.9609375, + "grad_norm_var": 0.020643870035807293, + "learning_rate": 0.0001, + "loss": 4.2739, + "loss/crossentropy": 2.3358936309814453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22385042905807495, + "step": 11606 + }, + { + "epoch": 0.23216, + "grad_norm": 2.046875, + "grad_norm_var": 0.01883112589518229, + "learning_rate": 0.0001, + "loss": 4.1968, + "loss/crossentropy": 1.9800407886505127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614364743232727, + "step": 11608 + }, + { + "epoch": 0.2322, + "grad_norm": 2.140625, + "grad_norm_var": 0.018536122639973958, + "learning_rate": 0.0001, + "loss": 4.2385, + "loss/crossentropy": 1.9646947979927063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21902073919773102, + "step": 11610 + }, + { + "epoch": 0.23224, + "grad_norm": 2.09375, + "grad_norm_var": 0.018930816650390626, + "learning_rate": 0.0001, + "loss": 4.1564, + "loss/crossentropy": 2.0113691687583923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21262314170598984, + "step": 11612 + }, + { + "epoch": 0.23228, + "grad_norm": 2.328125, + "grad_norm_var": 0.0254791259765625, + "learning_rate": 0.0001, + "loss": 4.3179, + "loss/crossentropy": 1.8359833359718323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125518098473549, + "step": 11614 + }, + { + "epoch": 0.23232, + "grad_norm": 2.15625, + "grad_norm_var": 0.02577489217122396, + "learning_rate": 0.0001, + "loss": 4.0954, + "loss/crossentropy": 1.934333622455597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148296758532524, + "step": 11616 + }, + { + "epoch": 0.23236, + "grad_norm": 2.359375, + "grad_norm_var": 0.026192220052083333, + "learning_rate": 0.0001, + "loss": 4.3556, + "loss/crossentropy": 1.9486380815505981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.201569102704525, + "step": 11618 + }, + { + "epoch": 0.2324, + "grad_norm": 2.140625, + "grad_norm_var": 0.0251708984375, + "learning_rate": 0.0001, + "loss": 4.3761, + "loss/crossentropy": 1.9379103183746338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2109408900141716, + "step": 11620 + }, + { + "epoch": 0.23244, + "grad_norm": 2.203125, + "grad_norm_var": 0.02165705362955729, + "learning_rate": 0.0001, + "loss": 4.3957, + "loss/crossentropy": 2.1635884046554565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23330900818109512, + "step": 11622 + }, + { + "epoch": 0.23248, + "grad_norm": 1.96875, + "grad_norm_var": 0.02851130167643229, + "learning_rate": 0.0001, + "loss": 4.1035, + "loss/crossentropy": 2.3183244466781616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21685285866260529, + "step": 11624 + }, + { + "epoch": 0.23252, + "grad_norm": 2.0, + "grad_norm_var": 0.03022028605143229, + "learning_rate": 0.0001, + "loss": 3.9465, + "loss/crossentropy": 2.2932451367378235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22147410362958908, + "step": 11626 + }, + { + "epoch": 0.23256, + "grad_norm": 1.953125, + "grad_norm_var": 0.03144505818684896, + "learning_rate": 0.0001, + "loss": 4.0598, + "loss/crossentropy": 2.1384140253067017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22141631692647934, + "step": 11628 + }, + { + "epoch": 0.2326, + "grad_norm": 2.0, + "grad_norm_var": 0.0146728515625, + "learning_rate": 0.0001, + "loss": 3.9313, + "loss/crossentropy": 1.7338963747024536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18959754705429077, + "step": 11630 + }, + { + "epoch": 0.23264, + "grad_norm": 2.046875, + "grad_norm_var": 0.014943186442057292, + "learning_rate": 0.0001, + "loss": 4.3293, + "loss/crossentropy": 2.186310887336731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19735413044691086, + "step": 11632 + }, + { + "epoch": 0.23268, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01092529296875, + "learning_rate": 0.0001, + "loss": 3.9636, + "loss/crossentropy": 2.0566734075546265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21930715441703796, + "step": 11634 + }, + { + "epoch": 0.23272, + "grad_norm": 1.984375, + "grad_norm_var": 0.011631011962890625, + "learning_rate": 0.0001, + "loss": 3.8382, + "loss/crossentropy": 1.9993655681610107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18999667465686798, + "step": 11636 + }, + { + "epoch": 0.23276, + "grad_norm": 2.015625, + "grad_norm_var": 0.008316802978515624, + "learning_rate": 0.0001, + "loss": 4.2309, + "loss/crossentropy": 2.2788418531417847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21202587336301804, + "step": 11638 + }, + { + "epoch": 0.2328, + "grad_norm": 1.8828125, + "grad_norm_var": 0.00792236328125, + "learning_rate": 0.0001, + "loss": 3.7837, + "loss/crossentropy": 1.7248046398162842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21424376964569092, + "step": 11640 + }, + { + "epoch": 0.23284, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008235422770182292, + "learning_rate": 0.0001, + "loss": 4.0023, + "loss/crossentropy": 1.9053270816802979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19046200066804886, + "step": 11642 + }, + { + "epoch": 0.23288, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0118408203125, + "learning_rate": 0.0001, + "loss": 4.3561, + "loss/crossentropy": 2.3588117361068726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23569310456514359, + "step": 11644 + }, + { + "epoch": 0.23292, + "grad_norm": 2.078125, + "grad_norm_var": 0.012271881103515625, + "learning_rate": 0.0001, + "loss": 4.4722, + "loss/crossentropy": 2.15751576423645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22107885777950287, + "step": 11646 + }, + { + "epoch": 0.23296, + "grad_norm": 2.03125, + "grad_norm_var": 0.009720611572265624, + "learning_rate": 0.0001, + "loss": 4.4075, + "loss/crossentropy": 2.094591200351715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20681703090667725, + "step": 11648 + }, + { + "epoch": 0.233, + "grad_norm": 2.03125, + "grad_norm_var": 0.009523264567057292, + "learning_rate": 0.0001, + "loss": 3.7832, + "loss/crossentropy": 1.7536470890045166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18886109441518784, + "step": 11650 + }, + { + "epoch": 0.23304, + "grad_norm": 2.0625, + "grad_norm_var": 0.0082427978515625, + "learning_rate": 0.0001, + "loss": 3.9905, + "loss/crossentropy": 1.8172362446784973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18359588831663132, + "step": 11652 + }, + { + "epoch": 0.23308, + "grad_norm": 2.0625, + "grad_norm_var": 0.00841064453125, + "learning_rate": 0.0001, + "loss": 4.2404, + "loss/crossentropy": 2.2346678376197815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2154158428311348, + "step": 11654 + }, + { + "epoch": 0.23312, + "grad_norm": 2.03125, + "grad_norm_var": 0.008040110270182291, + "learning_rate": 0.0001, + "loss": 4.1559, + "loss/crossentropy": 2.0573307275772095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160855233669281, + "step": 11656 + }, + { + "epoch": 0.23316, + "grad_norm": 2.03125, + "grad_norm_var": 0.007621256510416666, + "learning_rate": 0.0001, + "loss": 4.0669, + "loss/crossentropy": 2.4358904361724854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22637036442756653, + "step": 11658 + }, + { + "epoch": 0.2332, + "grad_norm": 1.96875, + "grad_norm_var": 0.004644521077473958, + "learning_rate": 0.0001, + "loss": 3.9938, + "loss/crossentropy": 2.387966513633728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2385111078619957, + "step": 11660 + }, + { + "epoch": 0.23324, + "grad_norm": 2.046875, + "grad_norm_var": 0.004709625244140625, + "learning_rate": 0.0001, + "loss": 4.3371, + "loss/crossentropy": 2.30223548412323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149546965956688, + "step": 11662 + }, + { + "epoch": 0.23328, + "grad_norm": 2.109375, + "grad_norm_var": 0.005494944254557292, + "learning_rate": 0.0001, + "loss": 4.3242, + "loss/crossentropy": 2.001839280128479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001148834824562, + "step": 11664 + }, + { + "epoch": 0.23332, + "grad_norm": 2.25, + "grad_norm_var": 0.00693359375, + "learning_rate": 0.0001, + "loss": 4.4932, + "loss/crossentropy": 2.387674927711487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22714952379465103, + "step": 11666 + }, + { + "epoch": 0.23336, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007470448811848958, + "learning_rate": 0.0001, + "loss": 4.1333, + "loss/crossentropy": 2.133235454559326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22263716161251068, + "step": 11668 + }, + { + "epoch": 0.2334, + "grad_norm": 1.90625, + "grad_norm_var": 0.008874257405598959, + "learning_rate": 0.0001, + "loss": 4.1392, + "loss/crossentropy": 2.1041141748428345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20474642515182495, + "step": 11670 + }, + { + "epoch": 0.23344, + "grad_norm": 2.25, + "grad_norm_var": 0.010796864827473959, + "learning_rate": 0.0001, + "loss": 4.4075, + "loss/crossentropy": 2.0425861477851868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23144961893558502, + "step": 11672 + }, + { + "epoch": 0.23348, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009883626302083334, + "learning_rate": 0.0001, + "loss": 4.0878, + "loss/crossentropy": 2.0170212388038635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134404480457306, + "step": 11674 + }, + { + "epoch": 0.23352, + "grad_norm": 2.015625, + "grad_norm_var": 0.010489908854166667, + "learning_rate": 0.0001, + "loss": 4.2275, + "loss/crossentropy": 2.205981433391571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220575213432312, + "step": 11676 + }, + { + "epoch": 0.23356, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011429595947265624, + "learning_rate": 0.0001, + "loss": 4.1845, + "loss/crossentropy": 1.9121403694152832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23312295228242874, + "step": 11678 + }, + { + "epoch": 0.2336, + "grad_norm": 2.296875, + "grad_norm_var": 0.0156890869140625, + "learning_rate": 0.0001, + "loss": 4.1112, + "loss/crossentropy": 1.6793898940086365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19598360359668732, + "step": 11680 + }, + { + "epoch": 0.23364, + "grad_norm": 2.359375, + "grad_norm_var": 0.0222564697265625, + "learning_rate": 0.0001, + "loss": 4.4814, + "loss/crossentropy": 2.1657907962799072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2281472533941269, + "step": 11682 + }, + { + "epoch": 0.23368, + "grad_norm": 2.015625, + "grad_norm_var": 0.022027333577473957, + "learning_rate": 0.0001, + "loss": 4.0869, + "loss/crossentropy": 2.1918715238571167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22837525606155396, + "step": 11684 + }, + { + "epoch": 0.23372, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020699055989583333, + "learning_rate": 0.0001, + "loss": 4.27, + "loss/crossentropy": 2.296495795249939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23491691797971725, + "step": 11686 + }, + { + "epoch": 0.23376, + "grad_norm": 2.1875, + "grad_norm_var": 0.019559733072916665, + "learning_rate": 0.0001, + "loss": 4.1422, + "loss/crossentropy": 1.9278987646102905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960090771317482, + "step": 11688 + }, + { + "epoch": 0.2338, + "grad_norm": 2.0625, + "grad_norm_var": 0.019461822509765626, + "learning_rate": 0.0001, + "loss": 4.3393, + "loss/crossentropy": 2.0491825938224792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21345915645360947, + "step": 11690 + }, + { + "epoch": 0.23384, + "grad_norm": 2.015625, + "grad_norm_var": 0.02127863566080729, + "learning_rate": 0.0001, + "loss": 3.9209, + "loss/crossentropy": 1.7594041228294373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19004464149475098, + "step": 11692 + }, + { + "epoch": 0.23388, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020961252848307292, + "learning_rate": 0.0001, + "loss": 4.3645, + "loss/crossentropy": 2.0313411951065063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20986144989728928, + "step": 11694 + }, + { + "epoch": 0.23392, + "grad_norm": 2.109375, + "grad_norm_var": 0.016068522135416666, + "learning_rate": 0.0001, + "loss": 4.4644, + "loss/crossentropy": 2.3580493927001953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2253532111644745, + "step": 11696 + }, + { + "epoch": 0.23396, + "grad_norm": 2.203125, + "grad_norm_var": 0.008548990885416666, + "learning_rate": 0.0001, + "loss": 4.3954, + "loss/crossentropy": 2.2036253213882446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22294757515192032, + "step": 11698 + }, + { + "epoch": 0.234, + "grad_norm": 2.03125, + "grad_norm_var": 0.007972971598307291, + "learning_rate": 0.0001, + "loss": 4.2602, + "loss/crossentropy": 1.8575093150138855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19865535199642181, + "step": 11700 + }, + { + "epoch": 0.23404, + "grad_norm": 1.96875, + "grad_norm_var": 0.008571116129557292, + "learning_rate": 0.0001, + "loss": 4.2619, + "loss/crossentropy": 2.2169028520584106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22326484322547913, + "step": 11702 + }, + { + "epoch": 0.23408, + "grad_norm": 2.015625, + "grad_norm_var": 0.007500966389973958, + "learning_rate": 0.0001, + "loss": 4.4045, + "loss/crossentropy": 2.1897542476654053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097000628709793, + "step": 11704 + }, + { + "epoch": 0.23412, + "grad_norm": 2.015625, + "grad_norm_var": 0.0063168843587239586, + "learning_rate": 0.0001, + "loss": 4.271, + "loss/crossentropy": 2.214607834815979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24112706631422043, + "step": 11706 + }, + { + "epoch": 0.23416, + "grad_norm": 2.046875, + "grad_norm_var": 0.004780832926432292, + "learning_rate": 0.0001, + "loss": 4.1278, + "loss/crossentropy": 2.124355912208557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222435548901558, + "step": 11708 + }, + { + "epoch": 0.2342, + "grad_norm": 2.171875, + "grad_norm_var": 0.0051025390625, + "learning_rate": 0.0001, + "loss": 4.3118, + "loss/crossentropy": 2.35421621799469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26120802760124207, + "step": 11710 + }, + { + "epoch": 0.23424, + "grad_norm": 2.0, + "grad_norm_var": 0.00521240234375, + "learning_rate": 0.0001, + "loss": 4.1781, + "loss/crossentropy": 1.9134620428085327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21830002963542938, + "step": 11712 + }, + { + "epoch": 0.23428, + "grad_norm": 2.078125, + "grad_norm_var": 0.003763580322265625, + "learning_rate": 0.0001, + "loss": 4.2445, + "loss/crossentropy": 1.9336887001991272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025599479675293, + "step": 11714 + }, + { + "epoch": 0.23432, + "grad_norm": 2.015625, + "grad_norm_var": 0.004133097330729167, + "learning_rate": 0.0001, + "loss": 4.0775, + "loss/crossentropy": 1.9964489936828613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020488828420639, + "step": 11716 + }, + { + "epoch": 0.23436, + "grad_norm": 2.078125, + "grad_norm_var": 0.00423583984375, + "learning_rate": 0.0001, + "loss": 4.264, + "loss/crossentropy": 2.108368992805481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20985107123851776, + "step": 11718 + }, + { + "epoch": 0.2344, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005008697509765625, + "learning_rate": 0.0001, + "loss": 3.9458, + "loss/crossentropy": 2.0314669013023376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20942936092615128, + "step": 11720 + }, + { + "epoch": 0.23444, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0054107666015625, + "learning_rate": 0.0001, + "loss": 4.1098, + "loss/crossentropy": 2.343130350112915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.237278014421463, + "step": 11722 + }, + { + "epoch": 0.23448, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010658518473307291, + "learning_rate": 0.0001, + "loss": 4.2497, + "loss/crossentropy": 2.1542173624038696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22762110829353333, + "step": 11724 + }, + { + "epoch": 0.23452, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009395090738932292, + "learning_rate": 0.0001, + "loss": 3.9056, + "loss/crossentropy": 1.638447105884552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1707247570157051, + "step": 11726 + }, + { + "epoch": 0.23456, + "grad_norm": 2.09375, + "grad_norm_var": 0.010573069254557291, + "learning_rate": 0.0001, + "loss": 4.0939, + "loss/crossentropy": 2.1509228944778442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091827318072319, + "step": 11728 + }, + { + "epoch": 0.2346, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011677805582682292, + "learning_rate": 0.0001, + "loss": 4.325, + "loss/crossentropy": 2.193585455417633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2697590962052345, + "step": 11730 + }, + { + "epoch": 0.23464, + "grad_norm": 2.109375, + "grad_norm_var": 0.012648264567057291, + "learning_rate": 0.0001, + "loss": 4.2239, + "loss/crossentropy": 2.2058298587799072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21958298981189728, + "step": 11732 + }, + { + "epoch": 0.23468, + "grad_norm": 2.078125, + "grad_norm_var": 0.012894439697265624, + "learning_rate": 0.0001, + "loss": 4.3269, + "loss/crossentropy": 2.0816246271133423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20916947722434998, + "step": 11734 + }, + { + "epoch": 0.23472, + "grad_norm": 2.03125, + "grad_norm_var": 0.0150543212890625, + "learning_rate": 0.0001, + "loss": 4.1495, + "loss/crossentropy": 1.838355541229248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20002590864896774, + "step": 11736 + }, + { + "epoch": 0.23476, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016795857747395834, + "learning_rate": 0.0001, + "loss": 4.0559, + "loss/crossentropy": 2.1602721214294434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21018436551094055, + "step": 11738 + }, + { + "epoch": 0.2348, + "grad_norm": 2.03125, + "grad_norm_var": 0.013181304931640625, + "learning_rate": 0.0001, + "loss": 4.5376, + "loss/crossentropy": 2.6967735290527344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23233920335769653, + "step": 11740 + }, + { + "epoch": 0.23484, + "grad_norm": 2.03125, + "grad_norm_var": 0.012715403238932292, + "learning_rate": 0.0001, + "loss": 3.9772, + "loss/crossentropy": 2.033313810825348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21240226924419403, + "step": 11742 + }, + { + "epoch": 0.23488, + "grad_norm": 1.96875, + "grad_norm_var": 0.012245432535807291, + "learning_rate": 0.0001, + "loss": 4.21, + "loss/crossentropy": 2.123607873916626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207147017121315, + "step": 11744 + }, + { + "epoch": 0.23492, + "grad_norm": 1.8125, + "grad_norm_var": 0.013719685872395833, + "learning_rate": 0.0001, + "loss": 4.1121, + "loss/crossentropy": 2.1173813343048096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21387682110071182, + "step": 11746 + }, + { + "epoch": 0.23496, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011456044514973958, + "learning_rate": 0.0001, + "loss": 4.1219, + "loss/crossentropy": 2.27209734916687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2254154533147812, + "step": 11748 + }, + { + "epoch": 0.235, + "grad_norm": 2.0625, + "grad_norm_var": 0.011156209309895833, + "learning_rate": 0.0001, + "loss": 4.3138, + "loss/crossentropy": 1.844546616077423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1740722581744194, + "step": 11750 + }, + { + "epoch": 0.23504, + "grad_norm": 2.078125, + "grad_norm_var": 0.009346516927083333, + "learning_rate": 0.0001, + "loss": 4.4615, + "loss/crossentropy": 2.341967821121216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23638595640659332, + "step": 11752 + }, + { + "epoch": 0.23508, + "grad_norm": 2.09375, + "grad_norm_var": 0.009354400634765624, + "learning_rate": 0.0001, + "loss": 4.4113, + "loss/crossentropy": 2.2561213970184326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23169096559286118, + "step": 11754 + }, + { + "epoch": 0.23512, + "grad_norm": 1.96875, + "grad_norm_var": 0.007972971598307291, + "learning_rate": 0.0001, + "loss": 4.1603, + "loss/crossentropy": 1.8693158030509949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19997497648000717, + "step": 11756 + }, + { + "epoch": 0.23516, + "grad_norm": 1.953125, + "grad_norm_var": 0.0085845947265625, + "learning_rate": 0.0001, + "loss": 3.9721, + "loss/crossentropy": 2.2228434085845947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19779722392559052, + "step": 11758 + }, + { + "epoch": 0.2352, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007899729410807292, + "learning_rate": 0.0001, + "loss": 4.2572, + "loss/crossentropy": 2.177064299583435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21091558039188385, + "step": 11760 + }, + { + "epoch": 0.23524, + "grad_norm": 1.984375, + "grad_norm_var": 0.005020904541015625, + "learning_rate": 0.0001, + "loss": 4.398, + "loss/crossentropy": 2.2667617797851562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2120228409767151, + "step": 11762 + }, + { + "epoch": 0.23528, + "grad_norm": 2.109375, + "grad_norm_var": 0.004713694254557292, + "learning_rate": 0.0001, + "loss": 4.198, + "loss/crossentropy": 2.0310307145118713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2194480448961258, + "step": 11764 + }, + { + "epoch": 0.23532, + "grad_norm": 1.953125, + "grad_norm_var": 0.005033111572265625, + "learning_rate": 0.0001, + "loss": 3.9906, + "loss/crossentropy": 1.8481100797653198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20181456953287125, + "step": 11766 + }, + { + "epoch": 0.23536, + "grad_norm": 2.140625, + "grad_norm_var": 0.008642323811848958, + "learning_rate": 0.0001, + "loss": 4.3486, + "loss/crossentropy": 2.254691958427429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26685942709445953, + "step": 11768 + }, + { + "epoch": 0.2354, + "grad_norm": 1.921875, + "grad_norm_var": 0.007993316650390625, + "learning_rate": 0.0001, + "loss": 4.144, + "loss/crossentropy": 2.112728714942932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19724663347005844, + "step": 11770 + }, + { + "epoch": 0.23544, + "grad_norm": 2.265625, + "grad_norm_var": 0.011195627848307292, + "learning_rate": 0.0001, + "loss": 4.089, + "loss/crossentropy": 2.0724143981933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20518633723258972, + "step": 11772 + }, + { + "epoch": 0.23548, + "grad_norm": 2.015625, + "grad_norm_var": 0.010064442952473959, + "learning_rate": 0.0001, + "loss": 4.1882, + "loss/crossentropy": 2.0743810534477234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181471511721611, + "step": 11774 + }, + { + "epoch": 0.23552, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011740875244140626, + "learning_rate": 0.0001, + "loss": 4.1121, + "loss/crossentropy": 2.02871835231781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22081361711025238, + "step": 11776 + }, + { + "epoch": 0.23556, + "grad_norm": 2.15625, + "grad_norm_var": 0.013185373942057292, + "learning_rate": 0.0001, + "loss": 4.5607, + "loss/crossentropy": 2.1898428201675415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22084421664476395, + "step": 11778 + }, + { + "epoch": 0.2356, + "grad_norm": 2.0625, + "grad_norm_var": 0.012748209635416667, + "learning_rate": 0.0001, + "loss": 4.2404, + "loss/crossentropy": 2.0893847346305847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21616562455892563, + "step": 11780 + }, + { + "epoch": 0.23564, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011769358317057292, + "learning_rate": 0.0001, + "loss": 4.215, + "loss/crossentropy": 1.8590916991233826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18936381489038467, + "step": 11782 + }, + { + "epoch": 0.23568, + "grad_norm": 2.296875, + "grad_norm_var": 0.012741851806640624, + "learning_rate": 0.0001, + "loss": 3.9544, + "loss/crossentropy": 2.105339765548706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22275932878255844, + "step": 11784 + }, + { + "epoch": 0.23572, + "grad_norm": 2.140625, + "grad_norm_var": 0.011987050374348959, + "learning_rate": 0.0001, + "loss": 4.2585, + "loss/crossentropy": 2.1720080375671387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20734921097755432, + "step": 11786 + }, + { + "epoch": 0.23576, + "grad_norm": 1.984375, + "grad_norm_var": 0.012640126546223958, + "learning_rate": 0.0001, + "loss": 4.2715, + "loss/crossentropy": 2.2301958799362183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21865415573120117, + "step": 11788 + }, + { + "epoch": 0.2358, + "grad_norm": 2.1875, + "grad_norm_var": 0.013626861572265624, + "learning_rate": 0.0001, + "loss": 3.7518, + "loss/crossentropy": 1.588155210018158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19334131479263306, + "step": 11790 + }, + { + "epoch": 0.23584, + "grad_norm": 2.046875, + "grad_norm_var": 0.025248209635416668, + "learning_rate": 0.0001, + "loss": 4.4052, + "loss/crossentropy": 2.208239734172821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21055688709020615, + "step": 11792 + }, + { + "epoch": 0.23588, + "grad_norm": 1.9140625, + "grad_norm_var": 0.030368804931640625, + "learning_rate": 0.0001, + "loss": 4.1996, + "loss/crossentropy": 2.46909761428833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.225817009806633, + "step": 11794 + }, + { + "epoch": 0.23592, + "grad_norm": 1.984375, + "grad_norm_var": 0.03367691040039063, + "learning_rate": 0.0001, + "loss": 4.2792, + "loss/crossentropy": 2.022938549518585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21232923865318298, + "step": 11796 + }, + { + "epoch": 0.23596, + "grad_norm": 1.9765625, + "grad_norm_var": 0.03394953409830729, + "learning_rate": 0.0001, + "loss": 4.3949, + "loss/crossentropy": 2.333058714866638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23842789232730865, + "step": 11798 + }, + { + "epoch": 0.236, + "grad_norm": 2.015625, + "grad_norm_var": 0.0366363525390625, + "learning_rate": 0.0001, + "loss": 3.8698, + "loss/crossentropy": 1.9570570588111877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20660093426704407, + "step": 11800 + }, + { + "epoch": 0.23604, + "grad_norm": 2.0, + "grad_norm_var": 0.0335601806640625, + "learning_rate": 0.0001, + "loss": 4.3649, + "loss/crossentropy": 2.2074697017669678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23766764998435974, + "step": 11802 + }, + { + "epoch": 0.23608, + "grad_norm": 1.9921875, + "grad_norm_var": 0.03026301066080729, + "learning_rate": 0.0001, + "loss": 4.4127, + "loss/crossentropy": 2.327863335609436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22967173904180527, + "step": 11804 + }, + { + "epoch": 0.23612, + "grad_norm": 2.03125, + "grad_norm_var": 0.02802734375, + "learning_rate": 0.0001, + "loss": 4.3367, + "loss/crossentropy": 2.135041356086731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21708428114652634, + "step": 11806 + }, + { + "epoch": 0.23616, + "grad_norm": 2.078125, + "grad_norm_var": 0.00665283203125, + "learning_rate": 0.0001, + "loss": 3.9805, + "loss/crossentropy": 1.9111011624336243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20122328400611877, + "step": 11808 + }, + { + "epoch": 0.2362, + "grad_norm": 1.984375, + "grad_norm_var": 0.006436920166015625, + "learning_rate": 0.0001, + "loss": 4.19, + "loss/crossentropy": 2.0823878049850464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23593349009752274, + "step": 11810 + }, + { + "epoch": 0.23624, + "grad_norm": 2.078125, + "grad_norm_var": 0.0069000244140625, + "learning_rate": 0.0001, + "loss": 3.9554, + "loss/crossentropy": 2.298948645591736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22076356410980225, + "step": 11812 + }, + { + "epoch": 0.23628, + "grad_norm": 1.96875, + "grad_norm_var": 0.006483713785807292, + "learning_rate": 0.0001, + "loss": 4.1119, + "loss/crossentropy": 2.383033037185669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20850034803152084, + "step": 11814 + }, + { + "epoch": 0.23632, + "grad_norm": 2.109375, + "grad_norm_var": 0.10857645670572917, + "learning_rate": 0.0001, + "loss": 4.3648, + "loss/crossentropy": 2.004193425178528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27853623032569885, + "step": 11816 + }, + { + "epoch": 0.23636, + "grad_norm": 2.515625, + "grad_norm_var": 0.11877339680989583, + "learning_rate": 0.0001, + "loss": 4.4454, + "loss/crossentropy": 1.9110660552978516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773407608270645, + "step": 11818 + }, + { + "epoch": 0.2364, + "grad_norm": 2.140625, + "grad_norm_var": 0.11894505818684896, + "learning_rate": 0.0001, + "loss": 4.5428, + "loss/crossentropy": 2.300672471523285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23710736632347107, + "step": 11820 + }, + { + "epoch": 0.23644, + "grad_norm": 1.984375, + "grad_norm_var": 0.11943333943684896, + "learning_rate": 0.0001, + "loss": 4.1413, + "loss/crossentropy": 2.270769238471985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2182588130235672, + "step": 11822 + }, + { + "epoch": 0.23648, + "grad_norm": 2.078125, + "grad_norm_var": 0.11885960896809895, + "learning_rate": 0.0001, + "loss": 4.2744, + "loss/crossentropy": 2.085016667842865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19494660943746567, + "step": 11824 + }, + { + "epoch": 0.23652, + "grad_norm": 2.078125, + "grad_norm_var": 0.11644261678059896, + "learning_rate": 0.0001, + "loss": 4.5554, + "loss/crossentropy": 2.384607672691345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22075054794549942, + "step": 11826 + }, + { + "epoch": 0.23656, + "grad_norm": 1.8984375, + "grad_norm_var": 0.11926167805989583, + "learning_rate": 0.0001, + "loss": 3.9076, + "loss/crossentropy": 2.1217936277389526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126227766275406, + "step": 11828 + }, + { + "epoch": 0.2366, + "grad_norm": 1.875, + "grad_norm_var": 0.12241185506184896, + "learning_rate": 0.0001, + "loss": 3.8285, + "loss/crossentropy": 2.1690168380737305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20450890809297562, + "step": 11830 + }, + { + "epoch": 0.23664, + "grad_norm": 1.9765625, + "grad_norm_var": 0.031998697916666666, + "learning_rate": 0.0001, + "loss": 4.2817, + "loss/crossentropy": 2.2614429593086243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22817185521125793, + "step": 11832 + }, + { + "epoch": 0.23668, + "grad_norm": 1.9296875, + "grad_norm_var": 0.023545074462890624, + "learning_rate": 0.0001, + "loss": 4.3074, + "loss/crossentropy": 2.177332043647766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.226592555642128, + "step": 11834 + }, + { + "epoch": 0.23672, + "grad_norm": 1.859375, + "grad_norm_var": 0.01765925089518229, + "learning_rate": 0.0001, + "loss": 4.0471, + "loss/crossentropy": 1.947661578655243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1762884557247162, + "step": 11836 + }, + { + "epoch": 0.23676, + "grad_norm": 2.015625, + "grad_norm_var": 0.018184153238932292, + "learning_rate": 0.0001, + "loss": 3.9014, + "loss/crossentropy": 1.7112661004066467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18696465343236923, + "step": 11838 + }, + { + "epoch": 0.2368, + "grad_norm": 2.078125, + "grad_norm_var": 0.017439524332682293, + "learning_rate": 0.0001, + "loss": 4.1761, + "loss/crossentropy": 2.0478790402412415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21436551213264465, + "step": 11840 + }, + { + "epoch": 0.23684, + "grad_norm": 2.078125, + "grad_norm_var": 0.01718724568684896, + "learning_rate": 0.0001, + "loss": 4.1702, + "loss/crossentropy": 2.0541738867759705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23121927678585052, + "step": 11842 + }, + { + "epoch": 0.23688, + "grad_norm": 2.109375, + "grad_norm_var": 0.01876805623372396, + "learning_rate": 0.0001, + "loss": 4.208, + "loss/crossentropy": 2.04353004693985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22237974405288696, + "step": 11844 + }, + { + "epoch": 0.23692, + "grad_norm": 2.1875, + "grad_norm_var": 0.017488606770833335, + "learning_rate": 0.0001, + "loss": 4.4641, + "loss/crossentropy": 2.110148549079895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23324476927518845, + "step": 11846 + }, + { + "epoch": 0.23696, + "grad_norm": 1.9140625, + "grad_norm_var": 0.018623860677083333, + "learning_rate": 0.0001, + "loss": 4.0831, + "loss/crossentropy": 2.207589864730835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209608756005764, + "step": 11848 + }, + { + "epoch": 0.237, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012180328369140625, + "learning_rate": 0.0001, + "loss": 4.1896, + "loss/crossentropy": 2.2447429895401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016080766916275, + "step": 11850 + }, + { + "epoch": 0.23704, + "grad_norm": 2.0625, + "grad_norm_var": 0.009771474202473958, + "learning_rate": 0.0001, + "loss": 4.3133, + "loss/crossentropy": 2.3349474668502808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22105249762535095, + "step": 11852 + }, + { + "epoch": 0.23708, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008024088541666667, + "learning_rate": 0.0001, + "loss": 4.1091, + "loss/crossentropy": 1.8444748520851135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993526816368103, + "step": 11854 + }, + { + "epoch": 0.23712, + "grad_norm": 2.109375, + "grad_norm_var": 0.009323883056640624, + "learning_rate": 0.0001, + "loss": 4.0758, + "loss/crossentropy": 1.9181615710258484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18698236346244812, + "step": 11856 + }, + { + "epoch": 0.23716, + "grad_norm": 2.15625, + "grad_norm_var": 0.009627024332682291, + "learning_rate": 0.0001, + "loss": 4.1454, + "loss/crossentropy": 2.1930960416793823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122444212436676, + "step": 11858 + }, + { + "epoch": 0.2372, + "grad_norm": 2.171875, + "grad_norm_var": 0.009439849853515625, + "learning_rate": 0.0001, + "loss": 4.3177, + "loss/crossentropy": 1.7835432887077332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20900961011648178, + "step": 11860 + }, + { + "epoch": 0.23724, + "grad_norm": 2.15625, + "grad_norm_var": 0.011775461832682292, + "learning_rate": 0.0001, + "loss": 4.6362, + "loss/crossentropy": 2.1839439868927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21574077755212784, + "step": 11862 + }, + { + "epoch": 0.23728, + "grad_norm": 1.96875, + "grad_norm_var": 0.010791015625, + "learning_rate": 0.0001, + "loss": 3.8916, + "loss/crossentropy": 1.9817007184028625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773921698331833, + "step": 11864 + }, + { + "epoch": 0.23732, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008923085530598958, + "learning_rate": 0.0001, + "loss": 4.2065, + "loss/crossentropy": 2.017501652240753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247873619198799, + "step": 11866 + }, + { + "epoch": 0.23736, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010992177327473958, + "learning_rate": 0.0001, + "loss": 3.8813, + "loss/crossentropy": 2.0779114961624146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21136770397424698, + "step": 11868 + }, + { + "epoch": 0.2374, + "grad_norm": 2.109375, + "grad_norm_var": 0.010497029622395833, + "learning_rate": 0.0001, + "loss": 4.341, + "loss/crossentropy": 2.3987231254577637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2326417714357376, + "step": 11870 + }, + { + "epoch": 0.23744, + "grad_norm": 2.0625, + "grad_norm_var": 0.008975982666015625, + "learning_rate": 0.0001, + "loss": 4.0466, + "loss/crossentropy": 1.7145346999168396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19222228974103928, + "step": 11872 + }, + { + "epoch": 0.23748, + "grad_norm": 2.015625, + "grad_norm_var": 0.008829498291015625, + "learning_rate": 0.0001, + "loss": 4.2683, + "loss/crossentropy": 2.378560423851013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21374844759702682, + "step": 11874 + }, + { + "epoch": 0.23752, + "grad_norm": 2.0625, + "grad_norm_var": 0.0073626200358072914, + "learning_rate": 0.0001, + "loss": 4.2548, + "loss/crossentropy": 1.951455295085907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2217455804347992, + "step": 11876 + }, + { + "epoch": 0.23756, + "grad_norm": 1.8671875, + "grad_norm_var": 0.00513916015625, + "learning_rate": 0.0001, + "loss": 3.8867, + "loss/crossentropy": 1.9309074878692627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100483626127243, + "step": 11878 + }, + { + "epoch": 0.2376, + "grad_norm": 2.015625, + "grad_norm_var": 0.00445556640625, + "learning_rate": 0.0001, + "loss": 4.102, + "loss/crossentropy": 2.2295292615890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2196313813328743, + "step": 11880 + }, + { + "epoch": 0.23764, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004964192708333333, + "learning_rate": 0.0001, + "loss": 4.0514, + "loss/crossentropy": 2.0665449500083923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20752248913049698, + "step": 11882 + }, + { + "epoch": 0.23768, + "grad_norm": 2.1875, + "grad_norm_var": 0.00560302734375, + "learning_rate": 0.0001, + "loss": 4.4022, + "loss/crossentropy": 1.9645958542823792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20391641557216644, + "step": 11884 + }, + { + "epoch": 0.23772, + "grad_norm": 1.984375, + "grad_norm_var": 0.005956013997395833, + "learning_rate": 0.0001, + "loss": 4.1698, + "loss/crossentropy": 2.1280174255371094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21947231888771057, + "step": 11886 + }, + { + "epoch": 0.23776, + "grad_norm": 2.03125, + "grad_norm_var": 0.0073964436848958336, + "learning_rate": 0.0001, + "loss": 4.2043, + "loss/crossentropy": 1.9462851285934448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1961178332567215, + "step": 11888 + }, + { + "epoch": 0.2378, + "grad_norm": 2.203125, + "grad_norm_var": 0.009422810872395833, + "learning_rate": 0.0001, + "loss": 4.4846, + "loss/crossentropy": 2.249086618423462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22473402321338654, + "step": 11890 + }, + { + "epoch": 0.23784, + "grad_norm": 1.9375, + "grad_norm_var": 0.010472615559895834, + "learning_rate": 0.0001, + "loss": 4.1368, + "loss/crossentropy": 2.1972378492355347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2164871245622635, + "step": 11892 + }, + { + "epoch": 0.23788, + "grad_norm": 2.15625, + "grad_norm_var": 0.009409332275390625, + "learning_rate": 0.0001, + "loss": 4.228, + "loss/crossentropy": 2.0067209601402283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20195162296295166, + "step": 11894 + }, + { + "epoch": 0.23792, + "grad_norm": 2.03125, + "grad_norm_var": 0.009673817952473959, + "learning_rate": 0.0001, + "loss": 4.1767, + "loss/crossentropy": 1.9103696942329407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19616805016994476, + "step": 11896 + }, + { + "epoch": 0.23796, + "grad_norm": 1.984375, + "grad_norm_var": 0.011595662434895833, + "learning_rate": 0.0001, + "loss": 4.228, + "loss/crossentropy": 2.1524049639701843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21107815951108932, + "step": 11898 + }, + { + "epoch": 0.238, + "grad_norm": 2.0, + "grad_norm_var": 0.0110260009765625, + "learning_rate": 0.0001, + "loss": 4.183, + "loss/crossentropy": 2.2112287878990173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2099093720316887, + "step": 11900 + }, + { + "epoch": 0.23804, + "grad_norm": 2.390625, + "grad_norm_var": 0.8914347330729167, + "learning_rate": 0.0001, + "loss": 4.647, + "loss/crossentropy": 2.2945204973220825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2713918164372444, + "step": 11902 + }, + { + "epoch": 0.23808, + "grad_norm": 2.078125, + "grad_norm_var": 0.8781575520833333, + "learning_rate": 0.0001, + "loss": 4.4699, + "loss/crossentropy": 2.1838968992233276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22640438377857208, + "step": 11904 + }, + { + "epoch": 0.23812, + "grad_norm": 2.0, + "grad_norm_var": 0.89010009765625, + "learning_rate": 0.0001, + "loss": 4.0843, + "loss/crossentropy": 1.9089699983596802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19553899765014648, + "step": 11906 + }, + { + "epoch": 0.23816, + "grad_norm": 2.03125, + "grad_norm_var": 0.8873443603515625, + "learning_rate": 0.0001, + "loss": 4.2573, + "loss/crossentropy": 2.105385661125183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21528839319944382, + "step": 11908 + }, + { + "epoch": 0.2382, + "grad_norm": 2.015625, + "grad_norm_var": 0.8921946207682292, + "learning_rate": 0.0001, + "loss": 4.0381, + "loss/crossentropy": 1.565223515033722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17765602469444275, + "step": 11910 + }, + { + "epoch": 0.23824, + "grad_norm": 2.078125, + "grad_norm_var": 0.8919016520182291, + "learning_rate": 0.0001, + "loss": 4.3014, + "loss/crossentropy": 2.220748543739319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21904101967811584, + "step": 11912 + }, + { + "epoch": 0.23828, + "grad_norm": 2.15625, + "grad_norm_var": 0.889306640625, + "learning_rate": 0.0001, + "loss": 4.2965, + "loss/crossentropy": 2.294031500816345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21700909733772278, + "step": 11914 + }, + { + "epoch": 0.23832, + "grad_norm": 2.171875, + "grad_norm_var": 0.8795237223307292, + "learning_rate": 0.0001, + "loss": 4.3439, + "loss/crossentropy": 2.072917103767395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20462248474359512, + "step": 11916 + }, + { + "epoch": 0.23836, + "grad_norm": 2.03125, + "grad_norm_var": 0.00533447265625, + "learning_rate": 0.0001, + "loss": 4.1843, + "loss/crossentropy": 2.0541720390319824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127074897289276, + "step": 11918 + }, + { + "epoch": 0.2384, + "grad_norm": 2.109375, + "grad_norm_var": 0.0046539306640625, + "learning_rate": 0.0001, + "loss": 3.8922, + "loss/crossentropy": 1.9873629808425903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19215020537376404, + "step": 11920 + }, + { + "epoch": 0.23844, + "grad_norm": 2.09375, + "grad_norm_var": 0.0032135009765625, + "learning_rate": 0.0001, + "loss": 4.1943, + "loss/crossentropy": 2.447067141532898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2318919375538826, + "step": 11922 + }, + { + "epoch": 0.23848, + "grad_norm": 2.0, + "grad_norm_var": 0.0038157145182291666, + "learning_rate": 0.0001, + "loss": 3.8756, + "loss/crossentropy": 1.9918025732040405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20865648984909058, + "step": 11924 + }, + { + "epoch": 0.23852, + "grad_norm": 2.046875, + "grad_norm_var": 0.0026041666666666665, + "learning_rate": 0.0001, + "loss": 4.3243, + "loss/crossentropy": 2.0803070068359375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21753622591495514, + "step": 11926 + }, + { + "epoch": 0.23856, + "grad_norm": 1.9921875, + "grad_norm_var": 0.002976226806640625, + "learning_rate": 0.0001, + "loss": 4.1828, + "loss/crossentropy": 2.118411421775818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21741919964551926, + "step": 11928 + }, + { + "epoch": 0.2386, + "grad_norm": 1.9375, + "grad_norm_var": 0.0034075419108072916, + "learning_rate": 0.0001, + "loss": 4.0056, + "loss/crossentropy": 1.8924900889396667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20609601587057114, + "step": 11930 + }, + { + "epoch": 0.23864, + "grad_norm": 2.046875, + "grad_norm_var": 0.002418772379557292, + "learning_rate": 0.0001, + "loss": 4.0012, + "loss/crossentropy": 1.743731439113617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19873473048210144, + "step": 11932 + }, + { + "epoch": 0.23868, + "grad_norm": 2.015625, + "grad_norm_var": 0.002929433186848958, + "learning_rate": 0.0001, + "loss": 4.2564, + "loss/crossentropy": 2.291381061077118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21248165518045425, + "step": 11934 + }, + { + "epoch": 0.23872, + "grad_norm": 2.109375, + "grad_norm_var": 0.007452138264973958, + "learning_rate": 0.0001, + "loss": 4.3938, + "loss/crossentropy": 1.7672501802444458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19201595336198807, + "step": 11936 + }, + { + "epoch": 0.23876, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008162434895833333, + "learning_rate": 0.0001, + "loss": 3.8693, + "loss/crossentropy": 1.8719280362129211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960783526301384, + "step": 11938 + }, + { + "epoch": 0.2388, + "grad_norm": 2.28125, + "grad_norm_var": 0.010978190104166667, + "learning_rate": 0.0001, + "loss": 4.3345, + "loss/crossentropy": 1.8103876113891602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1964937001466751, + "step": 11940 + }, + { + "epoch": 0.23884, + "grad_norm": 2.03125, + "grad_norm_var": 0.011901601155598959, + "learning_rate": 0.0001, + "loss": 4.2134, + "loss/crossentropy": 1.8743855953216553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19404225796461105, + "step": 11942 + }, + { + "epoch": 0.23888, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012562815348307292, + "learning_rate": 0.0001, + "loss": 4.2379, + "loss/crossentropy": 2.3096803426742554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23256508260965347, + "step": 11944 + }, + { + "epoch": 0.23892, + "grad_norm": 2.109375, + "grad_norm_var": 0.012123362223307291, + "learning_rate": 0.0001, + "loss": 4.5089, + "loss/crossentropy": 2.207027554512024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20778407156467438, + "step": 11946 + }, + { + "epoch": 0.23896, + "grad_norm": 2.015625, + "grad_norm_var": 0.011671702067057291, + "learning_rate": 0.0001, + "loss": 4.0423, + "loss/crossentropy": 1.845999002456665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19381015002727509, + "step": 11948 + }, + { + "epoch": 0.239, + "grad_norm": 2.03125, + "grad_norm_var": 0.011628977457682292, + "learning_rate": 0.0001, + "loss": 4.1054, + "loss/crossentropy": 1.677983045578003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17493421584367752, + "step": 11950 + }, + { + "epoch": 0.23904, + "grad_norm": 2.140625, + "grad_norm_var": 0.008107248942057292, + "learning_rate": 0.0001, + "loss": 4.1729, + "loss/crossentropy": 2.1798466444015503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22395183145999908, + "step": 11952 + }, + { + "epoch": 0.23908, + "grad_norm": 1.9140625, + "grad_norm_var": 0.013525390625, + "learning_rate": 0.0001, + "loss": 4.0162, + "loss/crossentropy": 2.008498191833496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21069375425577164, + "step": 11954 + }, + { + "epoch": 0.23912, + "grad_norm": 2.140625, + "grad_norm_var": 0.014546712239583334, + "learning_rate": 0.0001, + "loss": 4.3736, + "loss/crossentropy": 2.1712071895599365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21842175722122192, + "step": 11956 + }, + { + "epoch": 0.23916, + "grad_norm": 2.578125, + "grad_norm_var": 0.03242365519205729, + "learning_rate": 0.0001, + "loss": 4.1338, + "loss/crossentropy": 2.2769562005996704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22064895182847977, + "step": 11958 + }, + { + "epoch": 0.2392, + "grad_norm": 2.09375, + "grad_norm_var": 0.032572428385416664, + "learning_rate": 0.0001, + "loss": 4.3401, + "loss/crossentropy": 2.127632260322571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139139398932457, + "step": 11960 + }, + { + "epoch": 0.23924, + "grad_norm": 2.046875, + "grad_norm_var": 0.03223368326822917, + "learning_rate": 0.0001, + "loss": 4.3547, + "loss/crossentropy": 2.2687970399856567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23024404793977737, + "step": 11962 + }, + { + "epoch": 0.23928, + "grad_norm": 2.0625, + "grad_norm_var": 0.032136027018229166, + "learning_rate": 0.0001, + "loss": 4.3532, + "loss/crossentropy": 2.2271196246147156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22528529912233353, + "step": 11964 + }, + { + "epoch": 0.23932, + "grad_norm": 2.03125, + "grad_norm_var": 0.032136027018229166, + "learning_rate": 0.0001, + "loss": 4.1898, + "loss/crossentropy": 2.3209941387176514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21150042116641998, + "step": 11966 + }, + { + "epoch": 0.23936, + "grad_norm": 2.046875, + "grad_norm_var": 0.03208719889322917, + "learning_rate": 0.0001, + "loss": 4.197, + "loss/crossentropy": 1.9936136603355408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22582116723060608, + "step": 11968 + }, + { + "epoch": 0.2394, + "grad_norm": 2.03125, + "grad_norm_var": 0.024055989583333333, + "learning_rate": 0.0001, + "loss": 3.9045, + "loss/crossentropy": 1.7509311437606812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19028881192207336, + "step": 11970 + }, + { + "epoch": 0.23944, + "grad_norm": 2.140625, + "grad_norm_var": 0.02213134765625, + "learning_rate": 0.0001, + "loss": 4.2365, + "loss/crossentropy": 2.2120620012283325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23357221484184265, + "step": 11972 + }, + { + "epoch": 0.23948, + "grad_norm": 2.109375, + "grad_norm_var": 0.005686187744140625, + "learning_rate": 0.0001, + "loss": 4.0129, + "loss/crossentropy": 1.794329285621643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17914240062236786, + "step": 11974 + }, + { + "epoch": 0.23952, + "grad_norm": 2.03125, + "grad_norm_var": 0.010550689697265626, + "learning_rate": 0.0001, + "loss": 4.1347, + "loss/crossentropy": 2.1647136211395264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21009670197963715, + "step": 11976 + }, + { + "epoch": 0.23956, + "grad_norm": 2.046875, + "grad_norm_var": 0.010660552978515625, + "learning_rate": 0.0001, + "loss": 4.2365, + "loss/crossentropy": 1.674091637134552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19617585837841034, + "step": 11978 + }, + { + "epoch": 0.2396, + "grad_norm": 2.015625, + "grad_norm_var": 0.011739095052083334, + "learning_rate": 0.0001, + "loss": 4.2474, + "loss/crossentropy": 1.9591755867004395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18309858441352844, + "step": 11980 + }, + { + "epoch": 0.23964, + "grad_norm": 2.34375, + "grad_norm_var": 0.017862955729166668, + "learning_rate": 0.0001, + "loss": 4.3642, + "loss/crossentropy": 2.0550093054771423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22483228147029877, + "step": 11982 + }, + { + "epoch": 0.23968, + "grad_norm": 2.203125, + "grad_norm_var": 0.019291178385416666, + "learning_rate": 0.0001, + "loss": 4.3937, + "loss/crossentropy": 2.264032781124115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101321816444397, + "step": 11984 + }, + { + "epoch": 0.23972, + "grad_norm": 2.015625, + "grad_norm_var": 0.020213826497395834, + "learning_rate": 0.0001, + "loss": 4.4341, + "loss/crossentropy": 2.06082820892334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2090815082192421, + "step": 11986 + }, + { + "epoch": 0.23976, + "grad_norm": 2.0, + "grad_norm_var": 0.0197662353515625, + "learning_rate": 0.0001, + "loss": 4.4159, + "loss/crossentropy": 2.248009443283081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22642739117145538, + "step": 11988 + }, + { + "epoch": 0.2398, + "grad_norm": 1.9140625, + "grad_norm_var": 0.02211278279622396, + "learning_rate": 0.0001, + "loss": 3.9601, + "loss/crossentropy": 1.797426462173462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18600185215473175, + "step": 11990 + }, + { + "epoch": 0.23984, + "grad_norm": 2.015625, + "grad_norm_var": 0.01573460896809896, + "learning_rate": 0.0001, + "loss": 4.1666, + "loss/crossentropy": 2.5529314279556274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23957456648349762, + "step": 11992 + }, + { + "epoch": 0.23988, + "grad_norm": 2.125, + "grad_norm_var": 0.015933990478515625, + "learning_rate": 0.0001, + "loss": 4.2971, + "loss/crossentropy": 1.9974916577339172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23130090534687042, + "step": 11994 + }, + { + "epoch": 0.23992, + "grad_norm": 2.046875, + "grad_norm_var": 0.014679972330729167, + "learning_rate": 0.0001, + "loss": 4.0358, + "loss/crossentropy": 1.9714577794075012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969093456864357, + "step": 11996 + }, + { + "epoch": 0.23996, + "grad_norm": 2.25, + "grad_norm_var": 0.011055501302083333, + "learning_rate": 0.0001, + "loss": 4.282, + "loss/crossentropy": 1.8047285079956055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19191773235797882, + "step": 11998 + }, + { + "epoch": 0.24, + "grad_norm": 2.09375, + "grad_norm_var": 0.009187825520833333, + "learning_rate": 0.0001, + "loss": 4.4078, + "loss/crossentropy": 1.9240365028381348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19021467864513397, + "step": 12000 + }, + { + "epoch": 0.24004, + "grad_norm": 2.125, + "grad_norm_var": 0.009041086832682291, + "learning_rate": 0.0001, + "loss": 4.1657, + "loss/crossentropy": 1.6503748297691345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18838192522525787, + "step": 12002 + }, + { + "epoch": 0.24008, + "grad_norm": 2.109375, + "grad_norm_var": 0.009368642171223959, + "learning_rate": 0.0001, + "loss": 4.3451, + "loss/crossentropy": 2.238506555557251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139800265431404, + "step": 12004 + }, + { + "epoch": 0.24012, + "grad_norm": 1.890625, + "grad_norm_var": 0.008695220947265625, + "learning_rate": 0.0001, + "loss": 4.0978, + "loss/crossentropy": 1.8623422384262085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19039735198020935, + "step": 12006 + }, + { + "epoch": 0.24016, + "grad_norm": 2.109375, + "grad_norm_var": 0.0096588134765625, + "learning_rate": 0.0001, + "loss": 3.9741, + "loss/crossentropy": 1.975899577140808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20808710902929306, + "step": 12008 + }, + { + "epoch": 0.2402, + "grad_norm": 2.15625, + "grad_norm_var": 0.0108642578125, + "learning_rate": 0.0001, + "loss": 4.4332, + "loss/crossentropy": 2.2959831953048706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23725779354572296, + "step": 12010 + }, + { + "epoch": 0.24024, + "grad_norm": 1.8515625, + "grad_norm_var": 0.015130360921223959, + "learning_rate": 0.0001, + "loss": 3.9117, + "loss/crossentropy": 2.1626380681991577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068287953734398, + "step": 12012 + }, + { + "epoch": 0.24028, + "grad_norm": 2.15625, + "grad_norm_var": 0.013474273681640624, + "learning_rate": 0.0001, + "loss": 4.1614, + "loss/crossentropy": 1.925924837589264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097126841545105, + "step": 12014 + }, + { + "epoch": 0.24032, + "grad_norm": 2.046875, + "grad_norm_var": 0.018302154541015626, + "learning_rate": 0.0001, + "loss": 4.5545, + "loss/crossentropy": 2.147459626197815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2345646619796753, + "step": 12016 + }, + { + "epoch": 0.24036, + "grad_norm": 1.8359375, + "grad_norm_var": 0.021418253580729168, + "learning_rate": 0.0001, + "loss": 4.0219, + "loss/crossentropy": 2.052124857902527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058626338839531, + "step": 12018 + }, + { + "epoch": 0.2404, + "grad_norm": 1.8828125, + "grad_norm_var": 0.021329752604166665, + "learning_rate": 0.0001, + "loss": 3.8619, + "loss/crossentropy": 2.377007842063904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124926745891571, + "step": 12020 + }, + { + "epoch": 0.24044, + "grad_norm": 2.0, + "grad_norm_var": 0.020173136393229166, + "learning_rate": 0.0001, + "loss": 3.9542, + "loss/crossentropy": 1.9117819666862488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976870447397232, + "step": 12022 + }, + { + "epoch": 0.24048, + "grad_norm": 2.09375, + "grad_norm_var": 0.021478017171223957, + "learning_rate": 0.0001, + "loss": 4.5771, + "loss/crossentropy": 2.439339756965637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22178436815738678, + "step": 12024 + }, + { + "epoch": 0.24052, + "grad_norm": 2.140625, + "grad_norm_var": 0.02029596964518229, + "learning_rate": 0.0001, + "loss": 4.1182, + "loss/crossentropy": 1.6923209428787231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19288206100463867, + "step": 12026 + }, + { + "epoch": 0.24056, + "grad_norm": 1.9921875, + "grad_norm_var": 0.016434478759765624, + "learning_rate": 0.0001, + "loss": 4.1733, + "loss/crossentropy": 2.071919083595276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21092405915260315, + "step": 12028 + }, + { + "epoch": 0.2406, + "grad_norm": 1.984375, + "grad_norm_var": 0.016993967692057292, + "learning_rate": 0.0001, + "loss": 3.9688, + "loss/crossentropy": 1.994953691959381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19953829050064087, + "step": 12030 + }, + { + "epoch": 0.24064, + "grad_norm": 2.109375, + "grad_norm_var": 0.011201731363932292, + "learning_rate": 0.0001, + "loss": 4.3252, + "loss/crossentropy": 2.0545560121536255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20861412584781647, + "step": 12032 + }, + { + "epoch": 0.24068, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0103424072265625, + "learning_rate": 0.0001, + "loss": 4.0859, + "loss/crossentropy": 2.0211291909217834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146567404270172, + "step": 12034 + }, + { + "epoch": 0.24072, + "grad_norm": 2.0625, + "grad_norm_var": 0.012312825520833333, + "learning_rate": 0.0001, + "loss": 4.3156, + "loss/crossentropy": 2.165773868560791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21522508561611176, + "step": 12036 + }, + { + "epoch": 0.24076, + "grad_norm": 1.9609375, + "grad_norm_var": 0.013216145833333333, + "learning_rate": 0.0001, + "loss": 4.2253, + "loss/crossentropy": 2.0272024273872375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20012138038873672, + "step": 12038 + }, + { + "epoch": 0.2408, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011946360270182291, + "learning_rate": 0.0001, + "loss": 4.2419, + "loss/crossentropy": 1.9311429262161255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19755128771066666, + "step": 12040 + }, + { + "epoch": 0.24084, + "grad_norm": 2.0625, + "grad_norm_var": 0.012086741129557292, + "learning_rate": 0.0001, + "loss": 4.1857, + "loss/crossentropy": 2.0740894079208374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20651141554117203, + "step": 12042 + }, + { + "epoch": 0.24088, + "grad_norm": 1.96875, + "grad_norm_var": 0.012876129150390625, + "learning_rate": 0.0001, + "loss": 3.9882, + "loss/crossentropy": 2.215203881263733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21735452860593796, + "step": 12044 + }, + { + "epoch": 0.24092, + "grad_norm": 2.015625, + "grad_norm_var": 0.02371190388997396, + "learning_rate": 0.0001, + "loss": 4.3755, + "loss/crossentropy": 1.9994693994522095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001128643751144, + "step": 12046 + }, + { + "epoch": 0.24096, + "grad_norm": 2.15625, + "grad_norm_var": 0.025187174479166668, + "learning_rate": 0.0001, + "loss": 4.1145, + "loss/crossentropy": 2.195701003074646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20777013897895813, + "step": 12048 + }, + { + "epoch": 0.241, + "grad_norm": 2.109375, + "grad_norm_var": 0.02474950154622396, + "learning_rate": 0.0001, + "loss": 4.3486, + "loss/crossentropy": 2.1975715160369873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21987678855657578, + "step": 12050 + }, + { + "epoch": 0.24104, + "grad_norm": 1.953125, + "grad_norm_var": 0.02188695271809896, + "learning_rate": 0.0001, + "loss": 4.1337, + "loss/crossentropy": 2.225023865699768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24454358220100403, + "step": 12052 + }, + { + "epoch": 0.24108, + "grad_norm": 1.9296875, + "grad_norm_var": 0.022200520833333334, + "learning_rate": 0.0001, + "loss": 4.0404, + "loss/crossentropy": 1.892149806022644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110423520207405, + "step": 12054 + }, + { + "epoch": 0.24112, + "grad_norm": 1.8984375, + "grad_norm_var": 0.023579915364583332, + "learning_rate": 0.0001, + "loss": 4.1604, + "loss/crossentropy": 1.9996158480644226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821269765496254, + "step": 12056 + }, + { + "epoch": 0.24116, + "grad_norm": 2.109375, + "grad_norm_var": 0.02569580078125, + "learning_rate": 0.0001, + "loss": 4.0591, + "loss/crossentropy": 2.0965115427970886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140883505344391, + "step": 12058 + }, + { + "epoch": 0.2412, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0256500244140625, + "learning_rate": 0.0001, + "loss": 3.9494, + "loss/crossentropy": 1.9169449210166931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034926936030388, + "step": 12060 + }, + { + "epoch": 0.24124, + "grad_norm": 2.0625, + "grad_norm_var": 0.015135701497395833, + "learning_rate": 0.0001, + "loss": 4.5, + "loss/crossentropy": 2.3266680240631104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24314038455486298, + "step": 12062 + }, + { + "epoch": 0.24128, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013509114583333334, + "learning_rate": 0.0001, + "loss": 4.1009, + "loss/crossentropy": 2.0337759256362915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20842785388231277, + "step": 12064 + }, + { + "epoch": 0.24132, + "grad_norm": 1.828125, + "grad_norm_var": 0.013618977864583333, + "learning_rate": 0.0001, + "loss": 3.9186, + "loss/crossentropy": 1.9502894878387451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1812899112701416, + "step": 12066 + }, + { + "epoch": 0.24136, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013822174072265625, + "learning_rate": 0.0001, + "loss": 4.269, + "loss/crossentropy": 2.13326895236969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21872679144144058, + "step": 12068 + }, + { + "epoch": 0.2414, + "grad_norm": 2.125, + "grad_norm_var": 0.013304646809895833, + "learning_rate": 0.0001, + "loss": 4.3594, + "loss/crossentropy": 2.0836809873580933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212929405272007, + "step": 12070 + }, + { + "epoch": 0.24144, + "grad_norm": 2.03125, + "grad_norm_var": 0.012516276041666666, + "learning_rate": 0.0001, + "loss": 4.0467, + "loss/crossentropy": 2.207913398742676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20640508085489273, + "step": 12072 + }, + { + "epoch": 0.24148, + "grad_norm": 2.03125, + "grad_norm_var": 0.01080322265625, + "learning_rate": 0.0001, + "loss": 4.1877, + "loss/crossentropy": 2.0770451426506042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20633003860712051, + "step": 12074 + }, + { + "epoch": 0.24152, + "grad_norm": 2.0625, + "grad_norm_var": 0.010786946614583333, + "learning_rate": 0.0001, + "loss": 4.2237, + "loss/crossentropy": 2.0042858719825745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18523608148097992, + "step": 12076 + }, + { + "epoch": 0.24156, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010141754150390625, + "learning_rate": 0.0001, + "loss": 4.2755, + "loss/crossentropy": 1.8673237562179565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19267796725034714, + "step": 12078 + }, + { + "epoch": 0.2416, + "grad_norm": 2.046875, + "grad_norm_var": 0.013793690999348959, + "learning_rate": 0.0001, + "loss": 3.9766, + "loss/crossentropy": 2.0686238408088684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19344990700483322, + "step": 12080 + }, + { + "epoch": 0.24164, + "grad_norm": 2.03125, + "grad_norm_var": 0.009504954020182291, + "learning_rate": 0.0001, + "loss": 4.0366, + "loss/crossentropy": 2.0069685578346252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1971682757139206, + "step": 12082 + }, + { + "epoch": 0.24168, + "grad_norm": 2.0625, + "grad_norm_var": 0.009468587239583333, + "learning_rate": 0.0001, + "loss": 4.1975, + "loss/crossentropy": 2.2052754163742065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20646511763334274, + "step": 12084 + }, + { + "epoch": 0.24172, + "grad_norm": 2.078125, + "grad_norm_var": 0.008983357747395834, + "learning_rate": 0.0001, + "loss": 4.1362, + "loss/crossentropy": 2.0343621373176575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19575881958007812, + "step": 12086 + }, + { + "epoch": 0.24176, + "grad_norm": 1.890625, + "grad_norm_var": 0.009894816080729167, + "learning_rate": 0.0001, + "loss": 3.9695, + "loss/crossentropy": 2.2500641345977783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23151995986700058, + "step": 12088 + }, + { + "epoch": 0.2418, + "grad_norm": 2.421875, + "grad_norm_var": 0.0375640869140625, + "learning_rate": 0.0001, + "loss": 4.4361, + "loss/crossentropy": 2.2596821784973145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22305716574192047, + "step": 12090 + }, + { + "epoch": 0.24184, + "grad_norm": 1.9375, + "grad_norm_var": 0.03715184529622396, + "learning_rate": 0.0001, + "loss": 4.0381, + "loss/crossentropy": 2.0940088033676147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23409207165241241, + "step": 12092 + }, + { + "epoch": 0.24188, + "grad_norm": 2.046875, + "grad_norm_var": 0.03586018880208333, + "learning_rate": 0.0001, + "loss": 3.9715, + "loss/crossentropy": 1.6980834603309631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19339510053396225, + "step": 12094 + }, + { + "epoch": 0.24192, + "grad_norm": 2.09375, + "grad_norm_var": 0.03047459920247396, + "learning_rate": 0.0001, + "loss": 4.192, + "loss/crossentropy": 2.1921679973602295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21085387468338013, + "step": 12096 + }, + { + "epoch": 0.24196, + "grad_norm": 2.34375, + "grad_norm_var": 0.03551839192708333, + "learning_rate": 0.0001, + "loss": 4.3551, + "loss/crossentropy": 2.18610817193985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2332247570157051, + "step": 12098 + }, + { + "epoch": 0.242, + "grad_norm": 2.203125, + "grad_norm_var": 0.038852691650390625, + "learning_rate": 0.0001, + "loss": 4.1216, + "loss/crossentropy": 2.08358097076416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21169763058423996, + "step": 12100 + }, + { + "epoch": 0.24204, + "grad_norm": 2.140625, + "grad_norm_var": 0.038913726806640625, + "learning_rate": 0.0001, + "loss": 4.2339, + "loss/crossentropy": 2.1001542806625366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132650762796402, + "step": 12102 + }, + { + "epoch": 0.24208, + "grad_norm": 2.109375, + "grad_norm_var": 0.033614095052083334, + "learning_rate": 0.0001, + "loss": 4.2051, + "loss/crossentropy": 1.9031851887702942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042345404624939, + "step": 12104 + }, + { + "epoch": 0.24212, + "grad_norm": 1.96875, + "grad_norm_var": 0.0127838134765625, + "learning_rate": 0.0001, + "loss": 4.0341, + "loss/crossentropy": 1.670085072517395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878880336880684, + "step": 12106 + }, + { + "epoch": 0.24216, + "grad_norm": 2.09375, + "grad_norm_var": 0.012914784749348958, + "learning_rate": 0.0001, + "loss": 4.3395, + "loss/crossentropy": 1.9557109475135803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211412936449051, + "step": 12108 + }, + { + "epoch": 0.2422, + "grad_norm": 1.96875, + "grad_norm_var": 0.013637034098307292, + "learning_rate": 0.0001, + "loss": 4.0107, + "loss/crossentropy": 2.085852086544037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21336784213781357, + "step": 12110 + }, + { + "epoch": 0.24224, + "grad_norm": 1.8671875, + "grad_norm_var": 0.015721638997395832, + "learning_rate": 0.0001, + "loss": 4.0321, + "loss/crossentropy": 2.1064602732658386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2096766158938408, + "step": 12112 + }, + { + "epoch": 0.24228, + "grad_norm": 2.109375, + "grad_norm_var": 0.009325917561848958, + "learning_rate": 0.0001, + "loss": 4.2998, + "loss/crossentropy": 2.381577968597412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23788201808929443, + "step": 12114 + }, + { + "epoch": 0.24232, + "grad_norm": 1.96875, + "grad_norm_var": 0.006758626302083333, + "learning_rate": 0.0001, + "loss": 4.0687, + "loss/crossentropy": 1.7200234532356262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18620850890874863, + "step": 12116 + }, + { + "epoch": 0.24236, + "grad_norm": 2.1875, + "grad_norm_var": 0.007697550455729166, + "learning_rate": 0.0001, + "loss": 4.2672, + "loss/crossentropy": 1.9106165170669556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21456392109394073, + "step": 12118 + }, + { + "epoch": 0.2424, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007106272379557291, + "learning_rate": 0.0001, + "loss": 4.04, + "loss/crossentropy": 1.9852410554885864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1927378550171852, + "step": 12120 + }, + { + "epoch": 0.24244, + "grad_norm": 1.96875, + "grad_norm_var": 0.007991282145182292, + "learning_rate": 0.0001, + "loss": 4.1278, + "loss/crossentropy": 1.4948370456695557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1698828637599945, + "step": 12122 + }, + { + "epoch": 0.24248, + "grad_norm": 1.9375, + "grad_norm_var": 0.0078857421875, + "learning_rate": 0.0001, + "loss": 4.2874, + "loss/crossentropy": 1.956885814666748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19851476699113846, + "step": 12124 + }, + { + "epoch": 0.24252, + "grad_norm": 2.171875, + "grad_norm_var": 0.009422810872395833, + "learning_rate": 0.0001, + "loss": 3.8709, + "loss/crossentropy": 1.9052257537841797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18438522517681122, + "step": 12126 + }, + { + "epoch": 0.24256, + "grad_norm": 2.1875, + "grad_norm_var": 0.008430735270182291, + "learning_rate": 0.0001, + "loss": 4.4347, + "loss/crossentropy": 2.353670358657837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22648276388645172, + "step": 12128 + }, + { + "epoch": 0.2426, + "grad_norm": 1.8203125, + "grad_norm_var": 0.01263427734375, + "learning_rate": 0.0001, + "loss": 3.9896, + "loss/crossentropy": 1.714030683040619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19275055080652237, + "step": 12130 + }, + { + "epoch": 0.24264, + "grad_norm": 2.046875, + "grad_norm_var": 0.012898763020833334, + "learning_rate": 0.0001, + "loss": 4.2981, + "loss/crossentropy": 2.0048200488090515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20478381216526031, + "step": 12132 + }, + { + "epoch": 0.24268, + "grad_norm": 2.125, + "grad_norm_var": 0.011766560872395833, + "learning_rate": 0.0001, + "loss": 4.0497, + "loss/crossentropy": 1.7151115536689758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19872941821813583, + "step": 12134 + }, + { + "epoch": 0.24272, + "grad_norm": 2.109375, + "grad_norm_var": 0.011818186442057291, + "learning_rate": 0.0001, + "loss": 4.2753, + "loss/crossentropy": 2.1646838188171387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22345983982086182, + "step": 12136 + }, + { + "epoch": 0.24276, + "grad_norm": 2.046875, + "grad_norm_var": 0.010807037353515625, + "learning_rate": 0.0001, + "loss": 4.157, + "loss/crossentropy": 2.053311765193939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2143322378396988, + "step": 12138 + }, + { + "epoch": 0.2428, + "grad_norm": 2.015625, + "grad_norm_var": 0.009993235270182291, + "learning_rate": 0.0001, + "loss": 3.8805, + "loss/crossentropy": 2.0471617579460144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20450014621019363, + "step": 12140 + }, + { + "epoch": 0.24284, + "grad_norm": 2.1875, + "grad_norm_var": 0.008737945556640625, + "learning_rate": 0.0001, + "loss": 4.2966, + "loss/crossentropy": 2.1727033853530884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22905820608139038, + "step": 12142 + }, + { + "epoch": 0.24288, + "grad_norm": 2.09375, + "grad_norm_var": 0.008245595296223958, + "learning_rate": 0.0001, + "loss": 4.3861, + "loss/crossentropy": 2.197450280189514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22059273719787598, + "step": 12144 + }, + { + "epoch": 0.24292, + "grad_norm": 2.171875, + "grad_norm_var": 0.003413899739583333, + "learning_rate": 0.0001, + "loss": 4.4841, + "loss/crossentropy": 1.9954137206077576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22441548854112625, + "step": 12146 + }, + { + "epoch": 0.24296, + "grad_norm": 2.0, + "grad_norm_var": 0.005110677083333333, + "learning_rate": 0.0001, + "loss": 4.2113, + "loss/crossentropy": 2.3393132090568542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22949577867984772, + "step": 12148 + }, + { + "epoch": 0.243, + "grad_norm": 1.953125, + "grad_norm_var": 0.0067291259765625, + "learning_rate": 0.0001, + "loss": 4.146, + "loss/crossentropy": 2.11602646112442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19181709736585617, + "step": 12150 + }, + { + "epoch": 0.24304, + "grad_norm": 2.078125, + "grad_norm_var": 0.005597941080729167, + "learning_rate": 0.0001, + "loss": 4.1381, + "loss/crossentropy": 2.130657136440277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20904043316841125, + "step": 12152 + }, + { + "epoch": 0.24308, + "grad_norm": 2.078125, + "grad_norm_var": 0.0053293863932291664, + "learning_rate": 0.0001, + "loss": 4.2983, + "loss/crossentropy": 2.3923556804656982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23029828071594238, + "step": 12154 + }, + { + "epoch": 0.24312, + "grad_norm": 1.875, + "grad_norm_var": 0.007209269205729166, + "learning_rate": 0.0001, + "loss": 3.9697, + "loss/crossentropy": 2.024892747402191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879258424043655, + "step": 12156 + }, + { + "epoch": 0.24316, + "grad_norm": 1.921875, + "grad_norm_var": 0.007826487223307291, + "learning_rate": 0.0001, + "loss": 3.8761, + "loss/crossentropy": 1.7833393812179565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18542324006557465, + "step": 12158 + }, + { + "epoch": 0.2432, + "grad_norm": 1.890625, + "grad_norm_var": 0.007328033447265625, + "learning_rate": 0.0001, + "loss": 4.1896, + "loss/crossentropy": 2.253599762916565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148476019501686, + "step": 12160 + }, + { + "epoch": 0.24324, + "grad_norm": 2.046875, + "grad_norm_var": 0.005177561442057292, + "learning_rate": 0.0001, + "loss": 4.0253, + "loss/crossentropy": 1.7218471765518188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1923334002494812, + "step": 12162 + }, + { + "epoch": 0.24328, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007575480143229166, + "learning_rate": 0.0001, + "loss": 4.2947, + "loss/crossentropy": 2.210463523864746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2203536182641983, + "step": 12164 + }, + { + "epoch": 0.24332, + "grad_norm": 2.0625, + "grad_norm_var": 0.0076487223307291664, + "learning_rate": 0.0001, + "loss": 4.4164, + "loss/crossentropy": 2.452837347984314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22888235747814178, + "step": 12166 + }, + { + "epoch": 0.24336, + "grad_norm": 1.984375, + "grad_norm_var": 0.007657877604166667, + "learning_rate": 0.0001, + "loss": 4.2157, + "loss/crossentropy": 2.3511279821395874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2431751787662506, + "step": 12168 + }, + { + "epoch": 0.2434, + "grad_norm": 2.03125, + "grad_norm_var": 0.007819620768229167, + "learning_rate": 0.0001, + "loss": 4.0911, + "loss/crossentropy": 2.0734334588050842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137738972902298, + "step": 12170 + }, + { + "epoch": 0.24344, + "grad_norm": 2.03125, + "grad_norm_var": 0.0065582275390625, + "learning_rate": 0.0001, + "loss": 4.0987, + "loss/crossentropy": 1.9492397904396057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22894760966300964, + "step": 12172 + }, + { + "epoch": 0.24348, + "grad_norm": 2.046875, + "grad_norm_var": 0.005936431884765625, + "learning_rate": 0.0001, + "loss": 4.2316, + "loss/crossentropy": 2.03458708524704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20981666445732117, + "step": 12174 + }, + { + "epoch": 0.24352, + "grad_norm": 1.96875, + "grad_norm_var": 0.007059478759765625, + "learning_rate": 0.0001, + "loss": 4.3396, + "loss/crossentropy": 2.1464394330978394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20741496980190277, + "step": 12176 + }, + { + "epoch": 0.24356, + "grad_norm": 2.015625, + "grad_norm_var": 0.0064999898274739586, + "learning_rate": 0.0001, + "loss": 3.9902, + "loss/crossentropy": 1.8449034094810486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21950078010559082, + "step": 12178 + }, + { + "epoch": 0.2436, + "grad_norm": 2.15625, + "grad_norm_var": 0.0054433186848958336, + "learning_rate": 0.0001, + "loss": 4.0129, + "loss/crossentropy": 1.8646993041038513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117064744234085, + "step": 12180 + }, + { + "epoch": 0.24364, + "grad_norm": 1.859375, + "grad_norm_var": 0.008003743489583333, + "learning_rate": 0.0001, + "loss": 3.9347, + "loss/crossentropy": 1.9643146991729736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1946149319410324, + "step": 12182 + }, + { + "epoch": 0.24368, + "grad_norm": 2.109375, + "grad_norm_var": 0.008141835530598959, + "learning_rate": 0.0001, + "loss": 4.1468, + "loss/crossentropy": 1.867002248764038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19111115485429764, + "step": 12184 + }, + { + "epoch": 0.24372, + "grad_norm": 2.0625, + "grad_norm_var": 0.008129628499348958, + "learning_rate": 0.0001, + "loss": 4.0443, + "loss/crossentropy": 1.9431232810020447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19612180441617966, + "step": 12186 + }, + { + "epoch": 0.24376, + "grad_norm": 2.25, + "grad_norm_var": 0.0105224609375, + "learning_rate": 0.0001, + "loss": 4.2931, + "loss/crossentropy": 2.197216033935547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22340577840805054, + "step": 12188 + }, + { + "epoch": 0.2438, + "grad_norm": 2.109375, + "grad_norm_var": 0.010469563802083333, + "learning_rate": 0.0001, + "loss": 4.2499, + "loss/crossentropy": 1.916576623916626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20536810904741287, + "step": 12190 + }, + { + "epoch": 0.24384, + "grad_norm": 2.25, + "grad_norm_var": 0.010497029622395833, + "learning_rate": 0.0001, + "loss": 4.3838, + "loss/crossentropy": 2.0369369983673096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26386965811252594, + "step": 12192 + }, + { + "epoch": 0.24388, + "grad_norm": 2.09375, + "grad_norm_var": 0.01639404296875, + "learning_rate": 0.0001, + "loss": 4.2817, + "loss/crossentropy": 2.086443066596985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21351207792758942, + "step": 12194 + }, + { + "epoch": 0.24392, + "grad_norm": 2.015625, + "grad_norm_var": 0.01619873046875, + "learning_rate": 0.0001, + "loss": 4.3649, + "loss/crossentropy": 2.0969839096069336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21489758789539337, + "step": 12196 + }, + { + "epoch": 0.24396, + "grad_norm": 1.859375, + "grad_norm_var": 0.016463216145833334, + "learning_rate": 0.0001, + "loss": 4.2481, + "loss/crossentropy": 2.086832642555237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20651830732822418, + "step": 12198 + }, + { + "epoch": 0.244, + "grad_norm": 2.015625, + "grad_norm_var": 0.017146809895833334, + "learning_rate": 0.0001, + "loss": 4.1397, + "loss/crossentropy": 1.9104391932487488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963956654071808, + "step": 12200 + }, + { + "epoch": 0.24404, + "grad_norm": 2.03125, + "grad_norm_var": 0.0191314697265625, + "learning_rate": 0.0001, + "loss": 3.8558, + "loss/crossentropy": 1.9419523477554321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19279541075229645, + "step": 12202 + }, + { + "epoch": 0.24408, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01697998046875, + "learning_rate": 0.0001, + "loss": 4.3239, + "loss/crossentropy": 2.2867754697799683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23261378705501556, + "step": 12204 + }, + { + "epoch": 0.24412, + "grad_norm": 1.90625, + "grad_norm_var": 0.017634073893229168, + "learning_rate": 0.0001, + "loss": 4.0786, + "loss/crossentropy": 1.941315233707428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20962880551815033, + "step": 12206 + }, + { + "epoch": 0.24416, + "grad_norm": 2.140625, + "grad_norm_var": 0.01541748046875, + "learning_rate": 0.0001, + "loss": 4.2044, + "loss/crossentropy": 2.0638798475265503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21499283611774445, + "step": 12208 + }, + { + "epoch": 0.2442, + "grad_norm": 2.140625, + "grad_norm_var": 0.008454386393229167, + "learning_rate": 0.0001, + "loss": 4.1889, + "loss/crossentropy": 2.3167499899864197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21805942803621292, + "step": 12210 + }, + { + "epoch": 0.24424, + "grad_norm": 2.0, + "grad_norm_var": 0.00848388671875, + "learning_rate": 0.0001, + "loss": 4.0956, + "loss/crossentropy": 2.1785646080970764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21259736269712448, + "step": 12212 + }, + { + "epoch": 0.24428, + "grad_norm": 2.03125, + "grad_norm_var": 0.006615193684895834, + "learning_rate": 0.0001, + "loss": 4.2054, + "loss/crossentropy": 1.983469545841217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21091710776090622, + "step": 12214 + }, + { + "epoch": 0.24432, + "grad_norm": 2.078125, + "grad_norm_var": 0.006937408447265625, + "learning_rate": 0.0001, + "loss": 4.2283, + "loss/crossentropy": 1.8501896858215332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20537292212247849, + "step": 12216 + }, + { + "epoch": 0.24436, + "grad_norm": 2.1875, + "grad_norm_var": 0.0072934468587239586, + "learning_rate": 0.0001, + "loss": 4.182, + "loss/crossentropy": 1.7293490767478943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916719302535057, + "step": 12218 + }, + { + "epoch": 0.2444, + "grad_norm": 2.03125, + "grad_norm_var": 0.0069488525390625, + "learning_rate": 0.0001, + "loss": 4.3247, + "loss/crossentropy": 2.0931429862976074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22081860899925232, + "step": 12220 + }, + { + "epoch": 0.24444, + "grad_norm": 2.03125, + "grad_norm_var": 0.007657623291015625, + "learning_rate": 0.0001, + "loss": 4.0619, + "loss/crossentropy": 2.157875657081604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20259525626897812, + "step": 12222 + }, + { + "epoch": 0.24448, + "grad_norm": 2.0625, + "grad_norm_var": 0.007248687744140625, + "learning_rate": 0.0001, + "loss": 4.2712, + "loss/crossentropy": 2.2028552889823914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2431233748793602, + "step": 12224 + }, + { + "epoch": 0.24452, + "grad_norm": 2.046875, + "grad_norm_var": 0.006473541259765625, + "learning_rate": 0.0001, + "loss": 4.1184, + "loss/crossentropy": 2.0906929969787598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051575481891632, + "step": 12226 + }, + { + "epoch": 0.24456, + "grad_norm": 2.078125, + "grad_norm_var": 0.006091054280598958, + "learning_rate": 0.0001, + "loss": 4.5601, + "loss/crossentropy": 2.5020272731781006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2485819309949875, + "step": 12228 + }, + { + "epoch": 0.2446, + "grad_norm": 8.0625, + "grad_norm_var": 2.244887034098307, + "learning_rate": 0.0001, + "loss": 4.1465, + "loss/crossentropy": 1.502736508846283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839340552687645, + "step": 12230 + }, + { + "epoch": 0.24464, + "grad_norm": 2.484375, + "grad_norm_var": 2.2357358296712238, + "learning_rate": 0.0001, + "loss": 4.3882, + "loss/crossentropy": 2.1888676285743713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.226266011595726, + "step": 12232 + }, + { + "epoch": 0.24468, + "grad_norm": 2.125, + "grad_norm_var": 2.243033599853516, + "learning_rate": 0.0001, + "loss": 4.3887, + "loss/crossentropy": 2.377061367034912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23195213079452515, + "step": 12234 + }, + { + "epoch": 0.24472, + "grad_norm": 2.03125, + "grad_norm_var": 2.257559967041016, + "learning_rate": 0.0001, + "loss": 3.9354, + "loss/crossentropy": 2.0567076206207275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20983586460351944, + "step": 12236 + }, + { + "epoch": 0.24476, + "grad_norm": 2.015625, + "grad_norm_var": 2.246906534830729, + "learning_rate": 0.0001, + "loss": 4.1978, + "loss/crossentropy": 2.007763922214508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21983467787504196, + "step": 12238 + }, + { + "epoch": 0.2448, + "grad_norm": 2.078125, + "grad_norm_var": 2.247749837239583, + "learning_rate": 0.0001, + "loss": 4.1411, + "loss/crossentropy": 2.3309481143951416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23099908232688904, + "step": 12240 + }, + { + "epoch": 0.24484, + "grad_norm": 2.5, + "grad_norm_var": 2.23668212890625, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 1.8231184482574463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22009263187646866, + "step": 12242 + }, + { + "epoch": 0.24488, + "grad_norm": 1.9609375, + "grad_norm_var": 2.2464637756347656, + "learning_rate": 0.0001, + "loss": 4.1131, + "loss/crossentropy": 2.140045642852783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22202204167842865, + "step": 12244 + }, + { + "epoch": 0.24492, + "grad_norm": 1.984375, + "grad_norm_var": 0.02835057576497396, + "learning_rate": 0.0001, + "loss": 4.2209, + "loss/crossentropy": 1.8495931029319763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19803623855113983, + "step": 12246 + }, + { + "epoch": 0.24496, + "grad_norm": 2.171875, + "grad_norm_var": 0.0185943603515625, + "learning_rate": 0.0001, + "loss": 4.1624, + "loss/crossentropy": 2.2141982913017273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23418182879686356, + "step": 12248 + }, + { + "epoch": 0.245, + "grad_norm": 2.03125, + "grad_norm_var": 0.01844482421875, + "learning_rate": 0.0001, + "loss": 4.4104, + "loss/crossentropy": 2.14465594291687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22683896869421005, + "step": 12250 + }, + { + "epoch": 0.24504, + "grad_norm": 2.0625, + "grad_norm_var": 0.01610107421875, + "learning_rate": 0.0001, + "loss": 4.1913, + "loss/crossentropy": 2.0804443359375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265845239162445, + "step": 12252 + }, + { + "epoch": 0.24508, + "grad_norm": 2.046875, + "grad_norm_var": 0.0160308837890625, + "learning_rate": 0.0001, + "loss": 4.485, + "loss/crossentropy": 2.2451776266098022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22881492972373962, + "step": 12254 + }, + { + "epoch": 0.24512, + "grad_norm": 1.953125, + "grad_norm_var": 0.016947428385416668, + "learning_rate": 0.0001, + "loss": 4.0869, + "loss/crossentropy": 1.6861794590950012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18126793205738068, + "step": 12256 + }, + { + "epoch": 0.24516, + "grad_norm": 1.96875, + "grad_norm_var": 0.0059773763020833336, + "learning_rate": 0.0001, + "loss": 4.2343, + "loss/crossentropy": 1.87660551071167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20990663766860962, + "step": 12258 + }, + { + "epoch": 0.2452, + "grad_norm": 2.15625, + "grad_norm_var": 0.006799062093098958, + "learning_rate": 0.0001, + "loss": 4.2019, + "loss/crossentropy": 2.103124976158142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19615671038627625, + "step": 12260 + }, + { + "epoch": 0.24524, + "grad_norm": 2.0, + "grad_norm_var": 0.0069048563639322914, + "learning_rate": 0.0001, + "loss": 4.2509, + "loss/crossentropy": 2.139270842075348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21861301362514496, + "step": 12262 + }, + { + "epoch": 0.24528, + "grad_norm": 2.140625, + "grad_norm_var": 0.00791015625, + "learning_rate": 0.0001, + "loss": 4.4081, + "loss/crossentropy": 2.108114778995514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20996354520320892, + "step": 12264 + }, + { + "epoch": 0.24532, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008847808837890625, + "learning_rate": 0.0001, + "loss": 3.9495, + "loss/crossentropy": 2.1205111145973206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21352755278348923, + "step": 12266 + }, + { + "epoch": 0.24536, + "grad_norm": 2.109375, + "grad_norm_var": 0.009248860677083333, + "learning_rate": 0.0001, + "loss": 3.9946, + "loss/crossentropy": 2.20136821269989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2277800738811493, + "step": 12268 + }, + { + "epoch": 0.2454, + "grad_norm": 2.046875, + "grad_norm_var": 0.011336008707682291, + "learning_rate": 0.0001, + "loss": 4.1116, + "loss/crossentropy": 2.1149147748947144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062554806470871, + "step": 12270 + }, + { + "epoch": 0.24544, + "grad_norm": 2.4375, + "grad_norm_var": 0.01932347615559896, + "learning_rate": 0.0001, + "loss": 4.5226, + "loss/crossentropy": 2.1119648218154907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2922344133257866, + "step": 12272 + }, + { + "epoch": 0.24548, + "grad_norm": 2.0625, + "grad_norm_var": 0.017964680989583332, + "learning_rate": 0.0001, + "loss": 4.1766, + "loss/crossentropy": 2.2631434202194214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2044210433959961, + "step": 12274 + }, + { + "epoch": 0.24552, + "grad_norm": 2.125, + "grad_norm_var": 0.06928609212239584, + "learning_rate": 0.0001, + "loss": 4.0956, + "loss/crossentropy": 2.264181971549988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23541904985904694, + "step": 12276 + }, + { + "epoch": 0.24556, + "grad_norm": 2.0625, + "grad_norm_var": 0.06836649576822916, + "learning_rate": 0.0001, + "loss": 4.2083, + "loss/crossentropy": 2.256329298019409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21784210950136185, + "step": 12278 + }, + { + "epoch": 0.2456, + "grad_norm": 2.21875, + "grad_norm_var": 0.06968765258789063, + "learning_rate": 0.0001, + "loss": 4.267, + "loss/crossentropy": 2.0962833166122437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.211236834526062, + "step": 12280 + }, + { + "epoch": 0.24564, + "grad_norm": 2.140625, + "grad_norm_var": 0.0683990478515625, + "learning_rate": 0.0001, + "loss": 4.4767, + "loss/crossentropy": 2.3430999517440796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25170228630304337, + "step": 12282 + }, + { + "epoch": 0.24568, + "grad_norm": 2.125, + "grad_norm_var": 0.06921361287434896, + "learning_rate": 0.0001, + "loss": 4.1492, + "loss/crossentropy": 1.8256065845489502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20632392168045044, + "step": 12284 + }, + { + "epoch": 0.24572, + "grad_norm": 1.96875, + "grad_norm_var": 0.06643778483072917, + "learning_rate": 0.0001, + "loss": 4.294, + "loss/crossentropy": 2.481971561908722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121504619717598, + "step": 12286 + }, + { + "epoch": 0.24576, + "grad_norm": 2.125, + "grad_norm_var": 0.060469563802083334, + "learning_rate": 0.0001, + "loss": 4.2136, + "loss/crossentropy": 2.2815581560134888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22004567831754684, + "step": 12288 + }, + { + "epoch": 0.2458, + "grad_norm": 1.9921875, + "grad_norm_var": 0.060323079427083336, + "learning_rate": 0.0001, + "loss": 4.2426, + "loss/crossentropy": 1.7125394940376282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19618325680494308, + "step": 12290 + }, + { + "epoch": 0.24584, + "grad_norm": 1.859375, + "grad_norm_var": 0.008347320556640624, + "learning_rate": 0.0001, + "loss": 3.9676, + "loss/crossentropy": 2.0333253145217896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20251524448394775, + "step": 12292 + }, + { + "epoch": 0.24588, + "grad_norm": 1.984375, + "grad_norm_var": 0.008939361572265625, + "learning_rate": 0.0001, + "loss": 4.0958, + "loss/crossentropy": 2.359953284263611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23697812855243683, + "step": 12294 + }, + { + "epoch": 0.24592, + "grad_norm": 2.0625, + "grad_norm_var": 0.0065348307291666664, + "learning_rate": 0.0001, + "loss": 3.9914, + "loss/crossentropy": 1.8465049266815186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18830078095197678, + "step": 12296 + }, + { + "epoch": 0.24596, + "grad_norm": 2.109375, + "grad_norm_var": 0.00662841796875, + "learning_rate": 0.0001, + "loss": 4.1669, + "loss/crossentropy": 2.407967984676361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20054074376821518, + "step": 12298 + }, + { + "epoch": 0.246, + "grad_norm": 2.015625, + "grad_norm_var": 0.007005818684895833, + "learning_rate": 0.0001, + "loss": 4.1943, + "loss/crossentropy": 2.306265115737915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2286146879196167, + "step": 12300 + }, + { + "epoch": 0.24604, + "grad_norm": 1.90625, + "grad_norm_var": 0.0076416015625, + "learning_rate": 0.0001, + "loss": 4.108, + "loss/crossentropy": 1.9776363968849182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015514224767685, + "step": 12302 + }, + { + "epoch": 0.24608, + "grad_norm": 1.859375, + "grad_norm_var": 0.010041300455729167, + "learning_rate": 0.0001, + "loss": 3.7023, + "loss/crossentropy": 1.6589386463165283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16657201945781708, + "step": 12304 + }, + { + "epoch": 0.24612, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009968058268229166, + "learning_rate": 0.0001, + "loss": 4.2749, + "loss/crossentropy": 2.263510227203369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21220041066408157, + "step": 12306 + }, + { + "epoch": 0.24616, + "grad_norm": 2.015625, + "grad_norm_var": 0.008583323160807291, + "learning_rate": 0.0001, + "loss": 4.1735, + "loss/crossentropy": 2.0802704095840454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20156628638505936, + "step": 12308 + }, + { + "epoch": 0.2462, + "grad_norm": 2.0, + "grad_norm_var": 0.015193430582682292, + "learning_rate": 0.0001, + "loss": 4.2597, + "loss/crossentropy": 2.0921266674995422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22044362872838974, + "step": 12310 + }, + { + "epoch": 0.24624, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0164703369140625, + "learning_rate": 0.0001, + "loss": 3.8587, + "loss/crossentropy": 1.7366862297058105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1722203940153122, + "step": 12312 + }, + { + "epoch": 0.24628, + "grad_norm": 2.046875, + "grad_norm_var": 0.016377766927083332, + "learning_rate": 0.0001, + "loss": 4.1408, + "loss/crossentropy": 2.0741729140281677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1942591667175293, + "step": 12314 + }, + { + "epoch": 0.24632, + "grad_norm": 2.0625, + "grad_norm_var": 0.0145751953125, + "learning_rate": 0.0001, + "loss": 4.1343, + "loss/crossentropy": 2.0611414909362793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22244945168495178, + "step": 12316 + }, + { + "epoch": 0.24636, + "grad_norm": 2.015625, + "grad_norm_var": 0.014196523030598958, + "learning_rate": 0.0001, + "loss": 3.8586, + "loss/crossentropy": 1.9337337017059326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20655318349599838, + "step": 12318 + }, + { + "epoch": 0.2464, + "grad_norm": 2.09375, + "grad_norm_var": 0.012141672770182292, + "learning_rate": 0.0001, + "loss": 3.9197, + "loss/crossentropy": 1.8766502737998962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851816102862358, + "step": 12320 + }, + { + "epoch": 0.24644, + "grad_norm": 2.0625, + "grad_norm_var": 0.012552897135416666, + "learning_rate": 0.0001, + "loss": 4.2407, + "loss/crossentropy": 2.121203899383545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2177201583981514, + "step": 12322 + }, + { + "epoch": 0.24648, + "grad_norm": 2.03125, + "grad_norm_var": 0.0136138916015625, + "learning_rate": 0.0001, + "loss": 4.1441, + "loss/crossentropy": 1.8102782368659973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19694476574659348, + "step": 12324 + }, + { + "epoch": 0.24652, + "grad_norm": 1.984375, + "grad_norm_var": 0.0062164306640625, + "learning_rate": 0.0001, + "loss": 4.2526, + "loss/crossentropy": 2.323423147201538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20612536370754242, + "step": 12326 + }, + { + "epoch": 0.24656, + "grad_norm": 2.15625, + "grad_norm_var": 0.007814280192057292, + "learning_rate": 0.0001, + "loss": 4.4862, + "loss/crossentropy": 2.2835768461227417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22697383165359497, + "step": 12328 + }, + { + "epoch": 0.2466, + "grad_norm": 2.09375, + "grad_norm_var": 0.006192779541015625, + "learning_rate": 0.0001, + "loss": 4.2868, + "loss/crossentropy": 2.197639048099518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22446971386671066, + "step": 12330 + }, + { + "epoch": 0.24664, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006761678059895833, + "learning_rate": 0.0001, + "loss": 4.0758, + "loss/crossentropy": 2.1731618642807007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21908622235059738, + "step": 12332 + }, + { + "epoch": 0.24668, + "grad_norm": 1.875, + "grad_norm_var": 0.009291330973307291, + "learning_rate": 0.0001, + "loss": 3.9108, + "loss/crossentropy": 1.9236284494400024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1810280755162239, + "step": 12334 + }, + { + "epoch": 0.24672, + "grad_norm": 2.0, + "grad_norm_var": 0.008318837483723958, + "learning_rate": 0.0001, + "loss": 4.3975, + "loss/crossentropy": 2.2902809381484985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23192713409662247, + "step": 12336 + }, + { + "epoch": 0.24676, + "grad_norm": 2.015625, + "grad_norm_var": 0.008367665608723958, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 1.9808775186538696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19344759732484818, + "step": 12338 + }, + { + "epoch": 0.2468, + "grad_norm": 1.921875, + "grad_norm_var": 0.008337148030598958, + "learning_rate": 0.0001, + "loss": 4.1581, + "loss/crossentropy": 2.148995041847229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19994106143712997, + "step": 12340 + }, + { + "epoch": 0.24684, + "grad_norm": 2.109375, + "grad_norm_var": 0.008528391520182291, + "learning_rate": 0.0001, + "loss": 4.3927, + "loss/crossentropy": 2.374568462371826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24350540339946747, + "step": 12342 + }, + { + "epoch": 0.24688, + "grad_norm": 2.1875, + "grad_norm_var": 0.006959788004557292, + "learning_rate": 0.0001, + "loss": 4.258, + "loss/crossentropy": 1.9702014923095703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038111537694931, + "step": 12344 + }, + { + "epoch": 0.24692, + "grad_norm": 1.921875, + "grad_norm_var": 0.007008616129557292, + "learning_rate": 0.0001, + "loss": 4.0126, + "loss/crossentropy": 1.8438855409622192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822836771607399, + "step": 12346 + }, + { + "epoch": 0.24696, + "grad_norm": 2.265625, + "grad_norm_var": 0.010754140218098958, + "learning_rate": 0.0001, + "loss": 4.1352, + "loss/crossentropy": 2.005755662918091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21399007737636566, + "step": 12348 + }, + { + "epoch": 0.247, + "grad_norm": 2.015625, + "grad_norm_var": 0.008819325764973959, + "learning_rate": 0.0001, + "loss": 4.0988, + "loss/crossentropy": 1.8640305399894714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19055305421352386, + "step": 12350 + }, + { + "epoch": 0.24704, + "grad_norm": 2.140625, + "grad_norm_var": 0.010453033447265624, + "learning_rate": 0.0001, + "loss": 4.1981, + "loss/crossentropy": 2.1293725967407227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117801234126091, + "step": 12352 + }, + { + "epoch": 0.24708, + "grad_norm": 2.484375, + "grad_norm_var": 0.02337621053059896, + "learning_rate": 0.0001, + "loss": 4.2666, + "loss/crossentropy": 1.7655459642410278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2085946872830391, + "step": 12354 + }, + { + "epoch": 0.24712, + "grad_norm": 2.046875, + "grad_norm_var": 0.021897125244140624, + "learning_rate": 0.0001, + "loss": 4.3175, + "loss/crossentropy": 2.184986114501953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22597889602184296, + "step": 12356 + }, + { + "epoch": 0.24716, + "grad_norm": 2.015625, + "grad_norm_var": 0.021897125244140624, + "learning_rate": 0.0001, + "loss": 4.1295, + "loss/crossentropy": 2.368021607398987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21957845985889435, + "step": 12358 + }, + { + "epoch": 0.2472, + "grad_norm": 2.046875, + "grad_norm_var": 0.020645904541015624, + "learning_rate": 0.0001, + "loss": 4.16, + "loss/crossentropy": 2.297884225845337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155354768037796, + "step": 12360 + }, + { + "epoch": 0.24724, + "grad_norm": 2.328125, + "grad_norm_var": 0.023273722330729166, + "learning_rate": 0.0001, + "loss": 4.2181, + "loss/crossentropy": 2.121878147125244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22192668169736862, + "step": 12362 + }, + { + "epoch": 0.24728, + "grad_norm": 2.125, + "grad_norm_var": 0.020442454020182292, + "learning_rate": 0.0001, + "loss": 4.3233, + "loss/crossentropy": 2.130228877067566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20712029188871384, + "step": 12364 + }, + { + "epoch": 0.24732, + "grad_norm": 1.9921875, + "grad_norm_var": 0.02367121378580729, + "learning_rate": 0.0001, + "loss": 3.8542, + "loss/crossentropy": 1.6520383954048157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16511157900094986, + "step": 12366 + }, + { + "epoch": 0.24736, + "grad_norm": 2.1875, + "grad_norm_var": 0.02220637003580729, + "learning_rate": 0.0001, + "loss": 4.1894, + "loss/crossentropy": 1.9993118047714233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112603336572647, + "step": 12368 + }, + { + "epoch": 0.2474, + "grad_norm": 2.046875, + "grad_norm_var": 0.011494700113932292, + "learning_rate": 0.0001, + "loss": 4.0522, + "loss/crossentropy": 1.9502257108688354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944574937224388, + "step": 12370 + }, + { + "epoch": 0.24744, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011872355143229167, + "learning_rate": 0.0001, + "loss": 4.1098, + "loss/crossentropy": 2.1219520568847656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22872696816921234, + "step": 12372 + }, + { + "epoch": 0.24748, + "grad_norm": 1.984375, + "grad_norm_var": 0.0502593994140625, + "learning_rate": 0.0001, + "loss": 4.172, + "loss/crossentropy": 2.0668599605560303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21508124470710754, + "step": 12374 + }, + { + "epoch": 0.24752, + "grad_norm": 1.984375, + "grad_norm_var": 0.05098241170247396, + "learning_rate": 0.0001, + "loss": 4.1901, + "loss/crossentropy": 2.242877721786499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308686003088951, + "step": 12376 + }, + { + "epoch": 0.24756, + "grad_norm": 2.015625, + "grad_norm_var": 0.047304026285807294, + "learning_rate": 0.0001, + "loss": 3.8779, + "loss/crossentropy": 1.8258161544799805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1852153167128563, + "step": 12378 + }, + { + "epoch": 0.2476, + "grad_norm": 2.046875, + "grad_norm_var": 0.04918390909830729, + "learning_rate": 0.0001, + "loss": 4.1241, + "loss/crossentropy": 2.0654167532920837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20526036620140076, + "step": 12380 + }, + { + "epoch": 0.24764, + "grad_norm": 2.046875, + "grad_norm_var": 0.04582087198893229, + "learning_rate": 0.0001, + "loss": 4.1601, + "loss/crossentropy": 2.125216484069824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216343455016613, + "step": 12382 + }, + { + "epoch": 0.24768, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0473052978515625, + "learning_rate": 0.0001, + "loss": 4.1756, + "loss/crossentropy": 2.285245180130005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22941745072603226, + "step": 12384 + }, + { + "epoch": 0.24772, + "grad_norm": 2.046875, + "grad_norm_var": 0.04720637003580729, + "learning_rate": 0.0001, + "loss": 4.1097, + "loss/crossentropy": 2.0148350596427917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1874203458428383, + "step": 12386 + }, + { + "epoch": 0.24776, + "grad_norm": 2.03125, + "grad_norm_var": 0.04812825520833333, + "learning_rate": 0.0001, + "loss": 4.3127, + "loss/crossentropy": 2.2349741458892822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23187313228845596, + "step": 12388 + }, + { + "epoch": 0.2478, + "grad_norm": 2.078125, + "grad_norm_var": 0.009178670247395833, + "learning_rate": 0.0001, + "loss": 4.1875, + "loss/crossentropy": 2.195721983909607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119324654340744, + "step": 12390 + }, + { + "epoch": 0.24784, + "grad_norm": 2.265625, + "grad_norm_var": 0.012676747639973958, + "learning_rate": 0.0001, + "loss": 4.5732, + "loss/crossentropy": 2.1261669397354126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21143332868814468, + "step": 12392 + }, + { + "epoch": 0.24788, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012798817952473958, + "learning_rate": 0.0001, + "loss": 3.9475, + "loss/crossentropy": 2.1640073657035828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192412167787552, + "step": 12394 + }, + { + "epoch": 0.24792, + "grad_norm": 1.953125, + "grad_norm_var": 0.011230214436848959, + "learning_rate": 0.0001, + "loss": 4.2435, + "loss/crossentropy": 1.9555792808532715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110436111688614, + "step": 12396 + }, + { + "epoch": 0.24796, + "grad_norm": 2.015625, + "grad_norm_var": 0.011156972249348958, + "learning_rate": 0.0001, + "loss": 4.0616, + "loss/crossentropy": 1.948053002357483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19957780838012695, + "step": 12398 + }, + { + "epoch": 0.248, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011156972249348958, + "learning_rate": 0.0001, + "loss": 4.0865, + "loss/crossentropy": 2.355304718017578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20301833003759384, + "step": 12400 + }, + { + "epoch": 0.24804, + "grad_norm": 2.0, + "grad_norm_var": 0.010944620768229166, + "learning_rate": 0.0001, + "loss": 4.2537, + "loss/crossentropy": 2.172769784927368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157917320728302, + "step": 12402 + }, + { + "epoch": 0.24808, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012109120686848959, + "learning_rate": 0.0001, + "loss": 4.2817, + "loss/crossentropy": 2.2186524868011475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.221016563475132, + "step": 12404 + }, + { + "epoch": 0.24812, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012113189697265625, + "learning_rate": 0.0001, + "loss": 3.9624, + "loss/crossentropy": 2.011409044265747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19150879979133606, + "step": 12406 + }, + { + "epoch": 0.24816, + "grad_norm": 2.125, + "grad_norm_var": 0.010984039306640625, + "learning_rate": 0.0001, + "loss": 4.5039, + "loss/crossentropy": 1.9245591163635254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2296570986509323, + "step": 12408 + }, + { + "epoch": 0.2482, + "grad_norm": 2.328125, + "grad_norm_var": 0.016434733072916666, + "learning_rate": 0.0001, + "loss": 4.5765, + "loss/crossentropy": 1.973130702972412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20532477647066116, + "step": 12410 + }, + { + "epoch": 0.24824, + "grad_norm": 2.03125, + "grad_norm_var": 0.01617609659830729, + "learning_rate": 0.0001, + "loss": 3.914, + "loss/crossentropy": 1.655932605266571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1609746441245079, + "step": 12412 + }, + { + "epoch": 0.24828, + "grad_norm": 2.40625, + "grad_norm_var": 0.02411677042643229, + "learning_rate": 0.0001, + "loss": 4.2172, + "loss/crossentropy": 2.104023277759552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19898276031017303, + "step": 12414 + }, + { + "epoch": 0.24832, + "grad_norm": 1.9921875, + "grad_norm_var": 0.02210261027018229, + "learning_rate": 0.0001, + "loss": 4.0597, + "loss/crossentropy": 1.9362882375717163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20107803493738174, + "step": 12416 + }, + { + "epoch": 0.24836, + "grad_norm": 2.046875, + "grad_norm_var": 0.021996815999348957, + "learning_rate": 0.0001, + "loss": 4.4181, + "loss/crossentropy": 2.1508368253707886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2056911736726761, + "step": 12418 + }, + { + "epoch": 0.2484, + "grad_norm": 1.921875, + "grad_norm_var": 0.02194391886393229, + "learning_rate": 0.0001, + "loss": 3.802, + "loss/crossentropy": 1.7712991833686829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1908227875828743, + "step": 12420 + }, + { + "epoch": 0.24844, + "grad_norm": 2.140625, + "grad_norm_var": 0.02060114542643229, + "learning_rate": 0.0001, + "loss": 4.3557, + "loss/crossentropy": 2.2705806493759155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23912303894758224, + "step": 12422 + }, + { + "epoch": 0.24848, + "grad_norm": 1.9765625, + "grad_norm_var": 0.019791666666666666, + "learning_rate": 0.0001, + "loss": 4.2696, + "loss/crossentropy": 2.4222676753997803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21739919483661652, + "step": 12424 + }, + { + "epoch": 0.24852, + "grad_norm": 2.03125, + "grad_norm_var": 0.0145172119140625, + "learning_rate": 0.0001, + "loss": 4.1935, + "loss/crossentropy": 1.945872962474823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19402413070201874, + "step": 12426 + }, + { + "epoch": 0.24856, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01646728515625, + "learning_rate": 0.0001, + "loss": 4.0908, + "loss/crossentropy": 1.7793864011764526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19452669471502304, + "step": 12428 + }, + { + "epoch": 0.2486, + "grad_norm": 2.0, + "grad_norm_var": 0.015941365559895834, + "learning_rate": 0.0001, + "loss": 4.1045, + "loss/crossentropy": 2.2827813625335693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21572840213775635, + "step": 12430 + }, + { + "epoch": 0.24864, + "grad_norm": 2.03125, + "grad_norm_var": 0.016544342041015625, + "learning_rate": 0.0001, + "loss": 3.749, + "loss/crossentropy": 1.9543398022651672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20825360715389252, + "step": 12432 + }, + { + "epoch": 0.24868, + "grad_norm": 2.046875, + "grad_norm_var": 0.017276763916015625, + "learning_rate": 0.0001, + "loss": 4.3727, + "loss/crossentropy": 2.223303198814392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21834088116884232, + "step": 12434 + }, + { + "epoch": 0.24872, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017071278889973958, + "learning_rate": 0.0001, + "loss": 3.9993, + "loss/crossentropy": 1.6987267136573792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1834847405552864, + "step": 12436 + }, + { + "epoch": 0.24876, + "grad_norm": 2.125, + "grad_norm_var": 0.014522043863932292, + "learning_rate": 0.0001, + "loss": 4.4309, + "loss/crossentropy": 2.3947317600250244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22872642427682877, + "step": 12438 + }, + { + "epoch": 0.2488, + "grad_norm": 2.015625, + "grad_norm_var": 0.014435831705729167, + "learning_rate": 0.0001, + "loss": 4.4858, + "loss/crossentropy": 2.5481021404266357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23773372173309326, + "step": 12440 + }, + { + "epoch": 0.24884, + "grad_norm": 2.015625, + "grad_norm_var": 0.0145904541015625, + "learning_rate": 0.0001, + "loss": 4.1621, + "loss/crossentropy": 2.2349241971969604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21584660559892654, + "step": 12442 + }, + { + "epoch": 0.24888, + "grad_norm": 2.140625, + "grad_norm_var": 0.012988026936848958, + "learning_rate": 0.0001, + "loss": 4.2827, + "loss/crossentropy": 1.9308242201805115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19295598566532135, + "step": 12444 + }, + { + "epoch": 0.24892, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008536783854166667, + "learning_rate": 0.0001, + "loss": 4.0778, + "loss/crossentropy": 2.3508042097091675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22008787840604782, + "step": 12446 + }, + { + "epoch": 0.24896, + "grad_norm": 2.109375, + "grad_norm_var": 0.012593332926432292, + "learning_rate": 0.0001, + "loss": 4.0865, + "loss/crossentropy": 2.0870128870010376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20749760419130325, + "step": 12448 + }, + { + "epoch": 0.249, + "grad_norm": 2.109375, + "grad_norm_var": 0.012971750895182292, + "learning_rate": 0.0001, + "loss": 4.2758, + "loss/crossentropy": 2.087821125984192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21110248565673828, + "step": 12450 + }, + { + "epoch": 0.24904, + "grad_norm": 2.078125, + "grad_norm_var": 0.011494954427083334, + "learning_rate": 0.0001, + "loss": 4.3075, + "loss/crossentropy": 2.131770372390747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21426154673099518, + "step": 12452 + }, + { + "epoch": 0.24908, + "grad_norm": 2.046875, + "grad_norm_var": 0.011092122395833333, + "learning_rate": 0.0001, + "loss": 4.2373, + "loss/crossentropy": 2.175555467605591, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20418058335781097, + "step": 12454 + }, + { + "epoch": 0.24912, + "grad_norm": 2.1875, + "grad_norm_var": 0.012198893229166667, + "learning_rate": 0.0001, + "loss": 4.2788, + "loss/crossentropy": 2.205165147781372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23265192657709122, + "step": 12456 + }, + { + "epoch": 0.24916, + "grad_norm": 2.046875, + "grad_norm_var": 0.014534250895182291, + "learning_rate": 0.0001, + "loss": 3.9345, + "loss/crossentropy": 1.6410180926322937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18058068305253983, + "step": 12458 + }, + { + "epoch": 0.2492, + "grad_norm": 1.9375, + "grad_norm_var": 0.014345041910807292, + "learning_rate": 0.0001, + "loss": 3.962, + "loss/crossentropy": 1.8249012231826782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19164791703224182, + "step": 12460 + }, + { + "epoch": 0.24924, + "grad_norm": 2.234375, + "grad_norm_var": 0.0142242431640625, + "learning_rate": 0.0001, + "loss": 4.4945, + "loss/crossentropy": 2.0906582474708557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23415963351726532, + "step": 12462 + }, + { + "epoch": 0.24928, + "grad_norm": 2.078125, + "grad_norm_var": 0.009285227457682291, + "learning_rate": 0.0001, + "loss": 4.4702, + "loss/crossentropy": 2.314555048942566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24343416839838028, + "step": 12464 + }, + { + "epoch": 0.24932, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009105428059895834, + "learning_rate": 0.0001, + "loss": 4.3059, + "loss/crossentropy": 2.1258983612060547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21851430088281631, + "step": 12466 + }, + { + "epoch": 0.24936, + "grad_norm": 2.03125, + "grad_norm_var": 0.009325917561848958, + "learning_rate": 0.0001, + "loss": 4.1642, + "loss/crossentropy": 1.977954626083374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20545368641614914, + "step": 12468 + }, + { + "epoch": 0.2494, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011258951822916667, + "learning_rate": 0.0001, + "loss": 4.2224, + "loss/crossentropy": 2.0212838649749756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21720624715089798, + "step": 12470 + }, + { + "epoch": 0.24944, + "grad_norm": 2.0625, + "grad_norm_var": 0.010205078125, + "learning_rate": 0.0001, + "loss": 4.3526, + "loss/crossentropy": 1.964124321937561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19991052150726318, + "step": 12472 + }, + { + "epoch": 0.24948, + "grad_norm": 2.078125, + "grad_norm_var": 0.008567047119140626, + "learning_rate": 0.0001, + "loss": 4.2633, + "loss/crossentropy": 1.9038777947425842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19013714790344238, + "step": 12474 + }, + { + "epoch": 0.24952, + "grad_norm": 2.171875, + "grad_norm_var": 0.008449045817057292, + "learning_rate": 0.0001, + "loss": 4.4785, + "loss/crossentropy": 2.1772372722625732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21470360457897186, + "step": 12476 + }, + { + "epoch": 0.24956, + "grad_norm": 2.109375, + "grad_norm_var": 0.006605784098307292, + "learning_rate": 0.0001, + "loss": 4.3636, + "loss/crossentropy": 2.153541684150696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23221681267023087, + "step": 12478 + }, + { + "epoch": 0.2496, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006668853759765625, + "learning_rate": 0.0001, + "loss": 4.4578, + "loss/crossentropy": 2.421363592147827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20812640339136124, + "step": 12480 + }, + { + "epoch": 0.24964, + "grad_norm": 2.046875, + "grad_norm_var": 0.008188629150390625, + "learning_rate": 0.0001, + "loss": 4.0628, + "loss/crossentropy": 1.8497061133384705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18766893446445465, + "step": 12482 + }, + { + "epoch": 0.24968, + "grad_norm": 2.0625, + "grad_norm_var": 0.008894602457682291, + "learning_rate": 0.0001, + "loss": 4.1011, + "loss/crossentropy": 1.9876007437705994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932743340730667, + "step": 12484 + }, + { + "epoch": 0.24972, + "grad_norm": 2.0625, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 4.2722, + "loss/crossentropy": 2.4254921674728394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23315788805484772, + "step": 12486 + }, + { + "epoch": 0.24976, + "grad_norm": 1.96875, + "grad_norm_var": 0.0079010009765625, + "learning_rate": 0.0001, + "loss": 4.1012, + "loss/crossentropy": 1.983458697795868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190450206398964, + "step": 12488 + }, + { + "epoch": 0.2498, + "grad_norm": 2.046875, + "grad_norm_var": 0.007111612955729167, + "learning_rate": 0.0001, + "loss": 4.0284, + "loss/crossentropy": 1.8051987886428833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19166412204504013, + "step": 12490 + }, + { + "epoch": 0.24984, + "grad_norm": 2.140625, + "grad_norm_var": 0.0052398681640625, + "learning_rate": 0.0001, + "loss": 4.2385, + "loss/crossentropy": 2.1552056670188904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053346112370491, + "step": 12492 + }, + { + "epoch": 0.24988, + "grad_norm": 2.0625, + "grad_norm_var": 0.004992421468098958, + "learning_rate": 0.0001, + "loss": 4.0157, + "loss/crossentropy": 1.9644648432731628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17631876468658447, + "step": 12494 + }, + { + "epoch": 0.24992, + "grad_norm": 1.90625, + "grad_norm_var": 0.006089019775390625, + "learning_rate": 0.0001, + "loss": 4.1626, + "loss/crossentropy": 2.027154862880707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21671167761087418, + "step": 12496 + }, + { + "epoch": 0.24996, + "grad_norm": 2.125, + "grad_norm_var": 0.00628662109375, + "learning_rate": 0.0001, + "loss": 4.2485, + "loss/crossentropy": 2.2398791313171387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22507991641759872, + "step": 12498 + }, + { + "epoch": 0.25, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006232706705729166, + "learning_rate": 0.0001, + "loss": 4.13, + "loss/crossentropy": 2.2342909574508667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192724049091339, + "step": 12500 + }, + { + "epoch": 0.25004, + "grad_norm": 2.0, + "grad_norm_var": 0.0054443359375, + "learning_rate": 0.0001, + "loss": 4.2779, + "loss/crossentropy": 1.8621744513511658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882321536540985, + "step": 12502 + }, + { + "epoch": 0.25008, + "grad_norm": 2.109375, + "grad_norm_var": 0.004988606770833333, + "learning_rate": 0.0001, + "loss": 4.4516, + "loss/crossentropy": 2.2012354135513306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128412127494812, + "step": 12504 + }, + { + "epoch": 0.25012, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005873362223307292, + "learning_rate": 0.0001, + "loss": 4.0122, + "loss/crossentropy": 1.7629758715629578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1928938776254654, + "step": 12506 + }, + { + "epoch": 0.25016, + "grad_norm": 2.0625, + "grad_norm_var": 0.005730946858723958, + "learning_rate": 0.0001, + "loss": 4.1563, + "loss/crossentropy": 2.076514720916748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20530561357736588, + "step": 12508 + }, + { + "epoch": 0.2502, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006207021077473959, + "learning_rate": 0.0001, + "loss": 4.2005, + "loss/crossentropy": 2.004107654094696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21081873774528503, + "step": 12510 + }, + { + "epoch": 0.25024, + "grad_norm": 2.234375, + "grad_norm_var": 0.008013661702473958, + "learning_rate": 0.0001, + "loss": 4.3323, + "loss/crossentropy": 2.232061505317688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21327269077301025, + "step": 12512 + }, + { + "epoch": 0.25028, + "grad_norm": 2.0625, + "grad_norm_var": 0.0068662007649739586, + "learning_rate": 0.0001, + "loss": 4.1507, + "loss/crossentropy": 2.1506210565567017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20797627419233322, + "step": 12514 + }, + { + "epoch": 0.25032, + "grad_norm": 2.03125, + "grad_norm_var": 0.008207194010416667, + "learning_rate": 0.0001, + "loss": 4.1999, + "loss/crossentropy": 2.084704279899597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25306878983974457, + "step": 12516 + }, + { + "epoch": 0.25036, + "grad_norm": 2.0625, + "grad_norm_var": 0.008137003580729166, + "learning_rate": 0.0001, + "loss": 4.3286, + "loss/crossentropy": 2.000536620616913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20802763104438782, + "step": 12518 + }, + { + "epoch": 0.2504, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0088531494140625, + "learning_rate": 0.0001, + "loss": 4.1324, + "loss/crossentropy": 2.2391778230667114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171899899840355, + "step": 12520 + }, + { + "epoch": 0.25044, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011375935872395833, + "learning_rate": 0.0001, + "loss": 4.458, + "loss/crossentropy": 2.1465210914611816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026364952325821, + "step": 12522 + }, + { + "epoch": 0.25048, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011226145426432292, + "learning_rate": 0.0001, + "loss": 4.0804, + "loss/crossentropy": 1.8129625916481018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17917531728744507, + "step": 12524 + }, + { + "epoch": 0.25052, + "grad_norm": 2.0, + "grad_norm_var": 0.010029856363932292, + "learning_rate": 0.0001, + "loss": 4.1832, + "loss/crossentropy": 1.98322331905365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20269985496997833, + "step": 12526 + }, + { + "epoch": 0.25056, + "grad_norm": 2.1875, + "grad_norm_var": 0.008955637613932291, + "learning_rate": 0.0001, + "loss": 4.373, + "loss/crossentropy": 2.196588397026062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23827942460775375, + "step": 12528 + }, + { + "epoch": 0.2506, + "grad_norm": 2.09375, + "grad_norm_var": 0.009209950764973959, + "learning_rate": 0.0001, + "loss": 4.3811, + "loss/crossentropy": 2.183007001876831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071165144443512, + "step": 12530 + }, + { + "epoch": 0.25064, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0106353759765625, + "learning_rate": 0.0001, + "loss": 4.4205, + "loss/crossentropy": 2.2728021144866943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21004344522953033, + "step": 12532 + }, + { + "epoch": 0.25068, + "grad_norm": 2.125, + "grad_norm_var": 0.0111083984375, + "learning_rate": 0.0001, + "loss": 4.5478, + "loss/crossentropy": 2.280518889427185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22367073595523834, + "step": 12534 + }, + { + "epoch": 0.25072, + "grad_norm": 2.25, + "grad_norm_var": 0.015038045247395833, + "learning_rate": 0.0001, + "loss": 4.135, + "loss/crossentropy": 2.088103711605072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038320079445839, + "step": 12536 + }, + { + "epoch": 0.25076, + "grad_norm": 2.140625, + "grad_norm_var": 0.012837727864583334, + "learning_rate": 0.0001, + "loss": 4.1798, + "loss/crossentropy": 1.646530568599701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18982253968715668, + "step": 12538 + }, + { + "epoch": 0.2508, + "grad_norm": 2.09375, + "grad_norm_var": 0.013622792561848958, + "learning_rate": 0.0001, + "loss": 4.2246, + "loss/crossentropy": 2.1743921041488647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22835461795330048, + "step": 12540 + }, + { + "epoch": 0.25084, + "grad_norm": 1.9375, + "grad_norm_var": 0.0151123046875, + "learning_rate": 0.0001, + "loss": 4.3138, + "loss/crossentropy": 2.2216947078704834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21978579461574554, + "step": 12542 + }, + { + "epoch": 0.25088, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014890289306640625, + "learning_rate": 0.0001, + "loss": 4.298, + "loss/crossentropy": 2.141101062297821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21347828209400177, + "step": 12544 + }, + { + "epoch": 0.25092, + "grad_norm": 2.078125, + "grad_norm_var": 0.014861806233723959, + "learning_rate": 0.0001, + "loss": 4.1534, + "loss/crossentropy": 2.1961969137191772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192479968070984, + "step": 12546 + }, + { + "epoch": 0.25096, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014274088541666667, + "learning_rate": 0.0001, + "loss": 3.9219, + "loss/crossentropy": 1.9632240533828735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19182069599628448, + "step": 12548 + }, + { + "epoch": 0.251, + "grad_norm": 2.359375, + "grad_norm_var": 0.02173639933268229, + "learning_rate": 0.0001, + "loss": 4.0594, + "loss/crossentropy": 1.7821694612503052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24946660548448563, + "step": 12550 + }, + { + "epoch": 0.25104, + "grad_norm": 2.078125, + "grad_norm_var": 0.016290028889973957, + "learning_rate": 0.0001, + "loss": 4.2423, + "loss/crossentropy": 2.261335611343384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23269569873809814, + "step": 12552 + }, + { + "epoch": 0.25108, + "grad_norm": 1.921875, + "grad_norm_var": 0.01762669881184896, + "learning_rate": 0.0001, + "loss": 3.9858, + "loss/crossentropy": 2.2365309596061707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2119913324713707, + "step": 12554 + }, + { + "epoch": 0.25112, + "grad_norm": 2.078125, + "grad_norm_var": 0.0176025390625, + "learning_rate": 0.0001, + "loss": 3.9798, + "loss/crossentropy": 2.0685681104660034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19985978305339813, + "step": 12556 + }, + { + "epoch": 0.25116, + "grad_norm": 2.015625, + "grad_norm_var": 0.0144683837890625, + "learning_rate": 0.0001, + "loss": 4.0231, + "loss/crossentropy": 2.0773105025291443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2006489858031273, + "step": 12558 + }, + { + "epoch": 0.2512, + "grad_norm": 2.09375, + "grad_norm_var": 0.01561279296875, + "learning_rate": 0.0001, + "loss": 4.1948, + "loss/crossentropy": 2.1145309805870056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20449287444353104, + "step": 12560 + }, + { + "epoch": 0.25124, + "grad_norm": 1.828125, + "grad_norm_var": 0.017097981770833333, + "learning_rate": 0.0001, + "loss": 4.0543, + "loss/crossentropy": 2.109993577003479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20761344581842422, + "step": 12562 + }, + { + "epoch": 0.25128, + "grad_norm": 1.9375, + "grad_norm_var": 0.016755167643229166, + "learning_rate": 0.0001, + "loss": 4.1661, + "loss/crossentropy": 1.9653338193893433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069985270500183, + "step": 12564 + }, + { + "epoch": 0.25132, + "grad_norm": 2.015625, + "grad_norm_var": 0.007045237223307291, + "learning_rate": 0.0001, + "loss": 4.101, + "loss/crossentropy": 1.7084832191467285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18922459334135056, + "step": 12566 + }, + { + "epoch": 0.25136, + "grad_norm": 2.046875, + "grad_norm_var": 0.006494140625, + "learning_rate": 0.0001, + "loss": 3.9288, + "loss/crossentropy": 1.8007041215896606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122679501771927, + "step": 12568 + }, + { + "epoch": 0.2514, + "grad_norm": 2.125, + "grad_norm_var": 0.007972971598307291, + "learning_rate": 0.0001, + "loss": 4.3199, + "loss/crossentropy": 2.041845440864563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2219957411289215, + "step": 12570 + }, + { + "epoch": 0.25144, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008577219645182292, + "learning_rate": 0.0001, + "loss": 4.3703, + "loss/crossentropy": 2.2668861150741577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117021307349205, + "step": 12572 + }, + { + "epoch": 0.25148, + "grad_norm": 1.890625, + "grad_norm_var": 0.010235341389973958, + "learning_rate": 0.0001, + "loss": 3.8462, + "loss/crossentropy": 1.8246251940727234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822521835565567, + "step": 12574 + }, + { + "epoch": 0.25152, + "grad_norm": 2.046875, + "grad_norm_var": 0.010553995768229166, + "learning_rate": 0.0001, + "loss": 3.8137, + "loss/crossentropy": 2.0739742517471313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2114427089691162, + "step": 12576 + }, + { + "epoch": 0.25156, + "grad_norm": 2.03125, + "grad_norm_var": 0.008906809488932292, + "learning_rate": 0.0001, + "loss": 3.9261, + "loss/crossentropy": 1.6614344120025635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18680214881896973, + "step": 12578 + }, + { + "epoch": 0.2516, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01279296875, + "learning_rate": 0.0001, + "loss": 4.4042, + "loss/crossentropy": 2.5629080533981323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26097629219293594, + "step": 12580 + }, + { + "epoch": 0.25164, + "grad_norm": 2.015625, + "grad_norm_var": 0.01337890625, + "learning_rate": 0.0001, + "loss": 4.1583, + "loss/crossentropy": 2.058123230934143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2273598164319992, + "step": 12582 + }, + { + "epoch": 0.25168, + "grad_norm": 2.15625, + "grad_norm_var": 0.014314524332682292, + "learning_rate": 0.0001, + "loss": 4.2802, + "loss/crossentropy": 1.9959335327148438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1991373971104622, + "step": 12584 + }, + { + "epoch": 0.25172, + "grad_norm": 2.140625, + "grad_norm_var": 0.018293253580729165, + "learning_rate": 0.0001, + "loss": 3.9794, + "loss/crossentropy": 1.8165839314460754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1888013407588005, + "step": 12586 + }, + { + "epoch": 0.25176, + "grad_norm": 2.03125, + "grad_norm_var": 0.015221913655598959, + "learning_rate": 0.0001, + "loss": 4.0687, + "loss/crossentropy": 1.9918802976608276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18641646206378937, + "step": 12588 + }, + { + "epoch": 0.2518, + "grad_norm": 2.078125, + "grad_norm_var": 0.013378651936848958, + "learning_rate": 0.0001, + "loss": 4.3443, + "loss/crossentropy": 2.0256036520004272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19628985226154327, + "step": 12590 + }, + { + "epoch": 0.25184, + "grad_norm": 2.09375, + "grad_norm_var": 0.01236572265625, + "learning_rate": 0.0001, + "loss": 4.1317, + "loss/crossentropy": 1.948347806930542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19971590489149094, + "step": 12592 + }, + { + "epoch": 0.25188, + "grad_norm": 2.046875, + "grad_norm_var": 0.012544759114583333, + "learning_rate": 0.0001, + "loss": 3.8528, + "loss/crossentropy": 2.2025747299194336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2437238171696663, + "step": 12594 + }, + { + "epoch": 0.25192, + "grad_norm": 2.078125, + "grad_norm_var": 0.010872141520182291, + "learning_rate": 0.0001, + "loss": 4.2102, + "loss/crossentropy": 2.3209575414657593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2366873100399971, + "step": 12596 + }, + { + "epoch": 0.25196, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010846964518229167, + "learning_rate": 0.0001, + "loss": 3.8028, + "loss/crossentropy": 1.7978705763816833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19252559542655945, + "step": 12598 + }, + { + "epoch": 0.252, + "grad_norm": 2.015625, + "grad_norm_var": 0.010359446207682291, + "learning_rate": 0.0001, + "loss": 3.9269, + "loss/crossentropy": 1.964760184288025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1943562552332878, + "step": 12600 + }, + { + "epoch": 0.25204, + "grad_norm": 2.0, + "grad_norm_var": 0.005399322509765625, + "learning_rate": 0.0001, + "loss": 4.2205, + "loss/crossentropy": 2.1420929431915283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.206790953874588, + "step": 12602 + }, + { + "epoch": 0.25208, + "grad_norm": 2.234375, + "grad_norm_var": 0.008084869384765625, + "learning_rate": 0.0001, + "loss": 4.5232, + "loss/crossentropy": 2.2422659397125244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22513067722320557, + "step": 12604 + }, + { + "epoch": 0.25212, + "grad_norm": 2.75, + "grad_norm_var": 0.04026667277018229, + "learning_rate": 0.0001, + "loss": 3.8394, + "loss/crossentropy": 1.587006688117981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1833646520972252, + "step": 12606 + }, + { + "epoch": 0.25216, + "grad_norm": 2.09375, + "grad_norm_var": 0.03873291015625, + "learning_rate": 0.0001, + "loss": 4.4461, + "loss/crossentropy": 2.426867365837097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2435208261013031, + "step": 12608 + }, + { + "epoch": 0.2522, + "grad_norm": 2.0, + "grad_norm_var": 0.040185546875, + "learning_rate": 0.0001, + "loss": 4.0039, + "loss/crossentropy": 1.9733251333236694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21772438287734985, + "step": 12610 + }, + { + "epoch": 0.25224, + "grad_norm": 2.84375, + "grad_norm_var": 0.0764312744140625, + "learning_rate": 0.0001, + "loss": 4.2712, + "loss/crossentropy": 1.4835429191589355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17013566195964813, + "step": 12612 + }, + { + "epoch": 0.25228, + "grad_norm": 2.0, + "grad_norm_var": 0.07500178019205729, + "learning_rate": 0.0001, + "loss": 3.8985, + "loss/crossentropy": 2.165693759918213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21315700560808182, + "step": 12614 + }, + { + "epoch": 0.25232, + "grad_norm": 2.09375, + "grad_norm_var": 0.07396647135416666, + "learning_rate": 0.0001, + "loss": 3.9856, + "loss/crossentropy": 2.206323266029358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19913922995328903, + "step": 12616 + }, + { + "epoch": 0.25236, + "grad_norm": 2.015625, + "grad_norm_var": 0.07515360514322916, + "learning_rate": 0.0001, + "loss": 4.0788, + "loss/crossentropy": 2.2520995140075684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22997721284627914, + "step": 12618 + }, + { + "epoch": 0.2524, + "grad_norm": 2.140625, + "grad_norm_var": 0.07444559733072917, + "learning_rate": 0.0001, + "loss": 4.365, + "loss/crossentropy": 1.9322227239608765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20600315928459167, + "step": 12620 + }, + { + "epoch": 0.25244, + "grad_norm": 1.96875, + "grad_norm_var": 0.047459920247395836, + "learning_rate": 0.0001, + "loss": 4.0578, + "loss/crossentropy": 2.140220284461975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21072514355182648, + "step": 12622 + }, + { + "epoch": 0.25248, + "grad_norm": 1.96875, + "grad_norm_var": 0.04942118326822917, + "learning_rate": 0.0001, + "loss": 4.2316, + "loss/crossentropy": 2.061431884765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2259845808148384, + "step": 12624 + }, + { + "epoch": 0.25252, + "grad_norm": 2.125, + "grad_norm_var": 0.047055816650390624, + "learning_rate": 0.0001, + "loss": 4.14, + "loss/crossentropy": 2.323551654815674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23168490827083588, + "step": 12626 + }, + { + "epoch": 0.25256, + "grad_norm": 1.890625, + "grad_norm_var": 0.008335113525390625, + "learning_rate": 0.0001, + "loss": 4.0036, + "loss/crossentropy": 1.93999582529068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21139459311962128, + "step": 12628 + }, + { + "epoch": 0.2526, + "grad_norm": 2.140625, + "grad_norm_var": 0.011173248291015625, + "learning_rate": 0.0001, + "loss": 4.207, + "loss/crossentropy": 1.9334313869476318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21537292003631592, + "step": 12630 + }, + { + "epoch": 0.25264, + "grad_norm": 2.0, + "grad_norm_var": 0.010477447509765625, + "learning_rate": 0.0001, + "loss": 4.1674, + "loss/crossentropy": 2.145465135574341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22080931067466736, + "step": 12632 + }, + { + "epoch": 0.25268, + "grad_norm": 1.9375, + "grad_norm_var": 0.011668904622395834, + "learning_rate": 0.0001, + "loss": 4.0058, + "loss/crossentropy": 1.9708059430122375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2007170245051384, + "step": 12634 + }, + { + "epoch": 0.25272, + "grad_norm": 2.015625, + "grad_norm_var": 0.0115631103515625, + "learning_rate": 0.0001, + "loss": 3.9649, + "loss/crossentropy": 1.9522746801376343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20030274242162704, + "step": 12636 + }, + { + "epoch": 0.25276, + "grad_norm": 2.0625, + "grad_norm_var": 0.012035878499348958, + "learning_rate": 0.0001, + "loss": 4.0416, + "loss/crossentropy": 1.856387436389923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19685833156108856, + "step": 12638 + }, + { + "epoch": 0.2528, + "grad_norm": 2.296875, + "grad_norm_var": 0.013787587483723959, + "learning_rate": 0.0001, + "loss": 4.475, + "loss/crossentropy": 1.9449518322944641, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20622673630714417, + "step": 12640 + }, + { + "epoch": 0.25284, + "grad_norm": 2.25, + "grad_norm_var": 0.017533365885416666, + "learning_rate": 0.0001, + "loss": 4.4683, + "loss/crossentropy": 2.4559473991394043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21476459503173828, + "step": 12642 + }, + { + "epoch": 0.25288, + "grad_norm": 1.9296875, + "grad_norm_var": 0.016778310139973957, + "learning_rate": 0.0001, + "loss": 4.3164, + "loss/crossentropy": 2.238897919654846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22538839280605316, + "step": 12644 + }, + { + "epoch": 0.25292, + "grad_norm": 2.078125, + "grad_norm_var": 0.013944498697916667, + "learning_rate": 0.0001, + "loss": 3.9801, + "loss/crossentropy": 1.7506417036056519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18603645265102386, + "step": 12646 + }, + { + "epoch": 0.25296, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014359283447265624, + "learning_rate": 0.0001, + "loss": 4.1238, + "loss/crossentropy": 2.162258505821228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21168682724237442, + "step": 12648 + }, + { + "epoch": 0.253, + "grad_norm": 2.171875, + "grad_norm_var": 0.015154774983723958, + "learning_rate": 0.0001, + "loss": 4.0441, + "loss/crossentropy": 1.9122841954231262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20636393874883652, + "step": 12650 + }, + { + "epoch": 0.25304, + "grad_norm": 2.125, + "grad_norm_var": 0.015187327067057292, + "learning_rate": 0.0001, + "loss": 4.3566, + "loss/crossentropy": 2.1501541137695312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22346170246601105, + "step": 12652 + }, + { + "epoch": 0.25308, + "grad_norm": 2.015625, + "grad_norm_var": 0.01456298828125, + "learning_rate": 0.0001, + "loss": 4.0914, + "loss/crossentropy": 1.797228217124939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17510685324668884, + "step": 12654 + }, + { + "epoch": 0.25312, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010422515869140624, + "learning_rate": 0.0001, + "loss": 3.9232, + "loss/crossentropy": 1.7334046363830566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19618064910173416, + "step": 12656 + }, + { + "epoch": 0.25316, + "grad_norm": 2.109375, + "grad_norm_var": 0.0066912333170572914, + "learning_rate": 0.0001, + "loss": 4.3287, + "loss/crossentropy": 1.99091237783432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21426931023597717, + "step": 12658 + }, + { + "epoch": 0.2532, + "grad_norm": 2.09375, + "grad_norm_var": 0.0069244384765625, + "learning_rate": 0.0001, + "loss": 4.5036, + "loss/crossentropy": 2.1908310651779175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20360572636127472, + "step": 12660 + }, + { + "epoch": 0.25324, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00787353515625, + "learning_rate": 0.0001, + "loss": 3.9746, + "loss/crossentropy": 1.7925593852996826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18203188478946686, + "step": 12662 + }, + { + "epoch": 0.25328, + "grad_norm": 2.078125, + "grad_norm_var": 0.009006500244140625, + "learning_rate": 0.0001, + "loss": 3.8955, + "loss/crossentropy": 1.9464862942695618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1896686628460884, + "step": 12664 + }, + { + "epoch": 0.25332, + "grad_norm": 2.046875, + "grad_norm_var": 0.006379954020182292, + "learning_rate": 0.0001, + "loss": 3.9922, + "loss/crossentropy": 2.0207581520080566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20436276495456696, + "step": 12666 + }, + { + "epoch": 0.25336, + "grad_norm": 2.09375, + "grad_norm_var": 0.005322011311848959, + "learning_rate": 0.0001, + "loss": 4.141, + "loss/crossentropy": 1.9915068745613098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21265029907226562, + "step": 12668 + }, + { + "epoch": 0.2534, + "grad_norm": 2.015625, + "grad_norm_var": 0.005204010009765625, + "learning_rate": 0.0001, + "loss": 4.2602, + "loss/crossentropy": 1.787190020084381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20195094496011734, + "step": 12670 + }, + { + "epoch": 0.25344, + "grad_norm": 2.0625, + "grad_norm_var": 0.005036417643229167, + "learning_rate": 0.0001, + "loss": 4.3554, + "loss/crossentropy": 2.2403881549835205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2313152700662613, + "step": 12672 + }, + { + "epoch": 0.25348, + "grad_norm": 1.9921875, + "grad_norm_var": 0.00540771484375, + "learning_rate": 0.0001, + "loss": 3.9895, + "loss/crossentropy": 1.6122660636901855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19166506081819534, + "step": 12674 + }, + { + "epoch": 0.25352, + "grad_norm": 2.171875, + "grad_norm_var": 0.005402628580729167, + "learning_rate": 0.0001, + "loss": 4.1966, + "loss/crossentropy": 2.0758888125419617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19904160499572754, + "step": 12676 + }, + { + "epoch": 0.25356, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0064656575520833336, + "learning_rate": 0.0001, + "loss": 4.2466, + "loss/crossentropy": 1.8765565156936646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17384758591651917, + "step": 12678 + }, + { + "epoch": 0.2536, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0054705301920572914, + "learning_rate": 0.0001, + "loss": 4.0977, + "loss/crossentropy": 2.1322853565216064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21987880766391754, + "step": 12680 + }, + { + "epoch": 0.25364, + "grad_norm": 2.0, + "grad_norm_var": 0.006196848551432292, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 2.0303893089294434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20912615954875946, + "step": 12682 + }, + { + "epoch": 0.25368, + "grad_norm": 2.109375, + "grad_norm_var": 0.0065826416015625, + "learning_rate": 0.0001, + "loss": 4.0022, + "loss/crossentropy": 2.2314319610595703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22904963046312332, + "step": 12684 + }, + { + "epoch": 0.25372, + "grad_norm": 2.140625, + "grad_norm_var": 0.007438151041666666, + "learning_rate": 0.0001, + "loss": 4.1854, + "loss/crossentropy": 1.9751350283622742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20683745294809341, + "step": 12686 + }, + { + "epoch": 0.25376, + "grad_norm": 2.09375, + "grad_norm_var": 0.0088043212890625, + "learning_rate": 0.0001, + "loss": 4.112, + "loss/crossentropy": 1.9198334217071533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19134660065174103, + "step": 12688 + }, + { + "epoch": 0.2538, + "grad_norm": 1.890625, + "grad_norm_var": 0.009214019775390625, + "learning_rate": 0.0001, + "loss": 3.8783, + "loss/crossentropy": 1.945135474205017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20948659628629684, + "step": 12690 + }, + { + "epoch": 0.25384, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0094879150390625, + "learning_rate": 0.0001, + "loss": 4.1018, + "loss/crossentropy": 2.146402955055237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21819791197776794, + "step": 12692 + }, + { + "epoch": 0.25388, + "grad_norm": 2.09375, + "grad_norm_var": 0.007806142171223958, + "learning_rate": 0.0001, + "loss": 4.3696, + "loss/crossentropy": 2.309106230735779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160014659166336, + "step": 12694 + }, + { + "epoch": 0.25392, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 4.0637, + "loss/crossentropy": 2.1726107597351074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20791994035243988, + "step": 12696 + }, + { + "epoch": 0.25396, + "grad_norm": 2.09375, + "grad_norm_var": 0.007370758056640625, + "learning_rate": 0.0001, + "loss": 4.252, + "loss/crossentropy": 2.0905996561050415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2083379328250885, + "step": 12698 + }, + { + "epoch": 0.254, + "grad_norm": 2.078125, + "grad_norm_var": 0.008185831705729167, + "learning_rate": 0.0001, + "loss": 4.197, + "loss/crossentropy": 1.9190585017204285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1856430396437645, + "step": 12700 + }, + { + "epoch": 0.25404, + "grad_norm": 2.125, + "grad_norm_var": 0.007877604166666666, + "learning_rate": 0.0001, + "loss": 4.3906, + "loss/crossentropy": 2.2493419647216797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20197859406471252, + "step": 12702 + }, + { + "epoch": 0.25408, + "grad_norm": 2.171875, + "grad_norm_var": 0.007731119791666667, + "learning_rate": 0.0001, + "loss": 4.1935, + "loss/crossentropy": 2.0205613374710083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155362293124199, + "step": 12704 + }, + { + "epoch": 0.25412, + "grad_norm": 2.078125, + "grad_norm_var": 0.006257120768229167, + "learning_rate": 0.0001, + "loss": 4.188, + "loss/crossentropy": 1.8853323459625244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20137012749910355, + "step": 12706 + }, + { + "epoch": 0.25416, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0061431884765625, + "learning_rate": 0.0001, + "loss": 4.098, + "loss/crossentropy": 2.109869122505188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2036309316754341, + "step": 12708 + }, + { + "epoch": 0.2542, + "grad_norm": 2.078125, + "grad_norm_var": 0.005890909830729167, + "learning_rate": 0.0001, + "loss": 4.1782, + "loss/crossentropy": 2.0598954558372498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22793744504451752, + "step": 12710 + }, + { + "epoch": 0.25424, + "grad_norm": 2.171875, + "grad_norm_var": 0.0055735270182291664, + "learning_rate": 0.0001, + "loss": 4.2984, + "loss/crossentropy": 2.169256567955017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20931441336870193, + "step": 12712 + }, + { + "epoch": 0.25428, + "grad_norm": 2.09375, + "grad_norm_var": 0.005197906494140625, + "learning_rate": 0.0001, + "loss": 4.2715, + "loss/crossentropy": 1.8178632855415344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20344894379377365, + "step": 12714 + }, + { + "epoch": 0.25432, + "grad_norm": 2.046875, + "grad_norm_var": 0.004327138264973958, + "learning_rate": 0.0001, + "loss": 4.0897, + "loss/crossentropy": 1.6468743085861206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16709627211093903, + "step": 12716 + }, + { + "epoch": 0.25436, + "grad_norm": 2.171875, + "grad_norm_var": 0.004748280843098958, + "learning_rate": 0.0001, + "loss": 4.1702, + "loss/crossentropy": 1.8353837728500366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20837673544883728, + "step": 12718 + }, + { + "epoch": 0.2544, + "grad_norm": 1.9140625, + "grad_norm_var": 0.005492146809895833, + "learning_rate": 0.0001, + "loss": 3.8056, + "loss/crossentropy": 1.7881666421890259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18411727994680405, + "step": 12720 + }, + { + "epoch": 0.25444, + "grad_norm": 2.109375, + "grad_norm_var": 0.0065958658854166664, + "learning_rate": 0.0001, + "loss": 4.5369, + "loss/crossentropy": 2.19934618473053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22607161849737167, + "step": 12722 + }, + { + "epoch": 0.25448, + "grad_norm": 2.328125, + "grad_norm_var": 0.008821360270182292, + "learning_rate": 0.0001, + "loss": 4.5237, + "loss/crossentropy": 1.9789779782295227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156950756907463, + "step": 12724 + }, + { + "epoch": 0.25452, + "grad_norm": 1.921875, + "grad_norm_var": 0.010778554280598958, + "learning_rate": 0.0001, + "loss": 4.1059, + "loss/crossentropy": 2.0605525970458984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20622699707746506, + "step": 12726 + }, + { + "epoch": 0.25456, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011336263020833333, + "learning_rate": 0.0001, + "loss": 4.2862, + "loss/crossentropy": 2.1131407022476196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21070127934217453, + "step": 12728 + }, + { + "epoch": 0.2546, + "grad_norm": 2.09375, + "grad_norm_var": 0.011336263020833333, + "learning_rate": 0.0001, + "loss": 4.3789, + "loss/crossentropy": 1.9708096981048584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22219926118850708, + "step": 12730 + }, + { + "epoch": 0.25464, + "grad_norm": 2.171875, + "grad_norm_var": 0.011918131510416667, + "learning_rate": 0.0001, + "loss": 4.2985, + "loss/crossentropy": 2.220608353614807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2271113097667694, + "step": 12732 + }, + { + "epoch": 0.25468, + "grad_norm": 2.046875, + "grad_norm_var": 0.01146240234375, + "learning_rate": 0.0001, + "loss": 4.2097, + "loss/crossentropy": 2.535509705543518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24400310218334198, + "step": 12734 + }, + { + "epoch": 0.25472, + "grad_norm": 2.390625, + "grad_norm_var": 0.015952301025390626, + "learning_rate": 0.0001, + "loss": 4.6837, + "loss/crossentropy": 2.3619974851608276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2268039956688881, + "step": 12736 + }, + { + "epoch": 0.25476, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0168212890625, + "learning_rate": 0.0001, + "loss": 3.9929, + "loss/crossentropy": 2.0095953941345215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20850248634815216, + "step": 12738 + }, + { + "epoch": 0.2548, + "grad_norm": 2.03125, + "grad_norm_var": 0.012276204427083333, + "learning_rate": 0.0001, + "loss": 4.1466, + "loss/crossentropy": 1.934277892112732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116999626159668, + "step": 12740 + }, + { + "epoch": 0.25484, + "grad_norm": 2.03125, + "grad_norm_var": 0.011701456705729167, + "learning_rate": 0.0001, + "loss": 4.336, + "loss/crossentropy": 2.237201452255249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22638197988271713, + "step": 12742 + }, + { + "epoch": 0.25488, + "grad_norm": 2.0, + "grad_norm_var": 0.011464182535807292, + "learning_rate": 0.0001, + "loss": 4.0354, + "loss/crossentropy": 2.302059292793274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22285569459199905, + "step": 12744 + }, + { + "epoch": 0.25492, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013944244384765625, + "learning_rate": 0.0001, + "loss": 3.7097, + "loss/crossentropy": 1.7690886855125427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19209770113229752, + "step": 12746 + }, + { + "epoch": 0.25496, + "grad_norm": 2.171875, + "grad_norm_var": 0.013944244384765625, + "learning_rate": 0.0001, + "loss": 4.2324, + "loss/crossentropy": 2.4372475147247314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192872166633606, + "step": 12748 + }, + { + "epoch": 0.255, + "grad_norm": 1.8984375, + "grad_norm_var": 0.016405232747395835, + "learning_rate": 0.0001, + "loss": 4.2627, + "loss/crossentropy": 2.12698757648468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21698793768882751, + "step": 12750 + }, + { + "epoch": 0.25504, + "grad_norm": 2.078125, + "grad_norm_var": 0.007710520426432292, + "learning_rate": 0.0001, + "loss": 3.9991, + "loss/crossentropy": 1.989953875541687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969183310866356, + "step": 12752 + }, + { + "epoch": 0.25508, + "grad_norm": 2.671875, + "grad_norm_var": 0.034501139322916666, + "learning_rate": 0.0001, + "loss": 4.5505, + "loss/crossentropy": 2.2911970615386963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21863195300102234, + "step": 12754 + }, + { + "epoch": 0.25512, + "grad_norm": 2.09375, + "grad_norm_var": 0.03460286458333333, + "learning_rate": 0.0001, + "loss": 4.0915, + "loss/crossentropy": 1.8447460532188416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20167672634124756, + "step": 12756 + }, + { + "epoch": 0.25516, + "grad_norm": 1.96875, + "grad_norm_var": 0.035471343994140626, + "learning_rate": 0.0001, + "loss": 3.8488, + "loss/crossentropy": 2.043856382369995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2075670212507248, + "step": 12758 + }, + { + "epoch": 0.2552, + "grad_norm": 2.15625, + "grad_norm_var": 0.037536366780598955, + "learning_rate": 0.0001, + "loss": 4.0893, + "loss/crossentropy": 1.735123872756958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19347237050533295, + "step": 12760 + }, + { + "epoch": 0.25524, + "grad_norm": 2.015625, + "grad_norm_var": 0.034407552083333334, + "learning_rate": 0.0001, + "loss": 4.0586, + "loss/crossentropy": 2.1058340072631836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21859320253133774, + "step": 12762 + }, + { + "epoch": 0.25528, + "grad_norm": 2.171875, + "grad_norm_var": 0.03444722493489583, + "learning_rate": 0.0001, + "loss": 4.2518, + "loss/crossentropy": 2.0583395957946777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224592886865139, + "step": 12764 + }, + { + "epoch": 0.25532, + "grad_norm": 1.9921875, + "grad_norm_var": 0.032206217447916664, + "learning_rate": 0.0001, + "loss": 4.3562, + "loss/crossentropy": 2.2550116777420044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23644013702869415, + "step": 12766 + }, + { + "epoch": 0.25536, + "grad_norm": 2.015625, + "grad_norm_var": 0.032293446858723956, + "learning_rate": 0.0001, + "loss": 4.2338, + "loss/crossentropy": 2.463193655014038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229352444410324, + "step": 12768 + }, + { + "epoch": 0.2554, + "grad_norm": 2.0, + "grad_norm_var": 0.006400299072265625, + "learning_rate": 0.0001, + "loss": 4.016, + "loss/crossentropy": 2.010310709476471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20725534856319427, + "step": 12770 + }, + { + "epoch": 0.25544, + "grad_norm": 1.921875, + "grad_norm_var": 0.006219228108723958, + "learning_rate": 0.0001, + "loss": 4.1988, + "loss/crossentropy": 2.2002042531967163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21686428785324097, + "step": 12772 + }, + { + "epoch": 0.25548, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009690093994140624, + "learning_rate": 0.0001, + "loss": 4.2974, + "loss/crossentropy": 2.072916865348816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2210928201675415, + "step": 12774 + }, + { + "epoch": 0.25552, + "grad_norm": 2.015625, + "grad_norm_var": 0.010149892171223958, + "learning_rate": 0.0001, + "loss": 4.3168, + "loss/crossentropy": 2.068341016769409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20776809751987457, + "step": 12776 + }, + { + "epoch": 0.25556, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010949452718098959, + "learning_rate": 0.0001, + "loss": 3.8552, + "loss/crossentropy": 1.6145030856132507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2056947946548462, + "step": 12778 + }, + { + "epoch": 0.2556, + "grad_norm": 2.03125, + "grad_norm_var": 0.011311848958333334, + "learning_rate": 0.0001, + "loss": 4.0132, + "loss/crossentropy": 2.1529496908187866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21802741289138794, + "step": 12780 + }, + { + "epoch": 0.25564, + "grad_norm": 2.03125, + "grad_norm_var": 0.011226399739583334, + "learning_rate": 0.0001, + "loss": 4.1708, + "loss/crossentropy": 2.255640387535095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21794818341732025, + "step": 12782 + }, + { + "epoch": 0.25568, + "grad_norm": 2.03125, + "grad_norm_var": 0.011546834309895834, + "learning_rate": 0.0001, + "loss": 4.1102, + "loss/crossentropy": 1.9934669137001038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21103205531835556, + "step": 12784 + }, + { + "epoch": 0.25572, + "grad_norm": 2.125, + "grad_norm_var": 0.011864217122395833, + "learning_rate": 0.0001, + "loss": 4.2924, + "loss/crossentropy": 1.9404807090759277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22771108150482178, + "step": 12786 + }, + { + "epoch": 0.25576, + "grad_norm": 2.078125, + "grad_norm_var": 0.012084706624348959, + "learning_rate": 0.0001, + "loss": 4.0444, + "loss/crossentropy": 1.7803818583488464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19245180487632751, + "step": 12788 + }, + { + "epoch": 0.2558, + "grad_norm": 2.078125, + "grad_norm_var": 0.009423828125, + "learning_rate": 0.0001, + "loss": 4.2033, + "loss/crossentropy": 2.108216881752014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239757925271988, + "step": 12790 + }, + { + "epoch": 0.25584, + "grad_norm": 2.0625, + "grad_norm_var": 0.0059478759765625, + "learning_rate": 0.0001, + "loss": 4.1999, + "loss/crossentropy": 2.056196451187134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20573420077562332, + "step": 12792 + }, + { + "epoch": 0.25588, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007275390625, + "learning_rate": 0.0001, + "loss": 3.9749, + "loss/crossentropy": 2.2215099334716797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21107079833745956, + "step": 12794 + }, + { + "epoch": 0.25592, + "grad_norm": 1.796875, + "grad_norm_var": 0.009281158447265625, + "learning_rate": 0.0001, + "loss": 3.9527, + "loss/crossentropy": 1.947695553302765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19738437235355377, + "step": 12796 + }, + { + "epoch": 0.25596, + "grad_norm": 2.0625, + "grad_norm_var": 0.010204823811848958, + "learning_rate": 0.0001, + "loss": 4.1072, + "loss/crossentropy": 2.3351560831069946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21714143455028534, + "step": 12798 + }, + { + "epoch": 0.256, + "grad_norm": 2.0625, + "grad_norm_var": 0.009798177083333333, + "learning_rate": 0.0001, + "loss": 4.3055, + "loss/crossentropy": 2.399898648262024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25758983194828033, + "step": 12800 + }, + { + "epoch": 0.25604, + "grad_norm": 2.671875, + "grad_norm_var": 0.03814697265625, + "learning_rate": 0.0001, + "loss": 4.5632, + "loss/crossentropy": 2.3537880182266235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2976065129041672, + "step": 12802 + }, + { + "epoch": 0.25608, + "grad_norm": 1.921875, + "grad_norm_var": 0.03857421875, + "learning_rate": 0.0001, + "loss": 3.8394, + "loss/crossentropy": 1.7765586972236633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1823679357767105, + "step": 12804 + }, + { + "epoch": 0.25612, + "grad_norm": 2.1875, + "grad_norm_var": 0.039793904622395834, + "learning_rate": 0.0001, + "loss": 4.4023, + "loss/crossentropy": 2.119332432746887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23052836954593658, + "step": 12806 + }, + { + "epoch": 0.25616, + "grad_norm": 2.109375, + "grad_norm_var": 0.0414947509765625, + "learning_rate": 0.0001, + "loss": 4.0251, + "loss/crossentropy": 1.8325288891792297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19811426103115082, + "step": 12808 + }, + { + "epoch": 0.2562, + "grad_norm": 1.953125, + "grad_norm_var": 0.039896392822265626, + "learning_rate": 0.0001, + "loss": 4.1089, + "loss/crossentropy": 2.079294800758362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21122989803552628, + "step": 12810 + }, + { + "epoch": 0.25624, + "grad_norm": 2.078125, + "grad_norm_var": 0.03581110636393229, + "learning_rate": 0.0001, + "loss": 4.0105, + "loss/crossentropy": 2.029142141342163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008090615272522, + "step": 12812 + }, + { + "epoch": 0.25628, + "grad_norm": 1.9453125, + "grad_norm_var": 0.03578058878580729, + "learning_rate": 0.0001, + "loss": 4.1743, + "loss/crossentropy": 1.8697097301483154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19589021801948547, + "step": 12814 + }, + { + "epoch": 0.25632, + "grad_norm": 1.8203125, + "grad_norm_var": 0.038331858317057294, + "learning_rate": 0.0001, + "loss": 3.8837, + "loss/crossentropy": 1.8488793969154358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19999422132968903, + "step": 12816 + }, + { + "epoch": 0.25636, + "grad_norm": 2.109375, + "grad_norm_var": 0.011946360270182291, + "learning_rate": 0.0001, + "loss": 4.236, + "loss/crossentropy": 2.352132201194763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23569129407405853, + "step": 12818 + }, + { + "epoch": 0.2564, + "grad_norm": 2.015625, + "grad_norm_var": 0.010155232747395833, + "learning_rate": 0.0001, + "loss": 4.2247, + "loss/crossentropy": 2.213895559310913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19782276451587677, + "step": 12820 + }, + { + "epoch": 0.25644, + "grad_norm": 1.96875, + "grad_norm_var": 0.008876291910807292, + "learning_rate": 0.0001, + "loss": 4.205, + "loss/crossentropy": 2.328965425491333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138378769159317, + "step": 12822 + }, + { + "epoch": 0.25648, + "grad_norm": 1.9375, + "grad_norm_var": 0.007407379150390625, + "learning_rate": 0.0001, + "loss": 3.9178, + "loss/crossentropy": 2.148880124092102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20276722311973572, + "step": 12824 + }, + { + "epoch": 0.25652, + "grad_norm": 2.109375, + "grad_norm_var": 0.007124582926432292, + "learning_rate": 0.0001, + "loss": 4.2742, + "loss/crossentropy": 2.1533660888671875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22211932390928268, + "step": 12826 + }, + { + "epoch": 0.25656, + "grad_norm": 2.75, + "grad_norm_var": 0.04098078409830729, + "learning_rate": 0.0001, + "loss": 4.0928, + "loss/crossentropy": 1.7518101930618286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17689460515975952, + "step": 12828 + }, + { + "epoch": 0.2566, + "grad_norm": 2.109375, + "grad_norm_var": 0.0395660400390625, + "learning_rate": 0.0001, + "loss": 3.9745, + "loss/crossentropy": 2.346145749092102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22157911956310272, + "step": 12830 + }, + { + "epoch": 0.25664, + "grad_norm": 2.09375, + "grad_norm_var": 0.0358062744140625, + "learning_rate": 0.0001, + "loss": 4.1689, + "loss/crossentropy": 1.8355774283409119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20288754999637604, + "step": 12832 + }, + { + "epoch": 0.25668, + "grad_norm": 1.984375, + "grad_norm_var": 0.03906962076822917, + "learning_rate": 0.0001, + "loss": 3.9947, + "loss/crossentropy": 1.949233889579773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19860410690307617, + "step": 12834 + }, + { + "epoch": 0.25672, + "grad_norm": 1.9609375, + "grad_norm_var": 0.04248046875, + "learning_rate": 0.0001, + "loss": 4.0903, + "loss/crossentropy": 2.0141645669937134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19531898200511932, + "step": 12836 + }, + { + "epoch": 0.25676, + "grad_norm": 2.015625, + "grad_norm_var": 0.04277725219726562, + "learning_rate": 0.0001, + "loss": 3.918, + "loss/crossentropy": 1.6573863625526428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.163545623421669, + "step": 12838 + }, + { + "epoch": 0.2568, + "grad_norm": 2.15625, + "grad_norm_var": 0.04273656209309896, + "learning_rate": 0.0001, + "loss": 4.3252, + "loss/crossentropy": 2.2174651622772217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23083333671092987, + "step": 12840 + }, + { + "epoch": 0.25684, + "grad_norm": 1.953125, + "grad_norm_var": 0.045967356363932295, + "learning_rate": 0.0001, + "loss": 3.7593, + "loss/crossentropy": 1.8067168593406677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963481903076172, + "step": 12842 + }, + { + "epoch": 0.25688, + "grad_norm": 2.078125, + "grad_norm_var": 0.010536448160807291, + "learning_rate": 0.0001, + "loss": 3.936, + "loss/crossentropy": 1.928274691104889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21783485263586044, + "step": 12844 + }, + { + "epoch": 0.25692, + "grad_norm": 1.90625, + "grad_norm_var": 0.013498687744140625, + "learning_rate": 0.0001, + "loss": 4.0984, + "loss/crossentropy": 2.183669090270996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040836587548256, + "step": 12846 + }, + { + "epoch": 0.25696, + "grad_norm": 2.015625, + "grad_norm_var": 0.012894439697265624, + "learning_rate": 0.0001, + "loss": 4.0624, + "loss/crossentropy": 1.7395422458648682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19434035569429398, + "step": 12848 + }, + { + "epoch": 0.257, + "grad_norm": 1.8515625, + "grad_norm_var": 0.012772623697916667, + "learning_rate": 0.0001, + "loss": 3.9914, + "loss/crossentropy": 1.9124428629875183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19825156033039093, + "step": 12850 + }, + { + "epoch": 0.25704, + "grad_norm": 2.09375, + "grad_norm_var": 0.011606597900390625, + "learning_rate": 0.0001, + "loss": 3.9325, + "loss/crossentropy": 1.8064388036727905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19374487549066544, + "step": 12852 + }, + { + "epoch": 0.25708, + "grad_norm": 2.0, + "grad_norm_var": 0.0111724853515625, + "learning_rate": 0.0001, + "loss": 3.8722, + "loss/crossentropy": 2.0943931341171265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20997963845729828, + "step": 12854 + }, + { + "epoch": 0.25712, + "grad_norm": 1.96875, + "grad_norm_var": 0.00948486328125, + "learning_rate": 0.0001, + "loss": 3.9974, + "loss/crossentropy": 1.9252876043319702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20608004182577133, + "step": 12856 + }, + { + "epoch": 0.25716, + "grad_norm": 2.015625, + "grad_norm_var": 0.007575480143229166, + "learning_rate": 0.0001, + "loss": 3.9536, + "loss/crossentropy": 1.813286304473877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2107577547430992, + "step": 12858 + }, + { + "epoch": 0.2572, + "grad_norm": 1.984375, + "grad_norm_var": 0.007405598958333333, + "learning_rate": 0.0001, + "loss": 4.1571, + "loss/crossentropy": 1.985919713973999, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18724270164966583, + "step": 12860 + }, + { + "epoch": 0.25724, + "grad_norm": 1.921875, + "grad_norm_var": 0.004654693603515625, + "learning_rate": 0.0001, + "loss": 3.9377, + "loss/crossentropy": 1.912338137626648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19602636992931366, + "step": 12862 + }, + { + "epoch": 0.25728, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004743448893229167, + "learning_rate": 0.0001, + "loss": 4.0823, + "loss/crossentropy": 1.962310791015625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975855752825737, + "step": 12864 + }, + { + "epoch": 0.25732, + "grad_norm": 2.140625, + "grad_norm_var": 0.0050046284993489586, + "learning_rate": 0.0001, + "loss": 4.285, + "loss/crossentropy": 2.063133656978607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20012972503900528, + "step": 12866 + }, + { + "epoch": 0.25736, + "grad_norm": 2.15625, + "grad_norm_var": 0.005751291910807292, + "learning_rate": 0.0001, + "loss": 4.1443, + "loss/crossentropy": 2.1755728721618652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21089357137680054, + "step": 12868 + }, + { + "epoch": 0.2574, + "grad_norm": 2.09375, + "grad_norm_var": 0.006148274739583333, + "learning_rate": 0.0001, + "loss": 4.3854, + "loss/crossentropy": 2.21665620803833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22525488585233688, + "step": 12870 + }, + { + "epoch": 0.25744, + "grad_norm": 2.140625, + "grad_norm_var": 0.006959788004557292, + "learning_rate": 0.0001, + "loss": 4.2465, + "loss/crossentropy": 2.2826790809631348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22989312559366226, + "step": 12872 + }, + { + "epoch": 0.25748, + "grad_norm": 1.875, + "grad_norm_var": 0.008432769775390625, + "learning_rate": 0.0001, + "loss": 4.0753, + "loss/crossentropy": 2.3187366724014282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22048770636320114, + "step": 12874 + }, + { + "epoch": 0.25752, + "grad_norm": 2.09375, + "grad_norm_var": 0.0093017578125, + "learning_rate": 0.0001, + "loss": 4.2469, + "loss/crossentropy": 2.234878957271576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19875259697437286, + "step": 12876 + }, + { + "epoch": 0.25756, + "grad_norm": 2.046875, + "grad_norm_var": 0.007897694905598959, + "learning_rate": 0.0001, + "loss": 4.0981, + "loss/crossentropy": 1.8919751644134521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20682695508003235, + "step": 12878 + }, + { + "epoch": 0.2576, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 3.9564, + "loss/crossentropy": 2.135833740234375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100691795349121, + "step": 12880 + }, + { + "epoch": 0.25764, + "grad_norm": 1.96875, + "grad_norm_var": 0.008421834309895833, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 2.043276846408844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20327290892601013, + "step": 12882 + }, + { + "epoch": 0.25768, + "grad_norm": 1.921875, + "grad_norm_var": 0.0077512105305989586, + "learning_rate": 0.0001, + "loss": 4.2439, + "loss/crossentropy": 1.996088445186615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20499806851148605, + "step": 12884 + }, + { + "epoch": 0.25772, + "grad_norm": 1.984375, + "grad_norm_var": 0.006648508707682291, + "learning_rate": 0.0001, + "loss": 3.9821, + "loss/crossentropy": 1.7864345908164978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19354674220085144, + "step": 12886 + }, + { + "epoch": 0.25776, + "grad_norm": 2.03125, + "grad_norm_var": 0.0051310221354166664, + "learning_rate": 0.0001, + "loss": 3.961, + "loss/crossentropy": 1.7890866994857788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1806325614452362, + "step": 12888 + }, + { + "epoch": 0.2578, + "grad_norm": 2.09375, + "grad_norm_var": 0.004938761393229167, + "learning_rate": 0.0001, + "loss": 4.1836, + "loss/crossentropy": 1.9287649989128113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19555142521858215, + "step": 12890 + }, + { + "epoch": 0.25784, + "grad_norm": 2.140625, + "grad_norm_var": 0.005081939697265625, + "learning_rate": 0.0001, + "loss": 3.9151, + "loss/crossentropy": 1.930963397026062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19455663859844208, + "step": 12892 + }, + { + "epoch": 0.25788, + "grad_norm": 2.109375, + "grad_norm_var": 0.007551829020182292, + "learning_rate": 0.0001, + "loss": 4.3216, + "loss/crossentropy": 2.1105018854141235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22564146667718887, + "step": 12894 + }, + { + "epoch": 0.25792, + "grad_norm": 2.03125, + "grad_norm_var": 0.008365885416666666, + "learning_rate": 0.0001, + "loss": 4.1924, + "loss/crossentropy": 2.0535677671432495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2083517387509346, + "step": 12896 + }, + { + "epoch": 0.25796, + "grad_norm": 2.015625, + "grad_norm_var": 0.007966105143229167, + "learning_rate": 0.0001, + "loss": 4.2951, + "loss/crossentropy": 2.062114655971527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20135939121246338, + "step": 12898 + }, + { + "epoch": 0.258, + "grad_norm": 1.7734375, + "grad_norm_var": 0.012482706705729167, + "learning_rate": 0.0001, + "loss": 3.8187, + "loss/crossentropy": 1.968269407749176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20238329470157623, + "step": 12900 + }, + { + "epoch": 0.25804, + "grad_norm": 2.03125, + "grad_norm_var": 0.0598541259765625, + "learning_rate": 0.0001, + "loss": 4.1899, + "loss/crossentropy": 2.0783804655075073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23015306890010834, + "step": 12902 + }, + { + "epoch": 0.25808, + "grad_norm": 2.140625, + "grad_norm_var": 0.05815836588541667, + "learning_rate": 0.0001, + "loss": 4.1245, + "loss/crossentropy": 1.8616145253181458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19802623987197876, + "step": 12904 + }, + { + "epoch": 0.25812, + "grad_norm": 2.046875, + "grad_norm_var": 0.05834147135416667, + "learning_rate": 0.0001, + "loss": 4.2293, + "loss/crossentropy": 2.0005027651786804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21621856093406677, + "step": 12906 + }, + { + "epoch": 0.25816, + "grad_norm": 2.1875, + "grad_norm_var": 0.06181208292643229, + "learning_rate": 0.0001, + "loss": 4.0563, + "loss/crossentropy": 1.796087920665741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18594232201576233, + "step": 12908 + }, + { + "epoch": 0.2582, + "grad_norm": 2.1875, + "grad_norm_var": 0.06258316040039062, + "learning_rate": 0.0001, + "loss": 4.3161, + "loss/crossentropy": 2.003196358680725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19296320527791977, + "step": 12910 + }, + { + "epoch": 0.25824, + "grad_norm": 1.9296875, + "grad_norm_var": 0.06313654581705729, + "learning_rate": 0.0001, + "loss": 4.0489, + "loss/crossentropy": 1.8252119421958923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1879737451672554, + "step": 12912 + }, + { + "epoch": 0.25828, + "grad_norm": 2.046875, + "grad_norm_var": 0.0629900614420573, + "learning_rate": 0.0001, + "loss": 4.2053, + "loss/crossentropy": 1.9801989793777466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20867998152971268, + "step": 12914 + }, + { + "epoch": 0.25832, + "grad_norm": 1.9921875, + "grad_norm_var": 0.05583902994791667, + "learning_rate": 0.0001, + "loss": 3.9754, + "loss/crossentropy": 2.04214608669281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21134068816900253, + "step": 12916 + }, + { + "epoch": 0.25836, + "grad_norm": 2.171875, + "grad_norm_var": 0.009159342447916666, + "learning_rate": 0.0001, + "loss": 4.2202, + "loss/crossentropy": 2.0669824481010437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20436694473028183, + "step": 12918 + }, + { + "epoch": 0.2584, + "grad_norm": 2.09375, + "grad_norm_var": 0.008771769205729167, + "learning_rate": 0.0001, + "loss": 4.3616, + "loss/crossentropy": 1.951616883277893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19572293758392334, + "step": 12920 + }, + { + "epoch": 0.25844, + "grad_norm": 1.953125, + "grad_norm_var": 0.010074869791666666, + "learning_rate": 0.0001, + "loss": 3.8476, + "loss/crossentropy": 1.7768954634666443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930573582649231, + "step": 12922 + }, + { + "epoch": 0.25848, + "grad_norm": 2.40625, + "grad_norm_var": 0.020458984375, + "learning_rate": 0.0001, + "loss": 3.8775, + "loss/crossentropy": 1.8527125716209412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18682826310396194, + "step": 12924 + }, + { + "epoch": 0.25852, + "grad_norm": 2.0, + "grad_norm_var": 0.018990071614583333, + "learning_rate": 0.0001, + "loss": 4.3814, + "loss/crossentropy": 2.3325828313827515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21535523980855942, + "step": 12926 + }, + { + "epoch": 0.25856, + "grad_norm": 1.8984375, + "grad_norm_var": 0.02063166300455729, + "learning_rate": 0.0001, + "loss": 3.7701, + "loss/crossentropy": 1.759222686290741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17470692098140717, + "step": 12928 + }, + { + "epoch": 0.2586, + "grad_norm": 1.921875, + "grad_norm_var": 0.021247355143229167, + "learning_rate": 0.0001, + "loss": 3.8065, + "loss/crossentropy": 1.944493055343628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20259656757116318, + "step": 12930 + }, + { + "epoch": 0.25864, + "grad_norm": 1.984375, + "grad_norm_var": 0.05238825480143229, + "learning_rate": 0.0001, + "loss": 4.133, + "loss/crossentropy": 2.086555302143097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180738002061844, + "step": 12932 + }, + { + "epoch": 0.25868, + "grad_norm": 1.9453125, + "grad_norm_var": 0.052779134114583334, + "learning_rate": 0.0001, + "loss": 4.1414, + "loss/crossentropy": 1.9723011255264282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22814874351024628, + "step": 12934 + }, + { + "epoch": 0.25872, + "grad_norm": 2.125, + "grad_norm_var": 0.05356038411458333, + "learning_rate": 0.0001, + "loss": 4.418, + "loss/crossentropy": 1.5646896958351135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18182098120450974, + "step": 12936 + }, + { + "epoch": 0.25876, + "grad_norm": 1.8828125, + "grad_norm_var": 0.054870351155598955, + "learning_rate": 0.0001, + "loss": 3.8356, + "loss/crossentropy": 1.8396940231323242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19390598684549332, + "step": 12938 + }, + { + "epoch": 0.2588, + "grad_norm": 2.046875, + "grad_norm_var": 0.041112263997395836, + "learning_rate": 0.0001, + "loss": 4.5902, + "loss/crossentropy": 2.086738705635071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209588885307312, + "step": 12940 + }, + { + "epoch": 0.25884, + "grad_norm": 2.078125, + "grad_norm_var": 0.04136530558268229, + "learning_rate": 0.0001, + "loss": 4.2611, + "loss/crossentropy": 2.0965282917022705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21021046489477158, + "step": 12942 + }, + { + "epoch": 0.25888, + "grad_norm": 1.8125, + "grad_norm_var": 0.04311421712239583, + "learning_rate": 0.0001, + "loss": 4.0702, + "loss/crossentropy": 2.2878576517105103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2203715667128563, + "step": 12944 + }, + { + "epoch": 0.25892, + "grad_norm": 1.96875, + "grad_norm_var": 0.045446523030598956, + "learning_rate": 0.0001, + "loss": 4.419, + "loss/crossentropy": 2.1753373742103577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21585986018180847, + "step": 12946 + }, + { + "epoch": 0.25896, + "grad_norm": 2.0625, + "grad_norm_var": 0.036232248942057295, + "learning_rate": 0.0001, + "loss": 4.3735, + "loss/crossentropy": 1.963489055633545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20374132692813873, + "step": 12948 + }, + { + "epoch": 0.259, + "grad_norm": 2.265625, + "grad_norm_var": 0.037751261393229166, + "learning_rate": 0.0001, + "loss": 4.4073, + "loss/crossentropy": 2.3734259605407715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2375379502773285, + "step": 12950 + }, + { + "epoch": 0.25904, + "grad_norm": 1.9921875, + "grad_norm_var": 0.040710194905598955, + "learning_rate": 0.0001, + "loss": 4.184, + "loss/crossentropy": 2.266390085220337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22274205088615417, + "step": 12952 + }, + { + "epoch": 0.25908, + "grad_norm": 2.046875, + "grad_norm_var": 0.03729654947916667, + "learning_rate": 0.0001, + "loss": 3.9735, + "loss/crossentropy": 1.9015297293663025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20370282977819443, + "step": 12954 + }, + { + "epoch": 0.25912, + "grad_norm": 2.03125, + "grad_norm_var": 0.03886617024739583, + "learning_rate": 0.0001, + "loss": 4.3269, + "loss/crossentropy": 2.1192378997802734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26288172602653503, + "step": 12956 + }, + { + "epoch": 0.25916, + "grad_norm": 2.046875, + "grad_norm_var": 0.03792088826497396, + "learning_rate": 0.0001, + "loss": 4.0982, + "loss/crossentropy": 1.713826835155487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18557386100292206, + "step": 12958 + }, + { + "epoch": 0.2592, + "grad_norm": 1.9921875, + "grad_norm_var": 0.029545084635416666, + "learning_rate": 0.0001, + "loss": 4.2002, + "loss/crossentropy": 2.0365039706230164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21086978912353516, + "step": 12960 + }, + { + "epoch": 0.25924, + "grad_norm": 2.078125, + "grad_norm_var": 0.027852376302083332, + "learning_rate": 0.0001, + "loss": 4.2422, + "loss/crossentropy": 1.6235400438308716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18112218379974365, + "step": 12962 + }, + { + "epoch": 0.25928, + "grad_norm": 2.03125, + "grad_norm_var": 0.0116455078125, + "learning_rate": 0.0001, + "loss": 4.2996, + "loss/crossentropy": 2.042620003223419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21463964879512787, + "step": 12964 + }, + { + "epoch": 0.25932, + "grad_norm": 2.046875, + "grad_norm_var": 0.009611002604166667, + "learning_rate": 0.0001, + "loss": 4.0222, + "loss/crossentropy": 1.6238983273506165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2171783745288849, + "step": 12966 + }, + { + "epoch": 0.25936, + "grad_norm": 2.4375, + "grad_norm_var": 0.014070383707682292, + "learning_rate": 0.0001, + "loss": 4.0098, + "loss/crossentropy": 2.3150339126586914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22038520872592926, + "step": 12968 + }, + { + "epoch": 0.2594, + "grad_norm": 2.09375, + "grad_norm_var": 0.013301595052083334, + "learning_rate": 0.0001, + "loss": 4.2202, + "loss/crossentropy": 2.3907227516174316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2269512265920639, + "step": 12970 + }, + { + "epoch": 0.25944, + "grad_norm": 1.921875, + "grad_norm_var": 0.014378865559895834, + "learning_rate": 0.0001, + "loss": 4.1032, + "loss/crossentropy": 2.1069165468215942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21007738262414932, + "step": 12972 + }, + { + "epoch": 0.25948, + "grad_norm": 1.8984375, + "grad_norm_var": 0.017942047119140624, + "learning_rate": 0.0001, + "loss": 3.9475, + "loss/crossentropy": 2.277982234954834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21795185655355453, + "step": 12974 + }, + { + "epoch": 0.25952, + "grad_norm": 1.9921875, + "grad_norm_var": 0.017775217692057293, + "learning_rate": 0.0001, + "loss": 4.0159, + "loss/crossentropy": 1.7227254509925842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19050486385822296, + "step": 12976 + }, + { + "epoch": 0.25956, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0184722900390625, + "learning_rate": 0.0001, + "loss": 4.0281, + "loss/crossentropy": 2.0112340450286865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21032612770795822, + "step": 12978 + }, + { + "epoch": 0.2596, + "grad_norm": 2.140625, + "grad_norm_var": 0.019530232747395834, + "learning_rate": 0.0001, + "loss": 4.3041, + "loss/crossentropy": 2.4241796731948853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22474834322929382, + "step": 12980 + }, + { + "epoch": 0.25964, + "grad_norm": 2.078125, + "grad_norm_var": 0.019710286458333334, + "learning_rate": 0.0001, + "loss": 4.3814, + "loss/crossentropy": 1.893187701702118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20330534130334854, + "step": 12982 + }, + { + "epoch": 0.25968, + "grad_norm": 1.9375, + "grad_norm_var": 0.008467356363932291, + "learning_rate": 0.0001, + "loss": 4.1527, + "loss/crossentropy": 2.045100212097168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20636773109436035, + "step": 12984 + }, + { + "epoch": 0.25972, + "grad_norm": 1.8125, + "grad_norm_var": 0.010035959879557292, + "learning_rate": 0.0001, + "loss": 3.864, + "loss/crossentropy": 1.819455087184906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1775035709142685, + "step": 12986 + }, + { + "epoch": 0.25976, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00985107421875, + "learning_rate": 0.0001, + "loss": 4.0553, + "loss/crossentropy": 2.1358631253242493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2027900367975235, + "step": 12988 + }, + { + "epoch": 0.2598, + "grad_norm": 2.1875, + "grad_norm_var": 0.010902659098307291, + "learning_rate": 0.0001, + "loss": 4.1552, + "loss/crossentropy": 1.7847145199775696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21203257888555527, + "step": 12990 + }, + { + "epoch": 0.25984, + "grad_norm": 2.109375, + "grad_norm_var": 0.011979166666666667, + "learning_rate": 0.0001, + "loss": 4.325, + "loss/crossentropy": 2.2848687171936035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22337154299020767, + "step": 12992 + }, + { + "epoch": 0.25988, + "grad_norm": 2.046875, + "grad_norm_var": 0.012889607747395834, + "learning_rate": 0.0001, + "loss": 3.975, + "loss/crossentropy": 1.9629716277122498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20660604536533356, + "step": 12994 + }, + { + "epoch": 0.25992, + "grad_norm": 2.0, + "grad_norm_var": 0.010461171468098959, + "learning_rate": 0.0001, + "loss": 4.1521, + "loss/crossentropy": 2.0321004390716553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21755626797676086, + "step": 12996 + }, + { + "epoch": 0.25996, + "grad_norm": 1.890625, + "grad_norm_var": 0.0105712890625, + "learning_rate": 0.0001, + "loss": 4.1208, + "loss/crossentropy": 1.950038492679596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987697035074234, + "step": 12998 + }, + { + "epoch": 0.26, + "grad_norm": 2.078125, + "grad_norm_var": 0.010544586181640624, + "learning_rate": 0.0001, + "loss": 4.3383, + "loss/crossentropy": 2.2752585411071777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20959648489952087, + "step": 13000 + }, + { + "epoch": 0.26004, + "grad_norm": 2.125, + "grad_norm_var": 0.008138020833333334, + "learning_rate": 0.0001, + "loss": 4.3132, + "loss/crossentropy": 2.162104368209839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21731911599636078, + "step": 13002 + }, + { + "epoch": 0.26008, + "grad_norm": 1.890625, + "grad_norm_var": 0.009102121988932291, + "learning_rate": 0.0001, + "loss": 4.0138, + "loss/crossentropy": 2.3739081025123596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22912397980690002, + "step": 13004 + }, + { + "epoch": 0.26012, + "grad_norm": 3.0625, + "grad_norm_var": 0.07608413696289062, + "learning_rate": 0.0001, + "loss": 4.3052, + "loss/crossentropy": 2.161367416381836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22413842380046844, + "step": 13006 + }, + { + "epoch": 0.26016, + "grad_norm": 2.0625, + "grad_norm_var": 0.07587661743164062, + "learning_rate": 0.0001, + "loss": 4.1435, + "loss/crossentropy": 1.9589285850524902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21598373353481293, + "step": 13008 + }, + { + "epoch": 0.2602, + "grad_norm": 2.09375, + "grad_norm_var": 0.0731842041015625, + "learning_rate": 0.0001, + "loss": 4.3268, + "loss/crossentropy": 2.3472602367401123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22626091539859772, + "step": 13010 + }, + { + "epoch": 0.26024, + "grad_norm": 1.984375, + "grad_norm_var": 0.07296727498372396, + "learning_rate": 0.0001, + "loss": 3.8455, + "loss/crossentropy": 1.9968576431274414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20777586847543716, + "step": 13012 + }, + { + "epoch": 0.26028, + "grad_norm": 2.140625, + "grad_norm_var": 0.0729400634765625, + "learning_rate": 0.0001, + "loss": 3.8219, + "loss/crossentropy": 1.7597955465316772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19565816968679428, + "step": 13014 + }, + { + "epoch": 0.26032, + "grad_norm": 2.09375, + "grad_norm_var": 0.0735992431640625, + "learning_rate": 0.0001, + "loss": 4.1446, + "loss/crossentropy": 1.8115187287330627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19675451517105103, + "step": 13016 + }, + { + "epoch": 0.26036, + "grad_norm": 1.9453125, + "grad_norm_var": 0.07507909138997396, + "learning_rate": 0.0001, + "loss": 4.2018, + "loss/crossentropy": 1.9728147983551025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19999082386493683, + "step": 13018 + }, + { + "epoch": 0.2604, + "grad_norm": 1.8828125, + "grad_norm_var": 0.07516988118489583, + "learning_rate": 0.0001, + "loss": 4.1434, + "loss/crossentropy": 2.155986785888672, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20160435140132904, + "step": 13020 + }, + { + "epoch": 0.26044, + "grad_norm": 2.015625, + "grad_norm_var": 0.0081298828125, + "learning_rate": 0.0001, + "loss": 4.0647, + "loss/crossentropy": 2.119171440601349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2231176346540451, + "step": 13022 + }, + { + "epoch": 0.26048, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010884348551432292, + "learning_rate": 0.0001, + "loss": 3.8185, + "loss/crossentropy": 2.0395994186401367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21075501292943954, + "step": 13024 + }, + { + "epoch": 0.26052, + "grad_norm": 2.28125, + "grad_norm_var": 0.014808909098307291, + "learning_rate": 0.0001, + "loss": 4.4988, + "loss/crossentropy": 2.42897891998291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24882060289382935, + "step": 13026 + }, + { + "epoch": 0.26056, + "grad_norm": 2.25, + "grad_norm_var": 0.018202463785807293, + "learning_rate": 0.0001, + "loss": 4.4995, + "loss/crossentropy": 2.367077350616455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21996428817510605, + "step": 13028 + }, + { + "epoch": 0.2606, + "grad_norm": 1.875, + "grad_norm_var": 0.017207590738932292, + "learning_rate": 0.0001, + "loss": 4.0349, + "loss/crossentropy": 2.071397542953491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20548687875270844, + "step": 13030 + }, + { + "epoch": 0.26064, + "grad_norm": 2.078125, + "grad_norm_var": 0.016943359375, + "learning_rate": 0.0001, + "loss": 4.0425, + "loss/crossentropy": 1.764617681503296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19541263580322266, + "step": 13032 + }, + { + "epoch": 0.26068, + "grad_norm": 2.015625, + "grad_norm_var": 0.016916656494140626, + "learning_rate": 0.0001, + "loss": 4.2657, + "loss/crossentropy": 2.2966678142547607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187207117676735, + "step": 13034 + }, + { + "epoch": 0.26072, + "grad_norm": 2.09375, + "grad_norm_var": 0.0159820556640625, + "learning_rate": 0.0001, + "loss": 4.2959, + "loss/crossentropy": 1.923275649547577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20180264115333557, + "step": 13036 + }, + { + "epoch": 0.26076, + "grad_norm": 2.171875, + "grad_norm_var": 0.017207845052083334, + "learning_rate": 0.0001, + "loss": 4.3138, + "loss/crossentropy": 2.246683716773987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22133737802505493, + "step": 13038 + }, + { + "epoch": 0.2608, + "grad_norm": 2.046875, + "grad_norm_var": 0.011755116780598958, + "learning_rate": 0.0001, + "loss": 4.3685, + "loss/crossentropy": 2.271313190460205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23316439241170883, + "step": 13040 + }, + { + "epoch": 0.26084, + "grad_norm": 2.0625, + "grad_norm_var": 0.008652496337890624, + "learning_rate": 0.0001, + "loss": 4.4451, + "loss/crossentropy": 2.2649654150009155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.211005300283432, + "step": 13042 + }, + { + "epoch": 0.26088, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008006795247395834, + "learning_rate": 0.0001, + "loss": 3.9505, + "loss/crossentropy": 1.7872197031974792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17795034497976303, + "step": 13044 + }, + { + "epoch": 0.26092, + "grad_norm": 2.046875, + "grad_norm_var": 0.007875315348307292, + "learning_rate": 0.0001, + "loss": 3.8693, + "loss/crossentropy": 1.793078064918518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18739672750234604, + "step": 13046 + }, + { + "epoch": 0.26096, + "grad_norm": 2.140625, + "grad_norm_var": 0.0111572265625, + "learning_rate": 0.0001, + "loss": 4.4178, + "loss/crossentropy": 2.0676932334899902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156887650489807, + "step": 13048 + }, + { + "epoch": 0.261, + "grad_norm": 2.046875, + "grad_norm_var": 0.013484700520833334, + "learning_rate": 0.0001, + "loss": 4.2851, + "loss/crossentropy": 2.2497498989105225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21906063705682755, + "step": 13050 + }, + { + "epoch": 0.26104, + "grad_norm": 2.015625, + "grad_norm_var": 0.014994303385416666, + "learning_rate": 0.0001, + "loss": 4.0203, + "loss/crossentropy": 1.901672899723053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2070387825369835, + "step": 13052 + }, + { + "epoch": 0.26108, + "grad_norm": 2.109375, + "grad_norm_var": 0.013887532552083333, + "learning_rate": 0.0001, + "loss": 4.3772, + "loss/crossentropy": 2.2334693670272827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21539244055747986, + "step": 13054 + }, + { + "epoch": 0.26112, + "grad_norm": 2.0625, + "grad_norm_var": 0.014288075764973958, + "learning_rate": 0.0001, + "loss": 4.2916, + "loss/crossentropy": 2.0297399759292603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18854249268770218, + "step": 13056 + }, + { + "epoch": 0.26116, + "grad_norm": 2.09375, + "grad_norm_var": 0.014452107747395833, + "learning_rate": 0.0001, + "loss": 4.1115, + "loss/crossentropy": 2.0964863896369934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21821416169404984, + "step": 13058 + }, + { + "epoch": 0.2612, + "grad_norm": 2.09375, + "grad_norm_var": 0.013702138264973959, + "learning_rate": 0.0001, + "loss": 4.4918, + "loss/crossentropy": 2.1526511907577515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22606930136680603, + "step": 13060 + }, + { + "epoch": 0.26124, + "grad_norm": 2.109375, + "grad_norm_var": 0.0101470947265625, + "learning_rate": 0.0001, + "loss": 4.2079, + "loss/crossentropy": 2.2275509238243103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21959830820560455, + "step": 13062 + }, + { + "epoch": 0.26128, + "grad_norm": 2.046875, + "grad_norm_var": 0.0086181640625, + "learning_rate": 0.0001, + "loss": 4.4246, + "loss/crossentropy": 2.542263627052307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22889885306358337, + "step": 13064 + }, + { + "epoch": 0.26132, + "grad_norm": 1.875, + "grad_norm_var": 0.008585611979166666, + "learning_rate": 0.0001, + "loss": 4.0292, + "loss/crossentropy": 1.7915868163108826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20242593437433243, + "step": 13066 + }, + { + "epoch": 0.26136, + "grad_norm": 2.046875, + "grad_norm_var": 0.006453450520833333, + "learning_rate": 0.0001, + "loss": 4.1415, + "loss/crossentropy": 2.183286130428314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.227259561419487, + "step": 13068 + }, + { + "epoch": 0.2614, + "grad_norm": 2.046875, + "grad_norm_var": 0.0063720703125, + "learning_rate": 0.0001, + "loss": 4.3211, + "loss/crossentropy": 2.2707515954971313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2300432324409485, + "step": 13070 + }, + { + "epoch": 0.26144, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0066487630208333336, + "learning_rate": 0.0001, + "loss": 4.0237, + "loss/crossentropy": 1.822485864162445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19380664825439453, + "step": 13072 + }, + { + "epoch": 0.26148, + "grad_norm": 2.203125, + "grad_norm_var": 0.008349355061848958, + "learning_rate": 0.0001, + "loss": 4.0915, + "loss/crossentropy": 2.089439034461975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22531338036060333, + "step": 13074 + }, + { + "epoch": 0.26152, + "grad_norm": 2.21875, + "grad_norm_var": 0.008318837483723958, + "learning_rate": 0.0001, + "loss": 4.3872, + "loss/crossentropy": 2.0921221375465393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22256288677453995, + "step": 13076 + }, + { + "epoch": 0.26156, + "grad_norm": 2.1875, + "grad_norm_var": 0.009065500895182292, + "learning_rate": 0.0001, + "loss": 4.4644, + "loss/crossentropy": 1.8978914022445679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2677152305841446, + "step": 13078 + }, + { + "epoch": 0.2616, + "grad_norm": 2.296875, + "grad_norm_var": 0.011643218994140624, + "learning_rate": 0.0001, + "loss": 4.405, + "loss/crossentropy": 2.2540348768234253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24195441603660583, + "step": 13080 + }, + { + "epoch": 0.26164, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01092529296875, + "learning_rate": 0.0001, + "loss": 4.0965, + "loss/crossentropy": 1.9546288847923279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19464369863271713, + "step": 13082 + }, + { + "epoch": 0.26168, + "grad_norm": 2.0625, + "grad_norm_var": 0.012457021077473958, + "learning_rate": 0.0001, + "loss": 4.0103, + "loss/crossentropy": 2.1527568101882935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22238580137491226, + "step": 13084 + }, + { + "epoch": 0.26172, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0128814697265625, + "learning_rate": 0.0001, + "loss": 4.3957, + "loss/crossentropy": 2.111591637134552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21360620856285095, + "step": 13086 + }, + { + "epoch": 0.26176, + "grad_norm": 1.953125, + "grad_norm_var": 0.01339111328125, + "learning_rate": 0.0001, + "loss": 3.99, + "loss/crossentropy": 2.006071925163269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19746285676956177, + "step": 13088 + }, + { + "epoch": 0.2618, + "grad_norm": 2.203125, + "grad_norm_var": 0.0128814697265625, + "learning_rate": 0.0001, + "loss": 4.2073, + "loss/crossentropy": 2.276778221130371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21222709119319916, + "step": 13090 + }, + { + "epoch": 0.26184, + "grad_norm": 2.015625, + "grad_norm_var": 0.013346354166666666, + "learning_rate": 0.0001, + "loss": 4.0679, + "loss/crossentropy": 2.102404534816742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049730271100998, + "step": 13092 + }, + { + "epoch": 0.26188, + "grad_norm": 2.015625, + "grad_norm_var": 0.013719685872395833, + "learning_rate": 0.0001, + "loss": 4.439, + "loss/crossentropy": 2.204727053642273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21260947734117508, + "step": 13094 + }, + { + "epoch": 0.26192, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010184478759765626, + "learning_rate": 0.0001, + "loss": 4.2801, + "loss/crossentropy": 2.3762032985687256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22091203182935715, + "step": 13096 + }, + { + "epoch": 0.26196, + "grad_norm": 1.90625, + "grad_norm_var": 0.010286458333333333, + "learning_rate": 0.0001, + "loss": 4.3472, + "loss/crossentropy": 2.0710391998291016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20782632380723953, + "step": 13098 + }, + { + "epoch": 0.262, + "grad_norm": 2.09375, + "grad_norm_var": 0.009319814046223958, + "learning_rate": 0.0001, + "loss": 4.3257, + "loss/crossentropy": 2.193024158477783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21109049022197723, + "step": 13100 + }, + { + "epoch": 0.26204, + "grad_norm": 2.109375, + "grad_norm_var": 0.009308878580729167, + "learning_rate": 0.0001, + "loss": 4.1863, + "loss/crossentropy": 1.9668392539024353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19249404221773148, + "step": 13102 + }, + { + "epoch": 0.26208, + "grad_norm": 2.015625, + "grad_norm_var": 0.00858154296875, + "learning_rate": 0.0001, + "loss": 4.3122, + "loss/crossentropy": 2.2675795555114746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26278799772262573, + "step": 13104 + }, + { + "epoch": 0.26212, + "grad_norm": 2.109375, + "grad_norm_var": 0.007857004801432291, + "learning_rate": 0.0001, + "loss": 3.9962, + "loss/crossentropy": 2.0119330883026123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20821017771959305, + "step": 13106 + }, + { + "epoch": 0.26216, + "grad_norm": 1.984375, + "grad_norm_var": 0.006237538655598959, + "learning_rate": 0.0001, + "loss": 4.329, + "loss/crossentropy": 2.1259734630584717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22454438358545303, + "step": 13108 + }, + { + "epoch": 0.2622, + "grad_norm": 2.03125, + "grad_norm_var": 0.0043413798014322914, + "learning_rate": 0.0001, + "loss": 4.2558, + "loss/crossentropy": 1.8091979622840881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19353152066469193, + "step": 13110 + }, + { + "epoch": 0.26224, + "grad_norm": 2.0625, + "grad_norm_var": 0.0038655598958333335, + "learning_rate": 0.0001, + "loss": 4.1847, + "loss/crossentropy": 1.9463382363319397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20932665467262268, + "step": 13112 + }, + { + "epoch": 0.26228, + "grad_norm": 2.125, + "grad_norm_var": 0.004366048177083333, + "learning_rate": 0.0001, + "loss": 4.3975, + "loss/crossentropy": 2.475601077079773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23451963067054749, + "step": 13114 + }, + { + "epoch": 0.26232, + "grad_norm": 1.9921875, + "grad_norm_var": 0.004412587483723958, + "learning_rate": 0.0001, + "loss": 4.4397, + "loss/crossentropy": 2.0833849906921387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22380392253398895, + "step": 13116 + }, + { + "epoch": 0.26236, + "grad_norm": 1.8984375, + "grad_norm_var": 0.005301920572916666, + "learning_rate": 0.0001, + "loss": 4.0408, + "loss/crossentropy": 1.9876453876495361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20823375135660172, + "step": 13118 + }, + { + "epoch": 0.2624, + "grad_norm": 2.15625, + "grad_norm_var": 0.006750233968098958, + "learning_rate": 0.0001, + "loss": 4.5321, + "loss/crossentropy": 2.184763789176941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24357233941555023, + "step": 13120 + }, + { + "epoch": 0.26244, + "grad_norm": 2.015625, + "grad_norm_var": 0.0061187744140625, + "learning_rate": 0.0001, + "loss": 4.3552, + "loss/crossentropy": 2.217886805534363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21944888681173325, + "step": 13122 + }, + { + "epoch": 0.26248, + "grad_norm": 2.0625, + "grad_norm_var": 0.0052886962890625, + "learning_rate": 0.0001, + "loss": 4.2443, + "loss/crossentropy": 2.3299126625061035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158309668302536, + "step": 13124 + }, + { + "epoch": 0.26252, + "grad_norm": 2.09375, + "grad_norm_var": 0.0058349609375, + "learning_rate": 0.0001, + "loss": 4.1303, + "loss/crossentropy": 2.0423209071159363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20835570991039276, + "step": 13126 + }, + { + "epoch": 0.26256, + "grad_norm": 1.796875, + "grad_norm_var": 0.012325032552083334, + "learning_rate": 0.0001, + "loss": 4.31, + "loss/crossentropy": 2.227811813354492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043633684515953, + "step": 13128 + }, + { + "epoch": 0.2626, + "grad_norm": 2.078125, + "grad_norm_var": 0.011153157552083333, + "learning_rate": 0.0001, + "loss": 4.1312, + "loss/crossentropy": 2.105396568775177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2212589681148529, + "step": 13130 + }, + { + "epoch": 0.26264, + "grad_norm": 2.234375, + "grad_norm_var": 0.012737782796223958, + "learning_rate": 0.0001, + "loss": 4.2244, + "loss/crossentropy": 2.1711814999580383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21528291702270508, + "step": 13132 + }, + { + "epoch": 0.26268, + "grad_norm": 1.9375, + "grad_norm_var": 0.011937459309895834, + "learning_rate": 0.0001, + "loss": 4.1288, + "loss/crossentropy": 2.0086284279823303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974012851715088, + "step": 13134 + }, + { + "epoch": 0.26272, + "grad_norm": 2.0625, + "grad_norm_var": 0.011205037434895834, + "learning_rate": 0.0001, + "loss": 3.895, + "loss/crossentropy": 1.7025322914123535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878160759806633, + "step": 13136 + }, + { + "epoch": 0.26276, + "grad_norm": 2.21875, + "grad_norm_var": 0.015950520833333332, + "learning_rate": 0.0001, + "loss": 4.0134, + "loss/crossentropy": 1.821410596370697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197972871363163, + "step": 13138 + }, + { + "epoch": 0.2628, + "grad_norm": 1.8671875, + "grad_norm_var": 0.018993886311848958, + "learning_rate": 0.0001, + "loss": 4.0875, + "loss/crossentropy": 2.0485053658485413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20401400327682495, + "step": 13140 + }, + { + "epoch": 0.26284, + "grad_norm": 2.03125, + "grad_norm_var": 0.018808746337890626, + "learning_rate": 0.0001, + "loss": 4.4344, + "loss/crossentropy": 2.350088357925415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2638252303004265, + "step": 13142 + }, + { + "epoch": 0.26288, + "grad_norm": 2.078125, + "grad_norm_var": 0.012308502197265625, + "learning_rate": 0.0001, + "loss": 4.5756, + "loss/crossentropy": 2.3423168659210205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22735413163900375, + "step": 13144 + }, + { + "epoch": 0.26292, + "grad_norm": 2.0, + "grad_norm_var": 0.012033843994140625, + "learning_rate": 0.0001, + "loss": 4.3848, + "loss/crossentropy": 2.134036064147949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20407382398843765, + "step": 13146 + }, + { + "epoch": 0.26296, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009748331705729167, + "learning_rate": 0.0001, + "loss": 4.105, + "loss/crossentropy": 2.0744638442993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2107524871826172, + "step": 13148 + }, + { + "epoch": 0.263, + "grad_norm": 1.9375, + "grad_norm_var": 0.012604777018229167, + "learning_rate": 0.0001, + "loss": 4.0507, + "loss/crossentropy": 2.1522003412246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2046833261847496, + "step": 13150 + }, + { + "epoch": 0.26304, + "grad_norm": 2.140625, + "grad_norm_var": 0.013224283854166666, + "learning_rate": 0.0001, + "loss": 4.2754, + "loss/crossentropy": 2.328645348548889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24340055882930756, + "step": 13152 + }, + { + "epoch": 0.26308, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009161122639973958, + "learning_rate": 0.0001, + "loss": 4.0502, + "loss/crossentropy": 2.0878008008003235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110893502831459, + "step": 13154 + }, + { + "epoch": 0.26312, + "grad_norm": 1.984375, + "grad_norm_var": 0.0074155171712239586, + "learning_rate": 0.0001, + "loss": 4.3065, + "loss/crossentropy": 1.9381126761436462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18893951922655106, + "step": 13156 + }, + { + "epoch": 0.26316, + "grad_norm": 2.21875, + "grad_norm_var": 0.009364573160807292, + "learning_rate": 0.0001, + "loss": 4.0254, + "loss/crossentropy": 2.153562545776367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973574310541153, + "step": 13158 + }, + { + "epoch": 0.2632, + "grad_norm": 2.203125, + "grad_norm_var": 0.011370595296223958, + "learning_rate": 0.0001, + "loss": 3.998, + "loss/crossentropy": 2.0583202242851257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19830025732517242, + "step": 13160 + }, + { + "epoch": 0.26324, + "grad_norm": 2.171875, + "grad_norm_var": 0.012117258707682292, + "learning_rate": 0.0001, + "loss": 4.4843, + "loss/crossentropy": 2.1951998472213745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21916336566209793, + "step": 13162 + }, + { + "epoch": 0.26328, + "grad_norm": 2.078125, + "grad_norm_var": 0.011888631184895833, + "learning_rate": 0.0001, + "loss": 4.4944, + "loss/crossentropy": 2.1612678170204163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21040990203619003, + "step": 13164 + }, + { + "epoch": 0.26332, + "grad_norm": 2.078125, + "grad_norm_var": 0.009415690104166667, + "learning_rate": 0.0001, + "loss": 4.2012, + "loss/crossentropy": 1.9372112154960632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24868668615818024, + "step": 13166 + }, + { + "epoch": 0.26336, + "grad_norm": 2.015625, + "grad_norm_var": 0.009968058268229166, + "learning_rate": 0.0001, + "loss": 4.3605, + "loss/crossentropy": 2.0220513939857483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22029200196266174, + "step": 13168 + }, + { + "epoch": 0.2634, + "grad_norm": 1.921875, + "grad_norm_var": 0.010355631510416666, + "learning_rate": 0.0001, + "loss": 3.7251, + "loss/crossentropy": 1.6749334335327148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18502884358167648, + "step": 13170 + }, + { + "epoch": 0.26344, + "grad_norm": 1.9375, + "grad_norm_var": 0.010096995035807292, + "learning_rate": 0.0001, + "loss": 4.1394, + "loss/crossentropy": 2.1463050842285156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21098337322473526, + "step": 13172 + }, + { + "epoch": 0.26348, + "grad_norm": 2.125, + "grad_norm_var": 0.009089914957682292, + "learning_rate": 0.0001, + "loss": 4.3261, + "loss/crossentropy": 2.20473051071167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23006527870893478, + "step": 13174 + }, + { + "epoch": 0.26352, + "grad_norm": 2.3125, + "grad_norm_var": 0.012068684895833333, + "learning_rate": 0.0001, + "loss": 4.3289, + "loss/crossentropy": 2.097456157207489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077179104089737, + "step": 13176 + }, + { + "epoch": 0.26356, + "grad_norm": 2.015625, + "grad_norm_var": 0.0124176025390625, + "learning_rate": 0.0001, + "loss": 4.402, + "loss/crossentropy": 2.136400580406189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152952179312706, + "step": 13178 + }, + { + "epoch": 0.2636, + "grad_norm": 1.921875, + "grad_norm_var": 0.013451131184895833, + "learning_rate": 0.0001, + "loss": 4.23, + "loss/crossentropy": 2.0791231393814087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19688992202281952, + "step": 13180 + }, + { + "epoch": 0.26364, + "grad_norm": 2.03125, + "grad_norm_var": 0.0126617431640625, + "learning_rate": 0.0001, + "loss": 4.1233, + "loss/crossentropy": 2.135149598121643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22163429111242294, + "step": 13182 + }, + { + "epoch": 0.26368, + "grad_norm": 2.171875, + "grad_norm_var": 0.013936360677083334, + "learning_rate": 0.0001, + "loss": 4.4215, + "loss/crossentropy": 2.3207671642303467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24321655184030533, + "step": 13184 + }, + { + "epoch": 0.26372, + "grad_norm": 2.03125, + "grad_norm_var": 0.013069407145182291, + "learning_rate": 0.0001, + "loss": 4.0995, + "loss/crossentropy": 2.0136974453926086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19274070858955383, + "step": 13186 + }, + { + "epoch": 0.26376, + "grad_norm": 2.203125, + "grad_norm_var": 0.013610585530598959, + "learning_rate": 0.0001, + "loss": 4.5094, + "loss/crossentropy": 2.3542726039886475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24547156691551208, + "step": 13188 + }, + { + "epoch": 0.2638, + "grad_norm": 1.8984375, + "grad_norm_var": 0.016007486979166666, + "learning_rate": 0.0001, + "loss": 4.068, + "loss/crossentropy": 1.905708134174347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18261709809303284, + "step": 13190 + }, + { + "epoch": 0.26384, + "grad_norm": 1.921875, + "grad_norm_var": 0.012147776285807292, + "learning_rate": 0.0001, + "loss": 4.1426, + "loss/crossentropy": 2.137321710586548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052912563085556, + "step": 13192 + }, + { + "epoch": 0.26388, + "grad_norm": 2.0625, + "grad_norm_var": 0.010827382405598959, + "learning_rate": 0.0001, + "loss": 4.2252, + "loss/crossentropy": 2.083053410053253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038491740822792, + "step": 13194 + }, + { + "epoch": 0.26392, + "grad_norm": 2.0, + "grad_norm_var": 0.011346181233723959, + "learning_rate": 0.0001, + "loss": 3.8349, + "loss/crossentropy": 1.8006438612937927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910744607448578, + "step": 13196 + }, + { + "epoch": 0.26396, + "grad_norm": 2.03125, + "grad_norm_var": 0.012902577718098959, + "learning_rate": 0.0001, + "loss": 4.4104, + "loss/crossentropy": 2.323628544807434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2280128449201584, + "step": 13198 + }, + { + "epoch": 0.264, + "grad_norm": 1.921875, + "grad_norm_var": 0.010137685139973958, + "learning_rate": 0.0001, + "loss": 4.1231, + "loss/crossentropy": 1.9763594269752502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19776062667369843, + "step": 13200 + }, + { + "epoch": 0.26404, + "grad_norm": 2.109375, + "grad_norm_var": 0.010601552327473958, + "learning_rate": 0.0001, + "loss": 4.0175, + "loss/crossentropy": 1.8776894807815552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19732315093278885, + "step": 13202 + }, + { + "epoch": 0.26408, + "grad_norm": 1.96875, + "grad_norm_var": 0.007478841145833333, + "learning_rate": 0.0001, + "loss": 4.0713, + "loss/crossentropy": 1.826314091682434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19153253734111786, + "step": 13204 + }, + { + "epoch": 0.26412, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007673136393229167, + "learning_rate": 0.0001, + "loss": 4.1503, + "loss/crossentropy": 2.435014486312866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22149913012981415, + "step": 13206 + }, + { + "epoch": 0.26416, + "grad_norm": 2.4375, + "grad_norm_var": 0.01866633097330729, + "learning_rate": 0.0001, + "loss": 4.0829, + "loss/crossentropy": 2.055279493331909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21625792980194092, + "step": 13208 + }, + { + "epoch": 0.2642, + "grad_norm": 1.984375, + "grad_norm_var": 0.018641916910807292, + "learning_rate": 0.0001, + "loss": 4.216, + "loss/crossentropy": 2.4019049406051636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22626115381717682, + "step": 13210 + }, + { + "epoch": 0.26424, + "grad_norm": 1.9921875, + "grad_norm_var": 0.017508697509765626, + "learning_rate": 0.0001, + "loss": 3.9822, + "loss/crossentropy": 2.336918354034424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22773776203393936, + "step": 13212 + }, + { + "epoch": 0.26428, + "grad_norm": 2.03125, + "grad_norm_var": 0.01608454386393229, + "learning_rate": 0.0001, + "loss": 4.3929, + "loss/crossentropy": 2.0398528575897217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20827088505029678, + "step": 13214 + }, + { + "epoch": 0.26432, + "grad_norm": 2.03125, + "grad_norm_var": 0.015419260660807291, + "learning_rate": 0.0001, + "loss": 4.026, + "loss/crossentropy": 2.166857123374939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2136707603931427, + "step": 13216 + }, + { + "epoch": 0.26436, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015095011393229166, + "learning_rate": 0.0001, + "loss": 4.2282, + "loss/crossentropy": 1.8454258441925049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20678912848234177, + "step": 13218 + }, + { + "epoch": 0.2644, + "grad_norm": 2.140625, + "grad_norm_var": 0.014642079671223959, + "learning_rate": 0.0001, + "loss": 4.2966, + "loss/crossentropy": 2.249597668647766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22785719484090805, + "step": 13220 + }, + { + "epoch": 0.26444, + "grad_norm": 1.90625, + "grad_norm_var": 0.014134724934895834, + "learning_rate": 0.0001, + "loss": 3.875, + "loss/crossentropy": 1.7729946374893188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2014644593000412, + "step": 13222 + }, + { + "epoch": 0.26448, + "grad_norm": 1.9296875, + "grad_norm_var": 0.004976145426432292, + "learning_rate": 0.0001, + "loss": 3.9144, + "loss/crossentropy": 1.7968116998672485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889382004737854, + "step": 13224 + }, + { + "epoch": 0.26452, + "grad_norm": 2.015625, + "grad_norm_var": 0.006510162353515625, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 2.0765512585639954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20551903545856476, + "step": 13226 + }, + { + "epoch": 0.26456, + "grad_norm": 2.203125, + "grad_norm_var": 0.007933553059895833, + "learning_rate": 0.0001, + "loss": 4.2603, + "loss/crossentropy": 2.2007554173469543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2250915989279747, + "step": 13228 + }, + { + "epoch": 0.2646, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008184560139973958, + "learning_rate": 0.0001, + "loss": 4.2014, + "loss/crossentropy": 2.319425046443939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22913866490125656, + "step": 13230 + }, + { + "epoch": 0.26464, + "grad_norm": 1.96875, + "grad_norm_var": 0.008365631103515625, + "learning_rate": 0.0001, + "loss": 4.1395, + "loss/crossentropy": 1.9845139980316162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20769847929477692, + "step": 13232 + }, + { + "epoch": 0.26468, + "grad_norm": 1.96875, + "grad_norm_var": 0.008674112955729167, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 1.94157075881958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2314443141222, + "step": 13234 + }, + { + "epoch": 0.26472, + "grad_norm": 2.0, + "grad_norm_var": 0.007470703125, + "learning_rate": 0.0001, + "loss": 4.3428, + "loss/crossentropy": 2.0699294209480286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22432812303304672, + "step": 13236 + }, + { + "epoch": 0.26476, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006841786702473958, + "learning_rate": 0.0001, + "loss": 3.983, + "loss/crossentropy": 1.7656533122062683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20837824791669846, + "step": 13238 + }, + { + "epoch": 0.2648, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0060618082682291664, + "learning_rate": 0.0001, + "loss": 4.1799, + "loss/crossentropy": 1.9473227858543396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20151428878307343, + "step": 13240 + }, + { + "epoch": 0.26484, + "grad_norm": 1.921875, + "grad_norm_var": 0.005387369791666667, + "learning_rate": 0.0001, + "loss": 4.284, + "loss/crossentropy": 2.060440719127655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2041771411895752, + "step": 13242 + }, + { + "epoch": 0.26488, + "grad_norm": 1.8984375, + "grad_norm_var": 0.003922526041666667, + "learning_rate": 0.0001, + "loss": 3.7503, + "loss/crossentropy": 2.0440456867218018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2041776031255722, + "step": 13244 + }, + { + "epoch": 0.26492, + "grad_norm": 2.109375, + "grad_norm_var": 0.004622395833333333, + "learning_rate": 0.0001, + "loss": 4.1921, + "loss/crossentropy": 1.9192551374435425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18886462599039078, + "step": 13246 + }, + { + "epoch": 0.26496, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0072100321451822914, + "learning_rate": 0.0001, + "loss": 4.1673, + "loss/crossentropy": 1.7112281918525696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18532422930002213, + "step": 13248 + }, + { + "epoch": 0.265, + "grad_norm": 2.0, + "grad_norm_var": 0.007637532552083334, + "learning_rate": 0.0001, + "loss": 4.0561, + "loss/crossentropy": 1.9225355386734009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19994951784610748, + "step": 13250 + }, + { + "epoch": 0.26504, + "grad_norm": 2.0, + "grad_norm_var": 0.007950846354166667, + "learning_rate": 0.0001, + "loss": 3.6825, + "loss/crossentropy": 2.0243517756462097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19785288721323013, + "step": 13252 + }, + { + "epoch": 0.26508, + "grad_norm": 1.984375, + "grad_norm_var": 0.007413482666015625, + "learning_rate": 0.0001, + "loss": 4.2929, + "loss/crossentropy": 2.255920171737671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21220286190509796, + "step": 13254 + }, + { + "epoch": 0.26512, + "grad_norm": 2.0625, + "grad_norm_var": 0.008475748697916667, + "learning_rate": 0.0001, + "loss": 4.4145, + "loss/crossentropy": 2.3024203777313232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22255335003137589, + "step": 13256 + }, + { + "epoch": 0.26516, + "grad_norm": 2.03125, + "grad_norm_var": 0.00733642578125, + "learning_rate": 0.0001, + "loss": 4.1386, + "loss/crossentropy": 2.0467708110809326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20193494111299515, + "step": 13258 + }, + { + "epoch": 0.2652, + "grad_norm": 2.03125, + "grad_norm_var": 0.006064605712890625, + "learning_rate": 0.0001, + "loss": 4.2188, + "loss/crossentropy": 2.1543976068496704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2257647141814232, + "step": 13260 + }, + { + "epoch": 0.26524, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006373850504557291, + "learning_rate": 0.0001, + "loss": 4.2683, + "loss/crossentropy": 2.077743351459503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21964918076992035, + "step": 13262 + }, + { + "epoch": 0.26528, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0044921875, + "learning_rate": 0.0001, + "loss": 3.963, + "loss/crossentropy": 1.6356236338615417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1847509667277336, + "step": 13264 + }, + { + "epoch": 0.26532, + "grad_norm": 1.953125, + "grad_norm_var": 0.006925201416015625, + "learning_rate": 0.0001, + "loss": 4.3559, + "loss/crossentropy": 2.0876659750938416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22080402821302414, + "step": 13266 + }, + { + "epoch": 0.26536, + "grad_norm": 2.109375, + "grad_norm_var": 0.005747222900390625, + "learning_rate": 0.0001, + "loss": 4.0452, + "loss/crossentropy": 1.7057855129241943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18772340565919876, + "step": 13268 + }, + { + "epoch": 0.2654, + "grad_norm": 1.890625, + "grad_norm_var": 0.0078277587890625, + "learning_rate": 0.0001, + "loss": 4.0032, + "loss/crossentropy": 2.0550594329833984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18500665575265884, + "step": 13270 + }, + { + "epoch": 0.26544, + "grad_norm": 2.0, + "grad_norm_var": 0.011131795247395833, + "learning_rate": 0.0001, + "loss": 4.2837, + "loss/crossentropy": 2.027509331703186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21377252787351608, + "step": 13272 + }, + { + "epoch": 0.26548, + "grad_norm": 1.921875, + "grad_norm_var": 0.012393951416015625, + "learning_rate": 0.0001, + "loss": 3.9739, + "loss/crossentropy": 2.0517578125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21172185242176056, + "step": 13274 + }, + { + "epoch": 0.26552, + "grad_norm": 2.09375, + "grad_norm_var": 0.01395263671875, + "learning_rate": 0.0001, + "loss": 4.1628, + "loss/crossentropy": 2.0550750494003296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22374523431062698, + "step": 13276 + }, + { + "epoch": 0.26556, + "grad_norm": 2.09375, + "grad_norm_var": 0.013399251302083333, + "learning_rate": 0.0001, + "loss": 4.1829, + "loss/crossentropy": 1.8055049777030945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17029780894517899, + "step": 13278 + }, + { + "epoch": 0.2656, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013114166259765626, + "learning_rate": 0.0001, + "loss": 4.3999, + "loss/crossentropy": 1.9650321006774902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20327500253915787, + "step": 13280 + }, + { + "epoch": 0.26564, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010209147135416667, + "learning_rate": 0.0001, + "loss": 4.1395, + "loss/crossentropy": 2.164494276046753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21579623967409134, + "step": 13282 + }, + { + "epoch": 0.26568, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009810129801432291, + "learning_rate": 0.0001, + "loss": 4.0676, + "loss/crossentropy": 2.0488376021385193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20213299989700317, + "step": 13284 + }, + { + "epoch": 0.26572, + "grad_norm": 2.0, + "grad_norm_var": 0.010308583577473959, + "learning_rate": 0.0001, + "loss": 3.7442, + "loss/crossentropy": 1.948447048664093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19875720143318176, + "step": 13286 + }, + { + "epoch": 0.26576, + "grad_norm": 1.875, + "grad_norm_var": 0.008487701416015625, + "learning_rate": 0.0001, + "loss": 4.2312, + "loss/crossentropy": 2.115865170955658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19675103574991226, + "step": 13288 + }, + { + "epoch": 0.2658, + "grad_norm": 2.140625, + "grad_norm_var": 0.0105377197265625, + "learning_rate": 0.0001, + "loss": 4.2289, + "loss/crossentropy": 1.9720736145973206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20975899696350098, + "step": 13290 + }, + { + "epoch": 0.26584, + "grad_norm": 2.015625, + "grad_norm_var": 0.009154256184895833, + "learning_rate": 0.0001, + "loss": 3.9258, + "loss/crossentropy": 1.9105132222175598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20582708716392517, + "step": 13292 + }, + { + "epoch": 0.26588, + "grad_norm": 2.40625, + "grad_norm_var": 0.018369293212890624, + "learning_rate": 0.0001, + "loss": 4.2629, + "loss/crossentropy": 2.089789867401123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21264012902975082, + "step": 13294 + }, + { + "epoch": 0.26592, + "grad_norm": 2.03125, + "grad_norm_var": 0.018358357747395835, + "learning_rate": 0.0001, + "loss": 4.41, + "loss/crossentropy": 2.271156430244446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21146492660045624, + "step": 13296 + }, + { + "epoch": 0.26596, + "grad_norm": 1.90625, + "grad_norm_var": 0.0196929931640625, + "learning_rate": 0.0001, + "loss": 3.8554, + "loss/crossentropy": 1.930766224861145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959189623594284, + "step": 13298 + }, + { + "epoch": 0.266, + "grad_norm": 2.25, + "grad_norm_var": 0.023374176025390624, + "learning_rate": 0.0001, + "loss": 4.3551, + "loss/crossentropy": 1.9945995807647705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20959854125976562, + "step": 13300 + }, + { + "epoch": 0.26604, + "grad_norm": 2.125, + "grad_norm_var": 0.020637003580729167, + "learning_rate": 0.0001, + "loss": 4.5445, + "loss/crossentropy": 2.2128443717956543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21266601234674454, + "step": 13302 + }, + { + "epoch": 0.26608, + "grad_norm": 2.03125, + "grad_norm_var": 0.018049112955729165, + "learning_rate": 0.0001, + "loss": 4.2578, + "loss/crossentropy": 2.076206088066101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2037106677889824, + "step": 13304 + }, + { + "epoch": 0.26612, + "grad_norm": 2.171875, + "grad_norm_var": 0.018724568684895835, + "learning_rate": 0.0001, + "loss": 4.0512, + "loss/crossentropy": 2.077200174331665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.228537917137146, + "step": 13306 + }, + { + "epoch": 0.26616, + "grad_norm": 2.0, + "grad_norm_var": 0.01874567667643229, + "learning_rate": 0.0001, + "loss": 3.9641, + "loss/crossentropy": 2.0368794202804565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150697112083435, + "step": 13308 + }, + { + "epoch": 0.2662, + "grad_norm": 1.9375, + "grad_norm_var": 0.016556803385416666, + "learning_rate": 0.0001, + "loss": 3.8516, + "loss/crossentropy": 1.9108307361602783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1903306469321251, + "step": 13310 + }, + { + "epoch": 0.26624, + "grad_norm": 2.046875, + "grad_norm_var": 0.016218058268229165, + "learning_rate": 0.0001, + "loss": 3.9928, + "loss/crossentropy": 1.8118465542793274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18557358533143997, + "step": 13312 + }, + { + "epoch": 0.26628, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01736424763997396, + "learning_rate": 0.0001, + "loss": 3.897, + "loss/crossentropy": 1.7392275929450989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19214990735054016, + "step": 13314 + }, + { + "epoch": 0.26632, + "grad_norm": 3.359375, + "grad_norm_var": 0.12773412068684895, + "learning_rate": 0.0001, + "loss": 3.9728, + "loss/crossentropy": 2.1361005306243896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22042304277420044, + "step": 13316 + }, + { + "epoch": 0.26636, + "grad_norm": 1.9375, + "grad_norm_var": 0.12786026000976564, + "learning_rate": 0.0001, + "loss": 4.0055, + "loss/crossentropy": 2.0087279677391052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20278790593147278, + "step": 13318 + }, + { + "epoch": 0.2664, + "grad_norm": 2.09375, + "grad_norm_var": 0.12776692708333334, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 2.307933807373047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24929606914520264, + "step": 13320 + }, + { + "epoch": 0.26644, + "grad_norm": 2.078125, + "grad_norm_var": 0.12690022786458333, + "learning_rate": 0.0001, + "loss": 4.3451, + "loss/crossentropy": 2.5539438724517822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2219071164727211, + "step": 13322 + }, + { + "epoch": 0.26648, + "grad_norm": 1.953125, + "grad_norm_var": 0.12910334269205728, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 1.9069242477416992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19276316463947296, + "step": 13324 + }, + { + "epoch": 0.26652, + "grad_norm": 2.140625, + "grad_norm_var": 0.12194010416666666, + "learning_rate": 0.0001, + "loss": 4.535, + "loss/crossentropy": 2.0715484619140625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20079267024993896, + "step": 13326 + }, + { + "epoch": 0.26656, + "grad_norm": 2.15625, + "grad_norm_var": 0.12164713541666666, + "learning_rate": 0.0001, + "loss": 4.46, + "loss/crossentropy": 2.1134172677993774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20564979314804077, + "step": 13328 + }, + { + "epoch": 0.2666, + "grad_norm": 2.046875, + "grad_norm_var": 0.11581929524739583, + "learning_rate": 0.0001, + "loss": 4.3992, + "loss/crossentropy": 2.103445053100586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19361437857151031, + "step": 13330 + }, + { + "epoch": 0.26664, + "grad_norm": 1.8203125, + "grad_norm_var": 0.014499664306640625, + "learning_rate": 0.0001, + "loss": 4.0169, + "loss/crossentropy": 2.1882529258728027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21402588486671448, + "step": 13332 + }, + { + "epoch": 0.26668, + "grad_norm": 2.046875, + "grad_norm_var": 0.015498606363932292, + "learning_rate": 0.0001, + "loss": 4.3247, + "loss/crossentropy": 2.2531981468200684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019234597682953, + "step": 13334 + }, + { + "epoch": 0.26672, + "grad_norm": 1.9921875, + "grad_norm_var": 0.021954091389973958, + "learning_rate": 0.0001, + "loss": 3.8596, + "loss/crossentropy": 1.8077877759933472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19121356308460236, + "step": 13336 + }, + { + "epoch": 0.26676, + "grad_norm": 2.046875, + "grad_norm_var": 0.02600072224934896, + "learning_rate": 0.0001, + "loss": 4.3195, + "loss/crossentropy": 1.7387139797210693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20326050370931625, + "step": 13338 + }, + { + "epoch": 0.2668, + "grad_norm": 1.984375, + "grad_norm_var": 0.022712198893229167, + "learning_rate": 0.0001, + "loss": 4.065, + "loss/crossentropy": 2.2061930894851685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22271326184272766, + "step": 13340 + }, + { + "epoch": 0.26684, + "grad_norm": 2.109375, + "grad_norm_var": 0.02191162109375, + "learning_rate": 0.0001, + "loss": 4.4018, + "loss/crossentropy": 2.096329092979431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2201632410287857, + "step": 13342 + }, + { + "epoch": 0.26688, + "grad_norm": 1.9375, + "grad_norm_var": 0.0232574462890625, + "learning_rate": 0.0001, + "loss": 4.0783, + "loss/crossentropy": 2.2015284299850464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22004209458827972, + "step": 13344 + }, + { + "epoch": 0.26692, + "grad_norm": 2.125, + "grad_norm_var": 0.023957316080729166, + "learning_rate": 0.0001, + "loss": 4.5227, + "loss/crossentropy": 2.256517231464386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21762817353010178, + "step": 13346 + }, + { + "epoch": 0.26696, + "grad_norm": 1.828125, + "grad_norm_var": 0.022874959309895835, + "learning_rate": 0.0001, + "loss": 3.8578, + "loss/crossentropy": 1.8551252484321594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18950944393873215, + "step": 13348 + }, + { + "epoch": 0.267, + "grad_norm": 2.0, + "grad_norm_var": 0.020967610677083335, + "learning_rate": 0.0001, + "loss": 4.3051, + "loss/crossentropy": 1.8834841847419739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17950639128684998, + "step": 13350 + }, + { + "epoch": 0.26704, + "grad_norm": 2.109375, + "grad_norm_var": 0.012872060139973959, + "learning_rate": 0.0001, + "loss": 4.0234, + "loss/crossentropy": 2.0059815645217896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2279331535100937, + "step": 13352 + }, + { + "epoch": 0.26708, + "grad_norm": 2.015625, + "grad_norm_var": 0.00594482421875, + "learning_rate": 0.0001, + "loss": 4.0102, + "loss/crossentropy": 2.2327537536621094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22790935635566711, + "step": 13354 + }, + { + "epoch": 0.26712, + "grad_norm": 2.109375, + "grad_norm_var": 0.0066650390625, + "learning_rate": 0.0001, + "loss": 4.0535, + "loss/crossentropy": 1.7623894214630127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17889687418937683, + "step": 13356 + }, + { + "epoch": 0.26716, + "grad_norm": 1.984375, + "grad_norm_var": 0.006205240885416667, + "learning_rate": 0.0001, + "loss": 4.1595, + "loss/crossentropy": 1.9240076541900635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18552126735448837, + "step": 13358 + }, + { + "epoch": 0.2672, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007387034098307292, + "learning_rate": 0.0001, + "loss": 3.9162, + "loss/crossentropy": 2.301589012145996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20251981914043427, + "step": 13360 + }, + { + "epoch": 0.26724, + "grad_norm": 2.078125, + "grad_norm_var": 0.007671864827473959, + "learning_rate": 0.0001, + "loss": 4.1575, + "loss/crossentropy": 2.144057512283325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22049692273139954, + "step": 13362 + }, + { + "epoch": 0.26728, + "grad_norm": 2.03125, + "grad_norm_var": 0.006209309895833333, + "learning_rate": 0.0001, + "loss": 4.2381, + "loss/crossentropy": 1.9329981207847595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015545517206192, + "step": 13364 + }, + { + "epoch": 0.26732, + "grad_norm": 2.046875, + "grad_norm_var": 0.0062896728515625, + "learning_rate": 0.0001, + "loss": 4.2544, + "loss/crossentropy": 2.057462990283966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19826926290988922, + "step": 13366 + }, + { + "epoch": 0.26736, + "grad_norm": 2.03125, + "grad_norm_var": 0.006670888264973958, + "learning_rate": 0.0001, + "loss": 4.085, + "loss/crossentropy": 2.097291588783264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18807460367679596, + "step": 13368 + }, + { + "epoch": 0.2674, + "grad_norm": 2.015625, + "grad_norm_var": 0.007281239827473958, + "learning_rate": 0.0001, + "loss": 4.1204, + "loss/crossentropy": 2.154082179069519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22133710980415344, + "step": 13370 + }, + { + "epoch": 0.26744, + "grad_norm": 1.9453125, + "grad_norm_var": 0.017561848958333334, + "learning_rate": 0.0001, + "loss": 4.1412, + "loss/crossentropy": 2.208779454231262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139016017317772, + "step": 13372 + }, + { + "epoch": 0.26748, + "grad_norm": 2.0, + "grad_norm_var": 0.018355305989583334, + "learning_rate": 0.0001, + "loss": 3.8929, + "loss/crossentropy": 2.251901865005493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22264431416988373, + "step": 13374 + }, + { + "epoch": 0.26752, + "grad_norm": 2.15625, + "grad_norm_var": 0.016857655843098958, + "learning_rate": 0.0001, + "loss": 4.3243, + "loss/crossentropy": 2.4881285429000854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.250653475522995, + "step": 13376 + }, + { + "epoch": 0.26756, + "grad_norm": 2.140625, + "grad_norm_var": 0.017427571614583335, + "learning_rate": 0.0001, + "loss": 4.2806, + "loss/crossentropy": 2.104843556880951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19004888087511063, + "step": 13378 + }, + { + "epoch": 0.2676, + "grad_norm": 2.359375, + "grad_norm_var": 0.022899373372395834, + "learning_rate": 0.0001, + "loss": 4.0708, + "loss/crossentropy": 1.913047194480896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20572267472743988, + "step": 13380 + }, + { + "epoch": 0.26764, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02463353474934896, + "learning_rate": 0.0001, + "loss": 4.3632, + "loss/crossentropy": 2.1043838262557983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21029697358608246, + "step": 13382 + }, + { + "epoch": 0.26768, + "grad_norm": 2.1875, + "grad_norm_var": 0.02617162068684896, + "learning_rate": 0.0001, + "loss": 4.2429, + "loss/crossentropy": 2.055271625518799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113131284713745, + "step": 13384 + }, + { + "epoch": 0.26772, + "grad_norm": 2.71875, + "grad_norm_var": 0.0488433837890625, + "learning_rate": 0.0001, + "loss": 4.4956, + "loss/crossentropy": 1.8407886624336243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2729990780353546, + "step": 13386 + }, + { + "epoch": 0.26776, + "grad_norm": 2.0625, + "grad_norm_var": 0.04177017211914062, + "learning_rate": 0.0001, + "loss": 4.0097, + "loss/crossentropy": 2.0368717908859253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21415862441062927, + "step": 13388 + }, + { + "epoch": 0.2678, + "grad_norm": 1.953125, + "grad_norm_var": 0.040897369384765625, + "learning_rate": 0.0001, + "loss": 4.1836, + "loss/crossentropy": 1.8235292434692383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062123641371727, + "step": 13390 + }, + { + "epoch": 0.26784, + "grad_norm": 1.9375, + "grad_norm_var": 0.04222183227539063, + "learning_rate": 0.0001, + "loss": 3.8392, + "loss/crossentropy": 1.733048439025879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20651907473802567, + "step": 13392 + }, + { + "epoch": 0.26788, + "grad_norm": 1.9765625, + "grad_norm_var": 0.043454742431640624, + "learning_rate": 0.0001, + "loss": 3.9004, + "loss/crossentropy": 2.1339367628097534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21730080246925354, + "step": 13394 + }, + { + "epoch": 0.26792, + "grad_norm": 1.890625, + "grad_norm_var": 0.0452789306640625, + "learning_rate": 0.0001, + "loss": 3.8421, + "loss/crossentropy": 1.885707974433899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20833835750818253, + "step": 13396 + }, + { + "epoch": 0.26796, + "grad_norm": 1.890625, + "grad_norm_var": 0.04429423014322917, + "learning_rate": 0.0001, + "loss": 3.9714, + "loss/crossentropy": 1.9896257519721985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938292756676674, + "step": 13398 + }, + { + "epoch": 0.268, + "grad_norm": 2.25, + "grad_norm_var": 0.04457575480143229, + "learning_rate": 0.0001, + "loss": 4.2962, + "loss/crossentropy": 2.0359573364257812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21572312712669373, + "step": 13400 + }, + { + "epoch": 0.26804, + "grad_norm": 1.9296875, + "grad_norm_var": 0.014095052083333334, + "learning_rate": 0.0001, + "loss": 4.3483, + "loss/crossentropy": 2.241006851196289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2361859679222107, + "step": 13402 + }, + { + "epoch": 0.26808, + "grad_norm": 1.96875, + "grad_norm_var": 0.013852691650390625, + "learning_rate": 0.0001, + "loss": 4.0518, + "loss/crossentropy": 2.0988917350769043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20220966637134552, + "step": 13404 + }, + { + "epoch": 0.26812, + "grad_norm": 2.078125, + "grad_norm_var": 0.014802805582682292, + "learning_rate": 0.0001, + "loss": 4.3664, + "loss/crossentropy": 2.0803651213645935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22251859307289124, + "step": 13406 + }, + { + "epoch": 0.26816, + "grad_norm": 2.046875, + "grad_norm_var": 0.015827433268229166, + "learning_rate": 0.0001, + "loss": 4.0473, + "loss/crossentropy": 1.9151242971420288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19792036712169647, + "step": 13408 + }, + { + "epoch": 0.2682, + "grad_norm": 2.96875, + "grad_norm_var": 0.07619196573893229, + "learning_rate": 0.0001, + "loss": 4.0321, + "loss/crossentropy": 1.908549964427948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21568355709314346, + "step": 13410 + }, + { + "epoch": 0.26824, + "grad_norm": 2.109375, + "grad_norm_var": 0.06887919108072917, + "learning_rate": 0.0001, + "loss": 4.3844, + "loss/crossentropy": 2.3226535320281982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2332221046090126, + "step": 13412 + }, + { + "epoch": 0.26828, + "grad_norm": 1.9765625, + "grad_norm_var": 0.06696548461914062, + "learning_rate": 0.0001, + "loss": 4.24, + "loss/crossentropy": 1.9767839312553406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038547545671463, + "step": 13414 + }, + { + "epoch": 0.26832, + "grad_norm": 1.859375, + "grad_norm_var": 0.06800028483072916, + "learning_rate": 0.0001, + "loss": 3.9592, + "loss/crossentropy": 2.2013003826141357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076154574751854, + "step": 13416 + }, + { + "epoch": 0.26836, + "grad_norm": 1.9921875, + "grad_norm_var": 0.06597671508789063, + "learning_rate": 0.0001, + "loss": 4.1128, + "loss/crossentropy": 2.0941338539123535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20538556575775146, + "step": 13418 + }, + { + "epoch": 0.2684, + "grad_norm": 2.125, + "grad_norm_var": 0.06494954427083334, + "learning_rate": 0.0001, + "loss": 4.3656, + "loss/crossentropy": 2.0206560492515564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20901528000831604, + "step": 13420 + }, + { + "epoch": 0.26844, + "grad_norm": 2.171875, + "grad_norm_var": 0.06690648396809896, + "learning_rate": 0.0001, + "loss": 4.2606, + "loss/crossentropy": 2.1317169070243835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21660470962524414, + "step": 13422 + }, + { + "epoch": 0.26848, + "grad_norm": 1.8203125, + "grad_norm_var": 0.07109273274739583, + "learning_rate": 0.0001, + "loss": 3.6026, + "loss/crossentropy": 1.631429135799408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1729787141084671, + "step": 13424 + }, + { + "epoch": 0.26852, + "grad_norm": 1.8203125, + "grad_norm_var": 0.012491607666015625, + "learning_rate": 0.0001, + "loss": 3.923, + "loss/crossentropy": 1.8428707122802734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18231894075870514, + "step": 13426 + }, + { + "epoch": 0.26856, + "grad_norm": 1.984375, + "grad_norm_var": 0.011104075113932292, + "learning_rate": 0.0001, + "loss": 4.2351, + "loss/crossentropy": 1.860277235507965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18769516795873642, + "step": 13428 + }, + { + "epoch": 0.2686, + "grad_norm": 1.890625, + "grad_norm_var": 0.012542470296223959, + "learning_rate": 0.0001, + "loss": 4.0199, + "loss/crossentropy": 2.02046799659729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2160683423280716, + "step": 13430 + }, + { + "epoch": 0.26864, + "grad_norm": 2.359375, + "grad_norm_var": 0.021491495768229167, + "learning_rate": 0.0001, + "loss": 4.5105, + "loss/crossentropy": 2.094564437866211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22155898064374924, + "step": 13432 + }, + { + "epoch": 0.26868, + "grad_norm": 2.109375, + "grad_norm_var": 0.02215550740559896, + "learning_rate": 0.0001, + "loss": 4.0382, + "loss/crossentropy": 2.013366401195526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126912623643875, + "step": 13434 + }, + { + "epoch": 0.26872, + "grad_norm": 2.015625, + "grad_norm_var": 0.02212702433268229, + "learning_rate": 0.0001, + "loss": 4.5826, + "loss/crossentropy": 2.360998272895813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2355523630976677, + "step": 13436 + }, + { + "epoch": 0.26876, + "grad_norm": 1.7890625, + "grad_norm_var": 0.02289606730143229, + "learning_rate": 0.0001, + "loss": 3.7129, + "loss/crossentropy": 1.754651427268982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18666986376047134, + "step": 13438 + }, + { + "epoch": 0.2688, + "grad_norm": 2.03125, + "grad_norm_var": 0.01812922159830729, + "learning_rate": 0.0001, + "loss": 4.1024, + "loss/crossentropy": 1.9176424741744995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19241741299629211, + "step": 13440 + }, + { + "epoch": 0.26884, + "grad_norm": 2.078125, + "grad_norm_var": 0.0154449462890625, + "learning_rate": 0.0001, + "loss": 4.2548, + "loss/crossentropy": 2.324304223060608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22302107512950897, + "step": 13442 + }, + { + "epoch": 0.26888, + "grad_norm": 2.5, + "grad_norm_var": 0.028758748372395834, + "learning_rate": 0.0001, + "loss": 4.0693, + "loss/crossentropy": 1.8527624011039734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18644367158412933, + "step": 13444 + }, + { + "epoch": 0.26892, + "grad_norm": 2.1875, + "grad_norm_var": 0.027570597330729165, + "learning_rate": 0.0001, + "loss": 4.4437, + "loss/crossentropy": 2.0999260544776917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21172572672367096, + "step": 13446 + }, + { + "epoch": 0.26896, + "grad_norm": 1.90625, + "grad_norm_var": 0.026627349853515624, + "learning_rate": 0.0001, + "loss": 3.8125, + "loss/crossentropy": 1.8577081561088562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17302225530147552, + "step": 13448 + }, + { + "epoch": 0.269, + "grad_norm": 1.921875, + "grad_norm_var": 0.026956939697265626, + "learning_rate": 0.0001, + "loss": 4.2182, + "loss/crossentropy": 2.11561119556427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20249811559915543, + "step": 13450 + }, + { + "epoch": 0.26904, + "grad_norm": 1.921875, + "grad_norm_var": 0.026656087239583334, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.3036913871765137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20101947337388992, + "step": 13452 + }, + { + "epoch": 0.26908, + "grad_norm": 2.078125, + "grad_norm_var": 0.02341283162434896, + "learning_rate": 0.0001, + "loss": 3.9013, + "loss/crossentropy": 2.0498175621032715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19935546070337296, + "step": 13454 + }, + { + "epoch": 0.26912, + "grad_norm": 2.09375, + "grad_norm_var": 0.08842137654622396, + "learning_rate": 0.0001, + "loss": 4.2601, + "loss/crossentropy": 2.27053964138031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31231266260147095, + "step": 13456 + }, + { + "epoch": 0.26916, + "grad_norm": 1.921875, + "grad_norm_var": 0.0908953348795573, + "learning_rate": 0.0001, + "loss": 4.0035, + "loss/crossentropy": 2.191486120223999, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058379054069519, + "step": 13458 + }, + { + "epoch": 0.2692, + "grad_norm": 2.015625, + "grad_norm_var": 0.07780939737955729, + "learning_rate": 0.0001, + "loss": 4.1193, + "loss/crossentropy": 2.008154332637787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20862725377082825, + "step": 13460 + }, + { + "epoch": 0.26924, + "grad_norm": 2.109375, + "grad_norm_var": 0.07779947916666667, + "learning_rate": 0.0001, + "loss": 4.3191, + "loss/crossentropy": 2.3374814987182617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24534277617931366, + "step": 13462 + }, + { + "epoch": 0.26928, + "grad_norm": 2.0625, + "grad_norm_var": 0.07355855305989584, + "learning_rate": 0.0001, + "loss": 4.0558, + "loss/crossentropy": 2.2654502391815186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21406329423189163, + "step": 13464 + }, + { + "epoch": 0.26932, + "grad_norm": 2.21875, + "grad_norm_var": 0.07412821451822917, + "learning_rate": 0.0001, + "loss": 4.2078, + "loss/crossentropy": 2.1021856665611267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19653601199388504, + "step": 13466 + }, + { + "epoch": 0.26936, + "grad_norm": 2.140625, + "grad_norm_var": 0.0727068583170573, + "learning_rate": 0.0001, + "loss": 4.4466, + "loss/crossentropy": 1.9482329487800598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20822357386350632, + "step": 13468 + }, + { + "epoch": 0.2694, + "grad_norm": 1.8125, + "grad_norm_var": 0.07814915974934895, + "learning_rate": 0.0001, + "loss": 4.1036, + "loss/crossentropy": 2.122725486755371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132444903254509, + "step": 13470 + }, + { + "epoch": 0.26944, + "grad_norm": 2.0, + "grad_norm_var": 0.015044911702473959, + "learning_rate": 0.0001, + "loss": 4.0337, + "loss/crossentropy": 1.760430932044983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1835266426205635, + "step": 13472 + }, + { + "epoch": 0.26948, + "grad_norm": 2.046875, + "grad_norm_var": 0.014847564697265624, + "learning_rate": 0.0001, + "loss": 4.0338, + "loss/crossentropy": 1.610491931438446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19731693714857101, + "step": 13474 + }, + { + "epoch": 0.26952, + "grad_norm": 2.03125, + "grad_norm_var": 0.015313466389973959, + "learning_rate": 0.0001, + "loss": 4.213, + "loss/crossentropy": 2.1966941356658936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22074176371097565, + "step": 13476 + }, + { + "epoch": 0.26956, + "grad_norm": 1.8515625, + "grad_norm_var": 0.029904937744140624, + "learning_rate": 0.0001, + "loss": 4.1291, + "loss/crossentropy": 2.045714259147644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193753719329834, + "step": 13478 + }, + { + "epoch": 0.2696, + "grad_norm": 2.0, + "grad_norm_var": 0.029886881510416668, + "learning_rate": 0.0001, + "loss": 4.0127, + "loss/crossentropy": 1.860603928565979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900448128581047, + "step": 13480 + }, + { + "epoch": 0.26964, + "grad_norm": 2.125, + "grad_norm_var": 0.028449503580729167, + "learning_rate": 0.0001, + "loss": 4.0643, + "loss/crossentropy": 2.171591639518738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21908972412347794, + "step": 13482 + }, + { + "epoch": 0.26968, + "grad_norm": 2.296875, + "grad_norm_var": 0.030265299479166667, + "learning_rate": 0.0001, + "loss": 4.1351, + "loss/crossentropy": 2.271330237388611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21823390573263168, + "step": 13484 + }, + { + "epoch": 0.26972, + "grad_norm": 1.953125, + "grad_norm_var": 0.026875813802083332, + "learning_rate": 0.0001, + "loss": 4.192, + "loss/crossentropy": 2.155557870864868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20512574911117554, + "step": 13486 + }, + { + "epoch": 0.26976, + "grad_norm": 1.9609375, + "grad_norm_var": 0.027164459228515625, + "learning_rate": 0.0001, + "loss": 4.2561, + "loss/crossentropy": 2.3827039003372192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22485698014497757, + "step": 13488 + }, + { + "epoch": 0.2698, + "grad_norm": 1.9375, + "grad_norm_var": 0.028636678059895834, + "learning_rate": 0.0001, + "loss": 3.9937, + "loss/crossentropy": 2.225351929664612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22235903143882751, + "step": 13490 + }, + { + "epoch": 0.26984, + "grad_norm": 2.015625, + "grad_norm_var": 0.027717081705729167, + "learning_rate": 0.0001, + "loss": 4.0144, + "loss/crossentropy": 2.1125447750091553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20206717401742935, + "step": 13492 + }, + { + "epoch": 0.26988, + "grad_norm": 1.9296875, + "grad_norm_var": 0.013639068603515625, + "learning_rate": 0.0001, + "loss": 4.0318, + "loss/crossentropy": 2.30017626285553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22321031987667084, + "step": 13494 + }, + { + "epoch": 0.26992, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0145904541015625, + "learning_rate": 0.0001, + "loss": 4.3979, + "loss/crossentropy": 2.416603446006775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2413351908326149, + "step": 13496 + }, + { + "epoch": 0.26996, + "grad_norm": 2.046875, + "grad_norm_var": 0.013185373942057292, + "learning_rate": 0.0001, + "loss": 4.0436, + "loss/crossentropy": 2.06734299659729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23090296238660812, + "step": 13498 + }, + { + "epoch": 0.27, + "grad_norm": 1.96875, + "grad_norm_var": 0.0050961812337239586, + "learning_rate": 0.0001, + "loss": 4.0, + "loss/crossentropy": 1.8088473677635193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1988995596766472, + "step": 13500 + }, + { + "epoch": 0.27004, + "grad_norm": 2.015625, + "grad_norm_var": 0.005092112223307291, + "learning_rate": 0.0001, + "loss": 4.0205, + "loss/crossentropy": 1.7075524926185608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18463176488876343, + "step": 13502 + }, + { + "epoch": 0.27008, + "grad_norm": 2.21875, + "grad_norm_var": 0.0083160400390625, + "learning_rate": 0.0001, + "loss": 4.2291, + "loss/crossentropy": 2.1184898018836975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20941460877656937, + "step": 13504 + }, + { + "epoch": 0.27012, + "grad_norm": 1.890625, + "grad_norm_var": 0.007736968994140625, + "learning_rate": 0.0001, + "loss": 4.0139, + "loss/crossentropy": 1.8351567387580872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18913155794143677, + "step": 13506 + }, + { + "epoch": 0.27016, + "grad_norm": 1.8515625, + "grad_norm_var": 0.00914306640625, + "learning_rate": 0.0001, + "loss": 3.935, + "loss/crossentropy": 2.090391516685486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21020452678203583, + "step": 13508 + }, + { + "epoch": 0.2702, + "grad_norm": 2.046875, + "grad_norm_var": 0.0078521728515625, + "learning_rate": 0.0001, + "loss": 4.1805, + "loss/crossentropy": 2.0680218935012817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20998934656381607, + "step": 13510 + }, + { + "epoch": 0.27024, + "grad_norm": 1.90625, + "grad_norm_var": 0.007236480712890625, + "learning_rate": 0.0001, + "loss": 4.0074, + "loss/crossentropy": 2.258071780204773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22634688764810562, + "step": 13512 + }, + { + "epoch": 0.27028, + "grad_norm": 2.15625, + "grad_norm_var": 0.03350397745768229, + "learning_rate": 0.0001, + "loss": 4.775, + "loss/crossentropy": 1.931319773197174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.29265259206295013, + "step": 13514 + }, + { + "epoch": 0.27032, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03398844401041667, + "learning_rate": 0.0001, + "loss": 4.3408, + "loss/crossentropy": 2.45454478263855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22465243190526962, + "step": 13516 + }, + { + "epoch": 0.27036, + "grad_norm": 2.046875, + "grad_norm_var": 0.034063466389973956, + "learning_rate": 0.0001, + "loss": 4.2279, + "loss/crossentropy": 2.005887746810913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19818827509880066, + "step": 13518 + }, + { + "epoch": 0.2704, + "grad_norm": 2.03125, + "grad_norm_var": 0.031107330322265626, + "learning_rate": 0.0001, + "loss": 4.318, + "loss/crossentropy": 2.2329577207565308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21015001833438873, + "step": 13520 + }, + { + "epoch": 0.27044, + "grad_norm": 1.8359375, + "grad_norm_var": 0.032373046875, + "learning_rate": 0.0001, + "loss": 4.1026, + "loss/crossentropy": 2.1716688871383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20930524915456772, + "step": 13522 + }, + { + "epoch": 0.27048, + "grad_norm": 1.9609375, + "grad_norm_var": 0.030651601155598958, + "learning_rate": 0.0001, + "loss": 3.957, + "loss/crossentropy": 1.659675419330597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963459849357605, + "step": 13524 + }, + { + "epoch": 0.27052, + "grad_norm": 1.9140625, + "grad_norm_var": 0.035359700520833336, + "learning_rate": 0.0001, + "loss": 4.2214, + "loss/crossentropy": 1.9285706877708435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1708521470427513, + "step": 13526 + }, + { + "epoch": 0.27056, + "grad_norm": 2.046875, + "grad_norm_var": 0.03286031087239583, + "learning_rate": 0.0001, + "loss": 4.1771, + "loss/crossentropy": 2.0428953170776367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20893662422895432, + "step": 13528 + }, + { + "epoch": 0.2706, + "grad_norm": 2.078125, + "grad_norm_var": 0.011655426025390625, + "learning_rate": 0.0001, + "loss": 3.9885, + "loss/crossentropy": 1.979922592639923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19091727584600449, + "step": 13530 + }, + { + "epoch": 0.27064, + "grad_norm": 1.984375, + "grad_norm_var": 0.015922037760416667, + "learning_rate": 0.0001, + "loss": 4.3176, + "loss/crossentropy": 2.1879321336746216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24813223630189896, + "step": 13532 + }, + { + "epoch": 0.27068, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01631647745768229, + "learning_rate": 0.0001, + "loss": 4.1852, + "loss/crossentropy": 2.0450875759124756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21059003472328186, + "step": 13534 + }, + { + "epoch": 0.27072, + "grad_norm": 1.984375, + "grad_norm_var": 0.01673151652018229, + "learning_rate": 0.0001, + "loss": 4.3549, + "loss/crossentropy": 2.050394892692566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21235206723213196, + "step": 13536 + }, + { + "epoch": 0.27076, + "grad_norm": 2.0, + "grad_norm_var": 0.014793904622395833, + "learning_rate": 0.0001, + "loss": 4.0128, + "loss/crossentropy": 2.0687233805656433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22125782817602158, + "step": 13538 + }, + { + "epoch": 0.2708, + "grad_norm": 2.046875, + "grad_norm_var": 0.014647420247395833, + "learning_rate": 0.0001, + "loss": 4.1247, + "loss/crossentropy": 1.8587198853492737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19718395173549652, + "step": 13540 + }, + { + "epoch": 0.27084, + "grad_norm": 2.125, + "grad_norm_var": 0.010223134358723959, + "learning_rate": 0.0001, + "loss": 4.2992, + "loss/crossentropy": 2.2416387796401978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23797060549259186, + "step": 13542 + }, + { + "epoch": 0.27088, + "grad_norm": 4.8125, + "grad_norm_var": 0.495751953125, + "learning_rate": 0.0001, + "loss": 4.1875, + "loss/crossentropy": 2.251328468322754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23915010690689087, + "step": 13544 + }, + { + "epoch": 0.27092, + "grad_norm": 2.125, + "grad_norm_var": 0.4896074930826823, + "learning_rate": 0.0001, + "loss": 4.0601, + "loss/crossentropy": 1.990403652191162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21710015833377838, + "step": 13546 + }, + { + "epoch": 0.27096, + "grad_norm": 2.09375, + "grad_norm_var": 0.4871070861816406, + "learning_rate": 0.0001, + "loss": 4.38, + "loss/crossentropy": 2.176861047744751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2153458073735237, + "step": 13548 + }, + { + "epoch": 0.271, + "grad_norm": 1.953125, + "grad_norm_var": 0.4867286682128906, + "learning_rate": 0.0001, + "loss": 4.0311, + "loss/crossentropy": 1.581967830657959, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1787950098514557, + "step": 13550 + }, + { + "epoch": 0.27104, + "grad_norm": 1.90625, + "grad_norm_var": 0.4920183817545573, + "learning_rate": 0.0001, + "loss": 3.9803, + "loss/crossentropy": 1.827072560787201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18847094476222992, + "step": 13552 + }, + { + "epoch": 0.27108, + "grad_norm": 2.03125, + "grad_norm_var": 0.49279683430989585, + "learning_rate": 0.0001, + "loss": 3.9381, + "loss/crossentropy": 1.5748514533042908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18339257687330246, + "step": 13554 + }, + { + "epoch": 0.27112, + "grad_norm": 2.125, + "grad_norm_var": 0.4897989908854167, + "learning_rate": 0.0001, + "loss": 4.2545, + "loss/crossentropy": 2.3187586069107056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2535529136657715, + "step": 13556 + }, + { + "epoch": 0.27116, + "grad_norm": 2.15625, + "grad_norm_var": 0.4885660807291667, + "learning_rate": 0.0001, + "loss": 4.4848, + "loss/crossentropy": 2.3675668239593506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23080945760011673, + "step": 13558 + }, + { + "epoch": 0.2712, + "grad_norm": 2.0625, + "grad_norm_var": 0.0066912333170572914, + "learning_rate": 0.0001, + "loss": 4.2032, + "loss/crossentropy": 2.3976542949676514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2418106645345688, + "step": 13560 + }, + { + "epoch": 0.27124, + "grad_norm": 1.984375, + "grad_norm_var": 0.006221262613932291, + "learning_rate": 0.0001, + "loss": 4.1877, + "loss/crossentropy": 2.1341328024864197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21845046430826187, + "step": 13562 + }, + { + "epoch": 0.27128, + "grad_norm": 2.046875, + "grad_norm_var": 0.006154123942057292, + "learning_rate": 0.0001, + "loss": 4.3067, + "loss/crossentropy": 2.0519689321517944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21991144120693207, + "step": 13564 + }, + { + "epoch": 0.27132, + "grad_norm": 2.0, + "grad_norm_var": 0.005812327067057292, + "learning_rate": 0.0001, + "loss": 4.106, + "loss/crossentropy": 1.9690070748329163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20945511013269424, + "step": 13566 + }, + { + "epoch": 0.27136, + "grad_norm": 2.09375, + "grad_norm_var": 0.004609934488932292, + "learning_rate": 0.0001, + "loss": 4.0929, + "loss/crossentropy": 1.9553492069244385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2299523577094078, + "step": 13568 + }, + { + "epoch": 0.2714, + "grad_norm": 2.25, + "grad_norm_var": 0.0055501302083333336, + "learning_rate": 0.0001, + "loss": 4.158, + "loss/crossentropy": 1.8214278817176819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1801338642835617, + "step": 13570 + }, + { + "epoch": 0.27144, + "grad_norm": 1.7578125, + "grad_norm_var": 0.011822255452473958, + "learning_rate": 0.0001, + "loss": 3.8658, + "loss/crossentropy": 1.7834638953208923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18421506136655807, + "step": 13572 + }, + { + "epoch": 0.27148, + "grad_norm": 1.984375, + "grad_norm_var": 0.013097890218098958, + "learning_rate": 0.0001, + "loss": 3.8404, + "loss/crossentropy": 1.7703429460525513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18263264745473862, + "step": 13574 + }, + { + "epoch": 0.27152, + "grad_norm": 2.21875, + "grad_norm_var": 0.0155181884765625, + "learning_rate": 0.0001, + "loss": 4.1489, + "loss/crossentropy": 2.228934168815613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21034922450780869, + "step": 13576 + }, + { + "epoch": 0.27156, + "grad_norm": 2.046875, + "grad_norm_var": 0.015379842122395833, + "learning_rate": 0.0001, + "loss": 4.4665, + "loss/crossentropy": 2.160655975341797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22300932556390762, + "step": 13578 + }, + { + "epoch": 0.2716, + "grad_norm": 2.078125, + "grad_norm_var": 0.0150634765625, + "learning_rate": 0.0001, + "loss": 4.2042, + "loss/crossentropy": 2.1582624912261963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21843481063842773, + "step": 13580 + }, + { + "epoch": 0.27164, + "grad_norm": 2.03125, + "grad_norm_var": 0.0150146484375, + "learning_rate": 0.0001, + "loss": 4.2513, + "loss/crossentropy": 2.0660162568092346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1985679790377617, + "step": 13582 + }, + { + "epoch": 0.27168, + "grad_norm": 2.0625, + "grad_norm_var": 0.015433502197265626, + "learning_rate": 0.0001, + "loss": 3.873, + "loss/crossentropy": 1.839695692062378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1772506907582283, + "step": 13584 + }, + { + "epoch": 0.27172, + "grad_norm": 2.015625, + "grad_norm_var": 0.012345123291015624, + "learning_rate": 0.0001, + "loss": 3.9649, + "loss/crossentropy": 2.0837132930755615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19461007416248322, + "step": 13586 + }, + { + "epoch": 0.27176, + "grad_norm": 2.09375, + "grad_norm_var": 0.00750732421875, + "learning_rate": 0.0001, + "loss": 4.1986, + "loss/crossentropy": 1.910473346710205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20272061973810196, + "step": 13588 + }, + { + "epoch": 0.2718, + "grad_norm": 2.921875, + "grad_norm_var": 0.054402669270833336, + "learning_rate": 0.0001, + "loss": 4.2259, + "loss/crossentropy": 2.2427467107772827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21372541785240173, + "step": 13590 + }, + { + "epoch": 0.27184, + "grad_norm": 2.0625, + "grad_norm_var": 0.055272420247395836, + "learning_rate": 0.0001, + "loss": 4.1123, + "loss/crossentropy": 2.165425181388855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2552504763007164, + "step": 13592 + }, + { + "epoch": 0.27188, + "grad_norm": 1.96875, + "grad_norm_var": 0.05620930989583333, + "learning_rate": 0.0001, + "loss": 4.0993, + "loss/crossentropy": 1.9684009552001953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19894393533468246, + "step": 13594 + }, + { + "epoch": 0.27192, + "grad_norm": 2.046875, + "grad_norm_var": 0.055863444010416666, + "learning_rate": 0.0001, + "loss": 4.2421, + "loss/crossentropy": 2.0572606325149536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20954644680023193, + "step": 13596 + }, + { + "epoch": 0.27196, + "grad_norm": 1.78125, + "grad_norm_var": 0.0611724853515625, + "learning_rate": 0.0001, + "loss": 3.5511, + "loss/crossentropy": 1.8215171694755554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1905388981103897, + "step": 13598 + }, + { + "epoch": 0.272, + "grad_norm": 2.28125, + "grad_norm_var": 0.06299209594726562, + "learning_rate": 0.0001, + "loss": 4.2407, + "loss/crossentropy": 2.114215850830078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21045731008052826, + "step": 13600 + }, + { + "epoch": 0.27204, + "grad_norm": 1.9921875, + "grad_norm_var": 0.06243057250976562, + "learning_rate": 0.0001, + "loss": 4.1409, + "loss/crossentropy": 2.301407814025879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148626372218132, + "step": 13602 + }, + { + "epoch": 0.27208, + "grad_norm": 1.9453125, + "grad_norm_var": 0.06350809733072917, + "learning_rate": 0.0001, + "loss": 4.4367, + "loss/crossentropy": 2.0508424639701843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21292225271463394, + "step": 13604 + }, + { + "epoch": 0.27212, + "grad_norm": 2.1875, + "grad_norm_var": 0.014826456705729166, + "learning_rate": 0.0001, + "loss": 4.0024, + "loss/crossentropy": 2.095071792602539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23258864879608154, + "step": 13606 + }, + { + "epoch": 0.27216, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013802083333333333, + "learning_rate": 0.0001, + "loss": 4.1722, + "loss/crossentropy": 2.3613405227661133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20384693890810013, + "step": 13608 + }, + { + "epoch": 0.2722, + "grad_norm": 2.0625, + "grad_norm_var": 0.0142578125, + "learning_rate": 0.0001, + "loss": 4.1397, + "loss/crossentropy": 2.1618664264678955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21691302955150604, + "step": 13610 + }, + { + "epoch": 0.27224, + "grad_norm": 2.03125, + "grad_norm_var": 0.015620676676432292, + "learning_rate": 0.0001, + "loss": 3.8335, + "loss/crossentropy": 1.6352717280387878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18613358587026596, + "step": 13612 + }, + { + "epoch": 0.27228, + "grad_norm": 2.15625, + "grad_norm_var": 0.012851715087890625, + "learning_rate": 0.0001, + "loss": 4.2405, + "loss/crossentropy": 2.304627537727356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21715758740901947, + "step": 13614 + }, + { + "epoch": 0.27232, + "grad_norm": 2.078125, + "grad_norm_var": 0.009357706705729166, + "learning_rate": 0.0001, + "loss": 4.1845, + "loss/crossentropy": 2.232232451438904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20997462421655655, + "step": 13616 + }, + { + "epoch": 0.27236, + "grad_norm": 2.109375, + "grad_norm_var": 0.011503092447916667, + "learning_rate": 0.0001, + "loss": 4.1273, + "loss/crossentropy": 2.0350120663642883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20753738284111023, + "step": 13618 + }, + { + "epoch": 0.2724, + "grad_norm": 1.8828125, + "grad_norm_var": 0.015746053059895834, + "learning_rate": 0.0001, + "loss": 4.1955, + "loss/crossentropy": 1.892416536808014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1920388638973236, + "step": 13620 + }, + { + "epoch": 0.27244, + "grad_norm": 2.015625, + "grad_norm_var": 0.0128814697265625, + "learning_rate": 0.0001, + "loss": 4.3283, + "loss/crossentropy": 2.1694198846817017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217724971473217, + "step": 13622 + }, + { + "epoch": 0.27248, + "grad_norm": 1.96875, + "grad_norm_var": 0.013444010416666667, + "learning_rate": 0.0001, + "loss": 3.9786, + "loss/crossentropy": 2.1418001651763916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016705498099327, + "step": 13624 + }, + { + "epoch": 0.27252, + "grad_norm": 2.171875, + "grad_norm_var": 0.014240519205729166, + "learning_rate": 0.0001, + "loss": 4.3992, + "loss/crossentropy": 2.32223117351532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21616849303245544, + "step": 13626 + }, + { + "epoch": 0.27256, + "grad_norm": 2.03125, + "grad_norm_var": 0.013044230143229167, + "learning_rate": 0.0001, + "loss": 4.1847, + "loss/crossentropy": 1.9991823434829712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077011615037918, + "step": 13628 + }, + { + "epoch": 0.2726, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012434641520182291, + "learning_rate": 0.0001, + "loss": 4.1331, + "loss/crossentropy": 2.1381043195724487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20748750865459442, + "step": 13630 + }, + { + "epoch": 0.27264, + "grad_norm": 2.015625, + "grad_norm_var": 0.012303670247395834, + "learning_rate": 0.0001, + "loss": 4.2215, + "loss/crossentropy": 2.018588602542877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20897169411182404, + "step": 13632 + }, + { + "epoch": 0.27268, + "grad_norm": 2.15625, + "grad_norm_var": 0.01114501953125, + "learning_rate": 0.0001, + "loss": 4.3528, + "loss/crossentropy": 2.1859233379364014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21695572137832642, + "step": 13634 + }, + { + "epoch": 0.27272, + "grad_norm": 2.015625, + "grad_norm_var": 0.0060198465983072914, + "learning_rate": 0.0001, + "loss": 4.2106, + "loss/crossentropy": 2.17002010345459, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21605493128299713, + "step": 13636 + }, + { + "epoch": 0.27276, + "grad_norm": 2.046875, + "grad_norm_var": 0.006058502197265625, + "learning_rate": 0.0001, + "loss": 4.261, + "loss/crossentropy": 2.186649441719055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172158733010292, + "step": 13638 + }, + { + "epoch": 0.2728, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007306925455729167, + "learning_rate": 0.0001, + "loss": 3.8394, + "loss/crossentropy": 1.7324808835983276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17334696650505066, + "step": 13640 + }, + { + "epoch": 0.27284, + "grad_norm": 2.03125, + "grad_norm_var": 0.0049550374348958336, + "learning_rate": 0.0001, + "loss": 3.9805, + "loss/crossentropy": 1.7768054008483887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1750154346227646, + "step": 13642 + }, + { + "epoch": 0.27288, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0047515869140625, + "learning_rate": 0.0001, + "loss": 4.2292, + "loss/crossentropy": 1.9629738330841064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19489944726228714, + "step": 13644 + }, + { + "epoch": 0.27292, + "grad_norm": 2.109375, + "grad_norm_var": 0.006882476806640625, + "learning_rate": 0.0001, + "loss": 4.4096, + "loss/crossentropy": 2.144750416278839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19367601722478867, + "step": 13646 + }, + { + "epoch": 0.27296, + "grad_norm": 2.03125, + "grad_norm_var": 0.0067291259765625, + "learning_rate": 0.0001, + "loss": 4.1301, + "loss/crossentropy": 1.9281827211380005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19894402474164963, + "step": 13648 + }, + { + "epoch": 0.273, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007132720947265625, + "learning_rate": 0.0001, + "loss": 4.1998, + "loss/crossentropy": 1.8092535138130188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18536384403705597, + "step": 13650 + }, + { + "epoch": 0.27304, + "grad_norm": 2.046875, + "grad_norm_var": 0.007726796468098958, + "learning_rate": 0.0001, + "loss": 4.1697, + "loss/crossentropy": 2.5707184076309204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23067744076251984, + "step": 13652 + }, + { + "epoch": 0.27308, + "grad_norm": 2.03125, + "grad_norm_var": 0.008434804280598958, + "learning_rate": 0.0001, + "loss": 4.0189, + "loss/crossentropy": 2.108555316925049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052021473646164, + "step": 13654 + }, + { + "epoch": 0.27312, + "grad_norm": 1.984375, + "grad_norm_var": 0.007228342692057291, + "learning_rate": 0.0001, + "loss": 4.3412, + "loss/crossentropy": 1.836738109588623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20734501630067825, + "step": 13656 + }, + { + "epoch": 0.27316, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007990519205729166, + "learning_rate": 0.0001, + "loss": 4.0783, + "loss/crossentropy": 2.223781406879425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20618071407079697, + "step": 13658 + }, + { + "epoch": 0.2732, + "grad_norm": 2.0, + "grad_norm_var": 0.007207997639973958, + "learning_rate": 0.0001, + "loss": 4.1722, + "loss/crossentropy": 2.028991222381592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20579702407121658, + "step": 13660 + }, + { + "epoch": 0.27324, + "grad_norm": 2.015625, + "grad_norm_var": 0.006915028889973958, + "learning_rate": 0.0001, + "loss": 4.0294, + "loss/crossentropy": 2.1003119349479675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21900994330644608, + "step": 13662 + }, + { + "epoch": 0.27328, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007289377848307291, + "learning_rate": 0.0001, + "loss": 4.1695, + "loss/crossentropy": 2.1813069581985474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117675319314003, + "step": 13664 + }, + { + "epoch": 0.27332, + "grad_norm": 2.140625, + "grad_norm_var": 0.0105621337890625, + "learning_rate": 0.0001, + "loss": 3.8666, + "loss/crossentropy": 2.2203832864761353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20088640600442886, + "step": 13666 + }, + { + "epoch": 0.27336, + "grad_norm": 1.90625, + "grad_norm_var": 0.0103759765625, + "learning_rate": 0.0001, + "loss": 4.1084, + "loss/crossentropy": 1.8810867071151733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20646882802248, + "step": 13668 + }, + { + "epoch": 0.2734, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010060373942057292, + "learning_rate": 0.0001, + "loss": 4.0746, + "loss/crossentropy": 2.2147200107574463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038613259792328, + "step": 13670 + }, + { + "epoch": 0.27344, + "grad_norm": 2.078125, + "grad_norm_var": 0.008408355712890624, + "learning_rate": 0.0001, + "loss": 4.2151, + "loss/crossentropy": 2.3654199838638306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22163932025432587, + "step": 13672 + }, + { + "epoch": 0.27348, + "grad_norm": 1.9375, + "grad_norm_var": 0.0086669921875, + "learning_rate": 0.0001, + "loss": 4.1584, + "loss/crossentropy": 2.422420859336853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23172692954540253, + "step": 13674 + }, + { + "epoch": 0.27352, + "grad_norm": 2.015625, + "grad_norm_var": 0.008634440104166667, + "learning_rate": 0.0001, + "loss": 4.391, + "loss/crossentropy": 2.0776702165603638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2030162662267685, + "step": 13676 + }, + { + "epoch": 0.27356, + "grad_norm": 2.125, + "grad_norm_var": 0.009611002604166667, + "learning_rate": 0.0001, + "loss": 3.9108, + "loss/crossentropy": 1.7956212162971497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932467818260193, + "step": 13678 + }, + { + "epoch": 0.2736, + "grad_norm": 2.046875, + "grad_norm_var": 0.010573069254557291, + "learning_rate": 0.0001, + "loss": 4.1768, + "loss/crossentropy": 2.176727533340454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20450271666049957, + "step": 13680 + }, + { + "epoch": 0.27364, + "grad_norm": 1.90625, + "grad_norm_var": 0.0062255859375, + "learning_rate": 0.0001, + "loss": 4.061, + "loss/crossentropy": 2.0534290075302124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21150045841932297, + "step": 13682 + }, + { + "epoch": 0.27368, + "grad_norm": 2.015625, + "grad_norm_var": 0.0061920166015625, + "learning_rate": 0.0001, + "loss": 4.1682, + "loss/crossentropy": 1.726151466369629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19256682693958282, + "step": 13684 + }, + { + "epoch": 0.27372, + "grad_norm": 2.09375, + "grad_norm_var": 0.005956013997395833, + "learning_rate": 0.0001, + "loss": 4.2666, + "loss/crossentropy": 2.2024354934692383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222209945321083, + "step": 13686 + }, + { + "epoch": 0.27376, + "grad_norm": 2.171875, + "grad_norm_var": 0.007347615559895834, + "learning_rate": 0.0001, + "loss": 4.3252, + "loss/crossentropy": 2.095518469810486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2092512547969818, + "step": 13688 + }, + { + "epoch": 0.2738, + "grad_norm": 2.078125, + "grad_norm_var": 0.0074460347493489586, + "learning_rate": 0.0001, + "loss": 4.0389, + "loss/crossentropy": 1.9978103637695312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20576970279216766, + "step": 13690 + }, + { + "epoch": 0.27384, + "grad_norm": 1.875, + "grad_norm_var": 0.008599599202473959, + "learning_rate": 0.0001, + "loss": 4.1023, + "loss/crossentropy": 2.233831286430359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20514021068811417, + "step": 13692 + }, + { + "epoch": 0.27388, + "grad_norm": 1.921875, + "grad_norm_var": 0.010251617431640625, + "learning_rate": 0.0001, + "loss": 4.0479, + "loss/crossentropy": 1.6729156970977783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18054980039596558, + "step": 13694 + }, + { + "epoch": 0.27392, + "grad_norm": 2.140625, + "grad_norm_var": 0.010910797119140624, + "learning_rate": 0.0001, + "loss": 4.1911, + "loss/crossentropy": 2.0113388895988464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21645111590623856, + "step": 13696 + }, + { + "epoch": 0.27396, + "grad_norm": 2.140625, + "grad_norm_var": 0.010469563802083333, + "learning_rate": 0.0001, + "loss": 4.3487, + "loss/crossentropy": 1.9311461448669434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23587358742952347, + "step": 13698 + }, + { + "epoch": 0.274, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01889012654622396, + "learning_rate": 0.0001, + "loss": 4.1099, + "loss/crossentropy": 2.1205984354019165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20603877305984497, + "step": 13700 + }, + { + "epoch": 0.27404, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01889012654622396, + "learning_rate": 0.0001, + "loss": 4.2947, + "loss/crossentropy": 2.1700201630592346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21855031698942184, + "step": 13702 + }, + { + "epoch": 0.27408, + "grad_norm": 1.8828125, + "grad_norm_var": 0.022484334309895833, + "learning_rate": 0.0001, + "loss": 3.7717, + "loss/crossentropy": 1.5643411874771118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.161862351000309, + "step": 13704 + }, + { + "epoch": 0.27412, + "grad_norm": 1.8671875, + "grad_norm_var": 0.024881998697916668, + "learning_rate": 0.0001, + "loss": 3.7292, + "loss/crossentropy": 2.032666802406311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20749235153198242, + "step": 13706 + }, + { + "epoch": 0.27416, + "grad_norm": 1.890625, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 4.0899, + "loss/crossentropy": 1.9313859939575195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19065556675195694, + "step": 13708 + }, + { + "epoch": 0.2742, + "grad_norm": 2.0, + "grad_norm_var": 0.02225519816080729, + "learning_rate": 0.0001, + "loss": 4.3295, + "loss/crossentropy": 2.1200226545333862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1967960000038147, + "step": 13710 + }, + { + "epoch": 0.27424, + "grad_norm": 1.953125, + "grad_norm_var": 0.020765940348307293, + "learning_rate": 0.0001, + "loss": 4.0655, + "loss/crossentropy": 2.144526958465576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20019693672657013, + "step": 13712 + }, + { + "epoch": 0.27428, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01809666951497396, + "learning_rate": 0.0001, + "loss": 4.2039, + "loss/crossentropy": 1.9802407622337341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20123689621686935, + "step": 13714 + }, + { + "epoch": 0.27432, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0062334696451822914, + "learning_rate": 0.0001, + "loss": 3.9415, + "loss/crossentropy": 2.096527099609375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20282836258411407, + "step": 13716 + }, + { + "epoch": 0.27436, + "grad_norm": 2.046875, + "grad_norm_var": 0.005558013916015625, + "learning_rate": 0.0001, + "loss": 4.1829, + "loss/crossentropy": 2.1037757992744446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039530873298645, + "step": 13718 + }, + { + "epoch": 0.2744, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004752349853515625, + "learning_rate": 0.0001, + "loss": 3.8416, + "loss/crossentropy": 1.5415751934051514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18478797376155853, + "step": 13720 + }, + { + "epoch": 0.27444, + "grad_norm": 2.015625, + "grad_norm_var": 0.001859283447265625, + "learning_rate": 0.0001, + "loss": 4.071, + "loss/crossentropy": 2.0357913970947266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20176240801811218, + "step": 13722 + }, + { + "epoch": 0.27448, + "grad_norm": 1.8125, + "grad_norm_var": 0.0034543355305989582, + "learning_rate": 0.0001, + "loss": 3.8472, + "loss/crossentropy": 1.9013367891311646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20012690126895905, + "step": 13724 + }, + { + "epoch": 0.27452, + "grad_norm": 1.9921875, + "grad_norm_var": 0.00343017578125, + "learning_rate": 0.0001, + "loss": 3.8072, + "loss/crossentropy": 1.5798682570457458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18169572949409485, + "step": 13726 + }, + { + "epoch": 0.27456, + "grad_norm": 1.984375, + "grad_norm_var": 0.0034624735514322915, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 2.196288228034973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22311393916606903, + "step": 13728 + }, + { + "epoch": 0.2746, + "grad_norm": 1.96875, + "grad_norm_var": 0.003928375244140625, + "learning_rate": 0.0001, + "loss": 4.0172, + "loss/crossentropy": 1.714774489402771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17624646425247192, + "step": 13730 + }, + { + "epoch": 0.27464, + "grad_norm": 2.15625, + "grad_norm_var": 0.0053484598795572914, + "learning_rate": 0.0001, + "loss": 4.3264, + "loss/crossentropy": 1.9853646159172058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18827050924301147, + "step": 13732 + }, + { + "epoch": 0.27468, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0074045817057291664, + "learning_rate": 0.0001, + "loss": 4.0074, + "loss/crossentropy": 1.7830212116241455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1801598072052002, + "step": 13734 + }, + { + "epoch": 0.27472, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008046213785807292, + "learning_rate": 0.0001, + "loss": 3.7923, + "loss/crossentropy": 1.7400763034820557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930072009563446, + "step": 13736 + }, + { + "epoch": 0.27476, + "grad_norm": 2.046875, + "grad_norm_var": 0.008421834309895833, + "learning_rate": 0.0001, + "loss": 4.0992, + "loss/crossentropy": 2.2292455434799194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20573973655700684, + "step": 13738 + }, + { + "epoch": 0.2748, + "grad_norm": 2.09375, + "grad_norm_var": 0.009056599934895833, + "learning_rate": 0.0001, + "loss": 4.4305, + "loss/crossentropy": 2.1399097442626953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265445813536644, + "step": 13740 + }, + { + "epoch": 0.27484, + "grad_norm": 1.953125, + "grad_norm_var": 0.009366861979166667, + "learning_rate": 0.0001, + "loss": 4.0596, + "loss/crossentropy": 2.049125075340271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20623435080051422, + "step": 13742 + }, + { + "epoch": 0.27488, + "grad_norm": 2.078125, + "grad_norm_var": 0.010689036051432291, + "learning_rate": 0.0001, + "loss": 4.2678, + "loss/crossentropy": 2.388680577278137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23650053143501282, + "step": 13744 + }, + { + "epoch": 0.27492, + "grad_norm": 2.390625, + "grad_norm_var": 0.019577789306640624, + "learning_rate": 0.0001, + "loss": 4.5091, + "loss/crossentropy": 2.05421245098114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19932237267494202, + "step": 13746 + }, + { + "epoch": 0.27496, + "grad_norm": 2.0, + "grad_norm_var": 0.01969172159830729, + "learning_rate": 0.0001, + "loss": 3.8274, + "loss/crossentropy": 1.892092227935791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1907898187637329, + "step": 13748 + }, + { + "epoch": 0.275, + "grad_norm": 2.109375, + "grad_norm_var": 0.018507639567057293, + "learning_rate": 0.0001, + "loss": 4.2196, + "loss/crossentropy": 2.160037875175476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24229361861944199, + "step": 13750 + }, + { + "epoch": 0.27504, + "grad_norm": 1.8984375, + "grad_norm_var": 0.017451985677083334, + "learning_rate": 0.0001, + "loss": 4.1278, + "loss/crossentropy": 2.5185710191726685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2394627332687378, + "step": 13752 + }, + { + "epoch": 0.27508, + "grad_norm": 1.953125, + "grad_norm_var": 0.01754735310872396, + "learning_rate": 0.0001, + "loss": 4.1551, + "loss/crossentropy": 2.0613635778427124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19758722186088562, + "step": 13754 + }, + { + "epoch": 0.27512, + "grad_norm": 2.046875, + "grad_norm_var": 0.019496409098307292, + "learning_rate": 0.0001, + "loss": 4.1905, + "loss/crossentropy": 1.9780999422073364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21773972362279892, + "step": 13756 + }, + { + "epoch": 0.27516, + "grad_norm": 2.1875, + "grad_norm_var": 0.01987482706705729, + "learning_rate": 0.0001, + "loss": 4.3297, + "loss/crossentropy": 2.1116772890090942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20809265226125717, + "step": 13758 + }, + { + "epoch": 0.2752, + "grad_norm": 2.109375, + "grad_norm_var": 0.021201324462890626, + "learning_rate": 0.0001, + "loss": 4.2079, + "loss/crossentropy": 2.1072784662246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21189533174037933, + "step": 13760 + }, + { + "epoch": 0.27524, + "grad_norm": 1.953125, + "grad_norm_var": 0.014902496337890625, + "learning_rate": 0.0001, + "loss": 3.9102, + "loss/crossentropy": 2.261967897415161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058071494102478, + "step": 13762 + }, + { + "epoch": 0.27528, + "grad_norm": 2.28125, + "grad_norm_var": 0.01571044921875, + "learning_rate": 0.0001, + "loss": 4.0684, + "loss/crossentropy": 2.240622043609619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2305934578180313, + "step": 13764 + }, + { + "epoch": 0.27532, + "grad_norm": 2.078125, + "grad_norm_var": 0.013874308268229166, + "learning_rate": 0.0001, + "loss": 3.8268, + "loss/crossentropy": 1.6203233003616333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17164954543113708, + "step": 13766 + }, + { + "epoch": 0.27536, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013521067301432292, + "learning_rate": 0.0001, + "loss": 3.928, + "loss/crossentropy": 2.171602725982666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20083081722259521, + "step": 13768 + }, + { + "epoch": 0.2754, + "grad_norm": 2.046875, + "grad_norm_var": 0.012645467122395834, + "learning_rate": 0.0001, + "loss": 4.2738, + "loss/crossentropy": 2.2198195457458496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21113458275794983, + "step": 13770 + }, + { + "epoch": 0.27544, + "grad_norm": 1.9375, + "grad_norm_var": 0.012296549479166667, + "learning_rate": 0.0001, + "loss": 4.2833, + "loss/crossentropy": 2.1037912368774414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21217171102762222, + "step": 13772 + }, + { + "epoch": 0.27548, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012015533447265626, + "learning_rate": 0.0001, + "loss": 3.9824, + "loss/crossentropy": 2.252090096473694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20365406572818756, + "step": 13774 + }, + { + "epoch": 0.27552, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011701456705729167, + "learning_rate": 0.0001, + "loss": 3.9513, + "loss/crossentropy": 1.8044906258583069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854797750711441, + "step": 13776 + }, + { + "epoch": 0.27556, + "grad_norm": 2.09375, + "grad_norm_var": 0.012284342447916667, + "learning_rate": 0.0001, + "loss": 4.5229, + "loss/crossentropy": 2.2397952675819397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214948832988739, + "step": 13778 + }, + { + "epoch": 0.2756, + "grad_norm": 2.34375, + "grad_norm_var": 0.0164215087890625, + "learning_rate": 0.0001, + "loss": 4.3569, + "loss/crossentropy": 1.776510238647461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19995201379060745, + "step": 13780 + }, + { + "epoch": 0.27564, + "grad_norm": 4.0625, + "grad_norm_var": 0.26768290201822914, + "learning_rate": 0.0001, + "loss": 4.3061, + "loss/crossentropy": 2.470343589782715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2194378450512886, + "step": 13782 + }, + { + "epoch": 0.27568, + "grad_norm": 1.9921875, + "grad_norm_var": 0.26253433227539064, + "learning_rate": 0.0001, + "loss": 4.0708, + "loss/crossentropy": 2.1340490579605103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20280296355485916, + "step": 13784 + }, + { + "epoch": 0.27572, + "grad_norm": 2.03125, + "grad_norm_var": 0.26368179321289065, + "learning_rate": 0.0001, + "loss": 4.1072, + "loss/crossentropy": 1.6801128387451172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16750500351190567, + "step": 13786 + }, + { + "epoch": 0.27576, + "grad_norm": 1.9609375, + "grad_norm_var": 0.26387939453125, + "learning_rate": 0.0001, + "loss": 3.9902, + "loss/crossentropy": 1.9632562398910522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19115211814641953, + "step": 13788 + }, + { + "epoch": 0.2758, + "grad_norm": 2.125, + "grad_norm_var": 0.25916519165039065, + "learning_rate": 0.0001, + "loss": 4.2139, + "loss/crossentropy": 2.0817149877548218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21430402994155884, + "step": 13790 + }, + { + "epoch": 0.27584, + "grad_norm": 1.9296875, + "grad_norm_var": 0.253088124593099, + "learning_rate": 0.0001, + "loss": 4.3244, + "loss/crossentropy": 2.2736687660217285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22024155408143997, + "step": 13792 + }, + { + "epoch": 0.27588, + "grad_norm": 1.9921875, + "grad_norm_var": 0.25625712076822915, + "learning_rate": 0.0001, + "loss": 4.2548, + "loss/crossentropy": 2.2632944583892822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210355743765831, + "step": 13794 + }, + { + "epoch": 0.27592, + "grad_norm": 1.953125, + "grad_norm_var": 0.26166966756184895, + "learning_rate": 0.0001, + "loss": 4.2799, + "loss/crossentropy": 2.5457879304885864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22530706226825714, + "step": 13796 + }, + { + "epoch": 0.27596, + "grad_norm": 1.859375, + "grad_norm_var": 0.008410390218098958, + "learning_rate": 0.0001, + "loss": 3.8969, + "loss/crossentropy": 2.0878910422325134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19901857525110245, + "step": 13798 + }, + { + "epoch": 0.276, + "grad_norm": 2.015625, + "grad_norm_var": 0.008348592122395833, + "learning_rate": 0.0001, + "loss": 4.3213, + "loss/crossentropy": 1.803489863872528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19378191232681274, + "step": 13800 + }, + { + "epoch": 0.27604, + "grad_norm": 2.0, + "grad_norm_var": 0.009596506754557291, + "learning_rate": 0.0001, + "loss": 4.1916, + "loss/crossentropy": 2.156951904296875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091423124074936, + "step": 13802 + }, + { + "epoch": 0.27608, + "grad_norm": 1.890625, + "grad_norm_var": 0.010493977864583334, + "learning_rate": 0.0001, + "loss": 3.9346, + "loss/crossentropy": 1.7714723944664001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20779500901699066, + "step": 13804 + }, + { + "epoch": 0.27612, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009590403238932291, + "learning_rate": 0.0001, + "loss": 4.0499, + "loss/crossentropy": 1.8808923363685608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19429123401641846, + "step": 13806 + }, + { + "epoch": 0.27616, + "grad_norm": 1.9375, + "grad_norm_var": 0.005516560872395834, + "learning_rate": 0.0001, + "loss": 3.706, + "loss/crossentropy": 1.6658846735954285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17177510261535645, + "step": 13808 + }, + { + "epoch": 0.2762, + "grad_norm": 1.8359375, + "grad_norm_var": 0.018697102864583332, + "learning_rate": 0.0001, + "loss": 4.0909, + "loss/crossentropy": 2.052769422531128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20889723300933838, + "step": 13810 + }, + { + "epoch": 0.27624, + "grad_norm": 2.015625, + "grad_norm_var": 0.18883031209309895, + "learning_rate": 0.0001, + "loss": 4.0225, + "loss/crossentropy": 1.5800148844718933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24654322117567062, + "step": 13812 + }, + { + "epoch": 0.27628, + "grad_norm": 2.140625, + "grad_norm_var": 0.5619504292805989, + "learning_rate": 0.0001, + "loss": 4.1059, + "loss/crossentropy": 1.836085557937622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19563084840774536, + "step": 13814 + }, + { + "epoch": 0.27632, + "grad_norm": 2.046875, + "grad_norm_var": 0.5601722717285156, + "learning_rate": 0.0001, + "loss": 4.0077, + "loss/crossentropy": 1.627321183681488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17698387801647186, + "step": 13816 + }, + { + "epoch": 0.27636, + "grad_norm": 2.046875, + "grad_norm_var": 0.55164794921875, + "learning_rate": 0.0001, + "loss": 4.2038, + "loss/crossentropy": 2.0546024441719055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19832595437765121, + "step": 13818 + }, + { + "epoch": 0.2764, + "grad_norm": 1.96875, + "grad_norm_var": 0.5499501546223958, + "learning_rate": 0.0001, + "loss": 4.064, + "loss/crossentropy": 2.112219452857971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19605965912342072, + "step": 13820 + }, + { + "epoch": 0.27644, + "grad_norm": 2.078125, + "grad_norm_var": 0.5405006408691406, + "learning_rate": 0.0001, + "loss": 4.3825, + "loss/crossentropy": 1.9582284688949585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015855312347412, + "step": 13822 + }, + { + "epoch": 0.27648, + "grad_norm": 1.953125, + "grad_norm_var": 0.5307573954264323, + "learning_rate": 0.0001, + "loss": 4.2699, + "loss/crossentropy": 2.54653799533844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23546704649925232, + "step": 13824 + }, + { + "epoch": 0.27652, + "grad_norm": 1.953125, + "grad_norm_var": 0.5336090087890625, + "learning_rate": 0.0001, + "loss": 4.0729, + "loss/crossentropy": 1.630593478679657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16478916257619858, + "step": 13826 + }, + { + "epoch": 0.27656, + "grad_norm": 2.046875, + "grad_norm_var": 0.40966796875, + "learning_rate": 0.0001, + "loss": 4.0123, + "loss/crossentropy": 1.7995671033859253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19886638224124908, + "step": 13828 + }, + { + "epoch": 0.2766, + "grad_norm": 2.21875, + "grad_norm_var": 0.0177886962890625, + "learning_rate": 0.0001, + "loss": 4.4547, + "loss/crossentropy": 2.5423338413238525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.28057335317134857, + "step": 13830 + }, + { + "epoch": 0.27664, + "grad_norm": 2.078125, + "grad_norm_var": 0.018631744384765624, + "learning_rate": 0.0001, + "loss": 4.0213, + "loss/crossentropy": 1.8524783849716187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20141787081956863, + "step": 13832 + }, + { + "epoch": 0.27668, + "grad_norm": 1.9453125, + "grad_norm_var": 0.019440714518229166, + "learning_rate": 0.0001, + "loss": 4.2155, + "loss/crossentropy": 2.242451548576355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21003998070955276, + "step": 13834 + }, + { + "epoch": 0.27672, + "grad_norm": 2.046875, + "grad_norm_var": 0.0192047119140625, + "learning_rate": 0.0001, + "loss": 4.2665, + "loss/crossentropy": 2.2434345483779907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21679577976465225, + "step": 13836 + }, + { + "epoch": 0.27676, + "grad_norm": 2.0, + "grad_norm_var": 0.019551595052083332, + "learning_rate": 0.0001, + "loss": 4.014, + "loss/crossentropy": 1.6854392290115356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19326400756835938, + "step": 13838 + }, + { + "epoch": 0.2768, + "grad_norm": 2.171875, + "grad_norm_var": 0.026668294270833334, + "learning_rate": 0.0001, + "loss": 4.2837, + "loss/crossentropy": 2.0702155232429504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043909877538681, + "step": 13840 + }, + { + "epoch": 0.27684, + "grad_norm": 2.015625, + "grad_norm_var": 0.024738566080729166, + "learning_rate": 0.0001, + "loss": 4.0732, + "loss/crossentropy": 1.964758813381195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19540227204561234, + "step": 13842 + }, + { + "epoch": 0.27688, + "grad_norm": 2.046875, + "grad_norm_var": 0.0151611328125, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 2.0383411645889282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21456278860569, + "step": 13844 + }, + { + "epoch": 0.27692, + "grad_norm": 1.8984375, + "grad_norm_var": 0.015461222330729166, + "learning_rate": 0.0001, + "loss": 4.1407, + "loss/crossentropy": 2.020030975341797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20521440356969833, + "step": 13846 + }, + { + "epoch": 0.27696, + "grad_norm": 1.890625, + "grad_norm_var": 0.016110992431640624, + "learning_rate": 0.0001, + "loss": 4.1142, + "loss/crossentropy": 1.7514970302581787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19116375595331192, + "step": 13848 + }, + { + "epoch": 0.277, + "grad_norm": 1.9296875, + "grad_norm_var": 0.016437784830729166, + "learning_rate": 0.0001, + "loss": 3.9209, + "loss/crossentropy": 1.8194025754928589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18198709934949875, + "step": 13850 + }, + { + "epoch": 0.27704, + "grad_norm": 1.9375, + "grad_norm_var": 0.017223866780598958, + "learning_rate": 0.0001, + "loss": 3.7911, + "loss/crossentropy": 1.878225862979889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18159592151641846, + "step": 13852 + }, + { + "epoch": 0.27708, + "grad_norm": 2.0, + "grad_norm_var": 0.0178619384765625, + "learning_rate": 0.0001, + "loss": 4.008, + "loss/crossentropy": 1.8619664311408997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18098927289247513, + "step": 13854 + }, + { + "epoch": 0.27712, + "grad_norm": 2.046875, + "grad_norm_var": 0.003474934895833333, + "learning_rate": 0.0001, + "loss": 4.2336, + "loss/crossentropy": 2.0475016832351685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19999271631240845, + "step": 13856 + }, + { + "epoch": 0.27716, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004443105061848958, + "learning_rate": 0.0001, + "loss": 4.2412, + "loss/crossentropy": 2.2279679775238037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21767108887434006, + "step": 13858 + }, + { + "epoch": 0.2772, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0039670308430989586, + "learning_rate": 0.0001, + "loss": 3.8087, + "loss/crossentropy": 1.801176130771637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18873406946659088, + "step": 13860 + }, + { + "epoch": 0.27724, + "grad_norm": 1.8046875, + "grad_norm_var": 0.005303700764973958, + "learning_rate": 0.0001, + "loss": 3.7958, + "loss/crossentropy": 1.948195457458496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19519966840744019, + "step": 13862 + }, + { + "epoch": 0.27728, + "grad_norm": 1.90625, + "grad_norm_var": 0.0050432840983072914, + "learning_rate": 0.0001, + "loss": 4.0754, + "loss/crossentropy": 2.1044042110443115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20454693585634232, + "step": 13864 + }, + { + "epoch": 0.27732, + "grad_norm": 2.015625, + "grad_norm_var": 0.0051910400390625, + "learning_rate": 0.0001, + "loss": 4.1338, + "loss/crossentropy": 2.383382737636566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2179596871137619, + "step": 13866 + }, + { + "epoch": 0.27736, + "grad_norm": 1.9375, + "grad_norm_var": 0.004621378580729167, + "learning_rate": 0.0001, + "loss": 3.9475, + "loss/crossentropy": 1.7798078656196594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17871373891830444, + "step": 13868 + }, + { + "epoch": 0.2774, + "grad_norm": 1.984375, + "grad_norm_var": 0.004756418863932291, + "learning_rate": 0.0001, + "loss": 3.9937, + "loss/crossentropy": 2.1839531660079956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20030800253152847, + "step": 13870 + }, + { + "epoch": 0.27744, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005028279622395834, + "learning_rate": 0.0001, + "loss": 4.1637, + "loss/crossentropy": 1.990351676940918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951240971684456, + "step": 13872 + }, + { + "epoch": 0.27748, + "grad_norm": 2.140625, + "grad_norm_var": 0.006573232014973959, + "learning_rate": 0.0001, + "loss": 4.2919, + "loss/crossentropy": 2.191170334815979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.230004720389843, + "step": 13874 + }, + { + "epoch": 0.27752, + "grad_norm": 1.953125, + "grad_norm_var": 0.015726470947265626, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 1.8893607258796692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138589546084404, + "step": 13876 + }, + { + "epoch": 0.27756, + "grad_norm": 1.953125, + "grad_norm_var": 0.012483723958333333, + "learning_rate": 0.0001, + "loss": 4.2749, + "loss/crossentropy": 2.059523820877075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20147833973169327, + "step": 13878 + }, + { + "epoch": 0.2776, + "grad_norm": 2.125, + "grad_norm_var": 0.013206990559895833, + "learning_rate": 0.0001, + "loss": 4.4959, + "loss/crossentropy": 2.0867209434509277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23967817425727844, + "step": 13880 + }, + { + "epoch": 0.27764, + "grad_norm": 2.15625, + "grad_norm_var": 0.028393300374348958, + "learning_rate": 0.0001, + "loss": 4.5704, + "loss/crossentropy": 2.3017314672470093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23674342036247253, + "step": 13882 + }, + { + "epoch": 0.27768, + "grad_norm": 2.046875, + "grad_norm_var": 0.0261138916015625, + "learning_rate": 0.0001, + "loss": 4.3316, + "loss/crossentropy": 2.241922974586487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21731412410736084, + "step": 13884 + }, + { + "epoch": 0.27772, + "grad_norm": 2.03125, + "grad_norm_var": 0.025724283854166665, + "learning_rate": 0.0001, + "loss": 4.3317, + "loss/crossentropy": 2.3800796270370483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21916473656892776, + "step": 13886 + }, + { + "epoch": 0.27776, + "grad_norm": 2.09375, + "grad_norm_var": 0.02467625935872396, + "learning_rate": 0.0001, + "loss": 4.2972, + "loss/crossentropy": 1.966018259525299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21862812340259552, + "step": 13888 + }, + { + "epoch": 0.2778, + "grad_norm": 1.828125, + "grad_norm_var": 0.030104319254557293, + "learning_rate": 0.0001, + "loss": 4.1955, + "loss/crossentropy": 1.920684039592743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026856392621994, + "step": 13890 + }, + { + "epoch": 0.27784, + "grad_norm": 2.0625, + "grad_norm_var": 0.02601318359375, + "learning_rate": 0.0001, + "loss": 3.9662, + "loss/crossentropy": 2.1939095854759216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20429420471191406, + "step": 13892 + }, + { + "epoch": 0.27788, + "grad_norm": 1.9765625, + "grad_norm_var": 0.026292928059895835, + "learning_rate": 0.0001, + "loss": 4.2335, + "loss/crossentropy": 2.1600992679595947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20445890724658966, + "step": 13894 + }, + { + "epoch": 0.27792, + "grad_norm": 1.9453125, + "grad_norm_var": 0.026805623372395834, + "learning_rate": 0.0001, + "loss": 3.9973, + "loss/crossentropy": 1.9898765683174133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19056615233421326, + "step": 13896 + }, + { + "epoch": 0.27796, + "grad_norm": 1.859375, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 4.3037, + "loss/crossentropy": 2.0497539043426514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878463476896286, + "step": 13898 + }, + { + "epoch": 0.278, + "grad_norm": 1.9375, + "grad_norm_var": 0.007373046875, + "learning_rate": 0.0001, + "loss": 4.2299, + "loss/crossentropy": 1.894934356212616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.183290496468544, + "step": 13900 + }, + { + "epoch": 0.27804, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010094960530598959, + "learning_rate": 0.0001, + "loss": 3.858, + "loss/crossentropy": 1.808231770992279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19447006285190582, + "step": 13902 + }, + { + "epoch": 0.27808, + "grad_norm": 2.03125, + "grad_norm_var": 0.009688059488932291, + "learning_rate": 0.0001, + "loss": 4.0553, + "loss/crossentropy": 1.9384279251098633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20293861627578735, + "step": 13904 + }, + { + "epoch": 0.27812, + "grad_norm": 2.28125, + "grad_norm_var": 0.012835439046223958, + "learning_rate": 0.0001, + "loss": 4.3092, + "loss/crossentropy": 2.4291017055511475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24332769960165024, + "step": 13906 + }, + { + "epoch": 0.27816, + "grad_norm": 2.03125, + "grad_norm_var": 0.012669881184895834, + "learning_rate": 0.0001, + "loss": 4.331, + "loss/crossentropy": 2.3018234968185425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21657106280326843, + "step": 13908 + }, + { + "epoch": 0.2782, + "grad_norm": 2.390625, + "grad_norm_var": 0.0204742431640625, + "learning_rate": 0.0001, + "loss": 4.1756, + "loss/crossentropy": 2.211503267288208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19193635880947113, + "step": 13910 + }, + { + "epoch": 0.27824, + "grad_norm": 2.171875, + "grad_norm_var": 0.02191162109375, + "learning_rate": 0.0001, + "loss": 4.372, + "loss/crossentropy": 2.17062246799469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21989908069372177, + "step": 13912 + }, + { + "epoch": 0.27828, + "grad_norm": 2.015625, + "grad_norm_var": 0.018839518229166668, + "learning_rate": 0.0001, + "loss": 4.0895, + "loss/crossentropy": 2.1446024775505066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2036902904510498, + "step": 13914 + }, + { + "epoch": 0.27832, + "grad_norm": 2.015625, + "grad_norm_var": 0.0177642822265625, + "learning_rate": 0.0001, + "loss": 4.0729, + "loss/crossentropy": 1.8176022171974182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19358078390359879, + "step": 13916 + }, + { + "epoch": 0.27836, + "grad_norm": 2.09375, + "grad_norm_var": 0.01946996053059896, + "learning_rate": 0.0001, + "loss": 4.2376, + "loss/crossentropy": 2.341936469078064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22822833061218262, + "step": 13918 + }, + { + "epoch": 0.2784, + "grad_norm": 2.046875, + "grad_norm_var": 0.01697565714518229, + "learning_rate": 0.0001, + "loss": 4.0943, + "loss/crossentropy": 1.9315840601921082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17218464612960815, + "step": 13920 + }, + { + "epoch": 0.27844, + "grad_norm": 1.890625, + "grad_norm_var": 0.01615168253580729, + "learning_rate": 0.0001, + "loss": 4.1507, + "loss/crossentropy": 1.9442673921585083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19567463546991348, + "step": 13922 + }, + { + "epoch": 0.27848, + "grad_norm": 1.9921875, + "grad_norm_var": 0.016673787434895834, + "learning_rate": 0.0001, + "loss": 4.1611, + "loss/crossentropy": 2.3023592233657837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21824805438518524, + "step": 13924 + }, + { + "epoch": 0.27852, + "grad_norm": 1.8359375, + "grad_norm_var": 0.01170654296875, + "learning_rate": 0.0001, + "loss": 3.7557, + "loss/crossentropy": 2.116863250732422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087356001138687, + "step": 13926 + }, + { + "epoch": 0.27856, + "grad_norm": 1.953125, + "grad_norm_var": 0.010936482747395834, + "learning_rate": 0.0001, + "loss": 4.3746, + "loss/crossentropy": 2.0485963821411133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18910974264144897, + "step": 13928 + }, + { + "epoch": 0.2786, + "grad_norm": 2.0, + "grad_norm_var": 0.024857584635416666, + "learning_rate": 0.0001, + "loss": 4.1309, + "loss/crossentropy": 2.031753659248352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23851437866687775, + "step": 13930 + }, + { + "epoch": 0.27864, + "grad_norm": 1.953125, + "grad_norm_var": 0.025585683186848958, + "learning_rate": 0.0001, + "loss": 4.0688, + "loss/crossentropy": 1.8019705414772034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18659329414367676, + "step": 13932 + }, + { + "epoch": 0.27868, + "grad_norm": 2.109375, + "grad_norm_var": 0.025243123372395832, + "learning_rate": 0.0001, + "loss": 3.9609, + "loss/crossentropy": 1.9191861152648926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19393808394670486, + "step": 13934 + }, + { + "epoch": 0.27872, + "grad_norm": 1.984375, + "grad_norm_var": 0.025763956705729167, + "learning_rate": 0.0001, + "loss": 4.1765, + "loss/crossentropy": 2.242986798286438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.213445246219635, + "step": 13936 + }, + { + "epoch": 0.27876, + "grad_norm": 2.09375, + "grad_norm_var": 0.0300689697265625, + "learning_rate": 0.0001, + "loss": 4.1767, + "loss/crossentropy": 2.1652570962905884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2070690169930458, + "step": 13938 + }, + { + "epoch": 0.2788, + "grad_norm": 1.9140625, + "grad_norm_var": 0.031583404541015624, + "learning_rate": 0.0001, + "loss": 3.9643, + "loss/crossentropy": 2.0970187187194824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20118873566389084, + "step": 13940 + }, + { + "epoch": 0.27884, + "grad_norm": 2.078125, + "grad_norm_var": 0.02823053995768229, + "learning_rate": 0.0001, + "loss": 4.1116, + "loss/crossentropy": 1.9957542419433594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2377041131258011, + "step": 13942 + }, + { + "epoch": 0.27888, + "grad_norm": 2.0, + "grad_norm_var": 0.02522761027018229, + "learning_rate": 0.0001, + "loss": 4.0718, + "loss/crossentropy": 1.8680259585380554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919926255941391, + "step": 13944 + }, + { + "epoch": 0.27892, + "grad_norm": 2.0625, + "grad_norm_var": 0.015516916910807291, + "learning_rate": 0.0001, + "loss": 4.301, + "loss/crossentropy": 2.369240164756775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24805612862110138, + "step": 13946 + }, + { + "epoch": 0.27896, + "grad_norm": 2.0625, + "grad_norm_var": 0.014530436197916666, + "learning_rate": 0.0001, + "loss": 4.3082, + "loss/crossentropy": 2.0923795104026794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20823675394058228, + "step": 13948 + }, + { + "epoch": 0.279, + "grad_norm": 2.1875, + "grad_norm_var": 0.013197580973307291, + "learning_rate": 0.0001, + "loss": 4.3049, + "loss/crossentropy": 2.1777398586273193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21319355070590973, + "step": 13950 + }, + { + "epoch": 0.27904, + "grad_norm": 1.890625, + "grad_norm_var": 0.014869944254557291, + "learning_rate": 0.0001, + "loss": 4.0918, + "loss/crossentropy": 2.2155595421791077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029823139309883, + "step": 13952 + }, + { + "epoch": 0.27908, + "grad_norm": 1.96875, + "grad_norm_var": 0.011435699462890626, + "learning_rate": 0.0001, + "loss": 3.7195, + "loss/crossentropy": 1.5916873812675476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18207873404026031, + "step": 13954 + }, + { + "epoch": 0.27912, + "grad_norm": 2.09375, + "grad_norm_var": 0.009791819254557292, + "learning_rate": 0.0001, + "loss": 4.3336, + "loss/crossentropy": 2.165639281272888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22917281091213226, + "step": 13956 + }, + { + "epoch": 0.27916, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012851715087890625, + "learning_rate": 0.0001, + "loss": 3.9586, + "loss/crossentropy": 2.0245776772499084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19862860441207886, + "step": 13958 + }, + { + "epoch": 0.2792, + "grad_norm": 2.0625, + "grad_norm_var": 0.013646443684895834, + "learning_rate": 0.0001, + "loss": 4.1593, + "loss/crossentropy": 2.0179941654205322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20749768614768982, + "step": 13960 + }, + { + "epoch": 0.27924, + "grad_norm": 2.140625, + "grad_norm_var": 0.011966705322265625, + "learning_rate": 0.0001, + "loss": 4.3978, + "loss/crossentropy": 1.998136818408966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992153376340866, + "step": 13962 + }, + { + "epoch": 0.27928, + "grad_norm": 1.890625, + "grad_norm_var": 0.015346018473307292, + "learning_rate": 0.0001, + "loss": 4.1801, + "loss/crossentropy": 2.041996657848358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22224828600883484, + "step": 13964 + }, + { + "epoch": 0.27932, + "grad_norm": 2.140625, + "grad_norm_var": 0.019406890869140624, + "learning_rate": 0.0001, + "loss": 4.4133, + "loss/crossentropy": 2.0259117484092712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001984491944313, + "step": 13966 + }, + { + "epoch": 0.27936, + "grad_norm": 2.015625, + "grad_norm_var": 0.018436686197916666, + "learning_rate": 0.0001, + "loss": 4.0439, + "loss/crossentropy": 1.8219285607337952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19637393951416016, + "step": 13968 + }, + { + "epoch": 0.2794, + "grad_norm": 2.046875, + "grad_norm_var": 0.017073567708333334, + "learning_rate": 0.0001, + "loss": 4.0294, + "loss/crossentropy": 2.1451289653778076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21846695989370346, + "step": 13970 + }, + { + "epoch": 0.27944, + "grad_norm": 2.125, + "grad_norm_var": 0.016877237955729166, + "learning_rate": 0.0001, + "loss": 4.349, + "loss/crossentropy": 2.0527132749557495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147705778479576, + "step": 13972 + }, + { + "epoch": 0.27948, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015143839518229167, + "learning_rate": 0.0001, + "loss": 4.0188, + "loss/crossentropy": 1.9238123893737793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21083690226078033, + "step": 13974 + }, + { + "epoch": 0.27952, + "grad_norm": 1.953125, + "grad_norm_var": 0.015057118733723958, + "learning_rate": 0.0001, + "loss": 4.0072, + "loss/crossentropy": 1.894473671913147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20077680051326752, + "step": 13976 + }, + { + "epoch": 0.27956, + "grad_norm": 2.03125, + "grad_norm_var": 0.013871256510416667, + "learning_rate": 0.0001, + "loss": 4.2344, + "loss/crossentropy": 2.139513611793518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138260081410408, + "step": 13978 + }, + { + "epoch": 0.2796, + "grad_norm": 2.171875, + "grad_norm_var": 0.013468424479166666, + "learning_rate": 0.0001, + "loss": 4.4519, + "loss/crossentropy": 2.4868407249450684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23635651916265488, + "step": 13980 + }, + { + "epoch": 0.27964, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007940419514973958, + "learning_rate": 0.0001, + "loss": 3.7428, + "loss/crossentropy": 1.426945686340332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16342981159687042, + "step": 13982 + }, + { + "epoch": 0.27968, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007885487874348958, + "learning_rate": 0.0001, + "loss": 4.0457, + "loss/crossentropy": 2.0577683448791504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20394782721996307, + "step": 13984 + }, + { + "epoch": 0.27972, + "grad_norm": 2.0625, + "grad_norm_var": 0.008063761393229167, + "learning_rate": 0.0001, + "loss": 4.0045, + "loss/crossentropy": 1.884658396244049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18946699798107147, + "step": 13986 + }, + { + "epoch": 0.27976, + "grad_norm": 1.984375, + "grad_norm_var": 0.0070383707682291664, + "learning_rate": 0.0001, + "loss": 3.9423, + "loss/crossentropy": 1.7344964742660522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1923774629831314, + "step": 13988 + }, + { + "epoch": 0.2798, + "grad_norm": 1.84375, + "grad_norm_var": 0.00888671875, + "learning_rate": 0.0001, + "loss": 3.9541, + "loss/crossentropy": 1.9660141468048096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18560642004013062, + "step": 13990 + }, + { + "epoch": 0.27984, + "grad_norm": 2.046875, + "grad_norm_var": 0.008998362223307292, + "learning_rate": 0.0001, + "loss": 3.9308, + "loss/crossentropy": 1.7832527160644531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19922740012407303, + "step": 13992 + }, + { + "epoch": 0.27988, + "grad_norm": 2.078125, + "grad_norm_var": 0.009987131754557291, + "learning_rate": 0.0001, + "loss": 4.3919, + "loss/crossentropy": 1.9763087630271912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111397087574005, + "step": 13994 + }, + { + "epoch": 0.27992, + "grad_norm": 2.0625, + "grad_norm_var": 0.005252838134765625, + "learning_rate": 0.0001, + "loss": 4.1331, + "loss/crossentropy": 2.262092709541321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21212221682071686, + "step": 13996 + }, + { + "epoch": 0.27996, + "grad_norm": 2.015625, + "grad_norm_var": 0.0054595947265625, + "learning_rate": 0.0001, + "loss": 4.1946, + "loss/crossentropy": 2.237086296081543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20895393192768097, + "step": 13998 + }, + { + "epoch": 0.28, + "grad_norm": 1.890625, + "grad_norm_var": 0.0061337788899739586, + "learning_rate": 0.0001, + "loss": 4.063, + "loss/crossentropy": 1.9579973816871643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18024132400751114, + "step": 14000 + }, + { + "epoch": 0.28004, + "grad_norm": 2.015625, + "grad_norm_var": 0.0055844624837239586, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 2.2264128923416138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181634083390236, + "step": 14002 + }, + { + "epoch": 0.28008, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0058013916015625, + "learning_rate": 0.0001, + "loss": 4.0081, + "loss/crossentropy": 1.836763322353363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983085498213768, + "step": 14004 + }, + { + "epoch": 0.28012, + "grad_norm": 1.8515625, + "grad_norm_var": 0.005448404947916667, + "learning_rate": 0.0001, + "loss": 4.1192, + "loss/crossentropy": 2.0163257718086243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214015431702137, + "step": 14006 + }, + { + "epoch": 0.28016, + "grad_norm": 2.140625, + "grad_norm_var": 0.0070383707682291664, + "learning_rate": 0.0001, + "loss": 4.139, + "loss/crossentropy": 2.31567645072937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22832776606082916, + "step": 14008 + }, + { + "epoch": 0.2802, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006685129801432292, + "learning_rate": 0.0001, + "loss": 4.0753, + "loss/crossentropy": 2.3515210151672363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21946918964385986, + "step": 14010 + }, + { + "epoch": 0.28024, + "grad_norm": 2.0, + "grad_norm_var": 0.006845855712890625, + "learning_rate": 0.0001, + "loss": 3.9761, + "loss/crossentropy": 2.0987448692321777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180740088224411, + "step": 14012 + }, + { + "epoch": 0.28028, + "grad_norm": 2.015625, + "grad_norm_var": 0.00662841796875, + "learning_rate": 0.0001, + "loss": 4.0909, + "loss/crossentropy": 2.2203436493873596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2157250940799713, + "step": 14014 + }, + { + "epoch": 0.28032, + "grad_norm": 1.953125, + "grad_norm_var": 0.006241861979166667, + "learning_rate": 0.0001, + "loss": 4.3152, + "loss/crossentropy": 2.083233594894409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187238186597824, + "step": 14016 + }, + { + "epoch": 0.28036, + "grad_norm": 2.015625, + "grad_norm_var": 0.0061075846354166664, + "learning_rate": 0.0001, + "loss": 4.1311, + "loss/crossentropy": 2.1261327266693115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063112109899521, + "step": 14018 + }, + { + "epoch": 0.2804, + "grad_norm": 2.109375, + "grad_norm_var": 0.006648508707682291, + "learning_rate": 0.0001, + "loss": 4.0901, + "loss/crossentropy": 2.012390434741974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20225800573825836, + "step": 14020 + }, + { + "epoch": 0.28044, + "grad_norm": 2.046875, + "grad_norm_var": 0.0047271728515625, + "learning_rate": 0.0001, + "loss": 3.9985, + "loss/crossentropy": 1.8605966567993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003621682524681, + "step": 14022 + }, + { + "epoch": 0.28048, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0028157552083333333, + "learning_rate": 0.0001, + "loss": 3.9774, + "loss/crossentropy": 1.9208670258522034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20056191086769104, + "step": 14024 + }, + { + "epoch": 0.28052, + "grad_norm": 2.046875, + "grad_norm_var": 0.003951009114583333, + "learning_rate": 0.0001, + "loss": 3.7462, + "loss/crossentropy": 1.924518644809723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19410791993141174, + "step": 14026 + }, + { + "epoch": 0.28056, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0035113016764322918, + "learning_rate": 0.0001, + "loss": 4.0319, + "loss/crossentropy": 2.103038251399994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2054024636745453, + "step": 14028 + }, + { + "epoch": 0.2806, + "grad_norm": 1.8984375, + "grad_norm_var": 0.004303995768229167, + "learning_rate": 0.0001, + "loss": 4.0526, + "loss/crossentropy": 1.8974853157997131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18592579662799835, + "step": 14030 + }, + { + "epoch": 0.28064, + "grad_norm": 2.03125, + "grad_norm_var": 0.004447428385416666, + "learning_rate": 0.0001, + "loss": 4.1735, + "loss/crossentropy": 2.0207908749580383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.216909758746624, + "step": 14032 + }, + { + "epoch": 0.28068, + "grad_norm": 2.171875, + "grad_norm_var": 0.006119537353515625, + "learning_rate": 0.0001, + "loss": 4.5759, + "loss/crossentropy": 2.0010873079299927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20815415680408478, + "step": 14034 + }, + { + "epoch": 0.28072, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006961822509765625, + "learning_rate": 0.0001, + "loss": 3.9162, + "loss/crossentropy": 2.0840484499931335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21481210738420486, + "step": 14036 + }, + { + "epoch": 0.28076, + "grad_norm": 2.03125, + "grad_norm_var": 0.006880442301432292, + "learning_rate": 0.0001, + "loss": 4.1298, + "loss/crossentropy": 2.0026179552078247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1904347836971283, + "step": 14038 + }, + { + "epoch": 0.2808, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006880442301432292, + "learning_rate": 0.0001, + "loss": 4.0618, + "loss/crossentropy": 2.1660486459732056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21236423403024673, + "step": 14040 + }, + { + "epoch": 0.28084, + "grad_norm": 2.15625, + "grad_norm_var": 0.007669830322265625, + "learning_rate": 0.0001, + "loss": 4.0474, + "loss/crossentropy": 2.312765598297119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20535308122634888, + "step": 14042 + }, + { + "epoch": 0.28088, + "grad_norm": 1.984375, + "grad_norm_var": 0.0077789306640625, + "learning_rate": 0.0001, + "loss": 4.1959, + "loss/crossentropy": 1.8449203372001648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1986582651734352, + "step": 14044 + }, + { + "epoch": 0.28092, + "grad_norm": 1.8359375, + "grad_norm_var": 0.008894602457682291, + "learning_rate": 0.0001, + "loss": 4.016, + "loss/crossentropy": 2.0159433484077454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19724585115909576, + "step": 14046 + }, + { + "epoch": 0.28096, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0086090087890625, + "learning_rate": 0.0001, + "loss": 4.0691, + "loss/crossentropy": 2.138873815536499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21438196301460266, + "step": 14048 + }, + { + "epoch": 0.281, + "grad_norm": 2.25, + "grad_norm_var": 0.011372629801432292, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 1.6896708607673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19260118901729584, + "step": 14050 + }, + { + "epoch": 0.28104, + "grad_norm": 1.828125, + "grad_norm_var": 0.014656321207682291, + "learning_rate": 0.0001, + "loss": 3.8383, + "loss/crossentropy": 1.8495243191719055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18072611093521118, + "step": 14052 + }, + { + "epoch": 0.28108, + "grad_norm": 1.9765625, + "grad_norm_var": 0.017354329427083332, + "learning_rate": 0.0001, + "loss": 4.3581, + "loss/crossentropy": 2.193492293357849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247578352689743, + "step": 14054 + }, + { + "epoch": 0.28112, + "grad_norm": 1.96875, + "grad_norm_var": 0.01761652628580729, + "learning_rate": 0.0001, + "loss": 4.1252, + "loss/crossentropy": 2.154744803905487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20149191468954086, + "step": 14056 + }, + { + "epoch": 0.28116, + "grad_norm": 1.9921875, + "grad_norm_var": 0.016022745768229166, + "learning_rate": 0.0001, + "loss": 4.0832, + "loss/crossentropy": 2.137023687362671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20409516990184784, + "step": 14058 + }, + { + "epoch": 0.2812, + "grad_norm": 2.96875, + "grad_norm_var": 0.0767242431640625, + "learning_rate": 0.0001, + "loss": 4.6582, + "loss/crossentropy": 2.4530670642852783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24031317979097366, + "step": 14060 + }, + { + "epoch": 0.28124, + "grad_norm": 2.265625, + "grad_norm_var": 0.07713394165039063, + "learning_rate": 0.0001, + "loss": 4.2973, + "loss/crossentropy": 2.3285664319992065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.223993182182312, + "step": 14062 + }, + { + "epoch": 0.28128, + "grad_norm": 1.90625, + "grad_norm_var": 0.0784088134765625, + "learning_rate": 0.0001, + "loss": 4.0899, + "loss/crossentropy": 2.126828193664551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21428050100803375, + "step": 14064 + }, + { + "epoch": 0.28132, + "grad_norm": 1.796875, + "grad_norm_var": 0.08012288411458333, + "learning_rate": 0.0001, + "loss": 3.8083, + "loss/crossentropy": 2.0194268226623535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20124298334121704, + "step": 14066 + }, + { + "epoch": 0.28136, + "grad_norm": 2.125, + "grad_norm_var": 0.07445882161458334, + "learning_rate": 0.0001, + "loss": 4.0002, + "loss/crossentropy": 1.796358585357666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1902359575033188, + "step": 14068 + }, + { + "epoch": 0.2814, + "grad_norm": 2.15625, + "grad_norm_var": 0.07318700154622396, + "learning_rate": 0.0001, + "loss": 4.2, + "loss/crossentropy": 1.8975006341934204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19528520107269287, + "step": 14070 + }, + { + "epoch": 0.28144, + "grad_norm": 1.9296875, + "grad_norm_var": 0.07691141764322916, + "learning_rate": 0.0001, + "loss": 3.9885, + "loss/crossentropy": 1.628948986530304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1823427528142929, + "step": 14072 + }, + { + "epoch": 0.28148, + "grad_norm": 1.9140625, + "grad_norm_var": 0.07718480428059896, + "learning_rate": 0.0001, + "loss": 4.0404, + "loss/crossentropy": 1.9894697070121765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945328414440155, + "step": 14074 + }, + { + "epoch": 0.28152, + "grad_norm": 2.125, + "grad_norm_var": 0.022489166259765624, + "learning_rate": 0.0001, + "loss": 4.0927, + "loss/crossentropy": 1.9978017210960388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21055076271295547, + "step": 14076 + }, + { + "epoch": 0.28156, + "grad_norm": 1.921875, + "grad_norm_var": 0.01697565714518229, + "learning_rate": 0.0001, + "loss": 3.9224, + "loss/crossentropy": 2.087389588356018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19179877638816833, + "step": 14078 + }, + { + "epoch": 0.2816, + "grad_norm": 1.96875, + "grad_norm_var": 0.018308258056640624, + "learning_rate": 0.0001, + "loss": 4.3356, + "loss/crossentropy": 2.3281190395355225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2292996421456337, + "step": 14080 + }, + { + "epoch": 0.28164, + "grad_norm": 1.953125, + "grad_norm_var": 0.015215810139973958, + "learning_rate": 0.0001, + "loss": 3.8584, + "loss/crossentropy": 1.8850311040878296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19652675837278366, + "step": 14082 + }, + { + "epoch": 0.28168, + "grad_norm": 1.8671875, + "grad_norm_var": 0.015604654947916666, + "learning_rate": 0.0001, + "loss": 3.9013, + "loss/crossentropy": 1.757595181465149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17358998954296112, + "step": 14084 + }, + { + "epoch": 0.28172, + "grad_norm": 2.015625, + "grad_norm_var": 0.01387939453125, + "learning_rate": 0.0001, + "loss": 4.1367, + "loss/crossentropy": 2.173685908317566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22043757140636444, + "step": 14086 + }, + { + "epoch": 0.28176, + "grad_norm": 1.8203125, + "grad_norm_var": 0.010758463541666667, + "learning_rate": 0.0001, + "loss": 4.0381, + "loss/crossentropy": 2.0290868282318115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19695669412612915, + "step": 14088 + }, + { + "epoch": 0.2818, + "grad_norm": 2.125, + "grad_norm_var": 0.010609690348307292, + "learning_rate": 0.0001, + "loss": 4.3497, + "loss/crossentropy": 2.320943236351013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21747629344463348, + "step": 14090 + }, + { + "epoch": 0.28184, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011205037434895834, + "learning_rate": 0.0001, + "loss": 4.0918, + "loss/crossentropy": 2.0311750173568726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19410807639360428, + "step": 14092 + }, + { + "epoch": 0.28188, + "grad_norm": 2.03125, + "grad_norm_var": 0.011207834879557291, + "learning_rate": 0.0001, + "loss": 4.1303, + "loss/crossentropy": 2.164630174636841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1962611824274063, + "step": 14094 + }, + { + "epoch": 0.28192, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0093658447265625, + "learning_rate": 0.0001, + "loss": 3.9681, + "loss/crossentropy": 2.170566439628601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24082274734973907, + "step": 14096 + }, + { + "epoch": 0.28196, + "grad_norm": 2.0, + "grad_norm_var": 0.009285227457682291, + "learning_rate": 0.0001, + "loss": 4.209, + "loss/crossentropy": 2.3137707710266113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22475039958953857, + "step": 14098 + }, + { + "epoch": 0.282, + "grad_norm": 1.7421875, + "grad_norm_var": 0.010794830322265626, + "learning_rate": 0.0001, + "loss": 3.7432, + "loss/crossentropy": 2.0448675751686096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19814980030059814, + "step": 14100 + }, + { + "epoch": 0.28204, + "grad_norm": 1.96875, + "grad_norm_var": 0.011423492431640625, + "learning_rate": 0.0001, + "loss": 4.1729, + "loss/crossentropy": 2.095334231853485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19743670523166656, + "step": 14102 + }, + { + "epoch": 0.28208, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008737945556640625, + "learning_rate": 0.0001, + "loss": 4.0362, + "loss/crossentropy": 2.280429720878601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21895557641983032, + "step": 14104 + }, + { + "epoch": 0.28212, + "grad_norm": 2.09375, + "grad_norm_var": 0.008292388916015626, + "learning_rate": 0.0001, + "loss": 4.1166, + "loss/crossentropy": 1.8548901677131653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19965855032205582, + "step": 14106 + }, + { + "epoch": 0.28216, + "grad_norm": 2.234375, + "grad_norm_var": 0.01163330078125, + "learning_rate": 0.0001, + "loss": 4.2589, + "loss/crossentropy": 2.3574371337890625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23160286247730255, + "step": 14108 + }, + { + "epoch": 0.2822, + "grad_norm": 2.078125, + "grad_norm_var": 0.012800089518229167, + "learning_rate": 0.0001, + "loss": 3.932, + "loss/crossentropy": 2.112724542617798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19597014784812927, + "step": 14110 + }, + { + "epoch": 0.28224, + "grad_norm": 1.8828125, + "grad_norm_var": 0.013068644205729167, + "learning_rate": 0.0001, + "loss": 3.8792, + "loss/crossentropy": 1.9877798557281494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21120092272758484, + "step": 14112 + }, + { + "epoch": 0.28228, + "grad_norm": 2.015625, + "grad_norm_var": 0.013199869791666667, + "learning_rate": 0.0001, + "loss": 4.1761, + "loss/crossentropy": 2.253028154373169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2046835869550705, + "step": 14114 + }, + { + "epoch": 0.28232, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008561197916666667, + "learning_rate": 0.0001, + "loss": 4.1711, + "loss/crossentropy": 2.322582960128784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22178008407354355, + "step": 14116 + }, + { + "epoch": 0.28236, + "grad_norm": 2.03125, + "grad_norm_var": 0.008204905192057292, + "learning_rate": 0.0001, + "loss": 4.1584, + "loss/crossentropy": 2.470620036125183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23360955715179443, + "step": 14118 + }, + { + "epoch": 0.2824, + "grad_norm": 1.90625, + "grad_norm_var": 0.0086334228515625, + "learning_rate": 0.0001, + "loss": 4.0206, + "loss/crossentropy": 1.9218884110450745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1978505551815033, + "step": 14120 + }, + { + "epoch": 0.28244, + "grad_norm": 2.125, + "grad_norm_var": 0.0092437744140625, + "learning_rate": 0.0001, + "loss": 4.2045, + "loss/crossentropy": 1.9114753007888794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19295217841863632, + "step": 14122 + }, + { + "epoch": 0.28248, + "grad_norm": 2.015625, + "grad_norm_var": 0.498583984375, + "learning_rate": 0.0001, + "loss": 4.2259, + "loss/crossentropy": 2.1196082830429077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068253606557846, + "step": 14124 + }, + { + "epoch": 0.28252, + "grad_norm": 2.046875, + "grad_norm_var": 0.492419179280599, + "learning_rate": 0.0001, + "loss": 4.2616, + "loss/crossentropy": 1.9869969487190247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1950489580631256, + "step": 14126 + }, + { + "epoch": 0.28256, + "grad_norm": 1.9609375, + "grad_norm_var": 0.488177235921224, + "learning_rate": 0.0001, + "loss": 4.1635, + "loss/crossentropy": 2.09599232673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22184181958436966, + "step": 14128 + }, + { + "epoch": 0.2826, + "grad_norm": 2.046875, + "grad_norm_var": 0.489172108968099, + "learning_rate": 0.0001, + "loss": 4.2238, + "loss/crossentropy": 2.1891895532608032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229236587882042, + "step": 14130 + }, + { + "epoch": 0.28264, + "grad_norm": 2.171875, + "grad_norm_var": 0.48787816365559894, + "learning_rate": 0.0001, + "loss": 4.12, + "loss/crossentropy": 2.5070972442626953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21862925589084625, + "step": 14132 + }, + { + "epoch": 0.28268, + "grad_norm": 2.03125, + "grad_norm_var": 0.48812662760416664, + "learning_rate": 0.0001, + "loss": 4.0993, + "loss/crossentropy": 2.100346863269806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19969436526298523, + "step": 14134 + }, + { + "epoch": 0.28272, + "grad_norm": 2.1875, + "grad_norm_var": 0.48713150024414065, + "learning_rate": 0.0001, + "loss": 4.2607, + "loss/crossentropy": 2.0019801259040833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21319210529327393, + "step": 14136 + }, + { + "epoch": 0.28276, + "grad_norm": 2.296875, + "grad_norm_var": 0.4855323791503906, + "learning_rate": 0.0001, + "loss": 4.7069, + "loss/crossentropy": 2.2734099626541138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24008513987064362, + "step": 14138 + }, + { + "epoch": 0.2828, + "grad_norm": 2.484375, + "grad_norm_var": 0.02581965128580729, + "learning_rate": 0.0001, + "loss": 4.1085, + "loss/crossentropy": 2.2589277029037476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2224387675523758, + "step": 14140 + }, + { + "epoch": 0.28284, + "grad_norm": 1.9609375, + "grad_norm_var": 0.02667236328125, + "learning_rate": 0.0001, + "loss": 4.2749, + "loss/crossentropy": 1.713346004486084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19133105129003525, + "step": 14142 + }, + { + "epoch": 0.28288, + "grad_norm": 2.0, + "grad_norm_var": 0.02616144816080729, + "learning_rate": 0.0001, + "loss": 4.1792, + "loss/crossentropy": 2.212652564048767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21326624602079391, + "step": 14144 + }, + { + "epoch": 0.28292, + "grad_norm": 2.046875, + "grad_norm_var": 0.02902399698893229, + "learning_rate": 0.0001, + "loss": 3.901, + "loss/crossentropy": 1.8900847434997559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882227212190628, + "step": 14146 + }, + { + "epoch": 0.28296, + "grad_norm": 2.25, + "grad_norm_var": 0.028913370768229165, + "learning_rate": 0.0001, + "loss": 4.3775, + "loss/crossentropy": 2.171455979347229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20516864210367203, + "step": 14148 + }, + { + "epoch": 0.283, + "grad_norm": 2.046875, + "grad_norm_var": 0.027608235677083332, + "learning_rate": 0.0001, + "loss": 4.3466, + "loss/crossentropy": 2.0112149715423584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20043446868658066, + "step": 14150 + }, + { + "epoch": 0.28304, + "grad_norm": 1.96875, + "grad_norm_var": 0.026596832275390624, + "learning_rate": 0.0001, + "loss": 4.2879, + "loss/crossentropy": 2.0091559886932373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21023409068584442, + "step": 14152 + }, + { + "epoch": 0.28308, + "grad_norm": 1.9296875, + "grad_norm_var": 0.023045857747395832, + "learning_rate": 0.0001, + "loss": 3.9234, + "loss/crossentropy": 2.122502863407135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023375853896141, + "step": 14154 + }, + { + "epoch": 0.28312, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011462148030598958, + "learning_rate": 0.0001, + "loss": 4.3187, + "loss/crossentropy": 2.2842462062835693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24532422423362732, + "step": 14156 + }, + { + "epoch": 0.28316, + "grad_norm": 2.078125, + "grad_norm_var": 0.010990397135416666, + "learning_rate": 0.0001, + "loss": 4.5248, + "loss/crossentropy": 2.247039318084717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24580712616443634, + "step": 14158 + }, + { + "epoch": 0.2832, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011432902018229166, + "learning_rate": 0.0001, + "loss": 3.9486, + "loss/crossentropy": 2.182216167449951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21095345169305801, + "step": 14160 + }, + { + "epoch": 0.28324, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008648427327473958, + "learning_rate": 0.0001, + "loss": 4.4582, + "loss/crossentropy": 2.219786763191223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21747471392154694, + "step": 14162 + }, + { + "epoch": 0.28328, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005830637613932292, + "learning_rate": 0.0001, + "loss": 3.9483, + "loss/crossentropy": 1.8589028716087341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19629907608032227, + "step": 14164 + }, + { + "epoch": 0.28332, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0138092041015625, + "learning_rate": 0.0001, + "loss": 4.0217, + "loss/crossentropy": 1.9820821285247803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20433218777179718, + "step": 14166 + }, + { + "epoch": 0.28336, + "grad_norm": 1.9296875, + "grad_norm_var": 0.014216868082682292, + "learning_rate": 0.0001, + "loss": 4.0081, + "loss/crossentropy": 1.8423896431922913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18210452795028687, + "step": 14168 + }, + { + "epoch": 0.2834, + "grad_norm": 2.171875, + "grad_norm_var": 0.0158203125, + "learning_rate": 0.0001, + "loss": 4.1244, + "loss/crossentropy": 2.0365665555000305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21229179203510284, + "step": 14170 + }, + { + "epoch": 0.28344, + "grad_norm": 1.7578125, + "grad_norm_var": 0.019212849934895835, + "learning_rate": 0.0001, + "loss": 3.7894, + "loss/crossentropy": 1.6819360256195068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20171043276786804, + "step": 14172 + }, + { + "epoch": 0.28348, + "grad_norm": 1.8828125, + "grad_norm_var": 0.019846343994140626, + "learning_rate": 0.0001, + "loss": 4.1247, + "loss/crossentropy": 1.9932513236999512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21374855190515518, + "step": 14174 + }, + { + "epoch": 0.28352, + "grad_norm": 1.96875, + "grad_norm_var": 0.01975072224934896, + "learning_rate": 0.0001, + "loss": 3.8874, + "loss/crossentropy": 1.7137236595153809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17538649588823318, + "step": 14176 + }, + { + "epoch": 0.28356, + "grad_norm": 1.953125, + "grad_norm_var": 0.01977717081705729, + "learning_rate": 0.0001, + "loss": 3.8762, + "loss/crossentropy": 1.7120450735092163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19030355662107468, + "step": 14178 + }, + { + "epoch": 0.2836, + "grad_norm": 1.9765625, + "grad_norm_var": 0.019962565104166666, + "learning_rate": 0.0001, + "loss": 4.4603, + "loss/crossentropy": 2.346954107284546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24070723354816437, + "step": 14180 + }, + { + "epoch": 0.28364, + "grad_norm": 1.953125, + "grad_norm_var": 0.013755035400390626, + "learning_rate": 0.0001, + "loss": 4.2238, + "loss/crossentropy": 2.241390824317932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247970700263977, + "step": 14182 + }, + { + "epoch": 0.28368, + "grad_norm": 1.890625, + "grad_norm_var": 0.02080078125, + "learning_rate": 0.0001, + "loss": 4.206, + "loss/crossentropy": 2.234646439552307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21947349607944489, + "step": 14184 + }, + { + "epoch": 0.28372, + "grad_norm": 1.9375, + "grad_norm_var": 0.019010416666666665, + "learning_rate": 0.0001, + "loss": 4.0446, + "loss/crossentropy": 1.9331231117248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20428168773651123, + "step": 14186 + }, + { + "epoch": 0.28376, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015315500895182292, + "learning_rate": 0.0001, + "loss": 4.0184, + "loss/crossentropy": 1.8166351318359375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21285146474838257, + "step": 14188 + }, + { + "epoch": 0.2838, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015077463785807292, + "learning_rate": 0.0001, + "loss": 4.313, + "loss/crossentropy": 2.181701421737671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20924244076013565, + "step": 14190 + }, + { + "epoch": 0.28384, + "grad_norm": 2.078125, + "grad_norm_var": 0.015558878580729166, + "learning_rate": 0.0001, + "loss": 4.0866, + "loss/crossentropy": 2.1605955958366394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24021095037460327, + "step": 14192 + }, + { + "epoch": 0.28388, + "grad_norm": 2.1875, + "grad_norm_var": 0.015455881754557291, + "learning_rate": 0.0001, + "loss": 4.1617, + "loss/crossentropy": 2.3546024560928345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24228987097740173, + "step": 14194 + }, + { + "epoch": 0.28392, + "grad_norm": 2.015625, + "grad_norm_var": 0.015314737955729166, + "learning_rate": 0.0001, + "loss": 4.0791, + "loss/crossentropy": 2.2335511445999146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21917300671339035, + "step": 14196 + }, + { + "epoch": 0.28396, + "grad_norm": 2.0, + "grad_norm_var": 0.012760416666666666, + "learning_rate": 0.0001, + "loss": 3.9915, + "loss/crossentropy": 1.8429189324378967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18830660730600357, + "step": 14198 + }, + { + "epoch": 0.284, + "grad_norm": 2.015625, + "grad_norm_var": 0.0058746337890625, + "learning_rate": 0.0001, + "loss": 4.2671, + "loss/crossentropy": 2.4578241109848022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23527230322360992, + "step": 14200 + }, + { + "epoch": 0.28404, + "grad_norm": 2.03125, + "grad_norm_var": 0.00592041015625, + "learning_rate": 0.0001, + "loss": 4.3027, + "loss/crossentropy": 2.0944234132766724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211083546280861, + "step": 14202 + }, + { + "epoch": 0.28408, + "grad_norm": 1.90625, + "grad_norm_var": 0.0066070556640625, + "learning_rate": 0.0001, + "loss": 3.9341, + "loss/crossentropy": 1.8876588344573975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1908133327960968, + "step": 14204 + }, + { + "epoch": 0.28412, + "grad_norm": 2.0625, + "grad_norm_var": 0.006190745035807291, + "learning_rate": 0.0001, + "loss": 4.368, + "loss/crossentropy": 2.0882590413093567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21159439533948898, + "step": 14206 + }, + { + "epoch": 0.28416, + "grad_norm": 2.015625, + "grad_norm_var": 0.005296834309895833, + "learning_rate": 0.0001, + "loss": 4.3005, + "loss/crossentropy": 2.219560742378235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2282530516386032, + "step": 14208 + }, + { + "epoch": 0.2842, + "grad_norm": 2.015625, + "grad_norm_var": 0.004759724934895833, + "learning_rate": 0.0001, + "loss": 3.9308, + "loss/crossentropy": 1.8383984565734863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1868659406900406, + "step": 14210 + }, + { + "epoch": 0.28424, + "grad_norm": 2.125, + "grad_norm_var": 0.006892649332682291, + "learning_rate": 0.0001, + "loss": 3.9613, + "loss/crossentropy": 1.7509828209877014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1862550526857376, + "step": 14212 + }, + { + "epoch": 0.28428, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007513173421223958, + "learning_rate": 0.0001, + "loss": 4.1301, + "loss/crossentropy": 2.1544927954673767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19378253817558289, + "step": 14214 + }, + { + "epoch": 0.28432, + "grad_norm": 2.03125, + "grad_norm_var": 0.007533518473307291, + "learning_rate": 0.0001, + "loss": 4.1261, + "loss/crossentropy": 2.498751997947693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22881630063056946, + "step": 14216 + }, + { + "epoch": 0.28436, + "grad_norm": 2.09375, + "grad_norm_var": 0.006780751546223958, + "learning_rate": 0.0001, + "loss": 4.3414, + "loss/crossentropy": 2.2638392448425293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21407127380371094, + "step": 14218 + }, + { + "epoch": 0.2844, + "grad_norm": 2.015625, + "grad_norm_var": 0.006380208333333333, + "learning_rate": 0.0001, + "loss": 4.0469, + "loss/crossentropy": 2.097207546234131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21202068030834198, + "step": 14220 + }, + { + "epoch": 0.28444, + "grad_norm": 2.0625, + "grad_norm_var": 0.007661946614583333, + "learning_rate": 0.0001, + "loss": 4.211, + "loss/crossentropy": 2.0577695965766907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21462847292423248, + "step": 14222 + }, + { + "epoch": 0.28448, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008373769124348958, + "learning_rate": 0.0001, + "loss": 4.2646, + "loss/crossentropy": 2.3061472177505493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23122046887874603, + "step": 14224 + }, + { + "epoch": 0.28452, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0068511962890625, + "learning_rate": 0.0001, + "loss": 3.947, + "loss/crossentropy": 1.850829005241394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19229399412870407, + "step": 14226 + }, + { + "epoch": 0.28456, + "grad_norm": 1.9375, + "grad_norm_var": 0.0051348368326822914, + "learning_rate": 0.0001, + "loss": 4.0891, + "loss/crossentropy": 1.9520751237869263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19159140437841415, + "step": 14228 + }, + { + "epoch": 0.2846, + "grad_norm": 1.875, + "grad_norm_var": 0.005145009358723958, + "learning_rate": 0.0001, + "loss": 4.2161, + "loss/crossentropy": 2.507380247116089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23365024477243423, + "step": 14230 + }, + { + "epoch": 0.28464, + "grad_norm": 2.046875, + "grad_norm_var": 0.006261952718098958, + "learning_rate": 0.0001, + "loss": 4.0787, + "loss/crossentropy": 1.9148982763290405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19785508513450623, + "step": 14232 + }, + { + "epoch": 0.28468, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007769521077473958, + "learning_rate": 0.0001, + "loss": 4.1882, + "loss/crossentropy": 2.004162549972534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192252054810524, + "step": 14234 + }, + { + "epoch": 0.28472, + "grad_norm": 1.9375, + "grad_norm_var": 0.0078277587890625, + "learning_rate": 0.0001, + "loss": 4.1318, + "loss/crossentropy": 2.1455901861190796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19151784479618073, + "step": 14236 + }, + { + "epoch": 0.28476, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005580393473307291, + "learning_rate": 0.0001, + "loss": 3.9548, + "loss/crossentropy": 2.2556833028793335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20640526711940765, + "step": 14238 + }, + { + "epoch": 0.2848, + "grad_norm": 1.8828125, + "grad_norm_var": 0.013698069254557292, + "learning_rate": 0.0001, + "loss": 4.3805, + "loss/crossentropy": 2.1121991872787476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21154547482728958, + "step": 14240 + }, + { + "epoch": 0.28484, + "grad_norm": 1.7890625, + "grad_norm_var": 0.01586888631184896, + "learning_rate": 0.0001, + "loss": 3.7502, + "loss/crossentropy": 1.9072884321212769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19220874458551407, + "step": 14242 + }, + { + "epoch": 0.28488, + "grad_norm": 2.015625, + "grad_norm_var": 0.0154052734375, + "learning_rate": 0.0001, + "loss": 4.1925, + "loss/crossentropy": 1.907566249370575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20355048030614853, + "step": 14244 + }, + { + "epoch": 0.28492, + "grad_norm": 1.859375, + "grad_norm_var": 0.0149169921875, + "learning_rate": 0.0001, + "loss": 3.8873, + "loss/crossentropy": 2.1140421628952026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20535241812467575, + "step": 14246 + }, + { + "epoch": 0.28496, + "grad_norm": 2.125, + "grad_norm_var": 0.016022745768229166, + "learning_rate": 0.0001, + "loss": 4.1704, + "loss/crossentropy": 2.192967176437378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20353975892066956, + "step": 14248 + }, + { + "epoch": 0.285, + "grad_norm": 1.90625, + "grad_norm_var": 0.015213775634765624, + "learning_rate": 0.0001, + "loss": 3.9064, + "loss/crossentropy": 1.6658200025558472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17274750024080276, + "step": 14250 + }, + { + "epoch": 0.28504, + "grad_norm": 2.171875, + "grad_norm_var": 0.017411041259765624, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 2.014496326446533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203952856361866, + "step": 14252 + }, + { + "epoch": 0.28508, + "grad_norm": 2.15625, + "grad_norm_var": 0.0249908447265625, + "learning_rate": 0.0001, + "loss": 4.4607, + "loss/crossentropy": 2.1534151434898376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23526588827371597, + "step": 14254 + }, + { + "epoch": 0.28512, + "grad_norm": 2.0625, + "grad_norm_var": 0.01701838175455729, + "learning_rate": 0.0001, + "loss": 4.1263, + "loss/crossentropy": 2.08686763048172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20443058758974075, + "step": 14256 + }, + { + "epoch": 0.28516, + "grad_norm": 2.0625, + "grad_norm_var": 0.0135406494140625, + "learning_rate": 0.0001, + "loss": 4.314, + "loss/crossentropy": 1.9428821802139282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20955658704042435, + "step": 14258 + }, + { + "epoch": 0.2852, + "grad_norm": 1.921875, + "grad_norm_var": 0.015710194905598957, + "learning_rate": 0.0001, + "loss": 3.9157, + "loss/crossentropy": 1.8959371447563171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20648592710494995, + "step": 14260 + }, + { + "epoch": 0.28524, + "grad_norm": 1.8203125, + "grad_norm_var": 0.017081705729166667, + "learning_rate": 0.0001, + "loss": 3.8246, + "loss/crossentropy": 1.7842467427253723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17766441404819489, + "step": 14262 + }, + { + "epoch": 0.28528, + "grad_norm": 1.84375, + "grad_norm_var": 0.018805948893229167, + "learning_rate": 0.0001, + "loss": 4.0581, + "loss/crossentropy": 1.989040195941925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20939016342163086, + "step": 14264 + }, + { + "epoch": 0.28532, + "grad_norm": 1.953125, + "grad_norm_var": 0.018656158447265626, + "learning_rate": 0.0001, + "loss": 3.9216, + "loss/crossentropy": 2.0441301465034485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20979472994804382, + "step": 14266 + }, + { + "epoch": 0.28536, + "grad_norm": 1.875, + "grad_norm_var": 0.017970530192057292, + "learning_rate": 0.0001, + "loss": 4.2181, + "loss/crossentropy": 1.9348166584968567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930733323097229, + "step": 14268 + }, + { + "epoch": 0.2854, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009650675455729167, + "learning_rate": 0.0001, + "loss": 4.2039, + "loss/crossentropy": 2.056081175804138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20383870601654053, + "step": 14270 + }, + { + "epoch": 0.28544, + "grad_norm": 2.125, + "grad_norm_var": 0.011271158854166666, + "learning_rate": 0.0001, + "loss": 4.4046, + "loss/crossentropy": 2.2752946615219116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2458929866552353, + "step": 14272 + }, + { + "epoch": 0.28548, + "grad_norm": 2.0625, + "grad_norm_var": 0.010456339518229166, + "learning_rate": 0.0001, + "loss": 4.1693, + "loss/crossentropy": 2.093926787376404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19211938977241516, + "step": 14274 + }, + { + "epoch": 0.28552, + "grad_norm": 2.0, + "grad_norm_var": 0.009913889567057292, + "learning_rate": 0.0001, + "loss": 4.1841, + "loss/crossentropy": 2.448614239692688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23177827894687653, + "step": 14276 + }, + { + "epoch": 0.28556, + "grad_norm": 1.765625, + "grad_norm_var": 0.012401326497395834, + "learning_rate": 0.0001, + "loss": 4.0002, + "loss/crossentropy": 2.0047106742858887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2162417769432068, + "step": 14278 + }, + { + "epoch": 0.2856, + "grad_norm": 2.328125, + "grad_norm_var": 0.017986806233723958, + "learning_rate": 0.0001, + "loss": 4.1669, + "loss/crossentropy": 1.982455313205719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22751964628696442, + "step": 14280 + }, + { + "epoch": 0.28564, + "grad_norm": 2.0625, + "grad_norm_var": 0.017411295572916666, + "learning_rate": 0.0001, + "loss": 4.0355, + "loss/crossentropy": 1.872545838356018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19046122580766678, + "step": 14282 + }, + { + "epoch": 0.28568, + "grad_norm": 2.109375, + "grad_norm_var": 0.016388956705729166, + "learning_rate": 0.0001, + "loss": 4.1266, + "loss/crossentropy": 1.8999648690223694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155570462346077, + "step": 14284 + }, + { + "epoch": 0.28572, + "grad_norm": 2.0, + "grad_norm_var": 0.014129384358723959, + "learning_rate": 0.0001, + "loss": 3.9649, + "loss/crossentropy": 1.8433392643928528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18906012177467346, + "step": 14286 + }, + { + "epoch": 0.28576, + "grad_norm": 2.15625, + "grad_norm_var": 0.014697011311848958, + "learning_rate": 0.0001, + "loss": 4.085, + "loss/crossentropy": 1.817060947418213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20473239570856094, + "step": 14288 + }, + { + "epoch": 0.2858, + "grad_norm": 2.265625, + "grad_norm_var": 0.01796239217122396, + "learning_rate": 0.0001, + "loss": 4.1856, + "loss/crossentropy": 2.0344348549842834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19991667568683624, + "step": 14290 + }, + { + "epoch": 0.28584, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01784032185872396, + "learning_rate": 0.0001, + "loss": 4.2334, + "loss/crossentropy": 2.3413926362991333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2365436628460884, + "step": 14292 + }, + { + "epoch": 0.28588, + "grad_norm": 2.078125, + "grad_norm_var": 0.015464019775390626, + "learning_rate": 0.0001, + "loss": 3.9437, + "loss/crossentropy": 2.0959852933883667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19703902304172516, + "step": 14294 + }, + { + "epoch": 0.28592, + "grad_norm": 7.625, + "grad_norm_var": 1.9304239908854166, + "learning_rate": 0.0001, + "loss": 4.2144, + "loss/crossentropy": 2.220101237297058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21996742486953735, + "step": 14296 + }, + { + "epoch": 0.28596, + "grad_norm": 2.109375, + "grad_norm_var": 1.915691884358724, + "learning_rate": 0.0001, + "loss": 4.1806, + "loss/crossentropy": 2.073485493659973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020936980843544, + "step": 14298 + }, + { + "epoch": 0.286, + "grad_norm": 1.96875, + "grad_norm_var": 1.9216265360514322, + "learning_rate": 0.0001, + "loss": 3.9176, + "loss/crossentropy": 2.228920817375183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22304438799619675, + "step": 14300 + }, + { + "epoch": 0.28604, + "grad_norm": 1.890625, + "grad_norm_var": 1.933642323811849, + "learning_rate": 0.0001, + "loss": 4.0831, + "loss/crossentropy": 1.724998950958252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1592913344502449, + "step": 14302 + }, + { + "epoch": 0.28608, + "grad_norm": 2.0, + "grad_norm_var": 1.9375221252441406, + "learning_rate": 0.0001, + "loss": 4.1738, + "loss/crossentropy": 1.8970724940299988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18843455612659454, + "step": 14304 + }, + { + "epoch": 0.28612, + "grad_norm": 2.09375, + "grad_norm_var": 1.9384295145670574, + "learning_rate": 0.0001, + "loss": 4.0808, + "loss/crossentropy": 1.9957273602485657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18272704631090164, + "step": 14306 + }, + { + "epoch": 0.28616, + "grad_norm": 2.015625, + "grad_norm_var": 1.9298177083333334, + "learning_rate": 0.0001, + "loss": 4.3861, + "loss/crossentropy": 2.1424412727355957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21485400944948196, + "step": 14308 + }, + { + "epoch": 0.2862, + "grad_norm": 1.9765625, + "grad_norm_var": 1.9411211649576823, + "learning_rate": 0.0001, + "loss": 3.9622, + "loss/crossentropy": 1.7729334235191345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20436155050992966, + "step": 14310 + }, + { + "epoch": 0.28624, + "grad_norm": 1.96875, + "grad_norm_var": 0.013952382405598958, + "learning_rate": 0.0001, + "loss": 4.2495, + "loss/crossentropy": 2.5863327980041504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24598564952611923, + "step": 14312 + }, + { + "epoch": 0.28628, + "grad_norm": 1.9375, + "grad_norm_var": 0.01014404296875, + "learning_rate": 0.0001, + "loss": 3.962, + "loss/crossentropy": 1.9089832305908203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1905580461025238, + "step": 14314 + }, + { + "epoch": 0.28632, + "grad_norm": 2.09375, + "grad_norm_var": 0.010228474934895834, + "learning_rate": 0.0001, + "loss": 4.2167, + "loss/crossentropy": 2.0153337121009827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21556401252746582, + "step": 14316 + }, + { + "epoch": 0.28636, + "grad_norm": 2.015625, + "grad_norm_var": 0.008776601155598958, + "learning_rate": 0.0001, + "loss": 4.4037, + "loss/crossentropy": 1.9692147970199585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17990782111883163, + "step": 14318 + }, + { + "epoch": 0.2864, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010969034830729167, + "learning_rate": 0.0001, + "loss": 4.1049, + "loss/crossentropy": 2.0404393076896667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108697146177292, + "step": 14320 + }, + { + "epoch": 0.28644, + "grad_norm": 2.0, + "grad_norm_var": 0.009488932291666667, + "learning_rate": 0.0001, + "loss": 4.2809, + "loss/crossentropy": 2.133711099624634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1942032128572464, + "step": 14322 + }, + { + "epoch": 0.28648, + "grad_norm": 2.0, + "grad_norm_var": 0.004878489176432291, + "learning_rate": 0.0001, + "loss": 4.1605, + "loss/crossentropy": 1.9240726232528687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18174728006124496, + "step": 14324 + }, + { + "epoch": 0.28652, + "grad_norm": 2.0625, + "grad_norm_var": 0.00850830078125, + "learning_rate": 0.0001, + "loss": 4.2671, + "loss/crossentropy": 2.0952632427215576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20621410757303238, + "step": 14326 + }, + { + "epoch": 0.28656, + "grad_norm": 2.046875, + "grad_norm_var": 0.008326975504557292, + "learning_rate": 0.0001, + "loss": 4.0683, + "loss/crossentropy": 2.2604600191116333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2310045212507248, + "step": 14328 + }, + { + "epoch": 0.2866, + "grad_norm": 2.09375, + "grad_norm_var": 0.00777587890625, + "learning_rate": 0.0001, + "loss": 4.2551, + "loss/crossentropy": 2.0915167331695557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23344147205352783, + "step": 14330 + }, + { + "epoch": 0.28664, + "grad_norm": 1.9765625, + "grad_norm_var": 0.00804443359375, + "learning_rate": 0.0001, + "loss": 3.9575, + "loss/crossentropy": 2.2928651571273804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23840509355068207, + "step": 14332 + }, + { + "epoch": 0.28668, + "grad_norm": 2.0, + "grad_norm_var": 0.008685048421223958, + "learning_rate": 0.0001, + "loss": 4.4469, + "loss/crossentropy": 2.261072874069214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22056513279676437, + "step": 14334 + }, + { + "epoch": 0.28672, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007883453369140625, + "learning_rate": 0.0001, + "loss": 3.9218, + "loss/crossentropy": 1.7591394186019897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900082305073738, + "step": 14336 + }, + { + "epoch": 0.28676, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008426666259765625, + "learning_rate": 0.0001, + "loss": 4.0457, + "loss/crossentropy": 2.129893183708191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19578705728054047, + "step": 14338 + }, + { + "epoch": 0.2868, + "grad_norm": 2.078125, + "grad_norm_var": 0.008451334635416667, + "learning_rate": 0.0001, + "loss": 4.4884, + "loss/crossentropy": 2.0376795530319214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088106870651245, + "step": 14340 + }, + { + "epoch": 0.28684, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005018870035807292, + "learning_rate": 0.0001, + "loss": 4.282, + "loss/crossentropy": 2.1484848260879517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113661915063858, + "step": 14342 + }, + { + "epoch": 0.28688, + "grad_norm": 1.984375, + "grad_norm_var": 0.00496826171875, + "learning_rate": 0.0001, + "loss": 4.2713, + "loss/crossentropy": 2.204781651496887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20908530056476593, + "step": 14344 + }, + { + "epoch": 0.28692, + "grad_norm": 2.09375, + "grad_norm_var": 0.0052154541015625, + "learning_rate": 0.0001, + "loss": 4.1075, + "loss/crossentropy": 1.8096813559532166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18912722170352936, + "step": 14346 + }, + { + "epoch": 0.28696, + "grad_norm": 1.921875, + "grad_norm_var": 0.005580393473307291, + "learning_rate": 0.0001, + "loss": 4.0057, + "loss/crossentropy": 1.8014967441558838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18832575529813766, + "step": 14348 + }, + { + "epoch": 0.287, + "grad_norm": 2.203125, + "grad_norm_var": 0.00789794921875, + "learning_rate": 0.0001, + "loss": 4.3211, + "loss/crossentropy": 2.1443156003952026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.232828289270401, + "step": 14350 + }, + { + "epoch": 0.28704, + "grad_norm": 2.125, + "grad_norm_var": 0.008259073893229166, + "learning_rate": 0.0001, + "loss": 4.0984, + "loss/crossentropy": 2.1773927211761475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19964880496263504, + "step": 14352 + }, + { + "epoch": 0.28708, + "grad_norm": 2.0, + "grad_norm_var": 0.007795206705729167, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 2.157313585281372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149301916360855, + "step": 14354 + }, + { + "epoch": 0.28712, + "grad_norm": 2.046875, + "grad_norm_var": 0.007184855143229167, + "learning_rate": 0.0001, + "loss": 4.2938, + "loss/crossentropy": 2.3431146144866943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2297225296497345, + "step": 14356 + }, + { + "epoch": 0.28716, + "grad_norm": 2.03125, + "grad_norm_var": 0.007940419514973958, + "learning_rate": 0.0001, + "loss": 4.2026, + "loss/crossentropy": 1.894954264163971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19812744110822678, + "step": 14358 + }, + { + "epoch": 0.2872, + "grad_norm": 1.9375, + "grad_norm_var": 0.008784993489583334, + "learning_rate": 0.0001, + "loss": 4.0654, + "loss/crossentropy": 2.1701435446739197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21183644235134125, + "step": 14360 + }, + { + "epoch": 0.28724, + "grad_norm": 2.015625, + "grad_norm_var": 0.007975006103515625, + "learning_rate": 0.0001, + "loss": 4.2457, + "loss/crossentropy": 2.202796459197998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21229087561368942, + "step": 14362 + }, + { + "epoch": 0.28728, + "grad_norm": 2.203125, + "grad_norm_var": 0.009789784749348959, + "learning_rate": 0.0001, + "loss": 4.3692, + "loss/crossentropy": 2.225682258605957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22512775659561157, + "step": 14364 + }, + { + "epoch": 0.28732, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007088216145833334, + "learning_rate": 0.0001, + "loss": 4.0938, + "loss/crossentropy": 1.9562655687332153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19557978957891464, + "step": 14366 + }, + { + "epoch": 0.28736, + "grad_norm": 2.0, + "grad_norm_var": 0.0062945048014322914, + "learning_rate": 0.0001, + "loss": 4.2922, + "loss/crossentropy": 1.794309377670288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851721778512001, + "step": 14368 + }, + { + "epoch": 0.2874, + "grad_norm": 2.0, + "grad_norm_var": 0.006660970052083334, + "learning_rate": 0.0001, + "loss": 4.1198, + "loss/crossentropy": 2.276759445667267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071758210659027, + "step": 14370 + }, + { + "epoch": 0.28744, + "grad_norm": 1.90625, + "grad_norm_var": 0.006696573893229167, + "learning_rate": 0.0001, + "loss": 4.1162, + "loss/crossentropy": 2.332028031349182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22525647282600403, + "step": 14372 + }, + { + "epoch": 0.28748, + "grad_norm": 2.078125, + "grad_norm_var": 0.005997721354166667, + "learning_rate": 0.0001, + "loss": 4.2594, + "loss/crossentropy": 1.975858986377716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18405582010746002, + "step": 14374 + }, + { + "epoch": 0.28752, + "grad_norm": 1.984375, + "grad_norm_var": 0.006453196207682292, + "learning_rate": 0.0001, + "loss": 4.0615, + "loss/crossentropy": 2.002572774887085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19543466717004776, + "step": 14376 + }, + { + "epoch": 0.28756, + "grad_norm": 1.96875, + "grad_norm_var": 0.006703440348307292, + "learning_rate": 0.0001, + "loss": 4.1837, + "loss/crossentropy": 2.400219678878784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23009777069091797, + "step": 14378 + }, + { + "epoch": 0.2876, + "grad_norm": 2.0, + "grad_norm_var": 0.003352864583333333, + "learning_rate": 0.0001, + "loss": 4.0071, + "loss/crossentropy": 1.6437376737594604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17785440385341644, + "step": 14380 + }, + { + "epoch": 0.28764, + "grad_norm": 2.15625, + "grad_norm_var": 0.007225545247395834, + "learning_rate": 0.0001, + "loss": 4.1494, + "loss/crossentropy": 1.8833640813827515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22255511581897736, + "step": 14382 + }, + { + "epoch": 0.28768, + "grad_norm": 2.0, + "grad_norm_var": 0.007100423177083333, + "learning_rate": 0.0001, + "loss": 4.0881, + "loss/crossentropy": 1.9113904237747192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2050003483891487, + "step": 14384 + }, + { + "epoch": 0.28772, + "grad_norm": 1.90625, + "grad_norm_var": 0.007765452067057292, + "learning_rate": 0.0001, + "loss": 4.1114, + "loss/crossentropy": 2.236249327659607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21349424868822098, + "step": 14386 + }, + { + "epoch": 0.28776, + "grad_norm": 2.140625, + "grad_norm_var": 0.008090972900390625, + "learning_rate": 0.0001, + "loss": 3.9282, + "loss/crossentropy": 1.8709489703178406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19309207051992416, + "step": 14388 + }, + { + "epoch": 0.2878, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007933553059895833, + "learning_rate": 0.0001, + "loss": 4.0871, + "loss/crossentropy": 1.926324725151062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19902244210243225, + "step": 14390 + }, + { + "epoch": 0.28784, + "grad_norm": 1.8671875, + "grad_norm_var": 0.00810546875, + "learning_rate": 0.0001, + "loss": 4.085, + "loss/crossentropy": 1.898009479045868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18698234856128693, + "step": 14392 + }, + { + "epoch": 0.28788, + "grad_norm": 2.1875, + "grad_norm_var": 0.0122711181640625, + "learning_rate": 0.0001, + "loss": 4.2367, + "loss/crossentropy": 1.9728660583496094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033083364367485, + "step": 14394 + }, + { + "epoch": 0.28792, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01282958984375, + "learning_rate": 0.0001, + "loss": 3.7965, + "loss/crossentropy": 1.957255482673645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20039385557174683, + "step": 14396 + }, + { + "epoch": 0.28796, + "grad_norm": 2.0625, + "grad_norm_var": 0.009897613525390625, + "learning_rate": 0.0001, + "loss": 3.8525, + "loss/crossentropy": 1.8001562356948853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059212401509285, + "step": 14398 + }, + { + "epoch": 0.288, + "grad_norm": 2.125, + "grad_norm_var": 0.011177571614583333, + "learning_rate": 0.0001, + "loss": 4.1749, + "loss/crossentropy": 1.9166946411132812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19456689059734344, + "step": 14400 + }, + { + "epoch": 0.28804, + "grad_norm": 2.0625, + "grad_norm_var": 0.01221923828125, + "learning_rate": 0.0001, + "loss": 4.3302, + "loss/crossentropy": 2.033670485019684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24448300898075104, + "step": 14402 + }, + { + "epoch": 0.28808, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012580362955729167, + "learning_rate": 0.0001, + "loss": 3.8679, + "loss/crossentropy": 2.2668023705482483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20903322845697403, + "step": 14404 + }, + { + "epoch": 0.28812, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012532297770182292, + "learning_rate": 0.0001, + "loss": 4.0638, + "loss/crossentropy": 2.1076024770736694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20488093793392181, + "step": 14406 + }, + { + "epoch": 0.28816, + "grad_norm": 2.078125, + "grad_norm_var": 0.012117258707682292, + "learning_rate": 0.0001, + "loss": 4.0707, + "loss/crossentropy": 1.996046781539917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20725766569375992, + "step": 14408 + }, + { + "epoch": 0.2882, + "grad_norm": 2.046875, + "grad_norm_var": 0.008365885416666666, + "learning_rate": 0.0001, + "loss": 4.2302, + "loss/crossentropy": 2.2031015157699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19491394609212875, + "step": 14410 + }, + { + "epoch": 0.28824, + "grad_norm": 2.109375, + "grad_norm_var": 0.008714803059895833, + "learning_rate": 0.0001, + "loss": 4.3962, + "loss/crossentropy": 2.0339081287384033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21979261189699173, + "step": 14412 + }, + { + "epoch": 0.28828, + "grad_norm": 1.8515625, + "grad_norm_var": 0.011641438802083333, + "learning_rate": 0.0001, + "loss": 4.0042, + "loss/crossentropy": 1.9283286929130554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19970305263996124, + "step": 14414 + }, + { + "epoch": 0.28832, + "grad_norm": 2.046875, + "grad_norm_var": 0.011156972249348958, + "learning_rate": 0.0001, + "loss": 4.3879, + "loss/crossentropy": 2.1092429161071777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2114599123597145, + "step": 14416 + }, + { + "epoch": 0.28836, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01004638671875, + "learning_rate": 0.0001, + "loss": 4.1234, + "loss/crossentropy": 1.8102002143859863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18407940864562988, + "step": 14418 + }, + { + "epoch": 0.2884, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008906809488932292, + "learning_rate": 0.0001, + "loss": 4.0943, + "loss/crossentropy": 2.3192719221115112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22208110988140106, + "step": 14420 + }, + { + "epoch": 0.28844, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0082183837890625, + "learning_rate": 0.0001, + "loss": 4.2268, + "loss/crossentropy": 2.2302430868148804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21011962741613388, + "step": 14422 + }, + { + "epoch": 0.28848, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008542633056640625, + "learning_rate": 0.0001, + "loss": 4.1235, + "loss/crossentropy": 2.3015987873077393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012081891298294, + "step": 14424 + }, + { + "epoch": 0.28852, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0077056884765625, + "learning_rate": 0.0001, + "loss": 4.3094, + "loss/crossentropy": 2.060473084449768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20174618810415268, + "step": 14426 + }, + { + "epoch": 0.28856, + "grad_norm": 2.3125, + "grad_norm_var": 0.014082845052083333, + "learning_rate": 0.0001, + "loss": 3.9594, + "loss/crossentropy": 1.8146299719810486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18100766092538834, + "step": 14428 + }, + { + "epoch": 0.2886, + "grad_norm": 2.0, + "grad_norm_var": 0.011277008056640624, + "learning_rate": 0.0001, + "loss": 3.8444, + "loss/crossentropy": 2.1504935026168823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20493116229772568, + "step": 14430 + }, + { + "epoch": 0.28864, + "grad_norm": 2.0, + "grad_norm_var": 0.010796864827473959, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.4947997331619263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2339334562420845, + "step": 14432 + }, + { + "epoch": 0.28868, + "grad_norm": 2.3125, + "grad_norm_var": 0.01612116495768229, + "learning_rate": 0.0001, + "loss": 4.3011, + "loss/crossentropy": 2.421883702278137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23815617710351944, + "step": 14434 + }, + { + "epoch": 0.28872, + "grad_norm": 2.0, + "grad_norm_var": 0.016600545247395834, + "learning_rate": 0.0001, + "loss": 4.2152, + "loss/crossentropy": 2.093776822090149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20425530523061752, + "step": 14436 + }, + { + "epoch": 0.28876, + "grad_norm": 2.03125, + "grad_norm_var": 0.01817601521809896, + "learning_rate": 0.0001, + "loss": 3.9126, + "loss/crossentropy": 2.02763295173645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972591131925583, + "step": 14438 + }, + { + "epoch": 0.2888, + "grad_norm": 2.015625, + "grad_norm_var": 0.0176422119140625, + "learning_rate": 0.0001, + "loss": 4.1348, + "loss/crossentropy": 1.9769265055656433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186862014234066, + "step": 14440 + }, + { + "epoch": 0.28884, + "grad_norm": 1.9375, + "grad_norm_var": 0.019024403889973958, + "learning_rate": 0.0001, + "loss": 4.4997, + "loss/crossentropy": 2.34747314453125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22067507356405258, + "step": 14442 + }, + { + "epoch": 0.28888, + "grad_norm": 1.921875, + "grad_norm_var": 0.0128570556640625, + "learning_rate": 0.0001, + "loss": 4.1497, + "loss/crossentropy": 2.1036205887794495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2231181040406227, + "step": 14444 + }, + { + "epoch": 0.28892, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01638768513997396, + "learning_rate": 0.0001, + "loss": 4.251, + "loss/crossentropy": 2.2350860834121704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23475627601146698, + "step": 14446 + }, + { + "epoch": 0.28896, + "grad_norm": 2.09375, + "grad_norm_var": 0.01778132120768229, + "learning_rate": 0.0001, + "loss": 4.2095, + "loss/crossentropy": 2.2336645126342773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20643477141857147, + "step": 14448 + }, + { + "epoch": 0.289, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012923177083333333, + "learning_rate": 0.0001, + "loss": 4.0793, + "loss/crossentropy": 2.345365524291992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20650553703308105, + "step": 14450 + }, + { + "epoch": 0.28904, + "grad_norm": 2.125, + "grad_norm_var": 0.0120025634765625, + "learning_rate": 0.0001, + "loss": 4.131, + "loss/crossentropy": 2.307543635368347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21354079991579056, + "step": 14452 + }, + { + "epoch": 0.28908, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012001291910807291, + "learning_rate": 0.0001, + "loss": 4.1006, + "loss/crossentropy": 1.8094390034675598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1850133240222931, + "step": 14454 + }, + { + "epoch": 0.28912, + "grad_norm": 2.046875, + "grad_norm_var": 0.011424763997395834, + "learning_rate": 0.0001, + "loss": 4.1326, + "loss/crossentropy": 1.859872817993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.226328507065773, + "step": 14456 + }, + { + "epoch": 0.28916, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009340159098307292, + "learning_rate": 0.0001, + "loss": 3.9997, + "loss/crossentropy": 1.9507490396499634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18718113750219345, + "step": 14458 + }, + { + "epoch": 0.2892, + "grad_norm": 1.859375, + "grad_norm_var": 0.01126708984375, + "learning_rate": 0.0001, + "loss": 3.7868, + "loss/crossentropy": 1.6642532348632812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1674812287092209, + "step": 14460 + }, + { + "epoch": 0.28924, + "grad_norm": 1.859375, + "grad_norm_var": 0.007039133707682292, + "learning_rate": 0.0001, + "loss": 3.9523, + "loss/crossentropy": 1.9071928262710571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18480891734361649, + "step": 14462 + }, + { + "epoch": 0.28928, + "grad_norm": 1.96875, + "grad_norm_var": 0.005718739827473959, + "learning_rate": 0.0001, + "loss": 4.2876, + "loss/crossentropy": 2.0758568048477173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002219259738922, + "step": 14464 + }, + { + "epoch": 0.28932, + "grad_norm": 2.078125, + "grad_norm_var": 0.006769816080729167, + "learning_rate": 0.0001, + "loss": 3.9438, + "loss/crossentropy": 1.9777529835700989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19538866728544235, + "step": 14466 + }, + { + "epoch": 0.28936, + "grad_norm": 2.0625, + "grad_norm_var": 0.0062652587890625, + "learning_rate": 0.0001, + "loss": 4.1262, + "loss/crossentropy": 2.008077323436737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20578377693891525, + "step": 14468 + }, + { + "epoch": 0.2894, + "grad_norm": 1.8359375, + "grad_norm_var": 0.006673177083333333, + "learning_rate": 0.0001, + "loss": 3.9341, + "loss/crossentropy": 2.011266529560089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20298370718955994, + "step": 14470 + }, + { + "epoch": 0.28944, + "grad_norm": 1.9375, + "grad_norm_var": 0.0057769775390625, + "learning_rate": 0.0001, + "loss": 4.201, + "loss/crossentropy": 2.231672167778015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21847998350858688, + "step": 14472 + }, + { + "epoch": 0.28948, + "grad_norm": 1.984375, + "grad_norm_var": 0.007279205322265625, + "learning_rate": 0.0001, + "loss": 4.382, + "loss/crossentropy": 2.2268325090408325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22168640047311783, + "step": 14474 + }, + { + "epoch": 0.28952, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012105305989583334, + "learning_rate": 0.0001, + "loss": 4.2193, + "loss/crossentropy": 2.2705942392349243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110700011253357, + "step": 14476 + }, + { + "epoch": 0.28956, + "grad_norm": 1.8203125, + "grad_norm_var": 0.012566884358723959, + "learning_rate": 0.0001, + "loss": 3.8754, + "loss/crossentropy": 1.893052339553833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18865809589624405, + "step": 14478 + }, + { + "epoch": 0.2896, + "grad_norm": 1.84375, + "grad_norm_var": 0.013732655843098959, + "learning_rate": 0.0001, + "loss": 4.0989, + "loss/crossentropy": 2.020704984664917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18822921812534332, + "step": 14480 + }, + { + "epoch": 0.28964, + "grad_norm": 2.03125, + "grad_norm_var": 0.014632161458333333, + "learning_rate": 0.0001, + "loss": 4.1651, + "loss/crossentropy": 1.762313961982727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204545259475708, + "step": 14482 + }, + { + "epoch": 0.28968, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013158162434895834, + "learning_rate": 0.0001, + "loss": 4.0261, + "loss/crossentropy": 1.8793032765388489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17913836985826492, + "step": 14484 + }, + { + "epoch": 0.28972, + "grad_norm": 1.890625, + "grad_norm_var": 0.013372548421223958, + "learning_rate": 0.0001, + "loss": 4.2731, + "loss/crossentropy": 2.428762197494507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23766332119703293, + "step": 14486 + }, + { + "epoch": 0.28976, + "grad_norm": 2.046875, + "grad_norm_var": 0.014969635009765624, + "learning_rate": 0.0001, + "loss": 4.1134, + "loss/crossentropy": 2.2160138487815857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21619082987308502, + "step": 14488 + }, + { + "epoch": 0.2898, + "grad_norm": 2.046875, + "grad_norm_var": 0.0150787353515625, + "learning_rate": 0.0001, + "loss": 4.1049, + "loss/crossentropy": 1.8169404864311218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981067657470703, + "step": 14490 + }, + { + "epoch": 0.28984, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0131591796875, + "learning_rate": 0.0001, + "loss": 4.1456, + "loss/crossentropy": 2.147166609764099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993183195590973, + "step": 14492 + }, + { + "epoch": 0.28988, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011362457275390625, + "learning_rate": 0.0001, + "loss": 3.8507, + "loss/crossentropy": 1.7740440964698792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18381935358047485, + "step": 14494 + }, + { + "epoch": 0.28992, + "grad_norm": 1.921875, + "grad_norm_var": 0.010428619384765626, + "learning_rate": 0.0001, + "loss": 4.0327, + "loss/crossentropy": 2.206219792366028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21052061766386032, + "step": 14496 + }, + { + "epoch": 0.28996, + "grad_norm": 2.015625, + "grad_norm_var": 0.008600870768229166, + "learning_rate": 0.0001, + "loss": 4.4421, + "loss/crossentropy": 2.4030661582946777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22094716131687164, + "step": 14498 + }, + { + "epoch": 0.29, + "grad_norm": 2.25, + "grad_norm_var": 0.016633097330729166, + "learning_rate": 0.0001, + "loss": 4.4593, + "loss/crossentropy": 2.142518997192383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2111787647008896, + "step": 14500 + }, + { + "epoch": 0.29004, + "grad_norm": 1.953125, + "grad_norm_var": 0.015633138020833333, + "learning_rate": 0.0001, + "loss": 4.3679, + "loss/crossentropy": 1.9228865504264832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19389715045690536, + "step": 14502 + }, + { + "epoch": 0.29008, + "grad_norm": 2.09375, + "grad_norm_var": 0.014141591389973958, + "learning_rate": 0.0001, + "loss": 4.011, + "loss/crossentropy": 1.789183259010315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18835698068141937, + "step": 14504 + }, + { + "epoch": 0.29012, + "grad_norm": 1.9375, + "grad_norm_var": 0.013798014322916666, + "learning_rate": 0.0001, + "loss": 4.2862, + "loss/crossentropy": 2.3876765966415405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23156627267599106, + "step": 14506 + }, + { + "epoch": 0.29016, + "grad_norm": 2.078125, + "grad_norm_var": 0.010151926676432292, + "learning_rate": 0.0001, + "loss": 4.2024, + "loss/crossentropy": 2.055350124835968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1995788812637329, + "step": 14508 + }, + { + "epoch": 0.2902, + "grad_norm": 1.96875, + "grad_norm_var": 0.010007476806640625, + "learning_rate": 0.0001, + "loss": 4.343, + "loss/crossentropy": 2.2045267820358276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2197813093662262, + "step": 14510 + }, + { + "epoch": 0.29024, + "grad_norm": 1.765625, + "grad_norm_var": 0.014503733317057291, + "learning_rate": 0.0001, + "loss": 3.7136, + "loss/crossentropy": 1.9596665501594543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17329587787389755, + "step": 14512 + }, + { + "epoch": 0.29028, + "grad_norm": 1.9921875, + "grad_norm_var": 0.017465972900390626, + "learning_rate": 0.0001, + "loss": 3.9546, + "loss/crossentropy": 1.8273005485534668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17768846452236176, + "step": 14514 + }, + { + "epoch": 0.29032, + "grad_norm": 1.8359375, + "grad_norm_var": 0.011739095052083334, + "learning_rate": 0.0001, + "loss": 3.9924, + "loss/crossentropy": 2.0428807735443115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104019746184349, + "step": 14516 + }, + { + "epoch": 0.29036, + "grad_norm": 2.21875, + "grad_norm_var": 0.014216868082682292, + "learning_rate": 0.0001, + "loss": 4.3591, + "loss/crossentropy": 2.1242733001708984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2153768166899681, + "step": 14518 + }, + { + "epoch": 0.2904, + "grad_norm": 1.875, + "grad_norm_var": 0.013944244384765625, + "learning_rate": 0.0001, + "loss": 4.0831, + "loss/crossentropy": 1.9223615527153015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18382133543491364, + "step": 14520 + }, + { + "epoch": 0.29044, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013993326822916667, + "learning_rate": 0.0001, + "loss": 3.9527, + "loss/crossentropy": 2.12492972612381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19818739593029022, + "step": 14522 + }, + { + "epoch": 0.29048, + "grad_norm": 1.9609375, + "grad_norm_var": 0.013665517171223959, + "learning_rate": 0.0001, + "loss": 4.1539, + "loss/crossentropy": 2.0251020789146423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19633153080940247, + "step": 14524 + }, + { + "epoch": 0.29052, + "grad_norm": 2.1875, + "grad_norm_var": 0.014949544270833334, + "learning_rate": 0.0001, + "loss": 4.192, + "loss/crossentropy": 2.177084445953369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22618898749351501, + "step": 14526 + }, + { + "epoch": 0.29056, + "grad_norm": 2.0625, + "grad_norm_var": 0.013997141520182292, + "learning_rate": 0.0001, + "loss": 3.9215, + "loss/crossentropy": 1.718012809753418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18632279336452484, + "step": 14528 + }, + { + "epoch": 0.2906, + "grad_norm": 1.9609375, + "grad_norm_var": 0.017850494384765624, + "learning_rate": 0.0001, + "loss": 4.2869, + "loss/crossentropy": 1.7026863098144531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.211147278547287, + "step": 14530 + }, + { + "epoch": 0.29064, + "grad_norm": 1.8671875, + "grad_norm_var": 0.017682902018229165, + "learning_rate": 0.0001, + "loss": 3.9369, + "loss/crossentropy": 1.6652680039405823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16967280209064484, + "step": 14532 + }, + { + "epoch": 0.29068, + "grad_norm": 1.9296875, + "grad_norm_var": 0.013996378580729166, + "learning_rate": 0.0001, + "loss": 4.0815, + "loss/crossentropy": 2.191527247428894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20215752720832825, + "step": 14534 + }, + { + "epoch": 0.29072, + "grad_norm": 2.125, + "grad_norm_var": 0.015927886962890624, + "learning_rate": 0.0001, + "loss": 3.8817, + "loss/crossentropy": 1.8385645747184753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19906776398420334, + "step": 14536 + }, + { + "epoch": 0.29076, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01665013631184896, + "learning_rate": 0.0001, + "loss": 3.9186, + "loss/crossentropy": 2.0456249117851257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2057228460907936, + "step": 14538 + }, + { + "epoch": 0.2908, + "grad_norm": 1.84375, + "grad_norm_var": 0.018651326497395832, + "learning_rate": 0.0001, + "loss": 4.0636, + "loss/crossentropy": 2.0413625836372375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20096366852521896, + "step": 14540 + }, + { + "epoch": 0.29084, + "grad_norm": 1.9609375, + "grad_norm_var": 0.016755167643229166, + "learning_rate": 0.0001, + "loss": 3.7819, + "loss/crossentropy": 1.723297357559204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.180135078728199, + "step": 14542 + }, + { + "epoch": 0.29088, + "grad_norm": 2.078125, + "grad_norm_var": 0.01578343709309896, + "learning_rate": 0.0001, + "loss": 4.2115, + "loss/crossentropy": 2.3143200874328613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2289530113339424, + "step": 14544 + }, + { + "epoch": 0.29092, + "grad_norm": 2.046875, + "grad_norm_var": 0.010017903645833333, + "learning_rate": 0.0001, + "loss": 4.408, + "loss/crossentropy": 2.1612138748168945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21224550902843475, + "step": 14546 + }, + { + "epoch": 0.29096, + "grad_norm": 1.921875, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.1101, + "loss/crossentropy": 2.0591673851013184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18913161754608154, + "step": 14548 + }, + { + "epoch": 0.291, + "grad_norm": 1.875, + "grad_norm_var": 0.009505208333333333, + "learning_rate": 0.0001, + "loss": 4.132, + "loss/crossentropy": 2.0901013016700745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19315654039382935, + "step": 14550 + }, + { + "epoch": 0.29104, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006833648681640625, + "learning_rate": 0.0001, + "loss": 3.9351, + "loss/crossentropy": 2.0716471672058105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951451674103737, + "step": 14552 + }, + { + "epoch": 0.29108, + "grad_norm": 1.984375, + "grad_norm_var": 0.008186848958333333, + "learning_rate": 0.0001, + "loss": 4.3656, + "loss/crossentropy": 2.084408760070801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21354945749044418, + "step": 14554 + }, + { + "epoch": 0.29112, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006486002604166667, + "learning_rate": 0.0001, + "loss": 4.0342, + "loss/crossentropy": 2.154771566390991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20277246832847595, + "step": 14556 + }, + { + "epoch": 0.29116, + "grad_norm": 2.09375, + "grad_norm_var": 0.0069353739420572914, + "learning_rate": 0.0001, + "loss": 4.259, + "loss/crossentropy": 2.24834144115448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23204077035188675, + "step": 14558 + }, + { + "epoch": 0.2912, + "grad_norm": 1.859375, + "grad_norm_var": 0.006951649983723958, + "learning_rate": 0.0001, + "loss": 3.8054, + "loss/crossentropy": 1.8239850401878357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18208947032690048, + "step": 14560 + }, + { + "epoch": 0.29124, + "grad_norm": 1.90625, + "grad_norm_var": 0.006701405843098958, + "learning_rate": 0.0001, + "loss": 3.9465, + "loss/crossentropy": 1.969277262687683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093023955821991, + "step": 14562 + }, + { + "epoch": 0.29128, + "grad_norm": 2.046875, + "grad_norm_var": 0.007670084635416667, + "learning_rate": 0.0001, + "loss": 3.7176, + "loss/crossentropy": 1.561119556427002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1653747707605362, + "step": 14564 + }, + { + "epoch": 0.29132, + "grad_norm": 2.0625, + "grad_norm_var": 0.008154042561848958, + "learning_rate": 0.0001, + "loss": 4.3307, + "loss/crossentropy": 2.2341216802597046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23967482149600983, + "step": 14566 + }, + { + "epoch": 0.29136, + "grad_norm": 1.921875, + "grad_norm_var": 0.008227284749348958, + "learning_rate": 0.0001, + "loss": 3.9899, + "loss/crossentropy": 1.999854326248169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19598717242479324, + "step": 14568 + }, + { + "epoch": 0.2914, + "grad_norm": 1.953125, + "grad_norm_var": 0.006898752848307292, + "learning_rate": 0.0001, + "loss": 4.1262, + "loss/crossentropy": 1.8079020380973816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919228658080101, + "step": 14570 + }, + { + "epoch": 0.29144, + "grad_norm": 2.109375, + "grad_norm_var": 0.009970855712890626, + "learning_rate": 0.0001, + "loss": 3.9946, + "loss/crossentropy": 2.044901430606842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21737617254257202, + "step": 14572 + }, + { + "epoch": 0.29148, + "grad_norm": 2.03125, + "grad_norm_var": 0.009806060791015625, + "learning_rate": 0.0001, + "loss": 4.0498, + "loss/crossentropy": 1.972197949886322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20828764885663986, + "step": 14574 + }, + { + "epoch": 0.29152, + "grad_norm": 2.015625, + "grad_norm_var": 0.010131581624348959, + "learning_rate": 0.0001, + "loss": 4.0011, + "loss/crossentropy": 1.9129782915115356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17108886688947678, + "step": 14576 + }, + { + "epoch": 0.29156, + "grad_norm": 2.109375, + "grad_norm_var": 0.01068115234375, + "learning_rate": 0.0001, + "loss": 4.1458, + "loss/crossentropy": 2.0545085072517395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20631857216358185, + "step": 14578 + }, + { + "epoch": 0.2916, + "grad_norm": 2.015625, + "grad_norm_var": 0.012654368082682292, + "learning_rate": 0.0001, + "loss": 4.355, + "loss/crossentropy": 2.0274637937545776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2099764049053192, + "step": 14580 + }, + { + "epoch": 0.29164, + "grad_norm": 1.875, + "grad_norm_var": 0.013765207926432292, + "learning_rate": 0.0001, + "loss": 3.9014, + "loss/crossentropy": 1.8154139518737793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19045238196849823, + "step": 14582 + }, + { + "epoch": 0.29168, + "grad_norm": 2.0625, + "grad_norm_var": 0.014937082926432291, + "learning_rate": 0.0001, + "loss": 4.0327, + "loss/crossentropy": 2.061666965484619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21272550523281097, + "step": 14584 + }, + { + "epoch": 0.29172, + "grad_norm": 2.03125, + "grad_norm_var": 0.01580785115559896, + "learning_rate": 0.0001, + "loss": 4.6019, + "loss/crossentropy": 2.3831146955490112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22327116876840591, + "step": 14586 + }, + { + "epoch": 0.29176, + "grad_norm": 2.046875, + "grad_norm_var": 0.013090006510416667, + "learning_rate": 0.0001, + "loss": 4.2142, + "loss/crossentropy": 1.8024229407310486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1871618777513504, + "step": 14588 + }, + { + "epoch": 0.2918, + "grad_norm": 2.0625, + "grad_norm_var": 0.014967600504557291, + "learning_rate": 0.0001, + "loss": 4.1611, + "loss/crossentropy": 2.289118528366089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22116435319185257, + "step": 14590 + }, + { + "epoch": 0.29184, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013277180989583333, + "learning_rate": 0.0001, + "loss": 4.1358, + "loss/crossentropy": 2.11151522397995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22029711306095123, + "step": 14592 + }, + { + "epoch": 0.29188, + "grad_norm": 2.125, + "grad_norm_var": 0.013288370768229167, + "learning_rate": 0.0001, + "loss": 4.3103, + "loss/crossentropy": 1.7856897711753845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815958321094513, + "step": 14594 + }, + { + "epoch": 0.29192, + "grad_norm": 2.078125, + "grad_norm_var": 0.0101959228515625, + "learning_rate": 0.0001, + "loss": 4.1085, + "loss/crossentropy": 1.9745690822601318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074003368616104, + "step": 14596 + }, + { + "epoch": 0.29196, + "grad_norm": 1.953125, + "grad_norm_var": 0.0091461181640625, + "learning_rate": 0.0001, + "loss": 4.3034, + "loss/crossentropy": 1.9199401140213013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19998926669359207, + "step": 14598 + }, + { + "epoch": 0.292, + "grad_norm": 1.953125, + "grad_norm_var": 0.007743072509765625, + "learning_rate": 0.0001, + "loss": 4.0842, + "loss/crossentropy": 1.9938844442367554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20138052105903625, + "step": 14600 + }, + { + "epoch": 0.29204, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0059397379557291664, + "learning_rate": 0.0001, + "loss": 4.0025, + "loss/crossentropy": 1.6290993094444275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18716775625944138, + "step": 14602 + }, + { + "epoch": 0.29208, + "grad_norm": 1.875, + "grad_norm_var": 0.006115468343098959, + "learning_rate": 0.0001, + "loss": 3.9953, + "loss/crossentropy": 2.116807520389557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113799750804901, + "step": 14604 + }, + { + "epoch": 0.29212, + "grad_norm": 2.140625, + "grad_norm_var": 0.006392415364583333, + "learning_rate": 0.0001, + "loss": 4.0527, + "loss/crossentropy": 1.6732030510902405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19776474684476852, + "step": 14606 + }, + { + "epoch": 0.29216, + "grad_norm": 2.03125, + "grad_norm_var": 0.006762440999348958, + "learning_rate": 0.0001, + "loss": 4.2572, + "loss/crossentropy": 1.9403682351112366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18560070544481277, + "step": 14608 + }, + { + "epoch": 0.2922, + "grad_norm": 2.3125, + "grad_norm_var": 0.011641438802083333, + "learning_rate": 0.0001, + "loss": 4.2238, + "loss/crossentropy": 1.9337440729141235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172246277332306, + "step": 14610 + }, + { + "epoch": 0.29224, + "grad_norm": 2.0625, + "grad_norm_var": 0.011836751302083334, + "learning_rate": 0.0001, + "loss": 4.1839, + "loss/crossentropy": 2.1603177785873413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076597660779953, + "step": 14612 + }, + { + "epoch": 0.29228, + "grad_norm": 1.953125, + "grad_norm_var": 0.0118804931640625, + "learning_rate": 0.0001, + "loss": 4.1103, + "loss/crossentropy": 1.979960322380066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2007075995206833, + "step": 14614 + }, + { + "epoch": 0.29232, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0134429931640625, + "learning_rate": 0.0001, + "loss": 3.9901, + "loss/crossentropy": 2.1262378096580505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033269852399826, + "step": 14616 + }, + { + "epoch": 0.29236, + "grad_norm": 1.953125, + "grad_norm_var": 0.013327789306640626, + "learning_rate": 0.0001, + "loss": 3.9001, + "loss/crossentropy": 1.836608648300171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17032578587532043, + "step": 14618 + }, + { + "epoch": 0.2924, + "grad_norm": 1.984375, + "grad_norm_var": 0.012473297119140626, + "learning_rate": 0.0001, + "loss": 4.0165, + "loss/crossentropy": 1.784467339515686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187791109085083, + "step": 14620 + }, + { + "epoch": 0.29244, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010196940104166666, + "learning_rate": 0.0001, + "loss": 4.2963, + "loss/crossentropy": 2.5023289918899536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2469349130988121, + "step": 14622 + }, + { + "epoch": 0.29248, + "grad_norm": 2.1875, + "grad_norm_var": 0.0142974853515625, + "learning_rate": 0.0001, + "loss": 4.3748, + "loss/crossentropy": 2.244086742401123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20690800249576569, + "step": 14624 + }, + { + "epoch": 0.29252, + "grad_norm": 2.109375, + "grad_norm_var": 0.008829752604166666, + "learning_rate": 0.0001, + "loss": 4.2594, + "loss/crossentropy": 2.167649209499359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23074676096439362, + "step": 14626 + }, + { + "epoch": 0.29256, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0092437744140625, + "learning_rate": 0.0001, + "loss": 4.1837, + "loss/crossentropy": 1.934467613697052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2041843980550766, + "step": 14628 + }, + { + "epoch": 0.2926, + "grad_norm": 2.03125, + "grad_norm_var": 0.0152740478515625, + "learning_rate": 0.0001, + "loss": 4.3277, + "loss/crossentropy": 2.457708954811096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24325969815254211, + "step": 14630 + }, + { + "epoch": 0.29264, + "grad_norm": 2.015625, + "grad_norm_var": 0.014446767171223958, + "learning_rate": 0.0001, + "loss": 4.0523, + "loss/crossentropy": 2.165565609931946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20386752486228943, + "step": 14632 + }, + { + "epoch": 0.29268, + "grad_norm": 2.0625, + "grad_norm_var": 0.013677724202473958, + "learning_rate": 0.0001, + "loss": 4.0767, + "loss/crossentropy": 1.9475921988487244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2017974555492401, + "step": 14634 + }, + { + "epoch": 0.29272, + "grad_norm": 1.8203125, + "grad_norm_var": 0.015819295247395834, + "learning_rate": 0.0001, + "loss": 4.1435, + "loss/crossentropy": 1.95538991689682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19920305162668228, + "step": 14636 + }, + { + "epoch": 0.29276, + "grad_norm": 2.234375, + "grad_norm_var": 0.01803766886393229, + "learning_rate": 0.0001, + "loss": 4.4073, + "loss/crossentropy": 2.1813108921051025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2130545824766159, + "step": 14638 + }, + { + "epoch": 0.2928, + "grad_norm": 1.8515625, + "grad_norm_var": 0.01672337849934896, + "learning_rate": 0.0001, + "loss": 4.1841, + "loss/crossentropy": 2.198926568031311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21660766005516052, + "step": 14640 + }, + { + "epoch": 0.29284, + "grad_norm": 2.015625, + "grad_norm_var": 0.01692682902018229, + "learning_rate": 0.0001, + "loss": 4.2872, + "loss/crossentropy": 2.244659662246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22478342056274414, + "step": 14642 + }, + { + "epoch": 0.29288, + "grad_norm": 1.984375, + "grad_norm_var": 0.01608861287434896, + "learning_rate": 0.0001, + "loss": 4.234, + "loss/crossentropy": 2.1159621477127075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22023546695709229, + "step": 14644 + }, + { + "epoch": 0.29292, + "grad_norm": 1.859375, + "grad_norm_var": 0.011067708333333334, + "learning_rate": 0.0001, + "loss": 3.9166, + "loss/crossentropy": 2.2336114645004272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20683540403842926, + "step": 14646 + }, + { + "epoch": 0.29296, + "grad_norm": 2.078125, + "grad_norm_var": 0.011128743489583334, + "learning_rate": 0.0001, + "loss": 3.8647, + "loss/crossentropy": 1.9015604257583618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2791958376765251, + "step": 14648 + }, + { + "epoch": 0.293, + "grad_norm": 1.9140625, + "grad_norm_var": 0.011180623372395834, + "learning_rate": 0.0001, + "loss": 3.7325, + "loss/crossentropy": 1.6337950229644775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17279411852359772, + "step": 14650 + }, + { + "epoch": 0.29304, + "grad_norm": 2.0625, + "grad_norm_var": 0.009618123372395834, + "learning_rate": 0.0001, + "loss": 4.111, + "loss/crossentropy": 1.920238435268402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18912465870380402, + "step": 14652 + }, + { + "epoch": 0.29308, + "grad_norm": 2.046875, + "grad_norm_var": 0.006075032552083333, + "learning_rate": 0.0001, + "loss": 4.4346, + "loss/crossentropy": 2.5529074668884277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23994534462690353, + "step": 14654 + }, + { + "epoch": 0.29312, + "grad_norm": 1.953125, + "grad_norm_var": 0.0100982666015625, + "learning_rate": 0.0001, + "loss": 4.3154, + "loss/crossentropy": 2.1320537328720093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21480896323919296, + "step": 14656 + }, + { + "epoch": 0.29316, + "grad_norm": 1.8046875, + "grad_norm_var": 0.013205718994140626, + "learning_rate": 0.0001, + "loss": 3.867, + "loss/crossentropy": 1.8921156525611877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1749916449189186, + "step": 14658 + }, + { + "epoch": 0.2932, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015433502197265626, + "learning_rate": 0.0001, + "loss": 3.7405, + "loss/crossentropy": 2.2475300431251526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20675917714834213, + "step": 14660 + }, + { + "epoch": 0.29324, + "grad_norm": 2.046875, + "grad_norm_var": 0.015384928385416666, + "learning_rate": 0.0001, + "loss": 4.2091, + "loss/crossentropy": 2.291227698326111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2244400456547737, + "step": 14662 + }, + { + "epoch": 0.29328, + "grad_norm": 2.09375, + "grad_norm_var": 0.017976888020833335, + "learning_rate": 0.0001, + "loss": 4.1468, + "loss/crossentropy": 1.8830837607383728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137318760156631, + "step": 14664 + }, + { + "epoch": 0.29332, + "grad_norm": 2.078125, + "grad_norm_var": 0.019742838541666665, + "learning_rate": 0.0001, + "loss": 4.3424, + "loss/crossentropy": 2.366846203804016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22360631078481674, + "step": 14666 + }, + { + "epoch": 0.29336, + "grad_norm": 1.875, + "grad_norm_var": 0.021720123291015626, + "learning_rate": 0.0001, + "loss": 4.1778, + "loss/crossentropy": 2.165378987789154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088456228375435, + "step": 14668 + }, + { + "epoch": 0.2934, + "grad_norm": 1.9609375, + "grad_norm_var": 0.021809895833333332, + "learning_rate": 0.0001, + "loss": 4.178, + "loss/crossentropy": 1.782981276512146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17654509842395782, + "step": 14670 + }, + { + "epoch": 0.29344, + "grad_norm": 2.28125, + "grad_norm_var": 0.021581013997395832, + "learning_rate": 0.0001, + "loss": 4.0796, + "loss/crossentropy": 1.9891030192375183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18961766362190247, + "step": 14672 + }, + { + "epoch": 0.29348, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015738932291666667, + "learning_rate": 0.0001, + "loss": 4.0359, + "loss/crossentropy": 1.9350959062576294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124071717262268, + "step": 14674 + }, + { + "epoch": 0.29352, + "grad_norm": 2.21875, + "grad_norm_var": 0.01131591796875, + "learning_rate": 0.0001, + "loss": 4.2651, + "loss/crossentropy": 2.1763634085655212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22027570754289627, + "step": 14676 + }, + { + "epoch": 0.29356, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012499745686848958, + "learning_rate": 0.0001, + "loss": 4.1502, + "loss/crossentropy": 1.741381287574768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17308862507343292, + "step": 14678 + }, + { + "epoch": 0.2936, + "grad_norm": 2.03125, + "grad_norm_var": 0.011633046468098958, + "learning_rate": 0.0001, + "loss": 4.2012, + "loss/crossentropy": 2.352774977684021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20929308235645294, + "step": 14680 + }, + { + "epoch": 0.29364, + "grad_norm": 2.0625, + "grad_norm_var": 0.012679036458333333, + "learning_rate": 0.0001, + "loss": 3.9799, + "loss/crossentropy": 1.6342085003852844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17740759253501892, + "step": 14682 + }, + { + "epoch": 0.29368, + "grad_norm": 2.109375, + "grad_norm_var": 0.0110260009765625, + "learning_rate": 0.0001, + "loss": 4.1968, + "loss/crossentropy": 2.315647602081299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202264443039894, + "step": 14684 + }, + { + "epoch": 0.29372, + "grad_norm": 2.09375, + "grad_norm_var": 0.011860911051432292, + "learning_rate": 0.0001, + "loss": 4.0276, + "loss/crossentropy": 2.1667529344558716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20557944476604462, + "step": 14686 + }, + { + "epoch": 0.29376, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012970987955729167, + "learning_rate": 0.0001, + "loss": 3.7212, + "loss/crossentropy": 1.9426026940345764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17865119129419327, + "step": 14688 + }, + { + "epoch": 0.2938, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013720703125, + "learning_rate": 0.0001, + "loss": 3.9345, + "loss/crossentropy": 2.012951970100403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918969601392746, + "step": 14690 + }, + { + "epoch": 0.29384, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0097808837890625, + "learning_rate": 0.0001, + "loss": 4.2131, + "loss/crossentropy": 2.2038668394088745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21520675718784332, + "step": 14692 + }, + { + "epoch": 0.29388, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009659576416015624, + "learning_rate": 0.0001, + "loss": 3.9577, + "loss/crossentropy": 2.284528613090515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2084675058722496, + "step": 14694 + }, + { + "epoch": 0.29392, + "grad_norm": 2.09375, + "grad_norm_var": 0.009991200764973958, + "learning_rate": 0.0001, + "loss": 4.1159, + "loss/crossentropy": 2.3619518280029297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23107902705669403, + "step": 14696 + }, + { + "epoch": 0.29396, + "grad_norm": 2.0625, + "grad_norm_var": 0.009870402018229167, + "learning_rate": 0.0001, + "loss": 4.1488, + "loss/crossentropy": 2.279863119125366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21235084533691406, + "step": 14698 + }, + { + "epoch": 0.294, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009639231363932292, + "learning_rate": 0.0001, + "loss": 4.2216, + "loss/crossentropy": 2.128455936908722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21381118893623352, + "step": 14700 + }, + { + "epoch": 0.29404, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009014638264973958, + "learning_rate": 0.0001, + "loss": 4.0147, + "loss/crossentropy": 2.265346884727478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2086036428809166, + "step": 14702 + }, + { + "epoch": 0.29408, + "grad_norm": 2.0, + "grad_norm_var": 0.008421834309895833, + "learning_rate": 0.0001, + "loss": 4.2778, + "loss/crossentropy": 2.4323991537094116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21795648336410522, + "step": 14704 + }, + { + "epoch": 0.29412, + "grad_norm": 2.203125, + "grad_norm_var": 0.30881729125976565, + "learning_rate": 0.0001, + "loss": 4.3864, + "loss/crossentropy": 2.2948319911956787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23302766680717468, + "step": 14706 + }, + { + "epoch": 0.29416, + "grad_norm": 1.9765625, + "grad_norm_var": 0.3063140869140625, + "learning_rate": 0.0001, + "loss": 4.121, + "loss/crossentropy": 2.1273980140686035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2240317091345787, + "step": 14708 + }, + { + "epoch": 0.2942, + "grad_norm": 1.828125, + "grad_norm_var": 0.304357655843099, + "learning_rate": 0.0001, + "loss": 3.7468, + "loss/crossentropy": 2.0593321323394775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19727306067943573, + "step": 14710 + }, + { + "epoch": 0.29424, + "grad_norm": 1.9296875, + "grad_norm_var": 0.3060373942057292, + "learning_rate": 0.0001, + "loss": 3.9979, + "loss/crossentropy": 2.339016914367676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2313404530286789, + "step": 14712 + }, + { + "epoch": 0.29428, + "grad_norm": 1.9921875, + "grad_norm_var": 0.30822652180989585, + "learning_rate": 0.0001, + "loss": 3.9406, + "loss/crossentropy": 2.1587395668029785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20085373520851135, + "step": 14714 + }, + { + "epoch": 0.29432, + "grad_norm": 2.078125, + "grad_norm_var": 0.3103912353515625, + "learning_rate": 0.0001, + "loss": 4.0616, + "loss/crossentropy": 1.8317759037017822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20150133967399597, + "step": 14716 + }, + { + "epoch": 0.29436, + "grad_norm": 1.7890625, + "grad_norm_var": 0.31324437459309895, + "learning_rate": 0.0001, + "loss": 3.9291, + "loss/crossentropy": 1.849327266216278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17959115654230118, + "step": 14718 + }, + { + "epoch": 0.2944, + "grad_norm": 1.8828125, + "grad_norm_var": 0.32099507649739584, + "learning_rate": 0.0001, + "loss": 3.7933, + "loss/crossentropy": 1.652997612953186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17243028432130814, + "step": 14720 + }, + { + "epoch": 0.29444, + "grad_norm": 2.03125, + "grad_norm_var": 0.012995402018229166, + "learning_rate": 0.0001, + "loss": 4.2919, + "loss/crossentropy": 1.968604028224945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20564967393875122, + "step": 14722 + }, + { + "epoch": 0.29448, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007466634114583333, + "learning_rate": 0.0001, + "loss": 4.1211, + "loss/crossentropy": 2.349083185195923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24004054814577103, + "step": 14724 + }, + { + "epoch": 0.29452, + "grad_norm": 1.96875, + "grad_norm_var": 0.006200917561848958, + "learning_rate": 0.0001, + "loss": 4.2078, + "loss/crossentropy": 2.1629676818847656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2260306179523468, + "step": 14726 + }, + { + "epoch": 0.29456, + "grad_norm": 2.265625, + "grad_norm_var": 0.012572224934895833, + "learning_rate": 0.0001, + "loss": 4.5995, + "loss/crossentropy": 2.221126079559326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21054977923631668, + "step": 14728 + }, + { + "epoch": 0.2946, + "grad_norm": 2.234375, + "grad_norm_var": 0.3638987223307292, + "learning_rate": 0.0001, + "loss": 4.2588, + "loss/crossentropy": 2.0333986282348633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27580036222934723, + "step": 14730 + }, + { + "epoch": 0.29464, + "grad_norm": 2.09375, + "grad_norm_var": 0.36292699178059895, + "learning_rate": 0.0001, + "loss": 4.2601, + "loss/crossentropy": 2.0025470852851868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20166245102882385, + "step": 14732 + }, + { + "epoch": 0.29468, + "grad_norm": 2.046875, + "grad_norm_var": 0.35410868326822914, + "learning_rate": 0.0001, + "loss": 4.1044, + "loss/crossentropy": 2.2882933616638184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972086951136589, + "step": 14734 + }, + { + "epoch": 0.29472, + "grad_norm": 2.125, + "grad_norm_var": 0.3395342508951823, + "learning_rate": 0.0001, + "loss": 4.1616, + "loss/crossentropy": 2.2404085397720337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22015579044818878, + "step": 14736 + }, + { + "epoch": 0.29476, + "grad_norm": 2.234375, + "grad_norm_var": 0.33505223592122396, + "learning_rate": 0.0001, + "loss": 4.0132, + "loss/crossentropy": 1.9309356808662415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082076519727707, + "step": 14738 + }, + { + "epoch": 0.2948, + "grad_norm": 2.046875, + "grad_norm_var": 0.33227437337239585, + "learning_rate": 0.0001, + "loss": 4.1652, + "loss/crossentropy": 2.2118934988975525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20175088196992874, + "step": 14740 + }, + { + "epoch": 0.29484, + "grad_norm": 2.03125, + "grad_norm_var": 0.36171875, + "learning_rate": 0.0001, + "loss": 4.1204, + "loss/crossentropy": 2.19997900724411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22470968216657639, + "step": 14742 + }, + { + "epoch": 0.29488, + "grad_norm": 2.09375, + "grad_norm_var": 0.365966796875, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 1.8140272498130798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20032335817813873, + "step": 14744 + }, + { + "epoch": 0.29492, + "grad_norm": 1.9609375, + "grad_norm_var": 0.07648086547851562, + "learning_rate": 0.0001, + "loss": 3.957, + "loss/crossentropy": 2.0233620405197144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20464400947093964, + "step": 14746 + }, + { + "epoch": 0.29496, + "grad_norm": 2.390625, + "grad_norm_var": 0.07643407185872396, + "learning_rate": 0.0001, + "loss": 4.3946, + "loss/crossentropy": 2.3900705575942993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23763202875852585, + "step": 14748 + }, + { + "epoch": 0.295, + "grad_norm": 1.9765625, + "grad_norm_var": 0.07768325805664063, + "learning_rate": 0.0001, + "loss": 4.205, + "loss/crossentropy": 1.9895538687705994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20736829191446304, + "step": 14750 + }, + { + "epoch": 0.29504, + "grad_norm": 2.03125, + "grad_norm_var": 0.0857421875, + "learning_rate": 0.0001, + "loss": 4.1454, + "loss/crossentropy": 1.7315555810928345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839754432439804, + "step": 14752 + }, + { + "epoch": 0.29508, + "grad_norm": 2.0625, + "grad_norm_var": 0.08105367024739583, + "learning_rate": 0.0001, + "loss": 4.5482, + "loss/crossentropy": 2.44241464138031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505839467048645, + "step": 14754 + }, + { + "epoch": 0.29512, + "grad_norm": 2.1875, + "grad_norm_var": 0.0849029541015625, + "learning_rate": 0.0001, + "loss": 3.782, + "loss/crossentropy": 1.7948896884918213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987960934638977, + "step": 14756 + }, + { + "epoch": 0.29516, + "grad_norm": 2.125, + "grad_norm_var": 0.020334625244140626, + "learning_rate": 0.0001, + "loss": 4.0898, + "loss/crossentropy": 1.710760235786438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18624412268400192, + "step": 14758 + }, + { + "epoch": 0.2952, + "grad_norm": 2.390625, + "grad_norm_var": 0.027205149332682293, + "learning_rate": 0.0001, + "loss": 4.4651, + "loss/crossentropy": 2.0127750635147095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20178033411502838, + "step": 14760 + }, + { + "epoch": 0.29524, + "grad_norm": 2.0, + "grad_norm_var": 0.027042388916015625, + "learning_rate": 0.0001, + "loss": 4.1152, + "loss/crossentropy": 1.9764790534973145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19000716507434845, + "step": 14762 + }, + { + "epoch": 0.29528, + "grad_norm": 2.140625, + "grad_norm_var": 0.020930735270182292, + "learning_rate": 0.0001, + "loss": 3.9634, + "loss/crossentropy": 2.0221771597862244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980113685131073, + "step": 14764 + }, + { + "epoch": 0.29532, + "grad_norm": 2.09375, + "grad_norm_var": 0.019972483317057293, + "learning_rate": 0.0001, + "loss": 4.1721, + "loss/crossentropy": 2.278168559074402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.231141597032547, + "step": 14766 + }, + { + "epoch": 0.29536, + "grad_norm": 1.953125, + "grad_norm_var": 0.0168853759765625, + "learning_rate": 0.0001, + "loss": 4.1308, + "loss/crossentropy": 2.1837246417999268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026391476392746, + "step": 14768 + }, + { + "epoch": 0.2954, + "grad_norm": 1.890625, + "grad_norm_var": 0.019795735677083332, + "learning_rate": 0.0001, + "loss": 4.0694, + "loss/crossentropy": 2.1170668601989746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1871805638074875, + "step": 14770 + }, + { + "epoch": 0.29544, + "grad_norm": 2.125, + "grad_norm_var": 0.01784032185872396, + "learning_rate": 0.0001, + "loss": 4.1003, + "loss/crossentropy": 1.9905366897583008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951729580760002, + "step": 14772 + }, + { + "epoch": 0.29548, + "grad_norm": 2.03125, + "grad_norm_var": 0.017146809895833334, + "learning_rate": 0.0001, + "loss": 3.9811, + "loss/crossentropy": 1.781678318977356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19464702904224396, + "step": 14774 + }, + { + "epoch": 0.29552, + "grad_norm": 1.875, + "grad_norm_var": 0.012303670247395834, + "learning_rate": 0.0001, + "loss": 4.1944, + "loss/crossentropy": 2.2671462297439575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22269655019044876, + "step": 14776 + }, + { + "epoch": 0.29556, + "grad_norm": 2.046875, + "grad_norm_var": 0.012511952718098959, + "learning_rate": 0.0001, + "loss": 4.2128, + "loss/crossentropy": 2.0294137001037598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21939906477928162, + "step": 14778 + }, + { + "epoch": 0.2956, + "grad_norm": 1.9375, + "grad_norm_var": 0.011264801025390625, + "learning_rate": 0.0001, + "loss": 4.1295, + "loss/crossentropy": 1.9174134731292725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19592813402414322, + "step": 14780 + }, + { + "epoch": 0.29564, + "grad_norm": 2.0, + "grad_norm_var": 0.009528605143229167, + "learning_rate": 0.0001, + "loss": 3.9877, + "loss/crossentropy": 2.0033875703811646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121451273560524, + "step": 14782 + }, + { + "epoch": 0.29568, + "grad_norm": 1.9765625, + "grad_norm_var": 0.09626439412434896, + "learning_rate": 0.0001, + "loss": 4.0263, + "loss/crossentropy": 2.209542691707611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18373841792345047, + "step": 14784 + }, + { + "epoch": 0.29572, + "grad_norm": 2.015625, + "grad_norm_var": 0.09244155883789062, + "learning_rate": 0.0001, + "loss": 4.149, + "loss/crossentropy": 2.0651984214782715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20486845821142197, + "step": 14786 + }, + { + "epoch": 0.29576, + "grad_norm": 2.03125, + "grad_norm_var": 0.09123433430989583, + "learning_rate": 0.0001, + "loss": 4.1697, + "loss/crossentropy": 2.311215043067932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22553718090057373, + "step": 14788 + }, + { + "epoch": 0.2958, + "grad_norm": 2.234375, + "grad_norm_var": 0.09251708984375, + "learning_rate": 0.0001, + "loss": 4.3291, + "loss/crossentropy": 2.0282764434814453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20748327672481537, + "step": 14790 + }, + { + "epoch": 0.29584, + "grad_norm": 1.9765625, + "grad_norm_var": 0.09575093587239583, + "learning_rate": 0.0001, + "loss": 3.764, + "loss/crossentropy": 2.04198157787323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19687633216381073, + "step": 14792 + }, + { + "epoch": 0.29588, + "grad_norm": 2.125, + "grad_norm_var": 0.09601236979166666, + "learning_rate": 0.0001, + "loss": 4.389, + "loss/crossentropy": 2.049328565597534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20265275985002518, + "step": 14794 + }, + { + "epoch": 0.29592, + "grad_norm": 1.984375, + "grad_norm_var": 0.0992876688639323, + "learning_rate": 0.0001, + "loss": 3.9118, + "loss/crossentropy": 1.905173659324646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193840891122818, + "step": 14796 + }, + { + "epoch": 0.29596, + "grad_norm": 2.078125, + "grad_norm_var": 0.09698486328125, + "learning_rate": 0.0001, + "loss": 3.941, + "loss/crossentropy": 1.8715303540229797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18632445484399796, + "step": 14798 + }, + { + "epoch": 0.296, + "grad_norm": 1.953125, + "grad_norm_var": 0.013765207926432292, + "learning_rate": 0.0001, + "loss": 4.039, + "loss/crossentropy": 2.0428889989852905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19538581371307373, + "step": 14800 + }, + { + "epoch": 0.29604, + "grad_norm": 2.203125, + "grad_norm_var": 0.015553538004557292, + "learning_rate": 0.0001, + "loss": 4.0834, + "loss/crossentropy": 2.2959046363830566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2372349202632904, + "step": 14802 + }, + { + "epoch": 0.29608, + "grad_norm": 1.96875, + "grad_norm_var": 0.016190592447916666, + "learning_rate": 0.0001, + "loss": 4.0399, + "loss/crossentropy": 2.083451807498932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20915375649929047, + "step": 14804 + }, + { + "epoch": 0.29612, + "grad_norm": 2.03125, + "grad_norm_var": 0.0132232666015625, + "learning_rate": 0.0001, + "loss": 4.293, + "loss/crossentropy": 2.31546950340271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20648062229156494, + "step": 14806 + }, + { + "epoch": 0.29616, + "grad_norm": 1.96875, + "grad_norm_var": 0.013019816080729166, + "learning_rate": 0.0001, + "loss": 3.7273, + "loss/crossentropy": 1.9833638072013855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19132772833108902, + "step": 14808 + }, + { + "epoch": 0.2962, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013655344645182291, + "learning_rate": 0.0001, + "loss": 3.8247, + "loss/crossentropy": 1.784917414188385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19032159447669983, + "step": 14810 + }, + { + "epoch": 0.29624, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011263020833333333, + "learning_rate": 0.0001, + "loss": 4.0011, + "loss/crossentropy": 1.843966782093048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22127597779035568, + "step": 14812 + }, + { + "epoch": 0.29628, + "grad_norm": 2.921875, + "grad_norm_var": 0.06729100545247396, + "learning_rate": 0.0001, + "loss": 4.0535, + "loss/crossentropy": 1.8107360005378723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18275444209575653, + "step": 14814 + }, + { + "epoch": 0.29632, + "grad_norm": 2.078125, + "grad_norm_var": 0.06665827433268229, + "learning_rate": 0.0001, + "loss": 4.2711, + "loss/crossentropy": 2.147618055343628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23800157755613327, + "step": 14816 + }, + { + "epoch": 0.29636, + "grad_norm": 2.0625, + "grad_norm_var": 0.06475804646809896, + "learning_rate": 0.0001, + "loss": 4.267, + "loss/crossentropy": 2.4430278539657593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2401380091905594, + "step": 14818 + }, + { + "epoch": 0.2964, + "grad_norm": 2.078125, + "grad_norm_var": 0.06377665201822917, + "learning_rate": 0.0001, + "loss": 4.3204, + "loss/crossentropy": 2.181501626968384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22491848468780518, + "step": 14820 + }, + { + "epoch": 0.29644, + "grad_norm": 2.046875, + "grad_norm_var": 0.06398111979166667, + "learning_rate": 0.0001, + "loss": 4.0375, + "loss/crossentropy": 2.101313889026642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21598663926124573, + "step": 14822 + }, + { + "epoch": 0.29648, + "grad_norm": 1.9765625, + "grad_norm_var": 0.059020741780598955, + "learning_rate": 0.0001, + "loss": 3.9629, + "loss/crossentropy": 1.6834549307823181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19430063664913177, + "step": 14824 + }, + { + "epoch": 0.29652, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0566162109375, + "learning_rate": 0.0001, + "loss": 4.0087, + "loss/crossentropy": 1.8400229215621948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18764454871416092, + "step": 14826 + }, + { + "epoch": 0.29656, + "grad_norm": 2.015625, + "grad_norm_var": 0.05609944661458333, + "learning_rate": 0.0001, + "loss": 4.1427, + "loss/crossentropy": 1.9417667388916016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21304062008857727, + "step": 14828 + }, + { + "epoch": 0.2966, + "grad_norm": 2.046875, + "grad_norm_var": 0.003639475504557292, + "learning_rate": 0.0001, + "loss": 4.125, + "loss/crossentropy": 2.227192521095276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23011230677366257, + "step": 14830 + }, + { + "epoch": 0.29664, + "grad_norm": 2.015625, + "grad_norm_var": 0.0029436747233072915, + "learning_rate": 0.0001, + "loss": 4.3496, + "loss/crossentropy": 2.2136365175247192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21694285422563553, + "step": 14832 + }, + { + "epoch": 0.29668, + "grad_norm": 1.890625, + "grad_norm_var": 0.004137929280598958, + "learning_rate": 0.0001, + "loss": 3.9464, + "loss/crossentropy": 1.9179469347000122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996636986732483, + "step": 14834 + }, + { + "epoch": 0.29672, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0043718973795572914, + "learning_rate": 0.0001, + "loss": 3.9001, + "loss/crossentropy": 1.9639176726341248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17738713324069977, + "step": 14836 + }, + { + "epoch": 0.29676, + "grad_norm": 2.0625, + "grad_norm_var": 0.004564412434895833, + "learning_rate": 0.0001, + "loss": 4.2665, + "loss/crossentropy": 1.7748695611953735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18663641810417175, + "step": 14838 + }, + { + "epoch": 0.2968, + "grad_norm": 1.8046875, + "grad_norm_var": 0.007054646809895833, + "learning_rate": 0.0001, + "loss": 3.9879, + "loss/crossentropy": 2.094521999359131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18323734402656555, + "step": 14840 + }, + { + "epoch": 0.29684, + "grad_norm": 2.3125, + "grad_norm_var": 0.01407470703125, + "learning_rate": 0.0001, + "loss": 4.5969, + "loss/crossentropy": 2.488257050514221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23173442482948303, + "step": 14842 + }, + { + "epoch": 0.29688, + "grad_norm": 2.046875, + "grad_norm_var": 0.013444010416666667, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 2.2317086458206177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2046324908733368, + "step": 14844 + }, + { + "epoch": 0.29692, + "grad_norm": 2.09375, + "grad_norm_var": 0.014134724934895834, + "learning_rate": 0.0001, + "loss": 3.9917, + "loss/crossentropy": 2.275767207145691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20739831775426865, + "step": 14846 + }, + { + "epoch": 0.29696, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014501698811848958, + "learning_rate": 0.0001, + "loss": 3.9728, + "loss/crossentropy": 2.0288257002830505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18860980868339539, + "step": 14848 + }, + { + "epoch": 0.297, + "grad_norm": 2.140625, + "grad_norm_var": 0.015449778238932291, + "learning_rate": 0.0001, + "loss": 4.3509, + "loss/crossentropy": 2.0609896183013916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20585624873638153, + "step": 14850 + }, + { + "epoch": 0.29704, + "grad_norm": 1.96875, + "grad_norm_var": 0.018214670817057292, + "learning_rate": 0.0001, + "loss": 3.5485, + "loss/crossentropy": 1.9250158667564392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1843906193971634, + "step": 14852 + }, + { + "epoch": 0.29708, + "grad_norm": 1.96875, + "grad_norm_var": 0.018070475260416666, + "learning_rate": 0.0001, + "loss": 3.9761, + "loss/crossentropy": 2.118358612060547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2035902589559555, + "step": 14854 + }, + { + "epoch": 0.29712, + "grad_norm": 1.90625, + "grad_norm_var": 0.015610504150390624, + "learning_rate": 0.0001, + "loss": 4.135, + "loss/crossentropy": 1.8381852507591248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1859317123889923, + "step": 14856 + }, + { + "epoch": 0.29716, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009513346354166667, + "learning_rate": 0.0001, + "loss": 4.1235, + "loss/crossentropy": 2.078063726425171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21035100519657135, + "step": 14858 + }, + { + "epoch": 0.2972, + "grad_norm": 2.078125, + "grad_norm_var": 0.010701497395833334, + "learning_rate": 0.0001, + "loss": 4.0549, + "loss/crossentropy": 2.1546566486358643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19998939335346222, + "step": 14860 + }, + { + "epoch": 0.29724, + "grad_norm": 2.046875, + "grad_norm_var": 0.010164133707682292, + "learning_rate": 0.0001, + "loss": 4.2264, + "loss/crossentropy": 2.019156754016876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1957200989127159, + "step": 14862 + }, + { + "epoch": 0.29728, + "grad_norm": 2.140625, + "grad_norm_var": 0.013138834635416667, + "learning_rate": 0.0001, + "loss": 4.0808, + "loss/crossentropy": 1.8542814254760742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19331540167331696, + "step": 14864 + }, + { + "epoch": 0.29732, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010895792643229167, + "learning_rate": 0.0001, + "loss": 4.2488, + "loss/crossentropy": 2.241651773452759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22117780148983002, + "step": 14866 + }, + { + "epoch": 0.29736, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011226145426432292, + "learning_rate": 0.0001, + "loss": 4.2139, + "loss/crossentropy": 1.8856277465820312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19262754172086716, + "step": 14868 + }, + { + "epoch": 0.2974, + "grad_norm": 2.203125, + "grad_norm_var": 0.013388824462890626, + "learning_rate": 0.0001, + "loss": 4.3315, + "loss/crossentropy": 2.0159581899642944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21892660111188889, + "step": 14870 + }, + { + "epoch": 0.29744, + "grad_norm": 1.828125, + "grad_norm_var": 0.014562733968098958, + "learning_rate": 0.0001, + "loss": 3.8965, + "loss/crossentropy": 1.8794240355491638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18827088177204132, + "step": 14872 + }, + { + "epoch": 0.29748, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013726552327473959, + "learning_rate": 0.0001, + "loss": 4.2176, + "loss/crossentropy": 2.3741602897644043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2292410209774971, + "step": 14874 + }, + { + "epoch": 0.29752, + "grad_norm": 1.828125, + "grad_norm_var": 0.014025624593098958, + "learning_rate": 0.0001, + "loss": 3.8625, + "loss/crossentropy": 1.9994327425956726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21266867220401764, + "step": 14876 + }, + { + "epoch": 0.29756, + "grad_norm": 2.125, + "grad_norm_var": 0.014143880208333333, + "learning_rate": 0.0001, + "loss": 4.1337, + "loss/crossentropy": 2.0914021730422974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071390450000763, + "step": 14878 + }, + { + "epoch": 0.2976, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012798817952473958, + "learning_rate": 0.0001, + "loss": 4.0463, + "loss/crossentropy": 1.8055935502052307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19375848025083542, + "step": 14880 + }, + { + "epoch": 0.29764, + "grad_norm": 2.03125, + "grad_norm_var": 0.012896474202473958, + "learning_rate": 0.0001, + "loss": 4.3662, + "loss/crossentropy": 2.614544630050659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22156962752342224, + "step": 14882 + }, + { + "epoch": 0.29768, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013205718994140626, + "learning_rate": 0.0001, + "loss": 3.6502, + "loss/crossentropy": 1.7306513786315918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17159338295459747, + "step": 14884 + }, + { + "epoch": 0.29772, + "grad_norm": 2.046875, + "grad_norm_var": 0.010060373942057292, + "learning_rate": 0.0001, + "loss": 4.0683, + "loss/crossentropy": 2.060949981212616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20615212619304657, + "step": 14886 + }, + { + "epoch": 0.29776, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008756510416666667, + "learning_rate": 0.0001, + "loss": 3.9903, + "loss/crossentropy": 1.8338764309883118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20274019241333008, + "step": 14888 + }, + { + "epoch": 0.2978, + "grad_norm": 1.78125, + "grad_norm_var": 0.013683827718098958, + "learning_rate": 0.0001, + "loss": 4.0335, + "loss/crossentropy": 2.2182517051696777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22446341067552567, + "step": 14890 + }, + { + "epoch": 0.29784, + "grad_norm": 2.09375, + "grad_norm_var": 0.012951405843098958, + "learning_rate": 0.0001, + "loss": 4.3924, + "loss/crossentropy": 2.222801446914673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22065874934196472, + "step": 14892 + }, + { + "epoch": 0.29788, + "grad_norm": 1.8515625, + "grad_norm_var": 0.012808990478515626, + "learning_rate": 0.0001, + "loss": 4.1315, + "loss/crossentropy": 2.2181414365768433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21953265368938446, + "step": 14894 + }, + { + "epoch": 0.29792, + "grad_norm": 1.984375, + "grad_norm_var": 0.013166300455729167, + "learning_rate": 0.0001, + "loss": 3.654, + "loss/crossentropy": 1.9304096102714539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20096193999052048, + "step": 14896 + }, + { + "epoch": 0.29796, + "grad_norm": 2.0, + "grad_norm_var": 0.012043253580729166, + "learning_rate": 0.0001, + "loss": 4.2074, + "loss/crossentropy": 2.2288341522216797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24108773469924927, + "step": 14898 + }, + { + "epoch": 0.298, + "grad_norm": 2.15625, + "grad_norm_var": 0.012550608317057291, + "learning_rate": 0.0001, + "loss": 4.191, + "loss/crossentropy": 2.1711790561676025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2395533323287964, + "step": 14900 + }, + { + "epoch": 0.29804, + "grad_norm": 2.03125, + "grad_norm_var": 0.019482167561848958, + "learning_rate": 0.0001, + "loss": 4.2686, + "loss/crossentropy": 2.0533857345581055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21801679581403732, + "step": 14902 + }, + { + "epoch": 0.29808, + "grad_norm": 2.0625, + "grad_norm_var": 0.01883519490559896, + "learning_rate": 0.0001, + "loss": 3.8678, + "loss/crossentropy": 1.634634256362915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17861492931842804, + "step": 14904 + }, + { + "epoch": 0.29812, + "grad_norm": 2.15625, + "grad_norm_var": 0.0149322509765625, + "learning_rate": 0.0001, + "loss": 4.2152, + "loss/crossentropy": 2.3151358366012573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22694342583417892, + "step": 14906 + }, + { + "epoch": 0.29816, + "grad_norm": 2.03125, + "grad_norm_var": 0.01585871378580729, + "learning_rate": 0.0001, + "loss": 4.085, + "loss/crossentropy": 1.9165552258491516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18231045454740524, + "step": 14908 + }, + { + "epoch": 0.2982, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01480712890625, + "learning_rate": 0.0001, + "loss": 3.9693, + "loss/crossentropy": 2.0094637274742126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19547566026449203, + "step": 14910 + }, + { + "epoch": 0.29824, + "grad_norm": 1.8515625, + "grad_norm_var": 0.014753214518229167, + "learning_rate": 0.0001, + "loss": 3.9091, + "loss/crossentropy": 1.8764418959617615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18009892851114273, + "step": 14912 + }, + { + "epoch": 0.29828, + "grad_norm": 2.140625, + "grad_norm_var": 0.015868123372395834, + "learning_rate": 0.0001, + "loss": 4.429, + "loss/crossentropy": 2.1255921721458435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20359216630458832, + "step": 14914 + }, + { + "epoch": 0.29832, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014891560872395833, + "learning_rate": 0.0001, + "loss": 3.7889, + "loss/crossentropy": 2.1316330432891846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970125362277031, + "step": 14916 + }, + { + "epoch": 0.29836, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0089996337890625, + "learning_rate": 0.0001, + "loss": 4.0305, + "loss/crossentropy": 1.7825700640678406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1902567446231842, + "step": 14918 + }, + { + "epoch": 0.2984, + "grad_norm": 2.0, + "grad_norm_var": 0.008565012613932292, + "learning_rate": 0.0001, + "loss": 3.9577, + "loss/crossentropy": 1.982073962688446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20216460525989532, + "step": 14920 + }, + { + "epoch": 0.29844, + "grad_norm": 1.953125, + "grad_norm_var": 0.0060943603515625, + "learning_rate": 0.0001, + "loss": 4.04, + "loss/crossentropy": 2.3210418224334717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2224876508116722, + "step": 14922 + }, + { + "epoch": 0.29848, + "grad_norm": 2.03125, + "grad_norm_var": 0.005535634358723959, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 1.7990338206291199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17401983588933945, + "step": 14924 + }, + { + "epoch": 0.29852, + "grad_norm": 2.1875, + "grad_norm_var": 0.010545857747395833, + "learning_rate": 0.0001, + "loss": 4.4979, + "loss/crossentropy": 1.8772737979888916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21376933157444, + "step": 14926 + }, + { + "epoch": 0.29856, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009716542561848958, + "learning_rate": 0.0001, + "loss": 4.0544, + "loss/crossentropy": 1.6914128065109253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17860179394483566, + "step": 14928 + }, + { + "epoch": 0.2986, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009691365559895833, + "learning_rate": 0.0001, + "loss": 3.9737, + "loss/crossentropy": 1.9545430541038513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015315517783165, + "step": 14930 + }, + { + "epoch": 0.29864, + "grad_norm": 1.921875, + "grad_norm_var": 0.009175618489583334, + "learning_rate": 0.0001, + "loss": 3.9181, + "loss/crossentropy": 2.0585074424743652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029297649860382, + "step": 14932 + }, + { + "epoch": 0.29868, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009679921468098958, + "learning_rate": 0.0001, + "loss": 3.8638, + "loss/crossentropy": 1.9196518063545227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970217153429985, + "step": 14934 + }, + { + "epoch": 0.29872, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0097564697265625, + "learning_rate": 0.0001, + "loss": 4.1366, + "loss/crossentropy": 2.1077409982681274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22366851568222046, + "step": 14936 + }, + { + "epoch": 0.29876, + "grad_norm": 3.921875, + "grad_norm_var": 0.24059829711914063, + "learning_rate": 0.0001, + "loss": 3.8552, + "loss/crossentropy": 1.7660444974899292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23386957496404648, + "step": 14938 + }, + { + "epoch": 0.2988, + "grad_norm": 2.21875, + "grad_norm_var": 0.23871841430664062, + "learning_rate": 0.0001, + "loss": 4.2936, + "loss/crossentropy": 1.8436493873596191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19589588046073914, + "step": 14940 + }, + { + "epoch": 0.29884, + "grad_norm": 2.40625, + "grad_norm_var": 0.24443333943684895, + "learning_rate": 0.0001, + "loss": 4.3465, + "loss/crossentropy": 2.2123888731002808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26607823371887207, + "step": 14942 + }, + { + "epoch": 0.29888, + "grad_norm": 2.5625, + "grad_norm_var": 0.2546119689941406, + "learning_rate": 0.0001, + "loss": 4.3234, + "loss/crossentropy": 2.70485258102417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2529008686542511, + "step": 14944 + }, + { + "epoch": 0.29892, + "grad_norm": 2.109375, + "grad_norm_var": 0.25155843098958336, + "learning_rate": 0.0001, + "loss": 4.1379, + "loss/crossentropy": 1.932865023612976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22141847014427185, + "step": 14946 + }, + { + "epoch": 0.29896, + "grad_norm": 1.9765625, + "grad_norm_var": 0.25643310546875, + "learning_rate": 0.0001, + "loss": 4.0119, + "loss/crossentropy": 2.213751196861267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21382954716682434, + "step": 14948 + }, + { + "epoch": 0.299, + "grad_norm": 1.9609375, + "grad_norm_var": 0.251073964436849, + "learning_rate": 0.0001, + "loss": 3.9673, + "loss/crossentropy": 1.679059088230133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16484958678483963, + "step": 14950 + }, + { + "epoch": 0.29904, + "grad_norm": 1.9453125, + "grad_norm_var": 0.24972508748372396, + "learning_rate": 0.0001, + "loss": 3.9873, + "loss/crossentropy": 1.748594582080841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20150119066238403, + "step": 14952 + }, + { + "epoch": 0.29908, + "grad_norm": 2.5625, + "grad_norm_var": 0.05215657552083333, + "learning_rate": 0.0001, + "loss": 4.8053, + "loss/crossentropy": 2.4588215351104736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2513396218419075, + "step": 14954 + }, + { + "epoch": 0.29912, + "grad_norm": 1.9453125, + "grad_norm_var": 0.05468114217122396, + "learning_rate": 0.0001, + "loss": 4.0782, + "loss/crossentropy": 2.1104516983032227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20738063752651215, + "step": 14956 + }, + { + "epoch": 0.29916, + "grad_norm": 2.109375, + "grad_norm_var": 0.04905776977539063, + "learning_rate": 0.0001, + "loss": 4.2351, + "loss/crossentropy": 1.7171857953071594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27423302084207535, + "step": 14958 + }, + { + "epoch": 0.2992, + "grad_norm": 1.890625, + "grad_norm_var": 0.03264058430989583, + "learning_rate": 0.0001, + "loss": 3.7507, + "loss/crossentropy": 1.5607159733772278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1616373285651207, + "step": 14960 + }, + { + "epoch": 0.29924, + "grad_norm": 1.9765625, + "grad_norm_var": 0.033394114176432295, + "learning_rate": 0.0001, + "loss": 4.0128, + "loss/crossentropy": 1.9257365465164185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18031761050224304, + "step": 14962 + }, + { + "epoch": 0.29928, + "grad_norm": 2.0, + "grad_norm_var": 0.031062825520833334, + "learning_rate": 0.0001, + "loss": 3.9406, + "loss/crossentropy": 1.876187801361084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19531002640724182, + "step": 14964 + }, + { + "epoch": 0.29932, + "grad_norm": 2.140625, + "grad_norm_var": 0.0317047119140625, + "learning_rate": 0.0001, + "loss": 4.2028, + "loss/crossentropy": 1.9325042963027954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19496598094701767, + "step": 14966 + }, + { + "epoch": 0.29936, + "grad_norm": 1.8515625, + "grad_norm_var": 0.03322652180989583, + "learning_rate": 0.0001, + "loss": 4.1293, + "loss/crossentropy": 2.148952007293701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18833627551794052, + "step": 14968 + }, + { + "epoch": 0.2994, + "grad_norm": 2.015625, + "grad_norm_var": 0.008099110921223958, + "learning_rate": 0.0001, + "loss": 4.0954, + "loss/crossentropy": 2.3639365434646606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21851496398448944, + "step": 14970 + }, + { + "epoch": 0.29944, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008371734619140625, + "learning_rate": 0.0001, + "loss": 3.8026, + "loss/crossentropy": 1.941165030002594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19548548012971878, + "step": 14972 + }, + { + "epoch": 0.29948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0082916259765625, + "learning_rate": 0.0001, + "loss": 4.1203, + "loss/crossentropy": 2.030466139316559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1922580599784851, + "step": 14974 + }, + { + "epoch": 0.29952, + "grad_norm": 2.0625, + "grad_norm_var": 0.008487955729166666, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.2279865741729736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20831073075532913, + "step": 14976 + }, + { + "epoch": 0.29956, + "grad_norm": 1.875, + "grad_norm_var": 0.007047271728515625, + "learning_rate": 0.0001, + "loss": 4.1708, + "loss/crossentropy": 2.0749244689941406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18032152205705643, + "step": 14978 + }, + { + "epoch": 0.2996, + "grad_norm": 2.0625, + "grad_norm_var": 0.04270731608072917, + "learning_rate": 0.0001, + "loss": 4.199, + "loss/crossentropy": 2.3190104961395264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24735668301582336, + "step": 14980 + }, + { + "epoch": 0.29964, + "grad_norm": 1.8203125, + "grad_norm_var": 0.044406890869140625, + "learning_rate": 0.0001, + "loss": 3.9549, + "loss/crossentropy": 1.970013439655304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19323202222585678, + "step": 14982 + }, + { + "epoch": 0.29968, + "grad_norm": 1.9921875, + "grad_norm_var": 0.045169830322265625, + "learning_rate": 0.0001, + "loss": 4.4762, + "loss/crossentropy": 2.052343726158142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20909954607486725, + "step": 14984 + }, + { + "epoch": 0.29972, + "grad_norm": 2.046875, + "grad_norm_var": 0.043268839518229164, + "learning_rate": 0.0001, + "loss": 4.1776, + "loss/crossentropy": 2.2586673498153687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22640614211559296, + "step": 14986 + }, + { + "epoch": 0.29976, + "grad_norm": 1.9609375, + "grad_norm_var": 0.042557779947916666, + "learning_rate": 0.0001, + "loss": 3.8465, + "loss/crossentropy": 1.8843002319335938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19117384403944016, + "step": 14988 + }, + { + "epoch": 0.2998, + "grad_norm": 1.9453125, + "grad_norm_var": 0.042740885416666666, + "learning_rate": 0.0001, + "loss": 3.7343, + "loss/crossentropy": 1.542019009590149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18295453488826752, + "step": 14990 + }, + { + "epoch": 0.29984, + "grad_norm": 1.9609375, + "grad_norm_var": 0.043369293212890625, + "learning_rate": 0.0001, + "loss": 4.1335, + "loss/crossentropy": 2.160573959350586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22916415333747864, + "step": 14992 + }, + { + "epoch": 0.29988, + "grad_norm": 2.609375, + "grad_norm_var": 0.062168121337890625, + "learning_rate": 0.0001, + "loss": 4.1898, + "loss/crossentropy": 2.1296870708465576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20498204976320267, + "step": 14994 + }, + { + "epoch": 0.29992, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0336669921875, + "learning_rate": 0.0001, + "loss": 4.0554, + "loss/crossentropy": 2.0356279015541077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21798957884311676, + "step": 14996 + }, + { + "epoch": 0.29996, + "grad_norm": 2.015625, + "grad_norm_var": 0.035796864827473955, + "learning_rate": 0.0001, + "loss": 4.6195, + "loss/crossentropy": 2.466023027896881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22107623517513275, + "step": 14998 + }, + { + "epoch": 0.3, + "grad_norm": 2.046875, + "grad_norm_var": 0.03408203125, + "learning_rate": 0.0001, + "loss": 4.1455, + "loss/crossentropy": 1.8457531332969666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21306351572275162, + "step": 15000 + }, + { + "epoch": 0.30004, + "grad_norm": 2.078125, + "grad_norm_var": 0.034211222330729166, + "learning_rate": 0.0001, + "loss": 4.5128, + "loss/crossentropy": 2.202435612678528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21542339771986008, + "step": 15002 + }, + { + "epoch": 0.30008, + "grad_norm": 1.9921875, + "grad_norm_var": 0.032613118489583336, + "learning_rate": 0.0001, + "loss": 4.1489, + "loss/crossentropy": 2.142255425453186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21016352623701096, + "step": 15004 + }, + { + "epoch": 0.30012, + "grad_norm": 2.03125, + "grad_norm_var": 0.030631256103515626, + "learning_rate": 0.0001, + "loss": 4.1443, + "loss/crossentropy": 1.949910044670105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19256845861673355, + "step": 15006 + }, + { + "epoch": 0.30016, + "grad_norm": 2.078125, + "grad_norm_var": 0.029750315348307292, + "learning_rate": 0.0001, + "loss": 4.34, + "loss/crossentropy": 2.257867217063904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22471557557582855, + "step": 15008 + }, + { + "epoch": 0.3002, + "grad_norm": 2.140625, + "grad_norm_var": 0.011358388264973958, + "learning_rate": 0.0001, + "loss": 4.3457, + "loss/crossentropy": 2.1541898250579834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23054643720388412, + "step": 15010 + }, + { + "epoch": 0.30024, + "grad_norm": 2.296875, + "grad_norm_var": 0.01165771484375, + "learning_rate": 0.0001, + "loss": 4.2537, + "loss/crossentropy": 2.0029123425483704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21815954893827438, + "step": 15012 + }, + { + "epoch": 0.30028, + "grad_norm": 2.125, + "grad_norm_var": 0.0076416015625, + "learning_rate": 0.0001, + "loss": 4.0093, + "loss/crossentropy": 2.0617172718048096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19382908195257187, + "step": 15014 + }, + { + "epoch": 0.30032, + "grad_norm": 1.953125, + "grad_norm_var": 0.00921630859375, + "learning_rate": 0.0001, + "loss": 4.3123, + "loss/crossentropy": 2.1493303775787354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21762489527463913, + "step": 15016 + }, + { + "epoch": 0.30036, + "grad_norm": 2.0625, + "grad_norm_var": 0.0093505859375, + "learning_rate": 0.0001, + "loss": 4.1282, + "loss/crossentropy": 2.081854462623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22617685049772263, + "step": 15018 + }, + { + "epoch": 0.3004, + "grad_norm": 1.953125, + "grad_norm_var": 0.009842681884765624, + "learning_rate": 0.0001, + "loss": 4.0352, + "loss/crossentropy": 1.8565402626991272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18223749846220016, + "step": 15020 + }, + { + "epoch": 0.30044, + "grad_norm": 1.96875, + "grad_norm_var": 0.010815175374348958, + "learning_rate": 0.0001, + "loss": 4.1779, + "loss/crossentropy": 2.129835605621338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21263620257377625, + "step": 15022 + }, + { + "epoch": 0.30048, + "grad_norm": 2.15625, + "grad_norm_var": 0.011189524332682292, + "learning_rate": 0.0001, + "loss": 4.5131, + "loss/crossentropy": 2.113425612449646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22930295765399933, + "step": 15024 + }, + { + "epoch": 0.30052, + "grad_norm": 2.0, + "grad_norm_var": 0.010109202067057291, + "learning_rate": 0.0001, + "loss": 4.1557, + "loss/crossentropy": 2.254178762435913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21644078195095062, + "step": 15026 + }, + { + "epoch": 0.30056, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011966959635416666, + "learning_rate": 0.0001, + "loss": 4.0863, + "loss/crossentropy": 2.233784854412079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22132302820682526, + "step": 15028 + }, + { + "epoch": 0.3006, + "grad_norm": 1.953125, + "grad_norm_var": 0.0118804931640625, + "learning_rate": 0.0001, + "loss": 3.8657, + "loss/crossentropy": 2.2234549522399902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144894078373909, + "step": 15030 + }, + { + "epoch": 0.30064, + "grad_norm": 2.046875, + "grad_norm_var": 0.0108306884765625, + "learning_rate": 0.0001, + "loss": 4.1315, + "loss/crossentropy": 2.0401915907859802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20804419368505478, + "step": 15032 + }, + { + "epoch": 0.30068, + "grad_norm": 1.953125, + "grad_norm_var": 0.010978190104166667, + "learning_rate": 0.0001, + "loss": 4.0939, + "loss/crossentropy": 2.058899462223053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222386747598648, + "step": 15034 + }, + { + "epoch": 0.30072, + "grad_norm": 2.421875, + "grad_norm_var": 0.020694986979166666, + "learning_rate": 0.0001, + "loss": 4.5452, + "loss/crossentropy": 2.396425485610962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24734684079885483, + "step": 15036 + }, + { + "epoch": 0.30076, + "grad_norm": 2.03125, + "grad_norm_var": 0.02011693318684896, + "learning_rate": 0.0001, + "loss": 4.1124, + "loss/crossentropy": 1.9825797080993652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20504453778266907, + "step": 15038 + }, + { + "epoch": 0.3008, + "grad_norm": 2.015625, + "grad_norm_var": 0.02068049112955729, + "learning_rate": 0.0001, + "loss": 4.0287, + "loss/crossentropy": 1.9550088047981262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051691859960556, + "step": 15040 + }, + { + "epoch": 0.30084, + "grad_norm": 2.015625, + "grad_norm_var": 0.02102839152018229, + "learning_rate": 0.0001, + "loss": 4.1926, + "loss/crossentropy": 2.166532874107361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20486898720264435, + "step": 15042 + }, + { + "epoch": 0.30088, + "grad_norm": 1.921875, + "grad_norm_var": 0.0158355712890625, + "learning_rate": 0.0001, + "loss": 3.9905, + "loss/crossentropy": 1.8752986192703247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20286446809768677, + "step": 15044 + }, + { + "epoch": 0.30092, + "grad_norm": 2.125, + "grad_norm_var": 0.016649373372395835, + "learning_rate": 0.0001, + "loss": 4.3217, + "loss/crossentropy": 1.9599428176879883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061413824558258, + "step": 15046 + }, + { + "epoch": 0.30096, + "grad_norm": 2.015625, + "grad_norm_var": 0.015428670247395833, + "learning_rate": 0.0001, + "loss": 4.2914, + "loss/crossentropy": 2.5601563453674316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123739868402481, + "step": 15048 + }, + { + "epoch": 0.301, + "grad_norm": 2.0, + "grad_norm_var": 0.015062459309895833, + "learning_rate": 0.0001, + "loss": 4.2784, + "loss/crossentropy": 2.075629949569702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20791151374578476, + "step": 15050 + }, + { + "epoch": 0.30104, + "grad_norm": 2.171875, + "grad_norm_var": 0.008532460530598958, + "learning_rate": 0.0001, + "loss": 4.1578, + "loss/crossentropy": 1.8346505165100098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16620153188705444, + "step": 15052 + }, + { + "epoch": 0.30108, + "grad_norm": 2.109375, + "grad_norm_var": 0.009382120768229167, + "learning_rate": 0.0001, + "loss": 4.003, + "loss/crossentropy": 1.9715532660484314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20792889595031738, + "step": 15054 + }, + { + "epoch": 0.30112, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0090728759765625, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 2.135376811027527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.218796044588089, + "step": 15056 + }, + { + "epoch": 0.30116, + "grad_norm": 2.171875, + "grad_norm_var": 0.00941162109375, + "learning_rate": 0.0001, + "loss": 4.4339, + "loss/crossentropy": 2.095982074737549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2120692878961563, + "step": 15058 + }, + { + "epoch": 0.3012, + "grad_norm": 2.203125, + "grad_norm_var": 0.00911865234375, + "learning_rate": 0.0001, + "loss": 4.3395, + "loss/crossentropy": 2.2836159467697144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23748520761728287, + "step": 15060 + }, + { + "epoch": 0.30124, + "grad_norm": 2.03125, + "grad_norm_var": 0.010163370768229167, + "learning_rate": 0.0001, + "loss": 3.8877, + "loss/crossentropy": 2.073283314704895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19927848130464554, + "step": 15062 + }, + { + "epoch": 0.30128, + "grad_norm": 2.015625, + "grad_norm_var": 0.010196940104166666, + "learning_rate": 0.0001, + "loss": 4.1908, + "loss/crossentropy": 2.2401102781295776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22298122942447662, + "step": 15064 + }, + { + "epoch": 0.30132, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012650299072265624, + "learning_rate": 0.0001, + "loss": 4.1878, + "loss/crossentropy": 2.063979387283325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19714537262916565, + "step": 15066 + }, + { + "epoch": 0.30136, + "grad_norm": 2.0, + "grad_norm_var": 0.012894694010416667, + "learning_rate": 0.0001, + "loss": 3.6407, + "loss/crossentropy": 2.0346400141716003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18405266851186752, + "step": 15068 + }, + { + "epoch": 0.3014, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013437652587890625, + "learning_rate": 0.0001, + "loss": 3.8641, + "loss/crossentropy": 2.1211341619491577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19659722596406937, + "step": 15070 + }, + { + "epoch": 0.30144, + "grad_norm": 3.484375, + "grad_norm_var": 0.1490966796875, + "learning_rate": 0.0001, + "loss": 3.9967, + "loss/crossentropy": 1.937787652015686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18829380720853806, + "step": 15072 + }, + { + "epoch": 0.30148, + "grad_norm": 1.9140625, + "grad_norm_var": 0.1507219950358073, + "learning_rate": 0.0001, + "loss": 3.922, + "loss/crossentropy": 1.9047453999519348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18311911821365356, + "step": 15074 + }, + { + "epoch": 0.30152, + "grad_norm": 2.171875, + "grad_norm_var": 0.15110651652018228, + "learning_rate": 0.0001, + "loss": 4.3423, + "loss/crossentropy": 2.1409407258033752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21341674029827118, + "step": 15076 + }, + { + "epoch": 0.30156, + "grad_norm": 2.265625, + "grad_norm_var": 0.15178197224934895, + "learning_rate": 0.0001, + "loss": 4.2815, + "loss/crossentropy": 2.188783288002014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2406207174062729, + "step": 15078 + }, + { + "epoch": 0.3016, + "grad_norm": 2.125, + "grad_norm_var": 0.15162938435872395, + "learning_rate": 0.0001, + "loss": 4.368, + "loss/crossentropy": 2.088913679122925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055455967783928, + "step": 15080 + }, + { + "epoch": 0.30164, + "grad_norm": 1.9921875, + "grad_norm_var": 0.15350316365559896, + "learning_rate": 0.0001, + "loss": 3.9039, + "loss/crossentropy": 1.9477434754371643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19234874844551086, + "step": 15082 + }, + { + "epoch": 0.30168, + "grad_norm": 1.9375, + "grad_norm_var": 0.1490069071451823, + "learning_rate": 0.0001, + "loss": 4.2653, + "loss/crossentropy": 2.2611928582191467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2250279188156128, + "step": 15084 + }, + { + "epoch": 0.30172, + "grad_norm": 2.109375, + "grad_norm_var": 0.14735514322916668, + "learning_rate": 0.0001, + "loss": 4.043, + "loss/crossentropy": 2.340996742248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2201436087489128, + "step": 15086 + }, + { + "epoch": 0.30176, + "grad_norm": 1.8515625, + "grad_norm_var": 0.01718724568684896, + "learning_rate": 0.0001, + "loss": 3.9232, + "loss/crossentropy": 2.045587956905365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049652263522148, + "step": 15088 + }, + { + "epoch": 0.3018, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015265909830729167, + "learning_rate": 0.0001, + "loss": 3.9266, + "loss/crossentropy": 1.6079826951026917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1584620103240013, + "step": 15090 + }, + { + "epoch": 0.30184, + "grad_norm": 2.046875, + "grad_norm_var": 0.013494618733723958, + "learning_rate": 0.0001, + "loss": 4.1049, + "loss/crossentropy": 2.122701048851013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2095094472169876, + "step": 15092 + }, + { + "epoch": 0.30188, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008243560791015625, + "learning_rate": 0.0001, + "loss": 3.8602, + "loss/crossentropy": 1.8888922929763794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19282665848731995, + "step": 15094 + }, + { + "epoch": 0.30192, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0063873291015625, + "learning_rate": 0.0001, + "loss": 4.1279, + "loss/crossentropy": 2.121204972267151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894393339753151, + "step": 15096 + }, + { + "epoch": 0.30196, + "grad_norm": 1.890625, + "grad_norm_var": 0.005991363525390625, + "learning_rate": 0.0001, + "loss": 4.0433, + "loss/crossentropy": 1.836807131767273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851387470960617, + "step": 15098 + }, + { + "epoch": 0.302, + "grad_norm": 1.96875, + "grad_norm_var": 0.005246734619140625, + "learning_rate": 0.0001, + "loss": 4.1723, + "loss/crossentropy": 2.170082688331604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20964065939188004, + "step": 15100 + }, + { + "epoch": 0.30204, + "grad_norm": 2.015625, + "grad_norm_var": 0.0037595113118489582, + "learning_rate": 0.0001, + "loss": 4.0642, + "loss/crossentropy": 2.128955125808716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20737067610025406, + "step": 15102 + }, + { + "epoch": 0.30208, + "grad_norm": 1.890625, + "grad_norm_var": 0.0062744140625, + "learning_rate": 0.0001, + "loss": 4.1561, + "loss/crossentropy": 1.970711886882782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19101983308792114, + "step": 15104 + }, + { + "epoch": 0.30212, + "grad_norm": 2.40625, + "grad_norm_var": 0.018040974934895832, + "learning_rate": 0.0001, + "loss": 4.5089, + "loss/crossentropy": 2.105665922164917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23741377890110016, + "step": 15106 + }, + { + "epoch": 0.30216, + "grad_norm": 2.109375, + "grad_norm_var": 0.01962865193684896, + "learning_rate": 0.0001, + "loss": 4.3766, + "loss/crossentropy": 2.274402379989624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20633316040039062, + "step": 15108 + }, + { + "epoch": 0.3022, + "grad_norm": 2.046875, + "grad_norm_var": 0.017996978759765626, + "learning_rate": 0.0001, + "loss": 4.0393, + "loss/crossentropy": 2.134031891822815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20932414382696152, + "step": 15110 + }, + { + "epoch": 0.30224, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01930720011393229, + "learning_rate": 0.0001, + "loss": 3.9084, + "loss/crossentropy": 2.015448212623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18043135851621628, + "step": 15112 + }, + { + "epoch": 0.30228, + "grad_norm": 2.109375, + "grad_norm_var": 0.018232981363932293, + "learning_rate": 0.0001, + "loss": 4.2025, + "loss/crossentropy": 2.3585838079452515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2323882058262825, + "step": 15114 + }, + { + "epoch": 0.30232, + "grad_norm": 1.9140625, + "grad_norm_var": 0.018607584635416667, + "learning_rate": 0.0001, + "loss": 3.8586, + "loss/crossentropy": 1.790964961051941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18150723725557327, + "step": 15116 + }, + { + "epoch": 0.30236, + "grad_norm": 2.109375, + "grad_norm_var": 0.018802642822265625, + "learning_rate": 0.0001, + "loss": 4.1433, + "loss/crossentropy": 2.0206486582756042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102368101477623, + "step": 15118 + }, + { + "epoch": 0.3024, + "grad_norm": 1.9921875, + "grad_norm_var": 0.016169230143229168, + "learning_rate": 0.0001, + "loss": 4.1773, + "loss/crossentropy": 2.21474289894104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22742661088705063, + "step": 15120 + }, + { + "epoch": 0.30244, + "grad_norm": 2.1875, + "grad_norm_var": 0.009781901041666667, + "learning_rate": 0.0001, + "loss": 3.9536, + "loss/crossentropy": 2.0452207922935486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19045353680849075, + "step": 15122 + }, + { + "epoch": 0.30248, + "grad_norm": 1.828125, + "grad_norm_var": 0.0105865478515625, + "learning_rate": 0.0001, + "loss": 3.7914, + "loss/crossentropy": 2.019612729549408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032184973359108, + "step": 15124 + }, + { + "epoch": 0.30252, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010221099853515625, + "learning_rate": 0.0001, + "loss": 4.0303, + "loss/crossentropy": 2.1821396350860596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22832991182804108, + "step": 15126 + }, + { + "epoch": 0.30256, + "grad_norm": 1.875, + "grad_norm_var": 0.010088857014973958, + "learning_rate": 0.0001, + "loss": 3.7838, + "loss/crossentropy": 2.0395348072052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18255837261676788, + "step": 15128 + }, + { + "epoch": 0.3026, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009081013997395833, + "learning_rate": 0.0001, + "loss": 4.177, + "loss/crossentropy": 2.1724050045013428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19672775268554688, + "step": 15130 + }, + { + "epoch": 0.30264, + "grad_norm": 1.96875, + "grad_norm_var": 0.008470662434895833, + "learning_rate": 0.0001, + "loss": 4.2438, + "loss/crossentropy": 2.107967436313629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19564508646726608, + "step": 15132 + }, + { + "epoch": 0.30268, + "grad_norm": 1.953125, + "grad_norm_var": 0.007355753580729167, + "learning_rate": 0.0001, + "loss": 3.8674, + "loss/crossentropy": 1.7179370522499084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17135357856750488, + "step": 15134 + }, + { + "epoch": 0.30272, + "grad_norm": 2.078125, + "grad_norm_var": 0.008119455973307292, + "learning_rate": 0.0001, + "loss": 4.104, + "loss/crossentropy": 1.9004579186439514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19902431219816208, + "step": 15136 + }, + { + "epoch": 0.30276, + "grad_norm": 2.0625, + "grad_norm_var": 0.0052487691243489586, + "learning_rate": 0.0001, + "loss": 4.013, + "loss/crossentropy": 1.9554911255836487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19999338686466217, + "step": 15138 + }, + { + "epoch": 0.3028, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004793294270833333, + "learning_rate": 0.0001, + "loss": 4.1051, + "loss/crossentropy": 1.8687474131584167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19250088185071945, + "step": 15140 + }, + { + "epoch": 0.30284, + "grad_norm": 2.0625, + "grad_norm_var": 0.004929351806640625, + "learning_rate": 0.0001, + "loss": 4.1217, + "loss/crossentropy": 2.1360538005828857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22316065430641174, + "step": 15142 + }, + { + "epoch": 0.30288, + "grad_norm": 2.125, + "grad_norm_var": 0.006776682535807292, + "learning_rate": 0.0001, + "loss": 3.7681, + "loss/crossentropy": 1.6100040078163147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16926150023937225, + "step": 15144 + }, + { + "epoch": 0.30292, + "grad_norm": 2.09375, + "grad_norm_var": 0.0070302327473958336, + "learning_rate": 0.0001, + "loss": 4.0459, + "loss/crossentropy": 1.983083188533783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19075944274663925, + "step": 15146 + }, + { + "epoch": 0.30296, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009110514322916667, + "learning_rate": 0.0001, + "loss": 3.8722, + "loss/crossentropy": 1.9027757048606873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069152221083641, + "step": 15148 + }, + { + "epoch": 0.303, + "grad_norm": 1.953125, + "grad_norm_var": 0.009144846598307292, + "learning_rate": 0.0001, + "loss": 3.996, + "loss/crossentropy": 1.9141475558280945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19106873124837875, + "step": 15150 + }, + { + "epoch": 0.30304, + "grad_norm": 2.109375, + "grad_norm_var": 0.009821573893229166, + "learning_rate": 0.0001, + "loss": 4.0872, + "loss/crossentropy": 1.924220085144043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19870451837778091, + "step": 15152 + }, + { + "epoch": 0.30308, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009169260660807291, + "learning_rate": 0.0001, + "loss": 4.1611, + "loss/crossentropy": 2.18874990940094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040283977985382, + "step": 15154 + }, + { + "epoch": 0.30312, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0096588134765625, + "learning_rate": 0.0001, + "loss": 3.6905, + "loss/crossentropy": 2.0296109914779663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20384181290864944, + "step": 15156 + }, + { + "epoch": 0.30316, + "grad_norm": 2.03125, + "grad_norm_var": 0.0091949462890625, + "learning_rate": 0.0001, + "loss": 4.2475, + "loss/crossentropy": 2.1908326148986816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20775151252746582, + "step": 15158 + }, + { + "epoch": 0.3032, + "grad_norm": 2.0625, + "grad_norm_var": 0.007523345947265625, + "learning_rate": 0.0001, + "loss": 4.191, + "loss/crossentropy": 2.1348751187324524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21342730522155762, + "step": 15160 + }, + { + "epoch": 0.30324, + "grad_norm": 2.09375, + "grad_norm_var": 0.0075103759765625, + "learning_rate": 0.0001, + "loss": 3.9642, + "loss/crossentropy": 2.03944593667984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1934272050857544, + "step": 15162 + }, + { + "epoch": 0.30328, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005475870768229167, + "learning_rate": 0.0001, + "loss": 3.9692, + "loss/crossentropy": 1.8574647307395935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1847086101770401, + "step": 15164 + }, + { + "epoch": 0.30332, + "grad_norm": 1.9140625, + "grad_norm_var": 0.005671183268229167, + "learning_rate": 0.0001, + "loss": 4.3025, + "loss/crossentropy": 2.219611406326294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1982950195670128, + "step": 15166 + }, + { + "epoch": 0.30336, + "grad_norm": 2.078125, + "grad_norm_var": 0.005159250895182292, + "learning_rate": 0.0001, + "loss": 4.1903, + "loss/crossentropy": 2.024726688861847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.217759907245636, + "step": 15168 + }, + { + "epoch": 0.3034, + "grad_norm": 2.046875, + "grad_norm_var": 0.005353800455729167, + "learning_rate": 0.0001, + "loss": 4.2273, + "loss/crossentropy": 2.196849226951599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21290963143110275, + "step": 15170 + }, + { + "epoch": 0.30344, + "grad_norm": 1.7421875, + "grad_norm_var": 0.00750732421875, + "learning_rate": 0.0001, + "loss": 4.0096, + "loss/crossentropy": 1.9023171067237854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19199151545763016, + "step": 15172 + }, + { + "epoch": 0.30348, + "grad_norm": 2.03125, + "grad_norm_var": 0.008644358317057291, + "learning_rate": 0.0001, + "loss": 4.0798, + "loss/crossentropy": 2.1286264657974243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20934996753931046, + "step": 15174 + }, + { + "epoch": 0.30352, + "grad_norm": 2.046875, + "grad_norm_var": 0.008007558186848958, + "learning_rate": 0.0001, + "loss": 4.2621, + "loss/crossentropy": 2.2262455224990845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091647908091545, + "step": 15176 + }, + { + "epoch": 0.30356, + "grad_norm": 2.140625, + "grad_norm_var": 0.04183349609375, + "learning_rate": 0.0001, + "loss": 4.2439, + "loss/crossentropy": 2.4126710891723633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22972645610570908, + "step": 15178 + }, + { + "epoch": 0.3036, + "grad_norm": 2.078125, + "grad_norm_var": 0.042569732666015624, + "learning_rate": 0.0001, + "loss": 4.4268, + "loss/crossentropy": 2.4708153009414673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23798953741788864, + "step": 15180 + }, + { + "epoch": 0.30364, + "grad_norm": 2.109375, + "grad_norm_var": 0.04053929646809896, + "learning_rate": 0.0001, + "loss": 4.1841, + "loss/crossentropy": 1.9924429655075073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19149084389209747, + "step": 15182 + }, + { + "epoch": 0.30368, + "grad_norm": 1.953125, + "grad_norm_var": 0.04182103474934896, + "learning_rate": 0.0001, + "loss": 4.1335, + "loss/crossentropy": 1.8018346428871155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1883997619152069, + "step": 15184 + }, + { + "epoch": 0.30372, + "grad_norm": 1.9296875, + "grad_norm_var": 0.04639867146809896, + "learning_rate": 0.0001, + "loss": 3.96, + "loss/crossentropy": 2.001536011695862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008758783340454, + "step": 15186 + }, + { + "epoch": 0.30376, + "grad_norm": 2.1875, + "grad_norm_var": 0.06099828084309896, + "learning_rate": 0.0001, + "loss": 4.2308, + "loss/crossentropy": 1.9239726066589355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068573236465454, + "step": 15188 + }, + { + "epoch": 0.3038, + "grad_norm": 2.296875, + "grad_norm_var": 0.059845987955729166, + "learning_rate": 0.0001, + "loss": 4.303, + "loss/crossentropy": 2.252953827381134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22697407007217407, + "step": 15190 + }, + { + "epoch": 0.30384, + "grad_norm": 1.921875, + "grad_norm_var": 0.0631011962890625, + "learning_rate": 0.0001, + "loss": 4.174, + "loss/crossentropy": 2.135131001472473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088206186890602, + "step": 15192 + }, + { + "epoch": 0.30388, + "grad_norm": 2.09375, + "grad_norm_var": 0.0391754150390625, + "learning_rate": 0.0001, + "loss": 4.22, + "loss/crossentropy": 2.394876003265381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22017298638820648, + "step": 15194 + }, + { + "epoch": 0.30392, + "grad_norm": 2.078125, + "grad_norm_var": 0.03862711588541667, + "learning_rate": 0.0001, + "loss": 4.3951, + "loss/crossentropy": 2.3517301082611084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21459681540727615, + "step": 15196 + }, + { + "epoch": 0.30396, + "grad_norm": 2.015625, + "grad_norm_var": 0.038914998372395836, + "learning_rate": 0.0001, + "loss": 4.3068, + "loss/crossentropy": 1.956869900226593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19348695874214172, + "step": 15198 + }, + { + "epoch": 0.304, + "grad_norm": 1.9921875, + "grad_norm_var": 0.045660146077473956, + "learning_rate": 0.0001, + "loss": 3.5481, + "loss/crossentropy": 1.8349076509475708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19203926622867584, + "step": 15200 + }, + { + "epoch": 0.30404, + "grad_norm": 2.140625, + "grad_norm_var": 0.041265614827473956, + "learning_rate": 0.0001, + "loss": 4.1749, + "loss/crossentropy": 2.253283977508545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24825416505336761, + "step": 15202 + }, + { + "epoch": 0.30408, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0156646728515625, + "learning_rate": 0.0001, + "loss": 3.9585, + "loss/crossentropy": 1.7219101190567017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18302495777606964, + "step": 15204 + }, + { + "epoch": 0.30412, + "grad_norm": 2.046875, + "grad_norm_var": 0.010309855143229166, + "learning_rate": 0.0001, + "loss": 4.4398, + "loss/crossentropy": 2.2931004762649536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21909544616937637, + "step": 15206 + }, + { + "epoch": 0.30416, + "grad_norm": 2.125, + "grad_norm_var": 0.009883626302083334, + "learning_rate": 0.0001, + "loss": 4.0707, + "loss/crossentropy": 1.9331820011138916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19565977901220322, + "step": 15208 + }, + { + "epoch": 0.3042, + "grad_norm": 2.015625, + "grad_norm_var": 0.008568318684895833, + "learning_rate": 0.0001, + "loss": 4.0936, + "loss/crossentropy": 1.9863171577453613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20089460909366608, + "step": 15210 + }, + { + "epoch": 0.30424, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010945383707682292, + "learning_rate": 0.0001, + "loss": 4.3762, + "loss/crossentropy": 2.326872229576111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20695248246192932, + "step": 15212 + }, + { + "epoch": 0.30428, + "grad_norm": 2.171875, + "grad_norm_var": 0.012679036458333333, + "learning_rate": 0.0001, + "loss": 4.1684, + "loss/crossentropy": 2.0216678380966187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19882218539714813, + "step": 15214 + }, + { + "epoch": 0.30432, + "grad_norm": 2.171875, + "grad_norm_var": 0.0074503580729166664, + "learning_rate": 0.0001, + "loss": 4.3682, + "loss/crossentropy": 2.170402765274048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22156177461147308, + "step": 15216 + }, + { + "epoch": 0.30436, + "grad_norm": 2.03125, + "grad_norm_var": 0.007608795166015625, + "learning_rate": 0.0001, + "loss": 4.1655, + "loss/crossentropy": 1.9652912616729736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20658399909734726, + "step": 15218 + }, + { + "epoch": 0.3044, + "grad_norm": 1.90625, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 4.1234, + "loss/crossentropy": 2.4518260955810547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20796608924865723, + "step": 15220 + }, + { + "epoch": 0.30444, + "grad_norm": 1.984375, + "grad_norm_var": 0.009952545166015625, + "learning_rate": 0.0001, + "loss": 4.0179, + "loss/crossentropy": 1.9519163370132446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19269376248121262, + "step": 15222 + }, + { + "epoch": 0.30448, + "grad_norm": 1.9375, + "grad_norm_var": 0.010758209228515624, + "learning_rate": 0.0001, + "loss": 3.9411, + "loss/crossentropy": 2.1493905782699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19713661074638367, + "step": 15224 + }, + { + "epoch": 0.30452, + "grad_norm": 2.5, + "grad_norm_var": 0.026878865559895833, + "learning_rate": 0.0001, + "loss": 4.2124, + "loss/crossentropy": 2.2856662273406982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23240232467651367, + "step": 15226 + }, + { + "epoch": 0.30456, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0249176025390625, + "learning_rate": 0.0001, + "loss": 4.1723, + "loss/crossentropy": 2.3631181716918945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22731658816337585, + "step": 15228 + }, + { + "epoch": 0.3046, + "grad_norm": 2.015625, + "grad_norm_var": 0.027469635009765625, + "learning_rate": 0.0001, + "loss": 4.0952, + "loss/crossentropy": 2.1270273327827454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20736727863550186, + "step": 15230 + }, + { + "epoch": 0.30464, + "grad_norm": 2.09375, + "grad_norm_var": 0.029320271809895833, + "learning_rate": 0.0001, + "loss": 4.4801, + "loss/crossentropy": 2.1448079347610474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20350141823291779, + "step": 15232 + }, + { + "epoch": 0.30468, + "grad_norm": 2.71875, + "grad_norm_var": 0.059576161702473956, + "learning_rate": 0.0001, + "loss": 4.295, + "loss/crossentropy": 2.0582846999168396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19898280501365662, + "step": 15234 + }, + { + "epoch": 0.30472, + "grad_norm": 2.078125, + "grad_norm_var": 0.0597076416015625, + "learning_rate": 0.0001, + "loss": 4.0946, + "loss/crossentropy": 2.2282591462135315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22299205511808395, + "step": 15236 + }, + { + "epoch": 0.30476, + "grad_norm": 4.34375, + "grad_norm_var": 0.3757484436035156, + "learning_rate": 0.0001, + "loss": 4.4031, + "loss/crossentropy": 1.902436077594757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19752992689609528, + "step": 15238 + }, + { + "epoch": 0.3048, + "grad_norm": 2.09375, + "grad_norm_var": 0.3626177469889323, + "learning_rate": 0.0001, + "loss": 4.5476, + "loss/crossentropy": 2.5741195678710938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25522060692310333, + "step": 15240 + }, + { + "epoch": 0.30484, + "grad_norm": 2.078125, + "grad_norm_var": 0.3576942443847656, + "learning_rate": 0.0001, + "loss": 4.2319, + "loss/crossentropy": 2.261489987373352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2379501387476921, + "step": 15242 + }, + { + "epoch": 0.30488, + "grad_norm": 1.921875, + "grad_norm_var": 0.3581451416015625, + "learning_rate": 0.0001, + "loss": 4.1728, + "loss/crossentropy": 2.3075523376464844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21830804646015167, + "step": 15244 + }, + { + "epoch": 0.30492, + "grad_norm": 1.9375, + "grad_norm_var": 0.36470133463541665, + "learning_rate": 0.0001, + "loss": 3.9585, + "loss/crossentropy": 2.096464157104492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073582485318184, + "step": 15246 + }, + { + "epoch": 0.30496, + "grad_norm": 2.078125, + "grad_norm_var": 0.41076558430989585, + "learning_rate": 0.0001, + "loss": 4.0121, + "loss/crossentropy": 1.9624161124229431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19424450397491455, + "step": 15248 + }, + { + "epoch": 0.305, + "grad_norm": 2.0625, + "grad_norm_var": 0.38983154296875, + "learning_rate": 0.0001, + "loss": 4.1351, + "loss/crossentropy": 2.0838447213172913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22090255469083786, + "step": 15250 + }, + { + "epoch": 0.30504, + "grad_norm": 2.171875, + "grad_norm_var": 0.379644521077474, + "learning_rate": 0.0001, + "loss": 4.4244, + "loss/crossentropy": 2.349258542060852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20451825857162476, + "step": 15252 + }, + { + "epoch": 0.30508, + "grad_norm": 2.09375, + "grad_norm_var": 0.06938451131184896, + "learning_rate": 0.0001, + "loss": 4.328, + "loss/crossentropy": 2.3996351957321167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24550051987171173, + "step": 15254 + }, + { + "epoch": 0.30512, + "grad_norm": 1.859375, + "grad_norm_var": 0.07430191040039062, + "learning_rate": 0.0001, + "loss": 4.0098, + "loss/crossentropy": 2.325650215148926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2189955934882164, + "step": 15256 + }, + { + "epoch": 0.30516, + "grad_norm": 2.015625, + "grad_norm_var": 0.07815348307291667, + "learning_rate": 0.0001, + "loss": 4.0063, + "loss/crossentropy": 1.8568453788757324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18639766424894333, + "step": 15258 + }, + { + "epoch": 0.3052, + "grad_norm": 1.875, + "grad_norm_var": 0.08263346354166666, + "learning_rate": 0.0001, + "loss": 3.8955, + "loss/crossentropy": 1.8268811106681824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18669788539409637, + "step": 15260 + }, + { + "epoch": 0.30524, + "grad_norm": 2.125, + "grad_norm_var": 0.08479715983072916, + "learning_rate": 0.0001, + "loss": 4.1124, + "loss/crossentropy": 2.0555724501609802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101765051484108, + "step": 15262 + }, + { + "epoch": 0.30528, + "grad_norm": 1.9296875, + "grad_norm_var": 0.018070475260416666, + "learning_rate": 0.0001, + "loss": 4.1075, + "loss/crossentropy": 2.080612599849701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18955913186073303, + "step": 15264 + }, + { + "epoch": 0.30532, + "grad_norm": 1.890625, + "grad_norm_var": 0.0137939453125, + "learning_rate": 0.0001, + "loss": 4.2678, + "loss/crossentropy": 2.352793037891388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21857291460037231, + "step": 15266 + }, + { + "epoch": 0.30536, + "grad_norm": 1.8671875, + "grad_norm_var": 0.010434722900390625, + "learning_rate": 0.0001, + "loss": 4.0871, + "loss/crossentropy": 2.0258968472480774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18990560621023178, + "step": 15268 + }, + { + "epoch": 0.3054, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010871378580729167, + "learning_rate": 0.0001, + "loss": 4.3126, + "loss/crossentropy": 2.228433310985565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20862557739019394, + "step": 15270 + }, + { + "epoch": 0.30544, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012330881754557292, + "learning_rate": 0.0001, + "loss": 3.7731, + "loss/crossentropy": 1.9475300312042236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20644760876893997, + "step": 15272 + }, + { + "epoch": 0.30548, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012166341145833334, + "learning_rate": 0.0001, + "loss": 4.1685, + "loss/crossentropy": 2.3502081632614136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22096982598304749, + "step": 15274 + }, + { + "epoch": 0.30552, + "grad_norm": 2.09375, + "grad_norm_var": 0.012064615885416666, + "learning_rate": 0.0001, + "loss": 4.294, + "loss/crossentropy": 2.2581464052200317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21158118546009064, + "step": 15276 + }, + { + "epoch": 0.30556, + "grad_norm": 2.171875, + "grad_norm_var": 0.014054361979166667, + "learning_rate": 0.0001, + "loss": 4.0301, + "loss/crossentropy": 1.9583097696304321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19379162788391113, + "step": 15278 + }, + { + "epoch": 0.3056, + "grad_norm": 2.109375, + "grad_norm_var": 0.016527303059895835, + "learning_rate": 0.0001, + "loss": 4.4464, + "loss/crossentropy": 1.988980233669281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19495020806789398, + "step": 15280 + }, + { + "epoch": 0.30564, + "grad_norm": 1.984375, + "grad_norm_var": 0.017350260416666666, + "learning_rate": 0.0001, + "loss": 4.1733, + "loss/crossentropy": 2.1816195249557495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2196531891822815, + "step": 15282 + }, + { + "epoch": 0.30568, + "grad_norm": 1.9375, + "grad_norm_var": 0.015264638264973958, + "learning_rate": 0.0001, + "loss": 3.7994, + "loss/crossentropy": 2.180557370185852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106764316558838, + "step": 15284 + }, + { + "epoch": 0.30572, + "grad_norm": 2.046875, + "grad_norm_var": 0.011027018229166666, + "learning_rate": 0.0001, + "loss": 4.0655, + "loss/crossentropy": 1.901601493358612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18202877044677734, + "step": 15286 + }, + { + "epoch": 0.30576, + "grad_norm": 2.0625, + "grad_norm_var": 0.019301096598307293, + "learning_rate": 0.0001, + "loss": 4.1989, + "loss/crossentropy": 2.259449601173401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23166191577911377, + "step": 15288 + }, + { + "epoch": 0.3058, + "grad_norm": 1.90625, + "grad_norm_var": 0.020514933268229167, + "learning_rate": 0.0001, + "loss": 3.9866, + "loss/crossentropy": 1.786646544933319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19156906753778458, + "step": 15290 + }, + { + "epoch": 0.30584, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02205174763997396, + "learning_rate": 0.0001, + "loss": 3.9258, + "loss/crossentropy": 1.8522255420684814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1828666776418686, + "step": 15292 + }, + { + "epoch": 0.30588, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02127685546875, + "learning_rate": 0.0001, + "loss": 4.2543, + "loss/crossentropy": 2.195667266845703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21043212711811066, + "step": 15294 + }, + { + "epoch": 0.30592, + "grad_norm": 2.0, + "grad_norm_var": 0.020918528238932293, + "learning_rate": 0.0001, + "loss": 3.8389, + "loss/crossentropy": 1.8748087882995605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20358053594827652, + "step": 15296 + }, + { + "epoch": 0.30596, + "grad_norm": 1.9453125, + "grad_norm_var": 0.017536417643229166, + "learning_rate": 0.0001, + "loss": 4.0789, + "loss/crossentropy": 2.2658244371414185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2206317037343979, + "step": 15298 + }, + { + "epoch": 0.306, + "grad_norm": 2.140625, + "grad_norm_var": 0.018778483072916668, + "learning_rate": 0.0001, + "loss": 4.2882, + "loss/crossentropy": 1.7914378643035889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20813927054405212, + "step": 15300 + }, + { + "epoch": 0.30604, + "grad_norm": 2.109375, + "grad_norm_var": 0.01895319620768229, + "learning_rate": 0.0001, + "loss": 4.1428, + "loss/crossentropy": 2.046573519706726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20681843161582947, + "step": 15302 + }, + { + "epoch": 0.30608, + "grad_norm": 2.03125, + "grad_norm_var": 0.005960845947265625, + "learning_rate": 0.0001, + "loss": 4.2538, + "loss/crossentropy": 2.0068886280059814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.201521098613739, + "step": 15304 + }, + { + "epoch": 0.30612, + "grad_norm": 2.03125, + "grad_norm_var": 0.005509440104166667, + "learning_rate": 0.0001, + "loss": 4.0672, + "loss/crossentropy": 2.2020740509033203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20413114875555038, + "step": 15306 + }, + { + "epoch": 0.30616, + "grad_norm": 2.09375, + "grad_norm_var": 0.0057769775390625, + "learning_rate": 0.0001, + "loss": 4.3654, + "loss/crossentropy": 2.3031119108200073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20878810435533524, + "step": 15308 + }, + { + "epoch": 0.3062, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006204986572265625, + "learning_rate": 0.0001, + "loss": 3.7786, + "loss/crossentropy": 1.6445855498313904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1618219092488289, + "step": 15310 + }, + { + "epoch": 0.30624, + "grad_norm": 2.046875, + "grad_norm_var": 0.004572550455729167, + "learning_rate": 0.0001, + "loss": 4.1378, + "loss/crossentropy": 1.995127022266388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21076080948114395, + "step": 15312 + }, + { + "epoch": 0.30628, + "grad_norm": 1.90625, + "grad_norm_var": 0.005411783854166667, + "learning_rate": 0.0001, + "loss": 3.6278, + "loss/crossentropy": 1.8340229392051697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19418048858642578, + "step": 15314 + }, + { + "epoch": 0.30632, + "grad_norm": 2.0625, + "grad_norm_var": 0.007106272379557291, + "learning_rate": 0.0001, + "loss": 3.9496, + "loss/crossentropy": 1.713787317276001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17608212679624557, + "step": 15316 + }, + { + "epoch": 0.30636, + "grad_norm": 2.0625, + "grad_norm_var": 0.006967926025390625, + "learning_rate": 0.0001, + "loss": 4.0223, + "loss/crossentropy": 1.7105762362480164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1760871261358261, + "step": 15318 + }, + { + "epoch": 0.3064, + "grad_norm": 1.9375, + "grad_norm_var": 0.0071441650390625, + "learning_rate": 0.0001, + "loss": 4.0326, + "loss/crossentropy": 2.337005376815796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2222321778535843, + "step": 15320 + }, + { + "epoch": 0.30644, + "grad_norm": 2.046875, + "grad_norm_var": 0.008868153889973958, + "learning_rate": 0.0001, + "loss": 4.3269, + "loss/crossentropy": 1.8426868915557861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1880369558930397, + "step": 15322 + }, + { + "epoch": 0.30648, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009162394205729167, + "learning_rate": 0.0001, + "loss": 3.7798, + "loss/crossentropy": 2.300028443336487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22529813647270203, + "step": 15324 + }, + { + "epoch": 0.30652, + "grad_norm": 1.953125, + "grad_norm_var": 0.009193674723307291, + "learning_rate": 0.0001, + "loss": 3.9838, + "loss/crossentropy": 2.3558120727539062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2000514343380928, + "step": 15326 + }, + { + "epoch": 0.30656, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007879384358723958, + "learning_rate": 0.0001, + "loss": 3.6921, + "loss/crossentropy": 1.681145966053009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756032332777977, + "step": 15328 + }, + { + "epoch": 0.3066, + "grad_norm": 2.0, + "grad_norm_var": 0.010298665364583333, + "learning_rate": 0.0001, + "loss": 4.41, + "loss/crossentropy": 2.1566712260246277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21067694574594498, + "step": 15330 + }, + { + "epoch": 0.30664, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0075347900390625, + "learning_rate": 0.0001, + "loss": 3.993, + "loss/crossentropy": 2.125000774860382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20768606662750244, + "step": 15332 + }, + { + "epoch": 0.30668, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007328033447265625, + "learning_rate": 0.0001, + "loss": 4.0805, + "loss/crossentropy": 2.168649673461914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20393794029951096, + "step": 15334 + }, + { + "epoch": 0.30672, + "grad_norm": 2.09375, + "grad_norm_var": 0.008072662353515624, + "learning_rate": 0.0001, + "loss": 3.9154, + "loss/crossentropy": 1.657529592514038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19351129233837128, + "step": 15336 + }, + { + "epoch": 0.30676, + "grad_norm": 2.0625, + "grad_norm_var": 0.006780751546223958, + "learning_rate": 0.0001, + "loss": 4.2166, + "loss/crossentropy": 2.115979850292206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20223377645015717, + "step": 15338 + }, + { + "epoch": 0.3068, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005833943684895833, + "learning_rate": 0.0001, + "loss": 4.1288, + "loss/crossentropy": 1.699661135673523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17160461097955704, + "step": 15340 + }, + { + "epoch": 0.30684, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008180491129557292, + "learning_rate": 0.0001, + "loss": 3.8418, + "loss/crossentropy": 2.141602098941803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19209860265254974, + "step": 15342 + }, + { + "epoch": 0.30688, + "grad_norm": 2.0625, + "grad_norm_var": 0.007806142171223958, + "learning_rate": 0.0001, + "loss": 4.3619, + "loss/crossentropy": 2.1502809524536133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21510163694620132, + "step": 15344 + }, + { + "epoch": 0.30692, + "grad_norm": 1.984375, + "grad_norm_var": 0.006078847249348958, + "learning_rate": 0.0001, + "loss": 4.0684, + "loss/crossentropy": 2.1354172825813293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19959942996501923, + "step": 15346 + }, + { + "epoch": 0.30696, + "grad_norm": 2.015625, + "grad_norm_var": 0.006095377604166666, + "learning_rate": 0.0001, + "loss": 4.1196, + "loss/crossentropy": 2.0634223222732544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073688805103302, + "step": 15348 + }, + { + "epoch": 0.307, + "grad_norm": 2.125, + "grad_norm_var": 0.008385976155598959, + "learning_rate": 0.0001, + "loss": 4.1489, + "loss/crossentropy": 1.9851733446121216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2060396894812584, + "step": 15350 + }, + { + "epoch": 0.30704, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007490793863932292, + "learning_rate": 0.0001, + "loss": 4.1196, + "loss/crossentropy": 2.3790550231933594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23205619305372238, + "step": 15352 + }, + { + "epoch": 0.30708, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008552042643229167, + "learning_rate": 0.0001, + "loss": 3.9085, + "loss/crossentropy": 1.943642258644104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18238085508346558, + "step": 15354 + }, + { + "epoch": 0.30712, + "grad_norm": 2.0, + "grad_norm_var": 0.008988189697265624, + "learning_rate": 0.0001, + "loss": 4.112, + "loss/crossentropy": 1.9680217504501343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18599005788564682, + "step": 15356 + }, + { + "epoch": 0.30716, + "grad_norm": 1.890625, + "grad_norm_var": 0.0077898661295572914, + "learning_rate": 0.0001, + "loss": 3.7402, + "loss/crossentropy": 1.9464871287345886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18814007937908173, + "step": 15358 + }, + { + "epoch": 0.3072, + "grad_norm": 2.265625, + "grad_norm_var": 0.013179524739583334, + "learning_rate": 0.0001, + "loss": 4.0231, + "loss/crossentropy": 2.0038596987724304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19203800708055496, + "step": 15360 + }, + { + "epoch": 0.30724, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013071441650390625, + "learning_rate": 0.0001, + "loss": 4.0843, + "loss/crossentropy": 1.93240225315094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20595912635326385, + "step": 15362 + }, + { + "epoch": 0.30728, + "grad_norm": 2.0, + "grad_norm_var": 0.012938435872395833, + "learning_rate": 0.0001, + "loss": 4.2042, + "loss/crossentropy": 2.067903220653534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21503040939569473, + "step": 15364 + }, + { + "epoch": 0.30732, + "grad_norm": 2.140625, + "grad_norm_var": 0.012674713134765625, + "learning_rate": 0.0001, + "loss": 3.9056, + "loss/crossentropy": 2.179181218147278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20675741881132126, + "step": 15366 + }, + { + "epoch": 0.30736, + "grad_norm": 2.078125, + "grad_norm_var": 0.02471923828125, + "learning_rate": 0.0001, + "loss": 4.4657, + "loss/crossentropy": 2.106004774570465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23683416098356247, + "step": 15368 + }, + { + "epoch": 0.3074, + "grad_norm": 2.0, + "grad_norm_var": 0.023266347249348958, + "learning_rate": 0.0001, + "loss": 4.135, + "loss/crossentropy": 2.2843246459960938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24139763414859772, + "step": 15370 + }, + { + "epoch": 0.30744, + "grad_norm": 2.0, + "grad_norm_var": 0.028484853108723958, + "learning_rate": 0.0001, + "loss": 4.122, + "loss/crossentropy": 2.2482924461364746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2242894545197487, + "step": 15372 + }, + { + "epoch": 0.30748, + "grad_norm": 1.9765625, + "grad_norm_var": 0.025321451822916667, + "learning_rate": 0.0001, + "loss": 4.221, + "loss/crossentropy": 2.1573593616485596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22176649421453476, + "step": 15374 + }, + { + "epoch": 0.30752, + "grad_norm": 1.8828125, + "grad_norm_var": 0.022705078125, + "learning_rate": 0.0001, + "loss": 4.219, + "loss/crossentropy": 1.9243710041046143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19137389957904816, + "step": 15376 + }, + { + "epoch": 0.30756, + "grad_norm": 1.984375, + "grad_norm_var": 0.02211481730143229, + "learning_rate": 0.0001, + "loss": 4.0903, + "loss/crossentropy": 2.004511833190918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20657775551080704, + "step": 15378 + }, + { + "epoch": 0.3076, + "grad_norm": 1.859375, + "grad_norm_var": 0.022956339518229167, + "learning_rate": 0.0001, + "loss": 4.0174, + "loss/crossentropy": 1.8409560918807983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19139550626277924, + "step": 15380 + }, + { + "epoch": 0.30764, + "grad_norm": 2.09375, + "grad_norm_var": 0.019974772135416666, + "learning_rate": 0.0001, + "loss": 4.2722, + "loss/crossentropy": 2.1442220211029053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23278377205133438, + "step": 15382 + }, + { + "epoch": 0.30768, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013651275634765625, + "learning_rate": 0.0001, + "loss": 4.2915, + "loss/crossentropy": 2.086996078491211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20602282136678696, + "step": 15384 + }, + { + "epoch": 0.30772, + "grad_norm": 2.125, + "grad_norm_var": 0.014404042561848959, + "learning_rate": 0.0001, + "loss": 4.4884, + "loss/crossentropy": 2.479863405227661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23815791308879852, + "step": 15386 + }, + { + "epoch": 0.30776, + "grad_norm": 1.953125, + "grad_norm_var": 0.009034983317057292, + "learning_rate": 0.0001, + "loss": 4.0648, + "loss/crossentropy": 1.9218324422836304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22571419924497604, + "step": 15388 + }, + { + "epoch": 0.3078, + "grad_norm": 2.078125, + "grad_norm_var": 0.009004720052083333, + "learning_rate": 0.0001, + "loss": 4.0315, + "loss/crossentropy": 1.9488168954849243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2004484310746193, + "step": 15390 + }, + { + "epoch": 0.30784, + "grad_norm": 2.09375, + "grad_norm_var": 0.006585439046223958, + "learning_rate": 0.0001, + "loss": 4.3221, + "loss/crossentropy": 2.307602286338806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23881366103887558, + "step": 15392 + }, + { + "epoch": 0.30788, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007283528645833333, + "learning_rate": 0.0001, + "loss": 4.1025, + "loss/crossentropy": 2.1380842328071594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019854038953781, + "step": 15394 + }, + { + "epoch": 0.30792, + "grad_norm": 2.125, + "grad_norm_var": 0.0072509765625, + "learning_rate": 0.0001, + "loss": 4.1245, + "loss/crossentropy": 2.0551159977912903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18669020384550095, + "step": 15396 + }, + { + "epoch": 0.30796, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009250640869140625, + "learning_rate": 0.0001, + "loss": 3.959, + "loss/crossentropy": 1.9213807582855225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1843273714184761, + "step": 15398 + }, + { + "epoch": 0.308, + "grad_norm": 2.015625, + "grad_norm_var": 0.008628082275390626, + "learning_rate": 0.0001, + "loss": 3.9724, + "loss/crossentropy": 2.0028855204582214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215728610754013, + "step": 15400 + }, + { + "epoch": 0.30804, + "grad_norm": 2.0, + "grad_norm_var": 0.0064776102701822914, + "learning_rate": 0.0001, + "loss": 4.0287, + "loss/crossentropy": 1.957477331161499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20727651566267014, + "step": 15402 + }, + { + "epoch": 0.30808, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0080718994140625, + "learning_rate": 0.0001, + "loss": 4.0255, + "loss/crossentropy": 2.2241835594177246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25234874337911606, + "step": 15404 + }, + { + "epoch": 0.30812, + "grad_norm": 2.046875, + "grad_norm_var": 0.006852213541666667, + "learning_rate": 0.0001, + "loss": 4.2677, + "loss/crossentropy": 2.108901560306549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20304366201162338, + "step": 15406 + }, + { + "epoch": 0.30816, + "grad_norm": 2.015625, + "grad_norm_var": 0.0075032552083333336, + "learning_rate": 0.0001, + "loss": 4.302, + "loss/crossentropy": 2.1005085110664368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21205321699380875, + "step": 15408 + }, + { + "epoch": 0.3082, + "grad_norm": 2.03125, + "grad_norm_var": 0.008139801025390626, + "learning_rate": 0.0001, + "loss": 4.2505, + "loss/crossentropy": 2.1565412282943726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22271832078695297, + "step": 15410 + }, + { + "epoch": 0.30824, + "grad_norm": 2.015625, + "grad_norm_var": 0.006241607666015625, + "learning_rate": 0.0001, + "loss": 4.1236, + "loss/crossentropy": 1.906798243522644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21911519020795822, + "step": 15412 + }, + { + "epoch": 0.30828, + "grad_norm": 1.8828125, + "grad_norm_var": 0.006089019775390625, + "learning_rate": 0.0001, + "loss": 4.1679, + "loss/crossentropy": 2.0200547575950623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19892341643571854, + "step": 15414 + }, + { + "epoch": 0.30832, + "grad_norm": 2.03125, + "grad_norm_var": 0.006037394205729167, + "learning_rate": 0.0001, + "loss": 4.302, + "loss/crossentropy": 2.14735209941864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21503761410713196, + "step": 15416 + }, + { + "epoch": 0.30836, + "grad_norm": 2.078125, + "grad_norm_var": 0.007405344645182292, + "learning_rate": 0.0001, + "loss": 4.2943, + "loss/crossentropy": 2.0479459166526794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21268057823181152, + "step": 15418 + }, + { + "epoch": 0.3084, + "grad_norm": 2.015625, + "grad_norm_var": 0.006831614176432291, + "learning_rate": 0.0001, + "loss": 4.1969, + "loss/crossentropy": 1.4835808873176575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16378673166036606, + "step": 15420 + }, + { + "epoch": 0.30844, + "grad_norm": 1.984375, + "grad_norm_var": 0.007802073160807292, + "learning_rate": 0.0001, + "loss": 4.058, + "loss/crossentropy": 1.7942256331443787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18387632071971893, + "step": 15422 + }, + { + "epoch": 0.30848, + "grad_norm": 2.046875, + "grad_norm_var": 0.006534576416015625, + "learning_rate": 0.0001, + "loss": 4.1352, + "loss/crossentropy": 1.8660341501235962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20189601182937622, + "step": 15424 + }, + { + "epoch": 0.30852, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006257120768229167, + "learning_rate": 0.0001, + "loss": 4.0359, + "loss/crossentropy": 1.8545112013816833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21422524750232697, + "step": 15426 + }, + { + "epoch": 0.30856, + "grad_norm": 1.890625, + "grad_norm_var": 0.0069163004557291664, + "learning_rate": 0.0001, + "loss": 4.1411, + "loss/crossentropy": 2.204701781272888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21733464300632477, + "step": 15428 + }, + { + "epoch": 0.3086, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006087239583333333, + "learning_rate": 0.0001, + "loss": 4.0865, + "loss/crossentropy": 1.9295769929885864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20931877195835114, + "step": 15430 + }, + { + "epoch": 0.30864, + "grad_norm": 2.140625, + "grad_norm_var": 0.008534495035807292, + "learning_rate": 0.0001, + "loss": 3.9165, + "loss/crossentropy": 2.0090243220329285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20928232371807098, + "step": 15432 + }, + { + "epoch": 0.30868, + "grad_norm": 2.0625, + "grad_norm_var": 0.009326171875, + "learning_rate": 0.0001, + "loss": 4.0517, + "loss/crossentropy": 2.1884257793426514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20887935161590576, + "step": 15434 + }, + { + "epoch": 0.30872, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009553019205729167, + "learning_rate": 0.0001, + "loss": 4.0101, + "loss/crossentropy": 2.023981511592865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893448904156685, + "step": 15436 + }, + { + "epoch": 0.30876, + "grad_norm": 1.890625, + "grad_norm_var": 0.00909423828125, + "learning_rate": 0.0001, + "loss": 4.0168, + "loss/crossentropy": 2.427197813987732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21649108827114105, + "step": 15438 + }, + { + "epoch": 0.3088, + "grad_norm": 2.3125, + "grad_norm_var": 0.015478515625, + "learning_rate": 0.0001, + "loss": 4.1243, + "loss/crossentropy": 2.2578816413879395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21297947317361832, + "step": 15440 + }, + { + "epoch": 0.30884, + "grad_norm": 2.0625, + "grad_norm_var": 0.015248362223307292, + "learning_rate": 0.0001, + "loss": 4.2977, + "loss/crossentropy": 1.8909358382225037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924509033560753, + "step": 15442 + }, + { + "epoch": 0.30888, + "grad_norm": 2.046875, + "grad_norm_var": 0.016228993733723957, + "learning_rate": 0.0001, + "loss": 4.0142, + "loss/crossentropy": 2.344139575958252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20414643734693527, + "step": 15444 + }, + { + "epoch": 0.30892, + "grad_norm": 1.96875, + "grad_norm_var": 0.01624755859375, + "learning_rate": 0.0001, + "loss": 4.2373, + "loss/crossentropy": 2.268660068511963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22332928329706192, + "step": 15446 + }, + { + "epoch": 0.30896, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0148193359375, + "learning_rate": 0.0001, + "loss": 4.0944, + "loss/crossentropy": 2.1276373267173767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123691812157631, + "step": 15448 + }, + { + "epoch": 0.309, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012975819905598958, + "learning_rate": 0.0001, + "loss": 4.1025, + "loss/crossentropy": 2.2471970319747925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2228669673204422, + "step": 15450 + }, + { + "epoch": 0.30904, + "grad_norm": 2.0625, + "grad_norm_var": 0.0124908447265625, + "learning_rate": 0.0001, + "loss": 4.2153, + "loss/crossentropy": 2.122064530849457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22981297969818115, + "step": 15452 + }, + { + "epoch": 0.30908, + "grad_norm": 1.7734375, + "grad_norm_var": 0.014823150634765626, + "learning_rate": 0.0001, + "loss": 3.815, + "loss/crossentropy": 2.016224443912506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19105292856693268, + "step": 15454 + }, + { + "epoch": 0.30912, + "grad_norm": 1.8359375, + "grad_norm_var": 0.016193644205729166, + "learning_rate": 0.0001, + "loss": 4.1518, + "loss/crossentropy": 1.9021474719047546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23519159853458405, + "step": 15456 + }, + { + "epoch": 0.30916, + "grad_norm": 2.09375, + "grad_norm_var": 0.016617838541666666, + "learning_rate": 0.0001, + "loss": 4.0971, + "loss/crossentropy": 2.067293703556061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20525048673152924, + "step": 15458 + }, + { + "epoch": 0.3092, + "grad_norm": 2.3125, + "grad_norm_var": 0.021930948893229166, + "learning_rate": 0.0001, + "loss": 4.3147, + "loss/crossentropy": 2.0720648169517517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20188772678375244, + "step": 15460 + }, + { + "epoch": 0.30924, + "grad_norm": 2.015625, + "grad_norm_var": 0.030890909830729167, + "learning_rate": 0.0001, + "loss": 4.212, + "loss/crossentropy": 2.1377363204956055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20943745225667953, + "step": 15462 + }, + { + "epoch": 0.30928, + "grad_norm": 2.078125, + "grad_norm_var": 0.028595987955729166, + "learning_rate": 0.0001, + "loss": 4.0799, + "loss/crossentropy": 1.9101733565330505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18436391651630402, + "step": 15464 + }, + { + "epoch": 0.30932, + "grad_norm": 2.015625, + "grad_norm_var": 0.026775868733723958, + "learning_rate": 0.0001, + "loss": 4.0945, + "loss/crossentropy": 2.205660581588745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2431517392396927, + "step": 15466 + }, + { + "epoch": 0.30936, + "grad_norm": 2.078125, + "grad_norm_var": 0.02670262654622396, + "learning_rate": 0.0001, + "loss": 4.502, + "loss/crossentropy": 2.3487337827682495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22333069890737534, + "step": 15468 + }, + { + "epoch": 0.3094, + "grad_norm": 1.9609375, + "grad_norm_var": 0.021971638997395834, + "learning_rate": 0.0001, + "loss": 3.9717, + "loss/crossentropy": 1.953830897808075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20369011163711548, + "step": 15470 + }, + { + "epoch": 0.30944, + "grad_norm": 2.0625, + "grad_norm_var": 0.0156402587890625, + "learning_rate": 0.0001, + "loss": 4.2282, + "loss/crossentropy": 2.1175807118415833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23428738862276077, + "step": 15472 + }, + { + "epoch": 0.30948, + "grad_norm": 1.9375, + "grad_norm_var": 0.016585032145182293, + "learning_rate": 0.0001, + "loss": 4.0257, + "loss/crossentropy": 2.114426612854004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20113752037286758, + "step": 15474 + }, + { + "epoch": 0.30952, + "grad_norm": 2.03125, + "grad_norm_var": 0.01676203409830729, + "learning_rate": 0.0001, + "loss": 3.8239, + "loss/crossentropy": 2.121155083179474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23428727686405182, + "step": 15476 + }, + { + "epoch": 0.30956, + "grad_norm": 1.9375, + "grad_norm_var": 0.007061513264973959, + "learning_rate": 0.0001, + "loss": 3.9982, + "loss/crossentropy": 2.1271342635154724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993299275636673, + "step": 15478 + }, + { + "epoch": 0.3096, + "grad_norm": 1.921875, + "grad_norm_var": 0.007502237955729167, + "learning_rate": 0.0001, + "loss": 4.0211, + "loss/crossentropy": 2.132619023323059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20428313314914703, + "step": 15480 + }, + { + "epoch": 0.30964, + "grad_norm": 2.296875, + "grad_norm_var": 0.013720703125, + "learning_rate": 0.0001, + "loss": 4.2644, + "loss/crossentropy": 1.8355163931846619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969365030527115, + "step": 15482 + }, + { + "epoch": 0.30968, + "grad_norm": 2.21875, + "grad_norm_var": 0.016695149739583335, + "learning_rate": 0.0001, + "loss": 4.0001, + "loss/crossentropy": 1.6765839457511902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19648748636245728, + "step": 15484 + }, + { + "epoch": 0.30972, + "grad_norm": 2.03125, + "grad_norm_var": 0.0168609619140625, + "learning_rate": 0.0001, + "loss": 4.0806, + "loss/crossentropy": 1.7628436088562012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1850176900625229, + "step": 15486 + }, + { + "epoch": 0.30976, + "grad_norm": 2.125, + "grad_norm_var": 0.017014312744140624, + "learning_rate": 0.0001, + "loss": 4.5902, + "loss/crossentropy": 2.407894253730774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2243446335196495, + "step": 15488 + }, + { + "epoch": 0.3098, + "grad_norm": 1.8984375, + "grad_norm_var": 0.017409006754557293, + "learning_rate": 0.0001, + "loss": 4.0274, + "loss/crossentropy": 2.0359743237495422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140478640794754, + "step": 15490 + }, + { + "epoch": 0.30984, + "grad_norm": 2.078125, + "grad_norm_var": 0.012111155192057292, + "learning_rate": 0.0001, + "loss": 4.086, + "loss/crossentropy": 2.1705892086029053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108132392168045, + "step": 15492 + }, + { + "epoch": 0.30988, + "grad_norm": 1.984375, + "grad_norm_var": 0.011126454671223958, + "learning_rate": 0.0001, + "loss": 3.9782, + "loss/crossentropy": 1.836803376674652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19436977803707123, + "step": 15494 + }, + { + "epoch": 0.30992, + "grad_norm": 2.015625, + "grad_norm_var": 0.010432688395182292, + "learning_rate": 0.0001, + "loss": 4.1018, + "loss/crossentropy": 2.1445683240890503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21950577944517136, + "step": 15496 + }, + { + "epoch": 0.30996, + "grad_norm": 2.09375, + "grad_norm_var": 0.007299550374348958, + "learning_rate": 0.0001, + "loss": 4.1382, + "loss/crossentropy": 2.258531093597412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212137870490551, + "step": 15498 + }, + { + "epoch": 0.31, + "grad_norm": 1.96875, + "grad_norm_var": 0.004052480061848958, + "learning_rate": 0.0001, + "loss": 4.2473, + "loss/crossentropy": 1.9310460686683655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18316251039505005, + "step": 15500 + }, + { + "epoch": 0.31004, + "grad_norm": 1.9375, + "grad_norm_var": 0.004788970947265625, + "learning_rate": 0.0001, + "loss": 3.8656, + "loss/crossentropy": 1.867807149887085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18478462100028992, + "step": 15502 + }, + { + "epoch": 0.31008, + "grad_norm": 2.21875, + "grad_norm_var": 0.008042144775390624, + "learning_rate": 0.0001, + "loss": 4.4691, + "loss/crossentropy": 2.0724986791610718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20986932516098022, + "step": 15504 + }, + { + "epoch": 0.31012, + "grad_norm": 2.203125, + "grad_norm_var": 0.010138956705729167, + "learning_rate": 0.0001, + "loss": 4.5188, + "loss/crossentropy": 2.1599953174591064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2250317633152008, + "step": 15506 + }, + { + "epoch": 0.31016, + "grad_norm": 1.8125, + "grad_norm_var": 0.015608469645182291, + "learning_rate": 0.0001, + "loss": 3.7842, + "loss/crossentropy": 1.975223183631897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104134038090706, + "step": 15508 + }, + { + "epoch": 0.3102, + "grad_norm": 2.21875, + "grad_norm_var": 0.019539388020833333, + "learning_rate": 0.0001, + "loss": 4.0028, + "loss/crossentropy": 2.133773148059845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20153696089982986, + "step": 15510 + }, + { + "epoch": 0.31024, + "grad_norm": 1.984375, + "grad_norm_var": 0.02014948527018229, + "learning_rate": 0.0001, + "loss": 3.9948, + "loss/crossentropy": 1.994078278541565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20573429763317108, + "step": 15512 + }, + { + "epoch": 0.31028, + "grad_norm": 1.953125, + "grad_norm_var": 0.019280751546223957, + "learning_rate": 0.0001, + "loss": 3.8209, + "loss/crossentropy": 1.7885381579399109, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889629364013672, + "step": 15514 + }, + { + "epoch": 0.31032, + "grad_norm": 2.0, + "grad_norm_var": 0.019191233317057292, + "learning_rate": 0.0001, + "loss": 4.2765, + "loss/crossentropy": 2.495113968849182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22110097110271454, + "step": 15516 + }, + { + "epoch": 0.31036, + "grad_norm": 1.953125, + "grad_norm_var": 0.019160715738932292, + "learning_rate": 0.0001, + "loss": 4.0465, + "loss/crossentropy": 2.0757648944854736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20948036015033722, + "step": 15518 + }, + { + "epoch": 0.3104, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015793609619140624, + "learning_rate": 0.0001, + "loss": 4.1369, + "loss/crossentropy": 2.151356875896454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22598887234926224, + "step": 15520 + }, + { + "epoch": 0.31044, + "grad_norm": 2.0, + "grad_norm_var": 0.010933176676432291, + "learning_rate": 0.0001, + "loss": 4.2193, + "loss/crossentropy": 2.1210484504699707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23365220427513123, + "step": 15522 + }, + { + "epoch": 0.31048, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008414459228515626, + "learning_rate": 0.0001, + "loss": 4.0584, + "loss/crossentropy": 2.3766754865646362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22161870449781418, + "step": 15524 + }, + { + "epoch": 0.31052, + "grad_norm": 1.9375, + "grad_norm_var": 0.0043853759765625, + "learning_rate": 0.0001, + "loss": 4.0968, + "loss/crossentropy": 2.33975088596344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21598009765148163, + "step": 15526 + }, + { + "epoch": 0.31056, + "grad_norm": 2.1875, + "grad_norm_var": 0.006468709309895833, + "learning_rate": 0.0001, + "loss": 4.4643, + "loss/crossentropy": 2.3024542331695557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20656803250312805, + "step": 15528 + }, + { + "epoch": 0.3106, + "grad_norm": 2.078125, + "grad_norm_var": 0.007666015625, + "learning_rate": 0.0001, + "loss": 4.1137, + "loss/crossentropy": 2.009307861328125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19054396450519562, + "step": 15530 + }, + { + "epoch": 0.31064, + "grad_norm": 1.90625, + "grad_norm_var": 0.008902994791666667, + "learning_rate": 0.0001, + "loss": 3.9413, + "loss/crossentropy": 1.8119140267372131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19199557602405548, + "step": 15532 + }, + { + "epoch": 0.31068, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009614817301432292, + "learning_rate": 0.0001, + "loss": 3.8753, + "loss/crossentropy": 1.9586234092712402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18981865793466568, + "step": 15534 + }, + { + "epoch": 0.31072, + "grad_norm": 1.96875, + "grad_norm_var": 0.010798899332682292, + "learning_rate": 0.0001, + "loss": 4.4319, + "loss/crossentropy": 2.1208658814430237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20590776205062866, + "step": 15536 + }, + { + "epoch": 0.31076, + "grad_norm": 1.84375, + "grad_norm_var": 0.012336222330729167, + "learning_rate": 0.0001, + "loss": 3.7806, + "loss/crossentropy": 1.539306402206421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1615876704454422, + "step": 15538 + }, + { + "epoch": 0.3108, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013768513997395834, + "learning_rate": 0.0001, + "loss": 4.0686, + "loss/crossentropy": 2.1376689672470093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21377786993980408, + "step": 15540 + }, + { + "epoch": 0.31084, + "grad_norm": 2.140625, + "grad_norm_var": 0.016249338785807293, + "learning_rate": 0.0001, + "loss": 4.368, + "loss/crossentropy": 1.954556941986084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19270780682563782, + "step": 15542 + }, + { + "epoch": 0.31088, + "grad_norm": 2.015625, + "grad_norm_var": 0.013575998942057292, + "learning_rate": 0.0001, + "loss": 4.1313, + "loss/crossentropy": 2.0225048661231995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937834918498993, + "step": 15544 + }, + { + "epoch": 0.31092, + "grad_norm": 2.0625, + "grad_norm_var": 0.011310831705729166, + "learning_rate": 0.0001, + "loss": 4.0919, + "loss/crossentropy": 2.149065852165222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21962979435920715, + "step": 15546 + }, + { + "epoch": 0.31096, + "grad_norm": 2.171875, + "grad_norm_var": 0.013073476155598958, + "learning_rate": 0.0001, + "loss": 4.0192, + "loss/crossentropy": 1.9546958804130554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20022059231996536, + "step": 15548 + }, + { + "epoch": 0.311, + "grad_norm": 1.90625, + "grad_norm_var": 0.013728841145833334, + "learning_rate": 0.0001, + "loss": 4.2815, + "loss/crossentropy": 2.3048110008239746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232206016778946, + "step": 15550 + }, + { + "epoch": 0.31104, + "grad_norm": 1.953125, + "grad_norm_var": 0.014525349934895833, + "learning_rate": 0.0001, + "loss": 3.9263, + "loss/crossentropy": 2.0329134464263916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20285522937774658, + "step": 15552 + }, + { + "epoch": 0.31108, + "grad_norm": 2.0, + "grad_norm_var": 0.013138580322265624, + "learning_rate": 0.0001, + "loss": 4.1654, + "loss/crossentropy": 2.0910086631774902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20308854430913925, + "step": 15554 + }, + { + "epoch": 0.31112, + "grad_norm": 1.96875, + "grad_norm_var": 0.012361399332682292, + "learning_rate": 0.0001, + "loss": 4.1922, + "loss/crossentropy": 2.3473485708236694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138630449771881, + "step": 15556 + }, + { + "epoch": 0.31116, + "grad_norm": 1.984375, + "grad_norm_var": 0.0098297119140625, + "learning_rate": 0.0001, + "loss": 4.3456, + "loss/crossentropy": 2.4176318645477295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23624707758426666, + "step": 15558 + }, + { + "epoch": 0.3112, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009943644205729166, + "learning_rate": 0.0001, + "loss": 4.0898, + "loss/crossentropy": 2.190505266189575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21235676854848862, + "step": 15560 + }, + { + "epoch": 0.31124, + "grad_norm": 2.28125, + "grad_norm_var": 0.1935198465983073, + "learning_rate": 0.0001, + "loss": 4.3688, + "loss/crossentropy": 2.1774531602859497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2568993717432022, + "step": 15562 + }, + { + "epoch": 0.31128, + "grad_norm": 1.9609375, + "grad_norm_var": 0.1934832255045573, + "learning_rate": 0.0001, + "loss": 4.004, + "loss/crossentropy": 2.0320950150489807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21587081998586655, + "step": 15564 + }, + { + "epoch": 0.31132, + "grad_norm": 2.125, + "grad_norm_var": 0.19068781534830728, + "learning_rate": 0.0001, + "loss": 4.3037, + "loss/crossentropy": 2.0477761030197144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21909283846616745, + "step": 15566 + }, + { + "epoch": 0.31136, + "grad_norm": 2.09375, + "grad_norm_var": 0.18454996744791666, + "learning_rate": 0.0001, + "loss": 4.2288, + "loss/crossentropy": 2.3907222747802734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21334309875965118, + "step": 15568 + }, + { + "epoch": 0.3114, + "grad_norm": 1.984375, + "grad_norm_var": 0.1816973368326823, + "learning_rate": 0.0001, + "loss": 4.28, + "loss/crossentropy": 2.2539732456207275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21502045542001724, + "step": 15570 + }, + { + "epoch": 0.31144, + "grad_norm": 1.96875, + "grad_norm_var": 0.17766520182291667, + "learning_rate": 0.0001, + "loss": 3.9675, + "loss/crossentropy": 2.036223292350769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959664672613144, + "step": 15572 + }, + { + "epoch": 0.31148, + "grad_norm": 1.984375, + "grad_norm_var": 0.17722880045572917, + "learning_rate": 0.0001, + "loss": 4.0387, + "loss/crossentropy": 1.7659193873405457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19535844773054123, + "step": 15574 + }, + { + "epoch": 0.31152, + "grad_norm": 2.015625, + "grad_norm_var": 0.17946751912434897, + "learning_rate": 0.0001, + "loss": 3.8892, + "loss/crossentropy": 2.0053369402885437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24516403675079346, + "step": 15576 + }, + { + "epoch": 0.31156, + "grad_norm": 2.171875, + "grad_norm_var": 0.0060384114583333336, + "learning_rate": 0.0001, + "loss": 3.9851, + "loss/crossentropy": 2.1804131269454956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2349890023469925, + "step": 15578 + }, + { + "epoch": 0.3116, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008552042643229167, + "learning_rate": 0.0001, + "loss": 3.9369, + "loss/crossentropy": 2.314085602760315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21073052287101746, + "step": 15580 + }, + { + "epoch": 0.31164, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008973948160807292, + "learning_rate": 0.0001, + "loss": 3.7951, + "loss/crossentropy": 2.1868577003479004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21520060300827026, + "step": 15582 + }, + { + "epoch": 0.31168, + "grad_norm": 1.953125, + "grad_norm_var": 0.007957967122395833, + "learning_rate": 0.0001, + "loss": 4.1618, + "loss/crossentropy": 1.9785407185554504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18188606202602386, + "step": 15584 + }, + { + "epoch": 0.31172, + "grad_norm": 2.109375, + "grad_norm_var": 0.009598795572916667, + "learning_rate": 0.0001, + "loss": 3.9173, + "loss/crossentropy": 1.9320110082626343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022753208875656, + "step": 15586 + }, + { + "epoch": 0.31176, + "grad_norm": 2.28125, + "grad_norm_var": 0.015653483072916665, + "learning_rate": 0.0001, + "loss": 4.0995, + "loss/crossentropy": 1.7925443649291992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17203447967767715, + "step": 15588 + }, + { + "epoch": 0.3118, + "grad_norm": 1.9609375, + "grad_norm_var": 0.016556803385416666, + "learning_rate": 0.0001, + "loss": 3.8876, + "loss/crossentropy": 2.048543393611908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18696243315935135, + "step": 15590 + }, + { + "epoch": 0.31184, + "grad_norm": 1.921875, + "grad_norm_var": 0.01658935546875, + "learning_rate": 0.0001, + "loss": 4.125, + "loss/crossentropy": 2.376060724258423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20351988822221756, + "step": 15592 + }, + { + "epoch": 0.31188, + "grad_norm": 2.171875, + "grad_norm_var": 0.0164947509765625, + "learning_rate": 0.0001, + "loss": 4.1778, + "loss/crossentropy": 1.5962707996368408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1538907140493393, + "step": 15594 + }, + { + "epoch": 0.31192, + "grad_norm": 1.9453125, + "grad_norm_var": 0.016281890869140624, + "learning_rate": 0.0001, + "loss": 3.7529, + "loss/crossentropy": 1.7845428586006165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18107154965400696, + "step": 15596 + }, + { + "epoch": 0.31196, + "grad_norm": 2.109375, + "grad_norm_var": 0.01536865234375, + "learning_rate": 0.0001, + "loss": 4.0385, + "loss/crossentropy": 1.8557100296020508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010527327656746, + "step": 15598 + }, + { + "epoch": 0.312, + "grad_norm": 1.921875, + "grad_norm_var": 0.015372721354166667, + "learning_rate": 0.0001, + "loss": 3.9735, + "loss/crossentropy": 1.8659257888793945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19198190420866013, + "step": 15600 + }, + { + "epoch": 0.31204, + "grad_norm": 2.046875, + "grad_norm_var": 0.014078521728515625, + "learning_rate": 0.0001, + "loss": 3.9842, + "loss/crossentropy": 2.0340747833251953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19963373243808746, + "step": 15602 + }, + { + "epoch": 0.31208, + "grad_norm": 2.09375, + "grad_norm_var": 0.00931396484375, + "learning_rate": 0.0001, + "loss": 4.1192, + "loss/crossentropy": 2.0676557421684265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20130064338445663, + "step": 15604 + }, + { + "epoch": 0.31212, + "grad_norm": 1.84375, + "grad_norm_var": 0.008397420247395834, + "learning_rate": 0.0001, + "loss": 3.8069, + "loss/crossentropy": 2.038145124912262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034953236579895, + "step": 15606 + }, + { + "epoch": 0.31216, + "grad_norm": 2.65625, + "grad_norm_var": 0.03449605305989583, + "learning_rate": 0.0001, + "loss": 4.3615, + "loss/crossentropy": 2.3371706008911133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21727421879768372, + "step": 15608 + }, + { + "epoch": 0.3122, + "grad_norm": 2.03125, + "grad_norm_var": 0.03327611287434896, + "learning_rate": 0.0001, + "loss": 4.1988, + "loss/crossentropy": 2.3314318656921387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064831778407097, + "step": 15610 + }, + { + "epoch": 0.31224, + "grad_norm": 1.9375, + "grad_norm_var": 0.03206558227539062, + "learning_rate": 0.0001, + "loss": 3.9634, + "loss/crossentropy": 1.8074566721916199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1755329668521881, + "step": 15612 + }, + { + "epoch": 0.31228, + "grad_norm": 1.96875, + "grad_norm_var": 0.032138824462890625, + "learning_rate": 0.0001, + "loss": 4.1849, + "loss/crossentropy": 2.1219228506088257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23183195292949677, + "step": 15614 + }, + { + "epoch": 0.31232, + "grad_norm": 2.03125, + "grad_norm_var": 0.031434885660807294, + "learning_rate": 0.0001, + "loss": 4.1418, + "loss/crossentropy": 2.3573983907699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20713336020708084, + "step": 15616 + }, + { + "epoch": 0.31236, + "grad_norm": 1.859375, + "grad_norm_var": 0.033933258056640624, + "learning_rate": 0.0001, + "loss": 4.048, + "loss/crossentropy": 2.0748316049575806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19384868443012238, + "step": 15618 + }, + { + "epoch": 0.3124, + "grad_norm": 2.03125, + "grad_norm_var": 0.033882395426432295, + "learning_rate": 0.0001, + "loss": 4.1834, + "loss/crossentropy": 1.7933568358421326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19711802154779434, + "step": 15620 + }, + { + "epoch": 0.31244, + "grad_norm": 2.046875, + "grad_norm_var": 0.03115819295247396, + "learning_rate": 0.0001, + "loss": 4.2023, + "loss/crossentropy": 1.8922778367996216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19847699999809265, + "step": 15622 + }, + { + "epoch": 0.31248, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0048004150390625, + "learning_rate": 0.0001, + "loss": 4.2326, + "loss/crossentropy": 2.3432679176330566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22804179042577744, + "step": 15624 + }, + { + "epoch": 0.31252, + "grad_norm": 2.203125, + "grad_norm_var": 0.007469685872395834, + "learning_rate": 0.0001, + "loss": 4.1215, + "loss/crossentropy": 2.340356707572937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22454006224870682, + "step": 15626 + }, + { + "epoch": 0.31256, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010737864176432292, + "learning_rate": 0.0001, + "loss": 4.0874, + "loss/crossentropy": 2.037777841091156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20374249666929245, + "step": 15628 + }, + { + "epoch": 0.3126, + "grad_norm": 2.078125, + "grad_norm_var": 0.010562896728515625, + "learning_rate": 0.0001, + "loss": 4.3193, + "loss/crossentropy": 2.2576274275779724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19910124689340591, + "step": 15630 + }, + { + "epoch": 0.31264, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011302693684895834, + "learning_rate": 0.0001, + "loss": 4.1561, + "loss/crossentropy": 2.1354891061782837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20646820217370987, + "step": 15632 + }, + { + "epoch": 0.31268, + "grad_norm": 2.046875, + "grad_norm_var": 0.011136627197265625, + "learning_rate": 0.0001, + "loss": 3.906, + "loss/crossentropy": 1.8888733386993408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20730097591876984, + "step": 15634 + }, + { + "epoch": 0.31272, + "grad_norm": 1.921875, + "grad_norm_var": 0.012474568684895833, + "learning_rate": 0.0001, + "loss": 3.9522, + "loss/crossentropy": 2.1037773489952087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20308485627174377, + "step": 15636 + }, + { + "epoch": 0.31276, + "grad_norm": 2.21875, + "grad_norm_var": 0.017085774739583334, + "learning_rate": 0.0001, + "loss": 4.244, + "loss/crossentropy": 1.996969223022461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2080031782388687, + "step": 15638 + }, + { + "epoch": 0.3128, + "grad_norm": 2.671875, + "grad_norm_var": 0.0469146728515625, + "learning_rate": 0.0001, + "loss": 3.915, + "loss/crossentropy": 2.0528018474578857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2248053327202797, + "step": 15640 + }, + { + "epoch": 0.31284, + "grad_norm": 1.9765625, + "grad_norm_var": 0.044864654541015625, + "learning_rate": 0.0001, + "loss": 4.1718, + "loss/crossentropy": 2.270516335964203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19578035175800323, + "step": 15642 + }, + { + "epoch": 0.31288, + "grad_norm": 2.046875, + "grad_norm_var": 0.0400299072265625, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 2.146915912628174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21282948553562164, + "step": 15644 + }, + { + "epoch": 0.31292, + "grad_norm": 2.125, + "grad_norm_var": 0.041562652587890624, + "learning_rate": 0.0001, + "loss": 4.3275, + "loss/crossentropy": 2.058235287666321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20604285597801208, + "step": 15646 + }, + { + "epoch": 0.31296, + "grad_norm": 2.015625, + "grad_norm_var": 0.0414215087890625, + "learning_rate": 0.0001, + "loss": 4.4304, + "loss/crossentropy": 2.3217705488204956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24202678352594376, + "step": 15648 + }, + { + "epoch": 0.313, + "grad_norm": 2.0625, + "grad_norm_var": 0.041257476806640624, + "learning_rate": 0.0001, + "loss": 4.1372, + "loss/crossentropy": 2.108501672744751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20358111709356308, + "step": 15650 + }, + { + "epoch": 0.31304, + "grad_norm": 1.859375, + "grad_norm_var": 0.041025543212890626, + "learning_rate": 0.0001, + "loss": 4.1224, + "loss/crossentropy": 2.0875802636146545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20186328142881393, + "step": 15652 + }, + { + "epoch": 0.31308, + "grad_norm": 2.0625, + "grad_norm_var": 0.036321767171223956, + "learning_rate": 0.0001, + "loss": 4.1994, + "loss/crossentropy": 2.0159415006637573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053827941417694, + "step": 15654 + }, + { + "epoch": 0.31312, + "grad_norm": 2.015625, + "grad_norm_var": 0.008402252197265625, + "learning_rate": 0.0001, + "loss": 3.9711, + "loss/crossentropy": 1.8664127588272095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18666206300258636, + "step": 15656 + }, + { + "epoch": 0.31316, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008548736572265625, + "learning_rate": 0.0001, + "loss": 3.9399, + "loss/crossentropy": 1.9351357221603394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.189142145216465, + "step": 15658 + }, + { + "epoch": 0.3132, + "grad_norm": 1.96875, + "grad_norm_var": 0.010872141520182291, + "learning_rate": 0.0001, + "loss": 4.3288, + "loss/crossentropy": 1.9937176704406738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18212904781103134, + "step": 15660 + }, + { + "epoch": 0.31324, + "grad_norm": 2.03125, + "grad_norm_var": 0.009801991780598958, + "learning_rate": 0.0001, + "loss": 4.1238, + "loss/crossentropy": 2.0626447796821594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20908795297145844, + "step": 15662 + }, + { + "epoch": 0.31328, + "grad_norm": 1.9375, + "grad_norm_var": 0.008506011962890626, + "learning_rate": 0.0001, + "loss": 4.0436, + "loss/crossentropy": 1.8491488695144653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18241358548402786, + "step": 15664 + }, + { + "epoch": 0.31332, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0084625244140625, + "learning_rate": 0.0001, + "loss": 3.9885, + "loss/crossentropy": 2.2744827270507812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295297086238861, + "step": 15666 + }, + { + "epoch": 0.31336, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0078521728515625, + "learning_rate": 0.0001, + "loss": 3.9213, + "loss/crossentropy": 2.30733585357666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23149366676807404, + "step": 15668 + }, + { + "epoch": 0.3134, + "grad_norm": 2.140625, + "grad_norm_var": 0.00892333984375, + "learning_rate": 0.0001, + "loss": 4.1357, + "loss/crossentropy": 2.038204550743103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19608831405639648, + "step": 15670 + }, + { + "epoch": 0.31344, + "grad_norm": 1.953125, + "grad_norm_var": 0.008676910400390625, + "learning_rate": 0.0001, + "loss": 4.1235, + "loss/crossentropy": 2.076035261154175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21801196038722992, + "step": 15672 + }, + { + "epoch": 0.31348, + "grad_norm": 1.7734375, + "grad_norm_var": 0.011310831705729166, + "learning_rate": 0.0001, + "loss": 4.0923, + "loss/crossentropy": 1.9234278202056885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18380460888147354, + "step": 15674 + }, + { + "epoch": 0.31352, + "grad_norm": 2.125, + "grad_norm_var": 0.00931396484375, + "learning_rate": 0.0001, + "loss": 4.1002, + "loss/crossentropy": 1.9312421679496765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18474044650793076, + "step": 15676 + }, + { + "epoch": 0.31356, + "grad_norm": 2.0625, + "grad_norm_var": 0.009716542561848958, + "learning_rate": 0.0001, + "loss": 4.079, + "loss/crossentropy": 2.014132261276245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045312151312828, + "step": 15678 + }, + { + "epoch": 0.3136, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0099365234375, + "learning_rate": 0.0001, + "loss": 4.2325, + "loss/crossentropy": 1.925381362438202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18306680023670197, + "step": 15680 + }, + { + "epoch": 0.31364, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009346516927083333, + "learning_rate": 0.0001, + "loss": 4.0766, + "loss/crossentropy": 1.932526171207428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20741897821426392, + "step": 15682 + }, + { + "epoch": 0.31368, + "grad_norm": 2.015625, + "grad_norm_var": 0.008963775634765626, + "learning_rate": 0.0001, + "loss": 4.2234, + "loss/crossentropy": 2.1144750714302063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19398797303438187, + "step": 15684 + }, + { + "epoch": 0.31372, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0076983133951822914, + "learning_rate": 0.0001, + "loss": 3.8411, + "loss/crossentropy": 1.8115296363830566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1869400143623352, + "step": 15686 + }, + { + "epoch": 0.31376, + "grad_norm": 2.296875, + "grad_norm_var": 0.014686838785807291, + "learning_rate": 0.0001, + "loss": 4.0935, + "loss/crossentropy": 2.0550562739372253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093418911099434, + "step": 15688 + }, + { + "epoch": 0.3138, + "grad_norm": 1.8046875, + "grad_norm_var": 0.013541412353515626, + "learning_rate": 0.0001, + "loss": 3.8549, + "loss/crossentropy": 2.144737482070923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20025025308132172, + "step": 15690 + }, + { + "epoch": 0.31384, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012629191080729166, + "learning_rate": 0.0001, + "loss": 3.9911, + "loss/crossentropy": 2.2495052814483643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21178698539733887, + "step": 15692 + }, + { + "epoch": 0.31388, + "grad_norm": 2.03125, + "grad_norm_var": 0.012740071614583333, + "learning_rate": 0.0001, + "loss": 4.2208, + "loss/crossentropy": 2.1858155727386475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20891498774290085, + "step": 15694 + }, + { + "epoch": 0.31392, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0131744384765625, + "learning_rate": 0.0001, + "loss": 4.1841, + "loss/crossentropy": 2.1480127573013306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149479240179062, + "step": 15696 + }, + { + "epoch": 0.31396, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013329060872395833, + "learning_rate": 0.0001, + "loss": 3.9218, + "loss/crossentropy": 2.1585733294487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20317788422107697, + "step": 15698 + }, + { + "epoch": 0.314, + "grad_norm": 1.9609375, + "grad_norm_var": 0.2103179931640625, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 2.1281010508537292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032032608985901, + "step": 15700 + }, + { + "epoch": 0.31404, + "grad_norm": 1.9375, + "grad_norm_var": 0.2096588134765625, + "learning_rate": 0.0001, + "loss": 4.0189, + "loss/crossentropy": 2.160776972770691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20999448001384735, + "step": 15702 + }, + { + "epoch": 0.31408, + "grad_norm": 1.9140625, + "grad_norm_var": 0.20640869140625, + "learning_rate": 0.0001, + "loss": 3.961, + "loss/crossentropy": 2.1508986949920654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21675271540880203, + "step": 15704 + }, + { + "epoch": 0.31412, + "grad_norm": 2.109375, + "grad_norm_var": 0.2011871337890625, + "learning_rate": 0.0001, + "loss": 4.1573, + "loss/crossentropy": 2.083084225654602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21110595017671585, + "step": 15706 + }, + { + "epoch": 0.31416, + "grad_norm": 1.8046875, + "grad_norm_var": 0.20275472005208334, + "learning_rate": 0.0001, + "loss": 3.9785, + "loss/crossentropy": 2.1268805265426636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20728368312120438, + "step": 15708 + }, + { + "epoch": 0.3142, + "grad_norm": 2.078125, + "grad_norm_var": 0.20265706380208334, + "learning_rate": 0.0001, + "loss": 4.4291, + "loss/crossentropy": 2.156775116920471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22175253927707672, + "step": 15710 + }, + { + "epoch": 0.31424, + "grad_norm": 1.984375, + "grad_norm_var": 0.20224583943684896, + "learning_rate": 0.0001, + "loss": 4.1537, + "loss/crossentropy": 2.2541415691375732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22773633152246475, + "step": 15712 + }, + { + "epoch": 0.31428, + "grad_norm": 2.0625, + "grad_norm_var": 0.20350723266601561, + "learning_rate": 0.0001, + "loss": 3.9166, + "loss/crossentropy": 1.6463716626167297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18172992020845413, + "step": 15714 + }, + { + "epoch": 0.31432, + "grad_norm": 2.03125, + "grad_norm_var": 0.011669921875, + "learning_rate": 0.0001, + "loss": 4.0144, + "loss/crossentropy": 1.9325580596923828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19147639721632004, + "step": 15716 + }, + { + "epoch": 0.31436, + "grad_norm": 2.109375, + "grad_norm_var": 0.014115142822265624, + "learning_rate": 0.0001, + "loss": 3.9952, + "loss/crossentropy": 2.4016847610473633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058410421013832, + "step": 15718 + }, + { + "epoch": 0.3144, + "grad_norm": 2.125, + "grad_norm_var": 0.014338175455729166, + "learning_rate": 0.0001, + "loss": 4.1847, + "loss/crossentropy": 2.3456791639328003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22646376490592957, + "step": 15720 + }, + { + "epoch": 0.31444, + "grad_norm": 2.046875, + "grad_norm_var": 0.013785807291666667, + "learning_rate": 0.0001, + "loss": 4.0551, + "loss/crossentropy": 1.6703909635543823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1818019300699234, + "step": 15722 + }, + { + "epoch": 0.31448, + "grad_norm": 1.953125, + "grad_norm_var": 0.011065419514973958, + "learning_rate": 0.0001, + "loss": 4.0561, + "loss/crossentropy": 2.0004162192344666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033773437142372, + "step": 15724 + }, + { + "epoch": 0.31452, + "grad_norm": 2.0, + "grad_norm_var": 0.009905751546223958, + "learning_rate": 0.0001, + "loss": 4.1799, + "loss/crossentropy": 2.018653154373169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20320426672697067, + "step": 15726 + }, + { + "epoch": 0.31456, + "grad_norm": 2.046875, + "grad_norm_var": 0.009592437744140625, + "learning_rate": 0.0001, + "loss": 4.0991, + "loss/crossentropy": 2.0881033539772034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126169577240944, + "step": 15728 + }, + { + "epoch": 0.3146, + "grad_norm": 2.046875, + "grad_norm_var": 0.0067942301432291664, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 2.1103954911231995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2163369059562683, + "step": 15730 + }, + { + "epoch": 0.31464, + "grad_norm": 2.015625, + "grad_norm_var": 0.007868448893229166, + "learning_rate": 0.0001, + "loss": 4.142, + "loss/crossentropy": 1.9757474660873413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21398526430130005, + "step": 15732 + }, + { + "epoch": 0.31468, + "grad_norm": 1.921875, + "grad_norm_var": 0.006304677327473958, + "learning_rate": 0.0001, + "loss": 3.8965, + "loss/crossentropy": 2.114749312400818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21620750427246094, + "step": 15734 + }, + { + "epoch": 0.31472, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007163238525390625, + "learning_rate": 0.0001, + "loss": 3.9688, + "loss/crossentropy": 1.9022989869117737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1895657405257225, + "step": 15736 + }, + { + "epoch": 0.31476, + "grad_norm": 2.296875, + "grad_norm_var": 0.013059234619140625, + "learning_rate": 0.0001, + "loss": 4.2254, + "loss/crossentropy": 2.0457635521888733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21281428635120392, + "step": 15738 + }, + { + "epoch": 0.3148, + "grad_norm": 2.015625, + "grad_norm_var": 0.011954752604166667, + "learning_rate": 0.0001, + "loss": 3.8985, + "loss/crossentropy": 1.9585834741592407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003040909767151, + "step": 15740 + }, + { + "epoch": 0.31484, + "grad_norm": 2.046875, + "grad_norm_var": 0.014745076497395834, + "learning_rate": 0.0001, + "loss": 4.1682, + "loss/crossentropy": 2.06454074382782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22020584344863892, + "step": 15742 + }, + { + "epoch": 0.31488, + "grad_norm": 1.921875, + "grad_norm_var": 0.015665690104166668, + "learning_rate": 0.0001, + "loss": 4.2384, + "loss/crossentropy": 2.2651820182800293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22998705506324768, + "step": 15744 + }, + { + "epoch": 0.31492, + "grad_norm": 1.9375, + "grad_norm_var": 0.0172607421875, + "learning_rate": 0.0001, + "loss": 4.258, + "loss/crossentropy": 2.269286036491394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21012098342180252, + "step": 15746 + }, + { + "epoch": 0.31496, + "grad_norm": 1.953125, + "grad_norm_var": 0.017891438802083333, + "learning_rate": 0.0001, + "loss": 4.0604, + "loss/crossentropy": 1.6778026223182678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17330823838710785, + "step": 15748 + }, + { + "epoch": 0.315, + "grad_norm": 1.78125, + "grad_norm_var": 0.020775349934895833, + "learning_rate": 0.0001, + "loss": 3.9785, + "loss/crossentropy": 1.8929405808448792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18170948326587677, + "step": 15750 + }, + { + "epoch": 0.31504, + "grad_norm": 1.953125, + "grad_norm_var": 0.01980768839518229, + "learning_rate": 0.0001, + "loss": 4.0874, + "loss/crossentropy": 1.8814340233802795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097821682691574, + "step": 15752 + }, + { + "epoch": 0.31508, + "grad_norm": 2.09375, + "grad_norm_var": 0.014465077718098959, + "learning_rate": 0.0001, + "loss": 3.8622, + "loss/crossentropy": 1.7101264595985413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1834770292043686, + "step": 15754 + }, + { + "epoch": 0.31512, + "grad_norm": 1.828125, + "grad_norm_var": 0.01607233683268229, + "learning_rate": 0.0001, + "loss": 3.8498, + "loss/crossentropy": 2.0287395119667053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19603880494832993, + "step": 15756 + }, + { + "epoch": 0.31516, + "grad_norm": 1.875, + "grad_norm_var": 0.010467274983723959, + "learning_rate": 0.0001, + "loss": 3.7485, + "loss/crossentropy": 2.0119568705558777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212313212454319, + "step": 15758 + }, + { + "epoch": 0.3152, + "grad_norm": 2.015625, + "grad_norm_var": 0.011441802978515625, + "learning_rate": 0.0001, + "loss": 4.5141, + "loss/crossentropy": 1.8937278985977173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1774902269244194, + "step": 15760 + }, + { + "epoch": 0.31524, + "grad_norm": 2.015625, + "grad_norm_var": 0.009104156494140625, + "learning_rate": 0.0001, + "loss": 4.3562, + "loss/crossentropy": 2.557780146598816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22111114859580994, + "step": 15762 + }, + { + "epoch": 0.31528, + "grad_norm": 2.1875, + "grad_norm_var": 0.011262003580729167, + "learning_rate": 0.0001, + "loss": 4.1646, + "loss/crossentropy": 2.1467400789260864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134222611784935, + "step": 15764 + }, + { + "epoch": 0.31532, + "grad_norm": 2.03125, + "grad_norm_var": 0.00814208984375, + "learning_rate": 0.0001, + "loss": 3.9631, + "loss/crossentropy": 1.9758468270301819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1853625252842903, + "step": 15766 + }, + { + "epoch": 0.31536, + "grad_norm": 2.171875, + "grad_norm_var": 0.009699503580729166, + "learning_rate": 0.0001, + "loss": 4.2144, + "loss/crossentropy": 1.8620794415473938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18791437149047852, + "step": 15768 + }, + { + "epoch": 0.3154, + "grad_norm": 2.03125, + "grad_norm_var": 0.008893839518229167, + "learning_rate": 0.0001, + "loss": 4.2066, + "loss/crossentropy": 2.187020480632782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2102060467004776, + "step": 15770 + }, + { + "epoch": 0.31544, + "grad_norm": 2.03125, + "grad_norm_var": 0.006522369384765625, + "learning_rate": 0.0001, + "loss": 4.1828, + "loss/crossentropy": 2.0049954652786255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19090547412633896, + "step": 15772 + }, + { + "epoch": 0.31548, + "grad_norm": 2.03125, + "grad_norm_var": 0.010357411702473958, + "learning_rate": 0.0001, + "loss": 4.195, + "loss/crossentropy": 2.0030736327171326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20134840160608292, + "step": 15774 + }, + { + "epoch": 0.31552, + "grad_norm": 2.0, + "grad_norm_var": 0.010687001546223958, + "learning_rate": 0.0001, + "loss": 4.3092, + "loss/crossentropy": 1.8941256999969482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20826192945241928, + "step": 15776 + }, + { + "epoch": 0.31556, + "grad_norm": 1.8046875, + "grad_norm_var": 0.015596516927083333, + "learning_rate": 0.0001, + "loss": 4.0767, + "loss/crossentropy": 2.0645129680633545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19137012213468552, + "step": 15778 + }, + { + "epoch": 0.3156, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014240519205729166, + "learning_rate": 0.0001, + "loss": 4.0349, + "loss/crossentropy": 2.198352813720703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21606199443340302, + "step": 15780 + }, + { + "epoch": 0.31564, + "grad_norm": 2.125, + "grad_norm_var": 0.014020792643229167, + "learning_rate": 0.0001, + "loss": 4.1544, + "loss/crossentropy": 1.9041627049446106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19438787549734116, + "step": 15782 + }, + { + "epoch": 0.31568, + "grad_norm": 2.078125, + "grad_norm_var": 0.027741495768229166, + "learning_rate": 0.0001, + "loss": 4.2615, + "loss/crossentropy": 2.110167443752289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026640698313713, + "step": 15784 + }, + { + "epoch": 0.31572, + "grad_norm": 2.171875, + "grad_norm_var": 0.02655029296875, + "learning_rate": 0.0001, + "loss": 4.1932, + "loss/crossentropy": 2.5422832369804382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20274986326694489, + "step": 15786 + }, + { + "epoch": 0.31576, + "grad_norm": 1.921875, + "grad_norm_var": 0.02840576171875, + "learning_rate": 0.0001, + "loss": 4.2038, + "loss/crossentropy": 2.106445074081421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20275159180164337, + "step": 15788 + }, + { + "epoch": 0.3158, + "grad_norm": 2.375, + "grad_norm_var": 0.029857381184895834, + "learning_rate": 0.0001, + "loss": 4.2718, + "loss/crossentropy": 2.341952681541443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22970512509346008, + "step": 15790 + }, + { + "epoch": 0.31584, + "grad_norm": 1.96875, + "grad_norm_var": 0.030228678385416666, + "learning_rate": 0.0001, + "loss": 4.238, + "loss/crossentropy": 2.120057761669159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21171308308839798, + "step": 15792 + }, + { + "epoch": 0.31588, + "grad_norm": 1.8671875, + "grad_norm_var": 0.027644856770833334, + "learning_rate": 0.0001, + "loss": 3.8819, + "loss/crossentropy": 1.5601414442062378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20278310775756836, + "step": 15794 + }, + { + "epoch": 0.31592, + "grad_norm": 1.984375, + "grad_norm_var": 0.027596028645833333, + "learning_rate": 0.0001, + "loss": 4.0743, + "loss/crossentropy": 2.0634138584136963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815958246588707, + "step": 15796 + }, + { + "epoch": 0.31596, + "grad_norm": 1.953125, + "grad_norm_var": 0.0312652587890625, + "learning_rate": 0.0001, + "loss": 3.8183, + "loss/crossentropy": 1.8437300324440002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18099966645240784, + "step": 15798 + }, + { + "epoch": 0.316, + "grad_norm": 1.984375, + "grad_norm_var": 0.015897369384765624, + "learning_rate": 0.0001, + "loss": 4.1102, + "loss/crossentropy": 1.9323074221611023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1884680539369583, + "step": 15800 + }, + { + "epoch": 0.31604, + "grad_norm": 2.265625, + "grad_norm_var": 0.018027496337890626, + "learning_rate": 0.0001, + "loss": 4.3145, + "loss/crossentropy": 2.158499598503113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23394601047039032, + "step": 15802 + }, + { + "epoch": 0.31608, + "grad_norm": 1.9921875, + "grad_norm_var": 0.018062337239583334, + "learning_rate": 0.0001, + "loss": 4.0894, + "loss/crossentropy": 1.9635908007621765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20692527294158936, + "step": 15804 + }, + { + "epoch": 0.31612, + "grad_norm": 1.8671875, + "grad_norm_var": 0.010503896077473958, + "learning_rate": 0.0001, + "loss": 4.1912, + "loss/crossentropy": 1.96099454164505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1971232146024704, + "step": 15806 + }, + { + "epoch": 0.31616, + "grad_norm": 2.09375, + "grad_norm_var": 0.010643513997395833, + "learning_rate": 0.0001, + "loss": 4.2205, + "loss/crossentropy": 1.9690999388694763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19965435564517975, + "step": 15808 + }, + { + "epoch": 0.3162, + "grad_norm": 2.015625, + "grad_norm_var": 0.011354319254557292, + "learning_rate": 0.0001, + "loss": 4.113, + "loss/crossentropy": 2.3924723863601685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192596197128296, + "step": 15810 + }, + { + "epoch": 0.31624, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013999176025390626, + "learning_rate": 0.0001, + "loss": 3.8794, + "loss/crossentropy": 2.137382209300995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20949169248342514, + "step": 15812 + }, + { + "epoch": 0.31628, + "grad_norm": 2.0625, + "grad_norm_var": 0.013103993733723958, + "learning_rate": 0.0001, + "loss": 4.0187, + "loss/crossentropy": 1.986245334148407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19678276032209396, + "step": 15814 + }, + { + "epoch": 0.31632, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013492584228515625, + "learning_rate": 0.0001, + "loss": 3.9623, + "loss/crossentropy": 1.9210307598114014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18748773634433746, + "step": 15816 + }, + { + "epoch": 0.31636, + "grad_norm": 2.0625, + "grad_norm_var": 0.0087066650390625, + "learning_rate": 0.0001, + "loss": 4.1328, + "loss/crossentropy": 1.9330076575279236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21187862008810043, + "step": 15818 + }, + { + "epoch": 0.3164, + "grad_norm": 2.015625, + "grad_norm_var": 0.007306925455729167, + "learning_rate": 0.0001, + "loss": 4.2475, + "loss/crossentropy": 2.275822639465332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2303369864821434, + "step": 15820 + }, + { + "epoch": 0.31644, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0082275390625, + "learning_rate": 0.0001, + "loss": 4.3641, + "loss/crossentropy": 1.9317327737808228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19543734192848206, + "step": 15822 + }, + { + "epoch": 0.31648, + "grad_norm": 2.109375, + "grad_norm_var": 0.008715565999348958, + "learning_rate": 0.0001, + "loss": 4.3209, + "loss/crossentropy": 2.2146100997924805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22172697633504868, + "step": 15824 + }, + { + "epoch": 0.31652, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007173411051432292, + "learning_rate": 0.0001, + "loss": 4.0226, + "loss/crossentropy": 2.296873092651367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22240595519542694, + "step": 15826 + }, + { + "epoch": 0.31656, + "grad_norm": 1.953125, + "grad_norm_var": 0.005900065104166667, + "learning_rate": 0.0001, + "loss": 4.2914, + "loss/crossentropy": 2.130843997001648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21288827806711197, + "step": 15828 + }, + { + "epoch": 0.3166, + "grad_norm": 1.9609375, + "grad_norm_var": 0.017856597900390625, + "learning_rate": 0.0001, + "loss": 4.1067, + "loss/crossentropy": 2.042823553085327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972874104976654, + "step": 15830 + }, + { + "epoch": 0.31664, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01701838175455729, + "learning_rate": 0.0001, + "loss": 3.8878, + "loss/crossentropy": 1.8040228486061096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19943677634000778, + "step": 15832 + }, + { + "epoch": 0.31668, + "grad_norm": 2.1875, + "grad_norm_var": 0.018400065104166665, + "learning_rate": 0.0001, + "loss": 4.2647, + "loss/crossentropy": 2.085720181465149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2592329904437065, + "step": 15834 + }, + { + "epoch": 0.31672, + "grad_norm": 2.046875, + "grad_norm_var": 0.022334798177083334, + "learning_rate": 0.0001, + "loss": 3.7694, + "loss/crossentropy": 1.9271164536476135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1766722872853279, + "step": 15836 + }, + { + "epoch": 0.31676, + "grad_norm": 2.09375, + "grad_norm_var": 0.022106679280598958, + "learning_rate": 0.0001, + "loss": 4.0742, + "loss/crossentropy": 2.188960611820221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19409415870904922, + "step": 15838 + }, + { + "epoch": 0.3168, + "grad_norm": 2.0, + "grad_norm_var": 0.020949045817057293, + "learning_rate": 0.0001, + "loss": 3.7821, + "loss/crossentropy": 1.5765752792358398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16517025232315063, + "step": 15840 + }, + { + "epoch": 0.31684, + "grad_norm": 2.203125, + "grad_norm_var": 0.020804595947265626, + "learning_rate": 0.0001, + "loss": 4.3356, + "loss/crossentropy": 1.9913761019706726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20838972181081772, + "step": 15842 + }, + { + "epoch": 0.31688, + "grad_norm": 1.984375, + "grad_norm_var": 0.034366607666015625, + "learning_rate": 0.0001, + "loss": 4.3525, + "loss/crossentropy": 1.8618363738059998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19304241985082626, + "step": 15844 + }, + { + "epoch": 0.31692, + "grad_norm": 1.9375, + "grad_norm_var": 0.026911417643229168, + "learning_rate": 0.0001, + "loss": 3.8087, + "loss/crossentropy": 1.9099775552749634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2251715511083603, + "step": 15846 + }, + { + "epoch": 0.31696, + "grad_norm": 1.953125, + "grad_norm_var": 0.026486968994140624, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 2.3534491062164307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19920790195465088, + "step": 15848 + }, + { + "epoch": 0.317, + "grad_norm": 2.203125, + "grad_norm_var": 0.02804743448893229, + "learning_rate": 0.0001, + "loss": 4.1676, + "loss/crossentropy": 2.1890534162521362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140251249074936, + "step": 15850 + }, + { + "epoch": 0.31704, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0298980712890625, + "learning_rate": 0.0001, + "loss": 3.6303, + "loss/crossentropy": 1.880380094051361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18094944953918457, + "step": 15852 + }, + { + "epoch": 0.31708, + "grad_norm": 2.4375, + "grad_norm_var": 0.03880106608072917, + "learning_rate": 0.0001, + "loss": 4.3276, + "loss/crossentropy": 1.8723257184028625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101321741938591, + "step": 15854 + }, + { + "epoch": 0.31712, + "grad_norm": 1.953125, + "grad_norm_var": 0.039582316080729166, + "learning_rate": 0.0001, + "loss": 4.077, + "loss/crossentropy": 2.0442943572998047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947987675666809, + "step": 15856 + }, + { + "epoch": 0.31716, + "grad_norm": 2.0625, + "grad_norm_var": 0.03819071451822917, + "learning_rate": 0.0001, + "loss": 4.2969, + "loss/crossentropy": 2.0415892601013184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022911012172699, + "step": 15858 + }, + { + "epoch": 0.3172, + "grad_norm": 1.875, + "grad_norm_var": 0.02176513671875, + "learning_rate": 0.0001, + "loss": 3.8801, + "loss/crossentropy": 1.8290876150131226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914537101984024, + "step": 15860 + }, + { + "epoch": 0.31724, + "grad_norm": 1.859375, + "grad_norm_var": 0.022904459635416666, + "learning_rate": 0.0001, + "loss": 3.8257, + "loss/crossentropy": 1.6169148087501526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117067337036133, + "step": 15862 + }, + { + "epoch": 0.31728, + "grad_norm": 2.0, + "grad_norm_var": 0.022648111979166666, + "learning_rate": 0.0001, + "loss": 4.2553, + "loss/crossentropy": 2.3701289892196655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2557762861251831, + "step": 15864 + }, + { + "epoch": 0.31732, + "grad_norm": 2.03125, + "grad_norm_var": 0.019657135009765625, + "learning_rate": 0.0001, + "loss": 4.2299, + "loss/crossentropy": 2.2519643306732178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21390919387340546, + "step": 15866 + }, + { + "epoch": 0.31736, + "grad_norm": 2.015625, + "grad_norm_var": 0.015409088134765625, + "learning_rate": 0.0001, + "loss": 4.266, + "loss/crossentropy": 2.3287068009376526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22211260348558426, + "step": 15868 + }, + { + "epoch": 0.3174, + "grad_norm": 1.8515625, + "grad_norm_var": 0.004401652018229166, + "learning_rate": 0.0001, + "loss": 3.9622, + "loss/crossentropy": 2.144728899002075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148151397705078, + "step": 15870 + }, + { + "epoch": 0.31744, + "grad_norm": 1.8984375, + "grad_norm_var": 0.004847971598307291, + "learning_rate": 0.0001, + "loss": 3.9042, + "loss/crossentropy": 2.120876669883728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21692734956741333, + "step": 15872 + }, + { + "epoch": 0.31748, + "grad_norm": 1.8828125, + "grad_norm_var": 0.004703521728515625, + "learning_rate": 0.0001, + "loss": 3.9254, + "loss/crossentropy": 2.2827231884002686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20714152604341507, + "step": 15874 + }, + { + "epoch": 0.31752, + "grad_norm": 1.84375, + "grad_norm_var": 0.004788970947265625, + "learning_rate": 0.0001, + "loss": 3.8742, + "loss/crossentropy": 1.8011438846588135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19294942915439606, + "step": 15876 + }, + { + "epoch": 0.31756, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0048258463541666664, + "learning_rate": 0.0001, + "loss": 3.9082, + "loss/crossentropy": 1.9359464049339294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19418440759181976, + "step": 15878 + }, + { + "epoch": 0.3176, + "grad_norm": 2.03125, + "grad_norm_var": 0.006371053059895834, + "learning_rate": 0.0001, + "loss": 4.4035, + "loss/crossentropy": 2.220693826675415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2583991438150406, + "step": 15880 + }, + { + "epoch": 0.31764, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006254069010416667, + "learning_rate": 0.0001, + "loss": 3.9762, + "loss/crossentropy": 1.7859990000724792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756511777639389, + "step": 15882 + }, + { + "epoch": 0.31768, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00587158203125, + "learning_rate": 0.0001, + "loss": 3.7008, + "loss/crossentropy": 1.9461398720741272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18634959310293198, + "step": 15884 + }, + { + "epoch": 0.31772, + "grad_norm": 2.234375, + "grad_norm_var": 0.010833485921223959, + "learning_rate": 0.0001, + "loss": 4.3732, + "loss/crossentropy": 2.1066328287124634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2553827613592148, + "step": 15886 + }, + { + "epoch": 0.31776, + "grad_norm": 2.03125, + "grad_norm_var": 0.011087799072265625, + "learning_rate": 0.0001, + "loss": 3.8751, + "loss/crossentropy": 1.7809287309646606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19362767785787582, + "step": 15888 + }, + { + "epoch": 0.3178, + "grad_norm": 1.921875, + "grad_norm_var": 0.01077880859375, + "learning_rate": 0.0001, + "loss": 4.0424, + "loss/crossentropy": 1.9554992318153381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18544971197843552, + "step": 15890 + }, + { + "epoch": 0.31784, + "grad_norm": 1.84375, + "grad_norm_var": 0.0327392578125, + "learning_rate": 0.0001, + "loss": 3.9083, + "loss/crossentropy": 1.5843109488487244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15569136291742325, + "step": 15892 + }, + { + "epoch": 0.31788, + "grad_norm": 2.109375, + "grad_norm_var": 0.0307525634765625, + "learning_rate": 0.0001, + "loss": 4.1213, + "loss/crossentropy": 2.2000880241394043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2167157679796219, + "step": 15894 + }, + { + "epoch": 0.31792, + "grad_norm": 1.9765625, + "grad_norm_var": 0.031870269775390626, + "learning_rate": 0.0001, + "loss": 4.265, + "loss/crossentropy": 2.2597590684890747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21171507239341736, + "step": 15896 + }, + { + "epoch": 0.31796, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03181330362955729, + "learning_rate": 0.0001, + "loss": 3.9516, + "loss/crossentropy": 2.0513627529144287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21095634251832962, + "step": 15898 + }, + { + "epoch": 0.318, + "grad_norm": 1.9765625, + "grad_norm_var": 0.029271443684895832, + "learning_rate": 0.0001, + "loss": 4.0667, + "loss/crossentropy": 2.1822222471237183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117551624774933, + "step": 15900 + }, + { + "epoch": 0.31804, + "grad_norm": 2.125, + "grad_norm_var": 0.027784983317057293, + "learning_rate": 0.0001, + "loss": 4.3526, + "loss/crossentropy": 1.9500460624694824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20769241452217102, + "step": 15902 + }, + { + "epoch": 0.31808, + "grad_norm": 2.0625, + "grad_norm_var": 0.02684326171875, + "learning_rate": 0.0001, + "loss": 4.2862, + "loss/crossentropy": 2.179791212081909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20147693157196045, + "step": 15904 + }, + { + "epoch": 0.31812, + "grad_norm": 2.03125, + "grad_norm_var": 0.025217437744140626, + "learning_rate": 0.0001, + "loss": 4.0485, + "loss/crossentropy": 1.939364731311798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18659047782421112, + "step": 15906 + }, + { + "epoch": 0.31816, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0059478759765625, + "learning_rate": 0.0001, + "loss": 3.9782, + "loss/crossentropy": 2.219409167766571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19967788457870483, + "step": 15908 + }, + { + "epoch": 0.3182, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007741038004557292, + "learning_rate": 0.0001, + "loss": 3.829, + "loss/crossentropy": 1.8363550901412964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18773943930864334, + "step": 15910 + }, + { + "epoch": 0.31824, + "grad_norm": 2.0625, + "grad_norm_var": 0.006151326497395833, + "learning_rate": 0.0001, + "loss": 4.2196, + "loss/crossentropy": 1.779226541519165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20676184445619583, + "step": 15912 + }, + { + "epoch": 0.31828, + "grad_norm": 1.953125, + "grad_norm_var": 0.006172688802083334, + "learning_rate": 0.0001, + "loss": 3.9532, + "loss/crossentropy": 2.1175760626792908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20498017221689224, + "step": 15914 + }, + { + "epoch": 0.31832, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005893707275390625, + "learning_rate": 0.0001, + "loss": 4.1601, + "loss/crossentropy": 2.0964329838752747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1898122876882553, + "step": 15916 + }, + { + "epoch": 0.31836, + "grad_norm": 1.90625, + "grad_norm_var": 0.005680084228515625, + "learning_rate": 0.0001, + "loss": 3.9093, + "loss/crossentropy": 1.9687228798866272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931748390197754, + "step": 15918 + }, + { + "epoch": 0.3184, + "grad_norm": 2.015625, + "grad_norm_var": 0.005197906494140625, + "learning_rate": 0.0001, + "loss": 4.0003, + "loss/crossentropy": 1.8344767689704895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19926829636096954, + "step": 15920 + }, + { + "epoch": 0.31844, + "grad_norm": 1.984375, + "grad_norm_var": 0.004443359375, + "learning_rate": 0.0001, + "loss": 4.0426, + "loss/crossentropy": 2.154146194458008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21284651011228561, + "step": 15922 + }, + { + "epoch": 0.31848, + "grad_norm": 2.03125, + "grad_norm_var": 0.004223378499348959, + "learning_rate": 0.0001, + "loss": 4.2112, + "loss/crossentropy": 1.877986490726471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18935512751340866, + "step": 15924 + }, + { + "epoch": 0.31852, + "grad_norm": 2.171875, + "grad_norm_var": 0.009297434488932292, + "learning_rate": 0.0001, + "loss": 4.4296, + "loss/crossentropy": 2.361938714981079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23210977017879486, + "step": 15926 + }, + { + "epoch": 0.31856, + "grad_norm": 2.109375, + "grad_norm_var": 0.01043701171875, + "learning_rate": 0.0001, + "loss": 4.1209, + "loss/crossentropy": 2.3886083364486694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22222661972045898, + "step": 15928 + }, + { + "epoch": 0.3186, + "grad_norm": 2.25, + "grad_norm_var": 0.014806874593098958, + "learning_rate": 0.0001, + "loss": 4.1164, + "loss/crossentropy": 2.073238492012024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20130325853824615, + "step": 15930 + }, + { + "epoch": 0.31864, + "grad_norm": 2.125, + "grad_norm_var": 0.015569814046223958, + "learning_rate": 0.0001, + "loss": 4.1692, + "loss/crossentropy": 1.9937713742256165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199419766664505, + "step": 15932 + }, + { + "epoch": 0.31868, + "grad_norm": 1.96875, + "grad_norm_var": 0.0135894775390625, + "learning_rate": 0.0001, + "loss": 4.0594, + "loss/crossentropy": 2.003056764602661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18682510405778885, + "step": 15934 + }, + { + "epoch": 0.31872, + "grad_norm": 1.984375, + "grad_norm_var": 0.0137359619140625, + "learning_rate": 0.0001, + "loss": 3.9555, + "loss/crossentropy": 1.7363762259483337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17377281934022903, + "step": 15936 + }, + { + "epoch": 0.31876, + "grad_norm": 2.0, + "grad_norm_var": 0.014412180582682291, + "learning_rate": 0.0001, + "loss": 4.1321, + "loss/crossentropy": 1.9791433811187744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24057063460350037, + "step": 15938 + }, + { + "epoch": 0.3188, + "grad_norm": 2.0625, + "grad_norm_var": 0.011378733317057292, + "learning_rate": 0.0001, + "loss": 3.9662, + "loss/crossentropy": 1.9962583780288696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19619113206863403, + "step": 15940 + }, + { + "epoch": 0.31884, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01053466796875, + "learning_rate": 0.0001, + "loss": 4.2093, + "loss/crossentropy": 1.8568945527076721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979837343096733, + "step": 15942 + }, + { + "epoch": 0.31888, + "grad_norm": 2.09375, + "grad_norm_var": 0.008991495768229166, + "learning_rate": 0.0001, + "loss": 4.0148, + "loss/crossentropy": 2.031070291996002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951117366552353, + "step": 15944 + }, + { + "epoch": 0.31892, + "grad_norm": 1.9375, + "grad_norm_var": 0.007916005452473958, + "learning_rate": 0.0001, + "loss": 4.0139, + "loss/crossentropy": 2.2226544618606567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21066214889287949, + "step": 15946 + }, + { + "epoch": 0.31896, + "grad_norm": 1.890625, + "grad_norm_var": 0.007783762613932292, + "learning_rate": 0.0001, + "loss": 4.0275, + "loss/crossentropy": 2.090229034423828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.225972518324852, + "step": 15948 + }, + { + "epoch": 0.319, + "grad_norm": 1.90625, + "grad_norm_var": 0.0080078125, + "learning_rate": 0.0001, + "loss": 3.9792, + "loss/crossentropy": 1.8562734723091125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17633049190044403, + "step": 15950 + }, + { + "epoch": 0.31904, + "grad_norm": 2.046875, + "grad_norm_var": 0.00992431640625, + "learning_rate": 0.0001, + "loss": 4.0227, + "loss/crossentropy": 1.6677230596542358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16521989554166794, + "step": 15952 + }, + { + "epoch": 0.31908, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008939361572265625, + "learning_rate": 0.0001, + "loss": 4.2391, + "loss/crossentropy": 2.1918782591819763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21977706253528595, + "step": 15954 + }, + { + "epoch": 0.31912, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008701324462890625, + "learning_rate": 0.0001, + "loss": 3.8932, + "loss/crossentropy": 1.8623137474060059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937999278306961, + "step": 15956 + }, + { + "epoch": 0.31916, + "grad_norm": 2.296875, + "grad_norm_var": 0.014839426676432291, + "learning_rate": 0.0001, + "loss": 4.2378, + "loss/crossentropy": 2.262889266014099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19410346448421478, + "step": 15958 + }, + { + "epoch": 0.3192, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014654286702473958, + "learning_rate": 0.0001, + "loss": 4.0487, + "loss/crossentropy": 1.9620846509933472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1978064849972725, + "step": 15960 + }, + { + "epoch": 0.31924, + "grad_norm": 1.9375, + "grad_norm_var": 0.014207967122395833, + "learning_rate": 0.0001, + "loss": 4.2746, + "loss/crossentropy": 2.453945517539978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21578459441661835, + "step": 15962 + }, + { + "epoch": 0.31928, + "grad_norm": 1.984375, + "grad_norm_var": 0.013529459635416666, + "learning_rate": 0.0001, + "loss": 3.9692, + "loss/crossentropy": 1.9565781354904175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17789364606142044, + "step": 15964 + }, + { + "epoch": 0.31932, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012589518229166667, + "learning_rate": 0.0001, + "loss": 4.3504, + "loss/crossentropy": 2.1175976991653442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2239348292350769, + "step": 15966 + }, + { + "epoch": 0.31936, + "grad_norm": 1.984375, + "grad_norm_var": 0.010554758707682292, + "learning_rate": 0.0001, + "loss": 4.0815, + "loss/crossentropy": 1.8919953107833862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19832338392734528, + "step": 15968 + }, + { + "epoch": 0.3194, + "grad_norm": 2.015625, + "grad_norm_var": 0.0126220703125, + "learning_rate": 0.0001, + "loss": 4.1282, + "loss/crossentropy": 1.9757861495018005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19513867795467377, + "step": 15970 + }, + { + "epoch": 0.31944, + "grad_norm": 1.90625, + "grad_norm_var": 0.012630208333333334, + "learning_rate": 0.0001, + "loss": 4.1045, + "loss/crossentropy": 1.8716764450073242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18571900576353073, + "step": 15972 + }, + { + "epoch": 0.31948, + "grad_norm": 1.984375, + "grad_norm_var": 0.006575520833333333, + "learning_rate": 0.0001, + "loss": 4.2572, + "loss/crossentropy": 2.251496374607086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2199217826128006, + "step": 15974 + }, + { + "epoch": 0.31952, + "grad_norm": 2.140625, + "grad_norm_var": 0.008424631754557292, + "learning_rate": 0.0001, + "loss": 4.1033, + "loss/crossentropy": 1.8639826774597168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18889226019382477, + "step": 15976 + }, + { + "epoch": 0.31956, + "grad_norm": 1.890625, + "grad_norm_var": 0.008499908447265624, + "learning_rate": 0.0001, + "loss": 3.8386, + "loss/crossentropy": 1.824375331401825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1809096783399582, + "step": 15978 + }, + { + "epoch": 0.3196, + "grad_norm": 2.0625, + "grad_norm_var": 0.010361480712890624, + "learning_rate": 0.0001, + "loss": 4.4576, + "loss/crossentropy": 2.357889413833618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21762560307979584, + "step": 15980 + }, + { + "epoch": 0.31964, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011527252197265626, + "learning_rate": 0.0001, + "loss": 4.0706, + "loss/crossentropy": 2.0944892168045044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18413915485143661, + "step": 15982 + }, + { + "epoch": 0.31968, + "grad_norm": 1.8984375, + "grad_norm_var": 0.011736806233723958, + "learning_rate": 0.0001, + "loss": 3.9458, + "loss/crossentropy": 1.8600184321403503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18580162525177002, + "step": 15984 + }, + { + "epoch": 0.31972, + "grad_norm": 2.03125, + "grad_norm_var": 0.008697255452473959, + "learning_rate": 0.0001, + "loss": 4.1152, + "loss/crossentropy": 1.9213852882385254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20599720627069473, + "step": 15986 + }, + { + "epoch": 0.31976, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0119537353515625, + "learning_rate": 0.0001, + "loss": 4.2187, + "loss/crossentropy": 2.0295584201812744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125483900308609, + "step": 15988 + }, + { + "epoch": 0.3198, + "grad_norm": 2.09375, + "grad_norm_var": 0.012198638916015626, + "learning_rate": 0.0001, + "loss": 4.2448, + "loss/crossentropy": 2.148836612701416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22027160972356796, + "step": 15990 + }, + { + "epoch": 0.31984, + "grad_norm": 2.03125, + "grad_norm_var": 0.010201009114583333, + "learning_rate": 0.0001, + "loss": 4.122, + "loss/crossentropy": 1.9927499294281006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20413245260715485, + "step": 15992 + }, + { + "epoch": 0.31988, + "grad_norm": 2.21875, + "grad_norm_var": 0.0113433837890625, + "learning_rate": 0.0001, + "loss": 4.2675, + "loss/crossentropy": 2.090071678161621, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21670149266719818, + "step": 15994 + }, + { + "epoch": 0.31992, + "grad_norm": 1.953125, + "grad_norm_var": 0.010074869791666666, + "learning_rate": 0.0001, + "loss": 4.1333, + "loss/crossentropy": 1.8375160098075867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938413679599762, + "step": 15996 + }, + { + "epoch": 0.31996, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010498046875, + "learning_rate": 0.0001, + "loss": 3.8915, + "loss/crossentropy": 1.5196507573127747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16407200694084167, + "step": 15998 + }, + { + "epoch": 0.32, + "grad_norm": 1.859375, + "grad_norm_var": 0.011344146728515626, + "learning_rate": 0.0001, + "loss": 4.0965, + "loss/crossentropy": 2.0738461017608643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21532364934682846, + "step": 16000 + }, + { + "epoch": 0.32004, + "grad_norm": 1.984375, + "grad_norm_var": 0.011948394775390624, + "learning_rate": 0.0001, + "loss": 4.0687, + "loss/crossentropy": 2.3243744373321533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23544494807720184, + "step": 16002 + }, + { + "epoch": 0.32008, + "grad_norm": 2.046875, + "grad_norm_var": 0.009862263997395834, + "learning_rate": 0.0001, + "loss": 3.9637, + "loss/crossentropy": 1.7145931124687195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1816524863243103, + "step": 16004 + }, + { + "epoch": 0.32012, + "grad_norm": 2.0625, + "grad_norm_var": 0.010506184895833333, + "learning_rate": 0.0001, + "loss": 4.2317, + "loss/crossentropy": 2.369017481803894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21978877484798431, + "step": 16006 + }, + { + "epoch": 0.32016, + "grad_norm": 1.984375, + "grad_norm_var": 0.010276031494140626, + "learning_rate": 0.0001, + "loss": 4.2283, + "loss/crossentropy": 2.2246848344802856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21983492374420166, + "step": 16008 + }, + { + "epoch": 0.3202, + "grad_norm": 2.671875, + "grad_norm_var": 0.03774185180664062, + "learning_rate": 0.0001, + "loss": 4.4077, + "loss/crossentropy": 2.0235647559165955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22780708968639374, + "step": 16010 + }, + { + "epoch": 0.32024, + "grad_norm": 2.015625, + "grad_norm_var": 0.03724543253580729, + "learning_rate": 0.0001, + "loss": 4.1692, + "loss/crossentropy": 2.3185853958129883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22308620065450668, + "step": 16012 + }, + { + "epoch": 0.32028, + "grad_norm": 2.109375, + "grad_norm_var": 0.0337890625, + "learning_rate": 0.0001, + "loss": 4.1314, + "loss/crossentropy": 1.8504603505134583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1909484937787056, + "step": 16014 + }, + { + "epoch": 0.32032, + "grad_norm": 1.859375, + "grad_norm_var": 0.03502604166666667, + "learning_rate": 0.0001, + "loss": 3.8247, + "loss/crossentropy": 1.7095224857330322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18588167428970337, + "step": 16016 + }, + { + "epoch": 0.32036, + "grad_norm": 1.8515625, + "grad_norm_var": 0.03749974568684896, + "learning_rate": 0.0001, + "loss": 4.1015, + "loss/crossentropy": 1.8398523330688477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18085161596536636, + "step": 16018 + }, + { + "epoch": 0.3204, + "grad_norm": 1.921875, + "grad_norm_var": 0.03569717407226562, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 2.297620415687561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20937249064445496, + "step": 16020 + }, + { + "epoch": 0.32044, + "grad_norm": 1.984375, + "grad_norm_var": 0.03618545532226562, + "learning_rate": 0.0001, + "loss": 4.3518, + "loss/crossentropy": 1.9124106764793396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072141021490097, + "step": 16022 + }, + { + "epoch": 0.32048, + "grad_norm": 1.921875, + "grad_norm_var": 0.03752415974934896, + "learning_rate": 0.0001, + "loss": 4.2695, + "loss/crossentropy": 2.119450092315674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20875975489616394, + "step": 16024 + }, + { + "epoch": 0.32052, + "grad_norm": 2.125, + "grad_norm_var": 0.008125559488932291, + "learning_rate": 0.0001, + "loss": 4.35, + "loss/crossentropy": 1.722611904144287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1751066967844963, + "step": 16026 + }, + { + "epoch": 0.32056, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008760579427083333, + "learning_rate": 0.0001, + "loss": 4.0689, + "loss/crossentropy": 2.246767997741699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1999046877026558, + "step": 16028 + }, + { + "epoch": 0.3206, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0083404541015625, + "learning_rate": 0.0001, + "loss": 3.9949, + "loss/crossentropy": 2.0449349880218506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969577819108963, + "step": 16030 + }, + { + "epoch": 0.32064, + "grad_norm": 2.140625, + "grad_norm_var": 0.010050201416015625, + "learning_rate": 0.0001, + "loss": 4.1257, + "loss/crossentropy": 2.125577926635742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22252507507801056, + "step": 16032 + }, + { + "epoch": 0.32068, + "grad_norm": 1.984375, + "grad_norm_var": 0.009079742431640624, + "learning_rate": 0.0001, + "loss": 3.639, + "loss/crossentropy": 1.8161216974258423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17801867425441742, + "step": 16034 + }, + { + "epoch": 0.32072, + "grad_norm": 2.0, + "grad_norm_var": 0.009422810872395833, + "learning_rate": 0.0001, + "loss": 3.9475, + "loss/crossentropy": 2.0116894841194153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18890849500894547, + "step": 16036 + }, + { + "epoch": 0.32076, + "grad_norm": 1.875, + "grad_norm_var": 0.0100982666015625, + "learning_rate": 0.0001, + "loss": 3.8274, + "loss/crossentropy": 1.505588710308075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17151623219251633, + "step": 16038 + }, + { + "epoch": 0.3208, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008585357666015625, + "learning_rate": 0.0001, + "loss": 4.2162, + "loss/crossentropy": 2.1405014991760254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2036641389131546, + "step": 16040 + }, + { + "epoch": 0.32084, + "grad_norm": 1.953125, + "grad_norm_var": 0.006628163655598958, + "learning_rate": 0.0001, + "loss": 4.0743, + "loss/crossentropy": 2.1562809348106384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21301954984664917, + "step": 16042 + }, + { + "epoch": 0.32088, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006628163655598958, + "learning_rate": 0.0001, + "loss": 3.9773, + "loss/crossentropy": 1.9317167401313782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19167031347751617, + "step": 16044 + }, + { + "epoch": 0.32092, + "grad_norm": 1.796875, + "grad_norm_var": 0.007330067952473958, + "learning_rate": 0.0001, + "loss": 4.0523, + "loss/crossentropy": 1.6257455348968506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1676744669675827, + "step": 16046 + }, + { + "epoch": 0.32096, + "grad_norm": 2.109375, + "grad_norm_var": 0.006640625, + "learning_rate": 0.0001, + "loss": 4.2454, + "loss/crossentropy": 2.0711347460746765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2268730029463768, + "step": 16048 + }, + { + "epoch": 0.321, + "grad_norm": 2.03125, + "grad_norm_var": 0.0071489969889322914, + "learning_rate": 0.0001, + "loss": 4.3333, + "loss/crossentropy": 2.173452377319336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981828361749649, + "step": 16050 + }, + { + "epoch": 0.32104, + "grad_norm": 1.953125, + "grad_norm_var": 0.009593709309895834, + "learning_rate": 0.0001, + "loss": 4.4601, + "loss/crossentropy": 2.348356604576111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2538425847887993, + "step": 16052 + }, + { + "epoch": 0.32108, + "grad_norm": 1.984375, + "grad_norm_var": 0.0089263916015625, + "learning_rate": 0.0001, + "loss": 4.3408, + "loss/crossentropy": 2.5441235303878784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22638535499572754, + "step": 16054 + }, + { + "epoch": 0.32112, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0098876953125, + "learning_rate": 0.0001, + "loss": 4.08, + "loss/crossentropy": 2.129339337348938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20430771261453629, + "step": 16056 + }, + { + "epoch": 0.32116, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01142578125, + "learning_rate": 0.0001, + "loss": 3.9048, + "loss/crossentropy": 2.1599501371383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20718347281217575, + "step": 16058 + }, + { + "epoch": 0.3212, + "grad_norm": 1.984375, + "grad_norm_var": 0.010296376546223958, + "learning_rate": 0.0001, + "loss": 4.2746, + "loss/crossentropy": 2.2704890966415405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22025802731513977, + "step": 16060 + }, + { + "epoch": 0.32124, + "grad_norm": 1.796875, + "grad_norm_var": 0.011702473958333333, + "learning_rate": 0.0001, + "loss": 3.7282, + "loss/crossentropy": 1.7704021334648132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17811128497123718, + "step": 16062 + }, + { + "epoch": 0.32128, + "grad_norm": 2.1875, + "grad_norm_var": 0.015705362955729166, + "learning_rate": 0.0001, + "loss": 4.2366, + "loss/crossentropy": 1.8030991554260254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20078134536743164, + "step": 16064 + }, + { + "epoch": 0.32132, + "grad_norm": 2.015625, + "grad_norm_var": 0.0161041259765625, + "learning_rate": 0.0001, + "loss": 3.9832, + "loss/crossentropy": 2.14484703540802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21000836789608002, + "step": 16066 + }, + { + "epoch": 0.32136, + "grad_norm": 2.203125, + "grad_norm_var": 0.016521962483723958, + "learning_rate": 0.0001, + "loss": 4.0812, + "loss/crossentropy": 2.1680833101272583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22287416458129883, + "step": 16068 + }, + { + "epoch": 0.3214, + "grad_norm": 2.078125, + "grad_norm_var": 0.01708958943684896, + "learning_rate": 0.0001, + "loss": 4.1355, + "loss/crossentropy": 1.9971441626548767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20709602534770966, + "step": 16070 + }, + { + "epoch": 0.32144, + "grad_norm": 2.09375, + "grad_norm_var": 0.016462198893229165, + "learning_rate": 0.0001, + "loss": 4.2735, + "loss/crossentropy": 2.180538833141327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097538262605667, + "step": 16072 + }, + { + "epoch": 0.32148, + "grad_norm": 2.046875, + "grad_norm_var": 0.013529459635416666, + "learning_rate": 0.0001, + "loss": 4.0725, + "loss/crossentropy": 2.074296534061432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20961299538612366, + "step": 16074 + }, + { + "epoch": 0.32152, + "grad_norm": 1.9375, + "grad_norm_var": 0.014998372395833333, + "learning_rate": 0.0001, + "loss": 4.3, + "loss/crossentropy": 2.0936360359191895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18253444135189056, + "step": 16076 + }, + { + "epoch": 0.32156, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0121337890625, + "learning_rate": 0.0001, + "loss": 3.9992, + "loss/crossentropy": 2.1265164613723755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127028927206993, + "step": 16078 + }, + { + "epoch": 0.3216, + "grad_norm": 2.0, + "grad_norm_var": 0.008410390218098958, + "learning_rate": 0.0001, + "loss": 4.1907, + "loss/crossentropy": 2.0804547667503357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23104286193847656, + "step": 16080 + }, + { + "epoch": 0.32164, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007759602864583334, + "learning_rate": 0.0001, + "loss": 3.9653, + "loss/crossentropy": 1.9458459615707397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18746595829725266, + "step": 16082 + }, + { + "epoch": 0.32168, + "grad_norm": 1.96875, + "grad_norm_var": 0.0056793212890625, + "learning_rate": 0.0001, + "loss": 4.0581, + "loss/crossentropy": 2.186868667602539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20350559055805206, + "step": 16084 + }, + { + "epoch": 0.32172, + "grad_norm": 1.953125, + "grad_norm_var": 0.007340494791666667, + "learning_rate": 0.0001, + "loss": 4.2364, + "loss/crossentropy": 1.5468108654022217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17710693180561066, + "step": 16086 + }, + { + "epoch": 0.32176, + "grad_norm": 2.0, + "grad_norm_var": 0.006180572509765625, + "learning_rate": 0.0001, + "loss": 4.2635, + "loss/crossentropy": 2.222296118736267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22016968578100204, + "step": 16088 + }, + { + "epoch": 0.3218, + "grad_norm": 1.984375, + "grad_norm_var": 0.005619049072265625, + "learning_rate": 0.0001, + "loss": 4.0659, + "loss/crossentropy": 1.7950996160507202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1892552226781845, + "step": 16090 + }, + { + "epoch": 0.32184, + "grad_norm": 2.15625, + "grad_norm_var": 0.007120768229166667, + "learning_rate": 0.0001, + "loss": 4.2899, + "loss/crossentropy": 2.293117642402649, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2351122871041298, + "step": 16092 + }, + { + "epoch": 0.32188, + "grad_norm": 1.890625, + "grad_norm_var": 0.006208292643229167, + "learning_rate": 0.0001, + "loss": 3.8235, + "loss/crossentropy": 1.6602438688278198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18052196502685547, + "step": 16094 + }, + { + "epoch": 0.32192, + "grad_norm": 2.140625, + "grad_norm_var": 0.008501942952473958, + "learning_rate": 0.0001, + "loss": 4.114, + "loss/crossentropy": 1.9335945844650269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20585428178310394, + "step": 16096 + }, + { + "epoch": 0.32196, + "grad_norm": 1.9140625, + "grad_norm_var": 0.011378733317057292, + "learning_rate": 0.0001, + "loss": 4.3911, + "loss/crossentropy": 2.0107831358909607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19297368824481964, + "step": 16098 + }, + { + "epoch": 0.322, + "grad_norm": 2.0, + "grad_norm_var": 0.011801910400390626, + "learning_rate": 0.0001, + "loss": 4.0153, + "loss/crossentropy": 1.9804013967514038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17699319869279861, + "step": 16100 + }, + { + "epoch": 0.32204, + "grad_norm": 1.9375, + "grad_norm_var": 0.01048583984375, + "learning_rate": 0.0001, + "loss": 4.1069, + "loss/crossentropy": 1.9009913206100464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931428462266922, + "step": 16102 + }, + { + "epoch": 0.32208, + "grad_norm": 2.140625, + "grad_norm_var": 0.011358388264973958, + "learning_rate": 0.0001, + "loss": 4.5134, + "loss/crossentropy": 1.9558063745498657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139815017580986, + "step": 16104 + }, + { + "epoch": 0.32212, + "grad_norm": 2.1875, + "grad_norm_var": 0.013323720296223958, + "learning_rate": 0.0001, + "loss": 4.0989, + "loss/crossentropy": 2.0626373887062073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022782266139984, + "step": 16106 + }, + { + "epoch": 0.32216, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013814036051432292, + "learning_rate": 0.0001, + "loss": 3.9408, + "loss/crossentropy": 2.166835308074951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22070696204900742, + "step": 16108 + }, + { + "epoch": 0.3222, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012963612874348959, + "learning_rate": 0.0001, + "loss": 4.1382, + "loss/crossentropy": 2.2970025539398193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23291311413049698, + "step": 16110 + }, + { + "epoch": 0.32224, + "grad_norm": 1.921875, + "grad_norm_var": 0.012174224853515625, + "learning_rate": 0.0001, + "loss": 3.9577, + "loss/crossentropy": 1.9400970935821533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199058435857296, + "step": 16112 + }, + { + "epoch": 0.32228, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009110260009765624, + "learning_rate": 0.0001, + "loss": 3.9659, + "loss/crossentropy": 1.6467864513397217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19500216841697693, + "step": 16114 + }, + { + "epoch": 0.32232, + "grad_norm": 2.03125, + "grad_norm_var": 0.010546875, + "learning_rate": 0.0001, + "loss": 4.1791, + "loss/crossentropy": 2.477790355682373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22540342807769775, + "step": 16116 + }, + { + "epoch": 0.32236, + "grad_norm": 1.875, + "grad_norm_var": 0.011066691080729166, + "learning_rate": 0.0001, + "loss": 4.0661, + "loss/crossentropy": 1.946933627128601, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17866672575473785, + "step": 16118 + }, + { + "epoch": 0.3224, + "grad_norm": 1.859375, + "grad_norm_var": 0.01153564453125, + "learning_rate": 0.0001, + "loss": 4.1416, + "loss/crossentropy": 2.4484145641326904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21314629167318344, + "step": 16120 + }, + { + "epoch": 0.32244, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009155019124348959, + "learning_rate": 0.0001, + "loss": 3.9754, + "loss/crossentropy": 1.9663755893707275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19814299046993256, + "step": 16122 + }, + { + "epoch": 0.32248, + "grad_norm": 2.125, + "grad_norm_var": 0.5887278238932292, + "learning_rate": 0.0001, + "loss": 3.7417, + "loss/crossentropy": 1.9786240458488464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19666218757629395, + "step": 16124 + }, + { + "epoch": 0.32252, + "grad_norm": 1.9453125, + "grad_norm_var": 0.5901079813639323, + "learning_rate": 0.0001, + "loss": 3.9504, + "loss/crossentropy": 1.90617835521698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1803947538137436, + "step": 16126 + }, + { + "epoch": 0.32256, + "grad_norm": 2.03125, + "grad_norm_var": 0.5877764383951823, + "learning_rate": 0.0001, + "loss": 4.3267, + "loss/crossentropy": 2.1199004650115967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21409574151039124, + "step": 16128 + }, + { + "epoch": 0.3226, + "grad_norm": 1.8984375, + "grad_norm_var": 0.5875445048014323, + "learning_rate": 0.0001, + "loss": 4.184, + "loss/crossentropy": 2.081290364265442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20103489607572556, + "step": 16130 + }, + { + "epoch": 0.32264, + "grad_norm": 1.9375, + "grad_norm_var": 0.5902565002441407, + "learning_rate": 0.0001, + "loss": 4.216, + "loss/crossentropy": 1.9839922785758972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20679790526628494, + "step": 16132 + }, + { + "epoch": 0.32268, + "grad_norm": 2.078125, + "grad_norm_var": 0.5830800374348958, + "learning_rate": 0.0001, + "loss": 4.3441, + "loss/crossentropy": 2.2765486240386963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21012597531080246, + "step": 16134 + }, + { + "epoch": 0.32272, + "grad_norm": 2.125, + "grad_norm_var": 0.57388916015625, + "learning_rate": 0.0001, + "loss": 4.0003, + "loss/crossentropy": 1.8353520035743713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19367212802171707, + "step": 16136 + }, + { + "epoch": 0.32276, + "grad_norm": 1.8984375, + "grad_norm_var": 0.5781572977701823, + "learning_rate": 0.0001, + "loss": 3.8104, + "loss/crossentropy": 1.8009640574455261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18030469119548798, + "step": 16138 + }, + { + "epoch": 0.3228, + "grad_norm": 2.140625, + "grad_norm_var": 0.020975494384765626, + "learning_rate": 0.0001, + "loss": 4.1914, + "loss/crossentropy": 1.865959644317627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20606407523155212, + "step": 16140 + }, + { + "epoch": 0.32284, + "grad_norm": 2.109375, + "grad_norm_var": 0.021445465087890626, + "learning_rate": 0.0001, + "loss": 4.0253, + "loss/crossentropy": 2.238165020942688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22822897136211395, + "step": 16142 + }, + { + "epoch": 0.32288, + "grad_norm": 1.90625, + "grad_norm_var": 0.02393366495768229, + "learning_rate": 0.0001, + "loss": 4.0425, + "loss/crossentropy": 2.175424814224243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2260037586092949, + "step": 16144 + }, + { + "epoch": 0.32292, + "grad_norm": 1.9453125, + "grad_norm_var": 0.025050608317057292, + "learning_rate": 0.0001, + "loss": 4.1493, + "loss/crossentropy": 2.2071722745895386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23026321083307266, + "step": 16146 + }, + { + "epoch": 0.32296, + "grad_norm": 2.109375, + "grad_norm_var": 0.024468739827473957, + "learning_rate": 0.0001, + "loss": 4.2384, + "loss/crossentropy": 2.211812973022461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20882735401391983, + "step": 16148 + }, + { + "epoch": 0.323, + "grad_norm": 2.046875, + "grad_norm_var": 0.02444636027018229, + "learning_rate": 0.0001, + "loss": 4.0356, + "loss/crossentropy": 1.6503818035125732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1640765517950058, + "step": 16150 + }, + { + "epoch": 0.32304, + "grad_norm": 2.015625, + "grad_norm_var": 0.02520726521809896, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 1.9449399709701538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1876515969634056, + "step": 16152 + }, + { + "epoch": 0.32308, + "grad_norm": 1.8984375, + "grad_norm_var": 0.022581990559895834, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 2.001387894153595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21191075444221497, + "step": 16154 + }, + { + "epoch": 0.32312, + "grad_norm": 2.234375, + "grad_norm_var": 0.0143218994140625, + "learning_rate": 0.0001, + "loss": 4.2115, + "loss/crossentropy": 2.1146541833877563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2692951038479805, + "step": 16156 + }, + { + "epoch": 0.32316, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015602366129557291, + "learning_rate": 0.0001, + "loss": 4.1534, + "loss/crossentropy": 2.0765844583511353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19518104195594788, + "step": 16158 + }, + { + "epoch": 0.3232, + "grad_norm": 2.0, + "grad_norm_var": 0.0128814697265625, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 2.5394046306610107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22651594877243042, + "step": 16160 + }, + { + "epoch": 0.32324, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011702473958333333, + "learning_rate": 0.0001, + "loss": 4.0412, + "loss/crossentropy": 2.0140721797943115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18245946615934372, + "step": 16162 + }, + { + "epoch": 0.32328, + "grad_norm": 1.9375, + "grad_norm_var": 0.012303670247395834, + "learning_rate": 0.0001, + "loss": 4.3029, + "loss/crossentropy": 2.472638249397278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23127944022417068, + "step": 16164 + }, + { + "epoch": 0.32332, + "grad_norm": 2.03125, + "grad_norm_var": 0.014924112955729167, + "learning_rate": 0.0001, + "loss": 4.1974, + "loss/crossentropy": 2.1551883220672607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093031033873558, + "step": 16166 + }, + { + "epoch": 0.32336, + "grad_norm": 2.25, + "grad_norm_var": 0.01641845703125, + "learning_rate": 0.0001, + "loss": 4.1221, + "loss/crossentropy": 2.042950928211212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049141526222229, + "step": 16168 + }, + { + "epoch": 0.3234, + "grad_norm": 2.03125, + "grad_norm_var": 0.014849599202473958, + "learning_rate": 0.0001, + "loss": 4.4795, + "loss/crossentropy": 2.0262559056282043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18649922311306, + "step": 16170 + }, + { + "epoch": 0.32344, + "grad_norm": 1.984375, + "grad_norm_var": 0.011675771077473958, + "learning_rate": 0.0001, + "loss": 4.0477, + "loss/crossentropy": 2.3014276027679443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21770215779542923, + "step": 16172 + }, + { + "epoch": 0.32348, + "grad_norm": 2.078125, + "grad_norm_var": 0.009349568684895834, + "learning_rate": 0.0001, + "loss": 4.2802, + "loss/crossentropy": 2.213370680809021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18508044630289078, + "step": 16174 + }, + { + "epoch": 0.32352, + "grad_norm": 2.03125, + "grad_norm_var": 0.009075673421223958, + "learning_rate": 0.0001, + "loss": 4.2529, + "loss/crossentropy": 1.973215639591217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937488242983818, + "step": 16176 + }, + { + "epoch": 0.32356, + "grad_norm": 2.09375, + "grad_norm_var": 0.018553670247395834, + "learning_rate": 0.0001, + "loss": 4.3534, + "loss/crossentropy": 2.1585731506347656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029733881354332, + "step": 16178 + }, + { + "epoch": 0.3236, + "grad_norm": 2.359375, + "grad_norm_var": 0.021647135416666668, + "learning_rate": 0.0001, + "loss": 4.3324, + "loss/crossentropy": 2.3878796100616455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19985762238502502, + "step": 16180 + }, + { + "epoch": 0.32364, + "grad_norm": 1.84375, + "grad_norm_var": 0.027860514322916665, + "learning_rate": 0.0001, + "loss": 3.8043, + "loss/crossentropy": 2.07839834690094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994347795844078, + "step": 16182 + }, + { + "epoch": 0.32368, + "grad_norm": 1.953125, + "grad_norm_var": 0.02607421875, + "learning_rate": 0.0001, + "loss": 4.3147, + "loss/crossentropy": 2.1800806522369385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21444538235664368, + "step": 16184 + }, + { + "epoch": 0.32372, + "grad_norm": 1.8984375, + "grad_norm_var": 0.029352823893229168, + "learning_rate": 0.0001, + "loss": 3.9927, + "loss/crossentropy": 2.0900917053222656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20476030558347702, + "step": 16186 + }, + { + "epoch": 0.32376, + "grad_norm": 2.0625, + "grad_norm_var": 0.02899169921875, + "learning_rate": 0.0001, + "loss": 4.209, + "loss/crossentropy": 2.1523303985595703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20984850078821182, + "step": 16188 + }, + { + "epoch": 0.3238, + "grad_norm": 1.765625, + "grad_norm_var": 0.03593648274739583, + "learning_rate": 0.0001, + "loss": 3.799, + "loss/crossentropy": 1.8808646202087402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18587414175271988, + "step": 16190 + }, + { + "epoch": 0.32384, + "grad_norm": 1.953125, + "grad_norm_var": 0.03593648274739583, + "learning_rate": 0.0001, + "loss": 4.1002, + "loss/crossentropy": 1.981968104839325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20611849427223206, + "step": 16192 + }, + { + "epoch": 0.32388, + "grad_norm": 2.03125, + "grad_norm_var": 0.020951334635416666, + "learning_rate": 0.0001, + "loss": 4.2811, + "loss/crossentropy": 2.1454352140426636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21696189790964127, + "step": 16194 + }, + { + "epoch": 0.32392, + "grad_norm": 2.078125, + "grad_norm_var": 0.011905924479166666, + "learning_rate": 0.0001, + "loss": 4.159, + "loss/crossentropy": 2.026822328567505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112257331609726, + "step": 16196 + }, + { + "epoch": 0.32396, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009706370035807292, + "learning_rate": 0.0001, + "loss": 4.0838, + "loss/crossentropy": 2.332329034805298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20129209011793137, + "step": 16198 + }, + { + "epoch": 0.324, + "grad_norm": 2.046875, + "grad_norm_var": 0.007260894775390625, + "learning_rate": 0.0001, + "loss": 4.11, + "loss/crossentropy": 2.271657705307007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2460309863090515, + "step": 16200 + }, + { + "epoch": 0.32404, + "grad_norm": 2.0, + "grad_norm_var": 0.0064084370930989586, + "learning_rate": 0.0001, + "loss": 4.0835, + "loss/crossentropy": 2.0890920162200928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132192626595497, + "step": 16202 + }, + { + "epoch": 0.32408, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007226308186848958, + "learning_rate": 0.0001, + "loss": 4.0271, + "loss/crossentropy": 1.8368538618087769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891050636768341, + "step": 16204 + }, + { + "epoch": 0.32412, + "grad_norm": 2.0625, + "grad_norm_var": 0.004375966389973959, + "learning_rate": 0.0001, + "loss": 4.2859, + "loss/crossentropy": 2.3123772144317627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22660605609416962, + "step": 16206 + }, + { + "epoch": 0.32416, + "grad_norm": 2.125, + "grad_norm_var": 0.005411529541015625, + "learning_rate": 0.0001, + "loss": 4.0989, + "loss/crossentropy": 2.21794331073761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21767260879278183, + "step": 16208 + }, + { + "epoch": 0.3242, + "grad_norm": 2.0625, + "grad_norm_var": 0.010591379801432292, + "learning_rate": 0.0001, + "loss": 4.5407, + "loss/crossentropy": 2.169134736061096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172672376036644, + "step": 16210 + }, + { + "epoch": 0.32424, + "grad_norm": 1.921875, + "grad_norm_var": 0.011331939697265625, + "learning_rate": 0.0001, + "loss": 4.0327, + "loss/crossentropy": 2.2830699682235718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22920189797878265, + "step": 16212 + }, + { + "epoch": 0.32428, + "grad_norm": 1.90625, + "grad_norm_var": 0.012770334879557291, + "learning_rate": 0.0001, + "loss": 3.9278, + "loss/crossentropy": 2.024571657180786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21096136420965195, + "step": 16214 + }, + { + "epoch": 0.32432, + "grad_norm": 1.9375, + "grad_norm_var": 0.013057200113932292, + "learning_rate": 0.0001, + "loss": 4.0244, + "loss/crossentropy": 2.060720980167389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21043655276298523, + "step": 16216 + }, + { + "epoch": 0.32436, + "grad_norm": 2.078125, + "grad_norm_var": 0.014134724934895834, + "learning_rate": 0.0001, + "loss": 4.074, + "loss/crossentropy": 2.0332056283950806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19506454467773438, + "step": 16218 + }, + { + "epoch": 0.3244, + "grad_norm": 1.8515625, + "grad_norm_var": 0.014021555582682291, + "learning_rate": 0.0001, + "loss": 3.8173, + "loss/crossentropy": 1.811396062374115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19359800219535828, + "step": 16220 + }, + { + "epoch": 0.32444, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013117472330729166, + "learning_rate": 0.0001, + "loss": 4.1846, + "loss/crossentropy": 2.074379801750183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21763048321008682, + "step": 16222 + }, + { + "epoch": 0.32448, + "grad_norm": 2.203125, + "grad_norm_var": 0.014682769775390625, + "learning_rate": 0.0001, + "loss": 4.1984, + "loss/crossentropy": 2.1836538314819336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20814575254917145, + "step": 16224 + }, + { + "epoch": 0.32452, + "grad_norm": 1.953125, + "grad_norm_var": 0.010794830322265626, + "learning_rate": 0.0001, + "loss": 4.2866, + "loss/crossentropy": 2.365849494934082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24219170212745667, + "step": 16226 + }, + { + "epoch": 0.32456, + "grad_norm": 2.0625, + "grad_norm_var": 0.010343424479166667, + "learning_rate": 0.0001, + "loss": 4.1976, + "loss/crossentropy": 1.8738153576850891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22152718901634216, + "step": 16228 + }, + { + "epoch": 0.3246, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010209147135416667, + "learning_rate": 0.0001, + "loss": 4.1458, + "loss/crossentropy": 2.2568124532699585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21668314933776855, + "step": 16230 + }, + { + "epoch": 0.32464, + "grad_norm": 2.109375, + "grad_norm_var": 0.010900624593098958, + "learning_rate": 0.0001, + "loss": 4.2889, + "loss/crossentropy": 2.4037723541259766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2225075364112854, + "step": 16232 + }, + { + "epoch": 0.32468, + "grad_norm": 1.765625, + "grad_norm_var": 0.01375732421875, + "learning_rate": 0.0001, + "loss": 3.7288, + "loss/crossentropy": 2.2305864095687866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19381006807088852, + "step": 16234 + }, + { + "epoch": 0.32472, + "grad_norm": 2.03125, + "grad_norm_var": 0.013862864176432291, + "learning_rate": 0.0001, + "loss": 4.186, + "loss/crossentropy": 2.1115931272506714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20033665746450424, + "step": 16236 + }, + { + "epoch": 0.32476, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014054361979166667, + "learning_rate": 0.0001, + "loss": 4.1602, + "loss/crossentropy": 2.0450429916381836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2080780565738678, + "step": 16238 + }, + { + "epoch": 0.3248, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011888631184895833, + "learning_rate": 0.0001, + "loss": 4.056, + "loss/crossentropy": 2.003769636154175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20560404658317566, + "step": 16240 + }, + { + "epoch": 0.32484, + "grad_norm": 2.125, + "grad_norm_var": 0.010383860270182291, + "learning_rate": 0.0001, + "loss": 4.3504, + "loss/crossentropy": 1.9452934265136719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063707485795021, + "step": 16242 + }, + { + "epoch": 0.32488, + "grad_norm": 1.7734375, + "grad_norm_var": 0.012497711181640624, + "learning_rate": 0.0001, + "loss": 3.9172, + "loss/crossentropy": 2.1412742137908936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20292682200670242, + "step": 16244 + }, + { + "epoch": 0.32492, + "grad_norm": 1.96875, + "grad_norm_var": 0.011140950520833333, + "learning_rate": 0.0001, + "loss": 4.3025, + "loss/crossentropy": 2.2285404205322266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22571790218353271, + "step": 16246 + }, + { + "epoch": 0.32496, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010713704427083333, + "learning_rate": 0.0001, + "loss": 3.6582, + "loss/crossentropy": 1.82416570186615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1771545708179474, + "step": 16248 + }, + { + "epoch": 0.325, + "grad_norm": 2.109375, + "grad_norm_var": 0.010422515869140624, + "learning_rate": 0.0001, + "loss": 4.2533, + "loss/crossentropy": 2.1432559490203857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20037438720464706, + "step": 16250 + }, + { + "epoch": 0.32504, + "grad_norm": 2.234375, + "grad_norm_var": 0.015337880452473958, + "learning_rate": 0.0001, + "loss": 4.162, + "loss/crossentropy": 2.1962517499923706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22743911296129227, + "step": 16252 + }, + { + "epoch": 0.32508, + "grad_norm": 2.125, + "grad_norm_var": 0.01636530558268229, + "learning_rate": 0.0001, + "loss": 4.2223, + "loss/crossentropy": 1.932455599308014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043299823999405, + "step": 16254 + }, + { + "epoch": 0.32512, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015824127197265624, + "learning_rate": 0.0001, + "loss": 3.9024, + "loss/crossentropy": 1.9702956676483154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19528628885746002, + "step": 16256 + }, + { + "epoch": 0.32516, + "grad_norm": 2.296875, + "grad_norm_var": 0.020475260416666665, + "learning_rate": 0.0001, + "loss": 4.0628, + "loss/crossentropy": 2.0042436718940735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19861597567796707, + "step": 16258 + }, + { + "epoch": 0.3252, + "grad_norm": 2.28125, + "grad_norm_var": 0.019573720296223958, + "learning_rate": 0.0001, + "loss": 4.1876, + "loss/crossentropy": 2.3544296622276306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22385136783123016, + "step": 16260 + }, + { + "epoch": 0.32524, + "grad_norm": 1.984375, + "grad_norm_var": 0.01942723592122396, + "learning_rate": 0.0001, + "loss": 4.1435, + "loss/crossentropy": 2.1482596397399902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21662700176239014, + "step": 16262 + }, + { + "epoch": 0.32528, + "grad_norm": 1.96875, + "grad_norm_var": 0.01395263671875, + "learning_rate": 0.0001, + "loss": 4.3441, + "loss/crossentropy": 2.243022322654724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22729884833097458, + "step": 16264 + }, + { + "epoch": 0.32532, + "grad_norm": 1.875, + "grad_norm_var": 0.018656158447265626, + "learning_rate": 0.0001, + "loss": 3.9013, + "loss/crossentropy": 2.2499197721481323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20558901876211166, + "step": 16266 + }, + { + "epoch": 0.32536, + "grad_norm": 1.9140625, + "grad_norm_var": 0.022907511393229166, + "learning_rate": 0.0001, + "loss": 4.2313, + "loss/crossentropy": 1.6232356429100037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18864237517118454, + "step": 16268 + }, + { + "epoch": 0.3254, + "grad_norm": 1.96875, + "grad_norm_var": 0.02402521769205729, + "learning_rate": 0.0001, + "loss": 4.0435, + "loss/crossentropy": 1.559517502784729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1704516038298607, + "step": 16270 + }, + { + "epoch": 0.32544, + "grad_norm": 1.890625, + "grad_norm_var": 0.02484308878580729, + "learning_rate": 0.0001, + "loss": 3.9018, + "loss/crossentropy": 1.7346046566963196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17380736768245697, + "step": 16272 + }, + { + "epoch": 0.32548, + "grad_norm": 2.21875, + "grad_norm_var": 0.02237523396809896, + "learning_rate": 0.0001, + "loss": 4.2894, + "loss/crossentropy": 2.1451315879821777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21778792887926102, + "step": 16274 + }, + { + "epoch": 0.32552, + "grad_norm": 2.25, + "grad_norm_var": 0.022684733072916668, + "learning_rate": 0.0001, + "loss": 3.8737, + "loss/crossentropy": 1.8013367056846619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18631768971681595, + "step": 16276 + }, + { + "epoch": 0.32556, + "grad_norm": 1.9609375, + "grad_norm_var": 0.025655110677083332, + "learning_rate": 0.0001, + "loss": 3.8573, + "loss/crossentropy": 1.8858280181884766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1940329447388649, + "step": 16278 + }, + { + "epoch": 0.3256, + "grad_norm": 1.9375, + "grad_norm_var": 0.0259429931640625, + "learning_rate": 0.0001, + "loss": 4.1268, + "loss/crossentropy": 1.8475046157836914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18596403300762177, + "step": 16280 + }, + { + "epoch": 0.32564, + "grad_norm": 2.0, + "grad_norm_var": 0.023636881510416666, + "learning_rate": 0.0001, + "loss": 4.2602, + "loss/crossentropy": 2.2585566639900208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22430463135242462, + "step": 16282 + }, + { + "epoch": 0.32568, + "grad_norm": 1.8984375, + "grad_norm_var": 0.015900675455729166, + "learning_rate": 0.0001, + "loss": 3.7224, + "loss/crossentropy": 1.7233783602714539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17318468540906906, + "step": 16284 + }, + { + "epoch": 0.32572, + "grad_norm": 1.9921875, + "grad_norm_var": 0.022172037760416666, + "learning_rate": 0.0001, + "loss": 4.3042, + "loss/crossentropy": 2.174088716506958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21329404413700104, + "step": 16286 + }, + { + "epoch": 0.32576, + "grad_norm": 2.015625, + "grad_norm_var": 0.022345987955729167, + "learning_rate": 0.0001, + "loss": 3.8932, + "loss/crossentropy": 1.8016705513000488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20395102351903915, + "step": 16288 + }, + { + "epoch": 0.3258, + "grad_norm": 2.03125, + "grad_norm_var": 0.020182037353515626, + "learning_rate": 0.0001, + "loss": 3.8634, + "loss/crossentropy": 2.166901111602783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21237251162528992, + "step": 16290 + }, + { + "epoch": 0.32584, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014766184488932292, + "learning_rate": 0.0001, + "loss": 4.07, + "loss/crossentropy": 2.024773359298706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973796784877777, + "step": 16292 + }, + { + "epoch": 0.32588, + "grad_norm": 2.203125, + "grad_norm_var": 0.015466054280598959, + "learning_rate": 0.0001, + "loss": 4.173, + "loss/crossentropy": 2.457033157348633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21896487474441528, + "step": 16294 + }, + { + "epoch": 0.32592, + "grad_norm": 2.03125, + "grad_norm_var": 0.014180501302083334, + "learning_rate": 0.0001, + "loss": 4.11, + "loss/crossentropy": 2.4872595071792603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2348102256655693, + "step": 16296 + }, + { + "epoch": 0.32596, + "grad_norm": 2.03125, + "grad_norm_var": 0.014249420166015625, + "learning_rate": 0.0001, + "loss": 4.0756, + "loss/crossentropy": 2.110987067222595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21079359203577042, + "step": 16298 + }, + { + "epoch": 0.326, + "grad_norm": 1.9375, + "grad_norm_var": 0.0133941650390625, + "learning_rate": 0.0001, + "loss": 4.2334, + "loss/crossentropy": 2.2166510820388794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19656993448734283, + "step": 16300 + }, + { + "epoch": 0.32604, + "grad_norm": 2.046875, + "grad_norm_var": 0.008125813802083333, + "learning_rate": 0.0001, + "loss": 4.0212, + "loss/crossentropy": 2.014153838157654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956234946846962, + "step": 16302 + }, + { + "epoch": 0.32608, + "grad_norm": 2.671875, + "grad_norm_var": 0.03588231404622396, + "learning_rate": 0.0001, + "loss": 4.3045, + "loss/crossentropy": 2.298485517501831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21710477769374847, + "step": 16304 + }, + { + "epoch": 0.32612, + "grad_norm": 1.9921875, + "grad_norm_var": 0.033599599202473955, + "learning_rate": 0.0001, + "loss": 4.3134, + "loss/crossentropy": 2.233761191368103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21609390527009964, + "step": 16306 + }, + { + "epoch": 0.32616, + "grad_norm": 1.8515625, + "grad_norm_var": 0.03675918579101563, + "learning_rate": 0.0001, + "loss": 3.9141, + "loss/crossentropy": 1.9988956451416016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924634352326393, + "step": 16308 + }, + { + "epoch": 0.3262, + "grad_norm": 1.96875, + "grad_norm_var": 0.035194651285807295, + "learning_rate": 0.0001, + "loss": 3.9726, + "loss/crossentropy": 1.9185590147972107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20295168459415436, + "step": 16310 + }, + { + "epoch": 0.32624, + "grad_norm": 2.0625, + "grad_norm_var": 0.03476155598958333, + "learning_rate": 0.0001, + "loss": 4.0805, + "loss/crossentropy": 2.219870448112488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21425887942314148, + "step": 16312 + }, + { + "epoch": 0.32628, + "grad_norm": 2.21875, + "grad_norm_var": 0.03612442016601562, + "learning_rate": 0.0001, + "loss": 4.5119, + "loss/crossentropy": 2.219307541847229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22380203753709793, + "step": 16314 + }, + { + "epoch": 0.32632, + "grad_norm": 1.875, + "grad_norm_var": 0.03746515909830729, + "learning_rate": 0.0001, + "loss": 3.9652, + "loss/crossentropy": 1.7907224893569946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18037232011556625, + "step": 16316 + }, + { + "epoch": 0.32636, + "grad_norm": 1.8203125, + "grad_norm_var": 0.03876927693684896, + "learning_rate": 0.0001, + "loss": 4.1337, + "loss/crossentropy": 1.9702708721160889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18266448378562927, + "step": 16318 + }, + { + "epoch": 0.3264, + "grad_norm": 1.90625, + "grad_norm_var": 0.012481435139973959, + "learning_rate": 0.0001, + "loss": 4.0551, + "loss/crossentropy": 2.1748844385147095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20725325495004654, + "step": 16320 + }, + { + "epoch": 0.32644, + "grad_norm": 1.953125, + "grad_norm_var": 0.0119781494140625, + "learning_rate": 0.0001, + "loss": 4.2882, + "loss/crossentropy": 2.2458006143569946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21462788432836533, + "step": 16322 + }, + { + "epoch": 0.32648, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009276326497395833, + "learning_rate": 0.0001, + "loss": 4.1231, + "loss/crossentropy": 1.923665463924408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.223393976688385, + "step": 16324 + }, + { + "epoch": 0.32652, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010129547119140625, + "learning_rate": 0.0001, + "loss": 4.1643, + "loss/crossentropy": 2.36092209815979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22316965460777283, + "step": 16326 + }, + { + "epoch": 0.32656, + "grad_norm": 2.046875, + "grad_norm_var": 0.010456339518229166, + "learning_rate": 0.0001, + "loss": 4.0778, + "loss/crossentropy": 1.7765440344810486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1736016422510147, + "step": 16328 + }, + { + "epoch": 0.3266, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006638336181640625, + "learning_rate": 0.0001, + "loss": 4.4885, + "loss/crossentropy": 2.2588138580322266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21762139350175858, + "step": 16330 + }, + { + "epoch": 0.32664, + "grad_norm": 1.921875, + "grad_norm_var": 0.0064084370930989586, + "learning_rate": 0.0001, + "loss": 4.2476, + "loss/crossentropy": 2.177221417427063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19594799727201462, + "step": 16332 + }, + { + "epoch": 0.32668, + "grad_norm": 2.0625, + "grad_norm_var": 0.0051422119140625, + "learning_rate": 0.0001, + "loss": 3.941, + "loss/crossentropy": 1.8424673676490784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19562739878892899, + "step": 16334 + }, + { + "epoch": 0.32672, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006473541259765625, + "learning_rate": 0.0001, + "loss": 3.861, + "loss/crossentropy": 1.8677524328231812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19708558171987534, + "step": 16336 + }, + { + "epoch": 0.32676, + "grad_norm": 1.859375, + "grad_norm_var": 0.007081858317057292, + "learning_rate": 0.0001, + "loss": 3.9222, + "loss/crossentropy": 1.9559081196784973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19585052132606506, + "step": 16338 + }, + { + "epoch": 0.3268, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006892649332682291, + "learning_rate": 0.0001, + "loss": 4.144, + "loss/crossentropy": 1.979565978050232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045053392648697, + "step": 16340 + }, + { + "epoch": 0.32684, + "grad_norm": 1.953125, + "grad_norm_var": 0.00714111328125, + "learning_rate": 0.0001, + "loss": 4.2517, + "loss/crossentropy": 2.296669840812683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207671657204628, + "step": 16342 + }, + { + "epoch": 0.32688, + "grad_norm": 2.0, + "grad_norm_var": 0.0065305074055989586, + "learning_rate": 0.0001, + "loss": 4.0279, + "loss/crossentropy": 1.9962339401245117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19833384454250336, + "step": 16344 + }, + { + "epoch": 0.32692, + "grad_norm": 2.015625, + "grad_norm_var": 0.006078084309895833, + "learning_rate": 0.0001, + "loss": 4.1779, + "loss/crossentropy": 2.020018517971039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19721734523773193, + "step": 16346 + }, + { + "epoch": 0.32696, + "grad_norm": 1.8828125, + "grad_norm_var": 0.006815592447916667, + "learning_rate": 0.0001, + "loss": 3.7614, + "loss/crossentropy": 2.112026810646057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20460353046655655, + "step": 16348 + }, + { + "epoch": 0.327, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006540679931640625, + "learning_rate": 0.0001, + "loss": 4.1509, + "loss/crossentropy": 2.2069387435913086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2233232483267784, + "step": 16350 + }, + { + "epoch": 0.32704, + "grad_norm": 1.9375, + "grad_norm_var": 0.004792277018229167, + "learning_rate": 0.0001, + "loss": 3.8675, + "loss/crossentropy": 1.9365113377571106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19625376164913177, + "step": 16352 + }, + { + "epoch": 0.32708, + "grad_norm": 2.125, + "grad_norm_var": 0.0052154541015625, + "learning_rate": 0.0001, + "loss": 4.1831, + "loss/crossentropy": 2.2316179871559143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22712747752666473, + "step": 16354 + }, + { + "epoch": 0.32712, + "grad_norm": 2.0, + "grad_norm_var": 0.035672760009765624, + "learning_rate": 0.0001, + "loss": 3.9148, + "loss/crossentropy": 1.801742434501648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1831187978386879, + "step": 16356 + }, + { + "epoch": 0.32716, + "grad_norm": 1.9921875, + "grad_norm_var": 0.03590087890625, + "learning_rate": 0.0001, + "loss": 3.9819, + "loss/crossentropy": 2.157936453819275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21520215272903442, + "step": 16358 + }, + { + "epoch": 0.3272, + "grad_norm": 2.03125, + "grad_norm_var": 0.035835520426432295, + "learning_rate": 0.0001, + "loss": 4.0743, + "loss/crossentropy": 1.6054654717445374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1769917830824852, + "step": 16360 + }, + { + "epoch": 0.32724, + "grad_norm": 2.15625, + "grad_norm_var": 0.039613596598307294, + "learning_rate": 0.0001, + "loss": 4.0751, + "loss/crossentropy": 2.0573782324790955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22550390660762787, + "step": 16362 + }, + { + "epoch": 0.32728, + "grad_norm": 2.03125, + "grad_norm_var": 0.035131581624348956, + "learning_rate": 0.0001, + "loss": 4.281, + "loss/crossentropy": 2.0643117427825928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21060281991958618, + "step": 16364 + }, + { + "epoch": 0.32732, + "grad_norm": 1.8984375, + "grad_norm_var": 0.042525227864583334, + "learning_rate": 0.0001, + "loss": 4.0884, + "loss/crossentropy": 2.1980225443840027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19246292114257812, + "step": 16366 + }, + { + "epoch": 0.32736, + "grad_norm": 2.078125, + "grad_norm_var": 0.042577107747395836, + "learning_rate": 0.0001, + "loss": 4.05, + "loss/crossentropy": 1.8917307257652283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17911095917224884, + "step": 16368 + }, + { + "epoch": 0.3274, + "grad_norm": 2.15625, + "grad_norm_var": 0.0429595947265625, + "learning_rate": 0.0001, + "loss": 4.1415, + "loss/crossentropy": 2.0520957708358765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19265015423297882, + "step": 16370 + }, + { + "epoch": 0.32744, + "grad_norm": 2.28125, + "grad_norm_var": 0.019510904947916668, + "learning_rate": 0.0001, + "loss": 4.3026, + "loss/crossentropy": 2.4500341415405273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21713324636220932, + "step": 16372 + }, + { + "epoch": 0.32748, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02032648722330729, + "learning_rate": 0.0001, + "loss": 4.0754, + "loss/crossentropy": 1.8100037574768066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16967355459928513, + "step": 16374 + }, + { + "epoch": 0.32752, + "grad_norm": 1.9375, + "grad_norm_var": 0.021174875895182292, + "learning_rate": 0.0001, + "loss": 4.0244, + "loss/crossentropy": 2.092597723007202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1998964473605156, + "step": 16376 + }, + { + "epoch": 0.32756, + "grad_norm": 2.03125, + "grad_norm_var": 0.016076405843098957, + "learning_rate": 0.0001, + "loss": 4.3627, + "loss/crossentropy": 2.151292622089386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19120946526527405, + "step": 16378 + }, + { + "epoch": 0.3276, + "grad_norm": 1.796875, + "grad_norm_var": 0.020906575520833335, + "learning_rate": 0.0001, + "loss": 3.6949, + "loss/crossentropy": 1.7384315729141235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18261852860450745, + "step": 16380 + }, + { + "epoch": 0.32764, + "grad_norm": 1.953125, + "grad_norm_var": 0.019147745768229165, + "learning_rate": 0.0001, + "loss": 3.9341, + "loss/crossentropy": 1.662436068058014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16703163087368011, + "step": 16382 + }, + { + "epoch": 0.32768, + "grad_norm": 1.984375, + "grad_norm_var": 0.018871053059895834, + "learning_rate": 0.0001, + "loss": 4.3202, + "loss/crossentropy": 2.2057151794433594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20989537239074707, + "step": 16384 + }, + { + "epoch": 0.32772, + "grad_norm": 2.03125, + "grad_norm_var": 0.068359375, + "learning_rate": 0.0001, + "loss": 4.3033, + "loss/crossentropy": 2.1689382791519165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20804189145565033, + "step": 16386 + }, + { + "epoch": 0.32776, + "grad_norm": 2.171875, + "grad_norm_var": 0.06450169881184896, + "learning_rate": 0.0001, + "loss": 4.2241, + "loss/crossentropy": 2.3121412992477417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20552029460668564, + "step": 16388 + }, + { + "epoch": 0.3278, + "grad_norm": 2.046875, + "grad_norm_var": 0.06318333943684896, + "learning_rate": 0.0001, + "loss": 4.1396, + "loss/crossentropy": 2.0784353017807007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20427466928958893, + "step": 16390 + }, + { + "epoch": 0.32784, + "grad_norm": 1.9609375, + "grad_norm_var": 0.06282145182291667, + "learning_rate": 0.0001, + "loss": 4.1132, + "loss/crossentropy": 2.147130608558655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181989550590515, + "step": 16392 + }, + { + "epoch": 0.32788, + "grad_norm": 1.9296875, + "grad_norm_var": 0.06468505859375, + "learning_rate": 0.0001, + "loss": 3.812, + "loss/crossentropy": 1.7986091375350952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18294771760702133, + "step": 16394 + }, + { + "epoch": 0.32792, + "grad_norm": 1.9140625, + "grad_norm_var": 0.059004720052083334, + "learning_rate": 0.0001, + "loss": 4.037, + "loss/crossentropy": 2.0563793778419495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19180169701576233, + "step": 16396 + }, + { + "epoch": 0.32796, + "grad_norm": 2.015625, + "grad_norm_var": 0.05506591796875, + "learning_rate": 0.0001, + "loss": 4.3111, + "loss/crossentropy": 2.462133765220642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21563740074634552, + "step": 16398 + }, + { + "epoch": 0.328, + "grad_norm": 1.9375, + "grad_norm_var": 0.059427897135416664, + "learning_rate": 0.0001, + "loss": 3.9743, + "loss/crossentropy": 1.8900938630104065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845797374844551, + "step": 16400 + }, + { + "epoch": 0.32804, + "grad_norm": 2.375, + "grad_norm_var": 0.0195068359375, + "learning_rate": 0.0001, + "loss": 4.1653, + "loss/crossentropy": 2.0627527832984924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20268447697162628, + "step": 16402 + }, + { + "epoch": 0.32808, + "grad_norm": 2.125, + "grad_norm_var": 0.030147043863932292, + "learning_rate": 0.0001, + "loss": 4.3042, + "loss/crossentropy": 2.058929443359375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19984640181064606, + "step": 16404 + }, + { + "epoch": 0.32812, + "grad_norm": 1.9296875, + "grad_norm_var": 0.030564117431640624, + "learning_rate": 0.0001, + "loss": 4.1162, + "loss/crossentropy": 1.816510558128357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1754368245601654, + "step": 16406 + }, + { + "epoch": 0.32816, + "grad_norm": 2.015625, + "grad_norm_var": 0.0302886962890625, + "learning_rate": 0.0001, + "loss": 4.263, + "loss/crossentropy": 1.9079806208610535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18917685747146606, + "step": 16408 + }, + { + "epoch": 0.3282, + "grad_norm": 1.9765625, + "grad_norm_var": 0.028612263997395835, + "learning_rate": 0.0001, + "loss": 3.9264, + "loss/crossentropy": 2.0913302898406982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088220864534378, + "step": 16410 + }, + { + "epoch": 0.32824, + "grad_norm": 2.09375, + "grad_norm_var": 0.028547159830729165, + "learning_rate": 0.0001, + "loss": 3.9029, + "loss/crossentropy": 2.1609301567077637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010297030210495, + "step": 16412 + }, + { + "epoch": 0.32828, + "grad_norm": 3.078125, + "grad_norm_var": 0.0982666015625, + "learning_rate": 0.0001, + "loss": 4.2563, + "loss/crossentropy": 1.8449677228927612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23620254546403885, + "step": 16414 + }, + { + "epoch": 0.32832, + "grad_norm": 2.078125, + "grad_norm_var": 0.09134699503580729, + "learning_rate": 0.0001, + "loss": 4.1053, + "loss/crossentropy": 2.059410274028778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20259736478328705, + "step": 16416 + }, + { + "epoch": 0.32836, + "grad_norm": 1.8828125, + "grad_norm_var": 0.08761571248372396, + "learning_rate": 0.0001, + "loss": 3.8718, + "loss/crossentropy": 1.6095005869865417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17026344686746597, + "step": 16418 + }, + { + "epoch": 0.3284, + "grad_norm": 2.03125, + "grad_norm_var": 0.07932510375976562, + "learning_rate": 0.0001, + "loss": 4.2623, + "loss/crossentropy": 2.200170636177063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2259170413017273, + "step": 16420 + }, + { + "epoch": 0.32844, + "grad_norm": 1.921875, + "grad_norm_var": 0.08116226196289063, + "learning_rate": 0.0001, + "loss": 4.0724, + "loss/crossentropy": 1.970030963420868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19053932279348373, + "step": 16422 + }, + { + "epoch": 0.32848, + "grad_norm": 1.890625, + "grad_norm_var": 0.08263753255208334, + "learning_rate": 0.0001, + "loss": 4.106, + "loss/crossentropy": 2.2675901651382446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22277145832777023, + "step": 16424 + }, + { + "epoch": 0.32852, + "grad_norm": 1.78125, + "grad_norm_var": 0.08910725911458334, + "learning_rate": 0.0001, + "loss": 3.723, + "loss/crossentropy": 1.6446372866630554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15234287828207016, + "step": 16426 + }, + { + "epoch": 0.32856, + "grad_norm": 1.984375, + "grad_norm_var": 0.0875689188639323, + "learning_rate": 0.0001, + "loss": 4.3421, + "loss/crossentropy": 2.350424647331238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190476581454277, + "step": 16428 + }, + { + "epoch": 0.3286, + "grad_norm": 2.046875, + "grad_norm_var": 0.009279123942057292, + "learning_rate": 0.0001, + "loss": 3.9663, + "loss/crossentropy": 1.8809763193130493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19404269009828568, + "step": 16430 + }, + { + "epoch": 0.32864, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008257802327473958, + "learning_rate": 0.0001, + "loss": 4.1731, + "loss/crossentropy": 2.1159849166870117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21232837438583374, + "step": 16432 + }, + { + "epoch": 0.32868, + "grad_norm": 2.421875, + "grad_norm_var": 0.02133763631184896, + "learning_rate": 0.0001, + "loss": 4.2875, + "loss/crossentropy": 2.3252066373825073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24867044389247894, + "step": 16434 + }, + { + "epoch": 0.32872, + "grad_norm": 1.9140625, + "grad_norm_var": 0.020186360677083334, + "learning_rate": 0.0001, + "loss": 4.2462, + "loss/crossentropy": 2.0642696619033813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18409788608551025, + "step": 16436 + }, + { + "epoch": 0.32876, + "grad_norm": 2.203125, + "grad_norm_var": 0.022725423177083332, + "learning_rate": 0.0001, + "loss": 4.3832, + "loss/crossentropy": 2.359580874443054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.218842551112175, + "step": 16438 + }, + { + "epoch": 0.3288, + "grad_norm": 1.8984375, + "grad_norm_var": 0.02264378865559896, + "learning_rate": 0.0001, + "loss": 3.7996, + "loss/crossentropy": 2.0959436893463135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993689462542534, + "step": 16440 + }, + { + "epoch": 0.32884, + "grad_norm": 1.96875, + "grad_norm_var": 0.017679595947265626, + "learning_rate": 0.0001, + "loss": 4.0838, + "loss/crossentropy": 2.1507667303085327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20800111442804337, + "step": 16442 + }, + { + "epoch": 0.32888, + "grad_norm": 1.875, + "grad_norm_var": 0.020005035400390624, + "learning_rate": 0.0001, + "loss": 3.9428, + "loss/crossentropy": 1.8771533370018005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20476838946342468, + "step": 16444 + }, + { + "epoch": 0.32892, + "grad_norm": 2.046875, + "grad_norm_var": 0.020005035400390624, + "learning_rate": 0.0001, + "loss": 4.2689, + "loss/crossentropy": 2.365285038948059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22512932121753693, + "step": 16446 + }, + { + "epoch": 0.32896, + "grad_norm": 2.0625, + "grad_norm_var": 0.019701131184895835, + "learning_rate": 0.0001, + "loss": 4.2928, + "loss/crossentropy": 2.2584418058395386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21187593042850494, + "step": 16448 + }, + { + "epoch": 0.329, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010270182291666667, + "learning_rate": 0.0001, + "loss": 3.8186, + "loss/crossentropy": 1.9227403402328491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945207566022873, + "step": 16450 + }, + { + "epoch": 0.32904, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011897786458333334, + "learning_rate": 0.0001, + "loss": 3.8512, + "loss/crossentropy": 2.062865734100342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064652517437935, + "step": 16452 + }, + { + "epoch": 0.32908, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007505035400390625, + "learning_rate": 0.0001, + "loss": 4.0493, + "loss/crossentropy": 1.98399817943573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22567245364189148, + "step": 16454 + }, + { + "epoch": 0.32912, + "grad_norm": 2.0, + "grad_norm_var": 0.007428995768229167, + "learning_rate": 0.0001, + "loss": 4.2888, + "loss/crossentropy": 2.0655510425567627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20936425030231476, + "step": 16456 + }, + { + "epoch": 0.32916, + "grad_norm": 2.140625, + "grad_norm_var": 0.01739501953125, + "learning_rate": 0.0001, + "loss": 4.1486, + "loss/crossentropy": 1.8360095024108887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18788952380418777, + "step": 16458 + }, + { + "epoch": 0.3292, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01672948201497396, + "learning_rate": 0.0001, + "loss": 4.2265, + "loss/crossentropy": 2.1088568568229675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20265556871891022, + "step": 16460 + }, + { + "epoch": 0.32924, + "grad_norm": 1.921875, + "grad_norm_var": 0.017601521809895833, + "learning_rate": 0.0001, + "loss": 4.1004, + "loss/crossentropy": 2.3746429681777954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21173583716154099, + "step": 16462 + }, + { + "epoch": 0.32928, + "grad_norm": 2.078125, + "grad_norm_var": 0.01946996053059896, + "learning_rate": 0.0001, + "loss": 3.9577, + "loss/crossentropy": 2.0129969716072083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18141476809978485, + "step": 16464 + }, + { + "epoch": 0.32932, + "grad_norm": 1.9921875, + "grad_norm_var": 0.019160970052083334, + "learning_rate": 0.0001, + "loss": 4.2122, + "loss/crossentropy": 2.385707139968872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22456685453653336, + "step": 16466 + }, + { + "epoch": 0.32936, + "grad_norm": 2.09375, + "grad_norm_var": 0.015958658854166665, + "learning_rate": 0.0001, + "loss": 4.183, + "loss/crossentropy": 2.466804623603821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23166514188051224, + "step": 16468 + }, + { + "epoch": 0.3294, + "grad_norm": 1.8515625, + "grad_norm_var": 0.018700154622395833, + "learning_rate": 0.0001, + "loss": 3.8552, + "loss/crossentropy": 1.9182489514350891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918247565627098, + "step": 16470 + }, + { + "epoch": 0.32944, + "grad_norm": 2.125, + "grad_norm_var": 0.021993001302083332, + "learning_rate": 0.0001, + "loss": 4.4364, + "loss/crossentropy": 2.27492892742157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2237030565738678, + "step": 16472 + }, + { + "epoch": 0.32948, + "grad_norm": 2.03125, + "grad_norm_var": 0.0144927978515625, + "learning_rate": 0.0001, + "loss": 4.0022, + "loss/crossentropy": 1.856327474117279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17718566209077835, + "step": 16474 + }, + { + "epoch": 0.32952, + "grad_norm": 1.859375, + "grad_norm_var": 0.0145263671875, + "learning_rate": 0.0001, + "loss": 3.9994, + "loss/crossentropy": 2.014274477958679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20536774396896362, + "step": 16476 + }, + { + "epoch": 0.32956, + "grad_norm": 2.03125, + "grad_norm_var": 0.016383616129557292, + "learning_rate": 0.0001, + "loss": 4.4606, + "loss/crossentropy": 2.098844289779663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18847080320119858, + "step": 16478 + }, + { + "epoch": 0.3296, + "grad_norm": 1.890625, + "grad_norm_var": 0.014647420247395833, + "learning_rate": 0.0001, + "loss": 4.1978, + "loss/crossentropy": 2.0402814149856567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929381936788559, + "step": 16480 + }, + { + "epoch": 0.32964, + "grad_norm": 2.0625, + "grad_norm_var": 0.013315582275390625, + "learning_rate": 0.0001, + "loss": 4.2737, + "loss/crossentropy": 2.413077712059021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2426547110080719, + "step": 16482 + }, + { + "epoch": 0.32968, + "grad_norm": 2.015625, + "grad_norm_var": 0.013401031494140625, + "learning_rate": 0.0001, + "loss": 3.9544, + "loss/crossentropy": 2.081916332244873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190306931734085, + "step": 16484 + }, + { + "epoch": 0.32972, + "grad_norm": 1.9375, + "grad_norm_var": 0.010765584309895833, + "learning_rate": 0.0001, + "loss": 4.1093, + "loss/crossentropy": 2.1017117500305176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19809769093990326, + "step": 16486 + }, + { + "epoch": 0.32976, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008139801025390626, + "learning_rate": 0.0001, + "loss": 3.9585, + "loss/crossentropy": 2.289687156677246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21503399312496185, + "step": 16488 + }, + { + "epoch": 0.3298, + "grad_norm": 2.09375, + "grad_norm_var": 0.008642323811848958, + "learning_rate": 0.0001, + "loss": 4.1952, + "loss/crossentropy": 2.0613157749176025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20428171753883362, + "step": 16490 + }, + { + "epoch": 0.32984, + "grad_norm": 2.0, + "grad_norm_var": 0.0073964436848958336, + "learning_rate": 0.0001, + "loss": 4.3191, + "loss/crossentropy": 2.1484400033950806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20850211381912231, + "step": 16492 + }, + { + "epoch": 0.32988, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0072629292805989586, + "learning_rate": 0.0001, + "loss": 3.875, + "loss/crossentropy": 1.6853107213974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1736336275935173, + "step": 16494 + }, + { + "epoch": 0.32992, + "grad_norm": 1.96875, + "grad_norm_var": 0.007389068603515625, + "learning_rate": 0.0001, + "loss": 4.2047, + "loss/crossentropy": 2.323481321334839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22742202132940292, + "step": 16496 + }, + { + "epoch": 0.32996, + "grad_norm": 2.078125, + "grad_norm_var": 0.00738525390625, + "learning_rate": 0.0001, + "loss": 4.124, + "loss/crossentropy": 2.1196956038475037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19792834669351578, + "step": 16498 + }, + { + "epoch": 0.33, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007417805989583333, + "learning_rate": 0.0001, + "loss": 4.0013, + "loss/crossentropy": 1.947241187095642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21067001670598984, + "step": 16500 + }, + { + "epoch": 0.33004, + "grad_norm": 2.046875, + "grad_norm_var": 0.00758056640625, + "learning_rate": 0.0001, + "loss": 4.0631, + "loss/crossentropy": 2.256502628326416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22300700098276138, + "step": 16502 + }, + { + "epoch": 0.33008, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007574208577473958, + "learning_rate": 0.0001, + "loss": 3.7874, + "loss/crossentropy": 1.7947281002998352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1860191375017166, + "step": 16504 + }, + { + "epoch": 0.33012, + "grad_norm": 1.921875, + "grad_norm_var": 0.007995351155598959, + "learning_rate": 0.0001, + "loss": 4.0189, + "loss/crossentropy": 1.9331985712051392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20170047879219055, + "step": 16506 + }, + { + "epoch": 0.33016, + "grad_norm": 1.96875, + "grad_norm_var": 0.011030832926432291, + "learning_rate": 0.0001, + "loss": 4.2148, + "loss/crossentropy": 2.2274327278137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977931559085846, + "step": 16508 + }, + { + "epoch": 0.3302, + "grad_norm": 2.015625, + "grad_norm_var": 0.007884724934895834, + "learning_rate": 0.0001, + "loss": 4.15, + "loss/crossentropy": 1.8204082250595093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18195096403360367, + "step": 16510 + }, + { + "epoch": 0.33024, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009633127848307292, + "learning_rate": 0.0001, + "loss": 4.2363, + "loss/crossentropy": 2.308950901031494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22067535668611526, + "step": 16512 + }, + { + "epoch": 0.33028, + "grad_norm": 2.3125, + "grad_norm_var": 0.014794921875, + "learning_rate": 0.0001, + "loss": 4.3895, + "loss/crossentropy": 2.1682082414627075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2021712213754654, + "step": 16514 + }, + { + "epoch": 0.33032, + "grad_norm": 1.90625, + "grad_norm_var": 0.016047159830729168, + "learning_rate": 0.0001, + "loss": 3.9213, + "loss/crossentropy": 1.9669402837753296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19773626327514648, + "step": 16516 + }, + { + "epoch": 0.33036, + "grad_norm": 2.046875, + "grad_norm_var": 0.016047159830729168, + "learning_rate": 0.0001, + "loss": 4.1723, + "loss/crossentropy": 2.0478034019470215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19190094619989395, + "step": 16518 + }, + { + "epoch": 0.3304, + "grad_norm": 1.984375, + "grad_norm_var": 0.014127349853515625, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.8262990713119507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19281092286109924, + "step": 16520 + }, + { + "epoch": 0.33044, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0139801025390625, + "learning_rate": 0.0001, + "loss": 4.385, + "loss/crossentropy": 2.3943055868148804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2286393940448761, + "step": 16522 + }, + { + "epoch": 0.33048, + "grad_norm": 1.90625, + "grad_norm_var": 0.0124664306640625, + "learning_rate": 0.0001, + "loss": 4.1117, + "loss/crossentropy": 1.7113690972328186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20282406359910965, + "step": 16524 + }, + { + "epoch": 0.33052, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012924957275390624, + "learning_rate": 0.0001, + "loss": 3.9182, + "loss/crossentropy": 1.9077526926994324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18814775347709656, + "step": 16526 + }, + { + "epoch": 0.33056, + "grad_norm": 2.0625, + "grad_norm_var": 0.0106597900390625, + "learning_rate": 0.0001, + "loss": 4.3404, + "loss/crossentropy": 1.9100408554077148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1897171437740326, + "step": 16528 + }, + { + "epoch": 0.3306, + "grad_norm": 1.84375, + "grad_norm_var": 0.0066314697265625, + "learning_rate": 0.0001, + "loss": 4.1403, + "loss/crossentropy": 1.8145674467086792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18221604079008102, + "step": 16530 + }, + { + "epoch": 0.33064, + "grad_norm": 2.1875, + "grad_norm_var": 0.0085113525390625, + "learning_rate": 0.0001, + "loss": 4.0647, + "loss/crossentropy": 2.017480492591858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19173581898212433, + "step": 16532 + }, + { + "epoch": 0.33068, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008739980061848958, + "learning_rate": 0.0001, + "loss": 4.1918, + "loss/crossentropy": 1.8937708139419556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974407583475113, + "step": 16534 + }, + { + "epoch": 0.33072, + "grad_norm": 2.203125, + "grad_norm_var": 0.012664540608723959, + "learning_rate": 0.0001, + "loss": 4.1354, + "loss/crossentropy": 1.824431598186493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18114721775054932, + "step": 16536 + }, + { + "epoch": 0.33076, + "grad_norm": 2.03125, + "grad_norm_var": 0.018464152018229166, + "learning_rate": 0.0001, + "loss": 4.034, + "loss/crossentropy": 1.6330693364143372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17686167359352112, + "step": 16538 + }, + { + "epoch": 0.3308, + "grad_norm": 2.453125, + "grad_norm_var": 0.028595987955729166, + "learning_rate": 0.0001, + "loss": 4.2359, + "loss/crossentropy": 2.1948810815811157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21977168321609497, + "step": 16540 + }, + { + "epoch": 0.33084, + "grad_norm": 2.328125, + "grad_norm_var": 0.03220926920572917, + "learning_rate": 0.0001, + "loss": 4.1942, + "loss/crossentropy": 2.1222333908081055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19481626898050308, + "step": 16542 + }, + { + "epoch": 0.33088, + "grad_norm": 2.140625, + "grad_norm_var": 0.03241780598958333, + "learning_rate": 0.0001, + "loss": 3.9908, + "loss/crossentropy": 1.9703376293182373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20912524312734604, + "step": 16544 + }, + { + "epoch": 0.33092, + "grad_norm": 1.8515625, + "grad_norm_var": 0.03206965128580729, + "learning_rate": 0.0001, + "loss": 4.0774, + "loss/crossentropy": 2.2693361043930054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2137979120016098, + "step": 16546 + }, + { + "epoch": 0.33096, + "grad_norm": 3.703125, + "grad_norm_var": 0.19548746744791667, + "learning_rate": 0.0001, + "loss": 4.2372, + "loss/crossentropy": 2.029613673686981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21167294681072235, + "step": 16548 + }, + { + "epoch": 0.331, + "grad_norm": 1.90625, + "grad_norm_var": 0.19552586873372396, + "learning_rate": 0.0001, + "loss": 3.7223, + "loss/crossentropy": 2.0102853775024414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100009247660637, + "step": 16550 + }, + { + "epoch": 0.33104, + "grad_norm": 2.234375, + "grad_norm_var": 0.19806722005208333, + "learning_rate": 0.0001, + "loss": 4.6058, + "loss/crossentropy": 2.1570287942886353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21619540452957153, + "step": 16552 + }, + { + "epoch": 0.33108, + "grad_norm": 2.046875, + "grad_norm_var": 0.20053609212239584, + "learning_rate": 0.0001, + "loss": 4.0271, + "loss/crossentropy": 1.9157934188842773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193254753947258, + "step": 16554 + }, + { + "epoch": 0.33112, + "grad_norm": 1.8515625, + "grad_norm_var": 0.2052642822265625, + "learning_rate": 0.0001, + "loss": 3.6593, + "loss/crossentropy": 1.7680367827415466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19283122569322586, + "step": 16556 + }, + { + "epoch": 0.33116, + "grad_norm": 1.8984375, + "grad_norm_var": 0.2058013916015625, + "learning_rate": 0.0001, + "loss": 3.8828, + "loss/crossentropy": 1.8805989623069763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20299813151359558, + "step": 16558 + }, + { + "epoch": 0.3312, + "grad_norm": 1.71875, + "grad_norm_var": 0.21938069661458334, + "learning_rate": 0.0001, + "loss": 3.7657, + "loss/crossentropy": 1.9145240187644958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18402665108442307, + "step": 16560 + }, + { + "epoch": 0.33124, + "grad_norm": 1.9453125, + "grad_norm_var": 0.2167144775390625, + "learning_rate": 0.0001, + "loss": 4.1984, + "loss/crossentropy": 1.7958417534828186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17038261890411377, + "step": 16562 + }, + { + "epoch": 0.33128, + "grad_norm": 1.8984375, + "grad_norm_var": 0.038919830322265626, + "learning_rate": 0.0001, + "loss": 3.8916, + "loss/crossentropy": 2.0782148838043213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21738532930612564, + "step": 16564 + }, + { + "epoch": 0.33132, + "grad_norm": 2.046875, + "grad_norm_var": 0.03910903930664063, + "learning_rate": 0.0001, + "loss": 4.3098, + "loss/crossentropy": 1.9767170548439026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127690687775612, + "step": 16566 + }, + { + "epoch": 0.33136, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01185302734375, + "learning_rate": 0.0001, + "loss": 3.9133, + "loss/crossentropy": 2.086383044719696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19811449944972992, + "step": 16568 + }, + { + "epoch": 0.3314, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011991119384765625, + "learning_rate": 0.0001, + "loss": 4.0358, + "loss/crossentropy": 2.0471088886260986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20480120182037354, + "step": 16570 + }, + { + "epoch": 0.33144, + "grad_norm": 2.109375, + "grad_norm_var": 0.012485504150390625, + "learning_rate": 0.0001, + "loss": 4.1097, + "loss/crossentropy": 2.2637689113616943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21112071722745895, + "step": 16572 + }, + { + "epoch": 0.33148, + "grad_norm": 2.015625, + "grad_norm_var": 0.012018839518229166, + "learning_rate": 0.0001, + "loss": 4.023, + "loss/crossentropy": 1.8035425543785095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18357180804014206, + "step": 16574 + }, + { + "epoch": 0.33152, + "grad_norm": 1.953125, + "grad_norm_var": 0.007189687093098958, + "learning_rate": 0.0001, + "loss": 3.7453, + "loss/crossentropy": 1.647614598274231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17512448877096176, + "step": 16576 + }, + { + "epoch": 0.33156, + "grad_norm": 2.078125, + "grad_norm_var": 0.0063873291015625, + "learning_rate": 0.0001, + "loss": 4.0452, + "loss/crossentropy": 2.0647078156471252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21568534523248672, + "step": 16578 + }, + { + "epoch": 0.3316, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00615234375, + "learning_rate": 0.0001, + "loss": 3.8656, + "loss/crossentropy": 1.7758954763412476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891157627105713, + "step": 16580 + }, + { + "epoch": 0.33164, + "grad_norm": 2.125, + "grad_norm_var": 0.0060618082682291664, + "learning_rate": 0.0001, + "loss": 4.1022, + "loss/crossentropy": 2.0523850321769714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21198877692222595, + "step": 16582 + }, + { + "epoch": 0.33168, + "grad_norm": 2.078125, + "grad_norm_var": 0.006725819905598959, + "learning_rate": 0.0001, + "loss": 4.0584, + "loss/crossentropy": 1.99459570646286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20555292069911957, + "step": 16584 + }, + { + "epoch": 0.33172, + "grad_norm": 2.046875, + "grad_norm_var": 0.007771809895833333, + "learning_rate": 0.0001, + "loss": 4.4979, + "loss/crossentropy": 2.353522777557373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23193839937448502, + "step": 16586 + }, + { + "epoch": 0.33176, + "grad_norm": 2.078125, + "grad_norm_var": 0.020970662434895832, + "learning_rate": 0.0001, + "loss": 4.1303, + "loss/crossentropy": 2.0257768630981445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20071294903755188, + "step": 16588 + }, + { + "epoch": 0.3318, + "grad_norm": 2.046875, + "grad_norm_var": 0.020467122395833332, + "learning_rate": 0.0001, + "loss": 4.2135, + "loss/crossentropy": 2.2084985971450806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20876871049404144, + "step": 16590 + }, + { + "epoch": 0.33184, + "grad_norm": 2.15625, + "grad_norm_var": 0.019791412353515624, + "learning_rate": 0.0001, + "loss": 4.3275, + "loss/crossentropy": 1.9118791818618774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24547216296195984, + "step": 16592 + }, + { + "epoch": 0.33188, + "grad_norm": 1.90625, + "grad_norm_var": 0.022188313802083335, + "learning_rate": 0.0001, + "loss": 4.0069, + "loss/crossentropy": 2.184453248977661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1988314613699913, + "step": 16594 + }, + { + "epoch": 0.33192, + "grad_norm": 1.71875, + "grad_norm_var": 0.02801488240559896, + "learning_rate": 0.0001, + "loss": 3.8711, + "loss/crossentropy": 2.2105953097343445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20877376198768616, + "step": 16596 + }, + { + "epoch": 0.33196, + "grad_norm": 2.078125, + "grad_norm_var": 0.028586578369140626, + "learning_rate": 0.0001, + "loss": 4.2969, + "loss/crossentropy": 2.028349459171295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20705370604991913, + "step": 16598 + }, + { + "epoch": 0.332, + "grad_norm": 2.015625, + "grad_norm_var": 0.02982177734375, + "learning_rate": 0.0001, + "loss": 3.8968, + "loss/crossentropy": 2.0778703689575195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19761346280574799, + "step": 16600 + }, + { + "epoch": 0.33204, + "grad_norm": 1.8828125, + "grad_norm_var": 0.029670206705729167, + "learning_rate": 0.0001, + "loss": 3.9786, + "loss/crossentropy": 2.046931743621826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21325716376304626, + "step": 16602 + }, + { + "epoch": 0.33208, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011486562093098958, + "learning_rate": 0.0001, + "loss": 4.1525, + "loss/crossentropy": 1.7907955050468445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822371482849121, + "step": 16604 + }, + { + "epoch": 0.33212, + "grad_norm": 2.09375, + "grad_norm_var": 0.012108357747395833, + "learning_rate": 0.0001, + "loss": 4.1586, + "loss/crossentropy": 1.7620025277137756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19196610152721405, + "step": 16606 + }, + { + "epoch": 0.33216, + "grad_norm": 2.015625, + "grad_norm_var": 0.010282135009765625, + "learning_rate": 0.0001, + "loss": 3.9175, + "loss/crossentropy": 1.6695470213890076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16451346129179, + "step": 16608 + }, + { + "epoch": 0.3322, + "grad_norm": 2.09375, + "grad_norm_var": 0.012717437744140626, + "learning_rate": 0.0001, + "loss": 4.0304, + "loss/crossentropy": 2.2249897718429565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21898073703050613, + "step": 16610 + }, + { + "epoch": 0.33224, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009474436442057291, + "learning_rate": 0.0001, + "loss": 4.1589, + "loss/crossentropy": 2.0861976146698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993977203965187, + "step": 16612 + }, + { + "epoch": 0.33228, + "grad_norm": 1.859375, + "grad_norm_var": 0.009527333577473958, + "learning_rate": 0.0001, + "loss": 4.1516, + "loss/crossentropy": 2.195378541946411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103184312582016, + "step": 16614 + }, + { + "epoch": 0.33232, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009478505452473958, + "learning_rate": 0.0001, + "loss": 3.6947, + "loss/crossentropy": 1.9064915180206299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17310689389705658, + "step": 16616 + }, + { + "epoch": 0.33236, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009452311197916667, + "learning_rate": 0.0001, + "loss": 3.9698, + "loss/crossentropy": 1.7967488169670105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18099892884492874, + "step": 16618 + }, + { + "epoch": 0.3324, + "grad_norm": 2.140625, + "grad_norm_var": 0.012565104166666667, + "learning_rate": 0.0001, + "loss": 4.1319, + "loss/crossentropy": 2.0776742696762085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005431056022644, + "step": 16620 + }, + { + "epoch": 0.33244, + "grad_norm": 1.90625, + "grad_norm_var": 0.0109375, + "learning_rate": 0.0001, + "loss": 4.1881, + "loss/crossentropy": 2.0590518712997437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20676207542419434, + "step": 16622 + }, + { + "epoch": 0.33248, + "grad_norm": 2.0, + "grad_norm_var": 0.011628977457682292, + "learning_rate": 0.0001, + "loss": 4.2067, + "loss/crossentropy": 1.8861089944839478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1860756129026413, + "step": 16624 + }, + { + "epoch": 0.33252, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009308878580729167, + "learning_rate": 0.0001, + "loss": 3.9757, + "loss/crossentropy": 2.4207329750061035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22214040160179138, + "step": 16626 + }, + { + "epoch": 0.33256, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009295399983723958, + "learning_rate": 0.0001, + "loss": 3.934, + "loss/crossentropy": 2.089089274406433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21007104218006134, + "step": 16628 + }, + { + "epoch": 0.3326, + "grad_norm": 1.984375, + "grad_norm_var": 0.011380767822265625, + "learning_rate": 0.0001, + "loss": 4.3451, + "loss/crossentropy": 2.357658624649048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22044895589351654, + "step": 16630 + }, + { + "epoch": 0.33264, + "grad_norm": 1.875, + "grad_norm_var": 0.010223134358723959, + "learning_rate": 0.0001, + "loss": 3.9142, + "loss/crossentropy": 2.1036278009414673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003040835261345, + "step": 16632 + }, + { + "epoch": 0.33268, + "grad_norm": 1.859375, + "grad_norm_var": 0.010749308268229167, + "learning_rate": 0.0001, + "loss": 4.0802, + "loss/crossentropy": 2.202688694000244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21549009531736374, + "step": 16634 + }, + { + "epoch": 0.33272, + "grad_norm": 2.078125, + "grad_norm_var": 0.008642323811848958, + "learning_rate": 0.0001, + "loss": 4.1642, + "loss/crossentropy": 2.261624753475189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2209767997264862, + "step": 16636 + }, + { + "epoch": 0.33276, + "grad_norm": 2.234375, + "grad_norm_var": 0.013570149739583334, + "learning_rate": 0.0001, + "loss": 4.0556, + "loss/crossentropy": 1.7452040910720825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18558355420827866, + "step": 16638 + }, + { + "epoch": 0.3328, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012797037760416666, + "learning_rate": 0.0001, + "loss": 3.9364, + "loss/crossentropy": 1.9910151362419128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115819901227951, + "step": 16640 + }, + { + "epoch": 0.33284, + "grad_norm": 2.015625, + "grad_norm_var": 0.014583079020182292, + "learning_rate": 0.0001, + "loss": 3.9412, + "loss/crossentropy": 2.0230116844177246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001063972711563, + "step": 16642 + }, + { + "epoch": 0.33288, + "grad_norm": 1.8359375, + "grad_norm_var": 0.015818023681640626, + "learning_rate": 0.0001, + "loss": 4.0476, + "loss/crossentropy": 1.9575697183609009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20243427157402039, + "step": 16644 + }, + { + "epoch": 0.33292, + "grad_norm": 1.96875, + "grad_norm_var": 0.012890370686848958, + "learning_rate": 0.0001, + "loss": 4.052, + "loss/crossentropy": 1.9650630354881287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203716441988945, + "step": 16646 + }, + { + "epoch": 0.33296, + "grad_norm": 1.859375, + "grad_norm_var": 0.0132476806640625, + "learning_rate": 0.0001, + "loss": 4.1345, + "loss/crossentropy": 1.497282326221466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15893913060426712, + "step": 16648 + }, + { + "epoch": 0.333, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014491526285807292, + "learning_rate": 0.0001, + "loss": 4.1273, + "loss/crossentropy": 1.8017843961715698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18981081247329712, + "step": 16650 + }, + { + "epoch": 0.33304, + "grad_norm": 2.125, + "grad_norm_var": 0.014296213785807291, + "learning_rate": 0.0001, + "loss": 4.07, + "loss/crossentropy": 2.089366614818573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076069712638855, + "step": 16652 + }, + { + "epoch": 0.33308, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008695475260416667, + "learning_rate": 0.0001, + "loss": 3.9662, + "loss/crossentropy": 1.9445012211799622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18241490423679352, + "step": 16654 + }, + { + "epoch": 0.33312, + "grad_norm": 1.90625, + "grad_norm_var": 0.008740234375, + "learning_rate": 0.0001, + "loss": 3.9922, + "loss/crossentropy": 1.844423532485962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18664997816085815, + "step": 16656 + }, + { + "epoch": 0.33316, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007085927327473958, + "learning_rate": 0.0001, + "loss": 4.2221, + "loss/crossentropy": 2.1095730662345886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21053151786327362, + "step": 16658 + }, + { + "epoch": 0.3332, + "grad_norm": 2.03125, + "grad_norm_var": 0.007477823893229167, + "learning_rate": 0.0001, + "loss": 4.255, + "loss/crossentropy": 2.139304041862488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19131766259670258, + "step": 16660 + }, + { + "epoch": 0.33324, + "grad_norm": 2.0, + "grad_norm_var": 0.007450103759765625, + "learning_rate": 0.0001, + "loss": 3.9358, + "loss/crossentropy": 1.8872849345207214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19200573861598969, + "step": 16662 + }, + { + "epoch": 0.33328, + "grad_norm": 1.984375, + "grad_norm_var": 0.006695302327473959, + "learning_rate": 0.0001, + "loss": 4.0939, + "loss/crossentropy": 2.488931655883789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24411997199058533, + "step": 16664 + }, + { + "epoch": 0.33332, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006151326497395833, + "learning_rate": 0.0001, + "loss": 3.7294, + "loss/crossentropy": 1.8066997528076172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19391655176877975, + "step": 16666 + }, + { + "epoch": 0.33336, + "grad_norm": 2.0, + "grad_norm_var": 0.005370076497395833, + "learning_rate": 0.0001, + "loss": 4.1831, + "loss/crossentropy": 2.085490345954895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21987342834472656, + "step": 16668 + }, + { + "epoch": 0.3334, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006459299723307292, + "learning_rate": 0.0001, + "loss": 3.7791, + "loss/crossentropy": 2.052145302295685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21673783659934998, + "step": 16670 + }, + { + "epoch": 0.33344, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00640869140625, + "learning_rate": 0.0001, + "loss": 4.1648, + "loss/crossentropy": 2.0924419164657593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21523578464984894, + "step": 16672 + }, + { + "epoch": 0.33348, + "grad_norm": 1.96875, + "grad_norm_var": 0.006525675455729167, + "learning_rate": 0.0001, + "loss": 4.125, + "loss/crossentropy": 2.0340868830680847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1785074919462204, + "step": 16674 + }, + { + "epoch": 0.33352, + "grad_norm": 2.25, + "grad_norm_var": 0.010990397135416666, + "learning_rate": 0.0001, + "loss": 4.2404, + "loss/crossentropy": 2.4421908855438232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24263421446084976, + "step": 16676 + }, + { + "epoch": 0.33356, + "grad_norm": 1.953125, + "grad_norm_var": 0.0108551025390625, + "learning_rate": 0.0001, + "loss": 4.1879, + "loss/crossentropy": 1.9141955971717834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18666712939739227, + "step": 16678 + }, + { + "epoch": 0.3336, + "grad_norm": 2.015625, + "grad_norm_var": 0.011336008707682291, + "learning_rate": 0.0001, + "loss": 4.0378, + "loss/crossentropy": 2.0019638538360596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17722105979919434, + "step": 16680 + }, + { + "epoch": 0.33364, + "grad_norm": 1.984375, + "grad_norm_var": 0.009511057535807292, + "learning_rate": 0.0001, + "loss": 4.2047, + "loss/crossentropy": 2.0835896134376526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.251879021525383, + "step": 16682 + }, + { + "epoch": 0.33368, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015192667643229166, + "learning_rate": 0.0001, + "loss": 4.2406, + "loss/crossentropy": 1.8940032720565796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19034714996814728, + "step": 16684 + }, + { + "epoch": 0.33372, + "grad_norm": 2.09375, + "grad_norm_var": 0.014817047119140624, + "learning_rate": 0.0001, + "loss": 4.0845, + "loss/crossentropy": 1.863099992275238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19675587117671967, + "step": 16686 + }, + { + "epoch": 0.33376, + "grad_norm": 2.046875, + "grad_norm_var": 0.024836222330729168, + "learning_rate": 0.0001, + "loss": 4.2785, + "loss/crossentropy": 2.027459740638733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19650273770093918, + "step": 16688 + }, + { + "epoch": 0.3338, + "grad_norm": 1.890625, + "grad_norm_var": 0.026008097330729167, + "learning_rate": 0.0001, + "loss": 4.2892, + "loss/crossentropy": 2.2324944734573364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23016268759965897, + "step": 16690 + }, + { + "epoch": 0.33384, + "grad_norm": 2.03125, + "grad_norm_var": 0.02474950154622396, + "learning_rate": 0.0001, + "loss": 4.155, + "loss/crossentropy": 2.213741898536682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21159538626670837, + "step": 16692 + }, + { + "epoch": 0.33388, + "grad_norm": 2.03125, + "grad_norm_var": 0.025340779622395834, + "learning_rate": 0.0001, + "loss": 4.4426, + "loss/crossentropy": 2.084823966026306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22060541808605194, + "step": 16694 + }, + { + "epoch": 0.33392, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02417780558268229, + "learning_rate": 0.0001, + "loss": 4.1847, + "loss/crossentropy": 2.2417017221450806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2050381377339363, + "step": 16696 + }, + { + "epoch": 0.33396, + "grad_norm": 2.015625, + "grad_norm_var": 0.032134755452473955, + "learning_rate": 0.0001, + "loss": 4.3037, + "loss/crossentropy": 2.3696242570877075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22559890151023865, + "step": 16698 + }, + { + "epoch": 0.334, + "grad_norm": 1.8828125, + "grad_norm_var": 0.030295562744140626, + "learning_rate": 0.0001, + "loss": 4.158, + "loss/crossentropy": 2.0899609327316284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20470503717660904, + "step": 16700 + }, + { + "epoch": 0.33404, + "grad_norm": 1.90625, + "grad_norm_var": 0.029386138916015624, + "learning_rate": 0.0001, + "loss": 3.924, + "loss/crossentropy": 1.8663234114646912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815531924366951, + "step": 16702 + }, + { + "epoch": 0.33408, + "grad_norm": 3.0, + "grad_norm_var": 0.08069432576497396, + "learning_rate": 0.0001, + "loss": 4.1651, + "loss/crossentropy": 2.030432403087616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063748836517334, + "step": 16704 + }, + { + "epoch": 0.33412, + "grad_norm": 1.90625, + "grad_norm_var": 0.08033218383789062, + "learning_rate": 0.0001, + "loss": 4.152, + "loss/crossentropy": 2.12885981798172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19536730647087097, + "step": 16706 + }, + { + "epoch": 0.33416, + "grad_norm": 2.09375, + "grad_norm_var": 0.07674153645833333, + "learning_rate": 0.0001, + "loss": 3.909, + "loss/crossentropy": 2.0870607495307922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20478558540344238, + "step": 16708 + }, + { + "epoch": 0.3342, + "grad_norm": 2.03125, + "grad_norm_var": 0.07646382649739583, + "learning_rate": 0.0001, + "loss": 4.2058, + "loss/crossentropy": 2.215463638305664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2118721902370453, + "step": 16710 + }, + { + "epoch": 0.33424, + "grad_norm": 1.9609375, + "grad_norm_var": 0.08145243326822917, + "learning_rate": 0.0001, + "loss": 3.8417, + "loss/crossentropy": 2.0292049646377563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18440847843885422, + "step": 16712 + }, + { + "epoch": 0.33428, + "grad_norm": 1.828125, + "grad_norm_var": 0.08017756144205729, + "learning_rate": 0.0001, + "loss": 3.7722, + "loss/crossentropy": 1.9214341640472412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18510671705007553, + "step": 16714 + }, + { + "epoch": 0.33432, + "grad_norm": 2.046875, + "grad_norm_var": 0.07769266764322917, + "learning_rate": 0.0001, + "loss": 4.3905, + "loss/crossentropy": 2.1761534214019775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20527766644954681, + "step": 16716 + }, + { + "epoch": 0.33436, + "grad_norm": 1.9765625, + "grad_norm_var": 0.07639567057291667, + "learning_rate": 0.0001, + "loss": 3.9234, + "loss/crossentropy": 2.020824670791626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20723462104797363, + "step": 16718 + }, + { + "epoch": 0.3344, + "grad_norm": 1.921875, + "grad_norm_var": 0.0129547119140625, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 2.243465781211853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21866197139024734, + "step": 16720 + }, + { + "epoch": 0.33444, + "grad_norm": 1.859375, + "grad_norm_var": 0.013244374593098959, + "learning_rate": 0.0001, + "loss": 3.9963, + "loss/crossentropy": 1.9071390628814697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20368033647537231, + "step": 16722 + }, + { + "epoch": 0.33448, + "grad_norm": 1.7890625, + "grad_norm_var": 0.011966959635416666, + "learning_rate": 0.0001, + "loss": 3.8974, + "loss/crossentropy": 1.72525554895401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18200145661830902, + "step": 16724 + }, + { + "epoch": 0.33452, + "grad_norm": 2.109375, + "grad_norm_var": 0.0154052734375, + "learning_rate": 0.0001, + "loss": 4.101, + "loss/crossentropy": 2.174055576324463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21181844174861908, + "step": 16726 + }, + { + "epoch": 0.33456, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014251454671223959, + "learning_rate": 0.0001, + "loss": 3.8004, + "loss/crossentropy": 1.6175724864006042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15338046848773956, + "step": 16728 + }, + { + "epoch": 0.3346, + "grad_norm": 1.953125, + "grad_norm_var": 0.012353515625, + "learning_rate": 0.0001, + "loss": 4.1101, + "loss/crossentropy": 2.0030194520950317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20353461802005768, + "step": 16730 + }, + { + "epoch": 0.33464, + "grad_norm": 2.03125, + "grad_norm_var": 0.012092081705729167, + "learning_rate": 0.0001, + "loss": 4.3876, + "loss/crossentropy": 2.272015690803528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22368235886096954, + "step": 16732 + }, + { + "epoch": 0.33468, + "grad_norm": 2.046875, + "grad_norm_var": 0.012189737955729167, + "learning_rate": 0.0001, + "loss": 4.3365, + "loss/crossentropy": 2.149975538253784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20634270459413528, + "step": 16734 + }, + { + "epoch": 0.33472, + "grad_norm": 1.921875, + "grad_norm_var": 0.009869130452473958, + "learning_rate": 0.0001, + "loss": 3.9201, + "loss/crossentropy": 2.095462441444397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20027077198028564, + "step": 16736 + }, + { + "epoch": 0.33476, + "grad_norm": 1.953125, + "grad_norm_var": 0.008819325764973959, + "learning_rate": 0.0001, + "loss": 4.0859, + "loss/crossentropy": 1.988598644733429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20503661036491394, + "step": 16738 + }, + { + "epoch": 0.3348, + "grad_norm": 2.03125, + "grad_norm_var": 0.0066569010416666664, + "learning_rate": 0.0001, + "loss": 4.2618, + "loss/crossentropy": 2.0615572333335876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20363956689834595, + "step": 16740 + }, + { + "epoch": 0.33484, + "grad_norm": 2.0625, + "grad_norm_var": 0.0027903238932291668, + "learning_rate": 0.0001, + "loss": 3.9476, + "loss/crossentropy": 1.8172455430030823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20091129839420319, + "step": 16742 + }, + { + "epoch": 0.33488, + "grad_norm": 1.984375, + "grad_norm_var": 0.0043853759765625, + "learning_rate": 0.0001, + "loss": 3.9541, + "loss/crossentropy": 2.212782144546509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19710610061883926, + "step": 16744 + }, + { + "epoch": 0.33492, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008674875895182291, + "learning_rate": 0.0001, + "loss": 4.2629, + "loss/crossentropy": 2.374183773994446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23687051236629486, + "step": 16746 + }, + { + "epoch": 0.33496, + "grad_norm": 2.0625, + "grad_norm_var": 0.008902740478515626, + "learning_rate": 0.0001, + "loss": 4.3642, + "loss/crossentropy": 1.9374622702598572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19821622967720032, + "step": 16748 + }, + { + "epoch": 0.335, + "grad_norm": 1.984375, + "grad_norm_var": 0.008740234375, + "learning_rate": 0.0001, + "loss": 4.1289, + "loss/crossentropy": 1.6063715815544128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16296840459108353, + "step": 16750 + }, + { + "epoch": 0.33504, + "grad_norm": 1.859375, + "grad_norm_var": 0.0096832275390625, + "learning_rate": 0.0001, + "loss": 4.0315, + "loss/crossentropy": 2.1251469254493713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17997874319553375, + "step": 16752 + }, + { + "epoch": 0.33508, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0097564697265625, + "learning_rate": 0.0001, + "loss": 4.03, + "loss/crossentropy": 2.126043915748596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2283206582069397, + "step": 16754 + }, + { + "epoch": 0.33512, + "grad_norm": 2.046875, + "grad_norm_var": 0.009901682535807291, + "learning_rate": 0.0001, + "loss": 4.2969, + "loss/crossentropy": 2.3409340381622314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21817704290151596, + "step": 16756 + }, + { + "epoch": 0.33516, + "grad_norm": 2.0, + "grad_norm_var": 0.01024169921875, + "learning_rate": 0.0001, + "loss": 4.0083, + "loss/crossentropy": 1.9816790223121643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19743956625461578, + "step": 16758 + }, + { + "epoch": 0.3352, + "grad_norm": 2.21875, + "grad_norm_var": 0.012967681884765625, + "learning_rate": 0.0001, + "loss": 4.3056, + "loss/crossentropy": 2.246092438697815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22662658989429474, + "step": 16760 + }, + { + "epoch": 0.33524, + "grad_norm": 2.125, + "grad_norm_var": 0.010794830322265626, + "learning_rate": 0.0001, + "loss": 4.2842, + "loss/crossentropy": 2.1571802496910095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20506682991981506, + "step": 16762 + }, + { + "epoch": 0.33528, + "grad_norm": 2.015625, + "grad_norm_var": 0.010636138916015624, + "learning_rate": 0.0001, + "loss": 4.0372, + "loss/crossentropy": 1.966954231262207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18581678718328476, + "step": 16764 + }, + { + "epoch": 0.33532, + "grad_norm": 2.125, + "grad_norm_var": 0.012961578369140626, + "learning_rate": 0.0001, + "loss": 3.9978, + "loss/crossentropy": 1.9360153079032898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20846693962812424, + "step": 16766 + }, + { + "epoch": 0.33536, + "grad_norm": 2.0625, + "grad_norm_var": 0.012644195556640625, + "learning_rate": 0.0001, + "loss": 4.0953, + "loss/crossentropy": 2.3978073596954346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20406542718410492, + "step": 16768 + }, + { + "epoch": 0.3354, + "grad_norm": 2.109375, + "grad_norm_var": 0.013639068603515625, + "learning_rate": 0.0001, + "loss": 4.0933, + "loss/crossentropy": 2.239229917526245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21536701172590256, + "step": 16770 + }, + { + "epoch": 0.33544, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013600413004557292, + "learning_rate": 0.0001, + "loss": 4.2153, + "loss/crossentropy": 2.1333428621292114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062046378850937, + "step": 16772 + }, + { + "epoch": 0.33548, + "grad_norm": 2.09375, + "grad_norm_var": 0.014362589518229166, + "learning_rate": 0.0001, + "loss": 4.0094, + "loss/crossentropy": 2.3503127098083496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20422626286745071, + "step": 16774 + }, + { + "epoch": 0.33552, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009772745768229167, + "learning_rate": 0.0001, + "loss": 4.1466, + "loss/crossentropy": 1.9181209802627563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16844520717859268, + "step": 16776 + }, + { + "epoch": 0.33556, + "grad_norm": 2.015625, + "grad_norm_var": 0.009269205729166667, + "learning_rate": 0.0001, + "loss": 4.1975, + "loss/crossentropy": 2.022172212600708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19443488866090775, + "step": 16778 + }, + { + "epoch": 0.3356, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010951487223307292, + "learning_rate": 0.0001, + "loss": 3.9544, + "loss/crossentropy": 2.0416210293769836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17731749266386032, + "step": 16780 + }, + { + "epoch": 0.33564, + "grad_norm": 2.0625, + "grad_norm_var": 0.009748331705729167, + "learning_rate": 0.0001, + "loss": 4.0757, + "loss/crossentropy": 2.1722596883773804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19995421171188354, + "step": 16782 + }, + { + "epoch": 0.33568, + "grad_norm": 1.953125, + "grad_norm_var": 0.009146881103515626, + "learning_rate": 0.0001, + "loss": 4.0942, + "loss/crossentropy": 2.4041545391082764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22784388065338135, + "step": 16784 + }, + { + "epoch": 0.33572, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007757314046223958, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 2.189204216003418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20079431682825089, + "step": 16786 + }, + { + "epoch": 0.33576, + "grad_norm": 1.890625, + "grad_norm_var": 0.007130686442057292, + "learning_rate": 0.0001, + "loss": 3.9771, + "loss/crossentropy": 2.2308130860328674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22352158278226852, + "step": 16788 + }, + { + "epoch": 0.3358, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005248006184895833, + "learning_rate": 0.0001, + "loss": 4.0901, + "loss/crossentropy": 2.238118886947632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21165399253368378, + "step": 16790 + }, + { + "epoch": 0.33584, + "grad_norm": 11.9375, + "grad_norm_var": 6.248060862223308, + "learning_rate": 0.0001, + "loss": 4.5633, + "loss/crossentropy": 2.452346444129944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2192159816622734, + "step": 16792 + }, + { + "epoch": 0.33588, + "grad_norm": 1.9765625, + "grad_norm_var": 6.234175364176433, + "learning_rate": 0.0001, + "loss": 4.2032, + "loss/crossentropy": 2.115506410598755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19228096306324005, + "step": 16794 + }, + { + "epoch": 0.33592, + "grad_norm": 2.046875, + "grad_norm_var": 6.207706705729167, + "learning_rate": 0.0001, + "loss": 4.313, + "loss/crossentropy": 2.1052395701408386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24314512312412262, + "step": 16796 + }, + { + "epoch": 0.33596, + "grad_norm": 1.9765625, + "grad_norm_var": 6.191576131184896, + "learning_rate": 0.0001, + "loss": 4.201, + "loss/crossentropy": 1.7643597722053528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17998731136322021, + "step": 16798 + }, + { + "epoch": 0.336, + "grad_norm": 1.890625, + "grad_norm_var": 6.213846588134766, + "learning_rate": 0.0001, + "loss": 3.886, + "loss/crossentropy": 2.0926302671432495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199835367500782, + "step": 16800 + }, + { + "epoch": 0.33604, + "grad_norm": 2.109375, + "grad_norm_var": 6.19762954711914, + "learning_rate": 0.0001, + "loss": 4.0106, + "loss/crossentropy": 1.9939849972724915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19728610664606094, + "step": 16802 + }, + { + "epoch": 0.33608, + "grad_norm": 2.15625, + "grad_norm_var": 6.1746826171875, + "learning_rate": 0.0001, + "loss": 4.3381, + "loss/crossentropy": 2.154136300086975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071739211678505, + "step": 16804 + }, + { + "epoch": 0.33612, + "grad_norm": 1.9296875, + "grad_norm_var": 6.173281860351563, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 1.6244451403617859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1825866624712944, + "step": 16806 + }, + { + "epoch": 0.33616, + "grad_norm": 1.890625, + "grad_norm_var": 0.010945383707682292, + "learning_rate": 0.0001, + "loss": 3.6796, + "loss/crossentropy": 1.936405599117279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20031629502773285, + "step": 16808 + }, + { + "epoch": 0.3362, + "grad_norm": 2.046875, + "grad_norm_var": 0.011568196614583333, + "learning_rate": 0.0001, + "loss": 4.276, + "loss/crossentropy": 2.117881119251251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21941083669662476, + "step": 16810 + }, + { + "epoch": 0.33624, + "grad_norm": 1.921875, + "grad_norm_var": 0.0125152587890625, + "learning_rate": 0.0001, + "loss": 4.1104, + "loss/crossentropy": 2.0384327173233032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019578069448471, + "step": 16812 + }, + { + "epoch": 0.33628, + "grad_norm": 1.921875, + "grad_norm_var": 0.011893463134765626, + "learning_rate": 0.0001, + "loss": 4.1383, + "loss/crossentropy": 2.058988571166992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20959977805614471, + "step": 16814 + }, + { + "epoch": 0.33632, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010680898030598959, + "learning_rate": 0.0001, + "loss": 4.0734, + "loss/crossentropy": 2.117241382598877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039024829864502, + "step": 16816 + }, + { + "epoch": 0.33636, + "grad_norm": 2.0, + "grad_norm_var": 0.009968058268229166, + "learning_rate": 0.0001, + "loss": 4.157, + "loss/crossentropy": 1.8142318725585938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877349317073822, + "step": 16818 + }, + { + "epoch": 0.3364, + "grad_norm": 2.296875, + "grad_norm_var": 0.014440663655598958, + "learning_rate": 0.0001, + "loss": 4.2402, + "loss/crossentropy": 2.1766676902770996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21788865327835083, + "step": 16820 + }, + { + "epoch": 0.33644, + "grad_norm": 2.078125, + "grad_norm_var": 0.0163726806640625, + "learning_rate": 0.0001, + "loss": 4.2953, + "loss/crossentropy": 2.2596739530563354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23641519993543625, + "step": 16822 + }, + { + "epoch": 0.33648, + "grad_norm": 1.984375, + "grad_norm_var": 0.012572987874348959, + "learning_rate": 0.0001, + "loss": 4.1689, + "loss/crossentropy": 2.1782987117767334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20562626421451569, + "step": 16824 + }, + { + "epoch": 0.33652, + "grad_norm": 2.28125, + "grad_norm_var": 0.01633275349934896, + "learning_rate": 0.0001, + "loss": 4.0693, + "loss/crossentropy": 1.9579410552978516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20332375168800354, + "step": 16826 + }, + { + "epoch": 0.33656, + "grad_norm": 1.75, + "grad_norm_var": 0.020977528889973958, + "learning_rate": 0.0001, + "loss": 4.0193, + "loss/crossentropy": 1.917382538318634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188996322453022, + "step": 16828 + }, + { + "epoch": 0.3366, + "grad_norm": 2.125, + "grad_norm_var": 0.02045466105143229, + "learning_rate": 0.0001, + "loss": 3.9885, + "loss/crossentropy": 1.8045424222946167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18587132543325424, + "step": 16830 + }, + { + "epoch": 0.33664, + "grad_norm": 2.046875, + "grad_norm_var": 0.07043355305989583, + "learning_rate": 0.0001, + "loss": 4.0171, + "loss/crossentropy": 2.113844871520996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20383527874946594, + "step": 16832 + }, + { + "epoch": 0.33668, + "grad_norm": 2.0625, + "grad_norm_var": 0.06795654296875, + "learning_rate": 0.0001, + "loss": 4.1255, + "loss/crossentropy": 1.9635592699050903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20447230339050293, + "step": 16834 + }, + { + "epoch": 0.33672, + "grad_norm": 2.046875, + "grad_norm_var": 0.06646219889322917, + "learning_rate": 0.0001, + "loss": 4.1336, + "loss/crossentropy": 1.8918602466583252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21786046028137207, + "step": 16836 + }, + { + "epoch": 0.33676, + "grad_norm": 1.9453125, + "grad_norm_var": 0.06734390258789062, + "learning_rate": 0.0001, + "loss": 4.3372, + "loss/crossentropy": 2.347909450531006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23047995567321777, + "step": 16838 + }, + { + "epoch": 0.3368, + "grad_norm": 1.90625, + "grad_norm_var": 0.06854248046875, + "learning_rate": 0.0001, + "loss": 4.0242, + "loss/crossentropy": 1.834633469581604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1915532797574997, + "step": 16840 + }, + { + "epoch": 0.33684, + "grad_norm": 1.84375, + "grad_norm_var": 0.06948954264322917, + "learning_rate": 0.0001, + "loss": 3.6871, + "loss/crossentropy": 1.9172112345695496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18901804834604263, + "step": 16842 + }, + { + "epoch": 0.33688, + "grad_norm": 1.953125, + "grad_norm_var": 0.06441141764322916, + "learning_rate": 0.0001, + "loss": 4.3911, + "loss/crossentropy": 2.37927508354187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2328021451830864, + "step": 16844 + }, + { + "epoch": 0.33692, + "grad_norm": 1.921875, + "grad_norm_var": 0.06558024088541667, + "learning_rate": 0.0001, + "loss": 3.8573, + "loss/crossentropy": 1.7887099385261536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17989587783813477, + "step": 16846 + }, + { + "epoch": 0.33696, + "grad_norm": 1.96875, + "grad_norm_var": 0.0051513671875, + "learning_rate": 0.0001, + "loss": 4.0826, + "loss/crossentropy": 2.008473217487335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20677915215492249, + "step": 16848 + }, + { + "epoch": 0.337, + "grad_norm": 1.90625, + "grad_norm_var": 0.005362955729166666, + "learning_rate": 0.0001, + "loss": 4.1751, + "loss/crossentropy": 2.101797103881836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19833290576934814, + "step": 16850 + }, + { + "epoch": 0.33704, + "grad_norm": 2.0, + "grad_norm_var": 0.005779774983723959, + "learning_rate": 0.0001, + "loss": 4.0087, + "loss/crossentropy": 1.9306662678718567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893247440457344, + "step": 16852 + }, + { + "epoch": 0.33708, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005293782552083333, + "learning_rate": 0.0001, + "loss": 3.9986, + "loss/crossentropy": 2.1301704049110413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975892335176468, + "step": 16854 + }, + { + "epoch": 0.33712, + "grad_norm": 2.09375, + "grad_norm_var": 0.007173411051432292, + "learning_rate": 0.0001, + "loss": 3.9583, + "loss/crossentropy": 2.020972192287445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19376973807811737, + "step": 16856 + }, + { + "epoch": 0.33716, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006159464518229167, + "learning_rate": 0.0001, + "loss": 3.9632, + "loss/crossentropy": 1.8190429210662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1629476398229599, + "step": 16858 + }, + { + "epoch": 0.3372, + "grad_norm": 2.09375, + "grad_norm_var": 0.007183583577473959, + "learning_rate": 0.0001, + "loss": 4.2147, + "loss/crossentropy": 2.360092878341675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22874057292938232, + "step": 16860 + }, + { + "epoch": 0.33724, + "grad_norm": 2.171875, + "grad_norm_var": 0.008853912353515625, + "learning_rate": 0.0001, + "loss": 4.232, + "loss/crossentropy": 2.3477792739868164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215813398361206, + "step": 16862 + }, + { + "epoch": 0.33728, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009919230143229167, + "learning_rate": 0.0001, + "loss": 4.2551, + "loss/crossentropy": 2.474452257156372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2185184359550476, + "step": 16864 + }, + { + "epoch": 0.33732, + "grad_norm": 2.015625, + "grad_norm_var": 0.010396067301432292, + "learning_rate": 0.0001, + "loss": 3.7782, + "loss/crossentropy": 1.8387269973754883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18137617409229279, + "step": 16866 + }, + { + "epoch": 0.33736, + "grad_norm": 1.953125, + "grad_norm_var": 0.009924062093098958, + "learning_rate": 0.0001, + "loss": 3.9414, + "loss/crossentropy": 1.8537965416908264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19480426609516144, + "step": 16868 + }, + { + "epoch": 0.3374, + "grad_norm": 1.96875, + "grad_norm_var": 0.010057576497395833, + "learning_rate": 0.0001, + "loss": 3.9808, + "loss/crossentropy": 1.9274648427963257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19545693695545197, + "step": 16870 + }, + { + "epoch": 0.33744, + "grad_norm": 2.140625, + "grad_norm_var": 0.010273996988932292, + "learning_rate": 0.0001, + "loss": 4.1097, + "loss/crossentropy": 1.9180024862289429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180539146065712, + "step": 16872 + }, + { + "epoch": 0.33748, + "grad_norm": 2.078125, + "grad_norm_var": 0.011016591389973959, + "learning_rate": 0.0001, + "loss": 4.4008, + "loss/crossentropy": 2.185365915298462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2225860357284546, + "step": 16874 + }, + { + "epoch": 0.33752, + "grad_norm": 1.859375, + "grad_norm_var": 0.011523183186848958, + "learning_rate": 0.0001, + "loss": 3.9882, + "loss/crossentropy": 1.9749634861946106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19897626340389252, + "step": 16876 + }, + { + "epoch": 0.33756, + "grad_norm": 2.109375, + "grad_norm_var": 0.010284169514973959, + "learning_rate": 0.0001, + "loss": 4.237, + "loss/crossentropy": 2.1505807638168335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20921844244003296, + "step": 16878 + }, + { + "epoch": 0.3376, + "grad_norm": 1.859375, + "grad_norm_var": 0.01002197265625, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 1.7159577012062073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19649318605661392, + "step": 16880 + }, + { + "epoch": 0.33764, + "grad_norm": 1.84375, + "grad_norm_var": 0.010251617431640625, + "learning_rate": 0.0001, + "loss": 4.1615, + "loss/crossentropy": 2.187040388584137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20251552015542984, + "step": 16882 + }, + { + "epoch": 0.33768, + "grad_norm": 2.234375, + "grad_norm_var": 0.014609527587890626, + "learning_rate": 0.0001, + "loss": 4.0587, + "loss/crossentropy": 1.9247627258300781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17710037529468536, + "step": 16884 + }, + { + "epoch": 0.33772, + "grad_norm": 1.9375, + "grad_norm_var": 0.0141845703125, + "learning_rate": 0.0001, + "loss": 4.001, + "loss/crossentropy": 1.8499276041984558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18950388580560684, + "step": 16886 + }, + { + "epoch": 0.33776, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0138824462890625, + "learning_rate": 0.0001, + "loss": 4.0998, + "loss/crossentropy": 2.0127341747283936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18379998207092285, + "step": 16888 + }, + { + "epoch": 0.3378, + "grad_norm": 1.9375, + "grad_norm_var": 0.013602447509765626, + "learning_rate": 0.0001, + "loss": 3.9096, + "loss/crossentropy": 2.2561115026474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20785757154226303, + "step": 16890 + }, + { + "epoch": 0.33784, + "grad_norm": 1.9375, + "grad_norm_var": 0.0131103515625, + "learning_rate": 0.0001, + "loss": 3.9638, + "loss/crossentropy": 1.9391398429870605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19481037557125092, + "step": 16892 + }, + { + "epoch": 0.33788, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014827219645182292, + "learning_rate": 0.0001, + "loss": 4.2368, + "loss/crossentropy": 2.3129481077194214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2065483033657074, + "step": 16894 + }, + { + "epoch": 0.33792, + "grad_norm": 2.296875, + "grad_norm_var": 0.01984430948893229, + "learning_rate": 0.0001, + "loss": 4.0774, + "loss/crossentropy": 1.8482372760772705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19291523844003677, + "step": 16896 + }, + { + "epoch": 0.33796, + "grad_norm": 2.03125, + "grad_norm_var": 0.020031483968098958, + "learning_rate": 0.0001, + "loss": 4.3876, + "loss/crossentropy": 1.973912537097931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22137422114610672, + "step": 16898 + }, + { + "epoch": 0.338, + "grad_norm": 1.890625, + "grad_norm_var": 0.01654841105143229, + "learning_rate": 0.0001, + "loss": 3.9599, + "loss/crossentropy": 1.7987132668495178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18385548144578934, + "step": 16900 + }, + { + "epoch": 0.33804, + "grad_norm": 1.890625, + "grad_norm_var": 0.01727879842122396, + "learning_rate": 0.0001, + "loss": 4.3673, + "loss/crossentropy": 2.199060797691345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016109600663185, + "step": 16902 + }, + { + "epoch": 0.33808, + "grad_norm": 3.046875, + "grad_norm_var": 0.0830718994140625, + "learning_rate": 0.0001, + "loss": 4.27, + "loss/crossentropy": 2.179081439971924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21342043578624725, + "step": 16904 + }, + { + "epoch": 0.33812, + "grad_norm": 1.9375, + "grad_norm_var": 0.0810455322265625, + "learning_rate": 0.0001, + "loss": 3.9861, + "loss/crossentropy": 1.953968107700348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20462830364704132, + "step": 16906 + }, + { + "epoch": 0.33816, + "grad_norm": 2.0, + "grad_norm_var": 0.08123270670572917, + "learning_rate": 0.0001, + "loss": 4.3432, + "loss/crossentropy": 2.3202494382858276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21390480548143387, + "step": 16908 + }, + { + "epoch": 0.3382, + "grad_norm": 2.15625, + "grad_norm_var": 0.07872899373372395, + "learning_rate": 0.0001, + "loss": 4.3085, + "loss/crossentropy": 2.085767686367035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20946675539016724, + "step": 16910 + }, + { + "epoch": 0.33824, + "grad_norm": 2.0625, + "grad_norm_var": 0.07372945149739583, + "learning_rate": 0.0001, + "loss": 4.217, + "loss/crossentropy": 1.789705514907837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2247294932603836, + "step": 16912 + }, + { + "epoch": 0.33828, + "grad_norm": 1.9140625, + "grad_norm_var": 0.07440770467122396, + "learning_rate": 0.0001, + "loss": 4.0409, + "loss/crossentropy": 1.9354140758514404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17958863079547882, + "step": 16914 + }, + { + "epoch": 0.33832, + "grad_norm": 2.015625, + "grad_norm_var": 0.07529296875, + "learning_rate": 0.0001, + "loss": 3.6949, + "loss/crossentropy": 1.878045916557312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1812344640493393, + "step": 16916 + }, + { + "epoch": 0.33836, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0769488016764323, + "learning_rate": 0.0001, + "loss": 3.9392, + "loss/crossentropy": 1.7474132776260376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19444099068641663, + "step": 16918 + }, + { + "epoch": 0.3384, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007185618082682292, + "learning_rate": 0.0001, + "loss": 4.1524, + "loss/crossentropy": 2.0462751984596252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20077970623970032, + "step": 16920 + }, + { + "epoch": 0.33844, + "grad_norm": 2.03125, + "grad_norm_var": 0.006912994384765625, + "learning_rate": 0.0001, + "loss": 4.205, + "loss/crossentropy": 2.1104917526245117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21799270808696747, + "step": 16922 + }, + { + "epoch": 0.33848, + "grad_norm": 2.0625, + "grad_norm_var": 0.007083892822265625, + "learning_rate": 0.0001, + "loss": 4.1915, + "loss/crossentropy": 2.11525696516037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20585542917251587, + "step": 16924 + }, + { + "epoch": 0.33852, + "grad_norm": 1.96875, + "grad_norm_var": 0.005222320556640625, + "learning_rate": 0.0001, + "loss": 4.3629, + "loss/crossentropy": 2.3451485633850098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22672247886657715, + "step": 16926 + }, + { + "epoch": 0.33856, + "grad_norm": 2.03125, + "grad_norm_var": 0.004906209309895834, + "learning_rate": 0.0001, + "loss": 4.0493, + "loss/crossentropy": 1.8872862458229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973365843296051, + "step": 16928 + }, + { + "epoch": 0.3386, + "grad_norm": 2.1875, + "grad_norm_var": 0.00777587890625, + "learning_rate": 0.0001, + "loss": 4.1082, + "loss/crossentropy": 1.9465213418006897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2297648787498474, + "step": 16930 + }, + { + "epoch": 0.33864, + "grad_norm": 1.96875, + "grad_norm_var": 0.0084625244140625, + "learning_rate": 0.0001, + "loss": 3.9397, + "loss/crossentropy": 1.7463516592979431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193784698843956, + "step": 16932 + }, + { + "epoch": 0.33868, + "grad_norm": 1.984375, + "grad_norm_var": 0.006738026936848958, + "learning_rate": 0.0001, + "loss": 4.1561, + "loss/crossentropy": 2.2214877605438232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064037024974823, + "step": 16934 + }, + { + "epoch": 0.33872, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007600657145182292, + "learning_rate": 0.0001, + "loss": 4.0355, + "loss/crossentropy": 1.896401822566986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21860874444246292, + "step": 16936 + }, + { + "epoch": 0.33876, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007209269205729166, + "learning_rate": 0.0001, + "loss": 4.0701, + "loss/crossentropy": 2.508669376373291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23076221346855164, + "step": 16938 + }, + { + "epoch": 0.3388, + "grad_norm": 2.09375, + "grad_norm_var": 0.009266916910807292, + "learning_rate": 0.0001, + "loss": 3.8421, + "loss/crossentropy": 1.9412622451782227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1833793818950653, + "step": 16940 + }, + { + "epoch": 0.33884, + "grad_norm": 2.484375, + "grad_norm_var": 0.025406646728515624, + "learning_rate": 0.0001, + "loss": 4.362, + "loss/crossentropy": 2.4757901430130005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23629899322986603, + "step": 16942 + }, + { + "epoch": 0.33888, + "grad_norm": 2.046875, + "grad_norm_var": 0.02580744425455729, + "learning_rate": 0.0001, + "loss": 4.1204, + "loss/crossentropy": 1.788071870803833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894787922501564, + "step": 16944 + }, + { + "epoch": 0.33892, + "grad_norm": 1.9296875, + "grad_norm_var": 0.023436482747395834, + "learning_rate": 0.0001, + "loss": 4.0626, + "loss/crossentropy": 2.1902449131011963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21310994774103165, + "step": 16946 + }, + { + "epoch": 0.33896, + "grad_norm": 1.96875, + "grad_norm_var": 0.021394856770833335, + "learning_rate": 0.0001, + "loss": 3.9828, + "loss/crossentropy": 2.0444132685661316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1755325198173523, + "step": 16948 + }, + { + "epoch": 0.339, + "grad_norm": 2.09375, + "grad_norm_var": 0.02200902303059896, + "learning_rate": 0.0001, + "loss": 4.1609, + "loss/crossentropy": 1.9242961406707764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18970132619142532, + "step": 16950 + }, + { + "epoch": 0.33904, + "grad_norm": 1.90625, + "grad_norm_var": 0.0216552734375, + "learning_rate": 0.0001, + "loss": 3.9819, + "loss/crossentropy": 1.7046860456466675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19807563722133636, + "step": 16952 + }, + { + "epoch": 0.33908, + "grad_norm": 2.0625, + "grad_norm_var": 0.02184015909830729, + "learning_rate": 0.0001, + "loss": 3.8975, + "loss/crossentropy": 1.8960286974906921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17571083456277847, + "step": 16954 + }, + { + "epoch": 0.33912, + "grad_norm": 1.9296875, + "grad_norm_var": 0.019481404622395834, + "learning_rate": 0.0001, + "loss": 3.922, + "loss/crossentropy": 1.9351304769515991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918834000825882, + "step": 16956 + }, + { + "epoch": 0.33916, + "grad_norm": 2.453125, + "grad_norm_var": 0.017728424072265624, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 1.974304735660553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20828718692064285, + "step": 16958 + }, + { + "epoch": 0.3392, + "grad_norm": 1.921875, + "grad_norm_var": 0.017867024739583334, + "learning_rate": 0.0001, + "loss": 3.8506, + "loss/crossentropy": 2.3836190700531006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074722871184349, + "step": 16960 + }, + { + "epoch": 0.33924, + "grad_norm": 1.875, + "grad_norm_var": 0.018123372395833334, + "learning_rate": 0.0001, + "loss": 4.0827, + "loss/crossentropy": 2.1663198471069336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20097027719020844, + "step": 16962 + }, + { + "epoch": 0.33928, + "grad_norm": 2.03125, + "grad_norm_var": 0.01810277303059896, + "learning_rate": 0.0001, + "loss": 4.2466, + "loss/crossentropy": 2.275226354598999, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22942040860652924, + "step": 16964 + }, + { + "epoch": 0.33932, + "grad_norm": 1.9375, + "grad_norm_var": 0.017796834309895832, + "learning_rate": 0.0001, + "loss": 3.9802, + "loss/crossentropy": 1.7741501331329346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822734773159027, + "step": 16966 + }, + { + "epoch": 0.33936, + "grad_norm": 1.984375, + "grad_norm_var": 0.017215983072916666, + "learning_rate": 0.0001, + "loss": 3.9016, + "loss/crossentropy": 1.791014850139618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008860930800438, + "step": 16968 + }, + { + "epoch": 0.3394, + "grad_norm": 2.046875, + "grad_norm_var": 0.016888173421223958, + "learning_rate": 0.0001, + "loss": 4.1941, + "loss/crossentropy": 1.9239555597305298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192842036485672, + "step": 16970 + }, + { + "epoch": 0.33944, + "grad_norm": 2.09375, + "grad_norm_var": 0.016788482666015625, + "learning_rate": 0.0001, + "loss": 4.108, + "loss/crossentropy": 2.1543636322021484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20888526737689972, + "step": 16972 + }, + { + "epoch": 0.33948, + "grad_norm": 1.84375, + "grad_norm_var": 0.005012003580729166, + "learning_rate": 0.0001, + "loss": 3.975, + "loss/crossentropy": 2.2322418093681335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19024743884801865, + "step": 16974 + }, + { + "epoch": 0.33952, + "grad_norm": 2.109375, + "grad_norm_var": 0.012422434488932292, + "learning_rate": 0.0001, + "loss": 4.3208, + "loss/crossentropy": 1.9934163093566895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26193471997976303, + "step": 16976 + }, + { + "epoch": 0.33956, + "grad_norm": 2.03125, + "grad_norm_var": 0.013244374593098959, + "learning_rate": 0.0001, + "loss": 4.2753, + "loss/crossentropy": 2.340088725090027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110593616962433, + "step": 16978 + }, + { + "epoch": 0.3396, + "grad_norm": 1.890625, + "grad_norm_var": 0.03021214803059896, + "learning_rate": 0.0001, + "loss": 4.2047, + "loss/crossentropy": 2.057206869125366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071269005537033, + "step": 16980 + }, + { + "epoch": 0.33964, + "grad_norm": 1.953125, + "grad_norm_var": 0.030210113525390624, + "learning_rate": 0.0001, + "loss": 4.0262, + "loss/crossentropy": 2.002032458782196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19903026521205902, + "step": 16982 + }, + { + "epoch": 0.33968, + "grad_norm": 2.0, + "grad_norm_var": 0.030804189046223958, + "learning_rate": 0.0001, + "loss": 4.246, + "loss/crossentropy": 2.01567679643631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20198361575603485, + "step": 16984 + }, + { + "epoch": 0.33972, + "grad_norm": 1.8515625, + "grad_norm_var": 0.032814280192057295, + "learning_rate": 0.0001, + "loss": 4.0606, + "loss/crossentropy": 1.661778211593628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16601823270320892, + "step": 16986 + }, + { + "epoch": 0.33976, + "grad_norm": 2.015625, + "grad_norm_var": 0.03467203776041667, + "learning_rate": 0.0001, + "loss": 3.9411, + "loss/crossentropy": 2.284485101699829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20108629018068314, + "step": 16988 + }, + { + "epoch": 0.3398, + "grad_norm": 2.03125, + "grad_norm_var": 0.03200861612955729, + "learning_rate": 0.0001, + "loss": 4.1066, + "loss/crossentropy": 2.1860578656196594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22195414453744888, + "step": 16990 + }, + { + "epoch": 0.33984, + "grad_norm": 1.984375, + "grad_norm_var": 0.02719904581705729, + "learning_rate": 0.0001, + "loss": 4.0212, + "loss/crossentropy": 2.032013416290283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19930274784564972, + "step": 16992 + }, + { + "epoch": 0.33988, + "grad_norm": 2.0625, + "grad_norm_var": 0.1898577372233073, + "learning_rate": 0.0001, + "loss": 4.2847, + "loss/crossentropy": 2.227185010910034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19806954264640808, + "step": 16994 + }, + { + "epoch": 0.33992, + "grad_norm": 1.9609375, + "grad_norm_var": 0.17535807291666666, + "learning_rate": 0.0001, + "loss": 4.1526, + "loss/crossentropy": 2.222020983695984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2354518324136734, + "step": 16996 + }, + { + "epoch": 0.33996, + "grad_norm": 2.109375, + "grad_norm_var": 0.1755767822265625, + "learning_rate": 0.0001, + "loss": 4.1766, + "loss/crossentropy": 2.336432456970215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19555255770683289, + "step": 16998 + }, + { + "epoch": 0.34, + "grad_norm": 1.96875, + "grad_norm_var": 0.1748443603515625, + "learning_rate": 0.0001, + "loss": 4.3312, + "loss/crossentropy": 2.17374986410141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21119412034749985, + "step": 17000 + }, + { + "epoch": 0.34004, + "grad_norm": 2.203125, + "grad_norm_var": 0.17195536295572916, + "learning_rate": 0.0001, + "loss": 4.1445, + "loss/crossentropy": 2.135382056236267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2263253927230835, + "step": 17002 + }, + { + "epoch": 0.34008, + "grad_norm": 2.09375, + "grad_norm_var": 0.20149917602539064, + "learning_rate": 0.0001, + "loss": 3.811, + "loss/crossentropy": 1.9646037220954895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18890808522701263, + "step": 17004 + }, + { + "epoch": 0.34012, + "grad_norm": 1.921875, + "grad_norm_var": 0.20098368326822916, + "learning_rate": 0.0001, + "loss": 4.0889, + "loss/crossentropy": 1.8009583353996277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882813572883606, + "step": 17006 + }, + { + "epoch": 0.34016, + "grad_norm": 3.109375, + "grad_norm_var": 0.24433492024739584, + "learning_rate": 0.0001, + "loss": 4.4921, + "loss/crossentropy": 2.0362982153892517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208044171333313, + "step": 17008 + }, + { + "epoch": 0.3402, + "grad_norm": 1.9765625, + "grad_norm_var": 0.11477432250976563, + "learning_rate": 0.0001, + "loss": 4.1781, + "loss/crossentropy": 2.2055057287216187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21141493320465088, + "step": 17010 + }, + { + "epoch": 0.34024, + "grad_norm": 2.0, + "grad_norm_var": 0.11963475545247396, + "learning_rate": 0.0001, + "loss": 3.772, + "loss/crossentropy": 1.386088252067566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16025834530591965, + "step": 17012 + }, + { + "epoch": 0.34028, + "grad_norm": 2.1875, + "grad_norm_var": 0.11599299112955729, + "learning_rate": 0.0001, + "loss": 4.2389, + "loss/crossentropy": 2.131182312965393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20460083335638046, + "step": 17014 + }, + { + "epoch": 0.34032, + "grad_norm": 2.015625, + "grad_norm_var": 0.11592992146809895, + "learning_rate": 0.0001, + "loss": 4.2681, + "loss/crossentropy": 2.2682281732559204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22915159910917282, + "step": 17016 + }, + { + "epoch": 0.34036, + "grad_norm": 1.96875, + "grad_norm_var": 0.12011693318684896, + "learning_rate": 0.0001, + "loss": 4.0521, + "loss/crossentropy": 2.0855059027671814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19398894160985947, + "step": 17018 + }, + { + "epoch": 0.3404, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0842437744140625, + "learning_rate": 0.0001, + "loss": 4.0421, + "loss/crossentropy": 1.895095944404602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172701731324196, + "step": 17020 + }, + { + "epoch": 0.34044, + "grad_norm": 2.0, + "grad_norm_var": 0.08455403645833333, + "learning_rate": 0.0001, + "loss": 4.0814, + "loss/crossentropy": 2.10223788022995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144509255886078, + "step": 17022 + }, + { + "epoch": 0.34048, + "grad_norm": 2.03125, + "grad_norm_var": 0.014241282145182292, + "learning_rate": 0.0001, + "loss": 4.1211, + "loss/crossentropy": 2.1131081581115723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21176592260599136, + "step": 17024 + }, + { + "epoch": 0.34052, + "grad_norm": 2.0, + "grad_norm_var": 0.009991200764973958, + "learning_rate": 0.0001, + "loss": 4.1581, + "loss/crossentropy": 2.1125452518463135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931193768978119, + "step": 17026 + }, + { + "epoch": 0.34056, + "grad_norm": 1.8046875, + "grad_norm_var": 0.017775217692057293, + "learning_rate": 0.0001, + "loss": 4.2624, + "loss/crossentropy": 2.213385283946991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2576863020658493, + "step": 17028 + }, + { + "epoch": 0.3406, + "grad_norm": 2.171875, + "grad_norm_var": 0.017773183186848958, + "learning_rate": 0.0001, + "loss": 4.408, + "loss/crossentropy": 2.0793206095695496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20338336378335953, + "step": 17030 + }, + { + "epoch": 0.34064, + "grad_norm": 2.078125, + "grad_norm_var": 0.01807225545247396, + "learning_rate": 0.0001, + "loss": 4.3102, + "loss/crossentropy": 1.8870239853858948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18544750660657883, + "step": 17032 + }, + { + "epoch": 0.34068, + "grad_norm": 1.9609375, + "grad_norm_var": 0.017254384358723958, + "learning_rate": 0.0001, + "loss": 4.1538, + "loss/crossentropy": 1.9893989562988281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19679997861385345, + "step": 17034 + }, + { + "epoch": 0.34072, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01845270792643229, + "learning_rate": 0.0001, + "loss": 3.9447, + "loss/crossentropy": 2.0401668548583984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19242482632398605, + "step": 17036 + }, + { + "epoch": 0.34076, + "grad_norm": 2.109375, + "grad_norm_var": 0.015472157796223959, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 2.0786415934562683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21781006455421448, + "step": 17038 + }, + { + "epoch": 0.3408, + "grad_norm": 1.9296875, + "grad_norm_var": 0.015533192952473959, + "learning_rate": 0.0001, + "loss": 3.9472, + "loss/crossentropy": 2.143572211265564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20608216524124146, + "step": 17040 + }, + { + "epoch": 0.34084, + "grad_norm": 2.0625, + "grad_norm_var": 0.0154693603515625, + "learning_rate": 0.0001, + "loss": 4.3305, + "loss/crossentropy": 2.0578572750091553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22006108611822128, + "step": 17042 + }, + { + "epoch": 0.34088, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0061920166015625, + "learning_rate": 0.0001, + "loss": 4.1295, + "loss/crossentropy": 2.1016032099723816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059948667883873, + "step": 17044 + }, + { + "epoch": 0.34092, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00438232421875, + "learning_rate": 0.0001, + "loss": 3.9293, + "loss/crossentropy": 1.814449965953827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1946963220834732, + "step": 17046 + }, + { + "epoch": 0.34096, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004233551025390625, + "learning_rate": 0.0001, + "loss": 4.0696, + "loss/crossentropy": 2.2421000599861145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2114911824464798, + "step": 17048 + }, + { + "epoch": 0.341, + "grad_norm": 2.125, + "grad_norm_var": 0.0070231119791666664, + "learning_rate": 0.0001, + "loss": 3.984, + "loss/crossentropy": 2.1200218200683594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212583489716053, + "step": 17050 + }, + { + "epoch": 0.34104, + "grad_norm": 1.921875, + "grad_norm_var": 0.007441965738932291, + "learning_rate": 0.0001, + "loss": 3.8656, + "loss/crossentropy": 1.9486305713653564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18631484359502792, + "step": 17052 + }, + { + "epoch": 0.34108, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007721964518229167, + "learning_rate": 0.0001, + "loss": 4.3888, + "loss/crossentropy": 2.4838292598724365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23271672427654266, + "step": 17054 + }, + { + "epoch": 0.34112, + "grad_norm": 1.96875, + "grad_norm_var": 0.008137766520182292, + "learning_rate": 0.0001, + "loss": 3.9099, + "loss/crossentropy": 1.8606489896774292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1955883800983429, + "step": 17056 + }, + { + "epoch": 0.34116, + "grad_norm": 2.15625, + "grad_norm_var": 0.013199615478515624, + "learning_rate": 0.0001, + "loss": 4.2847, + "loss/crossentropy": 2.1702204942703247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21619878709316254, + "step": 17058 + }, + { + "epoch": 0.3412, + "grad_norm": 2.015625, + "grad_norm_var": 0.013158162434895834, + "learning_rate": 0.0001, + "loss": 4.1987, + "loss/crossentropy": 2.1011393070220947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2120063453912735, + "step": 17060 + }, + { + "epoch": 0.34124, + "grad_norm": 1.953125, + "grad_norm_var": 0.013142903645833334, + "learning_rate": 0.0001, + "loss": 4.1704, + "loss/crossentropy": 2.267427682876587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20903929322957993, + "step": 17062 + }, + { + "epoch": 0.34128, + "grad_norm": 1.984375, + "grad_norm_var": 0.01620457967122396, + "learning_rate": 0.0001, + "loss": 4.3312, + "loss/crossentropy": 2.125304937362671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18540818989276886, + "step": 17064 + }, + { + "epoch": 0.34132, + "grad_norm": 1.90625, + "grad_norm_var": 0.013925933837890625, + "learning_rate": 0.0001, + "loss": 3.9552, + "loss/crossentropy": 1.8741024136543274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19243815541267395, + "step": 17066 + }, + { + "epoch": 0.34136, + "grad_norm": 2.46875, + "grad_norm_var": 0.023451487223307293, + "learning_rate": 0.0001, + "loss": 4.0252, + "loss/crossentropy": 2.12781822681427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23974300920963287, + "step": 17068 + }, + { + "epoch": 0.3414, + "grad_norm": 2.15625, + "grad_norm_var": 0.02448298136393229, + "learning_rate": 0.0001, + "loss": 4.2728, + "loss/crossentropy": 2.0786141753196716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22311320155858994, + "step": 17070 + }, + { + "epoch": 0.34144, + "grad_norm": 1.9140625, + "grad_norm_var": 0.02408421834309896, + "learning_rate": 0.0001, + "loss": 4.1686, + "loss/crossentropy": 2.1569892168045044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20391946285963058, + "step": 17072 + }, + { + "epoch": 0.34148, + "grad_norm": 1.90625, + "grad_norm_var": 0.024607086181640626, + "learning_rate": 0.0001, + "loss": 3.8035, + "loss/crossentropy": 1.8860353231430054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937229335308075, + "step": 17074 + }, + { + "epoch": 0.34152, + "grad_norm": 2.0, + "grad_norm_var": 0.02448094685872396, + "learning_rate": 0.0001, + "loss": 4.0993, + "loss/crossentropy": 2.25645911693573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088249772787094, + "step": 17076 + }, + { + "epoch": 0.34156, + "grad_norm": 2.328125, + "grad_norm_var": 0.02972997029622396, + "learning_rate": 0.0001, + "loss": 4.3486, + "loss/crossentropy": 2.379481792449951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2341819554567337, + "step": 17078 + }, + { + "epoch": 0.3416, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0273345947265625, + "learning_rate": 0.0001, + "loss": 4.0841, + "loss/crossentropy": 2.1564733386039734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2162168025970459, + "step": 17080 + }, + { + "epoch": 0.34164, + "grad_norm": 3.015625, + "grad_norm_var": 0.0860260009765625, + "learning_rate": 0.0001, + "loss": 4.2526, + "loss/crossentropy": 1.8906886577606201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18614360690116882, + "step": 17082 + }, + { + "epoch": 0.34168, + "grad_norm": 2.03125, + "grad_norm_var": 0.07732747395833334, + "learning_rate": 0.0001, + "loss": 4.0135, + "loss/crossentropy": 2.093637704849243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19867172092199326, + "step": 17084 + }, + { + "epoch": 0.34172, + "grad_norm": 2.03125, + "grad_norm_var": 0.0760210673014323, + "learning_rate": 0.0001, + "loss": 4.2364, + "loss/crossentropy": 2.2579731941223145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2153376340866089, + "step": 17086 + }, + { + "epoch": 0.34176, + "grad_norm": 2.15625, + "grad_norm_var": 0.07312393188476562, + "learning_rate": 0.0001, + "loss": 4.3275, + "loss/crossentropy": 2.250504732131958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2759099751710892, + "step": 17088 + }, + { + "epoch": 0.3418, + "grad_norm": 1.921875, + "grad_norm_var": 0.07190653483072916, + "learning_rate": 0.0001, + "loss": 3.8977, + "loss/crossentropy": 1.6062138676643372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15309542417526245, + "step": 17090 + }, + { + "epoch": 0.34184, + "grad_norm": 1.8828125, + "grad_norm_var": 0.07587865193684896, + "learning_rate": 0.0001, + "loss": 4.2199, + "loss/crossentropy": 2.227494239807129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20563329756259918, + "step": 17092 + }, + { + "epoch": 0.34188, + "grad_norm": 1.921875, + "grad_norm_var": 0.07314631144205729, + "learning_rate": 0.0001, + "loss": 4.1095, + "loss/crossentropy": 2.1047890186309814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19775691628456116, + "step": 17094 + }, + { + "epoch": 0.34192, + "grad_norm": 2.015625, + "grad_norm_var": 0.07301813761393229, + "learning_rate": 0.0001, + "loss": 4.0508, + "loss/crossentropy": 2.044450581073761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19867002964019775, + "step": 17096 + }, + { + "epoch": 0.34196, + "grad_norm": 1.953125, + "grad_norm_var": 0.009511057535807292, + "learning_rate": 0.0001, + "loss": 4.2319, + "loss/crossentropy": 1.9139947891235352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20143548399209976, + "step": 17098 + }, + { + "epoch": 0.342, + "grad_norm": 1.9375, + "grad_norm_var": 0.008162180582682291, + "learning_rate": 0.0001, + "loss": 3.7159, + "loss/crossentropy": 1.6752634048461914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17691873013973236, + "step": 17100 + }, + { + "epoch": 0.34204, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0156402587890625, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 2.277552843093872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049017697572708, + "step": 17102 + }, + { + "epoch": 0.34208, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015608469645182291, + "learning_rate": 0.0001, + "loss": 4.0663, + "loss/crossentropy": 2.458711266517639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21726765483617783, + "step": 17104 + }, + { + "epoch": 0.34212, + "grad_norm": 2.171875, + "grad_norm_var": 0.01627197265625, + "learning_rate": 0.0001, + "loss": 4.0384, + "loss/crossentropy": 1.9302194714546204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19446631520986557, + "step": 17106 + }, + { + "epoch": 0.34216, + "grad_norm": 1.890625, + "grad_norm_var": 0.01622314453125, + "learning_rate": 0.0001, + "loss": 3.8935, + "loss/crossentropy": 2.021213114261627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19394385814666748, + "step": 17108 + }, + { + "epoch": 0.3422, + "grad_norm": 2.0, + "grad_norm_var": 0.01689453125, + "learning_rate": 0.0001, + "loss": 4.4094, + "loss/crossentropy": 2.4856090545654297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156004160642624, + "step": 17110 + }, + { + "epoch": 0.34224, + "grad_norm": 2.09375, + "grad_norm_var": 0.016869862874348957, + "learning_rate": 0.0001, + "loss": 4.4135, + "loss/crossentropy": 2.410070061683655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21203476190567017, + "step": 17112 + }, + { + "epoch": 0.34228, + "grad_norm": 1.9765625, + "grad_norm_var": 0.019136555989583335, + "learning_rate": 0.0001, + "loss": 4.0484, + "loss/crossentropy": 2.026827871799469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20053605735301971, + "step": 17114 + }, + { + "epoch": 0.34232, + "grad_norm": 2.09375, + "grad_norm_var": 0.016283162434895835, + "learning_rate": 0.0001, + "loss": 4.2319, + "loss/crossentropy": 2.1833966970443726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2193789780139923, + "step": 17116 + }, + { + "epoch": 0.34236, + "grad_norm": 2.109375, + "grad_norm_var": 0.010798899332682292, + "learning_rate": 0.0001, + "loss": 4.0921, + "loss/crossentropy": 2.137472152709961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21218669414520264, + "step": 17118 + }, + { + "epoch": 0.3424, + "grad_norm": 1.96875, + "grad_norm_var": 0.013741048177083333, + "learning_rate": 0.0001, + "loss": 4.2064, + "loss/crossentropy": 2.0607420206069946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138473093509674, + "step": 17120 + }, + { + "epoch": 0.34244, + "grad_norm": 1.984375, + "grad_norm_var": 0.0123687744140625, + "learning_rate": 0.0001, + "loss": 4.1787, + "loss/crossentropy": 1.9109066724777222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20130306482315063, + "step": 17122 + }, + { + "epoch": 0.34248, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010944620768229166, + "learning_rate": 0.0001, + "loss": 3.7531, + "loss/crossentropy": 2.1049917936325073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20878130197525024, + "step": 17124 + }, + { + "epoch": 0.34252, + "grad_norm": 2.03125, + "grad_norm_var": 0.010008748372395833, + "learning_rate": 0.0001, + "loss": 4.0951, + "loss/crossentropy": 1.9087567925453186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18959704041481018, + "step": 17126 + }, + { + "epoch": 0.34256, + "grad_norm": 2.125, + "grad_norm_var": 0.010033162434895833, + "learning_rate": 0.0001, + "loss": 4.3854, + "loss/crossentropy": 2.491095781326294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21739569306373596, + "step": 17128 + }, + { + "epoch": 0.3426, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0081207275390625, + "learning_rate": 0.0001, + "loss": 4.0227, + "loss/crossentropy": 2.045976400375366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18868304044008255, + "step": 17130 + }, + { + "epoch": 0.34264, + "grad_norm": 2.109375, + "grad_norm_var": 0.009318033854166666, + "learning_rate": 0.0001, + "loss": 4.1221, + "loss/crossentropy": 2.021378517150879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1955355852842331, + "step": 17132 + }, + { + "epoch": 0.34268, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010247548421223959, + "learning_rate": 0.0001, + "loss": 4.0244, + "loss/crossentropy": 2.052547872066498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18654850870370865, + "step": 17134 + }, + { + "epoch": 0.34272, + "grad_norm": 2.09375, + "grad_norm_var": 0.008886464436848958, + "learning_rate": 0.0001, + "loss": 4.2536, + "loss/crossentropy": 2.1709023118019104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21335267275571823, + "step": 17136 + }, + { + "epoch": 0.34276, + "grad_norm": 1.921875, + "grad_norm_var": 0.010477701822916666, + "learning_rate": 0.0001, + "loss": 4.1682, + "loss/crossentropy": 2.310550093650818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18719495832920074, + "step": 17138 + }, + { + "epoch": 0.3428, + "grad_norm": 2.046875, + "grad_norm_var": 0.011262003580729167, + "learning_rate": 0.0001, + "loss": 4.0145, + "loss/crossentropy": 1.9357584714889526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914527416229248, + "step": 17140 + }, + { + "epoch": 0.34284, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012684885660807292, + "learning_rate": 0.0001, + "loss": 3.9673, + "loss/crossentropy": 1.7561541199684143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1711764633655548, + "step": 17142 + }, + { + "epoch": 0.34288, + "grad_norm": 2.515625, + "grad_norm_var": 0.7546994527180989, + "learning_rate": 0.0001, + "loss": 4.562, + "loss/crossentropy": 2.4081480503082275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2405518889427185, + "step": 17144 + }, + { + "epoch": 0.34292, + "grad_norm": 2.34375, + "grad_norm_var": 0.7456451416015625, + "learning_rate": 0.0001, + "loss": 4.4023, + "loss/crossentropy": 2.44531512260437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23487353324890137, + "step": 17146 + }, + { + "epoch": 0.34296, + "grad_norm": 1.9296875, + "grad_norm_var": 0.7445696512858073, + "learning_rate": 0.0001, + "loss": 4.0845, + "loss/crossentropy": 1.9712305068969727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20045534521341324, + "step": 17148 + }, + { + "epoch": 0.343, + "grad_norm": 1.890625, + "grad_norm_var": 0.7468706766764323, + "learning_rate": 0.0001, + "loss": 3.8479, + "loss/crossentropy": 1.9718485474586487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18550659716129303, + "step": 17150 + }, + { + "epoch": 0.34304, + "grad_norm": 1.9453125, + "grad_norm_var": 0.7527414957682291, + "learning_rate": 0.0001, + "loss": 4.2708, + "loss/crossentropy": 2.1901930570602417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082839384675026, + "step": 17152 + }, + { + "epoch": 0.34308, + "grad_norm": 1.953125, + "grad_norm_var": 0.7463905334472656, + "learning_rate": 0.0001, + "loss": 4.2104, + "loss/crossentropy": 2.098864793777466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19527551531791687, + "step": 17154 + }, + { + "epoch": 0.34312, + "grad_norm": 2.0, + "grad_norm_var": 0.7467437744140625, + "learning_rate": 0.0001, + "loss": 3.9873, + "loss/crossentropy": 1.7448402643203735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1862412989139557, + "step": 17156 + }, + { + "epoch": 0.34316, + "grad_norm": 2.109375, + "grad_norm_var": 0.7429603576660156, + "learning_rate": 0.0001, + "loss": 4.1016, + "loss/crossentropy": 1.9973859190940857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19645331799983978, + "step": 17158 + }, + { + "epoch": 0.3432, + "grad_norm": 1.8515625, + "grad_norm_var": 0.016874186197916665, + "learning_rate": 0.0001, + "loss": 4.2022, + "loss/crossentropy": 2.2570079565048218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20290197432041168, + "step": 17160 + }, + { + "epoch": 0.34324, + "grad_norm": 2.15625, + "grad_norm_var": 0.008642323811848958, + "learning_rate": 0.0001, + "loss": 4.2304, + "loss/crossentropy": 1.7943695187568665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924503892660141, + "step": 17162 + }, + { + "epoch": 0.34328, + "grad_norm": 2.0, + "grad_norm_var": 0.008225250244140624, + "learning_rate": 0.0001, + "loss": 4.2142, + "loss/crossentropy": 2.1875799894332886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968172788619995, + "step": 17164 + }, + { + "epoch": 0.34332, + "grad_norm": 2.0625, + "grad_norm_var": 0.007477823893229167, + "learning_rate": 0.0001, + "loss": 4.1844, + "loss/crossentropy": 2.242374360561371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21981361508369446, + "step": 17166 + }, + { + "epoch": 0.34336, + "grad_norm": 2.046875, + "grad_norm_var": 0.006754302978515625, + "learning_rate": 0.0001, + "loss": 4.0406, + "loss/crossentropy": 1.8542492985725403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19412916153669357, + "step": 17168 + }, + { + "epoch": 0.3434, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0067860921223958336, + "learning_rate": 0.0001, + "loss": 3.8804, + "loss/crossentropy": 1.9377904534339905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20015903562307358, + "step": 17170 + }, + { + "epoch": 0.34344, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006319173177083333, + "learning_rate": 0.0001, + "loss": 4.1691, + "loss/crossentropy": 2.2250781655311584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1943892240524292, + "step": 17172 + }, + { + "epoch": 0.34348, + "grad_norm": 1.984375, + "grad_norm_var": 0.0054094950358072914, + "learning_rate": 0.0001, + "loss": 4.0274, + "loss/crossentropy": 2.0463303923606873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931215077638626, + "step": 17174 + }, + { + "epoch": 0.34352, + "grad_norm": 2.078125, + "grad_norm_var": 0.004984537760416667, + "learning_rate": 0.0001, + "loss": 3.826, + "loss/crossentropy": 2.0576277375221252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24007725715637207, + "step": 17176 + }, + { + "epoch": 0.34356, + "grad_norm": 1.859375, + "grad_norm_var": 0.003999582926432292, + "learning_rate": 0.0001, + "loss": 3.9865, + "loss/crossentropy": 1.8423307538032532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17171106487512589, + "step": 17178 + }, + { + "epoch": 0.3436, + "grad_norm": 1.984375, + "grad_norm_var": 0.0053375244140625, + "learning_rate": 0.0001, + "loss": 4.3089, + "loss/crossentropy": 2.05659818649292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20058965682983398, + "step": 17180 + }, + { + "epoch": 0.34364, + "grad_norm": 2.046875, + "grad_norm_var": 0.0055620829264322914, + "learning_rate": 0.0001, + "loss": 3.8965, + "loss/crossentropy": 1.926946997642517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1770440638065338, + "step": 17182 + }, + { + "epoch": 0.34368, + "grad_norm": 2.015625, + "grad_norm_var": 0.005098215738932292, + "learning_rate": 0.0001, + "loss": 3.9781, + "loss/crossentropy": 1.8181686401367188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19718249142169952, + "step": 17184 + }, + { + "epoch": 0.34372, + "grad_norm": 1.8203125, + "grad_norm_var": 0.006624094645182292, + "learning_rate": 0.0001, + "loss": 4.0732, + "loss/crossentropy": 2.021128237247467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968582272529602, + "step": 17186 + }, + { + "epoch": 0.34376, + "grad_norm": 2.109375, + "grad_norm_var": 0.008365631103515625, + "learning_rate": 0.0001, + "loss": 4.2861, + "loss/crossentropy": 2.04776668548584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22718969732522964, + "step": 17188 + }, + { + "epoch": 0.3438, + "grad_norm": 1.890625, + "grad_norm_var": 0.008957672119140624, + "learning_rate": 0.0001, + "loss": 4.2196, + "loss/crossentropy": 2.2225213050842285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20369096100330353, + "step": 17190 + }, + { + "epoch": 0.34384, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008089192708333333, + "learning_rate": 0.0001, + "loss": 4.1137, + "loss/crossentropy": 1.8544179201126099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20437808334827423, + "step": 17192 + }, + { + "epoch": 0.34388, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006730143229166667, + "learning_rate": 0.0001, + "loss": 4.387, + "loss/crossentropy": 2.1270517110824585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997273936867714, + "step": 17194 + }, + { + "epoch": 0.34392, + "grad_norm": 1.984375, + "grad_norm_var": 0.0060791015625, + "learning_rate": 0.0001, + "loss": 3.8062, + "loss/crossentropy": 1.8699182271957397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19382557272911072, + "step": 17196 + }, + { + "epoch": 0.34396, + "grad_norm": 2.046875, + "grad_norm_var": 0.006941731770833333, + "learning_rate": 0.0001, + "loss": 3.9864, + "loss/crossentropy": 1.8734498023986816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18053626269102097, + "step": 17198 + }, + { + "epoch": 0.344, + "grad_norm": 1.890625, + "grad_norm_var": 0.008097330729166666, + "learning_rate": 0.0001, + "loss": 3.9234, + "loss/crossentropy": 1.9386133551597595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851777583360672, + "step": 17200 + }, + { + "epoch": 0.34404, + "grad_norm": 2.015625, + "grad_norm_var": 0.0069163004557291664, + "learning_rate": 0.0001, + "loss": 3.9619, + "loss/crossentropy": 2.1074278354644775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22176695615053177, + "step": 17202 + }, + { + "epoch": 0.34408, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005895741780598958, + "learning_rate": 0.0001, + "loss": 4.1231, + "loss/crossentropy": 1.826387107372284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19368141889572144, + "step": 17204 + }, + { + "epoch": 0.34412, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005680084228515625, + "learning_rate": 0.0001, + "loss": 4.1367, + "loss/crossentropy": 1.9109328389167786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023227959871292, + "step": 17206 + }, + { + "epoch": 0.34416, + "grad_norm": 2.09375, + "grad_norm_var": 0.006517537434895833, + "learning_rate": 0.0001, + "loss": 4.1495, + "loss/crossentropy": 1.8752732872962952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17867545038461685, + "step": 17208 + }, + { + "epoch": 0.3442, + "grad_norm": 1.828125, + "grad_norm_var": 0.00972900390625, + "learning_rate": 0.0001, + "loss": 3.9998, + "loss/crossentropy": 1.95436429977417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19947312772274017, + "step": 17210 + }, + { + "epoch": 0.34424, + "grad_norm": 1.984375, + "grad_norm_var": 0.00947265625, + "learning_rate": 0.0001, + "loss": 4.0074, + "loss/crossentropy": 1.9237809777259827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055458500981331, + "step": 17212 + }, + { + "epoch": 0.34428, + "grad_norm": 1.9375, + "grad_norm_var": 0.007995351155598959, + "learning_rate": 0.0001, + "loss": 4.1351, + "loss/crossentropy": 2.4497138261795044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23067011684179306, + "step": 17214 + }, + { + "epoch": 0.34432, + "grad_norm": 1.859375, + "grad_norm_var": 0.007791900634765625, + "learning_rate": 0.0001, + "loss": 3.9144, + "loss/crossentropy": 2.33384370803833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22732949256896973, + "step": 17216 + }, + { + "epoch": 0.34436, + "grad_norm": 2.0, + "grad_norm_var": 0.0075439453125, + "learning_rate": 0.0001, + "loss": 3.9414, + "loss/crossentropy": 2.022156059741974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063901051878929, + "step": 17218 + }, + { + "epoch": 0.3444, + "grad_norm": 1.90625, + "grad_norm_var": 0.007614898681640625, + "learning_rate": 0.0001, + "loss": 4.0642, + "loss/crossentropy": 2.183963179588318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21098726987838745, + "step": 17220 + }, + { + "epoch": 0.34444, + "grad_norm": 6.875, + "grad_norm_var": 1.501227823893229, + "learning_rate": 0.0001, + "loss": 4.3139, + "loss/crossentropy": 2.256633758544922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21449647843837738, + "step": 17222 + }, + { + "epoch": 0.34448, + "grad_norm": 14.3125, + "grad_norm_var": 10.478612263997396, + "learning_rate": 0.0001, + "loss": 3.838, + "loss/crossentropy": 1.8303287625312805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18412896245718002, + "step": 17224 + }, + { + "epoch": 0.34452, + "grad_norm": 2.0625, + "grad_norm_var": 10.405972290039063, + "learning_rate": 0.0001, + "loss": 4.1852, + "loss/crossentropy": 2.3248833417892456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23459070920944214, + "step": 17226 + }, + { + "epoch": 0.34456, + "grad_norm": 2.109375, + "grad_norm_var": 10.418485514322917, + "learning_rate": 0.0001, + "loss": 4.062, + "loss/crossentropy": 2.018009066581726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21266046166419983, + "step": 17228 + }, + { + "epoch": 0.3446, + "grad_norm": 1.984375, + "grad_norm_var": 10.434609985351562, + "learning_rate": 0.0001, + "loss": 4.186, + "loss/crossentropy": 2.337665557861328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.225304514169693, + "step": 17230 + }, + { + "epoch": 0.34464, + "grad_norm": 1.8046875, + "grad_norm_var": 10.453043365478516, + "learning_rate": 0.0001, + "loss": 3.86, + "loss/crossentropy": 1.8581790924072266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18770471215248108, + "step": 17232 + }, + { + "epoch": 0.34468, + "grad_norm": 1.8515625, + "grad_norm_var": 10.462247721354167, + "learning_rate": 0.0001, + "loss": 3.9213, + "loss/crossentropy": 1.9208187460899353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19106490910053253, + "step": 17234 + }, + { + "epoch": 0.34472, + "grad_norm": 1.8515625, + "grad_norm_var": 10.454630279541016, + "learning_rate": 0.0001, + "loss": 3.8284, + "loss/crossentropy": 1.8844050765037537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17407642304897308, + "step": 17236 + }, + { + "epoch": 0.34476, + "grad_norm": 1.9296875, + "grad_norm_var": 9.497085571289062, + "learning_rate": 0.0001, + "loss": 3.7423, + "loss/crossentropy": 2.0443355441093445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198170468211174, + "step": 17238 + }, + { + "epoch": 0.3448, + "grad_norm": 2.046875, + "grad_norm_var": 0.027342732747395834, + "learning_rate": 0.0001, + "loss": 4.0965, + "loss/crossentropy": 1.8563520908355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958291083574295, + "step": 17240 + }, + { + "epoch": 0.34484, + "grad_norm": 2.015625, + "grad_norm_var": 0.0127349853515625, + "learning_rate": 0.0001, + "loss": 4.3028, + "loss/crossentropy": 2.1252214908599854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22386425733566284, + "step": 17242 + }, + { + "epoch": 0.34488, + "grad_norm": 1.9375, + "grad_norm_var": 0.011848958333333333, + "learning_rate": 0.0001, + "loss": 3.926, + "loss/crossentropy": 2.1379681825637817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19781935214996338, + "step": 17244 + }, + { + "epoch": 0.34492, + "grad_norm": 1.828125, + "grad_norm_var": 0.013529205322265625, + "learning_rate": 0.0001, + "loss": 3.7403, + "loss/crossentropy": 1.9774981141090393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18523120880126953, + "step": 17246 + }, + { + "epoch": 0.34496, + "grad_norm": 2.015625, + "grad_norm_var": 0.0120025634765625, + "learning_rate": 0.0001, + "loss": 4.2668, + "loss/crossentropy": 2.3234479427337646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21939057111740112, + "step": 17248 + }, + { + "epoch": 0.345, + "grad_norm": 2.109375, + "grad_norm_var": 0.011706288655598958, + "learning_rate": 0.0001, + "loss": 4.11, + "loss/crossentropy": 2.0521583557128906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20204473286867142, + "step": 17250 + }, + { + "epoch": 0.34504, + "grad_norm": 1.90625, + "grad_norm_var": 0.03452860514322917, + "learning_rate": 0.0001, + "loss": 4.2184, + "loss/crossentropy": 2.486477255821228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21673013269901276, + "step": 17252 + }, + { + "epoch": 0.34508, + "grad_norm": 1.8515625, + "grad_norm_var": 0.035456339518229164, + "learning_rate": 0.0001, + "loss": 4.0215, + "loss/crossentropy": 2.285652995109558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21322200447320938, + "step": 17254 + }, + { + "epoch": 0.34512, + "grad_norm": 2.046875, + "grad_norm_var": 0.03436686197916667, + "learning_rate": 0.0001, + "loss": 4.1855, + "loss/crossentropy": 2.0946747064590454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19791530817747116, + "step": 17256 + }, + { + "epoch": 0.34516, + "grad_norm": 2.09375, + "grad_norm_var": 0.03250732421875, + "learning_rate": 0.0001, + "loss": 4.1129, + "loss/crossentropy": 1.7509565353393555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16754039376974106, + "step": 17258 + }, + { + "epoch": 0.3452, + "grad_norm": 2.03125, + "grad_norm_var": 0.0325347900390625, + "learning_rate": 0.0001, + "loss": 4.3411, + "loss/crossentropy": 2.3828574419021606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22555553913116455, + "step": 17260 + }, + { + "epoch": 0.34524, + "grad_norm": 2.140625, + "grad_norm_var": 0.029060872395833333, + "learning_rate": 0.0001, + "loss": 4.215, + "loss/crossentropy": 1.997941255569458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815095767378807, + "step": 17262 + }, + { + "epoch": 0.34528, + "grad_norm": 2.25, + "grad_norm_var": 0.0296630859375, + "learning_rate": 0.0001, + "loss": 4.3814, + "loss/crossentropy": 2.3036177158355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2305641770362854, + "step": 17264 + }, + { + "epoch": 0.34532, + "grad_norm": 1.8984375, + "grad_norm_var": 0.03178075154622396, + "learning_rate": 0.0001, + "loss": 4.1668, + "loss/crossentropy": 2.202543616294861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19594155251979828, + "step": 17266 + }, + { + "epoch": 0.34536, + "grad_norm": 1.875, + "grad_norm_var": 0.011199696858723959, + "learning_rate": 0.0001, + "loss": 4.1792, + "loss/crossentropy": 2.1162038445472717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945410966873169, + "step": 17268 + }, + { + "epoch": 0.3454, + "grad_norm": 1.90625, + "grad_norm_var": 0.009883626302083334, + "learning_rate": 0.0001, + "loss": 4.1109, + "loss/crossentropy": 2.273344039916992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24327096343040466, + "step": 17270 + }, + { + "epoch": 0.34544, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011351521809895833, + "learning_rate": 0.0001, + "loss": 3.8254, + "loss/crossentropy": 1.851362407207489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18494703620672226, + "step": 17272 + }, + { + "epoch": 0.34548, + "grad_norm": 2.046875, + "grad_norm_var": 0.01114501953125, + "learning_rate": 0.0001, + "loss": 4.0846, + "loss/crossentropy": 1.9861729145050049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20077189058065414, + "step": 17274 + }, + { + "epoch": 0.34552, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011716461181640625, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 2.0234099626541138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20506415516138077, + "step": 17276 + }, + { + "epoch": 0.34556, + "grad_norm": 1.90625, + "grad_norm_var": 0.012326812744140625, + "learning_rate": 0.0001, + "loss": 3.9843, + "loss/crossentropy": 2.1666045784950256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18108145147562027, + "step": 17278 + }, + { + "epoch": 0.3456, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0060536702473958336, + "learning_rate": 0.0001, + "loss": 4.0877, + "loss/crossentropy": 2.066905975341797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112976461648941, + "step": 17280 + }, + { + "epoch": 0.34564, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0059722900390625, + "learning_rate": 0.0001, + "loss": 4.224, + "loss/crossentropy": 2.1789294481277466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20467212051153183, + "step": 17282 + }, + { + "epoch": 0.34568, + "grad_norm": 1.953125, + "grad_norm_var": 0.005736287434895833, + "learning_rate": 0.0001, + "loss": 3.9716, + "loss/crossentropy": 1.8310245275497437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974945366382599, + "step": 17284 + }, + { + "epoch": 0.34572, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006563059488932292, + "learning_rate": 0.0001, + "loss": 4.1832, + "loss/crossentropy": 2.033496856689453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1989584118127823, + "step": 17286 + }, + { + "epoch": 0.34576, + "grad_norm": 2.0625, + "grad_norm_var": 0.0091705322265625, + "learning_rate": 0.0001, + "loss": 3.8748, + "loss/crossentropy": 1.8883211016654968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889789029955864, + "step": 17288 + }, + { + "epoch": 0.3458, + "grad_norm": 2.015625, + "grad_norm_var": 0.008495076497395834, + "learning_rate": 0.0001, + "loss": 4.1637, + "loss/crossentropy": 2.4520593881607056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2183564007282257, + "step": 17290 + }, + { + "epoch": 0.34584, + "grad_norm": 2.046875, + "grad_norm_var": 0.008272298177083333, + "learning_rate": 0.0001, + "loss": 3.9375, + "loss/crossentropy": 2.030746340751648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19382008165121078, + "step": 17292 + }, + { + "epoch": 0.34588, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007295735677083333, + "learning_rate": 0.0001, + "loss": 4.2182, + "loss/crossentropy": 1.9897491931915283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21130450069904327, + "step": 17294 + }, + { + "epoch": 0.34592, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006883748372395833, + "learning_rate": 0.0001, + "loss": 3.982, + "loss/crossentropy": 1.8000388145446777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20359987765550613, + "step": 17296 + }, + { + "epoch": 0.34596, + "grad_norm": 1.984375, + "grad_norm_var": 0.0065610249837239586, + "learning_rate": 0.0001, + "loss": 4.0341, + "loss/crossentropy": 2.0408164262771606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18042601644992828, + "step": 17298 + }, + { + "epoch": 0.346, + "grad_norm": 2.015625, + "grad_norm_var": 0.006742350260416667, + "learning_rate": 0.0001, + "loss": 4.0849, + "loss/crossentropy": 2.28415846824646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20799801498651505, + "step": 17300 + }, + { + "epoch": 0.34604, + "grad_norm": 2.25, + "grad_norm_var": 0.010925038655598959, + "learning_rate": 0.0001, + "loss": 4.3258, + "loss/crossentropy": 1.9420422315597534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20019559562206268, + "step": 17302 + }, + { + "epoch": 0.34608, + "grad_norm": 2.234375, + "grad_norm_var": 0.0102691650390625, + "learning_rate": 0.0001, + "loss": 4.1892, + "loss/crossentropy": 2.100687026977539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21129299700260162, + "step": 17304 + }, + { + "epoch": 0.34612, + "grad_norm": 1.921875, + "grad_norm_var": 0.010970052083333333, + "learning_rate": 0.0001, + "loss": 4.0209, + "loss/crossentropy": 1.8657938241958618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2081414759159088, + "step": 17306 + }, + { + "epoch": 0.34616, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012853749593098958, + "learning_rate": 0.0001, + "loss": 4.025, + "loss/crossentropy": 2.0629169940948486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117018163204193, + "step": 17308 + }, + { + "epoch": 0.3462, + "grad_norm": 1.96875, + "grad_norm_var": 0.0126129150390625, + "learning_rate": 0.0001, + "loss": 4.159, + "loss/crossentropy": 2.2877084016799927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959577053785324, + "step": 17310 + }, + { + "epoch": 0.34624, + "grad_norm": 2.015625, + "grad_norm_var": 0.012280019124348958, + "learning_rate": 0.0001, + "loss": 4.1135, + "loss/crossentropy": 2.2501026391983032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108026072382927, + "step": 17312 + }, + { + "epoch": 0.34628, + "grad_norm": 2.0, + "grad_norm_var": 0.012011464436848958, + "learning_rate": 0.0001, + "loss": 4.1498, + "loss/crossentropy": 1.9485750794410706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126113697886467, + "step": 17314 + }, + { + "epoch": 0.34632, + "grad_norm": 1.984375, + "grad_norm_var": 0.011500803629557292, + "learning_rate": 0.0001, + "loss": 4.13, + "loss/crossentropy": 2.193112373352051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19680321961641312, + "step": 17316 + }, + { + "epoch": 0.34636, + "grad_norm": 1.796875, + "grad_norm_var": 0.010084788004557291, + "learning_rate": 0.0001, + "loss": 4.1181, + "loss/crossentropy": 2.30054771900177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211720421910286, + "step": 17318 + }, + { + "epoch": 0.3464, + "grad_norm": 2.078125, + "grad_norm_var": 0.026869455973307293, + "learning_rate": 0.0001, + "loss": 4.2737, + "loss/crossentropy": 2.0091487169265747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19040243327617645, + "step": 17320 + }, + { + "epoch": 0.34644, + "grad_norm": 2.0625, + "grad_norm_var": 0.036628977457682295, + "learning_rate": 0.0001, + "loss": 4.2257, + "loss/crossentropy": 2.1841423511505127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112671434879303, + "step": 17322 + }, + { + "epoch": 0.34648, + "grad_norm": 2.171875, + "grad_norm_var": 0.03555501302083333, + "learning_rate": 0.0001, + "loss": 3.8837, + "loss/crossentropy": 2.092822253704071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2037336230278015, + "step": 17324 + }, + { + "epoch": 0.34652, + "grad_norm": 2.09375, + "grad_norm_var": 0.0335601806640625, + "learning_rate": 0.0001, + "loss": 4.0322, + "loss/crossentropy": 2.280580163002014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20753052085638046, + "step": 17326 + }, + { + "epoch": 0.34656, + "grad_norm": 1.7890625, + "grad_norm_var": 0.03884048461914062, + "learning_rate": 0.0001, + "loss": 4.1206, + "loss/crossentropy": 1.60361909866333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15820877254009247, + "step": 17328 + }, + { + "epoch": 0.3466, + "grad_norm": 1.84375, + "grad_norm_var": 0.04915949503580729, + "learning_rate": 0.0001, + "loss": 3.6214, + "loss/crossentropy": 1.608814001083374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16808264702558517, + "step": 17330 + }, + { + "epoch": 0.34664, + "grad_norm": 2.609375, + "grad_norm_var": 0.06932373046875, + "learning_rate": 0.0001, + "loss": 4.4983, + "loss/crossentropy": 2.3868669271469116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23717772960662842, + "step": 17332 + }, + { + "epoch": 0.34668, + "grad_norm": 1.8984375, + "grad_norm_var": 0.06720759073893229, + "learning_rate": 0.0001, + "loss": 4.0862, + "loss/crossentropy": 2.1908692717552185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21705647557973862, + "step": 17334 + }, + { + "epoch": 0.34672, + "grad_norm": 2.140625, + "grad_norm_var": 0.05272191365559896, + "learning_rate": 0.0001, + "loss": 4.2436, + "loss/crossentropy": 2.459980010986328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24167515337467194, + "step": 17336 + }, + { + "epoch": 0.34676, + "grad_norm": 1.984375, + "grad_norm_var": 0.04353612263997396, + "learning_rate": 0.0001, + "loss": 4.219, + "loss/crossentropy": 2.221674919128418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2097555324435234, + "step": 17338 + }, + { + "epoch": 0.3468, + "grad_norm": 1.78125, + "grad_norm_var": 0.04521458943684896, + "learning_rate": 0.0001, + "loss": 3.7565, + "loss/crossentropy": 2.2840858697891235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2250964418053627, + "step": 17340 + }, + { + "epoch": 0.34684, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04426854451497396, + "learning_rate": 0.0001, + "loss": 3.6803, + "loss/crossentropy": 2.0715824365615845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149435505270958, + "step": 17342 + }, + { + "epoch": 0.34688, + "grad_norm": 2.03125, + "grad_norm_var": 0.0420806884765625, + "learning_rate": 0.0001, + "loss": 4.1017, + "loss/crossentropy": 2.2389899492263794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22377658635377884, + "step": 17344 + }, + { + "epoch": 0.34692, + "grad_norm": 2.046875, + "grad_norm_var": 0.035278065999348955, + "learning_rate": 0.0001, + "loss": 3.9827, + "loss/crossentropy": 2.0308732986450195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074751853942871, + "step": 17346 + }, + { + "epoch": 0.34696, + "grad_norm": 1.890625, + "grad_norm_var": 0.010864003499348959, + "learning_rate": 0.0001, + "loss": 3.7901, + "loss/crossentropy": 1.8123770356178284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19080865383148193, + "step": 17348 + }, + { + "epoch": 0.347, + "grad_norm": 1.734375, + "grad_norm_var": 0.010367838541666667, + "learning_rate": 0.0001, + "loss": 3.8772, + "loss/crossentropy": 1.884379267692566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799243837594986, + "step": 17350 + }, + { + "epoch": 0.34704, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012442779541015626, + "learning_rate": 0.0001, + "loss": 4.4317, + "loss/crossentropy": 2.237663149833679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20197343081235886, + "step": 17352 + }, + { + "epoch": 0.34708, + "grad_norm": 1.828125, + "grad_norm_var": 0.013337961832682292, + "learning_rate": 0.0001, + "loss": 3.7444, + "loss/crossentropy": 1.9563414454460144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18722663074731827, + "step": 17354 + }, + { + "epoch": 0.34712, + "grad_norm": 2.3125, + "grad_norm_var": 0.020817057291666666, + "learning_rate": 0.0001, + "loss": 4.2799, + "loss/crossentropy": 2.1188591718673706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21183937788009644, + "step": 17356 + }, + { + "epoch": 0.34716, + "grad_norm": 2.046875, + "grad_norm_var": 0.0219146728515625, + "learning_rate": 0.0001, + "loss": 4.4551, + "loss/crossentropy": 2.276149272918701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2232571393251419, + "step": 17358 + }, + { + "epoch": 0.3472, + "grad_norm": 2.0, + "grad_norm_var": 0.02123998006184896, + "learning_rate": 0.0001, + "loss": 4.4329, + "loss/crossentropy": 2.360138177871704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22683367878198624, + "step": 17360 + }, + { + "epoch": 0.34724, + "grad_norm": 1.96875, + "grad_norm_var": 0.020576985677083333, + "learning_rate": 0.0001, + "loss": 4.2762, + "loss/crossentropy": 1.99272620677948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18479204177856445, + "step": 17362 + }, + { + "epoch": 0.34728, + "grad_norm": 1.9921875, + "grad_norm_var": 0.019760894775390624, + "learning_rate": 0.0001, + "loss": 4.2303, + "loss/crossentropy": 2.053771197795868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20591863989830017, + "step": 17364 + }, + { + "epoch": 0.34732, + "grad_norm": 1.9296875, + "grad_norm_var": 0.016428375244140626, + "learning_rate": 0.0001, + "loss": 3.9703, + "loss/crossentropy": 2.1311055421829224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049081027507782, + "step": 17366 + }, + { + "epoch": 0.34736, + "grad_norm": 1.984375, + "grad_norm_var": 0.01246337890625, + "learning_rate": 0.0001, + "loss": 4.1102, + "loss/crossentropy": 2.0712148547172546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19809392094612122, + "step": 17368 + }, + { + "epoch": 0.3474, + "grad_norm": 1.953125, + "grad_norm_var": 0.009468587239583333, + "learning_rate": 0.0001, + "loss": 4.0732, + "loss/crossentropy": 2.144728183746338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21964795887470245, + "step": 17370 + }, + { + "epoch": 0.34744, + "grad_norm": 2.03125, + "grad_norm_var": 0.003856404622395833, + "learning_rate": 0.0001, + "loss": 3.9145, + "loss/crossentropy": 2.024275004863739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19414371997117996, + "step": 17372 + }, + { + "epoch": 0.34748, + "grad_norm": 1.84375, + "grad_norm_var": 0.00400390625, + "learning_rate": 0.0001, + "loss": 4.0925, + "loss/crossentropy": 2.460733413696289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2185375839471817, + "step": 17374 + }, + { + "epoch": 0.34752, + "grad_norm": 2.28125, + "grad_norm_var": 0.010423787434895833, + "learning_rate": 0.0001, + "loss": 4.308, + "loss/crossentropy": 2.3502203226089478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140796035528183, + "step": 17376 + }, + { + "epoch": 0.34756, + "grad_norm": 1.78125, + "grad_norm_var": 0.012547810872395834, + "learning_rate": 0.0001, + "loss": 4.0238, + "loss/crossentropy": 2.2896264791488647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2149006575345993, + "step": 17378 + }, + { + "epoch": 0.3476, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013118489583333334, + "learning_rate": 0.0001, + "loss": 3.9757, + "loss/crossentropy": 2.101209044456482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20842310786247253, + "step": 17380 + }, + { + "epoch": 0.34764, + "grad_norm": 2.0, + "grad_norm_var": 0.0189361572265625, + "learning_rate": 0.0001, + "loss": 4.3287, + "loss/crossentropy": 1.8995551466941833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851196512579918, + "step": 17382 + }, + { + "epoch": 0.34768, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01887995402018229, + "learning_rate": 0.0001, + "loss": 3.9219, + "loss/crossentropy": 1.850643277168274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16962846368551254, + "step": 17384 + }, + { + "epoch": 0.34772, + "grad_norm": 2.078125, + "grad_norm_var": 0.019528961181640624, + "learning_rate": 0.0001, + "loss": 4.2478, + "loss/crossentropy": 2.1131063103675842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19152897596359253, + "step": 17386 + }, + { + "epoch": 0.34776, + "grad_norm": 1.9921875, + "grad_norm_var": 0.020140584309895834, + "learning_rate": 0.0001, + "loss": 4.1932, + "loss/crossentropy": 2.172673463821411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20645174384117126, + "step": 17388 + }, + { + "epoch": 0.3478, + "grad_norm": 2.09375, + "grad_norm_var": 0.019147745768229165, + "learning_rate": 0.0001, + "loss": 4.269, + "loss/crossentropy": 2.431947708129883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20513125509023666, + "step": 17390 + }, + { + "epoch": 0.34784, + "grad_norm": 1.9453125, + "grad_norm_var": 0.014461008707682292, + "learning_rate": 0.0001, + "loss": 3.9314, + "loss/crossentropy": 2.062328279018402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929447129368782, + "step": 17392 + }, + { + "epoch": 0.34788, + "grad_norm": 2.015625, + "grad_norm_var": 0.010609690348307292, + "learning_rate": 0.0001, + "loss": 4.2277, + "loss/crossentropy": 2.189133882522583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20945163816213608, + "step": 17394 + }, + { + "epoch": 0.34792, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010846964518229167, + "learning_rate": 0.0001, + "loss": 3.9426, + "loss/crossentropy": 2.070523977279663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20401575416326523, + "step": 17396 + }, + { + "epoch": 0.34796, + "grad_norm": 1.9375, + "grad_norm_var": 0.0075266520182291664, + "learning_rate": 0.0001, + "loss": 4.0668, + "loss/crossentropy": 2.3387876749038696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19960296899080276, + "step": 17398 + }, + { + "epoch": 0.348, + "grad_norm": 2.078125, + "grad_norm_var": 0.00955810546875, + "learning_rate": 0.0001, + "loss": 4.0225, + "loss/crossentropy": 1.9857355952262878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19195467978715897, + "step": 17400 + }, + { + "epoch": 0.34804, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009041086832682291, + "learning_rate": 0.0001, + "loss": 4.0715, + "loss/crossentropy": 2.2523770332336426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22015457600355148, + "step": 17402 + }, + { + "epoch": 0.34808, + "grad_norm": 1.875, + "grad_norm_var": 0.0074045817057291664, + "learning_rate": 0.0001, + "loss": 3.9181, + "loss/crossentropy": 1.7088012099266052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1644933521747589, + "step": 17404 + }, + { + "epoch": 0.34812, + "grad_norm": 1.890625, + "grad_norm_var": 0.006772613525390625, + "learning_rate": 0.0001, + "loss": 3.8759, + "loss/crossentropy": 2.022001802921295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19237526506185532, + "step": 17406 + }, + { + "epoch": 0.34816, + "grad_norm": 2.125, + "grad_norm_var": 0.015669504801432293, + "learning_rate": 0.0001, + "loss": 4.2196, + "loss/crossentropy": 2.240332841873169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2214636355638504, + "step": 17408 + }, + { + "epoch": 0.3482, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015099843343098959, + "learning_rate": 0.0001, + "loss": 4.0211, + "loss/crossentropy": 2.1392452716827393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21769221127033234, + "step": 17410 + }, + { + "epoch": 0.34824, + "grad_norm": 1.890625, + "grad_norm_var": 0.016078440348307292, + "learning_rate": 0.0001, + "loss": 4.0243, + "loss/crossentropy": 1.9182460308074951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202327162027359, + "step": 17412 + }, + { + "epoch": 0.34828, + "grad_norm": 2.0625, + "grad_norm_var": 0.0153961181640625, + "learning_rate": 0.0001, + "loss": 4.0654, + "loss/crossentropy": 2.036426305770874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19997400790452957, + "step": 17414 + }, + { + "epoch": 0.34832, + "grad_norm": 1.9296875, + "grad_norm_var": 0.014353179931640625, + "learning_rate": 0.0001, + "loss": 3.9086, + "loss/crossentropy": 2.0429354906082153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20519915968179703, + "step": 17416 + }, + { + "epoch": 0.34836, + "grad_norm": 2.03125, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 4.2742, + "loss/crossentropy": 2.3943710327148438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106815129518509, + "step": 17418 + }, + { + "epoch": 0.3484, + "grad_norm": 2.046875, + "grad_norm_var": 0.0130615234375, + "learning_rate": 0.0001, + "loss": 4.1596, + "loss/crossentropy": 2.060115098953247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20672088861465454, + "step": 17420 + }, + { + "epoch": 0.34844, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012471516927083334, + "learning_rate": 0.0001, + "loss": 4.2223, + "loss/crossentropy": 2.2820589542388916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23306988179683685, + "step": 17422 + }, + { + "epoch": 0.34848, + "grad_norm": 1.984375, + "grad_norm_var": 0.008788045247395833, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 2.13969624042511, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20909114927053452, + "step": 17424 + }, + { + "epoch": 0.34852, + "grad_norm": 1.953125, + "grad_norm_var": 0.011214192708333333, + "learning_rate": 0.0001, + "loss": 3.9572, + "loss/crossentropy": 2.0822505950927734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20429539680480957, + "step": 17426 + }, + { + "epoch": 0.34856, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009759267171223959, + "learning_rate": 0.0001, + "loss": 3.9435, + "loss/crossentropy": 2.512156844139099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21776874363422394, + "step": 17428 + }, + { + "epoch": 0.3486, + "grad_norm": 2.359375, + "grad_norm_var": 0.018700917561848957, + "learning_rate": 0.0001, + "loss": 4.1145, + "loss/crossentropy": 2.0379759669303894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20926732569932938, + "step": 17430 + }, + { + "epoch": 0.34864, + "grad_norm": 2.046875, + "grad_norm_var": 0.017295074462890626, + "learning_rate": 0.0001, + "loss": 4.1998, + "loss/crossentropy": 2.443945050239563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22597475349903107, + "step": 17432 + }, + { + "epoch": 0.34868, + "grad_norm": 1.8203125, + "grad_norm_var": 0.019576009114583334, + "learning_rate": 0.0001, + "loss": 3.8442, + "loss/crossentropy": 1.8295652866363525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18034228682518005, + "step": 17434 + }, + { + "epoch": 0.34872, + "grad_norm": 1.8671875, + "grad_norm_var": 0.020428212483723958, + "learning_rate": 0.0001, + "loss": 4.0614, + "loss/crossentropy": 1.9487475156784058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18921684473752975, + "step": 17436 + }, + { + "epoch": 0.34876, + "grad_norm": 2.078125, + "grad_norm_var": 0.018721262613932293, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.099911689758301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2079886943101883, + "step": 17438 + }, + { + "epoch": 0.3488, + "grad_norm": 2.015625, + "grad_norm_var": 0.016739654541015624, + "learning_rate": 0.0001, + "loss": 4.1762, + "loss/crossentropy": 2.52177894115448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22080882638692856, + "step": 17440 + }, + { + "epoch": 0.34884, + "grad_norm": 2.125, + "grad_norm_var": 0.014900461832682291, + "learning_rate": 0.0001, + "loss": 4.1076, + "loss/crossentropy": 2.222637891769409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22301562875509262, + "step": 17442 + }, + { + "epoch": 0.34888, + "grad_norm": 2.09375, + "grad_norm_var": 0.014725494384765624, + "learning_rate": 0.0001, + "loss": 4.3965, + "loss/crossentropy": 2.2042930126190186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051277905702591, + "step": 17444 + }, + { + "epoch": 0.34892, + "grad_norm": 2.015625, + "grad_norm_var": 0.0065093994140625, + "learning_rate": 0.0001, + "loss": 4.2452, + "loss/crossentropy": 2.0599029064178467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20651433616876602, + "step": 17446 + }, + { + "epoch": 0.34896, + "grad_norm": 2.0, + "grad_norm_var": 0.007027180989583334, + "learning_rate": 0.0001, + "loss": 3.9174, + "loss/crossentropy": 1.9319151639938354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18495241552591324, + "step": 17448 + }, + { + "epoch": 0.349, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006197102864583333, + "learning_rate": 0.0001, + "loss": 4.0844, + "loss/crossentropy": 2.341706871986389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20252344757318497, + "step": 17450 + }, + { + "epoch": 0.34904, + "grad_norm": 1.953125, + "grad_norm_var": 0.004833984375, + "learning_rate": 0.0001, + "loss": 4.0301, + "loss/crossentropy": 2.0161609053611755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2176668643951416, + "step": 17452 + }, + { + "epoch": 0.34908, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0062334696451822914, + "learning_rate": 0.0001, + "loss": 3.6046, + "loss/crossentropy": 1.7961083054542542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16384318470954895, + "step": 17454 + }, + { + "epoch": 0.34912, + "grad_norm": 2.265625, + "grad_norm_var": 0.01791966756184896, + "learning_rate": 0.0001, + "loss": 4.6054, + "loss/crossentropy": 2.108873188495636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137713640928268, + "step": 17456 + }, + { + "epoch": 0.34916, + "grad_norm": 1.921875, + "grad_norm_var": 0.017830403645833333, + "learning_rate": 0.0001, + "loss": 3.9543, + "loss/crossentropy": 2.158664584159851, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19957562536001205, + "step": 17458 + }, + { + "epoch": 0.3492, + "grad_norm": 1.84375, + "grad_norm_var": 0.018660227457682293, + "learning_rate": 0.0001, + "loss": 3.9454, + "loss/crossentropy": 1.7799381017684937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193112812936306, + "step": 17460 + }, + { + "epoch": 0.34924, + "grad_norm": 2.0625, + "grad_norm_var": 0.019128163655598957, + "learning_rate": 0.0001, + "loss": 4.4072, + "loss/crossentropy": 2.372501015663147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2180323675274849, + "step": 17462 + }, + { + "epoch": 0.34928, + "grad_norm": 1.96875, + "grad_norm_var": 0.0185699462890625, + "learning_rate": 0.0001, + "loss": 4.0343, + "loss/crossentropy": 1.9338968396186829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19117196649312973, + "step": 17464 + }, + { + "epoch": 0.34932, + "grad_norm": 1.8125, + "grad_norm_var": 0.020774078369140626, + "learning_rate": 0.0001, + "loss": 3.6429, + "loss/crossentropy": 1.4602742195129395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1677519753575325, + "step": 17466 + }, + { + "epoch": 0.34936, + "grad_norm": 2.0, + "grad_norm_var": 0.02080078125, + "learning_rate": 0.0001, + "loss": 4.2498, + "loss/crossentropy": 2.1464006304740906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21201838552951813, + "step": 17468 + }, + { + "epoch": 0.3494, + "grad_norm": 2.109375, + "grad_norm_var": 0.018529256184895832, + "learning_rate": 0.0001, + "loss": 4.1736, + "loss/crossentropy": 2.071315884590149, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19322700798511505, + "step": 17470 + }, + { + "epoch": 0.34944, + "grad_norm": 1.984375, + "grad_norm_var": 0.010309855143229166, + "learning_rate": 0.0001, + "loss": 4.2653, + "loss/crossentropy": 1.9166500568389893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23960981518030167, + "step": 17472 + }, + { + "epoch": 0.34948, + "grad_norm": 2.0625, + "grad_norm_var": 0.011457316080729167, + "learning_rate": 0.0001, + "loss": 3.9433, + "loss/crossentropy": 1.8666390180587769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1875123456120491, + "step": 17474 + }, + { + "epoch": 0.34952, + "grad_norm": 2.015625, + "grad_norm_var": 0.009736887613932292, + "learning_rate": 0.0001, + "loss": 4.0984, + "loss/crossentropy": 1.9197289943695068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21075090020895004, + "step": 17476 + }, + { + "epoch": 0.34956, + "grad_norm": 1.90625, + "grad_norm_var": 0.011018625895182292, + "learning_rate": 0.0001, + "loss": 4.0971, + "loss/crossentropy": 1.904780387878418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20412997901439667, + "step": 17478 + }, + { + "epoch": 0.3496, + "grad_norm": 1.859375, + "grad_norm_var": 0.012325032552083334, + "learning_rate": 0.0001, + "loss": 3.735, + "loss/crossentropy": 1.6775096654891968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1669272631406784, + "step": 17480 + }, + { + "epoch": 0.34964, + "grad_norm": 2.015625, + "grad_norm_var": 0.0089111328125, + "learning_rate": 0.0001, + "loss": 4.1039, + "loss/crossentropy": 2.050463318824768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19382070004940033, + "step": 17482 + }, + { + "epoch": 0.34968, + "grad_norm": 2.015625, + "grad_norm_var": 0.009224192301432291, + "learning_rate": 0.0001, + "loss": 3.8739, + "loss/crossentropy": 1.8124673962593079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20674846321344376, + "step": 17484 + }, + { + "epoch": 0.34972, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008969879150390625, + "learning_rate": 0.0001, + "loss": 3.6797, + "loss/crossentropy": 1.6487451791763306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17874807119369507, + "step": 17486 + }, + { + "epoch": 0.34976, + "grad_norm": 2.171875, + "grad_norm_var": 0.009430948893229167, + "learning_rate": 0.0001, + "loss": 4.4025, + "loss/crossentropy": 2.101687431335449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24856076389551163, + "step": 17488 + }, + { + "epoch": 0.3498, + "grad_norm": 2.015625, + "grad_norm_var": 0.009205881754557292, + "learning_rate": 0.0001, + "loss": 4.0236, + "loss/crossentropy": 1.9233632683753967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21046236902475357, + "step": 17490 + }, + { + "epoch": 0.34984, + "grad_norm": 2.09375, + "grad_norm_var": 0.010589345296223959, + "learning_rate": 0.0001, + "loss": 4.0376, + "loss/crossentropy": 2.2882679104804993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23568424582481384, + "step": 17492 + }, + { + "epoch": 0.34988, + "grad_norm": 2.0625, + "grad_norm_var": 0.022342681884765625, + "learning_rate": 0.0001, + "loss": 4.4205, + "loss/crossentropy": 2.205874502658844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21892941743135452, + "step": 17494 + }, + { + "epoch": 0.34992, + "grad_norm": 2.234375, + "grad_norm_var": 0.0232177734375, + "learning_rate": 0.0001, + "loss": 4.3628, + "loss/crossentropy": 2.278464913368225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22409500926733017, + "step": 17496 + }, + { + "epoch": 0.34996, + "grad_norm": 1.859375, + "grad_norm_var": 0.026082356770833332, + "learning_rate": 0.0001, + "loss": 4.0736, + "loss/crossentropy": 2.5003273487091064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22478827834129333, + "step": 17498 + }, + { + "epoch": 0.35, + "grad_norm": 2.0625, + "grad_norm_var": 0.04237848917643229, + "learning_rate": 0.0001, + "loss": 4.3658, + "loss/crossentropy": 2.034249722957611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19176610559225082, + "step": 17500 + }, + { + "epoch": 0.35004, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0401031494140625, + "learning_rate": 0.0001, + "loss": 4.1586, + "loss/crossentropy": 2.271065592765808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21523310244083405, + "step": 17502 + }, + { + "epoch": 0.35008, + "grad_norm": 1.984375, + "grad_norm_var": 0.03870824178059896, + "learning_rate": 0.0001, + "loss": 4.1198, + "loss/crossentropy": 2.182866334915161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21311558783054352, + "step": 17504 + }, + { + "epoch": 0.35012, + "grad_norm": 2.125, + "grad_norm_var": 0.03878173828125, + "learning_rate": 0.0001, + "loss": 4.1471, + "loss/crossentropy": 1.7134016752243042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16884544491767883, + "step": 17506 + }, + { + "epoch": 0.35016, + "grad_norm": 2.125, + "grad_norm_var": 0.03687718709309896, + "learning_rate": 0.0001, + "loss": 4.2464, + "loss/crossentropy": 2.198198080062866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916792169213295, + "step": 17508 + }, + { + "epoch": 0.3502, + "grad_norm": 2.0625, + "grad_norm_var": 0.0272125244140625, + "learning_rate": 0.0001, + "loss": 4.2651, + "loss/crossentropy": 1.9370547533035278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23538394272327423, + "step": 17510 + }, + { + "epoch": 0.35024, + "grad_norm": 2.078125, + "grad_norm_var": 0.02469456990559896, + "learning_rate": 0.0001, + "loss": 4.0664, + "loss/crossentropy": 2.1743874549865723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22514399886131287, + "step": 17512 + }, + { + "epoch": 0.35028, + "grad_norm": 2.03125, + "grad_norm_var": 0.02174657185872396, + "learning_rate": 0.0001, + "loss": 4.2767, + "loss/crossentropy": 1.8546866178512573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1772822067141533, + "step": 17514 + }, + { + "epoch": 0.35032, + "grad_norm": 2.0, + "grad_norm_var": 0.004154205322265625, + "learning_rate": 0.0001, + "loss": 4.2603, + "loss/crossentropy": 1.8439211249351501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19603434205055237, + "step": 17516 + }, + { + "epoch": 0.35036, + "grad_norm": 1.8125, + "grad_norm_var": 0.006843058268229166, + "learning_rate": 0.0001, + "loss": 3.9506, + "loss/crossentropy": 2.2717851400375366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21598497033119202, + "step": 17518 + }, + { + "epoch": 0.3504, + "grad_norm": 1.921875, + "grad_norm_var": 0.006843058268229166, + "learning_rate": 0.0001, + "loss": 4.1807, + "loss/crossentropy": 2.2645692825317383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229163572192192, + "step": 17520 + }, + { + "epoch": 0.35044, + "grad_norm": 1.828125, + "grad_norm_var": 0.0071604410807291664, + "learning_rate": 0.0001, + "loss": 4.09, + "loss/crossentropy": 2.0507450103759766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19268949329853058, + "step": 17522 + }, + { + "epoch": 0.35048, + "grad_norm": 2.0, + "grad_norm_var": 0.005521647135416667, + "learning_rate": 0.0001, + "loss": 4.1468, + "loss/crossentropy": 2.3831188678741455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21931231766939163, + "step": 17524 + }, + { + "epoch": 0.35052, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006281534830729167, + "learning_rate": 0.0001, + "loss": 4.116, + "loss/crossentropy": 1.8574120998382568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17972451448440552, + "step": 17526 + }, + { + "epoch": 0.35056, + "grad_norm": 1.875, + "grad_norm_var": 0.006013997395833333, + "learning_rate": 0.0001, + "loss": 3.9063, + "loss/crossentropy": 2.141213893890381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21798025816679, + "step": 17528 + }, + { + "epoch": 0.3506, + "grad_norm": 1.8359375, + "grad_norm_var": 0.006520334879557292, + "learning_rate": 0.0001, + "loss": 3.9454, + "loss/crossentropy": 2.2224762439727783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2031020149588585, + "step": 17530 + }, + { + "epoch": 0.35064, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006476847330729166, + "learning_rate": 0.0001, + "loss": 3.765, + "loss/crossentropy": 2.163232743740082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22002054750919342, + "step": 17532 + }, + { + "epoch": 0.35068, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0055653889973958336, + "learning_rate": 0.0001, + "loss": 4.1397, + "loss/crossentropy": 2.1463611721992493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20084905624389648, + "step": 17534 + }, + { + "epoch": 0.35072, + "grad_norm": 2.015625, + "grad_norm_var": 0.006566365559895833, + "learning_rate": 0.0001, + "loss": 3.9751, + "loss/crossentropy": 2.1863406896591187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150622010231018, + "step": 17536 + }, + { + "epoch": 0.35076, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005744425455729166, + "learning_rate": 0.0001, + "loss": 4.0958, + "loss/crossentropy": 2.2162610292434692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20196689665317535, + "step": 17538 + }, + { + "epoch": 0.3508, + "grad_norm": 10.6875, + "grad_norm_var": 4.776464589436849, + "learning_rate": 0.0001, + "loss": 4.1004, + "loss/crossentropy": 1.8908233642578125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23238417506217957, + "step": 17540 + }, + { + "epoch": 0.35084, + "grad_norm": 2.125, + "grad_norm_var": 4.763142903645833, + "learning_rate": 0.0001, + "loss": 4.416, + "loss/crossentropy": 2.3578076362609863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21626722812652588, + "step": 17542 + }, + { + "epoch": 0.35088, + "grad_norm": 2.234375, + "grad_norm_var": 4.736722819010416, + "learning_rate": 0.0001, + "loss": 4.3243, + "loss/crossentropy": 2.4079915285110474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2292388305068016, + "step": 17544 + }, + { + "epoch": 0.35092, + "grad_norm": 2.015625, + "grad_norm_var": 4.717746734619141, + "learning_rate": 0.0001, + "loss": 4.2647, + "loss/crossentropy": 2.1021666526794434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20725515484809875, + "step": 17546 + }, + { + "epoch": 0.35096, + "grad_norm": 1.96875, + "grad_norm_var": 4.707061513264974, + "learning_rate": 0.0001, + "loss": 4.0905, + "loss/crossentropy": 2.039289176464081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614883303642273, + "step": 17548 + }, + { + "epoch": 0.351, + "grad_norm": 2.203125, + "grad_norm_var": 4.68468017578125, + "learning_rate": 0.0001, + "loss": 4.269, + "loss/crossentropy": 1.9731826782226562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2164870798587799, + "step": 17550 + }, + { + "epoch": 0.35104, + "grad_norm": 1.875, + "grad_norm_var": 4.69991455078125, + "learning_rate": 0.0001, + "loss": 4.0785, + "loss/crossentropy": 2.0166266560554504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977868676185608, + "step": 17552 + }, + { + "epoch": 0.35108, + "grad_norm": 2.0, + "grad_norm_var": 4.692333730061849, + "learning_rate": 0.0001, + "loss": 4.2823, + "loss/crossentropy": 2.284690737724304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2204381301999092, + "step": 17554 + }, + { + "epoch": 0.35112, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012977854410807291, + "learning_rate": 0.0001, + "loss": 4.2629, + "loss/crossentropy": 2.293992757797241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21718238294124603, + "step": 17556 + }, + { + "epoch": 0.35116, + "grad_norm": 2.140625, + "grad_norm_var": 0.013133748372395834, + "learning_rate": 0.0001, + "loss": 4.0687, + "loss/crossentropy": 2.252563714981079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23132775723934174, + "step": 17558 + }, + { + "epoch": 0.3512, + "grad_norm": 1.921875, + "grad_norm_var": 0.0097564697265625, + "learning_rate": 0.0001, + "loss": 3.977, + "loss/crossentropy": 1.8454533219337463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20084110647439957, + "step": 17560 + }, + { + "epoch": 0.35124, + "grad_norm": 2.0625, + "grad_norm_var": 0.009616851806640625, + "learning_rate": 0.0001, + "loss": 4.1526, + "loss/crossentropy": 2.3160746097564697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21168464422225952, + "step": 17562 + }, + { + "epoch": 0.35128, + "grad_norm": 2.03125, + "grad_norm_var": 0.00953369140625, + "learning_rate": 0.0001, + "loss": 4.031, + "loss/crossentropy": 2.0470253229141235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20728407055139542, + "step": 17564 + }, + { + "epoch": 0.35132, + "grad_norm": 2.0, + "grad_norm_var": 0.0064389546712239586, + "learning_rate": 0.0001, + "loss": 4.404, + "loss/crossentropy": 2.098679304122925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2133883535861969, + "step": 17566 + }, + { + "epoch": 0.35136, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005615234375, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 1.975411057472229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19393378496170044, + "step": 17568 + }, + { + "epoch": 0.3514, + "grad_norm": 2.0625, + "grad_norm_var": 0.005606842041015625, + "learning_rate": 0.0001, + "loss": 4.2861, + "loss/crossentropy": 2.594779372215271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2499995082616806, + "step": 17570 + }, + { + "epoch": 0.35144, + "grad_norm": 1.875, + "grad_norm_var": 0.005915323893229167, + "learning_rate": 0.0001, + "loss": 4.1045, + "loss/crossentropy": 2.075734496116638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207140251994133, + "step": 17572 + }, + { + "epoch": 0.35148, + "grad_norm": 1.765625, + "grad_norm_var": 0.0062978108723958336, + "learning_rate": 0.0001, + "loss": 3.869, + "loss/crossentropy": 2.124872624874115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20058108121156693, + "step": 17574 + }, + { + "epoch": 0.35152, + "grad_norm": 2.0625, + "grad_norm_var": 0.012056477864583333, + "learning_rate": 0.0001, + "loss": 4.1552, + "loss/crossentropy": 2.0275574922561646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2105395644903183, + "step": 17576 + }, + { + "epoch": 0.35156, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011913045247395834, + "learning_rate": 0.0001, + "loss": 4.248, + "loss/crossentropy": 2.0694713592529297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21342340111732483, + "step": 17578 + }, + { + "epoch": 0.3516, + "grad_norm": 2.0625, + "grad_norm_var": 0.012239583333333333, + "learning_rate": 0.0001, + "loss": 3.9729, + "loss/crossentropy": 1.8013625741004944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1766640990972519, + "step": 17580 + }, + { + "epoch": 0.35164, + "grad_norm": 2.015625, + "grad_norm_var": 0.0123443603515625, + "learning_rate": 0.0001, + "loss": 4.0313, + "loss/crossentropy": 2.1990463733673096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19608185440301895, + "step": 17582 + }, + { + "epoch": 0.35168, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013361612955729166, + "learning_rate": 0.0001, + "loss": 3.9041, + "loss/crossentropy": 2.0714540481567383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20064443349838257, + "step": 17584 + }, + { + "epoch": 0.35172, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015242258707682291, + "learning_rate": 0.0001, + "loss": 3.7431, + "loss/crossentropy": 1.7437097430229187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17001167684793472, + "step": 17586 + }, + { + "epoch": 0.35176, + "grad_norm": 2.265625, + "grad_norm_var": 0.020531972249348957, + "learning_rate": 0.0001, + "loss": 4.1987, + "loss/crossentropy": 2.072261691093445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21019883453845978, + "step": 17588 + }, + { + "epoch": 0.3518, + "grad_norm": 2.28125, + "grad_norm_var": 0.021922810872395834, + "learning_rate": 0.0001, + "loss": 4.0826, + "loss/crossentropy": 2.1804715394973755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21589763462543488, + "step": 17590 + }, + { + "epoch": 0.35184, + "grad_norm": 2.0, + "grad_norm_var": 0.01734619140625, + "learning_rate": 0.0001, + "loss": 4.0964, + "loss/crossentropy": 2.0044930577278137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20204917341470718, + "step": 17592 + }, + { + "epoch": 0.35188, + "grad_norm": 2.015625, + "grad_norm_var": 0.017362467447916665, + "learning_rate": 0.0001, + "loss": 4.1394, + "loss/crossentropy": 1.8961025476455688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1865018978714943, + "step": 17594 + }, + { + "epoch": 0.35192, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016971588134765625, + "learning_rate": 0.0001, + "loss": 3.8503, + "loss/crossentropy": 1.7005563378334045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18951866030693054, + "step": 17596 + }, + { + "epoch": 0.35196, + "grad_norm": 1.953125, + "grad_norm_var": 0.017002105712890625, + "learning_rate": 0.0001, + "loss": 4.0044, + "loss/crossentropy": 2.002982437610626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076285257935524, + "step": 17598 + }, + { + "epoch": 0.352, + "grad_norm": 1.90625, + "grad_norm_var": 0.016950480143229165, + "learning_rate": 0.0001, + "loss": 4.0595, + "loss/crossentropy": 2.0283620357513428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20175684988498688, + "step": 17600 + }, + { + "epoch": 0.35204, + "grad_norm": 1.9375, + "grad_norm_var": 0.014289347330729167, + "learning_rate": 0.0001, + "loss": 3.9495, + "loss/crossentropy": 1.855184018611908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19368328154087067, + "step": 17602 + }, + { + "epoch": 0.35208, + "grad_norm": 1.96875, + "grad_norm_var": 0.009089152018229166, + "learning_rate": 0.0001, + "loss": 4.2364, + "loss/crossentropy": 2.2288308143615723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121831700205803, + "step": 17604 + }, + { + "epoch": 0.35212, + "grad_norm": 1.953125, + "grad_norm_var": 0.002512359619140625, + "learning_rate": 0.0001, + "loss": 4.1398, + "loss/crossentropy": 1.9114368557929993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17937570065259933, + "step": 17606 + }, + { + "epoch": 0.35216, + "grad_norm": 1.921875, + "grad_norm_var": 0.0026751200358072916, + "learning_rate": 0.0001, + "loss": 3.9921, + "loss/crossentropy": 2.238133430480957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23264692723751068, + "step": 17608 + }, + { + "epoch": 0.3522, + "grad_norm": 1.9375, + "grad_norm_var": 0.0024169921875, + "learning_rate": 0.0001, + "loss": 3.85, + "loss/crossentropy": 1.7219743728637695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17108283936977386, + "step": 17610 + }, + { + "epoch": 0.35224, + "grad_norm": 1.890625, + "grad_norm_var": 0.005037434895833333, + "learning_rate": 0.0001, + "loss": 3.8406, + "loss/crossentropy": 1.9613978862762451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042389214038849, + "step": 17612 + }, + { + "epoch": 0.35228, + "grad_norm": 1.859375, + "grad_norm_var": 0.005132802327473958, + "learning_rate": 0.0001, + "loss": 3.8567, + "loss/crossentropy": 1.946107029914856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19604258984327316, + "step": 17614 + }, + { + "epoch": 0.35232, + "grad_norm": 1.921875, + "grad_norm_var": 0.005077870686848959, + "learning_rate": 0.0001, + "loss": 4.1404, + "loss/crossentropy": 2.255744218826294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2184707298874855, + "step": 17616 + }, + { + "epoch": 0.35236, + "grad_norm": 2.0, + "grad_norm_var": 0.0058062235514322914, + "learning_rate": 0.0001, + "loss": 4.1852, + "loss/crossentropy": 2.339258551597595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21353042870759964, + "step": 17618 + }, + { + "epoch": 0.3524, + "grad_norm": 1.984375, + "grad_norm_var": 0.0054595947265625, + "learning_rate": 0.0001, + "loss": 3.9494, + "loss/crossentropy": 1.8912869691848755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19134657084941864, + "step": 17620 + }, + { + "epoch": 0.35244, + "grad_norm": 2.03125, + "grad_norm_var": 0.0058258056640625, + "learning_rate": 0.0001, + "loss": 4.2607, + "loss/crossentropy": 2.186914384365082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20765449851751328, + "step": 17622 + }, + { + "epoch": 0.35248, + "grad_norm": 1.8828125, + "grad_norm_var": 0.005773671468098958, + "learning_rate": 0.0001, + "loss": 3.8567, + "loss/crossentropy": 1.6339558959007263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18255474418401718, + "step": 17624 + }, + { + "epoch": 0.35252, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005796051025390625, + "learning_rate": 0.0001, + "loss": 3.9593, + "loss/crossentropy": 2.030660629272461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1943136677145958, + "step": 17626 + }, + { + "epoch": 0.35256, + "grad_norm": 2.046875, + "grad_norm_var": 0.004173787434895834, + "learning_rate": 0.0001, + "loss": 3.9738, + "loss/crossentropy": 1.8040945529937744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18390020728111267, + "step": 17628 + }, + { + "epoch": 0.3526, + "grad_norm": 2.15625, + "grad_norm_var": 0.0055539449055989586, + "learning_rate": 0.0001, + "loss": 4.4613, + "loss/crossentropy": 1.9993014335632324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20794235169887543, + "step": 17630 + }, + { + "epoch": 0.35264, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004881795247395833, + "learning_rate": 0.0001, + "loss": 3.8539, + "loss/crossentropy": 2.0751482248306274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20385687798261642, + "step": 17632 + }, + { + "epoch": 0.35268, + "grad_norm": 2.078125, + "grad_norm_var": 0.0052073160807291664, + "learning_rate": 0.0001, + "loss": 4.0059, + "loss/crossentropy": 1.8923512697219849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879557192325592, + "step": 17634 + }, + { + "epoch": 0.35272, + "grad_norm": 1.984375, + "grad_norm_var": 0.004935709635416666, + "learning_rate": 0.0001, + "loss": 4.2176, + "loss/crossentropy": 2.068901300430298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21545641124248505, + "step": 17636 + }, + { + "epoch": 0.35276, + "grad_norm": 2.046875, + "grad_norm_var": 0.005968983968098958, + "learning_rate": 0.0001, + "loss": 3.8627, + "loss/crossentropy": 1.8329379558563232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1875568851828575, + "step": 17638 + }, + { + "epoch": 0.3528, + "grad_norm": 1.90625, + "grad_norm_var": 0.007811482747395833, + "learning_rate": 0.0001, + "loss": 4.0878, + "loss/crossentropy": 2.2368232011795044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22392207384109497, + "step": 17640 + }, + { + "epoch": 0.35284, + "grad_norm": 2.3125, + "grad_norm_var": 0.014972941080729166, + "learning_rate": 0.0001, + "loss": 4.0397, + "loss/crossentropy": 2.2503843307495117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21132144331932068, + "step": 17642 + }, + { + "epoch": 0.35288, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01582819620768229, + "learning_rate": 0.0001, + "loss": 4.0459, + "loss/crossentropy": 2.1919764280319214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197752483189106, + "step": 17644 + }, + { + "epoch": 0.35292, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0148345947265625, + "learning_rate": 0.0001, + "loss": 3.9122, + "loss/crossentropy": 1.9867302775382996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115078568458557, + "step": 17646 + }, + { + "epoch": 0.35296, + "grad_norm": 1.90625, + "grad_norm_var": 0.017899576822916666, + "learning_rate": 0.0001, + "loss": 3.8369, + "loss/crossentropy": 1.9015426635742188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17188573628664017, + "step": 17648 + }, + { + "epoch": 0.353, + "grad_norm": 1.875, + "grad_norm_var": 0.019006093343098957, + "learning_rate": 0.0001, + "loss": 3.6023, + "loss/crossentropy": 1.8092533946037292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1764475554227829, + "step": 17650 + }, + { + "epoch": 0.35304, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01936620076497396, + "learning_rate": 0.0001, + "loss": 4.0542, + "loss/crossentropy": 2.0736488103866577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20046666264533997, + "step": 17652 + }, + { + "epoch": 0.35308, + "grad_norm": 2.078125, + "grad_norm_var": 0.019364166259765624, + "learning_rate": 0.0001, + "loss": 3.9716, + "loss/crossentropy": 2.1787428855895996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23303276300430298, + "step": 17654 + }, + { + "epoch": 0.35312, + "grad_norm": 2.0, + "grad_norm_var": 0.017020416259765626, + "learning_rate": 0.0001, + "loss": 4.4889, + "loss/crossentropy": 2.419381618499756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23756036162376404, + "step": 17656 + }, + { + "epoch": 0.35316, + "grad_norm": 2.09375, + "grad_norm_var": 0.012111155192057292, + "learning_rate": 0.0001, + "loss": 4.4202, + "loss/crossentropy": 2.4017220735549927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2373708263039589, + "step": 17658 + }, + { + "epoch": 0.3532, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011919911702473958, + "learning_rate": 0.0001, + "loss": 3.8805, + "loss/crossentropy": 1.7796767354011536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19661501049995422, + "step": 17660 + }, + { + "epoch": 0.35324, + "grad_norm": 2.015625, + "grad_norm_var": 0.013158162434895834, + "learning_rate": 0.0001, + "loss": 4.2215, + "loss/crossentropy": 2.1815608739852905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20645780861377716, + "step": 17662 + }, + { + "epoch": 0.35328, + "grad_norm": 2.03125, + "grad_norm_var": 0.010178375244140624, + "learning_rate": 0.0001, + "loss": 4.2148, + "loss/crossentropy": 2.056411921977997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21581003069877625, + "step": 17664 + }, + { + "epoch": 0.35332, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007824452718098958, + "learning_rate": 0.0001, + "loss": 3.8676, + "loss/crossentropy": 1.8901747465133667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19498124718666077, + "step": 17666 + }, + { + "epoch": 0.35336, + "grad_norm": 2.0, + "grad_norm_var": 0.006493123372395834, + "learning_rate": 0.0001, + "loss": 4.0884, + "loss/crossentropy": 2.0241716504096985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19405542314052582, + "step": 17668 + }, + { + "epoch": 0.3534, + "grad_norm": 1.96875, + "grad_norm_var": 0.008241526285807292, + "learning_rate": 0.0001, + "loss": 4.1452, + "loss/crossentropy": 2.016151189804077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1856973022222519, + "step": 17670 + }, + { + "epoch": 0.35344, + "grad_norm": 2.078125, + "grad_norm_var": 0.009096018473307292, + "learning_rate": 0.0001, + "loss": 4.3895, + "loss/crossentropy": 2.5066399574279785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23949292302131653, + "step": 17672 + }, + { + "epoch": 0.35348, + "grad_norm": 2.015625, + "grad_norm_var": 0.007236480712890625, + "learning_rate": 0.0001, + "loss": 4.1432, + "loss/crossentropy": 2.1384177207946777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21078043431043625, + "step": 17674 + }, + { + "epoch": 0.35352, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0070709228515625, + "learning_rate": 0.0001, + "loss": 4.1732, + "loss/crossentropy": 2.1211976408958435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064737230539322, + "step": 17676 + }, + { + "epoch": 0.35356, + "grad_norm": 2.078125, + "grad_norm_var": 0.0072418212890625, + "learning_rate": 0.0001, + "loss": 4.3103, + "loss/crossentropy": 2.018574059009552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20446718484163284, + "step": 17678 + }, + { + "epoch": 0.3536, + "grad_norm": 2.015625, + "grad_norm_var": 0.008186848958333333, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 2.10149747133255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974436193704605, + "step": 17680 + }, + { + "epoch": 0.35364, + "grad_norm": 2.0625, + "grad_norm_var": 0.009698232014973959, + "learning_rate": 0.0001, + "loss": 4.3133, + "loss/crossentropy": 2.288322687149048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22535154223442078, + "step": 17682 + }, + { + "epoch": 0.35368, + "grad_norm": 1.921875, + "grad_norm_var": 0.010400136311848959, + "learning_rate": 0.0001, + "loss": 4.3396, + "loss/crossentropy": 2.002479314804077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20964795351028442, + "step": 17684 + }, + { + "epoch": 0.35372, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0099029541015625, + "learning_rate": 0.0001, + "loss": 4.0116, + "loss/crossentropy": 2.2548930644989014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19946102052927017, + "step": 17686 + }, + { + "epoch": 0.35376, + "grad_norm": 2.296875, + "grad_norm_var": 0.015006510416666667, + "learning_rate": 0.0001, + "loss": 4.1777, + "loss/crossentropy": 2.3636194467544556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21626830101013184, + "step": 17688 + }, + { + "epoch": 0.3538, + "grad_norm": 2.15625, + "grad_norm_var": 0.016471099853515626, + "learning_rate": 0.0001, + "loss": 4.1137, + "loss/crossentropy": 2.2401771545410156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202116459608078, + "step": 17690 + }, + { + "epoch": 0.35384, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01602783203125, + "learning_rate": 0.0001, + "loss": 4.2248, + "loss/crossentropy": 2.192560911178589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20161370187997818, + "step": 17692 + }, + { + "epoch": 0.35388, + "grad_norm": 2.078125, + "grad_norm_var": 0.01495361328125, + "learning_rate": 0.0001, + "loss": 4.175, + "loss/crossentropy": 2.31100332736969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23697054386138916, + "step": 17694 + }, + { + "epoch": 0.35392, + "grad_norm": 2.171875, + "grad_norm_var": 0.014139811197916666, + "learning_rate": 0.0001, + "loss": 4.0894, + "loss/crossentropy": 2.018012821674347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954345926642418, + "step": 17696 + }, + { + "epoch": 0.35396, + "grad_norm": 2.203125, + "grad_norm_var": 0.0168853759765625, + "learning_rate": 0.0001, + "loss": 4.3128, + "loss/crossentropy": 2.0922133326530457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2219238504767418, + "step": 17698 + }, + { + "epoch": 0.354, + "grad_norm": 2.03125, + "grad_norm_var": 0.01591796875, + "learning_rate": 0.0001, + "loss": 4.3803, + "loss/crossentropy": 2.727385640144348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21042727679014206, + "step": 17700 + }, + { + "epoch": 0.35404, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014631144205729167, + "learning_rate": 0.0001, + "loss": 3.9535, + "loss/crossentropy": 1.8388070464134216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18624315410852432, + "step": 17702 + }, + { + "epoch": 0.35408, + "grad_norm": 2.03125, + "grad_norm_var": 0.008719889322916667, + "learning_rate": 0.0001, + "loss": 3.9327, + "loss/crossentropy": 1.879045307636261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827290803194046, + "step": 17704 + }, + { + "epoch": 0.35412, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0075266520182291664, + "learning_rate": 0.0001, + "loss": 4.2715, + "loss/crossentropy": 2.2604658603668213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22230257838964462, + "step": 17706 + }, + { + "epoch": 0.35416, + "grad_norm": 1.7890625, + "grad_norm_var": 0.01329345703125, + "learning_rate": 0.0001, + "loss": 3.7828, + "loss/crossentropy": 2.1200287342071533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20524511486291885, + "step": 17708 + }, + { + "epoch": 0.3542, + "grad_norm": 1.96875, + "grad_norm_var": 0.014135487874348958, + "learning_rate": 0.0001, + "loss": 3.9624, + "loss/crossentropy": 1.933307945728302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19991742819547653, + "step": 17710 + }, + { + "epoch": 0.35424, + "grad_norm": 2.015625, + "grad_norm_var": 0.013231404622395833, + "learning_rate": 0.0001, + "loss": 4.0673, + "loss/crossentropy": 1.9442221522331238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1861218512058258, + "step": 17712 + }, + { + "epoch": 0.35428, + "grad_norm": 1.90625, + "grad_norm_var": 0.008885701497395834, + "learning_rate": 0.0001, + "loss": 3.6592, + "loss/crossentropy": 1.7169482111930847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16892607510089874, + "step": 17714 + }, + { + "epoch": 0.35432, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006891886393229167, + "learning_rate": 0.0001, + "loss": 4.3472, + "loss/crossentropy": 2.105340003967285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19026386737823486, + "step": 17716 + }, + { + "epoch": 0.35436, + "grad_norm": 1.90625, + "grad_norm_var": 0.0069976806640625, + "learning_rate": 0.0001, + "loss": 4.1666, + "loss/crossentropy": 2.0366504192352295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19709386676549911, + "step": 17718 + }, + { + "epoch": 0.3544, + "grad_norm": 2.078125, + "grad_norm_var": 0.007393391927083334, + "learning_rate": 0.0001, + "loss": 3.9829, + "loss/crossentropy": 1.9578897356987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19842654466629028, + "step": 17720 + }, + { + "epoch": 0.35444, + "grad_norm": 1.890625, + "grad_norm_var": 0.006888834635416666, + "learning_rate": 0.0001, + "loss": 3.8608, + "loss/crossentropy": 2.139336943626404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19530436396598816, + "step": 17722 + }, + { + "epoch": 0.35448, + "grad_norm": 2.140625, + "grad_norm_var": 0.009669748942057292, + "learning_rate": 0.0001, + "loss": 4.3803, + "loss/crossentropy": 2.4077011346817017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2209417074918747, + "step": 17724 + }, + { + "epoch": 0.35452, + "grad_norm": 1.953125, + "grad_norm_var": 0.009373982747395834, + "learning_rate": 0.0001, + "loss": 4.2412, + "loss/crossentropy": 2.1217586994171143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052367851138115, + "step": 17726 + }, + { + "epoch": 0.35456, + "grad_norm": 1.90625, + "grad_norm_var": 0.009751129150390624, + "learning_rate": 0.0001, + "loss": 4.1872, + "loss/crossentropy": 2.078941583633423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033359408378601, + "step": 17728 + }, + { + "epoch": 0.3546, + "grad_norm": 2.09375, + "grad_norm_var": 0.0072265625, + "learning_rate": 0.0001, + "loss": 4.3514, + "loss/crossentropy": 2.1763141751289368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22765995562076569, + "step": 17730 + }, + { + "epoch": 0.35464, + "grad_norm": 2.09375, + "grad_norm_var": 0.00784912109375, + "learning_rate": 0.0001, + "loss": 4.1094, + "loss/crossentropy": 1.83991938829422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18603162467479706, + "step": 17732 + }, + { + "epoch": 0.35468, + "grad_norm": 2.015625, + "grad_norm_var": 0.009132639567057291, + "learning_rate": 0.0001, + "loss": 3.93, + "loss/crossentropy": 2.021396040916443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19647164642810822, + "step": 17734 + }, + { + "epoch": 0.35472, + "grad_norm": 2.0, + "grad_norm_var": 0.009297434488932292, + "learning_rate": 0.0001, + "loss": 4.2137, + "loss/crossentropy": 2.2339202165603638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20721730589866638, + "step": 17736 + }, + { + "epoch": 0.35476, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008017730712890626, + "learning_rate": 0.0001, + "loss": 4.1859, + "loss/crossentropy": 2.2280211448669434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21610353142023087, + "step": 17738 + }, + { + "epoch": 0.3548, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0082672119140625, + "learning_rate": 0.0001, + "loss": 3.8836, + "loss/crossentropy": 1.8112387657165527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18565281480550766, + "step": 17740 + }, + { + "epoch": 0.35484, + "grad_norm": 2.140625, + "grad_norm_var": 0.009098307291666666, + "learning_rate": 0.0001, + "loss": 4.5429, + "loss/crossentropy": 2.1125651597976685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1949760466814041, + "step": 17742 + }, + { + "epoch": 0.35488, + "grad_norm": 1.96875, + "grad_norm_var": 0.009163411458333333, + "learning_rate": 0.0001, + "loss": 3.9613, + "loss/crossentropy": 2.1020091772079468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126646414399147, + "step": 17744 + }, + { + "epoch": 0.35492, + "grad_norm": 2.046875, + "grad_norm_var": 0.010553995768229166, + "learning_rate": 0.0001, + "loss": 4.0522, + "loss/crossentropy": 1.8875654339790344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1913822665810585, + "step": 17746 + }, + { + "epoch": 0.35496, + "grad_norm": 2.140625, + "grad_norm_var": 0.011921946207682292, + "learning_rate": 0.0001, + "loss": 4.2408, + "loss/crossentropy": 2.1043163537979126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18798817694187164, + "step": 17748 + }, + { + "epoch": 0.355, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012373606363932291, + "learning_rate": 0.0001, + "loss": 4.0744, + "loss/crossentropy": 2.0751022696495056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19110247492790222, + "step": 17750 + }, + { + "epoch": 0.35504, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011815388997395834, + "learning_rate": 0.0001, + "loss": 4.2077, + "loss/crossentropy": 2.0335286259651184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18653328716754913, + "step": 17752 + }, + { + "epoch": 0.35508, + "grad_norm": 2.140625, + "grad_norm_var": 0.012230428059895833, + "learning_rate": 0.0001, + "loss": 4.2833, + "loss/crossentropy": 2.091266691684723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19435560703277588, + "step": 17754 + }, + { + "epoch": 0.35512, + "grad_norm": 2.265625, + "grad_norm_var": 0.015702056884765624, + "learning_rate": 0.0001, + "loss": 4.4468, + "loss/crossentropy": 2.2889565229415894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21760451793670654, + "step": 17756 + }, + { + "epoch": 0.35516, + "grad_norm": 2.140625, + "grad_norm_var": 0.0162017822265625, + "learning_rate": 0.0001, + "loss": 4.1216, + "loss/crossentropy": 1.818089485168457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18759652972221375, + "step": 17758 + }, + { + "epoch": 0.3552, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015531158447265625, + "learning_rate": 0.0001, + "loss": 4.0322, + "loss/crossentropy": 1.8579466938972473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20318903028964996, + "step": 17760 + }, + { + "epoch": 0.35524, + "grad_norm": 2.140625, + "grad_norm_var": 0.0152008056640625, + "learning_rate": 0.0001, + "loss": 4.058, + "loss/crossentropy": 2.0463815927505493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20810134708881378, + "step": 17762 + }, + { + "epoch": 0.35528, + "grad_norm": 1.859375, + "grad_norm_var": 0.0146392822265625, + "learning_rate": 0.0001, + "loss": 3.9702, + "loss/crossentropy": 1.8263658285140991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1781037524342537, + "step": 17764 + }, + { + "epoch": 0.35532, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0123199462890625, + "learning_rate": 0.0001, + "loss": 4.0973, + "loss/crossentropy": 2.0662325620651245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2161395400762558, + "step": 17766 + }, + { + "epoch": 0.35536, + "grad_norm": 1.921875, + "grad_norm_var": 0.013598378499348958, + "learning_rate": 0.0001, + "loss": 4.1028, + "loss/crossentropy": 1.9901453256607056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20523327589035034, + "step": 17768 + }, + { + "epoch": 0.3554, + "grad_norm": 2.03125, + "grad_norm_var": 0.011766560872395833, + "learning_rate": 0.0001, + "loss": 4.1567, + "loss/crossentropy": 1.9877265095710754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20279201865196228, + "step": 17770 + }, + { + "epoch": 0.35544, + "grad_norm": 2.09375, + "grad_norm_var": 0.013695271809895833, + "learning_rate": 0.0001, + "loss": 4.5072, + "loss/crossentropy": 2.1376627683639526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20117300748825073, + "step": 17772 + }, + { + "epoch": 0.35548, + "grad_norm": 1.953125, + "grad_norm_var": 0.015327962239583333, + "learning_rate": 0.0001, + "loss": 3.8024, + "loss/crossentropy": 2.0174089074134827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042396143078804, + "step": 17774 + }, + { + "epoch": 0.35552, + "grad_norm": 1.90625, + "grad_norm_var": 0.019090779622395835, + "learning_rate": 0.0001, + "loss": 3.7872, + "loss/crossentropy": 2.1650888919830322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062518149614334, + "step": 17776 + }, + { + "epoch": 0.35556, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017836252848307293, + "learning_rate": 0.0001, + "loss": 4.275, + "loss/crossentropy": 2.078373670578003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.205182746052742, + "step": 17778 + }, + { + "epoch": 0.3556, + "grad_norm": 1.9453125, + "grad_norm_var": 0.017693837483723957, + "learning_rate": 0.0001, + "loss": 3.7109, + "loss/crossentropy": 1.7496679425239563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17602672427892685, + "step": 17780 + }, + { + "epoch": 0.35564, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01779963175455729, + "learning_rate": 0.0001, + "loss": 3.8767, + "loss/crossentropy": 2.3132810592651367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21808438748121262, + "step": 17782 + }, + { + "epoch": 0.35568, + "grad_norm": 2.046875, + "grad_norm_var": 0.017210896809895834, + "learning_rate": 0.0001, + "loss": 3.8465, + "loss/crossentropy": 1.9900661706924438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20264087617397308, + "step": 17784 + }, + { + "epoch": 0.35572, + "grad_norm": 2.03125, + "grad_norm_var": 0.01665013631184896, + "learning_rate": 0.0001, + "loss": 4.2735, + "loss/crossentropy": 2.095883369445801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19925827533006668, + "step": 17786 + }, + { + "epoch": 0.35576, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008771769205729167, + "learning_rate": 0.0001, + "loss": 3.9558, + "loss/crossentropy": 1.9932443499565125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1864292174577713, + "step": 17788 + }, + { + "epoch": 0.3558, + "grad_norm": 2.0625, + "grad_norm_var": 0.008455149332682292, + "learning_rate": 0.0001, + "loss": 4.0811, + "loss/crossentropy": 1.8468709588050842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18770359456539154, + "step": 17790 + }, + { + "epoch": 0.35584, + "grad_norm": 1.921875, + "grad_norm_var": 0.006502278645833333, + "learning_rate": 0.0001, + "loss": 4.1749, + "loss/crossentropy": 2.293928623199463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20280499011278152, + "step": 17792 + }, + { + "epoch": 0.35588, + "grad_norm": 2.0, + "grad_norm_var": 0.00618896484375, + "learning_rate": 0.0001, + "loss": 4.0155, + "loss/crossentropy": 1.8333890438079834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20070593804121017, + "step": 17794 + }, + { + "epoch": 0.35592, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0061767578125, + "learning_rate": 0.0001, + "loss": 3.9903, + "loss/crossentropy": 1.7173805236816406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17676317691802979, + "step": 17796 + }, + { + "epoch": 0.35596, + "grad_norm": 1.953125, + "grad_norm_var": 0.0058977762858072914, + "learning_rate": 0.0001, + "loss": 3.8196, + "loss/crossentropy": 1.8198468685150146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20926716923713684, + "step": 17798 + }, + { + "epoch": 0.356, + "grad_norm": 2.015625, + "grad_norm_var": 0.0054840087890625, + "learning_rate": 0.0001, + "loss": 4.2473, + "loss/crossentropy": 2.354674279689789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23675478994846344, + "step": 17800 + }, + { + "epoch": 0.35604, + "grad_norm": 1.96875, + "grad_norm_var": 0.005610911051432291, + "learning_rate": 0.0001, + "loss": 4.1092, + "loss/crossentropy": 2.055357277393341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18840713798999786, + "step": 17802 + }, + { + "epoch": 0.35608, + "grad_norm": 1.8359375, + "grad_norm_var": 0.007089996337890625, + "learning_rate": 0.0001, + "loss": 4.1351, + "loss/crossentropy": 2.3461071252822876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22149299830198288, + "step": 17804 + }, + { + "epoch": 0.35612, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0083892822265625, + "learning_rate": 0.0001, + "loss": 4.165, + "loss/crossentropy": 2.2675124406814575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21928898990154266, + "step": 17806 + }, + { + "epoch": 0.35616, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007346343994140625, + "learning_rate": 0.0001, + "loss": 4.2397, + "loss/crossentropy": 2.239398717880249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20473385602235794, + "step": 17808 + }, + { + "epoch": 0.3562, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009262847900390624, + "learning_rate": 0.0001, + "loss": 3.8152, + "loss/crossentropy": 2.022661864757538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910046860575676, + "step": 17810 + }, + { + "epoch": 0.35624, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009236653645833334, + "learning_rate": 0.0001, + "loss": 4.053, + "loss/crossentropy": 1.874055802822113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19950387626886368, + "step": 17812 + }, + { + "epoch": 0.35628, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009178670247395833, + "learning_rate": 0.0001, + "loss": 4.058, + "loss/crossentropy": 1.7601851224899292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18487969785928726, + "step": 17814 + }, + { + "epoch": 0.35632, + "grad_norm": 1.984375, + "grad_norm_var": 0.008348592122395833, + "learning_rate": 0.0001, + "loss": 4.0428, + "loss/crossentropy": 1.8841391801834106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17906954884529114, + "step": 17816 + }, + { + "epoch": 0.35636, + "grad_norm": 1.7890625, + "grad_norm_var": 0.011092122395833333, + "learning_rate": 0.0001, + "loss": 3.6292, + "loss/crossentropy": 1.7313326597213745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16394374519586563, + "step": 17818 + }, + { + "epoch": 0.3564, + "grad_norm": 1.96875, + "grad_norm_var": 0.0076901753743489586, + "learning_rate": 0.0001, + "loss": 3.7718, + "loss/crossentropy": 2.094825506210327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20041480660438538, + "step": 17820 + }, + { + "epoch": 0.35644, + "grad_norm": 1.9296875, + "grad_norm_var": 0.004854329427083333, + "learning_rate": 0.0001, + "loss": 4.075, + "loss/crossentropy": 2.222295045852661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19912637770175934, + "step": 17822 + }, + { + "epoch": 0.35648, + "grad_norm": 2.015625, + "grad_norm_var": 0.005980428059895833, + "learning_rate": 0.0001, + "loss": 3.971, + "loss/crossentropy": 2.1671608090400696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20070582628250122, + "step": 17824 + }, + { + "epoch": 0.35652, + "grad_norm": 2.03125, + "grad_norm_var": 0.006884511311848958, + "learning_rate": 0.0001, + "loss": 4.0805, + "loss/crossentropy": 2.164097785949707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2535782679915428, + "step": 17826 + }, + { + "epoch": 0.35656, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008405558268229167, + "learning_rate": 0.0001, + "loss": 4.128, + "loss/crossentropy": 1.738038182258606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17469938844442368, + "step": 17828 + }, + { + "epoch": 0.3566, + "grad_norm": 1.96875, + "grad_norm_var": 0.010636393229166667, + "learning_rate": 0.0001, + "loss": 4.3157, + "loss/crossentropy": 2.3383474349975586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087705433368683, + "step": 17830 + }, + { + "epoch": 0.35664, + "grad_norm": 1.7734375, + "grad_norm_var": 0.012870279947916667, + "learning_rate": 0.0001, + "loss": 3.854, + "loss/crossentropy": 2.0826202034950256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19418098032474518, + "step": 17832 + }, + { + "epoch": 0.35668, + "grad_norm": 2.109375, + "grad_norm_var": 0.011287434895833334, + "learning_rate": 0.0001, + "loss": 4.1733, + "loss/crossentropy": 1.943938970565796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20667240023612976, + "step": 17834 + }, + { + "epoch": 0.35672, + "grad_norm": 1.9375, + "grad_norm_var": 0.011226145426432292, + "learning_rate": 0.0001, + "loss": 4.1312, + "loss/crossentropy": 1.9756113290786743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973763257265091, + "step": 17836 + }, + { + "epoch": 0.35676, + "grad_norm": 1.8515625, + "grad_norm_var": 0.012898763020833334, + "learning_rate": 0.0001, + "loss": 3.6895, + "loss/crossentropy": 1.8583598732948303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17464379966259003, + "step": 17838 + }, + { + "epoch": 0.3568, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011791737874348958, + "learning_rate": 0.0001, + "loss": 3.9421, + "loss/crossentropy": 1.9786911606788635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20051777362823486, + "step": 17840 + }, + { + "epoch": 0.35684, + "grad_norm": 1.984375, + "grad_norm_var": 0.010414377848307291, + "learning_rate": 0.0001, + "loss": 4.0804, + "loss/crossentropy": 2.1271785497665405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21754157543182373, + "step": 17842 + }, + { + "epoch": 0.35688, + "grad_norm": 1.828125, + "grad_norm_var": 0.009639485677083334, + "learning_rate": 0.0001, + "loss": 3.9739, + "loss/crossentropy": 2.264941096305847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20562931895256042, + "step": 17844 + }, + { + "epoch": 0.35692, + "grad_norm": 2.078125, + "grad_norm_var": 0.0083160400390625, + "learning_rate": 0.0001, + "loss": 4.3865, + "loss/crossentropy": 2.141028881072998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23258531093597412, + "step": 17846 + }, + { + "epoch": 0.35696, + "grad_norm": 2.203125, + "grad_norm_var": 0.009781901041666667, + "learning_rate": 0.0001, + "loss": 4.1484, + "loss/crossentropy": 1.9527946710586548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24948758631944656, + "step": 17848 + }, + { + "epoch": 0.357, + "grad_norm": 2.03125, + "grad_norm_var": 0.008854166666666666, + "learning_rate": 0.0001, + "loss": 4.0745, + "loss/crossentropy": 1.9595746397972107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19834068417549133, + "step": 17850 + }, + { + "epoch": 0.35704, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010396321614583334, + "learning_rate": 0.0001, + "loss": 4.1407, + "loss/crossentropy": 1.8414466977119446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19133952260017395, + "step": 17852 + }, + { + "epoch": 0.35708, + "grad_norm": 1.984375, + "grad_norm_var": 0.010326894124348958, + "learning_rate": 0.0001, + "loss": 3.6912, + "loss/crossentropy": 1.6589071154594421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17579880356788635, + "step": 17854 + }, + { + "epoch": 0.35712, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011213175455729167, + "learning_rate": 0.0001, + "loss": 3.8164, + "loss/crossentropy": 1.8462252020835876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19051598757505417, + "step": 17856 + }, + { + "epoch": 0.35716, + "grad_norm": 2.109375, + "grad_norm_var": 0.015083567301432291, + "learning_rate": 0.0001, + "loss": 3.9453, + "loss/crossentropy": 2.0473897457122803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21250004321336746, + "step": 17858 + }, + { + "epoch": 0.3572, + "grad_norm": 2.078125, + "grad_norm_var": 0.014664459228515624, + "learning_rate": 0.0001, + "loss": 4.1836, + "loss/crossentropy": 2.220176875591278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22177650034427643, + "step": 17860 + }, + { + "epoch": 0.35724, + "grad_norm": 1.9140625, + "grad_norm_var": 0.13489176432291666, + "learning_rate": 0.0001, + "loss": 3.5816, + "loss/crossentropy": 1.6774207949638367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16900332272052765, + "step": 17862 + }, + { + "epoch": 0.35728, + "grad_norm": 2.0, + "grad_norm_var": 0.13516616821289062, + "learning_rate": 0.0001, + "loss": 3.8075, + "loss/crossentropy": 1.9064326286315918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1925061270594597, + "step": 17864 + }, + { + "epoch": 0.35732, + "grad_norm": 2.15625, + "grad_norm_var": 0.1366607666015625, + "learning_rate": 0.0001, + "loss": 4.1698, + "loss/crossentropy": 2.188231110572815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20198386162519455, + "step": 17866 + }, + { + "epoch": 0.35736, + "grad_norm": 2.140625, + "grad_norm_var": 0.13647359212239582, + "learning_rate": 0.0001, + "loss": 4.0141, + "loss/crossentropy": 1.9380639791488647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19545384496450424, + "step": 17868 + }, + { + "epoch": 0.3574, + "grad_norm": 2.015625, + "grad_norm_var": 0.13407389322916666, + "learning_rate": 0.0001, + "loss": 4.1965, + "loss/crossentropy": 1.995704710483551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19666064530611038, + "step": 17870 + }, + { + "epoch": 0.35744, + "grad_norm": 1.96875, + "grad_norm_var": 0.13017349243164061, + "learning_rate": 0.0001, + "loss": 4.0191, + "loss/crossentropy": 2.2148354053497314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23265502601861954, + "step": 17872 + }, + { + "epoch": 0.35748, + "grad_norm": 2.03125, + "grad_norm_var": 0.1243072509765625, + "learning_rate": 0.0001, + "loss": 4.0021, + "loss/crossentropy": 1.7091269493103027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21242853999137878, + "step": 17874 + }, + { + "epoch": 0.35752, + "grad_norm": 2.109375, + "grad_norm_var": 0.12219950358072916, + "learning_rate": 0.0001, + "loss": 4.3081, + "loss/crossentropy": 2.2395507097244263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21505478769540787, + "step": 17876 + }, + { + "epoch": 0.35756, + "grad_norm": 1.953125, + "grad_norm_var": 0.008512369791666667, + "learning_rate": 0.0001, + "loss": 3.8636, + "loss/crossentropy": 2.014355480670929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20956294238567352, + "step": 17878 + }, + { + "epoch": 0.3576, + "grad_norm": 2.296875, + "grad_norm_var": 0.012383778889973959, + "learning_rate": 0.0001, + "loss": 4.3642, + "loss/crossentropy": 2.3606066703796387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22014011442661285, + "step": 17880 + }, + { + "epoch": 0.35764, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011368560791015624, + "learning_rate": 0.0001, + "loss": 3.9368, + "loss/crossentropy": 1.6152977347373962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1630660966038704, + "step": 17882 + }, + { + "epoch": 0.35768, + "grad_norm": 1.953125, + "grad_norm_var": 0.011138661702473959, + "learning_rate": 0.0001, + "loss": 4.156, + "loss/crossentropy": 2.1977567076683044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20369882881641388, + "step": 17884 + }, + { + "epoch": 0.35772, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012359364827473959, + "learning_rate": 0.0001, + "loss": 3.9927, + "loss/crossentropy": 1.9582098126411438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20284517109394073, + "step": 17886 + }, + { + "epoch": 0.35776, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013818105061848959, + "learning_rate": 0.0001, + "loss": 4.108, + "loss/crossentropy": 2.1300426721572876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19376185536384583, + "step": 17888 + }, + { + "epoch": 0.3578, + "grad_norm": 2.03125, + "grad_norm_var": 0.01883519490559896, + "learning_rate": 0.0001, + "loss": 4.3648, + "loss/crossentropy": 2.057590961456299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21069013327360153, + "step": 17890 + }, + { + "epoch": 0.35784, + "grad_norm": 2.046875, + "grad_norm_var": 0.01968994140625, + "learning_rate": 0.0001, + "loss": 4.0006, + "loss/crossentropy": 2.21635901927948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19591110199689865, + "step": 17892 + }, + { + "epoch": 0.35788, + "grad_norm": 2.109375, + "grad_norm_var": 0.019809722900390625, + "learning_rate": 0.0001, + "loss": 4.4868, + "loss/crossentropy": 2.6530216932296753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24127788096666336, + "step": 17894 + }, + { + "epoch": 0.35792, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013256581624348958, + "learning_rate": 0.0001, + "loss": 4.0727, + "loss/crossentropy": 1.9931264519691467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20592481642961502, + "step": 17896 + }, + { + "epoch": 0.35796, + "grad_norm": 2.0, + "grad_norm_var": 0.013305409749348959, + "learning_rate": 0.0001, + "loss": 4.1574, + "loss/crossentropy": 2.178193688392639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2204412743449211, + "step": 17898 + }, + { + "epoch": 0.358, + "grad_norm": 1.96875, + "grad_norm_var": 0.012878163655598959, + "learning_rate": 0.0001, + "loss": 4.1748, + "loss/crossentropy": 2.192083954811096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22313552349805832, + "step": 17900 + }, + { + "epoch": 0.35804, + "grad_norm": 1.9375, + "grad_norm_var": 0.011281077067057292, + "learning_rate": 0.0001, + "loss": 4.1695, + "loss/crossentropy": 2.228816568851471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2120378389954567, + "step": 17902 + }, + { + "epoch": 0.35808, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010871378580729167, + "learning_rate": 0.0001, + "loss": 4.1369, + "loss/crossentropy": 2.0921813249588013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19552303105592728, + "step": 17904 + }, + { + "epoch": 0.35812, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004687245686848958, + "learning_rate": 0.0001, + "loss": 4.2206, + "loss/crossentropy": 2.0568217635154724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19668184220790863, + "step": 17906 + }, + { + "epoch": 0.35816, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0038937886555989584, + "learning_rate": 0.0001, + "loss": 3.9946, + "loss/crossentropy": 1.8241485357284546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15895915031433105, + "step": 17908 + }, + { + "epoch": 0.3582, + "grad_norm": 1.859375, + "grad_norm_var": 0.003082021077473958, + "learning_rate": 0.0001, + "loss": 3.9904, + "loss/crossentropy": 2.2571341395378113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2065516859292984, + "step": 17910 + }, + { + "epoch": 0.35824, + "grad_norm": 1.890625, + "grad_norm_var": 0.0032297770182291665, + "learning_rate": 0.0001, + "loss": 3.9289, + "loss/crossentropy": 2.0472288727760315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21586833149194717, + "step": 17912 + }, + { + "epoch": 0.35828, + "grad_norm": 1.984375, + "grad_norm_var": 0.0035947163899739585, + "learning_rate": 0.0001, + "loss": 4.0652, + "loss/crossentropy": 2.0345569252967834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19726714491844177, + "step": 17914 + }, + { + "epoch": 0.35832, + "grad_norm": 1.984375, + "grad_norm_var": 0.0037127176920572916, + "learning_rate": 0.0001, + "loss": 4.0294, + "loss/crossentropy": 2.15024471282959, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20401738584041595, + "step": 17916 + }, + { + "epoch": 0.35836, + "grad_norm": 2.046875, + "grad_norm_var": 0.004874420166015625, + "learning_rate": 0.0001, + "loss": 4.3122, + "loss/crossentropy": 2.110986351966858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2299470156431198, + "step": 17918 + }, + { + "epoch": 0.3584, + "grad_norm": 2.03125, + "grad_norm_var": 0.003897857666015625, + "learning_rate": 0.0001, + "loss": 4.2147, + "loss/crossentropy": 1.7297720909118652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18035240471363068, + "step": 17920 + }, + { + "epoch": 0.35844, + "grad_norm": 2.3125, + "grad_norm_var": 0.01236572265625, + "learning_rate": 0.0001, + "loss": 4.3997, + "loss/crossentropy": 1.9483368396759033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2060721442103386, + "step": 17922 + }, + { + "epoch": 0.35848, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0123443603515625, + "learning_rate": 0.0001, + "loss": 4.1527, + "loss/crossentropy": 1.9840999841690063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20700833201408386, + "step": 17924 + }, + { + "epoch": 0.35852, + "grad_norm": 2.0625, + "grad_norm_var": 0.011525217692057292, + "learning_rate": 0.0001, + "loss": 4.1754, + "loss/crossentropy": 2.4118131399154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20695140957832336, + "step": 17926 + }, + { + "epoch": 0.35856, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011195627848307292, + "learning_rate": 0.0001, + "loss": 3.9363, + "loss/crossentropy": 1.9709432721138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18202026933431625, + "step": 17928 + }, + { + "epoch": 0.3586, + "grad_norm": 2.078125, + "grad_norm_var": 0.010206858317057291, + "learning_rate": 0.0001, + "loss": 4.2972, + "loss/crossentropy": 1.9450251460075378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21454186737537384, + "step": 17930 + }, + { + "epoch": 0.35864, + "grad_norm": 2.03125, + "grad_norm_var": 0.0094390869140625, + "learning_rate": 0.0001, + "loss": 4.3317, + "loss/crossentropy": 2.0738271474838257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21376954019069672, + "step": 17932 + }, + { + "epoch": 0.35868, + "grad_norm": 2.03125, + "grad_norm_var": 0.010109202067057291, + "learning_rate": 0.0001, + "loss": 4.1613, + "loss/crossentropy": 2.1857110261917114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2238282486796379, + "step": 17934 + }, + { + "epoch": 0.35872, + "grad_norm": 1.96875, + "grad_norm_var": 0.010643513997395833, + "learning_rate": 0.0001, + "loss": 4.2208, + "loss/crossentropy": 2.2790093421936035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987098902463913, + "step": 17936 + }, + { + "epoch": 0.35876, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0054280598958333336, + "learning_rate": 0.0001, + "loss": 3.9261, + "loss/crossentropy": 2.1101399064064026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034989446401596, + "step": 17938 + }, + { + "epoch": 0.3588, + "grad_norm": 1.8203125, + "grad_norm_var": 0.005793253580729167, + "learning_rate": 0.0001, + "loss": 3.7024, + "loss/crossentropy": 1.6923771500587463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18680892139673233, + "step": 17940 + }, + { + "epoch": 0.35884, + "grad_norm": 1.8515625, + "grad_norm_var": 0.005342356363932292, + "learning_rate": 0.0001, + "loss": 3.9282, + "loss/crossentropy": 1.6772454977035522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17377988994121552, + "step": 17942 + }, + { + "epoch": 0.35888, + "grad_norm": 1.921875, + "grad_norm_var": 0.007722981770833333, + "learning_rate": 0.0001, + "loss": 4.141, + "loss/crossentropy": 2.2448233366012573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21804184466600418, + "step": 17944 + }, + { + "epoch": 0.35892, + "grad_norm": 1.953125, + "grad_norm_var": 0.006688435872395833, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 1.934161365032196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1834414005279541, + "step": 17946 + }, + { + "epoch": 0.35896, + "grad_norm": 2.0, + "grad_norm_var": 0.007344563802083333, + "learning_rate": 0.0001, + "loss": 3.9943, + "loss/crossentropy": 2.06991970539093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21084094047546387, + "step": 17948 + }, + { + "epoch": 0.359, + "grad_norm": 1.984375, + "grad_norm_var": 0.007916005452473958, + "learning_rate": 0.0001, + "loss": 4.0493, + "loss/crossentropy": 2.0942054986953735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21652275323867798, + "step": 17950 + }, + { + "epoch": 0.35904, + "grad_norm": 2.078125, + "grad_norm_var": 0.009134928385416666, + "learning_rate": 0.0001, + "loss": 4.1125, + "loss/crossentropy": 2.0005252361297607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198953777551651, + "step": 17952 + }, + { + "epoch": 0.35908, + "grad_norm": 2.609375, + "grad_norm_var": 0.0327301025390625, + "learning_rate": 0.0001, + "loss": 4.1639, + "loss/crossentropy": 2.1563133597373962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072513848543167, + "step": 17954 + }, + { + "epoch": 0.35912, + "grad_norm": 2.109375, + "grad_norm_var": 0.0312896728515625, + "learning_rate": 0.0001, + "loss": 4.1789, + "loss/crossentropy": 2.3533977270126343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21483703702688217, + "step": 17956 + }, + { + "epoch": 0.35916, + "grad_norm": 1.9375, + "grad_norm_var": 0.029581705729166668, + "learning_rate": 0.0001, + "loss": 4.0921, + "loss/crossentropy": 2.1915602684020996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2241670861840248, + "step": 17958 + }, + { + "epoch": 0.3592, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0314117431640625, + "learning_rate": 0.0001, + "loss": 3.9225, + "loss/crossentropy": 1.9529941082000732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1920214593410492, + "step": 17960 + }, + { + "epoch": 0.35924, + "grad_norm": 2.015625, + "grad_norm_var": 0.030350748697916666, + "learning_rate": 0.0001, + "loss": 4.4467, + "loss/crossentropy": 2.308731436729431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2238362655043602, + "step": 17962 + }, + { + "epoch": 0.35928, + "grad_norm": 2.03125, + "grad_norm_var": 0.04334208170572917, + "learning_rate": 0.0001, + "loss": 4.1682, + "loss/crossentropy": 2.0917986631393433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061680108308792, + "step": 17964 + }, + { + "epoch": 0.35932, + "grad_norm": 2.265625, + "grad_norm_var": 10.265104166666667, + "learning_rate": 0.0001, + "loss": 4.9501, + "loss/crossentropy": 2.16925585269928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20239810645580292, + "step": 17966 + }, + { + "epoch": 0.35936, + "grad_norm": 2.0, + "grad_norm_var": 10.219252268473307, + "learning_rate": 0.0001, + "loss": 4.287, + "loss/crossentropy": 2.1585946083068848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21381714940071106, + "step": 17968 + }, + { + "epoch": 0.3594, + "grad_norm": 1.8359375, + "grad_norm_var": 10.281192016601562, + "learning_rate": 0.0001, + "loss": 4.0133, + "loss/crossentropy": 1.8752552270889282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19667387753725052, + "step": 17970 + }, + { + "epoch": 0.35944, + "grad_norm": 1.9140625, + "grad_norm_var": 10.291275024414062, + "learning_rate": 0.0001, + "loss": 3.9797, + "loss/crossentropy": 1.8533543944358826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1844579428434372, + "step": 17972 + }, + { + "epoch": 0.35948, + "grad_norm": 2.234375, + "grad_norm_var": 10.282754516601562, + "learning_rate": 0.0001, + "loss": 4.5316, + "loss/crossentropy": 2.5859906673431396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23142188042402267, + "step": 17974 + }, + { + "epoch": 0.35952, + "grad_norm": 2.15625, + "grad_norm_var": 10.236201985677083, + "learning_rate": 0.0001, + "loss": 4.2607, + "loss/crossentropy": 2.2086023092269897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21764708310365677, + "step": 17976 + }, + { + "epoch": 0.35956, + "grad_norm": 1.953125, + "grad_norm_var": 10.258858235677083, + "learning_rate": 0.0001, + "loss": 4.2578, + "loss/crossentropy": 2.3860682249069214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22804389894008636, + "step": 17978 + }, + { + "epoch": 0.3596, + "grad_norm": 2.0, + "grad_norm_var": 10.314815266927083, + "learning_rate": 0.0001, + "loss": 4.2055, + "loss/crossentropy": 2.1412216424942017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.200544573366642, + "step": 17980 + }, + { + "epoch": 0.35964, + "grad_norm": 1.9140625, + "grad_norm_var": 0.022395833333333334, + "learning_rate": 0.0001, + "loss": 4.0412, + "loss/crossentropy": 2.198704957962036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043847218155861, + "step": 17982 + }, + { + "epoch": 0.35968, + "grad_norm": 1.90625, + "grad_norm_var": 0.010249837239583334, + "learning_rate": 0.0001, + "loss": 4.037, + "loss/crossentropy": 2.0053776502609253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072744369506836, + "step": 17984 + }, + { + "epoch": 0.35972, + "grad_norm": 2.0625, + "grad_norm_var": 0.04221979777018229, + "learning_rate": 0.0001, + "loss": 4.127, + "loss/crossentropy": 2.5015710592269897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23083895444869995, + "step": 17986 + }, + { + "epoch": 0.35976, + "grad_norm": 2.015625, + "grad_norm_var": 0.04145889282226563, + "learning_rate": 0.0001, + "loss": 4.0531, + "loss/crossentropy": 2.037220776081085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19487662613391876, + "step": 17988 + }, + { + "epoch": 0.3598, + "grad_norm": 1.921875, + "grad_norm_var": 0.03911107381184896, + "learning_rate": 0.0001, + "loss": 3.9801, + "loss/crossentropy": 1.9316660165786743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183449983596802, + "step": 17990 + }, + { + "epoch": 0.35984, + "grad_norm": 1.828125, + "grad_norm_var": 0.04090067545572917, + "learning_rate": 0.0001, + "loss": 3.804, + "loss/crossentropy": 1.892772138118744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17082024365663528, + "step": 17992 + }, + { + "epoch": 0.35988, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04088109334309896, + "learning_rate": 0.0001, + "loss": 4.0732, + "loss/crossentropy": 1.9325169324874878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900368332862854, + "step": 17994 + }, + { + "epoch": 0.35992, + "grad_norm": 2.421875, + "grad_norm_var": 0.050388336181640625, + "learning_rate": 0.0001, + "loss": 4.1392, + "loss/crossentropy": 1.9746126532554626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21207977831363678, + "step": 17996 + }, + { + "epoch": 0.35996, + "grad_norm": 2.03125, + "grad_norm_var": 0.048620351155598956, + "learning_rate": 0.0001, + "loss": 4.0308, + "loss/crossentropy": 2.1249493956565857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18674498051404953, + "step": 17998 + }, + { + "epoch": 0.36, + "grad_norm": 2.046875, + "grad_norm_var": 0.04625422159830729, + "learning_rate": 0.0001, + "loss": 4.0854, + "loss/crossentropy": 2.1279499530792236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214716374874115, + "step": 18000 + }, + { + "epoch": 0.36004, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0177398681640625, + "learning_rate": 0.0001, + "loss": 3.9238, + "loss/crossentropy": 2.301763415336609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20897136628627777, + "step": 18002 + }, + { + "epoch": 0.36008, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01788330078125, + "learning_rate": 0.0001, + "loss": 4.0944, + "loss/crossentropy": 1.8079062104225159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19506293535232544, + "step": 18004 + }, + { + "epoch": 0.36012, + "grad_norm": 1.890625, + "grad_norm_var": 0.0205078125, + "learning_rate": 0.0001, + "loss": 4.0503, + "loss/crossentropy": 1.9429230093955994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19762147217988968, + "step": 18006 + }, + { + "epoch": 0.36016, + "grad_norm": 1.9296875, + "grad_norm_var": 0.019608561197916666, + "learning_rate": 0.0001, + "loss": 3.9801, + "loss/crossentropy": 2.307368576526642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21261631697416306, + "step": 18008 + }, + { + "epoch": 0.3602, + "grad_norm": 1.8203125, + "grad_norm_var": 0.021952311197916668, + "learning_rate": 0.0001, + "loss": 3.8723, + "loss/crossentropy": 2.2834020853042603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20086131989955902, + "step": 18010 + }, + { + "epoch": 0.36024, + "grad_norm": 1.875, + "grad_norm_var": 0.0117340087890625, + "learning_rate": 0.0001, + "loss": 4.1708, + "loss/crossentropy": 2.1730109453201294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20470792055130005, + "step": 18012 + }, + { + "epoch": 0.36028, + "grad_norm": 2.109375, + "grad_norm_var": 0.012809244791666667, + "learning_rate": 0.0001, + "loss": 4.4047, + "loss/crossentropy": 2.491591691970825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2361597716808319, + "step": 18014 + }, + { + "epoch": 0.36032, + "grad_norm": 1.9296875, + "grad_norm_var": 0.013240305582682292, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 2.024519979953766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042919620871544, + "step": 18016 + }, + { + "epoch": 0.36036, + "grad_norm": 1.7734375, + "grad_norm_var": 0.016123199462890626, + "learning_rate": 0.0001, + "loss": 3.9521, + "loss/crossentropy": 2.055600941181183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17253537476062775, + "step": 18018 + }, + { + "epoch": 0.3604, + "grad_norm": 2.046875, + "grad_norm_var": 0.0188629150390625, + "learning_rate": 0.0001, + "loss": 3.9665, + "loss/crossentropy": 1.8796368837356567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21168731898069382, + "step": 18020 + }, + { + "epoch": 0.36044, + "grad_norm": 1.9140625, + "grad_norm_var": 0.014825185139973959, + "learning_rate": 0.0001, + "loss": 4.0291, + "loss/crossentropy": 1.9885223507881165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20676057040691376, + "step": 18022 + }, + { + "epoch": 0.36048, + "grad_norm": 1.90625, + "grad_norm_var": 0.010839589436848958, + "learning_rate": 0.0001, + "loss": 3.8401, + "loss/crossentropy": 2.0412577986717224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18211868405342102, + "step": 18024 + }, + { + "epoch": 0.36052, + "grad_norm": 1.828125, + "grad_norm_var": 0.010445149739583333, + "learning_rate": 0.0001, + "loss": 3.8696, + "loss/crossentropy": 2.1391053199768066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20121745765209198, + "step": 18026 + }, + { + "epoch": 0.36056, + "grad_norm": 2.046875, + "grad_norm_var": 0.010087076822916667, + "learning_rate": 0.0001, + "loss": 3.9683, + "loss/crossentropy": 1.8888981938362122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19372491538524628, + "step": 18028 + }, + { + "epoch": 0.3606, + "grad_norm": 2.015625, + "grad_norm_var": 0.007087198893229166, + "learning_rate": 0.0001, + "loss": 4.2467, + "loss/crossentropy": 2.002028524875641, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18978165835142136, + "step": 18030 + }, + { + "epoch": 0.36064, + "grad_norm": 1.828125, + "grad_norm_var": 0.007828776041666667, + "learning_rate": 0.0001, + "loss": 4.0703, + "loss/crossentropy": 2.3448036909103394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21162079274654388, + "step": 18032 + }, + { + "epoch": 0.36068, + "grad_norm": 2.5625, + "grad_norm_var": 0.03154474894205729, + "learning_rate": 0.0001, + "loss": 4.283, + "loss/crossentropy": 2.086554765701294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25456031411886215, + "step": 18034 + }, + { + "epoch": 0.36072, + "grad_norm": 1.8671875, + "grad_norm_var": 0.02948582967122396, + "learning_rate": 0.0001, + "loss": 3.8122, + "loss/crossentropy": 1.9166680574417114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18382181227207184, + "step": 18036 + }, + { + "epoch": 0.36076, + "grad_norm": 1.8984375, + "grad_norm_var": 0.029642740885416668, + "learning_rate": 0.0001, + "loss": 3.9937, + "loss/crossentropy": 2.050579786300659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20126450806856155, + "step": 18038 + }, + { + "epoch": 0.3608, + "grad_norm": 1.9453125, + "grad_norm_var": 0.030049641927083332, + "learning_rate": 0.0001, + "loss": 3.9273, + "loss/crossentropy": 2.2339882850646973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20789660513401031, + "step": 18040 + }, + { + "epoch": 0.36084, + "grad_norm": 1.8984375, + "grad_norm_var": 0.029243977864583333, + "learning_rate": 0.0001, + "loss": 4.0086, + "loss/crossentropy": 2.248544931411743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20925041288137436, + "step": 18042 + }, + { + "epoch": 0.36088, + "grad_norm": 2.046875, + "grad_norm_var": 0.028955078125, + "learning_rate": 0.0001, + "loss": 4.0448, + "loss/crossentropy": 1.8894963264465332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18232882767915726, + "step": 18044 + }, + { + "epoch": 0.36092, + "grad_norm": 2.046875, + "grad_norm_var": 0.0289703369140625, + "learning_rate": 0.0001, + "loss": 4.1838, + "loss/crossentropy": 2.0220844745635986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18812591582536697, + "step": 18046 + }, + { + "epoch": 0.36096, + "grad_norm": 1.9921875, + "grad_norm_var": 0.027913411458333332, + "learning_rate": 0.0001, + "loss": 4.2088, + "loss/crossentropy": 2.1896166801452637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.205679252743721, + "step": 18048 + }, + { + "epoch": 0.361, + "grad_norm": 2.046875, + "grad_norm_var": 0.004686482747395833, + "learning_rate": 0.0001, + "loss": 4.0511, + "loss/crossentropy": 2.0670089721679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18881987780332565, + "step": 18050 + }, + { + "epoch": 0.36104, + "grad_norm": 2.046875, + "grad_norm_var": 0.005500284830729166, + "learning_rate": 0.0001, + "loss": 4.2426, + "loss/crossentropy": 2.155470609664917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147108018398285, + "step": 18052 + }, + { + "epoch": 0.36108, + "grad_norm": 2.15625, + "grad_norm_var": 0.0076812744140625, + "learning_rate": 0.0001, + "loss": 4.2007, + "loss/crossentropy": 1.8182223439216614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18691811710596085, + "step": 18054 + }, + { + "epoch": 0.36112, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0066314697265625, + "learning_rate": 0.0001, + "loss": 3.8601, + "loss/crossentropy": 1.7366089820861816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18361609429121017, + "step": 18056 + }, + { + "epoch": 0.36116, + "grad_norm": 1.96875, + "grad_norm_var": 0.005704498291015625, + "learning_rate": 0.0001, + "loss": 4.027, + "loss/crossentropy": 1.9429153203964233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974082887172699, + "step": 18058 + }, + { + "epoch": 0.3612, + "grad_norm": 1.9453125, + "grad_norm_var": 0.00562744140625, + "learning_rate": 0.0001, + "loss": 4.2751, + "loss/crossentropy": 2.230627417564392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22216492146253586, + "step": 18060 + }, + { + "epoch": 0.36124, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006371053059895834, + "learning_rate": 0.0001, + "loss": 4.1759, + "loss/crossentropy": 2.0506786704063416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20492341369390488, + "step": 18062 + }, + { + "epoch": 0.36128, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007116444905598958, + "learning_rate": 0.0001, + "loss": 3.778, + "loss/crossentropy": 2.025477647781372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20034795254468918, + "step": 18064 + }, + { + "epoch": 0.36132, + "grad_norm": 2.046875, + "grad_norm_var": 0.006617991129557291, + "learning_rate": 0.0001, + "loss": 4.2998, + "loss/crossentropy": 1.9828922748565674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19445167481899261, + "step": 18066 + }, + { + "epoch": 0.36136, + "grad_norm": 2.046875, + "grad_norm_var": 0.006485748291015625, + "learning_rate": 0.0001, + "loss": 4.2566, + "loss/crossentropy": 2.2869513630867004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21809721738100052, + "step": 18068 + }, + { + "epoch": 0.3614, + "grad_norm": 1.875, + "grad_norm_var": 0.004129791259765625, + "learning_rate": 0.0001, + "loss": 3.9636, + "loss/crossentropy": 2.1584482192993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21123671531677246, + "step": 18070 + }, + { + "epoch": 0.36144, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0046078999837239586, + "learning_rate": 0.0001, + "loss": 3.9628, + "loss/crossentropy": 2.112669587135315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20795845240354538, + "step": 18072 + }, + { + "epoch": 0.36148, + "grad_norm": 2.09375, + "grad_norm_var": 0.0059234619140625, + "learning_rate": 0.0001, + "loss": 4.2516, + "loss/crossentropy": 2.3283581733703613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22724319994449615, + "step": 18074 + }, + { + "epoch": 0.36152, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006078084309895833, + "learning_rate": 0.0001, + "loss": 3.8966, + "loss/crossentropy": 2.286331057548523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20937074720859528, + "step": 18076 + }, + { + "epoch": 0.36156, + "grad_norm": 1.7578125, + "grad_norm_var": 0.009110260009765624, + "learning_rate": 0.0001, + "loss": 4.1124, + "loss/crossentropy": 2.1055954694747925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924988478422165, + "step": 18078 + }, + { + "epoch": 0.3616, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008186594645182291, + "learning_rate": 0.0001, + "loss": 4.0349, + "loss/crossentropy": 2.180675983428955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21886169910430908, + "step": 18080 + }, + { + "epoch": 0.36164, + "grad_norm": 2.078125, + "grad_norm_var": 0.008621978759765624, + "learning_rate": 0.0001, + "loss": 3.9958, + "loss/crossentropy": 1.7595775127410889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17430071532726288, + "step": 18082 + }, + { + "epoch": 0.36168, + "grad_norm": 1.921875, + "grad_norm_var": 0.011472320556640625, + "learning_rate": 0.0001, + "loss": 4.0306, + "loss/crossentropy": 2.1191208958625793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19574576616287231, + "step": 18084 + }, + { + "epoch": 0.36172, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0110595703125, + "learning_rate": 0.0001, + "loss": 4.1563, + "loss/crossentropy": 2.0964609384536743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19742169976234436, + "step": 18086 + }, + { + "epoch": 0.36176, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0108154296875, + "learning_rate": 0.0001, + "loss": 4.3584, + "loss/crossentropy": 2.3081077337265015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2177809551358223, + "step": 18088 + }, + { + "epoch": 0.3618, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009928385416666666, + "learning_rate": 0.0001, + "loss": 4.0493, + "loss/crossentropy": 2.340154528617859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23811470717191696, + "step": 18090 + }, + { + "epoch": 0.36184, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009777577718098958, + "learning_rate": 0.0001, + "loss": 3.828, + "loss/crossentropy": 1.9872968196868896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19391798973083496, + "step": 18092 + }, + { + "epoch": 0.36188, + "grad_norm": 1.984375, + "grad_norm_var": 0.0067779541015625, + "learning_rate": 0.0001, + "loss": 3.9404, + "loss/crossentropy": 1.9162002205848694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203052818775177, + "step": 18094 + }, + { + "epoch": 0.36192, + "grad_norm": 1.7890625, + "grad_norm_var": 0.009718577067057291, + "learning_rate": 0.0001, + "loss": 3.7458, + "loss/crossentropy": 2.240887403488159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19722212105989456, + "step": 18096 + }, + { + "epoch": 0.36196, + "grad_norm": 2.078125, + "grad_norm_var": 0.009496053059895834, + "learning_rate": 0.0001, + "loss": 4.1473, + "loss/crossentropy": 1.8601738214492798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938677802681923, + "step": 18098 + }, + { + "epoch": 0.362, + "grad_norm": 1.84375, + "grad_norm_var": 0.005729166666666666, + "learning_rate": 0.0001, + "loss": 4.0131, + "loss/crossentropy": 1.9588143229484558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19559185206890106, + "step": 18100 + }, + { + "epoch": 0.36204, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0060618082682291664, + "learning_rate": 0.0001, + "loss": 3.9199, + "loss/crossentropy": 1.909518837928772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974424198269844, + "step": 18102 + }, + { + "epoch": 0.36208, + "grad_norm": 1.8203125, + "grad_norm_var": 0.005641428629557291, + "learning_rate": 0.0001, + "loss": 4.15, + "loss/crossentropy": 2.0762908458709717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981927454471588, + "step": 18104 + }, + { + "epoch": 0.36212, + "grad_norm": 2.015625, + "grad_norm_var": 0.006339263916015625, + "learning_rate": 0.0001, + "loss": 4.3226, + "loss/crossentropy": 2.4555106163024902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22001618146896362, + "step": 18106 + }, + { + "epoch": 0.36216, + "grad_norm": 1.7734375, + "grad_norm_var": 0.013533528645833333, + "learning_rate": 0.0001, + "loss": 4.0595, + "loss/crossentropy": 1.9983150959014893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19945750385522842, + "step": 18108 + }, + { + "epoch": 0.3622, + "grad_norm": 1.90625, + "grad_norm_var": 0.013181304931640625, + "learning_rate": 0.0001, + "loss": 4.0863, + "loss/crossentropy": 2.152313530445099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21388111263513565, + "step": 18110 + }, + { + "epoch": 0.36224, + "grad_norm": 2.125, + "grad_norm_var": 0.016886393229166668, + "learning_rate": 0.0001, + "loss": 4.1128, + "loss/crossentropy": 1.690669596195221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1883523240685463, + "step": 18112 + }, + { + "epoch": 0.36228, + "grad_norm": 2.09375, + "grad_norm_var": 0.017179107666015624, + "learning_rate": 0.0001, + "loss": 3.9978, + "loss/crossentropy": 1.9404500126838684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18945956230163574, + "step": 18114 + }, + { + "epoch": 0.36232, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016007486979166666, + "learning_rate": 0.0001, + "loss": 4.4396, + "loss/crossentropy": 2.213471293449402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22550443559885025, + "step": 18116 + }, + { + "epoch": 0.36236, + "grad_norm": 2.015625, + "grad_norm_var": 0.014233144124348958, + "learning_rate": 0.0001, + "loss": 4.3084, + "loss/crossentropy": 2.1008135080337524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002386674284935, + "step": 18118 + }, + { + "epoch": 0.3624, + "grad_norm": 1.8359375, + "grad_norm_var": 0.013866933186848958, + "learning_rate": 0.0001, + "loss": 3.8664, + "loss/crossentropy": 1.9704426527023315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1806606948375702, + "step": 18120 + }, + { + "epoch": 0.36244, + "grad_norm": 1.890625, + "grad_norm_var": 0.01473388671875, + "learning_rate": 0.0001, + "loss": 3.7456, + "loss/crossentropy": 1.8658949732780457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19861871004104614, + "step": 18122 + }, + { + "epoch": 0.36248, + "grad_norm": 2.0625, + "grad_norm_var": 0.011034901936848958, + "learning_rate": 0.0001, + "loss": 4.0341, + "loss/crossentropy": 1.9327979683876038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19395866990089417, + "step": 18124 + }, + { + "epoch": 0.36252, + "grad_norm": 1.984375, + "grad_norm_var": 0.009641265869140625, + "learning_rate": 0.0001, + "loss": 4.071, + "loss/crossentropy": 2.377102255821228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21928569674491882, + "step": 18126 + }, + { + "epoch": 0.36256, + "grad_norm": 2.0625, + "grad_norm_var": 0.006994374593098958, + "learning_rate": 0.0001, + "loss": 3.9623, + "loss/crossentropy": 2.1650161743164062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2292800396680832, + "step": 18128 + }, + { + "epoch": 0.3626, + "grad_norm": 2.078125, + "grad_norm_var": 0.007458241780598959, + "learning_rate": 0.0001, + "loss": 4.3135, + "loss/crossentropy": 2.281827986240387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190621793270111, + "step": 18130 + }, + { + "epoch": 0.36264, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00848388671875, + "learning_rate": 0.0001, + "loss": 3.8335, + "loss/crossentropy": 2.0073219537734985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18984179198741913, + "step": 18132 + }, + { + "epoch": 0.36268, + "grad_norm": 2.0625, + "grad_norm_var": 0.01141357421875, + "learning_rate": 0.0001, + "loss": 3.7551, + "loss/crossentropy": 1.7675580978393555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18771683424711227, + "step": 18134 + }, + { + "epoch": 0.36272, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011154937744140624, + "learning_rate": 0.0001, + "loss": 3.8816, + "loss/crossentropy": 2.064103126525879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20835159718990326, + "step": 18136 + }, + { + "epoch": 0.36276, + "grad_norm": 2.140625, + "grad_norm_var": 0.0126861572265625, + "learning_rate": 0.0001, + "loss": 4.2122, + "loss/crossentropy": 2.1821314096450806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2391536980867386, + "step": 18138 + }, + { + "epoch": 0.3628, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010117340087890624, + "learning_rate": 0.0001, + "loss": 4.2307, + "loss/crossentropy": 2.295500636100769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215906023979187, + "step": 18140 + }, + { + "epoch": 0.36284, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010245768229166667, + "learning_rate": 0.0001, + "loss": 4.1144, + "loss/crossentropy": 2.0736570954322815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18491360545158386, + "step": 18142 + }, + { + "epoch": 0.36288, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009496053059895834, + "learning_rate": 0.0001, + "loss": 4.1326, + "loss/crossentropy": 2.0513535737991333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20204297453165054, + "step": 18144 + }, + { + "epoch": 0.36292, + "grad_norm": 2.140625, + "grad_norm_var": 0.010601552327473958, + "learning_rate": 0.0001, + "loss": 4.1648, + "loss/crossentropy": 2.1471269130706787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204594686627388, + "step": 18146 + }, + { + "epoch": 0.36296, + "grad_norm": 2.125, + "grad_norm_var": 0.011372884114583334, + "learning_rate": 0.0001, + "loss": 4.2728, + "loss/crossentropy": 2.093555986881256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20173487067222595, + "step": 18148 + }, + { + "epoch": 0.363, + "grad_norm": 1.6953125, + "grad_norm_var": 0.013627115885416667, + "learning_rate": 0.0001, + "loss": 3.7183, + "loss/crossentropy": 1.6938685178756714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1703190803527832, + "step": 18150 + }, + { + "epoch": 0.36304, + "grad_norm": 2.046875, + "grad_norm_var": 0.013337961832682292, + "learning_rate": 0.0001, + "loss": 4.116, + "loss/crossentropy": 2.180016875267029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23546195775270462, + "step": 18152 + }, + { + "epoch": 0.36308, + "grad_norm": 2.015625, + "grad_norm_var": 0.011277008056640624, + "learning_rate": 0.0001, + "loss": 4.1826, + "loss/crossentropy": 2.145975947380066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20554804801940918, + "step": 18154 + }, + { + "epoch": 0.36312, + "grad_norm": 1.953125, + "grad_norm_var": 0.011131795247395833, + "learning_rate": 0.0001, + "loss": 4.0113, + "loss/crossentropy": 2.068985939025879, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20410801470279694, + "step": 18156 + }, + { + "epoch": 0.36316, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012572224934895833, + "learning_rate": 0.0001, + "loss": 3.8273, + "loss/crossentropy": 1.8428975343704224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19165785610675812, + "step": 18158 + }, + { + "epoch": 0.3632, + "grad_norm": 1.875, + "grad_norm_var": 0.0131744384765625, + "learning_rate": 0.0001, + "loss": 3.7808, + "loss/crossentropy": 1.859747588634491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18524104356765747, + "step": 18160 + }, + { + "epoch": 0.36324, + "grad_norm": 2.0, + "grad_norm_var": 0.011521148681640624, + "learning_rate": 0.0001, + "loss": 4.1458, + "loss/crossentropy": 2.1383343935012817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21649780869483948, + "step": 18162 + }, + { + "epoch": 0.36328, + "grad_norm": 2.078125, + "grad_norm_var": 0.010758209228515624, + "learning_rate": 0.0001, + "loss": 3.9889, + "loss/crossentropy": 2.0153123140335083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2131347879767418, + "step": 18164 + }, + { + "epoch": 0.36332, + "grad_norm": 1.875, + "grad_norm_var": 0.007004547119140625, + "learning_rate": 0.0001, + "loss": 4.0827, + "loss/crossentropy": 1.8382813930511475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18116765469312668, + "step": 18166 + }, + { + "epoch": 0.36336, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007287343343098958, + "learning_rate": 0.0001, + "loss": 3.8883, + "loss/crossentropy": 1.9639039039611816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2014933079481125, + "step": 18168 + }, + { + "epoch": 0.3634, + "grad_norm": 2.21875, + "grad_norm_var": 0.012446848551432292, + "learning_rate": 0.0001, + "loss": 4.2886, + "loss/crossentropy": 2.057366132736206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134721800684929, + "step": 18170 + }, + { + "epoch": 0.36344, + "grad_norm": 1.953125, + "grad_norm_var": 0.012690989176432292, + "learning_rate": 0.0001, + "loss": 4.0801, + "loss/crossentropy": 2.244979500770569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219241164624691, + "step": 18172 + }, + { + "epoch": 0.36348, + "grad_norm": 1.921875, + "grad_norm_var": 0.010625966389973958, + "learning_rate": 0.0001, + "loss": 4.1271, + "loss/crossentropy": 2.1159361600875854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19503474980592728, + "step": 18174 + }, + { + "epoch": 0.36352, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0099761962890625, + "learning_rate": 0.0001, + "loss": 4.2031, + "loss/crossentropy": 2.240646004676819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22175125032663345, + "step": 18176 + }, + { + "epoch": 0.36356, + "grad_norm": 1.8984375, + "grad_norm_var": 0.011327107747395834, + "learning_rate": 0.0001, + "loss": 3.9508, + "loss/crossentropy": 1.7946885228157043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17505639791488647, + "step": 18178 + }, + { + "epoch": 0.3636, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010420735677083333, + "learning_rate": 0.0001, + "loss": 4.1138, + "loss/crossentropy": 2.2058286666870117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21548640727996826, + "step": 18180 + }, + { + "epoch": 0.36364, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0101226806640625, + "learning_rate": 0.0001, + "loss": 4.0831, + "loss/crossentropy": 2.168944835662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22227579355239868, + "step": 18182 + }, + { + "epoch": 0.36368, + "grad_norm": 1.96875, + "grad_norm_var": 0.008392079671223959, + "learning_rate": 0.0001, + "loss": 4.1482, + "loss/crossentropy": 2.019882082939148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20161078870296478, + "step": 18184 + }, + { + "epoch": 0.36372, + "grad_norm": 1.9375, + "grad_norm_var": 0.0034993489583333335, + "learning_rate": 0.0001, + "loss": 4.1305, + "loss/crossentropy": 2.000952959060669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19938969612121582, + "step": 18186 + }, + { + "epoch": 0.36376, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0024920145670572916, + "learning_rate": 0.0001, + "loss": 3.9158, + "loss/crossentropy": 2.0612798929214478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022939696907997, + "step": 18188 + }, + { + "epoch": 0.3638, + "grad_norm": 1.78125, + "grad_norm_var": 0.0042307535807291664, + "learning_rate": 0.0001, + "loss": 3.8822, + "loss/crossentropy": 1.664880096912384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16950316727161407, + "step": 18190 + }, + { + "epoch": 0.36384, + "grad_norm": 1.875, + "grad_norm_var": 0.003360748291015625, + "learning_rate": 0.0001, + "loss": 3.9881, + "loss/crossentropy": 1.8047854900360107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1823924407362938, + "step": 18192 + }, + { + "epoch": 0.36388, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0060808817545572914, + "learning_rate": 0.0001, + "loss": 3.7857, + "loss/crossentropy": 2.167177438735962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963154897093773, + "step": 18194 + }, + { + "epoch": 0.36392, + "grad_norm": 1.875, + "grad_norm_var": 0.006151326497395833, + "learning_rate": 0.0001, + "loss": 4.0372, + "loss/crossentropy": 2.396019458770752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20359576493501663, + "step": 18196 + }, + { + "epoch": 0.36396, + "grad_norm": 1.9375, + "grad_norm_var": 0.005866495768229166, + "learning_rate": 0.0001, + "loss": 3.9349, + "loss/crossentropy": 2.0866541862487793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082398235797882, + "step": 18198 + }, + { + "epoch": 0.364, + "grad_norm": 2.015625, + "grad_norm_var": 0.0060791015625, + "learning_rate": 0.0001, + "loss": 4.1046, + "loss/crossentropy": 2.170537829399109, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19030719250440598, + "step": 18200 + }, + { + "epoch": 0.36404, + "grad_norm": 2.328125, + "grad_norm_var": 0.024234771728515625, + "learning_rate": 0.0001, + "loss": 4.3921, + "loss/crossentropy": 1.938852846622467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19586428999900818, + "step": 18202 + }, + { + "epoch": 0.36408, + "grad_norm": 1.9453125, + "grad_norm_var": 0.023996734619140626, + "learning_rate": 0.0001, + "loss": 4.0629, + "loss/crossentropy": 1.8909979462623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17625273764133453, + "step": 18204 + }, + { + "epoch": 0.36412, + "grad_norm": 1.9296875, + "grad_norm_var": 0.022078450520833334, + "learning_rate": 0.0001, + "loss": 4.1729, + "loss/crossentropy": 1.9127016067504883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1888885200023651, + "step": 18206 + }, + { + "epoch": 0.36416, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03103612263997396, + "learning_rate": 0.0001, + "loss": 4.1826, + "loss/crossentropy": 2.182482957839966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21610775589942932, + "step": 18208 + }, + { + "epoch": 0.3642, + "grad_norm": 1.8828125, + "grad_norm_var": 0.027497355143229166, + "learning_rate": 0.0001, + "loss": 4.1941, + "loss/crossentropy": 2.0900736451148987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20886409282684326, + "step": 18210 + }, + { + "epoch": 0.36424, + "grad_norm": 2.0, + "grad_norm_var": 0.0248199462890625, + "learning_rate": 0.0001, + "loss": 3.9184, + "loss/crossentropy": 2.2132604122161865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21213480830192566, + "step": 18212 + }, + { + "epoch": 0.36428, + "grad_norm": 2.171875, + "grad_norm_var": 0.024825032552083334, + "learning_rate": 0.0001, + "loss": 4.0002, + "loss/crossentropy": 1.7810762524604797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188262477517128, + "step": 18214 + }, + { + "epoch": 0.36432, + "grad_norm": 1.984375, + "grad_norm_var": 0.02490208943684896, + "learning_rate": 0.0001, + "loss": 3.8339, + "loss/crossentropy": 2.072141647338867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20580865442752838, + "step": 18216 + }, + { + "epoch": 0.36436, + "grad_norm": 2.125, + "grad_norm_var": 0.016778310139973957, + "learning_rate": 0.0001, + "loss": 4.0777, + "loss/crossentropy": 1.9672082662582397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19521142542362213, + "step": 18218 + }, + { + "epoch": 0.3644, + "grad_norm": 1.890625, + "grad_norm_var": 0.017574055989583334, + "learning_rate": 0.0001, + "loss": 4.0763, + "loss/crossentropy": 2.0122682452201843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21602813154459, + "step": 18220 + }, + { + "epoch": 0.36444, + "grad_norm": 2.078125, + "grad_norm_var": 0.0169342041015625, + "learning_rate": 0.0001, + "loss": 4.0574, + "loss/crossentropy": 2.1709738969802856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2267712950706482, + "step": 18222 + }, + { + "epoch": 0.36448, + "grad_norm": 2.03125, + "grad_norm_var": 0.009091949462890625, + "learning_rate": 0.0001, + "loss": 4.2092, + "loss/crossentropy": 2.246786117553711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128008008003235, + "step": 18224 + }, + { + "epoch": 0.36452, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0063250223795572914, + "learning_rate": 0.0001, + "loss": 4.1051, + "loss/crossentropy": 1.686498999595642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18233107775449753, + "step": 18226 + }, + { + "epoch": 0.36456, + "grad_norm": 2.078125, + "grad_norm_var": 0.006786855061848959, + "learning_rate": 0.0001, + "loss": 4.3823, + "loss/crossentropy": 2.5849136114120483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23062889277935028, + "step": 18228 + }, + { + "epoch": 0.3646, + "grad_norm": 1.921875, + "grad_norm_var": 0.005804189046223958, + "learning_rate": 0.0001, + "loss": 4.04, + "loss/crossentropy": 2.068212151527405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20198806375265121, + "step": 18230 + }, + { + "epoch": 0.36464, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00618896484375, + "learning_rate": 0.0001, + "loss": 3.8729, + "loss/crossentropy": 1.727245271205902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17305339127779007, + "step": 18232 + }, + { + "epoch": 0.36468, + "grad_norm": 2.078125, + "grad_norm_var": 0.005574289957682292, + "learning_rate": 0.0001, + "loss": 4.0496, + "loss/crossentropy": 2.0269583463668823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2060471773147583, + "step": 18234 + }, + { + "epoch": 0.36472, + "grad_norm": 2.015625, + "grad_norm_var": 0.004801177978515625, + "learning_rate": 0.0001, + "loss": 4.1707, + "loss/crossentropy": 1.9194093346595764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19781427085399628, + "step": 18236 + }, + { + "epoch": 0.36476, + "grad_norm": 2.078125, + "grad_norm_var": 0.0055328369140625, + "learning_rate": 0.0001, + "loss": 4.1438, + "loss/crossentropy": 1.7539438605308533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19985094666481018, + "step": 18238 + }, + { + "epoch": 0.3648, + "grad_norm": 1.890625, + "grad_norm_var": 0.006068674723307291, + "learning_rate": 0.0001, + "loss": 4.1287, + "loss/crossentropy": 2.1092851161956787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23365512490272522, + "step": 18240 + }, + { + "epoch": 0.36484, + "grad_norm": 1.984375, + "grad_norm_var": 0.00771484375, + "learning_rate": 0.0001, + "loss": 3.6435, + "loss/crossentropy": 1.772037386894226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18215186148881912, + "step": 18242 + }, + { + "epoch": 0.36488, + "grad_norm": 2.109375, + "grad_norm_var": 0.007845052083333333, + "learning_rate": 0.0001, + "loss": 4.1017, + "loss/crossentropy": 2.0108843445777893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20684240013360977, + "step": 18244 + }, + { + "epoch": 0.36492, + "grad_norm": 2.109375, + "grad_norm_var": 0.008348592122395833, + "learning_rate": 0.0001, + "loss": 4.0077, + "loss/crossentropy": 2.0076091289520264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1842496171593666, + "step": 18246 + }, + { + "epoch": 0.36496, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008085123697916667, + "learning_rate": 0.0001, + "loss": 4.1357, + "loss/crossentropy": 2.1824593544006348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19478252530097961, + "step": 18248 + }, + { + "epoch": 0.365, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0068318684895833336, + "learning_rate": 0.0001, + "loss": 4.2231, + "loss/crossentropy": 2.1033846139907837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21361663192510605, + "step": 18250 + }, + { + "epoch": 0.36504, + "grad_norm": 1.84375, + "grad_norm_var": 0.009299468994140626, + "learning_rate": 0.0001, + "loss": 3.7738, + "loss/crossentropy": 2.142418146133423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20618261396884918, + "step": 18252 + }, + { + "epoch": 0.36508, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008131663004557291, + "learning_rate": 0.0001, + "loss": 4.1037, + "loss/crossentropy": 2.061814546585083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20519836992025375, + "step": 18254 + }, + { + "epoch": 0.36512, + "grad_norm": 2.015625, + "grad_norm_var": 0.008512115478515625, + "learning_rate": 0.0001, + "loss": 4.0512, + "loss/crossentropy": 2.0216987133026123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956964135169983, + "step": 18256 + }, + { + "epoch": 0.36516, + "grad_norm": 2.03125, + "grad_norm_var": 0.006681315104166667, + "learning_rate": 0.0001, + "loss": 4.301, + "loss/crossentropy": 2.2619231939315796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21638523787260056, + "step": 18258 + }, + { + "epoch": 0.3652, + "grad_norm": 2.09375, + "grad_norm_var": 0.006281534830729167, + "learning_rate": 0.0001, + "loss": 4.165, + "loss/crossentropy": 1.8058243989944458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19046328961849213, + "step": 18260 + }, + { + "epoch": 0.36524, + "grad_norm": 2.046875, + "grad_norm_var": 0.005639394124348958, + "learning_rate": 0.0001, + "loss": 4.1876, + "loss/crossentropy": 2.0969032049179077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20291081070899963, + "step": 18262 + }, + { + "epoch": 0.36528, + "grad_norm": 1.8125, + "grad_norm_var": 0.006794993082682292, + "learning_rate": 0.0001, + "loss": 3.8754, + "loss/crossentropy": 1.9567083716392517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19624605774879456, + "step": 18264 + }, + { + "epoch": 0.36532, + "grad_norm": 1.921875, + "grad_norm_var": 0.006493123372395834, + "learning_rate": 0.0001, + "loss": 3.9861, + "loss/crossentropy": 1.9015939235687256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18771862238645554, + "step": 18266 + }, + { + "epoch": 0.36536, + "grad_norm": 1.9375, + "grad_norm_var": 0.017575836181640624, + "learning_rate": 0.0001, + "loss": 4.0288, + "loss/crossentropy": 1.9432410597801208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1785590723156929, + "step": 18268 + }, + { + "epoch": 0.3654, + "grad_norm": 2.0, + "grad_norm_var": 0.017438761393229165, + "learning_rate": 0.0001, + "loss": 4.3256, + "loss/crossentropy": 2.216074585914612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21463461220264435, + "step": 18270 + }, + { + "epoch": 0.36544, + "grad_norm": 1.8203125, + "grad_norm_var": 0.020401763916015624, + "learning_rate": 0.0001, + "loss": 3.7192, + "loss/crossentropy": 1.5440006256103516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1621650904417038, + "step": 18272 + }, + { + "epoch": 0.36548, + "grad_norm": 1.8203125, + "grad_norm_var": 0.02197850545247396, + "learning_rate": 0.0001, + "loss": 3.9072, + "loss/crossentropy": 1.9478511214256287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20275350660085678, + "step": 18274 + }, + { + "epoch": 0.36552, + "grad_norm": 2.0, + "grad_norm_var": 0.02217381795247396, + "learning_rate": 0.0001, + "loss": 4.1986, + "loss/crossentropy": 2.16433984041214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21180611103773117, + "step": 18276 + }, + { + "epoch": 0.36556, + "grad_norm": 2.015625, + "grad_norm_var": 0.02182184855143229, + "learning_rate": 0.0001, + "loss": 4.0345, + "loss/crossentropy": 1.9628196954727173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198894202709198, + "step": 18278 + }, + { + "epoch": 0.3656, + "grad_norm": 1.8984375, + "grad_norm_var": 0.020409901936848957, + "learning_rate": 0.0001, + "loss": 4.0616, + "loss/crossentropy": 1.7818017601966858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19511079788208008, + "step": 18280 + }, + { + "epoch": 0.36564, + "grad_norm": 2.15625, + "grad_norm_var": 0.02315241495768229, + "learning_rate": 0.0001, + "loss": 4.1769, + "loss/crossentropy": 2.0149444341659546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128366306424141, + "step": 18282 + }, + { + "epoch": 0.36568, + "grad_norm": 1.8671875, + "grad_norm_var": 0.010351308186848958, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 2.1580519676208496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21166586130857468, + "step": 18284 + }, + { + "epoch": 0.36572, + "grad_norm": 1.859375, + "grad_norm_var": 0.03432591756184896, + "learning_rate": 0.0001, + "loss": 3.9196, + "loss/crossentropy": 2.308629631996155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20283856242895126, + "step": 18286 + }, + { + "epoch": 0.36576, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03199055989583333, + "learning_rate": 0.0001, + "loss": 4.0587, + "loss/crossentropy": 2.1761614084243774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.212914377450943, + "step": 18288 + }, + { + "epoch": 0.3658, + "grad_norm": 1.9453125, + "grad_norm_var": 0.029842122395833334, + "learning_rate": 0.0001, + "loss": 4.191, + "loss/crossentropy": 1.9851300120353699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1898990124464035, + "step": 18290 + }, + { + "epoch": 0.36584, + "grad_norm": 1.859375, + "grad_norm_var": 0.031172688802083334, + "learning_rate": 0.0001, + "loss": 4.028, + "loss/crossentropy": 2.1123871207237244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20982655137777328, + "step": 18292 + }, + { + "epoch": 0.36588, + "grad_norm": 2.0625, + "grad_norm_var": 0.03831761678059896, + "learning_rate": 0.0001, + "loss": 4.3007, + "loss/crossentropy": 2.2666051387786865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23328793793916702, + "step": 18294 + }, + { + "epoch": 0.36592, + "grad_norm": 1.9765625, + "grad_norm_var": 0.03904393513997396, + "learning_rate": 0.0001, + "loss": 3.8741, + "loss/crossentropy": 1.9444871544837952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20335105061531067, + "step": 18296 + }, + { + "epoch": 0.36596, + "grad_norm": 2.046875, + "grad_norm_var": 0.036717732747395836, + "learning_rate": 0.0001, + "loss": 4.0799, + "loss/crossentropy": 2.136604368686676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20185644924640656, + "step": 18298 + }, + { + "epoch": 0.366, + "grad_norm": 2.59375, + "grad_norm_var": 0.059024810791015625, + "learning_rate": 0.0001, + "loss": 4.0605, + "loss/crossentropy": 1.9029142260551453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24649157375097275, + "step": 18300 + }, + { + "epoch": 0.36604, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0400054931640625, + "learning_rate": 0.0001, + "loss": 4.1685, + "loss/crossentropy": 2.0363988876342773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20920708030462265, + "step": 18302 + }, + { + "epoch": 0.36608, + "grad_norm": 1.8984375, + "grad_norm_var": 0.04248860677083333, + "learning_rate": 0.0001, + "loss": 3.5556, + "loss/crossentropy": 1.8357294797897339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18201576173305511, + "step": 18304 + }, + { + "epoch": 0.36612, + "grad_norm": 2.109375, + "grad_norm_var": 0.04253743489583333, + "learning_rate": 0.0001, + "loss": 4.3381, + "loss/crossentropy": 2.194098114967346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20245323330163956, + "step": 18306 + }, + { + "epoch": 0.36616, + "grad_norm": 2.03125, + "grad_norm_var": 0.0409332275390625, + "learning_rate": 0.0001, + "loss": 4.2501, + "loss/crossentropy": 2.3840869665145874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2243075668811798, + "step": 18308 + }, + { + "epoch": 0.3662, + "grad_norm": 1.9375, + "grad_norm_var": 0.034398396809895836, + "learning_rate": 0.0001, + "loss": 4.1895, + "loss/crossentropy": 2.3093236684799194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22531048208475113, + "step": 18310 + }, + { + "epoch": 0.36624, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03472493489583333, + "learning_rate": 0.0001, + "loss": 3.7629, + "loss/crossentropy": 2.155013680458069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19654065370559692, + "step": 18312 + }, + { + "epoch": 0.36628, + "grad_norm": 1.9453125, + "grad_norm_var": 0.03486226399739583, + "learning_rate": 0.0001, + "loss": 4.0241, + "loss/crossentropy": 1.8823603391647339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18333254009485245, + "step": 18314 + }, + { + "epoch": 0.36632, + "grad_norm": 2.046875, + "grad_norm_var": 0.0090972900390625, + "learning_rate": 0.0001, + "loss": 3.8092, + "loss/crossentropy": 1.9217600226402283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18095016479492188, + "step": 18316 + }, + { + "epoch": 0.36636, + "grad_norm": 1.84375, + "grad_norm_var": 0.009004720052083333, + "learning_rate": 0.0001, + "loss": 3.8178, + "loss/crossentropy": 1.7448294758796692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18972519785165787, + "step": 18318 + }, + { + "epoch": 0.3664, + "grad_norm": 1.8359375, + "grad_norm_var": 0.008442942301432292, + "learning_rate": 0.0001, + "loss": 3.9394, + "loss/crossentropy": 1.8730336427688599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18396812677383423, + "step": 18320 + }, + { + "epoch": 0.36644, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006955718994140625, + "learning_rate": 0.0001, + "loss": 3.8763, + "loss/crossentropy": 2.075575351715088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20251936465501785, + "step": 18322 + }, + { + "epoch": 0.36648, + "grad_norm": 1.96875, + "grad_norm_var": 0.004131825764973959, + "learning_rate": 0.0001, + "loss": 4.0955, + "loss/crossentropy": 2.083173990249634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003224566578865, + "step": 18324 + }, + { + "epoch": 0.36652, + "grad_norm": 1.8671875, + "grad_norm_var": 0.00396728515625, + "learning_rate": 0.0001, + "loss": 4.0245, + "loss/crossentropy": 2.4401766061782837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21872484683990479, + "step": 18326 + }, + { + "epoch": 0.36656, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0038533528645833333, + "learning_rate": 0.0001, + "loss": 4.0249, + "loss/crossentropy": 2.2402881383895874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977379471063614, + "step": 18328 + }, + { + "epoch": 0.3666, + "grad_norm": 1.921875, + "grad_norm_var": 0.004650624593098959, + "learning_rate": 0.0001, + "loss": 4.1047, + "loss/crossentropy": 2.2246369123458862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21132111549377441, + "step": 18330 + }, + { + "epoch": 0.36664, + "grad_norm": 2.046875, + "grad_norm_var": 0.0043365478515625, + "learning_rate": 0.0001, + "loss": 4.1607, + "loss/crossentropy": 1.8048500418663025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18298982083797455, + "step": 18332 + }, + { + "epoch": 0.36668, + "grad_norm": 2.15625, + "grad_norm_var": 0.006514231363932292, + "learning_rate": 0.0001, + "loss": 4.0987, + "loss/crossentropy": 2.3368008136749268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21743982285261154, + "step": 18334 + }, + { + "epoch": 0.36672, + "grad_norm": 2.234375, + "grad_norm_var": 0.010188547770182292, + "learning_rate": 0.0001, + "loss": 4.3158, + "loss/crossentropy": 2.3520134687423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22603602707386017, + "step": 18336 + }, + { + "epoch": 0.36676, + "grad_norm": 2.140625, + "grad_norm_var": 0.011248524983723958, + "learning_rate": 0.0001, + "loss": 4.0037, + "loss/crossentropy": 2.0071199536323547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18584266304969788, + "step": 18338 + }, + { + "epoch": 0.3668, + "grad_norm": 2.234375, + "grad_norm_var": 0.019260406494140625, + "learning_rate": 0.0001, + "loss": 4.2787, + "loss/crossentropy": 1.9329636693000793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21350353956222534, + "step": 18340 + }, + { + "epoch": 0.36684, + "grad_norm": 2.0, + "grad_norm_var": 0.015990193684895834, + "learning_rate": 0.0001, + "loss": 4.0387, + "loss/crossentropy": 1.8231948018074036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064230963587761, + "step": 18342 + }, + { + "epoch": 0.36688, + "grad_norm": 1.828125, + "grad_norm_var": 0.01605809529622396, + "learning_rate": 0.0001, + "loss": 4.0888, + "loss/crossentropy": 1.9928399324417114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996312066912651, + "step": 18344 + }, + { + "epoch": 0.36692, + "grad_norm": 1.96875, + "grad_norm_var": 0.015916951497395835, + "learning_rate": 0.0001, + "loss": 4.1736, + "loss/crossentropy": 2.139783501625061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20407474786043167, + "step": 18346 + }, + { + "epoch": 0.36696, + "grad_norm": 1.8984375, + "grad_norm_var": 0.017166900634765624, + "learning_rate": 0.0001, + "loss": 4.0357, + "loss/crossentropy": 1.8519954681396484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18404167890548706, + "step": 18348 + }, + { + "epoch": 0.367, + "grad_norm": 2.015625, + "grad_norm_var": 0.016361236572265625, + "learning_rate": 0.0001, + "loss": 4.1325, + "loss/crossentropy": 2.037365674972534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2384558469057083, + "step": 18350 + }, + { + "epoch": 0.36704, + "grad_norm": 1.875, + "grad_norm_var": 0.015600331624348958, + "learning_rate": 0.0001, + "loss": 3.887, + "loss/crossentropy": 2.0023937821388245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19700156897306442, + "step": 18352 + }, + { + "epoch": 0.36708, + "grad_norm": 2.125, + "grad_norm_var": 0.015282185872395833, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 1.829107940196991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19559869915246964, + "step": 18354 + }, + { + "epoch": 0.36712, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006012980143229167, + "learning_rate": 0.0001, + "loss": 4.0962, + "loss/crossentropy": 2.206624150276184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166636437177658, + "step": 18356 + }, + { + "epoch": 0.36716, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005907185872395833, + "learning_rate": 0.0001, + "loss": 4.1191, + "loss/crossentropy": 2.1971875429153442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20924220979213715, + "step": 18358 + }, + { + "epoch": 0.3672, + "grad_norm": 2.1875, + "grad_norm_var": 0.0069539388020833336, + "learning_rate": 0.0001, + "loss": 4.4876, + "loss/crossentropy": 2.554604172706604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2638286352157593, + "step": 18360 + }, + { + "epoch": 0.36724, + "grad_norm": 1.9375, + "grad_norm_var": 0.007981109619140624, + "learning_rate": 0.0001, + "loss": 3.9076, + "loss/crossentropy": 2.088346302509308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1948780044913292, + "step": 18362 + }, + { + "epoch": 0.36728, + "grad_norm": 1.875, + "grad_norm_var": 0.0078033447265625, + "learning_rate": 0.0001, + "loss": 4.0101, + "loss/crossentropy": 2.24162495136261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21616502106189728, + "step": 18364 + }, + { + "epoch": 0.36732, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008567047119140626, + "learning_rate": 0.0001, + "loss": 3.8693, + "loss/crossentropy": 2.249878764152527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2312733307480812, + "step": 18366 + }, + { + "epoch": 0.36736, + "grad_norm": 1.921875, + "grad_norm_var": 0.0083404541015625, + "learning_rate": 0.0001, + "loss": 4.0711, + "loss/crossentropy": 2.0440531969070435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18561603128910065, + "step": 18368 + }, + { + "epoch": 0.3674, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006696573893229167, + "learning_rate": 0.0001, + "loss": 4.2582, + "loss/crossentropy": 2.2324228286743164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20741964876651764, + "step": 18370 + }, + { + "epoch": 0.36744, + "grad_norm": 2.125, + "grad_norm_var": 0.008634440104166667, + "learning_rate": 0.0001, + "loss": 4.2807, + "loss/crossentropy": 2.168972373008728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.215408056974411, + "step": 18372 + }, + { + "epoch": 0.36748, + "grad_norm": 1.890625, + "grad_norm_var": 0.008955637613932291, + "learning_rate": 0.0001, + "loss": 4.0341, + "loss/crossentropy": 2.076392412185669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023996263742447, + "step": 18374 + }, + { + "epoch": 0.36752, + "grad_norm": 2.046875, + "grad_norm_var": 0.005968983968098958, + "learning_rate": 0.0001, + "loss": 4.2958, + "loss/crossentropy": 2.0497928857803345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21541842073202133, + "step": 18376 + }, + { + "epoch": 0.36756, + "grad_norm": 1.6875, + "grad_norm_var": 0.0102294921875, + "learning_rate": 0.0001, + "loss": 3.6889, + "loss/crossentropy": 1.8894451260566711, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18294879794120789, + "step": 18378 + }, + { + "epoch": 0.3676, + "grad_norm": 1.984375, + "grad_norm_var": 0.010545857747395833, + "learning_rate": 0.0001, + "loss": 4.0237, + "loss/crossentropy": 2.1654014587402344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18372679501771927, + "step": 18380 + }, + { + "epoch": 0.36764, + "grad_norm": 2.1875, + "grad_norm_var": 0.013516998291015625, + "learning_rate": 0.0001, + "loss": 4.0657, + "loss/crossentropy": 2.203396499156952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22593200951814651, + "step": 18382 + }, + { + "epoch": 0.36768, + "grad_norm": 2.03125, + "grad_norm_var": 0.014090728759765626, + "learning_rate": 0.0001, + "loss": 3.946, + "loss/crossentropy": 1.7496543526649475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1825375333428383, + "step": 18384 + }, + { + "epoch": 0.36772, + "grad_norm": 1.875, + "grad_norm_var": 0.014249420166015625, + "learning_rate": 0.0001, + "loss": 3.7667, + "loss/crossentropy": 1.9488004446029663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18244057893753052, + "step": 18386 + }, + { + "epoch": 0.36776, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01231689453125, + "learning_rate": 0.0001, + "loss": 3.9406, + "loss/crossentropy": 1.9792875051498413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20204880088567734, + "step": 18388 + }, + { + "epoch": 0.3678, + "grad_norm": 1.890625, + "grad_norm_var": 0.012412261962890626, + "learning_rate": 0.0001, + "loss": 4.0077, + "loss/crossentropy": 1.890614092350006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19333010911941528, + "step": 18390 + }, + { + "epoch": 0.36784, + "grad_norm": 1.96875, + "grad_norm_var": 0.011832427978515626, + "learning_rate": 0.0001, + "loss": 4.0615, + "loss/crossentropy": 2.0901471972465515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20534122735261917, + "step": 18392 + }, + { + "epoch": 0.36788, + "grad_norm": 2.109375, + "grad_norm_var": 0.00716552734375, + "learning_rate": 0.0001, + "loss": 4.2502, + "loss/crossentropy": 2.2931089401245117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2130991816520691, + "step": 18394 + }, + { + "epoch": 0.36792, + "grad_norm": 2.078125, + "grad_norm_var": 0.0077392578125, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 2.2144237756729126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22273198515176773, + "step": 18396 + }, + { + "epoch": 0.36796, + "grad_norm": 1.8046875, + "grad_norm_var": 0.006624094645182292, + "learning_rate": 0.0001, + "loss": 3.7365, + "loss/crossentropy": 1.9135422110557556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983061358332634, + "step": 18398 + }, + { + "epoch": 0.368, + "grad_norm": 2.0, + "grad_norm_var": 0.007456207275390625, + "learning_rate": 0.0001, + "loss": 3.7556, + "loss/crossentropy": 1.692852795124054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1670849658548832, + "step": 18400 + }, + { + "epoch": 0.36804, + "grad_norm": 2.09375, + "grad_norm_var": 0.0083160400390625, + "learning_rate": 0.0001, + "loss": 4.0043, + "loss/crossentropy": 1.908652126789093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21787738054990768, + "step": 18402 + }, + { + "epoch": 0.36808, + "grad_norm": 2.015625, + "grad_norm_var": 0.008432769775390625, + "learning_rate": 0.0001, + "loss": 4.2343, + "loss/crossentropy": 2.0613549947738647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20495950430631638, + "step": 18404 + }, + { + "epoch": 0.36812, + "grad_norm": 1.890625, + "grad_norm_var": 0.026759592692057292, + "learning_rate": 0.0001, + "loss": 4.0549, + "loss/crossentropy": 1.9765326976776123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18859146535396576, + "step": 18406 + }, + { + "epoch": 0.36816, + "grad_norm": 1.953125, + "grad_norm_var": 0.027147420247395835, + "learning_rate": 0.0001, + "loss": 4.0196, + "loss/crossentropy": 1.8403696417808533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18926746398210526, + "step": 18408 + }, + { + "epoch": 0.3682, + "grad_norm": 2.15625, + "grad_norm_var": 0.029080963134765624, + "learning_rate": 0.0001, + "loss": 4.1859, + "loss/crossentropy": 2.1792644262313843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22263357043266296, + "step": 18410 + }, + { + "epoch": 0.36824, + "grad_norm": 2.0625, + "grad_norm_var": 0.02910334269205729, + "learning_rate": 0.0001, + "loss": 4.2387, + "loss/crossentropy": 2.079226016998291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20990607887506485, + "step": 18412 + }, + { + "epoch": 0.36828, + "grad_norm": 1.8671875, + "grad_norm_var": 0.027457427978515626, + "learning_rate": 0.0001, + "loss": 4.2093, + "loss/crossentropy": 2.1428889632225037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22140806913375854, + "step": 18414 + }, + { + "epoch": 0.36832, + "grad_norm": 1.984375, + "grad_norm_var": 0.024589029947916667, + "learning_rate": 0.0001, + "loss": 4.3761, + "loss/crossentropy": 2.1461609601974487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20594316720962524, + "step": 18416 + }, + { + "epoch": 0.36836, + "grad_norm": 1.9609375, + "grad_norm_var": 0.02399470011393229, + "learning_rate": 0.0001, + "loss": 4.1597, + "loss/crossentropy": 1.980036735534668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.201970636844635, + "step": 18418 + }, + { + "epoch": 0.3684, + "grad_norm": 1.9140625, + "grad_norm_var": 0.024773915608723957, + "learning_rate": 0.0001, + "loss": 3.9847, + "loss/crossentropy": 1.6273554563522339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938486322760582, + "step": 18420 + }, + { + "epoch": 0.36844, + "grad_norm": 2.03125, + "grad_norm_var": 0.006251780192057291, + "learning_rate": 0.0001, + "loss": 4.1784, + "loss/crossentropy": 1.9100900292396545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18063092976808548, + "step": 18422 + }, + { + "epoch": 0.36848, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0063555399576822914, + "learning_rate": 0.0001, + "loss": 4.1282, + "loss/crossentropy": 2.1882822513580322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21261122822761536, + "step": 18424 + }, + { + "epoch": 0.36852, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0046384175618489586, + "learning_rate": 0.0001, + "loss": 3.9556, + "loss/crossentropy": 2.210257649421692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929008737206459, + "step": 18426 + }, + { + "epoch": 0.36856, + "grad_norm": 2.078125, + "grad_norm_var": 0.005041249593098958, + "learning_rate": 0.0001, + "loss": 4.1691, + "loss/crossentropy": 2.0247724056243896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2004971206188202, + "step": 18428 + }, + { + "epoch": 0.3686, + "grad_norm": 2.015625, + "grad_norm_var": 0.004133097330729167, + "learning_rate": 0.0001, + "loss": 4.2092, + "loss/crossentropy": 1.9844316244125366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210312619805336, + "step": 18430 + }, + { + "epoch": 0.36864, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005000559488932291, + "learning_rate": 0.0001, + "loss": 4.0368, + "loss/crossentropy": 2.2416625022888184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21065251529216766, + "step": 18432 + }, + { + "epoch": 0.36868, + "grad_norm": 2.453125, + "grad_norm_var": 0.020104726155598957, + "learning_rate": 0.0001, + "loss": 4.4109, + "loss/crossentropy": 2.0283551812171936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150302305817604, + "step": 18434 + }, + { + "epoch": 0.36872, + "grad_norm": 2.015625, + "grad_norm_var": 0.01968994140625, + "learning_rate": 0.0001, + "loss": 4.1379, + "loss/crossentropy": 2.210233688354492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21831902861595154, + "step": 18436 + }, + { + "epoch": 0.36876, + "grad_norm": 1.9375, + "grad_norm_var": 0.0205474853515625, + "learning_rate": 0.0001, + "loss": 3.9993, + "loss/crossentropy": 2.0454147458076477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2432640790939331, + "step": 18438 + }, + { + "epoch": 0.3688, + "grad_norm": 2.078125, + "grad_norm_var": 0.020444488525390624, + "learning_rate": 0.0001, + "loss": 4.1243, + "loss/crossentropy": 2.024592399597168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20496471971273422, + "step": 18440 + }, + { + "epoch": 0.36884, + "grad_norm": 2.125, + "grad_norm_var": 0.018822987874348957, + "learning_rate": 0.0001, + "loss": 4.3058, + "loss/crossentropy": 2.4205459356307983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22281523793935776, + "step": 18442 + }, + { + "epoch": 0.36888, + "grad_norm": 1.96875, + "grad_norm_var": 0.0170806884765625, + "learning_rate": 0.0001, + "loss": 4.0341, + "loss/crossentropy": 1.9198943376541138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916896402835846, + "step": 18444 + }, + { + "epoch": 0.36892, + "grad_norm": 2.0625, + "grad_norm_var": 0.0171051025390625, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 1.928059160709381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1899876967072487, + "step": 18446 + }, + { + "epoch": 0.36896, + "grad_norm": 1.921875, + "grad_norm_var": 0.020792388916015626, + "learning_rate": 0.0001, + "loss": 4.3327, + "loss/crossentropy": 2.346290349960327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22388508170843124, + "step": 18448 + }, + { + "epoch": 0.369, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009946441650390625, + "learning_rate": 0.0001, + "loss": 3.9064, + "loss/crossentropy": 2.0108938217163086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19462434202432632, + "step": 18450 + }, + { + "epoch": 0.36904, + "grad_norm": 2.0, + "grad_norm_var": 0.00986328125, + "learning_rate": 0.0001, + "loss": 4.2468, + "loss/crossentropy": 1.7108886241912842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18806710839271545, + "step": 18452 + }, + { + "epoch": 0.36908, + "grad_norm": 5.1875, + "grad_norm_var": 0.6345499674479167, + "learning_rate": 0.0001, + "loss": 4.6972, + "loss/crossentropy": 2.4162802696228027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3217930570244789, + "step": 18454 + }, + { + "epoch": 0.36912, + "grad_norm": 2.1875, + "grad_norm_var": 0.639013671875, + "learning_rate": 0.0001, + "loss": 3.4762, + "loss/crossentropy": 1.6459838151931763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17543485760688782, + "step": 18456 + }, + { + "epoch": 0.36916, + "grad_norm": 2.140625, + "grad_norm_var": 0.640679677327474, + "learning_rate": 0.0001, + "loss": 4.1194, + "loss/crossentropy": 2.2111966013908386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22191359847784042, + "step": 18458 + }, + { + "epoch": 0.3692, + "grad_norm": 2.125, + "grad_norm_var": 0.6325887044270834, + "learning_rate": 0.0001, + "loss": 4.301, + "loss/crossentropy": 2.2514692544937134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21384654194116592, + "step": 18460 + }, + { + "epoch": 0.36924, + "grad_norm": 2.125, + "grad_norm_var": 0.63665771484375, + "learning_rate": 0.0001, + "loss": 4.307, + "loss/crossentropy": 2.097872793674469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21747223287820816, + "step": 18462 + }, + { + "epoch": 0.36928, + "grad_norm": 2.03125, + "grad_norm_var": 0.6418690999348958, + "learning_rate": 0.0001, + "loss": 4.1033, + "loss/crossentropy": 2.1611807346343994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042219042778015, + "step": 18464 + }, + { + "epoch": 0.36932, + "grad_norm": 2.15625, + "grad_norm_var": 0.62939453125, + "learning_rate": 0.0001, + "loss": 3.7871, + "loss/crossentropy": 1.8941562175750732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192266546189785, + "step": 18466 + }, + { + "epoch": 0.36936, + "grad_norm": 1.921875, + "grad_norm_var": 0.72021484375, + "learning_rate": 0.0001, + "loss": 4.0358, + "loss/crossentropy": 1.8930317163467407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19436348974704742, + "step": 18468 + }, + { + "epoch": 0.3694, + "grad_norm": 1.9140625, + "grad_norm_var": 0.15979588826497396, + "learning_rate": 0.0001, + "loss": 4.1046, + "loss/crossentropy": 2.1041005849838257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21314333379268646, + "step": 18470 + }, + { + "epoch": 0.36944, + "grad_norm": 1.9453125, + "grad_norm_var": 0.15750732421875, + "learning_rate": 0.0001, + "loss": 4.2096, + "loss/crossentropy": 1.8747637867927551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19172174483537674, + "step": 18472 + }, + { + "epoch": 0.36948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.15751113891601562, + "learning_rate": 0.0001, + "loss": 3.9809, + "loss/crossentropy": 2.1883610486984253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19927512109279633, + "step": 18474 + }, + { + "epoch": 0.36952, + "grad_norm": 1.9921875, + "grad_norm_var": 0.1606353759765625, + "learning_rate": 0.0001, + "loss": 3.9303, + "loss/crossentropy": 1.972772240638733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17948149144649506, + "step": 18476 + }, + { + "epoch": 0.36956, + "grad_norm": 2.0625, + "grad_norm_var": 0.14570490519205728, + "learning_rate": 0.0001, + "loss": 4.1246, + "loss/crossentropy": 1.8912597298622131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18716244399547577, + "step": 18478 + }, + { + "epoch": 0.3696, + "grad_norm": 1.9140625, + "grad_norm_var": 0.1458740234375, + "learning_rate": 0.0001, + "loss": 4.0816, + "loss/crossentropy": 2.044828712940216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076854333281517, + "step": 18480 + }, + { + "epoch": 0.36964, + "grad_norm": 2.03125, + "grad_norm_var": 0.14629618326822916, + "learning_rate": 0.0001, + "loss": 4.2007, + "loss/crossentropy": 2.0936968326568604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22010911256074905, + "step": 18482 + }, + { + "epoch": 0.36968, + "grad_norm": 1.96875, + "grad_norm_var": 0.0027414957682291665, + "learning_rate": 0.0001, + "loss": 4.08, + "loss/crossentropy": 2.2678394317626953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073991820216179, + "step": 18484 + }, + { + "epoch": 0.36972, + "grad_norm": 2.078125, + "grad_norm_var": 0.0031064351399739585, + "learning_rate": 0.0001, + "loss": 3.9361, + "loss/crossentropy": 2.281362295150757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015322744846344, + "step": 18486 + }, + { + "epoch": 0.36976, + "grad_norm": 2.015625, + "grad_norm_var": 0.0034739176432291665, + "learning_rate": 0.0001, + "loss": 4.2318, + "loss/crossentropy": 2.0029674768447876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155751883983612, + "step": 18488 + }, + { + "epoch": 0.3698, + "grad_norm": 1.984375, + "grad_norm_var": 0.003525543212890625, + "learning_rate": 0.0001, + "loss": 4.3761, + "loss/crossentropy": 2.370519280433655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2308923304080963, + "step": 18490 + }, + { + "epoch": 0.36984, + "grad_norm": 2.09375, + "grad_norm_var": 0.004109446207682292, + "learning_rate": 0.0001, + "loss": 4.3137, + "loss/crossentropy": 1.9416582584381104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18809420615434647, + "step": 18492 + }, + { + "epoch": 0.36988, + "grad_norm": 1.9765625, + "grad_norm_var": 0.003885650634765625, + "learning_rate": 0.0001, + "loss": 4.1381, + "loss/crossentropy": 2.065304160118103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19029072672128677, + "step": 18494 + }, + { + "epoch": 0.36992, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0036516825358072916, + "learning_rate": 0.0001, + "loss": 4.1366, + "loss/crossentropy": 1.9798340201377869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1716388538479805, + "step": 18496 + }, + { + "epoch": 0.36996, + "grad_norm": 1.984375, + "grad_norm_var": 0.0035540262858072915, + "learning_rate": 0.0001, + "loss": 4.2321, + "loss/crossentropy": 2.336732864379883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19805839657783508, + "step": 18498 + }, + { + "epoch": 0.37, + "grad_norm": 1.984375, + "grad_norm_var": 0.0033444722493489584, + "learning_rate": 0.0001, + "loss": 4.08, + "loss/crossentropy": 1.9556902050971985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19576922804117203, + "step": 18500 + }, + { + "epoch": 0.37004, + "grad_norm": 1.796875, + "grad_norm_var": 0.006581370035807292, + "learning_rate": 0.0001, + "loss": 3.7281, + "loss/crossentropy": 2.0062427520751953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19548972696065903, + "step": 18502 + }, + { + "epoch": 0.37008, + "grad_norm": 1.9375, + "grad_norm_var": 0.0061419169108072914, + "learning_rate": 0.0001, + "loss": 4.0954, + "loss/crossentropy": 2.1030293703079224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22147215902805328, + "step": 18504 + }, + { + "epoch": 0.37012, + "grad_norm": 2.046875, + "grad_norm_var": 0.005163319905598958, + "learning_rate": 0.0001, + "loss": 3.9545, + "loss/crossentropy": 1.7618860006332397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17934025079011917, + "step": 18506 + }, + { + "epoch": 0.37016, + "grad_norm": 2.109375, + "grad_norm_var": 0.005454254150390625, + "learning_rate": 0.0001, + "loss": 4.239, + "loss/crossentropy": 2.4424854516983032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23131447285413742, + "step": 18508 + }, + { + "epoch": 0.3702, + "grad_norm": 1.96875, + "grad_norm_var": 0.0076904296875, + "learning_rate": 0.0001, + "loss": 4.0365, + "loss/crossentropy": 2.125267446041107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19127625226974487, + "step": 18510 + }, + { + "epoch": 0.37024, + "grad_norm": 1.828125, + "grad_norm_var": 0.008506011962890626, + "learning_rate": 0.0001, + "loss": 3.9498, + "loss/crossentropy": 2.2222214937210083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204838365316391, + "step": 18512 + }, + { + "epoch": 0.37028, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009056599934895833, + "learning_rate": 0.0001, + "loss": 4.1012, + "loss/crossentropy": 2.1930031776428223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21769095957279205, + "step": 18514 + }, + { + "epoch": 0.37032, + "grad_norm": 1.84375, + "grad_norm_var": 0.010814412434895834, + "learning_rate": 0.0001, + "loss": 4.0068, + "loss/crossentropy": 2.078152060508728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19528701901435852, + "step": 18516 + }, + { + "epoch": 0.37036, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009016927083333333, + "learning_rate": 0.0001, + "loss": 4.0135, + "loss/crossentropy": 1.9786349534988403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19193317741155624, + "step": 18518 + }, + { + "epoch": 0.3704, + "grad_norm": 2.015625, + "grad_norm_var": 0.0119293212890625, + "learning_rate": 0.0001, + "loss": 4.1458, + "loss/crossentropy": 2.279360294342041, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20865458250045776, + "step": 18520 + }, + { + "epoch": 0.37044, + "grad_norm": 2.078125, + "grad_norm_var": 0.012548828125, + "learning_rate": 0.0001, + "loss": 4.195, + "loss/crossentropy": 2.0668978691101074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19060996919870377, + "step": 18522 + }, + { + "epoch": 0.37048, + "grad_norm": 1.90625, + "grad_norm_var": 0.011116536458333333, + "learning_rate": 0.0001, + "loss": 4.2661, + "loss/crossentropy": 2.4433913230895996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2263498529791832, + "step": 18524 + }, + { + "epoch": 0.37052, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009049224853515624, + "learning_rate": 0.0001, + "loss": 3.7953, + "loss/crossentropy": 1.7785582542419434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18417692929506302, + "step": 18526 + }, + { + "epoch": 0.37056, + "grad_norm": 2.015625, + "grad_norm_var": 0.008499908447265624, + "learning_rate": 0.0001, + "loss": 4.2958, + "loss/crossentropy": 1.9753797054290771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1898263543844223, + "step": 18528 + }, + { + "epoch": 0.3706, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007696278889973958, + "learning_rate": 0.0001, + "loss": 4.1945, + "loss/crossentropy": 2.302052319049835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21289903670549393, + "step": 18530 + }, + { + "epoch": 0.37064, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007523600260416667, + "learning_rate": 0.0001, + "loss": 4.1132, + "loss/crossentropy": 2.3275071382522583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2265043556690216, + "step": 18532 + }, + { + "epoch": 0.37068, + "grad_norm": 2.046875, + "grad_norm_var": 0.00826416015625, + "learning_rate": 0.0001, + "loss": 4.2004, + "loss/crossentropy": 2.3155715465545654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20101473480463028, + "step": 18534 + }, + { + "epoch": 0.37072, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0075681050618489586, + "learning_rate": 0.0001, + "loss": 3.8546, + "loss/crossentropy": 2.0893908739089966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18266596645116806, + "step": 18536 + }, + { + "epoch": 0.37076, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006546783447265625, + "learning_rate": 0.0001, + "loss": 4.297, + "loss/crossentropy": 2.075433909893036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20270193368196487, + "step": 18538 + }, + { + "epoch": 0.3708, + "grad_norm": 1.90625, + "grad_norm_var": 0.006605784098307292, + "learning_rate": 0.0001, + "loss": 4.186, + "loss/crossentropy": 2.2795485258102417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21547292172908783, + "step": 18540 + }, + { + "epoch": 0.37084, + "grad_norm": 1.859375, + "grad_norm_var": 0.006956990559895833, + "learning_rate": 0.0001, + "loss": 4.0351, + "loss/crossentropy": 2.0038467049598694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19927001744508743, + "step": 18542 + }, + { + "epoch": 0.37088, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0056111653645833336, + "learning_rate": 0.0001, + "loss": 3.979, + "loss/crossentropy": 1.8877951502799988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19283732771873474, + "step": 18544 + }, + { + "epoch": 0.37092, + "grad_norm": 1.90625, + "grad_norm_var": 0.0065305074055989586, + "learning_rate": 0.0001, + "loss": 4.0169, + "loss/crossentropy": 1.8111292719841003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20587582886219025, + "step": 18546 + }, + { + "epoch": 0.37096, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004548136393229167, + "learning_rate": 0.0001, + "loss": 4.0766, + "loss/crossentropy": 2.12824147939682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996314972639084, + "step": 18548 + }, + { + "epoch": 0.371, + "grad_norm": 1.953125, + "grad_norm_var": 0.0035845438639322915, + "learning_rate": 0.0001, + "loss": 3.9549, + "loss/crossentropy": 2.22495698928833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20922152698040009, + "step": 18550 + }, + { + "epoch": 0.37104, + "grad_norm": 1.9609375, + "grad_norm_var": 0.003110504150390625, + "learning_rate": 0.0001, + "loss": 4.2245, + "loss/crossentropy": 2.1434414386749268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2070355862379074, + "step": 18552 + }, + { + "epoch": 0.37108, + "grad_norm": 1.859375, + "grad_norm_var": 0.003979237874348959, + "learning_rate": 0.0001, + "loss": 4.0359, + "loss/crossentropy": 2.248233437538147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21400754898786545, + "step": 18554 + }, + { + "epoch": 0.37112, + "grad_norm": 2.0, + "grad_norm_var": 0.0051513671875, + "learning_rate": 0.0001, + "loss": 4.2072, + "loss/crossentropy": 2.246641755104065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22847777605056763, + "step": 18556 + }, + { + "epoch": 0.37116, + "grad_norm": 1.9765625, + "grad_norm_var": 0.004654693603515625, + "learning_rate": 0.0001, + "loss": 4.0805, + "loss/crossentropy": 2.057798206806183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23668289184570312, + "step": 18558 + }, + { + "epoch": 0.3712, + "grad_norm": 2.0, + "grad_norm_var": 0.004587554931640625, + "learning_rate": 0.0001, + "loss": 3.8454, + "loss/crossentropy": 1.7570822834968567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1820850819349289, + "step": 18560 + }, + { + "epoch": 0.37124, + "grad_norm": 2.390625, + "grad_norm_var": 0.015372467041015626, + "learning_rate": 0.0001, + "loss": 4.0077, + "loss/crossentropy": 1.7747303247451782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1704205796122551, + "step": 18562 + }, + { + "epoch": 0.37128, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015632120768229167, + "learning_rate": 0.0001, + "loss": 4.0611, + "loss/crossentropy": 2.3060861825942993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20002726465463638, + "step": 18564 + }, + { + "epoch": 0.37132, + "grad_norm": 1.921875, + "grad_norm_var": 0.015242258707682291, + "learning_rate": 0.0001, + "loss": 4.258, + "loss/crossentropy": 2.2742475271224976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22524481266736984, + "step": 18566 + }, + { + "epoch": 0.37136, + "grad_norm": 1.859375, + "grad_norm_var": 0.01647923787434896, + "learning_rate": 0.0001, + "loss": 3.9816, + "loss/crossentropy": 1.7929689288139343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1819063350558281, + "step": 18568 + }, + { + "epoch": 0.3714, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015718587239583335, + "learning_rate": 0.0001, + "loss": 4.2487, + "loss/crossentropy": 2.4505950212478638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22857815772294998, + "step": 18570 + }, + { + "epoch": 0.37144, + "grad_norm": 1.828125, + "grad_norm_var": 0.017658487955729166, + "learning_rate": 0.0001, + "loss": 3.5856, + "loss/crossentropy": 1.640372097492218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16958069056272507, + "step": 18572 + }, + { + "epoch": 0.37148, + "grad_norm": 1.984375, + "grad_norm_var": 0.01916681925455729, + "learning_rate": 0.0001, + "loss": 4.1601, + "loss/crossentropy": 2.1556472778320312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20749886333942413, + "step": 18574 + }, + { + "epoch": 0.37152, + "grad_norm": 2.015625, + "grad_norm_var": 0.01846491495768229, + "learning_rate": 0.0001, + "loss": 4.1579, + "loss/crossentropy": 2.1131449937820435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20596522092819214, + "step": 18576 + }, + { + "epoch": 0.37156, + "grad_norm": 1.84375, + "grad_norm_var": 0.008998362223307292, + "learning_rate": 0.0001, + "loss": 3.8036, + "loss/crossentropy": 1.8966050148010254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19289513677358627, + "step": 18578 + }, + { + "epoch": 0.3716, + "grad_norm": 2.109375, + "grad_norm_var": 0.010306803385416667, + "learning_rate": 0.0001, + "loss": 4.1883, + "loss/crossentropy": 2.1016936898231506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1917622685432434, + "step": 18580 + }, + { + "epoch": 0.37164, + "grad_norm": 2.265625, + "grad_norm_var": 0.01512451171875, + "learning_rate": 0.0001, + "loss": 4.2596, + "loss/crossentropy": 2.289618492126465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033960521221161, + "step": 18582 + }, + { + "epoch": 0.37168, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020287068684895833, + "learning_rate": 0.0001, + "loss": 4.0441, + "loss/crossentropy": 2.205715775489807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20015332102775574, + "step": 18584 + }, + { + "epoch": 0.37172, + "grad_norm": 2.171875, + "grad_norm_var": 0.02072118123372396, + "learning_rate": 0.0001, + "loss": 4.0834, + "loss/crossentropy": 2.3037471771240234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2424507886171341, + "step": 18586 + }, + { + "epoch": 0.37176, + "grad_norm": 1.890625, + "grad_norm_var": 0.016686757405598957, + "learning_rate": 0.0001, + "loss": 4.0718, + "loss/crossentropy": 2.1757054328918457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20777830481529236, + "step": 18588 + }, + { + "epoch": 0.3718, + "grad_norm": 2.03125, + "grad_norm_var": 0.015860748291015626, + "learning_rate": 0.0001, + "loss": 4.2905, + "loss/crossentropy": 2.3257133960723877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22848188877105713, + "step": 18590 + }, + { + "epoch": 0.37184, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017179107666015624, + "learning_rate": 0.0001, + "loss": 3.9132, + "loss/crossentropy": 2.3375465869903564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21214719116687775, + "step": 18592 + }, + { + "epoch": 0.37188, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0162750244140625, + "learning_rate": 0.0001, + "loss": 4.507, + "loss/crossentropy": 2.1896092891693115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2303280457854271, + "step": 18594 + }, + { + "epoch": 0.37192, + "grad_norm": 2.0, + "grad_norm_var": 0.016287994384765626, + "learning_rate": 0.0001, + "loss": 3.9078, + "loss/crossentropy": 1.8913645148277283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17311514914035797, + "step": 18596 + }, + { + "epoch": 0.37196, + "grad_norm": 2.03125, + "grad_norm_var": 0.012442779541015626, + "learning_rate": 0.0001, + "loss": 4.2212, + "loss/crossentropy": 2.5278197526931763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23568174242973328, + "step": 18598 + }, + { + "epoch": 0.372, + "grad_norm": 1.9375, + "grad_norm_var": 0.0101959228515625, + "learning_rate": 0.0001, + "loss": 4.1568, + "loss/crossentropy": 2.073936700820923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21885155141353607, + "step": 18600 + }, + { + "epoch": 0.37204, + "grad_norm": 2.265625, + "grad_norm_var": 0.012684885660807292, + "learning_rate": 0.0001, + "loss": 4.388, + "loss/crossentropy": 2.162986159324646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012084275484085, + "step": 18602 + }, + { + "epoch": 0.37208, + "grad_norm": 1.921875, + "grad_norm_var": 0.013108062744140624, + "learning_rate": 0.0001, + "loss": 3.9659, + "loss/crossentropy": 1.9332863092422485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18284077942371368, + "step": 18604 + }, + { + "epoch": 0.37212, + "grad_norm": 1.9140625, + "grad_norm_var": 0.013809967041015624, + "learning_rate": 0.0001, + "loss": 4.0279, + "loss/crossentropy": 2.083172380924225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186289481818676, + "step": 18606 + }, + { + "epoch": 0.37216, + "grad_norm": 1.96875, + "grad_norm_var": 0.013588205973307291, + "learning_rate": 0.0001, + "loss": 3.9932, + "loss/crossentropy": 2.100313901901245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977698802947998, + "step": 18608 + }, + { + "epoch": 0.3722, + "grad_norm": 1.890625, + "grad_norm_var": 0.016792805989583333, + "learning_rate": 0.0001, + "loss": 4.1086, + "loss/crossentropy": 2.2336236238479614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21973469108343124, + "step": 18610 + }, + { + "epoch": 0.37224, + "grad_norm": 1.984375, + "grad_norm_var": 0.017048136393229166, + "learning_rate": 0.0001, + "loss": 3.9422, + "loss/crossentropy": 2.0263352394104004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19232206791639328, + "step": 18612 + }, + { + "epoch": 0.37228, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01727472941080729, + "learning_rate": 0.0001, + "loss": 4.2295, + "loss/crossentropy": 2.0719032287597656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22896382957696915, + "step": 18614 + }, + { + "epoch": 0.37232, + "grad_norm": 1.84375, + "grad_norm_var": 0.015400950113932292, + "learning_rate": 0.0001, + "loss": 3.881, + "loss/crossentropy": 1.9514707326889038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1899290755391121, + "step": 18616 + }, + { + "epoch": 0.37236, + "grad_norm": 1.921875, + "grad_norm_var": 0.010172526041666666, + "learning_rate": 0.0001, + "loss": 4.0163, + "loss/crossentropy": 2.0810786485671997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983262002468109, + "step": 18618 + }, + { + "epoch": 0.3724, + "grad_norm": 1.921875, + "grad_norm_var": 0.010302734375, + "learning_rate": 0.0001, + "loss": 4.1311, + "loss/crossentropy": 1.823366403579712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18820683658123016, + "step": 18620 + }, + { + "epoch": 0.37244, + "grad_norm": 2.4375, + "grad_norm_var": 0.023933919270833333, + "learning_rate": 0.0001, + "loss": 4.2361, + "loss/crossentropy": 2.2814120054244995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19500700384378433, + "step": 18622 + }, + { + "epoch": 0.37248, + "grad_norm": 1.9765625, + "grad_norm_var": 0.023339589436848957, + "learning_rate": 0.0001, + "loss": 3.9548, + "loss/crossentropy": 1.5761349201202393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17777466028928757, + "step": 18624 + }, + { + "epoch": 0.37252, + "grad_norm": 1.9375, + "grad_norm_var": 0.018853505452473957, + "learning_rate": 0.0001, + "loss": 3.9134, + "loss/crossentropy": 1.8996286988258362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21907050907611847, + "step": 18626 + }, + { + "epoch": 0.37256, + "grad_norm": 1.9296875, + "grad_norm_var": 0.020869700113932292, + "learning_rate": 0.0001, + "loss": 4.2361, + "loss/crossentropy": 1.6650620698928833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18808475136756897, + "step": 18628 + }, + { + "epoch": 0.3726, + "grad_norm": 2.125, + "grad_norm_var": 0.02156550089518229, + "learning_rate": 0.0001, + "loss": 4.4004, + "loss/crossentropy": 2.2887942790985107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20416373759508133, + "step": 18630 + }, + { + "epoch": 0.37264, + "grad_norm": 2.03125, + "grad_norm_var": 0.018629709879557293, + "learning_rate": 0.0001, + "loss": 3.9606, + "loss/crossentropy": 1.9479430317878723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18489708751440048, + "step": 18632 + }, + { + "epoch": 0.37268, + "grad_norm": 2.0, + "grad_norm_var": 0.016676584879557293, + "learning_rate": 0.0001, + "loss": 4.3041, + "loss/crossentropy": 2.2156901359558105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20997263491153717, + "step": 18634 + }, + { + "epoch": 0.37272, + "grad_norm": 2.0, + "grad_norm_var": 0.015950520833333332, + "learning_rate": 0.0001, + "loss": 4.1619, + "loss/crossentropy": 2.50583279132843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23347270488739014, + "step": 18636 + }, + { + "epoch": 0.37276, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008650716145833333, + "learning_rate": 0.0001, + "loss": 3.8241, + "loss/crossentropy": 1.7688243985176086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18601343035697937, + "step": 18638 + }, + { + "epoch": 0.3728, + "grad_norm": 2.109375, + "grad_norm_var": 0.009439849853515625, + "learning_rate": 0.0001, + "loss": 4.2241, + "loss/crossentropy": 2.3786444664001465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2287142351269722, + "step": 18640 + }, + { + "epoch": 0.37284, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009479777018229166, + "learning_rate": 0.0001, + "loss": 4.243, + "loss/crossentropy": 2.2686339616775513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21086719632148743, + "step": 18642 + }, + { + "epoch": 0.37288, + "grad_norm": 2.0, + "grad_norm_var": 0.007490793863932292, + "learning_rate": 0.0001, + "loss": 3.8341, + "loss/crossentropy": 1.9386130571365356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19289222359657288, + "step": 18644 + }, + { + "epoch": 0.37292, + "grad_norm": 1.953125, + "grad_norm_var": 0.006819407145182292, + "learning_rate": 0.0001, + "loss": 4.3357, + "loss/crossentropy": 2.3655601739883423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2207036018371582, + "step": 18646 + }, + { + "epoch": 0.37296, + "grad_norm": 1.9375, + "grad_norm_var": 0.007100168863932292, + "learning_rate": 0.0001, + "loss": 3.9039, + "loss/crossentropy": 1.9600831270217896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18018172681331635, + "step": 18648 + }, + { + "epoch": 0.373, + "grad_norm": 2.515625, + "grad_norm_var": 0.024873860677083335, + "learning_rate": 0.0001, + "loss": 3.9492, + "loss/crossentropy": 1.86528480052948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19763591885566711, + "step": 18650 + }, + { + "epoch": 0.37304, + "grad_norm": 1.984375, + "grad_norm_var": 0.024925740559895833, + "learning_rate": 0.0001, + "loss": 4.07, + "loss/crossentropy": 2.0938327312469482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051714062690735, + "step": 18652 + }, + { + "epoch": 0.37308, + "grad_norm": 1.9296875, + "grad_norm_var": 0.022454579671223957, + "learning_rate": 0.0001, + "loss": 3.878, + "loss/crossentropy": 1.964399516582489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20198997855186462, + "step": 18654 + }, + { + "epoch": 0.37312, + "grad_norm": 2.09375, + "grad_norm_var": 0.0245849609375, + "learning_rate": 0.0001, + "loss": 3.9929, + "loss/crossentropy": 2.0186336040496826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19310183823108673, + "step": 18656 + }, + { + "epoch": 0.37316, + "grad_norm": 1.921875, + "grad_norm_var": 0.025052642822265624, + "learning_rate": 0.0001, + "loss": 3.8569, + "loss/crossentropy": 1.8805240392684937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20392487198114395, + "step": 18658 + }, + { + "epoch": 0.3732, + "grad_norm": 1.90625, + "grad_norm_var": 0.025480143229166665, + "learning_rate": 0.0001, + "loss": 4.0306, + "loss/crossentropy": 2.45102322101593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2161283940076828, + "step": 18660 + }, + { + "epoch": 0.37324, + "grad_norm": 1.921875, + "grad_norm_var": 0.025679270426432293, + "learning_rate": 0.0001, + "loss": 4.1015, + "loss/crossentropy": 2.2971357107162476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23155176639556885, + "step": 18662 + }, + { + "epoch": 0.37328, + "grad_norm": 1.953125, + "grad_norm_var": 0.025614166259765626, + "learning_rate": 0.0001, + "loss": 4.285, + "loss/crossentropy": 2.3736867904663086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22897879034280777, + "step": 18664 + }, + { + "epoch": 0.37332, + "grad_norm": 2.0625, + "grad_norm_var": 0.005500284830729166, + "learning_rate": 0.0001, + "loss": 3.9884, + "loss/crossentropy": 1.5138108134269714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1526506468653679, + "step": 18666 + }, + { + "epoch": 0.37336, + "grad_norm": 1.859375, + "grad_norm_var": 0.0061948140462239586, + "learning_rate": 0.0001, + "loss": 3.9507, + "loss/crossentropy": 2.415435791015625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21689960360527039, + "step": 18668 + }, + { + "epoch": 0.3734, + "grad_norm": 1.90625, + "grad_norm_var": 0.006306966145833333, + "learning_rate": 0.0001, + "loss": 4.014, + "loss/crossentropy": 2.1235941648483276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20921900868415833, + "step": 18670 + }, + { + "epoch": 0.37344, + "grad_norm": 2.015625, + "grad_norm_var": 0.004428863525390625, + "learning_rate": 0.0001, + "loss": 4.2549, + "loss/crossentropy": 2.3612579703330994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21255160123109818, + "step": 18672 + }, + { + "epoch": 0.37348, + "grad_norm": 2.09375, + "grad_norm_var": 0.054351552327473955, + "learning_rate": 0.0001, + "loss": 4.1722, + "loss/crossentropy": 2.23550283908844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18890902400016785, + "step": 18674 + }, + { + "epoch": 0.37352, + "grad_norm": 1.9140625, + "grad_norm_var": 0.05468317667643229, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 1.7998243570327759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19070486724376678, + "step": 18676 + }, + { + "epoch": 0.37356, + "grad_norm": 1.984375, + "grad_norm_var": 0.055214182535807295, + "learning_rate": 0.0001, + "loss": 4.151, + "loss/crossentropy": 1.892760992050171, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19817107915878296, + "step": 18678 + }, + { + "epoch": 0.3736, + "grad_norm": 1.9140625, + "grad_norm_var": 0.055757395426432294, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 1.9619091153144836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21299303323030472, + "step": 18680 + }, + { + "epoch": 0.37364, + "grad_norm": 1.90625, + "grad_norm_var": 0.057889556884765624, + "learning_rate": 0.0001, + "loss": 3.8419, + "loss/crossentropy": 2.0857014656066895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20634424686431885, + "step": 18682 + }, + { + "epoch": 0.37368, + "grad_norm": 1.796875, + "grad_norm_var": 0.05814793904622396, + "learning_rate": 0.0001, + "loss": 3.9589, + "loss/crossentropy": 2.1093358397483826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1933949589729309, + "step": 18684 + }, + { + "epoch": 0.37372, + "grad_norm": 1.9375, + "grad_norm_var": 0.05987548828125, + "learning_rate": 0.0001, + "loss": 3.9537, + "loss/crossentropy": 2.1040873527526855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061748430132866, + "step": 18686 + }, + { + "epoch": 0.37376, + "grad_norm": 1.9609375, + "grad_norm_var": 0.059081776936848955, + "learning_rate": 0.0001, + "loss": 4.1676, + "loss/crossentropy": 2.4795751571655273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2075038179755211, + "step": 18688 + }, + { + "epoch": 0.3738, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006599934895833334, + "learning_rate": 0.0001, + "loss": 3.9992, + "loss/crossentropy": 1.864591360092163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17119919508695602, + "step": 18690 + }, + { + "epoch": 0.37384, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007063547770182292, + "learning_rate": 0.0001, + "loss": 3.7565, + "loss/crossentropy": 1.6705753207206726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19292957335710526, + "step": 18692 + }, + { + "epoch": 0.37388, + "grad_norm": 2.03125, + "grad_norm_var": 0.004209136962890625, + "learning_rate": 0.0001, + "loss": 4.0448, + "loss/crossentropy": 2.16109561920166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20515639334917068, + "step": 18694 + }, + { + "epoch": 0.37392, + "grad_norm": 2.28125, + "grad_norm_var": 0.014212799072265626, + "learning_rate": 0.0001, + "loss": 4.1683, + "loss/crossentropy": 1.771731436252594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20304742455482483, + "step": 18696 + }, + { + "epoch": 0.37396, + "grad_norm": 1.8203125, + "grad_norm_var": 0.014330037434895833, + "learning_rate": 0.0001, + "loss": 3.9636, + "loss/crossentropy": 2.046182096004486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976921409368515, + "step": 18698 + }, + { + "epoch": 0.374, + "grad_norm": 1.78125, + "grad_norm_var": 0.0148345947265625, + "learning_rate": 0.0001, + "loss": 4.0257, + "loss/crossentropy": 2.0343292355537415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19037891179323196, + "step": 18700 + }, + { + "epoch": 0.37404, + "grad_norm": 1.859375, + "grad_norm_var": 0.014782460530598958, + "learning_rate": 0.0001, + "loss": 4.1727, + "loss/crossentropy": 2.034367859363556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20182596892118454, + "step": 18702 + }, + { + "epoch": 0.37408, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015616861979166667, + "learning_rate": 0.0001, + "loss": 3.966, + "loss/crossentropy": 2.048890709877014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19922567903995514, + "step": 18704 + }, + { + "epoch": 0.37412, + "grad_norm": 2.03125, + "grad_norm_var": 0.0163970947265625, + "learning_rate": 0.0001, + "loss": 4.0864, + "loss/crossentropy": 1.8759222626686096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19452380388975143, + "step": 18706 + }, + { + "epoch": 0.37416, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015681966145833334, + "learning_rate": 0.0001, + "loss": 3.9964, + "loss/crossentropy": 1.693844199180603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16944261640310287, + "step": 18708 + }, + { + "epoch": 0.3742, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01727879842122396, + "learning_rate": 0.0001, + "loss": 4.293, + "loss/crossentropy": 2.266264319419861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103852555155754, + "step": 18710 + }, + { + "epoch": 0.37424, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008250935872395834, + "learning_rate": 0.0001, + "loss": 3.8547, + "loss/crossentropy": 2.029367506504059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18859465420246124, + "step": 18712 + }, + { + "epoch": 0.37428, + "grad_norm": 2.640625, + "grad_norm_var": 0.039184315999348955, + "learning_rate": 0.0001, + "loss": 4.1141, + "loss/crossentropy": 1.9427701234817505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960265040397644, + "step": 18714 + }, + { + "epoch": 0.37432, + "grad_norm": 2.078125, + "grad_norm_var": 0.03737360636393229, + "learning_rate": 0.0001, + "loss": 4.3087, + "loss/crossentropy": 2.2656137943267822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22287357598543167, + "step": 18716 + }, + { + "epoch": 0.37436, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0365386962890625, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 2.0447250604629517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20505183935165405, + "step": 18718 + }, + { + "epoch": 0.3744, + "grad_norm": 2.03125, + "grad_norm_var": 0.04125137329101562, + "learning_rate": 0.0001, + "loss": 3.6507, + "loss/crossentropy": 1.8176262378692627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17877614498138428, + "step": 18720 + }, + { + "epoch": 0.37444, + "grad_norm": 1.890625, + "grad_norm_var": 0.04017333984375, + "learning_rate": 0.0001, + "loss": 3.9368, + "loss/crossentropy": 1.8362378478050232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18403278291225433, + "step": 18722 + }, + { + "epoch": 0.37448, + "grad_norm": 2.140625, + "grad_norm_var": 0.04104410807291667, + "learning_rate": 0.0001, + "loss": 4.4322, + "loss/crossentropy": 2.43450927734375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21623602509498596, + "step": 18724 + }, + { + "epoch": 0.37452, + "grad_norm": 2.09375, + "grad_norm_var": 0.04133275349934896, + "learning_rate": 0.0001, + "loss": 4.3905, + "loss/crossentropy": 2.227471947669983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20000503957271576, + "step": 18726 + }, + { + "epoch": 0.37456, + "grad_norm": 1.8046875, + "grad_norm_var": 0.04317626953125, + "learning_rate": 0.0001, + "loss": 3.9138, + "loss/crossentropy": 1.9392182230949402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19422952830791473, + "step": 18728 + }, + { + "epoch": 0.3746, + "grad_norm": 1.96875, + "grad_norm_var": 0.014469401041666666, + "learning_rate": 0.0001, + "loss": 4.1284, + "loss/crossentropy": 2.0168241262435913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1774912029504776, + "step": 18730 + }, + { + "epoch": 0.37464, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012859853108723958, + "learning_rate": 0.0001, + "loss": 3.8982, + "loss/crossentropy": 1.6861125230789185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16301175951957703, + "step": 18732 + }, + { + "epoch": 0.37468, + "grad_norm": 2.09375, + "grad_norm_var": 0.014111328125, + "learning_rate": 0.0001, + "loss": 4.2366, + "loss/crossentropy": 2.2991716861724854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21094633638858795, + "step": 18734 + }, + { + "epoch": 0.37472, + "grad_norm": 1.96875, + "grad_norm_var": 0.008115386962890625, + "learning_rate": 0.0001, + "loss": 4.0184, + "loss/crossentropy": 1.8746486902236938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19248205423355103, + "step": 18736 + }, + { + "epoch": 0.37476, + "grad_norm": 2.15625, + "grad_norm_var": 0.009642537434895833, + "learning_rate": 0.0001, + "loss": 4.1886, + "loss/crossentropy": 2.137068212032318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996043175458908, + "step": 18738 + }, + { + "epoch": 0.3748, + "grad_norm": 1.9296875, + "grad_norm_var": 0.015636952718098958, + "learning_rate": 0.0001, + "loss": 4.1503, + "loss/crossentropy": 1.724601149559021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23018527776002884, + "step": 18740 + }, + { + "epoch": 0.37484, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015584309895833334, + "learning_rate": 0.0001, + "loss": 4.1593, + "loss/crossentropy": 2.1407381296157837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21157334744930267, + "step": 18742 + }, + { + "epoch": 0.37488, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013090006510416667, + "learning_rate": 0.0001, + "loss": 4.0371, + "loss/crossentropy": 2.2977999448776245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19883693754673004, + "step": 18744 + }, + { + "epoch": 0.37492, + "grad_norm": 2.078125, + "grad_norm_var": 0.018155924479166665, + "learning_rate": 0.0001, + "loss": 3.8912, + "loss/crossentropy": 2.096716046333313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21084215492010117, + "step": 18746 + }, + { + "epoch": 0.37496, + "grad_norm": 1.9453125, + "grad_norm_var": 0.019254302978515624, + "learning_rate": 0.0001, + "loss": 3.8508, + "loss/crossentropy": 1.7292688488960266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18280881643295288, + "step": 18748 + }, + { + "epoch": 0.375, + "grad_norm": 1.984375, + "grad_norm_var": 0.0191314697265625, + "learning_rate": 0.0001, + "loss": 3.9798, + "loss/crossentropy": 1.7562988996505737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889612227678299, + "step": 18750 + }, + { + "epoch": 0.37504, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02008641560872396, + "learning_rate": 0.0001, + "loss": 4.1498, + "loss/crossentropy": 2.1730951070785522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20177915692329407, + "step": 18752 + }, + { + "epoch": 0.37508, + "grad_norm": 2.0625, + "grad_norm_var": 0.018161773681640625, + "learning_rate": 0.0001, + "loss": 4.0316, + "loss/crossentropy": 1.7671055793762207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17018838971853256, + "step": 18754 + }, + { + "epoch": 0.37512, + "grad_norm": 2.015625, + "grad_norm_var": 0.010228474934895834, + "learning_rate": 0.0001, + "loss": 4.1922, + "loss/crossentropy": 2.2650365829467773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22090423107147217, + "step": 18756 + }, + { + "epoch": 0.37516, + "grad_norm": 1.921875, + "grad_norm_var": 0.01002197265625, + "learning_rate": 0.0001, + "loss": 4.1985, + "loss/crossentropy": 2.252619981765747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20853855460882187, + "step": 18758 + }, + { + "epoch": 0.3752, + "grad_norm": 2.109375, + "grad_norm_var": 0.011766560872395833, + "learning_rate": 0.0001, + "loss": 3.8947, + "loss/crossentropy": 2.022092342376709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19293325394392014, + "step": 18760 + }, + { + "epoch": 0.37524, + "grad_norm": 2.078125, + "grad_norm_var": 0.010081990559895834, + "learning_rate": 0.0001, + "loss": 4.0289, + "loss/crossentropy": 1.7500890493392944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18832595646381378, + "step": 18762 + }, + { + "epoch": 0.37528, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009220123291015625, + "learning_rate": 0.0001, + "loss": 3.7769, + "loss/crossentropy": 1.8201736211776733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910964399576187, + "step": 18764 + }, + { + "epoch": 0.37532, + "grad_norm": 1.9375, + "grad_norm_var": 0.009989420572916666, + "learning_rate": 0.0001, + "loss": 4.0508, + "loss/crossentropy": 2.1644541025161743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20023848116397858, + "step": 18766 + }, + { + "epoch": 0.37536, + "grad_norm": 2.078125, + "grad_norm_var": 0.010436757405598959, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 2.275505781173706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23095671832561493, + "step": 18768 + }, + { + "epoch": 0.3754, + "grad_norm": 2.078125, + "grad_norm_var": 0.010773722330729167, + "learning_rate": 0.0001, + "loss": 4.0557, + "loss/crossentropy": 1.976850986480713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125934213399887, + "step": 18770 + }, + { + "epoch": 0.37544, + "grad_norm": 2.140625, + "grad_norm_var": 0.012035115559895834, + "learning_rate": 0.0001, + "loss": 4.146, + "loss/crossentropy": 2.150836706161499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19192685186862946, + "step": 18772 + }, + { + "epoch": 0.37548, + "grad_norm": 2.046875, + "grad_norm_var": 0.012400054931640625, + "learning_rate": 0.0001, + "loss": 4.2302, + "loss/crossentropy": 2.0790088176727295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147749587893486, + "step": 18774 + }, + { + "epoch": 0.37552, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010697428385416667, + "learning_rate": 0.0001, + "loss": 3.7918, + "loss/crossentropy": 1.8366054892539978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19662949442863464, + "step": 18776 + }, + { + "epoch": 0.37556, + "grad_norm": 2.015625, + "grad_norm_var": 0.008278147379557291, + "learning_rate": 0.0001, + "loss": 3.9056, + "loss/crossentropy": 1.6928801536560059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18292289972305298, + "step": 18778 + }, + { + "epoch": 0.3756, + "grad_norm": 1.96875, + "grad_norm_var": 0.007591756184895834, + "learning_rate": 0.0001, + "loss": 4.2872, + "loss/crossentropy": 2.015128195285797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20940368622541428, + "step": 18780 + }, + { + "epoch": 0.37564, + "grad_norm": 2.125, + "grad_norm_var": 0.0061757405598958336, + "learning_rate": 0.0001, + "loss": 4.3304, + "loss/crossentropy": 2.026209592819214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2227947860956192, + "step": 18782 + }, + { + "epoch": 0.37568, + "grad_norm": 2.0, + "grad_norm_var": 0.004349772135416667, + "learning_rate": 0.0001, + "loss": 4.0591, + "loss/crossentropy": 2.29964280128479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23132510483264923, + "step": 18784 + }, + { + "epoch": 0.37572, + "grad_norm": 1.859375, + "grad_norm_var": 0.005366770426432291, + "learning_rate": 0.0001, + "loss": 3.9675, + "loss/crossentropy": 1.881381332874298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17967566847801208, + "step": 18786 + }, + { + "epoch": 0.37576, + "grad_norm": 2.046875, + "grad_norm_var": 0.004198201497395833, + "learning_rate": 0.0001, + "loss": 4.1834, + "loss/crossentropy": 2.2231001257896423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21163878589868546, + "step": 18788 + }, + { + "epoch": 0.3758, + "grad_norm": 1.8984375, + "grad_norm_var": 0.004526519775390625, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 2.3531078100204468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21745989471673965, + "step": 18790 + }, + { + "epoch": 0.37584, + "grad_norm": 2.046875, + "grad_norm_var": 0.004801432291666667, + "learning_rate": 0.0001, + "loss": 4.2549, + "loss/crossentropy": 2.1178460121154785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23571017384529114, + "step": 18792 + }, + { + "epoch": 0.37588, + "grad_norm": 1.84375, + "grad_norm_var": 0.005890909830729167, + "learning_rate": 0.0001, + "loss": 3.8373, + "loss/crossentropy": 2.0659135580062866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20817245543003082, + "step": 18794 + }, + { + "epoch": 0.37592, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006318918863932292, + "learning_rate": 0.0001, + "loss": 4.0103, + "loss/crossentropy": 1.579395353794098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16713083535432816, + "step": 18796 + }, + { + "epoch": 0.37596, + "grad_norm": 1.953125, + "grad_norm_var": 0.005940500895182292, + "learning_rate": 0.0001, + "loss": 4.1272, + "loss/crossentropy": 2.102541923522949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2174636349081993, + "step": 18798 + }, + { + "epoch": 0.376, + "grad_norm": 2.09375, + "grad_norm_var": 0.0067942301432291664, + "learning_rate": 0.0001, + "loss": 4.2622, + "loss/crossentropy": 1.9342190027236938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19780533015727997, + "step": 18800 + }, + { + "epoch": 0.37604, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006121571858723958, + "learning_rate": 0.0001, + "loss": 4.0659, + "loss/crossentropy": 2.140671730041504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21276966482400894, + "step": 18802 + }, + { + "epoch": 0.37608, + "grad_norm": 2.21875, + "grad_norm_var": 0.009590657552083333, + "learning_rate": 0.0001, + "loss": 4.1929, + "loss/crossentropy": 1.9004405736923218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17683426290750504, + "step": 18804 + }, + { + "epoch": 0.37612, + "grad_norm": 1.859375, + "grad_norm_var": 0.010155232747395833, + "learning_rate": 0.0001, + "loss": 3.7163, + "loss/crossentropy": 1.9487649202346802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19463464617729187, + "step": 18806 + }, + { + "epoch": 0.37616, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011262766520182292, + "learning_rate": 0.0001, + "loss": 3.9517, + "loss/crossentropy": 2.15252423286438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20531394332647324, + "step": 18808 + }, + { + "epoch": 0.3762, + "grad_norm": 1.8359375, + "grad_norm_var": 0.011592356363932292, + "learning_rate": 0.0001, + "loss": 4.0494, + "loss/crossentropy": 2.114433467388153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183662325143814, + "step": 18810 + }, + { + "epoch": 0.37624, + "grad_norm": 2.0, + "grad_norm_var": 0.011205037434895834, + "learning_rate": 0.0001, + "loss": 3.9035, + "loss/crossentropy": 1.9229055047035217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18662934005260468, + "step": 18812 + }, + { + "epoch": 0.37628, + "grad_norm": 2.046875, + "grad_norm_var": 1.8609944661458333, + "learning_rate": 0.0001, + "loss": 4.2516, + "loss/crossentropy": 1.8242689371109009, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17858239263296127, + "step": 18814 + }, + { + "epoch": 0.37632, + "grad_norm": 1.9765625, + "grad_norm_var": 1.864818318684896, + "learning_rate": 0.0001, + "loss": 3.8917, + "loss/crossentropy": 2.3987890481948853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2246004194021225, + "step": 18816 + }, + { + "epoch": 0.37636, + "grad_norm": 2.234375, + "grad_norm_var": 1.8565500895182292, + "learning_rate": 0.0001, + "loss": 4.3493, + "loss/crossentropy": 2.4376614093780518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24541915208101273, + "step": 18818 + }, + { + "epoch": 0.3764, + "grad_norm": 1.984375, + "grad_norm_var": 1.865612538655599, + "learning_rate": 0.0001, + "loss": 4.1344, + "loss/crossentropy": 2.2477601766586304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2178046852350235, + "step": 18820 + }, + { + "epoch": 0.37644, + "grad_norm": 1.96875, + "grad_norm_var": 1.8636464436848958, + "learning_rate": 0.0001, + "loss": 3.9685, + "loss/crossentropy": 1.9782747626304626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19459142535924911, + "step": 18822 + }, + { + "epoch": 0.37648, + "grad_norm": 2.0625, + "grad_norm_var": 1.8451738993326823, + "learning_rate": 0.0001, + "loss": 4.043, + "loss/crossentropy": 2.1214100122451782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082664743065834, + "step": 18824 + }, + { + "epoch": 0.37652, + "grad_norm": 1.90625, + "grad_norm_var": 1.8489461263020834, + "learning_rate": 0.0001, + "loss": 3.8256, + "loss/crossentropy": 1.68446546792984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.167055182158947, + "step": 18826 + }, + { + "epoch": 0.37656, + "grad_norm": 1.9921875, + "grad_norm_var": 1.8380022684733073, + "learning_rate": 0.0001, + "loss": 4.0588, + "loss/crossentropy": 1.7720499634742737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18632127344608307, + "step": 18828 + }, + { + "epoch": 0.3766, + "grad_norm": 2.140625, + "grad_norm_var": 0.013492838541666666, + "learning_rate": 0.0001, + "loss": 4.2288, + "loss/crossentropy": 2.283332347869873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23949767649173737, + "step": 18830 + }, + { + "epoch": 0.37664, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014644114176432292, + "learning_rate": 0.0001, + "loss": 3.9134, + "loss/crossentropy": 1.9924857020378113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20181651413440704, + "step": 18832 + }, + { + "epoch": 0.37668, + "grad_norm": 2.046875, + "grad_norm_var": 0.011091105143229167, + "learning_rate": 0.0001, + "loss": 3.8424, + "loss/crossentropy": 2.4015761613845825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2279737964272499, + "step": 18834 + }, + { + "epoch": 0.37672, + "grad_norm": 1.8515625, + "grad_norm_var": 0.014152018229166667, + "learning_rate": 0.0001, + "loss": 3.8636, + "loss/crossentropy": 2.0717111229896545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19089852273464203, + "step": 18836 + }, + { + "epoch": 0.37676, + "grad_norm": 2.09375, + "grad_norm_var": 0.0149566650390625, + "learning_rate": 0.0001, + "loss": 4.2694, + "loss/crossentropy": 1.9993728995323181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19489262998104095, + "step": 18838 + }, + { + "epoch": 0.3768, + "grad_norm": 1.8125, + "grad_norm_var": 0.015949503580729166, + "learning_rate": 0.0001, + "loss": 3.7009, + "loss/crossentropy": 1.915448248386383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1923104077577591, + "step": 18840 + }, + { + "epoch": 0.37684, + "grad_norm": 1.890625, + "grad_norm_var": 0.016511027018229166, + "learning_rate": 0.0001, + "loss": 4.0424, + "loss/crossentropy": 2.292284607887268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22159253805875778, + "step": 18842 + }, + { + "epoch": 0.37688, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012784830729166667, + "learning_rate": 0.0001, + "loss": 3.9904, + "loss/crossentropy": 1.9938938617706299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19937817752361298, + "step": 18844 + }, + { + "epoch": 0.37692, + "grad_norm": 1.9453125, + "grad_norm_var": 0.016454060872395832, + "learning_rate": 0.0001, + "loss": 4.1914, + "loss/crossentropy": 2.045258641242981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2652823179960251, + "step": 18846 + }, + { + "epoch": 0.37696, + "grad_norm": 1.9921875, + "grad_norm_var": 0.016707356770833334, + "learning_rate": 0.0001, + "loss": 4.0391, + "loss/crossentropy": 2.1330259442329407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19155749678611755, + "step": 18848 + }, + { + "epoch": 0.377, + "grad_norm": 1.921875, + "grad_norm_var": 0.017101796468098958, + "learning_rate": 0.0001, + "loss": 4.1422, + "loss/crossentropy": 2.2135089635849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20062117278575897, + "step": 18850 + }, + { + "epoch": 0.37704, + "grad_norm": 2.046875, + "grad_norm_var": 0.013826243082682292, + "learning_rate": 0.0001, + "loss": 4.1188, + "loss/crossentropy": 2.073060691356659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19443543255329132, + "step": 18852 + }, + { + "epoch": 0.37708, + "grad_norm": 1.96875, + "grad_norm_var": 0.014170074462890625, + "learning_rate": 0.0001, + "loss": 4.089, + "loss/crossentropy": 1.930641233921051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19604724645614624, + "step": 18854 + }, + { + "epoch": 0.37712, + "grad_norm": 2.015625, + "grad_norm_var": 0.011252593994140626, + "learning_rate": 0.0001, + "loss": 4.0238, + "loss/crossentropy": 2.0813130140304565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21019181609153748, + "step": 18856 + }, + { + "epoch": 0.37716, + "grad_norm": 2.03125, + "grad_norm_var": 0.010080718994140625, + "learning_rate": 0.0001, + "loss": 3.9855, + "loss/crossentropy": 1.8528355956077576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19945576786994934, + "step": 18858 + }, + { + "epoch": 0.3772, + "grad_norm": 2.0, + "grad_norm_var": 0.011896769205729166, + "learning_rate": 0.0001, + "loss": 3.9655, + "loss/crossentropy": 2.32794725894928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22688604146242142, + "step": 18860 + }, + { + "epoch": 0.37724, + "grad_norm": 2.046875, + "grad_norm_var": 0.007264963785807292, + "learning_rate": 0.0001, + "loss": 4.25, + "loss/crossentropy": 2.061101734638214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1961553767323494, + "step": 18862 + }, + { + "epoch": 0.37728, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00604248046875, + "learning_rate": 0.0001, + "loss": 3.7839, + "loss/crossentropy": 1.9127929210662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20082338899374008, + "step": 18864 + }, + { + "epoch": 0.37732, + "grad_norm": 2.046875, + "grad_norm_var": 0.14662272135416668, + "learning_rate": 0.0001, + "loss": 4.289, + "loss/crossentropy": 2.001932919025421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1881283074617386, + "step": 18866 + }, + { + "epoch": 0.37736, + "grad_norm": 1.9609375, + "grad_norm_var": 0.14527180989583333, + "learning_rate": 0.0001, + "loss": 4.0851, + "loss/crossentropy": 1.693075716495514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18698371946811676, + "step": 18868 + }, + { + "epoch": 0.3774, + "grad_norm": 1.984375, + "grad_norm_var": 0.1481402079264323, + "learning_rate": 0.0001, + "loss": 4.0484, + "loss/crossentropy": 1.879252314567566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18412816524505615, + "step": 18870 + }, + { + "epoch": 0.37744, + "grad_norm": 2.015625, + "grad_norm_var": 0.1550066630045573, + "learning_rate": 0.0001, + "loss": 4.3488, + "loss/crossentropy": 2.1819299459457397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20243355631828308, + "step": 18872 + }, + { + "epoch": 0.37748, + "grad_norm": 2.21875, + "grad_norm_var": 0.1576568603515625, + "learning_rate": 0.0001, + "loss": 4.0889, + "loss/crossentropy": 2.2487794160842896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20594948530197144, + "step": 18874 + }, + { + "epoch": 0.37752, + "grad_norm": 1.8984375, + "grad_norm_var": 0.15466079711914063, + "learning_rate": 0.0001, + "loss": 4.303, + "loss/crossentropy": 1.990889549255371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19230055809020996, + "step": 18876 + }, + { + "epoch": 0.37756, + "grad_norm": 1.9765625, + "grad_norm_var": 0.1582763671875, + "learning_rate": 0.0001, + "loss": 4.0435, + "loss/crossentropy": 2.091560959815979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19504332542419434, + "step": 18878 + }, + { + "epoch": 0.3776, + "grad_norm": 2.109375, + "grad_norm_var": 0.1599273681640625, + "learning_rate": 0.0001, + "loss": 4.0776, + "loss/crossentropy": 1.9213955998420715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994621828198433, + "step": 18880 + }, + { + "epoch": 0.37764, + "grad_norm": 1.8671875, + "grad_norm_var": 0.02360814412434896, + "learning_rate": 0.0001, + "loss": 4.0103, + "loss/crossentropy": 2.1728278398513794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21039214730262756, + "step": 18882 + }, + { + "epoch": 0.37768, + "grad_norm": 1.78125, + "grad_norm_var": 0.026341756184895832, + "learning_rate": 0.0001, + "loss": 4.052, + "loss/crossentropy": 1.889222264289856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17232363671064377, + "step": 18884 + }, + { + "epoch": 0.37772, + "grad_norm": 1.9921875, + "grad_norm_var": 0.026228841145833334, + "learning_rate": 0.0001, + "loss": 4.118, + "loss/crossentropy": 2.264981746673584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.213841512799263, + "step": 18886 + }, + { + "epoch": 0.37776, + "grad_norm": 1.78125, + "grad_norm_var": 0.013598378499348958, + "learning_rate": 0.0001, + "loss": 3.8843, + "loss/crossentropy": 1.8615361452102661, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1745932251214981, + "step": 18888 + }, + { + "epoch": 0.3778, + "grad_norm": 1.765625, + "grad_norm_var": 0.009266916910807292, + "learning_rate": 0.0001, + "loss": 3.5687, + "loss/crossentropy": 2.1459723711013794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21431762725114822, + "step": 18890 + }, + { + "epoch": 0.37784, + "grad_norm": 2.0625, + "grad_norm_var": 0.010479482014973958, + "learning_rate": 0.0001, + "loss": 4.2125, + "loss/crossentropy": 1.8859283328056335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20627497136592865, + "step": 18892 + }, + { + "epoch": 0.37788, + "grad_norm": 2.046875, + "grad_norm_var": 0.012211100260416666, + "learning_rate": 0.0001, + "loss": 3.9978, + "loss/crossentropy": 1.824287474155426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17877592146396637, + "step": 18894 + }, + { + "epoch": 0.37792, + "grad_norm": 1.953125, + "grad_norm_var": 0.012414296468098959, + "learning_rate": 0.0001, + "loss": 3.9857, + "loss/crossentropy": 2.486180305480957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033834606409073, + "step": 18896 + }, + { + "epoch": 0.37796, + "grad_norm": 2.046875, + "grad_norm_var": 0.012981923421223958, + "learning_rate": 0.0001, + "loss": 4.2339, + "loss/crossentropy": 2.0677965879440308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21111362427473068, + "step": 18898 + }, + { + "epoch": 0.378, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011107381184895833, + "learning_rate": 0.0001, + "loss": 4.2559, + "loss/crossentropy": 1.941792368888855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18762809038162231, + "step": 18900 + }, + { + "epoch": 0.37804, + "grad_norm": 2.078125, + "grad_norm_var": 0.012504069010416667, + "learning_rate": 0.0001, + "loss": 3.8028, + "loss/crossentropy": 1.907107174396515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2050766795873642, + "step": 18902 + }, + { + "epoch": 0.37808, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009242502848307292, + "learning_rate": 0.0001, + "loss": 3.9742, + "loss/crossentropy": 1.5920222997665405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17366845160722733, + "step": 18904 + }, + { + "epoch": 0.37812, + "grad_norm": 1.9375, + "grad_norm_var": 0.0066650390625, + "learning_rate": 0.0001, + "loss": 3.9581, + "loss/crossentropy": 2.054854154586792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20102889090776443, + "step": 18906 + }, + { + "epoch": 0.37816, + "grad_norm": 1.90625, + "grad_norm_var": 0.0067789713541666664, + "learning_rate": 0.0001, + "loss": 4.2582, + "loss/crossentropy": 2.42835795879364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2154095396399498, + "step": 18908 + }, + { + "epoch": 0.3782, + "grad_norm": 2.09375, + "grad_norm_var": 0.007169596354166667, + "learning_rate": 0.0001, + "loss": 4.1466, + "loss/crossentropy": 2.1060370206832886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20842333883047104, + "step": 18910 + }, + { + "epoch": 0.37824, + "grad_norm": 1.96875, + "grad_norm_var": 0.008194986979166667, + "learning_rate": 0.0001, + "loss": 4.1587, + "loss/crossentropy": 2.0559674501419067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2195875570178032, + "step": 18912 + }, + { + "epoch": 0.37828, + "grad_norm": 2.03125, + "grad_norm_var": 0.008888498942057291, + "learning_rate": 0.0001, + "loss": 4.2786, + "loss/crossentropy": 2.240954041481018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20908969640731812, + "step": 18914 + }, + { + "epoch": 0.37832, + "grad_norm": 1.96875, + "grad_norm_var": 0.008747355143229166, + "learning_rate": 0.0001, + "loss": 4.0729, + "loss/crossentropy": 2.0393139123916626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19933026283979416, + "step": 18916 + }, + { + "epoch": 0.37836, + "grad_norm": 2.109375, + "grad_norm_var": 0.007682291666666666, + "learning_rate": 0.0001, + "loss": 4.0969, + "loss/crossentropy": 2.0294516682624817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20086780190467834, + "step": 18918 + }, + { + "epoch": 0.3784, + "grad_norm": 1.96875, + "grad_norm_var": 0.007425944010416667, + "learning_rate": 0.0001, + "loss": 3.9138, + "loss/crossentropy": 1.8509765267372131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18834587186574936, + "step": 18920 + }, + { + "epoch": 0.37844, + "grad_norm": 1.875, + "grad_norm_var": 0.0068267822265625, + "learning_rate": 0.0001, + "loss": 3.8669, + "loss/crossentropy": 1.8817378878593445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17679119110107422, + "step": 18922 + }, + { + "epoch": 0.37848, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005975087483723958, + "learning_rate": 0.0001, + "loss": 3.9276, + "loss/crossentropy": 1.766166627407074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16948368400335312, + "step": 18924 + }, + { + "epoch": 0.37852, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006786855061848959, + "learning_rate": 0.0001, + "loss": 3.8948, + "loss/crossentropy": 2.251446485519409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22272750735282898, + "step": 18926 + }, + { + "epoch": 0.37856, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0045125325520833336, + "learning_rate": 0.0001, + "loss": 4.1119, + "loss/crossentropy": 2.0006097555160522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21003134548664093, + "step": 18928 + }, + { + "epoch": 0.3786, + "grad_norm": 2.03125, + "grad_norm_var": 0.0036936442057291666, + "learning_rate": 0.0001, + "loss": 3.7731, + "loss/crossentropy": 2.2906641960144043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23650038242340088, + "step": 18930 + }, + { + "epoch": 0.37864, + "grad_norm": 2.03125, + "grad_norm_var": 0.0039866129557291664, + "learning_rate": 0.0001, + "loss": 4.1848, + "loss/crossentropy": 2.185898005962372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21387950330972672, + "step": 18932 + }, + { + "epoch": 0.37868, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0028928120930989585, + "learning_rate": 0.0001, + "loss": 4.1808, + "loss/crossentropy": 2.2457324266433716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21054718643426895, + "step": 18934 + }, + { + "epoch": 0.37872, + "grad_norm": 2.046875, + "grad_norm_var": 0.0032793680826822915, + "learning_rate": 0.0001, + "loss": 4.2711, + "loss/crossentropy": 2.1835416555404663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23258651047945023, + "step": 18936 + }, + { + "epoch": 0.37876, + "grad_norm": 2.078125, + "grad_norm_var": 0.0029436747233072915, + "learning_rate": 0.0001, + "loss": 4.1555, + "loss/crossentropy": 2.1908507347106934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20261384546756744, + "step": 18938 + }, + { + "epoch": 0.3788, + "grad_norm": 2.078125, + "grad_norm_var": 0.0035336812337239585, + "learning_rate": 0.0001, + "loss": 4.1359, + "loss/crossentropy": 2.276697278022766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20843130350112915, + "step": 18940 + }, + { + "epoch": 0.37884, + "grad_norm": 1.9375, + "grad_norm_var": 0.0023251851399739582, + "learning_rate": 0.0001, + "loss": 4.1309, + "loss/crossentropy": 2.16153222322464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19693513214588165, + "step": 18942 + }, + { + "epoch": 0.37888, + "grad_norm": 1.96875, + "grad_norm_var": 0.0023590087890625, + "learning_rate": 0.0001, + "loss": 3.9822, + "loss/crossentropy": 2.293270707130432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21085364371538162, + "step": 18944 + }, + { + "epoch": 0.37892, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0025042215983072918, + "learning_rate": 0.0001, + "loss": 4.1291, + "loss/crossentropy": 2.0382518768310547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21567383408546448, + "step": 18946 + }, + { + "epoch": 0.37896, + "grad_norm": 1.8359375, + "grad_norm_var": 0.00894775390625, + "learning_rate": 0.0001, + "loss": 3.9993, + "loss/crossentropy": 2.038296341896057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19963379949331284, + "step": 18948 + }, + { + "epoch": 0.379, + "grad_norm": 1.953125, + "grad_norm_var": 0.00994873046875, + "learning_rate": 0.0001, + "loss": 4.2363, + "loss/crossentropy": 2.1765074729919434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19551265239715576, + "step": 18950 + }, + { + "epoch": 0.37904, + "grad_norm": 2.015625, + "grad_norm_var": 0.010591379801432292, + "learning_rate": 0.0001, + "loss": 3.8641, + "loss/crossentropy": 1.870033621788025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17858022451400757, + "step": 18952 + }, + { + "epoch": 0.37908, + "grad_norm": 2.0625, + "grad_norm_var": 0.011083730061848958, + "learning_rate": 0.0001, + "loss": 4.1877, + "loss/crossentropy": 1.8716632723808289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20534101128578186, + "step": 18954 + }, + { + "epoch": 0.37912, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0111236572265625, + "learning_rate": 0.0001, + "loss": 4.1662, + "loss/crossentropy": 1.925970435142517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19596435129642487, + "step": 18956 + }, + { + "epoch": 0.37916, + "grad_norm": 1.890625, + "grad_norm_var": 0.011800130208333334, + "learning_rate": 0.0001, + "loss": 4.126, + "loss/crossentropy": 2.2280589938163757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21037188172340393, + "step": 18958 + }, + { + "epoch": 0.3792, + "grad_norm": 2.0, + "grad_norm_var": 0.0150299072265625, + "learning_rate": 0.0001, + "loss": 3.7683, + "loss/crossentropy": 1.8350458145141602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821446716785431, + "step": 18960 + }, + { + "epoch": 0.37924, + "grad_norm": 1.953125, + "grad_norm_var": 0.015018463134765625, + "learning_rate": 0.0001, + "loss": 4.0376, + "loss/crossentropy": 2.0393940210342407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18378175050020218, + "step": 18962 + }, + { + "epoch": 0.37928, + "grad_norm": 1.9375, + "grad_norm_var": 0.00897216796875, + "learning_rate": 0.0001, + "loss": 4.0018, + "loss/crossentropy": 1.7470228672027588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19222351908683777, + "step": 18964 + }, + { + "epoch": 0.37932, + "grad_norm": 2.203125, + "grad_norm_var": 0.011742146809895833, + "learning_rate": 0.0001, + "loss": 4.1791, + "loss/crossentropy": 2.3943980932235718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23214909434318542, + "step": 18966 + }, + { + "epoch": 0.37936, + "grad_norm": 1.90625, + "grad_norm_var": 0.012239329020182292, + "learning_rate": 0.0001, + "loss": 3.7877, + "loss/crossentropy": 1.8153263330459595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19808300584554672, + "step": 18968 + }, + { + "epoch": 0.3794, + "grad_norm": 2.0625, + "grad_norm_var": 0.011171213785807292, + "learning_rate": 0.0001, + "loss": 4.0122, + "loss/crossentropy": 2.0037755370140076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20173701643943787, + "step": 18970 + }, + { + "epoch": 0.37944, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012168121337890626, + "learning_rate": 0.0001, + "loss": 4.1069, + "loss/crossentropy": 2.144823908805847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21421342343091965, + "step": 18972 + }, + { + "epoch": 0.37948, + "grad_norm": 2.015625, + "grad_norm_var": 0.011565907796223959, + "learning_rate": 0.0001, + "loss": 4.2788, + "loss/crossentropy": 2.2151081562042236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20057211816310883, + "step": 18974 + }, + { + "epoch": 0.37952, + "grad_norm": 2.03125, + "grad_norm_var": 0.008976236979166666, + "learning_rate": 0.0001, + "loss": 4.5061, + "loss/crossentropy": 2.341770827770233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21535535156726837, + "step": 18976 + }, + { + "epoch": 0.37956, + "grad_norm": 2.109375, + "grad_norm_var": 0.009520467122395833, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 1.8952747583389282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17530933022499084, + "step": 18978 + }, + { + "epoch": 0.3796, + "grad_norm": 2.03125, + "grad_norm_var": 0.008885701497395834, + "learning_rate": 0.0001, + "loss": 4.141, + "loss/crossentropy": 2.2774226665496826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20682721585035324, + "step": 18980 + }, + { + "epoch": 0.37964, + "grad_norm": 2.078125, + "grad_norm_var": 0.008508046468098959, + "learning_rate": 0.0001, + "loss": 3.982, + "loss/crossentropy": 1.9596800208091736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18774595111608505, + "step": 18982 + }, + { + "epoch": 0.37968, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009309895833333333, + "learning_rate": 0.0001, + "loss": 4.008, + "loss/crossentropy": 2.1485220193862915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19221040606498718, + "step": 18984 + }, + { + "epoch": 0.37972, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009039052327473958, + "learning_rate": 0.0001, + "loss": 4.142, + "loss/crossentropy": 2.2140207290649414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012096345424652, + "step": 18986 + }, + { + "epoch": 0.37976, + "grad_norm": 1.890625, + "grad_norm_var": 0.008063761393229167, + "learning_rate": 0.0001, + "loss": 4.1088, + "loss/crossentropy": 1.9045360684394836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19290773570537567, + "step": 18988 + }, + { + "epoch": 0.3798, + "grad_norm": 2.03125, + "grad_norm_var": 0.008072662353515624, + "learning_rate": 0.0001, + "loss": 4.0349, + "loss/crossentropy": 1.8351407051086426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18892163783311844, + "step": 18990 + }, + { + "epoch": 0.37984, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007865397135416667, + "learning_rate": 0.0001, + "loss": 3.9136, + "loss/crossentropy": 2.021119713783264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19214972108602524, + "step": 18992 + }, + { + "epoch": 0.37988, + "grad_norm": 2.0, + "grad_norm_var": 0.006394195556640625, + "learning_rate": 0.0001, + "loss": 4.2961, + "loss/crossentropy": 2.0591527223587036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20295391231775284, + "step": 18994 + }, + { + "epoch": 0.37992, + "grad_norm": 2.0625, + "grad_norm_var": 0.006566365559895833, + "learning_rate": 0.0001, + "loss": 4.1135, + "loss/crossentropy": 1.9794987440109253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20964892208576202, + "step": 18996 + }, + { + "epoch": 0.37996, + "grad_norm": 2.078125, + "grad_norm_var": 0.0042111714680989586, + "learning_rate": 0.0001, + "loss": 4.0699, + "loss/crossentropy": 2.116178512573242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1962205320596695, + "step": 18998 + }, + { + "epoch": 0.38, + "grad_norm": 1.8828125, + "grad_norm_var": 0.003714752197265625, + "learning_rate": 0.0001, + "loss": 4.0683, + "loss/crossentropy": 2.2458595037460327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981743946671486, + "step": 19000 + }, + { + "epoch": 0.38004, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0038266499837239583, + "learning_rate": 0.0001, + "loss": 4.2085, + "loss/crossentropy": 2.0612659454345703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20526322722434998, + "step": 19002 + }, + { + "epoch": 0.38008, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00347900390625, + "learning_rate": 0.0001, + "loss": 4.191, + "loss/crossentropy": 2.0810243487358093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.222077377140522, + "step": 19004 + }, + { + "epoch": 0.38012, + "grad_norm": 1.890625, + "grad_norm_var": 0.003940582275390625, + "learning_rate": 0.0001, + "loss": 4.0957, + "loss/crossentropy": 2.2651617527008057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106148600578308, + "step": 19006 + }, + { + "epoch": 0.38016, + "grad_norm": 1.8125, + "grad_norm_var": 0.0056793212890625, + "learning_rate": 0.0001, + "loss": 3.9671, + "loss/crossentropy": 1.9701108932495117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18778277933597565, + "step": 19008 + }, + { + "epoch": 0.3802, + "grad_norm": 1.859375, + "grad_norm_var": 0.007024892171223958, + "learning_rate": 0.0001, + "loss": 4.1265, + "loss/crossentropy": 1.9822896122932434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25280050933361053, + "step": 19010 + }, + { + "epoch": 0.38024, + "grad_norm": 2.0, + "grad_norm_var": 0.0063435872395833336, + "learning_rate": 0.0001, + "loss": 3.9577, + "loss/crossentropy": 2.020545542240143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19008295983076096, + "step": 19012 + }, + { + "epoch": 0.38028, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007336171468098959, + "learning_rate": 0.0001, + "loss": 4.2044, + "loss/crossentropy": 2.0922394394874573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972956582903862, + "step": 19014 + }, + { + "epoch": 0.38032, + "grad_norm": 1.859375, + "grad_norm_var": 0.007067616780598958, + "learning_rate": 0.0001, + "loss": 3.9726, + "loss/crossentropy": 2.028052031993866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18549586832523346, + "step": 19016 + }, + { + "epoch": 0.38036, + "grad_norm": 1.6796875, + "grad_norm_var": 0.011395009358723958, + "learning_rate": 0.0001, + "loss": 4.0118, + "loss/crossentropy": 1.9518752098083496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17747169733047485, + "step": 19018 + }, + { + "epoch": 0.3804, + "grad_norm": 2.078125, + "grad_norm_var": 0.013492838541666666, + "learning_rate": 0.0001, + "loss": 4.1404, + "loss/crossentropy": 1.8896766901016235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20509368181228638, + "step": 19020 + }, + { + "epoch": 0.38044, + "grad_norm": 2.046875, + "grad_norm_var": 0.01407470703125, + "learning_rate": 0.0001, + "loss": 4.1159, + "loss/crossentropy": 2.10469388961792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058955579996109, + "step": 19022 + }, + { + "epoch": 0.38048, + "grad_norm": 1.875, + "grad_norm_var": 0.012717437744140626, + "learning_rate": 0.0001, + "loss": 3.8601, + "loss/crossentropy": 1.7341394424438477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19358114898204803, + "step": 19024 + }, + { + "epoch": 0.38052, + "grad_norm": 1.7734375, + "grad_norm_var": 0.013634999593098959, + "learning_rate": 0.0001, + "loss": 3.8528, + "loss/crossentropy": 2.186724543571472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981809437274933, + "step": 19026 + }, + { + "epoch": 0.38056, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014070638020833333, + "learning_rate": 0.0001, + "loss": 4.2421, + "loss/crossentropy": 2.228869318962097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20712755620479584, + "step": 19028 + }, + { + "epoch": 0.3806, + "grad_norm": 2.125, + "grad_norm_var": 0.0140869140625, + "learning_rate": 0.0001, + "loss": 4.126, + "loss/crossentropy": 2.2783373594284058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20916558057069778, + "step": 19030 + }, + { + "epoch": 0.38064, + "grad_norm": 1.96875, + "grad_norm_var": 0.013270823160807292, + "learning_rate": 0.0001, + "loss": 4.2744, + "loss/crossentropy": 2.2671823501586914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19389723241329193, + "step": 19032 + }, + { + "epoch": 0.38068, + "grad_norm": 1.859375, + "grad_norm_var": 0.008540852864583334, + "learning_rate": 0.0001, + "loss": 4.0267, + "loss/crossentropy": 1.948801040649414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18766260892152786, + "step": 19034 + }, + { + "epoch": 0.38072, + "grad_norm": 1.90625, + "grad_norm_var": 0.007818349202473958, + "learning_rate": 0.0001, + "loss": 4.0856, + "loss/crossentropy": 1.895260751247406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1848379746079445, + "step": 19036 + }, + { + "epoch": 0.38076, + "grad_norm": 1.8046875, + "grad_norm_var": 0.008504231770833334, + "learning_rate": 0.0001, + "loss": 3.8189, + "loss/crossentropy": 1.8295226097106934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1816689372062683, + "step": 19038 + }, + { + "epoch": 0.3808, + "grad_norm": 2.0, + "grad_norm_var": 0.007830556233723958, + "learning_rate": 0.0001, + "loss": 4.0363, + "loss/crossentropy": 1.6487661004066467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.178475059568882, + "step": 19040 + }, + { + "epoch": 0.38084, + "grad_norm": 1.921875, + "grad_norm_var": 0.0058095296223958336, + "learning_rate": 0.0001, + "loss": 3.828, + "loss/crossentropy": 1.9264054894447327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2009180784225464, + "step": 19042 + }, + { + "epoch": 0.38088, + "grad_norm": 1.84375, + "grad_norm_var": 0.007989247639973959, + "learning_rate": 0.0001, + "loss": 4.2636, + "loss/crossentropy": 2.3018531799316406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039026990532875, + "step": 19044 + }, + { + "epoch": 0.38092, + "grad_norm": 2.15625, + "grad_norm_var": 0.009093983968098959, + "learning_rate": 0.0001, + "loss": 4.0201, + "loss/crossentropy": 2.135006010532379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.172908216714859, + "step": 19046 + }, + { + "epoch": 0.38096, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009476725260416667, + "learning_rate": 0.0001, + "loss": 4.2064, + "loss/crossentropy": 2.1111900806427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19583145529031754, + "step": 19048 + }, + { + "epoch": 0.381, + "grad_norm": 2.015625, + "grad_norm_var": 0.0096099853515625, + "learning_rate": 0.0001, + "loss": 4.1165, + "loss/crossentropy": 2.208159327507019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21107421815395355, + "step": 19050 + }, + { + "epoch": 0.38104, + "grad_norm": 2.203125, + "grad_norm_var": 0.012894694010416667, + "learning_rate": 0.0001, + "loss": 4.1722, + "loss/crossentropy": 2.144998788833618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108323574066162, + "step": 19052 + }, + { + "epoch": 0.38108, + "grad_norm": 2.125, + "grad_norm_var": 0.0116455078125, + "learning_rate": 0.0001, + "loss": 4.0938, + "loss/crossentropy": 2.0046940445899963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20454590767621994, + "step": 19054 + }, + { + "epoch": 0.38112, + "grad_norm": 2.03125, + "grad_norm_var": 0.010992177327473958, + "learning_rate": 0.0001, + "loss": 4.2219, + "loss/crossentropy": 2.042281448841095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21530431509017944, + "step": 19056 + }, + { + "epoch": 0.38116, + "grad_norm": 2.09375, + "grad_norm_var": 0.0102691650390625, + "learning_rate": 0.0001, + "loss": 4.2701, + "loss/crossentropy": 2.0579177141189575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20769091695547104, + "step": 19058 + }, + { + "epoch": 0.3812, + "grad_norm": 2.3125, + "grad_norm_var": 0.012719472249348959, + "learning_rate": 0.0001, + "loss": 4.0561, + "loss/crossentropy": 1.8340198993682861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1927536353468895, + "step": 19060 + }, + { + "epoch": 0.38124, + "grad_norm": 2.375, + "grad_norm_var": 0.016584269205729165, + "learning_rate": 0.0001, + "loss": 4.0412, + "loss/crossentropy": 2.146699070930481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18840950727462769, + "step": 19062 + }, + { + "epoch": 0.38128, + "grad_norm": 1.90625, + "grad_norm_var": 0.017600250244140626, + "learning_rate": 0.0001, + "loss": 3.7448, + "loss/crossentropy": 2.0206886529922485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20354034006595612, + "step": 19064 + }, + { + "epoch": 0.38132, + "grad_norm": 1.78125, + "grad_norm_var": 0.023372141520182292, + "learning_rate": 0.0001, + "loss": 4.0047, + "loss/crossentropy": 1.9568690061569214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.183781698346138, + "step": 19066 + }, + { + "epoch": 0.38136, + "grad_norm": 2.109375, + "grad_norm_var": 0.022855631510416665, + "learning_rate": 0.0001, + "loss": 4.0898, + "loss/crossentropy": 2.092573404312134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20443417131900787, + "step": 19068 + }, + { + "epoch": 0.3814, + "grad_norm": 1.84375, + "grad_norm_var": 0.025770823160807293, + "learning_rate": 0.0001, + "loss": 4.2253, + "loss/crossentropy": 2.17366099357605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064819559454918, + "step": 19070 + }, + { + "epoch": 0.38144, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0265777587890625, + "learning_rate": 0.0001, + "loss": 4.0398, + "loss/crossentropy": 1.7935467958450317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19256697595119476, + "step": 19072 + }, + { + "epoch": 0.38148, + "grad_norm": 1.9765625, + "grad_norm_var": 0.026554107666015625, + "learning_rate": 0.0001, + "loss": 3.9846, + "loss/crossentropy": 1.995141625404358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19474036246538162, + "step": 19074 + }, + { + "epoch": 0.38152, + "grad_norm": 1.8203125, + "grad_norm_var": 0.023221588134765624, + "learning_rate": 0.0001, + "loss": 3.9058, + "loss/crossentropy": 2.140002131462097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19610393047332764, + "step": 19076 + }, + { + "epoch": 0.38156, + "grad_norm": 2.0, + "grad_norm_var": 0.013288370768229167, + "learning_rate": 0.0001, + "loss": 4.1176, + "loss/crossentropy": 2.1740564107894897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20218566060066223, + "step": 19078 + }, + { + "epoch": 0.3816, + "grad_norm": 2.1875, + "grad_norm_var": 0.015925089518229168, + "learning_rate": 0.0001, + "loss": 4.3043, + "loss/crossentropy": 2.065530776977539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21683599054813385, + "step": 19080 + }, + { + "epoch": 0.38164, + "grad_norm": 2.0625, + "grad_norm_var": 0.0127349853515625, + "learning_rate": 0.0001, + "loss": 4.2859, + "loss/crossentropy": 1.844248354434967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19479060918092728, + "step": 19082 + }, + { + "epoch": 0.38168, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011136627197265625, + "learning_rate": 0.0001, + "loss": 3.886, + "loss/crossentropy": 1.8596556186676025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059667930006981, + "step": 19084 + }, + { + "epoch": 0.38172, + "grad_norm": 2.234375, + "grad_norm_var": 0.014731597900390626, + "learning_rate": 0.0001, + "loss": 4.0982, + "loss/crossentropy": 2.08061683177948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20623548328876495, + "step": 19086 + }, + { + "epoch": 0.38176, + "grad_norm": 2.03125, + "grad_norm_var": 0.014615631103515625, + "learning_rate": 0.0001, + "loss": 4.0013, + "loss/crossentropy": 1.888766884803772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18000122904777527, + "step": 19088 + }, + { + "epoch": 0.3818, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017032877604166666, + "learning_rate": 0.0001, + "loss": 3.8784, + "loss/crossentropy": 2.021374225616455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21468255668878555, + "step": 19090 + }, + { + "epoch": 0.38184, + "grad_norm": 2.0625, + "grad_norm_var": 0.016805013020833332, + "learning_rate": 0.0001, + "loss": 3.8327, + "loss/crossentropy": 2.083697557449341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21188172698020935, + "step": 19092 + }, + { + "epoch": 0.38188, + "grad_norm": 2.171875, + "grad_norm_var": 0.02019017537434896, + "learning_rate": 0.0001, + "loss": 3.9617, + "loss/crossentropy": 2.1484656929969788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21978207677602768, + "step": 19094 + }, + { + "epoch": 0.38192, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0218414306640625, + "learning_rate": 0.0001, + "loss": 4.1728, + "loss/crossentropy": 2.321221709251404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22688252478837967, + "step": 19096 + }, + { + "epoch": 0.38196, + "grad_norm": 2.703125, + "grad_norm_var": 0.0518218994140625, + "learning_rate": 0.0001, + "loss": 4.157, + "loss/crossentropy": 2.0623167753219604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148035168647766, + "step": 19098 + }, + { + "epoch": 0.382, + "grad_norm": 2.234375, + "grad_norm_var": 0.05501708984375, + "learning_rate": 0.0001, + "loss": 4.0255, + "loss/crossentropy": 1.8845015168190002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1880025863647461, + "step": 19100 + }, + { + "epoch": 0.38204, + "grad_norm": 1.9140625, + "grad_norm_var": 0.05045750935872396, + "learning_rate": 0.0001, + "loss": 3.8361, + "loss/crossentropy": 1.927463173866272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17446774244308472, + "step": 19102 + }, + { + "epoch": 0.38208, + "grad_norm": 2.0, + "grad_norm_var": 0.0536529541015625, + "learning_rate": 0.0001, + "loss": 4.2909, + "loss/crossentropy": 2.6780699491500854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22951193153858185, + "step": 19104 + }, + { + "epoch": 0.38212, + "grad_norm": 1.875, + "grad_norm_var": 0.05358250935872396, + "learning_rate": 0.0001, + "loss": 3.9188, + "loss/crossentropy": 2.2530760765075684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2141779437661171, + "step": 19106 + }, + { + "epoch": 0.38216, + "grad_norm": 1.90625, + "grad_norm_var": 0.07016499837239583, + "learning_rate": 0.0001, + "loss": 4.1647, + "loss/crossentropy": 2.0292217135429382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20607471466064453, + "step": 19108 + }, + { + "epoch": 0.3822, + "grad_norm": 2.015625, + "grad_norm_var": 0.07394790649414062, + "learning_rate": 0.0001, + "loss": 3.8144, + "loss/crossentropy": 1.8341345191001892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18437029421329498, + "step": 19110 + }, + { + "epoch": 0.38224, + "grad_norm": 2.078125, + "grad_norm_var": 0.06982421875, + "learning_rate": 0.0001, + "loss": 4.1782, + "loss/crossentropy": 1.9262341260910034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894589066505432, + "step": 19112 + }, + { + "epoch": 0.38228, + "grad_norm": 1.9609375, + "grad_norm_var": 0.04212824503580729, + "learning_rate": 0.0001, + "loss": 4.2473, + "loss/crossentropy": 1.9369717240333557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17737293988466263, + "step": 19114 + }, + { + "epoch": 0.38232, + "grad_norm": 1.953125, + "grad_norm_var": 0.03845621744791667, + "learning_rate": 0.0001, + "loss": 4.0904, + "loss/crossentropy": 1.7547513842582703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1895698457956314, + "step": 19116 + }, + { + "epoch": 0.38236, + "grad_norm": 2.15625, + "grad_norm_var": 0.042789459228515625, + "learning_rate": 0.0001, + "loss": 4.2814, + "loss/crossentropy": 2.2367511987686157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22208665311336517, + "step": 19118 + }, + { + "epoch": 0.3824, + "grad_norm": 1.890625, + "grad_norm_var": 0.040169016520182295, + "learning_rate": 0.0001, + "loss": 4.1671, + "loss/crossentropy": 2.248104691505432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132478505373001, + "step": 19120 + }, + { + "epoch": 0.38244, + "grad_norm": 1.7890625, + "grad_norm_var": 0.040415191650390626, + "learning_rate": 0.0001, + "loss": 4.0371, + "loss/crossentropy": 2.237170696258545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21920116990804672, + "step": 19122 + }, + { + "epoch": 0.38248, + "grad_norm": 1.84375, + "grad_norm_var": 0.018027496337890626, + "learning_rate": 0.0001, + "loss": 4.1011, + "loss/crossentropy": 1.82357919216156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1723759025335312, + "step": 19124 + }, + { + "epoch": 0.38252, + "grad_norm": 1.8046875, + "grad_norm_var": 0.018668619791666667, + "learning_rate": 0.0001, + "loss": 3.8541, + "loss/crossentropy": 1.9371765851974487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19782865047454834, + "step": 19126 + }, + { + "epoch": 0.38256, + "grad_norm": 2.1875, + "grad_norm_var": 0.0208404541015625, + "learning_rate": 0.0001, + "loss": 4.1305, + "loss/crossentropy": 1.8295999765396118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756351813673973, + "step": 19128 + }, + { + "epoch": 0.3826, + "grad_norm": 2.046875, + "grad_norm_var": 0.044077301025390626, + "learning_rate": 0.0001, + "loss": 4.0657, + "loss/crossentropy": 2.0991050601005554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17175115644931793, + "step": 19130 + }, + { + "epoch": 0.38264, + "grad_norm": 2.078125, + "grad_norm_var": 0.04390640258789062, + "learning_rate": 0.0001, + "loss": 4.1359, + "loss/crossentropy": 2.2707191705703735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22085107117891312, + "step": 19132 + }, + { + "epoch": 0.38268, + "grad_norm": 2.015625, + "grad_norm_var": 0.03630345662434896, + "learning_rate": 0.0001, + "loss": 4.352, + "loss/crossentropy": 2.237929582595825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2524951994419098, + "step": 19134 + }, + { + "epoch": 0.38272, + "grad_norm": 1.9609375, + "grad_norm_var": 0.03629735310872396, + "learning_rate": 0.0001, + "loss": 4.0293, + "loss/crossentropy": 1.9790211915969849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18156784772872925, + "step": 19136 + }, + { + "epoch": 0.38276, + "grad_norm": 1.890625, + "grad_norm_var": 0.034795888264973956, + "learning_rate": 0.0001, + "loss": 3.9467, + "loss/crossentropy": 1.9819305539131165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18941760063171387, + "step": 19138 + }, + { + "epoch": 0.3828, + "grad_norm": 2.046875, + "grad_norm_var": 0.032022857666015626, + "learning_rate": 0.0001, + "loss": 3.9699, + "loss/crossentropy": 1.9541404843330383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.195252887904644, + "step": 19140 + }, + { + "epoch": 0.38284, + "grad_norm": 2.0, + "grad_norm_var": 0.028824869791666666, + "learning_rate": 0.0001, + "loss": 3.9563, + "loss/crossentropy": 2.1265366673469543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20113253593444824, + "step": 19142 + }, + { + "epoch": 0.38288, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0292724609375, + "learning_rate": 0.0001, + "loss": 4.0248, + "loss/crossentropy": 1.767483413219452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18520043045282364, + "step": 19144 + }, + { + "epoch": 0.38292, + "grad_norm": 2.0625, + "grad_norm_var": 0.005352528889973959, + "learning_rate": 0.0001, + "loss": 4.2516, + "loss/crossentropy": 2.1124974489212036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20909518748521805, + "step": 19146 + }, + { + "epoch": 0.38296, + "grad_norm": 2.046875, + "grad_norm_var": 0.005181630452473958, + "learning_rate": 0.0001, + "loss": 4.3104, + "loss/crossentropy": 2.2959529161453247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2183123677968979, + "step": 19148 + }, + { + "epoch": 0.383, + "grad_norm": 2.0625, + "grad_norm_var": 0.005078125, + "learning_rate": 0.0001, + "loss": 4.3217, + "loss/crossentropy": 2.3522391319274902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22062842547893524, + "step": 19150 + }, + { + "epoch": 0.38304, + "grad_norm": 1.953125, + "grad_norm_var": 0.0048095703125, + "learning_rate": 0.0001, + "loss": 3.7375, + "loss/crossentropy": 1.6673399806022644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16702746599912643, + "step": 19152 + }, + { + "epoch": 0.38308, + "grad_norm": 1.984375, + "grad_norm_var": 0.004073079427083333, + "learning_rate": 0.0001, + "loss": 4.2058, + "loss/crossentropy": 2.021436333656311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19833282381296158, + "step": 19154 + }, + { + "epoch": 0.38312, + "grad_norm": 2.0, + "grad_norm_var": 0.007439931233723958, + "learning_rate": 0.0001, + "loss": 3.7624, + "loss/crossentropy": 1.4123128056526184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1485099121928215, + "step": 19156 + }, + { + "epoch": 0.38316, + "grad_norm": 1.8125, + "grad_norm_var": 0.009959920247395834, + "learning_rate": 0.0001, + "loss": 4.2364, + "loss/crossentropy": 1.9472095966339111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16975026577711105, + "step": 19158 + }, + { + "epoch": 0.3832, + "grad_norm": 2.078125, + "grad_norm_var": 0.0094390869140625, + "learning_rate": 0.0001, + "loss": 4.2025, + "loss/crossentropy": 2.112083077430725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2198200300335884, + "step": 19160 + }, + { + "epoch": 0.38324, + "grad_norm": 2.015625, + "grad_norm_var": 0.0121246337890625, + "learning_rate": 0.0001, + "loss": 3.9661, + "loss/crossentropy": 2.1301331520080566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039676010608673, + "step": 19162 + }, + { + "epoch": 0.38328, + "grad_norm": 2.109375, + "grad_norm_var": 0.019456990559895835, + "learning_rate": 0.0001, + "loss": 4.0655, + "loss/crossentropy": 1.8461318016052246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17811527848243713, + "step": 19164 + }, + { + "epoch": 0.38332, + "grad_norm": 1.984375, + "grad_norm_var": 0.018973795572916667, + "learning_rate": 0.0001, + "loss": 4.0408, + "loss/crossentropy": 2.102015793323517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21166063100099564, + "step": 19166 + }, + { + "epoch": 0.38336, + "grad_norm": 1.8359375, + "grad_norm_var": 0.020167795817057292, + "learning_rate": 0.0001, + "loss": 3.9572, + "loss/crossentropy": 1.9496545791625977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17148377001285553, + "step": 19168 + }, + { + "epoch": 0.3834, + "grad_norm": 1.890625, + "grad_norm_var": 0.020643870035807293, + "learning_rate": 0.0001, + "loss": 4.0675, + "loss/crossentropy": 2.059878885746002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984856054186821, + "step": 19170 + }, + { + "epoch": 0.38344, + "grad_norm": 1.8359375, + "grad_norm_var": 0.017923990885416668, + "learning_rate": 0.0001, + "loss": 3.6476, + "loss/crossentropy": 1.7966619729995728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17823103070259094, + "step": 19172 + }, + { + "epoch": 0.38348, + "grad_norm": 1.8359375, + "grad_norm_var": 0.01645075480143229, + "learning_rate": 0.0001, + "loss": 3.6671, + "loss/crossentropy": 2.0173062086105347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19681749492883682, + "step": 19174 + }, + { + "epoch": 0.38352, + "grad_norm": 1.7265625, + "grad_norm_var": 0.019108072916666666, + "learning_rate": 0.0001, + "loss": 3.6439, + "loss/crossentropy": 1.6330417394638062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17260564863681793, + "step": 19176 + }, + { + "epoch": 0.38356, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01646728515625, + "learning_rate": 0.0001, + "loss": 3.8957, + "loss/crossentropy": 1.6575458645820618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17257437109947205, + "step": 19178 + }, + { + "epoch": 0.3836, + "grad_norm": 2.0625, + "grad_norm_var": 0.006266021728515625, + "learning_rate": 0.0001, + "loss": 3.8633, + "loss/crossentropy": 2.094203770160675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039681375026703, + "step": 19180 + }, + { + "epoch": 0.38364, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006306966145833333, + "learning_rate": 0.0001, + "loss": 3.9579, + "loss/crossentropy": 1.9561032056808472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19183862209320068, + "step": 19182 + }, + { + "epoch": 0.38368, + "grad_norm": 2.609375, + "grad_norm_var": 0.03765055338541667, + "learning_rate": 0.0001, + "loss": 4.4288, + "loss/crossentropy": 1.950503408908844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198556549847126, + "step": 19184 + }, + { + "epoch": 0.38372, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0398193359375, + "learning_rate": 0.0001, + "loss": 4.2354, + "loss/crossentropy": 2.421600103378296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208856999874115, + "step": 19186 + }, + { + "epoch": 0.38376, + "grad_norm": 1.96875, + "grad_norm_var": 0.04036356608072917, + "learning_rate": 0.0001, + "loss": 3.9318, + "loss/crossentropy": 1.7086477279663086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19327964633703232, + "step": 19188 + }, + { + "epoch": 0.3838, + "grad_norm": 2.171875, + "grad_norm_var": 0.041071573893229164, + "learning_rate": 0.0001, + "loss": 4.1488, + "loss/crossentropy": 2.1670665740966797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19333725422620773, + "step": 19190 + }, + { + "epoch": 0.38384, + "grad_norm": 1.9609375, + "grad_norm_var": 0.03430582682291667, + "learning_rate": 0.0001, + "loss": 4.3145, + "loss/crossentropy": 2.218670129776001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20835422724485397, + "step": 19192 + }, + { + "epoch": 0.38388, + "grad_norm": 1.828125, + "grad_norm_var": 0.037699127197265626, + "learning_rate": 0.0001, + "loss": 3.7753, + "loss/crossentropy": 1.909380555152893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18018050491809845, + "step": 19194 + }, + { + "epoch": 0.38392, + "grad_norm": 1.953125, + "grad_norm_var": 0.038211822509765625, + "learning_rate": 0.0001, + "loss": 4.1049, + "loss/crossentropy": 1.9840248227119446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19120849668979645, + "step": 19196 + }, + { + "epoch": 0.38396, + "grad_norm": 2.265625, + "grad_norm_var": 0.042012532552083336, + "learning_rate": 0.0001, + "loss": 4.4124, + "loss/crossentropy": 2.3527809381484985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20009320974349976, + "step": 19198 + }, + { + "epoch": 0.384, + "grad_norm": 1.96875, + "grad_norm_var": 0.0181793212890625, + "learning_rate": 0.0001, + "loss": 4.1011, + "loss/crossentropy": 2.2111377716064453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19603972136974335, + "step": 19200 + }, + { + "epoch": 0.38404, + "grad_norm": 1.9375, + "grad_norm_var": 0.01685358683268229, + "learning_rate": 0.0001, + "loss": 3.9995, + "loss/crossentropy": 1.6832327842712402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18738068640232086, + "step": 19202 + }, + { + "epoch": 0.38408, + "grad_norm": 2.046875, + "grad_norm_var": 0.015805816650390624, + "learning_rate": 0.0001, + "loss": 3.9789, + "loss/crossentropy": 1.9412715435028076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1835481896996498, + "step": 19204 + }, + { + "epoch": 0.38412, + "grad_norm": 2.046875, + "grad_norm_var": 0.013451131184895833, + "learning_rate": 0.0001, + "loss": 4.203, + "loss/crossentropy": 2.0641706585884094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20165929198265076, + "step": 19206 + }, + { + "epoch": 0.38416, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011400349934895833, + "learning_rate": 0.0001, + "loss": 4.0054, + "loss/crossentropy": 2.2053582668304443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108134999871254, + "step": 19208 + }, + { + "epoch": 0.3842, + "grad_norm": 1.828125, + "grad_norm_var": 0.010015614827473958, + "learning_rate": 0.0001, + "loss": 3.8549, + "loss/crossentropy": 2.0534805059432983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19088375568389893, + "step": 19210 + }, + { + "epoch": 0.38424, + "grad_norm": 1.8203125, + "grad_norm_var": 0.010969034830729167, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 1.8782867789268494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17865915596485138, + "step": 19212 + }, + { + "epoch": 0.38428, + "grad_norm": 2.078125, + "grad_norm_var": 0.0059201558430989586, + "learning_rate": 0.0001, + "loss": 4.0641, + "loss/crossentropy": 1.9773831963539124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20622816681861877, + "step": 19214 + }, + { + "epoch": 0.38432, + "grad_norm": 2.078125, + "grad_norm_var": 0.0067942301432291664, + "learning_rate": 0.0001, + "loss": 4.1479, + "loss/crossentropy": 2.0890401005744934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2173013687133789, + "step": 19216 + }, + { + "epoch": 0.38436, + "grad_norm": 1.90625, + "grad_norm_var": 0.008676910400390625, + "learning_rate": 0.0001, + "loss": 3.7221, + "loss/crossentropy": 1.858555793762207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15820113569498062, + "step": 19218 + }, + { + "epoch": 0.3844, + "grad_norm": 2.078125, + "grad_norm_var": 0.00911865234375, + "learning_rate": 0.0001, + "loss": 3.9924, + "loss/crossentropy": 2.2149851322174072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2221473827958107, + "step": 19220 + }, + { + "epoch": 0.38444, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009490712483723959, + "learning_rate": 0.0001, + "loss": 3.7057, + "loss/crossentropy": 2.086311161518097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1791597157716751, + "step": 19222 + }, + { + "epoch": 0.38448, + "grad_norm": 1.90625, + "grad_norm_var": 0.009112294514973958, + "learning_rate": 0.0001, + "loss": 3.9117, + "loss/crossentropy": 1.810127079486847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2018662691116333, + "step": 19224 + }, + { + "epoch": 0.38452, + "grad_norm": 1.984375, + "grad_norm_var": 0.009064737955729167, + "learning_rate": 0.0001, + "loss": 4.3156, + "loss/crossentropy": 2.132863163948059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22217299789190292, + "step": 19226 + }, + { + "epoch": 0.38456, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008299763997395833, + "learning_rate": 0.0001, + "loss": 3.8697, + "loss/crossentropy": 2.0617652535438538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968286782503128, + "step": 19228 + }, + { + "epoch": 0.3846, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007250722249348958, + "learning_rate": 0.0001, + "loss": 3.8566, + "loss/crossentropy": 1.9286837577819824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20616496354341507, + "step": 19230 + }, + { + "epoch": 0.38464, + "grad_norm": 2.125, + "grad_norm_var": 0.008150227864583333, + "learning_rate": 0.0001, + "loss": 4.2854, + "loss/crossentropy": 2.009112238883972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19038759917020798, + "step": 19232 + }, + { + "epoch": 0.38468, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006818644205729167, + "learning_rate": 0.0001, + "loss": 4.0879, + "loss/crossentropy": 2.1937737464904785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20164573192596436, + "step": 19234 + }, + { + "epoch": 0.38472, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00587158203125, + "learning_rate": 0.0001, + "loss": 4.2356, + "loss/crossentropy": 1.981977641582489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20445218682289124, + "step": 19236 + }, + { + "epoch": 0.38476, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004648590087890625, + "learning_rate": 0.0001, + "loss": 3.9515, + "loss/crossentropy": 1.6332372426986694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1629931926727295, + "step": 19238 + }, + { + "epoch": 0.3848, + "grad_norm": 1.9375, + "grad_norm_var": 0.003952789306640625, + "learning_rate": 0.0001, + "loss": 3.9061, + "loss/crossentropy": 1.8656854629516602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18546659499406815, + "step": 19240 + }, + { + "epoch": 0.38484, + "grad_norm": 2.09375, + "grad_norm_var": 0.005594635009765625, + "learning_rate": 0.0001, + "loss": 4.1251, + "loss/crossentropy": 2.219391703605652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21327267587184906, + "step": 19242 + }, + { + "epoch": 0.38488, + "grad_norm": 2.171875, + "grad_norm_var": 0.007883453369140625, + "learning_rate": 0.0001, + "loss": 4.448, + "loss/crossentropy": 1.9518468976020813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23419301211833954, + "step": 19244 + }, + { + "epoch": 0.38492, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007411448160807291, + "learning_rate": 0.0001, + "loss": 4.2947, + "loss/crossentropy": 2.190311014652252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2218230664730072, + "step": 19246 + }, + { + "epoch": 0.38496, + "grad_norm": 1.765625, + "grad_norm_var": 0.009633127848307292, + "learning_rate": 0.0001, + "loss": 3.8475, + "loss/crossentropy": 1.8347881436347961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20036222785711288, + "step": 19248 + }, + { + "epoch": 0.385, + "grad_norm": 2.0625, + "grad_norm_var": 0.012157185872395834, + "learning_rate": 0.0001, + "loss": 3.9732, + "loss/crossentropy": 1.949516236782074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20307819545269012, + "step": 19250 + }, + { + "epoch": 0.38504, + "grad_norm": 1.9375, + "grad_norm_var": 0.012892405192057291, + "learning_rate": 0.0001, + "loss": 4.007, + "loss/crossentropy": 1.883664846420288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1825740560889244, + "step": 19252 + }, + { + "epoch": 0.38508, + "grad_norm": 2.140625, + "grad_norm_var": 0.0146881103515625, + "learning_rate": 0.0001, + "loss": 4.1794, + "loss/crossentropy": 2.1245557069778442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20935190469026566, + "step": 19254 + }, + { + "epoch": 0.38512, + "grad_norm": 2.078125, + "grad_norm_var": 0.015166982014973959, + "learning_rate": 0.0001, + "loss": 3.9125, + "loss/crossentropy": 1.546354353427887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15509265661239624, + "step": 19256 + }, + { + "epoch": 0.38516, + "grad_norm": 1.96875, + "grad_norm_var": 0.014290110270182291, + "learning_rate": 0.0001, + "loss": 4.1744, + "loss/crossentropy": 1.979533076286316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20994720607995987, + "step": 19258 + }, + { + "epoch": 0.3852, + "grad_norm": 1.984375, + "grad_norm_var": 0.014074452718098958, + "learning_rate": 0.0001, + "loss": 3.7774, + "loss/crossentropy": 2.125569224357605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21175500005483627, + "step": 19260 + }, + { + "epoch": 0.38524, + "grad_norm": 1.859375, + "grad_norm_var": 0.0124267578125, + "learning_rate": 0.0001, + "loss": 3.911, + "loss/crossentropy": 2.027758777141571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20255477726459503, + "step": 19262 + }, + { + "epoch": 0.38528, + "grad_norm": 1.84375, + "grad_norm_var": 0.011058553059895834, + "learning_rate": 0.0001, + "loss": 3.4728, + "loss/crossentropy": 1.9937690496444702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1836088001728058, + "step": 19264 + }, + { + "epoch": 0.38532, + "grad_norm": 1.921875, + "grad_norm_var": 0.00863037109375, + "learning_rate": 0.0001, + "loss": 4.1797, + "loss/crossentropy": 2.0846773386001587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18216369301080704, + "step": 19266 + }, + { + "epoch": 0.38536, + "grad_norm": 1.984375, + "grad_norm_var": 0.008429972330729167, + "learning_rate": 0.0001, + "loss": 3.8911, + "loss/crossentropy": 1.445238471031189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17925872653722763, + "step": 19268 + }, + { + "epoch": 0.3854, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0061757405598958336, + "learning_rate": 0.0001, + "loss": 4.0615, + "loss/crossentropy": 2.0442845821380615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024901956319809, + "step": 19270 + }, + { + "epoch": 0.38544, + "grad_norm": 1.7265625, + "grad_norm_var": 0.009144846598307292, + "learning_rate": 0.0001, + "loss": 4.0898, + "loss/crossentropy": 1.9336092472076416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17869817465543747, + "step": 19272 + }, + { + "epoch": 0.38548, + "grad_norm": 2.125, + "grad_norm_var": 0.010884348551432292, + "learning_rate": 0.0001, + "loss": 4.1939, + "loss/crossentropy": 1.815299928188324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.182998888194561, + "step": 19274 + }, + { + "epoch": 0.38552, + "grad_norm": 2.0, + "grad_norm_var": 0.008886464436848958, + "learning_rate": 0.0001, + "loss": 4.2121, + "loss/crossentropy": 2.091490864753723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19843900948762894, + "step": 19276 + }, + { + "epoch": 0.38556, + "grad_norm": 2.03125, + "grad_norm_var": 0.009608713785807292, + "learning_rate": 0.0001, + "loss": 4.098, + "loss/crossentropy": 1.9923794865608215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186911903321743, + "step": 19278 + }, + { + "epoch": 0.3856, + "grad_norm": 1.828125, + "grad_norm_var": 0.009867350260416666, + "learning_rate": 0.0001, + "loss": 4.0658, + "loss/crossentropy": 1.9323501586914062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931384950876236, + "step": 19280 + }, + { + "epoch": 0.38564, + "grad_norm": 1.875, + "grad_norm_var": 0.010262044270833333, + "learning_rate": 0.0001, + "loss": 3.8819, + "loss/crossentropy": 1.8469224572181702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1835423856973648, + "step": 19282 + }, + { + "epoch": 0.38568, + "grad_norm": 1.859375, + "grad_norm_var": 0.010453287760416667, + "learning_rate": 0.0001, + "loss": 3.9027, + "loss/crossentropy": 1.9476045370101929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17014781385660172, + "step": 19284 + }, + { + "epoch": 0.38572, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010285441080729167, + "learning_rate": 0.0001, + "loss": 3.9943, + "loss/crossentropy": 1.7913609743118286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18661632388830185, + "step": 19286 + }, + { + "epoch": 0.38576, + "grad_norm": 1.8046875, + "grad_norm_var": 0.00760498046875, + "learning_rate": 0.0001, + "loss": 4.0283, + "loss/crossentropy": 1.9682837128639221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799641102552414, + "step": 19288 + }, + { + "epoch": 0.3858, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006331125895182292, + "learning_rate": 0.0001, + "loss": 4.3075, + "loss/crossentropy": 2.117633819580078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208334282040596, + "step": 19290 + }, + { + "epoch": 0.38584, + "grad_norm": 1.890625, + "grad_norm_var": 0.005761464436848958, + "learning_rate": 0.0001, + "loss": 4.1215, + "loss/crossentropy": 2.1208746433258057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20928698778152466, + "step": 19292 + }, + { + "epoch": 0.38588, + "grad_norm": 1.8046875, + "grad_norm_var": 0.005549112955729167, + "learning_rate": 0.0001, + "loss": 4.0572, + "loss/crossentropy": 1.9424505829811096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18668201565742493, + "step": 19294 + }, + { + "epoch": 0.38592, + "grad_norm": 1.921875, + "grad_norm_var": 0.005277252197265625, + "learning_rate": 0.0001, + "loss": 4.1575, + "loss/crossentropy": 2.251755177974701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21153685450553894, + "step": 19296 + }, + { + "epoch": 0.38596, + "grad_norm": 2.078125, + "grad_norm_var": 0.007114410400390625, + "learning_rate": 0.0001, + "loss": 4.1885, + "loss/crossentropy": 2.1629436016082764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.206063412129879, + "step": 19298 + }, + { + "epoch": 0.386, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0069732666015625, + "learning_rate": 0.0001, + "loss": 4.1267, + "loss/crossentropy": 2.0333832502365112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1872929260134697, + "step": 19300 + }, + { + "epoch": 0.38604, + "grad_norm": 1.890625, + "grad_norm_var": 0.007641347249348959, + "learning_rate": 0.0001, + "loss": 3.8433, + "loss/crossentropy": 1.8364217281341553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17408161610364914, + "step": 19302 + }, + { + "epoch": 0.38608, + "grad_norm": 1.890625, + "grad_norm_var": 0.005973307291666666, + "learning_rate": 0.0001, + "loss": 3.9175, + "loss/crossentropy": 1.975037932395935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18705998361110687, + "step": 19304 + }, + { + "epoch": 0.38612, + "grad_norm": 1.7890625, + "grad_norm_var": 0.006477864583333334, + "learning_rate": 0.0001, + "loss": 3.7868, + "loss/crossentropy": 1.6889175176620483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16312924772500992, + "step": 19306 + }, + { + "epoch": 0.38616, + "grad_norm": 1.9375, + "grad_norm_var": 0.006392161051432292, + "learning_rate": 0.0001, + "loss": 4.1299, + "loss/crossentropy": 2.2644211053848267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18390889465808868, + "step": 19308 + }, + { + "epoch": 0.3862, + "grad_norm": 2.1875, + "grad_norm_var": 0.009723917643229166, + "learning_rate": 0.0001, + "loss": 4.325, + "loss/crossentropy": 2.180909037590027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20263531804084778, + "step": 19310 + }, + { + "epoch": 0.38624, + "grad_norm": 2.015625, + "grad_norm_var": 0.010138956705729167, + "learning_rate": 0.0001, + "loss": 4.2001, + "loss/crossentropy": 2.4007444381713867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23251044005155563, + "step": 19312 + }, + { + "epoch": 0.38628, + "grad_norm": 2.0, + "grad_norm_var": 0.010534413655598958, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 1.8122236728668213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18613354116678238, + "step": 19314 + }, + { + "epoch": 0.38632, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010404459635416667, + "learning_rate": 0.0001, + "loss": 4.1379, + "loss/crossentropy": 2.159746825695038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21711497008800507, + "step": 19316 + }, + { + "epoch": 0.38636, + "grad_norm": 1.953125, + "grad_norm_var": 0.009627024332682291, + "learning_rate": 0.0001, + "loss": 3.9929, + "loss/crossentropy": 2.129119336605072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19623322039842606, + "step": 19318 + }, + { + "epoch": 0.3864, + "grad_norm": 1.96875, + "grad_norm_var": 0.009934234619140624, + "learning_rate": 0.0001, + "loss": 4.1578, + "loss/crossentropy": 2.322758913040161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20978064835071564, + "step": 19320 + }, + { + "epoch": 0.38644, + "grad_norm": 2.015625, + "grad_norm_var": 0.010322825113932291, + "learning_rate": 0.0001, + "loss": 4.0266, + "loss/crossentropy": 2.2643767595291138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21338346600532532, + "step": 19322 + }, + { + "epoch": 0.38648, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010731760660807292, + "learning_rate": 0.0001, + "loss": 4.0614, + "loss/crossentropy": 1.8038535118103027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19042538851499557, + "step": 19324 + }, + { + "epoch": 0.38652, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0069620768229166664, + "learning_rate": 0.0001, + "loss": 3.9641, + "loss/crossentropy": 2.0847853422164917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1965423971414566, + "step": 19326 + }, + { + "epoch": 0.38656, + "grad_norm": 2.015625, + "grad_norm_var": 0.007692209879557292, + "learning_rate": 0.0001, + "loss": 4.3347, + "loss/crossentropy": 2.2121591567993164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21294504404067993, + "step": 19328 + }, + { + "epoch": 0.3866, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005985260009765625, + "learning_rate": 0.0001, + "loss": 3.9869, + "loss/crossentropy": 2.100327968597412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20181559026241302, + "step": 19330 + }, + { + "epoch": 0.38664, + "grad_norm": 1.78125, + "grad_norm_var": 0.00750732421875, + "learning_rate": 0.0001, + "loss": 3.642, + "loss/crossentropy": 1.7100898623466492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18045856803655624, + "step": 19332 + }, + { + "epoch": 0.38668, + "grad_norm": 1.953125, + "grad_norm_var": 0.008337148030598958, + "learning_rate": 0.0001, + "loss": 3.712, + "loss/crossentropy": 1.6435258388519287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821775734424591, + "step": 19334 + }, + { + "epoch": 0.38672, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008674875895182291, + "learning_rate": 0.0001, + "loss": 3.8334, + "loss/crossentropy": 2.338008165359497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2040715217590332, + "step": 19336 + }, + { + "epoch": 0.38676, + "grad_norm": 1.78125, + "grad_norm_var": 0.008454386393229167, + "learning_rate": 0.0001, + "loss": 3.8911, + "loss/crossentropy": 1.8373408913612366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17516936361789703, + "step": 19338 + }, + { + "epoch": 0.3868, + "grad_norm": 1.921875, + "grad_norm_var": 0.008422597249348959, + "learning_rate": 0.0001, + "loss": 3.9902, + "loss/crossentropy": 2.0346588492393494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20979416370391846, + "step": 19340 + }, + { + "epoch": 0.38684, + "grad_norm": 2.078125, + "grad_norm_var": 0.009399159749348959, + "learning_rate": 0.0001, + "loss": 4.1759, + "loss/crossentropy": 2.083326816558838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20702984929084778, + "step": 19342 + }, + { + "epoch": 0.38688, + "grad_norm": 2.0, + "grad_norm_var": 0.012473297119140626, + "learning_rate": 0.0001, + "loss": 4.611, + "loss/crossentropy": 2.4100319147109985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2617071568965912, + "step": 19344 + }, + { + "epoch": 0.38692, + "grad_norm": 1.96875, + "grad_norm_var": 0.012550608317057291, + "learning_rate": 0.0001, + "loss": 3.8864, + "loss/crossentropy": 1.7147992849349976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.174342080950737, + "step": 19346 + }, + { + "epoch": 0.38696, + "grad_norm": 2.046875, + "grad_norm_var": 0.013602701822916667, + "learning_rate": 0.0001, + "loss": 4.1508, + "loss/crossentropy": 2.041933536529541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024158239364624, + "step": 19348 + }, + { + "epoch": 0.387, + "grad_norm": 1.921875, + "grad_norm_var": 0.012434895833333333, + "learning_rate": 0.0001, + "loss": 4.0929, + "loss/crossentropy": 1.835128128528595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1861354559659958, + "step": 19350 + }, + { + "epoch": 0.38704, + "grad_norm": 2.15625, + "grad_norm_var": 0.013370768229166666, + "learning_rate": 0.0001, + "loss": 4.164, + "loss/crossentropy": 1.9472790360450745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19485513865947723, + "step": 19352 + }, + { + "epoch": 0.38708, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012428538004557291, + "learning_rate": 0.0001, + "loss": 4.0217, + "loss/crossentropy": 2.0546197295188904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1883516013622284, + "step": 19354 + }, + { + "epoch": 0.38712, + "grad_norm": 1.8359375, + "grad_norm_var": 0.013169097900390624, + "learning_rate": 0.0001, + "loss": 3.6439, + "loss/crossentropy": 1.8285245299339294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1730484738945961, + "step": 19356 + }, + { + "epoch": 0.38716, + "grad_norm": 1.984375, + "grad_norm_var": 0.013890584309895834, + "learning_rate": 0.0001, + "loss": 4.2169, + "loss/crossentropy": 2.320949673652649, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21517502516508102, + "step": 19358 + }, + { + "epoch": 0.3872, + "grad_norm": 2.03125, + "grad_norm_var": 0.011289215087890625, + "learning_rate": 0.0001, + "loss": 4.0196, + "loss/crossentropy": 2.2944518327713013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20343206077814102, + "step": 19360 + }, + { + "epoch": 0.38724, + "grad_norm": 1.96875, + "grad_norm_var": 0.0112945556640625, + "learning_rate": 0.0001, + "loss": 4.2113, + "loss/crossentropy": 2.134889602661133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19354654103517532, + "step": 19362 + }, + { + "epoch": 0.38728, + "grad_norm": 1.984375, + "grad_norm_var": 0.009639231363932292, + "learning_rate": 0.0001, + "loss": 4.1244, + "loss/crossentropy": 2.206334412097931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20670340210199356, + "step": 19364 + }, + { + "epoch": 0.38732, + "grad_norm": 2.0, + "grad_norm_var": 0.009769694010416666, + "learning_rate": 0.0001, + "loss": 3.8194, + "loss/crossentropy": 2.1286932229995728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19814791530370712, + "step": 19366 + }, + { + "epoch": 0.38736, + "grad_norm": 1.984375, + "grad_norm_var": 0.0066314697265625, + "learning_rate": 0.0001, + "loss": 4.1533, + "loss/crossentropy": 2.119162678718567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18044983595609665, + "step": 19368 + }, + { + "epoch": 0.3874, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005060831705729167, + "learning_rate": 0.0001, + "loss": 4.0061, + "loss/crossentropy": 2.4316102266311646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22509240359067917, + "step": 19370 + }, + { + "epoch": 0.38744, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0060117085774739586, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 2.540325403213501, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22844604402780533, + "step": 19372 + }, + { + "epoch": 0.38748, + "grad_norm": 2.078125, + "grad_norm_var": 0.005782063802083333, + "learning_rate": 0.0001, + "loss": 4.1795, + "loss/crossentropy": 2.3002058267593384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21805168688297272, + "step": 19374 + }, + { + "epoch": 0.38752, + "grad_norm": 2.03125, + "grad_norm_var": 0.0058258056640625, + "learning_rate": 0.0001, + "loss": 4.274, + "loss/crossentropy": 2.2381847500801086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19565512239933014, + "step": 19376 + }, + { + "epoch": 0.38756, + "grad_norm": 2.046875, + "grad_norm_var": 0.0060302734375, + "learning_rate": 0.0001, + "loss": 4.2683, + "loss/crossentropy": 2.101936161518097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198813758790493, + "step": 19378 + }, + { + "epoch": 0.3876, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0060943603515625, + "learning_rate": 0.0001, + "loss": 3.8679, + "loss/crossentropy": 2.1088311672210693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21237140893936157, + "step": 19380 + }, + { + "epoch": 0.38764, + "grad_norm": 2.0, + "grad_norm_var": 0.0060943603515625, + "learning_rate": 0.0001, + "loss": 3.9588, + "loss/crossentropy": 2.0985374450683594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18979168683290482, + "step": 19382 + }, + { + "epoch": 0.38768, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005690256754557292, + "learning_rate": 0.0001, + "loss": 4.0798, + "loss/crossentropy": 2.3004449605941772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053247168660164, + "step": 19384 + }, + { + "epoch": 0.38772, + "grad_norm": 1.9375, + "grad_norm_var": 0.005277252197265625, + "learning_rate": 0.0001, + "loss": 4.0285, + "loss/crossentropy": 2.1257707476615906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22222986072301865, + "step": 19386 + }, + { + "epoch": 0.38776, + "grad_norm": 1.9765625, + "grad_norm_var": 0.003733062744140625, + "learning_rate": 0.0001, + "loss": 4.1336, + "loss/crossentropy": 2.3034613132476807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2174137905240059, + "step": 19388 + }, + { + "epoch": 0.3878, + "grad_norm": 1.7578125, + "grad_norm_var": 0.006245930989583333, + "learning_rate": 0.0001, + "loss": 3.9854, + "loss/crossentropy": 2.1303011178970337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22152414172887802, + "step": 19390 + }, + { + "epoch": 0.38784, + "grad_norm": 2.0, + "grad_norm_var": 0.010412343343098958, + "learning_rate": 0.0001, + "loss": 4.1868, + "loss/crossentropy": 1.8575092554092407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18628983944654465, + "step": 19392 + }, + { + "epoch": 0.38788, + "grad_norm": 2.109375, + "grad_norm_var": 0.011694081624348958, + "learning_rate": 0.0001, + "loss": 4.1219, + "loss/crossentropy": 2.0332913994789124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2499927133321762, + "step": 19394 + }, + { + "epoch": 0.38792, + "grad_norm": 2.125, + "grad_norm_var": 0.013004302978515625, + "learning_rate": 0.0001, + "loss": 4.2275, + "loss/crossentropy": 2.0926660895347595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19264871627092361, + "step": 19396 + }, + { + "epoch": 0.38796, + "grad_norm": 2.21875, + "grad_norm_var": 0.0156005859375, + "learning_rate": 0.0001, + "loss": 4.2337, + "loss/crossentropy": 2.08814400434494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137948334217072, + "step": 19398 + }, + { + "epoch": 0.388, + "grad_norm": 1.9921875, + "grad_norm_var": 0.019162750244140624, + "learning_rate": 0.0001, + "loss": 4.0263, + "loss/crossentropy": 2.074121594429016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20524942874908447, + "step": 19400 + }, + { + "epoch": 0.38804, + "grad_norm": 1.8671875, + "grad_norm_var": 0.020589192708333332, + "learning_rate": 0.0001, + "loss": 3.9437, + "loss/crossentropy": 1.8818755745887756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1794123500585556, + "step": 19402 + }, + { + "epoch": 0.38808, + "grad_norm": 1.890625, + "grad_norm_var": 0.020992024739583334, + "learning_rate": 0.0001, + "loss": 3.8638, + "loss/crossentropy": 2.5050524473190308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22337081283330917, + "step": 19404 + }, + { + "epoch": 0.38812, + "grad_norm": 2.03125, + "grad_norm_var": 0.0180572509765625, + "learning_rate": 0.0001, + "loss": 4.2158, + "loss/crossentropy": 2.2035861015319824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20753300189971924, + "step": 19406 + }, + { + "epoch": 0.38816, + "grad_norm": 1.90625, + "grad_norm_var": 0.014012654622395834, + "learning_rate": 0.0001, + "loss": 3.8975, + "loss/crossentropy": 2.2270091772079468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20527157932519913, + "step": 19408 + }, + { + "epoch": 0.3882, + "grad_norm": 1.8828125, + "grad_norm_var": 0.013504791259765624, + "learning_rate": 0.0001, + "loss": 3.951, + "loss/crossentropy": 1.6546313762664795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17243140190839767, + "step": 19410 + }, + { + "epoch": 0.38824, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012756093343098959, + "learning_rate": 0.0001, + "loss": 4.0653, + "loss/crossentropy": 1.8858160376548767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18621040880680084, + "step": 19412 + }, + { + "epoch": 0.38828, + "grad_norm": 2.109375, + "grad_norm_var": 0.011875152587890625, + "learning_rate": 0.0001, + "loss": 4.4744, + "loss/crossentropy": 1.9876565337181091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.30138373374938965, + "step": 19414 + }, + { + "epoch": 0.38832, + "grad_norm": 2.1875, + "grad_norm_var": 0.013053131103515626, + "learning_rate": 0.0001, + "loss": 3.7802, + "loss/crossentropy": 2.1887502670288086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910400092601776, + "step": 19416 + }, + { + "epoch": 0.38836, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012511952718098959, + "learning_rate": 0.0001, + "loss": 3.9398, + "loss/crossentropy": 2.246508002281189, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2018478363752365, + "step": 19418 + }, + { + "epoch": 0.3884, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011510976155598958, + "learning_rate": 0.0001, + "loss": 4.1703, + "loss/crossentropy": 2.041202425956726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1890542358160019, + "step": 19420 + }, + { + "epoch": 0.38844, + "grad_norm": 2.09375, + "grad_norm_var": 0.016904449462890624, + "learning_rate": 0.0001, + "loss": 4.4154, + "loss/crossentropy": 2.0470627546310425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19648576527833939, + "step": 19422 + }, + { + "epoch": 0.38848, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0155181884765625, + "learning_rate": 0.0001, + "loss": 4.1794, + "loss/crossentropy": 2.172736167907715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22087720036506653, + "step": 19424 + }, + { + "epoch": 0.38852, + "grad_norm": 1.8828125, + "grad_norm_var": 0.015811920166015625, + "learning_rate": 0.0001, + "loss": 4.0106, + "loss/crossentropy": 2.0988662242889404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18553303182125092, + "step": 19426 + }, + { + "epoch": 0.38856, + "grad_norm": 1.8359375, + "grad_norm_var": 0.017134348551432293, + "learning_rate": 0.0001, + "loss": 4.1368, + "loss/crossentropy": 1.821892261505127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19670143723487854, + "step": 19428 + }, + { + "epoch": 0.3886, + "grad_norm": 1.875, + "grad_norm_var": 0.016747792561848957, + "learning_rate": 0.0001, + "loss": 3.7878, + "loss/crossentropy": 1.8893300294876099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18850861489772797, + "step": 19430 + }, + { + "epoch": 0.38864, + "grad_norm": 2.984375, + "grad_norm_var": 0.07888997395833333, + "learning_rate": 0.0001, + "loss": 4.243, + "loss/crossentropy": 2.210429847240448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17908993363380432, + "step": 19432 + }, + { + "epoch": 0.38868, + "grad_norm": 2.015625, + "grad_norm_var": 0.07731526692708333, + "learning_rate": 0.0001, + "loss": 3.9939, + "loss/crossentropy": 2.164771556854248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919529214501381, + "step": 19434 + }, + { + "epoch": 0.38872, + "grad_norm": 1.9609375, + "grad_norm_var": 0.07656631469726563, + "learning_rate": 0.0001, + "loss": 4.1721, + "loss/crossentropy": 2.1335190534591675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2263372465968132, + "step": 19436 + }, + { + "epoch": 0.38876, + "grad_norm": 1.9609375, + "grad_norm_var": 0.073779296875, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 2.199760317802429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20560061931610107, + "step": 19438 + }, + { + "epoch": 0.3888, + "grad_norm": 1.8125, + "grad_norm_var": 0.07665608723958334, + "learning_rate": 0.0001, + "loss": 3.9367, + "loss/crossentropy": 2.2948896884918213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016046792268753, + "step": 19440 + }, + { + "epoch": 0.38884, + "grad_norm": 1.921875, + "grad_norm_var": 0.07588882446289062, + "learning_rate": 0.0001, + "loss": 4.1444, + "loss/crossentropy": 2.120785415172577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18029560148715973, + "step": 19442 + }, + { + "epoch": 0.38888, + "grad_norm": 1.875, + "grad_norm_var": 0.07493260701497396, + "learning_rate": 0.0001, + "loss": 3.8872, + "loss/crossentropy": 2.049036145210266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19056373089551926, + "step": 19444 + }, + { + "epoch": 0.38892, + "grad_norm": 2.078125, + "grad_norm_var": 0.07226130167643229, + "learning_rate": 0.0001, + "loss": 4.2652, + "loss/crossentropy": 2.0076504945755005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20951516926288605, + "step": 19446 + }, + { + "epoch": 0.38896, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006740061442057291, + "learning_rate": 0.0001, + "loss": 3.8178, + "loss/crossentropy": 1.9398083090782166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21217594295740128, + "step": 19448 + }, + { + "epoch": 0.389, + "grad_norm": 1.875, + "grad_norm_var": 0.00546875, + "learning_rate": 0.0001, + "loss": 4.0245, + "loss/crossentropy": 2.0976104736328125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19852425903081894, + "step": 19450 + }, + { + "epoch": 0.38904, + "grad_norm": 7.625, + "grad_norm_var": 2.023509724934896, + "learning_rate": 0.0001, + "loss": 3.9741, + "loss/crossentropy": 1.8561761379241943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17980807274580002, + "step": 19452 + }, + { + "epoch": 0.38908, + "grad_norm": 1.859375, + "grad_norm_var": 2.0183570861816404, + "learning_rate": 0.0001, + "loss": 3.913, + "loss/crossentropy": 2.1107255816459656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20205769687891006, + "step": 19454 + }, + { + "epoch": 0.38912, + "grad_norm": 1.8203125, + "grad_norm_var": 2.009368642171224, + "learning_rate": 0.0001, + "loss": 3.9216, + "loss/crossentropy": 2.018397331237793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18721628934144974, + "step": 19456 + }, + { + "epoch": 0.38916, + "grad_norm": 1.84375, + "grad_norm_var": 2.010087076822917, + "learning_rate": 0.0001, + "loss": 3.8738, + "loss/crossentropy": 2.016912341117859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20780544728040695, + "step": 19458 + }, + { + "epoch": 0.3892, + "grad_norm": 1.8671875, + "grad_norm_var": 2.0083984375, + "learning_rate": 0.0001, + "loss": 3.9523, + "loss/crossentropy": 1.945086121559143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19544267654418945, + "step": 19460 + }, + { + "epoch": 0.38924, + "grad_norm": 1.9375, + "grad_norm_var": 2.01300048828125, + "learning_rate": 0.0001, + "loss": 4.1854, + "loss/crossentropy": 2.059622883796692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21501117944717407, + "step": 19462 + }, + { + "epoch": 0.38928, + "grad_norm": 1.96875, + "grad_norm_var": 2.024466705322266, + "learning_rate": 0.0001, + "loss": 3.8336, + "loss/crossentropy": 1.8915096521377563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1852220892906189, + "step": 19464 + }, + { + "epoch": 0.38932, + "grad_norm": 2.015625, + "grad_norm_var": 2.0162534077962238, + "learning_rate": 0.0001, + "loss": 4.0612, + "loss/crossentropy": 1.5394552946090698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16873380541801453, + "step": 19466 + }, + { + "epoch": 0.38936, + "grad_norm": 1.921875, + "grad_norm_var": 0.03472468058268229, + "learning_rate": 0.0001, + "loss": 4.2426, + "loss/crossentropy": 2.0290380716323853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19913609325885773, + "step": 19468 + }, + { + "epoch": 0.3894, + "grad_norm": 2.03125, + "grad_norm_var": 0.033841705322265624, + "learning_rate": 0.0001, + "loss": 4.1607, + "loss/crossentropy": 2.208653211593628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.190569207072258, + "step": 19470 + }, + { + "epoch": 0.38944, + "grad_norm": 1.9375, + "grad_norm_var": 0.0092041015625, + "learning_rate": 0.0001, + "loss": 4.2646, + "loss/crossentropy": 2.2873799800872803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20589639246463776, + "step": 19472 + }, + { + "epoch": 0.38948, + "grad_norm": 1.953125, + "grad_norm_var": 0.008014933268229166, + "learning_rate": 0.0001, + "loss": 4.1662, + "loss/crossentropy": 2.152024030685425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20272599160671234, + "step": 19474 + }, + { + "epoch": 0.38952, + "grad_norm": 1.96875, + "grad_norm_var": 0.009374745686848958, + "learning_rate": 0.0001, + "loss": 3.9932, + "loss/crossentropy": 2.0079659819602966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19110675901174545, + "step": 19476 + }, + { + "epoch": 0.38956, + "grad_norm": 2.078125, + "grad_norm_var": 0.010081990559895834, + "learning_rate": 0.0001, + "loss": 4.1061, + "loss/crossentropy": 2.3006476163864136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2253967523574829, + "step": 19478 + }, + { + "epoch": 0.3896, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010389963785807291, + "learning_rate": 0.0001, + "loss": 4.0678, + "loss/crossentropy": 2.1429306864738464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20268720388412476, + "step": 19480 + }, + { + "epoch": 0.38964, + "grad_norm": 2.03125, + "grad_norm_var": 0.011417643229166666, + "learning_rate": 0.0001, + "loss": 4.0581, + "loss/crossentropy": 2.260764956474304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121199518442154, + "step": 19482 + }, + { + "epoch": 0.38968, + "grad_norm": 2.015625, + "grad_norm_var": 0.010636393229166667, + "learning_rate": 0.0001, + "loss": 4.1607, + "loss/crossentropy": 2.3874053955078125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2229148969054222, + "step": 19484 + }, + { + "epoch": 0.38972, + "grad_norm": 2.140625, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.1456, + "loss/crossentropy": 2.2740933895111084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2509172707796097, + "step": 19486 + }, + { + "epoch": 0.38976, + "grad_norm": 1.921875, + "grad_norm_var": 0.009679921468098958, + "learning_rate": 0.0001, + "loss": 4.1831, + "loss/crossentropy": 2.1949650049209595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19400373846292496, + "step": 19488 + }, + { + "epoch": 0.3898, + "grad_norm": 1.96875, + "grad_norm_var": 0.009549713134765625, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 1.4600969552993774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15833207219839096, + "step": 19490 + }, + { + "epoch": 0.38984, + "grad_norm": 2.453125, + "grad_norm_var": 0.019913482666015624, + "learning_rate": 0.0001, + "loss": 4.1694, + "loss/crossentropy": 2.0005985498428345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20003372430801392, + "step": 19492 + }, + { + "epoch": 0.38988, + "grad_norm": 1.84375, + "grad_norm_var": 0.020304107666015626, + "learning_rate": 0.0001, + "loss": 3.8336, + "loss/crossentropy": 1.9609830379486084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19764738529920578, + "step": 19494 + }, + { + "epoch": 0.38992, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01943359375, + "learning_rate": 0.0001, + "loss": 3.8708, + "loss/crossentropy": 1.7853738069534302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18979863077402115, + "step": 19496 + }, + { + "epoch": 0.38996, + "grad_norm": 1.953125, + "grad_norm_var": 0.018961588541666668, + "learning_rate": 0.0001, + "loss": 4.0335, + "loss/crossentropy": 2.004499912261963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20765355974435806, + "step": 19498 + }, + { + "epoch": 0.39, + "grad_norm": 2.015625, + "grad_norm_var": 0.019870758056640625, + "learning_rate": 0.0001, + "loss": 4.1277, + "loss/crossentropy": 2.0148558020591736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18088079243898392, + "step": 19500 + }, + { + "epoch": 0.39004, + "grad_norm": 2.140625, + "grad_norm_var": 0.022965240478515624, + "learning_rate": 0.0001, + "loss": 4.1213, + "loss/crossentropy": 2.2602895498275757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22686412185430527, + "step": 19502 + }, + { + "epoch": 0.39008, + "grad_norm": 1.9921875, + "grad_norm_var": 0.024326324462890625, + "learning_rate": 0.0001, + "loss": 4.0541, + "loss/crossentropy": 2.1789051294326782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1989632546901703, + "step": 19504 + }, + { + "epoch": 0.39012, + "grad_norm": 2.03125, + "grad_norm_var": 0.024568684895833335, + "learning_rate": 0.0001, + "loss": 3.9234, + "loss/crossentropy": 1.93650484085083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976509690284729, + "step": 19506 + }, + { + "epoch": 0.39016, + "grad_norm": 1.96875, + "grad_norm_var": 0.008351389567057292, + "learning_rate": 0.0001, + "loss": 3.9519, + "loss/crossentropy": 1.9904287457466125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17271699011325836, + "step": 19508 + }, + { + "epoch": 0.3902, + "grad_norm": 2.03125, + "grad_norm_var": 0.008194986979166667, + "learning_rate": 0.0001, + "loss": 3.7009, + "loss/crossentropy": 1.4440776705741882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16108233481645584, + "step": 19510 + }, + { + "epoch": 0.39024, + "grad_norm": 1.984375, + "grad_norm_var": 0.008111317952473959, + "learning_rate": 0.0001, + "loss": 4.0218, + "loss/crossentropy": 2.143462061882019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024468332529068, + "step": 19512 + }, + { + "epoch": 0.39028, + "grad_norm": 2.03125, + "grad_norm_var": 0.008436838785807291, + "learning_rate": 0.0001, + "loss": 4.1267, + "loss/crossentropy": 2.085771322250366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22135765105485916, + "step": 19514 + }, + { + "epoch": 0.39032, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008337148030598958, + "learning_rate": 0.0001, + "loss": 4.1423, + "loss/crossentropy": 2.251617908477783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026226669549942, + "step": 19516 + }, + { + "epoch": 0.39036, + "grad_norm": 2.03125, + "grad_norm_var": 0.003885650634765625, + "learning_rate": 0.0001, + "loss": 4.3156, + "loss/crossentropy": 2.3808701038360596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2263621687889099, + "step": 19518 + }, + { + "epoch": 0.3904, + "grad_norm": 1.8203125, + "grad_norm_var": 0.004019927978515625, + "learning_rate": 0.0001, + "loss": 4.0323, + "loss/crossentropy": 2.000342011451721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20008830726146698, + "step": 19520 + }, + { + "epoch": 0.39044, + "grad_norm": 2.09375, + "grad_norm_var": 0.0044830322265625, + "learning_rate": 0.0001, + "loss": 4.3285, + "loss/crossentropy": 2.242555856704712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2443876415491104, + "step": 19522 + }, + { + "epoch": 0.39048, + "grad_norm": 2.0625, + "grad_norm_var": 0.005060831705729167, + "learning_rate": 0.0001, + "loss": 4.344, + "loss/crossentropy": 2.2867462635040283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21522662043571472, + "step": 19524 + }, + { + "epoch": 0.39052, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0053955078125, + "learning_rate": 0.0001, + "loss": 4.0117, + "loss/crossentropy": 2.1892699003219604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.205935537815094, + "step": 19526 + }, + { + "epoch": 0.39056, + "grad_norm": 1.890625, + "grad_norm_var": 0.007972971598307291, + "learning_rate": 0.0001, + "loss": 3.8673, + "loss/crossentropy": 2.182734966278076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980958804488182, + "step": 19528 + }, + { + "epoch": 0.3906, + "grad_norm": 1.96875, + "grad_norm_var": 0.0075927734375, + "learning_rate": 0.0001, + "loss": 4.0193, + "loss/crossentropy": 2.3624355792999268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22833774983882904, + "step": 19530 + }, + { + "epoch": 0.39064, + "grad_norm": 1.875, + "grad_norm_var": 0.007671864827473959, + "learning_rate": 0.0001, + "loss": 4.0497, + "loss/crossentropy": 2.037912607192993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032359093427658, + "step": 19532 + }, + { + "epoch": 0.39068, + "grad_norm": 2.015625, + "grad_norm_var": 0.007513173421223958, + "learning_rate": 0.0001, + "loss": 3.9812, + "loss/crossentropy": 2.076040804386139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21829531341791153, + "step": 19534 + }, + { + "epoch": 0.39072, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0058502197265625, + "learning_rate": 0.0001, + "loss": 4.0407, + "loss/crossentropy": 2.2695836424827576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125643789768219, + "step": 19536 + }, + { + "epoch": 0.39076, + "grad_norm": 2.078125, + "grad_norm_var": 0.0061279296875, + "learning_rate": 0.0001, + "loss": 4.2664, + "loss/crossentropy": 1.9343088269233704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045460194349289, + "step": 19538 + }, + { + "epoch": 0.3908, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005272420247395834, + "learning_rate": 0.0001, + "loss": 4.0757, + "loss/crossentropy": 1.9968918561935425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17788998782634735, + "step": 19540 + }, + { + "epoch": 0.39084, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005467732747395833, + "learning_rate": 0.0001, + "loss": 4.094, + "loss/crossentropy": 2.157910704612732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19516880810260773, + "step": 19542 + }, + { + "epoch": 0.39088, + "grad_norm": 2.0, + "grad_norm_var": 0.004042307535807292, + "learning_rate": 0.0001, + "loss": 4.1056, + "loss/crossentropy": 2.1784998178482056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20280858874320984, + "step": 19544 + }, + { + "epoch": 0.39092, + "grad_norm": 2.03125, + "grad_norm_var": 0.00438232421875, + "learning_rate": 0.0001, + "loss": 4.0796, + "loss/crossentropy": 2.3444844484329224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2418832629919052, + "step": 19546 + }, + { + "epoch": 0.39096, + "grad_norm": 1.96875, + "grad_norm_var": 0.0038083394368489585, + "learning_rate": 0.0001, + "loss": 3.9802, + "loss/crossentropy": 1.90557062625885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010379433631897, + "step": 19548 + }, + { + "epoch": 0.391, + "grad_norm": 1.90625, + "grad_norm_var": 0.004107411702473958, + "learning_rate": 0.0001, + "loss": 3.7756, + "loss/crossentropy": 1.4939787983894348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1362278200685978, + "step": 19550 + }, + { + "epoch": 0.39104, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0063517252604166664, + "learning_rate": 0.0001, + "loss": 4.1738, + "loss/crossentropy": 2.037553310394287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159510999917984, + "step": 19552 + }, + { + "epoch": 0.39108, + "grad_norm": 1.9375, + "grad_norm_var": 0.004816691080729167, + "learning_rate": 0.0001, + "loss": 3.887, + "loss/crossentropy": 1.9987242221832275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19919036328792572, + "step": 19554 + }, + { + "epoch": 0.39112, + "grad_norm": 1.90625, + "grad_norm_var": 0.0048906962076822914, + "learning_rate": 0.0001, + "loss": 4.0298, + "loss/crossentropy": 2.1593196392059326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21370293200016022, + "step": 19556 + }, + { + "epoch": 0.39116, + "grad_norm": 1.9375, + "grad_norm_var": 0.005631510416666667, + "learning_rate": 0.0001, + "loss": 3.8348, + "loss/crossentropy": 1.8323914408683777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20382094383239746, + "step": 19558 + }, + { + "epoch": 0.3912, + "grad_norm": 1.9375, + "grad_norm_var": 0.0053059895833333336, + "learning_rate": 0.0001, + "loss": 4.1138, + "loss/crossentropy": 2.0240999460220337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19374582171440125, + "step": 19560 + }, + { + "epoch": 0.39124, + "grad_norm": 2.0, + "grad_norm_var": 0.005304972330729167, + "learning_rate": 0.0001, + "loss": 4.4349, + "loss/crossentropy": 2.377955436706543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2213267982006073, + "step": 19562 + }, + { + "epoch": 0.39128, + "grad_norm": 2.015625, + "grad_norm_var": 0.006154123942057292, + "learning_rate": 0.0001, + "loss": 4.0878, + "loss/crossentropy": 1.724283754825592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20119256526231766, + "step": 19564 + }, + { + "epoch": 0.39132, + "grad_norm": 1.953125, + "grad_norm_var": 0.005655670166015625, + "learning_rate": 0.0001, + "loss": 3.9066, + "loss/crossentropy": 2.0352718234062195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19105417281389236, + "step": 19566 + }, + { + "epoch": 0.39136, + "grad_norm": 1.875, + "grad_norm_var": 0.0037913004557291667, + "learning_rate": 0.0001, + "loss": 3.7868, + "loss/crossentropy": 2.078265905380249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19992397725582123, + "step": 19568 + }, + { + "epoch": 0.3914, + "grad_norm": 1.859375, + "grad_norm_var": 0.004329172770182291, + "learning_rate": 0.0001, + "loss": 3.8276, + "loss/crossentropy": 1.9722577929496765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17697854340076447, + "step": 19570 + }, + { + "epoch": 0.39144, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004915364583333333, + "learning_rate": 0.0001, + "loss": 4.0587, + "loss/crossentropy": 2.2238826751708984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20041973888874054, + "step": 19572 + }, + { + "epoch": 0.39148, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0037923177083333333, + "learning_rate": 0.0001, + "loss": 4.0226, + "loss/crossentropy": 1.8374757170677185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18086174875497818, + "step": 19574 + }, + { + "epoch": 0.39152, + "grad_norm": 2.140625, + "grad_norm_var": 0.00635986328125, + "learning_rate": 0.0001, + "loss": 4.2903, + "loss/crossentropy": 2.0642590522766113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19544780254364014, + "step": 19576 + }, + { + "epoch": 0.39156, + "grad_norm": 2.421875, + "grad_norm_var": 0.020493316650390624, + "learning_rate": 0.0001, + "loss": 4.1231, + "loss/crossentropy": 2.090702533721924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19859656691551208, + "step": 19578 + }, + { + "epoch": 0.3916, + "grad_norm": 2.1875, + "grad_norm_var": 0.023538970947265626, + "learning_rate": 0.0001, + "loss": 3.9075, + "loss/crossentropy": 1.8087154030799866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918642893433571, + "step": 19580 + }, + { + "epoch": 0.39164, + "grad_norm": 1.9140625, + "grad_norm_var": 0.023527018229166665, + "learning_rate": 0.0001, + "loss": 3.8869, + "loss/crossentropy": 2.0180618166923523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19190336763858795, + "step": 19582 + }, + { + "epoch": 0.39168, + "grad_norm": 2.171875, + "grad_norm_var": 0.025886027018229167, + "learning_rate": 0.0001, + "loss": 4.2298, + "loss/crossentropy": 1.980249285697937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20591172575950623, + "step": 19584 + }, + { + "epoch": 0.39172, + "grad_norm": 1.8359375, + "grad_norm_var": 0.026374308268229167, + "learning_rate": 0.0001, + "loss": 3.9883, + "loss/crossentropy": 2.1208410263061523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20714347064495087, + "step": 19586 + }, + { + "epoch": 0.39176, + "grad_norm": 2.046875, + "grad_norm_var": 0.024179840087890626, + "learning_rate": 0.0001, + "loss": 4.3101, + "loss/crossentropy": 2.351140856742859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26494763791561127, + "step": 19588 + }, + { + "epoch": 0.3918, + "grad_norm": 2.125, + "grad_norm_var": 0.024448394775390625, + "learning_rate": 0.0001, + "loss": 4.1259, + "loss/crossentropy": 1.9821222424507141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062402218580246, + "step": 19590 + }, + { + "epoch": 0.39184, + "grad_norm": 2.015625, + "grad_norm_var": 0.02462158203125, + "learning_rate": 0.0001, + "loss": 4.1373, + "loss/crossentropy": 2.363589644432068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22435829043388367, + "step": 19592 + }, + { + "epoch": 0.39188, + "grad_norm": 1.984375, + "grad_norm_var": 0.013963826497395833, + "learning_rate": 0.0001, + "loss": 4.1845, + "loss/crossentropy": 1.925455391407013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17338209599256516, + "step": 19594 + }, + { + "epoch": 0.39192, + "grad_norm": 2.109375, + "grad_norm_var": 0.012254842122395833, + "learning_rate": 0.0001, + "loss": 3.9777, + "loss/crossentropy": 2.176175117492676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21482165157794952, + "step": 19596 + }, + { + "epoch": 0.39196, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014330037434895833, + "learning_rate": 0.0001, + "loss": 3.9266, + "loss/crossentropy": 2.0013960003852844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20629072189331055, + "step": 19598 + }, + { + "epoch": 0.392, + "grad_norm": 1.859375, + "grad_norm_var": 0.011797841389973958, + "learning_rate": 0.0001, + "loss": 3.9402, + "loss/crossentropy": 2.066355049610138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22155777364969254, + "step": 19600 + }, + { + "epoch": 0.39204, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010809071858723958, + "learning_rate": 0.0001, + "loss": 4.0319, + "loss/crossentropy": 2.0513144731521606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19442218542099, + "step": 19602 + }, + { + "epoch": 0.39208, + "grad_norm": 2.046875, + "grad_norm_var": 0.010896809895833333, + "learning_rate": 0.0001, + "loss": 4.2601, + "loss/crossentropy": 1.9635959267616272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20168906450271606, + "step": 19604 + }, + { + "epoch": 0.39212, + "grad_norm": 1.890625, + "grad_norm_var": 0.009025065104166667, + "learning_rate": 0.0001, + "loss": 3.9406, + "loss/crossentropy": 1.9395795464515686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17990338802337646, + "step": 19606 + }, + { + "epoch": 0.39216, + "grad_norm": 2.125, + "grad_norm_var": 0.011004384358723958, + "learning_rate": 0.0001, + "loss": 4.1936, + "loss/crossentropy": 2.111960232257843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20662778615951538, + "step": 19608 + }, + { + "epoch": 0.3922, + "grad_norm": 1.8125, + "grad_norm_var": 0.011466471354166667, + "learning_rate": 0.0001, + "loss": 3.9238, + "loss/crossentropy": 1.855182707309723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19665290415287018, + "step": 19610 + }, + { + "epoch": 0.39224, + "grad_norm": 2.0, + "grad_norm_var": 0.0156494140625, + "learning_rate": 0.0001, + "loss": 4.262, + "loss/crossentropy": 2.1587076783180237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20080996304750443, + "step": 19612 + }, + { + "epoch": 0.39228, + "grad_norm": 2.015625, + "grad_norm_var": 0.01597874959309896, + "learning_rate": 0.0001, + "loss": 4.1227, + "loss/crossentropy": 2.130104422569275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19484283030033112, + "step": 19614 + }, + { + "epoch": 0.39232, + "grad_norm": 1.75, + "grad_norm_var": 0.017992146809895835, + "learning_rate": 0.0001, + "loss": 3.9046, + "loss/crossentropy": 1.6983461380004883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18040503561496735, + "step": 19616 + }, + { + "epoch": 0.39236, + "grad_norm": 1.953125, + "grad_norm_var": 0.017756144205729168, + "learning_rate": 0.0001, + "loss": 4.0141, + "loss/crossentropy": 2.0672999024391174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20455461740493774, + "step": 19618 + }, + { + "epoch": 0.3924, + "grad_norm": 1.84375, + "grad_norm_var": 0.018202463785807293, + "learning_rate": 0.0001, + "loss": 3.83, + "loss/crossentropy": 2.0494508743286133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18307264149188995, + "step": 19620 + }, + { + "epoch": 0.39244, + "grad_norm": 1.875, + "grad_norm_var": 0.018277740478515624, + "learning_rate": 0.0001, + "loss": 3.9901, + "loss/crossentropy": 1.683717966079712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16951359808444977, + "step": 19622 + }, + { + "epoch": 0.39248, + "grad_norm": 1.859375, + "grad_norm_var": 0.015197499593098959, + "learning_rate": 0.0001, + "loss": 3.8383, + "loss/crossentropy": 2.62760066986084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22649705410003662, + "step": 19624 + }, + { + "epoch": 0.39252, + "grad_norm": 2.296875, + "grad_norm_var": 0.024072011311848957, + "learning_rate": 0.0001, + "loss": 4.19, + "loss/crossentropy": 1.857836663722992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827384978532791, + "step": 19626 + }, + { + "epoch": 0.39256, + "grad_norm": 2.0, + "grad_norm_var": 0.016108957926432292, + "learning_rate": 0.0001, + "loss": 3.7635, + "loss/crossentropy": 2.1203905940055847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20033711194992065, + "step": 19628 + }, + { + "epoch": 0.3926, + "grad_norm": 2.125, + "grad_norm_var": 0.01721165974934896, + "learning_rate": 0.0001, + "loss": 4.4474, + "loss/crossentropy": 2.378189444541931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20892338454723358, + "step": 19630 + }, + { + "epoch": 0.39264, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015819295247395834, + "learning_rate": 0.0001, + "loss": 4.0897, + "loss/crossentropy": 1.896558940410614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18873175233602524, + "step": 19632 + }, + { + "epoch": 0.39268, + "grad_norm": 2.109375, + "grad_norm_var": 0.020798492431640624, + "learning_rate": 0.0001, + "loss": 4.3611, + "loss/crossentropy": 2.3503568172454834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23128806054592133, + "step": 19634 + }, + { + "epoch": 0.39272, + "grad_norm": 1.7578125, + "grad_norm_var": 0.021345011393229165, + "learning_rate": 0.0001, + "loss": 4.0515, + "loss/crossentropy": 2.081954002380371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20059917122125626, + "step": 19636 + }, + { + "epoch": 0.39276, + "grad_norm": 2.015625, + "grad_norm_var": 0.021512858072916665, + "learning_rate": 0.0001, + "loss": 4.0986, + "loss/crossentropy": 1.9573910236358643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1903771460056305, + "step": 19638 + }, + { + "epoch": 0.3928, + "grad_norm": 1.859375, + "grad_norm_var": 0.022025299072265626, + "learning_rate": 0.0001, + "loss": 3.9333, + "loss/crossentropy": 2.098384141921997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19905343651771545, + "step": 19640 + }, + { + "epoch": 0.39284, + "grad_norm": 2.0, + "grad_norm_var": 0.015892537434895833, + "learning_rate": 0.0001, + "loss": 4.0854, + "loss/crossentropy": 1.8510947227478027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18955568969249725, + "step": 19642 + }, + { + "epoch": 0.39288, + "grad_norm": 1.9453125, + "grad_norm_var": 0.016440582275390626, + "learning_rate": 0.0001, + "loss": 4.2948, + "loss/crossentropy": 2.2839618921279907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23230211436748505, + "step": 19644 + }, + { + "epoch": 0.39292, + "grad_norm": 1.953125, + "grad_norm_var": 0.014778391520182291, + "learning_rate": 0.0001, + "loss": 4.0112, + "loss/crossentropy": 1.8055492639541626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20515839755535126, + "step": 19646 + }, + { + "epoch": 0.39296, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013348134358723958, + "learning_rate": 0.0001, + "loss": 4.0556, + "loss/crossentropy": 1.6756115555763245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17216521501541138, + "step": 19648 + }, + { + "epoch": 0.393, + "grad_norm": 1.984375, + "grad_norm_var": 0.007287343343098958, + "learning_rate": 0.0001, + "loss": 4.4001, + "loss/crossentropy": 2.3247755765914917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21122244000434875, + "step": 19650 + }, + { + "epoch": 0.39304, + "grad_norm": 1.9296875, + "grad_norm_var": 0.004044596354166667, + "learning_rate": 0.0001, + "loss": 3.895, + "loss/crossentropy": 1.9654970169067383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20018797367811203, + "step": 19652 + }, + { + "epoch": 0.39308, + "grad_norm": 1.96875, + "grad_norm_var": 0.0029042561848958335, + "learning_rate": 0.0001, + "loss": 3.9018, + "loss/crossentropy": 1.9368168115615845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19478464126586914, + "step": 19654 + }, + { + "epoch": 0.39312, + "grad_norm": 2.03125, + "grad_norm_var": 0.0023671468098958332, + "learning_rate": 0.0001, + "loss": 3.7869, + "loss/crossentropy": 2.099206328392029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20060646533966064, + "step": 19656 + }, + { + "epoch": 0.39316, + "grad_norm": 1.890625, + "grad_norm_var": 0.002561187744140625, + "learning_rate": 0.0001, + "loss": 4.3438, + "loss/crossentropy": 2.1585338711738586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20852376520633698, + "step": 19658 + }, + { + "epoch": 0.3932, + "grad_norm": 1.796875, + "grad_norm_var": 0.0044830322265625, + "learning_rate": 0.0001, + "loss": 4.0533, + "loss/crossentropy": 2.285220742225647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983182057738304, + "step": 19660 + }, + { + "epoch": 0.39324, + "grad_norm": 2.03125, + "grad_norm_var": 0.0047190348307291664, + "learning_rate": 0.0001, + "loss": 4.0046, + "loss/crossentropy": 1.955579936504364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21165720373392105, + "step": 19662 + }, + { + "epoch": 0.39328, + "grad_norm": 1.8046875, + "grad_norm_var": 0.00645751953125, + "learning_rate": 0.0001, + "loss": 3.823, + "loss/crossentropy": 1.9925233721733093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18789846450090408, + "step": 19664 + }, + { + "epoch": 0.39332, + "grad_norm": 1.984375, + "grad_norm_var": 0.006490071614583333, + "learning_rate": 0.0001, + "loss": 4.0123, + "loss/crossentropy": 1.9267281293869019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19545376300811768, + "step": 19666 + }, + { + "epoch": 0.39336, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006493123372395834, + "learning_rate": 0.0001, + "loss": 4.0401, + "loss/crossentropy": 2.179477632045746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19430068880319595, + "step": 19668 + }, + { + "epoch": 0.3934, + "grad_norm": 2.046875, + "grad_norm_var": 0.007228342692057291, + "learning_rate": 0.0001, + "loss": 4.1508, + "loss/crossentropy": 2.1411179900169373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2276124805212021, + "step": 19670 + }, + { + "epoch": 0.39344, + "grad_norm": 2.3125, + "grad_norm_var": 0.014438629150390625, + "learning_rate": 0.0001, + "loss": 4.2689, + "loss/crossentropy": 2.085427463054657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21856296062469482, + "step": 19672 + }, + { + "epoch": 0.39348, + "grad_norm": 2.09375, + "grad_norm_var": 0.014495595296223959, + "learning_rate": 0.0001, + "loss": 4.0302, + "loss/crossentropy": 1.7399682402610779, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16918571293354034, + "step": 19674 + }, + { + "epoch": 0.39352, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011356608072916666, + "learning_rate": 0.0001, + "loss": 4.0404, + "loss/crossentropy": 1.8636209964752197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17654836922883987, + "step": 19676 + }, + { + "epoch": 0.39356, + "grad_norm": 1.7421875, + "grad_norm_var": 0.015290323893229167, + "learning_rate": 0.0001, + "loss": 4.0157, + "loss/crossentropy": 2.143825590610504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125091552734375, + "step": 19678 + }, + { + "epoch": 0.3936, + "grad_norm": 1.6953125, + "grad_norm_var": 0.01873753865559896, + "learning_rate": 0.0001, + "loss": 3.7692, + "loss/crossentropy": 1.8222747445106506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17713025212287903, + "step": 19680 + }, + { + "epoch": 0.39364, + "grad_norm": 1.8515625, + "grad_norm_var": 0.01969172159830729, + "learning_rate": 0.0001, + "loss": 3.8254, + "loss/crossentropy": 1.7044820189476013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17669418454170227, + "step": 19682 + }, + { + "epoch": 0.39368, + "grad_norm": 1.953125, + "grad_norm_var": 0.020411936442057292, + "learning_rate": 0.0001, + "loss": 4.0987, + "loss/crossentropy": 2.144485831260681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20580272376537323, + "step": 19684 + }, + { + "epoch": 0.39372, + "grad_norm": 1.984375, + "grad_norm_var": 0.019774373372395834, + "learning_rate": 0.0001, + "loss": 4.0994, + "loss/crossentropy": 2.17536997795105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21704821288585663, + "step": 19686 + }, + { + "epoch": 0.39376, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012963612874348959, + "learning_rate": 0.0001, + "loss": 4.2592, + "loss/crossentropy": 2.1875303983688354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996757537126541, + "step": 19688 + }, + { + "epoch": 0.3938, + "grad_norm": 2.125, + "grad_norm_var": 0.014121246337890626, + "learning_rate": 0.0001, + "loss": 4.1943, + "loss/crossentropy": 2.270558476448059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23406681418418884, + "step": 19690 + }, + { + "epoch": 0.39384, + "grad_norm": 1.8671875, + "grad_norm_var": 0.015860748291015626, + "learning_rate": 0.0001, + "loss": 4.2097, + "loss/crossentropy": 2.0614060163497925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21256985515356064, + "step": 19692 + }, + { + "epoch": 0.39388, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015091705322265624, + "learning_rate": 0.0001, + "loss": 4.042, + "loss/crossentropy": 2.1036725640296936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19555094093084335, + "step": 19694 + }, + { + "epoch": 0.39392, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010396321614583334, + "learning_rate": 0.0001, + "loss": 3.7702, + "loss/crossentropy": 1.8713775277137756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973343789577484, + "step": 19696 + }, + { + "epoch": 0.39396, + "grad_norm": 1.921875, + "grad_norm_var": 0.009411366780598958, + "learning_rate": 0.0001, + "loss": 3.9581, + "loss/crossentropy": 1.9986143708229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1925104334950447, + "step": 19698 + }, + { + "epoch": 0.394, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0090484619140625, + "learning_rate": 0.0001, + "loss": 3.8134, + "loss/crossentropy": 1.8336694836616516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18453969806432724, + "step": 19700 + }, + { + "epoch": 0.39404, + "grad_norm": 1.9375, + "grad_norm_var": 0.01065673828125, + "learning_rate": 0.0001, + "loss": 3.9279, + "loss/crossentropy": 1.8948233723640442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17115242034196854, + "step": 19702 + }, + { + "epoch": 0.39408, + "grad_norm": 1.984375, + "grad_norm_var": 0.008958943684895833, + "learning_rate": 0.0001, + "loss": 4.244, + "loss/crossentropy": 2.5506935119628906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22056522965431213, + "step": 19704 + }, + { + "epoch": 0.39412, + "grad_norm": 1.984375, + "grad_norm_var": 0.02890625, + "learning_rate": 0.0001, + "loss": 4.1832, + "loss/crossentropy": 2.127421021461487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20175430178642273, + "step": 19706 + }, + { + "epoch": 0.39416, + "grad_norm": 1.9765625, + "grad_norm_var": 0.027197265625, + "learning_rate": 0.0001, + "loss": 4.0251, + "loss/crossentropy": 2.246693968772888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22178302705287933, + "step": 19708 + }, + { + "epoch": 0.3942, + "grad_norm": 1.6953125, + "grad_norm_var": 0.03328221638997396, + "learning_rate": 0.0001, + "loss": 3.9336, + "loss/crossentropy": 1.970005750656128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18967190384864807, + "step": 19710 + }, + { + "epoch": 0.39424, + "grad_norm": 1.921875, + "grad_norm_var": 0.032956695556640624, + "learning_rate": 0.0001, + "loss": 4.1035, + "loss/crossentropy": 1.944337785243988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1952769234776497, + "step": 19712 + }, + { + "epoch": 0.39428, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0328277587890625, + "learning_rate": 0.0001, + "loss": 4.2289, + "loss/crossentropy": 2.291213870048523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077646553516388, + "step": 19714 + }, + { + "epoch": 0.39432, + "grad_norm": 2.015625, + "grad_norm_var": 0.03361790974934896, + "learning_rate": 0.0001, + "loss": 4.0433, + "loss/crossentropy": 2.404030203819275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200116366147995, + "step": 19716 + }, + { + "epoch": 0.39436, + "grad_norm": 2.109375, + "grad_norm_var": 0.032364908854166666, + "learning_rate": 0.0001, + "loss": 3.8609, + "loss/crossentropy": 1.9242625832557678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19318503141403198, + "step": 19718 + }, + { + "epoch": 0.3944, + "grad_norm": 1.890625, + "grad_norm_var": 0.03240966796875, + "learning_rate": 0.0001, + "loss": 3.9789, + "loss/crossentropy": 2.319575071334839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21668671071529388, + "step": 19720 + }, + { + "epoch": 0.39444, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012287394205729166, + "learning_rate": 0.0001, + "loss": 4.2219, + "loss/crossentropy": 2.0032835006713867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19786667823791504, + "step": 19722 + }, + { + "epoch": 0.39448, + "grad_norm": 1.7734375, + "grad_norm_var": 0.0146636962890625, + "learning_rate": 0.0001, + "loss": 3.9509, + "loss/crossentropy": 1.9524562358856201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18965402245521545, + "step": 19724 + }, + { + "epoch": 0.39452, + "grad_norm": 1.890625, + "grad_norm_var": 0.007521311442057292, + "learning_rate": 0.0001, + "loss": 3.5677, + "loss/crossentropy": 1.5887231826782227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1512351706624031, + "step": 19726 + }, + { + "epoch": 0.39456, + "grad_norm": 2.046875, + "grad_norm_var": 0.008576456705729167, + "learning_rate": 0.0001, + "loss": 4.1379, + "loss/crossentropy": 2.107685923576355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18645642697811127, + "step": 19728 + }, + { + "epoch": 0.3946, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009028879801432292, + "learning_rate": 0.0001, + "loss": 3.9594, + "loss/crossentropy": 2.3551766872406006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2207053080201149, + "step": 19730 + }, + { + "epoch": 0.39464, + "grad_norm": 2.0625, + "grad_norm_var": 0.010847727457682291, + "learning_rate": 0.0001, + "loss": 4.3542, + "loss/crossentropy": 2.1983554363250732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23001667857170105, + "step": 19732 + }, + { + "epoch": 0.39468, + "grad_norm": 2.03125, + "grad_norm_var": 0.009639485677083334, + "learning_rate": 0.0001, + "loss": 4.3158, + "loss/crossentropy": 2.0236815214157104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19230765849351883, + "step": 19734 + }, + { + "epoch": 0.39472, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010529581705729167, + "learning_rate": 0.0001, + "loss": 4.0511, + "loss/crossentropy": 2.2027645111083984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1864710971713066, + "step": 19736 + }, + { + "epoch": 0.39476, + "grad_norm": 2.15625, + "grad_norm_var": 0.013152821858723959, + "learning_rate": 0.0001, + "loss": 4.008, + "loss/crossentropy": 1.921549379825592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21788031607866287, + "step": 19738 + }, + { + "epoch": 0.3948, + "grad_norm": 1.953125, + "grad_norm_var": 0.010416666666666666, + "learning_rate": 0.0001, + "loss": 3.8353, + "loss/crossentropy": 2.121790587902069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21380367130041122, + "step": 19740 + }, + { + "epoch": 0.39484, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008217112223307291, + "learning_rate": 0.0001, + "loss": 3.9975, + "loss/crossentropy": 1.9595564007759094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17919503152370453, + "step": 19742 + }, + { + "epoch": 0.39488, + "grad_norm": 2.140625, + "grad_norm_var": 0.010993448893229167, + "learning_rate": 0.0001, + "loss": 3.9935, + "loss/crossentropy": 2.155984342098236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002984657883644, + "step": 19744 + }, + { + "epoch": 0.39492, + "grad_norm": 1.7890625, + "grad_norm_var": 0.012589518229166667, + "learning_rate": 0.0001, + "loss": 3.875, + "loss/crossentropy": 1.766005277633667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1717890352010727, + "step": 19746 + }, + { + "epoch": 0.39496, + "grad_norm": 2.015625, + "grad_norm_var": 0.010424550374348958, + "learning_rate": 0.0001, + "loss": 3.9178, + "loss/crossentropy": 1.8891428112983704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1965286061167717, + "step": 19748 + }, + { + "epoch": 0.395, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010343170166015625, + "learning_rate": 0.0001, + "loss": 4.3124, + "loss/crossentropy": 1.9521282315254211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21058151125907898, + "step": 19750 + }, + { + "epoch": 0.39504, + "grad_norm": 1.8515625, + "grad_norm_var": 0.014711252848307292, + "learning_rate": 0.0001, + "loss": 4.2242, + "loss/crossentropy": 2.3142699003219604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24145027250051498, + "step": 19752 + }, + { + "epoch": 0.39508, + "grad_norm": 1.9375, + "grad_norm_var": 0.011777496337890625, + "learning_rate": 0.0001, + "loss": 4.1059, + "loss/crossentropy": 2.0375224351882935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19370558112859726, + "step": 19754 + }, + { + "epoch": 0.39512, + "grad_norm": 1.9140625, + "grad_norm_var": 0.011962890625, + "learning_rate": 0.0001, + "loss": 4.2367, + "loss/crossentropy": 2.193490743637085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012534961104393, + "step": 19756 + }, + { + "epoch": 0.39516, + "grad_norm": 2.015625, + "grad_norm_var": 0.011993153889973959, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 2.2397992610931396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19738413393497467, + "step": 19758 + }, + { + "epoch": 0.3952, + "grad_norm": 1.90625, + "grad_norm_var": 0.010223134358723959, + "learning_rate": 0.0001, + "loss": 3.9153, + "loss/crossentropy": 1.7602161169052124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18599014729261398, + "step": 19760 + }, + { + "epoch": 0.39524, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008640289306640625, + "learning_rate": 0.0001, + "loss": 4.182, + "loss/crossentropy": 2.536779046058655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2301289290189743, + "step": 19762 + }, + { + "epoch": 0.39528, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008965810139973959, + "learning_rate": 0.0001, + "loss": 3.9999, + "loss/crossentropy": 2.3237764835357666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21476060152053833, + "step": 19764 + }, + { + "epoch": 0.39532, + "grad_norm": 2.0625, + "grad_norm_var": 0.011107381184895833, + "learning_rate": 0.0001, + "loss": 4.2102, + "loss/crossentropy": 2.370723605155945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24277979880571365, + "step": 19766 + }, + { + "epoch": 0.39536, + "grad_norm": 2.09375, + "grad_norm_var": 0.006917063395182292, + "learning_rate": 0.0001, + "loss": 4.0838, + "loss/crossentropy": 1.6399320363998413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1740519106388092, + "step": 19768 + }, + { + "epoch": 0.3954, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02394383748372396, + "learning_rate": 0.0001, + "loss": 3.6633, + "loss/crossentropy": 1.8633801341056824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1810051053762436, + "step": 19770 + }, + { + "epoch": 0.39544, + "grad_norm": 1.953125, + "grad_norm_var": 0.023583984375, + "learning_rate": 0.0001, + "loss": 4.1102, + "loss/crossentropy": 2.318315625190735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21214767545461655, + "step": 19772 + }, + { + "epoch": 0.39548, + "grad_norm": 1.9921875, + "grad_norm_var": 0.026387532552083332, + "learning_rate": 0.0001, + "loss": 4.1232, + "loss/crossentropy": 2.2803520560264587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21136894822120667, + "step": 19774 + }, + { + "epoch": 0.39552, + "grad_norm": 1.8359375, + "grad_norm_var": 0.026387532552083332, + "learning_rate": 0.0001, + "loss": 3.9034, + "loss/crossentropy": 2.1529780626296997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18172463029623032, + "step": 19776 + }, + { + "epoch": 0.39556, + "grad_norm": 2.015625, + "grad_norm_var": 0.025585683186848958, + "learning_rate": 0.0001, + "loss": 4.2851, + "loss/crossentropy": 2.1246083974838257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181987464427948, + "step": 19778 + }, + { + "epoch": 0.3956, + "grad_norm": 1.90625, + "grad_norm_var": 0.029515584309895832, + "learning_rate": 0.0001, + "loss": 4.0097, + "loss/crossentropy": 2.027298629283905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877325028181076, + "step": 19780 + }, + { + "epoch": 0.39564, + "grad_norm": 2.0, + "grad_norm_var": 0.0288482666015625, + "learning_rate": 0.0001, + "loss": 4.2326, + "loss/crossentropy": 1.9338072538375854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1746418997645378, + "step": 19782 + }, + { + "epoch": 0.39568, + "grad_norm": 2.1875, + "grad_norm_var": 0.03104222615559896, + "learning_rate": 0.0001, + "loss": 4.2551, + "loss/crossentropy": 2.050130307674408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2011866271495819, + "step": 19784 + }, + { + "epoch": 0.39572, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014720662434895834, + "learning_rate": 0.0001, + "loss": 4.3227, + "loss/crossentropy": 2.48315691947937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22591856867074966, + "step": 19786 + }, + { + "epoch": 0.39576, + "grad_norm": 2.046875, + "grad_norm_var": 0.021019490559895833, + "learning_rate": 0.0001, + "loss": 4.3671, + "loss/crossentropy": 2.578279137611389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216324657201767, + "step": 19788 + }, + { + "epoch": 0.3958, + "grad_norm": 1.8828125, + "grad_norm_var": 0.018192291259765625, + "learning_rate": 0.0001, + "loss": 4.259, + "loss/crossentropy": 1.7862395644187927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17009516060352325, + "step": 19790 + }, + { + "epoch": 0.39584, + "grad_norm": 1.8828125, + "grad_norm_var": 0.017032877604166666, + "learning_rate": 0.0001, + "loss": 3.8506, + "loss/crossentropy": 2.042613208293915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1844482272863388, + "step": 19792 + }, + { + "epoch": 0.39588, + "grad_norm": 1.7890625, + "grad_norm_var": 0.020926666259765626, + "learning_rate": 0.0001, + "loss": 4.1919, + "loss/crossentropy": 2.317251443862915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2135745733976364, + "step": 19794 + }, + { + "epoch": 0.39592, + "grad_norm": 2.109375, + "grad_norm_var": 0.01964111328125, + "learning_rate": 0.0001, + "loss": 4.0061, + "loss/crossentropy": 1.849199891090393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20549577474594116, + "step": 19796 + }, + { + "epoch": 0.39596, + "grad_norm": 2.015625, + "grad_norm_var": 0.0199859619140625, + "learning_rate": 0.0001, + "loss": 4.0551, + "loss/crossentropy": 2.1876507997512817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20098386704921722, + "step": 19798 + }, + { + "epoch": 0.396, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01718317667643229, + "learning_rate": 0.0001, + "loss": 4.154, + "loss/crossentropy": 1.9740530848503113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19494586437940598, + "step": 19800 + }, + { + "epoch": 0.39604, + "grad_norm": 1.9375, + "grad_norm_var": 0.017577107747395834, + "learning_rate": 0.0001, + "loss": 4.1405, + "loss/crossentropy": 2.208943724632263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22833283245563507, + "step": 19802 + }, + { + "epoch": 0.39608, + "grad_norm": 1.96875, + "grad_norm_var": 0.009488677978515625, + "learning_rate": 0.0001, + "loss": 3.9082, + "loss/crossentropy": 2.024399518966675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19786083698272705, + "step": 19804 + }, + { + "epoch": 0.39612, + "grad_norm": 2.03125, + "grad_norm_var": 0.009356435139973958, + "learning_rate": 0.0001, + "loss": 4.1475, + "loss/crossentropy": 2.1283441185951233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21942409127950668, + "step": 19806 + }, + { + "epoch": 0.39616, + "grad_norm": 1.953125, + "grad_norm_var": 0.01080322265625, + "learning_rate": 0.0001, + "loss": 4.0101, + "loss/crossentropy": 2.3675626516342163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22466859221458435, + "step": 19808 + }, + { + "epoch": 0.3962, + "grad_norm": 2.078125, + "grad_norm_var": 0.0082672119140625, + "learning_rate": 0.0001, + "loss": 3.8682, + "loss/crossentropy": 1.886322796344757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1904991939663887, + "step": 19810 + }, + { + "epoch": 0.39624, + "grad_norm": 2.03125, + "grad_norm_var": 0.005012003580729166, + "learning_rate": 0.0001, + "loss": 3.9071, + "loss/crossentropy": 2.098154127597809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2065964937210083, + "step": 19812 + }, + { + "epoch": 0.39628, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006788889567057292, + "learning_rate": 0.0001, + "loss": 3.8447, + "loss/crossentropy": 2.0527199506759644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19717100262641907, + "step": 19814 + }, + { + "epoch": 0.39632, + "grad_norm": 2.03125, + "grad_norm_var": 0.011244455973307291, + "learning_rate": 0.0001, + "loss": 4.2692, + "loss/crossentropy": 2.0650912523269653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2021411582827568, + "step": 19816 + }, + { + "epoch": 0.39636, + "grad_norm": 2.34375, + "grad_norm_var": 0.018700154622395833, + "learning_rate": 0.0001, + "loss": 4.3611, + "loss/crossentropy": 2.2239702939987183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19744005054235458, + "step": 19818 + }, + { + "epoch": 0.3964, + "grad_norm": 1.9375, + "grad_norm_var": 0.020369211832682293, + "learning_rate": 0.0001, + "loss": 3.9029, + "loss/crossentropy": 2.123336434364319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18976306170225143, + "step": 19820 + }, + { + "epoch": 0.39644, + "grad_norm": 1.9296875, + "grad_norm_var": 0.02154515584309896, + "learning_rate": 0.0001, + "loss": 3.9966, + "loss/crossentropy": 2.121657133102417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159118503332138, + "step": 19822 + }, + { + "epoch": 0.39648, + "grad_norm": 2.078125, + "grad_norm_var": 0.0185455322265625, + "learning_rate": 0.0001, + "loss": 3.7621, + "loss/crossentropy": 1.7323416471481323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18072299659252167, + "step": 19824 + }, + { + "epoch": 0.39652, + "grad_norm": 2.21875, + "grad_norm_var": 0.020243072509765626, + "learning_rate": 0.0001, + "loss": 4.0183, + "loss/crossentropy": 2.181841015815735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21452204138040543, + "step": 19826 + }, + { + "epoch": 0.39656, + "grad_norm": 1.9296875, + "grad_norm_var": 0.02104670206705729, + "learning_rate": 0.0001, + "loss": 4.1181, + "loss/crossentropy": 1.9022215008735657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18924792110919952, + "step": 19828 + }, + { + "epoch": 0.3966, + "grad_norm": 1.984375, + "grad_norm_var": 0.018317667643229167, + "learning_rate": 0.0001, + "loss": 4.2505, + "loss/crossentropy": 2.1240022778511047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18618234246969223, + "step": 19830 + }, + { + "epoch": 0.39664, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015710194905598957, + "learning_rate": 0.0001, + "loss": 3.8611, + "loss/crossentropy": 1.5959683060646057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17317308485507965, + "step": 19832 + }, + { + "epoch": 0.39668, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007682291666666666, + "learning_rate": 0.0001, + "loss": 4.1281, + "loss/crossentropy": 2.1812866926193237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192174032330513, + "step": 19834 + }, + { + "epoch": 0.39672, + "grad_norm": 1.953125, + "grad_norm_var": 0.010651652018229167, + "learning_rate": 0.0001, + "loss": 4.188, + "loss/crossentropy": 1.8964568972587585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20922592282295227, + "step": 19836 + }, + { + "epoch": 0.39676, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012013498942057292, + "learning_rate": 0.0001, + "loss": 4.1369, + "loss/crossentropy": 2.0822665691375732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18019618093967438, + "step": 19838 + }, + { + "epoch": 0.3968, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010985310872395833, + "learning_rate": 0.0001, + "loss": 4.0271, + "loss/crossentropy": 2.116270899772644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21604043990373611, + "step": 19840 + }, + { + "epoch": 0.39684, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007759602864583334, + "learning_rate": 0.0001, + "loss": 3.9132, + "loss/crossentropy": 1.7761988639831543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20087965577840805, + "step": 19842 + }, + { + "epoch": 0.39688, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008649698893229167, + "learning_rate": 0.0001, + "loss": 4.0187, + "loss/crossentropy": 1.6091360449790955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18282026052474976, + "step": 19844 + }, + { + "epoch": 0.39692, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009547678629557292, + "learning_rate": 0.0001, + "loss": 4.4441, + "loss/crossentropy": 2.6394678354263306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20579150319099426, + "step": 19846 + }, + { + "epoch": 0.39696, + "grad_norm": 2.0, + "grad_norm_var": 0.010011545817057292, + "learning_rate": 0.0001, + "loss": 4.1778, + "loss/crossentropy": 2.0911704897880554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20075388252735138, + "step": 19848 + }, + { + "epoch": 0.397, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010139719645182291, + "learning_rate": 0.0001, + "loss": 4.3458, + "loss/crossentropy": 2.1299456357955933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067631408572197, + "step": 19850 + }, + { + "epoch": 0.39704, + "grad_norm": 1.875, + "grad_norm_var": 0.004937489827473958, + "learning_rate": 0.0001, + "loss": 3.9037, + "loss/crossentropy": 1.9630563855171204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187811940908432, + "step": 19852 + }, + { + "epoch": 0.39708, + "grad_norm": 1.90625, + "grad_norm_var": 0.004808553059895833, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 2.1914453506469727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064526304602623, + "step": 19854 + }, + { + "epoch": 0.39712, + "grad_norm": 2.046875, + "grad_norm_var": 0.012412261962890626, + "learning_rate": 0.0001, + "loss": 4.1404, + "loss/crossentropy": 2.3003474473953247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20841488242149353, + "step": 19856 + }, + { + "epoch": 0.39716, + "grad_norm": 1.84375, + "grad_norm_var": 0.01682306925455729, + "learning_rate": 0.0001, + "loss": 3.96, + "loss/crossentropy": 2.1625255346298218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20437289774417877, + "step": 19858 + }, + { + "epoch": 0.3972, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015868123372395834, + "learning_rate": 0.0001, + "loss": 3.8594, + "loss/crossentropy": 1.6742416620254517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16787201166152954, + "step": 19860 + }, + { + "epoch": 0.39724, + "grad_norm": 1.921875, + "grad_norm_var": 0.01546630859375, + "learning_rate": 0.0001, + "loss": 3.8889, + "loss/crossentropy": 2.0682146549224854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1715894490480423, + "step": 19862 + }, + { + "epoch": 0.39728, + "grad_norm": 1.96875, + "grad_norm_var": 0.017071278889973958, + "learning_rate": 0.0001, + "loss": 4.0319, + "loss/crossentropy": 1.99592924118042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19570383429527283, + "step": 19864 + }, + { + "epoch": 0.39732, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01967137654622396, + "learning_rate": 0.0001, + "loss": 4.0123, + "loss/crossentropy": 2.086844265460968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123936414718628, + "step": 19866 + }, + { + "epoch": 0.39736, + "grad_norm": 1.9921875, + "grad_norm_var": 0.017256673177083334, + "learning_rate": 0.0001, + "loss": 4.2245, + "loss/crossentropy": 1.7985658645629883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18117651343345642, + "step": 19868 + }, + { + "epoch": 0.3974, + "grad_norm": 1.9140625, + "grad_norm_var": 0.018949381510416665, + "learning_rate": 0.0001, + "loss": 3.8449, + "loss/crossentropy": 1.9804646372795105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18682067096233368, + "step": 19870 + }, + { + "epoch": 0.39744, + "grad_norm": 1.796875, + "grad_norm_var": 0.01546630859375, + "learning_rate": 0.0001, + "loss": 4.0692, + "loss/crossentropy": 1.9027678966522217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18195898830890656, + "step": 19872 + }, + { + "epoch": 0.39748, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01660944620768229, + "learning_rate": 0.0001, + "loss": 4.1191, + "loss/crossentropy": 2.3370686769485474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2376203015446663, + "step": 19874 + }, + { + "epoch": 0.39752, + "grad_norm": 1.890625, + "grad_norm_var": 0.018363189697265626, + "learning_rate": 0.0001, + "loss": 3.8472, + "loss/crossentropy": 1.7463279366493225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16485755145549774, + "step": 19876 + }, + { + "epoch": 0.39756, + "grad_norm": 2.015625, + "grad_norm_var": 0.017967732747395833, + "learning_rate": 0.0001, + "loss": 4.4039, + "loss/crossentropy": 2.1171544194221497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20631500333547592, + "step": 19878 + }, + { + "epoch": 0.3976, + "grad_norm": 1.84375, + "grad_norm_var": 0.016733551025390626, + "learning_rate": 0.0001, + "loss": 3.7242, + "loss/crossentropy": 2.1109176874160767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18512246757745743, + "step": 19880 + }, + { + "epoch": 0.39764, + "grad_norm": 1.9375, + "grad_norm_var": 0.013618977864583333, + "learning_rate": 0.0001, + "loss": 3.6843, + "loss/crossentropy": 1.927691638469696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19118069112300873, + "step": 19882 + }, + { + "epoch": 0.39768, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01337890625, + "learning_rate": 0.0001, + "loss": 3.9539, + "loss/crossentropy": 2.041896104812622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21281379461288452, + "step": 19884 + }, + { + "epoch": 0.39772, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013272857666015625, + "learning_rate": 0.0001, + "loss": 3.9146, + "loss/crossentropy": 1.8779407739639282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.180915005505085, + "step": 19886 + }, + { + "epoch": 0.39776, + "grad_norm": 1.9296875, + "grad_norm_var": 0.019001261393229166, + "learning_rate": 0.0001, + "loss": 4.2158, + "loss/crossentropy": 1.9899646639823914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19102467596530914, + "step": 19888 + }, + { + "epoch": 0.3978, + "grad_norm": 2.0, + "grad_norm_var": 0.012562815348307292, + "learning_rate": 0.0001, + "loss": 4.1822, + "loss/crossentropy": 2.3294299840927124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20739557594060898, + "step": 19890 + }, + { + "epoch": 0.39784, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011864980061848959, + "learning_rate": 0.0001, + "loss": 4.2829, + "loss/crossentropy": 2.2126539945602417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23066286742687225, + "step": 19892 + }, + { + "epoch": 0.39788, + "grad_norm": 2.1875, + "grad_norm_var": 0.016454060872395832, + "learning_rate": 0.0001, + "loss": 3.8665, + "loss/crossentropy": 1.9949330687522888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19172564148902893, + "step": 19894 + }, + { + "epoch": 0.39792, + "grad_norm": 2.0625, + "grad_norm_var": 0.02158177693684896, + "learning_rate": 0.0001, + "loss": 3.9567, + "loss/crossentropy": 2.0896310210227966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19692128896713257, + "step": 19896 + }, + { + "epoch": 0.39796, + "grad_norm": 1.921875, + "grad_norm_var": 0.020310211181640624, + "learning_rate": 0.0001, + "loss": 4.1856, + "loss/crossentropy": 2.191789746284485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073511779308319, + "step": 19898 + }, + { + "epoch": 0.398, + "grad_norm": 1.921875, + "grad_norm_var": 0.022415924072265624, + "learning_rate": 0.0001, + "loss": 3.9965, + "loss/crossentropy": 1.9925153255462646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20123805850744247, + "step": 19900 + }, + { + "epoch": 0.39804, + "grad_norm": 1.8125, + "grad_norm_var": 0.021476236979166667, + "learning_rate": 0.0001, + "loss": 3.7917, + "loss/crossentropy": 1.8903232216835022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18577788770198822, + "step": 19902 + }, + { + "epoch": 0.39808, + "grad_norm": 2.078125, + "grad_norm_var": 0.01817804972330729, + "learning_rate": 0.0001, + "loss": 4.2818, + "loss/crossentropy": 2.055288314819336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19745147973299026, + "step": 19904 + }, + { + "epoch": 0.39812, + "grad_norm": 2.0625, + "grad_norm_var": 0.018570709228515624, + "learning_rate": 0.0001, + "loss": 4.196, + "loss/crossentropy": 1.7709746360778809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17147859930992126, + "step": 19906 + }, + { + "epoch": 0.39816, + "grad_norm": 2.0, + "grad_norm_var": 0.01843846638997396, + "learning_rate": 0.0001, + "loss": 4.182, + "loss/crossentropy": 2.3295364379882812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20680927485227585, + "step": 19908 + }, + { + "epoch": 0.3982, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014414215087890625, + "learning_rate": 0.0001, + "loss": 4.1633, + "loss/crossentropy": 1.9828099608421326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202943354845047, + "step": 19910 + }, + { + "epoch": 0.39824, + "grad_norm": 2.046875, + "grad_norm_var": 0.009352366129557291, + "learning_rate": 0.0001, + "loss": 4.2294, + "loss/crossentropy": 2.140046715736389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1967788189649582, + "step": 19912 + }, + { + "epoch": 0.39828, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008736165364583333, + "learning_rate": 0.0001, + "loss": 4.1673, + "loss/crossentropy": 2.3365899324417114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2261614426970482, + "step": 19914 + }, + { + "epoch": 0.39832, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0075642903645833336, + "learning_rate": 0.0001, + "loss": 3.8716, + "loss/crossentropy": 2.101326584815979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19668899476528168, + "step": 19916 + }, + { + "epoch": 0.39836, + "grad_norm": 1.96875, + "grad_norm_var": 0.005037180582682292, + "learning_rate": 0.0001, + "loss": 4.1572, + "loss/crossentropy": 1.9804238080978394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117360234260559, + "step": 19918 + }, + { + "epoch": 0.3984, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0037676493326822915, + "learning_rate": 0.0001, + "loss": 3.945, + "loss/crossentropy": 1.858399510383606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20013604313135147, + "step": 19920 + }, + { + "epoch": 0.39844, + "grad_norm": 1.90625, + "grad_norm_var": 0.005832672119140625, + "learning_rate": 0.0001, + "loss": 4.0302, + "loss/crossentropy": 2.087529957294464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19001878052949905, + "step": 19922 + }, + { + "epoch": 0.39848, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0068267822265625, + "learning_rate": 0.0001, + "loss": 3.9425, + "loss/crossentropy": 2.2521530389785767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20182852447032928, + "step": 19924 + }, + { + "epoch": 0.39852, + "grad_norm": 2.109375, + "grad_norm_var": 0.00784912109375, + "learning_rate": 0.0001, + "loss": 4.3362, + "loss/crossentropy": 2.370557188987732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27234383672475815, + "step": 19926 + }, + { + "epoch": 0.39856, + "grad_norm": 13.0625, + "grad_norm_var": 7.6871192932128904, + "learning_rate": 0.0001, + "loss": 4.0383, + "loss/crossentropy": 2.18235445022583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20694056898355484, + "step": 19928 + }, + { + "epoch": 0.3986, + "grad_norm": 2.125, + "grad_norm_var": 7.659780883789063, + "learning_rate": 0.0001, + "loss": 3.8946, + "loss/crossentropy": 1.9220272898674011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21540776640176773, + "step": 19930 + }, + { + "epoch": 0.39864, + "grad_norm": 2.015625, + "grad_norm_var": 7.6551513671875, + "learning_rate": 0.0001, + "loss": 4.015, + "loss/crossentropy": 2.064575970172882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20172829926013947, + "step": 19932 + }, + { + "epoch": 0.39868, + "grad_norm": 2.296875, + "grad_norm_var": 7.620402018229167, + "learning_rate": 0.0001, + "loss": 4.5782, + "loss/crossentropy": 2.1824090480804443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23097260296344757, + "step": 19934 + }, + { + "epoch": 0.39872, + "grad_norm": 2.0, + "grad_norm_var": 7.594489542643229, + "learning_rate": 0.0001, + "loss": 3.9227, + "loss/crossentropy": 2.211961567401886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150636538863182, + "step": 19936 + }, + { + "epoch": 0.39876, + "grad_norm": 2.4375, + "grad_norm_var": 7.586128743489583, + "learning_rate": 0.0001, + "loss": 4.1724, + "loss/crossentropy": 1.8789254426956177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18855369836091995, + "step": 19938 + }, + { + "epoch": 0.3988, + "grad_norm": 2.03125, + "grad_norm_var": 7.576968383789063, + "learning_rate": 0.0001, + "loss": 4.212, + "loss/crossentropy": 2.322842240333557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19432100653648376, + "step": 19940 + }, + { + "epoch": 0.39884, + "grad_norm": 1.9375, + "grad_norm_var": 7.590311686197917, + "learning_rate": 0.0001, + "loss": 4.2201, + "loss/crossentropy": 2.1524535417556763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19931253790855408, + "step": 19942 + }, + { + "epoch": 0.39888, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0280670166015625, + "learning_rate": 0.0001, + "loss": 4.0166, + "loss/crossentropy": 2.0835599303245544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18674355000257492, + "step": 19944 + }, + { + "epoch": 0.39892, + "grad_norm": 1.96875, + "grad_norm_var": 0.026718902587890624, + "learning_rate": 0.0001, + "loss": 3.9398, + "loss/crossentropy": 1.8204763531684875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970055252313614, + "step": 19946 + }, + { + "epoch": 0.39896, + "grad_norm": 1.9375, + "grad_norm_var": 0.02769953409830729, + "learning_rate": 0.0001, + "loss": 3.8403, + "loss/crossentropy": 2.1432100534439087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960032731294632, + "step": 19948 + }, + { + "epoch": 0.399, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0227294921875, + "learning_rate": 0.0001, + "loss": 4.1183, + "loss/crossentropy": 1.9959533214569092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18608924746513367, + "step": 19950 + }, + { + "epoch": 0.39904, + "grad_norm": 2.015625, + "grad_norm_var": 0.02399266560872396, + "learning_rate": 0.0001, + "loss": 4.0235, + "loss/crossentropy": 1.9874342679977417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18600600212812424, + "step": 19952 + }, + { + "epoch": 0.39908, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00849609375, + "learning_rate": 0.0001, + "loss": 4.2266, + "loss/crossentropy": 2.5082361698150635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22386416047811508, + "step": 19954 + }, + { + "epoch": 0.39912, + "grad_norm": 1.90625, + "grad_norm_var": 0.008153279622395834, + "learning_rate": 0.0001, + "loss": 3.9735, + "loss/crossentropy": 2.1550523042678833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2169199213385582, + "step": 19956 + }, + { + "epoch": 0.39916, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008172353108723959, + "learning_rate": 0.0001, + "loss": 3.8834, + "loss/crossentropy": 2.079386830329895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19080224633216858, + "step": 19958 + }, + { + "epoch": 0.3992, + "grad_norm": 1.8828125, + "grad_norm_var": 0.006030019124348958, + "learning_rate": 0.0001, + "loss": 3.973, + "loss/crossentropy": 1.8762348890304565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1813136711716652, + "step": 19960 + }, + { + "epoch": 0.39924, + "grad_norm": 1.84375, + "grad_norm_var": 0.006455230712890625, + "learning_rate": 0.0001, + "loss": 4.0965, + "loss/crossentropy": 1.789110004901886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17166541516780853, + "step": 19962 + }, + { + "epoch": 0.39928, + "grad_norm": 1.859375, + "grad_norm_var": 0.009056599934895833, + "learning_rate": 0.0001, + "loss": 4.0912, + "loss/crossentropy": 2.0196239948272705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20005135238170624, + "step": 19964 + }, + { + "epoch": 0.39932, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0080718994140625, + "learning_rate": 0.0001, + "loss": 4.2787, + "loss/crossentropy": 2.192206025123596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979481726884842, + "step": 19966 + }, + { + "epoch": 0.39936, + "grad_norm": 2.0625, + "grad_norm_var": 0.007157135009765625, + "learning_rate": 0.0001, + "loss": 4.3155, + "loss/crossentropy": 2.263342499732971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2187722995877266, + "step": 19968 + }, + { + "epoch": 0.3994, + "grad_norm": 2.109375, + "grad_norm_var": 0.008161417643229167, + "learning_rate": 0.0001, + "loss": 4.2727, + "loss/crossentropy": 2.3387625217437744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21375280618667603, + "step": 19970 + }, + { + "epoch": 0.39944, + "grad_norm": 1.96875, + "grad_norm_var": 0.008733876546223958, + "learning_rate": 0.0001, + "loss": 4.0442, + "loss/crossentropy": 1.5837730765342712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17303457856178284, + "step": 19972 + }, + { + "epoch": 0.39948, + "grad_norm": 1.984375, + "grad_norm_var": 0.008678944905598958, + "learning_rate": 0.0001, + "loss": 3.8398, + "loss/crossentropy": 1.9090858697891235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1838761642575264, + "step": 19974 + }, + { + "epoch": 0.39952, + "grad_norm": 1.875, + "grad_norm_var": 0.008780924479166667, + "learning_rate": 0.0001, + "loss": 3.9345, + "loss/crossentropy": 2.107620596885681, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19132380187511444, + "step": 19976 + }, + { + "epoch": 0.39956, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009200032552083333, + "learning_rate": 0.0001, + "loss": 4.0709, + "loss/crossentropy": 2.1706109046936035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19275012612342834, + "step": 19978 + }, + { + "epoch": 0.3996, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007356516520182292, + "learning_rate": 0.0001, + "loss": 3.8486, + "loss/crossentropy": 2.0092907547950745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18047627061605453, + "step": 19980 + }, + { + "epoch": 0.39964, + "grad_norm": 2.046875, + "grad_norm_var": 0.025349934895833332, + "learning_rate": 0.0001, + "loss": 4.2226, + "loss/crossentropy": 2.1216511726379395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20591023564338684, + "step": 19982 + }, + { + "epoch": 0.39968, + "grad_norm": 2.046875, + "grad_norm_var": 0.02671076456705729, + "learning_rate": 0.0001, + "loss": 3.9895, + "loss/crossentropy": 2.3002817630767822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20734255760908127, + "step": 19984 + }, + { + "epoch": 0.39972, + "grad_norm": 2.09375, + "grad_norm_var": 0.02664972941080729, + "learning_rate": 0.0001, + "loss": 4.1494, + "loss/crossentropy": 2.3097801208496094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20194757729768753, + "step": 19986 + }, + { + "epoch": 0.39976, + "grad_norm": 2.15625, + "grad_norm_var": 0.029130045572916666, + "learning_rate": 0.0001, + "loss": 3.9706, + "loss/crossentropy": 2.2645580768585205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21866093575954437, + "step": 19988 + }, + { + "epoch": 0.3998, + "grad_norm": 1.875, + "grad_norm_var": 0.03004150390625, + "learning_rate": 0.0001, + "loss": 4.0634, + "loss/crossentropy": 2.224185347557068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19011163711547852, + "step": 19990 + }, + { + "epoch": 0.39984, + "grad_norm": 1.859375, + "grad_norm_var": 0.030248006184895832, + "learning_rate": 0.0001, + "loss": 3.9552, + "loss/crossentropy": 1.958588182926178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18359722197055817, + "step": 19992 + }, + { + "epoch": 0.39988, + "grad_norm": 1.9375, + "grad_norm_var": 0.029361724853515625, + "learning_rate": 0.0001, + "loss": 4.0449, + "loss/crossentropy": 2.5081194639205933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22576630860567093, + "step": 19994 + }, + { + "epoch": 0.39992, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0306549072265625, + "learning_rate": 0.0001, + "loss": 3.8833, + "loss/crossentropy": 1.9870144724845886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19262682646512985, + "step": 19996 + }, + { + "epoch": 0.39996, + "grad_norm": 1.828125, + "grad_norm_var": 0.0118560791015625, + "learning_rate": 0.0001, + "loss": 4.1321, + "loss/crossentropy": 2.227096438407898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2056111991405487, + "step": 19998 + }, + { + "epoch": 0.4, + "grad_norm": 1.796875, + "grad_norm_var": 0.012837473551432292, + "learning_rate": 0.0001, + "loss": 4.1659, + "loss/crossentropy": 1.8458907008171082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18857256323099136, + "step": 20000 + }, + { + "epoch": 0.40004, + "grad_norm": 1.90625, + "grad_norm_var": 0.010536448160807291, + "learning_rate": 0.0001, + "loss": 4.1497, + "loss/crossentropy": 2.184568405151367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20918434113264084, + "step": 20002 + }, + { + "epoch": 0.40008, + "grad_norm": 1.8203125, + "grad_norm_var": 0.007136027018229167, + "learning_rate": 0.0001, + "loss": 3.8604, + "loss/crossentropy": 2.046003818511963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17669742554426193, + "step": 20004 + }, + { + "epoch": 0.40012, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006151326497395833, + "learning_rate": 0.0001, + "loss": 4.1618, + "loss/crossentropy": 2.330680012702942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159157246351242, + "step": 20006 + }, + { + "epoch": 0.40016, + "grad_norm": 1.796875, + "grad_norm_var": 0.005973052978515625, + "learning_rate": 0.0001, + "loss": 3.643, + "loss/crossentropy": 1.9141735434532166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17788738757371902, + "step": 20008 + }, + { + "epoch": 0.4002, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006589508056640625, + "learning_rate": 0.0001, + "loss": 4.1782, + "loss/crossentropy": 2.3000227212905884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21896196901798248, + "step": 20010 + }, + { + "epoch": 0.40024, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0067535400390625, + "learning_rate": 0.0001, + "loss": 4.2057, + "loss/crossentropy": 2.151344060897827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20495032519102097, + "step": 20012 + }, + { + "epoch": 0.40028, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007911936442057291, + "learning_rate": 0.0001, + "loss": 3.9086, + "loss/crossentropy": 2.1465260982513428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1846243441104889, + "step": 20014 + }, + { + "epoch": 0.40032, + "grad_norm": 1.9375, + "grad_norm_var": 0.006453450520833333, + "learning_rate": 0.0001, + "loss": 4.0024, + "loss/crossentropy": 1.662541925907135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18223944306373596, + "step": 20016 + }, + { + "epoch": 0.40036, + "grad_norm": 1.921875, + "grad_norm_var": 0.006418609619140625, + "learning_rate": 0.0001, + "loss": 4.0052, + "loss/crossentropy": 1.9482674598693848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18713432550430298, + "step": 20018 + }, + { + "epoch": 0.4004, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0058837890625, + "learning_rate": 0.0001, + "loss": 3.7834, + "loss/crossentropy": 1.7436261773109436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18203437328338623, + "step": 20020 + }, + { + "epoch": 0.40044, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005716705322265625, + "learning_rate": 0.0001, + "loss": 3.8905, + "loss/crossentropy": 2.0031047463417053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20406989008188248, + "step": 20022 + }, + { + "epoch": 0.40048, + "grad_norm": 2.03125, + "grad_norm_var": 0.005415852864583333, + "learning_rate": 0.0001, + "loss": 3.8527, + "loss/crossentropy": 1.9428189992904663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17989446222782135, + "step": 20024 + }, + { + "epoch": 0.40052, + "grad_norm": 1.8984375, + "grad_norm_var": 0.005208333333333333, + "learning_rate": 0.0001, + "loss": 3.9674, + "loss/crossentropy": 2.021029829978943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1908131018280983, + "step": 20026 + }, + { + "epoch": 0.40056, + "grad_norm": 2.0, + "grad_norm_var": 0.02068049112955729, + "learning_rate": 0.0001, + "loss": 4.3471, + "loss/crossentropy": 1.8795804381370544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17839516699314117, + "step": 20028 + }, + { + "epoch": 0.4006, + "grad_norm": 2.296875, + "grad_norm_var": 0.022823079427083334, + "learning_rate": 0.0001, + "loss": 4.4826, + "loss/crossentropy": 1.9421144723892212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18655938655138016, + "step": 20030 + }, + { + "epoch": 0.40064, + "grad_norm": 2.0625, + "grad_norm_var": 0.022809855143229165, + "learning_rate": 0.0001, + "loss": 3.9615, + "loss/crossentropy": 2.1828919649124146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21237139403820038, + "step": 20032 + }, + { + "epoch": 0.40068, + "grad_norm": 1.984375, + "grad_norm_var": 0.02103271484375, + "learning_rate": 0.0001, + "loss": 4.0375, + "loss/crossentropy": 1.9509565830230713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19763849675655365, + "step": 20034 + }, + { + "epoch": 0.40072, + "grad_norm": 1.84375, + "grad_norm_var": 0.022507476806640624, + "learning_rate": 0.0001, + "loss": 4.0011, + "loss/crossentropy": 2.1327446699142456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048783376812935, + "step": 20036 + }, + { + "epoch": 0.40076, + "grad_norm": 2.140625, + "grad_norm_var": 0.022904205322265624, + "learning_rate": 0.0001, + "loss": 4.2094, + "loss/crossentropy": 2.3295645713806152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22039268165826797, + "step": 20038 + }, + { + "epoch": 0.4008, + "grad_norm": 2.203125, + "grad_norm_var": 0.03260879516601563, + "learning_rate": 0.0001, + "loss": 3.7464, + "loss/crossentropy": 1.786357820034027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1681244671344757, + "step": 20040 + }, + { + "epoch": 0.40084, + "grad_norm": 2.140625, + "grad_norm_var": 0.033113352457682294, + "learning_rate": 0.0001, + "loss": 4.244, + "loss/crossentropy": 2.0342337489128113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19485876709222794, + "step": 20042 + }, + { + "epoch": 0.40088, + "grad_norm": 1.96875, + "grad_norm_var": 0.027795155843098957, + "learning_rate": 0.0001, + "loss": 3.8838, + "loss/crossentropy": 2.23150634765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20408733934164047, + "step": 20044 + }, + { + "epoch": 0.40092, + "grad_norm": 1.8671875, + "grad_norm_var": 0.026805623372395834, + "learning_rate": 0.0001, + "loss": 3.9708, + "loss/crossentropy": 2.086575150489807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20418858528137207, + "step": 20046 + }, + { + "epoch": 0.40096, + "grad_norm": 1.9453125, + "grad_norm_var": 0.026741536458333333, + "learning_rate": 0.0001, + "loss": 4.1508, + "loss/crossentropy": 2.2025340795516968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21201904118061066, + "step": 20048 + }, + { + "epoch": 0.401, + "grad_norm": 2.09375, + "grad_norm_var": 0.02859471638997396, + "learning_rate": 0.0001, + "loss": 4.2056, + "loss/crossentropy": 2.175204277038574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20766476541757584, + "step": 20050 + }, + { + "epoch": 0.40104, + "grad_norm": 1.9375, + "grad_norm_var": 0.027581532796223957, + "learning_rate": 0.0001, + "loss": 4.0223, + "loss/crossentropy": 2.159320116043091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2013096585869789, + "step": 20052 + }, + { + "epoch": 0.40108, + "grad_norm": 2.09375, + "grad_norm_var": 0.028452301025390626, + "learning_rate": 0.0001, + "loss": 3.8412, + "loss/crossentropy": 1.5438128113746643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1641307920217514, + "step": 20054 + }, + { + "epoch": 0.40112, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0180084228515625, + "learning_rate": 0.0001, + "loss": 3.8949, + "loss/crossentropy": 2.0549490451812744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2176165133714676, + "step": 20056 + }, + { + "epoch": 0.40116, + "grad_norm": 1.90625, + "grad_norm_var": 0.01363525390625, + "learning_rate": 0.0001, + "loss": 4.1048, + "loss/crossentropy": 1.8520516753196716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19935637712478638, + "step": 20058 + }, + { + "epoch": 0.4012, + "grad_norm": 1.90625, + "grad_norm_var": 0.007112375895182292, + "learning_rate": 0.0001, + "loss": 3.8383, + "loss/crossentropy": 2.061866581439972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1928197741508484, + "step": 20060 + }, + { + "epoch": 0.40124, + "grad_norm": 1.859375, + "grad_norm_var": 0.006780751546223958, + "learning_rate": 0.0001, + "loss": 3.9205, + "loss/crossentropy": 2.0396493673324585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19679231941699982, + "step": 20062 + }, + { + "epoch": 0.40128, + "grad_norm": 2.109375, + "grad_norm_var": 0.008231353759765626, + "learning_rate": 0.0001, + "loss": 4.2929, + "loss/crossentropy": 1.8019860982894897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17440176755189896, + "step": 20064 + }, + { + "epoch": 0.40132, + "grad_norm": 1.8515625, + "grad_norm_var": 0.011027018229166666, + "learning_rate": 0.0001, + "loss": 4.1233, + "loss/crossentropy": 2.087724268436432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18905764818191528, + "step": 20066 + }, + { + "epoch": 0.40136, + "grad_norm": 1.921875, + "grad_norm_var": 0.0110504150390625, + "learning_rate": 0.0001, + "loss": 3.9207, + "loss/crossentropy": 1.899372398853302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20874439179897308, + "step": 20068 + }, + { + "epoch": 0.4014, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009073893229166666, + "learning_rate": 0.0001, + "loss": 3.9521, + "loss/crossentropy": 1.740720272064209, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21033668518066406, + "step": 20070 + }, + { + "epoch": 0.40144, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009748331705729167, + "learning_rate": 0.0001, + "loss": 3.9399, + "loss/crossentropy": 2.043474793434143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20285866409540176, + "step": 20072 + }, + { + "epoch": 0.40148, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010114542643229167, + "learning_rate": 0.0001, + "loss": 3.8251, + "loss/crossentropy": 1.907653033733368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19814437627792358, + "step": 20074 + }, + { + "epoch": 0.40152, + "grad_norm": 1.953125, + "grad_norm_var": 0.0099029541015625, + "learning_rate": 0.0001, + "loss": 4.004, + "loss/crossentropy": 1.803492784500122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19028827548027039, + "step": 20076 + }, + { + "epoch": 0.40156, + "grad_norm": 1.8671875, + "grad_norm_var": 0.017463175455729167, + "learning_rate": 0.0001, + "loss": 4.0304, + "loss/crossentropy": 2.2277488708496094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069825902581215, + "step": 20078 + }, + { + "epoch": 0.4016, + "grad_norm": 1.9375, + "grad_norm_var": 0.01710205078125, + "learning_rate": 0.0001, + "loss": 4.0578, + "loss/crossentropy": 1.9414427280426025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.182416632771492, + "step": 20080 + }, + { + "epoch": 0.40164, + "grad_norm": 1.7890625, + "grad_norm_var": 0.014086659749348958, + "learning_rate": 0.0001, + "loss": 3.9105, + "loss/crossentropy": 1.9231160879135132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17756588757038116, + "step": 20082 + }, + { + "epoch": 0.40168, + "grad_norm": 2.046875, + "grad_norm_var": 0.01566136678059896, + "learning_rate": 0.0001, + "loss": 4.2187, + "loss/crossentropy": 2.1724199056625366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21560527384281158, + "step": 20084 + }, + { + "epoch": 0.40172, + "grad_norm": 2.125, + "grad_norm_var": 0.017365519205729166, + "learning_rate": 0.0001, + "loss": 4.1919, + "loss/crossentropy": 1.7942206859588623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1705407276749611, + "step": 20086 + }, + { + "epoch": 0.40176, + "grad_norm": 1.8984375, + "grad_norm_var": 0.017183430989583335, + "learning_rate": 0.0001, + "loss": 4.0294, + "loss/crossentropy": 2.1846182346343994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2066875621676445, + "step": 20088 + }, + { + "epoch": 0.4018, + "grad_norm": 1.828125, + "grad_norm_var": 0.0183746337890625, + "learning_rate": 0.0001, + "loss": 3.6378, + "loss/crossentropy": 1.9378133416175842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18584155291318893, + "step": 20090 + }, + { + "epoch": 0.40184, + "grad_norm": 1.984375, + "grad_norm_var": 0.018379720052083333, + "learning_rate": 0.0001, + "loss": 4.2124, + "loss/crossentropy": 1.9822070598602295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22574876248836517, + "step": 20092 + }, + { + "epoch": 0.40188, + "grad_norm": 1.90625, + "grad_norm_var": 0.009626261393229167, + "learning_rate": 0.0001, + "loss": 3.8348, + "loss/crossentropy": 2.133267104625702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20023657381534576, + "step": 20094 + }, + { + "epoch": 0.40192, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012465159098307291, + "learning_rate": 0.0001, + "loss": 3.6025, + "loss/crossentropy": 1.9899320602416992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17226950079202652, + "step": 20096 + }, + { + "epoch": 0.40196, + "grad_norm": 2.625, + "grad_norm_var": 0.042699940999348956, + "learning_rate": 0.0001, + "loss": 4.6337, + "loss/crossentropy": 2.3320037126541138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.33343885838985443, + "step": 20098 + }, + { + "epoch": 0.402, + "grad_norm": 2.015625, + "grad_norm_var": 0.04210586547851562, + "learning_rate": 0.0001, + "loss": 4.1064, + "loss/crossentropy": 2.2026573419570923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20648924261331558, + "step": 20100 + }, + { + "epoch": 0.40204, + "grad_norm": 1.859375, + "grad_norm_var": 0.04067789713541667, + "learning_rate": 0.0001, + "loss": 4.0363, + "loss/crossentropy": 2.1894484758377075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947491317987442, + "step": 20102 + }, + { + "epoch": 0.40208, + "grad_norm": 1.8984375, + "grad_norm_var": 0.040421549479166666, + "learning_rate": 0.0001, + "loss": 4.1506, + "loss/crossentropy": 2.102599799633026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19991370290517807, + "step": 20104 + }, + { + "epoch": 0.40212, + "grad_norm": 1.8984375, + "grad_norm_var": 0.038590494791666666, + "learning_rate": 0.0001, + "loss": 4.0476, + "loss/crossentropy": 2.0289117097854614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18713568150997162, + "step": 20106 + }, + { + "epoch": 0.40216, + "grad_norm": 1.859375, + "grad_norm_var": 0.039098866780598956, + "learning_rate": 0.0001, + "loss": 4.0656, + "loss/crossentropy": 2.0425156950950623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19697192311286926, + "step": 20108 + }, + { + "epoch": 0.4022, + "grad_norm": 1.8125, + "grad_norm_var": 0.0391265869140625, + "learning_rate": 0.0001, + "loss": 3.7796, + "loss/crossentropy": 1.9846921563148499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20306336879730225, + "step": 20110 + }, + { + "epoch": 0.40224, + "grad_norm": 1.859375, + "grad_norm_var": 0.035166168212890626, + "learning_rate": 0.0001, + "loss": 3.8168, + "loss/crossentropy": 1.846974492073059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19833409041166306, + "step": 20112 + }, + { + "epoch": 0.40228, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005052693684895833, + "learning_rate": 0.0001, + "loss": 4.0727, + "loss/crossentropy": 2.3157109022140503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2169334515929222, + "step": 20114 + }, + { + "epoch": 0.40232, + "grad_norm": 1.9375, + "grad_norm_var": 0.004349772135416667, + "learning_rate": 0.0001, + "loss": 4.0974, + "loss/crossentropy": 2.28099262714386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20791316777467728, + "step": 20116 + }, + { + "epoch": 0.40236, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0042073567708333336, + "learning_rate": 0.0001, + "loss": 4.0327, + "loss/crossentropy": 2.3558366298675537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2205863893032074, + "step": 20118 + }, + { + "epoch": 0.4024, + "grad_norm": 1.984375, + "grad_norm_var": 0.0045074462890625, + "learning_rate": 0.0001, + "loss": 4.2792, + "loss/crossentropy": 2.0106826424598694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1982533410191536, + "step": 20120 + }, + { + "epoch": 0.40244, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005773671468098958, + "learning_rate": 0.0001, + "loss": 4.2738, + "loss/crossentropy": 2.321291446685791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20979785174131393, + "step": 20122 + }, + { + "epoch": 0.40248, + "grad_norm": 2.375, + "grad_norm_var": 0.016658528645833334, + "learning_rate": 0.0001, + "loss": 4.3524, + "loss/crossentropy": 2.3632254600524902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2184157818555832, + "step": 20124 + }, + { + "epoch": 0.40252, + "grad_norm": 2.078125, + "grad_norm_var": 0.015144856770833333, + "learning_rate": 0.0001, + "loss": 3.959, + "loss/crossentropy": 2.1488123536109924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2031005695462227, + "step": 20126 + }, + { + "epoch": 0.40256, + "grad_norm": 2.203125, + "grad_norm_var": 0.015533192952473959, + "learning_rate": 0.0001, + "loss": 4.3341, + "loss/crossentropy": 1.8036405444145203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19172652810811996, + "step": 20128 + }, + { + "epoch": 0.4026, + "grad_norm": 1.984375, + "grad_norm_var": 0.014925130208333333, + "learning_rate": 0.0001, + "loss": 4.1131, + "loss/crossentropy": 2.0299129486083984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20876885205507278, + "step": 20130 + }, + { + "epoch": 0.40264, + "grad_norm": 2.0, + "grad_norm_var": 0.014399973551432292, + "learning_rate": 0.0001, + "loss": 3.9618, + "loss/crossentropy": 1.9177062511444092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19626032561063766, + "step": 20132 + }, + { + "epoch": 0.40268, + "grad_norm": 1.859375, + "grad_norm_var": 0.015897623697916665, + "learning_rate": 0.0001, + "loss": 4.1608, + "loss/crossentropy": 2.2073251008987427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061092108488083, + "step": 20134 + }, + { + "epoch": 0.40272, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01739476521809896, + "learning_rate": 0.0001, + "loss": 3.7592, + "loss/crossentropy": 2.0243424773216248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2109317108988762, + "step": 20136 + }, + { + "epoch": 0.40276, + "grad_norm": 1.8515625, + "grad_norm_var": 0.02050959269205729, + "learning_rate": 0.0001, + "loss": 3.845, + "loss/crossentropy": 2.0052929520606995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17854592204093933, + "step": 20138 + }, + { + "epoch": 0.4028, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010790761311848958, + "learning_rate": 0.0001, + "loss": 4.0656, + "loss/crossentropy": 2.067071557044983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18235646188259125, + "step": 20140 + }, + { + "epoch": 0.40284, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009354400634765624, + "learning_rate": 0.0001, + "loss": 4.0343, + "loss/crossentropy": 2.050750732421875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19028235971927643, + "step": 20142 + }, + { + "epoch": 0.40288, + "grad_norm": 2.109375, + "grad_norm_var": 0.005680084228515625, + "learning_rate": 0.0001, + "loss": 4.2734, + "loss/crossentropy": 2.1326886415481567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158709168434143, + "step": 20144 + }, + { + "epoch": 0.40292, + "grad_norm": 1.8359375, + "grad_norm_var": 0.006571451822916667, + "learning_rate": 0.0001, + "loss": 4.1029, + "loss/crossentropy": 1.9970600605010986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20081285387277603, + "step": 20146 + }, + { + "epoch": 0.40296, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006178538004557292, + "learning_rate": 0.0001, + "loss": 3.8243, + "loss/crossentropy": 2.042023479938507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022870033979416, + "step": 20148 + }, + { + "epoch": 0.403, + "grad_norm": 1.921875, + "grad_norm_var": 0.006349436442057292, + "learning_rate": 0.0001, + "loss": 4.3217, + "loss/crossentropy": 2.3379902839660645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2027904912829399, + "step": 20150 + }, + { + "epoch": 0.40304, + "grad_norm": 2.03125, + "grad_norm_var": 0.007080078125, + "learning_rate": 0.0001, + "loss": 4.0321, + "loss/crossentropy": 1.804058849811554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18429075926542282, + "step": 20152 + }, + { + "epoch": 0.40308, + "grad_norm": 1.984375, + "grad_norm_var": 0.005716705322265625, + "learning_rate": 0.0001, + "loss": 4.1405, + "loss/crossentropy": 1.93051016330719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1912556290626526, + "step": 20154 + }, + { + "epoch": 0.40312, + "grad_norm": 1.890625, + "grad_norm_var": 0.004634348551432291, + "learning_rate": 0.0001, + "loss": 4.0571, + "loss/crossentropy": 2.3044906854629517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22676818072795868, + "step": 20156 + }, + { + "epoch": 0.40316, + "grad_norm": 2.0, + "grad_norm_var": 0.0047686258951822914, + "learning_rate": 0.0001, + "loss": 4.1262, + "loss/crossentropy": 2.1352078914642334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19985031336545944, + "step": 20158 + }, + { + "epoch": 0.4032, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0031565348307291668, + "learning_rate": 0.0001, + "loss": 4.1973, + "loss/crossentropy": 2.0967469811439514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1917915642261505, + "step": 20160 + }, + { + "epoch": 0.40324, + "grad_norm": 1.875, + "grad_norm_var": 0.0033078511555989583, + "learning_rate": 0.0001, + "loss": 4.2332, + "loss/crossentropy": 2.1989063024520874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067013531923294, + "step": 20162 + }, + { + "epoch": 0.40328, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0032623291015625, + "learning_rate": 0.0001, + "loss": 4.144, + "loss/crossentropy": 2.047918140888214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19593901932239532, + "step": 20164 + }, + { + "epoch": 0.40332, + "grad_norm": 2.046875, + "grad_norm_var": 0.003824615478515625, + "learning_rate": 0.0001, + "loss": 4.1674, + "loss/crossentropy": 2.258637487888336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20108382403850555, + "step": 20166 + }, + { + "epoch": 0.40336, + "grad_norm": 2.03125, + "grad_norm_var": 0.003574371337890625, + "learning_rate": 0.0001, + "loss": 4.3006, + "loss/crossentropy": 2.4297776222229004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21948332339525223, + "step": 20168 + }, + { + "epoch": 0.4034, + "grad_norm": 2.046875, + "grad_norm_var": 0.007413482666015625, + "learning_rate": 0.0001, + "loss": 4.1809, + "loss/crossentropy": 2.029311180114746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19954024255275726, + "step": 20170 + }, + { + "epoch": 0.40344, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008123524983723958, + "learning_rate": 0.0001, + "loss": 4.0891, + "loss/crossentropy": 2.099509835243225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19811799377202988, + "step": 20172 + }, + { + "epoch": 0.40348, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009732818603515625, + "learning_rate": 0.0001, + "loss": 3.996, + "loss/crossentropy": 1.9699227809906006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16898275911808014, + "step": 20174 + }, + { + "epoch": 0.40352, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010668690999348958, + "learning_rate": 0.0001, + "loss": 4.0507, + "loss/crossentropy": 2.0661654472351074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20182892680168152, + "step": 20176 + }, + { + "epoch": 0.40356, + "grad_norm": 1.859375, + "grad_norm_var": 0.0108306884765625, + "learning_rate": 0.0001, + "loss": 4.0142, + "loss/crossentropy": 2.0732903480529785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1912563294172287, + "step": 20178 + }, + { + "epoch": 0.4036, + "grad_norm": 1.96875, + "grad_norm_var": 0.010578409830729166, + "learning_rate": 0.0001, + "loss": 4.0337, + "loss/crossentropy": 1.9951130747795105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17538709938526154, + "step": 20180 + }, + { + "epoch": 0.40364, + "grad_norm": 1.921875, + "grad_norm_var": 0.010117340087890624, + "learning_rate": 0.0001, + "loss": 3.994, + "loss/crossentropy": 1.8298532366752625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18598610162734985, + "step": 20182 + }, + { + "epoch": 0.40368, + "grad_norm": 2.5, + "grad_norm_var": 0.028742472330729168, + "learning_rate": 0.0001, + "loss": 4.3272, + "loss/crossentropy": 2.19102144241333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25994810461997986, + "step": 20184 + }, + { + "epoch": 0.40372, + "grad_norm": 2.03125, + "grad_norm_var": 0.02552490234375, + "learning_rate": 0.0001, + "loss": 4.1003, + "loss/crossentropy": 2.021101176738739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19094721227884293, + "step": 20186 + }, + { + "epoch": 0.40376, + "grad_norm": 1.96875, + "grad_norm_var": 0.02437108357747396, + "learning_rate": 0.0001, + "loss": 4.0057, + "loss/crossentropy": 1.9469704627990723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19880877435207367, + "step": 20188 + }, + { + "epoch": 0.4038, + "grad_norm": 1.9375, + "grad_norm_var": 0.023374176025390624, + "learning_rate": 0.0001, + "loss": 4.1993, + "loss/crossentropy": 2.0968725085258484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23307666182518005, + "step": 20190 + }, + { + "epoch": 0.40384, + "grad_norm": 1.828125, + "grad_norm_var": 0.0236572265625, + "learning_rate": 0.0001, + "loss": 4.1409, + "loss/crossentropy": 1.7614120244979858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854785978794098, + "step": 20192 + }, + { + "epoch": 0.40388, + "grad_norm": 1.875, + "grad_norm_var": 0.021714019775390624, + "learning_rate": 0.0001, + "loss": 3.9778, + "loss/crossentropy": 2.1977179050445557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19997768104076385, + "step": 20194 + }, + { + "epoch": 0.40392, + "grad_norm": 1.984375, + "grad_norm_var": 0.022359212239583332, + "learning_rate": 0.0001, + "loss": 3.6169, + "loss/crossentropy": 1.6404736638069153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18520022183656693, + "step": 20196 + }, + { + "epoch": 0.40396, + "grad_norm": 1.9375, + "grad_norm_var": 0.022345987955729167, + "learning_rate": 0.0001, + "loss": 4.3767, + "loss/crossentropy": 2.4365806579589844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23998292535543442, + "step": 20198 + }, + { + "epoch": 0.404, + "grad_norm": 1.84375, + "grad_norm_var": 0.005098215738932292, + "learning_rate": 0.0001, + "loss": 4.026, + "loss/crossentropy": 2.191064238548279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20327729731798172, + "step": 20200 + }, + { + "epoch": 0.40404, + "grad_norm": 1.875, + "grad_norm_var": 0.004992421468098958, + "learning_rate": 0.0001, + "loss": 3.9392, + "loss/crossentropy": 2.1551318764686584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.226949080824852, + "step": 20202 + }, + { + "epoch": 0.40408, + "grad_norm": 2.390625, + "grad_norm_var": 0.017463175455729167, + "learning_rate": 0.0001, + "loss": 4.4831, + "loss/crossentropy": 2.4735565185546875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20860131084918976, + "step": 20204 + }, + { + "epoch": 0.40412, + "grad_norm": 2.046875, + "grad_norm_var": 0.017288970947265624, + "learning_rate": 0.0001, + "loss": 4.1105, + "loss/crossentropy": 2.1407764554023743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2044227048754692, + "step": 20206 + }, + { + "epoch": 0.40416, + "grad_norm": 2.046875, + "grad_norm_var": 0.016471099853515626, + "learning_rate": 0.0001, + "loss": 4.3293, + "loss/crossentropy": 2.1775436401367188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21017960458993912, + "step": 20208 + }, + { + "epoch": 0.4042, + "grad_norm": 2.015625, + "grad_norm_var": 0.018195597330729167, + "learning_rate": 0.0001, + "loss": 3.9514, + "loss/crossentropy": 1.8976858854293823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17403876036405563, + "step": 20210 + }, + { + "epoch": 0.40424, + "grad_norm": 2.046875, + "grad_norm_var": 0.018143463134765624, + "learning_rate": 0.0001, + "loss": 3.7933, + "loss/crossentropy": 1.7951022386550903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1805388182401657, + "step": 20212 + }, + { + "epoch": 0.40428, + "grad_norm": 1.984375, + "grad_norm_var": 0.01911188761393229, + "learning_rate": 0.0001, + "loss": 4.0215, + "loss/crossentropy": 2.0954891443252563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20514150708913803, + "step": 20214 + }, + { + "epoch": 0.40432, + "grad_norm": 2.046875, + "grad_norm_var": 0.018212636311848957, + "learning_rate": 0.0001, + "loss": 4.1753, + "loss/crossentropy": 2.125544309616089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19341818243265152, + "step": 20216 + }, + { + "epoch": 0.40436, + "grad_norm": 2.125, + "grad_norm_var": 0.01802546183268229, + "learning_rate": 0.0001, + "loss": 4.1361, + "loss/crossentropy": 1.7649177312850952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18554037809371948, + "step": 20218 + }, + { + "epoch": 0.4044, + "grad_norm": 2.0, + "grad_norm_var": 0.007529449462890625, + "learning_rate": 0.0001, + "loss": 4.1548, + "loss/crossentropy": 2.1141446232795715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219147227704525, + "step": 20220 + }, + { + "epoch": 0.40444, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007328033447265625, + "learning_rate": 0.0001, + "loss": 3.803, + "loss/crossentropy": 1.9603995084762573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19251137226819992, + "step": 20222 + }, + { + "epoch": 0.40448, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006941731770833333, + "learning_rate": 0.0001, + "loss": 4.1023, + "loss/crossentropy": 2.1824519634246826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134433463215828, + "step": 20224 + }, + { + "epoch": 0.40452, + "grad_norm": 1.84375, + "grad_norm_var": 0.005796051025390625, + "learning_rate": 0.0001, + "loss": 4.0501, + "loss/crossentropy": 2.0491157174110413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19854432344436646, + "step": 20226 + }, + { + "epoch": 0.40456, + "grad_norm": 1.828125, + "grad_norm_var": 0.007108561197916667, + "learning_rate": 0.0001, + "loss": 3.9262, + "loss/crossentropy": 2.3731456995010376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2264491394162178, + "step": 20228 + }, + { + "epoch": 0.4046, + "grad_norm": 1.859375, + "grad_norm_var": 0.006150054931640625, + "learning_rate": 0.0001, + "loss": 3.9746, + "loss/crossentropy": 1.9901636242866516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17958952486515045, + "step": 20230 + }, + { + "epoch": 0.40464, + "grad_norm": 2.09375, + "grad_norm_var": 0.0072265625, + "learning_rate": 0.0001, + "loss": 4.2473, + "loss/crossentropy": 1.9541369080543518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19508858770132065, + "step": 20232 + }, + { + "epoch": 0.40468, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0056149800618489586, + "learning_rate": 0.0001, + "loss": 3.8702, + "loss/crossentropy": 1.9723637700080872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17709529399871826, + "step": 20234 + }, + { + "epoch": 0.40472, + "grad_norm": 2.046875, + "grad_norm_var": 0.006669108072916667, + "learning_rate": 0.0001, + "loss": 4.1318, + "loss/crossentropy": 2.2354401350021362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117457002401352, + "step": 20236 + }, + { + "epoch": 0.40476, + "grad_norm": 1.90625, + "grad_norm_var": 0.017787424723307292, + "learning_rate": 0.0001, + "loss": 4.1353, + "loss/crossentropy": 2.1230952739715576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19268101453781128, + "step": 20238 + }, + { + "epoch": 0.4048, + "grad_norm": 1.9140625, + "grad_norm_var": 0.024057769775390626, + "learning_rate": 0.0001, + "loss": 4.1973, + "loss/crossentropy": 2.250411033630371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24458947777748108, + "step": 20240 + }, + { + "epoch": 0.40484, + "grad_norm": 1.9921875, + "grad_norm_var": 0.021882120768229166, + "learning_rate": 0.0001, + "loss": 4.0774, + "loss/crossentropy": 2.0546218752861023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20313245803117752, + "step": 20242 + }, + { + "epoch": 0.40488, + "grad_norm": 1.9140625, + "grad_norm_var": 0.020808664957682292, + "learning_rate": 0.0001, + "loss": 3.827, + "loss/crossentropy": 1.8805654644966125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19259311258792877, + "step": 20244 + }, + { + "epoch": 0.40492, + "grad_norm": 1.8828125, + "grad_norm_var": 0.023209381103515624, + "learning_rate": 0.0001, + "loss": 3.7371, + "loss/crossentropy": 1.9676281809806824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1743411347270012, + "step": 20246 + }, + { + "epoch": 0.40496, + "grad_norm": 1.8984375, + "grad_norm_var": 0.024894205729166667, + "learning_rate": 0.0001, + "loss": 3.7542, + "loss/crossentropy": 2.2166742086410522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19619230180978775, + "step": 20248 + }, + { + "epoch": 0.405, + "grad_norm": 1.828125, + "grad_norm_var": 0.027103424072265625, + "learning_rate": 0.0001, + "loss": 4.2001, + "loss/crossentropy": 2.2599674463272095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19451096653938293, + "step": 20250 + }, + { + "epoch": 0.40504, + "grad_norm": 1.8828125, + "grad_norm_var": 0.027296702067057293, + "learning_rate": 0.0001, + "loss": 3.7535, + "loss/crossentropy": 2.206246018409729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20522233098745346, + "step": 20252 + }, + { + "epoch": 0.40508, + "grad_norm": 2.171875, + "grad_norm_var": 0.01790135701497396, + "learning_rate": 0.0001, + "loss": 4.0881, + "loss/crossentropy": 2.0229114294052124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21543244272470474, + "step": 20254 + }, + { + "epoch": 0.40512, + "grad_norm": 2.0, + "grad_norm_var": 0.00960693359375, + "learning_rate": 0.0001, + "loss": 4.0279, + "loss/crossentropy": 2.245228886604309, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20587065815925598, + "step": 20256 + }, + { + "epoch": 0.40516, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009368642171223959, + "learning_rate": 0.0001, + "loss": 3.934, + "loss/crossentropy": 2.030737340450287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20048178732395172, + "step": 20258 + }, + { + "epoch": 0.4052, + "grad_norm": 1.8203125, + "grad_norm_var": 0.010027821858723958, + "learning_rate": 0.0001, + "loss": 4.0323, + "loss/crossentropy": 2.084582805633545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19962488114833832, + "step": 20260 + }, + { + "epoch": 0.40524, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009075673421223958, + "learning_rate": 0.0001, + "loss": 3.8496, + "loss/crossentropy": 2.2461657524108887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21189837902784348, + "step": 20262 + }, + { + "epoch": 0.40528, + "grad_norm": 2.03125, + "grad_norm_var": 0.0095123291015625, + "learning_rate": 0.0001, + "loss": 4.0405, + "loss/crossentropy": 1.9522897005081177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18683286011219025, + "step": 20264 + }, + { + "epoch": 0.40532, + "grad_norm": 1.8359375, + "grad_norm_var": 0.008104451497395833, + "learning_rate": 0.0001, + "loss": 3.9797, + "loss/crossentropy": 1.9156713485717773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18374722450971603, + "step": 20266 + }, + { + "epoch": 0.40536, + "grad_norm": 2.96875, + "grad_norm_var": 0.07459208170572916, + "learning_rate": 0.0001, + "loss": 4.2083, + "loss/crossentropy": 2.2467408180236816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2305024415254593, + "step": 20268 + }, + { + "epoch": 0.4054, + "grad_norm": 1.921875, + "grad_norm_var": 0.07486063639322917, + "learning_rate": 0.0001, + "loss": 3.9249, + "loss/crossentropy": 2.0827468633651733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20297909528017044, + "step": 20270 + }, + { + "epoch": 0.40544, + "grad_norm": 2.0625, + "grad_norm_var": 0.07484130859375, + "learning_rate": 0.0001, + "loss": 4.1911, + "loss/crossentropy": 2.1979604959487915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19396250694990158, + "step": 20272 + }, + { + "epoch": 0.40548, + "grad_norm": 1.953125, + "grad_norm_var": 0.0729156494140625, + "learning_rate": 0.0001, + "loss": 4.0384, + "loss/crossentropy": 2.0157305002212524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022785097360611, + "step": 20274 + }, + { + "epoch": 0.40552, + "grad_norm": 2.109375, + "grad_norm_var": 0.06889012654622396, + "learning_rate": 0.0001, + "loss": 4.1509, + "loss/crossentropy": 1.838208019733429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19091250747442245, + "step": 20276 + }, + { + "epoch": 0.40556, + "grad_norm": 1.8828125, + "grad_norm_var": 0.06746800740559895, + "learning_rate": 0.0001, + "loss": 4.1206, + "loss/crossentropy": 2.1762728691101074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19559895247220993, + "step": 20278 + }, + { + "epoch": 0.4056, + "grad_norm": 2.03125, + "grad_norm_var": 0.06892903645833333, + "learning_rate": 0.0001, + "loss": 4.2381, + "loss/crossentropy": 2.2119935154914856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954266056418419, + "step": 20280 + }, + { + "epoch": 0.40564, + "grad_norm": 2.0, + "grad_norm_var": 0.0665728251139323, + "learning_rate": 0.0001, + "loss": 3.9296, + "loss/crossentropy": 1.9655035138130188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19419729709625244, + "step": 20282 + }, + { + "epoch": 0.40568, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009501139322916666, + "learning_rate": 0.0001, + "loss": 3.8007, + "loss/crossentropy": 1.7471181750297546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15437456965446472, + "step": 20284 + }, + { + "epoch": 0.40572, + "grad_norm": 1.96875, + "grad_norm_var": 0.006799062093098958, + "learning_rate": 0.0001, + "loss": 3.9379, + "loss/crossentropy": 1.8075406551361084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18546781688928604, + "step": 20286 + }, + { + "epoch": 0.40576, + "grad_norm": 2.015625, + "grad_norm_var": 0.006925455729166667, + "learning_rate": 0.0001, + "loss": 4.2841, + "loss/crossentropy": 2.220236897468567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20516446232795715, + "step": 20288 + }, + { + "epoch": 0.4058, + "grad_norm": 1.828125, + "grad_norm_var": 0.006965128580729166, + "learning_rate": 0.0001, + "loss": 4.0263, + "loss/crossentropy": 1.887015163898468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1859249323606491, + "step": 20290 + }, + { + "epoch": 0.40584, + "grad_norm": 1.9375, + "grad_norm_var": 0.004776763916015625, + "learning_rate": 0.0001, + "loss": 4.1448, + "loss/crossentropy": 2.2080272436141968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20920713245868683, + "step": 20292 + }, + { + "epoch": 0.40588, + "grad_norm": 2.109375, + "grad_norm_var": 0.005722808837890625, + "learning_rate": 0.0001, + "loss": 4.2826, + "loss/crossentropy": 1.9298865795135498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954272910952568, + "step": 20294 + }, + { + "epoch": 0.40592, + "grad_norm": 1.984375, + "grad_norm_var": 0.0064208984375, + "learning_rate": 0.0001, + "loss": 3.9118, + "loss/crossentropy": 1.914223849773407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010154277086258, + "step": 20296 + }, + { + "epoch": 0.40596, + "grad_norm": 1.9375, + "grad_norm_var": 0.005782063802083333, + "learning_rate": 0.0001, + "loss": 3.7257, + "loss/crossentropy": 1.7863758206367493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19244936853647232, + "step": 20298 + }, + { + "epoch": 0.406, + "grad_norm": 1.8828125, + "grad_norm_var": 0.005782063802083333, + "learning_rate": 0.0001, + "loss": 3.8874, + "loss/crossentropy": 2.0487464666366577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20337236672639847, + "step": 20300 + }, + { + "epoch": 0.40604, + "grad_norm": 2.0625, + "grad_norm_var": 0.007281239827473958, + "learning_rate": 0.0001, + "loss": 3.8666, + "loss/crossentropy": 1.8796595335006714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18166716396808624, + "step": 20302 + }, + { + "epoch": 0.40608, + "grad_norm": 2.03125, + "grad_norm_var": 0.0071441650390625, + "learning_rate": 0.0001, + "loss": 4.0397, + "loss/crossentropy": 2.0738844871520996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20000334084033966, + "step": 20304 + }, + { + "epoch": 0.40612, + "grad_norm": 1.953125, + "grad_norm_var": 0.0070709228515625, + "learning_rate": 0.0001, + "loss": 3.8526, + "loss/crossentropy": 2.392453908920288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19829116761684418, + "step": 20306 + }, + { + "epoch": 0.40616, + "grad_norm": 2.03125, + "grad_norm_var": 0.010117340087890624, + "learning_rate": 0.0001, + "loss": 4.0658, + "loss/crossentropy": 2.137113928794861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19673822075128555, + "step": 20308 + }, + { + "epoch": 0.4062, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008593495686848958, + "learning_rate": 0.0001, + "loss": 4.1409, + "loss/crossentropy": 2.0520911812782288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19579464942216873, + "step": 20310 + }, + { + "epoch": 0.40624, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007303873697916667, + "learning_rate": 0.0001, + "loss": 4.2045, + "loss/crossentropy": 1.9204559922218323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19526654481887817, + "step": 20312 + }, + { + "epoch": 0.40628, + "grad_norm": 1.96875, + "grad_norm_var": 0.009822336832682292, + "learning_rate": 0.0001, + "loss": 3.9996, + "loss/crossentropy": 2.0489402413368225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19304056465625763, + "step": 20314 + }, + { + "epoch": 0.40632, + "grad_norm": 2.4375, + "grad_norm_var": 10.828910319010417, + "learning_rate": 0.0001, + "loss": 5.057, + "loss/crossentropy": 2.127828896045685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21591445803642273, + "step": 20316 + }, + { + "epoch": 0.40636, + "grad_norm": 1.9453125, + "grad_norm_var": 10.844437408447266, + "learning_rate": 0.0001, + "loss": 4.0046, + "loss/crossentropy": 1.9867295026779175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19691205769777298, + "step": 20318 + }, + { + "epoch": 0.4064, + "grad_norm": 2.0, + "grad_norm_var": 10.848514811197917, + "learning_rate": 0.0001, + "loss": 4.1028, + "loss/crossentropy": 2.324278950691223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2344008907675743, + "step": 20320 + }, + { + "epoch": 0.40644, + "grad_norm": 2.15625, + "grad_norm_var": 10.786188761393229, + "learning_rate": 0.0001, + "loss": 4.3306, + "loss/crossentropy": 2.2904698848724365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21537532657384872, + "step": 20322 + }, + { + "epoch": 0.40648, + "grad_norm": 1.96875, + "grad_norm_var": 10.77518081665039, + "learning_rate": 0.0001, + "loss": 3.9667, + "loss/crossentropy": 2.0894381999969482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20012424886226654, + "step": 20324 + }, + { + "epoch": 0.40652, + "grad_norm": 1.984375, + "grad_norm_var": 10.77937723795573, + "learning_rate": 0.0001, + "loss": 4.0753, + "loss/crossentropy": 2.109618663787842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21001552045345306, + "step": 20326 + }, + { + "epoch": 0.40656, + "grad_norm": 2.078125, + "grad_norm_var": 10.745448557535807, + "learning_rate": 0.0001, + "loss": 4.1139, + "loss/crossentropy": 2.048314094543457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19623054563999176, + "step": 20328 + }, + { + "epoch": 0.4066, + "grad_norm": 2.09375, + "grad_norm_var": 10.683837636311848, + "learning_rate": 0.0001, + "loss": 4.3341, + "loss/crossentropy": 2.2704076766967773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24112576246261597, + "step": 20330 + }, + { + "epoch": 0.40664, + "grad_norm": 2.015625, + "grad_norm_var": 0.010057576497395833, + "learning_rate": 0.0001, + "loss": 3.9603, + "loss/crossentropy": 2.1316241025924683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20026037096977234, + "step": 20332 + }, + { + "epoch": 0.40668, + "grad_norm": 2.34375, + "grad_norm_var": 0.015740712483723957, + "learning_rate": 0.0001, + "loss": 4.2839, + "loss/crossentropy": 2.1923957467079163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25954584032297134, + "step": 20334 + }, + { + "epoch": 0.40672, + "grad_norm": 2.015625, + "grad_norm_var": 0.01502685546875, + "learning_rate": 0.0001, + "loss": 4.0627, + "loss/crossentropy": 2.228965699672699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21487966179847717, + "step": 20336 + }, + { + "epoch": 0.40676, + "grad_norm": 1.8125, + "grad_norm_var": 0.01531982421875, + "learning_rate": 0.0001, + "loss": 3.8906, + "loss/crossentropy": 1.9097226858139038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1784234344959259, + "step": 20338 + }, + { + "epoch": 0.4068, + "grad_norm": 1.8046875, + "grad_norm_var": 0.020646158854166666, + "learning_rate": 0.0001, + "loss": 4.1256, + "loss/crossentropy": 2.125716209411621, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19827620685100555, + "step": 20340 + }, + { + "epoch": 0.40684, + "grad_norm": 1.90625, + "grad_norm_var": 0.02122802734375, + "learning_rate": 0.0001, + "loss": 4.1436, + "loss/crossentropy": 2.025933086872101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20766756683588028, + "step": 20342 + }, + { + "epoch": 0.40688, + "grad_norm": 1.9453125, + "grad_norm_var": 0.022638956705729168, + "learning_rate": 0.0001, + "loss": 3.7289, + "loss/crossentropy": 2.0030741095542908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19484290480613708, + "step": 20344 + }, + { + "epoch": 0.40692, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02196019490559896, + "learning_rate": 0.0001, + "loss": 4.2287, + "loss/crossentropy": 2.402723550796509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21893122792243958, + "step": 20346 + }, + { + "epoch": 0.40696, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0221343994140625, + "learning_rate": 0.0001, + "loss": 3.8234, + "loss/crossentropy": 2.077975869178772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20634907484054565, + "step": 20348 + }, + { + "epoch": 0.407, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015306599934895833, + "learning_rate": 0.0001, + "loss": 3.7596, + "loss/crossentropy": 1.86312997341156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18826880306005478, + "step": 20350 + }, + { + "epoch": 0.40704, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015225982666015625, + "learning_rate": 0.0001, + "loss": 3.982, + "loss/crossentropy": 2.090232253074646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20021233707666397, + "step": 20352 + }, + { + "epoch": 0.40708, + "grad_norm": 1.7890625, + "grad_norm_var": 0.015851847330729165, + "learning_rate": 0.0001, + "loss": 3.9013, + "loss/crossentropy": 2.056272864341736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975172832608223, + "step": 20354 + }, + { + "epoch": 0.40712, + "grad_norm": 2.03125, + "grad_norm_var": 0.008689117431640626, + "learning_rate": 0.0001, + "loss": 4.2957, + "loss/crossentropy": 2.0955827832221985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951812580227852, + "step": 20356 + }, + { + "epoch": 0.40716, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007950846354166667, + "learning_rate": 0.0001, + "loss": 4.0598, + "loss/crossentropy": 1.6103880405426025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17600858211517334, + "step": 20358 + }, + { + "epoch": 0.4072, + "grad_norm": 1.9375, + "grad_norm_var": 0.00782470703125, + "learning_rate": 0.0001, + "loss": 4.1568, + "loss/crossentropy": 2.2216384410858154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22146118432283401, + "step": 20360 + }, + { + "epoch": 0.40724, + "grad_norm": 2.0625, + "grad_norm_var": 0.007033030192057292, + "learning_rate": 0.0001, + "loss": 4.3917, + "loss/crossentropy": 2.1781840324401855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19006534665822983, + "step": 20362 + }, + { + "epoch": 0.40728, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0074371337890625, + "learning_rate": 0.0001, + "loss": 3.9049, + "loss/crossentropy": 1.8517277240753174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17870192229747772, + "step": 20364 + }, + { + "epoch": 0.40732, + "grad_norm": 1.984375, + "grad_norm_var": 0.006105295817057292, + "learning_rate": 0.0001, + "loss": 4.1469, + "loss/crossentropy": 2.0296677947044373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20260153710842133, + "step": 20366 + }, + { + "epoch": 0.40736, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006046295166015625, + "learning_rate": 0.0001, + "loss": 4.0516, + "loss/crossentropy": 2.088002324104309, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1896708756685257, + "step": 20368 + }, + { + "epoch": 0.4074, + "grad_norm": 1.8203125, + "grad_norm_var": 0.005122629801432291, + "learning_rate": 0.0001, + "loss": 4.0888, + "loss/crossentropy": 2.0305171608924866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1964031085371971, + "step": 20370 + }, + { + "epoch": 0.40744, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0046770731608072914, + "learning_rate": 0.0001, + "loss": 4.0663, + "loss/crossentropy": 1.9005139470100403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981399655342102, + "step": 20372 + }, + { + "epoch": 0.40748, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0036333719889322918, + "learning_rate": 0.0001, + "loss": 4.0646, + "loss/crossentropy": 2.1551105976104736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21034018695354462, + "step": 20374 + }, + { + "epoch": 0.40752, + "grad_norm": 1.875, + "grad_norm_var": 0.00372314453125, + "learning_rate": 0.0001, + "loss": 3.9726, + "loss/crossentropy": 2.0319311022758484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1935891956090927, + "step": 20376 + }, + { + "epoch": 0.40756, + "grad_norm": 2.09375, + "grad_norm_var": 0.0052530924479166664, + "learning_rate": 0.0001, + "loss": 4.028, + "loss/crossentropy": 2.06658136844635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19683608412742615, + "step": 20378 + }, + { + "epoch": 0.4076, + "grad_norm": 2.328125, + "grad_norm_var": 0.01574885050455729, + "learning_rate": 0.0001, + "loss": 4.5496, + "loss/crossentropy": 2.667450428009033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24288231134414673, + "step": 20380 + }, + { + "epoch": 0.40764, + "grad_norm": 1.7734375, + "grad_norm_var": 0.021061197916666666, + "learning_rate": 0.0001, + "loss": 3.6242, + "loss/crossentropy": 1.9372497200965881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18491743505001068, + "step": 20382 + }, + { + "epoch": 0.40768, + "grad_norm": 1.8359375, + "grad_norm_var": 0.021512858072916665, + "learning_rate": 0.0001, + "loss": 3.9358, + "loss/crossentropy": 1.96478271484375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18305277079343796, + "step": 20384 + }, + { + "epoch": 0.40772, + "grad_norm": 1.984375, + "grad_norm_var": 0.020096842447916666, + "learning_rate": 0.0001, + "loss": 4.0033, + "loss/crossentropy": 1.813466727733612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827232465147972, + "step": 20386 + }, + { + "epoch": 0.40776, + "grad_norm": 1.8671875, + "grad_norm_var": 0.020420074462890625, + "learning_rate": 0.0001, + "loss": 3.9655, + "loss/crossentropy": 1.6925603151321411, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17452678084373474, + "step": 20388 + }, + { + "epoch": 0.4078, + "grad_norm": 2.046875, + "grad_norm_var": 0.02113622029622396, + "learning_rate": 0.0001, + "loss": 4.0845, + "loss/crossentropy": 1.8859028220176697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187887504696846, + "step": 20390 + }, + { + "epoch": 0.40784, + "grad_norm": 2.046875, + "grad_norm_var": 0.02066218058268229, + "learning_rate": 0.0001, + "loss": 3.9992, + "loss/crossentropy": 2.0689834356307983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20971451699733734, + "step": 20392 + }, + { + "epoch": 0.40788, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0203033447265625, + "learning_rate": 0.0001, + "loss": 4.2332, + "loss/crossentropy": 2.237601161003113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20401395112276077, + "step": 20394 + }, + { + "epoch": 0.40792, + "grad_norm": 1.859375, + "grad_norm_var": 0.008560943603515624, + "learning_rate": 0.0001, + "loss": 3.9913, + "loss/crossentropy": 1.8701192736625671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192036435008049, + "step": 20396 + }, + { + "epoch": 0.40796, + "grad_norm": 1.890625, + "grad_norm_var": 0.005680084228515625, + "learning_rate": 0.0001, + "loss": 4.2942, + "loss/crossentropy": 2.3522990942001343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2178252711892128, + "step": 20398 + }, + { + "epoch": 0.408, + "grad_norm": 2.03125, + "grad_norm_var": 0.005069732666015625, + "learning_rate": 0.0001, + "loss": 4.245, + "loss/crossentropy": 2.267996072769165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22347165644168854, + "step": 20400 + }, + { + "epoch": 0.40804, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005208333333333333, + "learning_rate": 0.0001, + "loss": 4.079, + "loss/crossentropy": 2.274789035320282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2143096998333931, + "step": 20402 + }, + { + "epoch": 0.40808, + "grad_norm": 2.015625, + "grad_norm_var": 0.004797108968098958, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 2.0331249237060547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19468723237514496, + "step": 20404 + }, + { + "epoch": 0.40812, + "grad_norm": 1.875, + "grad_norm_var": 0.004980214436848958, + "learning_rate": 0.0001, + "loss": 3.9698, + "loss/crossentropy": 1.9442223906517029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19499845802783966, + "step": 20406 + }, + { + "epoch": 0.40816, + "grad_norm": 1.96875, + "grad_norm_var": 0.00697021484375, + "learning_rate": 0.0001, + "loss": 3.6943, + "loss/crossentropy": 1.9122044444084167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19391249865293503, + "step": 20408 + }, + { + "epoch": 0.4082, + "grad_norm": 1.890625, + "grad_norm_var": 0.005651601155598958, + "learning_rate": 0.0001, + "loss": 3.9492, + "loss/crossentropy": 1.9939789175987244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18136083334684372, + "step": 20410 + }, + { + "epoch": 0.40824, + "grad_norm": 1.828125, + "grad_norm_var": 0.006036122639973958, + "learning_rate": 0.0001, + "loss": 3.7378, + "loss/crossentropy": 2.212642192840576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18931927531957626, + "step": 20412 + }, + { + "epoch": 0.40828, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006990559895833333, + "learning_rate": 0.0001, + "loss": 4.1713, + "loss/crossentropy": 1.8282644152641296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.190102256834507, + "step": 20414 + }, + { + "epoch": 0.40832, + "grad_norm": 1.9765625, + "grad_norm_var": 0.00745849609375, + "learning_rate": 0.0001, + "loss": 3.8334, + "loss/crossentropy": 2.176365852355957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19906923174858093, + "step": 20416 + }, + { + "epoch": 0.40836, + "grad_norm": 1.90625, + "grad_norm_var": 0.0073626200358072914, + "learning_rate": 0.0001, + "loss": 4.1297, + "loss/crossentropy": 2.1126151084899902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.180545412003994, + "step": 20418 + }, + { + "epoch": 0.4084, + "grad_norm": 1.8984375, + "grad_norm_var": 0.00784912109375, + "learning_rate": 0.0001, + "loss": 4.1878, + "loss/crossentropy": 1.834926426410675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1794508993625641, + "step": 20420 + }, + { + "epoch": 0.40844, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007746378580729167, + "learning_rate": 0.0001, + "loss": 4.0114, + "loss/crossentropy": 2.2909014225006104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21252857148647308, + "step": 20422 + }, + { + "epoch": 0.40848, + "grad_norm": 2.015625, + "grad_norm_var": 0.005866495768229166, + "learning_rate": 0.0001, + "loss": 4.3771, + "loss/crossentropy": 2.0966813564300537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197954960167408, + "step": 20424 + }, + { + "epoch": 0.40852, + "grad_norm": 1.890625, + "grad_norm_var": 0.005228424072265625, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 2.0520461201667786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19458714127540588, + "step": 20426 + }, + { + "epoch": 0.40856, + "grad_norm": 2.046875, + "grad_norm_var": 0.006400553385416666, + "learning_rate": 0.0001, + "loss": 4.4592, + "loss/crossentropy": 2.2313653230667114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.227764293551445, + "step": 20428 + }, + { + "epoch": 0.4086, + "grad_norm": 1.921875, + "grad_norm_var": 0.006956990559895833, + "learning_rate": 0.0001, + "loss": 4.0398, + "loss/crossentropy": 2.2874799966812134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21484653651714325, + "step": 20430 + }, + { + "epoch": 0.40864, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006400553385416666, + "learning_rate": 0.0001, + "loss": 4.1991, + "loss/crossentropy": 2.2390183806419373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21978556364774704, + "step": 20432 + }, + { + "epoch": 0.40868, + "grad_norm": 1.8125, + "grad_norm_var": 0.008742014567057291, + "learning_rate": 0.0001, + "loss": 4.0057, + "loss/crossentropy": 2.2040212154388428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21448630839586258, + "step": 20434 + }, + { + "epoch": 0.40872, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008356730143229166, + "learning_rate": 0.0001, + "loss": 4.1285, + "loss/crossentropy": 2.2119942903518677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19829177856445312, + "step": 20436 + }, + { + "epoch": 0.40876, + "grad_norm": 2.015625, + "grad_norm_var": 0.007897694905598959, + "learning_rate": 0.0001, + "loss": 3.819, + "loss/crossentropy": 1.769053339958191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17577596008777618, + "step": 20438 + }, + { + "epoch": 0.4088, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009226226806640625, + "learning_rate": 0.0001, + "loss": 4.0063, + "loss/crossentropy": 2.1607295274734497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20044966787099838, + "step": 20440 + }, + { + "epoch": 0.40884, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009228515625, + "learning_rate": 0.0001, + "loss": 4.0617, + "loss/crossentropy": 1.4168038368225098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18440894782543182, + "step": 20442 + }, + { + "epoch": 0.40888, + "grad_norm": 2.0, + "grad_norm_var": 0.005490875244140625, + "learning_rate": 0.0001, + "loss": 4.2437, + "loss/crossentropy": 2.1797818541526794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22209269553422928, + "step": 20444 + }, + { + "epoch": 0.40892, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005086008707682292, + "learning_rate": 0.0001, + "loss": 4.1425, + "loss/crossentropy": 2.255184292793274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20536670833826065, + "step": 20446 + }, + { + "epoch": 0.40896, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005191802978515625, + "learning_rate": 0.0001, + "loss": 4.1206, + "loss/crossentropy": 1.9633015990257263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20300059020519257, + "step": 20448 + }, + { + "epoch": 0.409, + "grad_norm": 1.8984375, + "grad_norm_var": 0.002685292561848958, + "learning_rate": 0.0001, + "loss": 3.9409, + "loss/crossentropy": 2.0943238735198975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932680904865265, + "step": 20450 + }, + { + "epoch": 0.40904, + "grad_norm": 2.1875, + "grad_norm_var": 0.0073321024576822914, + "learning_rate": 0.0001, + "loss": 4.3528, + "loss/crossentropy": 2.3319740295410156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24838463962078094, + "step": 20452 + }, + { + "epoch": 0.40908, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0081787109375, + "learning_rate": 0.0001, + "loss": 3.9134, + "loss/crossentropy": 2.0098442435264587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19818133860826492, + "step": 20454 + }, + { + "epoch": 0.40912, + "grad_norm": 2.40625, + "grad_norm_var": 0.020344034830729166, + "learning_rate": 0.0001, + "loss": 4.048, + "loss/crossentropy": 1.9752068519592285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20243104547262192, + "step": 20456 + }, + { + "epoch": 0.40916, + "grad_norm": 2.015625, + "grad_norm_var": 0.024006144205729166, + "learning_rate": 0.0001, + "loss": 4.2057, + "loss/crossentropy": 2.2999367713928223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22019093483686447, + "step": 20458 + }, + { + "epoch": 0.4092, + "grad_norm": 1.8828125, + "grad_norm_var": 0.026395416259765624, + "learning_rate": 0.0001, + "loss": 3.9169, + "loss/crossentropy": 1.770751178264618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1787494421005249, + "step": 20460 + }, + { + "epoch": 0.40924, + "grad_norm": 1.953125, + "grad_norm_var": 0.026667277018229168, + "learning_rate": 0.0001, + "loss": 3.9317, + "loss/crossentropy": 2.0412501096725464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063281610608101, + "step": 20462 + }, + { + "epoch": 0.40928, + "grad_norm": 1.9140625, + "grad_norm_var": 0.027733357747395833, + "learning_rate": 0.0001, + "loss": 3.9852, + "loss/crossentropy": 2.3771307468414307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21928168088197708, + "step": 20464 + }, + { + "epoch": 0.40932, + "grad_norm": 1.9765625, + "grad_norm_var": 0.027551015218098957, + "learning_rate": 0.0001, + "loss": 4.0884, + "loss/crossentropy": 1.91280996799469, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17027543485164642, + "step": 20466 + }, + { + "epoch": 0.40936, + "grad_norm": 1.8671875, + "grad_norm_var": 0.028831990559895833, + "learning_rate": 0.0001, + "loss": 3.9003, + "loss/crossentropy": 2.024249494075775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20218047499656677, + "step": 20468 + }, + { + "epoch": 0.4094, + "grad_norm": 2.28125, + "grad_norm_var": 0.03514989217122396, + "learning_rate": 0.0001, + "loss": 4.0608, + "loss/crossentropy": 1.787394940853119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18133901804685593, + "step": 20470 + }, + { + "epoch": 0.40944, + "grad_norm": 1.8203125, + "grad_norm_var": 0.02319920857747396, + "learning_rate": 0.0001, + "loss": 3.9609, + "loss/crossentropy": 2.085790276527405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18643641471862793, + "step": 20472 + }, + { + "epoch": 0.40948, + "grad_norm": 1.953125, + "grad_norm_var": 0.017929840087890624, + "learning_rate": 0.0001, + "loss": 4.2251, + "loss/crossentropy": 2.1670111417770386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19420959055423737, + "step": 20474 + }, + { + "epoch": 0.40952, + "grad_norm": 1.875, + "grad_norm_var": 0.01706720987955729, + "learning_rate": 0.0001, + "loss": 4.0588, + "loss/crossentropy": 2.0304930210113525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18586499989032745, + "step": 20476 + }, + { + "epoch": 0.40956, + "grad_norm": 2.0625, + "grad_norm_var": 0.017832183837890626, + "learning_rate": 0.0001, + "loss": 4.2371, + "loss/crossentropy": 2.0918440222740173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21831007301807404, + "step": 20478 + }, + { + "epoch": 0.4096, + "grad_norm": 1.8671875, + "grad_norm_var": 0.017765045166015625, + "learning_rate": 0.0001, + "loss": 3.8786, + "loss/crossentropy": 2.027481257915497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19717323035001755, + "step": 20480 + }, + { + "epoch": 0.40964, + "grad_norm": 1.890625, + "grad_norm_var": 0.017333984375, + "learning_rate": 0.0001, + "loss": 4.3524, + "loss/crossentropy": 2.227196455001831, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21321603655815125, + "step": 20482 + }, + { + "epoch": 0.40968, + "grad_norm": 2.078125, + "grad_norm_var": 0.015329742431640625, + "learning_rate": 0.0001, + "loss": 3.9722, + "loss/crossentropy": 1.8985567688941956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19271692633628845, + "step": 20484 + }, + { + "epoch": 0.40972, + "grad_norm": 2.203125, + "grad_norm_var": 0.011498006184895833, + "learning_rate": 0.0001, + "loss": 4.1364, + "loss/crossentropy": 1.973772943019867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18885072320699692, + "step": 20486 + }, + { + "epoch": 0.40976, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01171875, + "learning_rate": 0.0001, + "loss": 3.8451, + "loss/crossentropy": 2.002311408519745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053375542163849, + "step": 20488 + }, + { + "epoch": 0.4098, + "grad_norm": 2.09375, + "grad_norm_var": 0.0702301025390625, + "learning_rate": 0.0001, + "loss": 3.8449, + "loss/crossentropy": 1.7301526069641113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16717776656150818, + "step": 20490 + }, + { + "epoch": 0.40984, + "grad_norm": 2.015625, + "grad_norm_var": 0.0685546875, + "learning_rate": 0.0001, + "loss": 4.1871, + "loss/crossentropy": 1.86936616897583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944454312324524, + "step": 20492 + }, + { + "epoch": 0.40988, + "grad_norm": 1.9765625, + "grad_norm_var": 0.06875381469726563, + "learning_rate": 0.0001, + "loss": 4.1013, + "loss/crossentropy": 1.5966055393218994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19996708631515503, + "step": 20494 + }, + { + "epoch": 0.40992, + "grad_norm": 2.125, + "grad_norm_var": 0.0671783447265625, + "learning_rate": 0.0001, + "loss": 3.9716, + "loss/crossentropy": 1.8741289377212524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18652669340372086, + "step": 20496 + }, + { + "epoch": 0.40996, + "grad_norm": 1.921875, + "grad_norm_var": 0.06702372233072916, + "learning_rate": 0.0001, + "loss": 4.0857, + "loss/crossentropy": 1.958758294582367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19158867746591568, + "step": 20498 + }, + { + "epoch": 0.41, + "grad_norm": 1.953125, + "grad_norm_var": 0.06591771443684896, + "learning_rate": 0.0001, + "loss": 3.7938, + "loss/crossentropy": 1.8465643525123596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1774417608976364, + "step": 20500 + }, + { + "epoch": 0.41004, + "grad_norm": 2.015625, + "grad_norm_var": 0.06303609212239583, + "learning_rate": 0.0001, + "loss": 4.0265, + "loss/crossentropy": 1.9578893780708313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19635465741157532, + "step": 20502 + }, + { + "epoch": 0.41008, + "grad_norm": 1.875, + "grad_norm_var": 0.06118977864583333, + "learning_rate": 0.0001, + "loss": 3.7317, + "loss/crossentropy": 1.8893834352493286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1778636872768402, + "step": 20504 + }, + { + "epoch": 0.41012, + "grad_norm": 1.8125, + "grad_norm_var": 0.011189778645833334, + "learning_rate": 0.0001, + "loss": 4.1129, + "loss/crossentropy": 2.2161590456962585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19613313674926758, + "step": 20506 + }, + { + "epoch": 0.41016, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011368560791015624, + "learning_rate": 0.0001, + "loss": 4.2363, + "loss/crossentropy": 2.1272462606430054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20562685281038284, + "step": 20508 + }, + { + "epoch": 0.4102, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011319732666015625, + "learning_rate": 0.0001, + "loss": 3.9087, + "loss/crossentropy": 1.8443754315376282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20733944326639175, + "step": 20510 + }, + { + "epoch": 0.41024, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012007649739583333, + "learning_rate": 0.0001, + "loss": 3.8766, + "loss/crossentropy": 1.8213927149772644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17855563014745712, + "step": 20512 + }, + { + "epoch": 0.41028, + "grad_norm": 2.03125, + "grad_norm_var": 0.009787750244140626, + "learning_rate": 0.0001, + "loss": 4.0914, + "loss/crossentropy": 1.969380497932434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20201627910137177, + "step": 20514 + }, + { + "epoch": 0.41032, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009479777018229166, + "learning_rate": 0.0001, + "loss": 4.0491, + "loss/crossentropy": 1.8974932432174683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20217570662498474, + "step": 20516 + }, + { + "epoch": 0.41036, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010312652587890625, + "learning_rate": 0.0001, + "loss": 4.0019, + "loss/crossentropy": 1.9362964630126953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1832883059978485, + "step": 20518 + }, + { + "epoch": 0.4104, + "grad_norm": 2.078125, + "grad_norm_var": 0.010754140218098958, + "learning_rate": 0.0001, + "loss": 4.2071, + "loss/crossentropy": 2.239969849586487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22399117052555084, + "step": 20520 + }, + { + "epoch": 0.41044, + "grad_norm": 1.9921875, + "grad_norm_var": 0.004987589518229167, + "learning_rate": 0.0001, + "loss": 3.9606, + "loss/crossentropy": 1.927116334438324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.194569431245327, + "step": 20522 + }, + { + "epoch": 0.41048, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00474853515625, + "learning_rate": 0.0001, + "loss": 4.0993, + "loss/crossentropy": 1.921772539615631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19974206387996674, + "step": 20524 + }, + { + "epoch": 0.41052, + "grad_norm": 2.046875, + "grad_norm_var": 0.00509033203125, + "learning_rate": 0.0001, + "loss": 3.9591, + "loss/crossentropy": 2.2263039350509644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088855803012848, + "step": 20526 + }, + { + "epoch": 0.41056, + "grad_norm": 1.90625, + "grad_norm_var": 0.00343017578125, + "learning_rate": 0.0001, + "loss": 3.9004, + "loss/crossentropy": 1.9309766292572021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984400451183319, + "step": 20528 + }, + { + "epoch": 0.4106, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0071441650390625, + "learning_rate": 0.0001, + "loss": 4.3564, + "loss/crossentropy": 2.2890199422836304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210438072681427, + "step": 20530 + }, + { + "epoch": 0.41064, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007199859619140625, + "learning_rate": 0.0001, + "loss": 4.2302, + "loss/crossentropy": 1.961566150188446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20430031418800354, + "step": 20532 + }, + { + "epoch": 0.41068, + "grad_norm": 2.03125, + "grad_norm_var": 0.0051513671875, + "learning_rate": 0.0001, + "loss": 4.0714, + "loss/crossentropy": 1.9776958227157593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20376411080360413, + "step": 20534 + }, + { + "epoch": 0.41072, + "grad_norm": 1.9140625, + "grad_norm_var": 0.005417633056640625, + "learning_rate": 0.0001, + "loss": 3.9352, + "loss/crossentropy": 2.386608600616455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20900936424732208, + "step": 20536 + }, + { + "epoch": 0.41076, + "grad_norm": 1.828125, + "grad_norm_var": 0.007233683268229167, + "learning_rate": 0.0001, + "loss": 3.8075, + "loss/crossentropy": 2.0592793226242065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19810590893030167, + "step": 20538 + }, + { + "epoch": 0.4108, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007420857747395833, + "learning_rate": 0.0001, + "loss": 4.2639, + "loss/crossentropy": 2.176212787628174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20612400770187378, + "step": 20540 + }, + { + "epoch": 0.41084, + "grad_norm": 1.96875, + "grad_norm_var": 0.007134755452473958, + "learning_rate": 0.0001, + "loss": 4.0386, + "loss/crossentropy": 1.920596957206726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19120831042528152, + "step": 20542 + }, + { + "epoch": 0.41088, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0086578369140625, + "learning_rate": 0.0001, + "loss": 4.0088, + "loss/crossentropy": 1.8587758541107178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1729830950498581, + "step": 20544 + }, + { + "epoch": 0.41092, + "grad_norm": 1.96875, + "grad_norm_var": 0.004538726806640625, + "learning_rate": 0.0001, + "loss": 4.1263, + "loss/crossentropy": 2.115676999092102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2130855768918991, + "step": 20546 + }, + { + "epoch": 0.41096, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0043332417805989586, + "learning_rate": 0.0001, + "loss": 4.1933, + "loss/crossentropy": 2.3977283239364624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074846476316452, + "step": 20548 + }, + { + "epoch": 0.411, + "grad_norm": 1.7421875, + "grad_norm_var": 0.005700429280598958, + "learning_rate": 0.0001, + "loss": 3.9028, + "loss/crossentropy": 2.091398298740387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975005343556404, + "step": 20550 + }, + { + "epoch": 0.41104, + "grad_norm": 1.9140625, + "grad_norm_var": 0.005861155192057292, + "learning_rate": 0.0001, + "loss": 4.1509, + "loss/crossentropy": 2.1864093542099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2201247364282608, + "step": 20552 + }, + { + "epoch": 0.41108, + "grad_norm": 1.8984375, + "grad_norm_var": 0.005206044514973958, + "learning_rate": 0.0001, + "loss": 3.8462, + "loss/crossentropy": 1.9154713153839111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18139449506998062, + "step": 20554 + }, + { + "epoch": 0.41112, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005682118733723958, + "learning_rate": 0.0001, + "loss": 4.0527, + "loss/crossentropy": 2.392449378967285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069767862558365, + "step": 20556 + }, + { + "epoch": 0.41116, + "grad_norm": 2.09375, + "grad_norm_var": 0.0071604410807291664, + "learning_rate": 0.0001, + "loss": 4.0525, + "loss/crossentropy": 1.9300146102905273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20329315215349197, + "step": 20558 + }, + { + "epoch": 0.4112, + "grad_norm": 1.8359375, + "grad_norm_var": 0.007313791910807292, + "learning_rate": 0.0001, + "loss": 3.6521, + "loss/crossentropy": 1.9400085806846619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1855112686753273, + "step": 20560 + }, + { + "epoch": 0.41124, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008321126302083334, + "learning_rate": 0.0001, + "loss": 4.1355, + "loss/crossentropy": 2.1216121912002563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1966063678264618, + "step": 20562 + }, + { + "epoch": 0.41128, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008154042561848958, + "learning_rate": 0.0001, + "loss": 3.9513, + "loss/crossentropy": 2.136322498321533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19735725224018097, + "step": 20564 + }, + { + "epoch": 0.41132, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0060943603515625, + "learning_rate": 0.0001, + "loss": 3.9084, + "loss/crossentropy": 2.1143118143081665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21534033119678497, + "step": 20566 + }, + { + "epoch": 0.41136, + "grad_norm": 2.046875, + "grad_norm_var": 0.006577301025390625, + "learning_rate": 0.0001, + "loss": 3.9451, + "loss/crossentropy": 1.8363113403320312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17482534050941467, + "step": 20568 + }, + { + "epoch": 0.4114, + "grad_norm": 1.9453125, + "grad_norm_var": 0.00645751953125, + "learning_rate": 0.0001, + "loss": 4.0373, + "loss/crossentropy": 1.8840075135231018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19201497733592987, + "step": 20570 + }, + { + "epoch": 0.41144, + "grad_norm": 2.0, + "grad_norm_var": 0.006270090738932292, + "learning_rate": 0.0001, + "loss": 4.0909, + "loss/crossentropy": 2.0200312733650208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2130940854549408, + "step": 20572 + }, + { + "epoch": 0.41148, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005078125, + "learning_rate": 0.0001, + "loss": 3.8886, + "loss/crossentropy": 2.136338949203491, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22277235239744186, + "step": 20574 + }, + { + "epoch": 0.41152, + "grad_norm": 2.078125, + "grad_norm_var": 0.004168446858723958, + "learning_rate": 0.0001, + "loss": 4.2314, + "loss/crossentropy": 2.2381343841552734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20784687995910645, + "step": 20576 + }, + { + "epoch": 0.41156, + "grad_norm": 2.0, + "grad_norm_var": 0.003780110677083333, + "learning_rate": 0.0001, + "loss": 4.0749, + "loss/crossentropy": 2.224187970161438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20546535402536392, + "step": 20578 + }, + { + "epoch": 0.4116, + "grad_norm": 1.875, + "grad_norm_var": 0.004117838541666667, + "learning_rate": 0.0001, + "loss": 3.5968, + "loss/crossentropy": 1.966045081615448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19925507158041, + "step": 20580 + }, + { + "epoch": 0.41164, + "grad_norm": 1.984375, + "grad_norm_var": 0.0036041259765625, + "learning_rate": 0.0001, + "loss": 3.6756, + "loss/crossentropy": 1.7995998859405518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1794610470533371, + "step": 20582 + }, + { + "epoch": 0.41168, + "grad_norm": 2.3125, + "grad_norm_var": 0.011474609375, + "learning_rate": 0.0001, + "loss": 4.2112, + "loss/crossentropy": 1.874330759048462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23802601546049118, + "step": 20584 + }, + { + "epoch": 0.41172, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011582183837890624, + "learning_rate": 0.0001, + "loss": 3.9275, + "loss/crossentropy": 2.1704328060150146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21792975068092346, + "step": 20586 + }, + { + "epoch": 0.41176, + "grad_norm": 2.046875, + "grad_norm_var": 0.012648264567057291, + "learning_rate": 0.0001, + "loss": 4.0073, + "loss/crossentropy": 1.9361745119094849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19227440655231476, + "step": 20588 + }, + { + "epoch": 0.4118, + "grad_norm": 2.0625, + "grad_norm_var": 0.0129058837890625, + "learning_rate": 0.0001, + "loss": 4.297, + "loss/crossentropy": 2.19529128074646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23097511380910873, + "step": 20590 + }, + { + "epoch": 0.41184, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012621815999348958, + "learning_rate": 0.0001, + "loss": 4.1085, + "loss/crossentropy": 2.0319225788116455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19555176049470901, + "step": 20592 + }, + { + "epoch": 0.41188, + "grad_norm": 1.890625, + "grad_norm_var": 0.012393951416015625, + "learning_rate": 0.0001, + "loss": 4.0767, + "loss/crossentropy": 2.395743250846863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20953691750764847, + "step": 20594 + }, + { + "epoch": 0.41192, + "grad_norm": 1.796875, + "grad_norm_var": 0.013901519775390624, + "learning_rate": 0.0001, + "loss": 3.8942, + "loss/crossentropy": 1.7772584557533264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19968461245298386, + "step": 20596 + }, + { + "epoch": 0.41196, + "grad_norm": 1.90625, + "grad_norm_var": 0.0158843994140625, + "learning_rate": 0.0001, + "loss": 3.7776, + "loss/crossentropy": 1.89765065908432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18085069954395294, + "step": 20598 + }, + { + "epoch": 0.412, + "grad_norm": 1.796875, + "grad_norm_var": 0.007846832275390625, + "learning_rate": 0.0001, + "loss": 3.9754, + "loss/crossentropy": 2.1154285073280334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20126362890005112, + "step": 20600 + }, + { + "epoch": 0.41204, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008259073893229166, + "learning_rate": 0.0001, + "loss": 4.2659, + "loss/crossentropy": 2.15169358253479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20982082933187485, + "step": 20602 + }, + { + "epoch": 0.41208, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007824452718098958, + "learning_rate": 0.0001, + "loss": 3.9542, + "loss/crossentropy": 1.8561811447143555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18480198085308075, + "step": 20604 + }, + { + "epoch": 0.41212, + "grad_norm": 1.96875, + "grad_norm_var": 0.008766428629557291, + "learning_rate": 0.0001, + "loss": 3.8989, + "loss/crossentropy": 1.8246020078659058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18010297417640686, + "step": 20606 + }, + { + "epoch": 0.41216, + "grad_norm": 2.015625, + "grad_norm_var": 0.0090484619140625, + "learning_rate": 0.0001, + "loss": 4.1666, + "loss/crossentropy": 2.3279634714126587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224753275513649, + "step": 20608 + }, + { + "epoch": 0.4122, + "grad_norm": 2.0, + "grad_norm_var": 0.009171295166015624, + "learning_rate": 0.0001, + "loss": 4.0695, + "loss/crossentropy": 2.122196078300476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2060379832983017, + "step": 20610 + }, + { + "epoch": 0.41224, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009666951497395833, + "learning_rate": 0.0001, + "loss": 4.1282, + "loss/crossentropy": 2.0729124546051025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20537251234054565, + "step": 20612 + }, + { + "epoch": 0.41228, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008443196614583334, + "learning_rate": 0.0001, + "loss": 4.2078, + "loss/crossentropy": 2.1832374930381775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19204121828079224, + "step": 20614 + }, + { + "epoch": 0.41232, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007982381184895833, + "learning_rate": 0.0001, + "loss": 3.8745, + "loss/crossentropy": 1.6739779114723206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15309840440750122, + "step": 20616 + }, + { + "epoch": 0.41236, + "grad_norm": 1.8046875, + "grad_norm_var": 0.008754221598307292, + "learning_rate": 0.0001, + "loss": 4.1438, + "loss/crossentropy": 2.1585883498191833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20271800458431244, + "step": 20618 + }, + { + "epoch": 0.4124, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008235422770182292, + "learning_rate": 0.0001, + "loss": 4.4289, + "loss/crossentropy": 2.391141653060913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2225441113114357, + "step": 20620 + }, + { + "epoch": 0.41244, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0053619384765625, + "learning_rate": 0.0001, + "loss": 4.1355, + "loss/crossentropy": 2.055308997631073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839761957526207, + "step": 20622 + }, + { + "epoch": 0.41248, + "grad_norm": 2.015625, + "grad_norm_var": 0.005125935872395833, + "learning_rate": 0.0001, + "loss": 4.1143, + "loss/crossentropy": 2.1222954988479614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18773587048053741, + "step": 20624 + }, + { + "epoch": 0.41252, + "grad_norm": 2.0625, + "grad_norm_var": 0.005647532145182292, + "learning_rate": 0.0001, + "loss": 4.1506, + "loss/crossentropy": 2.011850595474243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19865088164806366, + "step": 20626 + }, + { + "epoch": 0.41256, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0039784749348958336, + "learning_rate": 0.0001, + "loss": 4.0122, + "loss/crossentropy": 1.841840922832489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18899217247962952, + "step": 20628 + }, + { + "epoch": 0.4126, + "grad_norm": 1.953125, + "grad_norm_var": 0.004313151041666667, + "learning_rate": 0.0001, + "loss": 4.2477, + "loss/crossentropy": 2.335289478302002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22496683150529861, + "step": 20630 + }, + { + "epoch": 0.41264, + "grad_norm": 1.8515625, + "grad_norm_var": 0.004487864176432292, + "learning_rate": 0.0001, + "loss": 3.9252, + "loss/crossentropy": 2.0089204907417297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18921320140361786, + "step": 20632 + }, + { + "epoch": 0.41268, + "grad_norm": 2.0625, + "grad_norm_var": 0.004146067301432291, + "learning_rate": 0.0001, + "loss": 4.052, + "loss/crossentropy": 2.115709662437439, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033844143152237, + "step": 20634 + }, + { + "epoch": 0.41272, + "grad_norm": 1.921875, + "grad_norm_var": 0.004282379150390625, + "learning_rate": 0.0001, + "loss": 4.0311, + "loss/crossentropy": 2.175198018550873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20709756761789322, + "step": 20636 + }, + { + "epoch": 0.41276, + "grad_norm": 1.8515625, + "grad_norm_var": 0.005248006184895833, + "learning_rate": 0.0001, + "loss": 4.0452, + "loss/crossentropy": 2.128443717956543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18793444335460663, + "step": 20638 + }, + { + "epoch": 0.4128, + "grad_norm": 1.90625, + "grad_norm_var": 0.005830891927083333, + "learning_rate": 0.0001, + "loss": 3.9456, + "loss/crossentropy": 1.890213668346405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17856723070144653, + "step": 20640 + }, + { + "epoch": 0.41284, + "grad_norm": 2.125, + "grad_norm_var": 0.008283487955729167, + "learning_rate": 0.0001, + "loss": 4.1055, + "loss/crossentropy": 1.925938606262207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19046395272016525, + "step": 20642 + }, + { + "epoch": 0.41288, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008442942301432292, + "learning_rate": 0.0001, + "loss": 4.2369, + "loss/crossentropy": 2.0149936079978943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20876548439264297, + "step": 20644 + }, + { + "epoch": 0.41292, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008174641927083334, + "learning_rate": 0.0001, + "loss": 4.1904, + "loss/crossentropy": 2.237368941307068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20562848448753357, + "step": 20646 + }, + { + "epoch": 0.41296, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008451080322265625, + "learning_rate": 0.0001, + "loss": 3.8332, + "loss/crossentropy": 2.0298978090286255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20426444709300995, + "step": 20648 + }, + { + "epoch": 0.413, + "grad_norm": 2.09375, + "grad_norm_var": 0.0090240478515625, + "learning_rate": 0.0001, + "loss": 3.872, + "loss/crossentropy": 1.8883816599845886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18203142285346985, + "step": 20650 + }, + { + "epoch": 0.41304, + "grad_norm": 2.25, + "grad_norm_var": 0.014288075764973958, + "learning_rate": 0.0001, + "loss": 4.0831, + "loss/crossentropy": 2.051727771759033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19955651462078094, + "step": 20652 + }, + { + "epoch": 0.41308, + "grad_norm": 2.015625, + "grad_norm_var": 0.013335927327473959, + "learning_rate": 0.0001, + "loss": 4.1298, + "loss/crossentropy": 2.3192938566207886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23360159993171692, + "step": 20654 + }, + { + "epoch": 0.41312, + "grad_norm": 1.859375, + "grad_norm_var": 0.014647420247395833, + "learning_rate": 0.0001, + "loss": 3.8486, + "loss/crossentropy": 2.2148059606552124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20121797919273376, + "step": 20656 + }, + { + "epoch": 0.41316, + "grad_norm": 2.015625, + "grad_norm_var": 0.011787923177083333, + "learning_rate": 0.0001, + "loss": 4.2523, + "loss/crossentropy": 2.0665117502212524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18279288709163666, + "step": 20658 + }, + { + "epoch": 0.4132, + "grad_norm": 1.9140625, + "grad_norm_var": 0.013060506184895833, + "learning_rate": 0.0001, + "loss": 3.8782, + "loss/crossentropy": 2.0858163833618164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19623829424381256, + "step": 20660 + }, + { + "epoch": 0.41324, + "grad_norm": 1.875, + "grad_norm_var": 0.013630167643229166, + "learning_rate": 0.0001, + "loss": 3.8134, + "loss/crossentropy": 2.238997220993042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20264007151126862, + "step": 20662 + }, + { + "epoch": 0.41328, + "grad_norm": 2.0625, + "grad_norm_var": 0.016605377197265625, + "learning_rate": 0.0001, + "loss": 4.2317, + "loss/crossentropy": 2.2540037631988525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21315360069274902, + "step": 20664 + }, + { + "epoch": 0.41332, + "grad_norm": 2.03125, + "grad_norm_var": 0.014975738525390626, + "learning_rate": 0.0001, + "loss": 4.1657, + "loss/crossentropy": 2.3957645893096924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2319154515862465, + "step": 20666 + }, + { + "epoch": 0.41336, + "grad_norm": 2.015625, + "grad_norm_var": 0.009527333577473958, + "learning_rate": 0.0001, + "loss": 4.1946, + "loss/crossentropy": 2.046703338623047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992144137620926, + "step": 20668 + }, + { + "epoch": 0.4134, + "grad_norm": 1.9375, + "grad_norm_var": 0.010677083333333334, + "learning_rate": 0.0001, + "loss": 3.7664, + "loss/crossentropy": 1.7997825145721436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1673276051878929, + "step": 20670 + }, + { + "epoch": 0.41344, + "grad_norm": 2.03125, + "grad_norm_var": 0.009069569905598958, + "learning_rate": 0.0001, + "loss": 3.9432, + "loss/crossentropy": 2.102105975151062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19461503624916077, + "step": 20672 + }, + { + "epoch": 0.41348, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009557851155598958, + "learning_rate": 0.0001, + "loss": 3.8547, + "loss/crossentropy": 1.7070594429969788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17999255657196045, + "step": 20674 + }, + { + "epoch": 0.41352, + "grad_norm": 1.890625, + "grad_norm_var": 0.00869140625, + "learning_rate": 0.0001, + "loss": 4.0361, + "loss/crossentropy": 2.1908987760543823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010250836610794, + "step": 20676 + }, + { + "epoch": 0.41356, + "grad_norm": 2.046875, + "grad_norm_var": 0.007582346598307292, + "learning_rate": 0.0001, + "loss": 4.2711, + "loss/crossentropy": 2.1943222284317017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21789705008268356, + "step": 20678 + }, + { + "epoch": 0.4136, + "grad_norm": 1.9609375, + "grad_norm_var": 0.003842926025390625, + "learning_rate": 0.0001, + "loss": 4.2444, + "loss/crossentropy": 2.1216484904289246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20987947285175323, + "step": 20680 + }, + { + "epoch": 0.41364, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0038889567057291668, + "learning_rate": 0.0001, + "loss": 4.0256, + "loss/crossentropy": 2.3570865392684937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23686359077692032, + "step": 20682 + }, + { + "epoch": 0.41368, + "grad_norm": 1.953125, + "grad_norm_var": 0.0040771484375, + "learning_rate": 0.0001, + "loss": 4.1448, + "loss/crossentropy": 1.834035336971283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17401690781116486, + "step": 20684 + }, + { + "epoch": 0.41372, + "grad_norm": 2.109375, + "grad_norm_var": 0.0040891011555989586, + "learning_rate": 0.0001, + "loss": 4.2049, + "loss/crossentropy": 2.251484513282776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22543959319591522, + "step": 20686 + }, + { + "epoch": 0.41376, + "grad_norm": 1.9375, + "grad_norm_var": 0.004571278889973958, + "learning_rate": 0.0001, + "loss": 4.3775, + "loss/crossentropy": 2.233001708984375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18968196213245392, + "step": 20688 + }, + { + "epoch": 0.4138, + "grad_norm": 1.96875, + "grad_norm_var": 0.004198964436848958, + "learning_rate": 0.0001, + "loss": 3.9482, + "loss/crossentropy": 1.6730775833129883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1740739718079567, + "step": 20690 + }, + { + "epoch": 0.41384, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0100341796875, + "learning_rate": 0.0001, + "loss": 3.7717, + "loss/crossentropy": 2.3700443506240845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20850200206041336, + "step": 20692 + }, + { + "epoch": 0.41388, + "grad_norm": 2.0, + "grad_norm_var": 0.0098541259765625, + "learning_rate": 0.0001, + "loss": 4.2457, + "loss/crossentropy": 2.253411650657654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19234959036111832, + "step": 20694 + }, + { + "epoch": 0.41392, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011836751302083334, + "learning_rate": 0.0001, + "loss": 3.9595, + "loss/crossentropy": 2.414412260055542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20320549607276917, + "step": 20696 + }, + { + "epoch": 0.41396, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012562815348307292, + "learning_rate": 0.0001, + "loss": 3.8084, + "loss/crossentropy": 1.8252301216125488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19211787730455399, + "step": 20698 + }, + { + "epoch": 0.414, + "grad_norm": 2.015625, + "grad_norm_var": 0.012387847900390625, + "learning_rate": 0.0001, + "loss": 4.123, + "loss/crossentropy": 2.210301160812378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21196797490119934, + "step": 20700 + }, + { + "epoch": 0.41404, + "grad_norm": 2.15625, + "grad_norm_var": 0.013480377197265626, + "learning_rate": 0.0001, + "loss": 4.0629, + "loss/crossentropy": 1.6429123878479004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15947452187538147, + "step": 20702 + }, + { + "epoch": 0.41408, + "grad_norm": 1.84375, + "grad_norm_var": 0.014218902587890625, + "learning_rate": 0.0001, + "loss": 3.9954, + "loss/crossentropy": 1.8459800481796265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18668671697378159, + "step": 20704 + }, + { + "epoch": 0.41412, + "grad_norm": 1.921875, + "grad_norm_var": 0.014147694905598958, + "learning_rate": 0.0001, + "loss": 3.9468, + "loss/crossentropy": 2.20779287815094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2034989595413208, + "step": 20706 + }, + { + "epoch": 0.41416, + "grad_norm": 1.9140625, + "grad_norm_var": 0.03316218058268229, + "learning_rate": 0.0001, + "loss": 4.0503, + "loss/crossentropy": 1.9622939229011536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19208793342113495, + "step": 20708 + }, + { + "epoch": 0.4142, + "grad_norm": 1.9140625, + "grad_norm_var": 0.032990519205729166, + "learning_rate": 0.0001, + "loss": 4.2467, + "loss/crossentropy": 1.9145439267158508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17929863929748535, + "step": 20710 + }, + { + "epoch": 0.41424, + "grad_norm": 1.6953125, + "grad_norm_var": 0.03706029256184896, + "learning_rate": 0.0001, + "loss": 3.9202, + "loss/crossentropy": 1.9035375714302063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21188101917505264, + "step": 20712 + }, + { + "epoch": 0.41428, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0380767822265625, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 2.0500373244285583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2162824273109436, + "step": 20714 + }, + { + "epoch": 0.41432, + "grad_norm": 2.0625, + "grad_norm_var": 0.03862075805664063, + "learning_rate": 0.0001, + "loss": 4.1101, + "loss/crossentropy": 1.9774840474128723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20070379972457886, + "step": 20716 + }, + { + "epoch": 0.41436, + "grad_norm": 2.125, + "grad_norm_var": 0.03785985310872396, + "learning_rate": 0.0001, + "loss": 4.3021, + "loss/crossentropy": 1.9861173629760742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19038750231266022, + "step": 20718 + }, + { + "epoch": 0.4144, + "grad_norm": 1.875, + "grad_norm_var": 0.053098297119140624, + "learning_rate": 0.0001, + "loss": 3.669, + "loss/crossentropy": 1.9744009375572205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19013763219118118, + "step": 20720 + }, + { + "epoch": 0.41444, + "grad_norm": 1.8515625, + "grad_norm_var": 0.054441070556640624, + "learning_rate": 0.0001, + "loss": 3.9343, + "loss/crossentropy": 1.7110218405723572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1715855970978737, + "step": 20722 + }, + { + "epoch": 0.41448, + "grad_norm": 2.015625, + "grad_norm_var": 0.031556955973307294, + "learning_rate": 0.0001, + "loss": 3.9484, + "loss/crossentropy": 1.8336329460144043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19409292936325073, + "step": 20724 + }, + { + "epoch": 0.41452, + "grad_norm": 1.9765625, + "grad_norm_var": 0.03246027628580729, + "learning_rate": 0.0001, + "loss": 3.9218, + "loss/crossentropy": 1.8363453149795532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16922436654567719, + "step": 20726 + }, + { + "epoch": 0.41456, + "grad_norm": 2.09375, + "grad_norm_var": 0.025699869791666666, + "learning_rate": 0.0001, + "loss": 4.1121, + "loss/crossentropy": 1.8911715745925903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19849538803100586, + "step": 20728 + }, + { + "epoch": 0.4146, + "grad_norm": 1.8359375, + "grad_norm_var": 0.02716064453125, + "learning_rate": 0.0001, + "loss": 3.8365, + "loss/crossentropy": 1.7989731431007385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18458950519561768, + "step": 20730 + }, + { + "epoch": 0.41464, + "grad_norm": 1.921875, + "grad_norm_var": 0.02692235310872396, + "learning_rate": 0.0001, + "loss": 3.9896, + "loss/crossentropy": 1.9423270225524902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18612459301948547, + "step": 20732 + }, + { + "epoch": 0.41468, + "grad_norm": 1.9921875, + "grad_norm_var": 0.06059137980143229, + "learning_rate": 0.0001, + "loss": 3.7741, + "loss/crossentropy": 1.7587624788284302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1545337438583374, + "step": 20734 + }, + { + "epoch": 0.41472, + "grad_norm": 1.8984375, + "grad_norm_var": 0.04381510416666667, + "learning_rate": 0.0001, + "loss": 3.9651, + "loss/crossentropy": 2.314169406890869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22612107545137405, + "step": 20736 + }, + { + "epoch": 0.41476, + "grad_norm": 2.421875, + "grad_norm_var": 0.0535308837890625, + "learning_rate": 0.0001, + "loss": 4.272, + "loss/crossentropy": 2.088030219078064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19648675620555878, + "step": 20738 + }, + { + "epoch": 0.4148, + "grad_norm": 2.03125, + "grad_norm_var": 0.05324071248372396, + "learning_rate": 0.0001, + "loss": 3.8286, + "loss/crossentropy": 1.6665831208229065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17022767663002014, + "step": 20740 + }, + { + "epoch": 0.41484, + "grad_norm": 1.9765625, + "grad_norm_var": 0.05115458170572917, + "learning_rate": 0.0001, + "loss": 4.1142, + "loss/crossentropy": 2.425256609916687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20156945288181305, + "step": 20742 + }, + { + "epoch": 0.41488, + "grad_norm": 2.0, + "grad_norm_var": 0.05110244750976563, + "learning_rate": 0.0001, + "loss": 3.8343, + "loss/crossentropy": 1.6360740661621094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16550948470830917, + "step": 20744 + }, + { + "epoch": 0.41492, + "grad_norm": 2.078125, + "grad_norm_var": 0.044960276285807295, + "learning_rate": 0.0001, + "loss": 4.0782, + "loss/crossentropy": 2.0676532983779907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21137626469135284, + "step": 20746 + }, + { + "epoch": 0.41496, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04678141276041667, + "learning_rate": 0.0001, + "loss": 4.122, + "loss/crossentropy": 2.059161067008972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20899715274572372, + "step": 20748 + }, + { + "epoch": 0.415, + "grad_norm": 1.9296875, + "grad_norm_var": 0.017268625895182292, + "learning_rate": 0.0001, + "loss": 4.1438, + "loss/crossentropy": 2.1778156757354736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20495261996984482, + "step": 20750 + }, + { + "epoch": 0.41504, + "grad_norm": 2.125, + "grad_norm_var": 0.016877237955729166, + "learning_rate": 0.0001, + "loss": 4.2809, + "loss/crossentropy": 2.368219017982483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21667053550481796, + "step": 20752 + }, + { + "epoch": 0.41508, + "grad_norm": 1.828125, + "grad_norm_var": 0.006703440348307292, + "learning_rate": 0.0001, + "loss": 3.7872, + "loss/crossentropy": 1.9143288731575012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1782715767621994, + "step": 20754 + }, + { + "epoch": 0.41512, + "grad_norm": 2.1875, + "grad_norm_var": 0.009388987223307292, + "learning_rate": 0.0001, + "loss": 4.2937, + "loss/crossentropy": 1.7776128649711609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1746697723865509, + "step": 20756 + }, + { + "epoch": 0.41516, + "grad_norm": 1.90625, + "grad_norm_var": 0.009824371337890625, + "learning_rate": 0.0001, + "loss": 3.9991, + "loss/crossentropy": 2.057144343852997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18555855751037598, + "step": 20758 + }, + { + "epoch": 0.4152, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012048085530598959, + "learning_rate": 0.0001, + "loss": 4.0996, + "loss/crossentropy": 1.9628003239631653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2057315558195114, + "step": 20760 + }, + { + "epoch": 0.41524, + "grad_norm": 2.109375, + "grad_norm_var": 0.01282958984375, + "learning_rate": 0.0001, + "loss": 3.9163, + "loss/crossentropy": 1.8570061326026917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19530560076236725, + "step": 20762 + }, + { + "epoch": 0.41528, + "grad_norm": 2.015625, + "grad_norm_var": 0.0115386962890625, + "learning_rate": 0.0001, + "loss": 3.9201, + "loss/crossentropy": 1.939897060394287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979508325457573, + "step": 20764 + }, + { + "epoch": 0.41532, + "grad_norm": 1.921875, + "grad_norm_var": 0.0116607666015625, + "learning_rate": 0.0001, + "loss": 4.0702, + "loss/crossentropy": 2.15025132894516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2082752287387848, + "step": 20766 + }, + { + "epoch": 0.41536, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010713704427083333, + "learning_rate": 0.0001, + "loss": 4.0037, + "loss/crossentropy": 2.202960252761841, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20040277391672134, + "step": 20768 + }, + { + "epoch": 0.4154, + "grad_norm": 1.96875, + "grad_norm_var": 0.008208974202473959, + "learning_rate": 0.0001, + "loss": 4.05, + "loss/crossentropy": 2.107069969177246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1995261162519455, + "step": 20770 + }, + { + "epoch": 0.41544, + "grad_norm": 1.84375, + "grad_norm_var": 0.0071103413899739586, + "learning_rate": 0.0001, + "loss": 3.8939, + "loss/crossentropy": 2.287596106529236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20532196760177612, + "step": 20772 + }, + { + "epoch": 0.41548, + "grad_norm": 2.0625, + "grad_norm_var": 0.013236236572265626, + "learning_rate": 0.0001, + "loss": 3.7519, + "loss/crossentropy": 1.8234007954597473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17727909982204437, + "step": 20774 + }, + { + "epoch": 0.41552, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010581207275390626, + "learning_rate": 0.0001, + "loss": 3.8238, + "loss/crossentropy": 1.9223560690879822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918986812233925, + "step": 20776 + }, + { + "epoch": 0.41556, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009769694010416666, + "learning_rate": 0.0001, + "loss": 3.8487, + "loss/crossentropy": 2.0554555654525757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17795050144195557, + "step": 20778 + }, + { + "epoch": 0.4156, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009427642822265625, + "learning_rate": 0.0001, + "loss": 3.9899, + "loss/crossentropy": 2.170193314552307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20256754010915756, + "step": 20780 + }, + { + "epoch": 0.41564, + "grad_norm": 2.0, + "grad_norm_var": 0.009732818603515625, + "learning_rate": 0.0001, + "loss": 3.8443, + "loss/crossentropy": 1.704530119895935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18020445108413696, + "step": 20782 + }, + { + "epoch": 0.41568, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009915924072265625, + "learning_rate": 0.0001, + "loss": 4.0143, + "loss/crossentropy": 2.1245445013046265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20647235959768295, + "step": 20784 + }, + { + "epoch": 0.41572, + "grad_norm": 1.890625, + "grad_norm_var": 0.009895579020182291, + "learning_rate": 0.0001, + "loss": 4.1806, + "loss/crossentropy": 2.4078409671783447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126438021659851, + "step": 20786 + }, + { + "epoch": 0.41576, + "grad_norm": 1.84375, + "grad_norm_var": 0.011732737223307291, + "learning_rate": 0.0001, + "loss": 4.1084, + "loss/crossentropy": 2.0577614307403564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19302819669246674, + "step": 20788 + }, + { + "epoch": 0.4158, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0061686197916666664, + "learning_rate": 0.0001, + "loss": 4.0234, + "loss/crossentropy": 2.0301398038864136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947554647922516, + "step": 20790 + }, + { + "epoch": 0.41584, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0062558492024739586, + "learning_rate": 0.0001, + "loss": 3.9092, + "loss/crossentropy": 2.276142120361328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21211084723472595, + "step": 20792 + }, + { + "epoch": 0.41588, + "grad_norm": 1.859375, + "grad_norm_var": 0.006125640869140625, + "learning_rate": 0.0001, + "loss": 4.1577, + "loss/crossentropy": 2.050966262817383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20556103438138962, + "step": 20794 + }, + { + "epoch": 0.41592, + "grad_norm": 1.859375, + "grad_norm_var": 0.009218088785807292, + "learning_rate": 0.0001, + "loss": 4.1697, + "loss/crossentropy": 2.135987937450409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21866349130868912, + "step": 20796 + }, + { + "epoch": 0.41596, + "grad_norm": 1.90625, + "grad_norm_var": 0.0089263916015625, + "learning_rate": 0.0001, + "loss": 4.0574, + "loss/crossentropy": 2.0885995030403137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148696705698967, + "step": 20798 + }, + { + "epoch": 0.416, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008471425374348958, + "learning_rate": 0.0001, + "loss": 3.9976, + "loss/crossentropy": 1.8934147953987122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19176077097654343, + "step": 20800 + }, + { + "epoch": 0.41604, + "grad_norm": 2.0, + "grad_norm_var": 0.007970937093098958, + "learning_rate": 0.0001, + "loss": 4.3367, + "loss/crossentropy": 2.355128526687622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21396907418966293, + "step": 20802 + }, + { + "epoch": 0.41608, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005619049072265625, + "learning_rate": 0.0001, + "loss": 4.1109, + "loss/crossentropy": 2.2145700454711914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21031766384840012, + "step": 20804 + }, + { + "epoch": 0.41612, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006048329671223958, + "learning_rate": 0.0001, + "loss": 3.8883, + "loss/crossentropy": 2.17924165725708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19039995223283768, + "step": 20806 + }, + { + "epoch": 0.41616, + "grad_norm": 1.8828125, + "grad_norm_var": 0.006017812093098958, + "learning_rate": 0.0001, + "loss": 4.0044, + "loss/crossentropy": 2.2674453258514404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18743296712636948, + "step": 20808 + }, + { + "epoch": 0.4162, + "grad_norm": 2.0625, + "grad_norm_var": 0.052711741129557295, + "learning_rate": 0.0001, + "loss": 4.1857, + "loss/crossentropy": 2.331398367881775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20658230036497116, + "step": 20810 + }, + { + "epoch": 0.41624, + "grad_norm": 1.9609375, + "grad_norm_var": 0.05022354125976562, + "learning_rate": 0.0001, + "loss": 4.0019, + "loss/crossentropy": 1.6092209815979004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16996397078037262, + "step": 20812 + }, + { + "epoch": 0.41628, + "grad_norm": 1.953125, + "grad_norm_var": 0.04944661458333333, + "learning_rate": 0.0001, + "loss": 4.0452, + "loss/crossentropy": 2.094128370285034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19843773543834686, + "step": 20814 + }, + { + "epoch": 0.41632, + "grad_norm": 2.046875, + "grad_norm_var": 0.050842030843098955, + "learning_rate": 0.0001, + "loss": 3.9735, + "loss/crossentropy": 1.9059642553329468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20919224619865417, + "step": 20816 + }, + { + "epoch": 0.41636, + "grad_norm": 2.015625, + "grad_norm_var": 0.050966135660807294, + "learning_rate": 0.0001, + "loss": 3.9026, + "loss/crossentropy": 2.0733126401901245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132449895143509, + "step": 20818 + }, + { + "epoch": 0.4164, + "grad_norm": 1.9765625, + "grad_norm_var": 0.05125223795572917, + "learning_rate": 0.0001, + "loss": 4.1978, + "loss/crossentropy": 2.09629487991333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21850410103797913, + "step": 20820 + }, + { + "epoch": 0.41644, + "grad_norm": 1.8984375, + "grad_norm_var": 0.05614802042643229, + "learning_rate": 0.0001, + "loss": 3.6648, + "loss/crossentropy": 1.7917864322662354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16684360802173615, + "step": 20822 + }, + { + "epoch": 0.41648, + "grad_norm": 2.03125, + "grad_norm_var": 0.0561187744140625, + "learning_rate": 0.0001, + "loss": 3.7587, + "loss/crossentropy": 1.629518985748291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16551091521978378, + "step": 20824 + }, + { + "epoch": 0.41652, + "grad_norm": 2.03125, + "grad_norm_var": 0.009476725260416667, + "learning_rate": 0.0001, + "loss": 4.2593, + "loss/crossentropy": 2.2774561643600464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21336688101291656, + "step": 20826 + }, + { + "epoch": 0.41656, + "grad_norm": 1.9375, + "grad_norm_var": 0.010684967041015625, + "learning_rate": 0.0001, + "loss": 3.7644, + "loss/crossentropy": 2.0319623947143555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19169747829437256, + "step": 20828 + }, + { + "epoch": 0.4166, + "grad_norm": 1.921875, + "grad_norm_var": 0.010261789957682291, + "learning_rate": 0.0001, + "loss": 3.7815, + "loss/crossentropy": 1.8104961514472961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1863522008061409, + "step": 20830 + }, + { + "epoch": 0.41664, + "grad_norm": 2.09375, + "grad_norm_var": 0.0119781494140625, + "learning_rate": 0.0001, + "loss": 3.815, + "loss/crossentropy": 1.729806661605835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17772196978330612, + "step": 20832 + }, + { + "epoch": 0.41668, + "grad_norm": 2.03125, + "grad_norm_var": 0.012116495768229167, + "learning_rate": 0.0001, + "loss": 3.9823, + "loss/crossentropy": 1.9465582966804504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193453811109066, + "step": 20834 + }, + { + "epoch": 0.41672, + "grad_norm": 1.953125, + "grad_norm_var": 0.0122222900390625, + "learning_rate": 0.0001, + "loss": 4.1078, + "loss/crossentropy": 1.895602285861969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18032599240541458, + "step": 20836 + }, + { + "epoch": 0.41676, + "grad_norm": 1.84375, + "grad_norm_var": 0.009924062093098958, + "learning_rate": 0.0001, + "loss": 3.7889, + "loss/crossentropy": 2.082605481147766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21816352754831314, + "step": 20838 + }, + { + "epoch": 0.4168, + "grad_norm": 1.8125, + "grad_norm_var": 0.010628255208333333, + "learning_rate": 0.0001, + "loss": 3.9431, + "loss/crossentropy": 1.7751468420028687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18468675762414932, + "step": 20840 + }, + { + "epoch": 0.41684, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007838694254557292, + "learning_rate": 0.0001, + "loss": 4.1319, + "loss/crossentropy": 2.2611895203590393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24100882560014725, + "step": 20842 + }, + { + "epoch": 0.41688, + "grad_norm": 1.953125, + "grad_norm_var": 0.0078125, + "learning_rate": 0.0001, + "loss": 4.0711, + "loss/crossentropy": 2.1387221813201904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980198249220848, + "step": 20844 + }, + { + "epoch": 0.41692, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008206939697265625, + "learning_rate": 0.0001, + "loss": 4.0165, + "loss/crossentropy": 2.385176658630371, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22007980942726135, + "step": 20846 + }, + { + "epoch": 0.41696, + "grad_norm": 1.859375, + "grad_norm_var": 0.005273183186848958, + "learning_rate": 0.0001, + "loss": 3.9463, + "loss/crossentropy": 1.9382571578025818, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19266389310359955, + "step": 20848 + }, + { + "epoch": 0.417, + "grad_norm": 2.046875, + "grad_norm_var": 0.006583404541015625, + "learning_rate": 0.0001, + "loss": 4.3108, + "loss/crossentropy": 1.91634601354599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147107571363449, + "step": 20850 + }, + { + "epoch": 0.41704, + "grad_norm": 1.8515625, + "grad_norm_var": 0.012532297770182292, + "learning_rate": 0.0001, + "loss": 3.9546, + "loss/crossentropy": 1.9752032160758972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20748814195394516, + "step": 20852 + }, + { + "epoch": 0.41708, + "grad_norm": 1.890625, + "grad_norm_var": 0.011818186442057291, + "learning_rate": 0.0001, + "loss": 3.9149, + "loss/crossentropy": 1.844580054283142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17454466968774796, + "step": 20854 + }, + { + "epoch": 0.41712, + "grad_norm": 1.875, + "grad_norm_var": 0.010652669270833333, + "learning_rate": 0.0001, + "loss": 4.0963, + "loss/crossentropy": 2.18678879737854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2004237025976181, + "step": 20856 + }, + { + "epoch": 0.41716, + "grad_norm": 1.90625, + "grad_norm_var": 0.010583241780598959, + "learning_rate": 0.0001, + "loss": 3.855, + "loss/crossentropy": 1.7560098767280579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17802055180072784, + "step": 20858 + }, + { + "epoch": 0.4172, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009911092122395833, + "learning_rate": 0.0001, + "loss": 4.1007, + "loss/crossentropy": 1.8778411746025085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1892327517271042, + "step": 20860 + }, + { + "epoch": 0.41724, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009220123291015625, + "learning_rate": 0.0001, + "loss": 3.9034, + "loss/crossentropy": 1.8801356554031372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18983188271522522, + "step": 20862 + }, + { + "epoch": 0.41728, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010448201497395834, + "learning_rate": 0.0001, + "loss": 3.974, + "loss/crossentropy": 2.0934754014015198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17912424355745316, + "step": 20864 + }, + { + "epoch": 0.41732, + "grad_norm": 2.0625, + "grad_norm_var": 0.010247548421223959, + "learning_rate": 0.0001, + "loss": 4.0854, + "loss/crossentropy": 2.115522801876068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20105450600385666, + "step": 20866 + }, + { + "epoch": 0.41736, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004622141520182292, + "learning_rate": 0.0001, + "loss": 4.0041, + "loss/crossentropy": 2.1903135776519775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19662026315927505, + "step": 20868 + }, + { + "epoch": 0.4174, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005794016520182291, + "learning_rate": 0.0001, + "loss": 4.2067, + "loss/crossentropy": 2.1521800756454468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20861530303955078, + "step": 20870 + }, + { + "epoch": 0.41744, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006145985921223959, + "learning_rate": 0.0001, + "loss": 4.4442, + "loss/crossentropy": 2.0802451968193054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19014900922775269, + "step": 20872 + }, + { + "epoch": 0.41748, + "grad_norm": 1.890625, + "grad_norm_var": 0.006091054280598958, + "learning_rate": 0.0001, + "loss": 4.008, + "loss/crossentropy": 1.8639289140701294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17143192887306213, + "step": 20874 + }, + { + "epoch": 0.41752, + "grad_norm": 2.03125, + "grad_norm_var": 0.006525675455729167, + "learning_rate": 0.0001, + "loss": 4.2595, + "loss/crossentropy": 2.191206693649292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2013956382870674, + "step": 20876 + }, + { + "epoch": 0.41756, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006754557291666667, + "learning_rate": 0.0001, + "loss": 3.9213, + "loss/crossentropy": 1.8131752610206604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932450830936432, + "step": 20878 + }, + { + "epoch": 0.4176, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005018870035807292, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 2.1427782773971558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21525658667087555, + "step": 20880 + }, + { + "epoch": 0.41764, + "grad_norm": 2.015625, + "grad_norm_var": 0.005956013997395833, + "learning_rate": 0.0001, + "loss": 4.1872, + "loss/crossentropy": 1.997750997543335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22109957039356232, + "step": 20882 + }, + { + "epoch": 0.41768, + "grad_norm": 1.90625, + "grad_norm_var": 0.007700347900390625, + "learning_rate": 0.0001, + "loss": 4.0445, + "loss/crossentropy": 2.2799811363220215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18671181797981262, + "step": 20884 + }, + { + "epoch": 0.41772, + "grad_norm": 1.921875, + "grad_norm_var": 0.006886545817057292, + "learning_rate": 0.0001, + "loss": 3.7884, + "loss/crossentropy": 1.887694001197815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19071876257658005, + "step": 20886 + }, + { + "epoch": 0.41776, + "grad_norm": 2.03125, + "grad_norm_var": 0.008235677083333334, + "learning_rate": 0.0001, + "loss": 4.0758, + "loss/crossentropy": 2.0238420367240906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2015087753534317, + "step": 20888 + }, + { + "epoch": 0.4178, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008481597900390625, + "learning_rate": 0.0001, + "loss": 4.3148, + "loss/crossentropy": 2.5183900594711304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21474501490592957, + "step": 20890 + }, + { + "epoch": 0.41784, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0083160400390625, + "learning_rate": 0.0001, + "loss": 3.9086, + "loss/crossentropy": 2.0730000734329224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18937845528125763, + "step": 20892 + }, + { + "epoch": 0.41788, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008101145426432291, + "learning_rate": 0.0001, + "loss": 3.9845, + "loss/crossentropy": 2.083210587501526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19441235065460205, + "step": 20894 + }, + { + "epoch": 0.41792, + "grad_norm": 1.7734375, + "grad_norm_var": 0.010961659749348958, + "learning_rate": 0.0001, + "loss": 3.9813, + "loss/crossentropy": 1.7236113548278809, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931980401277542, + "step": 20896 + }, + { + "epoch": 0.41796, + "grad_norm": 1.984375, + "grad_norm_var": 0.008796183268229167, + "learning_rate": 0.0001, + "loss": 4.0757, + "loss/crossentropy": 2.108364462852478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20710121095180511, + "step": 20898 + }, + { + "epoch": 0.418, + "grad_norm": 1.7421875, + "grad_norm_var": 0.010179646809895833, + "learning_rate": 0.0001, + "loss": 3.8073, + "loss/crossentropy": 2.10055810213089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20653685182332993, + "step": 20900 + }, + { + "epoch": 0.41804, + "grad_norm": 1.96875, + "grad_norm_var": 0.011517079671223958, + "learning_rate": 0.0001, + "loss": 4.0977, + "loss/crossentropy": 2.254370093345642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21069462597370148, + "step": 20902 + }, + { + "epoch": 0.41808, + "grad_norm": 1.8125, + "grad_norm_var": 0.010107421875, + "learning_rate": 0.0001, + "loss": 3.8429, + "loss/crossentropy": 2.053595006465912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19040333479642868, + "step": 20904 + }, + { + "epoch": 0.41812, + "grad_norm": 2.1875, + "grad_norm_var": 0.014012654622395834, + "learning_rate": 0.0001, + "loss": 4.3406, + "loss/crossentropy": 2.1739301681518555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19576483964920044, + "step": 20906 + }, + { + "epoch": 0.41816, + "grad_norm": 2.140625, + "grad_norm_var": 0.01635920206705729, + "learning_rate": 0.0001, + "loss": 4.2794, + "loss/crossentropy": 1.9707902073860168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20553794503211975, + "step": 20908 + }, + { + "epoch": 0.4182, + "grad_norm": 1.9375, + "grad_norm_var": 0.017097981770833333, + "learning_rate": 0.0001, + "loss": 4.0596, + "loss/crossentropy": 1.901296079158783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18205050379037857, + "step": 20910 + }, + { + "epoch": 0.41824, + "grad_norm": 1.8828125, + "grad_norm_var": 0.015388997395833333, + "learning_rate": 0.0001, + "loss": 4.3012, + "loss/crossentropy": 2.2531062364578247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18042850494384766, + "step": 20912 + }, + { + "epoch": 0.41828, + "grad_norm": 1.90625, + "grad_norm_var": 0.0159088134765625, + "learning_rate": 0.0001, + "loss": 3.8839, + "loss/crossentropy": 1.959227204322815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19045928120613098, + "step": 20914 + }, + { + "epoch": 0.41832, + "grad_norm": 1.828125, + "grad_norm_var": 0.014207967122395833, + "learning_rate": 0.0001, + "loss": 3.9595, + "loss/crossentropy": 2.1432504057884216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1991618126630783, + "step": 20916 + }, + { + "epoch": 0.41836, + "grad_norm": 1.96875, + "grad_norm_var": 0.013423665364583334, + "learning_rate": 0.0001, + "loss": 4.0369, + "loss/crossentropy": 2.090354800224304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19416044652462006, + "step": 20918 + }, + { + "epoch": 0.4184, + "grad_norm": 2.015625, + "grad_norm_var": 0.011889394124348958, + "learning_rate": 0.0001, + "loss": 4.0133, + "loss/crossentropy": 2.0165509581565857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20892751216888428, + "step": 20920 + }, + { + "epoch": 0.41844, + "grad_norm": 1.890625, + "grad_norm_var": 0.009273020426432292, + "learning_rate": 0.0001, + "loss": 4.0852, + "loss/crossentropy": 2.2627620697021484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20449956506490707, + "step": 20922 + }, + { + "epoch": 0.41848, + "grad_norm": 1.8203125, + "grad_norm_var": 0.007249959309895833, + "learning_rate": 0.0001, + "loss": 3.8586, + "loss/crossentropy": 1.9771518111228943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20680005848407745, + "step": 20924 + }, + { + "epoch": 0.41852, + "grad_norm": 1.921875, + "grad_norm_var": 0.006982421875, + "learning_rate": 0.0001, + "loss": 4.1715, + "loss/crossentropy": 2.2706029415130615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22013718634843826, + "step": 20926 + }, + { + "epoch": 0.41856, + "grad_norm": 1.9375, + "grad_norm_var": 0.0060503641764322914, + "learning_rate": 0.0001, + "loss": 4.1293, + "loss/crossentropy": 2.071012258529663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21389423310756683, + "step": 20928 + }, + { + "epoch": 0.4186, + "grad_norm": 1.890625, + "grad_norm_var": 0.0071044921875, + "learning_rate": 0.0001, + "loss": 4.3878, + "loss/crossentropy": 2.084509491920471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20820914208889008, + "step": 20930 + }, + { + "epoch": 0.41864, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005855051676432291, + "learning_rate": 0.0001, + "loss": 3.7038, + "loss/crossentropy": 2.0449500679969788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20955926179885864, + "step": 20932 + }, + { + "epoch": 0.41868, + "grad_norm": 2.046875, + "grad_norm_var": 0.005881500244140625, + "learning_rate": 0.0001, + "loss": 4.0819, + "loss/crossentropy": 2.0860745310783386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21544789522886276, + "step": 20934 + }, + { + "epoch": 0.41872, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0065877278645833336, + "learning_rate": 0.0001, + "loss": 3.9627, + "loss/crossentropy": 2.042892038822174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20232345163822174, + "step": 20936 + }, + { + "epoch": 0.41876, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006298828125, + "learning_rate": 0.0001, + "loss": 4.3137, + "loss/crossentropy": 2.3840891122817993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21144797652959824, + "step": 20938 + }, + { + "epoch": 0.4188, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005210113525390625, + "learning_rate": 0.0001, + "loss": 4.1256, + "loss/crossentropy": 1.8122249245643616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1800767481327057, + "step": 20940 + }, + { + "epoch": 0.41884, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0052073160807291664, + "learning_rate": 0.0001, + "loss": 4.2324, + "loss/crossentropy": 2.168284773826599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116834670305252, + "step": 20942 + }, + { + "epoch": 0.41888, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006005859375, + "learning_rate": 0.0001, + "loss": 4.0847, + "loss/crossentropy": 2.1386263370513916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18317822366952896, + "step": 20944 + }, + { + "epoch": 0.41892, + "grad_norm": 1.921875, + "grad_norm_var": 0.004829915364583334, + "learning_rate": 0.0001, + "loss": 3.993, + "loss/crossentropy": 2.134578227996826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20232250541448593, + "step": 20946 + }, + { + "epoch": 0.41896, + "grad_norm": 1.875, + "grad_norm_var": 0.005116526285807292, + "learning_rate": 0.0001, + "loss": 4.212, + "loss/crossentropy": 2.138009011745453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20763013511896133, + "step": 20948 + }, + { + "epoch": 0.419, + "grad_norm": 1.9921875, + "grad_norm_var": 0.004881795247395833, + "learning_rate": 0.0001, + "loss": 3.9959, + "loss/crossentropy": 2.1796361207962036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1887281984090805, + "step": 20950 + }, + { + "epoch": 0.41904, + "grad_norm": 2.015625, + "grad_norm_var": 0.004369099934895833, + "learning_rate": 0.0001, + "loss": 3.9328, + "loss/crossentropy": 1.9848375916481018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21374072134494781, + "step": 20952 + }, + { + "epoch": 0.41908, + "grad_norm": 2.015625, + "grad_norm_var": 0.003985341389973958, + "learning_rate": 0.0001, + "loss": 4.1199, + "loss/crossentropy": 1.9993594288825989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19865376502275467, + "step": 20954 + }, + { + "epoch": 0.41912, + "grad_norm": 1.984375, + "grad_norm_var": 0.003883616129557292, + "learning_rate": 0.0001, + "loss": 4.0882, + "loss/crossentropy": 1.841040551662445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984993740916252, + "step": 20956 + }, + { + "epoch": 0.41916, + "grad_norm": 2.203125, + "grad_norm_var": 0.007472483317057291, + "learning_rate": 0.0001, + "loss": 4.3558, + "loss/crossentropy": 2.4816187620162964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072821408510208, + "step": 20958 + }, + { + "epoch": 0.4192, + "grad_norm": 2.140625, + "grad_norm_var": 0.009114329020182292, + "learning_rate": 0.0001, + "loss": 4.3989, + "loss/crossentropy": 2.1488978266716003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2619112655520439, + "step": 20960 + }, + { + "epoch": 0.41924, + "grad_norm": 2.046875, + "grad_norm_var": 0.009251912434895834, + "learning_rate": 0.0001, + "loss": 3.8774, + "loss/crossentropy": 1.8684781193733215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1915435940027237, + "step": 20962 + }, + { + "epoch": 0.41928, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008599599202473959, + "learning_rate": 0.0001, + "loss": 4.0564, + "loss/crossentropy": 2.2308766841888428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21091555058956146, + "step": 20964 + }, + { + "epoch": 0.41932, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0086822509765625, + "learning_rate": 0.0001, + "loss": 4.016, + "loss/crossentropy": 2.0642316341400146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18509763479232788, + "step": 20966 + }, + { + "epoch": 0.41936, + "grad_norm": 2.078125, + "grad_norm_var": 0.008929189046223958, + "learning_rate": 0.0001, + "loss": 4.2041, + "loss/crossentropy": 2.4577912092208862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22347646206617355, + "step": 20968 + }, + { + "epoch": 0.4194, + "grad_norm": 2.125, + "grad_norm_var": 0.009563954671223958, + "learning_rate": 0.0001, + "loss": 4.4522, + "loss/crossentropy": 2.0276389122009277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18190696835517883, + "step": 20970 + }, + { + "epoch": 0.41944, + "grad_norm": 1.8515625, + "grad_norm_var": 0.011171213785807292, + "learning_rate": 0.0001, + "loss": 3.7112, + "loss/crossentropy": 1.920120656490326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18268628418445587, + "step": 20972 + }, + { + "epoch": 0.41948, + "grad_norm": 1.8125, + "grad_norm_var": 0.010074615478515625, + "learning_rate": 0.0001, + "loss": 3.671, + "loss/crossentropy": 2.0709950923919678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20330249518156052, + "step": 20974 + }, + { + "epoch": 0.41952, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0092926025390625, + "learning_rate": 0.0001, + "loss": 4.1188, + "loss/crossentropy": 2.059907555580139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23229123651981354, + "step": 20976 + }, + { + "epoch": 0.41956, + "grad_norm": 1.96875, + "grad_norm_var": 0.012190755208333333, + "learning_rate": 0.0001, + "loss": 3.8434, + "loss/crossentropy": 1.9516863226890564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19371748715639114, + "step": 20978 + }, + { + "epoch": 0.4196, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012181599934895834, + "learning_rate": 0.0001, + "loss": 3.8149, + "loss/crossentropy": 1.8882723450660706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1830422580242157, + "step": 20980 + }, + { + "epoch": 0.41964, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011864980061848959, + "learning_rate": 0.0001, + "loss": 4.2085, + "loss/crossentropy": 2.3770724534988403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055138498544693, + "step": 20982 + }, + { + "epoch": 0.41968, + "grad_norm": 1.90625, + "grad_norm_var": 0.010221354166666667, + "learning_rate": 0.0001, + "loss": 4.0082, + "loss/crossentropy": 2.302065849304199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21600044518709183, + "step": 20984 + }, + { + "epoch": 0.41972, + "grad_norm": 1.953125, + "grad_norm_var": 0.00772705078125, + "learning_rate": 0.0001, + "loss": 4.2794, + "loss/crossentropy": 2.1097174286842346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2151128277182579, + "step": 20986 + }, + { + "epoch": 0.41976, + "grad_norm": 1.875, + "grad_norm_var": 0.007765452067057292, + "learning_rate": 0.0001, + "loss": 4.0175, + "loss/crossentropy": 2.1860098838806152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20798291265964508, + "step": 20988 + }, + { + "epoch": 0.4198, + "grad_norm": 1.90625, + "grad_norm_var": 0.007258097330729167, + "learning_rate": 0.0001, + "loss": 3.7789, + "loss/crossentropy": 2.0685555934906006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20412929356098175, + "step": 20990 + }, + { + "epoch": 0.41984, + "grad_norm": 2.0, + "grad_norm_var": 0.0064046223958333336, + "learning_rate": 0.0001, + "loss": 3.9187, + "loss/crossentropy": 1.886826515197754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17476174980401993, + "step": 20992 + }, + { + "epoch": 0.41988, + "grad_norm": 1.9296875, + "grad_norm_var": 0.003446197509765625, + "learning_rate": 0.0001, + "loss": 3.853, + "loss/crossentropy": 1.8504652380943298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18863234668970108, + "step": 20994 + }, + { + "epoch": 0.41992, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0035308837890625, + "learning_rate": 0.0001, + "loss": 4.0992, + "loss/crossentropy": 2.396555781364441, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21075168251991272, + "step": 20996 + }, + { + "epoch": 0.41996, + "grad_norm": 2.203125, + "grad_norm_var": 0.008676910400390625, + "learning_rate": 0.0001, + "loss": 4.0947, + "loss/crossentropy": 2.005809009075165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158650904893875, + "step": 20998 + }, + { + "epoch": 0.42, + "grad_norm": 1.9375, + "grad_norm_var": 0.008660634358723959, + "learning_rate": 0.0001, + "loss": 3.8813, + "loss/crossentropy": 1.8755770325660706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19334855675697327, + "step": 21000 + }, + { + "epoch": 0.42004, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007989247639973959, + "learning_rate": 0.0001, + "loss": 4.1168, + "loss/crossentropy": 2.4014203548431396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22790876030921936, + "step": 21002 + }, + { + "epoch": 0.42008, + "grad_norm": 1.96875, + "grad_norm_var": 0.008983357747395834, + "learning_rate": 0.0001, + "loss": 3.7571, + "loss/crossentropy": 1.959916114807129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19611585140228271, + "step": 21004 + }, + { + "epoch": 0.42012, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010212961832682292, + "learning_rate": 0.0001, + "loss": 4.0505, + "loss/crossentropy": 2.077211081981659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19385989010334015, + "step": 21006 + }, + { + "epoch": 0.42016, + "grad_norm": 2.328125, + "grad_norm_var": 0.017878214518229168, + "learning_rate": 0.0001, + "loss": 4.2484, + "loss/crossentropy": 2.1105176210403442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.303642213344574, + "step": 21008 + }, + { + "epoch": 0.4202, + "grad_norm": 1.921875, + "grad_norm_var": 0.01786066691080729, + "learning_rate": 0.0001, + "loss": 4.0644, + "loss/crossentropy": 2.0568641424179077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2137533351778984, + "step": 21010 + }, + { + "epoch": 0.42024, + "grad_norm": 1.921875, + "grad_norm_var": 0.018424224853515626, + "learning_rate": 0.0001, + "loss": 3.9592, + "loss/crossentropy": 2.185602903366089, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20309215039014816, + "step": 21012 + }, + { + "epoch": 0.42028, + "grad_norm": 2.0, + "grad_norm_var": 0.014646148681640625, + "learning_rate": 0.0001, + "loss": 4.1891, + "loss/crossentropy": 1.9848479628562927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918111816048622, + "step": 21014 + }, + { + "epoch": 0.42032, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014731597900390626, + "learning_rate": 0.0001, + "loss": 4.167, + "loss/crossentropy": 2.315872311592102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22417553514242172, + "step": 21016 + }, + { + "epoch": 0.42036, + "grad_norm": 1.90625, + "grad_norm_var": 0.014798736572265625, + "learning_rate": 0.0001, + "loss": 3.9912, + "loss/crossentropy": 2.1230897903442383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19763804227113724, + "step": 21018 + }, + { + "epoch": 0.4204, + "grad_norm": 2.015625, + "grad_norm_var": 0.012947336832682291, + "learning_rate": 0.0001, + "loss": 4.2364, + "loss/crossentropy": 1.9737728834152222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18397405743598938, + "step": 21020 + }, + { + "epoch": 0.42044, + "grad_norm": 1.9375, + "grad_norm_var": 0.012064615885416666, + "learning_rate": 0.0001, + "loss": 4.0749, + "loss/crossentropy": 2.1349278688430786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1923152655363083, + "step": 21022 + }, + { + "epoch": 0.42048, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007242584228515625, + "learning_rate": 0.0001, + "loss": 4.1186, + "loss/crossentropy": 2.110077440738678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115027979016304, + "step": 21024 + }, + { + "epoch": 0.42052, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008430989583333333, + "learning_rate": 0.0001, + "loss": 4.0164, + "loss/crossentropy": 2.0237202048301697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17709830403327942, + "step": 21026 + }, + { + "epoch": 0.42056, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006990559895833333, + "learning_rate": 0.0001, + "loss": 4.2527, + "loss/crossentropy": 2.1712347269058228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20721115171909332, + "step": 21028 + }, + { + "epoch": 0.4206, + "grad_norm": 1.890625, + "grad_norm_var": 0.007054646809895833, + "learning_rate": 0.0001, + "loss": 4.087, + "loss/crossentropy": 2.1889408826828003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2252693474292755, + "step": 21030 + }, + { + "epoch": 0.42064, + "grad_norm": 1.890625, + "grad_norm_var": 0.009051259358723958, + "learning_rate": 0.0001, + "loss": 4.2847, + "loss/crossentropy": 1.9189648032188416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20265216380357742, + "step": 21032 + }, + { + "epoch": 0.42068, + "grad_norm": 2.046875, + "grad_norm_var": 0.009008534749348958, + "learning_rate": 0.0001, + "loss": 4.2777, + "loss/crossentropy": 2.335593581199646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21812127530574799, + "step": 21034 + }, + { + "epoch": 0.42072, + "grad_norm": 1.90625, + "grad_norm_var": 0.010723622639973958, + "learning_rate": 0.0001, + "loss": 3.9306, + "loss/crossentropy": 2.0845232605934143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974613294005394, + "step": 21036 + }, + { + "epoch": 0.42076, + "grad_norm": 1.96875, + "grad_norm_var": 0.010528310139973959, + "learning_rate": 0.0001, + "loss": 4.2117, + "loss/crossentropy": 2.0062036514282227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18701466172933578, + "step": 21038 + }, + { + "epoch": 0.4208, + "grad_norm": 1.96875, + "grad_norm_var": 0.006520334879557292, + "learning_rate": 0.0001, + "loss": 4.1506, + "loss/crossentropy": 2.30399227142334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108677700161934, + "step": 21040 + }, + { + "epoch": 0.42084, + "grad_norm": 2.046875, + "grad_norm_var": 0.006036122639973958, + "learning_rate": 0.0001, + "loss": 4.4162, + "loss/crossentropy": 2.0211732387542725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1925763636827469, + "step": 21042 + }, + { + "epoch": 0.42088, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006986490885416667, + "learning_rate": 0.0001, + "loss": 4.2366, + "loss/crossentropy": 2.259310483932495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19453367590904236, + "step": 21044 + }, + { + "epoch": 0.42092, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006818644205729167, + "learning_rate": 0.0001, + "loss": 3.8528, + "loss/crossentropy": 2.225229859352112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21418282389640808, + "step": 21046 + }, + { + "epoch": 0.42096, + "grad_norm": 1.953125, + "grad_norm_var": 0.004667154947916667, + "learning_rate": 0.0001, + "loss": 3.993, + "loss/crossentropy": 2.007095217704773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1860651671886444, + "step": 21048 + }, + { + "epoch": 0.421, + "grad_norm": 1.921875, + "grad_norm_var": 0.004117838541666667, + "learning_rate": 0.0001, + "loss": 3.8646, + "loss/crossentropy": 2.2098451256752014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123607099056244, + "step": 21050 + }, + { + "epoch": 0.42104, + "grad_norm": 1.96875, + "grad_norm_var": 0.0024861653645833334, + "learning_rate": 0.0001, + "loss": 3.9383, + "loss/crossentropy": 2.038383424282074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19423578679561615, + "step": 21052 + }, + { + "epoch": 0.42108, + "grad_norm": 1.828125, + "grad_norm_var": 0.0047910054524739586, + "learning_rate": 0.0001, + "loss": 3.5858, + "loss/crossentropy": 1.5344518423080444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15087325125932693, + "step": 21054 + }, + { + "epoch": 0.42112, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0050351460774739586, + "learning_rate": 0.0001, + "loss": 3.7398, + "loss/crossentropy": 2.0475123524665833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19996189326047897, + "step": 21056 + }, + { + "epoch": 0.42116, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0047604878743489586, + "learning_rate": 0.0001, + "loss": 3.8581, + "loss/crossentropy": 1.9760377407073975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20170825719833374, + "step": 21058 + }, + { + "epoch": 0.4212, + "grad_norm": 2.25, + "grad_norm_var": 0.009871419270833333, + "learning_rate": 0.0001, + "loss": 4.0876, + "loss/crossentropy": 2.0718571543693542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987539902329445, + "step": 21060 + }, + { + "epoch": 0.42124, + "grad_norm": 1.84375, + "grad_norm_var": 0.010908762613932291, + "learning_rate": 0.0001, + "loss": 3.7226, + "loss/crossentropy": 1.694058895111084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15808547288179398, + "step": 21062 + }, + { + "epoch": 0.42128, + "grad_norm": 2.0, + "grad_norm_var": 0.010827382405598959, + "learning_rate": 0.0001, + "loss": 3.9852, + "loss/crossentropy": 2.024729013442993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19920113682746887, + "step": 21064 + }, + { + "epoch": 0.42132, + "grad_norm": 2.28125, + "grad_norm_var": 0.01858495076497396, + "learning_rate": 0.0001, + "loss": 4.3691, + "loss/crossentropy": 2.2473320960998535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2366955801844597, + "step": 21066 + }, + { + "epoch": 0.42136, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020230865478515624, + "learning_rate": 0.0001, + "loss": 4.014, + "loss/crossentropy": 1.8571689128875732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198233962059021, + "step": 21068 + }, + { + "epoch": 0.4214, + "grad_norm": 2.0625, + "grad_norm_var": 0.018314361572265625, + "learning_rate": 0.0001, + "loss": 4.066, + "loss/crossentropy": 1.7178888320922852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756814494729042, + "step": 21070 + }, + { + "epoch": 0.42144, + "grad_norm": 2.453125, + "grad_norm_var": 0.031840006510416664, + "learning_rate": 0.0001, + "loss": 4.0847, + "loss/crossentropy": 2.5577595233917236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22383909672498703, + "step": 21072 + }, + { + "epoch": 0.42148, + "grad_norm": 1.8828125, + "grad_norm_var": 0.030926259358723958, + "learning_rate": 0.0001, + "loss": 4.1138, + "loss/crossentropy": 1.8517940640449524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17597733438014984, + "step": 21074 + }, + { + "epoch": 0.42152, + "grad_norm": 1.7265625, + "grad_norm_var": 0.033934529622395834, + "learning_rate": 0.0001, + "loss": 4.1207, + "loss/crossentropy": 2.2587300539016724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20134828239679337, + "step": 21076 + }, + { + "epoch": 0.42156, + "grad_norm": 2.109375, + "grad_norm_var": 0.04946263631184896, + "learning_rate": 0.0001, + "loss": 4.0223, + "loss/crossentropy": 1.9288156032562256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2372005432844162, + "step": 21078 + }, + { + "epoch": 0.4216, + "grad_norm": 2.109375, + "grad_norm_var": 0.051454416910807294, + "learning_rate": 0.0001, + "loss": 3.9205, + "loss/crossentropy": 1.7192566990852356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19468504190444946, + "step": 21080 + }, + { + "epoch": 0.42164, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0494293212890625, + "learning_rate": 0.0001, + "loss": 4.1596, + "loss/crossentropy": 2.2686651945114136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18164453655481339, + "step": 21082 + }, + { + "epoch": 0.42168, + "grad_norm": 1.8359375, + "grad_norm_var": 0.048685455322265626, + "learning_rate": 0.0001, + "loss": 4.1317, + "loss/crossentropy": 2.090322732925415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19734591990709305, + "step": 21084 + }, + { + "epoch": 0.42172, + "grad_norm": 2.0, + "grad_norm_var": 0.047761027018229166, + "learning_rate": 0.0001, + "loss": 4.1072, + "loss/crossentropy": 2.3203768730163574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18960122764110565, + "step": 21086 + }, + { + "epoch": 0.42176, + "grad_norm": 1.9296875, + "grad_norm_var": 0.04411392211914063, + "learning_rate": 0.0001, + "loss": 4.1253, + "loss/crossentropy": 2.1767812967300415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20625658333301544, + "step": 21088 + }, + { + "epoch": 0.4218, + "grad_norm": 1.875, + "grad_norm_var": 0.0453125, + "learning_rate": 0.0001, + "loss": 3.9737, + "loss/crossentropy": 2.1308469772338867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20404084771871567, + "step": 21090 + }, + { + "epoch": 0.42184, + "grad_norm": 2.109375, + "grad_norm_var": 0.03766988118489583, + "learning_rate": 0.0001, + "loss": 4.0643, + "loss/crossentropy": 2.34736967086792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22817441076040268, + "step": 21092 + }, + { + "epoch": 0.42188, + "grad_norm": 1.9296875, + "grad_norm_var": 0.019730631510416666, + "learning_rate": 0.0001, + "loss": 3.7592, + "loss/crossentropy": 1.9515716433525085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18946269899606705, + "step": 21094 + }, + { + "epoch": 0.42192, + "grad_norm": 1.90625, + "grad_norm_var": 0.019181315104166666, + "learning_rate": 0.0001, + "loss": 4.2118, + "loss/crossentropy": 2.1175013184547424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25575874000787735, + "step": 21096 + }, + { + "epoch": 0.42196, + "grad_norm": 1.9375, + "grad_norm_var": 0.019465128580729168, + "learning_rate": 0.0001, + "loss": 3.9934, + "loss/crossentropy": 1.8839566707611084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1840147227048874, + "step": 21098 + }, + { + "epoch": 0.422, + "grad_norm": 2.015625, + "grad_norm_var": 0.01761042277018229, + "learning_rate": 0.0001, + "loss": 3.9215, + "loss/crossentropy": 1.8690025806427002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20930250734090805, + "step": 21100 + }, + { + "epoch": 0.42204, + "grad_norm": 2.015625, + "grad_norm_var": 0.01693700154622396, + "learning_rate": 0.0001, + "loss": 4.0877, + "loss/crossentropy": 2.0368083119392395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19566239416599274, + "step": 21102 + }, + { + "epoch": 0.42208, + "grad_norm": 2.078125, + "grad_norm_var": 0.009384918212890624, + "learning_rate": 0.0001, + "loss": 3.9941, + "loss/crossentropy": 1.5155547261238098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1755489930510521, + "step": 21104 + }, + { + "epoch": 0.42212, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006259918212890625, + "learning_rate": 0.0001, + "loss": 3.9463, + "loss/crossentropy": 1.850695788860321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17723418772220612, + "step": 21106 + }, + { + "epoch": 0.42216, + "grad_norm": 1.828125, + "grad_norm_var": 0.0071441650390625, + "learning_rate": 0.0001, + "loss": 3.8903, + "loss/crossentropy": 2.0554863810539246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18637049198150635, + "step": 21108 + }, + { + "epoch": 0.4222, + "grad_norm": 1.953125, + "grad_norm_var": 0.0073883056640625, + "learning_rate": 0.0001, + "loss": 3.9719, + "loss/crossentropy": 1.6186088919639587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17206687480211258, + "step": 21110 + }, + { + "epoch": 0.42224, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006131744384765625, + "learning_rate": 0.0001, + "loss": 4.1193, + "loss/crossentropy": 2.326562762260437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22168086469173431, + "step": 21112 + }, + { + "epoch": 0.42228, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007670084635416667, + "learning_rate": 0.0001, + "loss": 3.8155, + "loss/crossentropy": 1.8151599764823914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16791456937789917, + "step": 21114 + }, + { + "epoch": 0.42232, + "grad_norm": 2.03125, + "grad_norm_var": 0.007249959309895833, + "learning_rate": 0.0001, + "loss": 4.2537, + "loss/crossentropy": 1.8597796559333801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19767651706933975, + "step": 21116 + }, + { + "epoch": 0.42236, + "grad_norm": 1.9375, + "grad_norm_var": 0.0056884765625, + "learning_rate": 0.0001, + "loss": 4.0409, + "loss/crossentropy": 1.9591187238693237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18850377202033997, + "step": 21118 + }, + { + "epoch": 0.4224, + "grad_norm": 2.09375, + "grad_norm_var": 0.007684071858723958, + "learning_rate": 0.0001, + "loss": 4.1848, + "loss/crossentropy": 2.0987173318862915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087177261710167, + "step": 21120 + }, + { + "epoch": 0.42244, + "grad_norm": 2.015625, + "grad_norm_var": 0.011010487874348959, + "learning_rate": 0.0001, + "loss": 3.96, + "loss/crossentropy": 1.8050614595413208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19238876551389694, + "step": 21122 + }, + { + "epoch": 0.42248, + "grad_norm": 2.015625, + "grad_norm_var": 0.010487620035807292, + "learning_rate": 0.0001, + "loss": 4.1689, + "loss/crossentropy": 2.0409955978393555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19230370223522186, + "step": 21124 + }, + { + "epoch": 0.42252, + "grad_norm": 1.890625, + "grad_norm_var": 0.009942372639973959, + "learning_rate": 0.0001, + "loss": 4.2033, + "loss/crossentropy": 2.2344201803207397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21372146904468536, + "step": 21126 + }, + { + "epoch": 0.42256, + "grad_norm": 2.015625, + "grad_norm_var": 0.010033162434895833, + "learning_rate": 0.0001, + "loss": 3.9102, + "loss/crossentropy": 1.905364751815796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17828336358070374, + "step": 21128 + }, + { + "epoch": 0.4226, + "grad_norm": 8.6875, + "grad_norm_var": 2.8468658447265627, + "learning_rate": 0.0001, + "loss": 4.4863, + "loss/crossentropy": 2.118414044380188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183245092630386, + "step": 21130 + }, + { + "epoch": 0.42264, + "grad_norm": 2.421875, + "grad_norm_var": 2.826851399739583, + "learning_rate": 0.0001, + "loss": 3.9736, + "loss/crossentropy": 1.8499796390533447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17906417697668076, + "step": 21132 + }, + { + "epoch": 0.42268, + "grad_norm": 2.0625, + "grad_norm_var": 2.808918253580729, + "learning_rate": 0.0001, + "loss": 3.9669, + "loss/crossentropy": 2.046007513999939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20209533721208572, + "step": 21134 + }, + { + "epoch": 0.42272, + "grad_norm": 2.078125, + "grad_norm_var": 2.7881795247395833, + "learning_rate": 0.0001, + "loss": 4.4089, + "loss/crossentropy": 2.4684035778045654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23784233629703522, + "step": 21136 + }, + { + "epoch": 0.42276, + "grad_norm": 1.9296875, + "grad_norm_var": 2.781150054931641, + "learning_rate": 0.0001, + "loss": 3.9599, + "loss/crossentropy": 2.2614521980285645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20271501690149307, + "step": 21138 + }, + { + "epoch": 0.4228, + "grad_norm": 1.9375, + "grad_norm_var": 2.778649648030599, + "learning_rate": 0.0001, + "loss": 3.995, + "loss/crossentropy": 1.9676395058631897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1826155036687851, + "step": 21140 + }, + { + "epoch": 0.42284, + "grad_norm": 2.015625, + "grad_norm_var": 2.77156982421875, + "learning_rate": 0.0001, + "loss": 4.0974, + "loss/crossentropy": 1.9320645928382874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19518838077783585, + "step": 21142 + }, + { + "epoch": 0.42288, + "grad_norm": 2.546875, + "grad_norm_var": 2.751301066080729, + "learning_rate": 0.0001, + "loss": 4.2261, + "loss/crossentropy": 2.0839805603027344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20529074221849442, + "step": 21144 + }, + { + "epoch": 0.42292, + "grad_norm": 1.8671875, + "grad_norm_var": 0.04561538696289062, + "learning_rate": 0.0001, + "loss": 3.6995, + "loss/crossentropy": 1.7191408276557922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19476453214883804, + "step": 21146 + }, + { + "epoch": 0.42296, + "grad_norm": 2.0625, + "grad_norm_var": 0.026151275634765624, + "learning_rate": 0.0001, + "loss": 4.3019, + "loss/crossentropy": 2.3838316202163696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2231682687997818, + "step": 21148 + }, + { + "epoch": 0.423, + "grad_norm": 2.078125, + "grad_norm_var": 0.02588678995768229, + "learning_rate": 0.0001, + "loss": 4.3851, + "loss/crossentropy": 2.5376522541046143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22086318582296371, + "step": 21150 + }, + { + "epoch": 0.42304, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0267486572265625, + "learning_rate": 0.0001, + "loss": 4.0375, + "loss/crossentropy": 2.0386282801628113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19398655742406845, + "step": 21152 + }, + { + "epoch": 0.42308, + "grad_norm": 1.984375, + "grad_norm_var": 0.025047810872395833, + "learning_rate": 0.0001, + "loss": 4.0863, + "loss/crossentropy": 1.8140466213226318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1843537762761116, + "step": 21154 + }, + { + "epoch": 0.42312, + "grad_norm": 2.078125, + "grad_norm_var": 0.026839192708333334, + "learning_rate": 0.0001, + "loss": 3.8526, + "loss/crossentropy": 1.7746369242668152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17255257815122604, + "step": 21156 + }, + { + "epoch": 0.42316, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0270904541015625, + "learning_rate": 0.0001, + "loss": 3.9869, + "loss/crossentropy": 1.9952461123466492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18875698745250702, + "step": 21158 + }, + { + "epoch": 0.4232, + "grad_norm": 2.125, + "grad_norm_var": 0.0079742431640625, + "learning_rate": 0.0001, + "loss": 4.1292, + "loss/crossentropy": 2.1353421211242676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20545977354049683, + "step": 21160 + }, + { + "epoch": 0.42324, + "grad_norm": 1.90625, + "grad_norm_var": 0.0072662353515625, + "learning_rate": 0.0001, + "loss": 3.8585, + "loss/crossentropy": 1.93438321352005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2099536955356598, + "step": 21162 + }, + { + "epoch": 0.42328, + "grad_norm": 2.125, + "grad_norm_var": 0.008714803059895833, + "learning_rate": 0.0001, + "loss": 4.1526, + "loss/crossentropy": 2.085531711578369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19908609241247177, + "step": 21164 + }, + { + "epoch": 0.42332, + "grad_norm": 1.890625, + "grad_norm_var": 0.010716756184895834, + "learning_rate": 0.0001, + "loss": 3.7828, + "loss/crossentropy": 1.7960018515586853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17612508684396744, + "step": 21166 + }, + { + "epoch": 0.42336, + "grad_norm": 2.03125, + "grad_norm_var": 0.011982981363932292, + "learning_rate": 0.0001, + "loss": 4.0211, + "loss/crossentropy": 2.3417539596557617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124033421278, + "step": 21168 + }, + { + "epoch": 0.4234, + "grad_norm": 2.375, + "grad_norm_var": 0.02384211222330729, + "learning_rate": 0.0001, + "loss": 4.0768, + "loss/crossentropy": 1.7829700708389282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17676468938589096, + "step": 21170 + }, + { + "epoch": 0.42344, + "grad_norm": 1.9375, + "grad_norm_var": 0.027815500895182293, + "learning_rate": 0.0001, + "loss": 4.0011, + "loss/crossentropy": 1.8243988156318665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1778729408979416, + "step": 21172 + }, + { + "epoch": 0.42348, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02774225870768229, + "learning_rate": 0.0001, + "loss": 4.1254, + "loss/crossentropy": 2.0906782150268555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19730936735868454, + "step": 21174 + }, + { + "epoch": 0.42352, + "grad_norm": 1.9140625, + "grad_norm_var": 0.026041666666666668, + "learning_rate": 0.0001, + "loss": 4.0611, + "loss/crossentropy": 2.0885696411132812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19815433770418167, + "step": 21176 + }, + { + "epoch": 0.42356, + "grad_norm": 1.8203125, + "grad_norm_var": 0.027513631184895835, + "learning_rate": 0.0001, + "loss": 3.7571, + "loss/crossentropy": 1.9133245944976807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18601036816835403, + "step": 21178 + }, + { + "epoch": 0.4236, + "grad_norm": 1.890625, + "grad_norm_var": 0.026071929931640626, + "learning_rate": 0.0001, + "loss": 3.9254, + "loss/crossentropy": 2.3077515363693237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21091386675834656, + "step": 21180 + }, + { + "epoch": 0.42364, + "grad_norm": 1.796875, + "grad_norm_var": 0.0247222900390625, + "learning_rate": 0.0001, + "loss": 3.9337, + "loss/crossentropy": 2.170205235481262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22082476317882538, + "step": 21182 + }, + { + "epoch": 0.42368, + "grad_norm": 1.984375, + "grad_norm_var": 0.0234619140625, + "learning_rate": 0.0001, + "loss": 4.1234, + "loss/crossentropy": 2.053133964538574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19210617244243622, + "step": 21184 + }, + { + "epoch": 0.42372, + "grad_norm": 1.984375, + "grad_norm_var": 0.011930084228515625, + "learning_rate": 0.0001, + "loss": 4.1882, + "loss/crossentropy": 2.246233820915222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2110241949558258, + "step": 21186 + }, + { + "epoch": 0.42376, + "grad_norm": 2.015625, + "grad_norm_var": 0.005378214518229166, + "learning_rate": 0.0001, + "loss": 4.0148, + "loss/crossentropy": 2.088374972343445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20536644011735916, + "step": 21188 + }, + { + "epoch": 0.4238, + "grad_norm": 2.03125, + "grad_norm_var": 0.006105295817057292, + "learning_rate": 0.0001, + "loss": 4.08, + "loss/crossentropy": 2.2355328798294067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208805114030838, + "step": 21190 + }, + { + "epoch": 0.42384, + "grad_norm": 2.125, + "grad_norm_var": 0.008707682291666666, + "learning_rate": 0.0001, + "loss": 4.3974, + "loss/crossentropy": 2.3286044001579285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21351350098848343, + "step": 21192 + }, + { + "epoch": 0.42388, + "grad_norm": 1.9375, + "grad_norm_var": 0.008131663004557291, + "learning_rate": 0.0001, + "loss": 4.1763, + "loss/crossentropy": 2.188312590122223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2166001871228218, + "step": 21194 + }, + { + "epoch": 0.42392, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007873280843098959, + "learning_rate": 0.0001, + "loss": 3.9396, + "loss/crossentropy": 2.1229456663131714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19442371279001236, + "step": 21196 + }, + { + "epoch": 0.42396, + "grad_norm": 12.0, + "grad_norm_var": 6.295235188802083, + "learning_rate": 0.0001, + "loss": 4.0976, + "loss/crossentropy": 2.000286102294922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20243800431489944, + "step": 21198 + }, + { + "epoch": 0.424, + "grad_norm": 2.078125, + "grad_norm_var": 6.265840657552084, + "learning_rate": 0.0001, + "loss": 4.204, + "loss/crossentropy": 2.0628392100334167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20828108489513397, + "step": 21200 + }, + { + "epoch": 0.42404, + "grad_norm": 2.09375, + "grad_norm_var": 6.274006144205729, + "learning_rate": 0.0001, + "loss": 4.0036, + "loss/crossentropy": 1.6969141364097595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20169565826654434, + "step": 21202 + }, + { + "epoch": 0.42408, + "grad_norm": 1.9609375, + "grad_norm_var": 6.258829752604167, + "learning_rate": 0.0001, + "loss": 4.1475, + "loss/crossentropy": 2.078153133392334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974920853972435, + "step": 21204 + }, + { + "epoch": 0.42412, + "grad_norm": 2.03125, + "grad_norm_var": 6.2670237223307295, + "learning_rate": 0.0001, + "loss": 3.7707, + "loss/crossentropy": 1.9656822681427002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20239949226379395, + "step": 21206 + }, + { + "epoch": 0.42416, + "grad_norm": 1.890625, + "grad_norm_var": 6.289948527018229, + "learning_rate": 0.0001, + "loss": 4.0624, + "loss/crossentropy": 1.721840500831604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1692550852894783, + "step": 21208 + }, + { + "epoch": 0.4242, + "grad_norm": 1.9375, + "grad_norm_var": 6.2876942952473955, + "learning_rate": 0.0001, + "loss": 4.0437, + "loss/crossentropy": 2.093570113182068, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127639278769493, + "step": 21210 + }, + { + "epoch": 0.42424, + "grad_norm": 2.09375, + "grad_norm_var": 6.257684071858724, + "learning_rate": 0.0001, + "loss": 4.1692, + "loss/crossentropy": 2.476475954055786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22028189152479172, + "step": 21212 + }, + { + "epoch": 0.42428, + "grad_norm": 2.3125, + "grad_norm_var": 0.015653228759765624, + "learning_rate": 0.0001, + "loss": 3.8084, + "loss/crossentropy": 2.1308672428131104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2215510457754135, + "step": 21214 + }, + { + "epoch": 0.42432, + "grad_norm": 1.9296875, + "grad_norm_var": 0.014709218343098959, + "learning_rate": 0.0001, + "loss": 3.9183, + "loss/crossentropy": 1.9697207808494568, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23525278270244598, + "step": 21216 + }, + { + "epoch": 0.42436, + "grad_norm": 2.03125, + "grad_norm_var": 0.012943267822265625, + "learning_rate": 0.0001, + "loss": 4.3044, + "loss/crossentropy": 2.3466382026672363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21148498356342316, + "step": 21218 + }, + { + "epoch": 0.4244, + "grad_norm": 2.0, + "grad_norm_var": 0.013421376546223959, + "learning_rate": 0.0001, + "loss": 4.0281, + "loss/crossentropy": 1.9455206990242004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18886880576610565, + "step": 21220 + }, + { + "epoch": 0.42444, + "grad_norm": 2.09375, + "grad_norm_var": 0.010949452718098959, + "learning_rate": 0.0001, + "loss": 4.113, + "loss/crossentropy": 2.030101954936981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19099503010511398, + "step": 21222 + }, + { + "epoch": 0.42448, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0103912353515625, + "learning_rate": 0.0001, + "loss": 3.9738, + "loss/crossentropy": 2.0859753489494324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20600532740354538, + "step": 21224 + }, + { + "epoch": 0.42452, + "grad_norm": 2.03125, + "grad_norm_var": 0.009749348958333333, + "learning_rate": 0.0001, + "loss": 4.0856, + "loss/crossentropy": 1.9598749279975891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18913735449314117, + "step": 21226 + }, + { + "epoch": 0.42456, + "grad_norm": 1.8203125, + "grad_norm_var": 0.011791737874348958, + "learning_rate": 0.0001, + "loss": 3.8938, + "loss/crossentropy": 1.8889980912208557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1892652064561844, + "step": 21228 + }, + { + "epoch": 0.4246, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005224355061848958, + "learning_rate": 0.0001, + "loss": 4.0116, + "loss/crossentropy": 2.0479432940483093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20084280520677567, + "step": 21230 + }, + { + "epoch": 0.42464, + "grad_norm": 2.046875, + "grad_norm_var": 0.005407460530598958, + "learning_rate": 0.0001, + "loss": 4.1743, + "loss/crossentropy": 2.2049208879470825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21012280881404877, + "step": 21232 + }, + { + "epoch": 0.42468, + "grad_norm": 1.8125, + "grad_norm_var": 0.006945546468098958, + "learning_rate": 0.0001, + "loss": 3.8626, + "loss/crossentropy": 2.1863686442375183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19185375422239304, + "step": 21234 + }, + { + "epoch": 0.42472, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010564931233723958, + "learning_rate": 0.0001, + "loss": 4.0699, + "loss/crossentropy": 2.144856631755829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2018456757068634, + "step": 21236 + }, + { + "epoch": 0.42476, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0170806884765625, + "learning_rate": 0.0001, + "loss": 4.0695, + "loss/crossentropy": 2.0092588663101196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18666698783636093, + "step": 21238 + }, + { + "epoch": 0.4248, + "grad_norm": 1.9921875, + "grad_norm_var": 0.018365224202473957, + "learning_rate": 0.0001, + "loss": 3.8831, + "loss/crossentropy": 1.9071536660194397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1925549954175949, + "step": 21240 + }, + { + "epoch": 0.42484, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01822509765625, + "learning_rate": 0.0001, + "loss": 3.7471, + "loss/crossentropy": 1.7945414185523987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18291736394166946, + "step": 21242 + }, + { + "epoch": 0.42488, + "grad_norm": 1.8984375, + "grad_norm_var": 0.017252604166666668, + "learning_rate": 0.0001, + "loss": 3.96, + "loss/crossentropy": 2.0980335474014282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20030297338962555, + "step": 21244 + }, + { + "epoch": 0.42492, + "grad_norm": 1.84375, + "grad_norm_var": 0.01800715128580729, + "learning_rate": 0.0001, + "loss": 3.8988, + "loss/crossentropy": 1.8301831483840942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18173757195472717, + "step": 21246 + }, + { + "epoch": 0.42496, + "grad_norm": 2.015625, + "grad_norm_var": 0.020734659830729165, + "learning_rate": 0.0001, + "loss": 3.8851, + "loss/crossentropy": 2.1006619930267334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21144358813762665, + "step": 21248 + }, + { + "epoch": 0.425, + "grad_norm": 1.90625, + "grad_norm_var": 0.019220987955729168, + "learning_rate": 0.0001, + "loss": 4.1271, + "loss/crossentropy": 2.2240719199180603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21180906891822815, + "step": 21250 + }, + { + "epoch": 0.42504, + "grad_norm": 1.8984375, + "grad_norm_var": 0.024079386393229166, + "learning_rate": 0.0001, + "loss": 4.2143, + "loss/crossentropy": 2.187661051750183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19673307240009308, + "step": 21252 + }, + { + "epoch": 0.42508, + "grad_norm": 1.8515625, + "grad_norm_var": 0.07988993326822917, + "learning_rate": 0.0001, + "loss": 3.9913, + "loss/crossentropy": 2.17081356048584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19375403225421906, + "step": 21254 + }, + { + "epoch": 0.42512, + "grad_norm": 2.03125, + "grad_norm_var": 0.0834673563639323, + "learning_rate": 0.0001, + "loss": 3.7945, + "loss/crossentropy": 1.6309250593185425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17680996656417847, + "step": 21256 + }, + { + "epoch": 0.42516, + "grad_norm": 2.171875, + "grad_norm_var": 0.08495992024739583, + "learning_rate": 0.0001, + "loss": 3.97, + "loss/crossentropy": 1.9581794142723083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18848375976085663, + "step": 21258 + }, + { + "epoch": 0.4252, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0839508056640625, + "learning_rate": 0.0001, + "loss": 4.1796, + "loss/crossentropy": 1.9206833839416504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18920007348060608, + "step": 21260 + }, + { + "epoch": 0.42524, + "grad_norm": 1.9765625, + "grad_norm_var": 0.08524169921875, + "learning_rate": 0.0001, + "loss": 3.8909, + "loss/crossentropy": 1.736355721950531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17645896971225739, + "step": 21262 + }, + { + "epoch": 0.42528, + "grad_norm": 2.328125, + "grad_norm_var": 0.0872711181640625, + "learning_rate": 0.0001, + "loss": 4.0856, + "loss/crossentropy": 1.9830248355865479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18551607429981232, + "step": 21264 + }, + { + "epoch": 0.42532, + "grad_norm": 1.8359375, + "grad_norm_var": 0.08831558227539063, + "learning_rate": 0.0001, + "loss": 3.8254, + "loss/crossentropy": 1.8477718234062195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18887290358543396, + "step": 21266 + }, + { + "epoch": 0.42536, + "grad_norm": 1.921875, + "grad_norm_var": 0.08277359008789062, + "learning_rate": 0.0001, + "loss": 3.8833, + "loss/crossentropy": 2.024181544780731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20996837317943573, + "step": 21268 + }, + { + "epoch": 0.4254, + "grad_norm": 1.984375, + "grad_norm_var": 0.021930948893229166, + "learning_rate": 0.0001, + "loss": 4.0265, + "loss/crossentropy": 2.153610110282898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1962708830833435, + "step": 21270 + }, + { + "epoch": 0.42544, + "grad_norm": 1.8359375, + "grad_norm_var": 0.019359334309895834, + "learning_rate": 0.0001, + "loss": 4.1288, + "loss/crossentropy": 2.142420172691345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2167380526661873, + "step": 21272 + }, + { + "epoch": 0.42548, + "grad_norm": 1.984375, + "grad_norm_var": 0.017235310872395833, + "learning_rate": 0.0001, + "loss": 4.1983, + "loss/crossentropy": 2.091115117073059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20703484117984772, + "step": 21274 + }, + { + "epoch": 0.42552, + "grad_norm": 2.015625, + "grad_norm_var": 0.017235310872395833, + "learning_rate": 0.0001, + "loss": 4.1653, + "loss/crossentropy": 2.0014833211898804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172148898243904, + "step": 21276 + }, + { + "epoch": 0.42556, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015340169270833334, + "learning_rate": 0.0001, + "loss": 3.9513, + "loss/crossentropy": 1.8580491542816162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1861124113202095, + "step": 21278 + }, + { + "epoch": 0.4256, + "grad_norm": 1.859375, + "grad_norm_var": 0.0071980794270833336, + "learning_rate": 0.0001, + "loss": 4.1013, + "loss/crossentropy": 2.1323113441467285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839580088853836, + "step": 21280 + }, + { + "epoch": 0.42564, + "grad_norm": 1.8984375, + "grad_norm_var": 0.005952707926432292, + "learning_rate": 0.0001, + "loss": 4.0295, + "loss/crossentropy": 2.107018530368805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19513815641403198, + "step": 21282 + }, + { + "epoch": 0.42568, + "grad_norm": 2.0, + "grad_norm_var": 0.005882771809895834, + "learning_rate": 0.0001, + "loss": 3.989, + "loss/crossentropy": 2.025463044643402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19870947301387787, + "step": 21284 + }, + { + "epoch": 0.42572, + "grad_norm": 2.125, + "grad_norm_var": 0.007429758707682292, + "learning_rate": 0.0001, + "loss": 4.2469, + "loss/crossentropy": 2.348206877708435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2326420098543167, + "step": 21286 + }, + { + "epoch": 0.42576, + "grad_norm": 1.90625, + "grad_norm_var": 0.005037434895833333, + "learning_rate": 0.0001, + "loss": 4.1699, + "loss/crossentropy": 2.2780548334121704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1905437633395195, + "step": 21288 + }, + { + "epoch": 0.4258, + "grad_norm": 1.953125, + "grad_norm_var": 0.004400380452473958, + "learning_rate": 0.0001, + "loss": 4.1373, + "loss/crossentropy": 2.1915369629859924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19397883117198944, + "step": 21290 + }, + { + "epoch": 0.42584, + "grad_norm": 2.015625, + "grad_norm_var": 0.004353586832682292, + "learning_rate": 0.0001, + "loss": 4.0237, + "loss/crossentropy": 2.132679283618927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21279992163181305, + "step": 21292 + }, + { + "epoch": 0.42588, + "grad_norm": 2.0625, + "grad_norm_var": 0.006058756510416667, + "learning_rate": 0.0001, + "loss": 3.9847, + "loss/crossentropy": 1.860496699810028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1666220724582672, + "step": 21294 + }, + { + "epoch": 0.42592, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005535634358723959, + "learning_rate": 0.0001, + "loss": 4.1154, + "loss/crossentropy": 2.031413435935974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919916793704033, + "step": 21296 + }, + { + "epoch": 0.42596, + "grad_norm": 1.953125, + "grad_norm_var": 0.005296834309895833, + "learning_rate": 0.0001, + "loss": 3.7703, + "loss/crossentropy": 1.6095055937767029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1592901274561882, + "step": 21298 + }, + { + "epoch": 0.426, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0052886962890625, + "learning_rate": 0.0001, + "loss": 4.1652, + "loss/crossentropy": 2.002205550670624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18984580785036087, + "step": 21300 + }, + { + "epoch": 0.42604, + "grad_norm": 1.9375, + "grad_norm_var": 0.0029856363932291665, + "learning_rate": 0.0001, + "loss": 4.1755, + "loss/crossentropy": 2.022138476371765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1904059648513794, + "step": 21302 + }, + { + "epoch": 0.42608, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0036801656087239584, + "learning_rate": 0.0001, + "loss": 3.8417, + "loss/crossentropy": 1.917612910270691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18372280895709991, + "step": 21304 + }, + { + "epoch": 0.42612, + "grad_norm": 1.96875, + "grad_norm_var": 0.004426829020182292, + "learning_rate": 0.0001, + "loss": 3.8031, + "loss/crossentropy": 1.827630877494812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18420201539993286, + "step": 21306 + }, + { + "epoch": 0.42616, + "grad_norm": 1.875, + "grad_norm_var": 0.004115549723307291, + "learning_rate": 0.0001, + "loss": 3.8427, + "loss/crossentropy": 1.7498629689216614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1638321429491043, + "step": 21308 + }, + { + "epoch": 0.4262, + "grad_norm": 2.0625, + "grad_norm_var": 0.0036374409993489582, + "learning_rate": 0.0001, + "loss": 4.0028, + "loss/crossentropy": 1.7890136241912842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18237848579883575, + "step": 21310 + }, + { + "epoch": 0.42624, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0033518473307291665, + "learning_rate": 0.0001, + "loss": 4.1649, + "loss/crossentropy": 2.17901873588562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104976773262024, + "step": 21312 + }, + { + "epoch": 0.42628, + "grad_norm": 1.9140625, + "grad_norm_var": 0.003987630208333333, + "learning_rate": 0.0001, + "loss": 4.2149, + "loss/crossentropy": 2.322218656539917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20789335668087006, + "step": 21314 + }, + { + "epoch": 0.42632, + "grad_norm": 2.015625, + "grad_norm_var": 0.004515584309895833, + "learning_rate": 0.0001, + "loss": 4.2208, + "loss/crossentropy": 2.4203622341156006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21352334320545197, + "step": 21316 + }, + { + "epoch": 0.42636, + "grad_norm": 2.125, + "grad_norm_var": 0.008388010660807292, + "learning_rate": 0.0001, + "loss": 3.7956, + "loss/crossentropy": 2.2349472045898438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2009701356291771, + "step": 21318 + }, + { + "epoch": 0.4264, + "grad_norm": 2.078125, + "grad_norm_var": 0.009511057535807292, + "learning_rate": 0.0001, + "loss": 3.9733, + "loss/crossentropy": 1.8173826336860657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1736140102148056, + "step": 21320 + }, + { + "epoch": 0.42644, + "grad_norm": 1.7421875, + "grad_norm_var": 0.011315663655598959, + "learning_rate": 0.0001, + "loss": 3.6665, + "loss/crossentropy": 2.099972426891327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19424229115247726, + "step": 21322 + }, + { + "epoch": 0.42648, + "grad_norm": 1.9375, + "grad_norm_var": 0.06534830729166667, + "learning_rate": 0.0001, + "loss": 3.9213, + "loss/crossentropy": 1.8005958795547485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17232687771320343, + "step": 21324 + }, + { + "epoch": 0.42652, + "grad_norm": 2.09375, + "grad_norm_var": 0.06697184244791667, + "learning_rate": 0.0001, + "loss": 4.1434, + "loss/crossentropy": 2.0184829235076904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19896141439676285, + "step": 21326 + }, + { + "epoch": 0.42656, + "grad_norm": 1.7421875, + "grad_norm_var": 0.07361551920572916, + "learning_rate": 0.0001, + "loss": 3.7018, + "loss/crossentropy": 2.0207254886627197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1863861232995987, + "step": 21328 + }, + { + "epoch": 0.4266, + "grad_norm": 1.9609375, + "grad_norm_var": 0.07342020670572917, + "learning_rate": 0.0001, + "loss": 4.2953, + "loss/crossentropy": 2.1894538402557373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21097677946090698, + "step": 21330 + }, + { + "epoch": 0.42664, + "grad_norm": 1.8125, + "grad_norm_var": 0.07691650390625, + "learning_rate": 0.0001, + "loss": 4.1044, + "loss/crossentropy": 2.1939873695373535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1935359239578247, + "step": 21332 + }, + { + "epoch": 0.42668, + "grad_norm": 1.9375, + "grad_norm_var": 0.07349624633789062, + "learning_rate": 0.0001, + "loss": 4.1346, + "loss/crossentropy": 2.131330966949463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001090720295906, + "step": 21334 + }, + { + "epoch": 0.42672, + "grad_norm": 1.828125, + "grad_norm_var": 0.083154296875, + "learning_rate": 0.0001, + "loss": 3.9866, + "loss/crossentropy": 1.9254841208457947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20036878436803818, + "step": 21336 + }, + { + "epoch": 0.42676, + "grad_norm": 1.8828125, + "grad_norm_var": 0.07875874837239584, + "learning_rate": 0.0001, + "loss": 4.0641, + "loss/crossentropy": 1.9841803312301636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19805853068828583, + "step": 21338 + }, + { + "epoch": 0.4268, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02767918904622396, + "learning_rate": 0.0001, + "loss": 3.6806, + "loss/crossentropy": 1.6822729110717773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18236130475997925, + "step": 21340 + }, + { + "epoch": 0.42684, + "grad_norm": 1.9453125, + "grad_norm_var": 0.023789215087890624, + "learning_rate": 0.0001, + "loss": 4.1128, + "loss/crossentropy": 2.0904295444488525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19326913356781006, + "step": 21342 + }, + { + "epoch": 0.42688, + "grad_norm": 1.953125, + "grad_norm_var": 0.01951878865559896, + "learning_rate": 0.0001, + "loss": 4.0758, + "loss/crossentropy": 2.111131250858307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202858105301857, + "step": 21344 + }, + { + "epoch": 0.42692, + "grad_norm": 1.890625, + "grad_norm_var": 0.019197591145833335, + "learning_rate": 0.0001, + "loss": 4.2061, + "loss/crossentropy": 2.2793352603912354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20112870633602142, + "step": 21346 + }, + { + "epoch": 0.42696, + "grad_norm": 1.9140625, + "grad_norm_var": 0.016112263997395834, + "learning_rate": 0.0001, + "loss": 4.1173, + "loss/crossentropy": 2.253583312034607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21494006365537643, + "step": 21348 + }, + { + "epoch": 0.427, + "grad_norm": 1.9140625, + "grad_norm_var": 0.016532389322916667, + "learning_rate": 0.0001, + "loss": 4.2371, + "loss/crossentropy": 1.9753515124320984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1828732267022133, + "step": 21350 + }, + { + "epoch": 0.42704, + "grad_norm": 1.921875, + "grad_norm_var": 0.0023455301920572917, + "learning_rate": 0.0001, + "loss": 4.0961, + "loss/crossentropy": 2.110904037952423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19640487432479858, + "step": 21352 + }, + { + "epoch": 0.42708, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0022905985514322918, + "learning_rate": 0.0001, + "loss": 4.1961, + "loss/crossentropy": 2.323657512664795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2206008955836296, + "step": 21354 + }, + { + "epoch": 0.42712, + "grad_norm": 1.8984375, + "grad_norm_var": 0.003507232666015625, + "learning_rate": 0.0001, + "loss": 4.1613, + "loss/crossentropy": 2.113715887069702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18136650323867798, + "step": 21356 + }, + { + "epoch": 0.42716, + "grad_norm": 1.96875, + "grad_norm_var": 0.004207102457682291, + "learning_rate": 0.0001, + "loss": 3.9951, + "loss/crossentropy": 2.096550941467285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20972034335136414, + "step": 21358 + }, + { + "epoch": 0.4272, + "grad_norm": 1.9375, + "grad_norm_var": 0.004449208577473958, + "learning_rate": 0.0001, + "loss": 4.0398, + "loss/crossentropy": 2.1959888339042664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19834591448307037, + "step": 21360 + }, + { + "epoch": 0.42724, + "grad_norm": 1.8125, + "grad_norm_var": 0.0054351806640625, + "learning_rate": 0.0001, + "loss": 4.0003, + "loss/crossentropy": 2.146053671836853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18752741813659668, + "step": 21362 + }, + { + "epoch": 0.42728, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005728912353515625, + "learning_rate": 0.0001, + "loss": 3.6564, + "loss/crossentropy": 1.9178010821342468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20413734018802643, + "step": 21364 + }, + { + "epoch": 0.42732, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005283355712890625, + "learning_rate": 0.0001, + "loss": 3.884, + "loss/crossentropy": 1.8283003568649292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882033348083496, + "step": 21366 + }, + { + "epoch": 0.42736, + "grad_norm": 2.1875, + "grad_norm_var": 0.0103759765625, + "learning_rate": 0.0001, + "loss": 4.362, + "loss/crossentropy": 2.1956799030303955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21325727552175522, + "step": 21368 + }, + { + "epoch": 0.4274, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010381825764973958, + "learning_rate": 0.0001, + "loss": 3.9304, + "loss/crossentropy": 1.8081734776496887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1818179190158844, + "step": 21370 + }, + { + "epoch": 0.42744, + "grad_norm": 1.921875, + "grad_norm_var": 0.008699544270833333, + "learning_rate": 0.0001, + "loss": 4.3029, + "loss/crossentropy": 2.283393979072571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20135364681482315, + "step": 21372 + }, + { + "epoch": 0.42748, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013038889567057291, + "learning_rate": 0.0001, + "loss": 3.7118, + "loss/crossentropy": 1.9896268844604492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17712793499231339, + "step": 21374 + }, + { + "epoch": 0.42752, + "grad_norm": 2.0, + "grad_norm_var": 0.017289225260416666, + "learning_rate": 0.0001, + "loss": 4.2804, + "loss/crossentropy": 2.1862595081329346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19685917347669601, + "step": 21376 + }, + { + "epoch": 0.42756, + "grad_norm": 1.890625, + "grad_norm_var": 0.016355133056640624, + "learning_rate": 0.0001, + "loss": 3.9852, + "loss/crossentropy": 2.097273588180542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204267680644989, + "step": 21378 + }, + { + "epoch": 0.4276, + "grad_norm": 1.9140625, + "grad_norm_var": 0.016031646728515626, + "learning_rate": 0.0001, + "loss": 4.0112, + "loss/crossentropy": 2.0905693769454956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17958275228738785, + "step": 21380 + }, + { + "epoch": 0.42764, + "grad_norm": 2.0, + "grad_norm_var": 0.016502888997395833, + "learning_rate": 0.0001, + "loss": 4.2815, + "loss/crossentropy": 2.1066064834594727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18819285184144974, + "step": 21382 + }, + { + "epoch": 0.42768, + "grad_norm": 2.109375, + "grad_norm_var": 0.0134429931640625, + "learning_rate": 0.0001, + "loss": 4.2721, + "loss/crossentropy": 1.9919481873512268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19579682499170303, + "step": 21384 + }, + { + "epoch": 0.42772, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01337890625, + "learning_rate": 0.0001, + "loss": 3.9186, + "loss/crossentropy": 1.8611761927604675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929011344909668, + "step": 21386 + }, + { + "epoch": 0.42776, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015592193603515625, + "learning_rate": 0.0001, + "loss": 3.9346, + "loss/crossentropy": 2.232940196990967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1934458464384079, + "step": 21388 + }, + { + "epoch": 0.4278, + "grad_norm": 2.046875, + "grad_norm_var": 0.0073883056640625, + "learning_rate": 0.0001, + "loss": 4.2149, + "loss/crossentropy": 2.2017111778259277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22515927255153656, + "step": 21390 + }, + { + "epoch": 0.42784, + "grad_norm": 2.1875, + "grad_norm_var": 0.0079498291015625, + "learning_rate": 0.0001, + "loss": 4.3437, + "loss/crossentropy": 2.4802298545837402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20159347355365753, + "step": 21392 + }, + { + "epoch": 0.42788, + "grad_norm": 2.03125, + "grad_norm_var": 0.0065093994140625, + "learning_rate": 0.0001, + "loss": 4.1095, + "loss/crossentropy": 2.1320372819900513, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19364330172538757, + "step": 21394 + }, + { + "epoch": 0.42792, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0069488525390625, + "learning_rate": 0.0001, + "loss": 4.2004, + "loss/crossentropy": 2.115567684173584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150493860244751, + "step": 21396 + }, + { + "epoch": 0.42796, + "grad_norm": 2.1875, + "grad_norm_var": 0.008259073893229166, + "learning_rate": 0.0001, + "loss": 4.4131, + "loss/crossentropy": 2.3852893114089966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21371107548475266, + "step": 21398 + }, + { + "epoch": 0.428, + "grad_norm": 2.0625, + "grad_norm_var": 0.010204060872395834, + "learning_rate": 0.0001, + "loss": 3.9377, + "loss/crossentropy": 1.872315526008606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17347048223018646, + "step": 21400 + }, + { + "epoch": 0.42804, + "grad_norm": 3.9375, + "grad_norm_var": 0.23815816243489582, + "learning_rate": 0.0001, + "loss": 4.0048, + "loss/crossentropy": 2.064896881580353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20905783772468567, + "step": 21402 + }, + { + "epoch": 0.42808, + "grad_norm": 1.90625, + "grad_norm_var": 0.24268290201822917, + "learning_rate": 0.0001, + "loss": 3.6606, + "loss/crossentropy": 1.90069580078125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19062762707471848, + "step": 21404 + }, + { + "epoch": 0.42812, + "grad_norm": 1.84375, + "grad_norm_var": 0.24911473592122396, + "learning_rate": 0.0001, + "loss": 4.1083, + "loss/crossentropy": 1.9533473253250122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17531338334083557, + "step": 21406 + }, + { + "epoch": 0.42816, + "grad_norm": 2.03125, + "grad_norm_var": 0.25371475219726564, + "learning_rate": 0.0001, + "loss": 3.9376, + "loss/crossentropy": 1.9835584163665771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19894006848335266, + "step": 21408 + }, + { + "epoch": 0.4282, + "grad_norm": 1.8359375, + "grad_norm_var": 0.26023661295572914, + "learning_rate": 0.0001, + "loss": 3.6515, + "loss/crossentropy": 2.2370243072509766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20410479605197906, + "step": 21410 + }, + { + "epoch": 0.42824, + "grad_norm": 2.09375, + "grad_norm_var": 0.2575152079264323, + "learning_rate": 0.0001, + "loss": 3.8597, + "loss/crossentropy": 2.0012041330337524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18770772218704224, + "step": 21412 + }, + { + "epoch": 0.42828, + "grad_norm": 1.8984375, + "grad_norm_var": 0.2589312235514323, + "learning_rate": 0.0001, + "loss": 4.1147, + "loss/crossentropy": 2.0049465894699097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19158685207366943, + "step": 21414 + }, + { + "epoch": 0.42832, + "grad_norm": 2.046875, + "grad_norm_var": 0.2571207682291667, + "learning_rate": 0.0001, + "loss": 3.9892, + "loss/crossentropy": 2.1261045932769775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21888310462236404, + "step": 21416 + }, + { + "epoch": 0.42836, + "grad_norm": 2.046875, + "grad_norm_var": 0.007771809895833333, + "learning_rate": 0.0001, + "loss": 4.2261, + "loss/crossentropy": 2.3438754081726074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22312359511852264, + "step": 21418 + }, + { + "epoch": 0.4284, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0075457255045572914, + "learning_rate": 0.0001, + "loss": 4.0333, + "loss/crossentropy": 1.970156729221344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17968790978193283, + "step": 21420 + }, + { + "epoch": 0.42844, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007356516520182292, + "learning_rate": 0.0001, + "loss": 4.1106, + "loss/crossentropy": 2.0128689408302307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.190870463848114, + "step": 21422 + }, + { + "epoch": 0.42848, + "grad_norm": 2.734375, + "grad_norm_var": 0.044755045572916666, + "learning_rate": 0.0001, + "loss": 4.0063, + "loss/crossentropy": 1.766348421573639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1758265122771263, + "step": 21424 + }, + { + "epoch": 0.42852, + "grad_norm": 2.125, + "grad_norm_var": 0.041757965087890626, + "learning_rate": 0.0001, + "loss": 4.1498, + "loss/crossentropy": 1.9646649360656738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052430659532547, + "step": 21426 + }, + { + "epoch": 0.42856, + "grad_norm": 1.921875, + "grad_norm_var": 0.042281087239583334, + "learning_rate": 0.0001, + "loss": 4.0817, + "loss/crossentropy": 1.8953626155853271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974174976348877, + "step": 21428 + }, + { + "epoch": 0.4286, + "grad_norm": 1.765625, + "grad_norm_var": 0.04601008097330729, + "learning_rate": 0.0001, + "loss": 3.7752, + "loss/crossentropy": 1.9713319540023804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18385013192892075, + "step": 21430 + }, + { + "epoch": 0.42864, + "grad_norm": 1.875, + "grad_norm_var": 0.047501627604166666, + "learning_rate": 0.0001, + "loss": 4.1555, + "loss/crossentropy": 2.2564018964767456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19498983025550842, + "step": 21432 + }, + { + "epoch": 0.42868, + "grad_norm": 2.21875, + "grad_norm_var": 0.05091044108072917, + "learning_rate": 0.0001, + "loss": 4.1351, + "loss/crossentropy": 2.1691616773605347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22087589651346207, + "step": 21434 + }, + { + "epoch": 0.42872, + "grad_norm": 9.1875, + "grad_norm_var": 3.257096354166667, + "learning_rate": 0.0001, + "loss": 4.4825, + "loss/crossentropy": 1.9714866280555725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19079908728599548, + "step": 21436 + }, + { + "epoch": 0.42876, + "grad_norm": 2.0, + "grad_norm_var": 3.2328834533691406, + "learning_rate": 0.0001, + "loss": 4.0687, + "loss/crossentropy": 2.2311829328536987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19929265975952148, + "step": 21438 + }, + { + "epoch": 0.4288, + "grad_norm": 1.984375, + "grad_norm_var": 3.231501261393229, + "learning_rate": 0.0001, + "loss": 4.0397, + "loss/crossentropy": 1.9644780158996582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1768122911453247, + "step": 21440 + }, + { + "epoch": 0.42884, + "grad_norm": 1.796875, + "grad_norm_var": 3.253226725260417, + "learning_rate": 0.0001, + "loss": 3.9993, + "loss/crossentropy": 1.93829345703125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19605614244937897, + "step": 21442 + }, + { + "epoch": 0.42888, + "grad_norm": 1.9140625, + "grad_norm_var": 3.2553304036458335, + "learning_rate": 0.0001, + "loss": 3.8987, + "loss/crossentropy": 1.9488537907600403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20282328873872757, + "step": 21444 + }, + { + "epoch": 0.42892, + "grad_norm": 1.75, + "grad_norm_var": 3.254095204671224, + "learning_rate": 0.0001, + "loss": 3.8275, + "loss/crossentropy": 1.93411123752594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18476436287164688, + "step": 21446 + }, + { + "epoch": 0.42896, + "grad_norm": 1.8828125, + "grad_norm_var": 3.24100341796875, + "learning_rate": 0.0001, + "loss": 4.1485, + "loss/crossentropy": 2.4150885343551636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21736841648817062, + "step": 21448 + }, + { + "epoch": 0.429, + "grad_norm": 2.078125, + "grad_norm_var": 3.2536610921223956, + "learning_rate": 0.0001, + "loss": 4.227, + "loss/crossentropy": 2.117435574531555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20417648553848267, + "step": 21450 + }, + { + "epoch": 0.42904, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014115142822265624, + "learning_rate": 0.0001, + "loss": 3.9162, + "loss/crossentropy": 2.0840513706207275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20294595509767532, + "step": 21452 + }, + { + "epoch": 0.42908, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0087066650390625, + "learning_rate": 0.0001, + "loss": 3.9088, + "loss/crossentropy": 2.1203198432922363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19258832931518555, + "step": 21454 + }, + { + "epoch": 0.42912, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007503000895182291, + "learning_rate": 0.0001, + "loss": 4.1205, + "loss/crossentropy": 2.1161458492279053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18999946117401123, + "step": 21456 + }, + { + "epoch": 0.42916, + "grad_norm": 1.828125, + "grad_norm_var": 0.006876373291015625, + "learning_rate": 0.0001, + "loss": 3.9288, + "loss/crossentropy": 1.7208130955696106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20022748410701752, + "step": 21458 + }, + { + "epoch": 0.4292, + "grad_norm": 2.046875, + "grad_norm_var": 0.007940419514973958, + "learning_rate": 0.0001, + "loss": 4.1794, + "loss/crossentropy": 2.0148571729660034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059563249349594, + "step": 21460 + }, + { + "epoch": 0.42924, + "grad_norm": 1.8203125, + "grad_norm_var": 0.006615193684895834, + "learning_rate": 0.0001, + "loss": 4.0039, + "loss/crossentropy": 1.985020637512207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18583452701568604, + "step": 21462 + }, + { + "epoch": 0.42928, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0056955973307291664, + "learning_rate": 0.0001, + "loss": 3.9623, + "loss/crossentropy": 1.8767080903053284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18215155601501465, + "step": 21464 + }, + { + "epoch": 0.42932, + "grad_norm": 1.9375, + "grad_norm_var": 0.0037638346354166665, + "learning_rate": 0.0001, + "loss": 4.1592, + "loss/crossentropy": 2.162920832633972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20505908131599426, + "step": 21466 + }, + { + "epoch": 0.42936, + "grad_norm": 2.203125, + "grad_norm_var": 0.00985107421875, + "learning_rate": 0.0001, + "loss": 4.0253, + "loss/crossentropy": 2.329980731010437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23726581782102585, + "step": 21468 + }, + { + "epoch": 0.4294, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01512451171875, + "learning_rate": 0.0001, + "loss": 4.0204, + "loss/crossentropy": 2.020410180091858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19389280676841736, + "step": 21470 + }, + { + "epoch": 0.42944, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01613133748372396, + "learning_rate": 0.0001, + "loss": 4.0849, + "loss/crossentropy": 2.157116711139679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1991463601589203, + "step": 21472 + }, + { + "epoch": 0.42948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.014992014567057291, + "learning_rate": 0.0001, + "loss": 4.0913, + "loss/crossentropy": 2.0108723640441895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19546864926815033, + "step": 21474 + }, + { + "epoch": 0.42952, + "grad_norm": 1.953125, + "grad_norm_var": 0.0141998291015625, + "learning_rate": 0.0001, + "loss": 4.2555, + "loss/crossentropy": 2.154988646507263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19204716384410858, + "step": 21476 + }, + { + "epoch": 0.42956, + "grad_norm": 1.890625, + "grad_norm_var": 0.013549550374348959, + "learning_rate": 0.0001, + "loss": 3.7924, + "loss/crossentropy": 1.9463204145431519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2041161060333252, + "step": 21478 + }, + { + "epoch": 0.4296, + "grad_norm": 2.015625, + "grad_norm_var": 0.013459269205729167, + "learning_rate": 0.0001, + "loss": 4.1798, + "loss/crossentropy": 2.3123443126678467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21649713814258575, + "step": 21480 + }, + { + "epoch": 0.42964, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015160115559895833, + "learning_rate": 0.0001, + "loss": 3.8571, + "loss/crossentropy": 1.658549726009369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16132741421461105, + "step": 21482 + }, + { + "epoch": 0.42968, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010282389322916667, + "learning_rate": 0.0001, + "loss": 3.8965, + "loss/crossentropy": 1.9251441955566406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17892278730869293, + "step": 21484 + }, + { + "epoch": 0.42972, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004523722330729166, + "learning_rate": 0.0001, + "loss": 4.002, + "loss/crossentropy": 1.7956182956695557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17938754707574844, + "step": 21486 + }, + { + "epoch": 0.42976, + "grad_norm": 2.046875, + "grad_norm_var": 0.004312896728515625, + "learning_rate": 0.0001, + "loss": 4.0373, + "loss/crossentropy": 1.941636562347412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159031555056572, + "step": 21488 + }, + { + "epoch": 0.4298, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0047108968098958336, + "learning_rate": 0.0001, + "loss": 4.2861, + "loss/crossentropy": 2.4158248901367188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2346770316362381, + "step": 21490 + }, + { + "epoch": 0.42984, + "grad_norm": 2.171875, + "grad_norm_var": 0.009110260009765624, + "learning_rate": 0.0001, + "loss": 4.0334, + "loss/crossentropy": 1.749859094619751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17030366510152817, + "step": 21492 + }, + { + "epoch": 0.42988, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008622233072916667, + "learning_rate": 0.0001, + "loss": 4.1944, + "loss/crossentropy": 2.0735827684402466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1986066773533821, + "step": 21494 + }, + { + "epoch": 0.42992, + "grad_norm": 1.859375, + "grad_norm_var": 0.008556874593098958, + "learning_rate": 0.0001, + "loss": 3.8175, + "loss/crossentropy": 1.744678020477295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16986121982336044, + "step": 21496 + }, + { + "epoch": 0.42996, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007559967041015625, + "learning_rate": 0.0001, + "loss": 4.116, + "loss/crossentropy": 2.060591220855713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20758916437625885, + "step": 21498 + }, + { + "epoch": 0.43, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0074045817057291664, + "learning_rate": 0.0001, + "loss": 3.967, + "loss/crossentropy": 2.2836644649505615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20620425045490265, + "step": 21500 + }, + { + "epoch": 0.43004, + "grad_norm": 1.7421875, + "grad_norm_var": 0.010252888997395833, + "learning_rate": 0.0001, + "loss": 3.9827, + "loss/crossentropy": 2.1133594512939453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954261213541031, + "step": 21502 + }, + { + "epoch": 0.43008, + "grad_norm": 2.03125, + "grad_norm_var": 0.0100250244140625, + "learning_rate": 0.0001, + "loss": 4.0973, + "loss/crossentropy": 2.5492414236068726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23168637603521347, + "step": 21504 + }, + { + "epoch": 0.43012, + "grad_norm": 1.78125, + "grad_norm_var": 0.0113922119140625, + "learning_rate": 0.0001, + "loss": 3.6322, + "loss/crossentropy": 1.7122142314910889, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16849397122859955, + "step": 21506 + }, + { + "epoch": 0.43016, + "grad_norm": 2.015625, + "grad_norm_var": 0.0077288309733072914, + "learning_rate": 0.0001, + "loss": 4.0675, + "loss/crossentropy": 2.3052932024002075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21252303570508957, + "step": 21508 + }, + { + "epoch": 0.4302, + "grad_norm": 2.046875, + "grad_norm_var": 0.009570058186848958, + "learning_rate": 0.0001, + "loss": 3.9751, + "loss/crossentropy": 2.5156666040420532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26235880702733994, + "step": 21510 + }, + { + "epoch": 0.43024, + "grad_norm": 1.9375, + "grad_norm_var": 0.009419504801432292, + "learning_rate": 0.0001, + "loss": 3.8061, + "loss/crossentropy": 1.884554922580719, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1867503896355629, + "step": 21512 + }, + { + "epoch": 0.43028, + "grad_norm": 2.25, + "grad_norm_var": 0.01619440714518229, + "learning_rate": 0.0001, + "loss": 4.1092, + "loss/crossentropy": 2.1004000902175903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19210458546876907, + "step": 21514 + }, + { + "epoch": 0.43032, + "grad_norm": 1.890625, + "grad_norm_var": 0.01565526326497396, + "learning_rate": 0.0001, + "loss": 4.0146, + "loss/crossentropy": 1.606240451335907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15947996824979782, + "step": 21516 + }, + { + "epoch": 0.43036, + "grad_norm": 1.90625, + "grad_norm_var": 0.013130696614583333, + "learning_rate": 0.0001, + "loss": 4.1195, + "loss/crossentropy": 1.9417288303375244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19736050814390182, + "step": 21518 + }, + { + "epoch": 0.4304, + "grad_norm": 1.8203125, + "grad_norm_var": 0.014204915364583333, + "learning_rate": 0.0001, + "loss": 3.8322, + "loss/crossentropy": 1.9607113599777222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17349773645401, + "step": 21520 + }, + { + "epoch": 0.43044, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012105305989583334, + "learning_rate": 0.0001, + "loss": 3.9789, + "loss/crossentropy": 1.9606893062591553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726546436548233, + "step": 21522 + }, + { + "epoch": 0.43048, + "grad_norm": 2.03125, + "grad_norm_var": 0.011915842692057291, + "learning_rate": 0.0001, + "loss": 4.1203, + "loss/crossentropy": 1.9426813125610352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19272266328334808, + "step": 21524 + }, + { + "epoch": 0.43052, + "grad_norm": 1.78125, + "grad_norm_var": 0.011533355712890625, + "learning_rate": 0.0001, + "loss": 3.8158, + "loss/crossentropy": 2.1843879222869873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1986662521958351, + "step": 21526 + }, + { + "epoch": 0.43056, + "grad_norm": 1.90625, + "grad_norm_var": 0.011146799723307291, + "learning_rate": 0.0001, + "loss": 3.8314, + "loss/crossentropy": 1.7259829640388489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1759045571088791, + "step": 21528 + }, + { + "epoch": 0.4306, + "grad_norm": 2.015625, + "grad_norm_var": 0.004670969645182292, + "learning_rate": 0.0001, + "loss": 3.7475, + "loss/crossentropy": 1.7714526653289795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20490919053554535, + "step": 21530 + }, + { + "epoch": 0.43064, + "grad_norm": 2.046875, + "grad_norm_var": 0.005631256103515625, + "learning_rate": 0.0001, + "loss": 3.905, + "loss/crossentropy": 1.9385235905647278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19307191669940948, + "step": 21532 + }, + { + "epoch": 0.43068, + "grad_norm": 1.9375, + "grad_norm_var": 0.006034088134765625, + "learning_rate": 0.0001, + "loss": 4.2582, + "loss/crossentropy": 1.956704020500183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24551841616630554, + "step": 21534 + }, + { + "epoch": 0.43072, + "grad_norm": 2.015625, + "grad_norm_var": 0.0048736572265625, + "learning_rate": 0.0001, + "loss": 4.0436, + "loss/crossentropy": 2.0809181332588196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20348872244358063, + "step": 21536 + }, + { + "epoch": 0.43076, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004589589436848959, + "learning_rate": 0.0001, + "loss": 4.1488, + "loss/crossentropy": 1.9148198366165161, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19790956377983093, + "step": 21538 + }, + { + "epoch": 0.4308, + "grad_norm": 2.171875, + "grad_norm_var": 0.007207997639973958, + "learning_rate": 0.0001, + "loss": 3.9439, + "loss/crossentropy": 2.0833089351654053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19176144897937775, + "step": 21540 + }, + { + "epoch": 0.43084, + "grad_norm": 1.828125, + "grad_norm_var": 0.006376139322916667, + "learning_rate": 0.0001, + "loss": 4.0987, + "loss/crossentropy": 2.1066776514053345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20604360103607178, + "step": 21542 + }, + { + "epoch": 0.43088, + "grad_norm": 1.921875, + "grad_norm_var": 0.006248982747395834, + "learning_rate": 0.0001, + "loss": 3.9294, + "loss/crossentropy": 2.125930666923523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22426339238882065, + "step": 21544 + }, + { + "epoch": 0.43092, + "grad_norm": 1.90625, + "grad_norm_var": 0.0062978108723958336, + "learning_rate": 0.0001, + "loss": 3.8652, + "loss/crossentropy": 1.743666172027588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1681881994009018, + "step": 21546 + }, + { + "epoch": 0.43096, + "grad_norm": 2.078125, + "grad_norm_var": 0.00712890625, + "learning_rate": 0.0001, + "loss": 3.8952, + "loss/crossentropy": 1.882837951183319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18062414973974228, + "step": 21548 + }, + { + "epoch": 0.431, + "grad_norm": 1.78125, + "grad_norm_var": 0.009137980143229167, + "learning_rate": 0.0001, + "loss": 3.6019, + "loss/crossentropy": 1.8259800672531128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19174453616142273, + "step": 21550 + }, + { + "epoch": 0.43104, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009199778238932291, + "learning_rate": 0.0001, + "loss": 4.0255, + "loss/crossentropy": 2.3379745483398438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22536730766296387, + "step": 21552 + }, + { + "epoch": 0.43108, + "grad_norm": 2.09375, + "grad_norm_var": 0.010560862223307292, + "learning_rate": 0.0001, + "loss": 3.9447, + "loss/crossentropy": 2.123952627182007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21830175817012787, + "step": 21554 + }, + { + "epoch": 0.43112, + "grad_norm": 1.9375, + "grad_norm_var": 0.006849924723307292, + "learning_rate": 0.0001, + "loss": 4.1353, + "loss/crossentropy": 2.286398410797119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20413320511579514, + "step": 21556 + }, + { + "epoch": 0.43116, + "grad_norm": 2.015625, + "grad_norm_var": 0.006044260660807292, + "learning_rate": 0.0001, + "loss": 3.9117, + "loss/crossentropy": 2.0898618698120117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2105274796485901, + "step": 21558 + }, + { + "epoch": 0.4312, + "grad_norm": 2.015625, + "grad_norm_var": 0.0067789713541666664, + "learning_rate": 0.0001, + "loss": 4.1431, + "loss/crossentropy": 1.940380573272705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18282227218151093, + "step": 21560 + }, + { + "epoch": 0.43124, + "grad_norm": 2.078125, + "grad_norm_var": 0.007657623291015625, + "learning_rate": 0.0001, + "loss": 3.9952, + "loss/crossentropy": 1.6752060055732727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20306075364351273, + "step": 21562 + }, + { + "epoch": 0.43128, + "grad_norm": 2.234375, + "grad_norm_var": 0.011091105143229167, + "learning_rate": 0.0001, + "loss": 4.3745, + "loss/crossentropy": 2.234758734703064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19758003950119019, + "step": 21564 + }, + { + "epoch": 0.43132, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00819091796875, + "learning_rate": 0.0001, + "loss": 4.2822, + "loss/crossentropy": 2.126620352268219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20688238739967346, + "step": 21566 + }, + { + "epoch": 0.43136, + "grad_norm": 1.984375, + "grad_norm_var": 0.008084869384765625, + "learning_rate": 0.0001, + "loss": 4.1531, + "loss/crossentropy": 1.9397594332695007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18199825286865234, + "step": 21568 + }, + { + "epoch": 0.4314, + "grad_norm": 2.0625, + "grad_norm_var": 0.0080322265625, + "learning_rate": 0.0001, + "loss": 4.1644, + "loss/crossentropy": 2.1395343542099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074909508228302, + "step": 21570 + }, + { + "epoch": 0.43144, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010827382405598959, + "learning_rate": 0.0001, + "loss": 3.7397, + "loss/crossentropy": 1.9207513332366943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1881982833147049, + "step": 21572 + }, + { + "epoch": 0.43148, + "grad_norm": 2.34375, + "grad_norm_var": 0.01856053670247396, + "learning_rate": 0.0001, + "loss": 4.1079, + "loss/crossentropy": 2.0409696102142334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21291138231754303, + "step": 21574 + }, + { + "epoch": 0.43152, + "grad_norm": 1.953125, + "grad_norm_var": 0.02153498331705729, + "learning_rate": 0.0001, + "loss": 4.0419, + "loss/crossentropy": 2.0753902792930603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20471004396677017, + "step": 21576 + }, + { + "epoch": 0.43156, + "grad_norm": 1.875, + "grad_norm_var": 0.022321573893229165, + "learning_rate": 0.0001, + "loss": 4.1617, + "loss/crossentropy": 2.412545084953308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21189319342374802, + "step": 21578 + }, + { + "epoch": 0.4316, + "grad_norm": 1.875, + "grad_norm_var": 0.019850413004557293, + "learning_rate": 0.0001, + "loss": 3.9468, + "loss/crossentropy": 1.9271405339241028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18710750341415405, + "step": 21580 + }, + { + "epoch": 0.43164, + "grad_norm": 2.0, + "grad_norm_var": 0.019321441650390625, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 2.000905692577362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17429713159799576, + "step": 21582 + }, + { + "epoch": 0.43168, + "grad_norm": 1.9140625, + "grad_norm_var": 0.019334920247395835, + "learning_rate": 0.0001, + "loss": 3.8559, + "loss/crossentropy": 1.9592137932777405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.189944788813591, + "step": 21584 + }, + { + "epoch": 0.43172, + "grad_norm": 1.953125, + "grad_norm_var": 0.0188385009765625, + "learning_rate": 0.0001, + "loss": 4.1447, + "loss/crossentropy": 2.283234119415283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18164613842964172, + "step": 21586 + }, + { + "epoch": 0.43176, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0160064697265625, + "learning_rate": 0.0001, + "loss": 4.0019, + "loss/crossentropy": 2.1789008378982544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087143510580063, + "step": 21588 + }, + { + "epoch": 0.4318, + "grad_norm": 1.875, + "grad_norm_var": 0.009000396728515625, + "learning_rate": 0.0001, + "loss": 4.0039, + "loss/crossentropy": 2.192038893699646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19432441890239716, + "step": 21590 + }, + { + "epoch": 0.43184, + "grad_norm": 1.84375, + "grad_norm_var": 0.005130767822265625, + "learning_rate": 0.0001, + "loss": 4.0142, + "loss/crossentropy": 1.8618733286857605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18207471072673798, + "step": 21592 + }, + { + "epoch": 0.43188, + "grad_norm": 2.078125, + "grad_norm_var": 0.006186676025390625, + "learning_rate": 0.0001, + "loss": 4.3056, + "loss/crossentropy": 2.423651933670044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21819397807121277, + "step": 21594 + }, + { + "epoch": 0.43192, + "grad_norm": 1.984375, + "grad_norm_var": 0.005765533447265625, + "learning_rate": 0.0001, + "loss": 4.0506, + "loss/crossentropy": 1.7313976287841797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16011521220207214, + "step": 21596 + }, + { + "epoch": 0.43196, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0051043192545572914, + "learning_rate": 0.0001, + "loss": 4.0915, + "loss/crossentropy": 2.089974284172058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20138490200042725, + "step": 21598 + }, + { + "epoch": 0.432, + "grad_norm": 2.078125, + "grad_norm_var": 0.018895467122395832, + "learning_rate": 0.0001, + "loss": 4.3367, + "loss/crossentropy": 1.9697930812835693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22859974950551987, + "step": 21600 + }, + { + "epoch": 0.43204, + "grad_norm": 1.8671875, + "grad_norm_var": 0.019832356770833334, + "learning_rate": 0.0001, + "loss": 4.0616, + "loss/crossentropy": 1.7605210542678833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17675022035837173, + "step": 21602 + }, + { + "epoch": 0.43208, + "grad_norm": 1.953125, + "grad_norm_var": 0.019755045572916668, + "learning_rate": 0.0001, + "loss": 3.9467, + "loss/crossentropy": 1.9884806275367737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19690094888210297, + "step": 21604 + }, + { + "epoch": 0.43212, + "grad_norm": 1.9375, + "grad_norm_var": 0.018521881103515624, + "learning_rate": 0.0001, + "loss": 3.7705, + "loss/crossentropy": 1.987808346748352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20262162387371063, + "step": 21606 + }, + { + "epoch": 0.43216, + "grad_norm": 1.7578125, + "grad_norm_var": 0.026105753580729165, + "learning_rate": 0.0001, + "loss": 3.683, + "loss/crossentropy": 2.220315098762512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1899535208940506, + "step": 21608 + }, + { + "epoch": 0.4322, + "grad_norm": 1.90625, + "grad_norm_var": 0.024347941080729168, + "learning_rate": 0.0001, + "loss": 4.0442, + "loss/crossentropy": 2.245596408843994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21112707257270813, + "step": 21610 + }, + { + "epoch": 0.43224, + "grad_norm": 1.90625, + "grad_norm_var": 0.0252838134765625, + "learning_rate": 0.0001, + "loss": 4.2442, + "loss/crossentropy": 2.3593950271606445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22111155092716217, + "step": 21612 + }, + { + "epoch": 0.43228, + "grad_norm": 1.9453125, + "grad_norm_var": 0.025721995035807292, + "learning_rate": 0.0001, + "loss": 3.7807, + "loss/crossentropy": 1.7467412948608398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16833512485027313, + "step": 21614 + }, + { + "epoch": 0.43232, + "grad_norm": 1.7578125, + "grad_norm_var": 0.011328125, + "learning_rate": 0.0001, + "loss": 3.815, + "loss/crossentropy": 2.160380005836487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22670412063598633, + "step": 21616 + }, + { + "epoch": 0.43236, + "grad_norm": 1.8125, + "grad_norm_var": 0.012007395426432291, + "learning_rate": 0.0001, + "loss": 3.9665, + "loss/crossentropy": 2.1650896072387695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19781000167131424, + "step": 21618 + }, + { + "epoch": 0.4324, + "grad_norm": 1.875, + "grad_norm_var": 0.011987050374348959, + "learning_rate": 0.0001, + "loss": 3.8044, + "loss/crossentropy": 2.028198480606079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1905156373977661, + "step": 21620 + }, + { + "epoch": 0.43244, + "grad_norm": 1.859375, + "grad_norm_var": 0.014241282145182292, + "learning_rate": 0.0001, + "loss": 3.9015, + "loss/crossentropy": 2.30538147687912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21474938094615936, + "step": 21622 + }, + { + "epoch": 0.43248, + "grad_norm": 1.90625, + "grad_norm_var": 0.009450022379557292, + "learning_rate": 0.0001, + "loss": 3.9623, + "loss/crossentropy": 1.8054092526435852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1670922413468361, + "step": 21624 + }, + { + "epoch": 0.43252, + "grad_norm": 1.921875, + "grad_norm_var": 0.011578114827473958, + "learning_rate": 0.0001, + "loss": 4.2168, + "loss/crossentropy": 1.9821457862854004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19581317901611328, + "step": 21626 + }, + { + "epoch": 0.43256, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011197916666666667, + "learning_rate": 0.0001, + "loss": 3.9882, + "loss/crossentropy": 2.2404789328575134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19181591272354126, + "step": 21628 + }, + { + "epoch": 0.4326, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013557942708333333, + "learning_rate": 0.0001, + "loss": 4.0683, + "loss/crossentropy": 2.0229339003562927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2553243041038513, + "step": 21630 + }, + { + "epoch": 0.43264, + "grad_norm": 1.984375, + "grad_norm_var": 0.011122385660807291, + "learning_rate": 0.0001, + "loss": 4.2193, + "loss/crossentropy": 2.013065457344055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24048484861850739, + "step": 21632 + }, + { + "epoch": 0.43268, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009173329671223958, + "learning_rate": 0.0001, + "loss": 3.9129, + "loss/crossentropy": 1.9411469101905823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19341549277305603, + "step": 21634 + }, + { + "epoch": 0.43272, + "grad_norm": 1.953125, + "grad_norm_var": 0.008558909098307291, + "learning_rate": 0.0001, + "loss": 4.0984, + "loss/crossentropy": 2.1277971267700195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19953395426273346, + "step": 21636 + }, + { + "epoch": 0.43276, + "grad_norm": 2.015625, + "grad_norm_var": 0.0071795145670572914, + "learning_rate": 0.0001, + "loss": 4.2077, + "loss/crossentropy": 2.1296733617782593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117133647203445, + "step": 21638 + }, + { + "epoch": 0.4328, + "grad_norm": 2.109375, + "grad_norm_var": 0.009376780192057291, + "learning_rate": 0.0001, + "loss": 3.8594, + "loss/crossentropy": 1.6910184025764465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17567047476768494, + "step": 21640 + }, + { + "epoch": 0.43284, + "grad_norm": 1.8046875, + "grad_norm_var": 0.009845987955729166, + "learning_rate": 0.0001, + "loss": 3.9411, + "loss/crossentropy": 1.9889350533485413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18229827284812927, + "step": 21642 + }, + { + "epoch": 0.43288, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008931223551432292, + "learning_rate": 0.0001, + "loss": 3.8053, + "loss/crossentropy": 1.885926902294159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18047264218330383, + "step": 21644 + }, + { + "epoch": 0.43292, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006471506754557292, + "learning_rate": 0.0001, + "loss": 3.9383, + "loss/crossentropy": 1.9706445336341858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18135320395231247, + "step": 21646 + }, + { + "epoch": 0.43296, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007127888997395833, + "learning_rate": 0.0001, + "loss": 3.8972, + "loss/crossentropy": 1.7751979231834412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1693567857146263, + "step": 21648 + }, + { + "epoch": 0.433, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006884511311848958, + "learning_rate": 0.0001, + "loss": 4.0094, + "loss/crossentropy": 2.025897264480591, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18943700194358826, + "step": 21650 + }, + { + "epoch": 0.43304, + "grad_norm": 1.84375, + "grad_norm_var": 0.007627105712890625, + "learning_rate": 0.0001, + "loss": 3.8503, + "loss/crossentropy": 1.7019765973091125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1738589182496071, + "step": 21652 + }, + { + "epoch": 0.43308, + "grad_norm": 2.140625, + "grad_norm_var": 0.010225168863932292, + "learning_rate": 0.0001, + "loss": 4.1418, + "loss/crossentropy": 1.8927075266838074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1616983264684677, + "step": 21654 + }, + { + "epoch": 0.43312, + "grad_norm": 2.09375, + "grad_norm_var": 0.012336222330729167, + "learning_rate": 0.0001, + "loss": 4.3299, + "loss/crossentropy": 2.2224843502044678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21432733535766602, + "step": 21656 + }, + { + "epoch": 0.43316, + "grad_norm": 2.21875, + "grad_norm_var": 0.01502685546875, + "learning_rate": 0.0001, + "loss": 4.1572, + "loss/crossentropy": 2.1968295574188232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20150140672922134, + "step": 21658 + }, + { + "epoch": 0.4332, + "grad_norm": 2.046875, + "grad_norm_var": 0.015057118733723958, + "learning_rate": 0.0001, + "loss": 4.0046, + "loss/crossentropy": 2.1984314918518066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21520624309778214, + "step": 21660 + }, + { + "epoch": 0.43324, + "grad_norm": 2.015625, + "grad_norm_var": 0.01613133748372396, + "learning_rate": 0.0001, + "loss": 4.0638, + "loss/crossentropy": 2.183030843734741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994212418794632, + "step": 21662 + }, + { + "epoch": 0.43328, + "grad_norm": 1.984375, + "grad_norm_var": 0.016353352864583334, + "learning_rate": 0.0001, + "loss": 4.108, + "loss/crossentropy": 2.0716358423233032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18969116359949112, + "step": 21664 + }, + { + "epoch": 0.43332, + "grad_norm": 1.921875, + "grad_norm_var": 0.016043853759765626, + "learning_rate": 0.0001, + "loss": 4.0988, + "loss/crossentropy": 1.8427326679229736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19842079281806946, + "step": 21666 + }, + { + "epoch": 0.43336, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014058430989583334, + "learning_rate": 0.0001, + "loss": 4.0744, + "loss/crossentropy": 2.3021219968795776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21595575660467148, + "step": 21668 + }, + { + "epoch": 0.4334, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01593195597330729, + "learning_rate": 0.0001, + "loss": 3.8216, + "loss/crossentropy": 2.2828067541122437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21141938865184784, + "step": 21670 + }, + { + "epoch": 0.43344, + "grad_norm": 2.0625, + "grad_norm_var": 0.014955393473307292, + "learning_rate": 0.0001, + "loss": 4.0502, + "loss/crossentropy": 2.167429566383362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22402016073465347, + "step": 21672 + }, + { + "epoch": 0.43348, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010632069905598958, + "learning_rate": 0.0001, + "loss": 3.9943, + "loss/crossentropy": 2.02812397480011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061637043952942, + "step": 21674 + }, + { + "epoch": 0.43352, + "grad_norm": 1.7109375, + "grad_norm_var": 0.013724517822265626, + "learning_rate": 0.0001, + "loss": 3.6925, + "loss/crossentropy": 1.96470308303833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.168222576379776, + "step": 21676 + }, + { + "epoch": 0.43356, + "grad_norm": 2.03125, + "grad_norm_var": 0.013163248697916666, + "learning_rate": 0.0001, + "loss": 4.1567, + "loss/crossentropy": 2.640696406364441, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22549600154161453, + "step": 21678 + }, + { + "epoch": 0.4336, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010058339436848958, + "learning_rate": 0.0001, + "loss": 4.0939, + "loss/crossentropy": 2.140254855155945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973073035478592, + "step": 21680 + }, + { + "epoch": 0.43364, + "grad_norm": 1.921875, + "grad_norm_var": 0.0110595703125, + "learning_rate": 0.0001, + "loss": 4.1028, + "loss/crossentropy": 2.3025078773498535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19433188438415527, + "step": 21682 + }, + { + "epoch": 0.43368, + "grad_norm": 2.015625, + "grad_norm_var": 0.012452952067057292, + "learning_rate": 0.0001, + "loss": 4.344, + "loss/crossentropy": 2.1558977365493774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20599104464054108, + "step": 21684 + }, + { + "epoch": 0.43372, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014780426025390625, + "learning_rate": 0.0001, + "loss": 3.7918, + "loss/crossentropy": 1.8071665167808533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18443845212459564, + "step": 21686 + }, + { + "epoch": 0.43376, + "grad_norm": 1.984375, + "grad_norm_var": 0.018477121988932293, + "learning_rate": 0.0001, + "loss": 4.0928, + "loss/crossentropy": 2.41480815410614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22098398208618164, + "step": 21688 + }, + { + "epoch": 0.4338, + "grad_norm": 1.8828125, + "grad_norm_var": 0.019551595052083332, + "learning_rate": 0.0001, + "loss": 3.9046, + "loss/crossentropy": 2.0506592988967896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17161494493484497, + "step": 21690 + }, + { + "epoch": 0.43384, + "grad_norm": 1.9453125, + "grad_norm_var": 0.018187459309895834, + "learning_rate": 0.0001, + "loss": 3.9579, + "loss/crossentropy": 2.170476496219635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19826646149158478, + "step": 21692 + }, + { + "epoch": 0.43388, + "grad_norm": 1.7890625, + "grad_norm_var": 0.019665273030598958, + "learning_rate": 0.0001, + "loss": 3.9356, + "loss/crossentropy": 1.6374267935752869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1581001877784729, + "step": 21694 + }, + { + "epoch": 0.43392, + "grad_norm": 1.90625, + "grad_norm_var": 0.018944295247395833, + "learning_rate": 0.0001, + "loss": 3.9333, + "loss/crossentropy": 1.9059698581695557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18373095244169235, + "step": 21696 + }, + { + "epoch": 0.43396, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01762669881184896, + "learning_rate": 0.0001, + "loss": 4.2217, + "loss/crossentropy": 2.238908290863037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2044297158718109, + "step": 21698 + }, + { + "epoch": 0.434, + "grad_norm": 2.015625, + "grad_norm_var": 0.01877415974934896, + "learning_rate": 0.0001, + "loss": 3.6318, + "loss/crossentropy": 1.7991122007369995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18948530405759811, + "step": 21700 + }, + { + "epoch": 0.43404, + "grad_norm": 1.828125, + "grad_norm_var": 0.01668268839518229, + "learning_rate": 0.0001, + "loss": 3.754, + "loss/crossentropy": 1.773992896080017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16768701374530792, + "step": 21702 + }, + { + "epoch": 0.43408, + "grad_norm": 2.09375, + "grad_norm_var": 0.013051096598307292, + "learning_rate": 0.0001, + "loss": 4.325, + "loss/crossentropy": 2.021883547306061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20071732252836227, + "step": 21704 + }, + { + "epoch": 0.43412, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0167144775390625, + "learning_rate": 0.0001, + "loss": 4.3162, + "loss/crossentropy": 2.2198556661605835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20410272479057312, + "step": 21706 + }, + { + "epoch": 0.43416, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013986968994140625, + "learning_rate": 0.0001, + "loss": 4.0144, + "loss/crossentropy": 2.239061653614044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614881813526154, + "step": 21708 + }, + { + "epoch": 0.4342, + "grad_norm": 2.25, + "grad_norm_var": 0.01849543253580729, + "learning_rate": 0.0001, + "loss": 4.0531, + "loss/crossentropy": 1.720937430858612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19312319159507751, + "step": 21710 + }, + { + "epoch": 0.43424, + "grad_norm": 1.84375, + "grad_norm_var": 0.01881103515625, + "learning_rate": 0.0001, + "loss": 4.2064, + "loss/crossentropy": 2.577925205230713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20975439995527267, + "step": 21712 + }, + { + "epoch": 0.43428, + "grad_norm": 2.109375, + "grad_norm_var": 0.019634755452473958, + "learning_rate": 0.0001, + "loss": 4.2833, + "loss/crossentropy": 2.3273919820785522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.203931525349617, + "step": 21714 + }, + { + "epoch": 0.43432, + "grad_norm": 1.8671875, + "grad_norm_var": 0.017316691080729165, + "learning_rate": 0.0001, + "loss": 3.7922, + "loss/crossentropy": 1.6773480772972107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1546436920762062, + "step": 21716 + }, + { + "epoch": 0.43436, + "grad_norm": 1.875, + "grad_norm_var": 0.015986887613932292, + "learning_rate": 0.0001, + "loss": 4.0342, + "loss/crossentropy": 2.4566088914871216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2312512993812561, + "step": 21718 + }, + { + "epoch": 0.4344, + "grad_norm": 1.859375, + "grad_norm_var": 0.015998331705729167, + "learning_rate": 0.0001, + "loss": 3.5272, + "loss/crossentropy": 1.8777849674224854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18316396325826645, + "step": 21720 + }, + { + "epoch": 0.43444, + "grad_norm": 1.9375, + "grad_norm_var": 0.011744944254557292, + "learning_rate": 0.0001, + "loss": 4.0373, + "loss/crossentropy": 2.131688177585602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945742592215538, + "step": 21722 + }, + { + "epoch": 0.43448, + "grad_norm": 2.0, + "grad_norm_var": 0.0122222900390625, + "learning_rate": 0.0001, + "loss": 3.9186, + "loss/crossentropy": 1.9808812141418457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051435261964798, + "step": 21724 + }, + { + "epoch": 0.43452, + "grad_norm": 2.171875, + "grad_norm_var": 0.009037017822265625, + "learning_rate": 0.0001, + "loss": 3.9247, + "loss/crossentropy": 2.1240362524986267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22029808908700943, + "step": 21726 + }, + { + "epoch": 0.43456, + "grad_norm": 1.9375, + "grad_norm_var": 0.0087554931640625, + "learning_rate": 0.0001, + "loss": 4.1359, + "loss/crossentropy": 2.2637228965759277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24003028869628906, + "step": 21728 + }, + { + "epoch": 0.4346, + "grad_norm": 1.84375, + "grad_norm_var": 0.0075266520182291664, + "learning_rate": 0.0001, + "loss": 3.7561, + "loss/crossentropy": 2.0770394802093506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.185364231467247, + "step": 21730 + }, + { + "epoch": 0.43464, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0071197509765625, + "learning_rate": 0.0001, + "loss": 4.1621, + "loss/crossentropy": 2.1420366764068604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21707475185394287, + "step": 21732 + }, + { + "epoch": 0.43468, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006940714518229167, + "learning_rate": 0.0001, + "loss": 4.0836, + "loss/crossentropy": 1.8017431497573853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1667446494102478, + "step": 21734 + }, + { + "epoch": 0.43472, + "grad_norm": 1.84375, + "grad_norm_var": 0.007490793863932292, + "learning_rate": 0.0001, + "loss": 4.0098, + "loss/crossentropy": 2.215123176574707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24778465926647186, + "step": 21736 + }, + { + "epoch": 0.43476, + "grad_norm": 1.8046875, + "grad_norm_var": 0.009708658854166666, + "learning_rate": 0.0001, + "loss": 3.7596, + "loss/crossentropy": 1.7275863289833069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18431701511144638, + "step": 21738 + }, + { + "epoch": 0.4348, + "grad_norm": 2.0, + "grad_norm_var": 0.009643300374348959, + "learning_rate": 0.0001, + "loss": 4.1243, + "loss/crossentropy": 2.0645371675491333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19622839987277985, + "step": 21740 + }, + { + "epoch": 0.43484, + "grad_norm": 1.84375, + "grad_norm_var": 0.006563059488932292, + "learning_rate": 0.0001, + "loss": 3.9028, + "loss/crossentropy": 1.8695591688156128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18317092955112457, + "step": 21742 + }, + { + "epoch": 0.43488, + "grad_norm": 2.046875, + "grad_norm_var": 0.010239410400390624, + "learning_rate": 0.0001, + "loss": 4.1963, + "loss/crossentropy": 2.090248942375183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20329977571964264, + "step": 21744 + }, + { + "epoch": 0.43492, + "grad_norm": 1.875, + "grad_norm_var": 0.011307779947916667, + "learning_rate": 0.0001, + "loss": 3.6347, + "loss/crossentropy": 1.943733274936676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20422999560832977, + "step": 21746 + }, + { + "epoch": 0.43496, + "grad_norm": 1.8203125, + "grad_norm_var": 0.012483723958333333, + "learning_rate": 0.0001, + "loss": 4.0101, + "loss/crossentropy": 2.2485480308532715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20576569437980652, + "step": 21748 + }, + { + "epoch": 0.435, + "grad_norm": 1.953125, + "grad_norm_var": 0.012569173177083334, + "learning_rate": 0.0001, + "loss": 4.1053, + "loss/crossentropy": 2.184234142303467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19477711617946625, + "step": 21750 + }, + { + "epoch": 0.43504, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011311848958333334, + "learning_rate": 0.0001, + "loss": 4.1796, + "loss/crossentropy": 2.1742812991142273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19085688889026642, + "step": 21752 + }, + { + "epoch": 0.43508, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010985310872395833, + "learning_rate": 0.0001, + "loss": 4.1924, + "loss/crossentropy": 2.44822895526886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128421813249588, + "step": 21754 + }, + { + "epoch": 0.43512, + "grad_norm": 1.96875, + "grad_norm_var": 0.01138916015625, + "learning_rate": 0.0001, + "loss": 4.1137, + "loss/crossentropy": 1.8242409229278564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19045840948820114, + "step": 21756 + }, + { + "epoch": 0.43516, + "grad_norm": 2.0, + "grad_norm_var": 0.012474568684895833, + "learning_rate": 0.0001, + "loss": 4.0617, + "loss/crossentropy": 2.1707061529159546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2152707874774933, + "step": 21758 + }, + { + "epoch": 0.4352, + "grad_norm": 1.8671875, + "grad_norm_var": 0.010672760009765626, + "learning_rate": 0.0001, + "loss": 4.045, + "loss/crossentropy": 2.0790776014328003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19175638258457184, + "step": 21760 + }, + { + "epoch": 0.43524, + "grad_norm": 1.828125, + "grad_norm_var": 0.0095947265625, + "learning_rate": 0.0001, + "loss": 3.8814, + "loss/crossentropy": 1.9847996830940247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18776872754096985, + "step": 21762 + }, + { + "epoch": 0.43528, + "grad_norm": 2.125, + "grad_norm_var": 0.0096435546875, + "learning_rate": 0.0001, + "loss": 4.0754, + "loss/crossentropy": 2.122888207435608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20312964171171188, + "step": 21764 + }, + { + "epoch": 0.43532, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0095855712890625, + "learning_rate": 0.0001, + "loss": 4.035, + "loss/crossentropy": 2.1922764778137207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2057497799396515, + "step": 21766 + }, + { + "epoch": 0.43536, + "grad_norm": 1.7421875, + "grad_norm_var": 0.013321685791015624, + "learning_rate": 0.0001, + "loss": 3.8096, + "loss/crossentropy": 1.392991840839386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1425289735198021, + "step": 21768 + }, + { + "epoch": 0.4354, + "grad_norm": 2.03125, + "grad_norm_var": 0.012115224202473959, + "learning_rate": 0.0001, + "loss": 4.1758, + "loss/crossentropy": 1.9627132415771484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2007637917995453, + "step": 21770 + }, + { + "epoch": 0.43544, + "grad_norm": 1.96875, + "grad_norm_var": 0.011374664306640626, + "learning_rate": 0.0001, + "loss": 4.0094, + "loss/crossentropy": 2.04905503988266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18171894550323486, + "step": 21772 + }, + { + "epoch": 0.43548, + "grad_norm": 1.828125, + "grad_norm_var": 0.0086090087890625, + "learning_rate": 0.0001, + "loss": 3.7465, + "loss/crossentropy": 2.09994637966156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924375221133232, + "step": 21774 + }, + { + "epoch": 0.43552, + "grad_norm": 1.875, + "grad_norm_var": 0.008565012613932292, + "learning_rate": 0.0001, + "loss": 4.2903, + "loss/crossentropy": 2.381268620491028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22655388712882996, + "step": 21776 + }, + { + "epoch": 0.43556, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008161417643229167, + "learning_rate": 0.0001, + "loss": 4.151, + "loss/crossentropy": 2.189077138900757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19720469415187836, + "step": 21778 + }, + { + "epoch": 0.4356, + "grad_norm": 1.9140625, + "grad_norm_var": 0.005381011962890625, + "learning_rate": 0.0001, + "loss": 3.7407, + "loss/crossentropy": 1.691848337650299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1633242815732956, + "step": 21780 + }, + { + "epoch": 0.43564, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007054646809895833, + "learning_rate": 0.0001, + "loss": 4.0392, + "loss/crossentropy": 2.170955538749695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20698139071464539, + "step": 21782 + }, + { + "epoch": 0.43568, + "grad_norm": 1.9921875, + "grad_norm_var": 0.00499267578125, + "learning_rate": 0.0001, + "loss": 3.9417, + "loss/crossentropy": 2.0300655364990234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1849035769701004, + "step": 21784 + }, + { + "epoch": 0.43572, + "grad_norm": 1.984375, + "grad_norm_var": 0.004443359375, + "learning_rate": 0.0001, + "loss": 3.9352, + "loss/crossentropy": 1.998863697052002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1871778443455696, + "step": 21786 + }, + { + "epoch": 0.43576, + "grad_norm": 1.8984375, + "grad_norm_var": 0.004257965087890625, + "learning_rate": 0.0001, + "loss": 3.9628, + "loss/crossentropy": 1.7537881731987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.178309828042984, + "step": 21788 + }, + { + "epoch": 0.4358, + "grad_norm": 1.8828125, + "grad_norm_var": 0.003916168212890625, + "learning_rate": 0.0001, + "loss": 3.932, + "loss/crossentropy": 1.9158823490142822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18895266950130463, + "step": 21790 + }, + { + "epoch": 0.43584, + "grad_norm": 2.0, + "grad_norm_var": 0.0035845438639322915, + "learning_rate": 0.0001, + "loss": 4.0875, + "loss/crossentropy": 2.177749514579773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20137280225753784, + "step": 21792 + }, + { + "epoch": 0.43588, + "grad_norm": 1.953125, + "grad_norm_var": 0.004788970947265625, + "learning_rate": 0.0001, + "loss": 4.1789, + "loss/crossentropy": 2.132981538772583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20100867003202438, + "step": 21794 + }, + { + "epoch": 0.43592, + "grad_norm": 1.9296875, + "grad_norm_var": 0.004564412434895833, + "learning_rate": 0.0001, + "loss": 3.9301, + "loss/crossentropy": 1.926209807395935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19080301374197006, + "step": 21796 + }, + { + "epoch": 0.43596, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0040891011555989586, + "learning_rate": 0.0001, + "loss": 4.1137, + "loss/crossentropy": 2.1029208302497864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002335861325264, + "step": 21798 + }, + { + "epoch": 0.436, + "grad_norm": 1.9375, + "grad_norm_var": 0.0036333719889322918, + "learning_rate": 0.0001, + "loss": 4.1378, + "loss/crossentropy": 2.0101813673973083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21301408112049103, + "step": 21800 + }, + { + "epoch": 0.43604, + "grad_norm": 1.796875, + "grad_norm_var": 0.005250803629557292, + "learning_rate": 0.0001, + "loss": 3.8085, + "loss/crossentropy": 2.2096749544143677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20425067842006683, + "step": 21802 + }, + { + "epoch": 0.43608, + "grad_norm": 1.8828125, + "grad_norm_var": 0.005236562093098958, + "learning_rate": 0.0001, + "loss": 3.9631, + "loss/crossentropy": 1.7085874676704407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17989665269851685, + "step": 21804 + }, + { + "epoch": 0.43612, + "grad_norm": 1.7890625, + "grad_norm_var": 0.006086222330729167, + "learning_rate": 0.0001, + "loss": 4.2035, + "loss/crossentropy": 2.082730233669281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19857076555490494, + "step": 21806 + }, + { + "epoch": 0.43616, + "grad_norm": 1.96875, + "grad_norm_var": 0.0062164306640625, + "learning_rate": 0.0001, + "loss": 4.0634, + "loss/crossentropy": 2.0145097970962524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956789717078209, + "step": 21808 + }, + { + "epoch": 0.4362, + "grad_norm": 1.953125, + "grad_norm_var": 0.0051666259765625, + "learning_rate": 0.0001, + "loss": 4.31, + "loss/crossentropy": 2.0774065256118774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20803070068359375, + "step": 21810 + }, + { + "epoch": 0.43624, + "grad_norm": 1.9765625, + "grad_norm_var": 0.004839833577473958, + "learning_rate": 0.0001, + "loss": 3.9442, + "loss/crossentropy": 2.045994222164154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19481085240840912, + "step": 21812 + }, + { + "epoch": 0.43628, + "grad_norm": 2.0, + "grad_norm_var": 0.005570220947265625, + "learning_rate": 0.0001, + "loss": 4.119, + "loss/crossentropy": 1.9371901154518127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20281973481178284, + "step": 21814 + }, + { + "epoch": 0.43632, + "grad_norm": 2.046875, + "grad_norm_var": 0.00648193359375, + "learning_rate": 0.0001, + "loss": 3.8379, + "loss/crossentropy": 2.1048339009284973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919357031583786, + "step": 21816 + }, + { + "epoch": 0.43636, + "grad_norm": 1.921875, + "grad_norm_var": 0.005116526285807292, + "learning_rate": 0.0001, + "loss": 4.0696, + "loss/crossentropy": 2.162278175354004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19293204694986343, + "step": 21818 + }, + { + "epoch": 0.4364, + "grad_norm": 1.859375, + "grad_norm_var": 0.006034342447916666, + "learning_rate": 0.0001, + "loss": 4.0294, + "loss/crossentropy": 1.9012067317962646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19044938683509827, + "step": 21820 + }, + { + "epoch": 0.43644, + "grad_norm": 1.8671875, + "grad_norm_var": 0.004583485921223958, + "learning_rate": 0.0001, + "loss": 3.8538, + "loss/crossentropy": 1.617782175540924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1641436219215393, + "step": 21822 + }, + { + "epoch": 0.43648, + "grad_norm": 2.109375, + "grad_norm_var": 0.006048329671223958, + "learning_rate": 0.0001, + "loss": 4.2229, + "loss/crossentropy": 2.136145055294037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20208875089883804, + "step": 21824 + }, + { + "epoch": 0.43652, + "grad_norm": 2.0625, + "grad_norm_var": 0.006475575764973958, + "learning_rate": 0.0001, + "loss": 4.0446, + "loss/crossentropy": 2.1240354776382446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052341103553772, + "step": 21826 + }, + { + "epoch": 0.43656, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007020823160807292, + "learning_rate": 0.0001, + "loss": 4.0295, + "loss/crossentropy": 1.8771272897720337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17846016585826874, + "step": 21828 + }, + { + "epoch": 0.4366, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005985260009765625, + "learning_rate": 0.0001, + "loss": 3.9318, + "loss/crossentropy": 1.8061035871505737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1792590245604515, + "step": 21830 + }, + { + "epoch": 0.43664, + "grad_norm": 1.953125, + "grad_norm_var": 0.0055084228515625, + "learning_rate": 0.0001, + "loss": 4.0204, + "loss/crossentropy": 2.0216365456581116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18977319449186325, + "step": 21832 + }, + { + "epoch": 0.43668, + "grad_norm": 1.75, + "grad_norm_var": 0.008056386311848959, + "learning_rate": 0.0001, + "loss": 3.7667, + "loss/crossentropy": 1.81255304813385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1740209013223648, + "step": 21834 + }, + { + "epoch": 0.43672, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0087310791015625, + "learning_rate": 0.0001, + "loss": 4.0626, + "loss/crossentropy": 2.2982383966445923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20780136436223984, + "step": 21836 + }, + { + "epoch": 0.43676, + "grad_norm": 2.078125, + "grad_norm_var": 0.010037994384765625, + "learning_rate": 0.0001, + "loss": 3.9891, + "loss/crossentropy": 2.0461641550064087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21413680911064148, + "step": 21838 + }, + { + "epoch": 0.4368, + "grad_norm": 1.921875, + "grad_norm_var": 0.008341217041015625, + "learning_rate": 0.0001, + "loss": 4.1413, + "loss/crossentropy": 2.1272863149642944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22310787439346313, + "step": 21840 + }, + { + "epoch": 0.43684, + "grad_norm": 1.8359375, + "grad_norm_var": 0.008133951822916667, + "learning_rate": 0.0001, + "loss": 3.9467, + "loss/crossentropy": 1.9026488661766052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18013561517000198, + "step": 21842 + }, + { + "epoch": 0.43688, + "grad_norm": 1.921875, + "grad_norm_var": 0.007635243733723958, + "learning_rate": 0.0001, + "loss": 4.2739, + "loss/crossentropy": 2.1403396129608154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20329253375530243, + "step": 21844 + }, + { + "epoch": 0.43692, + "grad_norm": 1.9375, + "grad_norm_var": 0.0075103759765625, + "learning_rate": 0.0001, + "loss": 4.1922, + "loss/crossentropy": 2.041237533092499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20238220691680908, + "step": 21846 + }, + { + "epoch": 0.43696, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008011881510416667, + "learning_rate": 0.0001, + "loss": 4.2067, + "loss/crossentropy": 1.860413134098053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19062548130750656, + "step": 21848 + }, + { + "epoch": 0.437, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007721964518229167, + "learning_rate": 0.0001, + "loss": 3.962, + "loss/crossentropy": 2.1432868242263794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947617009282112, + "step": 21850 + }, + { + "epoch": 0.43704, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007834625244140626, + "learning_rate": 0.0001, + "loss": 3.667, + "loss/crossentropy": 1.5292840003967285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1465652659535408, + "step": 21852 + }, + { + "epoch": 0.43708, + "grad_norm": 1.90625, + "grad_norm_var": 0.00523681640625, + "learning_rate": 0.0001, + "loss": 3.7604, + "loss/crossentropy": 1.8826366066932678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19239384680986404, + "step": 21854 + }, + { + "epoch": 0.43712, + "grad_norm": 1.890625, + "grad_norm_var": 0.005086008707682292, + "learning_rate": 0.0001, + "loss": 4.0817, + "loss/crossentropy": 1.7915772199630737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18841900676488876, + "step": 21856 + }, + { + "epoch": 0.43716, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0048868815104166664, + "learning_rate": 0.0001, + "loss": 3.7986, + "loss/crossentropy": 1.9425716400146484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19191939383745193, + "step": 21858 + }, + { + "epoch": 0.4372, + "grad_norm": 1.84375, + "grad_norm_var": 0.006131744384765625, + "learning_rate": 0.0001, + "loss": 3.8735, + "loss/crossentropy": 2.073697865009308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19094493240118027, + "step": 21860 + }, + { + "epoch": 0.43724, + "grad_norm": 1.9453125, + "grad_norm_var": 0.046575673421223956, + "learning_rate": 0.0001, + "loss": 4.0953, + "loss/crossentropy": 2.023917257785797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2218250036239624, + "step": 21862 + }, + { + "epoch": 0.43728, + "grad_norm": 1.96875, + "grad_norm_var": 0.04965413411458333, + "learning_rate": 0.0001, + "loss": 4.1522, + "loss/crossentropy": 1.9710112810134888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21274644136428833, + "step": 21864 + }, + { + "epoch": 0.43732, + "grad_norm": 1.953125, + "grad_norm_var": 0.04736328125, + "learning_rate": 0.0001, + "loss": 4.0579, + "loss/crossentropy": 1.9758725762367249, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1855839416384697, + "step": 21866 + }, + { + "epoch": 0.43736, + "grad_norm": 2.15625, + "grad_norm_var": 0.04616597493489583, + "learning_rate": 0.0001, + "loss": 4.0144, + "loss/crossentropy": 2.0672810077667236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20418523997068405, + "step": 21868 + }, + { + "epoch": 0.4374, + "grad_norm": 1.921875, + "grad_norm_var": 0.046000162760416664, + "learning_rate": 0.0001, + "loss": 3.6892, + "loss/crossentropy": 1.8937869668006897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18653447926044464, + "step": 21870 + }, + { + "epoch": 0.43744, + "grad_norm": 1.8515625, + "grad_norm_var": 0.049006144205729164, + "learning_rate": 0.0001, + "loss": 3.8814, + "loss/crossentropy": 1.6613503694534302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16891123354434967, + "step": 21872 + }, + { + "epoch": 0.43748, + "grad_norm": 2.0, + "grad_norm_var": 0.04822489420572917, + "learning_rate": 0.0001, + "loss": 4.2159, + "loss/crossentropy": 2.007514178752899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2078787311911583, + "step": 21874 + }, + { + "epoch": 0.43752, + "grad_norm": 1.953125, + "grad_norm_var": 0.043603515625, + "learning_rate": 0.0001, + "loss": 3.831, + "loss/crossentropy": 1.913162350654602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20729011297225952, + "step": 21876 + }, + { + "epoch": 0.43756, + "grad_norm": 2.03125, + "grad_norm_var": 0.012849680582682292, + "learning_rate": 0.0001, + "loss": 3.9612, + "loss/crossentropy": 2.1534116864204407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183125138282776, + "step": 21878 + }, + { + "epoch": 0.4376, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012841542561848959, + "learning_rate": 0.0001, + "loss": 3.8101, + "loss/crossentropy": 1.9248425960540771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19415219128131866, + "step": 21880 + }, + { + "epoch": 0.43764, + "grad_norm": 1.921875, + "grad_norm_var": 0.012790679931640625, + "learning_rate": 0.0001, + "loss": 3.7379, + "loss/crossentropy": 2.095816135406494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21181143820285797, + "step": 21882 + }, + { + "epoch": 0.43768, + "grad_norm": 2.25, + "grad_norm_var": 0.015405019124348959, + "learning_rate": 0.0001, + "loss": 4.6839, + "loss/crossentropy": 2.0167580246925354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21327386051416397, + "step": 21884 + }, + { + "epoch": 0.43772, + "grad_norm": 2.15625, + "grad_norm_var": 0.019038899739583334, + "learning_rate": 0.0001, + "loss": 4.3355, + "loss/crossentropy": 2.2425005435943604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20246200263500214, + "step": 21886 + }, + { + "epoch": 0.43776, + "grad_norm": 2.0625, + "grad_norm_var": 0.017232259114583332, + "learning_rate": 0.0001, + "loss": 4.3023, + "loss/crossentropy": 1.9543325901031494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18824229389429092, + "step": 21888 + }, + { + "epoch": 0.4378, + "grad_norm": 1.890625, + "grad_norm_var": 0.017365519205729166, + "learning_rate": 0.0001, + "loss": 4.0802, + "loss/crossentropy": 2.2031290531158447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20949723571538925, + "step": 21890 + }, + { + "epoch": 0.43784, + "grad_norm": 1.8203125, + "grad_norm_var": 0.01901219685872396, + "learning_rate": 0.0001, + "loss": 3.728, + "loss/crossentropy": 1.97504061460495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19441132247447968, + "step": 21892 + }, + { + "epoch": 0.43788, + "grad_norm": 1.78125, + "grad_norm_var": 0.018888346354166665, + "learning_rate": 0.0001, + "loss": 3.8922, + "loss/crossentropy": 1.8496057391166687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18915118277072906, + "step": 21894 + }, + { + "epoch": 0.43792, + "grad_norm": 2.0, + "grad_norm_var": 0.015116373697916666, + "learning_rate": 0.0001, + "loss": 4.0474, + "loss/crossentropy": 1.9244802594184875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20679102092981339, + "step": 21896 + }, + { + "epoch": 0.43796, + "grad_norm": 1.7890625, + "grad_norm_var": 0.016902414957682292, + "learning_rate": 0.0001, + "loss": 3.9855, + "loss/crossentropy": 2.1038661003112793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2014932855963707, + "step": 21898 + }, + { + "epoch": 0.438, + "grad_norm": 1.84375, + "grad_norm_var": 0.011286417643229166, + "learning_rate": 0.0001, + "loss": 4.0416, + "loss/crossentropy": 2.202482581138611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20515280216932297, + "step": 21900 + }, + { + "epoch": 0.43804, + "grad_norm": 1.84375, + "grad_norm_var": 0.008353678385416667, + "learning_rate": 0.0001, + "loss": 4.0346, + "loss/crossentropy": 2.1215643286705017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18430311977863312, + "step": 21902 + }, + { + "epoch": 0.43808, + "grad_norm": 2.078125, + "grad_norm_var": 0.0086822509765625, + "learning_rate": 0.0001, + "loss": 4.1243, + "loss/crossentropy": 1.585806667804718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1615592986345291, + "step": 21904 + }, + { + "epoch": 0.43812, + "grad_norm": 1.96875, + "grad_norm_var": 0.008831532796223958, + "learning_rate": 0.0001, + "loss": 4.0288, + "loss/crossentropy": 2.0006843209266663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21003154665231705, + "step": 21906 + }, + { + "epoch": 0.43816, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008571116129557292, + "learning_rate": 0.0001, + "loss": 3.8523, + "loss/crossentropy": 1.9077118635177612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1881355568766594, + "step": 21908 + }, + { + "epoch": 0.4382, + "grad_norm": 1.890625, + "grad_norm_var": 0.0080810546875, + "learning_rate": 0.0001, + "loss": 3.7186, + "loss/crossentropy": 1.752367615699768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17893216013908386, + "step": 21910 + }, + { + "epoch": 0.43824, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0078765869140625, + "learning_rate": 0.0001, + "loss": 4.171, + "loss/crossentropy": 2.1325889825820923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21129589527845383, + "step": 21912 + }, + { + "epoch": 0.43828, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006949615478515625, + "learning_rate": 0.0001, + "loss": 4.3083, + "loss/crossentropy": 2.630311131477356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.224892295897007, + "step": 21914 + }, + { + "epoch": 0.43832, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007549794514973959, + "learning_rate": 0.0001, + "loss": 4.1696, + "loss/crossentropy": 2.002493679523468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19148597121238708, + "step": 21916 + }, + { + "epoch": 0.43836, + "grad_norm": 2.0625, + "grad_norm_var": 0.006571451822916667, + "learning_rate": 0.0001, + "loss": 4.1209, + "loss/crossentropy": 1.968815267086029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18172769993543625, + "step": 21918 + }, + { + "epoch": 0.4384, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005402628580729167, + "learning_rate": 0.0001, + "loss": 4.1595, + "loss/crossentropy": 2.2519482374191284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22669588029384613, + "step": 21920 + }, + { + "epoch": 0.43844, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0059315999348958336, + "learning_rate": 0.0001, + "loss": 3.9382, + "loss/crossentropy": 1.9276898503303528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18415841460227966, + "step": 21922 + }, + { + "epoch": 0.43848, + "grad_norm": 1.9921875, + "grad_norm_var": 0.004941558837890625, + "learning_rate": 0.0001, + "loss": 4.1358, + "loss/crossentropy": 2.1866860389709473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2306595891714096, + "step": 21924 + }, + { + "epoch": 0.43852, + "grad_norm": 1.875, + "grad_norm_var": 0.004378000895182292, + "learning_rate": 0.0001, + "loss": 3.9259, + "loss/crossentropy": 1.8013821840286255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17407341301441193, + "step": 21926 + }, + { + "epoch": 0.43856, + "grad_norm": 1.8359375, + "grad_norm_var": 0.005110422770182292, + "learning_rate": 0.0001, + "loss": 3.9644, + "loss/crossentropy": 2.116545557975769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19539258629083633, + "step": 21928 + }, + { + "epoch": 0.4386, + "grad_norm": 1.8359375, + "grad_norm_var": 0.006009928385416667, + "learning_rate": 0.0001, + "loss": 3.8005, + "loss/crossentropy": 2.039593815803528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1793600618839264, + "step": 21930 + }, + { + "epoch": 0.43864, + "grad_norm": 1.796875, + "grad_norm_var": 0.008385976155598959, + "learning_rate": 0.0001, + "loss": 3.9727, + "loss/crossentropy": 2.0602275133132935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17620772868394852, + "step": 21932 + }, + { + "epoch": 0.43868, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007024892171223958, + "learning_rate": 0.0001, + "loss": 3.7552, + "loss/crossentropy": 1.9316160082817078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026260495185852, + "step": 21934 + }, + { + "epoch": 0.43872, + "grad_norm": 1.984375, + "grad_norm_var": 0.007004547119140625, + "learning_rate": 0.0001, + "loss": 4.0254, + "loss/crossentropy": 2.152816414833069, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18020687997341156, + "step": 21936 + }, + { + "epoch": 0.43876, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0067626953125, + "learning_rate": 0.0001, + "loss": 3.8211, + "loss/crossentropy": 1.8688368201255798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17652328312397003, + "step": 21938 + }, + { + "epoch": 0.4388, + "grad_norm": 1.96875, + "grad_norm_var": 0.006628163655598958, + "learning_rate": 0.0001, + "loss": 4.162, + "loss/crossentropy": 1.8889214992523193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18263526260852814, + "step": 21940 + }, + { + "epoch": 0.43884, + "grad_norm": 1.7578125, + "grad_norm_var": 0.007869466145833334, + "learning_rate": 0.0001, + "loss": 3.8644, + "loss/crossentropy": 1.9699294567108154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2170155718922615, + "step": 21942 + }, + { + "epoch": 0.43888, + "grad_norm": 1.9375, + "grad_norm_var": 0.009642537434895833, + "learning_rate": 0.0001, + "loss": 4.1007, + "loss/crossentropy": 1.7749728560447693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18438361585140228, + "step": 21944 + }, + { + "epoch": 0.43892, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008910115559895833, + "learning_rate": 0.0001, + "loss": 4.0266, + "loss/crossentropy": 2.1438393592834473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22740208357572556, + "step": 21946 + }, + { + "epoch": 0.43896, + "grad_norm": 1.9375, + "grad_norm_var": 0.005663045247395833, + "learning_rate": 0.0001, + "loss": 4.0983, + "loss/crossentropy": 2.1517677307128906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20089010894298553, + "step": 21948 + }, + { + "epoch": 0.439, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006931304931640625, + "learning_rate": 0.0001, + "loss": 4.0286, + "loss/crossentropy": 2.283053159713745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19754114747047424, + "step": 21950 + }, + { + "epoch": 0.43904, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008438873291015624, + "learning_rate": 0.0001, + "loss": 4.3633, + "loss/crossentropy": 2.121833860874176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20339152216911316, + "step": 21952 + }, + { + "epoch": 0.43908, + "grad_norm": 2.0, + "grad_norm_var": 0.007926177978515626, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 2.206274390220642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19716881215572357, + "step": 21954 + }, + { + "epoch": 0.43912, + "grad_norm": 1.8203125, + "grad_norm_var": 0.012749989827473959, + "learning_rate": 0.0001, + "loss": 4.185, + "loss/crossentropy": 2.1501349210739136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20559798926115036, + "step": 21956 + }, + { + "epoch": 0.43916, + "grad_norm": 2.0625, + "grad_norm_var": 0.012088775634765625, + "learning_rate": 0.0001, + "loss": 3.9056, + "loss/crossentropy": 2.0854570269584656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22488616406917572, + "step": 21958 + }, + { + "epoch": 0.4392, + "grad_norm": 3.265625, + "grad_norm_var": 0.11881103515625, + "learning_rate": 0.0001, + "loss": 3.651, + "loss/crossentropy": 1.840592384338379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1934354454278946, + "step": 21960 + }, + { + "epoch": 0.43924, + "grad_norm": 1.8203125, + "grad_norm_var": 0.12266006469726562, + "learning_rate": 0.0001, + "loss": 4.013, + "loss/crossentropy": 2.062163829803467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19524814933538437, + "step": 21962 + }, + { + "epoch": 0.43928, + "grad_norm": 1.765625, + "grad_norm_var": 0.1280413309733073, + "learning_rate": 0.0001, + "loss": 3.8329, + "loss/crossentropy": 1.7060028910636902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17902052402496338, + "step": 21964 + }, + { + "epoch": 0.43932, + "grad_norm": 1.8046875, + "grad_norm_var": 0.12742919921875, + "learning_rate": 0.0001, + "loss": 4.0638, + "loss/crossentropy": 2.246550440788269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088882103562355, + "step": 21966 + }, + { + "epoch": 0.43936, + "grad_norm": 1.9375, + "grad_norm_var": 0.12764053344726561, + "learning_rate": 0.0001, + "loss": 4.2095, + "loss/crossentropy": 2.249360680580139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20600482821464539, + "step": 21968 + }, + { + "epoch": 0.4394, + "grad_norm": 1.8125, + "grad_norm_var": 0.13187840779622395, + "learning_rate": 0.0001, + "loss": 4.2291, + "loss/crossentropy": 2.029528558254242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18438928574323654, + "step": 21970 + }, + { + "epoch": 0.43944, + "grad_norm": 1.8828125, + "grad_norm_var": 0.1287249247233073, + "learning_rate": 0.0001, + "loss": 4.0865, + "loss/crossentropy": 2.0363243222236633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19402872025966644, + "step": 21972 + }, + { + "epoch": 0.43948, + "grad_norm": 2.015625, + "grad_norm_var": 0.12544657389322916, + "learning_rate": 0.0001, + "loss": 4.2188, + "loss/crossentropy": 2.1816266775131226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20053647458553314, + "step": 21974 + }, + { + "epoch": 0.43952, + "grad_norm": 2.015625, + "grad_norm_var": 0.01646703084309896, + "learning_rate": 0.0001, + "loss": 4.2621, + "loss/crossentropy": 2.452013850212097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2117229700088501, + "step": 21976 + }, + { + "epoch": 0.43956, + "grad_norm": 1.96875, + "grad_norm_var": 0.0123779296875, + "learning_rate": 0.0001, + "loss": 4.0617, + "loss/crossentropy": 2.1460453271865845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2046569585800171, + "step": 21978 + }, + { + "epoch": 0.4396, + "grad_norm": 1.953125, + "grad_norm_var": 0.009639485677083334, + "learning_rate": 0.0001, + "loss": 4.1805, + "loss/crossentropy": 2.152291774749756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19845052808523178, + "step": 21980 + }, + { + "epoch": 0.43964, + "grad_norm": 2.03125, + "grad_norm_var": 0.007950592041015624, + "learning_rate": 0.0001, + "loss": 4.3282, + "loss/crossentropy": 2.280704140663147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20648686587810516, + "step": 21982 + }, + { + "epoch": 0.43968, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009004465738932292, + "learning_rate": 0.0001, + "loss": 4.146, + "loss/crossentropy": 2.2323286533355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1949191614985466, + "step": 21984 + }, + { + "epoch": 0.43972, + "grad_norm": 2.046875, + "grad_norm_var": 0.006135813395182292, + "learning_rate": 0.0001, + "loss": 3.8289, + "loss/crossentropy": 2.105699062347412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19488361477851868, + "step": 21986 + }, + { + "epoch": 0.43976, + "grad_norm": 1.6875, + "grad_norm_var": 0.009834543863932291, + "learning_rate": 0.0001, + "loss": 3.731, + "loss/crossentropy": 1.7268863916397095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16756988316774368, + "step": 21988 + }, + { + "epoch": 0.4398, + "grad_norm": 1.953125, + "grad_norm_var": 0.009458160400390625, + "learning_rate": 0.0001, + "loss": 4.0178, + "loss/crossentropy": 2.1496203541755676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20237596333026886, + "step": 21990 + }, + { + "epoch": 0.43984, + "grad_norm": 1.984375, + "grad_norm_var": 0.009350331624348958, + "learning_rate": 0.0001, + "loss": 4.1835, + "loss/crossentropy": 2.196579158306122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20089206099510193, + "step": 21992 + }, + { + "epoch": 0.43988, + "grad_norm": 2.140625, + "grad_norm_var": 0.011958821614583334, + "learning_rate": 0.0001, + "loss": 4.0065, + "loss/crossentropy": 2.1145724654197693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21221736818552017, + "step": 21994 + }, + { + "epoch": 0.43992, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012230428059895833, + "learning_rate": 0.0001, + "loss": 4.1096, + "loss/crossentropy": 1.8034029603004456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18355921655893326, + "step": 21996 + }, + { + "epoch": 0.43996, + "grad_norm": 2.015625, + "grad_norm_var": 0.012262980143229166, + "learning_rate": 0.0001, + "loss": 4.1805, + "loss/crossentropy": 1.8520742654800415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18945766985416412, + "step": 21998 + }, + { + "epoch": 0.44, + "grad_norm": 2.0625, + "grad_norm_var": 0.012962849934895833, + "learning_rate": 0.0001, + "loss": 4.4938, + "loss/crossentropy": 2.0837312936782837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22002845257520676, + "step": 22000 + }, + { + "epoch": 0.44004, + "grad_norm": 1.859375, + "grad_norm_var": 0.011470540364583334, + "learning_rate": 0.0001, + "loss": 4.0409, + "loss/crossentropy": 1.9202563166618347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19138525426387787, + "step": 22002 + }, + { + "epoch": 0.44008, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006506093343098958, + "learning_rate": 0.0001, + "loss": 3.9965, + "loss/crossentropy": 2.0986216068267822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18914636224508286, + "step": 22004 + }, + { + "epoch": 0.44012, + "grad_norm": 1.953125, + "grad_norm_var": 0.008955637613932291, + "learning_rate": 0.0001, + "loss": 3.5102, + "loss/crossentropy": 1.675586223602295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17793113738298416, + "step": 22006 + }, + { + "epoch": 0.44016, + "grad_norm": 2.0625, + "grad_norm_var": 0.0088043212890625, + "learning_rate": 0.0001, + "loss": 4.0252, + "loss/crossentropy": 1.7633379697799683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17190124094486237, + "step": 22008 + }, + { + "epoch": 0.4402, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008796183268229167, + "learning_rate": 0.0001, + "loss": 3.925, + "loss/crossentropy": 2.0000420212745667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18909277766942978, + "step": 22010 + }, + { + "epoch": 0.44024, + "grad_norm": 1.8203125, + "grad_norm_var": 0.009691365559895833, + "learning_rate": 0.0001, + "loss": 3.8831, + "loss/crossentropy": 2.2526057958602905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19274260103702545, + "step": 22012 + }, + { + "epoch": 0.44028, + "grad_norm": 1.859375, + "grad_norm_var": 0.010261027018229167, + "learning_rate": 0.0001, + "loss": 3.9836, + "loss/crossentropy": 2.1151334643363953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992139369249344, + "step": 22014 + }, + { + "epoch": 0.44032, + "grad_norm": 1.84375, + "grad_norm_var": 0.008546702067057292, + "learning_rate": 0.0001, + "loss": 3.932, + "loss/crossentropy": 2.188230037689209, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.205021433532238, + "step": 22016 + }, + { + "epoch": 0.44036, + "grad_norm": 1.984375, + "grad_norm_var": 0.008329264322916667, + "learning_rate": 0.0001, + "loss": 3.9484, + "loss/crossentropy": 2.015773594379425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19697444140911102, + "step": 22018 + }, + { + "epoch": 0.4404, + "grad_norm": 1.7890625, + "grad_norm_var": 0.008882649739583333, + "learning_rate": 0.0001, + "loss": 3.7165, + "loss/crossentropy": 1.724815011024475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17810098081827164, + "step": 22020 + }, + { + "epoch": 0.44044, + "grad_norm": 2.09375, + "grad_norm_var": 0.010091145833333334, + "learning_rate": 0.0001, + "loss": 4.3719, + "loss/crossentropy": 2.471455454826355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20882685482501984, + "step": 22022 + }, + { + "epoch": 0.44048, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009212239583333334, + "learning_rate": 0.0001, + "loss": 4.2544, + "loss/crossentropy": 2.2879083156585693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20954442769289017, + "step": 22024 + }, + { + "epoch": 0.44052, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008440907796223958, + "learning_rate": 0.0001, + "loss": 4.1305, + "loss/crossentropy": 2.352377772331238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21374861150979996, + "step": 22026 + }, + { + "epoch": 0.44056, + "grad_norm": 1.875, + "grad_norm_var": 0.010823313395182292, + "learning_rate": 0.0001, + "loss": 3.889, + "loss/crossentropy": 1.5395016074180603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17564409226179123, + "step": 22028 + }, + { + "epoch": 0.4406, + "grad_norm": 1.890625, + "grad_norm_var": 0.009440104166666666, + "learning_rate": 0.0001, + "loss": 3.997, + "loss/crossentropy": 2.079974055290222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1886643022298813, + "step": 22030 + }, + { + "epoch": 0.44064, + "grad_norm": 2.21875, + "grad_norm_var": 0.013110097249348958, + "learning_rate": 0.0001, + "loss": 4.3671, + "loss/crossentropy": 2.0840702056884766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21841587871313095, + "step": 22032 + }, + { + "epoch": 0.44068, + "grad_norm": 2.125, + "grad_norm_var": 0.014168294270833333, + "learning_rate": 0.0001, + "loss": 4.2372, + "loss/crossentropy": 2.574951171875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21359464526176453, + "step": 22034 + }, + { + "epoch": 0.44072, + "grad_norm": 2.234375, + "grad_norm_var": 0.011519368489583333, + "learning_rate": 0.0001, + "loss": 4.0261, + "loss/crossentropy": 1.8530864715576172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19658754020929337, + "step": 22036 + }, + { + "epoch": 0.44076, + "grad_norm": 1.7109375, + "grad_norm_var": 0.018289947509765626, + "learning_rate": 0.0001, + "loss": 3.7077, + "loss/crossentropy": 1.9564177989959717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19047683477401733, + "step": 22038 + }, + { + "epoch": 0.4408, + "grad_norm": 2.078125, + "grad_norm_var": 0.02148615519205729, + "learning_rate": 0.0001, + "loss": 4.2303, + "loss/crossentropy": 2.2482646703720093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21484995633363724, + "step": 22040 + }, + { + "epoch": 0.44084, + "grad_norm": 2.640625, + "grad_norm_var": 0.04890925089518229, + "learning_rate": 0.0001, + "loss": 3.9753, + "loss/crossentropy": 1.9064926505088806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042565643787384, + "step": 22042 + }, + { + "epoch": 0.44088, + "grad_norm": 1.921875, + "grad_norm_var": 0.048164621988932295, + "learning_rate": 0.0001, + "loss": 3.8492, + "loss/crossentropy": 1.9772136211395264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18002107739448547, + "step": 22044 + }, + { + "epoch": 0.44092, + "grad_norm": 1.9609375, + "grad_norm_var": 0.047240193684895834, + "learning_rate": 0.0001, + "loss": 4.0417, + "loss/crossentropy": 1.8454242944717407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19727011024951935, + "step": 22046 + }, + { + "epoch": 0.44096, + "grad_norm": 1.8984375, + "grad_norm_var": 0.047584788004557295, + "learning_rate": 0.0001, + "loss": 3.8247, + "loss/crossentropy": 1.815159022808075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17201630771160126, + "step": 22048 + }, + { + "epoch": 0.441, + "grad_norm": 2.1875, + "grad_norm_var": 0.05176976521809896, + "learning_rate": 0.0001, + "loss": 4.0909, + "loss/crossentropy": 2.298068404197693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21187184751033783, + "step": 22050 + }, + { + "epoch": 0.44104, + "grad_norm": 2.046875, + "grad_norm_var": 0.04794108072916667, + "learning_rate": 0.0001, + "loss": 3.9931, + "loss/crossentropy": 2.4030654430389404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2233939692378044, + "step": 22052 + }, + { + "epoch": 0.44108, + "grad_norm": 1.859375, + "grad_norm_var": 0.043981679280598956, + "learning_rate": 0.0001, + "loss": 3.924, + "loss/crossentropy": 2.1413121223449707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19442930072546005, + "step": 22054 + }, + { + "epoch": 0.44112, + "grad_norm": 1.9375, + "grad_norm_var": 0.0421875, + "learning_rate": 0.0001, + "loss": 4.0048, + "loss/crossentropy": 2.029422342777252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19413675367832184, + "step": 22056 + }, + { + "epoch": 0.44116, + "grad_norm": 2.0625, + "grad_norm_var": 0.012010701497395833, + "learning_rate": 0.0001, + "loss": 4.3637, + "loss/crossentropy": 2.3010586500167847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2198517844080925, + "step": 22058 + }, + { + "epoch": 0.4412, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012353261311848959, + "learning_rate": 0.0001, + "loss": 4.0769, + "loss/crossentropy": 2.0910425782203674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19917423278093338, + "step": 22060 + }, + { + "epoch": 0.44124, + "grad_norm": 2.4375, + "grad_norm_var": 0.027766672770182292, + "learning_rate": 0.0001, + "loss": 3.7656, + "loss/crossentropy": 1.948616862297058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20419400930404663, + "step": 22062 + }, + { + "epoch": 0.44128, + "grad_norm": 1.8515625, + "grad_norm_var": 0.02814509073893229, + "learning_rate": 0.0001, + "loss": 4.1349, + "loss/crossentropy": 2.0931553840637207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19765160977840424, + "step": 22064 + }, + { + "epoch": 0.44132, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0232421875, + "learning_rate": 0.0001, + "loss": 4.1685, + "loss/crossentropy": 2.0905882120132446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20447614789009094, + "step": 22066 + }, + { + "epoch": 0.44136, + "grad_norm": 1.90625, + "grad_norm_var": 0.022861480712890625, + "learning_rate": 0.0001, + "loss": 4.1407, + "loss/crossentropy": 2.365593433380127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21229872107505798, + "step": 22068 + }, + { + "epoch": 0.4414, + "grad_norm": 1.9453125, + "grad_norm_var": 0.022093709309895834, + "learning_rate": 0.0001, + "loss": 4.2306, + "loss/crossentropy": 2.286802887916565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20844397693872452, + "step": 22070 + }, + { + "epoch": 0.44144, + "grad_norm": 1.984375, + "grad_norm_var": 0.021507771809895833, + "learning_rate": 0.0001, + "loss": 4.2739, + "loss/crossentropy": 2.1805503368377686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20301498472690582, + "step": 22072 + }, + { + "epoch": 0.44148, + "grad_norm": 2.03125, + "grad_norm_var": 0.020702870686848958, + "learning_rate": 0.0001, + "loss": 4.0185, + "loss/crossentropy": 2.025053381919861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959461346268654, + "step": 22074 + }, + { + "epoch": 0.44152, + "grad_norm": 1.828125, + "grad_norm_var": 0.02149658203125, + "learning_rate": 0.0001, + "loss": 4.1134, + "loss/crossentropy": 2.1158708930015564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19305311143398285, + "step": 22076 + }, + { + "epoch": 0.44156, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0063059488932291664, + "learning_rate": 0.0001, + "loss": 3.8749, + "loss/crossentropy": 1.746639907360077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18400797247886658, + "step": 22078 + }, + { + "epoch": 0.4416, + "grad_norm": 1.9375, + "grad_norm_var": 0.0041168212890625, + "learning_rate": 0.0001, + "loss": 4.3071, + "loss/crossentropy": 2.3301517963409424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19662386924028397, + "step": 22080 + }, + { + "epoch": 0.44164, + "grad_norm": 1.90625, + "grad_norm_var": 0.0036272684733072917, + "learning_rate": 0.0001, + "loss": 4.1136, + "loss/crossentropy": 1.9772114753723145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18933425843715668, + "step": 22082 + }, + { + "epoch": 0.44168, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0036069234212239582, + "learning_rate": 0.0001, + "loss": 4.0125, + "loss/crossentropy": 1.9511193633079529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19440795481204987, + "step": 22084 + }, + { + "epoch": 0.44172, + "grad_norm": 1.7890625, + "grad_norm_var": 0.005785115559895833, + "learning_rate": 0.0001, + "loss": 3.6898, + "loss/crossentropy": 1.7232664823532104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16435310244560242, + "step": 22086 + }, + { + "epoch": 0.44176, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005549875895182291, + "learning_rate": 0.0001, + "loss": 4.0631, + "loss/crossentropy": 2.1858248710632324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20494847744703293, + "step": 22088 + }, + { + "epoch": 0.4418, + "grad_norm": 1.921875, + "grad_norm_var": 0.01435546875, + "learning_rate": 0.0001, + "loss": 3.9496, + "loss/crossentropy": 2.4218143224716187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21809446811676025, + "step": 22090 + }, + { + "epoch": 0.44184, + "grad_norm": 1.8671875, + "grad_norm_var": 0.014168039957682291, + "learning_rate": 0.0001, + "loss": 4.0209, + "loss/crossentropy": 2.133104920387268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930398792028427, + "step": 22092 + }, + { + "epoch": 0.44188, + "grad_norm": 1.9609375, + "grad_norm_var": 0.013348134358723958, + "learning_rate": 0.0001, + "loss": 4.0959, + "loss/crossentropy": 2.113961935043335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2095598503947258, + "step": 22094 + }, + { + "epoch": 0.44192, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0151763916015625, + "learning_rate": 0.0001, + "loss": 3.7976, + "loss/crossentropy": 1.984375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18821635842323303, + "step": 22096 + }, + { + "epoch": 0.44196, + "grad_norm": 1.953125, + "grad_norm_var": 0.0153717041015625, + "learning_rate": 0.0001, + "loss": 3.8215, + "loss/crossentropy": 1.8913013339042664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1868671551346779, + "step": 22098 + }, + { + "epoch": 0.442, + "grad_norm": 1.84375, + "grad_norm_var": 0.01610081990559896, + "learning_rate": 0.0001, + "loss": 3.8635, + "loss/crossentropy": 1.9196743369102478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18776098638772964, + "step": 22100 + }, + { + "epoch": 0.44204, + "grad_norm": 1.921875, + "grad_norm_var": 0.013134511311848958, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 1.929152011871338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17069555073976517, + "step": 22102 + }, + { + "epoch": 0.44208, + "grad_norm": 1.921875, + "grad_norm_var": 0.0133941650390625, + "learning_rate": 0.0001, + "loss": 4.0115, + "loss/crossentropy": 2.0993363857269287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19678544998168945, + "step": 22104 + }, + { + "epoch": 0.44212, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004559071858723959, + "learning_rate": 0.0001, + "loss": 3.9802, + "loss/crossentropy": 1.8447982668876648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18432488292455673, + "step": 22106 + }, + { + "epoch": 0.44216, + "grad_norm": 1.96875, + "grad_norm_var": 0.0041544596354166664, + "learning_rate": 0.0001, + "loss": 4.2359, + "loss/crossentropy": 2.04882550239563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2129831686615944, + "step": 22108 + }, + { + "epoch": 0.4422, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0047686258951822914, + "learning_rate": 0.0001, + "loss": 3.8738, + "loss/crossentropy": 1.7422301769256592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18266545236110687, + "step": 22110 + }, + { + "epoch": 0.44224, + "grad_norm": 1.96875, + "grad_norm_var": 0.006664784749348959, + "learning_rate": 0.0001, + "loss": 4.2107, + "loss/crossentropy": 2.1211363077163696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22843683511018753, + "step": 22112 + }, + { + "epoch": 0.44228, + "grad_norm": 2.09375, + "grad_norm_var": 0.008097076416015625, + "learning_rate": 0.0001, + "loss": 3.8527, + "loss/crossentropy": 2.2478936910629272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21724677085876465, + "step": 22114 + }, + { + "epoch": 0.44232, + "grad_norm": 1.84375, + "grad_norm_var": 0.0153228759765625, + "learning_rate": 0.0001, + "loss": 4.3167, + "loss/crossentropy": 2.3345694541931152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20749730616807938, + "step": 22116 + }, + { + "epoch": 0.44236, + "grad_norm": 1.7265625, + "grad_norm_var": 0.020076243082682292, + "learning_rate": 0.0001, + "loss": 3.947, + "loss/crossentropy": 2.0583457946777344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18861139565706253, + "step": 22118 + }, + { + "epoch": 0.4424, + "grad_norm": 2.015625, + "grad_norm_var": 0.0246734619140625, + "learning_rate": 0.0001, + "loss": 3.7534, + "loss/crossentropy": 2.06750625371933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18891629576683044, + "step": 22120 + }, + { + "epoch": 0.44244, + "grad_norm": 2.109375, + "grad_norm_var": 0.025272623697916666, + "learning_rate": 0.0001, + "loss": 4.2192, + "loss/crossentropy": 1.8823494911193848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20006988942623138, + "step": 22122 + }, + { + "epoch": 0.44248, + "grad_norm": 2.15625, + "grad_norm_var": 0.026920572916666666, + "learning_rate": 0.0001, + "loss": 4.1285, + "loss/crossentropy": 2.0374066829681396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20347502827644348, + "step": 22124 + }, + { + "epoch": 0.44252, + "grad_norm": 1.9140625, + "grad_norm_var": 0.026195271809895834, + "learning_rate": 0.0001, + "loss": 3.8961, + "loss/crossentropy": 1.4977151155471802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15512564033269882, + "step": 22126 + }, + { + "epoch": 0.44256, + "grad_norm": 1.984375, + "grad_norm_var": 0.024346669514973957, + "learning_rate": 0.0001, + "loss": 3.9462, + "loss/crossentropy": 1.8102646470069885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1884431689977646, + "step": 22128 + }, + { + "epoch": 0.4426, + "grad_norm": 1.8671875, + "grad_norm_var": 0.024079386393229166, + "learning_rate": 0.0001, + "loss": 4.2244, + "loss/crossentropy": 2.1330573558807373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19625350832939148, + "step": 22130 + }, + { + "epoch": 0.44264, + "grad_norm": 1.90625, + "grad_norm_var": 0.0181396484375, + "learning_rate": 0.0001, + "loss": 3.7348, + "loss/crossentropy": 2.1866488456726074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21527475118637085, + "step": 22132 + }, + { + "epoch": 0.44268, + "grad_norm": 2.0, + "grad_norm_var": 0.013985188802083333, + "learning_rate": 0.0001, + "loss": 4.0073, + "loss/crossentropy": 2.0211732387542725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20512627810239792, + "step": 22134 + }, + { + "epoch": 0.44272, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010277303059895833, + "learning_rate": 0.0001, + "loss": 3.9338, + "loss/crossentropy": 1.798271358013153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1809278130531311, + "step": 22136 + }, + { + "epoch": 0.44276, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008927408854166667, + "learning_rate": 0.0001, + "loss": 4.0153, + "loss/crossentropy": 1.7850923538208008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16607706993818283, + "step": 22138 + }, + { + "epoch": 0.4428, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012361399332682292, + "learning_rate": 0.0001, + "loss": 4.0488, + "loss/crossentropy": 2.1693010330200195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18788384646177292, + "step": 22140 + }, + { + "epoch": 0.44284, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013242340087890625, + "learning_rate": 0.0001, + "loss": 4.0582, + "loss/crossentropy": 2.124464750289917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21157250553369522, + "step": 22142 + }, + { + "epoch": 0.44288, + "grad_norm": 1.8984375, + "grad_norm_var": 0.018416086832682293, + "learning_rate": 0.0001, + "loss": 4.2624, + "loss/crossentropy": 2.1086031794548035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20727767795324326, + "step": 22144 + }, + { + "epoch": 0.44292, + "grad_norm": 1.890625, + "grad_norm_var": 0.018000284830729168, + "learning_rate": 0.0001, + "loss": 3.8286, + "loss/crossentropy": 1.8891723155975342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20070044696331024, + "step": 22146 + }, + { + "epoch": 0.44296, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015173085530598958, + "learning_rate": 0.0001, + "loss": 4.1181, + "loss/crossentropy": 2.1405081748962402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19909673184156418, + "step": 22148 + }, + { + "epoch": 0.443, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0155181884765625, + "learning_rate": 0.0001, + "loss": 4.0455, + "loss/crossentropy": 2.2116858959198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1970309540629387, + "step": 22150 + }, + { + "epoch": 0.44304, + "grad_norm": 1.8828125, + "grad_norm_var": 0.015183258056640624, + "learning_rate": 0.0001, + "loss": 3.8698, + "loss/crossentropy": 1.9010364413261414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18096549808979034, + "step": 22152 + }, + { + "epoch": 0.44308, + "grad_norm": 1.875, + "grad_norm_var": 0.015607706705729167, + "learning_rate": 0.0001, + "loss": 4.0256, + "loss/crossentropy": 2.030808746814728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20662673562765121, + "step": 22154 + }, + { + "epoch": 0.44312, + "grad_norm": 2.265625, + "grad_norm_var": 0.018138631184895834, + "learning_rate": 0.0001, + "loss": 4.2633, + "loss/crossentropy": 1.9940236806869507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2860369235277176, + "step": 22156 + }, + { + "epoch": 0.44316, + "grad_norm": 1.828125, + "grad_norm_var": 0.01758397420247396, + "learning_rate": 0.0001, + "loss": 4.1577, + "loss/crossentropy": 2.3639464378356934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20355366170406342, + "step": 22158 + }, + { + "epoch": 0.4432, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013199615478515624, + "learning_rate": 0.0001, + "loss": 3.9197, + "loss/crossentropy": 2.247707962989807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2214345633983612, + "step": 22160 + }, + { + "epoch": 0.44324, + "grad_norm": 2.109375, + "grad_norm_var": 0.014696248372395833, + "learning_rate": 0.0001, + "loss": 4.3906, + "loss/crossentropy": 2.462310314178467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2597040832042694, + "step": 22162 + }, + { + "epoch": 0.44328, + "grad_norm": 1.9140625, + "grad_norm_var": 0.014890289306640625, + "learning_rate": 0.0001, + "loss": 4.0932, + "loss/crossentropy": 1.9838054180145264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21189701557159424, + "step": 22164 + }, + { + "epoch": 0.44332, + "grad_norm": 2.0625, + "grad_norm_var": 0.015169270833333333, + "learning_rate": 0.0001, + "loss": 3.988, + "loss/crossentropy": 2.086707830429077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23454946279525757, + "step": 22166 + }, + { + "epoch": 0.44336, + "grad_norm": 1.8671875, + "grad_norm_var": 0.014924875895182292, + "learning_rate": 0.0001, + "loss": 3.8447, + "loss/crossentropy": 1.9489662051200867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19790088385343552, + "step": 22168 + }, + { + "epoch": 0.4434, + "grad_norm": 1.8046875, + "grad_norm_var": 0.016532389322916667, + "learning_rate": 0.0001, + "loss": 3.7006, + "loss/crossentropy": 1.9342001676559448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17832304537296295, + "step": 22170 + }, + { + "epoch": 0.44344, + "grad_norm": 2.171875, + "grad_norm_var": 0.011139933268229167, + "learning_rate": 0.0001, + "loss": 4.353, + "loss/crossentropy": 2.248290777206421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21428222954273224, + "step": 22172 + }, + { + "epoch": 0.44348, + "grad_norm": 2.015625, + "grad_norm_var": 0.009865061442057291, + "learning_rate": 0.0001, + "loss": 4.1017, + "loss/crossentropy": 1.8474311232566833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1795397698879242, + "step": 22174 + }, + { + "epoch": 0.44352, + "grad_norm": 2.03125, + "grad_norm_var": 0.009187571207682292, + "learning_rate": 0.0001, + "loss": 4.1804, + "loss/crossentropy": 1.9687572121620178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062787041068077, + "step": 22176 + }, + { + "epoch": 0.44356, + "grad_norm": 1.7421875, + "grad_norm_var": 0.011625162760416667, + "learning_rate": 0.0001, + "loss": 3.8598, + "loss/crossentropy": 2.4766552448272705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2046542540192604, + "step": 22178 + }, + { + "epoch": 0.4436, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011618804931640626, + "learning_rate": 0.0001, + "loss": 4.1097, + "loss/crossentropy": 2.2163573503494263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21187163889408112, + "step": 22180 + }, + { + "epoch": 0.44364, + "grad_norm": 2.0, + "grad_norm_var": 0.010813140869140625, + "learning_rate": 0.0001, + "loss": 4.1702, + "loss/crossentropy": 2.180974006652832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20932868123054504, + "step": 22182 + }, + { + "epoch": 0.44368, + "grad_norm": 1.96875, + "grad_norm_var": 0.014957682291666666, + "learning_rate": 0.0001, + "loss": 4.2237, + "loss/crossentropy": 1.97287255525589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17987515777349472, + "step": 22184 + }, + { + "epoch": 0.44372, + "grad_norm": 2.0, + "grad_norm_var": 0.013679758707682291, + "learning_rate": 0.0001, + "loss": 4.0676, + "loss/crossentropy": 1.9375402927398682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994931995868683, + "step": 22186 + }, + { + "epoch": 0.44376, + "grad_norm": 1.859375, + "grad_norm_var": 0.0117340087890625, + "learning_rate": 0.0001, + "loss": 3.9908, + "loss/crossentropy": 2.152758836746216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19482193887233734, + "step": 22188 + }, + { + "epoch": 0.4438, + "grad_norm": 1.953125, + "grad_norm_var": 0.012450917561848959, + "learning_rate": 0.0001, + "loss": 4.223, + "loss/crossentropy": 2.29486083984375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24136198312044144, + "step": 22190 + }, + { + "epoch": 0.44384, + "grad_norm": 2.15625, + "grad_norm_var": 0.015363566080729167, + "learning_rate": 0.0001, + "loss": 3.9714, + "loss/crossentropy": 2.143070697784424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122703492641449, + "step": 22192 + }, + { + "epoch": 0.44388, + "grad_norm": 2.046875, + "grad_norm_var": 0.011229451497395833, + "learning_rate": 0.0001, + "loss": 3.8691, + "loss/crossentropy": 2.0414949655532837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614571869373322, + "step": 22194 + }, + { + "epoch": 0.44392, + "grad_norm": 2.296875, + "grad_norm_var": 0.016951497395833334, + "learning_rate": 0.0001, + "loss": 4.244, + "loss/crossentropy": 2.137809634208679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22674410045146942, + "step": 22196 + }, + { + "epoch": 0.44396, + "grad_norm": 2.171875, + "grad_norm_var": 0.01871312459309896, + "learning_rate": 0.0001, + "loss": 4.4088, + "loss/crossentropy": 2.292213559150696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18514274060726166, + "step": 22198 + }, + { + "epoch": 0.444, + "grad_norm": 2.015625, + "grad_norm_var": 0.016839345296223957, + "learning_rate": 0.0001, + "loss": 4.0419, + "loss/crossentropy": 2.3552767038345337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21818525344133377, + "step": 22200 + }, + { + "epoch": 0.44404, + "grad_norm": 2.015625, + "grad_norm_var": 0.01602961222330729, + "learning_rate": 0.0001, + "loss": 4.2912, + "loss/crossentropy": 2.1532652378082275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19477012753486633, + "step": 22202 + }, + { + "epoch": 0.44408, + "grad_norm": 1.921875, + "grad_norm_var": 0.012938435872395833, + "learning_rate": 0.0001, + "loss": 3.9805, + "loss/crossentropy": 1.921963095664978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16913262009620667, + "step": 22204 + }, + { + "epoch": 0.44412, + "grad_norm": 1.828125, + "grad_norm_var": 0.015063222249348958, + "learning_rate": 0.0001, + "loss": 3.8596, + "loss/crossentropy": 1.9791225790977478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18458476662635803, + "step": 22206 + }, + { + "epoch": 0.44416, + "grad_norm": 1.8125, + "grad_norm_var": 0.014595540364583333, + "learning_rate": 0.0001, + "loss": 4.0334, + "loss/crossentropy": 2.0733718276023865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2146601676940918, + "step": 22208 + }, + { + "epoch": 0.4442, + "grad_norm": 1.875, + "grad_norm_var": 0.015830230712890626, + "learning_rate": 0.0001, + "loss": 3.7326, + "loss/crossentropy": 1.5904502272605896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16351275146007538, + "step": 22210 + }, + { + "epoch": 0.44424, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009563954671223958, + "learning_rate": 0.0001, + "loss": 4.0214, + "loss/crossentropy": 2.0840989351272583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21245559304952621, + "step": 22212 + }, + { + "epoch": 0.44428, + "grad_norm": 2.046875, + "grad_norm_var": 0.008025868733723959, + "learning_rate": 0.0001, + "loss": 3.9995, + "loss/crossentropy": 2.0513384342193604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19362515211105347, + "step": 22214 + }, + { + "epoch": 0.44432, + "grad_norm": 1.96875, + "grad_norm_var": 0.010284169514973959, + "learning_rate": 0.0001, + "loss": 4.0989, + "loss/crossentropy": 2.0588608980178833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839647963643074, + "step": 22216 + }, + { + "epoch": 0.44436, + "grad_norm": 1.8046875, + "grad_norm_var": 0.011742146809895833, + "learning_rate": 0.0001, + "loss": 4.0668, + "loss/crossentropy": 2.0614622831344604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19217892736196518, + "step": 22218 + }, + { + "epoch": 0.4444, + "grad_norm": 1.828125, + "grad_norm_var": 0.013061269124348959, + "learning_rate": 0.0001, + "loss": 4.0823, + "loss/crossentropy": 1.980837643146515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963345631957054, + "step": 22220 + }, + { + "epoch": 0.44444, + "grad_norm": 1.7890625, + "grad_norm_var": 0.014562733968098958, + "learning_rate": 0.0001, + "loss": 3.8748, + "loss/crossentropy": 1.9071148037910461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20523779094219208, + "step": 22222 + }, + { + "epoch": 0.44448, + "grad_norm": 2.078125, + "grad_norm_var": 0.014020792643229167, + "learning_rate": 0.0001, + "loss": 4.2035, + "loss/crossentropy": 2.0757131576538086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1940036118030548, + "step": 22224 + }, + { + "epoch": 0.44452, + "grad_norm": 1.8671875, + "grad_norm_var": 0.015105946858723959, + "learning_rate": 0.0001, + "loss": 3.8287, + "loss/crossentropy": 1.8984442353248596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18971017003059387, + "step": 22226 + }, + { + "epoch": 0.44456, + "grad_norm": 1.9375, + "grad_norm_var": 0.014281972249348959, + "learning_rate": 0.0001, + "loss": 4.1821, + "loss/crossentropy": 2.2503433227539062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21632683277130127, + "step": 22228 + }, + { + "epoch": 0.4446, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011771647135416667, + "learning_rate": 0.0001, + "loss": 4.2548, + "loss/crossentropy": 2.264014482498169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061793953180313, + "step": 22230 + }, + { + "epoch": 0.44464, + "grad_norm": 2.015625, + "grad_norm_var": 0.009279123942057292, + "learning_rate": 0.0001, + "loss": 4.1289, + "loss/crossentropy": 2.2728497982025146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20043949782848358, + "step": 22232 + }, + { + "epoch": 0.44468, + "grad_norm": 2.125, + "grad_norm_var": 0.0100250244140625, + "learning_rate": 0.0001, + "loss": 4.199, + "loss/crossentropy": 2.074263334274292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2182644009590149, + "step": 22234 + }, + { + "epoch": 0.44472, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008780670166015626, + "learning_rate": 0.0001, + "loss": 3.9973, + "loss/crossentropy": 2.1185666918754578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1989990770816803, + "step": 22236 + }, + { + "epoch": 0.44476, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007028961181640625, + "learning_rate": 0.0001, + "loss": 3.9992, + "loss/crossentropy": 1.6533132791519165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17203578352928162, + "step": 22238 + }, + { + "epoch": 0.4448, + "grad_norm": 2.078125, + "grad_norm_var": 0.008231353759765626, + "learning_rate": 0.0001, + "loss": 3.948, + "loss/crossentropy": 2.2107443809509277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20056415349245071, + "step": 22240 + }, + { + "epoch": 0.44484, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007938385009765625, + "learning_rate": 0.0001, + "loss": 3.793, + "loss/crossentropy": 1.9579968452453613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1899353489279747, + "step": 22242 + }, + { + "epoch": 0.44488, + "grad_norm": 2.140625, + "grad_norm_var": 0.010945638020833334, + "learning_rate": 0.0001, + "loss": 4.012, + "loss/crossentropy": 2.074744164943695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2323092818260193, + "step": 22244 + }, + { + "epoch": 0.44492, + "grad_norm": 1.8125, + "grad_norm_var": 0.012308756510416666, + "learning_rate": 0.0001, + "loss": 3.8848, + "loss/crossentropy": 2.2182846069335938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18586912751197815, + "step": 22246 + }, + { + "epoch": 0.44496, + "grad_norm": 2.09375, + "grad_norm_var": 0.01761042277018229, + "learning_rate": 0.0001, + "loss": 4.3021, + "loss/crossentropy": 2.2342774868011475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20816949754953384, + "step": 22248 + }, + { + "epoch": 0.445, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0157135009765625, + "learning_rate": 0.0001, + "loss": 3.8011, + "loss/crossentropy": 2.103771924972534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20282650738954544, + "step": 22250 + }, + { + "epoch": 0.44504, + "grad_norm": 2.0625, + "grad_norm_var": 0.017350006103515624, + "learning_rate": 0.0001, + "loss": 4.1103, + "loss/crossentropy": 2.010720193386078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18631578236818314, + "step": 22252 + }, + { + "epoch": 0.44508, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0169097900390625, + "learning_rate": 0.0001, + "loss": 3.92, + "loss/crossentropy": 1.8840174674987793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19280597567558289, + "step": 22254 + }, + { + "epoch": 0.44512, + "grad_norm": 2.015625, + "grad_norm_var": 0.0151031494140625, + "learning_rate": 0.0001, + "loss": 4.0503, + "loss/crossentropy": 2.404397130012512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23199105262756348, + "step": 22256 + }, + { + "epoch": 0.44516, + "grad_norm": 1.7578125, + "grad_norm_var": 0.018021392822265624, + "learning_rate": 0.0001, + "loss": 3.7078, + "loss/crossentropy": 2.11636883020401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20003244280815125, + "step": 22258 + }, + { + "epoch": 0.4452, + "grad_norm": 1.8046875, + "grad_norm_var": 0.018195597330729167, + "learning_rate": 0.0001, + "loss": 4.1396, + "loss/crossentropy": 1.8522619009017944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16992035508155823, + "step": 22260 + }, + { + "epoch": 0.44524, + "grad_norm": 1.859375, + "grad_norm_var": 0.01738459269205729, + "learning_rate": 0.0001, + "loss": 3.923, + "loss/crossentropy": 2.243737578392029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2175471931695938, + "step": 22262 + }, + { + "epoch": 0.44528, + "grad_norm": 1.8046875, + "grad_norm_var": 0.01102294921875, + "learning_rate": 0.0001, + "loss": 3.9973, + "loss/crossentropy": 2.231359362602234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19454128295183182, + "step": 22264 + }, + { + "epoch": 0.44532, + "grad_norm": 1.984375, + "grad_norm_var": 0.0114898681640625, + "learning_rate": 0.0001, + "loss": 4.0373, + "loss/crossentropy": 2.08256071805954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158442661166191, + "step": 22266 + }, + { + "epoch": 0.44536, + "grad_norm": 1.796875, + "grad_norm_var": 0.010520172119140626, + "learning_rate": 0.0001, + "loss": 3.9418, + "loss/crossentropy": 2.089757025241852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029227539896965, + "step": 22268 + }, + { + "epoch": 0.4454, + "grad_norm": 1.8046875, + "grad_norm_var": 0.011017862955729167, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 2.0893616676330566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958554983139038, + "step": 22270 + }, + { + "epoch": 0.44544, + "grad_norm": 2.0625, + "grad_norm_var": 0.0145751953125, + "learning_rate": 0.0001, + "loss": 4.0382, + "loss/crossentropy": 1.9625884890556335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20560869574546814, + "step": 22272 + }, + { + "epoch": 0.44548, + "grad_norm": 2.015625, + "grad_norm_var": 0.012360636393229167, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 2.4028927087783813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21448686718940735, + "step": 22274 + }, + { + "epoch": 0.44552, + "grad_norm": 2.109375, + "grad_norm_var": 0.010890452067057292, + "learning_rate": 0.0001, + "loss": 4.1805, + "loss/crossentropy": 2.16468608379364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987794190645218, + "step": 22276 + }, + { + "epoch": 0.44556, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010326131184895834, + "learning_rate": 0.0001, + "loss": 3.9558, + "loss/crossentropy": 1.992479383945465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19877834618091583, + "step": 22278 + }, + { + "epoch": 0.4456, + "grad_norm": 1.9375, + "grad_norm_var": 0.008473459879557292, + "learning_rate": 0.0001, + "loss": 3.912, + "loss/crossentropy": 2.0368640422821045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20028480142354965, + "step": 22280 + }, + { + "epoch": 0.44564, + "grad_norm": 2.0, + "grad_norm_var": 0.017753092447916667, + "learning_rate": 0.0001, + "loss": 4.4026, + "loss/crossentropy": 2.252933144569397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19834356755018234, + "step": 22282 + }, + { + "epoch": 0.44568, + "grad_norm": 1.84375, + "grad_norm_var": 0.01628392537434896, + "learning_rate": 0.0001, + "loss": 3.9787, + "loss/crossentropy": 1.8675458431243896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1786389946937561, + "step": 22284 + }, + { + "epoch": 0.44572, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01455078125, + "learning_rate": 0.0001, + "loss": 4.254, + "loss/crossentropy": 2.0105971097946167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18405906856060028, + "step": 22286 + }, + { + "epoch": 0.44576, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014598592122395834, + "learning_rate": 0.0001, + "loss": 3.8389, + "loss/crossentropy": 1.8162254095077515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18923819810152054, + "step": 22288 + }, + { + "epoch": 0.4458, + "grad_norm": 1.890625, + "grad_norm_var": 0.01656061808268229, + "learning_rate": 0.0001, + "loss": 3.8374, + "loss/crossentropy": 1.7614133954048157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19005076587200165, + "step": 22290 + }, + { + "epoch": 0.44584, + "grad_norm": 2.0625, + "grad_norm_var": 0.1847551981608073, + "learning_rate": 0.0001, + "loss": 4.135, + "loss/crossentropy": 2.135009288787842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20304158329963684, + "step": 22292 + }, + { + "epoch": 0.44588, + "grad_norm": 1.890625, + "grad_norm_var": 0.18526611328125, + "learning_rate": 0.0001, + "loss": 3.9944, + "loss/crossentropy": 1.897695004940033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19377534836530685, + "step": 22294 + }, + { + "epoch": 0.44592, + "grad_norm": 2.03125, + "grad_norm_var": 0.182763671875, + "learning_rate": 0.0001, + "loss": 4.1663, + "loss/crossentropy": 2.0558266043663025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20582793653011322, + "step": 22296 + }, + { + "epoch": 0.44596, + "grad_norm": 2.015625, + "grad_norm_var": 0.18102188110351564, + "learning_rate": 0.0001, + "loss": 3.9538, + "loss/crossentropy": 2.329292058944702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20754558593034744, + "step": 22298 + }, + { + "epoch": 0.446, + "grad_norm": 1.8203125, + "grad_norm_var": 0.18606338500976563, + "learning_rate": 0.0001, + "loss": 3.8853, + "loss/crossentropy": 2.2229275703430176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20154273509979248, + "step": 22300 + }, + { + "epoch": 0.44604, + "grad_norm": 2.09375, + "grad_norm_var": 0.18578465779622397, + "learning_rate": 0.0001, + "loss": 3.992, + "loss/crossentropy": 1.9050685167312622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17552578449249268, + "step": 22302 + }, + { + "epoch": 0.44608, + "grad_norm": 1.84375, + "grad_norm_var": 0.18623021443684895, + "learning_rate": 0.0001, + "loss": 4.2241, + "loss/crossentropy": 2.139374792575836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19087333232164383, + "step": 22304 + }, + { + "epoch": 0.44612, + "grad_norm": 2.0, + "grad_norm_var": 0.18642578125, + "learning_rate": 0.0001, + "loss": 4.1013, + "loss/crossentropy": 1.8093907237052917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1805548369884491, + "step": 22306 + }, + { + "epoch": 0.44616, + "grad_norm": 1.859375, + "grad_norm_var": 0.030443318684895835, + "learning_rate": 0.0001, + "loss": 3.7124, + "loss/crossentropy": 1.8334048390388489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16753460466861725, + "step": 22308 + }, + { + "epoch": 0.4462, + "grad_norm": 2.046875, + "grad_norm_var": 0.030301920572916665, + "learning_rate": 0.0001, + "loss": 4.1018, + "loss/crossentropy": 2.1181896924972534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17999915778636932, + "step": 22310 + }, + { + "epoch": 0.44624, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02830785115559896, + "learning_rate": 0.0001, + "loss": 3.9938, + "loss/crossentropy": 2.3377938270568848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20874116569757462, + "step": 22312 + }, + { + "epoch": 0.44628, + "grad_norm": 1.921875, + "grad_norm_var": 0.028173828125, + "learning_rate": 0.0001, + "loss": 4.3027, + "loss/crossentropy": 2.2557464838027954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2159905582666397, + "step": 22314 + }, + { + "epoch": 0.44632, + "grad_norm": 1.9296875, + "grad_norm_var": 0.02478612263997396, + "learning_rate": 0.0001, + "loss": 4.1486, + "loss/crossentropy": 1.690861165523529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1791190207004547, + "step": 22316 + }, + { + "epoch": 0.44636, + "grad_norm": 1.921875, + "grad_norm_var": 0.02339452107747396, + "learning_rate": 0.0001, + "loss": 3.9995, + "loss/crossentropy": 2.282811760902405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2278990000486374, + "step": 22318 + }, + { + "epoch": 0.4464, + "grad_norm": 2.046875, + "grad_norm_var": 0.023021443684895834, + "learning_rate": 0.0001, + "loss": 4.0971, + "loss/crossentropy": 2.087849497795105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19879884272813797, + "step": 22320 + }, + { + "epoch": 0.44644, + "grad_norm": 2.0, + "grad_norm_var": 0.024181874593098958, + "learning_rate": 0.0001, + "loss": 4.2878, + "loss/crossentropy": 1.7355778217315674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1880098134279251, + "step": 22322 + }, + { + "epoch": 0.44648, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006556955973307291, + "learning_rate": 0.0001, + "loss": 3.935, + "loss/crossentropy": 1.9930670857429504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.195980504155159, + "step": 22324 + }, + { + "epoch": 0.44652, + "grad_norm": 1.890625, + "grad_norm_var": 0.006990305582682292, + "learning_rate": 0.0001, + "loss": 4.058, + "loss/crossentropy": 2.2104332447052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21003258973360062, + "step": 22326 + }, + { + "epoch": 0.44656, + "grad_norm": 2.15625, + "grad_norm_var": 0.009208170572916667, + "learning_rate": 0.0001, + "loss": 3.876, + "loss/crossentropy": 1.812508225440979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18001863360404968, + "step": 22328 + }, + { + "epoch": 0.4466, + "grad_norm": 2.09375, + "grad_norm_var": 0.009992472330729167, + "learning_rate": 0.0001, + "loss": 4.2515, + "loss/crossentropy": 2.0790328979492188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21385060250759125, + "step": 22330 + }, + { + "epoch": 0.44664, + "grad_norm": 2.09375, + "grad_norm_var": 0.010206858317057291, + "learning_rate": 0.0001, + "loss": 3.9865, + "loss/crossentropy": 1.778535783290863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18019723147153854, + "step": 22332 + }, + { + "epoch": 0.44668, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011714680989583334, + "learning_rate": 0.0001, + "loss": 4.1055, + "loss/crossentropy": 2.1208608746528625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2075667828321457, + "step": 22334 + }, + { + "epoch": 0.44672, + "grad_norm": 1.9140625, + "grad_norm_var": 0.013523101806640625, + "learning_rate": 0.0001, + "loss": 3.8638, + "loss/crossentropy": 1.9015109539031982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18012915551662445, + "step": 22336 + }, + { + "epoch": 0.44676, + "grad_norm": 1.921875, + "grad_norm_var": 0.01219482421875, + "learning_rate": 0.0001, + "loss": 3.9193, + "loss/crossentropy": 1.844041883945465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18436504900455475, + "step": 22338 + }, + { + "epoch": 0.4468, + "grad_norm": 1.90625, + "grad_norm_var": 0.012889607747395834, + "learning_rate": 0.0001, + "loss": 3.8859, + "loss/crossentropy": 1.730314016342163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16672005504369736, + "step": 22340 + }, + { + "epoch": 0.44684, + "grad_norm": 2.03125, + "grad_norm_var": 0.06477762858072916, + "learning_rate": 0.0001, + "loss": 3.9309, + "loss/crossentropy": 2.385637402534485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20245376229286194, + "step": 22342 + }, + { + "epoch": 0.44688, + "grad_norm": 1.9765625, + "grad_norm_var": 0.06330337524414062, + "learning_rate": 0.0001, + "loss": 4.0168, + "loss/crossentropy": 2.033573269844055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19226811826229095, + "step": 22344 + }, + { + "epoch": 0.44692, + "grad_norm": 2.03125, + "grad_norm_var": 0.06444498697916666, + "learning_rate": 0.0001, + "loss": 3.9613, + "loss/crossentropy": 2.018698275089264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19148993492126465, + "step": 22346 + }, + { + "epoch": 0.44696, + "grad_norm": 1.8671875, + "grad_norm_var": 0.06483739217122396, + "learning_rate": 0.0001, + "loss": 4.0311, + "loss/crossentropy": 1.7709164023399353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18092583864927292, + "step": 22348 + }, + { + "epoch": 0.447, + "grad_norm": 2.15625, + "grad_norm_var": 0.06496480305989584, + "learning_rate": 0.0001, + "loss": 4.1857, + "loss/crossentropy": 1.9011916518211365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18185202777385712, + "step": 22350 + }, + { + "epoch": 0.44704, + "grad_norm": 1.875, + "grad_norm_var": 0.0629547119140625, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 1.9298787117004395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18417780846357346, + "step": 22352 + }, + { + "epoch": 0.44708, + "grad_norm": 1.96875, + "grad_norm_var": 0.0615875244140625, + "learning_rate": 0.0001, + "loss": 4.0872, + "loss/crossentropy": 2.3658339977264404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2167125567793846, + "step": 22354 + }, + { + "epoch": 0.44712, + "grad_norm": 1.9609375, + "grad_norm_var": 0.06006571451822917, + "learning_rate": 0.0001, + "loss": 4.092, + "loss/crossentropy": 2.150742769241333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21436245739459991, + "step": 22356 + }, + { + "epoch": 0.44716, + "grad_norm": 1.796875, + "grad_norm_var": 0.007879384358723958, + "learning_rate": 0.0001, + "loss": 4.099, + "loss/crossentropy": 2.197741746902466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19714494049549103, + "step": 22358 + }, + { + "epoch": 0.4472, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008286285400390624, + "learning_rate": 0.0001, + "loss": 4.0584, + "loss/crossentropy": 2.11471688747406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19025350362062454, + "step": 22360 + }, + { + "epoch": 0.44724, + "grad_norm": 2.015625, + "grad_norm_var": 0.0075724283854166664, + "learning_rate": 0.0001, + "loss": 3.9815, + "loss/crossentropy": 2.2634165287017822, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038334459066391, + "step": 22362 + }, + { + "epoch": 0.44728, + "grad_norm": 1.984375, + "grad_norm_var": 0.007181549072265625, + "learning_rate": 0.0001, + "loss": 4.1587, + "loss/crossentropy": 2.3305805921554565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2206403762102127, + "step": 22364 + }, + { + "epoch": 0.44732, + "grad_norm": 1.921875, + "grad_norm_var": 0.0032867431640625, + "learning_rate": 0.0001, + "loss": 3.9196, + "loss/crossentropy": 1.8015141487121582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1901993378996849, + "step": 22366 + }, + { + "epoch": 0.44736, + "grad_norm": 2.40625, + "grad_norm_var": 0.017256673177083334, + "learning_rate": 0.0001, + "loss": 4.3285, + "loss/crossentropy": 2.325510263442993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2076701521873474, + "step": 22368 + }, + { + "epoch": 0.4474, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0187652587890625, + "learning_rate": 0.0001, + "loss": 4.0401, + "loss/crossentropy": 1.9470626711845398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854722648859024, + "step": 22370 + }, + { + "epoch": 0.44744, + "grad_norm": 1.78125, + "grad_norm_var": 0.020442708333333334, + "learning_rate": 0.0001, + "loss": 3.9825, + "loss/crossentropy": 1.7524075508117676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17917660623788834, + "step": 22372 + }, + { + "epoch": 0.44748, + "grad_norm": 1.921875, + "grad_norm_var": 0.018480428059895835, + "learning_rate": 0.0001, + "loss": 3.5927, + "loss/crossentropy": 2.0228232741355896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19089559465646744, + "step": 22374 + }, + { + "epoch": 0.44752, + "grad_norm": 1.9375, + "grad_norm_var": 0.019846343994140626, + "learning_rate": 0.0001, + "loss": 4.0478, + "loss/crossentropy": 2.289384961128235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21693934500217438, + "step": 22376 + }, + { + "epoch": 0.44756, + "grad_norm": 2.125, + "grad_norm_var": 0.021996053059895833, + "learning_rate": 0.0001, + "loss": 3.6972, + "loss/crossentropy": 1.9497195482254028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18409277498722076, + "step": 22378 + }, + { + "epoch": 0.4476, + "grad_norm": 1.953125, + "grad_norm_var": 0.022004191080729166, + "learning_rate": 0.0001, + "loss": 4.2471, + "loss/crossentropy": 2.184138000011444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19307169318199158, + "step": 22380 + }, + { + "epoch": 0.44764, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02208226521809896, + "learning_rate": 0.0001, + "loss": 4.0119, + "loss/crossentropy": 2.0050132274627686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19592129439115524, + "step": 22382 + }, + { + "epoch": 0.44768, + "grad_norm": 1.75, + "grad_norm_var": 0.012475331624348959, + "learning_rate": 0.0001, + "loss": 3.8577, + "loss/crossentropy": 1.725416898727417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17788676172494888, + "step": 22384 + }, + { + "epoch": 0.44772, + "grad_norm": 2.140625, + "grad_norm_var": 0.015103912353515625, + "learning_rate": 0.0001, + "loss": 3.8859, + "loss/crossentropy": 1.8454214930534363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18375547230243683, + "step": 22386 + }, + { + "epoch": 0.44776, + "grad_norm": 2.0, + "grad_norm_var": 0.013166300455729167, + "learning_rate": 0.0001, + "loss": 4.1856, + "loss/crossentropy": 2.090963661670685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20424450188875198, + "step": 22388 + }, + { + "epoch": 0.4478, + "grad_norm": 1.875, + "grad_norm_var": 0.014296213785807291, + "learning_rate": 0.0001, + "loss": 3.5071, + "loss/crossentropy": 1.7257133722305298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18389710783958435, + "step": 22390 + }, + { + "epoch": 0.44784, + "grad_norm": 2.015625, + "grad_norm_var": 0.012153879801432291, + "learning_rate": 0.0001, + "loss": 4.2002, + "loss/crossentropy": 2.0955962538719177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20340529829263687, + "step": 22392 + }, + { + "epoch": 0.44788, + "grad_norm": 2.046875, + "grad_norm_var": 0.010103098551432292, + "learning_rate": 0.0001, + "loss": 3.965, + "loss/crossentropy": 1.8837090730667114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188998244702816, + "step": 22394 + }, + { + "epoch": 0.44792, + "grad_norm": 1.875, + "grad_norm_var": 0.010092926025390626, + "learning_rate": 0.0001, + "loss": 4.0777, + "loss/crossentropy": 2.351726531982422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22628787904977798, + "step": 22396 + }, + { + "epoch": 0.44796, + "grad_norm": 1.90625, + "grad_norm_var": 0.010107421875, + "learning_rate": 0.0001, + "loss": 3.8849, + "loss/crossentropy": 2.012966811656952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19502227008342743, + "step": 22398 + }, + { + "epoch": 0.448, + "grad_norm": 1.890625, + "grad_norm_var": 0.0080230712890625, + "learning_rate": 0.0001, + "loss": 3.9682, + "loss/crossentropy": 1.6796467900276184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17925339192152023, + "step": 22400 + }, + { + "epoch": 0.44804, + "grad_norm": 1.75, + "grad_norm_var": 0.006696573893229167, + "learning_rate": 0.0001, + "loss": 3.7172, + "loss/crossentropy": 2.1229045391082764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19038531184196472, + "step": 22402 + }, + { + "epoch": 0.44808, + "grad_norm": 2.046875, + "grad_norm_var": 0.007372792561848958, + "learning_rate": 0.0001, + "loss": 4.1287, + "loss/crossentropy": 2.251511335372925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2037583664059639, + "step": 22404 + }, + { + "epoch": 0.44812, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007096099853515625, + "learning_rate": 0.0001, + "loss": 3.9975, + "loss/crossentropy": 1.768878161907196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192366823554039, + "step": 22406 + }, + { + "epoch": 0.44816, + "grad_norm": 1.78125, + "grad_norm_var": 0.008835601806640624, + "learning_rate": 0.0001, + "loss": 3.7432, + "loss/crossentropy": 1.9983880519866943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1903810203075409, + "step": 22408 + }, + { + "epoch": 0.4482, + "grad_norm": 1.859375, + "grad_norm_var": 0.007783762613932292, + "learning_rate": 0.0001, + "loss": 4.1312, + "loss/crossentropy": 2.0608294010162354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20538168400526047, + "step": 22410 + }, + { + "epoch": 0.44824, + "grad_norm": 1.921875, + "grad_norm_var": 0.0072509765625, + "learning_rate": 0.0001, + "loss": 4.0167, + "loss/crossentropy": 2.137459099292755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062518671154976, + "step": 22412 + }, + { + "epoch": 0.44828, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007437896728515625, + "learning_rate": 0.0001, + "loss": 3.8843, + "loss/crossentropy": 2.090251922607422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19352033734321594, + "step": 22414 + }, + { + "epoch": 0.44832, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006359608968098959, + "learning_rate": 0.0001, + "loss": 3.9818, + "loss/crossentropy": 1.8260875344276428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17446549236774445, + "step": 22416 + }, + { + "epoch": 0.44836, + "grad_norm": 1.96875, + "grad_norm_var": 0.00504150390625, + "learning_rate": 0.0001, + "loss": 4.1228, + "loss/crossentropy": 2.4598742723464966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21792489290237427, + "step": 22418 + }, + { + "epoch": 0.4484, + "grad_norm": 1.7734375, + "grad_norm_var": 0.006113433837890625, + "learning_rate": 0.0001, + "loss": 4.0063, + "loss/crossentropy": 2.120418429374695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20342590659856796, + "step": 22420 + }, + { + "epoch": 0.44844, + "grad_norm": 1.90625, + "grad_norm_var": 0.007795969645182292, + "learning_rate": 0.0001, + "loss": 4.2376, + "loss/crossentropy": 2.0946252942085266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19902334362268448, + "step": 22422 + }, + { + "epoch": 0.44848, + "grad_norm": 2.015625, + "grad_norm_var": 0.007112375895182292, + "learning_rate": 0.0001, + "loss": 4.4449, + "loss/crossentropy": 2.395688056945801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2158007100224495, + "step": 22424 + }, + { + "epoch": 0.44852, + "grad_norm": 1.796875, + "grad_norm_var": 0.0090087890625, + "learning_rate": 0.0001, + "loss": 4.0946, + "loss/crossentropy": 2.1429702043533325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2130357325077057, + "step": 22426 + }, + { + "epoch": 0.44856, + "grad_norm": 2.078125, + "grad_norm_var": 0.010050201416015625, + "learning_rate": 0.0001, + "loss": 4.0105, + "loss/crossentropy": 2.0070658922195435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1955135241150856, + "step": 22428 + }, + { + "epoch": 0.4486, + "grad_norm": 1.9375, + "grad_norm_var": 0.009308878580729167, + "learning_rate": 0.0001, + "loss": 4.0902, + "loss/crossentropy": 2.1505234241485596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20378431677818298, + "step": 22430 + }, + { + "epoch": 0.44864, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0091705322265625, + "learning_rate": 0.0001, + "loss": 4.3029, + "loss/crossentropy": 2.4345964193344116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19363048672676086, + "step": 22432 + }, + { + "epoch": 0.44868, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010811360677083333, + "learning_rate": 0.0001, + "loss": 3.9744, + "loss/crossentropy": 1.922882616519928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18355388939380646, + "step": 22434 + }, + { + "epoch": 0.44872, + "grad_norm": 1.96875, + "grad_norm_var": 0.008369954427083333, + "learning_rate": 0.0001, + "loss": 3.9232, + "loss/crossentropy": 2.051816701889038, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21039457619190216, + "step": 22436 + }, + { + "epoch": 0.44876, + "grad_norm": 2.15625, + "grad_norm_var": 0.009919230143229167, + "learning_rate": 0.0001, + "loss": 4.0486, + "loss/crossentropy": 2.375608444213867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21171333640813828, + "step": 22438 + }, + { + "epoch": 0.4488, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008937327067057292, + "learning_rate": 0.0001, + "loss": 4.2023, + "loss/crossentropy": 2.02259361743927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2248649150133133, + "step": 22440 + }, + { + "epoch": 0.44884, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007193756103515625, + "learning_rate": 0.0001, + "loss": 4.1429, + "loss/crossentropy": 2.010511100292206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19157880544662476, + "step": 22442 + }, + { + "epoch": 0.44888, + "grad_norm": 2.0625, + "grad_norm_var": 0.009030914306640625, + "learning_rate": 0.0001, + "loss": 4.4214, + "loss/crossentropy": 2.3076666593551636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2125840112566948, + "step": 22444 + }, + { + "epoch": 0.44892, + "grad_norm": 1.8125, + "grad_norm_var": 0.010227203369140625, + "learning_rate": 0.0001, + "loss": 3.772, + "loss/crossentropy": 1.780324101448059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16909868270158768, + "step": 22446 + }, + { + "epoch": 0.44896, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0105621337890625, + "learning_rate": 0.0001, + "loss": 4.2386, + "loss/crossentropy": 2.1713266372680664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1884608194231987, + "step": 22448 + }, + { + "epoch": 0.449, + "grad_norm": 1.890625, + "grad_norm_var": 0.009444173177083333, + "learning_rate": 0.0001, + "loss": 4.2591, + "loss/crossentropy": 2.1273884773254395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.181084506213665, + "step": 22450 + }, + { + "epoch": 0.44904, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009850819905598959, + "learning_rate": 0.0001, + "loss": 4.0999, + "loss/crossentropy": 1.8685917258262634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984790861606598, + "step": 22452 + }, + { + "epoch": 0.44908, + "grad_norm": 1.953125, + "grad_norm_var": 0.006689453125, + "learning_rate": 0.0001, + "loss": 3.9955, + "loss/crossentropy": 1.8099998831748962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186032235622406, + "step": 22454 + }, + { + "epoch": 0.44912, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007063802083333333, + "learning_rate": 0.0001, + "loss": 3.8906, + "loss/crossentropy": 2.111889600753784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19286638498306274, + "step": 22456 + }, + { + "epoch": 0.44916, + "grad_norm": 1.8125, + "grad_norm_var": 0.008984120686848958, + "learning_rate": 0.0001, + "loss": 3.7561, + "loss/crossentropy": 2.041996479034424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18484222888946533, + "step": 22458 + }, + { + "epoch": 0.4492, + "grad_norm": 2.0625, + "grad_norm_var": 0.0056874593098958336, + "learning_rate": 0.0001, + "loss": 3.8578, + "loss/crossentropy": 1.755624771118164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1807885393500328, + "step": 22460 + }, + { + "epoch": 0.44924, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0054840087890625, + "learning_rate": 0.0001, + "loss": 3.9186, + "loss/crossentropy": 2.272615075111389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22077642381191254, + "step": 22462 + }, + { + "epoch": 0.44928, + "grad_norm": 1.828125, + "grad_norm_var": 0.0053179423014322914, + "learning_rate": 0.0001, + "loss": 3.7631, + "loss/crossentropy": 2.1574344635009766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1905694305896759, + "step": 22464 + }, + { + "epoch": 0.44932, + "grad_norm": 2.0, + "grad_norm_var": 0.008463287353515625, + "learning_rate": 0.0001, + "loss": 4.1746, + "loss/crossentropy": 2.3979402780532837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21937933564186096, + "step": 22466 + }, + { + "epoch": 0.44936, + "grad_norm": 1.984375, + "grad_norm_var": 0.009250640869140625, + "learning_rate": 0.0001, + "loss": 4.0639, + "loss/crossentropy": 2.181613326072693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.219841867685318, + "step": 22468 + }, + { + "epoch": 0.4494, + "grad_norm": 2.03125, + "grad_norm_var": 0.01041259765625, + "learning_rate": 0.0001, + "loss": 4.032, + "loss/crossentropy": 2.2507534623146057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20193026959896088, + "step": 22470 + }, + { + "epoch": 0.44944, + "grad_norm": 1.984375, + "grad_norm_var": 0.013278961181640625, + "learning_rate": 0.0001, + "loss": 4.3495, + "loss/crossentropy": 2.288329839706421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20958562195301056, + "step": 22472 + }, + { + "epoch": 0.44948, + "grad_norm": 1.96875, + "grad_norm_var": 0.00838623046875, + "learning_rate": 0.0001, + "loss": 3.8597, + "loss/crossentropy": 2.0730547308921814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21679005771875381, + "step": 22474 + }, + { + "epoch": 0.44952, + "grad_norm": 1.96875, + "grad_norm_var": 0.008036041259765625, + "learning_rate": 0.0001, + "loss": 4.2922, + "loss/crossentropy": 2.1609703302383423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996021345257759, + "step": 22476 + }, + { + "epoch": 0.44956, + "grad_norm": 2.078125, + "grad_norm_var": 0.007738240559895833, + "learning_rate": 0.0001, + "loss": 4.048, + "loss/crossentropy": 1.857498288154602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20019329339265823, + "step": 22478 + }, + { + "epoch": 0.4496, + "grad_norm": 2.046875, + "grad_norm_var": 0.0049468994140625, + "learning_rate": 0.0001, + "loss": 4.1772, + "loss/crossentropy": 1.9688559174537659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18978293240070343, + "step": 22480 + }, + { + "epoch": 0.44964, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005391184488932292, + "learning_rate": 0.0001, + "loss": 4.1759, + "loss/crossentropy": 2.126068413257599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19416333734989166, + "step": 22482 + }, + { + "epoch": 0.44968, + "grad_norm": 1.9375, + "grad_norm_var": 0.0064361572265625, + "learning_rate": 0.0001, + "loss": 3.9915, + "loss/crossentropy": 2.0684805512428284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21337899565696716, + "step": 22484 + }, + { + "epoch": 0.44972, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007669830322265625, + "learning_rate": 0.0001, + "loss": 4.0065, + "loss/crossentropy": 1.965733289718628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1928843855857849, + "step": 22486 + }, + { + "epoch": 0.44976, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0059478759765625, + "learning_rate": 0.0001, + "loss": 4.1346, + "loss/crossentropy": 1.80375075340271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18274405598640442, + "step": 22488 + }, + { + "epoch": 0.4498, + "grad_norm": 2.0, + "grad_norm_var": 0.008284505208333333, + "learning_rate": 0.0001, + "loss": 3.8954, + "loss/crossentropy": 1.939602553844452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18628022074699402, + "step": 22490 + }, + { + "epoch": 0.44984, + "grad_norm": 1.890625, + "grad_norm_var": 0.009276326497395833, + "learning_rate": 0.0001, + "loss": 4.0224, + "loss/crossentropy": 2.408365488052368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21072806417942047, + "step": 22492 + }, + { + "epoch": 0.44988, + "grad_norm": 1.953125, + "grad_norm_var": 0.008961741129557292, + "learning_rate": 0.0001, + "loss": 4.0382, + "loss/crossentropy": 2.1751617789268494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21095305681228638, + "step": 22494 + }, + { + "epoch": 0.44992, + "grad_norm": 1.9375, + "grad_norm_var": 0.03264745076497396, + "learning_rate": 0.0001, + "loss": 4.2429, + "loss/crossentropy": 2.3348854780197144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21335425972938538, + "step": 22496 + }, + { + "epoch": 0.44996, + "grad_norm": 1.9375, + "grad_norm_var": 0.03258031209309896, + "learning_rate": 0.0001, + "loss": 3.9947, + "loss/crossentropy": 2.096919059753418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19042403250932693, + "step": 22498 + }, + { + "epoch": 0.45, + "grad_norm": 1.875, + "grad_norm_var": 0.033056640625, + "learning_rate": 0.0001, + "loss": 3.8055, + "loss/crossentropy": 1.950230062007904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19603446125984192, + "step": 22500 + }, + { + "epoch": 0.45004, + "grad_norm": 2.0, + "grad_norm_var": 0.032692209879557295, + "learning_rate": 0.0001, + "loss": 4.2118, + "loss/crossentropy": 2.3010233640670776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23052462935447693, + "step": 22502 + }, + { + "epoch": 0.45008, + "grad_norm": 2.03125, + "grad_norm_var": 0.03386408487955729, + "learning_rate": 0.0001, + "loss": 3.8238, + "loss/crossentropy": 2.143756926059723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19040528684854507, + "step": 22504 + }, + { + "epoch": 0.45012, + "grad_norm": 2.3125, + "grad_norm_var": 0.03865941365559896, + "learning_rate": 0.0001, + "loss": 4.194, + "loss/crossentropy": 1.9510119557380676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19400011003017426, + "step": 22506 + }, + { + "epoch": 0.45016, + "grad_norm": 2.203125, + "grad_norm_var": 0.03951822916666667, + "learning_rate": 0.0001, + "loss": 4.0304, + "loss/crossentropy": 2.2680559158325195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22951483726501465, + "step": 22508 + }, + { + "epoch": 0.4502, + "grad_norm": 2.140625, + "grad_norm_var": 0.19219741821289063, + "learning_rate": 0.0001, + "loss": 4.0042, + "loss/crossentropy": 2.3191086053848267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2179061621427536, + "step": 22510 + }, + { + "epoch": 0.45024, + "grad_norm": 2.125, + "grad_norm_var": 0.17602310180664063, + "learning_rate": 0.0001, + "loss": 4.1409, + "loss/crossentropy": 2.2202601432800293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21782507002353668, + "step": 22512 + }, + { + "epoch": 0.45028, + "grad_norm": 2.078125, + "grad_norm_var": 0.17064208984375, + "learning_rate": 0.0001, + "loss": 4.1591, + "loss/crossentropy": 1.9649160504341125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19763009250164032, + "step": 22514 + }, + { + "epoch": 0.45032, + "grad_norm": 2.125, + "grad_norm_var": 0.1665435791015625, + "learning_rate": 0.0001, + "loss": 4.4044, + "loss/crossentropy": 2.0819711089134216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008282020688057, + "step": 22516 + }, + { + "epoch": 0.45036, + "grad_norm": 1.890625, + "grad_norm_var": 0.26408589680989586, + "learning_rate": 0.0001, + "loss": 4.2564, + "loss/crossentropy": 2.2396440505981445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21240604668855667, + "step": 22518 + }, + { + "epoch": 0.4504, + "grad_norm": 1.9296875, + "grad_norm_var": 0.268542226155599, + "learning_rate": 0.0001, + "loss": 3.9298, + "loss/crossentropy": 2.21976238489151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20444615930318832, + "step": 22520 + }, + { + "epoch": 0.45044, + "grad_norm": 2.015625, + "grad_norm_var": 0.2693072001139323, + "learning_rate": 0.0001, + "loss": 4.075, + "loss/crossentropy": 2.2542803287506104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2205461636185646, + "step": 22522 + }, + { + "epoch": 0.45048, + "grad_norm": 2.0, + "grad_norm_var": 0.2702555338541667, + "learning_rate": 0.0001, + "loss": 3.9809, + "loss/crossentropy": 1.7875661253929138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17205239832401276, + "step": 22524 + }, + { + "epoch": 0.45052, + "grad_norm": 1.9609375, + "grad_norm_var": 0.13923238118489584, + "learning_rate": 0.0001, + "loss": 3.875, + "loss/crossentropy": 2.0036060214042664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21126192808151245, + "step": 22526 + }, + { + "epoch": 0.45056, + "grad_norm": 2.125, + "grad_norm_var": 0.13862279256184895, + "learning_rate": 0.0001, + "loss": 3.9585, + "loss/crossentropy": 1.965367078781128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19563131779432297, + "step": 22528 + }, + { + "epoch": 0.4506, + "grad_norm": 1.8671875, + "grad_norm_var": 0.14239273071289063, + "learning_rate": 0.0001, + "loss": 4.158, + "loss/crossentropy": 1.9605298042297363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932133436203003, + "step": 22530 + }, + { + "epoch": 0.45064, + "grad_norm": 1.8515625, + "grad_norm_var": 0.14579671223958332, + "learning_rate": 0.0001, + "loss": 3.9343, + "loss/crossentropy": 2.1047326922416687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19184139370918274, + "step": 22532 + }, + { + "epoch": 0.45068, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007104237874348958, + "learning_rate": 0.0001, + "loss": 4.299, + "loss/crossentropy": 2.173476457595825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1873651072382927, + "step": 22534 + }, + { + "epoch": 0.45072, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006379191080729167, + "learning_rate": 0.0001, + "loss": 4.0107, + "loss/crossentropy": 1.928326666355133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937912181019783, + "step": 22536 + }, + { + "epoch": 0.45076, + "grad_norm": 1.953125, + "grad_norm_var": 0.00718994140625, + "learning_rate": 0.0001, + "loss": 3.9559, + "loss/crossentropy": 2.2145062685012817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21003177762031555, + "step": 22538 + }, + { + "epoch": 0.4508, + "grad_norm": 1.96875, + "grad_norm_var": 0.008430989583333333, + "learning_rate": 0.0001, + "loss": 3.9536, + "loss/crossentropy": 1.9980989694595337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18583524972200394, + "step": 22540 + }, + { + "epoch": 0.45084, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008552805582682291, + "learning_rate": 0.0001, + "loss": 4.0124, + "loss/crossentropy": 2.0646828413009644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20016716420650482, + "step": 22542 + }, + { + "epoch": 0.45088, + "grad_norm": 1.890625, + "grad_norm_var": 0.006306711832682292, + "learning_rate": 0.0001, + "loss": 3.9468, + "loss/crossentropy": 2.4558991193771362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1985267549753189, + "step": 22544 + }, + { + "epoch": 0.45092, + "grad_norm": 2.0, + "grad_norm_var": 0.006589508056640625, + "learning_rate": 0.0001, + "loss": 4.1356, + "loss/crossentropy": 2.140208601951599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20745448023080826, + "step": 22546 + }, + { + "epoch": 0.45096, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0058176676432291664, + "learning_rate": 0.0001, + "loss": 4.0322, + "loss/crossentropy": 2.0344385504722595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20116931200027466, + "step": 22548 + }, + { + "epoch": 0.451, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007306925455729167, + "learning_rate": 0.0001, + "loss": 4.1927, + "loss/crossentropy": 2.213254153728485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2145981639623642, + "step": 22550 + }, + { + "epoch": 0.45104, + "grad_norm": 1.96875, + "grad_norm_var": 0.007260894775390625, + "learning_rate": 0.0001, + "loss": 4.0203, + "loss/crossentropy": 2.2077749967575073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2086869403719902, + "step": 22552 + }, + { + "epoch": 0.45108, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0064208984375, + "learning_rate": 0.0001, + "loss": 3.9152, + "loss/crossentropy": 2.131449520587921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20110997557640076, + "step": 22554 + }, + { + "epoch": 0.45112, + "grad_norm": 1.921875, + "grad_norm_var": 0.004107411702473958, + "learning_rate": 0.0001, + "loss": 4.0194, + "loss/crossentropy": 1.7281222343444824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18489576131105423, + "step": 22556 + }, + { + "epoch": 0.45116, + "grad_norm": 3.4375, + "grad_norm_var": 0.14352188110351563, + "learning_rate": 0.0001, + "loss": 3.9235, + "loss/crossentropy": 1.9907403588294983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19740281254053116, + "step": 22558 + }, + { + "epoch": 0.4512, + "grad_norm": 2.046875, + "grad_norm_var": 0.14071451822916667, + "learning_rate": 0.0001, + "loss": 4.2471, + "loss/crossentropy": 2.042698383331299, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20195024460554123, + "step": 22560 + }, + { + "epoch": 0.45124, + "grad_norm": 1.921875, + "grad_norm_var": 0.14180272420247395, + "learning_rate": 0.0001, + "loss": 3.9868, + "loss/crossentropy": 2.267351269721985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20345290750265121, + "step": 22562 + }, + { + "epoch": 0.45128, + "grad_norm": 1.8515625, + "grad_norm_var": 0.1418413798014323, + "learning_rate": 0.0001, + "loss": 3.8913, + "loss/crossentropy": 2.031484067440033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18091221153736115, + "step": 22564 + }, + { + "epoch": 0.45132, + "grad_norm": 2.125, + "grad_norm_var": 0.14130757649739584, + "learning_rate": 0.0001, + "loss": 4.3266, + "loss/crossentropy": 2.327489733695984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21997347474098206, + "step": 22566 + }, + { + "epoch": 0.45136, + "grad_norm": 2.046875, + "grad_norm_var": 0.1402099609375, + "learning_rate": 0.0001, + "loss": 4.3752, + "loss/crossentropy": 2.117951452732086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1775515079498291, + "step": 22568 + }, + { + "epoch": 0.4514, + "grad_norm": 1.90625, + "grad_norm_var": 0.1423906962076823, + "learning_rate": 0.0001, + "loss": 3.9752, + "loss/crossentropy": 1.9014524817466736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19188529253005981, + "step": 22570 + }, + { + "epoch": 0.45144, + "grad_norm": 2.09375, + "grad_norm_var": 0.13876851399739584, + "learning_rate": 0.0001, + "loss": 4.2163, + "loss/crossentropy": 2.0534915924072266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20141670107841492, + "step": 22572 + }, + { + "epoch": 0.45148, + "grad_norm": 2.203125, + "grad_norm_var": 0.009810384114583333, + "learning_rate": 0.0001, + "loss": 4.3861, + "loss/crossentropy": 2.018342673778534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20101028680801392, + "step": 22574 + }, + { + "epoch": 0.45152, + "grad_norm": 1.8515625, + "grad_norm_var": 0.011755116780598958, + "learning_rate": 0.0001, + "loss": 4.0395, + "loss/crossentropy": 2.000533401966095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18452423810958862, + "step": 22576 + }, + { + "epoch": 0.45156, + "grad_norm": 2.015625, + "grad_norm_var": 0.012015533447265626, + "learning_rate": 0.0001, + "loss": 3.9407, + "loss/crossentropy": 2.109315812587738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20570579916238785, + "step": 22578 + }, + { + "epoch": 0.4516, + "grad_norm": 1.9375, + "grad_norm_var": 0.011057281494140625, + "learning_rate": 0.0001, + "loss": 4.1289, + "loss/crossentropy": 2.2056689262390137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.195539653301239, + "step": 22580 + }, + { + "epoch": 0.45164, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010162099202473959, + "learning_rate": 0.0001, + "loss": 4.0627, + "loss/crossentropy": 2.246178388595581, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1960083767771721, + "step": 22582 + }, + { + "epoch": 0.45168, + "grad_norm": 1.90625, + "grad_norm_var": 0.010176340738932291, + "learning_rate": 0.0001, + "loss": 4.1413, + "loss/crossentropy": 1.8758829832077026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19170167297124863, + "step": 22584 + }, + { + "epoch": 0.45172, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0093994140625, + "learning_rate": 0.0001, + "loss": 3.9841, + "loss/crossentropy": 1.9648401141166687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1863914281129837, + "step": 22586 + }, + { + "epoch": 0.45176, + "grad_norm": 2.015625, + "grad_norm_var": 0.008339182535807291, + "learning_rate": 0.0001, + "loss": 4.2261, + "loss/crossentropy": 2.2547377347946167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003096491098404, + "step": 22588 + }, + { + "epoch": 0.4518, + "grad_norm": 1.8828125, + "grad_norm_var": 0.004626210530598958, + "learning_rate": 0.0001, + "loss": 3.9569, + "loss/crossentropy": 1.9784765839576721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822790503501892, + "step": 22590 + }, + { + "epoch": 0.45184, + "grad_norm": 1.96875, + "grad_norm_var": 0.0026730855305989584, + "learning_rate": 0.0001, + "loss": 4.1475, + "loss/crossentropy": 1.947198748588562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18320369720458984, + "step": 22592 + }, + { + "epoch": 0.45188, + "grad_norm": 1.7578125, + "grad_norm_var": 0.007625071207682291, + "learning_rate": 0.0001, + "loss": 3.5234, + "loss/crossentropy": 1.934649407863617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1809309870004654, + "step": 22594 + }, + { + "epoch": 0.45192, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009439849853515625, + "learning_rate": 0.0001, + "loss": 3.9333, + "loss/crossentropy": 2.064886689186096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17977996170520782, + "step": 22596 + }, + { + "epoch": 0.45196, + "grad_norm": 1.9375, + "grad_norm_var": 0.011228179931640625, + "learning_rate": 0.0001, + "loss": 3.8852, + "loss/crossentropy": 1.7243138551712036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16666193306446075, + "step": 22598 + }, + { + "epoch": 0.452, + "grad_norm": 1.8515625, + "grad_norm_var": 0.01080322265625, + "learning_rate": 0.0001, + "loss": 3.9807, + "loss/crossentropy": 2.0566282272338867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19942068308591843, + "step": 22600 + }, + { + "epoch": 0.45204, + "grad_norm": 1.765625, + "grad_norm_var": 0.012078603108723959, + "learning_rate": 0.0001, + "loss": 3.9323, + "loss/crossentropy": 2.108401298522949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068444862961769, + "step": 22602 + }, + { + "epoch": 0.45208, + "grad_norm": 1.96875, + "grad_norm_var": 0.011333974202473958, + "learning_rate": 0.0001, + "loss": 3.9746, + "loss/crossentropy": 2.281686544418335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20834453403949738, + "step": 22604 + }, + { + "epoch": 0.45212, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012324778238932292, + "learning_rate": 0.0001, + "loss": 4.0902, + "loss/crossentropy": 1.8627066016197205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18476207554340363, + "step": 22606 + }, + { + "epoch": 0.45216, + "grad_norm": 1.921875, + "grad_norm_var": 0.011567942301432292, + "learning_rate": 0.0001, + "loss": 4.097, + "loss/crossentropy": 2.46126925945282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19410762190818787, + "step": 22608 + }, + { + "epoch": 0.4522, + "grad_norm": 1.7890625, + "grad_norm_var": 0.009106190999348958, + "learning_rate": 0.0001, + "loss": 3.9222, + "loss/crossentropy": 1.8764225244522095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21187759190797806, + "step": 22610 + }, + { + "epoch": 0.45224, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007092030843098959, + "learning_rate": 0.0001, + "loss": 3.9213, + "loss/crossentropy": 1.8535012602806091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1783105507493019, + "step": 22612 + }, + { + "epoch": 0.45228, + "grad_norm": 1.96875, + "grad_norm_var": 0.0054929097493489586, + "learning_rate": 0.0001, + "loss": 4.1948, + "loss/crossentropy": 1.9735658764839172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2031656950712204, + "step": 22614 + }, + { + "epoch": 0.45232, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0064389546712239586, + "learning_rate": 0.0001, + "loss": 3.8466, + "loss/crossentropy": 2.005250871181488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18143242597579956, + "step": 22616 + }, + { + "epoch": 0.45236, + "grad_norm": 4.84375, + "grad_norm_var": 0.5445149739583334, + "learning_rate": 0.0001, + "loss": 4.1072, + "loss/crossentropy": 2.1731566786766052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042900174856186, + "step": 22618 + }, + { + "epoch": 0.4524, + "grad_norm": 1.921875, + "grad_norm_var": 0.5424496968587239, + "learning_rate": 0.0001, + "loss": 3.867, + "loss/crossentropy": 2.2009552717208862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20621322095394135, + "step": 22620 + }, + { + "epoch": 0.45244, + "grad_norm": 2.09375, + "grad_norm_var": 0.5397664388020833, + "learning_rate": 0.0001, + "loss": 3.9673, + "loss/crossentropy": 2.2014458179473877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910918429493904, + "step": 22622 + }, + { + "epoch": 0.45248, + "grad_norm": 1.921875, + "grad_norm_var": 0.538287099202474, + "learning_rate": 0.0001, + "loss": 3.8493, + "loss/crossentropy": 1.9113351702690125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981104016304016, + "step": 22624 + }, + { + "epoch": 0.45252, + "grad_norm": 1.8515625, + "grad_norm_var": 0.5362223307291667, + "learning_rate": 0.0001, + "loss": 3.9984, + "loss/crossentropy": 1.7521483302116394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18573005497455597, + "step": 22626 + }, + { + "epoch": 0.45256, + "grad_norm": 1.8046875, + "grad_norm_var": 0.5408111572265625, + "learning_rate": 0.0001, + "loss": 3.8505, + "loss/crossentropy": 2.003620207309723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799101158976555, + "step": 22628 + }, + { + "epoch": 0.4526, + "grad_norm": 2.171875, + "grad_norm_var": 0.5576894124348958, + "learning_rate": 0.0001, + "loss": 4.0585, + "loss/crossentropy": 2.17133104801178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21434936672449112, + "step": 22630 + }, + { + "epoch": 0.45264, + "grad_norm": 1.7890625, + "grad_norm_var": 0.5505777994791666, + "learning_rate": 0.0001, + "loss": 3.8865, + "loss/crossentropy": 2.113120198249817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21246708929538727, + "step": 22632 + }, + { + "epoch": 0.45268, + "grad_norm": 1.9609375, + "grad_norm_var": 0.04535319010416667, + "learning_rate": 0.0001, + "loss": 3.9273, + "loss/crossentropy": 1.9400765299797058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19067071378231049, + "step": 22634 + }, + { + "epoch": 0.45272, + "grad_norm": 1.921875, + "grad_norm_var": 0.04595133463541667, + "learning_rate": 0.0001, + "loss": 4.1057, + "loss/crossentropy": 2.107842206954956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1903919279575348, + "step": 22636 + }, + { + "epoch": 0.45276, + "grad_norm": 1.8515625, + "grad_norm_var": 0.046529134114583336, + "learning_rate": 0.0001, + "loss": 3.969, + "loss/crossentropy": 2.093311131000519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854369342327118, + "step": 22638 + }, + { + "epoch": 0.4528, + "grad_norm": 1.9921875, + "grad_norm_var": 0.04711278279622396, + "learning_rate": 0.0001, + "loss": 4.321, + "loss/crossentropy": 2.2625142335891724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128172516822815, + "step": 22640 + }, + { + "epoch": 0.45284, + "grad_norm": 1.890625, + "grad_norm_var": 0.04737726847330729, + "learning_rate": 0.0001, + "loss": 3.9444, + "loss/crossentropy": 2.13210928440094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18905573338270187, + "step": 22642 + }, + { + "epoch": 0.45288, + "grad_norm": 1.75, + "grad_norm_var": 0.04838841756184896, + "learning_rate": 0.0001, + "loss": 3.8324, + "loss/crossentropy": 2.1068539023399353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21476633101701736, + "step": 22644 + }, + { + "epoch": 0.45292, + "grad_norm": 1.921875, + "grad_norm_var": 0.009903971354166667, + "learning_rate": 0.0001, + "loss": 4.0148, + "loss/crossentropy": 1.9843144416809082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1885334774851799, + "step": 22646 + }, + { + "epoch": 0.45296, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0125396728515625, + "learning_rate": 0.0001, + "loss": 4.0819, + "loss/crossentropy": 2.3613446950912476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2183755785226822, + "step": 22648 + }, + { + "epoch": 0.453, + "grad_norm": 1.9375, + "grad_norm_var": 0.0125152587890625, + "learning_rate": 0.0001, + "loss": 3.9229, + "loss/crossentropy": 2.2134616374969482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22959928214550018, + "step": 22650 + }, + { + "epoch": 0.45304, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012532552083333334, + "learning_rate": 0.0001, + "loss": 4.0039, + "loss/crossentropy": 2.332140803337097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20412423461675644, + "step": 22652 + }, + { + "epoch": 0.45308, + "grad_norm": 2.03125, + "grad_norm_var": 0.012601470947265625, + "learning_rate": 0.0001, + "loss": 4.1368, + "loss/crossentropy": 1.7886313199996948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1998751536011696, + "step": 22654 + }, + { + "epoch": 0.45312, + "grad_norm": 1.7578125, + "grad_norm_var": 0.012733713785807291, + "learning_rate": 0.0001, + "loss": 3.8701, + "loss/crossentropy": 1.8666648864746094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16398675739765167, + "step": 22656 + }, + { + "epoch": 0.45316, + "grad_norm": 2.015625, + "grad_norm_var": 0.015077463785807292, + "learning_rate": 0.0001, + "loss": 3.9209, + "loss/crossentropy": 1.7308924794197083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17301977425813675, + "step": 22658 + }, + { + "epoch": 0.4532, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013459269205729167, + "learning_rate": 0.0001, + "loss": 4.0125, + "loss/crossentropy": 2.0344366431236267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18964581936597824, + "step": 22660 + }, + { + "epoch": 0.45324, + "grad_norm": 1.984375, + "grad_norm_var": 0.013991038004557291, + "learning_rate": 0.0001, + "loss": 4.0698, + "loss/crossentropy": 2.1261265873908997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20520442724227905, + "step": 22662 + }, + { + "epoch": 0.45328, + "grad_norm": 2.046875, + "grad_norm_var": 0.008641560872395834, + "learning_rate": 0.0001, + "loss": 4.169, + "loss/crossentropy": 2.163342595100403, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064770758152008, + "step": 22664 + }, + { + "epoch": 0.45332, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00960693359375, + "learning_rate": 0.0001, + "loss": 4.0267, + "loss/crossentropy": 2.293798089027405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21860123425722122, + "step": 22666 + }, + { + "epoch": 0.45336, + "grad_norm": 1.9375, + "grad_norm_var": 0.009891764322916666, + "learning_rate": 0.0001, + "loss": 3.8624, + "loss/crossentropy": 2.1405990719795227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19475476443767548, + "step": 22668 + }, + { + "epoch": 0.4534, + "grad_norm": 1.921875, + "grad_norm_var": 0.0090240478515625, + "learning_rate": 0.0001, + "loss": 4.0807, + "loss/crossentropy": 2.4012043476104736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21024177968502045, + "step": 22670 + }, + { + "epoch": 0.45344, + "grad_norm": 2.015625, + "grad_norm_var": 0.008182525634765625, + "learning_rate": 0.0001, + "loss": 4.1348, + "loss/crossentropy": 1.8321769833564758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21026848256587982, + "step": 22672 + }, + { + "epoch": 0.45348, + "grad_norm": 1.9375, + "grad_norm_var": 0.005832672119140625, + "learning_rate": 0.0001, + "loss": 3.9761, + "loss/crossentropy": 1.7738409042358398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17715871334075928, + "step": 22674 + }, + { + "epoch": 0.45352, + "grad_norm": 1.828125, + "grad_norm_var": 0.0066070556640625, + "learning_rate": 0.0001, + "loss": 4.1336, + "loss/crossentropy": 2.406354308128357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20188595354557037, + "step": 22676 + }, + { + "epoch": 0.45356, + "grad_norm": 1.71875, + "grad_norm_var": 0.0100006103515625, + "learning_rate": 0.0001, + "loss": 3.8723, + "loss/crossentropy": 1.977246344089508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1766221970319748, + "step": 22678 + }, + { + "epoch": 0.4536, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007922108968098958, + "learning_rate": 0.0001, + "loss": 3.9496, + "loss/crossentropy": 2.2606882452964783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21319438517093658, + "step": 22680 + }, + { + "epoch": 0.45364, + "grad_norm": 2.21875, + "grad_norm_var": 0.013313547770182291, + "learning_rate": 0.0001, + "loss": 4.3151, + "loss/crossentropy": 2.212466239929199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21307764202356339, + "step": 22682 + }, + { + "epoch": 0.45368, + "grad_norm": 1.90625, + "grad_norm_var": 0.012804921468098958, + "learning_rate": 0.0001, + "loss": 4.0279, + "loss/crossentropy": 1.8246173858642578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18758808821439743, + "step": 22684 + }, + { + "epoch": 0.45372, + "grad_norm": 1.8046875, + "grad_norm_var": 0.013946278889973959, + "learning_rate": 0.0001, + "loss": 3.9354, + "loss/crossentropy": 2.0831995010375977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112889140844345, + "step": 22686 + }, + { + "epoch": 0.45376, + "grad_norm": 1.953125, + "grad_norm_var": 0.013262685139973958, + "learning_rate": 0.0001, + "loss": 4.0611, + "loss/crossentropy": 1.81759911775589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17542830109596252, + "step": 22688 + }, + { + "epoch": 0.4538, + "grad_norm": 1.9453125, + "grad_norm_var": 0.014574178059895833, + "learning_rate": 0.0001, + "loss": 4.3459, + "loss/crossentropy": 1.99278324842453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18871523439884186, + "step": 22690 + }, + { + "epoch": 0.45384, + "grad_norm": 2.359375, + "grad_norm_var": 0.02394383748372396, + "learning_rate": 0.0001, + "loss": 4.2162, + "loss/crossentropy": 2.050497889518738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21048897504806519, + "step": 22692 + }, + { + "epoch": 0.45388, + "grad_norm": 1.84375, + "grad_norm_var": 0.01898981730143229, + "learning_rate": 0.0001, + "loss": 3.9088, + "loss/crossentropy": 1.7781237959861755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1762016862630844, + "step": 22694 + }, + { + "epoch": 0.45392, + "grad_norm": 1.921875, + "grad_norm_var": 0.01878840128580729, + "learning_rate": 0.0001, + "loss": 3.8821, + "loss/crossentropy": 1.8609422445297241, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18131760507822037, + "step": 22696 + }, + { + "epoch": 0.45396, + "grad_norm": 2.1875, + "grad_norm_var": 0.018553670247395834, + "learning_rate": 0.0001, + "loss": 4.3945, + "loss/crossentropy": 2.274345874786377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22067608684301376, + "step": 22698 + }, + { + "epoch": 0.454, + "grad_norm": 2.109375, + "grad_norm_var": 0.021036529541015626, + "learning_rate": 0.0001, + "loss": 4.1889, + "loss/crossentropy": 2.0518780946731567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.190098837018013, + "step": 22700 + }, + { + "epoch": 0.45404, + "grad_norm": 1.8125, + "grad_norm_var": 0.020654296875, + "learning_rate": 0.0001, + "loss": 4.0623, + "loss/crossentropy": 1.879017412662506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17030316591262817, + "step": 22702 + }, + { + "epoch": 0.45408, + "grad_norm": 1.8515625, + "grad_norm_var": 0.022944895426432292, + "learning_rate": 0.0001, + "loss": 4.0942, + "loss/crossentropy": 2.090358793735504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893768459558487, + "step": 22704 + }, + { + "epoch": 0.45412, + "grad_norm": 1.9375, + "grad_norm_var": 0.024095662434895835, + "learning_rate": 0.0001, + "loss": 4.0252, + "loss/crossentropy": 1.8985764980316162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20475559681653976, + "step": 22706 + }, + { + "epoch": 0.45416, + "grad_norm": 1.9453125, + "grad_norm_var": 0.014662424723307291, + "learning_rate": 0.0001, + "loss": 4.0392, + "loss/crossentropy": 2.1758298873901367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19116609543561935, + "step": 22708 + }, + { + "epoch": 0.4542, + "grad_norm": 1.828125, + "grad_norm_var": 0.014741770426432292, + "learning_rate": 0.0001, + "loss": 3.8379, + "loss/crossentropy": 1.7963250279426575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1838328242301941, + "step": 22710 + }, + { + "epoch": 0.45424, + "grad_norm": 2.015625, + "grad_norm_var": 0.015046183268229167, + "learning_rate": 0.0001, + "loss": 4.2924, + "loss/crossentropy": 2.4492361545562744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21029194444417953, + "step": 22712 + }, + { + "epoch": 0.45428, + "grad_norm": 1.921875, + "grad_norm_var": 0.0115386962890625, + "learning_rate": 0.0001, + "loss": 3.9314, + "loss/crossentropy": 2.10919725894928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20362409204244614, + "step": 22714 + }, + { + "epoch": 0.45432, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008713531494140624, + "learning_rate": 0.0001, + "loss": 4.0962, + "loss/crossentropy": 2.132240355014801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18763454258441925, + "step": 22716 + }, + { + "epoch": 0.45436, + "grad_norm": 2.0625, + "grad_norm_var": 0.008048502604166667, + "learning_rate": 0.0001, + "loss": 4.0287, + "loss/crossentropy": 2.1648387908935547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2098705694079399, + "step": 22718 + }, + { + "epoch": 0.4544, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008109283447265626, + "learning_rate": 0.0001, + "loss": 3.908, + "loss/crossentropy": 2.2334399223327637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21508647501468658, + "step": 22720 + }, + { + "epoch": 0.45444, + "grad_norm": 1.84375, + "grad_norm_var": 0.006101226806640625, + "learning_rate": 0.0001, + "loss": 3.8521, + "loss/crossentropy": 1.8207709193229675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1902608647942543, + "step": 22722 + }, + { + "epoch": 0.45448, + "grad_norm": 1.84375, + "grad_norm_var": 0.007191721598307292, + "learning_rate": 0.0001, + "loss": 4.0834, + "loss/crossentropy": 2.094780683517456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20928193628787994, + "step": 22724 + }, + { + "epoch": 0.45452, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006791178385416667, + "learning_rate": 0.0001, + "loss": 3.8648, + "loss/crossentropy": 2.164665937423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19287051260471344, + "step": 22726 + }, + { + "epoch": 0.45456, + "grad_norm": 1.890625, + "grad_norm_var": 0.0062164306640625, + "learning_rate": 0.0001, + "loss": 3.9342, + "loss/crossentropy": 2.2440634965896606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18155057728290558, + "step": 22728 + }, + { + "epoch": 0.4546, + "grad_norm": 2.15625, + "grad_norm_var": 0.008190663655598958, + "learning_rate": 0.0001, + "loss": 4.2287, + "loss/crossentropy": 2.012277126312256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21760402619838715, + "step": 22730 + }, + { + "epoch": 0.45464, + "grad_norm": 1.7890625, + "grad_norm_var": 0.01033935546875, + "learning_rate": 0.0001, + "loss": 4.0072, + "loss/crossentropy": 2.1217371225357056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944856494665146, + "step": 22732 + }, + { + "epoch": 0.45468, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010300445556640624, + "learning_rate": 0.0001, + "loss": 3.9806, + "loss/crossentropy": 2.3193390369415283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21992851048707962, + "step": 22734 + }, + { + "epoch": 0.45472, + "grad_norm": 2.09375, + "grad_norm_var": 0.012996164957682292, + "learning_rate": 0.0001, + "loss": 4.3389, + "loss/crossentropy": 2.520304322242737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.229971744120121, + "step": 22736 + }, + { + "epoch": 0.45476, + "grad_norm": 1.953125, + "grad_norm_var": 0.012898763020833334, + "learning_rate": 0.0001, + "loss": 4.0665, + "loss/crossentropy": 2.270485758781433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22266070544719696, + "step": 22738 + }, + { + "epoch": 0.4548, + "grad_norm": 1.96875, + "grad_norm_var": 0.011945597330729167, + "learning_rate": 0.0001, + "loss": 4.1006, + "loss/crossentropy": 2.046397566795349, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19661714136600494, + "step": 22740 + }, + { + "epoch": 0.45484, + "grad_norm": 1.828125, + "grad_norm_var": 0.012788645426432292, + "learning_rate": 0.0001, + "loss": 4.0564, + "loss/crossentropy": 1.925121009349823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19327251613140106, + "step": 22742 + }, + { + "epoch": 0.45488, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01253662109375, + "learning_rate": 0.0001, + "loss": 4.0178, + "loss/crossentropy": 2.1238789558410645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18549078702926636, + "step": 22744 + }, + { + "epoch": 0.45492, + "grad_norm": 1.8203125, + "grad_norm_var": 0.01082763671875, + "learning_rate": 0.0001, + "loss": 3.8122, + "loss/crossentropy": 1.8756769299507141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1634177565574646, + "step": 22746 + }, + { + "epoch": 0.45496, + "grad_norm": 2.171875, + "grad_norm_var": 0.012555948893229167, + "learning_rate": 0.0001, + "loss": 4.1419, + "loss/crossentropy": 2.017587959766388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187566876411438, + "step": 22748 + }, + { + "epoch": 0.455, + "grad_norm": 1.84375, + "grad_norm_var": 0.011735026041666667, + "learning_rate": 0.0001, + "loss": 3.8677, + "loss/crossentropy": 1.9583166241645813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20180843770503998, + "step": 22750 + }, + { + "epoch": 0.45504, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008259836832682292, + "learning_rate": 0.0001, + "loss": 3.8475, + "loss/crossentropy": 1.9483489990234375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19634968787431717, + "step": 22752 + }, + { + "epoch": 0.45508, + "grad_norm": 1.890625, + "grad_norm_var": 0.008270009358723959, + "learning_rate": 0.0001, + "loss": 3.8734, + "loss/crossentropy": 1.727650761604309, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17284797132015228, + "step": 22754 + }, + { + "epoch": 0.45512, + "grad_norm": 1.765625, + "grad_norm_var": 0.009262847900390624, + "learning_rate": 0.0001, + "loss": 3.9471, + "loss/crossentropy": 2.306153416633606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20535488426685333, + "step": 22756 + }, + { + "epoch": 0.45516, + "grad_norm": 1.90625, + "grad_norm_var": 0.007755279541015625, + "learning_rate": 0.0001, + "loss": 4.0382, + "loss/crossentropy": 1.8752743601799011, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19387705624103546, + "step": 22758 + }, + { + "epoch": 0.4552, + "grad_norm": 1.953125, + "grad_norm_var": 0.007941691080729167, + "learning_rate": 0.0001, + "loss": 4.2708, + "loss/crossentropy": 2.4626048803329468, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21318693459033966, + "step": 22760 + }, + { + "epoch": 0.45524, + "grad_norm": 1.796875, + "grad_norm_var": 0.008503977457682292, + "learning_rate": 0.0001, + "loss": 3.7236, + "loss/crossentropy": 1.968520700931549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19532684981822968, + "step": 22762 + }, + { + "epoch": 0.45528, + "grad_norm": 2.015625, + "grad_norm_var": 0.0135162353515625, + "learning_rate": 0.0001, + "loss": 4.0531, + "loss/crossentropy": 1.75477135181427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17470379173755646, + "step": 22764 + }, + { + "epoch": 0.45532, + "grad_norm": 2.0, + "grad_norm_var": 0.017325846354166667, + "learning_rate": 0.0001, + "loss": 4.0837, + "loss/crossentropy": 2.125413417816162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21318642050027847, + "step": 22766 + }, + { + "epoch": 0.45536, + "grad_norm": 2.109375, + "grad_norm_var": 0.01915257771809896, + "learning_rate": 0.0001, + "loss": 4.1433, + "loss/crossentropy": 2.2569944858551025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204448863863945, + "step": 22768 + }, + { + "epoch": 0.4554, + "grad_norm": 1.8828125, + "grad_norm_var": 0.018379720052083333, + "learning_rate": 0.0001, + "loss": 3.943, + "loss/crossentropy": 2.219534397125244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20692889392375946, + "step": 22770 + }, + { + "epoch": 0.45544, + "grad_norm": 1.8984375, + "grad_norm_var": 0.015372467041015626, + "learning_rate": 0.0001, + "loss": 4.0119, + "loss/crossentropy": 2.1480907797813416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20739484578371048, + "step": 22772 + }, + { + "epoch": 0.45548, + "grad_norm": 1.8359375, + "grad_norm_var": 0.016379547119140626, + "learning_rate": 0.0001, + "loss": 3.8875, + "loss/crossentropy": 1.9118182063102722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17766325920820236, + "step": 22774 + }, + { + "epoch": 0.45552, + "grad_norm": 2.0, + "grad_norm_var": 0.018900299072265626, + "learning_rate": 0.0001, + "loss": 3.9723, + "loss/crossentropy": 2.0928712487220764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19869665801525116, + "step": 22776 + }, + { + "epoch": 0.45556, + "grad_norm": 2.21875, + "grad_norm_var": 0.022031402587890624, + "learning_rate": 0.0001, + "loss": 4.4284, + "loss/crossentropy": 2.1542623043060303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22383734583854675, + "step": 22778 + }, + { + "epoch": 0.4556, + "grad_norm": 1.9453125, + "grad_norm_var": 0.019327545166015626, + "learning_rate": 0.0001, + "loss": 4.0714, + "loss/crossentropy": 2.1210632920265198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19574230164289474, + "step": 22780 + }, + { + "epoch": 0.45564, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01756566365559896, + "learning_rate": 0.0001, + "loss": 3.8178, + "loss/crossentropy": 1.975026547908783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18365998566150665, + "step": 22782 + }, + { + "epoch": 0.45568, + "grad_norm": 1.765625, + "grad_norm_var": 0.017549641927083335, + "learning_rate": 0.0001, + "loss": 3.4904, + "loss/crossentropy": 1.5320480465888977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15212352573871613, + "step": 22784 + }, + { + "epoch": 0.45572, + "grad_norm": 1.953125, + "grad_norm_var": 0.01838963826497396, + "learning_rate": 0.0001, + "loss": 3.9335, + "loss/crossentropy": 1.8474896550178528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18486780673265457, + "step": 22786 + }, + { + "epoch": 0.45576, + "grad_norm": 1.9765625, + "grad_norm_var": 0.018553670247395834, + "learning_rate": 0.0001, + "loss": 4.037, + "loss/crossentropy": 1.9554376006126404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18804097175598145, + "step": 22788 + }, + { + "epoch": 0.4558, + "grad_norm": 2.0625, + "grad_norm_var": 0.018968709309895835, + "learning_rate": 0.0001, + "loss": 4.1759, + "loss/crossentropy": 2.110785663127899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21371331810951233, + "step": 22790 + }, + { + "epoch": 0.45584, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016471099853515626, + "learning_rate": 0.0001, + "loss": 4.1691, + "loss/crossentropy": 2.100754976272583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21454931795597076, + "step": 22792 + }, + { + "epoch": 0.45588, + "grad_norm": 1.9375, + "grad_norm_var": 0.006845855712890625, + "learning_rate": 0.0001, + "loss": 4.0916, + "loss/crossentropy": 2.1975361704826355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2107257917523384, + "step": 22794 + }, + { + "epoch": 0.45592, + "grad_norm": 2.046875, + "grad_norm_var": 0.006961822509765625, + "learning_rate": 0.0001, + "loss": 4.1367, + "loss/crossentropy": 2.216245174407959, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22839964926242828, + "step": 22796 + }, + { + "epoch": 0.45596, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007798004150390625, + "learning_rate": 0.0001, + "loss": 3.8334, + "loss/crossentropy": 1.9227718710899353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18401919305324554, + "step": 22798 + }, + { + "epoch": 0.456, + "grad_norm": 2.078125, + "grad_norm_var": 0.007348378499348958, + "learning_rate": 0.0001, + "loss": 3.6818, + "loss/crossentropy": 1.916443407535553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20578119158744812, + "step": 22800 + }, + { + "epoch": 0.45604, + "grad_norm": 2.046875, + "grad_norm_var": 0.006947580973307292, + "learning_rate": 0.0001, + "loss": 3.9813, + "loss/crossentropy": 1.9856197834014893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20684893429279327, + "step": 22802 + }, + { + "epoch": 0.45608, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00740966796875, + "learning_rate": 0.0001, + "loss": 4.2125, + "loss/crossentropy": 2.284825623035431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19741405546665192, + "step": 22804 + }, + { + "epoch": 0.45612, + "grad_norm": 1.859375, + "grad_norm_var": 0.00848388671875, + "learning_rate": 0.0001, + "loss": 3.9399, + "loss/crossentropy": 2.28286874294281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18636887520551682, + "step": 22806 + }, + { + "epoch": 0.45616, + "grad_norm": 1.859375, + "grad_norm_var": 0.008121490478515625, + "learning_rate": 0.0001, + "loss": 4.0123, + "loss/crossentropy": 2.1754974722862244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18408793210983276, + "step": 22808 + }, + { + "epoch": 0.4562, + "grad_norm": 2.15625, + "grad_norm_var": 0.011668904622395834, + "learning_rate": 0.0001, + "loss": 3.9584, + "loss/crossentropy": 1.9059696793556213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17439302802085876, + "step": 22810 + }, + { + "epoch": 0.45624, + "grad_norm": 2.015625, + "grad_norm_var": 0.011075846354166667, + "learning_rate": 0.0001, + "loss": 4.0839, + "loss/crossentropy": 2.2676509618759155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20085486769676208, + "step": 22812 + }, + { + "epoch": 0.45628, + "grad_norm": 1.953125, + "grad_norm_var": 0.010788726806640624, + "learning_rate": 0.0001, + "loss": 3.9465, + "loss/crossentropy": 2.1320562958717346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19100098311901093, + "step": 22814 + }, + { + "epoch": 0.45632, + "grad_norm": 2.109375, + "grad_norm_var": 0.0119293212890625, + "learning_rate": 0.0001, + "loss": 4.2211, + "loss/crossentropy": 2.1827250719070435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1999506950378418, + "step": 22816 + }, + { + "epoch": 0.45636, + "grad_norm": 1.921875, + "grad_norm_var": 0.01083984375, + "learning_rate": 0.0001, + "loss": 3.7179, + "loss/crossentropy": 1.861267626285553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19007404148578644, + "step": 22818 + }, + { + "epoch": 0.4564, + "grad_norm": 2.1875, + "grad_norm_var": 0.014749908447265625, + "learning_rate": 0.0001, + "loss": 4.0594, + "loss/crossentropy": 2.0799012184143066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22177287936210632, + "step": 22820 + }, + { + "epoch": 0.45644, + "grad_norm": 2.125, + "grad_norm_var": 0.013742828369140625, + "learning_rate": 0.0001, + "loss": 4.2555, + "loss/crossentropy": 1.847994089126587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17684108018875122, + "step": 22822 + }, + { + "epoch": 0.45648, + "grad_norm": 1.90625, + "grad_norm_var": 0.011685943603515625, + "learning_rate": 0.0001, + "loss": 4.17, + "loss/crossentropy": 1.8111347556114197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1796310842037201, + "step": 22824 + }, + { + "epoch": 0.45652, + "grad_norm": 1.8125, + "grad_norm_var": 0.011971028645833333, + "learning_rate": 0.0001, + "loss": 3.7134, + "loss/crossentropy": 2.038852334022522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20186404883861542, + "step": 22826 + }, + { + "epoch": 0.45656, + "grad_norm": 2.078125, + "grad_norm_var": 0.09888509114583334, + "learning_rate": 0.0001, + "loss": 3.6857, + "loss/crossentropy": 1.6407727003097534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1637558490037918, + "step": 22828 + }, + { + "epoch": 0.4566, + "grad_norm": 2.046875, + "grad_norm_var": 0.09705174763997396, + "learning_rate": 0.0001, + "loss": 4.2396, + "loss/crossentropy": 2.240299344062805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22101856023073196, + "step": 22830 + }, + { + "epoch": 0.45664, + "grad_norm": 1.9140625, + "grad_norm_var": 0.10135269165039062, + "learning_rate": 0.0001, + "loss": 3.744, + "loss/crossentropy": 2.0920748114585876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1953132450580597, + "step": 22832 + }, + { + "epoch": 0.45668, + "grad_norm": 2.015625, + "grad_norm_var": 0.09823404947916667, + "learning_rate": 0.0001, + "loss": 4.4837, + "loss/crossentropy": 2.5458264350891113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22436098754405975, + "step": 22834 + }, + { + "epoch": 0.45672, + "grad_norm": 1.9375, + "grad_norm_var": 0.1007232666015625, + "learning_rate": 0.0001, + "loss": 4.228, + "loss/crossentropy": 2.2389872074127197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19549836963415146, + "step": 22836 + }, + { + "epoch": 0.45676, + "grad_norm": 1.8828125, + "grad_norm_var": 0.10216852823893229, + "learning_rate": 0.0001, + "loss": 4.0639, + "loss/crossentropy": 1.943382740020752, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17616666853427887, + "step": 22838 + }, + { + "epoch": 0.4568, + "grad_norm": 1.7578125, + "grad_norm_var": 0.10617574055989583, + "learning_rate": 0.0001, + "loss": 3.8807, + "loss/crossentropy": 2.3038381338119507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21248196810483932, + "step": 22840 + }, + { + "epoch": 0.45684, + "grad_norm": 1.7890625, + "grad_norm_var": 0.10474446614583334, + "learning_rate": 0.0001, + "loss": 4.0004, + "loss/crossentropy": 2.1889474391937256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21692103147506714, + "step": 22842 + }, + { + "epoch": 0.45688, + "grad_norm": 1.8125, + "grad_norm_var": 0.010959625244140625, + "learning_rate": 0.0001, + "loss": 3.8093, + "loss/crossentropy": 1.7324257493019104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1752893403172493, + "step": 22844 + }, + { + "epoch": 0.45692, + "grad_norm": 1.953125, + "grad_norm_var": 0.009358469645182292, + "learning_rate": 0.0001, + "loss": 3.7886, + "loss/crossentropy": 1.5598450899124146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1739826500415802, + "step": 22846 + }, + { + "epoch": 0.45696, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012699381510416666, + "learning_rate": 0.0001, + "loss": 4.1388, + "loss/crossentropy": 2.629135251045227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23805684596300125, + "step": 22848 + }, + { + "epoch": 0.457, + "grad_norm": 2.03125, + "grad_norm_var": 0.0129058837890625, + "learning_rate": 0.0001, + "loss": 4.0862, + "loss/crossentropy": 2.2966033220291138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20307788252830505, + "step": 22850 + }, + { + "epoch": 0.45704, + "grad_norm": 2.109375, + "grad_norm_var": 0.015372467041015626, + "learning_rate": 0.0001, + "loss": 3.8963, + "loss/crossentropy": 2.111830711364746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.220965214073658, + "step": 22852 + }, + { + "epoch": 0.45708, + "grad_norm": 2.078125, + "grad_norm_var": 0.016657511393229168, + "learning_rate": 0.0001, + "loss": 3.8466, + "loss/crossentropy": 1.9285584688186646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1810559406876564, + "step": 22854 + }, + { + "epoch": 0.45712, + "grad_norm": 1.8671875, + "grad_norm_var": 0.014798990885416667, + "learning_rate": 0.0001, + "loss": 3.9924, + "loss/crossentropy": 2.059799551963806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19754227995872498, + "step": 22856 + }, + { + "epoch": 0.45716, + "grad_norm": 1.96875, + "grad_norm_var": 0.013330078125, + "learning_rate": 0.0001, + "loss": 4.1501, + "loss/crossentropy": 2.219936490058899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20215890556573868, + "step": 22858 + }, + { + "epoch": 0.4572, + "grad_norm": 2.078125, + "grad_norm_var": 0.015541330973307291, + "learning_rate": 0.0001, + "loss": 3.6917, + "loss/crossentropy": 1.639923632144928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15062974393367767, + "step": 22860 + }, + { + "epoch": 0.45724, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013331858317057292, + "learning_rate": 0.0001, + "loss": 3.9225, + "loss/crossentropy": 2.038732647895813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19826284795999527, + "step": 22862 + }, + { + "epoch": 0.45728, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012969716389973959, + "learning_rate": 0.0001, + "loss": 3.7399, + "loss/crossentropy": 1.9492093920707703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18503065407276154, + "step": 22864 + }, + { + "epoch": 0.45732, + "grad_norm": 1.890625, + "grad_norm_var": 0.011061350504557291, + "learning_rate": 0.0001, + "loss": 3.8676, + "loss/crossentropy": 2.1745853424072266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20653871446847916, + "step": 22866 + }, + { + "epoch": 0.45736, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008536529541015626, + "learning_rate": 0.0001, + "loss": 4.1377, + "loss/crossentropy": 2.3043514490127563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20302630215883255, + "step": 22868 + }, + { + "epoch": 0.4574, + "grad_norm": 1.953125, + "grad_norm_var": 0.006730143229166667, + "learning_rate": 0.0001, + "loss": 4.0241, + "loss/crossentropy": 2.032300651073456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972976177930832, + "step": 22870 + }, + { + "epoch": 0.45744, + "grad_norm": 1.75, + "grad_norm_var": 0.007783762613932292, + "learning_rate": 0.0001, + "loss": 3.9334, + "loss/crossentropy": 1.7591362595558167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877521052956581, + "step": 22872 + }, + { + "epoch": 0.45748, + "grad_norm": 2.109375, + "grad_norm_var": 0.010179646809895833, + "learning_rate": 0.0001, + "loss": 3.9741, + "loss/crossentropy": 1.8232863545417786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19946040213108063, + "step": 22874 + }, + { + "epoch": 0.45752, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007627105712890625, + "learning_rate": 0.0001, + "loss": 4.1899, + "loss/crossentropy": 2.3904630541801453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21164552122354507, + "step": 22876 + }, + { + "epoch": 0.45756, + "grad_norm": 1.9375, + "grad_norm_var": 0.007377115885416666, + "learning_rate": 0.0001, + "loss": 3.9767, + "loss/crossentropy": 2.0109705328941345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19774679094552994, + "step": 22878 + }, + { + "epoch": 0.4576, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006381988525390625, + "learning_rate": 0.0001, + "loss": 3.9764, + "loss/crossentropy": 2.049817442893982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18809376657009125, + "step": 22880 + }, + { + "epoch": 0.45764, + "grad_norm": 2.25, + "grad_norm_var": 0.013444010416666667, + "learning_rate": 0.0001, + "loss": 3.9405, + "loss/crossentropy": 1.9604635834693909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21582353115081787, + "step": 22882 + }, + { + "epoch": 0.45768, + "grad_norm": 2.046875, + "grad_norm_var": 0.021355946858723957, + "learning_rate": 0.0001, + "loss": 4.2544, + "loss/crossentropy": 2.0224910378456116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21511054039001465, + "step": 22884 + }, + { + "epoch": 0.45772, + "grad_norm": 2.046875, + "grad_norm_var": 0.022191365559895832, + "learning_rate": 0.0001, + "loss": 4.1981, + "loss/crossentropy": 2.1743668913841248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20319290459156036, + "step": 22886 + }, + { + "epoch": 0.45776, + "grad_norm": 1.9765625, + "grad_norm_var": 0.017160797119140626, + "learning_rate": 0.0001, + "loss": 4.0452, + "loss/crossentropy": 2.0692490339279175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21043430268764496, + "step": 22888 + }, + { + "epoch": 0.4578, + "grad_norm": 1.9375, + "grad_norm_var": 0.01715672810872396, + "learning_rate": 0.0001, + "loss": 3.9975, + "loss/crossentropy": 2.2976402044296265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21817487478256226, + "step": 22890 + }, + { + "epoch": 0.45784, + "grad_norm": 1.828125, + "grad_norm_var": 0.0191162109375, + "learning_rate": 0.0001, + "loss": 4.11, + "loss/crossentropy": 1.9541950225830078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726100593805313, + "step": 22892 + }, + { + "epoch": 0.45788, + "grad_norm": 1.7421875, + "grad_norm_var": 0.022997029622395835, + "learning_rate": 0.0001, + "loss": 3.9245, + "loss/crossentropy": 1.8908615112304688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17845968157052994, + "step": 22894 + }, + { + "epoch": 0.45792, + "grad_norm": 1.8828125, + "grad_norm_var": 0.023522694905598957, + "learning_rate": 0.0001, + "loss": 4.1408, + "loss/crossentropy": 2.0752804279327393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17747191339731216, + "step": 22896 + }, + { + "epoch": 0.45796, + "grad_norm": 1.9375, + "grad_norm_var": 0.02047119140625, + "learning_rate": 0.0001, + "loss": 3.6761, + "loss/crossentropy": 1.7240750789642334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17918948084115982, + "step": 22898 + }, + { + "epoch": 0.458, + "grad_norm": 1.953125, + "grad_norm_var": 0.011519114176432291, + "learning_rate": 0.0001, + "loss": 4.0632, + "loss/crossentropy": 2.208250343799591, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937680020928383, + "step": 22900 + }, + { + "epoch": 0.45804, + "grad_norm": 1.90625, + "grad_norm_var": 0.008478800455729166, + "learning_rate": 0.0001, + "loss": 4.1283, + "loss/crossentropy": 2.076810359954834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18243352323770523, + "step": 22902 + }, + { + "epoch": 0.45808, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006624094645182292, + "learning_rate": 0.0001, + "loss": 4.2553, + "loss/crossentropy": 1.929943025112152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19618961960077286, + "step": 22904 + }, + { + "epoch": 0.45812, + "grad_norm": 1.984375, + "grad_norm_var": 0.0069976806640625, + "learning_rate": 0.0001, + "loss": 4.1791, + "loss/crossentropy": 2.186367392539978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19272585958242416, + "step": 22906 + }, + { + "epoch": 0.45816, + "grad_norm": 1.890625, + "grad_norm_var": 0.0063385009765625, + "learning_rate": 0.0001, + "loss": 4.1156, + "loss/crossentropy": 2.1537517309188843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1961517035961151, + "step": 22908 + }, + { + "epoch": 0.4582, + "grad_norm": 2.0625, + "grad_norm_var": 0.004325103759765625, + "learning_rate": 0.0001, + "loss": 4.1084, + "loss/crossentropy": 2.0141521096229553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1909116804599762, + "step": 22910 + }, + { + "epoch": 0.45824, + "grad_norm": 1.90625, + "grad_norm_var": 0.004351552327473958, + "learning_rate": 0.0001, + "loss": 4.1536, + "loss/crossentropy": 1.9720463752746582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18549032509326935, + "step": 22912 + }, + { + "epoch": 0.45828, + "grad_norm": 1.90625, + "grad_norm_var": 0.002243804931640625, + "learning_rate": 0.0001, + "loss": 4.0425, + "loss/crossentropy": 1.6521074771881104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.172331303358078, + "step": 22914 + }, + { + "epoch": 0.45832, + "grad_norm": 1.78125, + "grad_norm_var": 0.013109334309895833, + "learning_rate": 0.0001, + "loss": 3.8738, + "loss/crossentropy": 2.1382288932800293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19402942061424255, + "step": 22916 + }, + { + "epoch": 0.45836, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013688151041666667, + "learning_rate": 0.0001, + "loss": 4.0196, + "loss/crossentropy": 2.348258137702942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21549250930547714, + "step": 22918 + }, + { + "epoch": 0.4584, + "grad_norm": 2.296875, + "grad_norm_var": 0.02235107421875, + "learning_rate": 0.0001, + "loss": 4.0723, + "loss/crossentropy": 1.8907782435417175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18751342594623566, + "step": 22920 + }, + { + "epoch": 0.45844, + "grad_norm": 1.953125, + "grad_norm_var": 0.0222808837890625, + "learning_rate": 0.0001, + "loss": 4.0556, + "loss/crossentropy": 2.189586043357849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954984813928604, + "step": 22922 + }, + { + "epoch": 0.45848, + "grad_norm": 1.984375, + "grad_norm_var": 0.021491495768229167, + "learning_rate": 0.0001, + "loss": 3.8171, + "loss/crossentropy": 1.8710416555404663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1760418340563774, + "step": 22924 + }, + { + "epoch": 0.45852, + "grad_norm": 1.90625, + "grad_norm_var": 0.0213775634765625, + "learning_rate": 0.0001, + "loss": 4.4274, + "loss/crossentropy": 2.3559741973876953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21578038483858109, + "step": 22926 + }, + { + "epoch": 0.45856, + "grad_norm": 1.9609375, + "grad_norm_var": 0.022739410400390625, + "learning_rate": 0.0001, + "loss": 3.7592, + "loss/crossentropy": 1.8588098883628845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18936273455619812, + "step": 22928 + }, + { + "epoch": 0.4586, + "grad_norm": 1.78125, + "grad_norm_var": 0.024544270833333333, + "learning_rate": 0.0001, + "loss": 4.0378, + "loss/crossentropy": 2.039194941520691, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1917971894145012, + "step": 22930 + }, + { + "epoch": 0.45864, + "grad_norm": 1.828125, + "grad_norm_var": 0.016739654541015624, + "learning_rate": 0.0001, + "loss": 3.7675, + "loss/crossentropy": 1.7586663365364075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16597917675971985, + "step": 22932 + }, + { + "epoch": 0.45868, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01702880859375, + "learning_rate": 0.0001, + "loss": 3.7682, + "loss/crossentropy": 2.086498737335205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21375593543052673, + "step": 22934 + }, + { + "epoch": 0.45872, + "grad_norm": 2.0, + "grad_norm_var": 0.007853190104166666, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 2.14085054397583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2136429250240326, + "step": 22936 + }, + { + "epoch": 0.45876, + "grad_norm": 2.671875, + "grad_norm_var": 0.04417724609375, + "learning_rate": 0.0001, + "loss": 4.064, + "loss/crossentropy": 1.7813395261764526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23206777125597, + "step": 22938 + }, + { + "epoch": 0.4588, + "grad_norm": 1.9140625, + "grad_norm_var": 0.04527587890625, + "learning_rate": 0.0001, + "loss": 3.8535, + "loss/crossentropy": 2.267697334289551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19420506060123444, + "step": 22940 + }, + { + "epoch": 0.45884, + "grad_norm": 2.015625, + "grad_norm_var": 0.045328776041666664, + "learning_rate": 0.0001, + "loss": 4.1133, + "loss/crossentropy": 1.9011476039886475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20060188323259354, + "step": 22942 + }, + { + "epoch": 0.45888, + "grad_norm": 1.9609375, + "grad_norm_var": 0.043822987874348955, + "learning_rate": 0.0001, + "loss": 4.0148, + "loss/crossentropy": 2.0690804719924927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20410669595003128, + "step": 22944 + }, + { + "epoch": 0.45892, + "grad_norm": 2.03125, + "grad_norm_var": 0.0429443359375, + "learning_rate": 0.0001, + "loss": 4.2914, + "loss/crossentropy": 2.05231511592865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.200359046459198, + "step": 22946 + }, + { + "epoch": 0.45896, + "grad_norm": 1.921875, + "grad_norm_var": 0.04160741170247396, + "learning_rate": 0.0001, + "loss": 4.2998, + "loss/crossentropy": 2.167905569076538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20565193891525269, + "step": 22948 + }, + { + "epoch": 0.459, + "grad_norm": 1.921875, + "grad_norm_var": 0.041290028889973955, + "learning_rate": 0.0001, + "loss": 3.911, + "loss/crossentropy": 1.8297042846679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19305560737848282, + "step": 22950 + }, + { + "epoch": 0.45904, + "grad_norm": 2.03125, + "grad_norm_var": 0.042335764567057295, + "learning_rate": 0.0001, + "loss": 4.2231, + "loss/crossentropy": 2.1130523681640625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063598930835724, + "step": 22952 + }, + { + "epoch": 0.45908, + "grad_norm": 2.0, + "grad_norm_var": 0.011889394124348958, + "learning_rate": 0.0001, + "loss": 4.0074, + "loss/crossentropy": 1.8652898669242859, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19195117056369781, + "step": 22954 + }, + { + "epoch": 0.45912, + "grad_norm": 2.078125, + "grad_norm_var": 0.009948476155598959, + "learning_rate": 0.0001, + "loss": 4.2163, + "loss/crossentropy": 2.283188223838806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21027832478284836, + "step": 22956 + }, + { + "epoch": 0.45916, + "grad_norm": 2.140625, + "grad_norm_var": 0.010990397135416666, + "learning_rate": 0.0001, + "loss": 4.1242, + "loss/crossentropy": 1.9148901104927063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20047999173402786, + "step": 22958 + }, + { + "epoch": 0.4592, + "grad_norm": 1.8203125, + "grad_norm_var": 0.013936360677083334, + "learning_rate": 0.0001, + "loss": 4.1457, + "loss/crossentropy": 2.0488327741622925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18601303547620773, + "step": 22960 + }, + { + "epoch": 0.45924, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014810943603515625, + "learning_rate": 0.0001, + "loss": 4.0482, + "loss/crossentropy": 2.316429853439331, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18690580874681473, + "step": 22962 + }, + { + "epoch": 0.45928, + "grad_norm": 1.8359375, + "grad_norm_var": 0.013260650634765624, + "learning_rate": 0.0001, + "loss": 3.8958, + "loss/crossentropy": 2.2469218373298645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983572617173195, + "step": 22964 + }, + { + "epoch": 0.45932, + "grad_norm": 1.828125, + "grad_norm_var": 0.014964803059895834, + "learning_rate": 0.0001, + "loss": 3.7772, + "loss/crossentropy": 1.988997757434845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17721036076545715, + "step": 22966 + }, + { + "epoch": 0.45936, + "grad_norm": 1.8203125, + "grad_norm_var": 0.016259511311848957, + "learning_rate": 0.0001, + "loss": 3.6331, + "loss/crossentropy": 2.0831198692321777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20350365340709686, + "step": 22968 + }, + { + "epoch": 0.4594, + "grad_norm": 1.875, + "grad_norm_var": 0.016599273681640624, + "learning_rate": 0.0001, + "loss": 4.1053, + "loss/crossentropy": 1.9105250239372253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24999283254146576, + "step": 22970 + }, + { + "epoch": 0.45944, + "grad_norm": 1.859375, + "grad_norm_var": 0.014851633707682292, + "learning_rate": 0.0001, + "loss": 3.96, + "loss/crossentropy": 1.990349531173706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21390174329280853, + "step": 22972 + }, + { + "epoch": 0.45948, + "grad_norm": 1.8125, + "grad_norm_var": 0.011759440104166666, + "learning_rate": 0.0001, + "loss": 3.981, + "loss/crossentropy": 1.8016277551651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16551107168197632, + "step": 22974 + }, + { + "epoch": 0.45952, + "grad_norm": 1.921875, + "grad_norm_var": 0.008925120035807291, + "learning_rate": 0.0001, + "loss": 3.7665, + "loss/crossentropy": 2.0964725017547607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18953562527894974, + "step": 22976 + }, + { + "epoch": 0.45956, + "grad_norm": 1.90625, + "grad_norm_var": 0.008455149332682292, + "learning_rate": 0.0001, + "loss": 4.0136, + "loss/crossentropy": 1.8826870322227478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19081643223762512, + "step": 22978 + }, + { + "epoch": 0.4596, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009175618489583334, + "learning_rate": 0.0001, + "loss": 4.1068, + "loss/crossentropy": 2.3349568843841553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2133205682039261, + "step": 22980 + }, + { + "epoch": 0.45964, + "grad_norm": 2.0625, + "grad_norm_var": 0.009912109375, + "learning_rate": 0.0001, + "loss": 4.0551, + "loss/crossentropy": 2.1611928939819336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18273824453353882, + "step": 22982 + }, + { + "epoch": 0.45968, + "grad_norm": 2.0625, + "grad_norm_var": 0.008969879150390625, + "learning_rate": 0.0001, + "loss": 4.1389, + "loss/crossentropy": 1.9757606387138367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20463839918375015, + "step": 22984 + }, + { + "epoch": 0.45972, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008320871988932292, + "learning_rate": 0.0001, + "loss": 3.9488, + "loss/crossentropy": 2.075642466545105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19987941533327103, + "step": 22986 + }, + { + "epoch": 0.45976, + "grad_norm": 2.0, + "grad_norm_var": 0.009333292643229166, + "learning_rate": 0.0001, + "loss": 3.8588, + "loss/crossentropy": 1.805062711238861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16294736042618752, + "step": 22988 + }, + { + "epoch": 0.4598, + "grad_norm": 1.921875, + "grad_norm_var": 0.008907063802083334, + "learning_rate": 0.0001, + "loss": 3.9069, + "loss/crossentropy": 1.7681823372840881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16899758577346802, + "step": 22990 + }, + { + "epoch": 0.45984, + "grad_norm": 2.0625, + "grad_norm_var": 0.009837849934895834, + "learning_rate": 0.0001, + "loss": 4.1451, + "loss/crossentropy": 2.2006375789642334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2165016457438469, + "step": 22992 + }, + { + "epoch": 0.45988, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009822591145833334, + "learning_rate": 0.0001, + "loss": 4.0045, + "loss/crossentropy": 2.141044855117798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19580233097076416, + "step": 22994 + }, + { + "epoch": 0.45992, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007818349202473958, + "learning_rate": 0.0001, + "loss": 4.1228, + "loss/crossentropy": 1.8378351926803589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891382709145546, + "step": 22996 + }, + { + "epoch": 0.45996, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00626220703125, + "learning_rate": 0.0001, + "loss": 4.0411, + "loss/crossentropy": 1.9486631751060486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18766219913959503, + "step": 22998 + }, + { + "epoch": 0.46, + "grad_norm": 2.03125, + "grad_norm_var": 0.05397847493489583, + "learning_rate": 0.0001, + "loss": 4.0377, + "loss/crossentropy": 2.2083182334899902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19954963773488998, + "step": 23000 + }, + { + "epoch": 0.46004, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0544586181640625, + "learning_rate": 0.0001, + "loss": 3.8961, + "loss/crossentropy": 2.163831114768982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008308321237564, + "step": 23002 + }, + { + "epoch": 0.46008, + "grad_norm": 2.015625, + "grad_norm_var": 0.053138987223307295, + "learning_rate": 0.0001, + "loss": 4.1704, + "loss/crossentropy": 2.256836771965027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20936761796474457, + "step": 23004 + }, + { + "epoch": 0.46012, + "grad_norm": 2.03125, + "grad_norm_var": 0.051273345947265625, + "learning_rate": 0.0001, + "loss": 4.2633, + "loss/crossentropy": 2.2618257999420166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2167700231075287, + "step": 23006 + }, + { + "epoch": 0.46016, + "grad_norm": 2.734375, + "grad_norm_var": 0.08671875, + "learning_rate": 0.0001, + "loss": 3.8584, + "loss/crossentropy": 1.7830750346183777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17645323276519775, + "step": 23008 + }, + { + "epoch": 0.4602, + "grad_norm": 2.046875, + "grad_norm_var": 0.08814264933268229, + "learning_rate": 0.0001, + "loss": 4.0131, + "loss/crossentropy": 2.027057111263275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19191700220108032, + "step": 23010 + }, + { + "epoch": 0.46024, + "grad_norm": 1.875, + "grad_norm_var": 0.09068094889322917, + "learning_rate": 0.0001, + "loss": 3.9818, + "loss/crossentropy": 2.138069987297058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1890278086066246, + "step": 23012 + }, + { + "epoch": 0.46028, + "grad_norm": 1.8359375, + "grad_norm_var": 0.09110107421875, + "learning_rate": 0.0001, + "loss": 3.928, + "loss/crossentropy": 1.8553322553634644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18419906497001648, + "step": 23014 + }, + { + "epoch": 0.46032, + "grad_norm": 1.78125, + "grad_norm_var": 0.049627431233723956, + "learning_rate": 0.0001, + "loss": 3.8015, + "loss/crossentropy": 2.029839515686035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17909887433052063, + "step": 23016 + }, + { + "epoch": 0.46036, + "grad_norm": 1.9140625, + "grad_norm_var": 0.048868815104166664, + "learning_rate": 0.0001, + "loss": 3.9726, + "loss/crossentropy": 2.2698041200637817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202784962952137, + "step": 23018 + }, + { + "epoch": 0.4604, + "grad_norm": 2.203125, + "grad_norm_var": 0.05064264933268229, + "learning_rate": 0.0001, + "loss": 4.2589, + "loss/crossentropy": 2.1680142879486084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947874128818512, + "step": 23020 + }, + { + "epoch": 0.46044, + "grad_norm": 1.875, + "grad_norm_var": 0.05204976399739583, + "learning_rate": 0.0001, + "loss": 4.0885, + "loss/crossentropy": 2.1033952236175537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17692237347364426, + "step": 23022 + }, + { + "epoch": 0.46048, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0103912353515625, + "learning_rate": 0.0001, + "loss": 4.2974, + "loss/crossentropy": 2.3113337755203247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2220149040222168, + "step": 23024 + }, + { + "epoch": 0.46052, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009291330973307291, + "learning_rate": 0.0001, + "loss": 3.76, + "loss/crossentropy": 1.4061094522476196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16114307194948196, + "step": 23026 + }, + { + "epoch": 0.46056, + "grad_norm": 1.7890625, + "grad_norm_var": 0.010811106363932291, + "learning_rate": 0.0001, + "loss": 3.9877, + "loss/crossentropy": 2.14698326587677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20559164136648178, + "step": 23028 + }, + { + "epoch": 0.4606, + "grad_norm": 1.7734375, + "grad_norm_var": 0.011787668863932291, + "learning_rate": 0.0001, + "loss": 3.9314, + "loss/crossentropy": 2.255793571472168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18896941095590591, + "step": 23030 + }, + { + "epoch": 0.46064, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01043701171875, + "learning_rate": 0.0001, + "loss": 3.9739, + "loss/crossentropy": 2.186546564102173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2044816091656685, + "step": 23032 + }, + { + "epoch": 0.46068, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0108154296875, + "learning_rate": 0.0001, + "loss": 4.175, + "loss/crossentropy": 2.285028338432312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20862022787332535, + "step": 23034 + }, + { + "epoch": 0.46072, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0055908203125, + "learning_rate": 0.0001, + "loss": 4.3081, + "loss/crossentropy": 2.1346707344055176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2030143365263939, + "step": 23036 + }, + { + "epoch": 0.46076, + "grad_norm": 1.96875, + "grad_norm_var": 0.0063168843587239586, + "learning_rate": 0.0001, + "loss": 4.043, + "loss/crossentropy": 1.8645261526107788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1957538053393364, + "step": 23038 + }, + { + "epoch": 0.4608, + "grad_norm": 1.84375, + "grad_norm_var": 0.006048329671223958, + "learning_rate": 0.0001, + "loss": 3.8239, + "loss/crossentropy": 1.9670527577400208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1886385902762413, + "step": 23040 + }, + { + "epoch": 0.46084, + "grad_norm": 2.03125, + "grad_norm_var": 0.006644439697265625, + "learning_rate": 0.0001, + "loss": 3.9349, + "loss/crossentropy": 2.1273213624954224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1952996477484703, + "step": 23042 + }, + { + "epoch": 0.46088, + "grad_norm": 1.921875, + "grad_norm_var": 0.0060546875, + "learning_rate": 0.0001, + "loss": 3.947, + "loss/crossentropy": 2.1550720930099487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20368672162294388, + "step": 23044 + }, + { + "epoch": 0.46092, + "grad_norm": 1.828125, + "grad_norm_var": 0.005830891927083333, + "learning_rate": 0.0001, + "loss": 4.054, + "loss/crossentropy": 2.0871312618255615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20684774219989777, + "step": 23046 + }, + { + "epoch": 0.46096, + "grad_norm": 1.859375, + "grad_norm_var": 0.005980428059895833, + "learning_rate": 0.0001, + "loss": 3.9013, + "loss/crossentropy": 2.1256298422813416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18924856930971146, + "step": 23048 + }, + { + "epoch": 0.461, + "grad_norm": 2.3125, + "grad_norm_var": 0.014422353108723958, + "learning_rate": 0.0001, + "loss": 4.3133, + "loss/crossentropy": 1.9934388399124146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2362150102853775, + "step": 23050 + }, + { + "epoch": 0.46104, + "grad_norm": 1.8828125, + "grad_norm_var": 0.015478261311848958, + "learning_rate": 0.0001, + "loss": 4.0432, + "loss/crossentropy": 1.9070496559143066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198696069419384, + "step": 23052 + }, + { + "epoch": 0.46108, + "grad_norm": 1.890625, + "grad_norm_var": 0.016403961181640624, + "learning_rate": 0.0001, + "loss": 3.647, + "loss/crossentropy": 2.3180062770843506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19845713675022125, + "step": 23054 + }, + { + "epoch": 0.46112, + "grad_norm": 2.34375, + "grad_norm_var": 3.4797027587890623, + "learning_rate": 0.0001, + "loss": 3.9427, + "loss/crossentropy": 1.5877657532691956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17085347324609756, + "step": 23056 + }, + { + "epoch": 0.46116, + "grad_norm": 2.4375, + "grad_norm_var": 3.4481992085774738, + "learning_rate": 0.0001, + "loss": 4.1123, + "loss/crossentropy": 1.9232125282287598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18383993953466415, + "step": 23058 + }, + { + "epoch": 0.4612, + "grad_norm": 1.9765625, + "grad_norm_var": 3.442277018229167, + "learning_rate": 0.0001, + "loss": 4.0018, + "loss/crossentropy": 1.7219146490097046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18319600075483322, + "step": 23060 + }, + { + "epoch": 0.46124, + "grad_norm": 2.0625, + "grad_norm_var": 3.425248209635417, + "learning_rate": 0.0001, + "loss": 4.0123, + "loss/crossentropy": 2.274755835533142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20030079036951065, + "step": 23062 + }, + { + "epoch": 0.46128, + "grad_norm": 2.015625, + "grad_norm_var": 3.420116170247396, + "learning_rate": 0.0001, + "loss": 3.9746, + "loss/crossentropy": 2.1552868485450745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19475380331277847, + "step": 23064 + }, + { + "epoch": 0.46132, + "grad_norm": 1.9140625, + "grad_norm_var": 3.4470842997233073, + "learning_rate": 0.0001, + "loss": 3.7861, + "loss/crossentropy": 1.7763307094573975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16649360209703445, + "step": 23066 + }, + { + "epoch": 0.46136, + "grad_norm": 2.0, + "grad_norm_var": 3.4487945556640627, + "learning_rate": 0.0001, + "loss": 4.1083, + "loss/crossentropy": 1.859923779964447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17723160982131958, + "step": 23068 + }, + { + "epoch": 0.4614, + "grad_norm": 1.9453125, + "grad_norm_var": 3.4543690999348957, + "learning_rate": 0.0001, + "loss": 3.9948, + "loss/crossentropy": 1.9055342078208923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18139074742794037, + "step": 23070 + }, + { + "epoch": 0.46144, + "grad_norm": 1.8203125, + "grad_norm_var": 0.02443415323893229, + "learning_rate": 0.0001, + "loss": 4.1256, + "loss/crossentropy": 2.2504982948303223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21642805635929108, + "step": 23072 + }, + { + "epoch": 0.46148, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007469685872395834, + "learning_rate": 0.0001, + "loss": 4.2309, + "loss/crossentropy": 2.142970085144043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19985180348157883, + "step": 23074 + }, + { + "epoch": 0.46152, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0076253255208333336, + "learning_rate": 0.0001, + "loss": 3.836, + "loss/crossentropy": 2.0464539527893066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20277106016874313, + "step": 23076 + }, + { + "epoch": 0.46156, + "grad_norm": 2.03125, + "grad_norm_var": 0.007741038004557292, + "learning_rate": 0.0001, + "loss": 3.9323, + "loss/crossentropy": 1.9818042516708374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055400162935257, + "step": 23078 + }, + { + "epoch": 0.4616, + "grad_norm": 1.984375, + "grad_norm_var": 0.0063555399576822914, + "learning_rate": 0.0001, + "loss": 3.9854, + "loss/crossentropy": 2.245171070098877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21404191851615906, + "step": 23080 + }, + { + "epoch": 0.46164, + "grad_norm": 2.0, + "grad_norm_var": 0.009336090087890625, + "learning_rate": 0.0001, + "loss": 3.8036, + "loss/crossentropy": 1.8196159601211548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19405725598335266, + "step": 23082 + }, + { + "epoch": 0.46168, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0571197509765625, + "learning_rate": 0.0001, + "loss": 4.4681, + "loss/crossentropy": 2.2700140476226807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2196115106344223, + "step": 23084 + }, + { + "epoch": 0.46172, + "grad_norm": 1.859375, + "grad_norm_var": 0.05880940755208333, + "learning_rate": 0.0001, + "loss": 3.7722, + "loss/crossentropy": 1.8162717819213867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19593419879674911, + "step": 23086 + }, + { + "epoch": 0.46176, + "grad_norm": 1.9375, + "grad_norm_var": 0.057920074462890624, + "learning_rate": 0.0001, + "loss": 3.8489, + "loss/crossentropy": 2.287248969078064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2079424038529396, + "step": 23088 + }, + { + "epoch": 0.4618, + "grad_norm": 1.953125, + "grad_norm_var": 0.05727310180664062, + "learning_rate": 0.0001, + "loss": 4.1084, + "loss/crossentropy": 2.004701316356659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19984427094459534, + "step": 23090 + }, + { + "epoch": 0.46184, + "grad_norm": 1.953125, + "grad_norm_var": 0.05663655598958333, + "learning_rate": 0.0001, + "loss": 4.3167, + "loss/crossentropy": 2.185683250427246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210247203707695, + "step": 23092 + }, + { + "epoch": 0.46188, + "grad_norm": 1.8359375, + "grad_norm_var": 0.055425771077473956, + "learning_rate": 0.0001, + "loss": 4.0199, + "loss/crossentropy": 2.050000488758087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19505897909402847, + "step": 23094 + }, + { + "epoch": 0.46192, + "grad_norm": 1.953125, + "grad_norm_var": 0.056703440348307294, + "learning_rate": 0.0001, + "loss": 3.9138, + "loss/crossentropy": 2.0964688062667847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19070538878440857, + "step": 23096 + }, + { + "epoch": 0.46196, + "grad_norm": 1.953125, + "grad_norm_var": 0.05205256144205729, + "learning_rate": 0.0001, + "loss": 4.1354, + "loss/crossentropy": 2.02596253156662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22631053626537323, + "step": 23098 + }, + { + "epoch": 0.462, + "grad_norm": 1.9296875, + "grad_norm_var": 0.004951985677083334, + "learning_rate": 0.0001, + "loss": 4.0223, + "loss/crossentropy": 1.8759450912475586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18205147981643677, + "step": 23100 + }, + { + "epoch": 0.46204, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004231516520182292, + "learning_rate": 0.0001, + "loss": 3.8365, + "loss/crossentropy": 1.9460791945457458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17607779800891876, + "step": 23102 + }, + { + "epoch": 0.46208, + "grad_norm": 2.265625, + "grad_norm_var": 0.011031087239583333, + "learning_rate": 0.0001, + "loss": 3.9159, + "loss/crossentropy": 1.5419431328773499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15973693877458572, + "step": 23104 + }, + { + "epoch": 0.46212, + "grad_norm": 2.171875, + "grad_norm_var": 0.014019521077473958, + "learning_rate": 0.0001, + "loss": 4.2898, + "loss/crossentropy": 2.13347589969635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21076981723308563, + "step": 23106 + }, + { + "epoch": 0.46216, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013948567708333333, + "learning_rate": 0.0001, + "loss": 4.2812, + "loss/crossentropy": 2.199908971786499, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1933242455124855, + "step": 23108 + }, + { + "epoch": 0.4622, + "grad_norm": 1.890625, + "grad_norm_var": 0.013425445556640625, + "learning_rate": 0.0001, + "loss": 3.8662, + "loss/crossentropy": 2.0715879797935486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1933552771806717, + "step": 23110 + }, + { + "epoch": 0.46224, + "grad_norm": 2.0, + "grad_norm_var": 0.0129302978515625, + "learning_rate": 0.0001, + "loss": 3.9125, + "loss/crossentropy": 1.8359448909759521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17814192920923233, + "step": 23112 + }, + { + "epoch": 0.46228, + "grad_norm": 2.140625, + "grad_norm_var": 0.014987945556640625, + "learning_rate": 0.0001, + "loss": 4.1782, + "loss/crossentropy": 1.9806716442108154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18513696640729904, + "step": 23114 + }, + { + "epoch": 0.46232, + "grad_norm": 1.9375, + "grad_norm_var": 0.014725494384765624, + "learning_rate": 0.0001, + "loss": 3.9427, + "loss/crossentropy": 1.647410809993744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.177697092294693, + "step": 23116 + }, + { + "epoch": 0.46236, + "grad_norm": 1.734375, + "grad_norm_var": 0.01885986328125, + "learning_rate": 0.0001, + "loss": 3.7102, + "loss/crossentropy": 2.05259370803833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18241985887289047, + "step": 23118 + }, + { + "epoch": 0.4624, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013874308268229166, + "learning_rate": 0.0001, + "loss": 3.9863, + "loss/crossentropy": 2.229410469532013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21042390167713165, + "step": 23120 + }, + { + "epoch": 0.46244, + "grad_norm": 1.875, + "grad_norm_var": 0.010163370768229167, + "learning_rate": 0.0001, + "loss": 3.7858, + "loss/crossentropy": 1.6477417945861816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15472649037837982, + "step": 23122 + }, + { + "epoch": 0.46248, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009089914957682292, + "learning_rate": 0.0001, + "loss": 4.1175, + "loss/crossentropy": 2.0088080167770386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18309780955314636, + "step": 23124 + }, + { + "epoch": 0.46252, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009129842122395834, + "learning_rate": 0.0001, + "loss": 3.8097, + "loss/crossentropy": 1.8413895964622498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1828669160604477, + "step": 23126 + }, + { + "epoch": 0.46256, + "grad_norm": 1.8125, + "grad_norm_var": 0.009565989176432291, + "learning_rate": 0.0001, + "loss": 3.8763, + "loss/crossentropy": 2.0812554955482483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1836593672633171, + "step": 23128 + }, + { + "epoch": 0.4626, + "grad_norm": 1.9375, + "grad_norm_var": 0.006224568684895833, + "learning_rate": 0.0001, + "loss": 4.1576, + "loss/crossentropy": 2.269462764263153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22045638412237167, + "step": 23130 + }, + { + "epoch": 0.46264, + "grad_norm": 2.09375, + "grad_norm_var": 0.008898671468098958, + "learning_rate": 0.0001, + "loss": 4.203, + "loss/crossentropy": 2.1176013946533203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19339244812726974, + "step": 23132 + }, + { + "epoch": 0.46268, + "grad_norm": 2.078125, + "grad_norm_var": 0.008983357747395834, + "learning_rate": 0.0001, + "loss": 3.889, + "loss/crossentropy": 1.8236181735992432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18049708008766174, + "step": 23134 + }, + { + "epoch": 0.46272, + "grad_norm": 1.90625, + "grad_norm_var": 0.007608795166015625, + "learning_rate": 0.0001, + "loss": 4.163, + "loss/crossentropy": 1.7525643706321716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17853398621082306, + "step": 23136 + }, + { + "epoch": 0.46276, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006135050455729167, + "learning_rate": 0.0001, + "loss": 4.3239, + "loss/crossentropy": 2.2958520650863647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21427057683467865, + "step": 23138 + }, + { + "epoch": 0.4628, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009016927083333333, + "learning_rate": 0.0001, + "loss": 3.6256, + "loss/crossentropy": 1.8358682990074158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18813011050224304, + "step": 23140 + }, + { + "epoch": 0.46284, + "grad_norm": 2.09375, + "grad_norm_var": 0.010667928059895833, + "learning_rate": 0.0001, + "loss": 4.3444, + "loss/crossentropy": 2.4191941022872925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2317579984664917, + "step": 23142 + }, + { + "epoch": 0.46288, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012452952067057292, + "learning_rate": 0.0001, + "loss": 3.9152, + "loss/crossentropy": 2.2656983137130737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20870012044906616, + "step": 23144 + }, + { + "epoch": 0.46292, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014347076416015625, + "learning_rate": 0.0001, + "loss": 4.0893, + "loss/crossentropy": 2.2222647666931152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072836235165596, + "step": 23146 + }, + { + "epoch": 0.46296, + "grad_norm": 2.015625, + "grad_norm_var": 0.012410481770833334, + "learning_rate": 0.0001, + "loss": 4.0469, + "loss/crossentropy": 2.001325011253357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20486044138669968, + "step": 23148 + }, + { + "epoch": 0.463, + "grad_norm": 1.84375, + "grad_norm_var": 0.013061269124348959, + "learning_rate": 0.0001, + "loss": 4.0521, + "loss/crossentropy": 2.3565926551818848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19294436275959015, + "step": 23150 + }, + { + "epoch": 0.46304, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0134033203125, + "learning_rate": 0.0001, + "loss": 3.9513, + "loss/crossentropy": 1.9006416201591492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20511633157730103, + "step": 23152 + }, + { + "epoch": 0.46308, + "grad_norm": 1.8203125, + "grad_norm_var": 0.015697224934895834, + "learning_rate": 0.0001, + "loss": 3.761, + "loss/crossentropy": 2.017968237400055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877719983458519, + "step": 23154 + }, + { + "epoch": 0.46312, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012833658854166667, + "learning_rate": 0.0001, + "loss": 4.0591, + "loss/crossentropy": 2.093901038169861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1863752081990242, + "step": 23156 + }, + { + "epoch": 0.46316, + "grad_norm": 2.015625, + "grad_norm_var": 0.011690266927083333, + "learning_rate": 0.0001, + "loss": 4.1145, + "loss/crossentropy": 2.3631211519241333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2350725531578064, + "step": 23158 + }, + { + "epoch": 0.4632, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007892862955729166, + "learning_rate": 0.0001, + "loss": 4.1794, + "loss/crossentropy": 2.2482752799987793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18792778253555298, + "step": 23160 + }, + { + "epoch": 0.46324, + "grad_norm": 1.90625, + "grad_norm_var": 0.004115549723307291, + "learning_rate": 0.0001, + "loss": 4.0843, + "loss/crossentropy": 2.462800145149231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21113361418247223, + "step": 23162 + }, + { + "epoch": 0.46328, + "grad_norm": 1.9140625, + "grad_norm_var": 0.003639475504557292, + "learning_rate": 0.0001, + "loss": 4.1277, + "loss/crossentropy": 2.2522822618484497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2163240611553192, + "step": 23164 + }, + { + "epoch": 0.46332, + "grad_norm": 2.3125, + "grad_norm_var": 0.013444010416666667, + "learning_rate": 0.0001, + "loss": 3.9615, + "loss/crossentropy": 2.0585132837295532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20750011503696442, + "step": 23166 + }, + { + "epoch": 0.46336, + "grad_norm": 2.09375, + "grad_norm_var": 0.018357086181640624, + "learning_rate": 0.0001, + "loss": 4.0761, + "loss/crossentropy": 2.0086065530776978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21273234486579895, + "step": 23168 + }, + { + "epoch": 0.4634, + "grad_norm": 2.125, + "grad_norm_var": 0.017040761311848958, + "learning_rate": 0.0001, + "loss": 3.7175, + "loss/crossentropy": 1.8222439289093018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188787043094635, + "step": 23170 + }, + { + "epoch": 0.46344, + "grad_norm": 2.046875, + "grad_norm_var": 0.015843709309895832, + "learning_rate": 0.0001, + "loss": 4.0286, + "loss/crossentropy": 1.9914604425430298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19186841696500778, + "step": 23172 + }, + { + "epoch": 0.46348, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01594823201497396, + "learning_rate": 0.0001, + "loss": 4.022, + "loss/crossentropy": 1.5708445310592651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16981858015060425, + "step": 23174 + }, + { + "epoch": 0.46352, + "grad_norm": 1.953125, + "grad_norm_var": 0.015900675455729166, + "learning_rate": 0.0001, + "loss": 3.981, + "loss/crossentropy": 2.170333504676819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20143602043390274, + "step": 23176 + }, + { + "epoch": 0.46356, + "grad_norm": 1.8515625, + "grad_norm_var": 0.016033681233723958, + "learning_rate": 0.0001, + "loss": 4.0689, + "loss/crossentropy": 1.8889789581298828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17962591350078583, + "step": 23178 + }, + { + "epoch": 0.4636, + "grad_norm": 1.9296875, + "grad_norm_var": 0.016917928059895834, + "learning_rate": 0.0001, + "loss": 3.877, + "loss/crossentropy": 1.9203835129737854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20256394147872925, + "step": 23180 + }, + { + "epoch": 0.46364, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009715779622395834, + "learning_rate": 0.0001, + "loss": 4.1076, + "loss/crossentropy": 2.2325422763824463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23236138373613358, + "step": 23182 + }, + { + "epoch": 0.46368, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010133616129557292, + "learning_rate": 0.0001, + "loss": 3.6904, + "loss/crossentropy": 1.5864351987838745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15730705857276917, + "step": 23184 + }, + { + "epoch": 0.46372, + "grad_norm": 2.0625, + "grad_norm_var": 0.0185211181640625, + "learning_rate": 0.0001, + "loss": 4.4229, + "loss/crossentropy": 2.5526922941207886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22684484720230103, + "step": 23186 + }, + { + "epoch": 0.46376, + "grad_norm": 1.84375, + "grad_norm_var": 0.01954930623372396, + "learning_rate": 0.0001, + "loss": 3.7298, + "loss/crossentropy": 1.6752784252166748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16830473393201828, + "step": 23188 + }, + { + "epoch": 0.4638, + "grad_norm": 1.8671875, + "grad_norm_var": 0.019877115885416668, + "learning_rate": 0.0001, + "loss": 4.0933, + "loss/crossentropy": 1.992654800415039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198849655687809, + "step": 23190 + }, + { + "epoch": 0.46384, + "grad_norm": 2.03125, + "grad_norm_var": 0.021541341145833334, + "learning_rate": 0.0001, + "loss": 4.1588, + "loss/crossentropy": 1.932292878627777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19009747356176376, + "step": 23192 + }, + { + "epoch": 0.46388, + "grad_norm": 2.046875, + "grad_norm_var": 0.02211278279622396, + "learning_rate": 0.0001, + "loss": 4.2248, + "loss/crossentropy": 2.362654209136963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087061107158661, + "step": 23194 + }, + { + "epoch": 0.46392, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02215754191080729, + "learning_rate": 0.0001, + "loss": 3.932, + "loss/crossentropy": 2.117887258529663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19435501843690872, + "step": 23196 + }, + { + "epoch": 0.46396, + "grad_norm": 1.859375, + "grad_norm_var": 0.0228912353515625, + "learning_rate": 0.0001, + "loss": 3.7937, + "loss/crossentropy": 2.164400637149811, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19678650051355362, + "step": 23198 + }, + { + "epoch": 0.464, + "grad_norm": 2.734375, + "grad_norm_var": 0.05366795857747396, + "learning_rate": 0.0001, + "loss": 4.1249, + "loss/crossentropy": 2.1662251949310303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19994695484638214, + "step": 23200 + }, + { + "epoch": 0.46404, + "grad_norm": 2.046875, + "grad_norm_var": 0.04720433553059896, + "learning_rate": 0.0001, + "loss": 4.246, + "loss/crossentropy": 2.082236111164093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19977488368749619, + "step": 23202 + }, + { + "epoch": 0.46408, + "grad_norm": 1.8515625, + "grad_norm_var": 0.04742838541666667, + "learning_rate": 0.0001, + "loss": 4.0368, + "loss/crossentropy": 2.12766033411026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18198081851005554, + "step": 23204 + }, + { + "epoch": 0.46412, + "grad_norm": 1.90625, + "grad_norm_var": 0.047907511393229164, + "learning_rate": 0.0001, + "loss": 3.7709, + "loss/crossentropy": 2.0322113633155823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1872313991189003, + "step": 23206 + }, + { + "epoch": 0.46416, + "grad_norm": 1.8515625, + "grad_norm_var": 0.04896214803059896, + "learning_rate": 0.0001, + "loss": 4.0456, + "loss/crossentropy": 2.339892268180847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19740620255470276, + "step": 23208 + }, + { + "epoch": 0.4642, + "grad_norm": 1.9609375, + "grad_norm_var": 0.04987691243489583, + "learning_rate": 0.0001, + "loss": 3.7924, + "loss/crossentropy": 1.989054560661316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2098187357187271, + "step": 23210 + }, + { + "epoch": 0.46424, + "grad_norm": 1.90625, + "grad_norm_var": 0.050065104166666666, + "learning_rate": 0.0001, + "loss": 3.8527, + "loss/crossentropy": 2.0670509934425354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19233432412147522, + "step": 23212 + }, + { + "epoch": 0.46428, + "grad_norm": 2.078125, + "grad_norm_var": 0.04978815714518229, + "learning_rate": 0.0001, + "loss": 4.055, + "loss/crossentropy": 2.136350452899933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21299083530902863, + "step": 23214 + }, + { + "epoch": 0.46432, + "grad_norm": 1.9375, + "grad_norm_var": 0.008333079020182292, + "learning_rate": 0.0001, + "loss": 4.2758, + "loss/crossentropy": 2.1144350171089172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19434542953968048, + "step": 23216 + }, + { + "epoch": 0.46436, + "grad_norm": 1.7578125, + "grad_norm_var": 0.008605702718098959, + "learning_rate": 0.0001, + "loss": 3.8981, + "loss/crossentropy": 2.307699203491211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19784536957740784, + "step": 23218 + }, + { + "epoch": 0.4644, + "grad_norm": 1.8203125, + "grad_norm_var": 0.00841064453125, + "learning_rate": 0.0001, + "loss": 3.7529, + "loss/crossentropy": 1.8954175114631653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18727902323007584, + "step": 23220 + }, + { + "epoch": 0.46444, + "grad_norm": 2.140625, + "grad_norm_var": 0.011207834879557291, + "learning_rate": 0.0001, + "loss": 4.2463, + "loss/crossentropy": 2.3020440340042114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126014232635498, + "step": 23222 + }, + { + "epoch": 0.46448, + "grad_norm": 2.015625, + "grad_norm_var": 0.010628255208333333, + "learning_rate": 0.0001, + "loss": 4.1424, + "loss/crossentropy": 2.1159419417381287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19804084300994873, + "step": 23224 + }, + { + "epoch": 0.46452, + "grad_norm": 1.859375, + "grad_norm_var": 0.009422810872395833, + "learning_rate": 0.0001, + "loss": 3.7152, + "loss/crossentropy": 1.7309923768043518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19001224637031555, + "step": 23226 + }, + { + "epoch": 0.46456, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009273274739583334, + "learning_rate": 0.0001, + "loss": 4.0433, + "loss/crossentropy": 2.2450767755508423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20024073868989944, + "step": 23228 + }, + { + "epoch": 0.4646, + "grad_norm": 1.96875, + "grad_norm_var": 0.0081695556640625, + "learning_rate": 0.0001, + "loss": 4.1005, + "loss/crossentropy": 1.8837137818336487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20254365354776382, + "step": 23230 + }, + { + "epoch": 0.46464, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010825347900390626, + "learning_rate": 0.0001, + "loss": 3.9611, + "loss/crossentropy": 1.773855447769165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17011478543281555, + "step": 23232 + }, + { + "epoch": 0.46468, + "grad_norm": 2.34375, + "grad_norm_var": 0.017479451497395833, + "learning_rate": 0.0001, + "loss": 4.1329, + "loss/crossentropy": 2.2123981714248657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2134469673037529, + "step": 23234 + }, + { + "epoch": 0.46472, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015449778238932291, + "learning_rate": 0.0001, + "loss": 3.8216, + "loss/crossentropy": 2.004405915737152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19117921590805054, + "step": 23236 + }, + { + "epoch": 0.46476, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01505126953125, + "learning_rate": 0.0001, + "loss": 4.012, + "loss/crossentropy": 2.2839083671569824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20231828093528748, + "step": 23238 + }, + { + "epoch": 0.4648, + "grad_norm": 2.0, + "grad_norm_var": 0.014769490559895833, + "learning_rate": 0.0001, + "loss": 3.9148, + "loss/crossentropy": 2.196630358695984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992904543876648, + "step": 23240 + }, + { + "epoch": 0.46484, + "grad_norm": 1.9375, + "grad_norm_var": 0.013981119791666666, + "learning_rate": 0.0001, + "loss": 4.0372, + "loss/crossentropy": 1.907668113708496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16895314306020737, + "step": 23242 + }, + { + "epoch": 0.46488, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01436767578125, + "learning_rate": 0.0001, + "loss": 4.2587, + "loss/crossentropy": 2.1902048587799072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20193833857774734, + "step": 23244 + }, + { + "epoch": 0.46492, + "grad_norm": 2.015625, + "grad_norm_var": 0.014427693684895833, + "learning_rate": 0.0001, + "loss": 4.084, + "loss/crossentropy": 2.088558316230774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21183707565069199, + "step": 23246 + }, + { + "epoch": 0.46496, + "grad_norm": 2.265625, + "grad_norm_var": 0.018390909830729166, + "learning_rate": 0.0001, + "loss": 4.0602, + "loss/crossentropy": 2.3056023120880127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21808549761772156, + "step": 23248 + }, + { + "epoch": 0.465, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009235636393229166, + "learning_rate": 0.0001, + "loss": 4.2253, + "loss/crossentropy": 2.255687952041626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19484181702136993, + "step": 23250 + }, + { + "epoch": 0.46504, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009406534830729167, + "learning_rate": 0.0001, + "loss": 4.0991, + "loss/crossentropy": 2.042704999446869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20347727835178375, + "step": 23252 + }, + { + "epoch": 0.46508, + "grad_norm": 1.828125, + "grad_norm_var": 0.009901682535807291, + "learning_rate": 0.0001, + "loss": 4.1159, + "loss/crossentropy": 2.0365681648254395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19337813556194305, + "step": 23254 + }, + { + "epoch": 0.46512, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010707346598307292, + "learning_rate": 0.0001, + "loss": 4.0616, + "loss/crossentropy": 2.1936534643173218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2163967713713646, + "step": 23256 + }, + { + "epoch": 0.46516, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01199951171875, + "learning_rate": 0.0001, + "loss": 3.9913, + "loss/crossentropy": 2.116168260574341, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20038608461618423, + "step": 23258 + }, + { + "epoch": 0.4652, + "grad_norm": 1.859375, + "grad_norm_var": 0.012654622395833334, + "learning_rate": 0.0001, + "loss": 4.0231, + "loss/crossentropy": 2.182048201560974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19984129816293716, + "step": 23260 + }, + { + "epoch": 0.46524, + "grad_norm": 2.171875, + "grad_norm_var": 0.018418121337890624, + "learning_rate": 0.0001, + "loss": 3.953, + "loss/crossentropy": 2.1267359256744385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19032182544469833, + "step": 23262 + }, + { + "epoch": 0.46528, + "grad_norm": 1.8125, + "grad_norm_var": 0.0109527587890625, + "learning_rate": 0.0001, + "loss": 3.8849, + "loss/crossentropy": 1.763411819934845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1661328673362732, + "step": 23264 + }, + { + "epoch": 0.46532, + "grad_norm": 2.015625, + "grad_norm_var": 0.011279042561848958, + "learning_rate": 0.0001, + "loss": 4.1665, + "loss/crossentropy": 1.908778965473175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20274333655834198, + "step": 23266 + }, + { + "epoch": 0.46536, + "grad_norm": 1.890625, + "grad_norm_var": 0.010994466145833333, + "learning_rate": 0.0001, + "loss": 4.0938, + "loss/crossentropy": 2.2249021530151367, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18010813742876053, + "step": 23268 + }, + { + "epoch": 0.4654, + "grad_norm": 2.03125, + "grad_norm_var": 0.013032786051432292, + "learning_rate": 0.0001, + "loss": 4.2546, + "loss/crossentropy": 2.4366761445999146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22475503385066986, + "step": 23270 + }, + { + "epoch": 0.46544, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013084920247395833, + "learning_rate": 0.0001, + "loss": 4.2797, + "loss/crossentropy": 2.2445744276046753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18397177010774612, + "step": 23272 + }, + { + "epoch": 0.46548, + "grad_norm": 2.109375, + "grad_norm_var": 0.0141021728515625, + "learning_rate": 0.0001, + "loss": 4.5025, + "loss/crossentropy": 2.1187288761138916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064574956893921, + "step": 23274 + }, + { + "epoch": 0.46552, + "grad_norm": 2.03125, + "grad_norm_var": 0.0139892578125, + "learning_rate": 0.0001, + "loss": 4.0211, + "loss/crossentropy": 1.9558793902397156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1832912713289261, + "step": 23276 + }, + { + "epoch": 0.46556, + "grad_norm": 2.140625, + "grad_norm_var": 0.009187571207682292, + "learning_rate": 0.0001, + "loss": 4.2, + "loss/crossentropy": 2.125900387763977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23090820014476776, + "step": 23278 + }, + { + "epoch": 0.4656, + "grad_norm": 1.7890625, + "grad_norm_var": 0.009860992431640625, + "learning_rate": 0.0001, + "loss": 4.146, + "loss/crossentropy": 2.337615966796875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21365046501159668, + "step": 23280 + }, + { + "epoch": 0.46564, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011482747395833333, + "learning_rate": 0.0001, + "loss": 3.8832, + "loss/crossentropy": 2.0574655532836914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987999752163887, + "step": 23282 + }, + { + "epoch": 0.46568, + "grad_norm": 1.890625, + "grad_norm_var": 0.011250813802083334, + "learning_rate": 0.0001, + "loss": 3.8743, + "loss/crossentropy": 1.8969943523406982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18804477155208588, + "step": 23284 + }, + { + "epoch": 0.46572, + "grad_norm": 1.984375, + "grad_norm_var": 0.010005442301432292, + "learning_rate": 0.0001, + "loss": 4.2682, + "loss/crossentropy": 2.218023180961609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202347069978714, + "step": 23286 + }, + { + "epoch": 0.46576, + "grad_norm": 2.0, + "grad_norm_var": 0.010019683837890625, + "learning_rate": 0.0001, + "loss": 4.0489, + "loss/crossentropy": 2.2892422676086426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21481822431087494, + "step": 23288 + }, + { + "epoch": 0.4658, + "grad_norm": 1.96875, + "grad_norm_var": 0.0074859619140625, + "learning_rate": 0.0001, + "loss": 4.1356, + "loss/crossentropy": 1.9342178106307983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18514516204595566, + "step": 23290 + }, + { + "epoch": 0.46584, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0075762430826822914, + "learning_rate": 0.0001, + "loss": 3.9764, + "loss/crossentropy": 2.134042203426361, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20004402846097946, + "step": 23292 + }, + { + "epoch": 0.46588, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006422678629557292, + "learning_rate": 0.0001, + "loss": 4.3574, + "loss/crossentropy": 2.075138568878174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18759854137897491, + "step": 23294 + }, + { + "epoch": 0.46592, + "grad_norm": 1.9140625, + "grad_norm_var": 0.005260976155598959, + "learning_rate": 0.0001, + "loss": 4.0339, + "loss/crossentropy": 2.0511878728866577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19307290762662888, + "step": 23296 + }, + { + "epoch": 0.46596, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004484049479166667, + "learning_rate": 0.0001, + "loss": 4.31, + "loss/crossentropy": 2.2446112632751465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2207113802433014, + "step": 23298 + }, + { + "epoch": 0.466, + "grad_norm": 1.875, + "grad_norm_var": 0.0057769775390625, + "learning_rate": 0.0001, + "loss": 4.1328, + "loss/crossentropy": 2.0350372195243835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17964959889650345, + "step": 23300 + }, + { + "epoch": 0.46604, + "grad_norm": 2.234375, + "grad_norm_var": 0.011844635009765625, + "learning_rate": 0.0001, + "loss": 4.4, + "loss/crossentropy": 2.3412392139434814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23192551732063293, + "step": 23302 + }, + { + "epoch": 0.46608, + "grad_norm": 1.859375, + "grad_norm_var": 0.013456217447916667, + "learning_rate": 0.0001, + "loss": 4.1086, + "loss/crossentropy": 2.1255269050598145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22011695802211761, + "step": 23304 + }, + { + "epoch": 0.46612, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013818105061848959, + "learning_rate": 0.0001, + "loss": 4.1335, + "loss/crossentropy": 2.2372154593467712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20802992582321167, + "step": 23306 + }, + { + "epoch": 0.46616, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013051096598307292, + "learning_rate": 0.0001, + "loss": 3.9148, + "loss/crossentropy": 1.766309678554535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1731080263853073, + "step": 23308 + }, + { + "epoch": 0.4662, + "grad_norm": 2.109375, + "grad_norm_var": 0.013354237874348958, + "learning_rate": 0.0001, + "loss": 4.2245, + "loss/crossentropy": 2.035153806209564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21127209067344666, + "step": 23310 + }, + { + "epoch": 0.46624, + "grad_norm": 1.859375, + "grad_norm_var": 0.014525349934895833, + "learning_rate": 0.0001, + "loss": 3.8472, + "loss/crossentropy": 2.161786377429962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2059953212738037, + "step": 23312 + }, + { + "epoch": 0.46628, + "grad_norm": 1.859375, + "grad_norm_var": 0.015195465087890625, + "learning_rate": 0.0001, + "loss": 4.0489, + "loss/crossentropy": 2.2908066511154175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20963146537542343, + "step": 23314 + }, + { + "epoch": 0.46632, + "grad_norm": 1.859375, + "grad_norm_var": 0.015885162353515624, + "learning_rate": 0.0001, + "loss": 3.9638, + "loss/crossentropy": 2.1423317193984985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051083892583847, + "step": 23316 + }, + { + "epoch": 0.46636, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009303538004557292, + "learning_rate": 0.0001, + "loss": 3.9683, + "loss/crossentropy": 1.953084647655487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18355616927146912, + "step": 23318 + }, + { + "epoch": 0.4664, + "grad_norm": 2.078125, + "grad_norm_var": 0.008348592122395833, + "learning_rate": 0.0001, + "loss": 4.4509, + "loss/crossentropy": 2.3963130712509155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21937239170074463, + "step": 23320 + }, + { + "epoch": 0.46644, + "grad_norm": 1.828125, + "grad_norm_var": 0.009419759114583334, + "learning_rate": 0.0001, + "loss": 3.8771, + "loss/crossentropy": 1.991708517074585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18753455579280853, + "step": 23322 + }, + { + "epoch": 0.46648, + "grad_norm": 2.125, + "grad_norm_var": 0.013142903645833334, + "learning_rate": 0.0001, + "loss": 4.4012, + "loss/crossentropy": 2.1105082035064697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21250775456428528, + "step": 23324 + }, + { + "epoch": 0.46652, + "grad_norm": 2.03125, + "grad_norm_var": 0.011885579427083333, + "learning_rate": 0.0001, + "loss": 4.2301, + "loss/crossentropy": 2.3817760944366455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22308632731437683, + "step": 23326 + }, + { + "epoch": 0.46656, + "grad_norm": 2.0, + "grad_norm_var": 0.010762532552083334, + "learning_rate": 0.0001, + "loss": 3.9596, + "loss/crossentropy": 1.9623343348503113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.189848855137825, + "step": 23328 + }, + { + "epoch": 0.4666, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011156209309895833, + "learning_rate": 0.0001, + "loss": 4.0722, + "loss/crossentropy": 2.2103809118270874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21060046553611755, + "step": 23330 + }, + { + "epoch": 0.46664, + "grad_norm": 2.046875, + "grad_norm_var": 0.0101715087890625, + "learning_rate": 0.0001, + "loss": 3.915, + "loss/crossentropy": 2.0628533959388733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19960719347000122, + "step": 23332 + }, + { + "epoch": 0.46668, + "grad_norm": 1.90625, + "grad_norm_var": 0.01181640625, + "learning_rate": 0.0001, + "loss": 3.9823, + "loss/crossentropy": 1.8410035371780396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17858797311782837, + "step": 23334 + }, + { + "epoch": 0.46672, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010786946614583333, + "learning_rate": 0.0001, + "loss": 4.0117, + "loss/crossentropy": 2.056548833847046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1881648674607277, + "step": 23336 + }, + { + "epoch": 0.46676, + "grad_norm": 1.78125, + "grad_norm_var": 0.012962849934895833, + "learning_rate": 0.0001, + "loss": 3.8496, + "loss/crossentropy": 1.8516274094581604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19123712182044983, + "step": 23338 + }, + { + "epoch": 0.4668, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009137980143229167, + "learning_rate": 0.0001, + "loss": 4.1999, + "loss/crossentropy": 2.1613941192626953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937795951962471, + "step": 23340 + }, + { + "epoch": 0.46684, + "grad_norm": 2.0, + "grad_norm_var": 0.010106404622395834, + "learning_rate": 0.0001, + "loss": 4.052, + "loss/crossentropy": 1.9474772810935974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17672593146562576, + "step": 23342 + }, + { + "epoch": 0.46688, + "grad_norm": 1.7734375, + "grad_norm_var": 0.01136474609375, + "learning_rate": 0.0001, + "loss": 3.9052, + "loss/crossentropy": 1.8008830547332764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1621333733201027, + "step": 23344 + }, + { + "epoch": 0.46692, + "grad_norm": 2.140625, + "grad_norm_var": 0.013768513997395834, + "learning_rate": 0.0001, + "loss": 4.385, + "loss/crossentropy": 2.505362868309021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21897228807210922, + "step": 23346 + }, + { + "epoch": 0.46696, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013702138264973959, + "learning_rate": 0.0001, + "loss": 3.9262, + "loss/crossentropy": 1.7621804475784302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18244817852973938, + "step": 23348 + }, + { + "epoch": 0.467, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012630208333333334, + "learning_rate": 0.0001, + "loss": 4.0277, + "loss/crossentropy": 1.8438073992729187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19016395509243011, + "step": 23350 + }, + { + "epoch": 0.46704, + "grad_norm": 1.953125, + "grad_norm_var": 0.012630208333333334, + "learning_rate": 0.0001, + "loss": 3.7829, + "loss/crossentropy": 1.9118491411209106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19326411187648773, + "step": 23352 + }, + { + "epoch": 0.46708, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009806315104166666, + "learning_rate": 0.0001, + "loss": 3.9878, + "loss/crossentropy": 1.911489188671112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17859935760498047, + "step": 23354 + }, + { + "epoch": 0.46712, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014235178629557291, + "learning_rate": 0.0001, + "loss": 4.1033, + "loss/crossentropy": 2.107007384300232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20303775370121002, + "step": 23356 + }, + { + "epoch": 0.46716, + "grad_norm": 1.671875, + "grad_norm_var": 0.017659505208333332, + "learning_rate": 0.0001, + "loss": 3.7504, + "loss/crossentropy": 2.242035746574402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20925073325634003, + "step": 23358 + }, + { + "epoch": 0.4672, + "grad_norm": 1.796875, + "grad_norm_var": 0.016364542643229167, + "learning_rate": 0.0001, + "loss": 3.9543, + "loss/crossentropy": 1.9015487432479858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20830972492694855, + "step": 23360 + }, + { + "epoch": 0.46724, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013916015625, + "learning_rate": 0.0001, + "loss": 3.8229, + "loss/crossentropy": 1.8648836612701416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17992950975894928, + "step": 23362 + }, + { + "epoch": 0.46728, + "grad_norm": 1.96875, + "grad_norm_var": 0.012984212239583333, + "learning_rate": 0.0001, + "loss": 4.026, + "loss/crossentropy": 1.7606803178787231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17884241789579391, + "step": 23364 + }, + { + "epoch": 0.46732, + "grad_norm": 2.171875, + "grad_norm_var": 0.016635894775390625, + "learning_rate": 0.0001, + "loss": 3.9579, + "loss/crossentropy": 2.115469813346863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2030588760972023, + "step": 23366 + }, + { + "epoch": 0.46736, + "grad_norm": 1.9375, + "grad_norm_var": 0.017814127604166667, + "learning_rate": 0.0001, + "loss": 3.9706, + "loss/crossentropy": 1.9341481924057007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17974573373794556, + "step": 23368 + }, + { + "epoch": 0.4674, + "grad_norm": 2.03125, + "grad_norm_var": 0.017317454020182293, + "learning_rate": 0.0001, + "loss": 4.0494, + "loss/crossentropy": 2.171218752861023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20550543069839478, + "step": 23370 + }, + { + "epoch": 0.46744, + "grad_norm": 2.015625, + "grad_norm_var": 0.013392893473307292, + "learning_rate": 0.0001, + "loss": 4.0207, + "loss/crossentropy": 2.0552384853363037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19348255544900894, + "step": 23372 + }, + { + "epoch": 0.46748, + "grad_norm": 1.8203125, + "grad_norm_var": 0.010643513997395833, + "learning_rate": 0.0001, + "loss": 3.6074, + "loss/crossentropy": 1.7817274332046509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16803114861249924, + "step": 23374 + }, + { + "epoch": 0.46752, + "grad_norm": 2.015625, + "grad_norm_var": 0.010245513916015626, + "learning_rate": 0.0001, + "loss": 4.2458, + "loss/crossentropy": 2.0412665009498596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20734214782714844, + "step": 23376 + }, + { + "epoch": 0.46756, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012520090738932291, + "learning_rate": 0.0001, + "loss": 4.0561, + "loss/crossentropy": 2.0934815406799316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062682881951332, + "step": 23378 + }, + { + "epoch": 0.4676, + "grad_norm": 2.140625, + "grad_norm_var": 0.015720367431640625, + "learning_rate": 0.0001, + "loss": 3.9964, + "loss/crossentropy": 2.076676905155182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20668631047010422, + "step": 23380 + }, + { + "epoch": 0.46764, + "grad_norm": 2.0625, + "grad_norm_var": 0.0129302978515625, + "learning_rate": 0.0001, + "loss": 4.2509, + "loss/crossentropy": 2.185506582260132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21427924185991287, + "step": 23382 + }, + { + "epoch": 0.46768, + "grad_norm": 1.984375, + "grad_norm_var": 0.011771392822265626, + "learning_rate": 0.0001, + "loss": 3.865, + "loss/crossentropy": 2.0467058420181274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19394882023334503, + "step": 23384 + }, + { + "epoch": 0.46772, + "grad_norm": 2.140625, + "grad_norm_var": 0.013304646809895833, + "learning_rate": 0.0001, + "loss": 4.2221, + "loss/crossentropy": 2.4391517639160156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22828804701566696, + "step": 23386 + }, + { + "epoch": 0.46776, + "grad_norm": 2.03125, + "grad_norm_var": 0.0128814697265625, + "learning_rate": 0.0001, + "loss": 4.1389, + "loss/crossentropy": 2.2490856647491455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21296796947717667, + "step": 23388 + }, + { + "epoch": 0.4678, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011810048421223959, + "learning_rate": 0.0001, + "loss": 3.7515, + "loss/crossentropy": 1.5810586214065552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17569316178560257, + "step": 23390 + }, + { + "epoch": 0.46784, + "grad_norm": 1.953125, + "grad_norm_var": 0.011805979410807292, + "learning_rate": 0.0001, + "loss": 3.9116, + "loss/crossentropy": 2.069303274154663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18829242885112762, + "step": 23392 + }, + { + "epoch": 0.46788, + "grad_norm": 2.09375, + "grad_norm_var": 0.011004384358723958, + "learning_rate": 0.0001, + "loss": 4.1711, + "loss/crossentropy": 1.9410834312438965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1875254362821579, + "step": 23394 + }, + { + "epoch": 0.46792, + "grad_norm": 1.7421875, + "grad_norm_var": 0.011498769124348959, + "learning_rate": 0.0001, + "loss": 3.697, + "loss/crossentropy": 2.1052953004837036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19472668319940567, + "step": 23396 + }, + { + "epoch": 0.46796, + "grad_norm": 1.875, + "grad_norm_var": 0.011250813802083334, + "learning_rate": 0.0001, + "loss": 4.0819, + "loss/crossentropy": 2.1837843656539917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20570768415927887, + "step": 23398 + }, + { + "epoch": 0.468, + "grad_norm": 1.71875, + "grad_norm_var": 0.014229075113932291, + "learning_rate": 0.0001, + "loss": 3.7475, + "loss/crossentropy": 1.980469524860382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19470104575157166, + "step": 23400 + }, + { + "epoch": 0.46804, + "grad_norm": 1.890625, + "grad_norm_var": 0.011472320556640625, + "learning_rate": 0.0001, + "loss": 3.9931, + "loss/crossentropy": 1.992879033088684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19029226154088974, + "step": 23402 + }, + { + "epoch": 0.46808, + "grad_norm": 1.953125, + "grad_norm_var": 0.010689036051432291, + "learning_rate": 0.0001, + "loss": 4.085, + "loss/crossentropy": 2.5265753269195557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20238864421844482, + "step": 23404 + }, + { + "epoch": 0.46812, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008943430582682292, + "learning_rate": 0.0001, + "loss": 3.9541, + "loss/crossentropy": 2.1182003021240234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22061780095100403, + "step": 23406 + }, + { + "epoch": 0.46816, + "grad_norm": 1.90625, + "grad_norm_var": 0.008381144205729166, + "learning_rate": 0.0001, + "loss": 3.9054, + "loss/crossentropy": 1.8726989030838013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18806783854961395, + "step": 23408 + }, + { + "epoch": 0.4682, + "grad_norm": 2.46875, + "grad_norm_var": 0.026371256510416666, + "learning_rate": 0.0001, + "loss": 4.0764, + "loss/crossentropy": 2.3275599479675293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21061742305755615, + "step": 23410 + }, + { + "epoch": 0.46824, + "grad_norm": 2.140625, + "grad_norm_var": 0.02622044881184896, + "learning_rate": 0.0001, + "loss": 3.8391, + "loss/crossentropy": 1.9331459999084473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18973465263843536, + "step": 23412 + }, + { + "epoch": 0.46828, + "grad_norm": 1.765625, + "grad_norm_var": 0.028043365478515624, + "learning_rate": 0.0001, + "loss": 3.7524, + "loss/crossentropy": 1.9546124339103699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17560599744319916, + "step": 23414 + }, + { + "epoch": 0.46832, + "grad_norm": 2.21875, + "grad_norm_var": 0.03088353474934896, + "learning_rate": 0.0001, + "loss": 4.2547, + "loss/crossentropy": 2.5048669576644897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21813815087080002, + "step": 23416 + }, + { + "epoch": 0.46836, + "grad_norm": 1.9765625, + "grad_norm_var": 0.029930623372395833, + "learning_rate": 0.0001, + "loss": 4.0534, + "loss/crossentropy": 1.8067769408226013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18293478339910507, + "step": 23418 + }, + { + "epoch": 0.4684, + "grad_norm": 1.828125, + "grad_norm_var": 0.031981404622395834, + "learning_rate": 0.0001, + "loss": 3.7527, + "loss/crossentropy": 1.6894381642341614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17101695388555527, + "step": 23420 + }, + { + "epoch": 0.46844, + "grad_norm": 1.9609375, + "grad_norm_var": 0.03349177042643229, + "learning_rate": 0.0001, + "loss": 3.6362, + "loss/crossentropy": 2.3269423246383667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20582617074251175, + "step": 23422 + }, + { + "epoch": 0.46848, + "grad_norm": 1.9921875, + "grad_norm_var": 0.03337376912434896, + "learning_rate": 0.0001, + "loss": 3.9875, + "loss/crossentropy": 1.9424707293510437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18869344890117645, + "step": 23424 + }, + { + "epoch": 0.46852, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01929499308268229, + "learning_rate": 0.0001, + "loss": 3.927, + "loss/crossentropy": 1.8979859948158264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17844338715076447, + "step": 23426 + }, + { + "epoch": 0.46856, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0167236328125, + "learning_rate": 0.0001, + "loss": 4.3855, + "loss/crossentropy": 2.2464778423309326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18301833420991898, + "step": 23428 + }, + { + "epoch": 0.4686, + "grad_norm": 1.7109375, + "grad_norm_var": 0.02848078409830729, + "learning_rate": 0.0001, + "loss": 3.8957, + "loss/crossentropy": 1.9471614360809326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20326492190361023, + "step": 23430 + }, + { + "epoch": 0.46864, + "grad_norm": 1.9609375, + "grad_norm_var": 0.02283299763997396, + "learning_rate": 0.0001, + "loss": 4.0045, + "loss/crossentropy": 2.04762601852417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19779526442289352, + "step": 23432 + }, + { + "epoch": 0.46868, + "grad_norm": 1.8984375, + "grad_norm_var": 0.022607167561848957, + "learning_rate": 0.0001, + "loss": 4.1019, + "loss/crossentropy": 2.1162944436073303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19175396859645844, + "step": 23434 + }, + { + "epoch": 0.46872, + "grad_norm": 2.21875, + "grad_norm_var": 0.025567372639973957, + "learning_rate": 0.0001, + "loss": 3.9651, + "loss/crossentropy": 1.6130013465881348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16156645864248276, + "step": 23436 + }, + { + "epoch": 0.46876, + "grad_norm": 2.046875, + "grad_norm_var": 0.023423004150390624, + "learning_rate": 0.0001, + "loss": 3.9582, + "loss/crossentropy": 2.0393518805503845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20742176473140717, + "step": 23438 + }, + { + "epoch": 0.4688, + "grad_norm": 1.90625, + "grad_norm_var": 0.023811848958333333, + "learning_rate": 0.0001, + "loss": 3.7042, + "loss/crossentropy": 1.6487592458724976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16911844164133072, + "step": 23440 + }, + { + "epoch": 0.46884, + "grad_norm": 1.7890625, + "grad_norm_var": 0.023034413655598957, + "learning_rate": 0.0001, + "loss": 3.8824, + "loss/crossentropy": 1.8171139359474182, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18819218128919601, + "step": 23442 + }, + { + "epoch": 0.46888, + "grad_norm": 1.921875, + "grad_norm_var": 0.022981770833333335, + "learning_rate": 0.0001, + "loss": 4.0849, + "loss/crossentropy": 1.669542133808136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18253905326128006, + "step": 23444 + }, + { + "epoch": 0.46892, + "grad_norm": 1.96875, + "grad_norm_var": 0.008845011393229166, + "learning_rate": 0.0001, + "loss": 3.9415, + "loss/crossentropy": 1.8465211391448975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.194318987429142, + "step": 23446 + }, + { + "epoch": 0.46896, + "grad_norm": 2.0625, + "grad_norm_var": 0.011742146809895833, + "learning_rate": 0.0001, + "loss": 4.0606, + "loss/crossentropy": 2.4016542434692383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2081214338541031, + "step": 23448 + }, + { + "epoch": 0.469, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013692220052083334, + "learning_rate": 0.0001, + "loss": 4.062, + "loss/crossentropy": 2.2425581216812134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19481270760297775, + "step": 23450 + }, + { + "epoch": 0.46904, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009590657552083333, + "learning_rate": 0.0001, + "loss": 3.9642, + "loss/crossentropy": 1.8975054621696472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20339980721473694, + "step": 23452 + }, + { + "epoch": 0.46908, + "grad_norm": 2.328125, + "grad_norm_var": 0.01871312459309896, + "learning_rate": 0.0001, + "loss": 4.1964, + "loss/crossentropy": 2.274298667907715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22013512253761292, + "step": 23454 + }, + { + "epoch": 0.46912, + "grad_norm": 1.9609375, + "grad_norm_var": 0.019724273681640626, + "learning_rate": 0.0001, + "loss": 3.7991, + "loss/crossentropy": 2.286523461341858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21875084191560745, + "step": 23456 + }, + { + "epoch": 0.46916, + "grad_norm": 1.953125, + "grad_norm_var": 0.017465972900390626, + "learning_rate": 0.0001, + "loss": 4.1091, + "loss/crossentropy": 2.199851155281067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20098759233951569, + "step": 23458 + }, + { + "epoch": 0.4692, + "grad_norm": 2.3125, + "grad_norm_var": 0.025780232747395833, + "learning_rate": 0.0001, + "loss": 3.76, + "loss/crossentropy": 1.896471917629242, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18724198639392853, + "step": 23460 + }, + { + "epoch": 0.46924, + "grad_norm": 1.8671875, + "grad_norm_var": 0.026879628499348957, + "learning_rate": 0.0001, + "loss": 4.0191, + "loss/crossentropy": 2.0206944942474365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1795445680618286, + "step": 23462 + }, + { + "epoch": 0.46928, + "grad_norm": 1.8046875, + "grad_norm_var": 0.025911458333333335, + "learning_rate": 0.0001, + "loss": 4.1516, + "loss/crossentropy": 2.230253279209137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20442913472652435, + "step": 23464 + }, + { + "epoch": 0.46932, + "grad_norm": 1.96875, + "grad_norm_var": 0.025055948893229166, + "learning_rate": 0.0001, + "loss": 4.0306, + "loss/crossentropy": 2.117598056793213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20652078092098236, + "step": 23466 + }, + { + "epoch": 0.46936, + "grad_norm": 1.953125, + "grad_norm_var": 0.0245758056640625, + "learning_rate": 0.0001, + "loss": 4.0638, + "loss/crossentropy": 1.6677707433700562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15297261625528336, + "step": 23468 + }, + { + "epoch": 0.4694, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014875284830729167, + "learning_rate": 0.0001, + "loss": 4.0632, + "loss/crossentropy": 2.039306938648224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21778366714715958, + "step": 23470 + }, + { + "epoch": 0.46944, + "grad_norm": 1.9375, + "grad_norm_var": 0.013923136393229167, + "learning_rate": 0.0001, + "loss": 3.9302, + "loss/crossentropy": 1.8116675019264221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19272665679454803, + "step": 23472 + }, + { + "epoch": 0.46948, + "grad_norm": 2.015625, + "grad_norm_var": 0.015433502197265626, + "learning_rate": 0.0001, + "loss": 4.0605, + "loss/crossentropy": 2.378050446510315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21410302817821503, + "step": 23474 + }, + { + "epoch": 0.46952, + "grad_norm": 2.015625, + "grad_norm_var": 0.005890909830729167, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 2.2755852937698364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20388127863407135, + "step": 23476 + }, + { + "epoch": 0.46956, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005692545572916667, + "learning_rate": 0.0001, + "loss": 4.0909, + "loss/crossentropy": 2.1453245282173157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20442672073841095, + "step": 23478 + }, + { + "epoch": 0.4696, + "grad_norm": 1.9375, + "grad_norm_var": 0.003543853759765625, + "learning_rate": 0.0001, + "loss": 3.7373, + "loss/crossentropy": 2.0084391236305237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18768452107906342, + "step": 23480 + }, + { + "epoch": 0.46964, + "grad_norm": 1.96875, + "grad_norm_var": 0.002872467041015625, + "learning_rate": 0.0001, + "loss": 3.9004, + "loss/crossentropy": 1.9836713075637817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1842040792107582, + "step": 23482 + }, + { + "epoch": 0.46968, + "grad_norm": 2.015625, + "grad_norm_var": 0.004923502604166667, + "learning_rate": 0.0001, + "loss": 3.8047, + "loss/crossentropy": 1.8174407482147217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1779571697115898, + "step": 23484 + }, + { + "epoch": 0.46972, + "grad_norm": 2.203125, + "grad_norm_var": 0.010835774739583333, + "learning_rate": 0.0001, + "loss": 3.9017, + "loss/crossentropy": 1.8971520066261292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19008546322584152, + "step": 23486 + }, + { + "epoch": 0.46976, + "grad_norm": 2.0, + "grad_norm_var": 0.011356608072916666, + "learning_rate": 0.0001, + "loss": 3.7841, + "loss/crossentropy": 1.9949323534965515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878834068775177, + "step": 23488 + }, + { + "epoch": 0.4698, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010212198893229166, + "learning_rate": 0.0001, + "loss": 4.0221, + "loss/crossentropy": 1.7825579047203064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1911696270108223, + "step": 23490 + }, + { + "epoch": 0.46984, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009422810872395833, + "learning_rate": 0.0001, + "loss": 4.0892, + "loss/crossentropy": 2.305335283279419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20759187638759613, + "step": 23492 + }, + { + "epoch": 0.46988, + "grad_norm": 2.078125, + "grad_norm_var": 0.2226715087890625, + "learning_rate": 0.0001, + "loss": 3.894, + "loss/crossentropy": 2.0296601057052612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1965966895222664, + "step": 23494 + }, + { + "epoch": 0.46992, + "grad_norm": 1.921875, + "grad_norm_var": 0.22094319661458334, + "learning_rate": 0.0001, + "loss": 4.2948, + "loss/crossentropy": 2.5238767862319946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21435904502868652, + "step": 23496 + }, + { + "epoch": 0.46996, + "grad_norm": 1.9765625, + "grad_norm_var": 0.21852213541666668, + "learning_rate": 0.0001, + "loss": 4.1532, + "loss/crossentropy": 2.2085973024368286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983572095632553, + "step": 23498 + }, + { + "epoch": 0.47, + "grad_norm": 1.921875, + "grad_norm_var": 0.21304423014322918, + "learning_rate": 0.0001, + "loss": 4.2002, + "loss/crossentropy": 2.257534384727478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23111815005540848, + "step": 23500 + }, + { + "epoch": 0.47004, + "grad_norm": 1.8984375, + "grad_norm_var": 0.21055094401041666, + "learning_rate": 0.0001, + "loss": 4.2338, + "loss/crossentropy": 2.3113789558410645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043687105178833, + "step": 23502 + }, + { + "epoch": 0.47008, + "grad_norm": 1.8359375, + "grad_norm_var": 0.2143267313639323, + "learning_rate": 0.0001, + "loss": 4.0727, + "loss/crossentropy": 2.2028011083602905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18799518048763275, + "step": 23504 + }, + { + "epoch": 0.47012, + "grad_norm": 2.03125, + "grad_norm_var": 0.21318257649739583, + "learning_rate": 0.0001, + "loss": 4.3289, + "loss/crossentropy": 2.1186457872390747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2014438509941101, + "step": 23506 + }, + { + "epoch": 0.47016, + "grad_norm": 1.9921875, + "grad_norm_var": 0.21224136352539064, + "learning_rate": 0.0001, + "loss": 3.9821, + "loss/crossentropy": 2.0399728417396545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18888195604085922, + "step": 23508 + }, + { + "epoch": 0.4702, + "grad_norm": 2.03125, + "grad_norm_var": 0.011350250244140625, + "learning_rate": 0.0001, + "loss": 3.9898, + "loss/crossentropy": 1.943885326385498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20364457368850708, + "step": 23510 + }, + { + "epoch": 0.47024, + "grad_norm": 2.171875, + "grad_norm_var": 0.011812082926432292, + "learning_rate": 0.0001, + "loss": 4.0646, + "loss/crossentropy": 1.9351946115493774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18621300905942917, + "step": 23512 + }, + { + "epoch": 0.47028, + "grad_norm": 1.8125, + "grad_norm_var": 0.014351399739583333, + "learning_rate": 0.0001, + "loss": 3.873, + "loss/crossentropy": 2.0112447142601013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1884174421429634, + "step": 23514 + }, + { + "epoch": 0.47032, + "grad_norm": 1.9453125, + "grad_norm_var": 0.014227040608723958, + "learning_rate": 0.0001, + "loss": 3.9918, + "loss/crossentropy": 2.164724826812744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21057604998350143, + "step": 23516 + }, + { + "epoch": 0.47036, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0138092041015625, + "learning_rate": 0.0001, + "loss": 4.2016, + "loss/crossentropy": 2.198709011077881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20399244129657745, + "step": 23518 + }, + { + "epoch": 0.4704, + "grad_norm": 1.8828125, + "grad_norm_var": 0.014792633056640626, + "learning_rate": 0.0001, + "loss": 3.8945, + "loss/crossentropy": 2.3152371644973755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19689901918172836, + "step": 23520 + }, + { + "epoch": 0.47044, + "grad_norm": 1.90625, + "grad_norm_var": 0.015242258707682291, + "learning_rate": 0.0001, + "loss": 3.9598, + "loss/crossentropy": 1.8530511260032654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19973145425319672, + "step": 23522 + }, + { + "epoch": 0.47048, + "grad_norm": 1.890625, + "grad_norm_var": 0.010894521077473959, + "learning_rate": 0.0001, + "loss": 3.9725, + "loss/crossentropy": 1.6506844758987427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17004889249801636, + "step": 23524 + }, + { + "epoch": 0.47052, + "grad_norm": 1.9765625, + "grad_norm_var": 0.00750732421875, + "learning_rate": 0.0001, + "loss": 4.0036, + "loss/crossentropy": 1.7448241710662842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20921830832958221, + "step": 23526 + }, + { + "epoch": 0.47056, + "grad_norm": 1.90625, + "grad_norm_var": 0.0036944071451822918, + "learning_rate": 0.0001, + "loss": 3.7118, + "loss/crossentropy": 1.7023364305496216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18442079424858093, + "step": 23528 + }, + { + "epoch": 0.4706, + "grad_norm": 2.09375, + "grad_norm_var": 0.005909983317057292, + "learning_rate": 0.0001, + "loss": 3.9572, + "loss/crossentropy": 2.1476879119873047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.205117367208004, + "step": 23530 + }, + { + "epoch": 0.47064, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006154123942057292, + "learning_rate": 0.0001, + "loss": 4.1135, + "loss/crossentropy": 1.9776412844657898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1900978982448578, + "step": 23532 + }, + { + "epoch": 0.47068, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009437815348307291, + "learning_rate": 0.0001, + "loss": 4.2694, + "loss/crossentropy": 2.6421544551849365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21158932149410248, + "step": 23534 + }, + { + "epoch": 0.47072, + "grad_norm": 2.375, + "grad_norm_var": 0.020807902018229168, + "learning_rate": 0.0001, + "loss": 4.4728, + "loss/crossentropy": 2.3335670232772827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25099293887615204, + "step": 23536 + }, + { + "epoch": 0.47076, + "grad_norm": 2.015625, + "grad_norm_var": 0.02069091796875, + "learning_rate": 0.0001, + "loss": 4.2101, + "loss/crossentropy": 2.309471607208252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20084770023822784, + "step": 23538 + }, + { + "epoch": 0.4708, + "grad_norm": 1.96875, + "grad_norm_var": 0.020799763997395835, + "learning_rate": 0.0001, + "loss": 3.6632, + "loss/crossentropy": 1.794599175453186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17749344557523727, + "step": 23540 + }, + { + "epoch": 0.47084, + "grad_norm": 2.109375, + "grad_norm_var": 0.022440338134765626, + "learning_rate": 0.0001, + "loss": 4.3814, + "loss/crossentropy": 2.0325594544410706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19963493198156357, + "step": 23542 + }, + { + "epoch": 0.47088, + "grad_norm": 2.015625, + "grad_norm_var": 0.018863677978515625, + "learning_rate": 0.0001, + "loss": 4.0274, + "loss/crossentropy": 2.0868377089500427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20961833745241165, + "step": 23544 + }, + { + "epoch": 0.47092, + "grad_norm": 1.7734375, + "grad_norm_var": 0.02393366495768229, + "learning_rate": 0.0001, + "loss": 3.7205, + "loss/crossentropy": 1.9807188510894775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1892232447862625, + "step": 23546 + }, + { + "epoch": 0.47096, + "grad_norm": 1.8828125, + "grad_norm_var": 0.023872884114583333, + "learning_rate": 0.0001, + "loss": 4.2712, + "loss/crossentropy": 2.292481303215027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22137918323278427, + "step": 23548 + }, + { + "epoch": 0.471, + "grad_norm": 1.90625, + "grad_norm_var": 0.022240193684895833, + "learning_rate": 0.0001, + "loss": 4.0832, + "loss/crossentropy": 2.240318775177002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2084667906165123, + "step": 23550 + }, + { + "epoch": 0.47104, + "grad_norm": 1.7890625, + "grad_norm_var": 0.019969685872395834, + "learning_rate": 0.0001, + "loss": 3.8889, + "loss/crossentropy": 2.0380293130874634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2129639983177185, + "step": 23552 + }, + { + "epoch": 0.47108, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01995849609375, + "learning_rate": 0.0001, + "loss": 3.9957, + "loss/crossentropy": 2.296157479286194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.214280404150486, + "step": 23554 + }, + { + "epoch": 0.47112, + "grad_norm": 2.1875, + "grad_norm_var": 0.022655995686848958, + "learning_rate": 0.0001, + "loss": 4.0575, + "loss/crossentropy": 2.308936357498169, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20904332399368286, + "step": 23556 + }, + { + "epoch": 0.47116, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01942723592122396, + "learning_rate": 0.0001, + "loss": 3.8463, + "loss/crossentropy": 1.935391128063202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19526035338640213, + "step": 23558 + }, + { + "epoch": 0.4712, + "grad_norm": 1.8828125, + "grad_norm_var": 0.019022369384765626, + "learning_rate": 0.0001, + "loss": 4.0567, + "loss/crossentropy": 2.21670663356781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21146119385957718, + "step": 23560 + }, + { + "epoch": 0.47124, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01649958292643229, + "learning_rate": 0.0001, + "loss": 3.808, + "loss/crossentropy": 1.9948694705963135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18240909278392792, + "step": 23562 + }, + { + "epoch": 0.47128, + "grad_norm": 1.984375, + "grad_norm_var": 0.0165283203125, + "learning_rate": 0.0001, + "loss": 4.0053, + "loss/crossentropy": 2.0435580015182495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20460917055606842, + "step": 23564 + }, + { + "epoch": 0.47132, + "grad_norm": 1.828125, + "grad_norm_var": 0.018381500244140626, + "learning_rate": 0.0001, + "loss": 3.4991, + "loss/crossentropy": 2.0021498799324036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17788298428058624, + "step": 23566 + }, + { + "epoch": 0.47136, + "grad_norm": 2.015625, + "grad_norm_var": 0.008845774332682292, + "learning_rate": 0.0001, + "loss": 3.9934, + "loss/crossentropy": 2.0119062066078186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1800990328192711, + "step": 23568 + }, + { + "epoch": 0.4714, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008967081705729166, + "learning_rate": 0.0001, + "loss": 3.8167, + "loss/crossentropy": 1.5521536469459534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15504960715770721, + "step": 23570 + }, + { + "epoch": 0.47144, + "grad_norm": 2.015625, + "grad_norm_var": 0.004400380452473958, + "learning_rate": 0.0001, + "loss": 3.954, + "loss/crossentropy": 1.9227730631828308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20652085542678833, + "step": 23572 + }, + { + "epoch": 0.47148, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006078084309895833, + "learning_rate": 0.0001, + "loss": 4.075, + "loss/crossentropy": 2.0273211002349854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18889201432466507, + "step": 23574 + }, + { + "epoch": 0.47152, + "grad_norm": 2.03125, + "grad_norm_var": 0.006788889567057292, + "learning_rate": 0.0001, + "loss": 3.8103, + "loss/crossentropy": 1.934963881969452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17783871293067932, + "step": 23576 + }, + { + "epoch": 0.47156, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007020823160807292, + "learning_rate": 0.0001, + "loss": 3.8893, + "loss/crossentropy": 1.9819161295890808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19646496325731277, + "step": 23578 + }, + { + "epoch": 0.4716, + "grad_norm": 2.21875, + "grad_norm_var": 0.014387003580729167, + "learning_rate": 0.0001, + "loss": 4.1545, + "loss/crossentropy": 1.9830252528190613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19861895591020584, + "step": 23580 + }, + { + "epoch": 0.47164, + "grad_norm": 2.125, + "grad_norm_var": 0.011872355143229167, + "learning_rate": 0.0001, + "loss": 4.1332, + "loss/crossentropy": 1.9679479598999023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19719085842370987, + "step": 23582 + }, + { + "epoch": 0.47168, + "grad_norm": 1.859375, + "grad_norm_var": 0.0126708984375, + "learning_rate": 0.0001, + "loss": 3.9055, + "loss/crossentropy": 1.8915583491325378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1777575984597206, + "step": 23584 + }, + { + "epoch": 0.47172, + "grad_norm": 1.9375, + "grad_norm_var": 0.011533355712890625, + "learning_rate": 0.0001, + "loss": 4.0302, + "loss/crossentropy": 2.126678943634033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21122275292873383, + "step": 23586 + }, + { + "epoch": 0.47176, + "grad_norm": 1.859375, + "grad_norm_var": 0.012109375, + "learning_rate": 0.0001, + "loss": 3.8424, + "loss/crossentropy": 1.8739299774169922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19388508796691895, + "step": 23588 + }, + { + "epoch": 0.4718, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01207275390625, + "learning_rate": 0.0001, + "loss": 4.057, + "loss/crossentropy": 1.983488917350769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18411333858966827, + "step": 23590 + }, + { + "epoch": 0.47184, + "grad_norm": 2.03125, + "grad_norm_var": 0.016796875, + "learning_rate": 0.0001, + "loss": 4.4929, + "loss/crossentropy": 2.127811551094055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1860193982720375, + "step": 23592 + }, + { + "epoch": 0.47188, + "grad_norm": 2.3125, + "grad_norm_var": 0.022696940104166667, + "learning_rate": 0.0001, + "loss": 4.0036, + "loss/crossentropy": 1.8028001189231873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18699438124895096, + "step": 23594 + }, + { + "epoch": 0.47192, + "grad_norm": 1.9609375, + "grad_norm_var": 0.02093683878580729, + "learning_rate": 0.0001, + "loss": 4.0754, + "loss/crossentropy": 2.0716105699539185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1935330480337143, + "step": 23596 + }, + { + "epoch": 0.47196, + "grad_norm": 2.0, + "grad_norm_var": 0.020845540364583335, + "learning_rate": 0.0001, + "loss": 3.8819, + "loss/crossentropy": 2.0923121571540833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20516054332256317, + "step": 23598 + }, + { + "epoch": 0.472, + "grad_norm": 1.8359375, + "grad_norm_var": 0.020357259114583335, + "learning_rate": 0.0001, + "loss": 4.0871, + "loss/crossentropy": 2.244894862174988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21077310293912888, + "step": 23600 + }, + { + "epoch": 0.47204, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02054417928059896, + "learning_rate": 0.0001, + "loss": 3.9788, + "loss/crossentropy": 1.9484725594520569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16783934831619263, + "step": 23602 + }, + { + "epoch": 0.47208, + "grad_norm": 2.203125, + "grad_norm_var": 0.19378026326497397, + "learning_rate": 0.0001, + "loss": 3.8165, + "loss/crossentropy": 1.8027321100234985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21114902943372726, + "step": 23604 + }, + { + "epoch": 0.47212, + "grad_norm": 2.015625, + "grad_norm_var": 0.18945083618164063, + "learning_rate": 0.0001, + "loss": 3.8104, + "loss/crossentropy": 1.7768787741661072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1848941519856453, + "step": 23606 + }, + { + "epoch": 0.47216, + "grad_norm": 2.125, + "grad_norm_var": 0.19102274576822917, + "learning_rate": 0.0001, + "loss": 3.8457, + "loss/crossentropy": 1.8786412477493286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18932975828647614, + "step": 23608 + }, + { + "epoch": 0.4722, + "grad_norm": 1.828125, + "grad_norm_var": 0.1960113525390625, + "learning_rate": 0.0001, + "loss": 3.9519, + "loss/crossentropy": 2.0566998720169067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1795952171087265, + "step": 23610 + }, + { + "epoch": 0.47224, + "grad_norm": 2.03125, + "grad_norm_var": 0.19468154907226562, + "learning_rate": 0.0001, + "loss": 4.0255, + "loss/crossentropy": 1.9085100293159485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19264912605285645, + "step": 23612 + }, + { + "epoch": 0.47228, + "grad_norm": 1.8984375, + "grad_norm_var": 0.19574381510416666, + "learning_rate": 0.0001, + "loss": 4.0038, + "loss/crossentropy": 2.0626463294029236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947372406721115, + "step": 23614 + }, + { + "epoch": 0.47232, + "grad_norm": 2.09375, + "grad_norm_var": 0.1929278055826823, + "learning_rate": 0.0001, + "loss": 4.144, + "loss/crossentropy": 2.0861737728118896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21039434522390366, + "step": 23616 + }, + { + "epoch": 0.47236, + "grad_norm": 2.046875, + "grad_norm_var": 0.19161783854166667, + "learning_rate": 0.0001, + "loss": 3.9677, + "loss/crossentropy": 1.9024608135223389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18196185678243637, + "step": 23618 + }, + { + "epoch": 0.4724, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006371815999348958, + "learning_rate": 0.0001, + "loss": 4.1103, + "loss/crossentropy": 2.209709107875824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21473529189825058, + "step": 23620 + }, + { + "epoch": 0.47244, + "grad_norm": 1.921875, + "grad_norm_var": 0.0069010416666666664, + "learning_rate": 0.0001, + "loss": 3.9269, + "loss/crossentropy": 2.1699984073638916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023090422153473, + "step": 23622 + }, + { + "epoch": 0.47248, + "grad_norm": 1.875, + "grad_norm_var": 0.006441243489583333, + "learning_rate": 0.0001, + "loss": 3.7869, + "loss/crossentropy": 1.7027064561843872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16561312228441238, + "step": 23624 + }, + { + "epoch": 0.47252, + "grad_norm": 2.109375, + "grad_norm_var": 0.008526357014973958, + "learning_rate": 0.0001, + "loss": 3.9753, + "loss/crossentropy": 2.2230480909347534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20173655450344086, + "step": 23626 + }, + { + "epoch": 0.47256, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008841705322265626, + "learning_rate": 0.0001, + "loss": 3.6368, + "loss/crossentropy": 1.9015594124794006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1708713322877884, + "step": 23628 + }, + { + "epoch": 0.4726, + "grad_norm": 2.09375, + "grad_norm_var": 0.010723622639973958, + "learning_rate": 0.0001, + "loss": 4.0118, + "loss/crossentropy": 2.051177144050598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878596916794777, + "step": 23630 + }, + { + "epoch": 0.47264, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013752237955729166, + "learning_rate": 0.0001, + "loss": 3.9746, + "loss/crossentropy": 1.9841215014457703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1841672658920288, + "step": 23632 + }, + { + "epoch": 0.47268, + "grad_norm": 2.0625, + "grad_norm_var": 0.0148590087890625, + "learning_rate": 0.0001, + "loss": 4.1022, + "loss/crossentropy": 2.0642913579940796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22168366611003876, + "step": 23634 + }, + { + "epoch": 0.47272, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014899698893229167, + "learning_rate": 0.0001, + "loss": 4.0448, + "loss/crossentropy": 2.149936318397522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954735815525055, + "step": 23636 + }, + { + "epoch": 0.47276, + "grad_norm": 1.9296875, + "grad_norm_var": 0.014811197916666666, + "learning_rate": 0.0001, + "loss": 3.9646, + "loss/crossentropy": 2.0555055141448975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19528509676456451, + "step": 23638 + }, + { + "epoch": 0.4728, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013600413004557292, + "learning_rate": 0.0001, + "loss": 4.0768, + "loss/crossentropy": 1.9836488366127014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17974190413951874, + "step": 23640 + }, + { + "epoch": 0.47284, + "grad_norm": 2.21875, + "grad_norm_var": 0.014644368489583334, + "learning_rate": 0.0001, + "loss": 4.4381, + "loss/crossentropy": 2.455438733100891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21470309793949127, + "step": 23642 + }, + { + "epoch": 0.47288, + "grad_norm": 1.90625, + "grad_norm_var": 0.0148345947265625, + "learning_rate": 0.0001, + "loss": 3.9095, + "loss/crossentropy": 2.0290130376815796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19266076385974884, + "step": 23644 + }, + { + "epoch": 0.47292, + "grad_norm": 2.0625, + "grad_norm_var": 0.014312489827473959, + "learning_rate": 0.0001, + "loss": 3.8732, + "loss/crossentropy": 1.9230349659919739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17801092565059662, + "step": 23646 + }, + { + "epoch": 0.47296, + "grad_norm": 2.1875, + "grad_norm_var": 0.014452107747395833, + "learning_rate": 0.0001, + "loss": 4.1517, + "loss/crossentropy": 2.015698790550232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20072294026613235, + "step": 23648 + }, + { + "epoch": 0.473, + "grad_norm": 1.9140625, + "grad_norm_var": 0.014357248942057291, + "learning_rate": 0.0001, + "loss": 3.8298, + "loss/crossentropy": 1.9033147096633911, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18670127540826797, + "step": 23650 + }, + { + "epoch": 0.47304, + "grad_norm": 1.8359375, + "grad_norm_var": 0.014906565348307291, + "learning_rate": 0.0001, + "loss": 3.7137, + "loss/crossentropy": 1.7301848530769348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1701732650399208, + "step": 23652 + }, + { + "epoch": 0.47308, + "grad_norm": 1.7734375, + "grad_norm_var": 0.016747029622395833, + "learning_rate": 0.0001, + "loss": 3.7671, + "loss/crossentropy": 2.0487022399902344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18679651618003845, + "step": 23654 + }, + { + "epoch": 0.47312, + "grad_norm": 2.03125, + "grad_norm_var": 0.017064412434895832, + "learning_rate": 0.0001, + "loss": 4.0564, + "loss/crossentropy": 2.1262341737747192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21661578118801117, + "step": 23656 + }, + { + "epoch": 0.47316, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011466217041015626, + "learning_rate": 0.0001, + "loss": 3.8656, + "loss/crossentropy": 1.8145031929016113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1787424087524414, + "step": 23658 + }, + { + "epoch": 0.4732, + "grad_norm": 1.96875, + "grad_norm_var": 0.0114013671875, + "learning_rate": 0.0001, + "loss": 3.9916, + "loss/crossentropy": 2.1354995369911194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19884371012449265, + "step": 23660 + }, + { + "epoch": 0.47324, + "grad_norm": 2.046875, + "grad_norm_var": 0.010920206705729166, + "learning_rate": 0.0001, + "loss": 3.8533, + "loss/crossentropy": 1.7008216381072998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1525547280907631, + "step": 23662 + }, + { + "epoch": 0.47328, + "grad_norm": 2.0, + "grad_norm_var": 0.006880442301432292, + "learning_rate": 0.0001, + "loss": 4.0684, + "loss/crossentropy": 2.235612154006958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21687253564596176, + "step": 23664 + }, + { + "epoch": 0.47332, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008819325764973959, + "learning_rate": 0.0001, + "loss": 4.3348, + "loss/crossentropy": 2.375667631626129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21823541820049286, + "step": 23666 + }, + { + "epoch": 0.47336, + "grad_norm": 1.9375, + "grad_norm_var": 0.007995351155598959, + "learning_rate": 0.0001, + "loss": 4.2375, + "loss/crossentropy": 2.223970353603363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21631266921758652, + "step": 23668 + }, + { + "epoch": 0.4734, + "grad_norm": 1.7265625, + "grad_norm_var": 0.009261067708333333, + "learning_rate": 0.0001, + "loss": 3.7413, + "loss/crossentropy": 1.9337742328643799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18401625007390976, + "step": 23670 + }, + { + "epoch": 0.47344, + "grad_norm": 1.921875, + "grad_norm_var": 0.011400349934895833, + "learning_rate": 0.0001, + "loss": 4.1189, + "loss/crossentropy": 2.1500974893569946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19867678731679916, + "step": 23672 + }, + { + "epoch": 0.47348, + "grad_norm": 2.1875, + "grad_norm_var": 0.015103912353515625, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 1.8808711171150208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18535293638706207, + "step": 23674 + }, + { + "epoch": 0.47352, + "grad_norm": 2.0, + "grad_norm_var": 0.0133544921875, + "learning_rate": 0.0001, + "loss": 4.1431, + "loss/crossentropy": 2.1114797592163086, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959507018327713, + "step": 23676 + }, + { + "epoch": 0.47356, + "grad_norm": 2.09375, + "grad_norm_var": 0.016141510009765624, + "learning_rate": 0.0001, + "loss": 3.8953, + "loss/crossentropy": 2.2539754509925842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20711452513933182, + "step": 23678 + }, + { + "epoch": 0.4736, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01567357381184896, + "learning_rate": 0.0001, + "loss": 3.9558, + "loss/crossentropy": 1.950667679309845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18103761970996857, + "step": 23680 + }, + { + "epoch": 0.47364, + "grad_norm": 1.8515625, + "grad_norm_var": 0.015413157145182292, + "learning_rate": 0.0001, + "loss": 3.9297, + "loss/crossentropy": 1.9214385747909546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19267746806144714, + "step": 23682 + }, + { + "epoch": 0.47368, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0181304931640625, + "learning_rate": 0.0001, + "loss": 3.7439, + "loss/crossentropy": 2.053990364074707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18003928661346436, + "step": 23684 + }, + { + "epoch": 0.47372, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0151275634765625, + "learning_rate": 0.0001, + "loss": 3.951, + "loss/crossentropy": 1.6660608649253845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15546631067991257, + "step": 23686 + }, + { + "epoch": 0.47376, + "grad_norm": 1.8671875, + "grad_norm_var": 0.013250478108723958, + "learning_rate": 0.0001, + "loss": 3.8348, + "loss/crossentropy": 2.1377063989639282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202220119535923, + "step": 23688 + }, + { + "epoch": 0.4738, + "grad_norm": 1.8203125, + "grad_norm_var": 0.009293619791666667, + "learning_rate": 0.0001, + "loss": 3.8619, + "loss/crossentropy": 1.9241121411323547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17836976051330566, + "step": 23690 + }, + { + "epoch": 0.47384, + "grad_norm": 2.078125, + "grad_norm_var": 0.009159088134765625, + "learning_rate": 0.0001, + "loss": 4.2613, + "loss/crossentropy": 2.158058762550354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22955617308616638, + "step": 23692 + }, + { + "epoch": 0.47388, + "grad_norm": 2.0, + "grad_norm_var": 0.006870269775390625, + "learning_rate": 0.0001, + "loss": 4.24, + "loss/crossentropy": 2.138846278190613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2145211324095726, + "step": 23694 + }, + { + "epoch": 0.47392, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006341298421223958, + "learning_rate": 0.0001, + "loss": 3.9193, + "loss/crossentropy": 2.0770075917243958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2009606510400772, + "step": 23696 + }, + { + "epoch": 0.47396, + "grad_norm": 1.953125, + "grad_norm_var": 0.006363932291666667, + "learning_rate": 0.0001, + "loss": 3.8684, + "loss/crossentropy": 1.5739121437072754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16019296646118164, + "step": 23698 + }, + { + "epoch": 0.474, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005078125, + "learning_rate": 0.0001, + "loss": 4.1262, + "loss/crossentropy": 1.898674726486206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1942930817604065, + "step": 23700 + }, + { + "epoch": 0.47404, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004776763916015625, + "learning_rate": 0.0001, + "loss": 4.0102, + "loss/crossentropy": 2.2228487730026245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093074843287468, + "step": 23702 + }, + { + "epoch": 0.47408, + "grad_norm": 2.09375, + "grad_norm_var": 0.006390126546223959, + "learning_rate": 0.0001, + "loss": 4.1041, + "loss/crossentropy": 2.1111066341400146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19743914157152176, + "step": 23704 + }, + { + "epoch": 0.47412, + "grad_norm": 3.359375, + "grad_norm_var": 0.1302886962890625, + "learning_rate": 0.0001, + "loss": 4.0682, + "loss/crossentropy": 1.780187964439392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18026559054851532, + "step": 23706 + }, + { + "epoch": 0.47416, + "grad_norm": 2.03125, + "grad_norm_var": 0.13217137654622396, + "learning_rate": 0.0001, + "loss": 3.8936, + "loss/crossentropy": 1.983467936515808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188174769282341, + "step": 23708 + }, + { + "epoch": 0.4742, + "grad_norm": 1.8828125, + "grad_norm_var": 0.13393961588541667, + "learning_rate": 0.0001, + "loss": 4.0822, + "loss/crossentropy": 2.0928043723106384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048778161406517, + "step": 23710 + }, + { + "epoch": 0.47424, + "grad_norm": 1.8984375, + "grad_norm_var": 0.13646240234375, + "learning_rate": 0.0001, + "loss": 4.0171, + "loss/crossentropy": 1.8958263397216797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17733225226402283, + "step": 23712 + }, + { + "epoch": 0.47428, + "grad_norm": 1.9765625, + "grad_norm_var": 0.1341461181640625, + "learning_rate": 0.0001, + "loss": 4.0589, + "loss/crossentropy": 1.8104780316352844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1762789711356163, + "step": 23714 + }, + { + "epoch": 0.47432, + "grad_norm": 1.9375, + "grad_norm_var": 0.1332415262858073, + "learning_rate": 0.0001, + "loss": 4.0282, + "loss/crossentropy": 2.167409896850586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20198846608400345, + "step": 23716 + }, + { + "epoch": 0.47436, + "grad_norm": 1.8984375, + "grad_norm_var": 0.13642756144205728, + "learning_rate": 0.0001, + "loss": 3.9456, + "loss/crossentropy": 1.8979802131652832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17944183200597763, + "step": 23718 + }, + { + "epoch": 0.4744, + "grad_norm": 1.8984375, + "grad_norm_var": 0.13585205078125, + "learning_rate": 0.0001, + "loss": 4.1639, + "loss/crossentropy": 2.3538358211517334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2084932029247284, + "step": 23720 + }, + { + "epoch": 0.47444, + "grad_norm": 1.953125, + "grad_norm_var": 0.005028279622395834, + "learning_rate": 0.0001, + "loss": 4.1931, + "loss/crossentropy": 2.144857406616211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20259954035282135, + "step": 23722 + }, + { + "epoch": 0.47448, + "grad_norm": 2.0625, + "grad_norm_var": 0.0050771077473958336, + "learning_rate": 0.0001, + "loss": 4.1082, + "loss/crossentropy": 2.3802285194396973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2301197499036789, + "step": 23724 + }, + { + "epoch": 0.47452, + "grad_norm": 1.890625, + "grad_norm_var": 0.006025950113932292, + "learning_rate": 0.0001, + "loss": 4.212, + "loss/crossentropy": 2.5175833702087402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22316204756498337, + "step": 23726 + }, + { + "epoch": 0.47456, + "grad_norm": 2.078125, + "grad_norm_var": 0.006514485677083333, + "learning_rate": 0.0001, + "loss": 4.3959, + "loss/crossentropy": 2.266839861869812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20391426980495453, + "step": 23728 + }, + { + "epoch": 0.4746, + "grad_norm": 1.921875, + "grad_norm_var": 0.006526438395182291, + "learning_rate": 0.0001, + "loss": 3.9426, + "loss/crossentropy": 1.9993603229522705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19872139394283295, + "step": 23730 + }, + { + "epoch": 0.47464, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007321929931640625, + "learning_rate": 0.0001, + "loss": 4.0236, + "loss/crossentropy": 1.999217450618744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18402128666639328, + "step": 23732 + }, + { + "epoch": 0.47468, + "grad_norm": 2.015625, + "grad_norm_var": 0.006894683837890625, + "learning_rate": 0.0001, + "loss": 4.0597, + "loss/crossentropy": 2.2356200218200684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19472184777259827, + "step": 23734 + }, + { + "epoch": 0.47472, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0072062174479166664, + "learning_rate": 0.0001, + "loss": 3.9894, + "loss/crossentropy": 1.6930708289146423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16877874732017517, + "step": 23736 + }, + { + "epoch": 0.47476, + "grad_norm": 1.921875, + "grad_norm_var": 0.007985178629557292, + "learning_rate": 0.0001, + "loss": 3.6418, + "loss/crossentropy": 2.0427737832069397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20055241882801056, + "step": 23738 + }, + { + "epoch": 0.4748, + "grad_norm": 2.0625, + "grad_norm_var": 0.007795206705729167, + "learning_rate": 0.0001, + "loss": 4.0536, + "loss/crossentropy": 2.11184823513031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1939689666032791, + "step": 23740 + }, + { + "epoch": 0.47484, + "grad_norm": 1.7734375, + "grad_norm_var": 0.008778635660807292, + "learning_rate": 0.0001, + "loss": 3.8553, + "loss/crossentropy": 1.9833735823631287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16304892301559448, + "step": 23742 + }, + { + "epoch": 0.47488, + "grad_norm": 1.859375, + "grad_norm_var": 0.006754302978515625, + "learning_rate": 0.0001, + "loss": 4.1299, + "loss/crossentropy": 2.0734696984291077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18283121287822723, + "step": 23744 + }, + { + "epoch": 0.47492, + "grad_norm": 2.015625, + "grad_norm_var": 0.00877685546875, + "learning_rate": 0.0001, + "loss": 4.0763, + "loss/crossentropy": 1.7331766486167908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18056727200746536, + "step": 23746 + }, + { + "epoch": 0.47496, + "grad_norm": 2.046875, + "grad_norm_var": 0.0089752197265625, + "learning_rate": 0.0001, + "loss": 4.314, + "loss/crossentropy": 2.224283456802368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21115867048501968, + "step": 23748 + }, + { + "epoch": 0.475, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007734934488932292, + "learning_rate": 0.0001, + "loss": 3.9618, + "loss/crossentropy": 1.6858720183372498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16034550219774246, + "step": 23750 + }, + { + "epoch": 0.47504, + "grad_norm": 1.96875, + "grad_norm_var": 0.008304850260416666, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 2.038428485393524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020621821284294, + "step": 23752 + }, + { + "epoch": 0.47508, + "grad_norm": 1.859375, + "grad_norm_var": 0.007835896809895833, + "learning_rate": 0.0001, + "loss": 4.0219, + "loss/crossentropy": 2.076684832572937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1903630793094635, + "step": 23754 + }, + { + "epoch": 0.47512, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007665761311848958, + "learning_rate": 0.0001, + "loss": 3.7804, + "loss/crossentropy": 1.7917174100875854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1796514242887497, + "step": 23756 + }, + { + "epoch": 0.47516, + "grad_norm": 2.046875, + "grad_norm_var": 0.006859334309895834, + "learning_rate": 0.0001, + "loss": 4.0059, + "loss/crossentropy": 1.9619473814964294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18760067224502563, + "step": 23758 + }, + { + "epoch": 0.4752, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007990519205729166, + "learning_rate": 0.0001, + "loss": 3.787, + "loss/crossentropy": 2.083077907562256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19316518306732178, + "step": 23760 + }, + { + "epoch": 0.47524, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007100168863932292, + "learning_rate": 0.0001, + "loss": 3.8462, + "loss/crossentropy": 1.8045600056648254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17447172850370407, + "step": 23762 + }, + { + "epoch": 0.47528, + "grad_norm": 2.21875, + "grad_norm_var": 0.013691965738932292, + "learning_rate": 0.0001, + "loss": 3.8125, + "loss/crossentropy": 2.2159335613250732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20490044355392456, + "step": 23764 + }, + { + "epoch": 0.47532, + "grad_norm": 1.796875, + "grad_norm_var": 0.014851633707682292, + "learning_rate": 0.0001, + "loss": 3.7541, + "loss/crossentropy": 2.0102903246879578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17014086246490479, + "step": 23766 + }, + { + "epoch": 0.47536, + "grad_norm": 1.953125, + "grad_norm_var": 0.012823232014973958, + "learning_rate": 0.0001, + "loss": 4.0154, + "loss/crossentropy": 2.0524433851242065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20510738343000412, + "step": 23768 + }, + { + "epoch": 0.4754, + "grad_norm": 1.921875, + "grad_norm_var": 0.012849934895833333, + "learning_rate": 0.0001, + "loss": 3.8245, + "loss/crossentropy": 1.9695302844047546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17639748752117157, + "step": 23770 + }, + { + "epoch": 0.47544, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014642079671223959, + "learning_rate": 0.0001, + "loss": 4.4496, + "loss/crossentropy": 2.3865933418273926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23802988976240158, + "step": 23772 + }, + { + "epoch": 0.47548, + "grad_norm": 1.96875, + "grad_norm_var": 0.013480631510416667, + "learning_rate": 0.0001, + "loss": 3.8708, + "loss/crossentropy": 1.6841627955436707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1837886944413185, + "step": 23774 + }, + { + "epoch": 0.47552, + "grad_norm": 1.875, + "grad_norm_var": 0.0132232666015625, + "learning_rate": 0.0001, + "loss": 3.9562, + "loss/crossentropy": 1.9436610341072083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2144036665558815, + "step": 23776 + }, + { + "epoch": 0.47556, + "grad_norm": 2.03125, + "grad_norm_var": 0.013846842447916667, + "learning_rate": 0.0001, + "loss": 3.9312, + "loss/crossentropy": 1.8565305471420288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19765028357505798, + "step": 23778 + }, + { + "epoch": 0.4756, + "grad_norm": 1.859375, + "grad_norm_var": 0.007165273030598958, + "learning_rate": 0.0001, + "loss": 4.0102, + "loss/crossentropy": 2.119917571544647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1896904706954956, + "step": 23780 + }, + { + "epoch": 0.47564, + "grad_norm": 1.90625, + "grad_norm_var": 0.008892567952473958, + "learning_rate": 0.0001, + "loss": 4.0573, + "loss/crossentropy": 1.9241788387298584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17011697590351105, + "step": 23782 + }, + { + "epoch": 0.47568, + "grad_norm": 1.875, + "grad_norm_var": 0.010495758056640625, + "learning_rate": 0.0001, + "loss": 3.7454, + "loss/crossentropy": 1.7912859320640564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17273171246051788, + "step": 23784 + }, + { + "epoch": 0.47572, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0112945556640625, + "learning_rate": 0.0001, + "loss": 4.4069, + "loss/crossentropy": 2.4223393201828003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2194758951663971, + "step": 23786 + }, + { + "epoch": 0.47576, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010416412353515625, + "learning_rate": 0.0001, + "loss": 3.7825, + "loss/crossentropy": 1.9073758721351624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19918088614940643, + "step": 23788 + }, + { + "epoch": 0.4758, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0115631103515625, + "learning_rate": 0.0001, + "loss": 3.4428, + "loss/crossentropy": 1.5801246762275696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16602589190006256, + "step": 23790 + }, + { + "epoch": 0.47584, + "grad_norm": 1.890625, + "grad_norm_var": 0.010892740885416667, + "learning_rate": 0.0001, + "loss": 3.7883, + "loss/crossentropy": 1.847874402999878, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17299149185419083, + "step": 23792 + }, + { + "epoch": 0.47588, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010431925455729166, + "learning_rate": 0.0001, + "loss": 3.9211, + "loss/crossentropy": 2.272279739379883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19657636433839798, + "step": 23794 + }, + { + "epoch": 0.47592, + "grad_norm": 2.109375, + "grad_norm_var": 0.014387003580729167, + "learning_rate": 0.0001, + "loss": 4.422, + "loss/crossentropy": 1.9107636213302612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1978672295808792, + "step": 23796 + }, + { + "epoch": 0.47596, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011374664306640626, + "learning_rate": 0.0001, + "loss": 4.2113, + "loss/crossentropy": 2.384741425514221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.213532455265522, + "step": 23798 + }, + { + "epoch": 0.476, + "grad_norm": 2.140625, + "grad_norm_var": 0.019207509358723958, + "learning_rate": 0.0001, + "loss": 4.2722, + "loss/crossentropy": 2.2380464673042297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984338015317917, + "step": 23800 + }, + { + "epoch": 0.47604, + "grad_norm": 1.921875, + "grad_norm_var": 0.019465891520182292, + "learning_rate": 0.0001, + "loss": 4.0274, + "loss/crossentropy": 2.044179916381836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18904277682304382, + "step": 23802 + }, + { + "epoch": 0.47608, + "grad_norm": 2.015625, + "grad_norm_var": 0.019425455729166666, + "learning_rate": 0.0001, + "loss": 3.9933, + "loss/crossentropy": 2.3498952388763428, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2706408053636551, + "step": 23804 + }, + { + "epoch": 0.47612, + "grad_norm": 1.96875, + "grad_norm_var": 0.017710113525390626, + "learning_rate": 0.0001, + "loss": 3.7679, + "loss/crossentropy": 2.2029510736465454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19871965795755386, + "step": 23806 + }, + { + "epoch": 0.47616, + "grad_norm": 2.0, + "grad_norm_var": 0.0169830322265625, + "learning_rate": 0.0001, + "loss": 3.9891, + "loss/crossentropy": 1.7823997139930725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17446641623973846, + "step": 23808 + }, + { + "epoch": 0.4762, + "grad_norm": 2.078125, + "grad_norm_var": 0.016414388020833334, + "learning_rate": 0.0001, + "loss": 4.0756, + "loss/crossentropy": 1.9810225367546082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1739773526787758, + "step": 23810 + }, + { + "epoch": 0.47624, + "grad_norm": 1.8984375, + "grad_norm_var": 0.015429433186848958, + "learning_rate": 0.0001, + "loss": 4.1135, + "loss/crossentropy": 2.0867974758148193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1936039850115776, + "step": 23812 + }, + { + "epoch": 0.47628, + "grad_norm": 1.8046875, + "grad_norm_var": 0.017752838134765626, + "learning_rate": 0.0001, + "loss": 3.7694, + "loss/crossentropy": 1.88890939950943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18023589998483658, + "step": 23814 + }, + { + "epoch": 0.47632, + "grad_norm": 1.859375, + "grad_norm_var": 0.007838694254557292, + "learning_rate": 0.0001, + "loss": 3.7677, + "loss/crossentropy": 2.1205832958221436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20733095705509186, + "step": 23816 + }, + { + "epoch": 0.47636, + "grad_norm": 2.109375, + "grad_norm_var": 0.009804026285807291, + "learning_rate": 0.0001, + "loss": 3.9361, + "loss/crossentropy": 2.1970152854919434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2084852010011673, + "step": 23818 + }, + { + "epoch": 0.4764, + "grad_norm": 2.09375, + "grad_norm_var": 0.011484527587890625, + "learning_rate": 0.0001, + "loss": 3.9324, + "loss/crossentropy": 1.9947285056114197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.181013822555542, + "step": 23820 + }, + { + "epoch": 0.47644, + "grad_norm": 1.875, + "grad_norm_var": 0.011205037434895834, + "learning_rate": 0.0001, + "loss": 3.7626, + "loss/crossentropy": 1.8331729769706726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18118757009506226, + "step": 23822 + }, + { + "epoch": 0.47648, + "grad_norm": 1.875, + "grad_norm_var": 0.0111083984375, + "learning_rate": 0.0001, + "loss": 4.0555, + "loss/crossentropy": 2.191226840019226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.202358178794384, + "step": 23824 + }, + { + "epoch": 0.47652, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009348297119140625, + "learning_rate": 0.0001, + "loss": 3.9524, + "loss/crossentropy": 1.9327014088630676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17946144193410873, + "step": 23826 + }, + { + "epoch": 0.47656, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007482655843098958, + "learning_rate": 0.0001, + "loss": 3.8711, + "loss/crossentropy": 2.2243237495422363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19515040516853333, + "step": 23828 + }, + { + "epoch": 0.4766, + "grad_norm": 1.8984375, + "grad_norm_var": 0.00675048828125, + "learning_rate": 0.0001, + "loss": 4.1158, + "loss/crossentropy": 2.176102042198181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19702810794115067, + "step": 23830 + }, + { + "epoch": 0.47664, + "grad_norm": 1.875, + "grad_norm_var": 0.006197102864583333, + "learning_rate": 0.0001, + "loss": 4.0918, + "loss/crossentropy": 2.1210728883743286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.222815603017807, + "step": 23832 + }, + { + "epoch": 0.47668, + "grad_norm": 1.984375, + "grad_norm_var": 0.004134114583333333, + "learning_rate": 0.0001, + "loss": 3.9675, + "loss/crossentropy": 1.9496826529502869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17903053760528564, + "step": 23834 + }, + { + "epoch": 0.47672, + "grad_norm": 1.890625, + "grad_norm_var": 0.0017575581868489584, + "learning_rate": 0.0001, + "loss": 3.9558, + "loss/crossentropy": 2.139560639858246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20509754866361618, + "step": 23836 + }, + { + "epoch": 0.47676, + "grad_norm": 1.875, + "grad_norm_var": 0.0016591389973958333, + "learning_rate": 0.0001, + "loss": 4.0937, + "loss/crossentropy": 2.102014422416687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19706308841705322, + "step": 23838 + }, + { + "epoch": 0.4768, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0017486572265625, + "learning_rate": 0.0001, + "loss": 3.8519, + "loss/crossentropy": 1.9479581117630005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20007412880659103, + "step": 23840 + }, + { + "epoch": 0.47684, + "grad_norm": 1.921875, + "grad_norm_var": 0.003177897135416667, + "learning_rate": 0.0001, + "loss": 4.2284, + "loss/crossentropy": 1.8896448016166687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19973865151405334, + "step": 23842 + }, + { + "epoch": 0.47688, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0030995686848958332, + "learning_rate": 0.0001, + "loss": 4.0693, + "loss/crossentropy": 2.057901084423065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19682063907384872, + "step": 23844 + }, + { + "epoch": 0.47692, + "grad_norm": 2.0, + "grad_norm_var": 0.007355753580729167, + "learning_rate": 0.0001, + "loss": 4.2305, + "loss/crossentropy": 1.9882251024246216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18287301808595657, + "step": 23846 + }, + { + "epoch": 0.47696, + "grad_norm": 1.953125, + "grad_norm_var": 0.008421834309895833, + "learning_rate": 0.0001, + "loss": 4.1016, + "loss/crossentropy": 1.9481773972511292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17307578027248383, + "step": 23848 + }, + { + "epoch": 0.477, + "grad_norm": 1.828125, + "grad_norm_var": 0.01718928019205729, + "learning_rate": 0.0001, + "loss": 3.8738, + "loss/crossentropy": 1.8696550130844116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18704131245613098, + "step": 23850 + }, + { + "epoch": 0.47704, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0187652587890625, + "learning_rate": 0.0001, + "loss": 3.8426, + "loss/crossentropy": 2.0851826667785645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18556270003318787, + "step": 23852 + }, + { + "epoch": 0.47708, + "grad_norm": 2.09375, + "grad_norm_var": 0.018943023681640626, + "learning_rate": 0.0001, + "loss": 4.2501, + "loss/crossentropy": 2.3411136865615845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.225515678524971, + "step": 23854 + }, + { + "epoch": 0.47712, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01817804972330729, + "learning_rate": 0.0001, + "loss": 3.7964, + "loss/crossentropy": 1.944790780544281, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.195308655500412, + "step": 23856 + }, + { + "epoch": 0.47716, + "grad_norm": 2.328125, + "grad_norm_var": 0.026594034830729165, + "learning_rate": 0.0001, + "loss": 4.004, + "loss/crossentropy": 1.8672103881835938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18687407672405243, + "step": 23858 + }, + { + "epoch": 0.4772, + "grad_norm": 1.9453125, + "grad_norm_var": 0.026889801025390625, + "learning_rate": 0.0001, + "loss": 3.826, + "loss/crossentropy": 1.9624406099319458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19766812026500702, + "step": 23860 + }, + { + "epoch": 0.47724, + "grad_norm": 2.03125, + "grad_norm_var": 0.026609039306640624, + "learning_rate": 0.0001, + "loss": 3.997, + "loss/crossentropy": 2.1671148538589478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916585937142372, + "step": 23862 + }, + { + "epoch": 0.47728, + "grad_norm": 1.921875, + "grad_norm_var": 0.026668039957682292, + "learning_rate": 0.0001, + "loss": 4.0202, + "loss/crossentropy": 1.8473829627037048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16666045784950256, + "step": 23864 + }, + { + "epoch": 0.47732, + "grad_norm": 2.015625, + "grad_norm_var": 0.018182118733723957, + "learning_rate": 0.0001, + "loss": 4.0035, + "loss/crossentropy": 2.0888859033584595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19743109494447708, + "step": 23866 + }, + { + "epoch": 0.47736, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01679051717122396, + "learning_rate": 0.0001, + "loss": 3.8915, + "loss/crossentropy": 2.0148197412490845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2103613168001175, + "step": 23868 + }, + { + "epoch": 0.4774, + "grad_norm": 1.9375, + "grad_norm_var": 0.015550740559895833, + "learning_rate": 0.0001, + "loss": 3.9269, + "loss/crossentropy": 1.9780926704406738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18435538560152054, + "step": 23870 + }, + { + "epoch": 0.47744, + "grad_norm": 2.109375, + "grad_norm_var": 0.016068522135416666, + "learning_rate": 0.0001, + "loss": 3.9308, + "loss/crossentropy": 2.026561677455902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19771291315555573, + "step": 23872 + }, + { + "epoch": 0.47748, + "grad_norm": 1.90625, + "grad_norm_var": 0.005606842041015625, + "learning_rate": 0.0001, + "loss": 3.9616, + "loss/crossentropy": 2.029239237308502, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19737554341554642, + "step": 23874 + }, + { + "epoch": 0.47752, + "grad_norm": 1.8828125, + "grad_norm_var": 0.005360666910807292, + "learning_rate": 0.0001, + "loss": 3.9622, + "loss/crossentropy": 2.32661509513855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20728177577257156, + "step": 23876 + }, + { + "epoch": 0.47756, + "grad_norm": 1.8046875, + "grad_norm_var": 0.005086008707682292, + "learning_rate": 0.0001, + "loss": 3.8965, + "loss/crossentropy": 2.1897658109664917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1915959268808365, + "step": 23878 + }, + { + "epoch": 0.4776, + "grad_norm": 1.9765625, + "grad_norm_var": 0.004889933268229166, + "learning_rate": 0.0001, + "loss": 4.1431, + "loss/crossentropy": 2.3252129554748535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21492170542478561, + "step": 23880 + }, + { + "epoch": 0.47764, + "grad_norm": 1.8125, + "grad_norm_var": 0.005492146809895833, + "learning_rate": 0.0001, + "loss": 3.8796, + "loss/crossentropy": 1.6142144203186035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1473793387413025, + "step": 23882 + }, + { + "epoch": 0.47768, + "grad_norm": 2.0, + "grad_norm_var": 0.005702463785807291, + "learning_rate": 0.0001, + "loss": 4.2673, + "loss/crossentropy": 2.3027801513671875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21568017452955246, + "step": 23884 + }, + { + "epoch": 0.47772, + "grad_norm": 2.046875, + "grad_norm_var": 0.006791178385416667, + "learning_rate": 0.0001, + "loss": 3.9876, + "loss/crossentropy": 1.7805312871932983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19376665353775024, + "step": 23886 + }, + { + "epoch": 0.47776, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00482177734375, + "learning_rate": 0.0001, + "loss": 4.0035, + "loss/crossentropy": 2.011800706386566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1892775595188141, + "step": 23888 + }, + { + "epoch": 0.4778, + "grad_norm": 1.8125, + "grad_norm_var": 0.005649566650390625, + "learning_rate": 0.0001, + "loss": 4.0836, + "loss/crossentropy": 2.257364869117737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2021905705332756, + "step": 23890 + }, + { + "epoch": 0.47784, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005641428629557291, + "learning_rate": 0.0001, + "loss": 4.0778, + "loss/crossentropy": 2.211492657661438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19960176944732666, + "step": 23892 + }, + { + "epoch": 0.47788, + "grad_norm": 2.046875, + "grad_norm_var": 0.006807454427083333, + "learning_rate": 0.0001, + "loss": 3.8243, + "loss/crossentropy": 2.250791549682617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19187939167022705, + "step": 23894 + }, + { + "epoch": 0.47792, + "grad_norm": 1.953125, + "grad_norm_var": 0.0075927734375, + "learning_rate": 0.0001, + "loss": 3.9676, + "loss/crossentropy": 1.8536748886108398, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1737053319811821, + "step": 23896 + }, + { + "epoch": 0.47796, + "grad_norm": 2.078125, + "grad_norm_var": 0.014509073893229167, + "learning_rate": 0.0001, + "loss": 4.5734, + "loss/crossentropy": 2.1692601442337036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19785165041685104, + "step": 23898 + }, + { + "epoch": 0.478, + "grad_norm": 2.046875, + "grad_norm_var": 0.014826456705729166, + "learning_rate": 0.0001, + "loss": 4.0647, + "loss/crossentropy": 2.229991912841797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22324570268392563, + "step": 23900 + }, + { + "epoch": 0.47804, + "grad_norm": 1.90625, + "grad_norm_var": 0.014045206705729167, + "learning_rate": 0.0001, + "loss": 3.9411, + "loss/crossentropy": 2.2393418550491333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19389375299215317, + "step": 23902 + }, + { + "epoch": 0.47808, + "grad_norm": 2.015625, + "grad_norm_var": 0.014009348551432292, + "learning_rate": 0.0001, + "loss": 4.1504, + "loss/crossentropy": 2.072646141052246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20844239741563797, + "step": 23904 + }, + { + "epoch": 0.47812, + "grad_norm": 1.8046875, + "grad_norm_var": 0.014202626546223958, + "learning_rate": 0.0001, + "loss": 4.1153, + "loss/crossentropy": 2.3049235343933105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123921662569046, + "step": 23906 + }, + { + "epoch": 0.47816, + "grad_norm": 1.84375, + "grad_norm_var": 0.014964803059895834, + "learning_rate": 0.0001, + "loss": 3.7974, + "loss/crossentropy": 1.9755294919013977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17440249770879745, + "step": 23908 + }, + { + "epoch": 0.4782, + "grad_norm": 1.9609375, + "grad_norm_var": 0.013402303059895834, + "learning_rate": 0.0001, + "loss": 3.8354, + "loss/crossentropy": 1.8338207602500916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18320723623037338, + "step": 23910 + }, + { + "epoch": 0.47824, + "grad_norm": 2.453125, + "grad_norm_var": 0.026493326822916666, + "learning_rate": 0.0001, + "loss": 4.3086, + "loss/crossentropy": 2.079473555088043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2535192593932152, + "step": 23912 + }, + { + "epoch": 0.47828, + "grad_norm": 1.796875, + "grad_norm_var": 0.023119099934895835, + "learning_rate": 0.0001, + "loss": 3.6006, + "loss/crossentropy": 1.7429603934288025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17608244717121124, + "step": 23914 + }, + { + "epoch": 0.47832, + "grad_norm": 1.875, + "grad_norm_var": 0.023435211181640624, + "learning_rate": 0.0001, + "loss": 3.9687, + "loss/crossentropy": 2.1863731145858765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18513252586126328, + "step": 23916 + }, + { + "epoch": 0.47836, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0236083984375, + "learning_rate": 0.0001, + "loss": 3.948, + "loss/crossentropy": 2.1431294679641724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22228983789682388, + "step": 23918 + }, + { + "epoch": 0.4784, + "grad_norm": 1.921875, + "grad_norm_var": 0.023827107747395833, + "learning_rate": 0.0001, + "loss": 3.9218, + "loss/crossentropy": 1.8407437801361084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16755413264036179, + "step": 23920 + }, + { + "epoch": 0.47844, + "grad_norm": 2.140625, + "grad_norm_var": 0.025229644775390626, + "learning_rate": 0.0001, + "loss": 4.2533, + "loss/crossentropy": 2.0921709537506104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20164211839437485, + "step": 23922 + }, + { + "epoch": 0.47848, + "grad_norm": 1.90625, + "grad_norm_var": 0.04986979166666667, + "learning_rate": 0.0001, + "loss": 3.9155, + "loss/crossentropy": 1.5495757460594177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1741114780306816, + "step": 23924 + }, + { + "epoch": 0.47852, + "grad_norm": 2.015625, + "grad_norm_var": 0.050510406494140625, + "learning_rate": 0.0001, + "loss": 3.9084, + "loss/crossentropy": 2.0054045915603638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18482672423124313, + "step": 23926 + }, + { + "epoch": 0.47856, + "grad_norm": 3.015625, + "grad_norm_var": 0.10667724609375, + "learning_rate": 0.0001, + "loss": 4.3504, + "loss/crossentropy": 1.9925623536109924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19737054407596588, + "step": 23928 + }, + { + "epoch": 0.4786, + "grad_norm": 2.09375, + "grad_norm_var": 0.0996002197265625, + "learning_rate": 0.0001, + "loss": 4.1226, + "loss/crossentropy": 1.9019532203674316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19580943882465363, + "step": 23930 + }, + { + "epoch": 0.47864, + "grad_norm": 1.8828125, + "grad_norm_var": 0.10124104817708333, + "learning_rate": 0.0001, + "loss": 3.969, + "loss/crossentropy": 1.9665276408195496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18561270087957382, + "step": 23932 + }, + { + "epoch": 0.47868, + "grad_norm": 2.046875, + "grad_norm_var": 0.10370992024739584, + "learning_rate": 0.0001, + "loss": 4.3574, + "loss/crossentropy": 2.390560507774353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.194176584482193, + "step": 23934 + }, + { + "epoch": 0.47872, + "grad_norm": 2.046875, + "grad_norm_var": 0.10062026977539062, + "learning_rate": 0.0001, + "loss": 4.2517, + "loss/crossentropy": 2.1132951974868774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1915990635752678, + "step": 23936 + }, + { + "epoch": 0.47876, + "grad_norm": 1.9921875, + "grad_norm_var": 0.1031158447265625, + "learning_rate": 0.0001, + "loss": 4.0203, + "loss/crossentropy": 1.8993237018585205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18519867211580276, + "step": 23938 + }, + { + "epoch": 0.4788, + "grad_norm": 2.0, + "grad_norm_var": 0.08478190104166666, + "learning_rate": 0.0001, + "loss": 4.546, + "loss/crossentropy": 2.1111881732940674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20509092509746552, + "step": 23940 + }, + { + "epoch": 0.47884, + "grad_norm": 1.84375, + "grad_norm_var": 0.08461812337239584, + "learning_rate": 0.0001, + "loss": 4.1067, + "loss/crossentropy": 2.3452861309051514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19262470304965973, + "step": 23942 + }, + { + "epoch": 0.47888, + "grad_norm": 1.859375, + "grad_norm_var": 0.014733632405598959, + "learning_rate": 0.0001, + "loss": 3.8591, + "loss/crossentropy": 2.3471847772598267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20432017743587494, + "step": 23944 + }, + { + "epoch": 0.47892, + "grad_norm": 1.875, + "grad_norm_var": 0.01253662109375, + "learning_rate": 0.0001, + "loss": 3.9816, + "loss/crossentropy": 1.9720887541770935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19780997186899185, + "step": 23946 + }, + { + "epoch": 0.47896, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011124420166015624, + "learning_rate": 0.0001, + "loss": 4.2149, + "loss/crossentropy": 2.311566114425659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22458520531654358, + "step": 23948 + }, + { + "epoch": 0.479, + "grad_norm": 1.859375, + "grad_norm_var": 0.011009724934895833, + "learning_rate": 0.0001, + "loss": 3.7254, + "loss/crossentropy": 1.9993168711662292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1858225166797638, + "step": 23950 + }, + { + "epoch": 0.47904, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010343170166015625, + "learning_rate": 0.0001, + "loss": 3.8156, + "loss/crossentropy": 2.0514512062072754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19098126888275146, + "step": 23952 + }, + { + "epoch": 0.47908, + "grad_norm": 2.140625, + "grad_norm_var": 0.013248443603515625, + "learning_rate": 0.0001, + "loss": 3.6999, + "loss/crossentropy": 1.6041250824928284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15559390932321548, + "step": 23954 + }, + { + "epoch": 0.47912, + "grad_norm": 1.875, + "grad_norm_var": 0.0064117431640625, + "learning_rate": 0.0001, + "loss": 3.946, + "loss/crossentropy": 1.6981446146965027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16395021975040436, + "step": 23956 + }, + { + "epoch": 0.47916, + "grad_norm": 1.875, + "grad_norm_var": 0.007368977864583333, + "learning_rate": 0.0001, + "loss": 3.9571, + "loss/crossentropy": 1.6894282102584839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18198157846927643, + "step": 23958 + }, + { + "epoch": 0.4792, + "grad_norm": 2.875, + "grad_norm_var": 0.06331278483072916, + "learning_rate": 0.0001, + "loss": 4.1842, + "loss/crossentropy": 2.1354450583457947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053254321217537, + "step": 23960 + }, + { + "epoch": 0.47924, + "grad_norm": 1.8125, + "grad_norm_var": 0.06447652180989584, + "learning_rate": 0.0001, + "loss": 3.6665, + "loss/crossentropy": 1.8701319098472595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1739087551832199, + "step": 23962 + }, + { + "epoch": 0.47928, + "grad_norm": 1.984375, + "grad_norm_var": 0.06447347005208333, + "learning_rate": 0.0001, + "loss": 4.0016, + "loss/crossentropy": 2.1540639400482178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914234757423401, + "step": 23964 + }, + { + "epoch": 0.47932, + "grad_norm": 2.0625, + "grad_norm_var": 0.062170155843098956, + "learning_rate": 0.0001, + "loss": 3.9495, + "loss/crossentropy": 1.6824876070022583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17504771798849106, + "step": 23966 + }, + { + "epoch": 0.47936, + "grad_norm": 1.921875, + "grad_norm_var": 0.06093114217122396, + "learning_rate": 0.0001, + "loss": 4.0193, + "loss/crossentropy": 1.893419086933136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18423987925052643, + "step": 23968 + }, + { + "epoch": 0.4794, + "grad_norm": 1.84375, + "grad_norm_var": 0.0600738525390625, + "learning_rate": 0.0001, + "loss": 4.1371, + "loss/crossentropy": 1.759018063545227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1881185621023178, + "step": 23970 + }, + { + "epoch": 0.47944, + "grad_norm": 2.21875, + "grad_norm_var": 0.062263743082682295, + "learning_rate": 0.0001, + "loss": 4.0756, + "loss/crossentropy": 2.1234898567199707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2142435535788536, + "step": 23972 + }, + { + "epoch": 0.47948, + "grad_norm": 1.8125, + "grad_norm_var": 0.06409403483072916, + "learning_rate": 0.0001, + "loss": 3.9597, + "loss/crossentropy": 2.260958671569824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19538094103336334, + "step": 23974 + }, + { + "epoch": 0.47952, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0109039306640625, + "learning_rate": 0.0001, + "loss": 4.0813, + "loss/crossentropy": 2.004380762577057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1982579454779625, + "step": 23976 + }, + { + "epoch": 0.47956, + "grad_norm": 1.7421875, + "grad_norm_var": 0.012544759114583333, + "learning_rate": 0.0001, + "loss": 3.9318, + "loss/crossentropy": 2.038970112800598, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19289468228816986, + "step": 23978 + }, + { + "epoch": 0.4796, + "grad_norm": 1.8125, + "grad_norm_var": 0.013361612955729166, + "learning_rate": 0.0001, + "loss": 4.0707, + "loss/crossentropy": 2.291250705718994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19521041214466095, + "step": 23980 + }, + { + "epoch": 0.47964, + "grad_norm": 2.078125, + "grad_norm_var": 0.013537343343098958, + "learning_rate": 0.0001, + "loss": 4.2081, + "loss/crossentropy": 2.03319388628006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981820985674858, + "step": 23982 + }, + { + "epoch": 0.47968, + "grad_norm": 1.953125, + "grad_norm_var": 0.017899322509765624, + "learning_rate": 0.0001, + "loss": 4.259, + "loss/crossentropy": 2.3277642726898193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22071011364459991, + "step": 23984 + }, + { + "epoch": 0.47972, + "grad_norm": 1.8046875, + "grad_norm_var": 0.017724609375, + "learning_rate": 0.0001, + "loss": 4.0444, + "loss/crossentropy": 2.2068026065826416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19094634801149368, + "step": 23986 + }, + { + "epoch": 0.47976, + "grad_norm": 1.953125, + "grad_norm_var": 0.012475331624348959, + "learning_rate": 0.0001, + "loss": 4.0603, + "loss/crossentropy": 2.3219540119171143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21838166564702988, + "step": 23988 + }, + { + "epoch": 0.4798, + "grad_norm": 1.8125, + "grad_norm_var": 0.012743123372395833, + "learning_rate": 0.0001, + "loss": 3.9701, + "loss/crossentropy": 1.9068449139595032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19176269322633743, + "step": 23990 + }, + { + "epoch": 0.47984, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01620661417643229, + "learning_rate": 0.0001, + "loss": 3.8448, + "loss/crossentropy": 1.8598286509513855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17854613065719604, + "step": 23992 + }, + { + "epoch": 0.47988, + "grad_norm": 1.9140625, + "grad_norm_var": 0.013670857747395833, + "learning_rate": 0.0001, + "loss": 4.3274, + "loss/crossentropy": 2.3834491968154907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21013744175434113, + "step": 23994 + }, + { + "epoch": 0.47992, + "grad_norm": 1.7890625, + "grad_norm_var": 0.014009602864583333, + "learning_rate": 0.0001, + "loss": 3.8976, + "loss/crossentropy": 1.9524520635604858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17633382230997086, + "step": 23996 + }, + { + "epoch": 0.47996, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0135162353515625, + "learning_rate": 0.0001, + "loss": 3.8024, + "loss/crossentropy": 1.8082820773124695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17721424251794815, + "step": 23998 + }, + { + "epoch": 0.48, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0076983133951822914, + "learning_rate": 0.0001, + "loss": 4.0181, + "loss/crossentropy": 2.2108744382858276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21027880907058716, + "step": 24000 + }, + { + "epoch": 0.48004, + "grad_norm": 1.921875, + "grad_norm_var": 0.007533518473307291, + "learning_rate": 0.0001, + "loss": 4.0885, + "loss/crossentropy": 2.097791016101837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1928793340921402, + "step": 24002 + }, + { + "epoch": 0.48008, + "grad_norm": 1.875, + "grad_norm_var": 0.008429972330729167, + "learning_rate": 0.0001, + "loss": 4.1388, + "loss/crossentropy": 2.2565219402313232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21726776659488678, + "step": 24004 + }, + { + "epoch": 0.48012, + "grad_norm": 1.84375, + "grad_norm_var": 0.007706451416015625, + "learning_rate": 0.0001, + "loss": 4.0097, + "loss/crossentropy": 2.260462522506714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973881646990776, + "step": 24006 + }, + { + "epoch": 0.48016, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007713826497395834, + "learning_rate": 0.0001, + "loss": 4.4655, + "loss/crossentropy": 2.2929039001464844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22804804146289825, + "step": 24008 + }, + { + "epoch": 0.4802, + "grad_norm": 1.890625, + "grad_norm_var": 0.008017730712890626, + "learning_rate": 0.0001, + "loss": 3.8324, + "loss/crossentropy": 2.1556389331817627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19846376031637192, + "step": 24010 + }, + { + "epoch": 0.48024, + "grad_norm": 2.078125, + "grad_norm_var": 0.008514149983723959, + "learning_rate": 0.0001, + "loss": 4.2922, + "loss/crossentropy": 2.2187989950180054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21181489527225494, + "step": 24012 + }, + { + "epoch": 0.48028, + "grad_norm": 1.890625, + "grad_norm_var": 0.007230631510416667, + "learning_rate": 0.0001, + "loss": 3.7714, + "loss/crossentropy": 1.7474397420883179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18641416728496552, + "step": 24014 + }, + { + "epoch": 0.48032, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006388346354166667, + "learning_rate": 0.0001, + "loss": 4.2139, + "loss/crossentropy": 2.216716170310974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20577985048294067, + "step": 24016 + }, + { + "epoch": 0.48036, + "grad_norm": 1.8046875, + "grad_norm_var": 0.007673136393229167, + "learning_rate": 0.0001, + "loss": 3.9804, + "loss/crossentropy": 2.004529654979706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19065643101930618, + "step": 24018 + }, + { + "epoch": 0.4804, + "grad_norm": 2.0, + "grad_norm_var": 0.007054646809895833, + "learning_rate": 0.0001, + "loss": 4.0908, + "loss/crossentropy": 2.1064136028289795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20194996893405914, + "step": 24020 + }, + { + "epoch": 0.48044, + "grad_norm": 1.8125, + "grad_norm_var": 0.008208974202473959, + "learning_rate": 0.0001, + "loss": 4.0358, + "loss/crossentropy": 1.9716034531593323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18451768904924393, + "step": 24022 + }, + { + "epoch": 0.48048, + "grad_norm": 2.03125, + "grad_norm_var": 0.0073394775390625, + "learning_rate": 0.0001, + "loss": 4.2538, + "loss/crossentropy": 1.9206894636154175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21303895115852356, + "step": 24024 + }, + { + "epoch": 0.48052, + "grad_norm": 1.859375, + "grad_norm_var": 0.00718994140625, + "learning_rate": 0.0001, + "loss": 4.0227, + "loss/crossentropy": 2.2184818983078003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.205161914229393, + "step": 24026 + }, + { + "epoch": 0.48056, + "grad_norm": 2.21875, + "grad_norm_var": 0.061962890625, + "learning_rate": 0.0001, + "loss": 4.0584, + "loss/crossentropy": 1.6741513013839722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18154188245534897, + "step": 24028 + }, + { + "epoch": 0.4806, + "grad_norm": 2.96875, + "grad_norm_var": 0.11687418619791666, + "learning_rate": 0.0001, + "loss": 4.0782, + "loss/crossentropy": 1.9249740839004517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18673353642225266, + "step": 24030 + }, + { + "epoch": 0.48064, + "grad_norm": 2.0625, + "grad_norm_var": 0.11699930826822917, + "learning_rate": 0.0001, + "loss": 3.9456, + "loss/crossentropy": 1.7694995403289795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18005912005901337, + "step": 24032 + }, + { + "epoch": 0.48068, + "grad_norm": 1.9375, + "grad_norm_var": 0.11251220703125, + "learning_rate": 0.0001, + "loss": 4.0384, + "loss/crossentropy": 1.797115981578827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15852996706962585, + "step": 24034 + }, + { + "epoch": 0.48072, + "grad_norm": 1.8828125, + "grad_norm_var": 0.11373062133789062, + "learning_rate": 0.0001, + "loss": 4.0262, + "loss/crossentropy": 2.2001808881759644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20976591855287552, + "step": 24036 + }, + { + "epoch": 0.48076, + "grad_norm": 1.984375, + "grad_norm_var": 0.11187108357747395, + "learning_rate": 0.0001, + "loss": 4.1885, + "loss/crossentropy": 2.24720299243927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19293447583913803, + "step": 24038 + }, + { + "epoch": 0.4808, + "grad_norm": 1.84375, + "grad_norm_var": 0.11860249837239584, + "learning_rate": 0.0001, + "loss": 3.8017, + "loss/crossentropy": 2.0160406827926636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19429870694875717, + "step": 24040 + }, + { + "epoch": 0.48084, + "grad_norm": 2.078125, + "grad_norm_var": 0.11473770141601562, + "learning_rate": 0.0001, + "loss": 3.7577, + "loss/crossentropy": 1.6300169229507446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16134048998355865, + "step": 24042 + }, + { + "epoch": 0.48088, + "grad_norm": 1.8125, + "grad_norm_var": 0.0718505859375, + "learning_rate": 0.0001, + "loss": 3.7797, + "loss/crossentropy": 1.7445127367973328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16317147016525269, + "step": 24044 + }, + { + "epoch": 0.48092, + "grad_norm": 2.15625, + "grad_norm_var": 0.009417470296223958, + "learning_rate": 0.0001, + "loss": 3.6375, + "loss/crossentropy": 1.8674694299697876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19713745266199112, + "step": 24046 + }, + { + "epoch": 0.48096, + "grad_norm": 1.75, + "grad_norm_var": 0.01416015625, + "learning_rate": 0.0001, + "loss": 3.709, + "loss/crossentropy": 1.9868064522743225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17037389427423477, + "step": 24048 + }, + { + "epoch": 0.481, + "grad_norm": 1.828125, + "grad_norm_var": 0.02504450480143229, + "learning_rate": 0.0001, + "loss": 3.9139, + "loss/crossentropy": 1.8376989960670471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17913921177387238, + "step": 24050 + }, + { + "epoch": 0.48104, + "grad_norm": 1.8203125, + "grad_norm_var": 0.02818781534830729, + "learning_rate": 0.0001, + "loss": 3.9877, + "loss/crossentropy": 2.100782632827759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20613420754671097, + "step": 24052 + }, + { + "epoch": 0.48108, + "grad_norm": 1.96875, + "grad_norm_var": 0.033176422119140625, + "learning_rate": 0.0001, + "loss": 4.1303, + "loss/crossentropy": 2.20209538936615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972164735198021, + "step": 24054 + }, + { + "epoch": 0.48112, + "grad_norm": 1.75, + "grad_norm_var": 0.0370361328125, + "learning_rate": 0.0001, + "loss": 4.1346, + "loss/crossentropy": 2.3585156202316284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20321515947580338, + "step": 24056 + }, + { + "epoch": 0.48116, + "grad_norm": 1.90625, + "grad_norm_var": 0.03779678344726563, + "learning_rate": 0.0001, + "loss": 4.083, + "loss/crossentropy": 1.8413453698158264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18250074982643127, + "step": 24058 + }, + { + "epoch": 0.4812, + "grad_norm": 2.0, + "grad_norm_var": 0.03765055338541667, + "learning_rate": 0.0001, + "loss": 3.9894, + "loss/crossentropy": 1.975302815437317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21386513113975525, + "step": 24060 + }, + { + "epoch": 0.48124, + "grad_norm": 1.90625, + "grad_norm_var": 0.03631998697916667, + "learning_rate": 0.0001, + "loss": 4.0553, + "loss/crossentropy": 2.029150128364563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18910515308380127, + "step": 24062 + }, + { + "epoch": 0.48128, + "grad_norm": 1.8671875, + "grad_norm_var": 0.031648508707682294, + "learning_rate": 0.0001, + "loss": 3.9889, + "loss/crossentropy": 2.030007481575012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18395943939685822, + "step": 24064 + }, + { + "epoch": 0.48132, + "grad_norm": 2.109375, + "grad_norm_var": 0.026610310872395834, + "learning_rate": 0.0001, + "loss": 3.9652, + "loss/crossentropy": 2.0372042059898376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20342208445072174, + "step": 24066 + }, + { + "epoch": 0.48136, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0240386962890625, + "learning_rate": 0.0001, + "loss": 3.6604, + "loss/crossentropy": 2.1543315649032593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1886550337076187, + "step": 24068 + }, + { + "epoch": 0.4814, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01649754842122396, + "learning_rate": 0.0001, + "loss": 3.8595, + "loss/crossentropy": 2.125926434993744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22634702920913696, + "step": 24070 + }, + { + "epoch": 0.48144, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009699503580729166, + "learning_rate": 0.0001, + "loss": 3.879, + "loss/crossentropy": 2.057066559791565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19772379100322723, + "step": 24072 + }, + { + "epoch": 0.48148, + "grad_norm": 1.9375, + "grad_norm_var": 0.010151926676432292, + "learning_rate": 0.0001, + "loss": 3.8897, + "loss/crossentropy": 1.6535254120826721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16015072911977768, + "step": 24074 + }, + { + "epoch": 0.48152, + "grad_norm": 1.859375, + "grad_norm_var": 0.009471638997395834, + "learning_rate": 0.0001, + "loss": 3.8221, + "loss/crossentropy": 1.9959819912910461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19117611646652222, + "step": 24076 + }, + { + "epoch": 0.48156, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008251698811848958, + "learning_rate": 0.0001, + "loss": 3.9646, + "loss/crossentropy": 1.8243607878684998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17835189402103424, + "step": 24078 + }, + { + "epoch": 0.4816, + "grad_norm": 2.09375, + "grad_norm_var": 0.009779612223307291, + "learning_rate": 0.0001, + "loss": 3.8842, + "loss/crossentropy": 2.0300013422966003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20206968486309052, + "step": 24080 + }, + { + "epoch": 0.48164, + "grad_norm": 1.984375, + "grad_norm_var": 0.0050771077473958336, + "learning_rate": 0.0001, + "loss": 3.8706, + "loss/crossentropy": 1.9207960963249207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.195884570479393, + "step": 24082 + }, + { + "epoch": 0.48168, + "grad_norm": 1.84375, + "grad_norm_var": 0.004903157552083333, + "learning_rate": 0.0001, + "loss": 3.8254, + "loss/crossentropy": 1.7780911922454834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16665860265493393, + "step": 24084 + }, + { + "epoch": 0.48172, + "grad_norm": 1.953125, + "grad_norm_var": 0.004811350504557292, + "learning_rate": 0.0001, + "loss": 4.0063, + "loss/crossentropy": 2.028922200202942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19241196662187576, + "step": 24086 + }, + { + "epoch": 0.48176, + "grad_norm": 1.8671875, + "grad_norm_var": 0.004603830973307291, + "learning_rate": 0.0001, + "loss": 4.1312, + "loss/crossentropy": 2.4450284242630005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077808380126953, + "step": 24088 + }, + { + "epoch": 0.4818, + "grad_norm": 1.7578125, + "grad_norm_var": 0.006769816080729167, + "learning_rate": 0.0001, + "loss": 4.1001, + "loss/crossentropy": 2.0783454179763794, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20609202980995178, + "step": 24090 + }, + { + "epoch": 0.48184, + "grad_norm": 1.90625, + "grad_norm_var": 0.006498209635416667, + "learning_rate": 0.0001, + "loss": 3.8235, + "loss/crossentropy": 1.4834896326065063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1598825305700302, + "step": 24092 + }, + { + "epoch": 0.48188, + "grad_norm": 2.25, + "grad_norm_var": 0.013226064046223958, + "learning_rate": 0.0001, + "loss": 3.9492, + "loss/crossentropy": 2.1462320685386658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19592846930027008, + "step": 24094 + }, + { + "epoch": 0.48192, + "grad_norm": 1.96875, + "grad_norm_var": 0.011735026041666667, + "learning_rate": 0.0001, + "loss": 4.1818, + "loss/crossentropy": 2.197389841079712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19548462331295013, + "step": 24096 + }, + { + "epoch": 0.48196, + "grad_norm": 1.953125, + "grad_norm_var": 0.0119781494140625, + "learning_rate": 0.0001, + "loss": 4.0234, + "loss/crossentropy": 2.1164830923080444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1923857405781746, + "step": 24098 + }, + { + "epoch": 0.482, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011234283447265625, + "learning_rate": 0.0001, + "loss": 3.9165, + "loss/crossentropy": 2.2651044130325317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23053671419620514, + "step": 24100 + }, + { + "epoch": 0.48204, + "grad_norm": 2.140625, + "grad_norm_var": 0.013155110677083333, + "learning_rate": 0.0001, + "loss": 4.0934, + "loss/crossentropy": 2.0696656107902527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19399120658636093, + "step": 24102 + }, + { + "epoch": 0.48208, + "grad_norm": 1.984375, + "grad_norm_var": 0.012741851806640624, + "learning_rate": 0.0001, + "loss": 3.9432, + "loss/crossentropy": 2.0894209146499634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19454246014356613, + "step": 24104 + }, + { + "epoch": 0.48212, + "grad_norm": 1.828125, + "grad_norm_var": 0.011844889322916666, + "learning_rate": 0.0001, + "loss": 3.7906, + "loss/crossentropy": 1.9821223020553589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19387958943843842, + "step": 24106 + }, + { + "epoch": 0.48216, + "grad_norm": 1.921875, + "grad_norm_var": 0.010949452718098959, + "learning_rate": 0.0001, + "loss": 4.2146, + "loss/crossentropy": 2.172493100166321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20457924157381058, + "step": 24108 + }, + { + "epoch": 0.4822, + "grad_norm": 1.7578125, + "grad_norm_var": 0.009740956624348958, + "learning_rate": 0.0001, + "loss": 3.8386, + "loss/crossentropy": 2.0507744550704956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1748208999633789, + "step": 24110 + }, + { + "epoch": 0.48224, + "grad_norm": 1.859375, + "grad_norm_var": 0.01253662109375, + "learning_rate": 0.0001, + "loss": 3.6145, + "loss/crossentropy": 1.9859300255775452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1867031380534172, + "step": 24112 + }, + { + "epoch": 0.48228, + "grad_norm": 2.125, + "grad_norm_var": 0.0148193359375, + "learning_rate": 0.0001, + "loss": 4.089, + "loss/crossentropy": 2.1157150268554688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.208559550344944, + "step": 24114 + }, + { + "epoch": 0.48232, + "grad_norm": 1.90625, + "grad_norm_var": 0.015869140625, + "learning_rate": 0.0001, + "loss": 4.0411, + "loss/crossentropy": 2.0446948409080505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21388032287359238, + "step": 24116 + }, + { + "epoch": 0.48236, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0132232666015625, + "learning_rate": 0.0001, + "loss": 4.0558, + "loss/crossentropy": 2.1047087907791138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18818341195583344, + "step": 24118 + }, + { + "epoch": 0.4824, + "grad_norm": 2.03125, + "grad_norm_var": 0.013605753580729166, + "learning_rate": 0.0001, + "loss": 4.2172, + "loss/crossentropy": 2.235207200050354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22523081302642822, + "step": 24120 + }, + { + "epoch": 0.48244, + "grad_norm": 1.8125, + "grad_norm_var": 0.012068430582682291, + "learning_rate": 0.0001, + "loss": 4.0648, + "loss/crossentropy": 2.4851995706558228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19662421196699142, + "step": 24122 + }, + { + "epoch": 0.48248, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012980143229166666, + "learning_rate": 0.0001, + "loss": 3.9836, + "loss/crossentropy": 1.8986297249794006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18801461160182953, + "step": 24124 + }, + { + "epoch": 0.48252, + "grad_norm": 1.703125, + "grad_norm_var": 0.0140045166015625, + "learning_rate": 0.0001, + "loss": 3.8972, + "loss/crossentropy": 1.7900904417037964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16809770464897156, + "step": 24126 + }, + { + "epoch": 0.48256, + "grad_norm": 1.78125, + "grad_norm_var": 0.012914021809895834, + "learning_rate": 0.0001, + "loss": 3.8146, + "loss/crossentropy": 2.057170867919922, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19056524336338043, + "step": 24128 + }, + { + "epoch": 0.4826, + "grad_norm": 1.9375, + "grad_norm_var": 0.009600575764973958, + "learning_rate": 0.0001, + "loss": 4.0454, + "loss/crossentropy": 2.0708820819854736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18739734590053558, + "step": 24130 + }, + { + "epoch": 0.48264, + "grad_norm": 2.03125, + "grad_norm_var": 0.009989166259765625, + "learning_rate": 0.0001, + "loss": 4.1356, + "loss/crossentropy": 1.9330047965049744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18113382160663605, + "step": 24132 + }, + { + "epoch": 0.48268, + "grad_norm": 2.0, + "grad_norm_var": 0.010178375244140624, + "learning_rate": 0.0001, + "loss": 3.9655, + "loss/crossentropy": 2.076119899749756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1978432536125183, + "step": 24134 + }, + { + "epoch": 0.48272, + "grad_norm": 2.015625, + "grad_norm_var": 0.010994211832682291, + "learning_rate": 0.0001, + "loss": 4.0518, + "loss/crossentropy": 1.9721894264221191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19195114821195602, + "step": 24136 + }, + { + "epoch": 0.48276, + "grad_norm": 1.90625, + "grad_norm_var": 0.010514322916666667, + "learning_rate": 0.0001, + "loss": 4.0326, + "loss/crossentropy": 2.2981468439102173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2356787919998169, + "step": 24138 + }, + { + "epoch": 0.4828, + "grad_norm": 2.203125, + "grad_norm_var": 0.014104970296223958, + "learning_rate": 0.0001, + "loss": 4.0693, + "loss/crossentropy": 2.1552239656448364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20404700934886932, + "step": 24140 + }, + { + "epoch": 0.48284, + "grad_norm": 1.96875, + "grad_norm_var": 0.010379791259765625, + "learning_rate": 0.0001, + "loss": 4.0906, + "loss/crossentropy": 2.1359177231788635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20384428650140762, + "step": 24142 + }, + { + "epoch": 0.48288, + "grad_norm": 2.015625, + "grad_norm_var": 0.0084381103515625, + "learning_rate": 0.0001, + "loss": 4.2397, + "loss/crossentropy": 2.340220808982849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20727048069238663, + "step": 24144 + }, + { + "epoch": 0.48292, + "grad_norm": 1.875, + "grad_norm_var": 0.009702301025390625, + "learning_rate": 0.0001, + "loss": 3.6822, + "loss/crossentropy": 1.7174909114837646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16191110014915466, + "step": 24146 + }, + { + "epoch": 0.48296, + "grad_norm": 1.7890625, + "grad_norm_var": 0.010876210530598958, + "learning_rate": 0.0001, + "loss": 3.8731, + "loss/crossentropy": 2.0727250576019287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.182908833026886, + "step": 24148 + }, + { + "epoch": 0.483, + "grad_norm": 2.125, + "grad_norm_var": 0.0141357421875, + "learning_rate": 0.0001, + "loss": 4.2179, + "loss/crossentropy": 2.073746085166931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22062638401985168, + "step": 24150 + }, + { + "epoch": 0.48304, + "grad_norm": 4.59375, + "grad_norm_var": 0.4433143615722656, + "learning_rate": 0.0001, + "loss": 3.8633, + "loss/crossentropy": 1.7323468327522278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16554639488458633, + "step": 24152 + }, + { + "epoch": 0.48308, + "grad_norm": 1.96875, + "grad_norm_var": 0.4394365946451823, + "learning_rate": 0.0001, + "loss": 4.0216, + "loss/crossentropy": 2.136180579662323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192221999168396, + "step": 24154 + }, + { + "epoch": 0.48312, + "grad_norm": 1.953125, + "grad_norm_var": 0.4390459696451823, + "learning_rate": 0.0001, + "loss": 4.0375, + "loss/crossentropy": 1.8351938128471375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17663314193487167, + "step": 24156 + }, + { + "epoch": 0.48316, + "grad_norm": 1.96875, + "grad_norm_var": 0.44324722290039065, + "learning_rate": 0.0001, + "loss": 3.9927, + "loss/crossentropy": 2.02448707818985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19494900107383728, + "step": 24158 + }, + { + "epoch": 0.4832, + "grad_norm": 1.9140625, + "grad_norm_var": 0.4480865478515625, + "learning_rate": 0.0001, + "loss": 3.9197, + "loss/crossentropy": 2.185999810695648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1869122013449669, + "step": 24160 + }, + { + "epoch": 0.48324, + "grad_norm": 2.0625, + "grad_norm_var": 0.4416168212890625, + "learning_rate": 0.0001, + "loss": 4.2421, + "loss/crossentropy": 2.16790235042572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19200150668621063, + "step": 24162 + }, + { + "epoch": 0.48328, + "grad_norm": 1.828125, + "grad_norm_var": 0.44467137654622396, + "learning_rate": 0.0001, + "loss": 3.7496, + "loss/crossentropy": 1.5323420763015747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14329544454813004, + "step": 24164 + }, + { + "epoch": 0.48332, + "grad_norm": 1.96875, + "grad_norm_var": 0.4490631103515625, + "learning_rate": 0.0001, + "loss": 4.1271, + "loss/crossentropy": 2.4106760025024414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088209092617035, + "step": 24166 + }, + { + "epoch": 0.48336, + "grad_norm": 1.90625, + "grad_norm_var": 0.0195220947265625, + "learning_rate": 0.0001, + "loss": 4.1251, + "loss/crossentropy": 2.2519407272338867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21344104409217834, + "step": 24168 + }, + { + "epoch": 0.4834, + "grad_norm": 1.859375, + "grad_norm_var": 0.016771443684895835, + "learning_rate": 0.0001, + "loss": 4.129, + "loss/crossentropy": 2.4721094369888306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20769134163856506, + "step": 24170 + }, + { + "epoch": 0.48344, + "grad_norm": 2.03125, + "grad_norm_var": 0.01695734659830729, + "learning_rate": 0.0001, + "loss": 4.0163, + "loss/crossentropy": 1.794982135295868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18507220596075058, + "step": 24172 + }, + { + "epoch": 0.48348, + "grad_norm": 2.0, + "grad_norm_var": 0.01649958292643229, + "learning_rate": 0.0001, + "loss": 4.0596, + "loss/crossentropy": 2.086494565010071, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993468999862671, + "step": 24174 + }, + { + "epoch": 0.48352, + "grad_norm": 1.953125, + "grad_norm_var": 0.016112263997395834, + "learning_rate": 0.0001, + "loss": 4.2193, + "loss/crossentropy": 1.9496687054634094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19727850705385208, + "step": 24176 + }, + { + "epoch": 0.48356, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005777740478515625, + "learning_rate": 0.0001, + "loss": 4.1632, + "loss/crossentropy": 2.0036423206329346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1781500205397606, + "step": 24178 + }, + { + "epoch": 0.4836, + "grad_norm": 2.0, + "grad_norm_var": 0.013263956705729166, + "learning_rate": 0.0001, + "loss": 3.9156, + "loss/crossentropy": 2.1094807386398315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822209656238556, + "step": 24180 + }, + { + "epoch": 0.48364, + "grad_norm": 2.015625, + "grad_norm_var": 0.013722483317057292, + "learning_rate": 0.0001, + "loss": 3.9997, + "loss/crossentropy": 2.0273566246032715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21027785539627075, + "step": 24182 + }, + { + "epoch": 0.48368, + "grad_norm": 1.8203125, + "grad_norm_var": 0.014705149332682292, + "learning_rate": 0.0001, + "loss": 3.9781, + "loss/crossentropy": 1.9223063588142395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821991503238678, + "step": 24184 + }, + { + "epoch": 0.48372, + "grad_norm": 1.828125, + "grad_norm_var": 0.015067545572916667, + "learning_rate": 0.0001, + "loss": 4.0162, + "loss/crossentropy": 1.7469323873519897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19376590102910995, + "step": 24186 + }, + { + "epoch": 0.48376, + "grad_norm": 1.765625, + "grad_norm_var": 0.017545572916666665, + "learning_rate": 0.0001, + "loss": 3.7141, + "loss/crossentropy": 1.7850856184959412, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17863896489143372, + "step": 24188 + }, + { + "epoch": 0.4838, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01910985310872396, + "learning_rate": 0.0001, + "loss": 4.009, + "loss/crossentropy": 1.7316967844963074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16793282330036163, + "step": 24190 + }, + { + "epoch": 0.48384, + "grad_norm": 2.09375, + "grad_norm_var": 0.02093480428059896, + "learning_rate": 0.0001, + "loss": 4.0243, + "loss/crossentropy": 2.3408809900283813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21008720248937607, + "step": 24192 + }, + { + "epoch": 0.48388, + "grad_norm": 1.8515625, + "grad_norm_var": 0.02029393513997396, + "learning_rate": 0.0001, + "loss": 3.7861, + "loss/crossentropy": 2.0060762763023376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1912834793329239, + "step": 24194 + }, + { + "epoch": 0.48392, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011189778645833334, + "learning_rate": 0.0001, + "loss": 4.1224, + "loss/crossentropy": 2.1604164838790894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1941056326031685, + "step": 24196 + }, + { + "epoch": 0.48396, + "grad_norm": 1.96875, + "grad_norm_var": 0.010371907552083334, + "learning_rate": 0.0001, + "loss": 4.2158, + "loss/crossentropy": 2.0909000635147095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20140909403562546, + "step": 24198 + }, + { + "epoch": 0.484, + "grad_norm": 1.9375, + "grad_norm_var": 0.009039052327473958, + "learning_rate": 0.0001, + "loss": 3.94, + "loss/crossentropy": 1.8877050876617432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18381226062774658, + "step": 24200 + }, + { + "epoch": 0.48404, + "grad_norm": 1.90625, + "grad_norm_var": 0.008721669514973959, + "learning_rate": 0.0001, + "loss": 3.9384, + "loss/crossentropy": 1.982464611530304, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18099818378686905, + "step": 24202 + }, + { + "epoch": 0.48408, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006481679280598959, + "learning_rate": 0.0001, + "loss": 4.1044, + "loss/crossentropy": 1.9599023461341858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18184300512075424, + "step": 24204 + }, + { + "epoch": 0.48412, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005110422770182292, + "learning_rate": 0.0001, + "loss": 4.1949, + "loss/crossentropy": 2.4100862741470337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156199961900711, + "step": 24206 + }, + { + "epoch": 0.48416, + "grad_norm": 1.921875, + "grad_norm_var": 0.0038653055826822916, + "learning_rate": 0.0001, + "loss": 4.4761, + "loss/crossentropy": 2.2121779322624207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23106029629707336, + "step": 24208 + }, + { + "epoch": 0.4842, + "grad_norm": 1.6875, + "grad_norm_var": 0.007331339518229166, + "learning_rate": 0.0001, + "loss": 3.8548, + "loss/crossentropy": 1.9747964143753052, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17455366998910904, + "step": 24210 + }, + { + "epoch": 0.48424, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010008748372395833, + "learning_rate": 0.0001, + "loss": 4.3493, + "loss/crossentropy": 2.3463014364242554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21605894714593887, + "step": 24212 + }, + { + "epoch": 0.48428, + "grad_norm": 2.15625, + "grad_norm_var": 0.012626139322916667, + "learning_rate": 0.0001, + "loss": 4.1674, + "loss/crossentropy": 2.1442391872406006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003452554345131, + "step": 24214 + }, + { + "epoch": 0.48432, + "grad_norm": 1.890625, + "grad_norm_var": 0.015803019205729168, + "learning_rate": 0.0001, + "loss": 4.2701, + "loss/crossentropy": 2.281853437423706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21563008427619934, + "step": 24216 + }, + { + "epoch": 0.48436, + "grad_norm": 1.984375, + "grad_norm_var": 0.01668065388997396, + "learning_rate": 0.0001, + "loss": 4.3137, + "loss/crossentropy": 2.303357243537903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23583707213401794, + "step": 24218 + }, + { + "epoch": 0.4844, + "grad_norm": 1.953125, + "grad_norm_var": 0.016649373372395835, + "learning_rate": 0.0001, + "loss": 4.1768, + "loss/crossentropy": 1.6764637231826782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1820378378033638, + "step": 24220 + }, + { + "epoch": 0.48444, + "grad_norm": 1.921875, + "grad_norm_var": 0.01702855428059896, + "learning_rate": 0.0001, + "loss": 4.4549, + "loss/crossentropy": 2.3033594489097595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21953578293323517, + "step": 24222 + }, + { + "epoch": 0.48448, + "grad_norm": 2.0, + "grad_norm_var": 0.017354329427083332, + "learning_rate": 0.0001, + "loss": 4.0358, + "loss/crossentropy": 2.1051379442214966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19566316902637482, + "step": 24224 + }, + { + "epoch": 0.48452, + "grad_norm": 2.0625, + "grad_norm_var": 0.009997304280598958, + "learning_rate": 0.0001, + "loss": 4.0489, + "loss/crossentropy": 2.122231125831604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20219532400369644, + "step": 24226 + }, + { + "epoch": 0.48456, + "grad_norm": 1.7265625, + "grad_norm_var": 0.013974761962890625, + "learning_rate": 0.0001, + "loss": 3.8343, + "loss/crossentropy": 1.6727787256240845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15022944658994675, + "step": 24228 + }, + { + "epoch": 0.4846, + "grad_norm": 1.90625, + "grad_norm_var": 0.012410227457682292, + "learning_rate": 0.0001, + "loss": 4.1034, + "loss/crossentropy": 2.0252939462661743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2014101818203926, + "step": 24230 + }, + { + "epoch": 0.48464, + "grad_norm": 1.9375, + "grad_norm_var": 0.011213175455729167, + "learning_rate": 0.0001, + "loss": 4.003, + "loss/crossentropy": 2.163161873817444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20025929063558578, + "step": 24232 + }, + { + "epoch": 0.48468, + "grad_norm": 1.75, + "grad_norm_var": 0.010895792643229167, + "learning_rate": 0.0001, + "loss": 3.7811, + "loss/crossentropy": 1.9188687205314636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916854828596115, + "step": 24234 + }, + { + "epoch": 0.48472, + "grad_norm": 1.953125, + "grad_norm_var": 0.009870402018229167, + "learning_rate": 0.0001, + "loss": 3.9804, + "loss/crossentropy": 1.820820927619934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1769949272274971, + "step": 24236 + }, + { + "epoch": 0.48476, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009297434488932292, + "learning_rate": 0.0001, + "loss": 3.9483, + "loss/crossentropy": 2.1655235290527344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005934715270996, + "step": 24238 + }, + { + "epoch": 0.4848, + "grad_norm": 1.9375, + "grad_norm_var": 0.009810384114583333, + "learning_rate": 0.0001, + "loss": 4.0458, + "loss/crossentropy": 1.9468095898628235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1830679327249527, + "step": 24240 + }, + { + "epoch": 0.48484, + "grad_norm": 2.0, + "grad_norm_var": 0.009535471598307291, + "learning_rate": 0.0001, + "loss": 4.0301, + "loss/crossentropy": 2.062897562980652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2143079787492752, + "step": 24242 + }, + { + "epoch": 0.48488, + "grad_norm": 2.171875, + "grad_norm_var": 0.010389963785807291, + "learning_rate": 0.0001, + "loss": 4.1204, + "loss/crossentropy": 2.0829350352287292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19584693014621735, + "step": 24244 + }, + { + "epoch": 0.48492, + "grad_norm": 1.890625, + "grad_norm_var": 0.0102447509765625, + "learning_rate": 0.0001, + "loss": 3.9724, + "loss/crossentropy": 2.0563968420028687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20057959854602814, + "step": 24246 + }, + { + "epoch": 0.48496, + "grad_norm": 2.171875, + "grad_norm_var": 0.013396962483723959, + "learning_rate": 0.0001, + "loss": 4.2274, + "loss/crossentropy": 2.1221665143966675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20288699865341187, + "step": 24248 + }, + { + "epoch": 0.485, + "grad_norm": 1.921875, + "grad_norm_var": 0.010374959309895833, + "learning_rate": 0.0001, + "loss": 4.2162, + "loss/crossentropy": 2.1751914024353027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20034758746623993, + "step": 24250 + }, + { + "epoch": 0.48504, + "grad_norm": 1.890625, + "grad_norm_var": 0.010872395833333333, + "learning_rate": 0.0001, + "loss": 3.8777, + "loss/crossentropy": 2.0292373299598694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18800581246614456, + "step": 24252 + }, + { + "epoch": 0.48508, + "grad_norm": 4.03125, + "grad_norm_var": 0.275439453125, + "learning_rate": 0.0001, + "loss": 4.0572, + "loss/crossentropy": 2.3077808618545532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23775576055049896, + "step": 24254 + }, + { + "epoch": 0.48512, + "grad_norm": 2.109375, + "grad_norm_var": 0.27331441243489585, + "learning_rate": 0.0001, + "loss": 4.0708, + "loss/crossentropy": 1.532023847103119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1606956273317337, + "step": 24256 + }, + { + "epoch": 0.48516, + "grad_norm": 2.03125, + "grad_norm_var": 0.26945699055989586, + "learning_rate": 0.0001, + "loss": 4.0351, + "loss/crossentropy": 2.5032970905303955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21631880104541779, + "step": 24258 + }, + { + "epoch": 0.4852, + "grad_norm": 1.9296875, + "grad_norm_var": 0.27319234212239585, + "learning_rate": 0.0001, + "loss": 3.9323, + "loss/crossentropy": 1.981030285358429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19134312123060226, + "step": 24260 + }, + { + "epoch": 0.48524, + "grad_norm": 1.953125, + "grad_norm_var": 0.2728668212890625, + "learning_rate": 0.0001, + "loss": 3.7619, + "loss/crossentropy": 2.031146466732025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17859568446874619, + "step": 24262 + }, + { + "epoch": 0.48528, + "grad_norm": 1.8203125, + "grad_norm_var": 0.2775917053222656, + "learning_rate": 0.0001, + "loss": 4.0923, + "loss/crossentropy": 2.5168176889419556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21798798441886902, + "step": 24264 + }, + { + "epoch": 0.48532, + "grad_norm": 4.09375, + "grad_norm_var": 0.530810292561849, + "learning_rate": 0.0001, + "loss": 3.9951, + "loss/crossentropy": 2.037827789783478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2093580812215805, + "step": 24266 + }, + { + "epoch": 0.48536, + "grad_norm": 1.9453125, + "grad_norm_var": 0.5302874247233073, + "learning_rate": 0.0001, + "loss": 4.0575, + "loss/crossentropy": 2.17299222946167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18525302410125732, + "step": 24268 + }, + { + "epoch": 0.4854, + "grad_norm": 1.9296875, + "grad_norm_var": 0.2958106994628906, + "learning_rate": 0.0001, + "loss": 3.9348, + "loss/crossentropy": 2.147997260093689, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20097453892230988, + "step": 24270 + }, + { + "epoch": 0.48544, + "grad_norm": 1.921875, + "grad_norm_var": 0.298583984375, + "learning_rate": 0.0001, + "loss": 4.1208, + "loss/crossentropy": 2.220878005027771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18916144967079163, + "step": 24272 + }, + { + "epoch": 0.48548, + "grad_norm": 2.0625, + "grad_norm_var": 0.297265625, + "learning_rate": 0.0001, + "loss": 4.0689, + "loss/crossentropy": 2.3884263038635254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21515949070453644, + "step": 24274 + }, + { + "epoch": 0.48552, + "grad_norm": 1.8984375, + "grad_norm_var": 0.2965349833170573, + "learning_rate": 0.0001, + "loss": 4.091, + "loss/crossentropy": 2.0658962726593018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19635243713855743, + "step": 24276 + }, + { + "epoch": 0.48556, + "grad_norm": 1.8828125, + "grad_norm_var": 0.297857411702474, + "learning_rate": 0.0001, + "loss": 4.2023, + "loss/crossentropy": 2.3192285299301147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22074441611766815, + "step": 24278 + }, + { + "epoch": 0.4856, + "grad_norm": 1.8671875, + "grad_norm_var": 0.30143941243489586, + "learning_rate": 0.0001, + "loss": 3.8625, + "loss/crossentropy": 2.082835614681244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18948380649089813, + "step": 24280 + }, + { + "epoch": 0.48564, + "grad_norm": 2.046875, + "grad_norm_var": 0.009039052327473958, + "learning_rate": 0.0001, + "loss": 3.8998, + "loss/crossentropy": 1.7844191193580627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17573316395282745, + "step": 24282 + }, + { + "epoch": 0.48568, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009161122639973958, + "learning_rate": 0.0001, + "loss": 3.8169, + "loss/crossentropy": 1.9186521768569946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1827767789363861, + "step": 24284 + }, + { + "epoch": 0.48572, + "grad_norm": 2.296875, + "grad_norm_var": 3.559098307291667, + "learning_rate": 0.0001, + "loss": 4.1294, + "loss/crossentropy": 2.0445513129234314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22182191163301468, + "step": 24286 + }, + { + "epoch": 0.48576, + "grad_norm": 2.140625, + "grad_norm_var": 3.535992177327474, + "learning_rate": 0.0001, + "loss": 3.9131, + "loss/crossentropy": 2.1303210258483887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20597820729017258, + "step": 24288 + }, + { + "epoch": 0.4858, + "grad_norm": 1.96875, + "grad_norm_var": 3.544976552327474, + "learning_rate": 0.0001, + "loss": 4.0554, + "loss/crossentropy": 1.9035282135009766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17603804916143417, + "step": 24290 + }, + { + "epoch": 0.48584, + "grad_norm": 2.109375, + "grad_norm_var": 3.5285723368326822, + "learning_rate": 0.0001, + "loss": 4.0373, + "loss/crossentropy": 2.101797640323639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19533278793096542, + "step": 24292 + }, + { + "epoch": 0.48588, + "grad_norm": 1.875, + "grad_norm_var": 3.5240071614583335, + "learning_rate": 0.0001, + "loss": 3.9883, + "loss/crossentropy": 1.9595980048179626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17600257694721222, + "step": 24294 + }, + { + "epoch": 0.48592, + "grad_norm": 1.84375, + "grad_norm_var": 3.506517537434896, + "learning_rate": 0.0001, + "loss": 3.8312, + "loss/crossentropy": 2.020717740058899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19629748165607452, + "step": 24296 + }, + { + "epoch": 0.48596, + "grad_norm": 1.875, + "grad_norm_var": 3.5107137044270833, + "learning_rate": 0.0001, + "loss": 4.4606, + "loss/crossentropy": 2.113112688064575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987774595618248, + "step": 24298 + }, + { + "epoch": 0.486, + "grad_norm": 1.984375, + "grad_norm_var": 3.4833943684895834, + "learning_rate": 0.0001, + "loss": 4.3874, + "loss/crossentropy": 2.3140907287597656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087773084640503, + "step": 24300 + }, + { + "epoch": 0.48604, + "grad_norm": 2.0, + "grad_norm_var": 0.012322743733723959, + "learning_rate": 0.0001, + "loss": 4.1962, + "loss/crossentropy": 2.137213349342346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19433750957250595, + "step": 24302 + }, + { + "epoch": 0.48608, + "grad_norm": 1.765625, + "grad_norm_var": 0.015026601155598958, + "learning_rate": 0.0001, + "loss": 3.7098, + "loss/crossentropy": 1.8572564125061035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18543125689029694, + "step": 24304 + }, + { + "epoch": 0.48612, + "grad_norm": 1.7890625, + "grad_norm_var": 0.017389933268229168, + "learning_rate": 0.0001, + "loss": 3.8251, + "loss/crossentropy": 1.991189181804657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1831136792898178, + "step": 24306 + }, + { + "epoch": 0.48616, + "grad_norm": 2.015625, + "grad_norm_var": 0.016901652018229168, + "learning_rate": 0.0001, + "loss": 3.9742, + "loss/crossentropy": 1.8540424704551697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1730886772274971, + "step": 24308 + }, + { + "epoch": 0.4862, + "grad_norm": 1.875, + "grad_norm_var": 0.016747792561848957, + "learning_rate": 0.0001, + "loss": 3.668, + "loss/crossentropy": 2.0189526677131653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916787549853325, + "step": 24310 + }, + { + "epoch": 0.48624, + "grad_norm": 1.9375, + "grad_norm_var": 0.016071573893229166, + "learning_rate": 0.0001, + "loss": 4.0621, + "loss/crossentropy": 2.2053693532943726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20035792887210846, + "step": 24312 + }, + { + "epoch": 0.48628, + "grad_norm": 2.015625, + "grad_norm_var": 0.012189737955729167, + "learning_rate": 0.0001, + "loss": 4.0676, + "loss/crossentropy": 2.2726696729660034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21893906593322754, + "step": 24314 + }, + { + "epoch": 0.48632, + "grad_norm": 1.890625, + "grad_norm_var": 0.006940714518229167, + "learning_rate": 0.0001, + "loss": 3.6041, + "loss/crossentropy": 1.9184311032295227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18804488331079483, + "step": 24316 + }, + { + "epoch": 0.48636, + "grad_norm": 1.9921875, + "grad_norm_var": 0.00714111328125, + "learning_rate": 0.0001, + "loss": 4.2603, + "loss/crossentropy": 2.416733145713806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21675719320774078, + "step": 24318 + }, + { + "epoch": 0.4864, + "grad_norm": 1.9765625, + "grad_norm_var": 0.00570068359375, + "learning_rate": 0.0001, + "loss": 4.1108, + "loss/crossentropy": 2.2064080238342285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20753097534179688, + "step": 24320 + }, + { + "epoch": 0.48644, + "grad_norm": 2.015625, + "grad_norm_var": 0.005324045817057292, + "learning_rate": 0.0001, + "loss": 4.1638, + "loss/crossentropy": 1.9959591031074524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18936610966920853, + "step": 24322 + }, + { + "epoch": 0.48648, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004613240559895833, + "learning_rate": 0.0001, + "loss": 4.0866, + "loss/crossentropy": 2.14048969745636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17960359901189804, + "step": 24324 + }, + { + "epoch": 0.48652, + "grad_norm": 2.109375, + "grad_norm_var": 0.006845855712890625, + "learning_rate": 0.0001, + "loss": 4.1387, + "loss/crossentropy": 1.6789081692695618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1848226636648178, + "step": 24326 + }, + { + "epoch": 0.48656, + "grad_norm": 2.0, + "grad_norm_var": 0.0069620768229166664, + "learning_rate": 0.0001, + "loss": 4.2576, + "loss/crossentropy": 2.4790940284729004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22105763852596283, + "step": 24328 + }, + { + "epoch": 0.4866, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006981404622395834, + "learning_rate": 0.0001, + "loss": 4.1167, + "loss/crossentropy": 1.8698575496673584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18414634466171265, + "step": 24330 + }, + { + "epoch": 0.48664, + "grad_norm": 2.078125, + "grad_norm_var": 0.0045969645182291664, + "learning_rate": 0.0001, + "loss": 4.1848, + "loss/crossentropy": 2.1648661494255066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2052561342716217, + "step": 24332 + }, + { + "epoch": 0.48668, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0054107666015625, + "learning_rate": 0.0001, + "loss": 3.8961, + "loss/crossentropy": 1.9554332494735718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2595558241009712, + "step": 24334 + }, + { + "epoch": 0.48672, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005208079020182292, + "learning_rate": 0.0001, + "loss": 3.8727, + "loss/crossentropy": 2.120553970336914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20485319197177887, + "step": 24336 + }, + { + "epoch": 0.48676, + "grad_norm": 2.09375, + "grad_norm_var": 0.006046295166015625, + "learning_rate": 0.0001, + "loss": 3.849, + "loss/crossentropy": 1.7237398028373718, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2036430537700653, + "step": 24338 + }, + { + "epoch": 0.4868, + "grad_norm": 1.7734375, + "grad_norm_var": 0.0088043212890625, + "learning_rate": 0.0001, + "loss": 3.9799, + "loss/crossentropy": 1.7713853120803833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1712682917714119, + "step": 24340 + }, + { + "epoch": 0.48684, + "grad_norm": 1.828125, + "grad_norm_var": 0.00877685546875, + "learning_rate": 0.0001, + "loss": 3.6067, + "loss/crossentropy": 2.0136027932167053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18176132440567017, + "step": 24342 + }, + { + "epoch": 0.48688, + "grad_norm": 1.984375, + "grad_norm_var": 0.007478841145833333, + "learning_rate": 0.0001, + "loss": 4.0666, + "loss/crossentropy": 1.9793068766593933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973177120089531, + "step": 24344 + }, + { + "epoch": 0.48692, + "grad_norm": 2.140625, + "grad_norm_var": 0.011165364583333334, + "learning_rate": 0.0001, + "loss": 4.3935, + "loss/crossentropy": 2.4376041889190674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22222332656383514, + "step": 24346 + }, + { + "epoch": 0.48696, + "grad_norm": 1.921875, + "grad_norm_var": 0.010448201497395834, + "learning_rate": 0.0001, + "loss": 3.9815, + "loss/crossentropy": 1.8432115316390991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1915820688009262, + "step": 24348 + }, + { + "epoch": 0.487, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010530344645182292, + "learning_rate": 0.0001, + "loss": 3.8947, + "loss/crossentropy": 1.8100075125694275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18175233900547028, + "step": 24350 + }, + { + "epoch": 0.48704, + "grad_norm": 2.015625, + "grad_norm_var": 0.0118072509765625, + "learning_rate": 0.0001, + "loss": 4.0207, + "loss/crossentropy": 2.0997090339660645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19269437342882156, + "step": 24352 + }, + { + "epoch": 0.48708, + "grad_norm": 2.125, + "grad_norm_var": 0.013529205322265625, + "learning_rate": 0.0001, + "loss": 3.9769, + "loss/crossentropy": 1.78102046251297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18288961052894592, + "step": 24354 + }, + { + "epoch": 0.48712, + "grad_norm": 1.8359375, + "grad_norm_var": 0.012227121988932292, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 2.006900370121002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19866903871297836, + "step": 24356 + }, + { + "epoch": 0.48716, + "grad_norm": 1.8046875, + "grad_norm_var": 0.013036855061848958, + "learning_rate": 0.0001, + "loss": 3.7675, + "loss/crossentropy": 1.9674765467643738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17953628301620483, + "step": 24358 + }, + { + "epoch": 0.4872, + "grad_norm": 1.796875, + "grad_norm_var": 0.014772288004557292, + "learning_rate": 0.0001, + "loss": 3.8801, + "loss/crossentropy": 1.6254491806030273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16119451820850372, + "step": 24360 + }, + { + "epoch": 0.48724, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010749308268229167, + "learning_rate": 0.0001, + "loss": 4.0639, + "loss/crossentropy": 2.1395972967147827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20453596115112305, + "step": 24362 + }, + { + "epoch": 0.48728, + "grad_norm": 2.0, + "grad_norm_var": 0.011022694905598958, + "learning_rate": 0.0001, + "loss": 4.1345, + "loss/crossentropy": 2.0489049553871155, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140253186225891, + "step": 24364 + }, + { + "epoch": 0.48732, + "grad_norm": 2.046875, + "grad_norm_var": 0.011502838134765625, + "learning_rate": 0.0001, + "loss": 4.2531, + "loss/crossentropy": 2.061949372291565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2023036777973175, + "step": 24366 + }, + { + "epoch": 0.48736, + "grad_norm": 1.96875, + "grad_norm_var": 0.008658854166666667, + "learning_rate": 0.0001, + "loss": 4.0565, + "loss/crossentropy": 2.2293535470962524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20210447907447815, + "step": 24368 + }, + { + "epoch": 0.4874, + "grad_norm": 2.046875, + "grad_norm_var": 0.006525675455729167, + "learning_rate": 0.0001, + "loss": 4.0853, + "loss/crossentropy": 2.085119366645813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19112756848335266, + "step": 24370 + }, + { + "epoch": 0.48744, + "grad_norm": 2.234375, + "grad_norm_var": 0.012178548177083333, + "learning_rate": 0.0001, + "loss": 4.0025, + "loss/crossentropy": 1.8672300577163696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19416838884353638, + "step": 24372 + }, + { + "epoch": 0.48748, + "grad_norm": 1.7734375, + "grad_norm_var": 0.015378570556640625, + "learning_rate": 0.0001, + "loss": 4.0043, + "loss/crossentropy": 1.7550761699676514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23619835823774338, + "step": 24374 + }, + { + "epoch": 0.48752, + "grad_norm": 1.75, + "grad_norm_var": 0.017061360677083335, + "learning_rate": 0.0001, + "loss": 3.8778, + "loss/crossentropy": 2.199945092201233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198745958507061, + "step": 24376 + }, + { + "epoch": 0.48756, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01672948201497396, + "learning_rate": 0.0001, + "loss": 3.9917, + "loss/crossentropy": 2.005652666091919, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20299483090639114, + "step": 24378 + }, + { + "epoch": 0.4876, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01756769816080729, + "learning_rate": 0.0001, + "loss": 3.8204, + "loss/crossentropy": 1.9981390237808228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19434207677841187, + "step": 24380 + }, + { + "epoch": 0.48764, + "grad_norm": 1.9296875, + "grad_norm_var": 0.017203776041666667, + "learning_rate": 0.0001, + "loss": 4.0997, + "loss/crossentropy": 1.9847465753555298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016771137714386, + "step": 24382 + }, + { + "epoch": 0.48768, + "grad_norm": 1.9296875, + "grad_norm_var": 0.017268880208333334, + "learning_rate": 0.0001, + "loss": 3.9819, + "loss/crossentropy": 1.9587423205375671, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20353511720895767, + "step": 24384 + }, + { + "epoch": 0.48772, + "grad_norm": 1.796875, + "grad_norm_var": 0.017919921875, + "learning_rate": 0.0001, + "loss": 3.639, + "loss/crossentropy": 1.6634628176689148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18179689347743988, + "step": 24386 + }, + { + "epoch": 0.48776, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011164347330729166, + "learning_rate": 0.0001, + "loss": 4.0095, + "loss/crossentropy": 1.8678483963012695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18797649443149567, + "step": 24388 + }, + { + "epoch": 0.4878, + "grad_norm": 2.59375, + "grad_norm_var": 0.034024810791015624, + "learning_rate": 0.0001, + "loss": 4.245, + "loss/crossentropy": 2.2724742889404297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19732308387756348, + "step": 24390 + }, + { + "epoch": 0.48784, + "grad_norm": 2.09375, + "grad_norm_var": 0.030918121337890625, + "learning_rate": 0.0001, + "loss": 4.1437, + "loss/crossentropy": 2.2534377574920654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21449754387140274, + "step": 24392 + }, + { + "epoch": 0.48788, + "grad_norm": 2.03125, + "grad_norm_var": 0.03006159464518229, + "learning_rate": 0.0001, + "loss": 4.4295, + "loss/crossentropy": 2.3007233142852783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21122832596302032, + "step": 24394 + }, + { + "epoch": 0.48792, + "grad_norm": 1.6796875, + "grad_norm_var": 0.036195627848307294, + "learning_rate": 0.0001, + "loss": 3.7791, + "loss/crossentropy": 2.0957794189453125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19196099787950516, + "step": 24396 + }, + { + "epoch": 0.48796, + "grad_norm": 1.9296875, + "grad_norm_var": 0.03947118123372396, + "learning_rate": 0.0001, + "loss": 4.2137, + "loss/crossentropy": 1.8763219118118286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1841205433011055, + "step": 24398 + }, + { + "epoch": 0.488, + "grad_norm": 1.9453125, + "grad_norm_var": 0.03982747395833333, + "learning_rate": 0.0001, + "loss": 3.8093, + "loss/crossentropy": 2.2737534046173096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.200559563934803, + "step": 24400 + }, + { + "epoch": 0.48804, + "grad_norm": 2.09375, + "grad_norm_var": 0.03672866821289063, + "learning_rate": 0.0001, + "loss": 3.9472, + "loss/crossentropy": 2.2588294744491577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21728847920894623, + "step": 24402 + }, + { + "epoch": 0.48808, + "grad_norm": 2.078125, + "grad_norm_var": 0.03683039347330729, + "learning_rate": 0.0001, + "loss": 4.052, + "loss/crossentropy": 2.1669589281082153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2279760167002678, + "step": 24404 + }, + { + "epoch": 0.48812, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014857737223307292, + "learning_rate": 0.0001, + "loss": 4.1939, + "loss/crossentropy": 2.229967176914215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20311905443668365, + "step": 24406 + }, + { + "epoch": 0.48816, + "grad_norm": 1.8046875, + "grad_norm_var": 0.016242472330729167, + "learning_rate": 0.0001, + "loss": 3.8149, + "loss/crossentropy": 1.7735026478767395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17489700764417648, + "step": 24408 + }, + { + "epoch": 0.4882, + "grad_norm": 2.09375, + "grad_norm_var": 0.016880035400390625, + "learning_rate": 0.0001, + "loss": 4.0679, + "loss/crossentropy": 1.6055407524108887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16866382956504822, + "step": 24410 + }, + { + "epoch": 0.48824, + "grad_norm": 2.265625, + "grad_norm_var": 0.017826080322265625, + "learning_rate": 0.0001, + "loss": 4.1063, + "loss/crossentropy": 2.0581588745117188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20155030488967896, + "step": 24412 + }, + { + "epoch": 0.48828, + "grad_norm": 2.046875, + "grad_norm_var": 0.022802734375, + "learning_rate": 0.0001, + "loss": 3.874, + "loss/crossentropy": 2.0038405656814575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854727864265442, + "step": 24414 + }, + { + "epoch": 0.48832, + "grad_norm": 2.0, + "grad_norm_var": 0.0213134765625, + "learning_rate": 0.0001, + "loss": 4.4175, + "loss/crossentropy": 2.5517786741256714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22493450343608856, + "step": 24416 + }, + { + "epoch": 0.48836, + "grad_norm": 1.96875, + "grad_norm_var": 0.021297200520833334, + "learning_rate": 0.0001, + "loss": 4.3037, + "loss/crossentropy": 1.963084876537323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878989413380623, + "step": 24418 + }, + { + "epoch": 0.4884, + "grad_norm": 2.28125, + "grad_norm_var": 0.025023396809895834, + "learning_rate": 0.0001, + "loss": 4.3639, + "loss/crossentropy": 2.093212604522705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20798558741807938, + "step": 24420 + }, + { + "epoch": 0.48844, + "grad_norm": 1.859375, + "grad_norm_var": 0.028537750244140625, + "learning_rate": 0.0001, + "loss": 3.9645, + "loss/crossentropy": 2.2910980582237244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19769180566072464, + "step": 24422 + }, + { + "epoch": 0.48848, + "grad_norm": 1.984375, + "grad_norm_var": 0.02447509765625, + "learning_rate": 0.0001, + "loss": 4.0026, + "loss/crossentropy": 1.7930738925933838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18997550755739212, + "step": 24424 + }, + { + "epoch": 0.48852, + "grad_norm": 1.9765625, + "grad_norm_var": 0.02550048828125, + "learning_rate": 0.0001, + "loss": 3.9086, + "loss/crossentropy": 1.7596023678779602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17135074734687805, + "step": 24426 + }, + { + "epoch": 0.48856, + "grad_norm": 1.9296875, + "grad_norm_var": 0.019059244791666666, + "learning_rate": 0.0001, + "loss": 4.2215, + "loss/crossentropy": 1.6692591905593872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1735754758119583, + "step": 24428 + }, + { + "epoch": 0.4886, + "grad_norm": 2.28125, + "grad_norm_var": 0.016015625, + "learning_rate": 0.0001, + "loss": 4.1244, + "loss/crossentropy": 1.9515159130096436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20981048792600632, + "step": 24430 + }, + { + "epoch": 0.48864, + "grad_norm": 1.7734375, + "grad_norm_var": 0.019451649983723958, + "learning_rate": 0.0001, + "loss": 3.9243, + "loss/crossentropy": 2.218432307243347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20877013355493546, + "step": 24432 + }, + { + "epoch": 0.48868, + "grad_norm": 1.984375, + "grad_norm_var": 0.01914240519205729, + "learning_rate": 0.0001, + "loss": 3.9418, + "loss/crossentropy": 1.6877312660217285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17738080769777298, + "step": 24434 + }, + { + "epoch": 0.48872, + "grad_norm": 1.90625, + "grad_norm_var": 0.013427480061848959, + "learning_rate": 0.0001, + "loss": 4.1303, + "loss/crossentropy": 2.204097032546997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016325369477272, + "step": 24436 + }, + { + "epoch": 0.48876, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012330881754557292, + "learning_rate": 0.0001, + "loss": 3.8735, + "loss/crossentropy": 2.287162959575653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2044740840792656, + "step": 24438 + }, + { + "epoch": 0.4888, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012743123372395833, + "learning_rate": 0.0001, + "loss": 4.0459, + "loss/crossentropy": 2.0852694511413574, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2053818255662918, + "step": 24440 + }, + { + "epoch": 0.48884, + "grad_norm": 1.8046875, + "grad_norm_var": 0.017170206705729166, + "learning_rate": 0.0001, + "loss": 4.075, + "loss/crossentropy": 2.070200562477112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1872272714972496, + "step": 24442 + }, + { + "epoch": 0.48888, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01678466796875, + "learning_rate": 0.0001, + "loss": 4.2843, + "loss/crossentropy": 2.034148395061493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20018816739320755, + "step": 24444 + }, + { + "epoch": 0.48892, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010282389322916667, + "learning_rate": 0.0001, + "loss": 3.9187, + "loss/crossentropy": 2.2302210927009583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21468773484230042, + "step": 24446 + }, + { + "epoch": 0.48896, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009584299723307292, + "learning_rate": 0.0001, + "loss": 3.8918, + "loss/crossentropy": 1.9789615273475647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18996840715408325, + "step": 24448 + }, + { + "epoch": 0.489, + "grad_norm": 2.046875, + "grad_norm_var": 0.0099761962890625, + "learning_rate": 0.0001, + "loss": 4.2603, + "loss/crossentropy": 2.1200879216194153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20140299201011658, + "step": 24450 + }, + { + "epoch": 0.48904, + "grad_norm": 1.734375, + "grad_norm_var": 0.013475545247395833, + "learning_rate": 0.0001, + "loss": 3.6989, + "loss/crossentropy": 1.9175571203231812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18238398432731628, + "step": 24452 + }, + { + "epoch": 0.48908, + "grad_norm": 2.140625, + "grad_norm_var": 0.016527303059895835, + "learning_rate": 0.0001, + "loss": 4.1204, + "loss/crossentropy": 2.414350748062134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2311321198940277, + "step": 24454 + }, + { + "epoch": 0.48912, + "grad_norm": 1.9453125, + "grad_norm_var": 0.016267903645833335, + "learning_rate": 0.0001, + "loss": 4.0023, + "loss/crossentropy": 2.113545060157776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183447748422623, + "step": 24456 + }, + { + "epoch": 0.48916, + "grad_norm": 2.53125, + "grad_norm_var": 0.035123443603515624, + "learning_rate": 0.0001, + "loss": 4.0608, + "loss/crossentropy": 1.9957387447357178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.183005228638649, + "step": 24458 + }, + { + "epoch": 0.4892, + "grad_norm": 1.9765625, + "grad_norm_var": 0.034970855712890624, + "learning_rate": 0.0001, + "loss": 4.0566, + "loss/crossentropy": 2.333039402961731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21214177459478378, + "step": 24460 + }, + { + "epoch": 0.48924, + "grad_norm": 1.8671875, + "grad_norm_var": 0.035302734375, + "learning_rate": 0.0001, + "loss": 3.876, + "loss/crossentropy": 2.0721434950828552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012626901268959, + "step": 24462 + }, + { + "epoch": 0.48928, + "grad_norm": 1.953125, + "grad_norm_var": 0.0350006103515625, + "learning_rate": 0.0001, + "loss": 3.9045, + "loss/crossentropy": 1.8990368843078613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19242222607135773, + "step": 24464 + }, + { + "epoch": 0.48932, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0349029541015625, + "learning_rate": 0.0001, + "loss": 4.0894, + "loss/crossentropy": 2.119647741317749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18154827505350113, + "step": 24466 + }, + { + "epoch": 0.48936, + "grad_norm": 1.9921875, + "grad_norm_var": 0.029679107666015624, + "learning_rate": 0.0001, + "loss": 3.9083, + "loss/crossentropy": 2.0843252539634705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2219039350748062, + "step": 24468 + }, + { + "epoch": 0.4894, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0286773681640625, + "learning_rate": 0.0001, + "loss": 3.979, + "loss/crossentropy": 2.023647725582123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18391972035169601, + "step": 24470 + }, + { + "epoch": 0.48944, + "grad_norm": 2.0, + "grad_norm_var": 0.03695475260416667, + "learning_rate": 0.0001, + "loss": 4.2427, + "loss/crossentropy": 2.1299458742141724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20914346724748611, + "step": 24472 + }, + { + "epoch": 0.48948, + "grad_norm": 1.9296875, + "grad_norm_var": 0.016690826416015624, + "learning_rate": 0.0001, + "loss": 3.734, + "loss/crossentropy": 1.7355242371559143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1707276776432991, + "step": 24474 + }, + { + "epoch": 0.48952, + "grad_norm": 2.1875, + "grad_norm_var": 0.020888010660807293, + "learning_rate": 0.0001, + "loss": 3.9611, + "loss/crossentropy": 1.7911944389343262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19432906806468964, + "step": 24476 + }, + { + "epoch": 0.48956, + "grad_norm": 1.921875, + "grad_norm_var": 0.0211090087890625, + "learning_rate": 0.0001, + "loss": 3.7178, + "loss/crossentropy": 1.824588656425476, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1889827474951744, + "step": 24478 + }, + { + "epoch": 0.4896, + "grad_norm": 1.875, + "grad_norm_var": 0.020457967122395834, + "learning_rate": 0.0001, + "loss": 3.9154, + "loss/crossentropy": 1.9896376132965088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033599317073822, + "step": 24480 + }, + { + "epoch": 0.48964, + "grad_norm": 1.921875, + "grad_norm_var": 0.020213826497395834, + "learning_rate": 0.0001, + "loss": 4.0084, + "loss/crossentropy": 2.1408406496047974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20912234485149384, + "step": 24482 + }, + { + "epoch": 0.48968, + "grad_norm": 1.796875, + "grad_norm_var": 0.02083307902018229, + "learning_rate": 0.0001, + "loss": 4.0378, + "loss/crossentropy": 2.083206295967102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20860081911087036, + "step": 24484 + }, + { + "epoch": 0.48972, + "grad_norm": 1.9140625, + "grad_norm_var": 0.02238947550455729, + "learning_rate": 0.0001, + "loss": 3.676, + "loss/crossentropy": 1.706899344921112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1741488128900528, + "step": 24486 + }, + { + "epoch": 0.48976, + "grad_norm": 1.90625, + "grad_norm_var": 0.011774698893229166, + "learning_rate": 0.0001, + "loss": 4.0392, + "loss/crossentropy": 1.7981711030006409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1577363833785057, + "step": 24488 + }, + { + "epoch": 0.4898, + "grad_norm": 2.015625, + "grad_norm_var": 0.010716756184895834, + "learning_rate": 0.0001, + "loss": 4.0149, + "loss/crossentropy": 2.3241711854934692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20463860034942627, + "step": 24490 + }, + { + "epoch": 0.48984, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006810506184895833, + "learning_rate": 0.0001, + "loss": 3.8504, + "loss/crossentropy": 1.8822330832481384, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18153046071529388, + "step": 24492 + }, + { + "epoch": 0.48988, + "grad_norm": 1.890625, + "grad_norm_var": 0.006406402587890625, + "learning_rate": 0.0001, + "loss": 4.2485, + "loss/crossentropy": 2.4287012815475464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21238531172275543, + "step": 24494 + }, + { + "epoch": 0.48992, + "grad_norm": 1.828125, + "grad_norm_var": 0.005783843994140625, + "learning_rate": 0.0001, + "loss": 3.898, + "loss/crossentropy": 2.1026824712753296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19364942610263824, + "step": 24496 + }, + { + "epoch": 0.48996, + "grad_norm": 1.875, + "grad_norm_var": 0.00570068359375, + "learning_rate": 0.0001, + "loss": 3.8818, + "loss/crossentropy": 2.045943260192871, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19146864116191864, + "step": 24498 + }, + { + "epoch": 0.49, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004587554931640625, + "learning_rate": 0.0001, + "loss": 3.9354, + "loss/crossentropy": 2.0596802830696106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18570110201835632, + "step": 24500 + }, + { + "epoch": 0.49004, + "grad_norm": 2.015625, + "grad_norm_var": 0.005020904541015625, + "learning_rate": 0.0001, + "loss": 4.1919, + "loss/crossentropy": 2.0923487544059753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19211189448833466, + "step": 24502 + }, + { + "epoch": 0.49008, + "grad_norm": 1.7578125, + "grad_norm_var": 0.007564036051432291, + "learning_rate": 0.0001, + "loss": 3.8132, + "loss/crossentropy": 2.2522445917129517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19882844388484955, + "step": 24504 + }, + { + "epoch": 0.49012, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0066487630208333336, + "learning_rate": 0.0001, + "loss": 4.0853, + "loss/crossentropy": 1.9081300497055054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18056107312440872, + "step": 24506 + }, + { + "epoch": 0.49016, + "grad_norm": 2.015625, + "grad_norm_var": 0.0062978108723958336, + "learning_rate": 0.0001, + "loss": 4.0405, + "loss/crossentropy": 2.100565791130066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18689429759979248, + "step": 24508 + }, + { + "epoch": 0.4902, + "grad_norm": 2.015625, + "grad_norm_var": 0.007562001546223958, + "learning_rate": 0.0001, + "loss": 3.9123, + "loss/crossentropy": 1.9120057821273804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958208829164505, + "step": 24510 + }, + { + "epoch": 0.49024, + "grad_norm": 1.859375, + "grad_norm_var": 0.008888498942057291, + "learning_rate": 0.0001, + "loss": 3.9378, + "loss/crossentropy": 1.812345802783966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17729748785495758, + "step": 24512 + }, + { + "epoch": 0.49028, + "grad_norm": 2.015625, + "grad_norm_var": 0.009797922770182292, + "learning_rate": 0.0001, + "loss": 3.9652, + "loss/crossentropy": 1.9347732067108154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.207149900496006, + "step": 24514 + }, + { + "epoch": 0.49032, + "grad_norm": 1.734375, + "grad_norm_var": 0.011944325764973958, + "learning_rate": 0.0001, + "loss": 3.8159, + "loss/crossentropy": 1.8915638327598572, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16346803307533264, + "step": 24516 + }, + { + "epoch": 0.49036, + "grad_norm": 1.7265625, + "grad_norm_var": 0.011934407552083333, + "learning_rate": 0.0001, + "loss": 3.7906, + "loss/crossentropy": 2.2307119369506836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19432727247476578, + "step": 24518 + }, + { + "epoch": 0.4904, + "grad_norm": 2.1875, + "grad_norm_var": 0.015240224202473958, + "learning_rate": 0.0001, + "loss": 4.012, + "loss/crossentropy": 2.002243995666504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2070493996143341, + "step": 24520 + }, + { + "epoch": 0.49044, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017210896809895834, + "learning_rate": 0.0001, + "loss": 3.8478, + "loss/crossentropy": 2.166561722755432, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18929924815893173, + "step": 24522 + }, + { + "epoch": 0.49048, + "grad_norm": 2.03125, + "grad_norm_var": 0.018619537353515625, + "learning_rate": 0.0001, + "loss": 4.1177, + "loss/crossentropy": 2.1195461750030518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19457821547985077, + "step": 24524 + }, + { + "epoch": 0.49052, + "grad_norm": 1.9921875, + "grad_norm_var": 0.017878977457682292, + "learning_rate": 0.0001, + "loss": 4.195, + "loss/crossentropy": 2.2752838134765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21283380687236786, + "step": 24526 + }, + { + "epoch": 0.49056, + "grad_norm": 1.984375, + "grad_norm_var": 0.015822092692057293, + "learning_rate": 0.0001, + "loss": 4.1894, + "loss/crossentropy": 2.362849473953247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2148214504122734, + "step": 24528 + }, + { + "epoch": 0.4906, + "grad_norm": 1.8203125, + "grad_norm_var": 0.01643651326497396, + "learning_rate": 0.0001, + "loss": 3.965, + "loss/crossentropy": 1.9928752779960632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204628124833107, + "step": 24530 + }, + { + "epoch": 0.49064, + "grad_norm": 1.90625, + "grad_norm_var": 0.012206013997395833, + "learning_rate": 0.0001, + "loss": 4.1465, + "loss/crossentropy": 2.2063074111938477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20204229652881622, + "step": 24532 + }, + { + "epoch": 0.49068, + "grad_norm": 2.015625, + "grad_norm_var": 0.008457183837890625, + "learning_rate": 0.0001, + "loss": 4.2816, + "loss/crossentropy": 2.1019493341445923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18352027982473373, + "step": 24534 + }, + { + "epoch": 0.49072, + "grad_norm": 1.8125, + "grad_norm_var": 0.006949615478515625, + "learning_rate": 0.0001, + "loss": 4.0119, + "loss/crossentropy": 1.9460110664367676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18813221156597137, + "step": 24536 + }, + { + "epoch": 0.49076, + "grad_norm": 1.8125, + "grad_norm_var": 0.006912994384765625, + "learning_rate": 0.0001, + "loss": 3.9377, + "loss/crossentropy": 1.8915529251098633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198774054646492, + "step": 24538 + }, + { + "epoch": 0.4908, + "grad_norm": 2.078125, + "grad_norm_var": 0.0067860921223958336, + "learning_rate": 0.0001, + "loss": 4.2914, + "loss/crossentropy": 2.0600665807724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067568153142929, + "step": 24540 + }, + { + "epoch": 0.49084, + "grad_norm": 1.8046875, + "grad_norm_var": 0.007347615559895834, + "learning_rate": 0.0001, + "loss": 3.8895, + "loss/crossentropy": 2.3356127738952637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21175508201122284, + "step": 24542 + }, + { + "epoch": 0.49088, + "grad_norm": 1.890625, + "grad_norm_var": 0.006626129150390625, + "learning_rate": 0.0001, + "loss": 3.7529, + "loss/crossentropy": 1.7713094353675842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16650734096765518, + "step": 24544 + }, + { + "epoch": 0.49092, + "grad_norm": 2.03125, + "grad_norm_var": 0.006514231363932292, + "learning_rate": 0.0001, + "loss": 4.1357, + "loss/crossentropy": 2.307682991027832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20595747232437134, + "step": 24546 + }, + { + "epoch": 0.49096, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0071044921875, + "learning_rate": 0.0001, + "loss": 3.7078, + "loss/crossentropy": 2.0465540289878845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16928144544363022, + "step": 24548 + }, + { + "epoch": 0.491, + "grad_norm": 1.859375, + "grad_norm_var": 0.006422678629557292, + "learning_rate": 0.0001, + "loss": 3.7281, + "loss/crossentropy": 1.6479852795600891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1531681939959526, + "step": 24550 + }, + { + "epoch": 0.49104, + "grad_norm": 2.0625, + "grad_norm_var": 0.007600911458333333, + "learning_rate": 0.0001, + "loss": 3.9933, + "loss/crossentropy": 2.2218964099884033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1819760948419571, + "step": 24552 + }, + { + "epoch": 0.49108, + "grad_norm": 1.75, + "grad_norm_var": 0.009098307291666666, + "learning_rate": 0.0001, + "loss": 3.6829, + "loss/crossentropy": 1.7314581274986267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1850220039486885, + "step": 24554 + }, + { + "epoch": 0.49112, + "grad_norm": 1.953125, + "grad_norm_var": 0.007059733072916667, + "learning_rate": 0.0001, + "loss": 3.9411, + "loss/crossentropy": 2.1683263778686523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18814556300640106, + "step": 24556 + }, + { + "epoch": 0.49116, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007098134358723958, + "learning_rate": 0.0001, + "loss": 4.2667, + "loss/crossentropy": 2.364906072616577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20846372097730637, + "step": 24558 + }, + { + "epoch": 0.4912, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007710520426432292, + "learning_rate": 0.0001, + "loss": 4.1047, + "loss/crossentropy": 2.4396650791168213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21760626137256622, + "step": 24560 + }, + { + "epoch": 0.49124, + "grad_norm": 2.375, + "grad_norm_var": 0.059915924072265626, + "learning_rate": 0.0001, + "loss": 4.1424, + "loss/crossentropy": 2.2420458793640137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19711492210626602, + "step": 24562 + }, + { + "epoch": 0.49128, + "grad_norm": 1.84375, + "grad_norm_var": 0.05835367838541667, + "learning_rate": 0.0001, + "loss": 3.8802, + "loss/crossentropy": 1.8962340354919434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20642152428627014, + "step": 24564 + }, + { + "epoch": 0.49132, + "grad_norm": 2.0, + "grad_norm_var": 0.05555597941080729, + "learning_rate": 0.0001, + "loss": 3.9291, + "loss/crossentropy": 2.024625241756439, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20574413985013962, + "step": 24566 + }, + { + "epoch": 0.49136, + "grad_norm": 2.03125, + "grad_norm_var": 0.05478515625, + "learning_rate": 0.0001, + "loss": 4.2441, + "loss/crossentropy": 2.2790380716323853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19655412435531616, + "step": 24568 + }, + { + "epoch": 0.4914, + "grad_norm": 1.96875, + "grad_norm_var": 0.04713923136393229, + "learning_rate": 0.0001, + "loss": 4.0206, + "loss/crossentropy": 2.1089435815811157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18229448795318604, + "step": 24570 + }, + { + "epoch": 0.49144, + "grad_norm": 2.1875, + "grad_norm_var": 0.04691162109375, + "learning_rate": 0.0001, + "loss": 4.2213, + "loss/crossentropy": 2.242396593093872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1903126761317253, + "step": 24572 + }, + { + "epoch": 0.49148, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0496002197265625, + "learning_rate": 0.0001, + "loss": 3.9656, + "loss/crossentropy": 2.3436553478240967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20461376011371613, + "step": 24574 + }, + { + "epoch": 0.49152, + "grad_norm": 2.375, + "grad_norm_var": 0.05408910115559896, + "learning_rate": 0.0001, + "loss": 3.9294, + "loss/crossentropy": 2.020145893096924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20228110253810883, + "step": 24576 + }, + { + "epoch": 0.49156, + "grad_norm": 2.03125, + "grad_norm_var": 0.01663386027018229, + "learning_rate": 0.0001, + "loss": 4.1186, + "loss/crossentropy": 2.0371296405792236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19508864730596542, + "step": 24578 + }, + { + "epoch": 0.4916, + "grad_norm": 2.03125, + "grad_norm_var": 0.014544423421223958, + "learning_rate": 0.0001, + "loss": 4.2877, + "loss/crossentropy": 2.078322470188141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19366976618766785, + "step": 24580 + }, + { + "epoch": 0.49164, + "grad_norm": 2.015625, + "grad_norm_var": 0.014349110921223958, + "learning_rate": 0.0001, + "loss": 4.222, + "loss/crossentropy": 2.3686896562576294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21069607138633728, + "step": 24582 + }, + { + "epoch": 0.49168, + "grad_norm": 1.828125, + "grad_norm_var": 0.01715876261393229, + "learning_rate": 0.0001, + "loss": 3.9882, + "loss/crossentropy": 2.0332735180854797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21546462178230286, + "step": 24584 + }, + { + "epoch": 0.49172, + "grad_norm": 1.984375, + "grad_norm_var": 0.01779759724934896, + "learning_rate": 0.0001, + "loss": 4.1297, + "loss/crossentropy": 2.0070220232009888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17601517587900162, + "step": 24586 + }, + { + "epoch": 0.49176, + "grad_norm": 1.96875, + "grad_norm_var": 0.014888254801432292, + "learning_rate": 0.0001, + "loss": 4.0979, + "loss/crossentropy": 2.1559245586395264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19782397896051407, + "step": 24588 + }, + { + "epoch": 0.4918, + "grad_norm": 1.859375, + "grad_norm_var": 0.0160308837890625, + "learning_rate": 0.0001, + "loss": 3.8771, + "loss/crossentropy": 2.173780679702759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20206089317798615, + "step": 24590 + }, + { + "epoch": 0.49184, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008215077718098958, + "learning_rate": 0.0001, + "loss": 3.8911, + "loss/crossentropy": 1.609294056892395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16182265430688858, + "step": 24592 + }, + { + "epoch": 0.49188, + "grad_norm": 2.171875, + "grad_norm_var": 0.08147557576497395, + "learning_rate": 0.0001, + "loss": 4.2665, + "loss/crossentropy": 2.2137110233306885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27386993169784546, + "step": 24594 + }, + { + "epoch": 0.49192, + "grad_norm": 2.015625, + "grad_norm_var": 0.08635838826497395, + "learning_rate": 0.0001, + "loss": 3.7846, + "loss/crossentropy": 1.9553529024124146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18842827528715134, + "step": 24596 + }, + { + "epoch": 0.49196, + "grad_norm": 1.8203125, + "grad_norm_var": 0.08915176391601562, + "learning_rate": 0.0001, + "loss": 4.0381, + "loss/crossentropy": 1.8976882696151733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18534697592258453, + "step": 24598 + }, + { + "epoch": 0.492, + "grad_norm": 2.25, + "grad_norm_var": 0.09050064086914063, + "learning_rate": 0.0001, + "loss": 4.3902, + "loss/crossentropy": 1.9944769740104675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20278682559728622, + "step": 24600 + }, + { + "epoch": 0.49204, + "grad_norm": 1.765625, + "grad_norm_var": 0.094580078125, + "learning_rate": 0.0001, + "loss": 3.8038, + "loss/crossentropy": 2.3396860361099243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19831668585538864, + "step": 24602 + }, + { + "epoch": 0.49208, + "grad_norm": 2.015625, + "grad_norm_var": 0.09804280598958333, + "learning_rate": 0.0001, + "loss": 3.9544, + "loss/crossentropy": 1.8764755725860596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19239525496959686, + "step": 24604 + }, + { + "epoch": 0.49212, + "grad_norm": 2.015625, + "grad_norm_var": 0.0951080322265625, + "learning_rate": 0.0001, + "loss": 4.0428, + "loss/crossentropy": 1.9992756843566895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18613643199205399, + "step": 24606 + }, + { + "epoch": 0.49216, + "grad_norm": 1.90625, + "grad_norm_var": 0.09273681640625, + "learning_rate": 0.0001, + "loss": 4.1538, + "loss/crossentropy": 2.0907077193260193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1898733228445053, + "step": 24608 + }, + { + "epoch": 0.4922, + "grad_norm": 1.953125, + "grad_norm_var": 0.017183430989583335, + "learning_rate": 0.0001, + "loss": 3.9356, + "loss/crossentropy": 1.9387467503547668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077641263604164, + "step": 24610 + }, + { + "epoch": 0.49224, + "grad_norm": 1.859375, + "grad_norm_var": 0.015561676025390625, + "learning_rate": 0.0001, + "loss": 3.6729, + "loss/crossentropy": 1.8329110741615295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17758768796920776, + "step": 24612 + }, + { + "epoch": 0.49228, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01461181640625, + "learning_rate": 0.0001, + "loss": 3.9334, + "loss/crossentropy": 1.8766810894012451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18651552498340607, + "step": 24614 + }, + { + "epoch": 0.49232, + "grad_norm": 1.828125, + "grad_norm_var": 0.009284464518229167, + "learning_rate": 0.0001, + "loss": 3.6679, + "loss/crossentropy": 1.7456438541412354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16719914227724075, + "step": 24616 + }, + { + "epoch": 0.49236, + "grad_norm": 1.890625, + "grad_norm_var": 0.007920074462890624, + "learning_rate": 0.0001, + "loss": 4.0292, + "loss/crossentropy": 2.0994815826416016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20065009593963623, + "step": 24618 + }, + { + "epoch": 0.4924, + "grad_norm": 1.90625, + "grad_norm_var": 0.007218170166015625, + "learning_rate": 0.0001, + "loss": 4.0588, + "loss/crossentropy": 2.3052616119384766, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2254004403948784, + "step": 24620 + }, + { + "epoch": 0.49244, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006516265869140625, + "learning_rate": 0.0001, + "loss": 3.9487, + "loss/crossentropy": 1.7560802102088928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17277851700782776, + "step": 24622 + }, + { + "epoch": 0.49248, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007142893473307292, + "learning_rate": 0.0001, + "loss": 4.0868, + "loss/crossentropy": 1.862656593322754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19756756722927094, + "step": 24624 + }, + { + "epoch": 0.49252, + "grad_norm": 1.796875, + "grad_norm_var": 0.006959788004557292, + "learning_rate": 0.0001, + "loss": 3.9022, + "loss/crossentropy": 2.0374228954315186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19410685449838638, + "step": 24626 + }, + { + "epoch": 0.49256, + "grad_norm": 1.90625, + "grad_norm_var": 0.006315104166666667, + "learning_rate": 0.0001, + "loss": 3.9468, + "loss/crossentropy": 2.2796911001205444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21644171327352524, + "step": 24628 + }, + { + "epoch": 0.4926, + "grad_norm": 2.59375, + "grad_norm_var": 0.03423436482747396, + "learning_rate": 0.0001, + "loss": 3.9822, + "loss/crossentropy": 1.9718595743179321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18669331073760986, + "step": 24630 + }, + { + "epoch": 0.49264, + "grad_norm": 1.9453125, + "grad_norm_var": 0.030427042643229166, + "learning_rate": 0.0001, + "loss": 4.3174, + "loss/crossentropy": 2.0071244835853577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994020640850067, + "step": 24632 + }, + { + "epoch": 0.49268, + "grad_norm": 1.953125, + "grad_norm_var": 0.029759724934895832, + "learning_rate": 0.0001, + "loss": 4.3168, + "loss/crossentropy": 2.2078527212142944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2190861478447914, + "step": 24634 + }, + { + "epoch": 0.49272, + "grad_norm": 2.015625, + "grad_norm_var": 0.0288482666015625, + "learning_rate": 0.0001, + "loss": 4.1319, + "loss/crossentropy": 2.374260663986206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121455818414688, + "step": 24636 + }, + { + "epoch": 0.49276, + "grad_norm": 2.046875, + "grad_norm_var": 0.02938232421875, + "learning_rate": 0.0001, + "loss": 4.0958, + "loss/crossentropy": 2.139094114303589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20233627408742905, + "step": 24638 + }, + { + "epoch": 0.4928, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0288970947265625, + "learning_rate": 0.0001, + "loss": 4.1157, + "loss/crossentropy": 1.9579105377197266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005467787384987, + "step": 24640 + }, + { + "epoch": 0.49284, + "grad_norm": 2.03125, + "grad_norm_var": 0.025199381510416667, + "learning_rate": 0.0001, + "loss": 4.3003, + "loss/crossentropy": 2.122314691543579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005690187215805, + "step": 24642 + }, + { + "epoch": 0.49288, + "grad_norm": 1.9921875, + "grad_norm_var": 0.024094390869140624, + "learning_rate": 0.0001, + "loss": 4.3163, + "loss/crossentropy": 1.9626818299293518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.196880042552948, + "step": 24644 + }, + { + "epoch": 0.49292, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0026446024576822916, + "learning_rate": 0.0001, + "loss": 4.1957, + "loss/crossentropy": 1.99526047706604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18154115974903107, + "step": 24646 + }, + { + "epoch": 0.49296, + "grad_norm": 2.015625, + "grad_norm_var": 0.0028310139973958333, + "learning_rate": 0.0001, + "loss": 4.1079, + "loss/crossentropy": 2.2288442850112915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19869591295719147, + "step": 24648 + }, + { + "epoch": 0.493, + "grad_norm": 1.9453125, + "grad_norm_var": 0.002756500244140625, + "learning_rate": 0.0001, + "loss": 4.0832, + "loss/crossentropy": 1.8555628061294556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18529972434043884, + "step": 24650 + }, + { + "epoch": 0.49304, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0029042561848958335, + "learning_rate": 0.0001, + "loss": 4.0908, + "loss/crossentropy": 2.1587546467781067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1936108022928238, + "step": 24652 + }, + { + "epoch": 0.49308, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0025469462076822915, + "learning_rate": 0.0001, + "loss": 4.2219, + "loss/crossentropy": 1.8942645192146301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18769075721502304, + "step": 24654 + }, + { + "epoch": 0.49312, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0025266011555989585, + "learning_rate": 0.0001, + "loss": 4.0054, + "loss/crossentropy": 2.334362268447876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21441897749900818, + "step": 24656 + }, + { + "epoch": 0.49316, + "grad_norm": 2.25, + "grad_norm_var": 0.011063639322916667, + "learning_rate": 0.0001, + "loss": 3.945, + "loss/crossentropy": 1.9287384748458862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2295522838830948, + "step": 24658 + }, + { + "epoch": 0.4932, + "grad_norm": 2.015625, + "grad_norm_var": 0.012111155192057292, + "learning_rate": 0.0001, + "loss": 4.1852, + "loss/crossentropy": 2.212242364883423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2138296589255333, + "step": 24660 + }, + { + "epoch": 0.49324, + "grad_norm": 2.125, + "grad_norm_var": 0.014831288655598959, + "learning_rate": 0.0001, + "loss": 3.9479, + "loss/crossentropy": 2.17527437210083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1837603896856308, + "step": 24662 + }, + { + "epoch": 0.49328, + "grad_norm": 1.875, + "grad_norm_var": 0.01660741170247396, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 2.1768887042999268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20551389455795288, + "step": 24664 + }, + { + "epoch": 0.49332, + "grad_norm": 1.984375, + "grad_norm_var": 0.016646321614583334, + "learning_rate": 0.0001, + "loss": 3.9299, + "loss/crossentropy": 1.987557053565979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1999402493238449, + "step": 24666 + }, + { + "epoch": 0.49336, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0241851806640625, + "learning_rate": 0.0001, + "loss": 3.5654, + "loss/crossentropy": 1.7189387083053589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16583546251058578, + "step": 24668 + }, + { + "epoch": 0.4934, + "grad_norm": 1.8515625, + "grad_norm_var": 0.026021321614583332, + "learning_rate": 0.0001, + "loss": 3.9805, + "loss/crossentropy": 2.0449089407920837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21923594176769257, + "step": 24670 + }, + { + "epoch": 0.49344, + "grad_norm": 4.21875, + "grad_norm_var": 0.34633763631184894, + "learning_rate": 0.0001, + "loss": 4.0541, + "loss/crossentropy": 1.922966718673706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19492915272712708, + "step": 24672 + }, + { + "epoch": 0.49348, + "grad_norm": 1.84375, + "grad_norm_var": 0.34245198567708335, + "learning_rate": 0.0001, + "loss": 4.1132, + "loss/crossentropy": 1.8697016835212708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21420104801654816, + "step": 24674 + }, + { + "epoch": 0.49352, + "grad_norm": 2.0625, + "grad_norm_var": 0.34196370442708335, + "learning_rate": 0.0001, + "loss": 4.2348, + "loss/crossentropy": 2.2170732021331787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20673636347055435, + "step": 24676 + }, + { + "epoch": 0.49356, + "grad_norm": 1.8984375, + "grad_norm_var": 0.34133707682291664, + "learning_rate": 0.0001, + "loss": 3.7056, + "loss/crossentropy": 2.0645321011543274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2113899141550064, + "step": 24678 + }, + { + "epoch": 0.4936, + "grad_norm": 1.859375, + "grad_norm_var": 0.3439776102701823, + "learning_rate": 0.0001, + "loss": 4.0543, + "loss/crossentropy": 2.0374088883399963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20146184414625168, + "step": 24680 + }, + { + "epoch": 0.49364, + "grad_norm": 1.9140625, + "grad_norm_var": 0.34632975260416665, + "learning_rate": 0.0001, + "loss": 4.0364, + "loss/crossentropy": 1.9996783137321472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19109730422496796, + "step": 24682 + }, + { + "epoch": 0.49368, + "grad_norm": 1.90625, + "grad_norm_var": 0.329974110921224, + "learning_rate": 0.0001, + "loss": 4.1988, + "loss/crossentropy": 2.2255775928497314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2078489512205124, + "step": 24684 + }, + { + "epoch": 0.49372, + "grad_norm": 1.859375, + "grad_norm_var": 0.3313385009765625, + "learning_rate": 0.0001, + "loss": 4.2153, + "loss/crossentropy": 1.9860047698020935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21092459559440613, + "step": 24686 + }, + { + "epoch": 0.49376, + "grad_norm": 1.984375, + "grad_norm_var": 0.020475260416666665, + "learning_rate": 0.0001, + "loss": 4.0701, + "loss/crossentropy": 1.9905331134796143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1982455775141716, + "step": 24688 + }, + { + "epoch": 0.4938, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012786610921223959, + "learning_rate": 0.0001, + "loss": 4.1729, + "loss/crossentropy": 2.254474639892578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20714619755744934, + "step": 24690 + }, + { + "epoch": 0.49384, + "grad_norm": 1.8984375, + "grad_norm_var": 0.011376698811848959, + "learning_rate": 0.0001, + "loss": 3.8576, + "loss/crossentropy": 2.264926791191101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1946386992931366, + "step": 24692 + }, + { + "epoch": 0.49388, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01116943359375, + "learning_rate": 0.0001, + "loss": 3.847, + "loss/crossentropy": 2.071332335472107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2170526310801506, + "step": 24694 + }, + { + "epoch": 0.49392, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010746002197265625, + "learning_rate": 0.0001, + "loss": 3.8254, + "loss/crossentropy": 1.973964512348175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20519066601991653, + "step": 24696 + }, + { + "epoch": 0.49396, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011644490559895833, + "learning_rate": 0.0001, + "loss": 3.848, + "loss/crossentropy": 1.6099820137023926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15943622589111328, + "step": 24698 + }, + { + "epoch": 0.494, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010542551676432291, + "learning_rate": 0.0001, + "loss": 4.1036, + "loss/crossentropy": 2.1428415775299072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20248957723379135, + "step": 24700 + }, + { + "epoch": 0.49404, + "grad_norm": 1.9765625, + "grad_norm_var": 0.003742472330729167, + "learning_rate": 0.0001, + "loss": 4.3402, + "loss/crossentropy": 2.3973593711853027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21721762418746948, + "step": 24702 + }, + { + "epoch": 0.49408, + "grad_norm": 2.046875, + "grad_norm_var": 0.004219563802083334, + "learning_rate": 0.0001, + "loss": 4.2233, + "loss/crossentropy": 2.184257686138153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20263968408107758, + "step": 24704 + }, + { + "epoch": 0.49412, + "grad_norm": 1.984375, + "grad_norm_var": 0.0042111714680989586, + "learning_rate": 0.0001, + "loss": 3.9782, + "loss/crossentropy": 1.953734815120697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18011564016342163, + "step": 24706 + }, + { + "epoch": 0.49416, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00406494140625, + "learning_rate": 0.0001, + "loss": 4.0393, + "loss/crossentropy": 2.0036890506744385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21608620882034302, + "step": 24708 + }, + { + "epoch": 0.4942, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0047686258951822914, + "learning_rate": 0.0001, + "loss": 3.9851, + "loss/crossentropy": 2.016548752784729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18857601284980774, + "step": 24710 + }, + { + "epoch": 0.49424, + "grad_norm": 1.875, + "grad_norm_var": 0.0053179423014322914, + "learning_rate": 0.0001, + "loss": 3.9084, + "loss/crossentropy": 2.0665934085845947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18890216201543808, + "step": 24712 + }, + { + "epoch": 0.49428, + "grad_norm": 1.8359375, + "grad_norm_var": 0.005216471354166667, + "learning_rate": 0.0001, + "loss": 4.2576, + "loss/crossentropy": 2.102554202079773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19079755991697311, + "step": 24714 + }, + { + "epoch": 0.49432, + "grad_norm": 1.8125, + "grad_norm_var": 0.007893625895182292, + "learning_rate": 0.0001, + "loss": 4.2018, + "loss/crossentropy": 2.326322555541992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21783028542995453, + "step": 24716 + }, + { + "epoch": 0.49436, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0073811848958333336, + "learning_rate": 0.0001, + "loss": 4.2362, + "loss/crossentropy": 2.2604973316192627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19831713289022446, + "step": 24718 + }, + { + "epoch": 0.4944, + "grad_norm": 1.9765625, + "grad_norm_var": 0.00660400390625, + "learning_rate": 0.0001, + "loss": 3.859, + "loss/crossentropy": 2.2417763471603394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19713596254587173, + "step": 24720 + }, + { + "epoch": 0.49444, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009533437093098958, + "learning_rate": 0.0001, + "loss": 4.5059, + "loss/crossentropy": 2.21281498670578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067965790629387, + "step": 24722 + }, + { + "epoch": 0.49448, + "grad_norm": 1.921875, + "grad_norm_var": 0.009696451822916667, + "learning_rate": 0.0001, + "loss": 4.0386, + "loss/crossentropy": 2.288776397705078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19350187480449677, + "step": 24724 + }, + { + "epoch": 0.49452, + "grad_norm": 1.953125, + "grad_norm_var": 0.009528605143229167, + "learning_rate": 0.0001, + "loss": 4.0452, + "loss/crossentropy": 1.8925097584724426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1952025145292282, + "step": 24726 + }, + { + "epoch": 0.49456, + "grad_norm": 1.96875, + "grad_norm_var": 0.009287261962890625, + "learning_rate": 0.0001, + "loss": 3.8005, + "loss/crossentropy": 2.2274699211120605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19862639904022217, + "step": 24728 + }, + { + "epoch": 0.4946, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008671061197916666, + "learning_rate": 0.0001, + "loss": 3.9947, + "loss/crossentropy": 2.028487980365753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19470887631177902, + "step": 24730 + }, + { + "epoch": 0.49464, + "grad_norm": 1.75, + "grad_norm_var": 0.0082672119140625, + "learning_rate": 0.0001, + "loss": 4.2435, + "loss/crossentropy": 2.413878560066223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20918180793523788, + "step": 24732 + }, + { + "epoch": 0.49468, + "grad_norm": 1.9375, + "grad_norm_var": 0.007478841145833333, + "learning_rate": 0.0001, + "loss": 4.0814, + "loss/crossentropy": 2.2201125621795654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19350255280733109, + "step": 24734 + }, + { + "epoch": 0.49472, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006859334309895834, + "learning_rate": 0.0001, + "loss": 4.0363, + "loss/crossentropy": 2.4426982402801514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20905796438455582, + "step": 24736 + }, + { + "epoch": 0.49476, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0044667561848958336, + "learning_rate": 0.0001, + "loss": 3.811, + "loss/crossentropy": 1.586302399635315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1673523262143135, + "step": 24738 + }, + { + "epoch": 0.4948, + "grad_norm": 2.203125, + "grad_norm_var": 0.009822591145833334, + "learning_rate": 0.0001, + "loss": 3.8759, + "loss/crossentropy": 2.0435025691986084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17974509298801422, + "step": 24740 + }, + { + "epoch": 0.49484, + "grad_norm": 2.015625, + "grad_norm_var": 0.009916178385416667, + "learning_rate": 0.0001, + "loss": 4.288, + "loss/crossentropy": 2.2075798511505127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002267688512802, + "step": 24742 + }, + { + "epoch": 0.49488, + "grad_norm": 1.84375, + "grad_norm_var": 0.011563873291015625, + "learning_rate": 0.0001, + "loss": 3.6645, + "loss/crossentropy": 1.8993717432022095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17360415309667587, + "step": 24744 + }, + { + "epoch": 0.49492, + "grad_norm": 1.7734375, + "grad_norm_var": 0.013024648030598959, + "learning_rate": 0.0001, + "loss": 3.7726, + "loss/crossentropy": 1.8296250700950623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821255162358284, + "step": 24746 + }, + { + "epoch": 0.49496, + "grad_norm": 2.125, + "grad_norm_var": 0.013911692301432292, + "learning_rate": 0.0001, + "loss": 3.9835, + "loss/crossentropy": 1.8268097043037415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1724962294101715, + "step": 24748 + }, + { + "epoch": 0.495, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014411417643229167, + "learning_rate": 0.0001, + "loss": 3.9835, + "loss/crossentropy": 1.9587016701698303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193455271422863, + "step": 24750 + }, + { + "epoch": 0.49504, + "grad_norm": 1.890625, + "grad_norm_var": 0.014811197916666666, + "learning_rate": 0.0001, + "loss": 4.0863, + "loss/crossentropy": 2.1292134523391724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20238783955574036, + "step": 24752 + }, + { + "epoch": 0.49508, + "grad_norm": 2.03125, + "grad_norm_var": 0.01566136678059896, + "learning_rate": 0.0001, + "loss": 3.9682, + "loss/crossentropy": 2.0145905017852783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2276977151632309, + "step": 24754 + }, + { + "epoch": 0.49512, + "grad_norm": 2.015625, + "grad_norm_var": 0.011028798421223958, + "learning_rate": 0.0001, + "loss": 4.0566, + "loss/crossentropy": 1.9815504550933838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1814296990633011, + "step": 24756 + }, + { + "epoch": 0.49516, + "grad_norm": 1.7734375, + "grad_norm_var": 0.012532552083333334, + "learning_rate": 0.0001, + "loss": 3.7894, + "loss/crossentropy": 1.8275295495986938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16582081466913223, + "step": 24758 + }, + { + "epoch": 0.4952, + "grad_norm": 1.90625, + "grad_norm_var": 0.009859212239583333, + "learning_rate": 0.0001, + "loss": 3.9488, + "loss/crossentropy": 1.9509209990501404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19137832522392273, + "step": 24760 + }, + { + "epoch": 0.49524, + "grad_norm": 2.09375, + "grad_norm_var": 0.007433827718098958, + "learning_rate": 0.0001, + "loss": 4.405, + "loss/crossentropy": 2.255669593811035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20520294457674026, + "step": 24762 + }, + { + "epoch": 0.49528, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006624348958333333, + "learning_rate": 0.0001, + "loss": 3.9608, + "loss/crossentropy": 1.902245819568634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1721034198999405, + "step": 24764 + }, + { + "epoch": 0.49532, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007085927327473958, + "learning_rate": 0.0001, + "loss": 4.033, + "loss/crossentropy": 2.0003857016563416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1873132660984993, + "step": 24766 + }, + { + "epoch": 0.49536, + "grad_norm": 2.1875, + "grad_norm_var": 0.01024169921875, + "learning_rate": 0.0001, + "loss": 4.1782, + "loss/crossentropy": 2.3280882835388184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21413114666938782, + "step": 24768 + }, + { + "epoch": 0.4954, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008882649739583333, + "learning_rate": 0.0001, + "loss": 4.0517, + "loss/crossentropy": 2.0396437644958496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18958552181720734, + "step": 24770 + }, + { + "epoch": 0.49544, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009012603759765625, + "learning_rate": 0.0001, + "loss": 3.9254, + "loss/crossentropy": 2.247615098953247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21501318365335464, + "step": 24772 + }, + { + "epoch": 0.49548, + "grad_norm": 1.953125, + "grad_norm_var": 0.008479817708333334, + "learning_rate": 0.0001, + "loss": 3.8623, + "loss/crossentropy": 2.0479390621185303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20071326941251755, + "step": 24774 + }, + { + "epoch": 0.49552, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008591461181640624, + "learning_rate": 0.0001, + "loss": 4.2585, + "loss/crossentropy": 2.129696846008301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128293514251709, + "step": 24776 + }, + { + "epoch": 0.49556, + "grad_norm": 2.046875, + "grad_norm_var": 0.008805338541666667, + "learning_rate": 0.0001, + "loss": 4.1479, + "loss/crossentropy": 2.0912004709243774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24846868962049484, + "step": 24778 + }, + { + "epoch": 0.4956, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008934529622395833, + "learning_rate": 0.0001, + "loss": 4.1595, + "loss/crossentropy": 2.310052990913391, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025884911417961, + "step": 24780 + }, + { + "epoch": 0.49564, + "grad_norm": 1.8671875, + "grad_norm_var": 0.010886383056640626, + "learning_rate": 0.0001, + "loss": 4.2532, + "loss/crossentropy": 2.3318560123443604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21703048795461655, + "step": 24782 + }, + { + "epoch": 0.49568, + "grad_norm": 2.0, + "grad_norm_var": 0.009791819254557292, + "learning_rate": 0.0001, + "loss": 3.7083, + "loss/crossentropy": 1.9909458756446838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18664265424013138, + "step": 24784 + }, + { + "epoch": 0.49572, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009814198811848958, + "learning_rate": 0.0001, + "loss": 4.34, + "loss/crossentropy": 2.530202269554138, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20007390528917313, + "step": 24786 + }, + { + "epoch": 0.49576, + "grad_norm": 1.953125, + "grad_norm_var": 0.009382120768229167, + "learning_rate": 0.0001, + "loss": 4.1865, + "loss/crossentropy": 2.286345958709717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20716369152069092, + "step": 24788 + }, + { + "epoch": 0.4958, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008768717447916666, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 2.105964183807373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1934148371219635, + "step": 24790 + }, + { + "epoch": 0.49584, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009618123372395834, + "learning_rate": 0.0001, + "loss": 3.8057, + "loss/crossentropy": 1.8528069853782654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16931568086147308, + "step": 24792 + }, + { + "epoch": 0.49588, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008699544270833333, + "learning_rate": 0.0001, + "loss": 3.9729, + "loss/crossentropy": 1.8847370743751526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16615931689739227, + "step": 24794 + }, + { + "epoch": 0.49592, + "grad_norm": 2.046875, + "grad_norm_var": 0.009040323893229167, + "learning_rate": 0.0001, + "loss": 4.2483, + "loss/crossentropy": 2.159093201160431, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20933274924755096, + "step": 24796 + }, + { + "epoch": 0.49596, + "grad_norm": 2.109375, + "grad_norm_var": 0.007995351155598959, + "learning_rate": 0.0001, + "loss": 4.2635, + "loss/crossentropy": 2.1289132833480835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19792458415031433, + "step": 24798 + }, + { + "epoch": 0.496, + "grad_norm": 1.796875, + "grad_norm_var": 0.0071523030598958336, + "learning_rate": 0.0001, + "loss": 4.0548, + "loss/crossentropy": 2.0613357424736023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18525776267051697, + "step": 24800 + }, + { + "epoch": 0.49604, + "grad_norm": 1.96875, + "grad_norm_var": 0.0070953369140625, + "learning_rate": 0.0001, + "loss": 4.0807, + "loss/crossentropy": 1.7627623081207275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16891766339540482, + "step": 24802 + }, + { + "epoch": 0.49608, + "grad_norm": 1.90625, + "grad_norm_var": 0.008446248372395833, + "learning_rate": 0.0001, + "loss": 4.0642, + "loss/crossentropy": 1.77890944480896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24872711300849915, + "step": 24804 + }, + { + "epoch": 0.49612, + "grad_norm": 1.953125, + "grad_norm_var": 0.009291330973307291, + "learning_rate": 0.0001, + "loss": 4.0468, + "loss/crossentropy": 2.240827202796936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20014363527297974, + "step": 24806 + }, + { + "epoch": 0.49616, + "grad_norm": 1.90625, + "grad_norm_var": 0.009968058268229166, + "learning_rate": 0.0001, + "loss": 3.8452, + "loss/crossentropy": 1.7963212132453918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16260015219449997, + "step": 24808 + }, + { + "epoch": 0.4962, + "grad_norm": 2.125, + "grad_norm_var": 0.011681874593098959, + "learning_rate": 0.0001, + "loss": 4.0605, + "loss/crossentropy": 2.1720168590545654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18297959119081497, + "step": 24810 + }, + { + "epoch": 0.49624, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010849761962890624, + "learning_rate": 0.0001, + "loss": 3.8772, + "loss/crossentropy": 2.0715248584747314, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20910069346427917, + "step": 24812 + }, + { + "epoch": 0.49628, + "grad_norm": 2.0, + "grad_norm_var": 0.009203084309895833, + "learning_rate": 0.0001, + "loss": 3.8442, + "loss/crossentropy": 1.8676977157592773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19553124904632568, + "step": 24814 + }, + { + "epoch": 0.49632, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008809407552083334, + "learning_rate": 0.0001, + "loss": 4.1918, + "loss/crossentropy": 2.383392572402954, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20431752502918243, + "step": 24816 + }, + { + "epoch": 0.49636, + "grad_norm": 1.921875, + "grad_norm_var": 0.013744099934895834, + "learning_rate": 0.0001, + "loss": 4.1956, + "loss/crossentropy": 1.8039666414260864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17365024238824844, + "step": 24818 + }, + { + "epoch": 0.4964, + "grad_norm": 1.8515625, + "grad_norm_var": 0.012967936197916667, + "learning_rate": 0.0001, + "loss": 3.7321, + "loss/crossentropy": 1.803081214427948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188148595392704, + "step": 24820 + }, + { + "epoch": 0.49644, + "grad_norm": 1.7421875, + "grad_norm_var": 0.014137522379557291, + "learning_rate": 0.0001, + "loss": 3.8016, + "loss/crossentropy": 1.9158729910850525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17063472419977188, + "step": 24822 + }, + { + "epoch": 0.49648, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01593805948893229, + "learning_rate": 0.0001, + "loss": 3.8044, + "loss/crossentropy": 2.013060450553894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192637138068676, + "step": 24824 + }, + { + "epoch": 0.49652, + "grad_norm": 1.9609375, + "grad_norm_var": 0.015126291910807292, + "learning_rate": 0.0001, + "loss": 3.8618, + "loss/crossentropy": 2.2182289361953735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003488838672638, + "step": 24826 + }, + { + "epoch": 0.49656, + "grad_norm": 1.8125, + "grad_norm_var": 0.0152008056640625, + "learning_rate": 0.0001, + "loss": 3.7418, + "loss/crossentropy": 1.9512917399406433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17393754422664642, + "step": 24828 + }, + { + "epoch": 0.4966, + "grad_norm": 2.140625, + "grad_norm_var": 0.017986806233723958, + "learning_rate": 0.0001, + "loss": 4.2311, + "loss/crossentropy": 2.3564621210098267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22488048672676086, + "step": 24830 + }, + { + "epoch": 0.49664, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0183502197265625, + "learning_rate": 0.0001, + "loss": 4.0941, + "loss/crossentropy": 2.3347015380859375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045806273818016, + "step": 24832 + }, + { + "epoch": 0.49668, + "grad_norm": 1.921875, + "grad_norm_var": 0.0120513916015625, + "learning_rate": 0.0001, + "loss": 4.0312, + "loss/crossentropy": 1.8157051801681519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16906943917274475, + "step": 24834 + }, + { + "epoch": 0.49672, + "grad_norm": 1.9140625, + "grad_norm_var": 0.011903635660807292, + "learning_rate": 0.0001, + "loss": 4.1453, + "loss/crossentropy": 2.068196475505829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20066522806882858, + "step": 24836 + }, + { + "epoch": 0.49676, + "grad_norm": 1.875, + "grad_norm_var": 0.010198720296223958, + "learning_rate": 0.0001, + "loss": 3.9633, + "loss/crossentropy": 1.771648347377777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15898138284683228, + "step": 24838 + }, + { + "epoch": 0.4968, + "grad_norm": 2.03125, + "grad_norm_var": 0.008267974853515625, + "learning_rate": 0.0001, + "loss": 3.8167, + "loss/crossentropy": 2.206678628921509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22059056907892227, + "step": 24840 + }, + { + "epoch": 0.49684, + "grad_norm": 2.046875, + "grad_norm_var": 0.006502278645833333, + "learning_rate": 0.0001, + "loss": 4.0938, + "loss/crossentropy": 2.3368433713912964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20108889043331146, + "step": 24842 + }, + { + "epoch": 0.49688, + "grad_norm": 1.890625, + "grad_norm_var": 0.007039388020833333, + "learning_rate": 0.0001, + "loss": 3.6263, + "loss/crossentropy": 2.1412705183029175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19969124346971512, + "step": 24844 + }, + { + "epoch": 0.49692, + "grad_norm": 2.03125, + "grad_norm_var": 0.0048215230305989586, + "learning_rate": 0.0001, + "loss": 4.0444, + "loss/crossentropy": 1.9018915295600891, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17174704372882843, + "step": 24846 + }, + { + "epoch": 0.49696, + "grad_norm": 1.84375, + "grad_norm_var": 0.0062652587890625, + "learning_rate": 0.0001, + "loss": 4.3229, + "loss/crossentropy": 2.3042339086532593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20182086527347565, + "step": 24848 + }, + { + "epoch": 0.497, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009948476155598959, + "learning_rate": 0.0001, + "loss": 3.899, + "loss/crossentropy": 2.0412787199020386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19640880823135376, + "step": 24850 + }, + { + "epoch": 0.49704, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010065714518229166, + "learning_rate": 0.0001, + "loss": 3.76, + "loss/crossentropy": 1.9754884243011475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17998956143856049, + "step": 24852 + }, + { + "epoch": 0.49708, + "grad_norm": 2.015625, + "grad_norm_var": 0.010038248697916667, + "learning_rate": 0.0001, + "loss": 4.1763, + "loss/crossentropy": 2.060430645942688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21152494102716446, + "step": 24854 + }, + { + "epoch": 0.49712, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010375722249348959, + "learning_rate": 0.0001, + "loss": 3.7536, + "loss/crossentropy": 1.6876602172851562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1664302945137024, + "step": 24856 + }, + { + "epoch": 0.49716, + "grad_norm": 1.984375, + "grad_norm_var": 0.009663645426432292, + "learning_rate": 0.0001, + "loss": 3.9341, + "loss/crossentropy": 1.9389417171478271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19122107326984406, + "step": 24858 + }, + { + "epoch": 0.4972, + "grad_norm": 1.859375, + "grad_norm_var": 0.010752105712890625, + "learning_rate": 0.0001, + "loss": 3.905, + "loss/crossentropy": 2.069887936115265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20993375778198242, + "step": 24860 + }, + { + "epoch": 0.49724, + "grad_norm": 1.953125, + "grad_norm_var": 0.0125640869140625, + "learning_rate": 0.0001, + "loss": 4.2935, + "loss/crossentropy": 2.1173367500305176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19083526730537415, + "step": 24862 + }, + { + "epoch": 0.49728, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011138661702473959, + "learning_rate": 0.0001, + "loss": 4.2694, + "loss/crossentropy": 2.0459975004196167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20265580713748932, + "step": 24864 + }, + { + "epoch": 0.49732, + "grad_norm": 2.0625, + "grad_norm_var": 0.009471638997395834, + "learning_rate": 0.0001, + "loss": 3.9095, + "loss/crossentropy": 1.9144663214683533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18860550224781036, + "step": 24866 + }, + { + "epoch": 0.49736, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010762278238932292, + "learning_rate": 0.0001, + "loss": 3.8107, + "loss/crossentropy": 1.6619080901145935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17097505927085876, + "step": 24868 + }, + { + "epoch": 0.4974, + "grad_norm": 2.1875, + "grad_norm_var": 0.014192708333333333, + "learning_rate": 0.0001, + "loss": 4.2346, + "loss/crossentropy": 2.031949281692505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21831347048282623, + "step": 24870 + }, + { + "epoch": 0.49744, + "grad_norm": 2.109375, + "grad_norm_var": 0.014869944254557291, + "learning_rate": 0.0001, + "loss": 4.098, + "loss/crossentropy": 2.330108165740967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21315570175647736, + "step": 24872 + }, + { + "epoch": 0.49748, + "grad_norm": 1.859375, + "grad_norm_var": 0.014876302083333333, + "learning_rate": 0.0001, + "loss": 3.8719, + "loss/crossentropy": 2.301952600479126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18241943418979645, + "step": 24874 + }, + { + "epoch": 0.49752, + "grad_norm": 1.8828125, + "grad_norm_var": 0.013634999593098959, + "learning_rate": 0.0001, + "loss": 4.3927, + "loss/crossentropy": 2.3587260246276855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19822461903095245, + "step": 24876 + }, + { + "epoch": 0.49756, + "grad_norm": 1.84375, + "grad_norm_var": 0.012729644775390625, + "learning_rate": 0.0001, + "loss": 3.7801, + "loss/crossentropy": 1.8675512671470642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19025489687919617, + "step": 24878 + }, + { + "epoch": 0.4976, + "grad_norm": 2.390625, + "grad_norm_var": 0.027367146809895833, + "learning_rate": 0.0001, + "loss": 3.8605, + "loss/crossentropy": 1.9007975459098816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18459592014551163, + "step": 24880 + }, + { + "epoch": 0.49764, + "grad_norm": 2.03125, + "grad_norm_var": 0.026009114583333333, + "learning_rate": 0.0001, + "loss": 4.1207, + "loss/crossentropy": 1.9351030588150024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17839118093252182, + "step": 24882 + }, + { + "epoch": 0.49768, + "grad_norm": 1.890625, + "grad_norm_var": 0.02781956990559896, + "learning_rate": 0.0001, + "loss": 4.2804, + "loss/crossentropy": 2.2661246061325073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19518450647592545, + "step": 24884 + }, + { + "epoch": 0.49772, + "grad_norm": 2.0625, + "grad_norm_var": 0.025333658854166666, + "learning_rate": 0.0001, + "loss": 4.2568, + "loss/crossentropy": 2.1555867791175842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20517798513174057, + "step": 24886 + }, + { + "epoch": 0.49776, + "grad_norm": 1.953125, + "grad_norm_var": 0.023067220052083334, + "learning_rate": 0.0001, + "loss": 4.0291, + "loss/crossentropy": 2.1638144850730896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18970053642988205, + "step": 24888 + }, + { + "epoch": 0.4978, + "grad_norm": 2.015625, + "grad_norm_var": 0.023280588785807292, + "learning_rate": 0.0001, + "loss": 3.9658, + "loss/crossentropy": 2.230544328689575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20187361538410187, + "step": 24890 + }, + { + "epoch": 0.49784, + "grad_norm": 1.9296875, + "grad_norm_var": 0.02308349609375, + "learning_rate": 0.0001, + "loss": 4.1406, + "loss/crossentropy": 2.1005473136901855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20453108847141266, + "step": 24892 + }, + { + "epoch": 0.49788, + "grad_norm": 1.953125, + "grad_norm_var": 0.0233642578125, + "learning_rate": 0.0001, + "loss": 3.898, + "loss/crossentropy": 2.056541621685028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18511568009853363, + "step": 24894 + }, + { + "epoch": 0.49792, + "grad_norm": 1.8984375, + "grad_norm_var": 0.011407216389973959, + "learning_rate": 0.0001, + "loss": 4.2019, + "loss/crossentropy": 2.151825726032257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19390379637479782, + "step": 24896 + }, + { + "epoch": 0.49796, + "grad_norm": 2.015625, + "grad_norm_var": 0.011726633707682291, + "learning_rate": 0.0001, + "loss": 4.159, + "loss/crossentropy": 2.2062729597091675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21959953010082245, + "step": 24898 + }, + { + "epoch": 0.498, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007835896809895833, + "learning_rate": 0.0001, + "loss": 4.0506, + "loss/crossentropy": 2.11991286277771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20495782047510147, + "step": 24900 + }, + { + "epoch": 0.49804, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007785797119140625, + "learning_rate": 0.0001, + "loss": 3.8555, + "loss/crossentropy": 1.9586367011070251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18644800782203674, + "step": 24902 + }, + { + "epoch": 0.49808, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008665974934895833, + "learning_rate": 0.0001, + "loss": 3.987, + "loss/crossentropy": 2.030204951763153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938316822052002, + "step": 24904 + }, + { + "epoch": 0.49812, + "grad_norm": 1.953125, + "grad_norm_var": 0.00999755859375, + "learning_rate": 0.0001, + "loss": 4.2644, + "loss/crossentropy": 2.063572645187378, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19029874354600906, + "step": 24906 + }, + { + "epoch": 0.49816, + "grad_norm": 1.921875, + "grad_norm_var": 0.010081990559895834, + "learning_rate": 0.0001, + "loss": 3.8608, + "loss/crossentropy": 1.8356643319129944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18317610025405884, + "step": 24908 + }, + { + "epoch": 0.4982, + "grad_norm": 2.0625, + "grad_norm_var": 0.08252665201822916, + "learning_rate": 0.0001, + "loss": 4.1617, + "loss/crossentropy": 1.8856282830238342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1754850372672081, + "step": 24910 + }, + { + "epoch": 0.49824, + "grad_norm": 1.9296875, + "grad_norm_var": 0.08241780598958333, + "learning_rate": 0.0001, + "loss": 3.9649, + "loss/crossentropy": 1.8674440383911133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1805490255355835, + "step": 24912 + }, + { + "epoch": 0.49828, + "grad_norm": 1.84375, + "grad_norm_var": 0.08478978474934896, + "learning_rate": 0.0001, + "loss": 3.839, + "loss/crossentropy": 1.8774593472480774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18495067209005356, + "step": 24914 + }, + { + "epoch": 0.49832, + "grad_norm": 1.71875, + "grad_norm_var": 0.08945210774739583, + "learning_rate": 0.0001, + "loss": 3.9121, + "loss/crossentropy": 2.059852659702301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19449050724506378, + "step": 24916 + }, + { + "epoch": 0.49836, + "grad_norm": 1.9765625, + "grad_norm_var": 0.08824869791666666, + "learning_rate": 0.0001, + "loss": 4.2468, + "loss/crossentropy": 2.3840794563293457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23490352928638458, + "step": 24918 + }, + { + "epoch": 0.4984, + "grad_norm": 1.8671875, + "grad_norm_var": 0.08859049479166667, + "learning_rate": 0.0001, + "loss": 4.1537, + "loss/crossentropy": 2.136437773704529, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19557444751262665, + "step": 24920 + }, + { + "epoch": 0.49844, + "grad_norm": 2.140625, + "grad_norm_var": 0.08821614583333333, + "learning_rate": 0.0001, + "loss": 4.3658, + "loss/crossentropy": 2.4357622861862183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21140636503696442, + "step": 24922 + }, + { + "epoch": 0.49848, + "grad_norm": 1.859375, + "grad_norm_var": 0.08875223795572916, + "learning_rate": 0.0001, + "loss": 3.9085, + "loss/crossentropy": 2.1498345136642456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18728043884038925, + "step": 24924 + }, + { + "epoch": 0.49852, + "grad_norm": 1.890625, + "grad_norm_var": 0.012580362955729167, + "learning_rate": 0.0001, + "loss": 4.0771, + "loss/crossentropy": 2.131583571434021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21287915110588074, + "step": 24926 + }, + { + "epoch": 0.49856, + "grad_norm": 1.9375, + "grad_norm_var": 0.012505849202473959, + "learning_rate": 0.0001, + "loss": 4.1492, + "loss/crossentropy": 2.434916377067566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21438215672969818, + "step": 24928 + }, + { + "epoch": 0.4986, + "grad_norm": 1.6875, + "grad_norm_var": 0.016397857666015626, + "learning_rate": 0.0001, + "loss": 3.8951, + "loss/crossentropy": 1.9909663796424866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16714239120483398, + "step": 24930 + }, + { + "epoch": 0.49864, + "grad_norm": 1.859375, + "grad_norm_var": 0.01343994140625, + "learning_rate": 0.0001, + "loss": 4.0229, + "loss/crossentropy": 1.9362955689430237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18765835464000702, + "step": 24932 + }, + { + "epoch": 0.49868, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011527252197265626, + "learning_rate": 0.0001, + "loss": 3.888, + "loss/crossentropy": 1.909186065196991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19517959654331207, + "step": 24934 + }, + { + "epoch": 0.49872, + "grad_norm": 1.7890625, + "grad_norm_var": 0.010350545247395834, + "learning_rate": 0.0001, + "loss": 4.0523, + "loss/crossentropy": 1.9746766686439514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19376099109649658, + "step": 24936 + }, + { + "epoch": 0.49876, + "grad_norm": 2.0, + "grad_norm_var": 0.006844075520833334, + "learning_rate": 0.0001, + "loss": 4.0004, + "loss/crossentropy": 2.0698294639587402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993079036474228, + "step": 24938 + }, + { + "epoch": 0.4988, + "grad_norm": 1.921875, + "grad_norm_var": 0.007453409830729166, + "learning_rate": 0.0001, + "loss": 4.1234, + "loss/crossentropy": 1.8996255993843079, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822480708360672, + "step": 24940 + }, + { + "epoch": 0.49884, + "grad_norm": 2.125, + "grad_norm_var": 0.0114898681640625, + "learning_rate": 0.0001, + "loss": 4.4206, + "loss/crossentropy": 1.9174728989601135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18220385164022446, + "step": 24942 + }, + { + "epoch": 0.49888, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0126953125, + "learning_rate": 0.0001, + "loss": 3.6162, + "loss/crossentropy": 1.827277660369873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17845244705677032, + "step": 24944 + }, + { + "epoch": 0.49892, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009264882405598958, + "learning_rate": 0.0001, + "loss": 4.1165, + "loss/crossentropy": 2.0837597846984863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19145962595939636, + "step": 24946 + }, + { + "epoch": 0.49896, + "grad_norm": 1.90625, + "grad_norm_var": 0.008915201822916666, + "learning_rate": 0.0001, + "loss": 3.969, + "loss/crossentropy": 2.417944073677063, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22484175115823746, + "step": 24948 + }, + { + "epoch": 0.499, + "grad_norm": 2.03125, + "grad_norm_var": 0.009639485677083334, + "learning_rate": 0.0001, + "loss": 4.0408, + "loss/crossentropy": 2.338744640350342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132393717765808, + "step": 24950 + }, + { + "epoch": 0.49904, + "grad_norm": 2.09375, + "grad_norm_var": 0.009559885660807291, + "learning_rate": 0.0001, + "loss": 3.9549, + "loss/crossentropy": 2.1383684277534485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19005178660154343, + "step": 24952 + }, + { + "epoch": 0.49908, + "grad_norm": 1.859375, + "grad_norm_var": 0.010731760660807292, + "learning_rate": 0.0001, + "loss": 4.1188, + "loss/crossentropy": 1.8223644495010376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17134226113557816, + "step": 24954 + }, + { + "epoch": 0.49912, + "grad_norm": 2.265625, + "grad_norm_var": 0.017228190104166666, + "learning_rate": 0.0001, + "loss": 3.9886, + "loss/crossentropy": 2.09305202960968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19109848141670227, + "step": 24956 + }, + { + "epoch": 0.49916, + "grad_norm": 2.03125, + "grad_norm_var": 0.017844390869140626, + "learning_rate": 0.0001, + "loss": 3.7533, + "loss/crossentropy": 1.9903115034103394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18018385022878647, + "step": 24958 + }, + { + "epoch": 0.4992, + "grad_norm": 2.0, + "grad_norm_var": 0.02005182902018229, + "learning_rate": 0.0001, + "loss": 4.0391, + "loss/crossentropy": 1.9048819541931152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18057531118392944, + "step": 24960 + }, + { + "epoch": 0.49924, + "grad_norm": 1.9609375, + "grad_norm_var": 0.019505818684895832, + "learning_rate": 0.0001, + "loss": 4.0654, + "loss/crossentropy": 2.045142412185669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18374620378017426, + "step": 24962 + }, + { + "epoch": 0.49928, + "grad_norm": 1.9140625, + "grad_norm_var": 0.019798787434895833, + "learning_rate": 0.0001, + "loss": 4.0365, + "loss/crossentropy": 2.094703733921051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21248038858175278, + "step": 24964 + }, + { + "epoch": 0.49932, + "grad_norm": 1.8671875, + "grad_norm_var": 0.020359039306640625, + "learning_rate": 0.0001, + "loss": 3.7111, + "loss/crossentropy": 2.113132894039154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19428254663944244, + "step": 24966 + }, + { + "epoch": 0.49936, + "grad_norm": 2.015625, + "grad_norm_var": 0.019123331705729166, + "learning_rate": 0.0001, + "loss": 4.2238, + "loss/crossentropy": 2.0182060599327087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22048161923885345, + "step": 24968 + }, + { + "epoch": 0.4994, + "grad_norm": 1.84375, + "grad_norm_var": 0.01933568318684896, + "learning_rate": 0.0001, + "loss": 4.0755, + "loss/crossentropy": 2.141752541065216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.198372982442379, + "step": 24970 + }, + { + "epoch": 0.49944, + "grad_norm": 2.03125, + "grad_norm_var": 0.012737782796223958, + "learning_rate": 0.0001, + "loss": 4.0035, + "loss/crossentropy": 1.9773234724998474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18223125487565994, + "step": 24972 + }, + { + "epoch": 0.49948, + "grad_norm": 2.015625, + "grad_norm_var": 0.009464518229166666, + "learning_rate": 0.0001, + "loss": 3.9405, + "loss/crossentropy": 1.7909184098243713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17887402325868607, + "step": 24974 + }, + { + "epoch": 0.49952, + "grad_norm": 1.921875, + "grad_norm_var": 0.0048411051432291664, + "learning_rate": 0.0001, + "loss": 4.0024, + "loss/crossentropy": 1.9185696840286255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19022425264120102, + "step": 24976 + }, + { + "epoch": 0.49956, + "grad_norm": 2.03125, + "grad_norm_var": 0.005472819010416667, + "learning_rate": 0.0001, + "loss": 4.2593, + "loss/crossentropy": 1.7053416967391968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18504460155963898, + "step": 24978 + }, + { + "epoch": 0.4996, + "grad_norm": 1.8828125, + "grad_norm_var": 0.006156158447265625, + "learning_rate": 0.0001, + "loss": 4.2626, + "loss/crossentropy": 2.1260892152786255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958574280142784, + "step": 24980 + }, + { + "epoch": 0.49964, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005537923177083333, + "learning_rate": 0.0001, + "loss": 3.9854, + "loss/crossentropy": 1.9965519905090332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20814893394708633, + "step": 24982 + }, + { + "epoch": 0.49968, + "grad_norm": 1.8671875, + "grad_norm_var": 0.3696734110514323, + "learning_rate": 0.0001, + "loss": 4.1443, + "loss/crossentropy": 1.882089376449585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23063261061906815, + "step": 24984 + }, + { + "epoch": 0.49972, + "grad_norm": 2.03125, + "grad_norm_var": 0.36444498697916666, + "learning_rate": 0.0001, + "loss": 3.9484, + "loss/crossentropy": 2.0816534757614136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1818612888455391, + "step": 24986 + }, + { + "epoch": 0.49976, + "grad_norm": 1.875, + "grad_norm_var": 0.3668365478515625, + "learning_rate": 0.0001, + "loss": 3.794, + "loss/crossentropy": 1.7514638304710388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18518543243408203, + "step": 24988 + }, + { + "epoch": 0.4998, + "grad_norm": 1.96875, + "grad_norm_var": 0.37027969360351565, + "learning_rate": 0.0001, + "loss": 4.095, + "loss/crossentropy": 2.438727021217346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2042185142636299, + "step": 24990 + }, + { + "epoch": 0.49984, + "grad_norm": 2.1875, + "grad_norm_var": 0.3688087463378906, + "learning_rate": 0.0001, + "loss": 4.1072, + "loss/crossentropy": 2.0413439869880676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19952382892370224, + "step": 24992 + }, + { + "epoch": 0.49988, + "grad_norm": 1.859375, + "grad_norm_var": 0.3713612874348958, + "learning_rate": 0.0001, + "loss": 4.1332, + "loss/crossentropy": 2.1857372522354126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18842580169439316, + "step": 24994 + }, + { + "epoch": 0.49992, + "grad_norm": 1.96875, + "grad_norm_var": 0.36927261352539065, + "learning_rate": 0.0001, + "loss": 4.1855, + "loss/crossentropy": 1.9340556859970093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24346761405467987, + "step": 24996 + }, + { + "epoch": 0.49996, + "grad_norm": 1.859375, + "grad_norm_var": 0.3735389709472656, + "learning_rate": 0.0001, + "loss": 3.9177, + "loss/crossentropy": 1.9736055731773376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20419831573963165, + "step": 24998 + }, + { + "epoch": 0.5, + "grad_norm": 1.96875, + "grad_norm_var": 0.0122314453125, + "learning_rate": 0.0001, + "loss": 4.0167, + "loss/crossentropy": 1.8731598258018494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20324288308620453, + "step": 25000 + }, + { + "epoch": 0.50004, + "grad_norm": 2.0, + "grad_norm_var": 0.012645467122395834, + "learning_rate": 0.0001, + "loss": 4.1379, + "loss/crossentropy": 1.9114505648612976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19572290033102036, + "step": 25002 + }, + { + "epoch": 0.50008, + "grad_norm": 2.1875, + "grad_norm_var": 0.016022745768229166, + "learning_rate": 0.0001, + "loss": 4.0389, + "loss/crossentropy": 2.0608550310134888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20759478956460953, + "step": 25004 + }, + { + "epoch": 0.50012, + "grad_norm": 1.8828125, + "grad_norm_var": 0.015529123942057292, + "learning_rate": 0.0001, + "loss": 4.0434, + "loss/crossentropy": 2.0759811401367188, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1895374357700348, + "step": 25006 + }, + { + "epoch": 0.50016, + "grad_norm": 1.8203125, + "grad_norm_var": 0.012859853108723958, + "learning_rate": 0.0001, + "loss": 3.7772, + "loss/crossentropy": 2.139121353626251, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19550149887800217, + "step": 25008 + }, + { + "epoch": 0.5002, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013410441080729167, + "learning_rate": 0.0001, + "loss": 3.9163, + "loss/crossentropy": 1.8311315774917603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17756005376577377, + "step": 25010 + }, + { + "epoch": 0.50024, + "grad_norm": 2.03125, + "grad_norm_var": 0.012311808268229167, + "learning_rate": 0.0001, + "loss": 3.9897, + "loss/crossentropy": 2.0824968814849854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21674959361553192, + "step": 25012 + }, + { + "epoch": 0.50028, + "grad_norm": 2.0, + "grad_norm_var": 0.018317667643229167, + "learning_rate": 0.0001, + "loss": 4.4713, + "loss/crossentropy": 2.3479214906692505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22297757118940353, + "step": 25014 + }, + { + "epoch": 0.50032, + "grad_norm": 2.03125, + "grad_norm_var": 0.017682902018229165, + "learning_rate": 0.0001, + "loss": 4.2473, + "loss/crossentropy": 2.477561354637146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2347298264503479, + "step": 25016 + }, + { + "epoch": 0.50036, + "grad_norm": 1.875, + "grad_norm_var": 0.020369466145833334, + "learning_rate": 0.0001, + "loss": 3.797, + "loss/crossentropy": 1.7736787796020508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1656038835644722, + "step": 25018 + }, + { + "epoch": 0.5004, + "grad_norm": 1.90625, + "grad_norm_var": 0.017130533854166668, + "learning_rate": 0.0001, + "loss": 4.1537, + "loss/crossentropy": 2.2568663358688354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19755709171295166, + "step": 25020 + }, + { + "epoch": 0.50044, + "grad_norm": 2.078125, + "grad_norm_var": 0.018668619791666667, + "learning_rate": 0.0001, + "loss": 3.9279, + "loss/crossentropy": 1.961386263370514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2079472467303276, + "step": 25022 + }, + { + "epoch": 0.50048, + "grad_norm": 1.78125, + "grad_norm_var": 0.01898981730143229, + "learning_rate": 0.0001, + "loss": 3.7884, + "loss/crossentropy": 1.75395268201828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16586285829544067, + "step": 25024 + }, + { + "epoch": 0.50052, + "grad_norm": 1.90625, + "grad_norm_var": 0.0171142578125, + "learning_rate": 0.0001, + "loss": 3.9074, + "loss/crossentropy": 2.1727951169013977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21714362502098083, + "step": 25026 + }, + { + "epoch": 0.50056, + "grad_norm": 1.953125, + "grad_norm_var": 0.017765299479166666, + "learning_rate": 0.0001, + "loss": 4.276, + "loss/crossentropy": 2.147822380065918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19961202144622803, + "step": 25028 + }, + { + "epoch": 0.5006, + "grad_norm": 2.078125, + "grad_norm_var": 0.012775675455729166, + "learning_rate": 0.0001, + "loss": 4.1055, + "loss/crossentropy": 2.0300870537757874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19732433557510376, + "step": 25030 + }, + { + "epoch": 0.50064, + "grad_norm": 1.875, + "grad_norm_var": 0.019828287760416667, + "learning_rate": 0.0001, + "loss": 4.0727, + "loss/crossentropy": 1.840386688709259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18403710424900055, + "step": 25032 + }, + { + "epoch": 0.50068, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01875, + "learning_rate": 0.0001, + "loss": 3.8442, + "loss/crossentropy": 1.9718754291534424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19007521867752075, + "step": 25034 + }, + { + "epoch": 0.50072, + "grad_norm": 1.8984375, + "grad_norm_var": 0.022468058268229167, + "learning_rate": 0.0001, + "loss": 3.8137, + "loss/crossentropy": 1.9550666809082031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944015771150589, + "step": 25036 + }, + { + "epoch": 0.50076, + "grad_norm": 1.75, + "grad_norm_var": 0.024397786458333334, + "learning_rate": 0.0001, + "loss": 3.7467, + "loss/crossentropy": 1.9608187079429626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1774805188179016, + "step": 25038 + }, + { + "epoch": 0.5008, + "grad_norm": 1.96875, + "grad_norm_var": 0.0225494384765625, + "learning_rate": 0.0001, + "loss": 3.7607, + "loss/crossentropy": 1.6980991959571838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16579563170671463, + "step": 25040 + }, + { + "epoch": 0.50084, + "grad_norm": 1.890625, + "grad_norm_var": 0.02269261678059896, + "learning_rate": 0.0001, + "loss": 3.9008, + "loss/crossentropy": 1.8796940445899963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1857006549835205, + "step": 25042 + }, + { + "epoch": 0.50088, + "grad_norm": 1.953125, + "grad_norm_var": 0.02159398396809896, + "learning_rate": 0.0001, + "loss": 3.7862, + "loss/crossentropy": 1.8667701482772827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1920718252658844, + "step": 25044 + }, + { + "epoch": 0.50092, + "grad_norm": 1.9140625, + "grad_norm_var": 0.019676717122395833, + "learning_rate": 0.0001, + "loss": 4.11, + "loss/crossentropy": 2.142220616340637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20785054564476013, + "step": 25046 + }, + { + "epoch": 0.50096, + "grad_norm": 1.953125, + "grad_norm_var": 0.011185709635416667, + "learning_rate": 0.0001, + "loss": 4.1763, + "loss/crossentropy": 2.1965246200561523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2001591995358467, + "step": 25048 + }, + { + "epoch": 0.501, + "grad_norm": 2.09375, + "grad_norm_var": 0.01103515625, + "learning_rate": 0.0001, + "loss": 3.9254, + "loss/crossentropy": 2.1812084913253784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21115607023239136, + "step": 25050 + }, + { + "epoch": 0.50104, + "grad_norm": 1.921875, + "grad_norm_var": 0.00670166015625, + "learning_rate": 0.0001, + "loss": 4.1107, + "loss/crossentropy": 1.8218246698379517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20282388478517532, + "step": 25052 + }, + { + "epoch": 0.50108, + "grad_norm": 2.015625, + "grad_norm_var": 0.0037750244140625, + "learning_rate": 0.0001, + "loss": 3.9292, + "loss/crossentropy": 2.2070316076278687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21164244413375854, + "step": 25054 + }, + { + "epoch": 0.50112, + "grad_norm": 2.09375, + "grad_norm_var": 0.00509033203125, + "learning_rate": 0.0001, + "loss": 4.0604, + "loss/crossentropy": 2.1380701065063477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20642387121915817, + "step": 25056 + }, + { + "epoch": 0.50116, + "grad_norm": 1.9765625, + "grad_norm_var": 0.004671223958333333, + "learning_rate": 0.0001, + "loss": 4.188, + "loss/crossentropy": 2.2019251585006714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20543986558914185, + "step": 25058 + }, + { + "epoch": 0.5012, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006583658854166666, + "learning_rate": 0.0001, + "loss": 3.9042, + "loss/crossentropy": 2.0563461780548096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18677344918251038, + "step": 25060 + }, + { + "epoch": 0.50124, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0067291259765625, + "learning_rate": 0.0001, + "loss": 4.0295, + "loss/crossentropy": 1.788195252418518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18303587287664413, + "step": 25062 + }, + { + "epoch": 0.50128, + "grad_norm": 2.0, + "grad_norm_var": 0.0079254150390625, + "learning_rate": 0.0001, + "loss": 3.7124, + "loss/crossentropy": 1.888840913772583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1736098676919937, + "step": 25064 + }, + { + "epoch": 0.50132, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006139882405598958, + "learning_rate": 0.0001, + "loss": 4.0185, + "loss/crossentropy": 2.0963358283042908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944149062037468, + "step": 25066 + }, + { + "epoch": 0.50136, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006322224934895833, + "learning_rate": 0.0001, + "loss": 4.0357, + "loss/crossentropy": 2.3116443157196045, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22342295944690704, + "step": 25068 + }, + { + "epoch": 0.5014, + "grad_norm": 2.03125, + "grad_norm_var": 0.006266276041666667, + "learning_rate": 0.0001, + "loss": 4.2353, + "loss/crossentropy": 2.251197099685669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25324299186468124, + "step": 25070 + }, + { + "epoch": 0.50144, + "grad_norm": 1.890625, + "grad_norm_var": 0.004613240559895833, + "learning_rate": 0.0001, + "loss": 3.7142, + "loss/crossentropy": 2.2114810943603516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18877273797988892, + "step": 25072 + }, + { + "epoch": 0.50148, + "grad_norm": 1.9921875, + "grad_norm_var": 0.004923248291015625, + "learning_rate": 0.0001, + "loss": 4.1174, + "loss/crossentropy": 2.266746401786804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21522346138954163, + "step": 25074 + }, + { + "epoch": 0.50152, + "grad_norm": 1.84375, + "grad_norm_var": 0.004809315999348958, + "learning_rate": 0.0001, + "loss": 4.2369, + "loss/crossentropy": 1.922059416770935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17906102538108826, + "step": 25076 + }, + { + "epoch": 0.50156, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005891672770182292, + "learning_rate": 0.0001, + "loss": 4.0695, + "loss/crossentropy": 2.077875077724457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18431179225444794, + "step": 25078 + }, + { + "epoch": 0.5016, + "grad_norm": 1.96875, + "grad_norm_var": 0.006135050455729167, + "learning_rate": 0.0001, + "loss": 3.7519, + "loss/crossentropy": 1.8861806988716125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17609698325395584, + "step": 25080 + }, + { + "epoch": 0.50164, + "grad_norm": 2.078125, + "grad_norm_var": 0.008302561442057292, + "learning_rate": 0.0001, + "loss": 4.0371, + "loss/crossentropy": 2.1382817029953003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21308781206607819, + "step": 25082 + }, + { + "epoch": 0.50168, + "grad_norm": 2.015625, + "grad_norm_var": 0.008656565348307292, + "learning_rate": 0.0001, + "loss": 3.9133, + "loss/crossentropy": 1.9955537915229797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20031572878360748, + "step": 25084 + }, + { + "epoch": 0.50172, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0108551025390625, + "learning_rate": 0.0001, + "loss": 4.0816, + "loss/crossentropy": 2.2564213275909424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19504930824041367, + "step": 25086 + }, + { + "epoch": 0.50176, + "grad_norm": 2.203125, + "grad_norm_var": 0.01778132120768229, + "learning_rate": 0.0001, + "loss": 4.2432, + "loss/crossentropy": 2.388614296913147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2021147906780243, + "step": 25088 + }, + { + "epoch": 0.5018, + "grad_norm": 2.015625, + "grad_norm_var": 0.016764322916666668, + "learning_rate": 0.0001, + "loss": 4.0045, + "loss/crossentropy": 2.5410468578338623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23654652386903763, + "step": 25090 + }, + { + "epoch": 0.50184, + "grad_norm": 1.90625, + "grad_norm_var": 0.016255696614583332, + "learning_rate": 0.0001, + "loss": 4.0248, + "loss/crossentropy": 1.9850627183914185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1861669197678566, + "step": 25092 + }, + { + "epoch": 0.50188, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01573486328125, + "learning_rate": 0.0001, + "loss": 4.2318, + "loss/crossentropy": 2.3069719076156616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22062674164772034, + "step": 25094 + }, + { + "epoch": 0.50192, + "grad_norm": 1.875, + "grad_norm_var": 0.013960520426432291, + "learning_rate": 0.0001, + "loss": 3.9187, + "loss/crossentropy": 1.7462975978851318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17929460108280182, + "step": 25096 + }, + { + "epoch": 0.50196, + "grad_norm": 1.84375, + "grad_norm_var": 0.012870025634765626, + "learning_rate": 0.0001, + "loss": 4.0647, + "loss/crossentropy": 1.8277402520179749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17240776866674423, + "step": 25098 + }, + { + "epoch": 0.502, + "grad_norm": 1.9296875, + "grad_norm_var": 0.013978830973307292, + "learning_rate": 0.0001, + "loss": 4.0331, + "loss/crossentropy": 2.2530910968780518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20934444665908813, + "step": 25100 + }, + { + "epoch": 0.50204, + "grad_norm": 1.9375, + "grad_norm_var": 0.0118316650390625, + "learning_rate": 0.0001, + "loss": 3.9368, + "loss/crossentropy": 1.8995028138160706, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18956587463617325, + "step": 25102 + }, + { + "epoch": 0.50208, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0044553120930989586, + "learning_rate": 0.0001, + "loss": 3.8491, + "loss/crossentropy": 1.9500588774681091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18196535110473633, + "step": 25104 + }, + { + "epoch": 0.50212, + "grad_norm": 1.875, + "grad_norm_var": 0.0037750244140625, + "learning_rate": 0.0001, + "loss": 3.8702, + "loss/crossentropy": 1.794252336025238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17875805497169495, + "step": 25106 + }, + { + "epoch": 0.50216, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004124959309895833, + "learning_rate": 0.0001, + "loss": 3.9752, + "loss/crossentropy": 2.282618999481201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18928487598896027, + "step": 25108 + }, + { + "epoch": 0.5022, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0036173502604166666, + "learning_rate": 0.0001, + "loss": 4.1439, + "loss/crossentropy": 1.8668266534805298, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1754981055855751, + "step": 25110 + }, + { + "epoch": 0.50224, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0038859049479166665, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 2.416601300239563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21391690522432327, + "step": 25112 + }, + { + "epoch": 0.50228, + "grad_norm": 1.921875, + "grad_norm_var": 0.003311920166015625, + "learning_rate": 0.0001, + "loss": 3.914, + "loss/crossentropy": 1.9013903141021729, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16768474876880646, + "step": 25114 + }, + { + "epoch": 0.50232, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0035807291666666665, + "learning_rate": 0.0001, + "loss": 3.8587, + "loss/crossentropy": 2.090600550174713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963217630982399, + "step": 25116 + }, + { + "epoch": 0.50236, + "grad_norm": 1.9765625, + "grad_norm_var": 0.003929646809895834, + "learning_rate": 0.0001, + "loss": 4.0404, + "loss/crossentropy": 1.938132882118225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17899858951568604, + "step": 25118 + }, + { + "epoch": 0.5024, + "grad_norm": 2.0, + "grad_norm_var": 0.0031613667805989584, + "learning_rate": 0.0001, + "loss": 4.1818, + "loss/crossentropy": 2.2119863033294678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20339767634868622, + "step": 25120 + }, + { + "epoch": 0.50244, + "grad_norm": 2.0, + "grad_norm_var": 0.0031809488932291667, + "learning_rate": 0.0001, + "loss": 4.2186, + "loss/crossentropy": 2.35983407497406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063140720129013, + "step": 25122 + }, + { + "epoch": 0.50248, + "grad_norm": 1.8515625, + "grad_norm_var": 0.003830718994140625, + "learning_rate": 0.0001, + "loss": 4.2016, + "loss/crossentropy": 2.0203242897987366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19055970013141632, + "step": 25124 + }, + { + "epoch": 0.50252, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0050961812337239586, + "learning_rate": 0.0001, + "loss": 3.6522, + "loss/crossentropy": 1.7058457732200623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1857994720339775, + "step": 25126 + }, + { + "epoch": 0.50256, + "grad_norm": 1.828125, + "grad_norm_var": 0.0054514567057291664, + "learning_rate": 0.0001, + "loss": 3.7193, + "loss/crossentropy": 1.720770239830017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1738799437880516, + "step": 25128 + }, + { + "epoch": 0.5026, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006247711181640625, + "learning_rate": 0.0001, + "loss": 3.8967, + "loss/crossentropy": 1.677247405052185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18985359370708466, + "step": 25130 + }, + { + "epoch": 0.50264, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0058977762858072914, + "learning_rate": 0.0001, + "loss": 4.0737, + "loss/crossentropy": 2.1808619499206543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19232448935508728, + "step": 25132 + }, + { + "epoch": 0.50268, + "grad_norm": 1.6953125, + "grad_norm_var": 0.008512369791666667, + "learning_rate": 0.0001, + "loss": 3.8655, + "loss/crossentropy": 1.8475935459136963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18105845898389816, + "step": 25134 + }, + { + "epoch": 0.50272, + "grad_norm": 2.03125, + "grad_norm_var": 0.012052154541015625, + "learning_rate": 0.0001, + "loss": 3.8568, + "loss/crossentropy": 2.0687568187713623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997617706656456, + "step": 25136 + }, + { + "epoch": 0.50276, + "grad_norm": 2.015625, + "grad_norm_var": 0.01256103515625, + "learning_rate": 0.0001, + "loss": 4.29, + "loss/crossentropy": 2.405083179473877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22054974734783173, + "step": 25138 + }, + { + "epoch": 0.5028, + "grad_norm": 2.078125, + "grad_norm_var": 0.014037068684895833, + "learning_rate": 0.0001, + "loss": 4.0613, + "loss/crossentropy": 1.917975127696991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20000945031642914, + "step": 25140 + }, + { + "epoch": 0.50284, + "grad_norm": 2.09375, + "grad_norm_var": 0.017010243733723958, + "learning_rate": 0.0001, + "loss": 4.1259, + "loss/crossentropy": 2.270500898361206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21464426815509796, + "step": 25142 + }, + { + "epoch": 0.50288, + "grad_norm": 1.875, + "grad_norm_var": 0.017145792643229168, + "learning_rate": 0.0001, + "loss": 4.0141, + "loss/crossentropy": 2.0085031390190125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20128140598535538, + "step": 25144 + }, + { + "epoch": 0.50292, + "grad_norm": 1.96875, + "grad_norm_var": 0.0150299072265625, + "learning_rate": 0.0001, + "loss": 4.1969, + "loss/crossentropy": 2.206787347793579, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19804110378026962, + "step": 25146 + }, + { + "epoch": 0.50296, + "grad_norm": 1.875, + "grad_norm_var": 0.015428670247395833, + "learning_rate": 0.0001, + "loss": 3.8519, + "loss/crossentropy": 1.7760102152824402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18732204288244247, + "step": 25148 + }, + { + "epoch": 0.503, + "grad_norm": 2.046875, + "grad_norm_var": 0.011439768473307292, + "learning_rate": 0.0001, + "loss": 4.1334, + "loss/crossentropy": 1.9429696202278137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21402037143707275, + "step": 25150 + }, + { + "epoch": 0.50304, + "grad_norm": 1.8203125, + "grad_norm_var": 0.007315826416015625, + "learning_rate": 0.0001, + "loss": 3.8488, + "loss/crossentropy": 1.6982505321502686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16420165449380875, + "step": 25152 + }, + { + "epoch": 0.50308, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0076324462890625, + "learning_rate": 0.0001, + "loss": 4.0041, + "loss/crossentropy": 1.998267948627472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.193763867020607, + "step": 25154 + }, + { + "epoch": 0.50312, + "grad_norm": 2.1875, + "grad_norm_var": 0.009917958577473959, + "learning_rate": 0.0001, + "loss": 4.0845, + "loss/crossentropy": 2.4127193689346313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2271365001797676, + "step": 25156 + }, + { + "epoch": 0.50316, + "grad_norm": 1.859375, + "grad_norm_var": 0.0094390869140625, + "learning_rate": 0.0001, + "loss": 3.8656, + "loss/crossentropy": 1.5740959644317627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17892491817474365, + "step": 25158 + }, + { + "epoch": 0.5032, + "grad_norm": 2.03125, + "grad_norm_var": 0.010205078125, + "learning_rate": 0.0001, + "loss": 4.079, + "loss/crossentropy": 2.1531224250793457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20502899587154388, + "step": 25160 + }, + { + "epoch": 0.50324, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0105621337890625, + "learning_rate": 0.0001, + "loss": 3.9895, + "loss/crossentropy": 2.024933695793152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21745888888835907, + "step": 25162 + }, + { + "epoch": 0.50328, + "grad_norm": 1.890625, + "grad_norm_var": 0.010465240478515625, + "learning_rate": 0.0001, + "loss": 3.7801, + "loss/crossentropy": 1.7188761234283447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1754877120256424, + "step": 25164 + }, + { + "epoch": 0.50332, + "grad_norm": 1.96875, + "grad_norm_var": 0.009642537434895833, + "learning_rate": 0.0001, + "loss": 3.9803, + "loss/crossentropy": 2.1078373193740845, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1812133714556694, + "step": 25166 + }, + { + "epoch": 0.50336, + "grad_norm": 2.046875, + "grad_norm_var": 0.010949452718098959, + "learning_rate": 0.0001, + "loss": 3.753, + "loss/crossentropy": 1.793417751789093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1709349974989891, + "step": 25168 + }, + { + "epoch": 0.5034, + "grad_norm": 1.734375, + "grad_norm_var": 0.014115142822265624, + "learning_rate": 0.0001, + "loss": 3.2689, + "loss/crossentropy": 1.357922375202179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14589300006628036, + "step": 25170 + }, + { + "epoch": 0.50344, + "grad_norm": 2.109375, + "grad_norm_var": 0.014247385660807292, + "learning_rate": 0.0001, + "loss": 4.4387, + "loss/crossentropy": 2.3469592332839966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20105785876512527, + "step": 25172 + }, + { + "epoch": 0.50348, + "grad_norm": 1.890625, + "grad_norm_var": 0.014159138997395833, + "learning_rate": 0.0001, + "loss": 3.8041, + "loss/crossentropy": 2.1891257762908936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21113110333681107, + "step": 25174 + }, + { + "epoch": 0.50352, + "grad_norm": 1.890625, + "grad_norm_var": 0.012878163655598959, + "learning_rate": 0.0001, + "loss": 3.9236, + "loss/crossentropy": 2.201194167137146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21105756610631943, + "step": 25176 + }, + { + "epoch": 0.50356, + "grad_norm": 1.859375, + "grad_norm_var": 0.012702433268229167, + "learning_rate": 0.0001, + "loss": 4.0474, + "loss/crossentropy": 1.8287039995193481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18074452877044678, + "step": 25178 + }, + { + "epoch": 0.5036, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012813313802083334, + "learning_rate": 0.0001, + "loss": 4.1655, + "loss/crossentropy": 1.8915085792541504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997976452112198, + "step": 25180 + }, + { + "epoch": 0.50364, + "grad_norm": 1.7109375, + "grad_norm_var": 0.014509836832682291, + "learning_rate": 0.0001, + "loss": 3.8989, + "loss/crossentropy": 2.102576494216919, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18678200989961624, + "step": 25182 + }, + { + "epoch": 0.50368, + "grad_norm": 2.078125, + "grad_norm_var": 0.014898427327473958, + "learning_rate": 0.0001, + "loss": 4.3278, + "loss/crossentropy": 2.1603941917419434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20021572709083557, + "step": 25184 + }, + { + "epoch": 0.50372, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011263020833333333, + "learning_rate": 0.0001, + "loss": 3.6356, + "loss/crossentropy": 1.8718876838684082, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18388725817203522, + "step": 25186 + }, + { + "epoch": 0.50376, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0069272359212239586, + "learning_rate": 0.0001, + "loss": 4.1273, + "loss/crossentropy": 2.2126933336257935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20829670876264572, + "step": 25188 + }, + { + "epoch": 0.5038, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0072743733723958336, + "learning_rate": 0.0001, + "loss": 4.0426, + "loss/crossentropy": 2.1955376863479614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19484563171863556, + "step": 25190 + }, + { + "epoch": 0.50384, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0067535400390625, + "learning_rate": 0.0001, + "loss": 4.0132, + "loss/crossentropy": 1.9294677376747131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18266455084085464, + "step": 25192 + }, + { + "epoch": 0.50388, + "grad_norm": 2.265625, + "grad_norm_var": 0.012851715087890625, + "learning_rate": 0.0001, + "loss": 4.1586, + "loss/crossentropy": 1.867956280708313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2331952303647995, + "step": 25194 + }, + { + "epoch": 0.50392, + "grad_norm": 1.90625, + "grad_norm_var": 0.012984212239583333, + "learning_rate": 0.0001, + "loss": 3.8322, + "loss/crossentropy": 1.7810456156730652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1834517866373062, + "step": 25196 + }, + { + "epoch": 0.50396, + "grad_norm": 1.890625, + "grad_norm_var": 0.010711415608723959, + "learning_rate": 0.0001, + "loss": 3.7718, + "loss/crossentropy": 2.148942291736603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2065306007862091, + "step": 25198 + }, + { + "epoch": 0.504, + "grad_norm": 1.84375, + "grad_norm_var": 0.011291249593098959, + "learning_rate": 0.0001, + "loss": 3.9163, + "loss/crossentropy": 1.730657696723938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18555008620023727, + "step": 25200 + }, + { + "epoch": 0.50404, + "grad_norm": 2.078125, + "grad_norm_var": 0.014190419514973959, + "learning_rate": 0.0001, + "loss": 3.9587, + "loss/crossentropy": 2.2561585903167725, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22614946961402893, + "step": 25202 + }, + { + "epoch": 0.50408, + "grad_norm": 1.90625, + "grad_norm_var": 0.014324696858723958, + "learning_rate": 0.0001, + "loss": 4.1299, + "loss/crossentropy": 2.2798372507095337, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19820939004421234, + "step": 25204 + }, + { + "epoch": 0.50412, + "grad_norm": 2.078125, + "grad_norm_var": 0.014582316080729166, + "learning_rate": 0.0001, + "loss": 4.1232, + "loss/crossentropy": 2.2171024680137634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22784201800823212, + "step": 25206 + }, + { + "epoch": 0.50416, + "grad_norm": 1.8984375, + "grad_norm_var": 0.016283162434895835, + "learning_rate": 0.0001, + "loss": 3.8819, + "loss/crossentropy": 1.867311179637909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19023532420396805, + "step": 25208 + }, + { + "epoch": 0.5042, + "grad_norm": 1.953125, + "grad_norm_var": 0.009506988525390624, + "learning_rate": 0.0001, + "loss": 3.843, + "loss/crossentropy": 2.0274672508239746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19995202869176865, + "step": 25210 + }, + { + "epoch": 0.50424, + "grad_norm": 1.96875, + "grad_norm_var": 0.009663899739583334, + "learning_rate": 0.0001, + "loss": 3.7862, + "loss/crossentropy": 1.88859623670578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1842176765203476, + "step": 25212 + }, + { + "epoch": 0.50428, + "grad_norm": 1.953125, + "grad_norm_var": 0.008786773681640625, + "learning_rate": 0.0001, + "loss": 3.9548, + "loss/crossentropy": 1.917616069316864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19126974791288376, + "step": 25214 + }, + { + "epoch": 0.50432, + "grad_norm": 1.921875, + "grad_norm_var": 0.0074615478515625, + "learning_rate": 0.0001, + "loss": 3.928, + "loss/crossentropy": 2.111130714416504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20154303312301636, + "step": 25216 + }, + { + "epoch": 0.50436, + "grad_norm": 1.96875, + "grad_norm_var": 0.004857381184895833, + "learning_rate": 0.0001, + "loss": 3.9316, + "loss/crossentropy": 2.0559436678886414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1823289915919304, + "step": 25218 + }, + { + "epoch": 0.5044, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007225545247395834, + "learning_rate": 0.0001, + "loss": 4.2238, + "loss/crossentropy": 1.8830538392066956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17736117541790009, + "step": 25220 + }, + { + "epoch": 0.50444, + "grad_norm": 2.140625, + "grad_norm_var": 0.009488932291666667, + "learning_rate": 0.0001, + "loss": 4.1422, + "loss/crossentropy": 1.9821715354919434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20550980418920517, + "step": 25222 + }, + { + "epoch": 0.50448, + "grad_norm": 2.03125, + "grad_norm_var": 0.008670806884765625, + "learning_rate": 0.0001, + "loss": 4.0223, + "loss/crossentropy": 2.182245671749115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2306300848722458, + "step": 25224 + }, + { + "epoch": 0.50452, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0091217041015625, + "learning_rate": 0.0001, + "loss": 3.9751, + "loss/crossentropy": 1.6160125136375427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15688229352235794, + "step": 25226 + }, + { + "epoch": 0.50456, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008988189697265624, + "learning_rate": 0.0001, + "loss": 4.037, + "loss/crossentropy": 1.8484386801719666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19160201400518417, + "step": 25228 + }, + { + "epoch": 0.5046, + "grad_norm": 2.0, + "grad_norm_var": 0.008438873291015624, + "learning_rate": 0.0001, + "loss": 4.2143, + "loss/crossentropy": 2.153999447822571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077987790107727, + "step": 25230 + }, + { + "epoch": 0.50464, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008487701416015625, + "learning_rate": 0.0001, + "loss": 4.0174, + "loss/crossentropy": 2.1414119601249695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2047003135085106, + "step": 25232 + }, + { + "epoch": 0.50468, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00701904296875, + "learning_rate": 0.0001, + "loss": 4.1841, + "loss/crossentropy": 2.128756880760193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19793596118688583, + "step": 25234 + }, + { + "epoch": 0.50472, + "grad_norm": 1.8125, + "grad_norm_var": 0.008160146077473958, + "learning_rate": 0.0001, + "loss": 4.0356, + "loss/crossentropy": 1.9649195075035095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19532814621925354, + "step": 25236 + }, + { + "epoch": 0.50476, + "grad_norm": 1.921875, + "grad_norm_var": 0.006158192952473958, + "learning_rate": 0.0001, + "loss": 4.3162, + "loss/crossentropy": 2.165442705154419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19320187717676163, + "step": 25238 + }, + { + "epoch": 0.5048, + "grad_norm": 1.828125, + "grad_norm_var": 0.005909983317057292, + "learning_rate": 0.0001, + "loss": 4.0444, + "loss/crossentropy": 2.172460913658142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18116996437311172, + "step": 25240 + }, + { + "epoch": 0.50484, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0055501302083333336, + "learning_rate": 0.0001, + "loss": 3.9613, + "loss/crossentropy": 1.9772083163261414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18840450048446655, + "step": 25242 + }, + { + "epoch": 0.50488, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005952962239583333, + "learning_rate": 0.0001, + "loss": 3.8147, + "loss/crossentropy": 1.9985793828964233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19273578375577927, + "step": 25244 + }, + { + "epoch": 0.50492, + "grad_norm": 1.953125, + "grad_norm_var": 0.00513916015625, + "learning_rate": 0.0001, + "loss": 4.0365, + "loss/crossentropy": 2.339000701904297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2071395143866539, + "step": 25246 + }, + { + "epoch": 0.50496, + "grad_norm": 1.90625, + "grad_norm_var": 0.006056467692057292, + "learning_rate": 0.0001, + "loss": 4.2748, + "loss/crossentropy": 2.1699774861335754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2179349735379219, + "step": 25248 + }, + { + "epoch": 0.505, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0066569010416666664, + "learning_rate": 0.0001, + "loss": 3.9062, + "loss/crossentropy": 1.9445035457611084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17794214189052582, + "step": 25250 + }, + { + "epoch": 0.50504, + "grad_norm": 2.109375, + "grad_norm_var": 0.0077626546223958336, + "learning_rate": 0.0001, + "loss": 4.3328, + "loss/crossentropy": 2.325773239135742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20460912585258484, + "step": 25252 + }, + { + "epoch": 0.50508, + "grad_norm": 21.75, + "grad_norm_var": 24.616644032796223, + "learning_rate": 0.0001, + "loss": 4.5896, + "loss/crossentropy": 2.308157444000244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063561975955963, + "step": 25254 + }, + { + "epoch": 0.50512, + "grad_norm": 2.1875, + "grad_norm_var": 24.524991607666017, + "learning_rate": 0.0001, + "loss": 4.1727, + "loss/crossentropy": 2.1321988105773926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19514495134353638, + "step": 25256 + }, + { + "epoch": 0.50516, + "grad_norm": 2.03125, + "grad_norm_var": 24.486148834228516, + "learning_rate": 0.0001, + "loss": 3.9135, + "loss/crossentropy": 1.7947803735733032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18287408351898193, + "step": 25258 + }, + { + "epoch": 0.5052, + "grad_norm": 1.8046875, + "grad_norm_var": 24.50101318359375, + "learning_rate": 0.0001, + "loss": 4.0195, + "loss/crossentropy": 2.0636664628982544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20408915728330612, + "step": 25260 + }, + { + "epoch": 0.50524, + "grad_norm": 1.9296875, + "grad_norm_var": 24.510210927327474, + "learning_rate": 0.0001, + "loss": 3.904, + "loss/crossentropy": 2.113875925540924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19412121921777725, + "step": 25262 + }, + { + "epoch": 0.50528, + "grad_norm": 1.8671875, + "grad_norm_var": 24.563250478108724, + "learning_rate": 0.0001, + "loss": 3.9354, + "loss/crossentropy": 1.9069438576698303, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18107398599386215, + "step": 25264 + }, + { + "epoch": 0.50532, + "grad_norm": 1.90625, + "grad_norm_var": 24.5382687886556, + "learning_rate": 0.0001, + "loss": 3.7792, + "loss/crossentropy": 1.9241713881492615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18617494404315948, + "step": 25266 + }, + { + "epoch": 0.50536, + "grad_norm": 1.984375, + "grad_norm_var": 24.575275675455728, + "learning_rate": 0.0001, + "loss": 3.8934, + "loss/crossentropy": 1.9778280854225159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19652489572763443, + "step": 25268 + }, + { + "epoch": 0.5054, + "grad_norm": 1.9140625, + "grad_norm_var": 0.014729817708333334, + "learning_rate": 0.0001, + "loss": 3.7332, + "loss/crossentropy": 2.0369997024536133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18344805389642715, + "step": 25270 + }, + { + "epoch": 0.50544, + "grad_norm": 2.015625, + "grad_norm_var": 0.007176717122395833, + "learning_rate": 0.0001, + "loss": 4.0336, + "loss/crossentropy": 1.9891296029090881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19600480049848557, + "step": 25272 + }, + { + "epoch": 0.50548, + "grad_norm": 1.8203125, + "grad_norm_var": 0.005619049072265625, + "learning_rate": 0.0001, + "loss": 4.024, + "loss/crossentropy": 2.325801730155945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21493100374937057, + "step": 25274 + }, + { + "epoch": 0.50552, + "grad_norm": 1.953125, + "grad_norm_var": 0.005736287434895833, + "learning_rate": 0.0001, + "loss": 4.067, + "loss/crossentropy": 1.9277620315551758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140570506453514, + "step": 25276 + }, + { + "epoch": 0.50556, + "grad_norm": 1.875, + "grad_norm_var": 0.005763498942057291, + "learning_rate": 0.0001, + "loss": 3.9703, + "loss/crossentropy": 2.3552772998809814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21981871128082275, + "step": 25278 + }, + { + "epoch": 0.5056, + "grad_norm": 1.8046875, + "grad_norm_var": 0.005338287353515625, + "learning_rate": 0.0001, + "loss": 3.7174, + "loss/crossentropy": 1.7559251189231873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1645895466208458, + "step": 25280 + }, + { + "epoch": 0.50564, + "grad_norm": 1.921875, + "grad_norm_var": 0.005149078369140625, + "learning_rate": 0.0001, + "loss": 4.0222, + "loss/crossentropy": 1.8345605731010437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18997152149677277, + "step": 25282 + }, + { + "epoch": 0.50568, + "grad_norm": 2.03125, + "grad_norm_var": 0.005150349934895834, + "learning_rate": 0.0001, + "loss": 4.0908, + "loss/crossentropy": 2.297994017601013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2172359898686409, + "step": 25284 + }, + { + "epoch": 0.50572, + "grad_norm": 1.9375, + "grad_norm_var": 0.004239908854166667, + "learning_rate": 0.0001, + "loss": 3.9462, + "loss/crossentropy": 1.9640920758247375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1819777637720108, + "step": 25286 + }, + { + "epoch": 0.50576, + "grad_norm": 1.7734375, + "grad_norm_var": 0.004957834879557292, + "learning_rate": 0.0001, + "loss": 3.9544, + "loss/crossentropy": 2.0045265555381775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20263119041919708, + "step": 25288 + }, + { + "epoch": 0.5058, + "grad_norm": 1.75, + "grad_norm_var": 0.006058756510416667, + "learning_rate": 0.0001, + "loss": 3.8583, + "loss/crossentropy": 2.1333796977996826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20355874300003052, + "step": 25290 + }, + { + "epoch": 0.50584, + "grad_norm": 2.0, + "grad_norm_var": 0.006827799479166666, + "learning_rate": 0.0001, + "loss": 4.3003, + "loss/crossentropy": 2.1814208030700684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19890426099300385, + "step": 25292 + }, + { + "epoch": 0.50588, + "grad_norm": 2.0, + "grad_norm_var": 0.007283528645833333, + "learning_rate": 0.0001, + "loss": 4.4133, + "loss/crossentropy": 2.1642476320266724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1955343708395958, + "step": 25294 + }, + { + "epoch": 0.50592, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006498209635416667, + "learning_rate": 0.0001, + "loss": 4.1446, + "loss/crossentropy": 2.126586079597473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19216856360435486, + "step": 25296 + }, + { + "epoch": 0.50596, + "grad_norm": 1.84375, + "grad_norm_var": 0.007094065348307292, + "learning_rate": 0.0001, + "loss": 4.2278, + "loss/crossentropy": 2.4168988466262817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21824205666780472, + "step": 25298 + }, + { + "epoch": 0.506, + "grad_norm": 1.96875, + "grad_norm_var": 0.005887858072916667, + "learning_rate": 0.0001, + "loss": 4.103, + "loss/crossentropy": 2.013881802558899, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2011483684182167, + "step": 25300 + }, + { + "epoch": 0.50604, + "grad_norm": 1.953125, + "grad_norm_var": 0.007964833577473959, + "learning_rate": 0.0001, + "loss": 4.0249, + "loss/crossentropy": 2.18127703666687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19816970825195312, + "step": 25302 + }, + { + "epoch": 0.50608, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006249745686848958, + "learning_rate": 0.0001, + "loss": 4.0909, + "loss/crossentropy": 1.8370496034622192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18162214756011963, + "step": 25304 + }, + { + "epoch": 0.50612, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0038998921712239585, + "learning_rate": 0.0001, + "loss": 4.0421, + "loss/crossentropy": 2.12531840801239, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21469102054834366, + "step": 25306 + }, + { + "epoch": 0.50616, + "grad_norm": 1.9765625, + "grad_norm_var": 0.004961903889973958, + "learning_rate": 0.0001, + "loss": 3.95, + "loss/crossentropy": 1.956727385520935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18601331859827042, + "step": 25308 + }, + { + "epoch": 0.5062, + "grad_norm": 2.359375, + "grad_norm_var": 0.016161092122395835, + "learning_rate": 0.0001, + "loss": 3.9688, + "loss/crossentropy": 2.232389807701111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19958854466676712, + "step": 25310 + }, + { + "epoch": 0.50624, + "grad_norm": 1.921875, + "grad_norm_var": 0.016280110677083334, + "learning_rate": 0.0001, + "loss": 4.0164, + "loss/crossentropy": 2.0312620997428894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19724415987730026, + "step": 25312 + }, + { + "epoch": 0.50628, + "grad_norm": 2.09375, + "grad_norm_var": 0.01625544230143229, + "learning_rate": 0.0001, + "loss": 4.232, + "loss/crossentropy": 2.344046115875244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20940567553043365, + "step": 25314 + }, + { + "epoch": 0.50632, + "grad_norm": 1.84375, + "grad_norm_var": 0.017014312744140624, + "learning_rate": 0.0001, + "loss": 4.046, + "loss/crossentropy": 1.99763023853302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1835399493575096, + "step": 25316 + }, + { + "epoch": 0.50636, + "grad_norm": 1.90625, + "grad_norm_var": 0.017044830322265624, + "learning_rate": 0.0001, + "loss": 3.6584, + "loss/crossentropy": 1.9649406671524048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19432833045721054, + "step": 25318 + }, + { + "epoch": 0.5064, + "grad_norm": 1.9453125, + "grad_norm_var": 0.016434478759765624, + "learning_rate": 0.0001, + "loss": 3.9027, + "loss/crossentropy": 1.8670902848243713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1731145828962326, + "step": 25320 + }, + { + "epoch": 0.50644, + "grad_norm": 1.984375, + "grad_norm_var": 0.016706339518229165, + "learning_rate": 0.0001, + "loss": 4.0066, + "loss/crossentropy": 2.0195581912994385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19497420638799667, + "step": 25322 + }, + { + "epoch": 0.50648, + "grad_norm": 2.078125, + "grad_norm_var": 0.016290028889973957, + "learning_rate": 0.0001, + "loss": 4.1156, + "loss/crossentropy": 2.1221953630447388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20753790438175201, + "step": 25324 + }, + { + "epoch": 0.50652, + "grad_norm": 2.09375, + "grad_norm_var": 0.007169596354166667, + "learning_rate": 0.0001, + "loss": 4.2117, + "loss/crossentropy": 2.1407066583633423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1934654265642166, + "step": 25326 + }, + { + "epoch": 0.50656, + "grad_norm": 2.015625, + "grad_norm_var": 0.007917277018229167, + "learning_rate": 0.0001, + "loss": 3.9074, + "loss/crossentropy": 1.956154465675354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854771375656128, + "step": 25328 + }, + { + "epoch": 0.5066, + "grad_norm": 1.921875, + "grad_norm_var": 0.007450103759765625, + "learning_rate": 0.0001, + "loss": 4.0652, + "loss/crossentropy": 2.0948599576950073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19326548278331757, + "step": 25330 + }, + { + "epoch": 0.50664, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006705729166666666, + "learning_rate": 0.0001, + "loss": 4.0609, + "loss/crossentropy": 2.137898027896881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20896823704242706, + "step": 25332 + }, + { + "epoch": 0.50668, + "grad_norm": 1.984375, + "grad_norm_var": 0.005859375, + "learning_rate": 0.0001, + "loss": 3.8687, + "loss/crossentropy": 1.6471683382987976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1814308613538742, + "step": 25334 + }, + { + "epoch": 0.50672, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007165273030598958, + "learning_rate": 0.0001, + "loss": 3.7461, + "loss/crossentropy": 1.861536979675293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17766699194908142, + "step": 25336 + }, + { + "epoch": 0.50676, + "grad_norm": 1.90625, + "grad_norm_var": 0.006974029541015625, + "learning_rate": 0.0001, + "loss": 3.9319, + "loss/crossentropy": 2.312160909175873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20586217939853668, + "step": 25338 + }, + { + "epoch": 0.5068, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005293528238932292, + "learning_rate": 0.0001, + "loss": 4.1102, + "loss/crossentropy": 1.8415343165397644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20047077536582947, + "step": 25340 + }, + { + "epoch": 0.50684, + "grad_norm": 2.046875, + "grad_norm_var": 0.0033925374348958335, + "learning_rate": 0.0001, + "loss": 4.0579, + "loss/crossentropy": 1.94148850440979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19869551062583923, + "step": 25342 + }, + { + "epoch": 0.50688, + "grad_norm": 1.9296875, + "grad_norm_var": 0.002559407552083333, + "learning_rate": 0.0001, + "loss": 4.0379, + "loss/crossentropy": 2.280241370201111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19969766587018967, + "step": 25344 + }, + { + "epoch": 0.50692, + "grad_norm": 1.859375, + "grad_norm_var": 0.0026140848795572916, + "learning_rate": 0.0001, + "loss": 4.1569, + "loss/crossentropy": 2.0140087604522705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20252882689237595, + "step": 25346 + }, + { + "epoch": 0.50696, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0073201497395833336, + "learning_rate": 0.0001, + "loss": 3.9593, + "loss/crossentropy": 2.186760902404785, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20365024358034134, + "step": 25348 + }, + { + "epoch": 0.507, + "grad_norm": 1.90625, + "grad_norm_var": 0.007161458333333333, + "learning_rate": 0.0001, + "loss": 4.2392, + "loss/crossentropy": 2.324553370475769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20509789884090424, + "step": 25350 + }, + { + "epoch": 0.50704, + "grad_norm": 2.0, + "grad_norm_var": 0.006638336181640625, + "learning_rate": 0.0001, + "loss": 4.255, + "loss/crossentropy": 2.2305015325546265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21707269549369812, + "step": 25352 + }, + { + "epoch": 0.50708, + "grad_norm": 1.953125, + "grad_norm_var": 0.006864166259765625, + "learning_rate": 0.0001, + "loss": 4.0548, + "loss/crossentropy": 2.1571671962738037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20072072744369507, + "step": 25354 + }, + { + "epoch": 0.50712, + "grad_norm": 1.953125, + "grad_norm_var": 0.0072825113932291664, + "learning_rate": 0.0001, + "loss": 4.0577, + "loss/crossentropy": 2.014027178287506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1823829561471939, + "step": 25356 + }, + { + "epoch": 0.50716, + "grad_norm": 1.9375, + "grad_norm_var": 0.008304595947265625, + "learning_rate": 0.0001, + "loss": 3.7066, + "loss/crossentropy": 2.306033492088318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2326786294579506, + "step": 25358 + }, + { + "epoch": 0.5072, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008335113525390625, + "learning_rate": 0.0001, + "loss": 3.9867, + "loss/crossentropy": 2.1029749512672424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19578313827514648, + "step": 25360 + }, + { + "epoch": 0.50724, + "grad_norm": 1.921875, + "grad_norm_var": 0.009242502848307292, + "learning_rate": 0.0001, + "loss": 3.9696, + "loss/crossentropy": 2.244466543197632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20059070736169815, + "step": 25362 + }, + { + "epoch": 0.50728, + "grad_norm": 2.109375, + "grad_norm_var": 0.006858062744140625, + "learning_rate": 0.0001, + "loss": 4.1295, + "loss/crossentropy": 2.3815391063690186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21556073427200317, + "step": 25364 + }, + { + "epoch": 0.50732, + "grad_norm": 2.0, + "grad_norm_var": 0.013767242431640625, + "learning_rate": 0.0001, + "loss": 4.0822, + "loss/crossentropy": 1.997973620891571, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20311636477708817, + "step": 25366 + }, + { + "epoch": 0.50736, + "grad_norm": 1.765625, + "grad_norm_var": 0.016845703125, + "learning_rate": 0.0001, + "loss": 3.8043, + "loss/crossentropy": 2.2644081115722656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022029459476471, + "step": 25368 + }, + { + "epoch": 0.5074, + "grad_norm": 1.921875, + "grad_norm_var": 0.01632868448893229, + "learning_rate": 0.0001, + "loss": 3.9491, + "loss/crossentropy": 1.7542709112167358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15834683179855347, + "step": 25370 + }, + { + "epoch": 0.50744, + "grad_norm": 1.90625, + "grad_norm_var": 0.016502888997395833, + "learning_rate": 0.0001, + "loss": 4.0981, + "loss/crossentropy": 2.2310311794281006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21090705692768097, + "step": 25372 + }, + { + "epoch": 0.50748, + "grad_norm": 1.921875, + "grad_norm_var": 0.014977773030598959, + "learning_rate": 0.0001, + "loss": 3.9221, + "loss/crossentropy": 1.9018943905830383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821354478597641, + "step": 25374 + }, + { + "epoch": 0.50752, + "grad_norm": 1.8125, + "grad_norm_var": 0.01602961222330729, + "learning_rate": 0.0001, + "loss": 4.0671, + "loss/crossentropy": 2.0701186656951904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19244033098220825, + "step": 25376 + }, + { + "epoch": 0.50756, + "grad_norm": 1.890625, + "grad_norm_var": 0.01553955078125, + "learning_rate": 0.0001, + "loss": 3.7639, + "loss/crossentropy": 2.0421605110168457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1847202628850937, + "step": 25378 + }, + { + "epoch": 0.5076, + "grad_norm": 1.953125, + "grad_norm_var": 0.014314524332682292, + "learning_rate": 0.0001, + "loss": 3.9923, + "loss/crossentropy": 1.9375264048576355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17147281020879745, + "step": 25380 + }, + { + "epoch": 0.50764, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0062408447265625, + "learning_rate": 0.0001, + "loss": 4.0086, + "loss/crossentropy": 1.9593093395233154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19151991605758667, + "step": 25382 + }, + { + "epoch": 0.50768, + "grad_norm": 1.7734375, + "grad_norm_var": 0.007500966389973958, + "learning_rate": 0.0001, + "loss": 3.9897, + "loss/crossentropy": 1.9283559322357178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18128316849470139, + "step": 25384 + }, + { + "epoch": 0.50772, + "grad_norm": 2.015625, + "grad_norm_var": 0.010970052083333333, + "learning_rate": 0.0001, + "loss": 4.385, + "loss/crossentropy": 1.912258267402649, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18579021841287613, + "step": 25386 + }, + { + "epoch": 0.50776, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010619099934895833, + "learning_rate": 0.0001, + "loss": 4.0037, + "loss/crossentropy": 2.161900043487549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025725245475769, + "step": 25388 + }, + { + "epoch": 0.5078, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010733795166015626, + "learning_rate": 0.0001, + "loss": 4.2077, + "loss/crossentropy": 2.1753687858581543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21103990823030472, + "step": 25390 + }, + { + "epoch": 0.50784, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010581207275390626, + "learning_rate": 0.0001, + "loss": 4.2122, + "loss/crossentropy": 1.869983971118927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18909727036952972, + "step": 25392 + }, + { + "epoch": 0.50788, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0122711181640625, + "learning_rate": 0.0001, + "loss": 4.0602, + "loss/crossentropy": 2.0557621121406555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18932487815618515, + "step": 25394 + }, + { + "epoch": 0.50792, + "grad_norm": 2.015625, + "grad_norm_var": 0.011946360270182291, + "learning_rate": 0.0001, + "loss": 4.0104, + "loss/crossentropy": 1.865005910396576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2251267060637474, + "step": 25396 + }, + { + "epoch": 0.50796, + "grad_norm": 1.8359375, + "grad_norm_var": 0.011503092447916667, + "learning_rate": 0.0001, + "loss": 3.836, + "loss/crossentropy": 1.7247771620750427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16510912030935287, + "step": 25398 + }, + { + "epoch": 0.508, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009266916910807292, + "learning_rate": 0.0001, + "loss": 4.0276, + "loss/crossentropy": 2.110623359680176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18747511506080627, + "step": 25400 + }, + { + "epoch": 0.50804, + "grad_norm": 2.09375, + "grad_norm_var": 0.008058420817057292, + "learning_rate": 0.0001, + "loss": 4.1451, + "loss/crossentropy": 2.0501877069473267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19226385653018951, + "step": 25402 + }, + { + "epoch": 0.50808, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008119455973307292, + "learning_rate": 0.0001, + "loss": 3.8899, + "loss/crossentropy": 2.479986786842346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23350444436073303, + "step": 25404 + }, + { + "epoch": 0.50812, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008133951822916667, + "learning_rate": 0.0001, + "loss": 4.0184, + "loss/crossentropy": 2.215437889099121, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19325988739728928, + "step": 25406 + }, + { + "epoch": 0.50816, + "grad_norm": 1.8046875, + "grad_norm_var": 0.008485666910807292, + "learning_rate": 0.0001, + "loss": 3.8584, + "loss/crossentropy": 2.1168838143348694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18737324327230453, + "step": 25408 + }, + { + "epoch": 0.5082, + "grad_norm": 2.0, + "grad_norm_var": 0.007173411051432292, + "learning_rate": 0.0001, + "loss": 3.9339, + "loss/crossentropy": 1.9043878316879272, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19250840693712234, + "step": 25410 + }, + { + "epoch": 0.50824, + "grad_norm": 2.15625, + "grad_norm_var": 0.010158030192057292, + "learning_rate": 0.0001, + "loss": 4.2541, + "loss/crossentropy": 2.411430239677429, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22225366532802582, + "step": 25412 + }, + { + "epoch": 0.50828, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009091949462890625, + "learning_rate": 0.0001, + "loss": 3.9909, + "loss/crossentropy": 1.843587577342987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18295426666736603, + "step": 25414 + }, + { + "epoch": 0.50832, + "grad_norm": 1.9375, + "grad_norm_var": 0.008733876546223958, + "learning_rate": 0.0001, + "loss": 3.94, + "loss/crossentropy": 1.8655884861946106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17755136638879776, + "step": 25416 + }, + { + "epoch": 0.50836, + "grad_norm": 1.859375, + "grad_norm_var": 0.007176717122395833, + "learning_rate": 0.0001, + "loss": 3.9523, + "loss/crossentropy": 1.8799603581428528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18243838846683502, + "step": 25418 + }, + { + "epoch": 0.5084, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007226308186848958, + "learning_rate": 0.0001, + "loss": 3.9171, + "loss/crossentropy": 2.2428990602493286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21759599447250366, + "step": 25420 + }, + { + "epoch": 0.50844, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008265940348307292, + "learning_rate": 0.0001, + "loss": 3.9152, + "loss/crossentropy": 1.9947530031204224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956867277622223, + "step": 25422 + }, + { + "epoch": 0.50848, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006359608968098959, + "learning_rate": 0.0001, + "loss": 3.9553, + "loss/crossentropy": 2.0679984092712402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21864020824432373, + "step": 25424 + }, + { + "epoch": 0.50852, + "grad_norm": 1.984375, + "grad_norm_var": 0.005712890625, + "learning_rate": 0.0001, + "loss": 4.0127, + "loss/crossentropy": 1.9529247879981995, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1891709715127945, + "step": 25426 + }, + { + "epoch": 0.50856, + "grad_norm": 1.859375, + "grad_norm_var": 0.006308746337890625, + "learning_rate": 0.0001, + "loss": 4.0049, + "loss/crossentropy": 2.1328742504119873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21699222922325134, + "step": 25428 + }, + { + "epoch": 0.5086, + "grad_norm": 2.015625, + "grad_norm_var": 0.00760498046875, + "learning_rate": 0.0001, + "loss": 4.0057, + "loss/crossentropy": 1.8275007009506226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1621478796005249, + "step": 25430 + }, + { + "epoch": 0.50864, + "grad_norm": 1.7265625, + "grad_norm_var": 0.010374959309895833, + "learning_rate": 0.0001, + "loss": 3.7843, + "loss/crossentropy": 2.134227454662323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19917849451303482, + "step": 25432 + }, + { + "epoch": 0.50868, + "grad_norm": 2.015625, + "grad_norm_var": 0.010499827067057292, + "learning_rate": 0.0001, + "loss": 4.099, + "loss/crossentropy": 2.040915012359619, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1758517250418663, + "step": 25434 + }, + { + "epoch": 0.50872, + "grad_norm": 2.09375, + "grad_norm_var": 0.012247721354166666, + "learning_rate": 0.0001, + "loss": 4.0379, + "loss/crossentropy": 1.9219905734062195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26709431409835815, + "step": 25436 + }, + { + "epoch": 0.50876, + "grad_norm": 1.9375, + "grad_norm_var": 0.011683909098307292, + "learning_rate": 0.0001, + "loss": 3.8866, + "loss/crossentropy": 1.8520461320877075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17154939472675323, + "step": 25438 + }, + { + "epoch": 0.5088, + "grad_norm": 1.9921875, + "grad_norm_var": 0.011891428629557292, + "learning_rate": 0.0001, + "loss": 3.9908, + "loss/crossentropy": 2.091896414756775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19294072687625885, + "step": 25440 + }, + { + "epoch": 0.50884, + "grad_norm": 1.890625, + "grad_norm_var": 0.012029774983723958, + "learning_rate": 0.0001, + "loss": 3.8595, + "loss/crossentropy": 2.209986925125122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19535083323717117, + "step": 25442 + }, + { + "epoch": 0.50888, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010643513997395833, + "learning_rate": 0.0001, + "loss": 3.5984, + "loss/crossentropy": 1.7103394865989685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17907221615314484, + "step": 25444 + }, + { + "epoch": 0.50892, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0099365234375, + "learning_rate": 0.0001, + "loss": 4.0083, + "loss/crossentropy": 2.185406744480133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19615823030471802, + "step": 25446 + }, + { + "epoch": 0.50896, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009317779541015625, + "learning_rate": 0.0001, + "loss": 4.0897, + "loss/crossentropy": 1.7030528783798218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19345925003290176, + "step": 25448 + }, + { + "epoch": 0.509, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0098541259765625, + "learning_rate": 0.0001, + "loss": 3.8739, + "loss/crossentropy": 1.9207965731620789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18073637783527374, + "step": 25450 + }, + { + "epoch": 0.50904, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008307902018229167, + "learning_rate": 0.0001, + "loss": 3.8217, + "loss/crossentropy": 1.8818482160568237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18431610614061356, + "step": 25452 + }, + { + "epoch": 0.50908, + "grad_norm": 1.7578125, + "grad_norm_var": 0.009965006510416667, + "learning_rate": 0.0001, + "loss": 3.8117, + "loss/crossentropy": 2.2623773217201233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20106880366802216, + "step": 25454 + }, + { + "epoch": 0.50912, + "grad_norm": 1.921875, + "grad_norm_var": 0.010782623291015625, + "learning_rate": 0.0001, + "loss": 4.1155, + "loss/crossentropy": 2.303765892982483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19978389143943787, + "step": 25456 + }, + { + "epoch": 0.50916, + "grad_norm": 2.09375, + "grad_norm_var": 0.012824503580729167, + "learning_rate": 0.0001, + "loss": 4.0437, + "loss/crossentropy": 2.1355656385421753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20279624313116074, + "step": 25458 + }, + { + "epoch": 0.5092, + "grad_norm": 2.078125, + "grad_norm_var": 0.012621815999348958, + "learning_rate": 0.0001, + "loss": 3.9021, + "loss/crossentropy": 1.7096583843231201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16103574633598328, + "step": 25460 + }, + { + "epoch": 0.50924, + "grad_norm": 1.90625, + "grad_norm_var": 0.012889607747395834, + "learning_rate": 0.0001, + "loss": 3.9125, + "loss/crossentropy": 1.7230717539787292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16039150953292847, + "step": 25462 + }, + { + "epoch": 0.50928, + "grad_norm": 1.9375, + "grad_norm_var": 0.024662017822265625, + "learning_rate": 0.0001, + "loss": 4.0199, + "loss/crossentropy": 1.983295977115631, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20290112495422363, + "step": 25464 + }, + { + "epoch": 0.50932, + "grad_norm": 1.9140625, + "grad_norm_var": 0.024812825520833335, + "learning_rate": 0.0001, + "loss": 3.8669, + "loss/crossentropy": 1.7826906442642212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1826622262597084, + "step": 25466 + }, + { + "epoch": 0.50936, + "grad_norm": 1.8125, + "grad_norm_var": 0.024472808837890624, + "learning_rate": 0.0001, + "loss": 3.6992, + "loss/crossentropy": 1.7718060612678528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1686256304383278, + "step": 25468 + }, + { + "epoch": 0.5094, + "grad_norm": 1.8515625, + "grad_norm_var": 0.021605428059895834, + "learning_rate": 0.0001, + "loss": 3.7736, + "loss/crossentropy": 1.9546560645103455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1890355497598648, + "step": 25470 + }, + { + "epoch": 0.50944, + "grad_norm": 1.7578125, + "grad_norm_var": 0.024265289306640625, + "learning_rate": 0.0001, + "loss": 3.7519, + "loss/crossentropy": 2.2464581727981567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21081877499818802, + "step": 25472 + }, + { + "epoch": 0.50948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.022810872395833334, + "learning_rate": 0.0001, + "loss": 4.0546, + "loss/crossentropy": 2.150168299674988, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20787308365106583, + "step": 25474 + }, + { + "epoch": 0.50952, + "grad_norm": 1.828125, + "grad_norm_var": 0.020169830322265624, + "learning_rate": 0.0001, + "loss": 3.8762, + "loss/crossentropy": 2.08556866645813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18290850520133972, + "step": 25476 + }, + { + "epoch": 0.50956, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0203125, + "learning_rate": 0.0001, + "loss": 4.1563, + "loss/crossentropy": 2.174665331840515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2028878778219223, + "step": 25478 + }, + { + "epoch": 0.5096, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0035764058430989582, + "learning_rate": 0.0001, + "loss": 3.9699, + "loss/crossentropy": 1.9720661640167236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19616466760635376, + "step": 25480 + }, + { + "epoch": 0.50964, + "grad_norm": 2.015625, + "grad_norm_var": 0.0045562744140625, + "learning_rate": 0.0001, + "loss": 3.9704, + "loss/crossentropy": 2.2741299867630005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2136249840259552, + "step": 25482 + }, + { + "epoch": 0.50968, + "grad_norm": 1.875, + "grad_norm_var": 0.004620107014973959, + "learning_rate": 0.0001, + "loss": 3.9, + "loss/crossentropy": 1.8212140798568726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16650617122650146, + "step": 25484 + }, + { + "epoch": 0.50972, + "grad_norm": 2.078125, + "grad_norm_var": 0.0072174072265625, + "learning_rate": 0.0001, + "loss": 3.9479, + "loss/crossentropy": 2.25445818901062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21185942739248276, + "step": 25486 + }, + { + "epoch": 0.50976, + "grad_norm": 1.890625, + "grad_norm_var": 0.0061838785807291664, + "learning_rate": 0.0001, + "loss": 4.273, + "loss/crossentropy": 2.198422074317932, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19693002849817276, + "step": 25488 + }, + { + "epoch": 0.5098, + "grad_norm": 2.03125, + "grad_norm_var": 0.007177480061848958, + "learning_rate": 0.0001, + "loss": 4.0788, + "loss/crossentropy": 2.0654536485671997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20444564521312714, + "step": 25490 + }, + { + "epoch": 0.50984, + "grad_norm": 1.84375, + "grad_norm_var": 0.007889811197916667, + "learning_rate": 0.0001, + "loss": 4.1771, + "loss/crossentropy": 1.9605732560157776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1785297840833664, + "step": 25492 + }, + { + "epoch": 0.50988, + "grad_norm": 2.09375, + "grad_norm_var": 0.009968821207682292, + "learning_rate": 0.0001, + "loss": 4.2828, + "loss/crossentropy": 2.128088355064392, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063232958316803, + "step": 25494 + }, + { + "epoch": 0.50992, + "grad_norm": 1.921875, + "grad_norm_var": 0.011039225260416667, + "learning_rate": 0.0001, + "loss": 3.7328, + "loss/crossentropy": 1.9981979727745056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17392293363809586, + "step": 25496 + }, + { + "epoch": 0.50996, + "grad_norm": 2.140625, + "grad_norm_var": 0.014243316650390626, + "learning_rate": 0.0001, + "loss": 4.3475, + "loss/crossentropy": 2.4065998792648315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20214569568634033, + "step": 25498 + }, + { + "epoch": 0.51, + "grad_norm": 1.8515625, + "grad_norm_var": 0.01361083984375, + "learning_rate": 0.0001, + "loss": 3.9476, + "loss/crossentropy": 2.154718041419983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19910430163145065, + "step": 25500 + }, + { + "epoch": 0.51004, + "grad_norm": 2.03125, + "grad_norm_var": 0.011042277018229166, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 2.22087424993515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20276758074760437, + "step": 25502 + }, + { + "epoch": 0.51008, + "grad_norm": 1.8671875, + "grad_norm_var": 0.014336903889973959, + "learning_rate": 0.0001, + "loss": 3.6317, + "loss/crossentropy": 1.9433262944221497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1835595816373825, + "step": 25504 + }, + { + "epoch": 0.51012, + "grad_norm": 1.96875, + "grad_norm_var": 0.014625803629557291, + "learning_rate": 0.0001, + "loss": 3.8672, + "loss/crossentropy": 1.814663290977478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1932072937488556, + "step": 25506 + }, + { + "epoch": 0.51016, + "grad_norm": 2.078125, + "grad_norm_var": 0.0143463134765625, + "learning_rate": 0.0001, + "loss": 3.9603, + "loss/crossentropy": 1.8567258715629578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20248033851385117, + "step": 25508 + }, + { + "epoch": 0.5102, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013232167561848958, + "learning_rate": 0.0001, + "loss": 4.1419, + "loss/crossentropy": 2.056326985359192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19645985960960388, + "step": 25510 + }, + { + "epoch": 0.51024, + "grad_norm": 1.9375, + "grad_norm_var": 0.010884348551432292, + "learning_rate": 0.0001, + "loss": 4.2054, + "loss/crossentropy": 2.068827450275421, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19381123036146164, + "step": 25512 + }, + { + "epoch": 0.51028, + "grad_norm": 1.90625, + "grad_norm_var": 0.008898671468098958, + "learning_rate": 0.0001, + "loss": 3.9707, + "loss/crossentropy": 2.4085018634796143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20001942664384842, + "step": 25514 + }, + { + "epoch": 0.51032, + "grad_norm": 1.921875, + "grad_norm_var": 0.008056386311848959, + "learning_rate": 0.0001, + "loss": 4.0348, + "loss/crossentropy": 1.9481959342956543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17734962701797485, + "step": 25516 + }, + { + "epoch": 0.51036, + "grad_norm": 1.7265625, + "grad_norm_var": 0.011722819010416666, + "learning_rate": 0.0001, + "loss": 3.6203, + "loss/crossentropy": 1.9250916838645935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1881168857216835, + "step": 25518 + }, + { + "epoch": 0.5104, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0101715087890625, + "learning_rate": 0.0001, + "loss": 4.2853, + "loss/crossentropy": 2.4334945678710938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22089337557554245, + "step": 25520 + }, + { + "epoch": 0.51044, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012629954020182292, + "learning_rate": 0.0001, + "loss": 3.9618, + "loss/crossentropy": 1.8872219324111938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17582690715789795, + "step": 25522 + }, + { + "epoch": 0.51048, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011156209309895833, + "learning_rate": 0.0001, + "loss": 4.0339, + "loss/crossentropy": 2.0595308542251587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20405854284763336, + "step": 25524 + }, + { + "epoch": 0.51052, + "grad_norm": 2.015625, + "grad_norm_var": 0.012819163004557292, + "learning_rate": 0.0001, + "loss": 4.0668, + "loss/crossentropy": 1.7874937653541565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21105076372623444, + "step": 25526 + }, + { + "epoch": 0.51056, + "grad_norm": 2.03125, + "grad_norm_var": 0.014076487223307291, + "learning_rate": 0.0001, + "loss": 3.8664, + "loss/crossentropy": 1.866211712360382, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18459182977676392, + "step": 25528 + }, + { + "epoch": 0.5106, + "grad_norm": 2.015625, + "grad_norm_var": 0.015641021728515624, + "learning_rate": 0.0001, + "loss": 4.0479, + "loss/crossentropy": 1.8424429893493652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17495467513799667, + "step": 25530 + }, + { + "epoch": 0.51064, + "grad_norm": 2.046875, + "grad_norm_var": 0.015632120768229167, + "learning_rate": 0.0001, + "loss": 4.1848, + "loss/crossentropy": 2.3827977180480957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2094314694404602, + "step": 25532 + }, + { + "epoch": 0.51068, + "grad_norm": 1.8359375, + "grad_norm_var": 0.00950927734375, + "learning_rate": 0.0001, + "loss": 3.9321, + "loss/crossentropy": 2.0699292421340942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20574011653661728, + "step": 25534 + }, + { + "epoch": 0.51072, + "grad_norm": 2.015625, + "grad_norm_var": 0.0102447509765625, + "learning_rate": 0.0001, + "loss": 3.7784, + "loss/crossentropy": 2.058907687664032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1945437341928482, + "step": 25536 + }, + { + "epoch": 0.51076, + "grad_norm": 1.90625, + "grad_norm_var": 0.0090484619140625, + "learning_rate": 0.0001, + "loss": 4.0229, + "loss/crossentropy": 2.138782501220703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19527214765548706, + "step": 25538 + }, + { + "epoch": 0.5108, + "grad_norm": 1.890625, + "grad_norm_var": 0.00936279296875, + "learning_rate": 0.0001, + "loss": 3.9339, + "loss/crossentropy": 1.9123243689537048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18704041838645935, + "step": 25540 + }, + { + "epoch": 0.51084, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008056640625, + "learning_rate": 0.0001, + "loss": 4.2846, + "loss/crossentropy": 2.5264101028442383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23050238192081451, + "step": 25542 + }, + { + "epoch": 0.51088, + "grad_norm": 1.875, + "grad_norm_var": 0.005826822916666667, + "learning_rate": 0.0001, + "loss": 4.0812, + "loss/crossentropy": 1.9210030436515808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19104420393705368, + "step": 25544 + }, + { + "epoch": 0.51092, + "grad_norm": 1.7734375, + "grad_norm_var": 0.006809234619140625, + "learning_rate": 0.0001, + "loss": 3.6386, + "loss/crossentropy": 1.7989648580551147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17732901126146317, + "step": 25546 + }, + { + "epoch": 0.51096, + "grad_norm": 1.9375, + "grad_norm_var": 0.006086985270182292, + "learning_rate": 0.0001, + "loss": 4.0115, + "loss/crossentropy": 1.7673185467720032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19276081770658493, + "step": 25548 + }, + { + "epoch": 0.511, + "grad_norm": 1.8359375, + "grad_norm_var": 0.005586496988932292, + "learning_rate": 0.0001, + "loss": 3.9068, + "loss/crossentropy": 2.451088309288025, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21039804071187973, + "step": 25550 + }, + { + "epoch": 0.51104, + "grad_norm": 1.8671875, + "grad_norm_var": 0.004813385009765625, + "learning_rate": 0.0001, + "loss": 3.9767, + "loss/crossentropy": 2.0937219858169556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21383897960186005, + "step": 25552 + }, + { + "epoch": 0.51108, + "grad_norm": 1.7890625, + "grad_norm_var": 0.007010650634765625, + "learning_rate": 0.0001, + "loss": 3.8091, + "loss/crossentropy": 1.7688068747520447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17263571172952652, + "step": 25554 + }, + { + "epoch": 0.51112, + "grad_norm": 2.046875, + "grad_norm_var": 0.008137003580729166, + "learning_rate": 0.0001, + "loss": 4.1427, + "loss/crossentropy": 2.165677785873413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19934290647506714, + "step": 25556 + }, + { + "epoch": 0.51116, + "grad_norm": 1.8203125, + "grad_norm_var": 0.009197743733723958, + "learning_rate": 0.0001, + "loss": 3.8542, + "loss/crossentropy": 1.7860172986984253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18735045939683914, + "step": 25558 + }, + { + "epoch": 0.5112, + "grad_norm": 2.15625, + "grad_norm_var": 0.012165323893229166, + "learning_rate": 0.0001, + "loss": 4.0364, + "loss/crossentropy": 2.2124353647232056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27785858511924744, + "step": 25560 + }, + { + "epoch": 0.51124, + "grad_norm": 2.265625, + "grad_norm_var": 0.01620457967122396, + "learning_rate": 0.0001, + "loss": 4.0802, + "loss/crossentropy": 1.8342975974082947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1646983027458191, + "step": 25562 + }, + { + "epoch": 0.51128, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01895319620768229, + "learning_rate": 0.0001, + "loss": 3.7718, + "loss/crossentropy": 2.0271248817443848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18964418768882751, + "step": 25564 + }, + { + "epoch": 0.51132, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01892267862955729, + "learning_rate": 0.0001, + "loss": 3.8739, + "loss/crossentropy": 2.1983633041381836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2039782777428627, + "step": 25566 + }, + { + "epoch": 0.51136, + "grad_norm": 1.8125, + "grad_norm_var": 0.01982421875, + "learning_rate": 0.0001, + "loss": 3.9194, + "loss/crossentropy": 1.804263949394226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17312633991241455, + "step": 25568 + }, + { + "epoch": 0.5114, + "grad_norm": 1.8828125, + "grad_norm_var": 0.019624837239583335, + "learning_rate": 0.0001, + "loss": 4.2465, + "loss/crossentropy": 1.7486448287963867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16695599257946014, + "step": 25570 + }, + { + "epoch": 0.51144, + "grad_norm": 1.765625, + "grad_norm_var": 0.02077204386393229, + "learning_rate": 0.0001, + "loss": 3.7398, + "loss/crossentropy": 1.9466286301612854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1805291324853897, + "step": 25572 + }, + { + "epoch": 0.51148, + "grad_norm": 1.9921875, + "grad_norm_var": 0.020703125, + "learning_rate": 0.0001, + "loss": 4.1362, + "loss/crossentropy": 2.203175187110901, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21553437411785126, + "step": 25574 + }, + { + "epoch": 0.51152, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01708958943684896, + "learning_rate": 0.0001, + "loss": 4.1116, + "loss/crossentropy": 1.8743736743927002, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19245517253875732, + "step": 25576 + }, + { + "epoch": 0.51156, + "grad_norm": 1.90625, + "grad_norm_var": 0.008211008707682292, + "learning_rate": 0.0001, + "loss": 3.8857, + "loss/crossentropy": 1.866178572177887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18233061581850052, + "step": 25578 + }, + { + "epoch": 0.5116, + "grad_norm": 1.984375, + "grad_norm_var": 0.007420857747395833, + "learning_rate": 0.0001, + "loss": 3.8674, + "loss/crossentropy": 2.0799529552459717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19485916197299957, + "step": 25580 + }, + { + "epoch": 0.51164, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007222493489583333, + "learning_rate": 0.0001, + "loss": 3.9318, + "loss/crossentropy": 1.7319077253341675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16157583892345428, + "step": 25582 + }, + { + "epoch": 0.51168, + "grad_norm": 1.84375, + "grad_norm_var": 0.0073893229166666664, + "learning_rate": 0.0001, + "loss": 3.7334, + "loss/crossentropy": 2.0670501589775085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18853918462991714, + "step": 25584 + }, + { + "epoch": 0.51172, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004671223958333333, + "learning_rate": 0.0001, + "loss": 4.0737, + "loss/crossentropy": 2.0605591535568237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799873411655426, + "step": 25586 + }, + { + "epoch": 0.51176, + "grad_norm": 1.796875, + "grad_norm_var": 0.004288482666015625, + "learning_rate": 0.0001, + "loss": 3.9416, + "loss/crossentropy": 1.9844827055931091, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19314195960760117, + "step": 25588 + }, + { + "epoch": 0.5118, + "grad_norm": 1.84375, + "grad_norm_var": 0.0035906473795572916, + "learning_rate": 0.0001, + "loss": 3.9044, + "loss/crossentropy": 2.292548894882202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19697485864162445, + "step": 25590 + }, + { + "epoch": 0.51184, + "grad_norm": 1.953125, + "grad_norm_var": 0.004400380452473958, + "learning_rate": 0.0001, + "loss": 4.1964, + "loss/crossentropy": 2.472001791000366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20994122326374054, + "step": 25592 + }, + { + "epoch": 0.51188, + "grad_norm": 2.25, + "grad_norm_var": 0.011283365885416667, + "learning_rate": 0.0001, + "loss": 3.9873, + "loss/crossentropy": 1.3704800009727478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1426696479320526, + "step": 25594 + }, + { + "epoch": 0.51192, + "grad_norm": 1.875, + "grad_norm_var": 0.0113922119140625, + "learning_rate": 0.0001, + "loss": 3.9129, + "loss/crossentropy": 1.6338598132133484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17765308916568756, + "step": 25596 + }, + { + "epoch": 0.51196, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011717732747395833, + "learning_rate": 0.0001, + "loss": 4.1234, + "loss/crossentropy": 1.8394725322723389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996341496706009, + "step": 25598 + }, + { + "epoch": 0.512, + "grad_norm": 1.7578125, + "grad_norm_var": 0.012443033854166667, + "learning_rate": 0.0001, + "loss": 3.9978, + "loss/crossentropy": 1.9061697125434875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18420706689357758, + "step": 25600 + }, + { + "epoch": 0.51204, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012426503499348958, + "learning_rate": 0.0001, + "loss": 4.0949, + "loss/crossentropy": 2.3584840297698975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21293433010578156, + "step": 25602 + }, + { + "epoch": 0.51208, + "grad_norm": 1.8984375, + "grad_norm_var": 0.012251536051432291, + "learning_rate": 0.0001, + "loss": 3.8624, + "loss/crossentropy": 1.923392653465271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18330930918455124, + "step": 25604 + }, + { + "epoch": 0.51212, + "grad_norm": 1.890625, + "grad_norm_var": 0.011934153238932292, + "learning_rate": 0.0001, + "loss": 4.027, + "loss/crossentropy": 2.2249737977981567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061079815030098, + "step": 25606 + }, + { + "epoch": 0.51216, + "grad_norm": 1.7578125, + "grad_norm_var": 0.013505045572916667, + "learning_rate": 0.0001, + "loss": 3.8274, + "loss/crossentropy": 1.7640107870101929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1722632348537445, + "step": 25608 + }, + { + "epoch": 0.5122, + "grad_norm": 2.0, + "grad_norm_var": 0.006278483072916666, + "learning_rate": 0.0001, + "loss": 4.0634, + "loss/crossentropy": 2.0654106736183167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959560066461563, + "step": 25610 + }, + { + "epoch": 0.51224, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006685384114583333, + "learning_rate": 0.0001, + "loss": 3.857, + "loss/crossentropy": 1.8277021646499634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1689283475279808, + "step": 25612 + }, + { + "epoch": 0.51228, + "grad_norm": 1.953125, + "grad_norm_var": 0.0066802978515625, + "learning_rate": 0.0001, + "loss": 3.9565, + "loss/crossentropy": 2.2842466831207275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116750180721283, + "step": 25614 + }, + { + "epoch": 0.51232, + "grad_norm": 2.34375, + "grad_norm_var": 0.01710205078125, + "learning_rate": 0.0001, + "loss": 3.7537, + "loss/crossentropy": 1.8136274218559265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18518541753292084, + "step": 25616 + }, + { + "epoch": 0.51236, + "grad_norm": 1.90625, + "grad_norm_var": 0.017277018229166666, + "learning_rate": 0.0001, + "loss": 3.9369, + "loss/crossentropy": 2.0048798322677612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975068524479866, + "step": 25618 + }, + { + "epoch": 0.5124, + "grad_norm": 1.9375, + "grad_norm_var": 0.016745758056640626, + "learning_rate": 0.0001, + "loss": 3.7449, + "loss/crossentropy": 1.7246285676956177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17372817546129227, + "step": 25620 + }, + { + "epoch": 0.51244, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0177978515625, + "learning_rate": 0.0001, + "loss": 4.2149, + "loss/crossentropy": 2.160382390022278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20257163047790527, + "step": 25622 + }, + { + "epoch": 0.51248, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015604400634765625, + "learning_rate": 0.0001, + "loss": 4.0976, + "loss/crossentropy": 2.1341055631637573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18860094994306564, + "step": 25624 + }, + { + "epoch": 0.51252, + "grad_norm": 1.8203125, + "grad_norm_var": 0.01613337198893229, + "learning_rate": 0.0001, + "loss": 3.9408, + "loss/crossentropy": 2.1212183237075806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18820258975028992, + "step": 25626 + }, + { + "epoch": 0.51256, + "grad_norm": 1.84375, + "grad_norm_var": 0.017045084635416666, + "learning_rate": 0.0001, + "loss": 3.88, + "loss/crossentropy": 2.0193417072296143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18116050213575363, + "step": 25628 + }, + { + "epoch": 0.5126, + "grad_norm": 1.8359375, + "grad_norm_var": 0.017472330729166666, + "learning_rate": 0.0001, + "loss": 4.0049, + "loss/crossentropy": 2.2681472301483154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21637655794620514, + "step": 25630 + }, + { + "epoch": 0.51264, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006444295247395833, + "learning_rate": 0.0001, + "loss": 4.1065, + "loss/crossentropy": 1.879043698310852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1792997345328331, + "step": 25632 + }, + { + "epoch": 0.51268, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005980428059895833, + "learning_rate": 0.0001, + "loss": 3.9104, + "loss/crossentropy": 1.8325838446617126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18299763649702072, + "step": 25634 + }, + { + "epoch": 0.51272, + "grad_norm": 1.859375, + "grad_norm_var": 0.006241607666015625, + "learning_rate": 0.0001, + "loss": 3.9663, + "loss/crossentropy": 2.1776668429374695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21143290400505066, + "step": 25636 + }, + { + "epoch": 0.51276, + "grad_norm": 1.8515625, + "grad_norm_var": 0.003932444254557291, + "learning_rate": 0.0001, + "loss": 3.7604, + "loss/crossentropy": 2.010956645011902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18642369657754898, + "step": 25638 + }, + { + "epoch": 0.5128, + "grad_norm": 1.796875, + "grad_norm_var": 0.0038401285807291665, + "learning_rate": 0.0001, + "loss": 3.9791, + "loss/crossentropy": 2.3246684074401855, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20379284024238586, + "step": 25640 + }, + { + "epoch": 0.51284, + "grad_norm": 2.140625, + "grad_norm_var": 0.008740234375, + "learning_rate": 0.0001, + "loss": 3.9221, + "loss/crossentropy": 2.1043150424957275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1939583420753479, + "step": 25642 + }, + { + "epoch": 0.51288, + "grad_norm": 1.8125, + "grad_norm_var": 0.008308664957682291, + "learning_rate": 0.0001, + "loss": 3.8916, + "loss/crossentropy": 1.947311520576477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19082298129796982, + "step": 25644 + }, + { + "epoch": 0.51292, + "grad_norm": 1.84375, + "grad_norm_var": 0.008581288655598958, + "learning_rate": 0.0001, + "loss": 3.991, + "loss/crossentropy": 2.189740777015686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19475838541984558, + "step": 25646 + }, + { + "epoch": 0.51296, + "grad_norm": 1.875, + "grad_norm_var": 0.007420857747395833, + "learning_rate": 0.0001, + "loss": 4.0033, + "loss/crossentropy": 1.6541009545326233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16929760575294495, + "step": 25648 + }, + { + "epoch": 0.513, + "grad_norm": 2.125, + "grad_norm_var": 0.011130523681640626, + "learning_rate": 0.0001, + "loss": 4.1202, + "loss/crossentropy": 2.102361261844635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20186372101306915, + "step": 25650 + }, + { + "epoch": 0.51304, + "grad_norm": 2.109375, + "grad_norm_var": 0.016169230143229168, + "learning_rate": 0.0001, + "loss": 4.3193, + "loss/crossentropy": 2.1970421075820923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2268364280462265, + "step": 25652 + }, + { + "epoch": 0.51308, + "grad_norm": 1.921875, + "grad_norm_var": 0.01594823201497396, + "learning_rate": 0.0001, + "loss": 4.0125, + "loss/crossentropy": 1.9391701817512512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19213733822107315, + "step": 25654 + }, + { + "epoch": 0.51312, + "grad_norm": 1.828125, + "grad_norm_var": 0.015453084309895834, + "learning_rate": 0.0001, + "loss": 4.1499, + "loss/crossentropy": 1.9313429594039917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.184333935379982, + "step": 25656 + }, + { + "epoch": 0.51316, + "grad_norm": 2.046875, + "grad_norm_var": 0.03003107706705729, + "learning_rate": 0.0001, + "loss": 4.0926, + "loss/crossentropy": 2.221387028694153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127813696861267, + "step": 25658 + }, + { + "epoch": 0.5132, + "grad_norm": 2.21875, + "grad_norm_var": 0.0320220947265625, + "learning_rate": 0.0001, + "loss": 4.2681, + "loss/crossentropy": 2.0020928978919983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1746811792254448, + "step": 25660 + }, + { + "epoch": 0.51324, + "grad_norm": 3.421875, + "grad_norm_var": 0.15138320922851561, + "learning_rate": 0.0001, + "loss": 3.8316, + "loss/crossentropy": 1.960661768913269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19446442276239395, + "step": 25662 + }, + { + "epoch": 0.51328, + "grad_norm": 2.125, + "grad_norm_var": 0.1433428446451823, + "learning_rate": 0.0001, + "loss": 3.7593, + "loss/crossentropy": 1.6914892792701721, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17224331945180893, + "step": 25664 + }, + { + "epoch": 0.51332, + "grad_norm": 1.78125, + "grad_norm_var": 0.14908447265625, + "learning_rate": 0.0001, + "loss": 3.7995, + "loss/crossentropy": 1.8427329063415527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18707136064767838, + "step": 25666 + }, + { + "epoch": 0.51336, + "grad_norm": 2.453125, + "grad_norm_var": 0.1630938212076823, + "learning_rate": 0.0001, + "loss": 3.7455, + "loss/crossentropy": 1.7277203798294067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1778879463672638, + "step": 25668 + }, + { + "epoch": 0.5134, + "grad_norm": 1.9765625, + "grad_norm_var": 0.16569417317708332, + "learning_rate": 0.0001, + "loss": 3.817, + "loss/crossentropy": 1.9606621861457825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19271928071975708, + "step": 25670 + }, + { + "epoch": 0.51344, + "grad_norm": 2.171875, + "grad_norm_var": 0.1670000712076823, + "learning_rate": 0.0001, + "loss": 3.7844, + "loss/crossentropy": 2.004323959350586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18009979277849197, + "step": 25672 + }, + { + "epoch": 0.51348, + "grad_norm": 2.59375, + "grad_norm_var": 0.1731035868326823, + "learning_rate": 0.0001, + "loss": 4.1429, + "loss/crossentropy": 2.0127062797546387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20746992528438568, + "step": 25674 + }, + { + "epoch": 0.51352, + "grad_norm": 1.828125, + "grad_norm_var": 0.181298828125, + "learning_rate": 0.0001, + "loss": 3.9813, + "loss/crossentropy": 2.240972399711609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19375959038734436, + "step": 25676 + }, + { + "epoch": 0.51356, + "grad_norm": 1.9921875, + "grad_norm_var": 0.06014404296875, + "learning_rate": 0.0001, + "loss": 4.1652, + "loss/crossentropy": 2.1942732334136963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21413519978523254, + "step": 25678 + }, + { + "epoch": 0.5136, + "grad_norm": 1.8125, + "grad_norm_var": 0.061522420247395834, + "learning_rate": 0.0001, + "loss": 4.0474, + "loss/crossentropy": 2.0388938188552856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1887573003768921, + "step": 25680 + }, + { + "epoch": 0.51364, + "grad_norm": 1.8515625, + "grad_norm_var": 0.059458160400390626, + "learning_rate": 0.0001, + "loss": 4.0529, + "loss/crossentropy": 2.147862672805786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947481632232666, + "step": 25682 + }, + { + "epoch": 0.51368, + "grad_norm": 1.9140625, + "grad_norm_var": 0.052308909098307294, + "learning_rate": 0.0001, + "loss": 4.0428, + "loss/crossentropy": 1.7891934514045715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200366035103798, + "step": 25684 + }, + { + "epoch": 0.51372, + "grad_norm": 1.9921875, + "grad_norm_var": 0.047973378499348955, + "learning_rate": 0.0001, + "loss": 4.1498, + "loss/crossentropy": 2.0402196645736694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20540674775838852, + "step": 25686 + }, + { + "epoch": 0.51376, + "grad_norm": 1.8046875, + "grad_norm_var": 0.044535064697265626, + "learning_rate": 0.0001, + "loss": 3.809, + "loss/crossentropy": 1.9355202913284302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19018185883760452, + "step": 25688 + }, + { + "epoch": 0.5138, + "grad_norm": 1.765625, + "grad_norm_var": 0.021648915608723958, + "learning_rate": 0.0001, + "loss": 3.8918, + "loss/crossentropy": 2.140886068344116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20067927986383438, + "step": 25690 + }, + { + "epoch": 0.51384, + "grad_norm": 2.03125, + "grad_norm_var": 0.021149698893229166, + "learning_rate": 0.0001, + "loss": 3.7235, + "loss/crossentropy": 1.9624481201171875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.191593699157238, + "step": 25692 + }, + { + "epoch": 0.51388, + "grad_norm": 1.9921875, + "grad_norm_var": 0.021834055582682293, + "learning_rate": 0.0001, + "loss": 4.1363, + "loss/crossentropy": 2.2475873231887817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987176612019539, + "step": 25694 + }, + { + "epoch": 0.51392, + "grad_norm": 2.078125, + "grad_norm_var": 0.020857747395833334, + "learning_rate": 0.0001, + "loss": 4.0871, + "loss/crossentropy": 2.336251735687256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216811552643776, + "step": 25696 + }, + { + "epoch": 0.51396, + "grad_norm": 2.015625, + "grad_norm_var": 0.019893391927083334, + "learning_rate": 0.0001, + "loss": 3.9574, + "loss/crossentropy": 1.9058191776275635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20057585835456848, + "step": 25698 + }, + { + "epoch": 0.514, + "grad_norm": 1.828125, + "grad_norm_var": 0.009749094645182291, + "learning_rate": 0.0001, + "loss": 3.8912, + "loss/crossentropy": 1.8164226412773132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1772766411304474, + "step": 25700 + }, + { + "epoch": 0.51404, + "grad_norm": 2.078125, + "grad_norm_var": 0.010578409830729166, + "learning_rate": 0.0001, + "loss": 4.1467, + "loss/crossentropy": 1.9770846962928772, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19005891680717468, + "step": 25702 + }, + { + "epoch": 0.51408, + "grad_norm": 3.5625, + "grad_norm_var": 0.1743242899576823, + "learning_rate": 0.0001, + "loss": 3.859, + "loss/crossentropy": 1.9907479286193848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19942636042833328, + "step": 25704 + }, + { + "epoch": 0.51412, + "grad_norm": 1.8125, + "grad_norm_var": 0.17293472290039064, + "learning_rate": 0.0001, + "loss": 3.7331, + "loss/crossentropy": 1.803969383239746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17413488030433655, + "step": 25706 + }, + { + "epoch": 0.51416, + "grad_norm": 1.84375, + "grad_norm_var": 0.1740386962890625, + "learning_rate": 0.0001, + "loss": 4.0005, + "loss/crossentropy": 1.9947617053985596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18278180807828903, + "step": 25708 + }, + { + "epoch": 0.5142, + "grad_norm": 2.0625, + "grad_norm_var": 0.17277399698893228, + "learning_rate": 0.0001, + "loss": 3.9845, + "loss/crossentropy": 2.102198004722595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19085385650396347, + "step": 25710 + }, + { + "epoch": 0.51424, + "grad_norm": 1.90625, + "grad_norm_var": 0.1716631571451823, + "learning_rate": 0.0001, + "loss": 4.2193, + "loss/crossentropy": 2.087872624397278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18291640281677246, + "step": 25712 + }, + { + "epoch": 0.51428, + "grad_norm": 1.984375, + "grad_norm_var": 0.17193094889322916, + "learning_rate": 0.0001, + "loss": 4.1705, + "loss/crossentropy": 2.1515949964523315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19207611680030823, + "step": 25714 + }, + { + "epoch": 0.51432, + "grad_norm": 1.984375, + "grad_norm_var": 0.1703814188639323, + "learning_rate": 0.0001, + "loss": 4.1645, + "loss/crossentropy": 2.2736594676971436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21448197960853577, + "step": 25716 + }, + { + "epoch": 0.51436, + "grad_norm": 2.0, + "grad_norm_var": 0.16901219685872396, + "learning_rate": 0.0001, + "loss": 4.3332, + "loss/crossentropy": 1.8625124096870422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1784079223871231, + "step": 25718 + }, + { + "epoch": 0.5144, + "grad_norm": 1.7890625, + "grad_norm_var": 0.005600738525390625, + "learning_rate": 0.0001, + "loss": 3.9551, + "loss/crossentropy": 2.2445143461227417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20068903267383575, + "step": 25720 + }, + { + "epoch": 0.51444, + "grad_norm": 1.8125, + "grad_norm_var": 0.005924224853515625, + "learning_rate": 0.0001, + "loss": 3.7845, + "loss/crossentropy": 1.7019882202148438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17819605767726898, + "step": 25722 + }, + { + "epoch": 0.51448, + "grad_norm": 2.34375, + "grad_norm_var": 0.016523996988932293, + "learning_rate": 0.0001, + "loss": 3.8062, + "loss/crossentropy": 2.077397406101227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20366767793893814, + "step": 25724 + }, + { + "epoch": 0.51452, + "grad_norm": 1.96875, + "grad_norm_var": 0.0159423828125, + "learning_rate": 0.0001, + "loss": 4.2385, + "loss/crossentropy": 2.1213592290878296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19402803480625153, + "step": 25726 + }, + { + "epoch": 0.51456, + "grad_norm": 1.8828125, + "grad_norm_var": 0.016110992431640624, + "learning_rate": 0.0001, + "loss": 3.6229, + "loss/crossentropy": 1.7625560760498047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17147068679332733, + "step": 25728 + }, + { + "epoch": 0.5146, + "grad_norm": 1.7734375, + "grad_norm_var": 0.02178955078125, + "learning_rate": 0.0001, + "loss": 3.8784, + "loss/crossentropy": 2.108244776725769, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20061994343996048, + "step": 25730 + }, + { + "epoch": 0.51464, + "grad_norm": 1.84375, + "grad_norm_var": 0.023787180582682293, + "learning_rate": 0.0001, + "loss": 3.7433, + "loss/crossentropy": 2.0263237953186035, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17062099277973175, + "step": 25732 + }, + { + "epoch": 0.51468, + "grad_norm": 1.921875, + "grad_norm_var": 0.023337554931640626, + "learning_rate": 0.0001, + "loss": 3.9004, + "loss/crossentropy": 2.3399864435195923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22683630138635635, + "step": 25734 + }, + { + "epoch": 0.51472, + "grad_norm": 2.109375, + "grad_norm_var": 0.024761708577473958, + "learning_rate": 0.0001, + "loss": 3.9724, + "loss/crossentropy": 2.179167151451111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2077057957649231, + "step": 25736 + }, + { + "epoch": 0.51476, + "grad_norm": 1.8984375, + "grad_norm_var": 0.027428944905598957, + "learning_rate": 0.0001, + "loss": 3.5788, + "loss/crossentropy": 1.7649011611938477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17765354365110397, + "step": 25738 + }, + { + "epoch": 0.5148, + "grad_norm": 1.9140625, + "grad_norm_var": 0.03184789021809896, + "learning_rate": 0.0001, + "loss": 4.0858, + "loss/crossentropy": 2.01451712846756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20021513849496841, + "step": 25740 + }, + { + "epoch": 0.51484, + "grad_norm": 2.078125, + "grad_norm_var": 0.03407567342122396, + "learning_rate": 0.0001, + "loss": 3.8967, + "loss/crossentropy": 2.043434739112854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18407747894525528, + "step": 25742 + }, + { + "epoch": 0.51488, + "grad_norm": 1.9375, + "grad_norm_var": 0.0338531494140625, + "learning_rate": 0.0001, + "loss": 3.9306, + "loss/crossentropy": 1.7236329913139343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16566471755504608, + "step": 25744 + }, + { + "epoch": 0.51492, + "grad_norm": 1.9375, + "grad_norm_var": 0.028107706705729166, + "learning_rate": 0.0001, + "loss": 3.9946, + "loss/crossentropy": 1.9381142258644104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18159441649913788, + "step": 25746 + }, + { + "epoch": 0.51496, + "grad_norm": 1.96875, + "grad_norm_var": 0.025341542561848958, + "learning_rate": 0.0001, + "loss": 4.1442, + "loss/crossentropy": 2.262045383453369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20239067822694778, + "step": 25748 + }, + { + "epoch": 0.515, + "grad_norm": 2.0, + "grad_norm_var": 0.026682281494140626, + "learning_rate": 0.0001, + "loss": 4.0358, + "loss/crossentropy": 1.980510652065277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910887509584427, + "step": 25750 + }, + { + "epoch": 0.51504, + "grad_norm": 1.9765625, + "grad_norm_var": 0.024494425455729166, + "learning_rate": 0.0001, + "loss": 3.9608, + "loss/crossentropy": 2.1935293674468994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19832123070955276, + "step": 25752 + }, + { + "epoch": 0.51508, + "grad_norm": 1.890625, + "grad_norm_var": 0.023835245768229166, + "learning_rate": 0.0001, + "loss": 4.1082, + "loss/crossentropy": 1.912952721118927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726488322019577, + "step": 25754 + }, + { + "epoch": 0.51512, + "grad_norm": 1.734375, + "grad_norm_var": 0.0124664306640625, + "learning_rate": 0.0001, + "loss": 3.6737, + "loss/crossentropy": 1.7230817675590515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16161160171031952, + "step": 25756 + }, + { + "epoch": 0.51516, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010846964518229167, + "learning_rate": 0.0001, + "loss": 3.8654, + "loss/crossentropy": 1.7760237455368042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1803196743130684, + "step": 25758 + }, + { + "epoch": 0.5152, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011126454671223958, + "learning_rate": 0.0001, + "loss": 4.0545, + "loss/crossentropy": 2.3100990056991577, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2151026725769043, + "step": 25760 + }, + { + "epoch": 0.51524, + "grad_norm": 1.890625, + "grad_norm_var": 0.010957845052083333, + "learning_rate": 0.0001, + "loss": 4.2432, + "loss/crossentropy": 2.1021112203598022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18987688422203064, + "step": 25762 + }, + { + "epoch": 0.51528, + "grad_norm": 1.890625, + "grad_norm_var": 0.011139933268229167, + "learning_rate": 0.0001, + "loss": 3.798, + "loss/crossentropy": 1.9850506782531738, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19890005141496658, + "step": 25764 + }, + { + "epoch": 0.51532, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010253651936848959, + "learning_rate": 0.0001, + "loss": 4.2499, + "loss/crossentropy": 2.37569797039032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20863597840070724, + "step": 25766 + }, + { + "epoch": 0.51536, + "grad_norm": 2.015625, + "grad_norm_var": 0.011994425455729167, + "learning_rate": 0.0001, + "loss": 4.202, + "loss/crossentropy": 2.3777748346328735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22870129346847534, + "step": 25768 + }, + { + "epoch": 0.5154, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009114329020182292, + "learning_rate": 0.0001, + "loss": 4.2291, + "loss/crossentropy": 2.071221649646759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19465549290180206, + "step": 25770 + }, + { + "epoch": 0.51544, + "grad_norm": 1.7265625, + "grad_norm_var": 0.009650675455729167, + "learning_rate": 0.0001, + "loss": 3.8449, + "loss/crossentropy": 2.2113492488861084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19282811880111694, + "step": 25772 + }, + { + "epoch": 0.51548, + "grad_norm": 2.28125, + "grad_norm_var": 0.0171539306640625, + "learning_rate": 0.0001, + "loss": 4.0252, + "loss/crossentropy": 1.8800004124641418, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1772722378373146, + "step": 25774 + }, + { + "epoch": 0.51552, + "grad_norm": 2.015625, + "grad_norm_var": 0.01837946573893229, + "learning_rate": 0.0001, + "loss": 3.8032, + "loss/crossentropy": 1.9569828510284424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1836630329489708, + "step": 25776 + }, + { + "epoch": 0.51556, + "grad_norm": 2.125, + "grad_norm_var": 0.020114898681640625, + "learning_rate": 0.0001, + "loss": 4.2328, + "loss/crossentropy": 2.035541355609894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19203660637140274, + "step": 25778 + }, + { + "epoch": 0.5156, + "grad_norm": 1.84375, + "grad_norm_var": 0.0198638916015625, + "learning_rate": 0.0001, + "loss": 3.9283, + "loss/crossentropy": 2.2393122911453247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18675968796014786, + "step": 25780 + }, + { + "epoch": 0.51564, + "grad_norm": 1.9140625, + "grad_norm_var": 0.019632975260416668, + "learning_rate": 0.0001, + "loss": 4.0009, + "loss/crossentropy": 2.0540746450424194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20924384146928787, + "step": 25782 + }, + { + "epoch": 0.51568, + "grad_norm": 2.015625, + "grad_norm_var": 0.01869481404622396, + "learning_rate": 0.0001, + "loss": 4.2234, + "loss/crossentropy": 2.168312907218933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21717043220996857, + "step": 25784 + }, + { + "epoch": 0.51572, + "grad_norm": 1.96875, + "grad_norm_var": 0.017252349853515626, + "learning_rate": 0.0001, + "loss": 3.8948, + "loss/crossentropy": 1.9122300744056702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18530257046222687, + "step": 25786 + }, + { + "epoch": 0.51576, + "grad_norm": 2.046875, + "grad_norm_var": 0.018314615885416666, + "learning_rate": 0.0001, + "loss": 3.7462, + "loss/crossentropy": 1.813349425792694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17834128439426422, + "step": 25788 + }, + { + "epoch": 0.5158, + "grad_norm": 2.046875, + "grad_norm_var": 0.011604563395182291, + "learning_rate": 0.0001, + "loss": 4.197, + "loss/crossentropy": 2.217802047729492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1857040449976921, + "step": 25790 + }, + { + "epoch": 0.51584, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011132558186848959, + "learning_rate": 0.0001, + "loss": 3.8139, + "loss/crossentropy": 2.0986403822898865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1917191743850708, + "step": 25792 + }, + { + "epoch": 0.51588, + "grad_norm": 2.125, + "grad_norm_var": 0.011250813802083334, + "learning_rate": 0.0001, + "loss": 4.1166, + "loss/crossentropy": 1.7962473034858704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1671363189816475, + "step": 25794 + }, + { + "epoch": 0.51592, + "grad_norm": 1.84375, + "grad_norm_var": 0.012215169270833333, + "learning_rate": 0.0001, + "loss": 3.7758, + "loss/crossentropy": 2.001879632472992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19479414075613022, + "step": 25796 + }, + { + "epoch": 0.51596, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012018839518229166, + "learning_rate": 0.0001, + "loss": 3.9258, + "loss/crossentropy": 2.228678584098816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20541846752166748, + "step": 25798 + }, + { + "epoch": 0.516, + "grad_norm": 2.109375, + "grad_norm_var": 0.013679758707682291, + "learning_rate": 0.0001, + "loss": 4.1654, + "loss/crossentropy": 2.283624291419983, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23651224374771118, + "step": 25800 + }, + { + "epoch": 0.51604, + "grad_norm": 2.078125, + "grad_norm_var": 0.015006510416666667, + "learning_rate": 0.0001, + "loss": 3.9617, + "loss/crossentropy": 1.9860345125198364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878475695848465, + "step": 25802 + }, + { + "epoch": 0.51608, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01031494140625, + "learning_rate": 0.0001, + "loss": 3.9197, + "loss/crossentropy": 2.1123459935188293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22042006999254227, + "step": 25804 + }, + { + "epoch": 0.51612, + "grad_norm": 2.03125, + "grad_norm_var": 0.009886678059895833, + "learning_rate": 0.0001, + "loss": 3.9532, + "loss/crossentropy": 2.092814803123474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2060900628566742, + "step": 25806 + }, + { + "epoch": 0.51616, + "grad_norm": 1.90625, + "grad_norm_var": 0.0083404541015625, + "learning_rate": 0.0001, + "loss": 3.8515, + "loss/crossentropy": 1.9071126580238342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18779786676168442, + "step": 25808 + }, + { + "epoch": 0.5162, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006819661458333333, + "learning_rate": 0.0001, + "loss": 3.9823, + "loss/crossentropy": 1.9593830704689026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17265263199806213, + "step": 25810 + }, + { + "epoch": 0.51624, + "grad_norm": 1.9140625, + "grad_norm_var": 0.00438232421875, + "learning_rate": 0.0001, + "loss": 4.0247, + "loss/crossentropy": 1.8603730201721191, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18761274218559265, + "step": 25812 + }, + { + "epoch": 0.51628, + "grad_norm": 1.8359375, + "grad_norm_var": 0.007425689697265625, + "learning_rate": 0.0001, + "loss": 3.7386, + "loss/crossentropy": 1.8858801126480103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186118483543396, + "step": 25814 + }, + { + "epoch": 0.51632, + "grad_norm": 2.796875, + "grad_norm_var": 0.051513671875, + "learning_rate": 0.0001, + "loss": 4.0368, + "loss/crossentropy": 2.1853126287460327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23933759331703186, + "step": 25816 + }, + { + "epoch": 0.51636, + "grad_norm": 1.9453125, + "grad_norm_var": 0.05208104451497396, + "learning_rate": 0.0001, + "loss": 4.123, + "loss/crossentropy": 2.0599172115325928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19681133329868317, + "step": 25818 + }, + { + "epoch": 0.5164, + "grad_norm": 2.234375, + "grad_norm_var": 0.055663045247395834, + "learning_rate": 0.0001, + "loss": 4.2483, + "loss/crossentropy": 2.371991515159607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2135465070605278, + "step": 25820 + }, + { + "epoch": 0.51644, + "grad_norm": 1.9921875, + "grad_norm_var": 0.05572077433268229, + "learning_rate": 0.0001, + "loss": 4.2967, + "loss/crossentropy": 2.1175169944763184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994188129901886, + "step": 25822 + }, + { + "epoch": 0.51648, + "grad_norm": 1.8515625, + "grad_norm_var": 0.059357706705729166, + "learning_rate": 0.0001, + "loss": 3.735, + "loss/crossentropy": 2.074811100959778, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20690523087978363, + "step": 25824 + }, + { + "epoch": 0.51652, + "grad_norm": 1.8984375, + "grad_norm_var": 0.05923258463541667, + "learning_rate": 0.0001, + "loss": 3.9436, + "loss/crossentropy": 2.041485607624054, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19222356379032135, + "step": 25826 + }, + { + "epoch": 0.51656, + "grad_norm": 1.84375, + "grad_norm_var": 0.060212961832682294, + "learning_rate": 0.0001, + "loss": 3.9312, + "loss/crossentropy": 1.9386745691299438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20328567177057266, + "step": 25828 + }, + { + "epoch": 0.5166, + "grad_norm": 1.9765625, + "grad_norm_var": 0.055692291259765624, + "learning_rate": 0.0001, + "loss": 4.1459, + "loss/crossentropy": 1.8701601028442383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17632923275232315, + "step": 25830 + }, + { + "epoch": 0.51664, + "grad_norm": 2.125, + "grad_norm_var": 0.012333170572916666, + "learning_rate": 0.0001, + "loss": 4.0424, + "loss/crossentropy": 2.258900284767151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2108033150434494, + "step": 25832 + }, + { + "epoch": 0.51668, + "grad_norm": 2.03125, + "grad_norm_var": 0.016806793212890626, + "learning_rate": 0.0001, + "loss": 4.4649, + "loss/crossentropy": 2.10329806804657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19907134026288986, + "step": 25834 + }, + { + "epoch": 0.51672, + "grad_norm": 2.109375, + "grad_norm_var": 0.014208984375, + "learning_rate": 0.0001, + "loss": 3.9527, + "loss/crossentropy": 1.692655324935913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17038661241531372, + "step": 25836 + }, + { + "epoch": 0.51676, + "grad_norm": 2.109375, + "grad_norm_var": 0.01628392537434896, + "learning_rate": 0.0001, + "loss": 4.122, + "loss/crossentropy": 2.0541951060295105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20199306309223175, + "step": 25838 + }, + { + "epoch": 0.5168, + "grad_norm": 2.171875, + "grad_norm_var": 0.014721425374348958, + "learning_rate": 0.0001, + "loss": 4.2206, + "loss/crossentropy": 2.0732097029685974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21288836747407913, + "step": 25840 + }, + { + "epoch": 0.51684, + "grad_norm": 2.015625, + "grad_norm_var": 0.013939412434895833, + "learning_rate": 0.0001, + "loss": 4.2017, + "loss/crossentropy": 2.282121777534485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21497543156147003, + "step": 25842 + }, + { + "epoch": 0.51688, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0136383056640625, + "learning_rate": 0.0001, + "loss": 3.978, + "loss/crossentropy": 1.8445320129394531, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17342890799045563, + "step": 25844 + }, + { + "epoch": 0.51692, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0162109375, + "learning_rate": 0.0001, + "loss": 3.9638, + "loss/crossentropy": 1.9874401092529297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18226970732212067, + "step": 25846 + }, + { + "epoch": 0.51696, + "grad_norm": 1.9296875, + "grad_norm_var": 0.015290323893229167, + "learning_rate": 0.0001, + "loss": 4.0861, + "loss/crossentropy": 2.0648937225341797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19047797471284866, + "step": 25848 + }, + { + "epoch": 0.517, + "grad_norm": 9.125, + "grad_norm_var": 3.220908355712891, + "learning_rate": 0.0001, + "loss": 4.2993, + "loss/crossentropy": 2.3565629720687866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20815564692020416, + "step": 25850 + }, + { + "epoch": 0.51704, + "grad_norm": 2.15625, + "grad_norm_var": 3.200935872395833, + "learning_rate": 0.0001, + "loss": 4.2516, + "loss/crossentropy": 1.9872604608535767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20761053264141083, + "step": 25852 + }, + { + "epoch": 0.51708, + "grad_norm": 1.921875, + "grad_norm_var": 3.206274159749349, + "learning_rate": 0.0001, + "loss": 3.8383, + "loss/crossentropy": 1.6661525964736938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16651833057403564, + "step": 25854 + }, + { + "epoch": 0.51712, + "grad_norm": 2.03125, + "grad_norm_var": 3.218633778889974, + "learning_rate": 0.0001, + "loss": 4.2274, + "loss/crossentropy": 2.043734908103943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980709508061409, + "step": 25856 + }, + { + "epoch": 0.51716, + "grad_norm": 1.921875, + "grad_norm_var": 3.2231727600097657, + "learning_rate": 0.0001, + "loss": 3.9739, + "loss/crossentropy": 1.897942066192627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18658392876386642, + "step": 25858 + }, + { + "epoch": 0.5172, + "grad_norm": 1.9609375, + "grad_norm_var": 3.216633097330729, + "learning_rate": 0.0001, + "loss": 4.1149, + "loss/crossentropy": 2.1246655583381653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21123212575912476, + "step": 25860 + }, + { + "epoch": 0.51724, + "grad_norm": 2.03125, + "grad_norm_var": 3.201690419514974, + "learning_rate": 0.0001, + "loss": 4.0062, + "loss/crossentropy": 2.000667631626129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2007341906428337, + "step": 25862 + }, + { + "epoch": 0.51728, + "grad_norm": 2.015625, + "grad_norm_var": 3.202071126302083, + "learning_rate": 0.0001, + "loss": 3.987, + "loss/crossentropy": 2.09254252910614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19557519257068634, + "step": 25864 + }, + { + "epoch": 0.51732, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012505849202473959, + "learning_rate": 0.0001, + "loss": 4.1372, + "loss/crossentropy": 2.0801188945770264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20193548500537872, + "step": 25866 + }, + { + "epoch": 0.51736, + "grad_norm": 1.8359375, + "grad_norm_var": 0.004166412353515625, + "learning_rate": 0.0001, + "loss": 3.9061, + "loss/crossentropy": 2.0977261662483215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2003898024559021, + "step": 25868 + }, + { + "epoch": 0.5174, + "grad_norm": 1.953125, + "grad_norm_var": 0.005246734619140625, + "learning_rate": 0.0001, + "loss": 3.7979, + "loss/crossentropy": 1.5647385120391846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14602012932300568, + "step": 25870 + }, + { + "epoch": 0.51744, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0049631754557291664, + "learning_rate": 0.0001, + "loss": 3.8804, + "loss/crossentropy": 1.9746862649917603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19539950788021088, + "step": 25872 + }, + { + "epoch": 0.51748, + "grad_norm": 1.9375, + "grad_norm_var": 0.005721028645833333, + "learning_rate": 0.0001, + "loss": 4.143, + "loss/crossentropy": 2.08547705411911, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1976427659392357, + "step": 25874 + }, + { + "epoch": 0.51752, + "grad_norm": 2.0625, + "grad_norm_var": 0.007647450764973958, + "learning_rate": 0.0001, + "loss": 4.017, + "loss/crossentropy": 2.040421783924103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20465777814388275, + "step": 25876 + }, + { + "epoch": 0.51756, + "grad_norm": 1.8359375, + "grad_norm_var": 0.008272043863932292, + "learning_rate": 0.0001, + "loss": 3.7525, + "loss/crossentropy": 1.8793463706970215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17017312347888947, + "step": 25878 + }, + { + "epoch": 0.5176, + "grad_norm": 1.984375, + "grad_norm_var": 0.007903798421223959, + "learning_rate": 0.0001, + "loss": 4.0791, + "loss/crossentropy": 2.263822913169861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2253909707069397, + "step": 25880 + }, + { + "epoch": 0.51764, + "grad_norm": 2.34375, + "grad_norm_var": 0.01856689453125, + "learning_rate": 0.0001, + "loss": 4.2281, + "loss/crossentropy": 1.8976882100105286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1699293926358223, + "step": 25882 + }, + { + "epoch": 0.51768, + "grad_norm": 1.953125, + "grad_norm_var": 0.016641998291015626, + "learning_rate": 0.0001, + "loss": 4.0898, + "loss/crossentropy": 2.102656662464142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.189642496407032, + "step": 25884 + }, + { + "epoch": 0.51772, + "grad_norm": 2.28125, + "grad_norm_var": 0.0206451416015625, + "learning_rate": 0.0001, + "loss": 4.0169, + "loss/crossentropy": 2.1792502403259277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2173495665192604, + "step": 25886 + }, + { + "epoch": 0.51776, + "grad_norm": 2.0625, + "grad_norm_var": 0.021286773681640624, + "learning_rate": 0.0001, + "loss": 4.2587, + "loss/crossentropy": 2.3405139446258545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20445296168327332, + "step": 25888 + }, + { + "epoch": 0.5178, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0225830078125, + "learning_rate": 0.0001, + "loss": 3.8529, + "loss/crossentropy": 2.116323173046112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.204654723405838, + "step": 25890 + }, + { + "epoch": 0.51784, + "grad_norm": 1.8515625, + "grad_norm_var": 0.021628570556640626, + "learning_rate": 0.0001, + "loss": 3.9595, + "loss/crossentropy": 1.9406288266181946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19154663383960724, + "step": 25892 + }, + { + "epoch": 0.51788, + "grad_norm": 2.484375, + "grad_norm_var": 0.0339263916015625, + "learning_rate": 0.0001, + "loss": 4.0066, + "loss/crossentropy": 2.008155882358551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18465732783079147, + "step": 25894 + }, + { + "epoch": 0.51792, + "grad_norm": 2.0625, + "grad_norm_var": 0.03385594685872396, + "learning_rate": 0.0001, + "loss": 4.2731, + "loss/crossentropy": 2.2472530007362366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22532539069652557, + "step": 25896 + }, + { + "epoch": 0.51796, + "grad_norm": 1.90625, + "grad_norm_var": 0.028238932291666668, + "learning_rate": 0.0001, + "loss": 3.9856, + "loss/crossentropy": 2.0776681900024414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18579304963350296, + "step": 25898 + }, + { + "epoch": 0.518, + "grad_norm": 1.8984375, + "grad_norm_var": 0.029670206705729167, + "learning_rate": 0.0001, + "loss": 3.8088, + "loss/crossentropy": 2.053288221359253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1885019615292549, + "step": 25900 + }, + { + "epoch": 0.51804, + "grad_norm": 1.84375, + "grad_norm_var": 0.02557551066080729, + "learning_rate": 0.0001, + "loss": 4.0077, + "loss/crossentropy": 1.6223698258399963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21323902904987335, + "step": 25902 + }, + { + "epoch": 0.51808, + "grad_norm": 1.71875, + "grad_norm_var": 0.03012873331705729, + "learning_rate": 0.0001, + "loss": 3.8248, + "loss/crossentropy": 2.012439727783203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17938166856765747, + "step": 25904 + }, + { + "epoch": 0.51812, + "grad_norm": 1.8828125, + "grad_norm_var": 0.02967096964518229, + "learning_rate": 0.0001, + "loss": 4.0141, + "loss/crossentropy": 1.975797176361084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2021034210920334, + "step": 25906 + }, + { + "epoch": 0.51816, + "grad_norm": 2.15625, + "grad_norm_var": 0.03150126139322917, + "learning_rate": 0.0001, + "loss": 4.0165, + "loss/crossentropy": 2.1858623027801514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22130713611841202, + "step": 25908 + }, + { + "epoch": 0.5182, + "grad_norm": 1.84375, + "grad_norm_var": 0.0142486572265625, + "learning_rate": 0.0001, + "loss": 3.9008, + "loss/crossentropy": 1.8949698209762573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19013772159814835, + "step": 25910 + }, + { + "epoch": 0.51824, + "grad_norm": 1.9609375, + "grad_norm_var": 0.013266754150390626, + "learning_rate": 0.0001, + "loss": 3.9872, + "loss/crossentropy": 1.9657301902770996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19463873654603958, + "step": 25912 + }, + { + "epoch": 0.51828, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0107818603515625, + "learning_rate": 0.0001, + "loss": 3.9089, + "loss/crossentropy": 1.9067611694335938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1767619624733925, + "step": 25914 + }, + { + "epoch": 0.51832, + "grad_norm": 1.7734375, + "grad_norm_var": 0.012032063802083333, + "learning_rate": 0.0001, + "loss": 3.8141, + "loss/crossentropy": 1.9040088653564453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19490251690149307, + "step": 25916 + }, + { + "epoch": 0.51836, + "grad_norm": 1.6796875, + "grad_norm_var": 0.014776357014973958, + "learning_rate": 0.0001, + "loss": 3.8649, + "loss/crossentropy": 1.9055203795433044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1863974630832672, + "step": 25918 + }, + { + "epoch": 0.5184, + "grad_norm": 1.828125, + "grad_norm_var": 0.012507883707682292, + "learning_rate": 0.0001, + "loss": 3.8156, + "loss/crossentropy": 2.01656973361969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1821809709072113, + "step": 25920 + }, + { + "epoch": 0.51844, + "grad_norm": 1.859375, + "grad_norm_var": 0.012629191080729166, + "learning_rate": 0.0001, + "loss": 3.9567, + "loss/crossentropy": 1.9760606288909912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18721874803304672, + "step": 25922 + }, + { + "epoch": 0.51848, + "grad_norm": 1.9375, + "grad_norm_var": 0.008579254150390625, + "learning_rate": 0.0001, + "loss": 3.8369, + "loss/crossentropy": 2.002072513103485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2038237452507019, + "step": 25924 + }, + { + "epoch": 0.51852, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008454386393229167, + "learning_rate": 0.0001, + "loss": 4.0081, + "loss/crossentropy": 2.329070210456848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19514091312885284, + "step": 25926 + }, + { + "epoch": 0.51856, + "grad_norm": 1.953125, + "grad_norm_var": 0.008243815104166666, + "learning_rate": 0.0001, + "loss": 3.8548, + "loss/crossentropy": 2.008804202079773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18007662147283554, + "step": 25928 + }, + { + "epoch": 0.5186, + "grad_norm": 1.8359375, + "grad_norm_var": 0.00882568359375, + "learning_rate": 0.0001, + "loss": 3.8927, + "loss/crossentropy": 1.7008295059204102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17658023536205292, + "step": 25930 + }, + { + "epoch": 0.51864, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009455362955729166, + "learning_rate": 0.0001, + "loss": 4.2386, + "loss/crossentropy": 2.215006470680237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20486676692962646, + "step": 25932 + }, + { + "epoch": 0.51868, + "grad_norm": 1.890625, + "grad_norm_var": 0.006404368082682291, + "learning_rate": 0.0001, + "loss": 3.6252, + "loss/crossentropy": 1.5741249322891235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15962044149637222, + "step": 25934 + }, + { + "epoch": 0.51872, + "grad_norm": 1.7421875, + "grad_norm_var": 0.0077545166015625, + "learning_rate": 0.0001, + "loss": 3.8169, + "loss/crossentropy": 1.8923559784889221, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1873473823070526, + "step": 25936 + }, + { + "epoch": 0.51876, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0076067606608072914, + "learning_rate": 0.0001, + "loss": 3.8542, + "loss/crossentropy": 1.7315371632575989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17056572437286377, + "step": 25938 + }, + { + "epoch": 0.5188, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007469685872395834, + "learning_rate": 0.0001, + "loss": 3.9419, + "loss/crossentropy": 1.9019699096679688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19772474467754364, + "step": 25940 + }, + { + "epoch": 0.51884, + "grad_norm": 1.953125, + "grad_norm_var": 0.006068674723307291, + "learning_rate": 0.0001, + "loss": 3.9864, + "loss/crossentropy": 2.0523456931114197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19947312027215958, + "step": 25942 + }, + { + "epoch": 0.51888, + "grad_norm": 2.0, + "grad_norm_var": 0.005785115559895833, + "learning_rate": 0.0001, + "loss": 4.046, + "loss/crossentropy": 1.9278483390808105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19414179772138596, + "step": 25944 + }, + { + "epoch": 0.51892, + "grad_norm": 1.9375, + "grad_norm_var": 0.004874674479166666, + "learning_rate": 0.0001, + "loss": 3.8827, + "loss/crossentropy": 2.059605300426483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18402428179979324, + "step": 25946 + }, + { + "epoch": 0.51896, + "grad_norm": 2.46875, + "grad_norm_var": 0.023219553629557292, + "learning_rate": 0.0001, + "loss": 4.34, + "loss/crossentropy": 2.2166699171066284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21071894466876984, + "step": 25948 + }, + { + "epoch": 0.519, + "grad_norm": 1.9765625, + "grad_norm_var": 0.03566665649414062, + "learning_rate": 0.0001, + "loss": 4.3082, + "loss/crossentropy": 1.8669118881225586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2712375670671463, + "step": 25950 + }, + { + "epoch": 0.51904, + "grad_norm": 1.953125, + "grad_norm_var": 0.03169530232747396, + "learning_rate": 0.0001, + "loss": 4.1025, + "loss/crossentropy": 1.9831582307815552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18969787657260895, + "step": 25952 + }, + { + "epoch": 0.51908, + "grad_norm": 3.421875, + "grad_norm_var": 0.15215835571289063, + "learning_rate": 0.0001, + "loss": 4.2287, + "loss/crossentropy": 2.092381715774536, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.31603462994098663, + "step": 25954 + }, + { + "epoch": 0.51912, + "grad_norm": 1.90625, + "grad_norm_var": 0.15208231608072917, + "learning_rate": 0.0001, + "loss": 3.7941, + "loss/crossentropy": 2.083071291446686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18124651163816452, + "step": 25956 + }, + { + "epoch": 0.51916, + "grad_norm": 2.046875, + "grad_norm_var": 0.1500017801920573, + "learning_rate": 0.0001, + "loss": 4.2498, + "loss/crossentropy": 2.280423641204834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21027068048715591, + "step": 25958 + }, + { + "epoch": 0.5192, + "grad_norm": 1.78125, + "grad_norm_var": 0.15642471313476564, + "learning_rate": 0.0001, + "loss": 4.0311, + "loss/crossentropy": 2.176509916782379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987079679965973, + "step": 25960 + }, + { + "epoch": 0.51924, + "grad_norm": 1.7734375, + "grad_norm_var": 0.16028416951497396, + "learning_rate": 0.0001, + "loss": 3.9031, + "loss/crossentropy": 1.918852150440216, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17551065981388092, + "step": 25962 + }, + { + "epoch": 0.51928, + "grad_norm": 1.9296875, + "grad_norm_var": 0.15350748697916666, + "learning_rate": 0.0001, + "loss": 3.8734, + "loss/crossentropy": 2.238860845565796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2066703960299492, + "step": 25964 + }, + { + "epoch": 0.51932, + "grad_norm": 2.09375, + "grad_norm_var": 0.1447771708170573, + "learning_rate": 0.0001, + "loss": 4.261, + "loss/crossentropy": 2.1192798614501953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21037639677524567, + "step": 25966 + }, + { + "epoch": 0.51936, + "grad_norm": 1.84375, + "grad_norm_var": 0.14727554321289063, + "learning_rate": 0.0001, + "loss": 3.9908, + "loss/crossentropy": 2.0698190927505493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19720114767551422, + "step": 25968 + }, + { + "epoch": 0.5194, + "grad_norm": 2.15625, + "grad_norm_var": 0.0139801025390625, + "learning_rate": 0.0001, + "loss": 4.3106, + "loss/crossentropy": 2.273471713066101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21805588901042938, + "step": 25970 + }, + { + "epoch": 0.51944, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0140869140625, + "learning_rate": 0.0001, + "loss": 3.9038, + "loss/crossentropy": 2.1107550263404846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19864343851804733, + "step": 25972 + }, + { + "epoch": 0.51948, + "grad_norm": 1.9375, + "grad_norm_var": 0.013566080729166667, + "learning_rate": 0.0001, + "loss": 4.0438, + "loss/crossentropy": 1.9644930362701416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18844127655029297, + "step": 25974 + }, + { + "epoch": 0.51952, + "grad_norm": 1.953125, + "grad_norm_var": 0.0092926025390625, + "learning_rate": 0.0001, + "loss": 4.1062, + "loss/crossentropy": 2.120868682861328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19970117509365082, + "step": 25976 + }, + { + "epoch": 0.51956, + "grad_norm": 1.90625, + "grad_norm_var": 0.007201131184895833, + "learning_rate": 0.0001, + "loss": 3.8065, + "loss/crossentropy": 1.6433513164520264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15953880548477173, + "step": 25978 + }, + { + "epoch": 0.5196, + "grad_norm": 1.9375, + "grad_norm_var": 0.009043121337890625, + "learning_rate": 0.0001, + "loss": 4.1572, + "loss/crossentropy": 2.0936105847358704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19382526725530624, + "step": 25980 + }, + { + "epoch": 0.51964, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008234659830729166, + "learning_rate": 0.0001, + "loss": 3.9524, + "loss/crossentropy": 2.3531426191329956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20965352654457092, + "step": 25982 + }, + { + "epoch": 0.51968, + "grad_norm": 1.984375, + "grad_norm_var": 0.006493123372395834, + "learning_rate": 0.0001, + "loss": 4.1327, + "loss/crossentropy": 2.342974007129669, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20701958984136581, + "step": 25984 + }, + { + "epoch": 0.51972, + "grad_norm": 2.09375, + "grad_norm_var": 0.0052154541015625, + "learning_rate": 0.0001, + "loss": 4.3142, + "loss/crossentropy": 2.193662643432617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072567343711853, + "step": 25986 + }, + { + "epoch": 0.51976, + "grad_norm": 1.9375, + "grad_norm_var": 0.004809315999348958, + "learning_rate": 0.0001, + "loss": 4.1056, + "loss/crossentropy": 1.8900989294052124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910807117819786, + "step": 25988 + }, + { + "epoch": 0.5198, + "grad_norm": 1.984375, + "grad_norm_var": 0.005387369791666667, + "learning_rate": 0.0001, + "loss": 4.3945, + "loss/crossentropy": 2.243148624897003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20235490798950195, + "step": 25990 + }, + { + "epoch": 0.51984, + "grad_norm": 1.796875, + "grad_norm_var": 0.007513173421223958, + "learning_rate": 0.0001, + "loss": 3.9088, + "loss/crossentropy": 2.105653166770935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20262588560581207, + "step": 25992 + }, + { + "epoch": 0.51988, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0073811848958333336, + "learning_rate": 0.0001, + "loss": 4.0318, + "loss/crossentropy": 1.9967975616455078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18423616141080856, + "step": 25994 + }, + { + "epoch": 0.51992, + "grad_norm": 2.203125, + "grad_norm_var": 0.009565989176432291, + "learning_rate": 0.0001, + "loss": 4.1648, + "loss/crossentropy": 2.357354164123535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2126748189330101, + "step": 25996 + }, + { + "epoch": 0.51996, + "grad_norm": 1.671875, + "grad_norm_var": 0.0246002197265625, + "learning_rate": 0.0001, + "loss": 3.6427, + "loss/crossentropy": 2.22074818611145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187141053378582, + "step": 25998 + }, + { + "epoch": 0.52, + "grad_norm": 1.984375, + "grad_norm_var": 0.0246002197265625, + "learning_rate": 0.0001, + "loss": 4.1297, + "loss/crossentropy": 2.0902404189109802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18575181812047958, + "step": 26000 + }, + { + "epoch": 0.52004, + "grad_norm": 1.9921875, + "grad_norm_var": 0.025172678629557292, + "learning_rate": 0.0001, + "loss": 4.2075, + "loss/crossentropy": 1.9274433851242065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.178068108856678, + "step": 26002 + }, + { + "epoch": 0.52008, + "grad_norm": 1.8515625, + "grad_norm_var": 0.026362864176432292, + "learning_rate": 0.0001, + "loss": 4.0935, + "loss/crossentropy": 2.3539743423461914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2101089358329773, + "step": 26004 + }, + { + "epoch": 0.52012, + "grad_norm": 1.9765625, + "grad_norm_var": 0.027264149983723958, + "learning_rate": 0.0001, + "loss": 3.9747, + "loss/crossentropy": 1.7968198657035828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17364778369665146, + "step": 26006 + }, + { + "epoch": 0.52016, + "grad_norm": 1.9296875, + "grad_norm_var": 0.025731404622395832, + "learning_rate": 0.0001, + "loss": 4.0823, + "loss/crossentropy": 1.7845246195793152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18205135315656662, + "step": 26008 + }, + { + "epoch": 0.5202, + "grad_norm": 1.875, + "grad_norm_var": 0.027034250895182292, + "learning_rate": 0.0001, + "loss": 3.8676, + "loss/crossentropy": 2.309031844139099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21052244305610657, + "step": 26010 + }, + { + "epoch": 0.52024, + "grad_norm": 2.0625, + "grad_norm_var": 0.024491373697916666, + "learning_rate": 0.0001, + "loss": 3.9956, + "loss/crossentropy": 1.9549171328544617, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2348322793841362, + "step": 26012 + }, + { + "epoch": 0.52028, + "grad_norm": 2.03125, + "grad_norm_var": 0.009211222330729166, + "learning_rate": 0.0001, + "loss": 4.3597, + "loss/crossentropy": 2.5377613306045532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23388546705245972, + "step": 26014 + }, + { + "epoch": 0.52032, + "grad_norm": 2.0625, + "grad_norm_var": 0.009912109375, + "learning_rate": 0.0001, + "loss": 4.244, + "loss/crossentropy": 2.1748557090759277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20094042271375656, + "step": 26016 + }, + { + "epoch": 0.52036, + "grad_norm": 1.921875, + "grad_norm_var": 0.007661946614583333, + "learning_rate": 0.0001, + "loss": 3.8643, + "loss/crossentropy": 1.4638828039169312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15480295568704605, + "step": 26018 + }, + { + "epoch": 0.5204, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011408487955729166, + "learning_rate": 0.0001, + "loss": 4.041, + "loss/crossentropy": 2.058153450489044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19334695488214493, + "step": 26020 + }, + { + "epoch": 0.52044, + "grad_norm": 1.96875, + "grad_norm_var": 0.011543782552083333, + "learning_rate": 0.0001, + "loss": 4.1254, + "loss/crossentropy": 2.0739980936050415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19739702343940735, + "step": 26022 + }, + { + "epoch": 0.52048, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0109619140625, + "learning_rate": 0.0001, + "loss": 4.1202, + "loss/crossentropy": 2.0489348769187927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2435540407896042, + "step": 26024 + }, + { + "epoch": 0.52052, + "grad_norm": 2.0625, + "grad_norm_var": 0.010138956705729167, + "learning_rate": 0.0001, + "loss": 4.1301, + "loss/crossentropy": 1.9589496850967407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1817004606127739, + "step": 26026 + }, + { + "epoch": 0.52056, + "grad_norm": 2.046875, + "grad_norm_var": 0.009032185872395833, + "learning_rate": 0.0001, + "loss": 4.1931, + "loss/crossentropy": 2.2213997840881348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20844081044197083, + "step": 26028 + }, + { + "epoch": 0.5206, + "grad_norm": 1.9609375, + "grad_norm_var": 0.010863240559895833, + "learning_rate": 0.0001, + "loss": 3.9974, + "loss/crossentropy": 1.920683741569519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19115915894508362, + "step": 26030 + }, + { + "epoch": 0.52064, + "grad_norm": 1.84375, + "grad_norm_var": 0.012599436442057292, + "learning_rate": 0.0001, + "loss": 4.011, + "loss/crossentropy": 2.142166316509247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18952738493680954, + "step": 26032 + }, + { + "epoch": 0.52068, + "grad_norm": 2.015625, + "grad_norm_var": 0.013067372639973958, + "learning_rate": 0.0001, + "loss": 4.2289, + "loss/crossentropy": 2.1924999952316284, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21429342031478882, + "step": 26034 + }, + { + "epoch": 0.52072, + "grad_norm": 1.890625, + "grad_norm_var": 0.010874176025390625, + "learning_rate": 0.0001, + "loss": 3.7799, + "loss/crossentropy": 2.1277804374694824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19686074554920197, + "step": 26036 + }, + { + "epoch": 0.52076, + "grad_norm": 1.8671875, + "grad_norm_var": 0.010959625244140625, + "learning_rate": 0.0001, + "loss": 3.8865, + "loss/crossentropy": 2.2193630933761597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20152346789836884, + "step": 26038 + }, + { + "epoch": 0.5208, + "grad_norm": 1.921875, + "grad_norm_var": 0.010545857747395833, + "learning_rate": 0.0001, + "loss": 4.1349, + "loss/crossentropy": 1.8031319975852966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18107134103775024, + "step": 26040 + }, + { + "epoch": 0.52084, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009032185872395833, + "learning_rate": 0.0001, + "loss": 4.0376, + "loss/crossentropy": 1.9713850021362305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17920061200857162, + "step": 26042 + }, + { + "epoch": 0.52088, + "grad_norm": 1.796875, + "grad_norm_var": 0.00533447265625, + "learning_rate": 0.0001, + "loss": 4.142, + "loss/crossentropy": 2.5888466835021973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2081139236688614, + "step": 26044 + }, + { + "epoch": 0.52092, + "grad_norm": 1.8515625, + "grad_norm_var": 0.004811350504557292, + "learning_rate": 0.0001, + "loss": 4.1217, + "loss/crossentropy": 2.4226022958755493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20813634991645813, + "step": 26046 + }, + { + "epoch": 0.52096, + "grad_norm": 1.8984375, + "grad_norm_var": 0.005248006184895833, + "learning_rate": 0.0001, + "loss": 4.1199, + "loss/crossentropy": 2.179656744003296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19288229197263718, + "step": 26048 + }, + { + "epoch": 0.521, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0041463216145833336, + "learning_rate": 0.0001, + "loss": 4.0571, + "loss/crossentropy": 2.014236092567444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1917470395565033, + "step": 26050 + }, + { + "epoch": 0.52104, + "grad_norm": 1.890625, + "grad_norm_var": 0.0035519917805989583, + "learning_rate": 0.0001, + "loss": 3.7101, + "loss/crossentropy": 2.12498939037323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18156899511814117, + "step": 26052 + }, + { + "epoch": 0.52108, + "grad_norm": 1.875, + "grad_norm_var": 0.003342437744140625, + "learning_rate": 0.0001, + "loss": 3.9443, + "loss/crossentropy": 1.8441026210784912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18120328336954117, + "step": 26054 + }, + { + "epoch": 0.52112, + "grad_norm": 2.109375, + "grad_norm_var": 0.008603668212890625, + "learning_rate": 0.0001, + "loss": 3.9958, + "loss/crossentropy": 2.047904908657074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19617904722690582, + "step": 26056 + }, + { + "epoch": 0.52116, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008697255452473959, + "learning_rate": 0.0001, + "loss": 4.0111, + "loss/crossentropy": 1.7961552739143372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16928328573703766, + "step": 26058 + }, + { + "epoch": 0.5212, + "grad_norm": 1.7890625, + "grad_norm_var": 0.008763631184895834, + "learning_rate": 0.0001, + "loss": 4.0501, + "loss/crossentropy": 2.0813616514205933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18402273952960968, + "step": 26060 + }, + { + "epoch": 0.52124, + "grad_norm": 1.90625, + "grad_norm_var": 0.008632151285807292, + "learning_rate": 0.0001, + "loss": 3.8866, + "loss/crossentropy": 2.13175368309021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19714613258838654, + "step": 26062 + }, + { + "epoch": 0.52128, + "grad_norm": 1.859375, + "grad_norm_var": 0.008640289306640625, + "learning_rate": 0.0001, + "loss": 3.9793, + "loss/crossentropy": 2.0052719712257385, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20111151784658432, + "step": 26064 + }, + { + "epoch": 0.52132, + "grad_norm": 2.171875, + "grad_norm_var": 0.013216145833333333, + "learning_rate": 0.0001, + "loss": 4.2311, + "loss/crossentropy": 1.939625859260559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20675888657569885, + "step": 26066 + }, + { + "epoch": 0.52136, + "grad_norm": 1.9140625, + "grad_norm_var": 0.013255818684895834, + "learning_rate": 0.0001, + "loss": 3.7256, + "loss/crossentropy": 2.208404779434204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18266795575618744, + "step": 26068 + }, + { + "epoch": 0.5214, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0123687744140625, + "learning_rate": 0.0001, + "loss": 3.8233, + "loss/crossentropy": 1.8239786028862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18409103155136108, + "step": 26070 + }, + { + "epoch": 0.52144, + "grad_norm": 1.8125, + "grad_norm_var": 0.007743326822916666, + "learning_rate": 0.0001, + "loss": 4.0984, + "loss/crossentropy": 1.951551616191864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17273114621639252, + "step": 26072 + }, + { + "epoch": 0.52148, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007899729410807292, + "learning_rate": 0.0001, + "loss": 3.6726, + "loss/crossentropy": 1.6747522354125977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1755337342619896, + "step": 26074 + }, + { + "epoch": 0.52152, + "grad_norm": 1.9375, + "grad_norm_var": 0.007661946614583333, + "learning_rate": 0.0001, + "loss": 4.0389, + "loss/crossentropy": 1.954121172428131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2060263231396675, + "step": 26076 + }, + { + "epoch": 0.52156, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007757314046223958, + "learning_rate": 0.0001, + "loss": 4.0388, + "loss/crossentropy": 2.025652050971985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19741995632648468, + "step": 26078 + }, + { + "epoch": 0.5216, + "grad_norm": 2.046875, + "grad_norm_var": 0.010249582926432292, + "learning_rate": 0.0001, + "loss": 4.0048, + "loss/crossentropy": 2.150280773639679, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016005516052246, + "step": 26080 + }, + { + "epoch": 0.52164, + "grad_norm": 2.125, + "grad_norm_var": 0.009211985270182292, + "learning_rate": 0.0001, + "loss": 4.0257, + "loss/crossentropy": 1.9364630579948425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17605994641780853, + "step": 26082 + }, + { + "epoch": 0.52168, + "grad_norm": 1.890625, + "grad_norm_var": 0.009226226806640625, + "learning_rate": 0.0001, + "loss": 4.1845, + "loss/crossentropy": 2.195487380027771, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19364381581544876, + "step": 26084 + }, + { + "epoch": 0.52172, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0098785400390625, + "learning_rate": 0.0001, + "loss": 3.7514, + "loss/crossentropy": 1.6146498918533325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726212576031685, + "step": 26086 + }, + { + "epoch": 0.52176, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009968821207682292, + "learning_rate": 0.0001, + "loss": 3.8782, + "loss/crossentropy": 2.2554049491882324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20600321888923645, + "step": 26088 + }, + { + "epoch": 0.5218, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010210927327473958, + "learning_rate": 0.0001, + "loss": 3.8424, + "loss/crossentropy": 2.0821692943573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20989617705345154, + "step": 26090 + }, + { + "epoch": 0.52184, + "grad_norm": 2.015625, + "grad_norm_var": 0.009765625, + "learning_rate": 0.0001, + "loss": 4.1895, + "loss/crossentropy": 2.114021897315979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20276522636413574, + "step": 26092 + }, + { + "epoch": 0.52188, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009897613525390625, + "learning_rate": 0.0001, + "loss": 3.9856, + "loss/crossentropy": 1.7443894147872925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15722404420375824, + "step": 26094 + }, + { + "epoch": 0.52192, + "grad_norm": 1.8203125, + "grad_norm_var": 0.010796864827473959, + "learning_rate": 0.0001, + "loss": 3.5758, + "loss/crossentropy": 1.603552222251892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14188075065612793, + "step": 26096 + }, + { + "epoch": 0.52196, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0070269266764322914, + "learning_rate": 0.0001, + "loss": 3.9047, + "loss/crossentropy": 1.7003676295280457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1696496084332466, + "step": 26098 + }, + { + "epoch": 0.522, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0069964090983072914, + "learning_rate": 0.0001, + "loss": 3.9792, + "loss/crossentropy": 2.1242096424102783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1936848685145378, + "step": 26100 + }, + { + "epoch": 0.52204, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0076416015625, + "learning_rate": 0.0001, + "loss": 3.9315, + "loss/crossentropy": 2.012674570083618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068016678094864, + "step": 26102 + }, + { + "epoch": 0.52208, + "grad_norm": 2.203125, + "grad_norm_var": 0.014304351806640626, + "learning_rate": 0.0001, + "loss": 4.0618, + "loss/crossentropy": 2.0136077404022217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968667060136795, + "step": 26104 + }, + { + "epoch": 0.52212, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0141021728515625, + "learning_rate": 0.0001, + "loss": 3.9797, + "loss/crossentropy": 1.8880528211593628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18080192804336548, + "step": 26106 + }, + { + "epoch": 0.52216, + "grad_norm": 2.015625, + "grad_norm_var": 0.014216105143229166, + "learning_rate": 0.0001, + "loss": 4.0981, + "loss/crossentropy": 2.109636068344116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19920819997787476, + "step": 26108 + }, + { + "epoch": 0.5222, + "grad_norm": 1.8984375, + "grad_norm_var": 0.015236155192057291, + "learning_rate": 0.0001, + "loss": 4.0566, + "loss/crossentropy": 1.8923597931861877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18053779006004333, + "step": 26110 + }, + { + "epoch": 0.52224, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011324055989583333, + "learning_rate": 0.0001, + "loss": 4.0756, + "loss/crossentropy": 2.179795265197754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18849624693393707, + "step": 26112 + }, + { + "epoch": 0.52228, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011789703369140625, + "learning_rate": 0.0001, + "loss": 3.9964, + "loss/crossentropy": 2.0764212608337402, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18905582278966904, + "step": 26114 + }, + { + "epoch": 0.52232, + "grad_norm": 2.0625, + "grad_norm_var": 0.012634023030598959, + "learning_rate": 0.0001, + "loss": 4.0006, + "loss/crossentropy": 2.2309343814849854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2156553491950035, + "step": 26116 + }, + { + "epoch": 0.52236, + "grad_norm": 1.78125, + "grad_norm_var": 0.012261708577473959, + "learning_rate": 0.0001, + "loss": 3.8689, + "loss/crossentropy": 1.8764967918395996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17533813416957855, + "step": 26118 + }, + { + "epoch": 0.5224, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0075266520182291664, + "learning_rate": 0.0001, + "loss": 3.916, + "loss/crossentropy": 2.1870853900909424, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984037682414055, + "step": 26120 + }, + { + "epoch": 0.52244, + "grad_norm": 1.828125, + "grad_norm_var": 0.007838694254557292, + "learning_rate": 0.0001, + "loss": 3.716, + "loss/crossentropy": 2.113897442817688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18819265812635422, + "step": 26122 + }, + { + "epoch": 0.52248, + "grad_norm": 2.0625, + "grad_norm_var": 0.008056386311848959, + "learning_rate": 0.0001, + "loss": 4.1003, + "loss/crossentropy": 1.9980355501174927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20408682525157928, + "step": 26124 + }, + { + "epoch": 0.52252, + "grad_norm": 2.015625, + "grad_norm_var": 0.007784016927083333, + "learning_rate": 0.0001, + "loss": 4.0452, + "loss/crossentropy": 2.0322210788726807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19906745105981827, + "step": 26126 + }, + { + "epoch": 0.52256, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008304595947265625, + "learning_rate": 0.0001, + "loss": 3.9786, + "loss/crossentropy": 2.0122641921043396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2028798833489418, + "step": 26128 + }, + { + "epoch": 0.5226, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007352447509765625, + "learning_rate": 0.0001, + "loss": 4.0595, + "loss/crossentropy": 2.0169489979743958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20758477598428726, + "step": 26130 + }, + { + "epoch": 0.52264, + "grad_norm": 2.03125, + "grad_norm_var": 0.006754302978515625, + "learning_rate": 0.0001, + "loss": 4.2196, + "loss/crossentropy": 2.45787513256073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2269362136721611, + "step": 26132 + }, + { + "epoch": 0.52268, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0064198811848958336, + "learning_rate": 0.0001, + "loss": 3.9921, + "loss/crossentropy": 2.0325412154197693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20839164406061172, + "step": 26134 + }, + { + "epoch": 0.52272, + "grad_norm": 2.046875, + "grad_norm_var": 0.006815338134765625, + "learning_rate": 0.0001, + "loss": 4.1983, + "loss/crossentropy": 2.1984806060791016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21555981040000916, + "step": 26136 + }, + { + "epoch": 0.52276, + "grad_norm": 2.03125, + "grad_norm_var": 0.005440266927083334, + "learning_rate": 0.0001, + "loss": 3.9955, + "loss/crossentropy": 1.9628766775131226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2089555338025093, + "step": 26138 + }, + { + "epoch": 0.5228, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0069882710774739586, + "learning_rate": 0.0001, + "loss": 3.9137, + "loss/crossentropy": 2.1061203479766846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18837912380695343, + "step": 26140 + }, + { + "epoch": 0.52284, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014485677083333334, + "learning_rate": 0.0001, + "loss": 3.6926, + "loss/crossentropy": 2.120006561279297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19504809379577637, + "step": 26142 + }, + { + "epoch": 0.52288, + "grad_norm": 1.859375, + "grad_norm_var": 0.013749186197916667, + "learning_rate": 0.0001, + "loss": 3.8771, + "loss/crossentropy": 2.0000118613243103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19644330441951752, + "step": 26144 + }, + { + "epoch": 0.52292, + "grad_norm": 1.9296875, + "grad_norm_var": 0.014411417643229167, + "learning_rate": 0.0001, + "loss": 4.2381, + "loss/crossentropy": 2.261639356613159, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21112587302923203, + "step": 26146 + }, + { + "epoch": 0.52296, + "grad_norm": 1.9453125, + "grad_norm_var": 0.013392893473307292, + "learning_rate": 0.0001, + "loss": 4.0808, + "loss/crossentropy": 2.203276216983795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19632557779550552, + "step": 26148 + }, + { + "epoch": 0.523, + "grad_norm": 1.953125, + "grad_norm_var": 0.0129302978515625, + "learning_rate": 0.0001, + "loss": 4.1363, + "loss/crossentropy": 2.1430857181549072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18741771578788757, + "step": 26150 + }, + { + "epoch": 0.52304, + "grad_norm": 1.828125, + "grad_norm_var": 0.012581380208333333, + "learning_rate": 0.0001, + "loss": 3.9574, + "loss/crossentropy": 2.094003438949585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19085614383220673, + "step": 26152 + }, + { + "epoch": 0.52308, + "grad_norm": 1.9140625, + "grad_norm_var": 0.011800130208333334, + "learning_rate": 0.0001, + "loss": 3.9145, + "loss/crossentropy": 1.9885526299476624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17914890497922897, + "step": 26154 + }, + { + "epoch": 0.52312, + "grad_norm": 1.7734375, + "grad_norm_var": 0.012263743082682292, + "learning_rate": 0.0001, + "loss": 3.9942, + "loss/crossentropy": 1.885224461555481, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1779332235455513, + "step": 26156 + }, + { + "epoch": 0.52316, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007411448160807291, + "learning_rate": 0.0001, + "loss": 3.9081, + "loss/crossentropy": 1.8236491084098816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17156238853931427, + "step": 26158 + }, + { + "epoch": 0.5232, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008685048421223958, + "learning_rate": 0.0001, + "loss": 3.8614, + "loss/crossentropy": 1.7451134324073792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16748512536287308, + "step": 26160 + }, + { + "epoch": 0.52324, + "grad_norm": 2.046875, + "grad_norm_var": 0.00872802734375, + "learning_rate": 0.0001, + "loss": 4.0804, + "loss/crossentropy": 2.408850908279419, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20247718691825867, + "step": 26162 + }, + { + "epoch": 0.52328, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0099273681640625, + "learning_rate": 0.0001, + "loss": 4.0908, + "loss/crossentropy": 2.265584349632263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20462538301944733, + "step": 26164 + }, + { + "epoch": 0.52332, + "grad_norm": 1.8046875, + "grad_norm_var": 0.008676910400390625, + "learning_rate": 0.0001, + "loss": 3.7925, + "loss/crossentropy": 1.8766103982925415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1740010604262352, + "step": 26166 + }, + { + "epoch": 0.52336, + "grad_norm": 2.046875, + "grad_norm_var": 0.009413401285807291, + "learning_rate": 0.0001, + "loss": 3.9389, + "loss/crossentropy": 1.858881413936615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.176437109708786, + "step": 26168 + }, + { + "epoch": 0.5234, + "grad_norm": 1.96875, + "grad_norm_var": 0.008907063802083334, + "learning_rate": 0.0001, + "loss": 3.9833, + "loss/crossentropy": 1.733948528766632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.183179572224617, + "step": 26170 + }, + { + "epoch": 0.52344, + "grad_norm": 2.34375, + "grad_norm_var": 0.017923990885416668, + "learning_rate": 0.0001, + "loss": 4.4352, + "loss/crossentropy": 1.7623996138572693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.3073955178260803, + "step": 26172 + }, + { + "epoch": 0.52348, + "grad_norm": 1.921875, + "grad_norm_var": 0.017470041910807293, + "learning_rate": 0.0001, + "loss": 4.0571, + "loss/crossentropy": 1.9229013919830322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19095296412706375, + "step": 26174 + }, + { + "epoch": 0.52352, + "grad_norm": 1.84375, + "grad_norm_var": 0.014867146809895834, + "learning_rate": 0.0001, + "loss": 4.1139, + "loss/crossentropy": 2.155863106250763, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1953117996454239, + "step": 26176 + }, + { + "epoch": 0.52356, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01754735310872396, + "learning_rate": 0.0001, + "loss": 3.8574, + "loss/crossentropy": 1.8662053942680359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17596308141946793, + "step": 26178 + }, + { + "epoch": 0.5236, + "grad_norm": 2.4375, + "grad_norm_var": 0.03288548787434896, + "learning_rate": 0.0001, + "loss": 4.2924, + "loss/crossentropy": 2.0425168871879578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2088511660695076, + "step": 26180 + }, + { + "epoch": 0.52364, + "grad_norm": 1.875, + "grad_norm_var": 0.031689453125, + "learning_rate": 0.0001, + "loss": 4.0537, + "loss/crossentropy": 2.3064836263656616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20652327686548233, + "step": 26182 + }, + { + "epoch": 0.52368, + "grad_norm": 2.171875, + "grad_norm_var": 0.035065714518229166, + "learning_rate": 0.0001, + "loss": 4.2602, + "loss/crossentropy": 2.437976837158203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21545469760894775, + "step": 26184 + }, + { + "epoch": 0.52372, + "grad_norm": 2.03125, + "grad_norm_var": 0.034645334879557295, + "learning_rate": 0.0001, + "loss": 3.8111, + "loss/crossentropy": 1.8655226826667786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19221200793981552, + "step": 26186 + }, + { + "epoch": 0.52376, + "grad_norm": 1.8203125, + "grad_norm_var": 0.027339426676432292, + "learning_rate": 0.0001, + "loss": 3.9068, + "loss/crossentropy": 1.5981062650680542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15811648964881897, + "step": 26188 + }, + { + "epoch": 0.5238, + "grad_norm": 1.890625, + "grad_norm_var": 0.02739232381184896, + "learning_rate": 0.0001, + "loss": 3.6361, + "loss/crossentropy": 2.0123122334480286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117129176855087, + "step": 26190 + }, + { + "epoch": 0.52384, + "grad_norm": 1.921875, + "grad_norm_var": 0.02658869425455729, + "learning_rate": 0.0001, + "loss": 4.2715, + "loss/crossentropy": 1.8640416860580444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17403434216976166, + "step": 26192 + }, + { + "epoch": 0.52388, + "grad_norm": 1.9375, + "grad_norm_var": 0.026683553059895834, + "learning_rate": 0.0001, + "loss": 3.9331, + "loss/crossentropy": 1.9721081256866455, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18515276163816452, + "step": 26194 + }, + { + "epoch": 0.52392, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0124176025390625, + "learning_rate": 0.0001, + "loss": 3.9945, + "loss/crossentropy": 1.9967116713523865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1789851263165474, + "step": 26196 + }, + { + "epoch": 0.52396, + "grad_norm": 2.015625, + "grad_norm_var": 0.012467193603515624, + "learning_rate": 0.0001, + "loss": 4.0541, + "loss/crossentropy": 1.9825578331947327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994011253118515, + "step": 26198 + }, + { + "epoch": 0.524, + "grad_norm": 1.7734375, + "grad_norm_var": 0.011922200520833334, + "learning_rate": 0.0001, + "loss": 3.8301, + "loss/crossentropy": 1.9351627230644226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18274720758199692, + "step": 26200 + }, + { + "epoch": 0.52404, + "grad_norm": 1.8515625, + "grad_norm_var": 0.011970011393229167, + "learning_rate": 0.0001, + "loss": 3.7656, + "loss/crossentropy": 1.9799736738204956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18502405285835266, + "step": 26202 + }, + { + "epoch": 0.52408, + "grad_norm": 1.828125, + "grad_norm_var": 0.011896769205729166, + "learning_rate": 0.0001, + "loss": 3.8567, + "loss/crossentropy": 1.6783319115638733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14768389612436295, + "step": 26204 + }, + { + "epoch": 0.52412, + "grad_norm": 2.03125, + "grad_norm_var": 0.022709147135416666, + "learning_rate": 0.0001, + "loss": 3.8827, + "loss/crossentropy": 2.167922616004944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2303374707698822, + "step": 26206 + }, + { + "epoch": 0.52416, + "grad_norm": 1.9453125, + "grad_norm_var": 0.022359212239583332, + "learning_rate": 0.0001, + "loss": 3.9447, + "loss/crossentropy": 2.075382351875305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19902942329645157, + "step": 26208 + }, + { + "epoch": 0.5242, + "grad_norm": 1.6796875, + "grad_norm_var": 0.021203358968098957, + "learning_rate": 0.0001, + "loss": 3.7008, + "loss/crossentropy": 1.9230342507362366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18066919595003128, + "step": 26210 + }, + { + "epoch": 0.52424, + "grad_norm": 1.8671875, + "grad_norm_var": 0.021134440104166666, + "learning_rate": 0.0001, + "loss": 3.7475, + "loss/crossentropy": 2.000435531139374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19018910825252533, + "step": 26212 + }, + { + "epoch": 0.52428, + "grad_norm": 2.1875, + "grad_norm_var": 0.025569407145182292, + "learning_rate": 0.0001, + "loss": 4.1926, + "loss/crossentropy": 1.887748658657074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17914170771837234, + "step": 26214 + }, + { + "epoch": 0.52432, + "grad_norm": 1.9296875, + "grad_norm_var": 0.023911285400390624, + "learning_rate": 0.0001, + "loss": 3.8331, + "loss/crossentropy": 1.9090191721916199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18189192563295364, + "step": 26216 + }, + { + "epoch": 0.52436, + "grad_norm": 1.8671875, + "grad_norm_var": 0.023331705729166666, + "learning_rate": 0.0001, + "loss": 3.9697, + "loss/crossentropy": 1.9368041157722473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18426424264907837, + "step": 26218 + }, + { + "epoch": 0.5244, + "grad_norm": 1.875, + "grad_norm_var": 0.022761027018229168, + "learning_rate": 0.0001, + "loss": 3.7137, + "loss/crossentropy": 2.1554033160209656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20169231295585632, + "step": 26220 + }, + { + "epoch": 0.52444, + "grad_norm": 1.84375, + "grad_norm_var": 0.011848704020182291, + "learning_rate": 0.0001, + "loss": 3.78, + "loss/crossentropy": 2.013387084007263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1921246349811554, + "step": 26222 + }, + { + "epoch": 0.52448, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011848704020182291, + "learning_rate": 0.0001, + "loss": 4.045, + "loss/crossentropy": 2.166366934776306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19481656700372696, + "step": 26224 + }, + { + "epoch": 0.52452, + "grad_norm": 2.171875, + "grad_norm_var": 0.012669881184895834, + "learning_rate": 0.0001, + "loss": 3.9517, + "loss/crossentropy": 2.15239155292511, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20820968598127365, + "step": 26226 + }, + { + "epoch": 0.52456, + "grad_norm": 1.8828125, + "grad_norm_var": 0.014229329427083333, + "learning_rate": 0.0001, + "loss": 3.9075, + "loss/crossentropy": 2.3724613189697266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21621037274599075, + "step": 26228 + }, + { + "epoch": 0.5246, + "grad_norm": 1.890625, + "grad_norm_var": 0.008888498942057291, + "learning_rate": 0.0001, + "loss": 3.9089, + "loss/crossentropy": 2.34514319896698, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21700077503919601, + "step": 26230 + }, + { + "epoch": 0.52464, + "grad_norm": 1.7890625, + "grad_norm_var": 0.009085845947265626, + "learning_rate": 0.0001, + "loss": 3.8283, + "loss/crossentropy": 1.7633844017982483, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17453230917453766, + "step": 26232 + }, + { + "epoch": 0.52468, + "grad_norm": 1.984375, + "grad_norm_var": 0.010135650634765625, + "learning_rate": 0.0001, + "loss": 3.6374, + "loss/crossentropy": 1.952212929725647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18606504052877426, + "step": 26234 + }, + { + "epoch": 0.52472, + "grad_norm": 2.109375, + "grad_norm_var": 0.012851715087890625, + "learning_rate": 0.0001, + "loss": 4.4356, + "loss/crossentropy": 2.2077181935310364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19942647963762283, + "step": 26236 + }, + { + "epoch": 0.52476, + "grad_norm": 2.046875, + "grad_norm_var": 0.014029947916666667, + "learning_rate": 0.0001, + "loss": 4.2066, + "loss/crossentropy": 2.110091209411621, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2050892785191536, + "step": 26238 + }, + { + "epoch": 0.5248, + "grad_norm": 1.9921875, + "grad_norm_var": 0.014284006754557292, + "learning_rate": 0.0001, + "loss": 3.9062, + "loss/crossentropy": 2.2912864685058594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043801248073578, + "step": 26240 + }, + { + "epoch": 0.52484, + "grad_norm": 1.84375, + "grad_norm_var": 0.010396067301432292, + "learning_rate": 0.0001, + "loss": 3.9453, + "loss/crossentropy": 2.280913293361664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21790295839309692, + "step": 26242 + }, + { + "epoch": 0.52488, + "grad_norm": 1.875, + "grad_norm_var": 0.008568318684895833, + "learning_rate": 0.0001, + "loss": 3.9073, + "loss/crossentropy": 1.9681018590927124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18275773525238037, + "step": 26244 + }, + { + "epoch": 0.52492, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009073893229166666, + "learning_rate": 0.0001, + "loss": 4.0666, + "loss/crossentropy": 2.1007355451583862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20663000643253326, + "step": 26246 + }, + { + "epoch": 0.52496, + "grad_norm": 1.796875, + "grad_norm_var": 0.009029134114583334, + "learning_rate": 0.0001, + "loss": 4.0375, + "loss/crossentropy": 2.1250157952308655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20055226981639862, + "step": 26248 + }, + { + "epoch": 0.525, + "grad_norm": 1.8359375, + "grad_norm_var": 0.008451334635416667, + "learning_rate": 0.0001, + "loss": 3.8399, + "loss/crossentropy": 1.8575809597969055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18729300796985626, + "step": 26250 + }, + { + "epoch": 0.52504, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006150054931640625, + "learning_rate": 0.0001, + "loss": 4.0213, + "loss/crossentropy": 2.172927498817444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20007719844579697, + "step": 26252 + }, + { + "epoch": 0.52508, + "grad_norm": 1.765625, + "grad_norm_var": 0.005271148681640625, + "learning_rate": 0.0001, + "loss": 3.8717, + "loss/crossentropy": 2.200989246368408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2174500823020935, + "step": 26254 + }, + { + "epoch": 0.52512, + "grad_norm": 1.890625, + "grad_norm_var": 0.004835764567057292, + "learning_rate": 0.0001, + "loss": 4.051, + "loss/crossentropy": 2.1007824540138245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20543181151151657, + "step": 26256 + }, + { + "epoch": 0.52516, + "grad_norm": 1.921875, + "grad_norm_var": 0.3579241434733073, + "learning_rate": 0.0001, + "loss": 3.9813, + "loss/crossentropy": 1.8371055722236633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18277855217456818, + "step": 26258 + }, + { + "epoch": 0.5252, + "grad_norm": 1.9765625, + "grad_norm_var": 0.3564849853515625, + "learning_rate": 0.0001, + "loss": 3.8955, + "loss/crossentropy": 1.9994609355926514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19775213301181793, + "step": 26260 + }, + { + "epoch": 0.52524, + "grad_norm": 2.015625, + "grad_norm_var": 0.3601519266764323, + "learning_rate": 0.0001, + "loss": 3.8842, + "loss/crossentropy": 1.9619916677474976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894378885626793, + "step": 26262 + }, + { + "epoch": 0.52528, + "grad_norm": 1.9375, + "grad_norm_var": 0.3581438700358073, + "learning_rate": 0.0001, + "loss": 4.0634, + "loss/crossentropy": 1.7938214540481567, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18398133665323257, + "step": 26264 + }, + { + "epoch": 0.52532, + "grad_norm": 2.109375, + "grad_norm_var": 0.3526995340983073, + "learning_rate": 0.0001, + "loss": 4.1002, + "loss/crossentropy": 2.0497565865516663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19529610127210617, + "step": 26266 + }, + { + "epoch": 0.52536, + "grad_norm": 1.8046875, + "grad_norm_var": 0.3564715067545573, + "learning_rate": 0.0001, + "loss": 3.8681, + "loss/crossentropy": 2.2085641622543335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2090958058834076, + "step": 26268 + }, + { + "epoch": 0.5254, + "grad_norm": 1.953125, + "grad_norm_var": 0.3512034098307292, + "learning_rate": 0.0001, + "loss": 4.1566, + "loss/crossentropy": 2.143643379211426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21305573731660843, + "step": 26270 + }, + { + "epoch": 0.52544, + "grad_norm": 2.34375, + "grad_norm_var": 0.35494969685872396, + "learning_rate": 0.0001, + "loss": 4.1428, + "loss/crossentropy": 2.2753764390945435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21875588595867157, + "step": 26272 + }, + { + "epoch": 0.52548, + "grad_norm": 2.0, + "grad_norm_var": 0.09860610961914062, + "learning_rate": 0.0001, + "loss": 4.1114, + "loss/crossentropy": 2.0325489044189453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19430440664291382, + "step": 26274 + }, + { + "epoch": 0.52552, + "grad_norm": 1.984375, + "grad_norm_var": 0.0970293680826823, + "learning_rate": 0.0001, + "loss": 3.9278, + "loss/crossentropy": 2.3086185455322266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21185901761054993, + "step": 26276 + }, + { + "epoch": 0.52556, + "grad_norm": 2.03125, + "grad_norm_var": 0.0931060791015625, + "learning_rate": 0.0001, + "loss": 4.1219, + "loss/crossentropy": 2.1545599699020386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2019871026277542, + "step": 26278 + }, + { + "epoch": 0.5256, + "grad_norm": 2.03125, + "grad_norm_var": 0.08904622395833334, + "learning_rate": 0.0001, + "loss": 4.0154, + "loss/crossentropy": 1.8528248071670532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17894673347473145, + "step": 26280 + }, + { + "epoch": 0.52564, + "grad_norm": 1.796875, + "grad_norm_var": 0.09781901041666667, + "learning_rate": 0.0001, + "loss": 3.6732, + "loss/crossentropy": 1.7371985912322998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17898088693618774, + "step": 26282 + }, + { + "epoch": 0.52568, + "grad_norm": 1.78125, + "grad_norm_var": 0.10242284138997396, + "learning_rate": 0.0001, + "loss": 3.7005, + "loss/crossentropy": 1.8484544157981873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16372547298669815, + "step": 26284 + }, + { + "epoch": 0.52572, + "grad_norm": 1.859375, + "grad_norm_var": 0.10460586547851562, + "learning_rate": 0.0001, + "loss": 3.9015, + "loss/crossentropy": 2.2279897928237915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19722852110862732, + "step": 26286 + }, + { + "epoch": 0.52576, + "grad_norm": 2.046875, + "grad_norm_var": 0.09019749959309896, + "learning_rate": 0.0001, + "loss": 4.0875, + "loss/crossentropy": 1.9138891100883484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17364701628684998, + "step": 26288 + }, + { + "epoch": 0.5258, + "grad_norm": 1.921875, + "grad_norm_var": 0.010786946614583333, + "learning_rate": 0.0001, + "loss": 3.8941, + "loss/crossentropy": 2.062381386756897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20237115025520325, + "step": 26290 + }, + { + "epoch": 0.52584, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010548655192057292, + "learning_rate": 0.0001, + "loss": 4.0351, + "loss/crossentropy": 2.094722032546997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1814640611410141, + "step": 26292 + }, + { + "epoch": 0.52588, + "grad_norm": 2.015625, + "grad_norm_var": 0.010334269205729166, + "learning_rate": 0.0001, + "loss": 4.2114, + "loss/crossentropy": 1.6870167255401611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19745031744241714, + "step": 26294 + }, + { + "epoch": 0.52592, + "grad_norm": 1.9375, + "grad_norm_var": 0.007062784830729167, + "learning_rate": 0.0001, + "loss": 3.7982, + "loss/crossentropy": 1.688876748085022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17806339263916016, + "step": 26296 + }, + { + "epoch": 0.52596, + "grad_norm": 1.8125, + "grad_norm_var": 0.006898752848307292, + "learning_rate": 0.0001, + "loss": 3.8256, + "loss/crossentropy": 2.0577162504196167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18965084105730057, + "step": 26298 + }, + { + "epoch": 0.526, + "grad_norm": 1.921875, + "grad_norm_var": 0.005057779947916666, + "learning_rate": 0.0001, + "loss": 3.8858, + "loss/crossentropy": 1.9294962882995605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19645613431930542, + "step": 26300 + }, + { + "epoch": 0.52604, + "grad_norm": 2.0625, + "grad_norm_var": 0.0063435872395833336, + "learning_rate": 0.0001, + "loss": 3.9, + "loss/crossentropy": 2.1160236597061157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20182175934314728, + "step": 26302 + }, + { + "epoch": 0.52608, + "grad_norm": 1.8984375, + "grad_norm_var": 0.005232747395833333, + "learning_rate": 0.0001, + "loss": 3.9438, + "loss/crossentropy": 2.0521216988563538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19732891023159027, + "step": 26304 + }, + { + "epoch": 0.52612, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004789225260416667, + "learning_rate": 0.0001, + "loss": 4.1944, + "loss/crossentropy": 2.3100136518478394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20355679094791412, + "step": 26306 + }, + { + "epoch": 0.52616, + "grad_norm": 2.109375, + "grad_norm_var": 0.0065081278483072914, + "learning_rate": 0.0001, + "loss": 4.1077, + "loss/crossentropy": 2.2635756731033325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21735866367816925, + "step": 26308 + }, + { + "epoch": 0.5262, + "grad_norm": 1.828125, + "grad_norm_var": 0.008512115478515625, + "learning_rate": 0.0001, + "loss": 4.0985, + "loss/crossentropy": 1.8445220589637756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18094825744628906, + "step": 26310 + }, + { + "epoch": 0.52624, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007651519775390625, + "learning_rate": 0.0001, + "loss": 3.9866, + "loss/crossentropy": 2.174062728881836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19913697242736816, + "step": 26312 + }, + { + "epoch": 0.52628, + "grad_norm": 1.953125, + "grad_norm_var": 0.008565266927083334, + "learning_rate": 0.0001, + "loss": 3.8556, + "loss/crossentropy": 2.1039488911628723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1815500482916832, + "step": 26314 + }, + { + "epoch": 0.52632, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008491770426432291, + "learning_rate": 0.0001, + "loss": 4.3284, + "loss/crossentropy": 2.134014844894409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20855064690113068, + "step": 26316 + }, + { + "epoch": 0.52636, + "grad_norm": 1.875, + "grad_norm_var": 0.008050282796223959, + "learning_rate": 0.0001, + "loss": 3.9977, + "loss/crossentropy": 1.7635084390640259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16970381140708923, + "step": 26318 + }, + { + "epoch": 0.5264, + "grad_norm": 1.921875, + "grad_norm_var": 0.0078122456868489586, + "learning_rate": 0.0001, + "loss": 4.0349, + "loss/crossentropy": 2.4526535272598267, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23066413402557373, + "step": 26320 + }, + { + "epoch": 0.52644, + "grad_norm": 2.046875, + "grad_norm_var": 0.008210245768229167, + "learning_rate": 0.0001, + "loss": 4.1903, + "loss/crossentropy": 2.073263466358185, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19519560784101486, + "step": 26322 + }, + { + "epoch": 0.52648, + "grad_norm": 2.0625, + "grad_norm_var": 0.007401529947916667, + "learning_rate": 0.0001, + "loss": 3.9606, + "loss/crossentropy": 1.9899149537086487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18811341375112534, + "step": 26324 + }, + { + "epoch": 0.52652, + "grad_norm": 2.203125, + "grad_norm_var": 0.013899739583333333, + "learning_rate": 0.0001, + "loss": 3.9753, + "loss/crossentropy": 2.039364755153656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18913507461547852, + "step": 26326 + }, + { + "epoch": 0.52656, + "grad_norm": 1.9609375, + "grad_norm_var": 0.014096832275390625, + "learning_rate": 0.0001, + "loss": 3.9944, + "loss/crossentropy": 2.076085150241852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21231256425380707, + "step": 26328 + }, + { + "epoch": 0.5266, + "grad_norm": 1.7734375, + "grad_norm_var": 0.016331990559895832, + "learning_rate": 0.0001, + "loss": 3.8768, + "loss/crossentropy": 2.1222329139709473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18735431134700775, + "step": 26330 + }, + { + "epoch": 0.52664, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01644872029622396, + "learning_rate": 0.0001, + "loss": 4.206, + "loss/crossentropy": 2.490665316581726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19682377576828003, + "step": 26332 + }, + { + "epoch": 0.52668, + "grad_norm": 1.7890625, + "grad_norm_var": 0.01778132120768229, + "learning_rate": 0.0001, + "loss": 4.0541, + "loss/crossentropy": 2.2802486419677734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20574340224266052, + "step": 26334 + }, + { + "epoch": 0.52672, + "grad_norm": 1.8359375, + "grad_norm_var": 0.019925944010416665, + "learning_rate": 0.0001, + "loss": 3.9517, + "loss/crossentropy": 2.094264805316925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18822843581438065, + "step": 26336 + }, + { + "epoch": 0.52676, + "grad_norm": 2.0, + "grad_norm_var": 0.020151519775390626, + "learning_rate": 0.0001, + "loss": 3.7813, + "loss/crossentropy": 1.9270477294921875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1901463121175766, + "step": 26338 + }, + { + "epoch": 0.5268, + "grad_norm": 1.8671875, + "grad_norm_var": 0.018046061197916668, + "learning_rate": 0.0001, + "loss": 4.1199, + "loss/crossentropy": 2.1571450233459473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919422224164009, + "step": 26340 + }, + { + "epoch": 0.52684, + "grad_norm": 1.875, + "grad_norm_var": 0.008902994791666667, + "learning_rate": 0.0001, + "loss": 3.9607, + "loss/crossentropy": 1.9483349323272705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188851498067379, + "step": 26342 + }, + { + "epoch": 0.52688, + "grad_norm": 1.984375, + "grad_norm_var": 0.009401194254557292, + "learning_rate": 0.0001, + "loss": 3.9986, + "loss/crossentropy": 2.0212563276290894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19506627321243286, + "step": 26344 + }, + { + "epoch": 0.52692, + "grad_norm": 1.953125, + "grad_norm_var": 0.004689280192057292, + "learning_rate": 0.0001, + "loss": 3.9598, + "loss/crossentropy": 1.9661552906036377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18422409892082214, + "step": 26346 + }, + { + "epoch": 0.52696, + "grad_norm": 1.953125, + "grad_norm_var": 0.0053212483723958336, + "learning_rate": 0.0001, + "loss": 3.851, + "loss/crossentropy": 2.0267988443374634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18864717334508896, + "step": 26348 + }, + { + "epoch": 0.527, + "grad_norm": 1.7734375, + "grad_norm_var": 0.006468709309895833, + "learning_rate": 0.0001, + "loss": 3.8491, + "loss/crossentropy": 1.874852180480957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17451918125152588, + "step": 26350 + }, + { + "epoch": 0.52704, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0056722005208333336, + "learning_rate": 0.0001, + "loss": 3.8164, + "loss/crossentropy": 1.9450770020484924, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19308660924434662, + "step": 26352 + }, + { + "epoch": 0.52708, + "grad_norm": 1.9765625, + "grad_norm_var": 0.004589589436848959, + "learning_rate": 0.0001, + "loss": 3.9731, + "loss/crossentropy": 1.8964659571647644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18824025243520737, + "step": 26354 + }, + { + "epoch": 0.52712, + "grad_norm": 1.921875, + "grad_norm_var": 0.0045166015625, + "learning_rate": 0.0001, + "loss": 3.9693, + "loss/crossentropy": 1.856788158416748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17411265522241592, + "step": 26356 + }, + { + "epoch": 0.52716, + "grad_norm": 1.984375, + "grad_norm_var": 0.005098470052083333, + "learning_rate": 0.0001, + "loss": 3.963, + "loss/crossentropy": 1.6551550030708313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1649954915046692, + "step": 26358 + }, + { + "epoch": 0.5272, + "grad_norm": 1.9375, + "grad_norm_var": 0.005791982014973958, + "learning_rate": 0.0001, + "loss": 3.7678, + "loss/crossentropy": 1.9796473383903503, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1766190305352211, + "step": 26360 + }, + { + "epoch": 0.52724, + "grad_norm": 1.875, + "grad_norm_var": 0.0056955973307291664, + "learning_rate": 0.0001, + "loss": 3.9551, + "loss/crossentropy": 1.9023584723472595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18641258776187897, + "step": 26362 + }, + { + "epoch": 0.52728, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005069732666015625, + "learning_rate": 0.0001, + "loss": 4.1641, + "loss/crossentropy": 2.0332838892936707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18603450059890747, + "step": 26364 + }, + { + "epoch": 0.52732, + "grad_norm": 1.8515625, + "grad_norm_var": 0.003580474853515625, + "learning_rate": 0.0001, + "loss": 3.8799, + "loss/crossentropy": 2.0670453310012817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1904071420431137, + "step": 26366 + }, + { + "epoch": 0.52736, + "grad_norm": 1.84375, + "grad_norm_var": 0.004162343343098959, + "learning_rate": 0.0001, + "loss": 3.7713, + "loss/crossentropy": 1.9684784412384033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19187042117118835, + "step": 26368 + }, + { + "epoch": 0.5274, + "grad_norm": 1.859375, + "grad_norm_var": 0.003979237874348959, + "learning_rate": 0.0001, + "loss": 4.0018, + "loss/crossentropy": 1.9178830981254578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1805613711476326, + "step": 26370 + }, + { + "epoch": 0.52744, + "grad_norm": 2.046875, + "grad_norm_var": 0.005496978759765625, + "learning_rate": 0.0001, + "loss": 4.1603, + "loss/crossentropy": 2.094612956047058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2008626013994217, + "step": 26372 + }, + { + "epoch": 0.52748, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005102284749348958, + "learning_rate": 0.0001, + "loss": 3.8223, + "loss/crossentropy": 2.0617056488990784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19100335985422134, + "step": 26374 + }, + { + "epoch": 0.52752, + "grad_norm": 1.8984375, + "grad_norm_var": 0.00411376953125, + "learning_rate": 0.0001, + "loss": 3.8758, + "loss/crossentropy": 2.1675769090652466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1890057846903801, + "step": 26376 + }, + { + "epoch": 0.52756, + "grad_norm": 2.0625, + "grad_norm_var": 0.005600738525390625, + "learning_rate": 0.0001, + "loss": 4.0096, + "loss/crossentropy": 2.245623469352722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20736746490001678, + "step": 26378 + }, + { + "epoch": 0.5276, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0055735270182291664, + "learning_rate": 0.0001, + "loss": 4.1111, + "loss/crossentropy": 2.107472836971283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996273696422577, + "step": 26380 + }, + { + "epoch": 0.52764, + "grad_norm": 1.8125, + "grad_norm_var": 0.006083170572916667, + "learning_rate": 0.0001, + "loss": 3.7678, + "loss/crossentropy": 1.6452732682228088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16541734337806702, + "step": 26382 + }, + { + "epoch": 0.52768, + "grad_norm": 2.1875, + "grad_norm_var": 0.020611317952473958, + "learning_rate": 0.0001, + "loss": 4.1566, + "loss/crossentropy": 2.0002782940864563, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.27171435952186584, + "step": 26384 + }, + { + "epoch": 0.52772, + "grad_norm": 2.0, + "grad_norm_var": 0.018888092041015624, + "learning_rate": 0.0001, + "loss": 4.2397, + "loss/crossentropy": 2.0969172716140747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20492591708898544, + "step": 26386 + }, + { + "epoch": 0.52776, + "grad_norm": 1.953125, + "grad_norm_var": 0.019510904947916668, + "learning_rate": 0.0001, + "loss": 3.8078, + "loss/crossentropy": 2.238045036792755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20098427683115005, + "step": 26388 + }, + { + "epoch": 0.5278, + "grad_norm": 1.8671875, + "grad_norm_var": 0.021022288004557292, + "learning_rate": 0.0001, + "loss": 4.0391, + "loss/crossentropy": 1.9958226680755615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19130828231573105, + "step": 26390 + }, + { + "epoch": 0.52784, + "grad_norm": 1.9609375, + "grad_norm_var": 0.020702107747395834, + "learning_rate": 0.0001, + "loss": 4.1161, + "loss/crossentropy": 2.176322102546692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20082970708608627, + "step": 26392 + }, + { + "epoch": 0.52788, + "grad_norm": 1.953125, + "grad_norm_var": 0.0212066650390625, + "learning_rate": 0.0001, + "loss": 4.0363, + "loss/crossentropy": 1.9507973790168762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17588113993406296, + "step": 26394 + }, + { + "epoch": 0.52792, + "grad_norm": 1.8203125, + "grad_norm_var": 0.022013346354166668, + "learning_rate": 0.0001, + "loss": 3.9508, + "loss/crossentropy": 2.0576651096343994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19712203741073608, + "step": 26396 + }, + { + "epoch": 0.52796, + "grad_norm": 1.7890625, + "grad_norm_var": 0.022663370768229166, + "learning_rate": 0.0001, + "loss": 3.9035, + "loss/crossentropy": 2.030472457408905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18470944464206696, + "step": 26398 + }, + { + "epoch": 0.528, + "grad_norm": 2.078125, + "grad_norm_var": 0.011378733317057292, + "learning_rate": 0.0001, + "loss": 3.9675, + "loss/crossentropy": 2.0267462730407715, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19596907496452332, + "step": 26400 + }, + { + "epoch": 0.52804, + "grad_norm": 1.984375, + "grad_norm_var": 0.013927968343098958, + "learning_rate": 0.0001, + "loss": 3.7917, + "loss/crossentropy": 2.087961494922638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19787786155939102, + "step": 26402 + }, + { + "epoch": 0.52808, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0133941650390625, + "learning_rate": 0.0001, + "loss": 3.7384, + "loss/crossentropy": 1.6107558608055115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16004415601491928, + "step": 26404 + }, + { + "epoch": 0.52812, + "grad_norm": 2.015625, + "grad_norm_var": 0.010249837239583334, + "learning_rate": 0.0001, + "loss": 4.0522, + "loss/crossentropy": 1.8615078926086426, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997840330004692, + "step": 26406 + }, + { + "epoch": 0.52816, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009528605143229167, + "learning_rate": 0.0001, + "loss": 4.0445, + "loss/crossentropy": 2.165738344192505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002716213464737, + "step": 26408 + }, + { + "epoch": 0.5282, + "grad_norm": 1.828125, + "grad_norm_var": 0.009700520833333334, + "learning_rate": 0.0001, + "loss": 3.9418, + "loss/crossentropy": 2.039876639842987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19271554052829742, + "step": 26410 + }, + { + "epoch": 0.52824, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008763631184895834, + "learning_rate": 0.0001, + "loss": 3.9841, + "loss/crossentropy": 2.114060878753662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19228192418813705, + "step": 26412 + }, + { + "epoch": 0.52828, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006998697916666667, + "learning_rate": 0.0001, + "loss": 4.0182, + "loss/crossentropy": 2.1510127782821655, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19021940976381302, + "step": 26414 + }, + { + "epoch": 0.52832, + "grad_norm": 1.8515625, + "grad_norm_var": 0.004986317952473959, + "learning_rate": 0.0001, + "loss": 3.9933, + "loss/crossentropy": 1.9514253735542297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19038037955760956, + "step": 26416 + }, + { + "epoch": 0.52836, + "grad_norm": 1.828125, + "grad_norm_var": 0.0032867431640625, + "learning_rate": 0.0001, + "loss": 3.9041, + "loss/crossentropy": 1.8573151230812073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17533313482999802, + "step": 26418 + }, + { + "epoch": 0.5284, + "grad_norm": 1.84375, + "grad_norm_var": 0.004084269205729167, + "learning_rate": 0.0001, + "loss": 4.1181, + "loss/crossentropy": 2.2349034547805786, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1933293491601944, + "step": 26420 + }, + { + "epoch": 0.52844, + "grad_norm": 1.953125, + "grad_norm_var": 0.0031084696451822917, + "learning_rate": 0.0001, + "loss": 4.127, + "loss/crossentropy": 1.9790935516357422, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1930369809269905, + "step": 26422 + }, + { + "epoch": 0.52848, + "grad_norm": 1.890625, + "grad_norm_var": 0.002399698893229167, + "learning_rate": 0.0001, + "loss": 3.9531, + "loss/crossentropy": 1.9758057594299316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19405999034643173, + "step": 26424 + }, + { + "epoch": 0.52852, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0030637105305989583, + "learning_rate": 0.0001, + "loss": 4.0303, + "loss/crossentropy": 1.9666665196418762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19631757587194443, + "step": 26426 + }, + { + "epoch": 0.52856, + "grad_norm": 1.859375, + "grad_norm_var": 0.0037717183430989584, + "learning_rate": 0.0001, + "loss": 3.8257, + "loss/crossentropy": 1.7752289175987244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16831018775701523, + "step": 26428 + }, + { + "epoch": 0.5286, + "grad_norm": 2.015625, + "grad_norm_var": 0.005293782552083333, + "learning_rate": 0.0001, + "loss": 4.0879, + "loss/crossentropy": 2.1496593952178955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1875729113817215, + "step": 26430 + }, + { + "epoch": 0.52864, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005926259358723958, + "learning_rate": 0.0001, + "loss": 4.0407, + "loss/crossentropy": 1.9487584829330444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18598007410764694, + "step": 26432 + }, + { + "epoch": 0.52868, + "grad_norm": 1.859375, + "grad_norm_var": 0.009886678059895833, + "learning_rate": 0.0001, + "loss": 3.9064, + "loss/crossentropy": 1.8842872977256775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1913917362689972, + "step": 26434 + }, + { + "epoch": 0.52872, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009913889567057292, + "learning_rate": 0.0001, + "loss": 4.1857, + "loss/crossentropy": 2.1598324179649353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19371075928211212, + "step": 26436 + }, + { + "epoch": 0.52876, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010228474934895834, + "learning_rate": 0.0001, + "loss": 4.0243, + "loss/crossentropy": 1.993024468421936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19384337216615677, + "step": 26438 + }, + { + "epoch": 0.5288, + "grad_norm": 1.8515625, + "grad_norm_var": 0.011631011962890625, + "learning_rate": 0.0001, + "loss": 3.9132, + "loss/crossentropy": 2.083270490169525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1818510740995407, + "step": 26440 + }, + { + "epoch": 0.52884, + "grad_norm": 2.015625, + "grad_norm_var": 0.011761220296223958, + "learning_rate": 0.0001, + "loss": 4.0291, + "loss/crossentropy": 2.136199116706848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19123348593711853, + "step": 26442 + }, + { + "epoch": 0.52888, + "grad_norm": 2.046875, + "grad_norm_var": 0.010335286458333334, + "learning_rate": 0.0001, + "loss": 4.2448, + "loss/crossentropy": 2.405175805091858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21493228524923325, + "step": 26444 + }, + { + "epoch": 0.52892, + "grad_norm": 1.84375, + "grad_norm_var": 0.010900624593098958, + "learning_rate": 0.0001, + "loss": 3.9793, + "loss/crossentropy": 2.0925610065460205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20184864103794098, + "step": 26446 + }, + { + "epoch": 0.52896, + "grad_norm": 1.90625, + "grad_norm_var": 0.012116495768229167, + "learning_rate": 0.0001, + "loss": 4.1213, + "loss/crossentropy": 2.1476112604141235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.206947922706604, + "step": 26448 + }, + { + "epoch": 0.529, + "grad_norm": 1.84375, + "grad_norm_var": 0.009333292643229166, + "learning_rate": 0.0001, + "loss": 3.9592, + "loss/crossentropy": 1.7512850165367126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15776671469211578, + "step": 26450 + }, + { + "epoch": 0.52904, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009505208333333333, + "learning_rate": 0.0001, + "loss": 4.0763, + "loss/crossentropy": 1.766721785068512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18328477442264557, + "step": 26452 + }, + { + "epoch": 0.52908, + "grad_norm": 1.890625, + "grad_norm_var": 0.009178670247395833, + "learning_rate": 0.0001, + "loss": 4.0028, + "loss/crossentropy": 1.9713833928108215, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19086521863937378, + "step": 26454 + }, + { + "epoch": 0.52912, + "grad_norm": 1.765625, + "grad_norm_var": 0.009212239583333334, + "learning_rate": 0.0001, + "loss": 3.7999, + "loss/crossentropy": 1.8375617861747742, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799859181046486, + "step": 26456 + }, + { + "epoch": 0.52916, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009368642171223959, + "learning_rate": 0.0001, + "loss": 4.0928, + "loss/crossentropy": 2.3689894676208496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21843338012695312, + "step": 26458 + }, + { + "epoch": 0.5292, + "grad_norm": 1.859375, + "grad_norm_var": 0.008896636962890624, + "learning_rate": 0.0001, + "loss": 3.8911, + "loss/crossentropy": 2.2842042446136475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20215284824371338, + "step": 26460 + }, + { + "epoch": 0.52924, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008405558268229167, + "learning_rate": 0.0001, + "loss": 3.7891, + "loss/crossentropy": 2.0660494565963745, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18394764512777328, + "step": 26462 + }, + { + "epoch": 0.52928, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009537506103515624, + "learning_rate": 0.0001, + "loss": 4.097, + "loss/crossentropy": 2.101405918598175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051784247159958, + "step": 26464 + }, + { + "epoch": 0.52932, + "grad_norm": 1.7890625, + "grad_norm_var": 0.00943603515625, + "learning_rate": 0.0001, + "loss": 3.8075, + "loss/crossentropy": 1.8293753862380981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18773258477449417, + "step": 26466 + }, + { + "epoch": 0.52936, + "grad_norm": 1.703125, + "grad_norm_var": 0.011115519205729167, + "learning_rate": 0.0001, + "loss": 3.5724, + "loss/crossentropy": 1.9455705881118774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17760945856571198, + "step": 26468 + }, + { + "epoch": 0.5294, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011508941650390625, + "learning_rate": 0.0001, + "loss": 3.9305, + "loss/crossentropy": 1.750687837600708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18026860058307648, + "step": 26470 + }, + { + "epoch": 0.52944, + "grad_norm": 1.921875, + "grad_norm_var": 0.010375722249348959, + "learning_rate": 0.0001, + "loss": 4.0931, + "loss/crossentropy": 2.189963698387146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19402040541172028, + "step": 26472 + }, + { + "epoch": 0.52948, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009549713134765625, + "learning_rate": 0.0001, + "loss": 4.2266, + "loss/crossentropy": 1.8729415535926819, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20855195820331573, + "step": 26474 + }, + { + "epoch": 0.52952, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0097412109375, + "learning_rate": 0.0001, + "loss": 3.5935, + "loss/crossentropy": 1.8071807622909546, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16968178004026413, + "step": 26476 + }, + { + "epoch": 0.52956, + "grad_norm": 1.9375, + "grad_norm_var": 0.010257720947265625, + "learning_rate": 0.0001, + "loss": 3.9832, + "loss/crossentropy": 2.241515278816223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20128827542066574, + "step": 26478 + }, + { + "epoch": 0.5296, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0062978108723958336, + "learning_rate": 0.0001, + "loss": 3.9792, + "loss/crossentropy": 2.2276766300201416, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.201935775578022, + "step": 26480 + }, + { + "epoch": 0.52964, + "grad_norm": 2.015625, + "grad_norm_var": 0.0061604817708333336, + "learning_rate": 0.0001, + "loss": 3.8511, + "loss/crossentropy": 2.2751588821411133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21366826444864273, + "step": 26482 + }, + { + "epoch": 0.52968, + "grad_norm": 1.8984375, + "grad_norm_var": 0.00335693359375, + "learning_rate": 0.0001, + "loss": 4.1496, + "loss/crossentropy": 2.2164629697799683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2294466495513916, + "step": 26484 + }, + { + "epoch": 0.52972, + "grad_norm": 1.6328125, + "grad_norm_var": 0.008131663004557291, + "learning_rate": 0.0001, + "loss": 3.7879, + "loss/crossentropy": 1.8609917163848877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1908324807882309, + "step": 26486 + }, + { + "epoch": 0.52976, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0085601806640625, + "learning_rate": 0.0001, + "loss": 4.066, + "loss/crossentropy": 2.050028145313263, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18777167797088623, + "step": 26488 + }, + { + "epoch": 0.5298, + "grad_norm": 2.078125, + "grad_norm_var": 0.011329905192057291, + "learning_rate": 0.0001, + "loss": 4.2666, + "loss/crossentropy": 2.398528814315796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22428229451179504, + "step": 26490 + }, + { + "epoch": 0.52984, + "grad_norm": 2.078125, + "grad_norm_var": 0.012889607747395834, + "learning_rate": 0.0001, + "loss": 3.8208, + "loss/crossentropy": 2.0193370580673218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19870522618293762, + "step": 26492 + }, + { + "epoch": 0.52988, + "grad_norm": 1.84375, + "grad_norm_var": 0.012369791666666666, + "learning_rate": 0.0001, + "loss": 3.9186, + "loss/crossentropy": 2.1982868313789368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19849159568548203, + "step": 26494 + }, + { + "epoch": 0.52992, + "grad_norm": 1.8203125, + "grad_norm_var": 0.013405100504557291, + "learning_rate": 0.0001, + "loss": 3.8017, + "loss/crossentropy": 1.8915838599205017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16818702220916748, + "step": 26496 + }, + { + "epoch": 0.52996, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012596638997395833, + "learning_rate": 0.0001, + "loss": 3.8179, + "loss/crossentropy": 1.9891789555549622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18337423354387283, + "step": 26498 + }, + { + "epoch": 0.53, + "grad_norm": 2.125, + "grad_norm_var": 0.015571848551432291, + "learning_rate": 0.0001, + "loss": 4.1336, + "loss/crossentropy": 2.4187783002853394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20852266252040863, + "step": 26500 + }, + { + "epoch": 0.53004, + "grad_norm": 1.7734375, + "grad_norm_var": 0.01514892578125, + "learning_rate": 0.0001, + "loss": 3.4934, + "loss/crossentropy": 1.742977499961853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17425427585840225, + "step": 26502 + }, + { + "epoch": 0.53008, + "grad_norm": 1.875, + "grad_norm_var": 0.017000325520833335, + "learning_rate": 0.0001, + "loss": 4.0615, + "loss/crossentropy": 1.8980122208595276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16612428426742554, + "step": 26504 + }, + { + "epoch": 0.53012, + "grad_norm": 1.8828125, + "grad_norm_var": 0.013993072509765624, + "learning_rate": 0.0001, + "loss": 3.9807, + "loss/crossentropy": 1.7788900136947632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15797217935323715, + "step": 26506 + }, + { + "epoch": 0.53016, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011578114827473958, + "learning_rate": 0.0001, + "loss": 4.0657, + "loss/crossentropy": 1.8558028936386108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18111377209424973, + "step": 26508 + }, + { + "epoch": 0.5302, + "grad_norm": 2.03125, + "grad_norm_var": 0.012886555989583333, + "learning_rate": 0.0001, + "loss": 4.1489, + "loss/crossentropy": 2.2158637046813965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20664983242750168, + "step": 26510 + }, + { + "epoch": 0.53024, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012347157796223958, + "learning_rate": 0.0001, + "loss": 4.1, + "loss/crossentropy": 2.159493863582611, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18167608231306076, + "step": 26512 + }, + { + "epoch": 0.53028, + "grad_norm": 1.921875, + "grad_norm_var": 0.012717437744140626, + "learning_rate": 0.0001, + "loss": 3.9015, + "loss/crossentropy": 2.328041195869446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1777345836162567, + "step": 26514 + }, + { + "epoch": 0.53032, + "grad_norm": 1.890625, + "grad_norm_var": 0.0093902587890625, + "learning_rate": 0.0001, + "loss": 3.7341, + "loss/crossentropy": 1.724794864654541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1699744239449501, + "step": 26516 + }, + { + "epoch": 0.53036, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006198883056640625, + "learning_rate": 0.0001, + "loss": 4.0218, + "loss/crossentropy": 1.96088445186615, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1849687099456787, + "step": 26518 + }, + { + "epoch": 0.5304, + "grad_norm": 1.875, + "grad_norm_var": 0.0041656494140625, + "learning_rate": 0.0001, + "loss": 3.9326, + "loss/crossentropy": 2.150395691394806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20122019946575165, + "step": 26520 + }, + { + "epoch": 0.53044, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004296875, + "learning_rate": 0.0001, + "loss": 3.9209, + "loss/crossentropy": 2.016223907470703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18350518494844437, + "step": 26522 + }, + { + "epoch": 0.53048, + "grad_norm": 2.0, + "grad_norm_var": 0.0060791015625, + "learning_rate": 0.0001, + "loss": 4.3091, + "loss/crossentropy": 2.334647059440613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20854850858449936, + "step": 26524 + }, + { + "epoch": 0.53052, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0053059895833333336, + "learning_rate": 0.0001, + "loss": 3.8457, + "loss/crossentropy": 1.928149163722992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18799126893281937, + "step": 26526 + }, + { + "epoch": 0.53056, + "grad_norm": 1.8828125, + "grad_norm_var": 0.005224355061848958, + "learning_rate": 0.0001, + "loss": 3.8411, + "loss/crossentropy": 2.297117590904236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20247841626405716, + "step": 26528 + }, + { + "epoch": 0.5306, + "grad_norm": 2.15625, + "grad_norm_var": 0.025925445556640624, + "learning_rate": 0.0001, + "loss": 3.8426, + "loss/crossentropy": 1.6292110085487366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1556340456008911, + "step": 26530 + }, + { + "epoch": 0.53064, + "grad_norm": 1.9765625, + "grad_norm_var": 0.023819732666015624, + "learning_rate": 0.0001, + "loss": 3.9924, + "loss/crossentropy": 2.3576741218566895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22244800627231598, + "step": 26532 + }, + { + "epoch": 0.53068, + "grad_norm": 1.828125, + "grad_norm_var": 0.024812825520833335, + "learning_rate": 0.0001, + "loss": 3.8956, + "loss/crossentropy": 2.412485718727112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1838645040988922, + "step": 26534 + }, + { + "epoch": 0.53072, + "grad_norm": 1.84375, + "grad_norm_var": 0.023276519775390626, + "learning_rate": 0.0001, + "loss": 3.9826, + "loss/crossentropy": 2.1326744556427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19059032201766968, + "step": 26536 + }, + { + "epoch": 0.53076, + "grad_norm": 1.8125, + "grad_norm_var": 0.023726399739583334, + "learning_rate": 0.0001, + "loss": 3.8374, + "loss/crossentropy": 1.8022708892822266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17919087409973145, + "step": 26538 + }, + { + "epoch": 0.5308, + "grad_norm": 1.9375, + "grad_norm_var": 0.023563385009765625, + "learning_rate": 0.0001, + "loss": 3.8724, + "loss/crossentropy": 2.1001075506210327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1887490525841713, + "step": 26540 + }, + { + "epoch": 0.53084, + "grad_norm": 1.8359375, + "grad_norm_var": 0.023860677083333334, + "learning_rate": 0.0001, + "loss": 3.8678, + "loss/crossentropy": 1.9068372249603271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1752641201019287, + "step": 26542 + }, + { + "epoch": 0.53088, + "grad_norm": 1.890625, + "grad_norm_var": 0.023787180582682293, + "learning_rate": 0.0001, + "loss": 4.072, + "loss/crossentropy": 1.8628905415534973, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17757906019687653, + "step": 26544 + }, + { + "epoch": 0.53092, + "grad_norm": 1.921875, + "grad_norm_var": 0.0041196187337239586, + "learning_rate": 0.0001, + "loss": 4.1044, + "loss/crossentropy": 1.9989616870880127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1890716478228569, + "step": 26546 + }, + { + "epoch": 0.53096, + "grad_norm": 1.78125, + "grad_norm_var": 0.004624176025390625, + "learning_rate": 0.0001, + "loss": 3.8013, + "loss/crossentropy": 2.063919186592102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18740086257457733, + "step": 26548 + }, + { + "epoch": 0.531, + "grad_norm": 1.953125, + "grad_norm_var": 0.004355621337890625, + "learning_rate": 0.0001, + "loss": 4.0148, + "loss/crossentropy": 1.9364339709281921, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18698105216026306, + "step": 26550 + }, + { + "epoch": 0.53104, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0037859598795572918, + "learning_rate": 0.0001, + "loss": 4.1554, + "loss/crossentropy": 2.063125193119049, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1881653591990471, + "step": 26552 + }, + { + "epoch": 0.53108, + "grad_norm": 1.8515625, + "grad_norm_var": 0.003570302327473958, + "learning_rate": 0.0001, + "loss": 3.9373, + "loss/crossentropy": 1.9567288160324097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19221889972686768, + "step": 26554 + }, + { + "epoch": 0.53112, + "grad_norm": 1.8828125, + "grad_norm_var": 0.004367828369140625, + "learning_rate": 0.0001, + "loss": 4.0455, + "loss/crossentropy": 2.177999496459961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061704844236374, + "step": 26556 + }, + { + "epoch": 0.53116, + "grad_norm": 1.7421875, + "grad_norm_var": 0.006052398681640625, + "learning_rate": 0.0001, + "loss": 3.7498, + "loss/crossentropy": 1.956209123134613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16483356803655624, + "step": 26558 + }, + { + "epoch": 0.5312, + "grad_norm": 1.84375, + "grad_norm_var": 0.006082916259765625, + "learning_rate": 0.0001, + "loss": 3.9295, + "loss/crossentropy": 1.778043806552887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15840457379817963, + "step": 26560 + }, + { + "epoch": 0.53124, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0064849853515625, + "learning_rate": 0.0001, + "loss": 4.2495, + "loss/crossentropy": 2.5685311555862427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21522855013608932, + "step": 26562 + }, + { + "epoch": 0.53128, + "grad_norm": 1.875, + "grad_norm_var": 0.006259918212890625, + "learning_rate": 0.0001, + "loss": 4.0887, + "loss/crossentropy": 2.054154694080353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18618051707744598, + "step": 26564 + }, + { + "epoch": 0.53132, + "grad_norm": 2.046875, + "grad_norm_var": 0.011114247639973958, + "learning_rate": 0.0001, + "loss": 4.1652, + "loss/crossentropy": 2.0676876306533813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19200553745031357, + "step": 26566 + }, + { + "epoch": 0.53136, + "grad_norm": 1.921875, + "grad_norm_var": 0.010817209879557291, + "learning_rate": 0.0001, + "loss": 3.9219, + "loss/crossentropy": 2.081417202949524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19634481519460678, + "step": 26568 + }, + { + "epoch": 0.5314, + "grad_norm": 1.953125, + "grad_norm_var": 0.010400136311848959, + "learning_rate": 0.0001, + "loss": 3.911, + "loss/crossentropy": 2.0342991948127747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19060657918453217, + "step": 26570 + }, + { + "epoch": 0.53144, + "grad_norm": 1.8203125, + "grad_norm_var": 0.012132771809895833, + "learning_rate": 0.0001, + "loss": 3.8132, + "loss/crossentropy": 2.2454493641853333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17674832791090012, + "step": 26572 + }, + { + "epoch": 0.53148, + "grad_norm": 1.921875, + "grad_norm_var": 0.010927073160807292, + "learning_rate": 0.0001, + "loss": 3.9746, + "loss/crossentropy": 2.1805293560028076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19897499680519104, + "step": 26574 + }, + { + "epoch": 0.53152, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0107086181640625, + "learning_rate": 0.0001, + "loss": 4.0728, + "loss/crossentropy": 2.074836492538452, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20241302251815796, + "step": 26576 + }, + { + "epoch": 0.53156, + "grad_norm": 1.96875, + "grad_norm_var": 0.010380045572916666, + "learning_rate": 0.0001, + "loss": 4.0942, + "loss/crossentropy": 2.341915488243103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19814413785934448, + "step": 26578 + }, + { + "epoch": 0.5316, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010994211832682291, + "learning_rate": 0.0001, + "loss": 3.8701, + "loss/crossentropy": 1.690186619758606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16645215451717377, + "step": 26580 + }, + { + "epoch": 0.53164, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005614217122395833, + "learning_rate": 0.0001, + "loss": 3.7241, + "loss/crossentropy": 2.0770626068115234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19572289288043976, + "step": 26582 + }, + { + "epoch": 0.53168, + "grad_norm": 1.859375, + "grad_norm_var": 0.006351470947265625, + "learning_rate": 0.0001, + "loss": 4.0342, + "loss/crossentropy": 2.411499500274658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21515025198459625, + "step": 26584 + }, + { + "epoch": 0.53172, + "grad_norm": 1.84375, + "grad_norm_var": 0.0062558492024739586, + "learning_rate": 0.0001, + "loss": 4.1189, + "loss/crossentropy": 2.0363634824752808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18869787454605103, + "step": 26586 + }, + { + "epoch": 0.53176, + "grad_norm": 1.8359375, + "grad_norm_var": 0.004858144124348958, + "learning_rate": 0.0001, + "loss": 4.049, + "loss/crossentropy": 2.203519582748413, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20460474491119385, + "step": 26588 + }, + { + "epoch": 0.5318, + "grad_norm": 2.0, + "grad_norm_var": 0.004491170247395833, + "learning_rate": 0.0001, + "loss": 3.8633, + "loss/crossentropy": 1.9883779883384705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21096121519804, + "step": 26590 + }, + { + "epoch": 0.53184, + "grad_norm": 1.96875, + "grad_norm_var": 0.005332183837890625, + "learning_rate": 0.0001, + "loss": 4.1148, + "loss/crossentropy": 2.209134042263031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23714564740657806, + "step": 26592 + }, + { + "epoch": 0.53188, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005125935872395833, + "learning_rate": 0.0001, + "loss": 3.9585, + "loss/crossentropy": 2.3128318786621094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20354144275188446, + "step": 26594 + }, + { + "epoch": 0.53192, + "grad_norm": 2.09375, + "grad_norm_var": 0.006624348958333333, + "learning_rate": 0.0001, + "loss": 4.2532, + "loss/crossentropy": 2.2282800674438477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2072424292564392, + "step": 26596 + }, + { + "epoch": 0.53196, + "grad_norm": 1.90625, + "grad_norm_var": 0.0056111653645833336, + "learning_rate": 0.0001, + "loss": 4.0443, + "loss/crossentropy": 2.086778998374939, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18629200011491776, + "step": 26598 + }, + { + "epoch": 0.532, + "grad_norm": 1.8359375, + "grad_norm_var": 0.005773671468098958, + "learning_rate": 0.0001, + "loss": 3.8206, + "loss/crossentropy": 2.12203449010849, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1955840140581131, + "step": 26600 + }, + { + "epoch": 0.53204, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007008616129557292, + "learning_rate": 0.0001, + "loss": 3.7792, + "loss/crossentropy": 1.987316608428955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18615661561489105, + "step": 26602 + }, + { + "epoch": 0.53208, + "grad_norm": 1.9375, + "grad_norm_var": 0.007228342692057291, + "learning_rate": 0.0001, + "loss": 4.1107, + "loss/crossentropy": 2.0519716143608093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968577429652214, + "step": 26604 + }, + { + "epoch": 0.53212, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007020823160807292, + "learning_rate": 0.0001, + "loss": 4.1097, + "loss/crossentropy": 2.2337719202041626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19842206686735153, + "step": 26606 + }, + { + "epoch": 0.53216, + "grad_norm": 1.921875, + "grad_norm_var": 0.0071489969889322914, + "learning_rate": 0.0001, + "loss": 3.7935, + "loss/crossentropy": 2.2014262080192566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2037125900387764, + "step": 26608 + }, + { + "epoch": 0.5322, + "grad_norm": 2.28125, + "grad_norm_var": 0.0175689697265625, + "learning_rate": 0.0001, + "loss": 3.8532, + "loss/crossentropy": 1.7964777946472168, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17876359075307846, + "step": 26610 + }, + { + "epoch": 0.53224, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0155914306640625, + "learning_rate": 0.0001, + "loss": 3.9521, + "loss/crossentropy": 1.9710991978645325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18646740168333054, + "step": 26612 + }, + { + "epoch": 0.53228, + "grad_norm": 1.7734375, + "grad_norm_var": 0.016495513916015624, + "learning_rate": 0.0001, + "loss": 3.8435, + "loss/crossentropy": 2.039173901081085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19768649339675903, + "step": 26614 + }, + { + "epoch": 0.53232, + "grad_norm": 1.984375, + "grad_norm_var": 0.0170318603515625, + "learning_rate": 0.0001, + "loss": 3.9751, + "loss/crossentropy": 2.2037198543548584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2104790136218071, + "step": 26616 + }, + { + "epoch": 0.53236, + "grad_norm": 2.0, + "grad_norm_var": 0.01741943359375, + "learning_rate": 0.0001, + "loss": 3.9441, + "loss/crossentropy": 2.1893996596336365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21086306124925613, + "step": 26618 + }, + { + "epoch": 0.5324, + "grad_norm": 1.84375, + "grad_norm_var": 0.0174560546875, + "learning_rate": 0.0001, + "loss": 3.9598, + "loss/crossentropy": 2.0324689745903015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20213106274604797, + "step": 26620 + }, + { + "epoch": 0.53244, + "grad_norm": 1.984375, + "grad_norm_var": 0.018244425455729168, + "learning_rate": 0.0001, + "loss": 3.8941, + "loss/crossentropy": 2.191510498523712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20318301022052765, + "step": 26622 + }, + { + "epoch": 0.53248, + "grad_norm": 2.046875, + "grad_norm_var": 0.018741861979166666, + "learning_rate": 0.0001, + "loss": 4.131, + "loss/crossentropy": 2.2092620134353638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033141329884529, + "step": 26624 + }, + { + "epoch": 0.53252, + "grad_norm": 1.875, + "grad_norm_var": 0.008796946207682291, + "learning_rate": 0.0001, + "loss": 3.8481, + "loss/crossentropy": 2.078882932662964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1927858516573906, + "step": 26626 + }, + { + "epoch": 0.53256, + "grad_norm": 2.109375, + "grad_norm_var": 0.010599517822265625, + "learning_rate": 0.0001, + "loss": 4.3066, + "loss/crossentropy": 2.1759738326072693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21904072910547256, + "step": 26628 + }, + { + "epoch": 0.5326, + "grad_norm": 1.890625, + "grad_norm_var": 0.01015625, + "learning_rate": 0.0001, + "loss": 3.7002, + "loss/crossentropy": 2.0541751980781555, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17661282420158386, + "step": 26630 + }, + { + "epoch": 0.53264, + "grad_norm": 1.9375, + "grad_norm_var": 0.010394032796223958, + "learning_rate": 0.0001, + "loss": 3.7789, + "loss/crossentropy": 1.8126670122146606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16475193202495575, + "step": 26632 + }, + { + "epoch": 0.53268, + "grad_norm": 1.984375, + "grad_norm_var": 0.009065500895182292, + "learning_rate": 0.0001, + "loss": 3.9798, + "loss/crossentropy": 1.8467394709587097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17031905055046082, + "step": 26634 + }, + { + "epoch": 0.53272, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008516184488932292, + "learning_rate": 0.0001, + "loss": 3.8392, + "loss/crossentropy": 1.7537733912467957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17307621240615845, + "step": 26636 + }, + { + "epoch": 0.53276, + "grad_norm": 2.078125, + "grad_norm_var": 0.009273274739583334, + "learning_rate": 0.0001, + "loss": 4.2491, + "loss/crossentropy": 2.2367827892303467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21301713585853577, + "step": 26638 + }, + { + "epoch": 0.5328, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009419504801432292, + "learning_rate": 0.0001, + "loss": 3.9927, + "loss/crossentropy": 1.9437520503997803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19594744592905045, + "step": 26640 + }, + { + "epoch": 0.53284, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0102691650390625, + "learning_rate": 0.0001, + "loss": 4.1308, + "loss/crossentropy": 1.9712047576904297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2043192759156227, + "step": 26642 + }, + { + "epoch": 0.53288, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0079498291015625, + "learning_rate": 0.0001, + "loss": 3.916, + "loss/crossentropy": 1.951958179473877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2054751068353653, + "step": 26644 + }, + { + "epoch": 0.53292, + "grad_norm": 1.890625, + "grad_norm_var": 0.0067291259765625, + "learning_rate": 0.0001, + "loss": 3.929, + "loss/crossentropy": 2.0811157822608948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19759366661310196, + "step": 26646 + }, + { + "epoch": 0.53296, + "grad_norm": 1.9609375, + "grad_norm_var": 0.06748428344726562, + "learning_rate": 0.0001, + "loss": 4.0009, + "loss/crossentropy": 2.0309654474258423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1910555213689804, + "step": 26648 + }, + { + "epoch": 0.533, + "grad_norm": 1.78125, + "grad_norm_var": 0.07044448852539062, + "learning_rate": 0.0001, + "loss": 3.7709, + "loss/crossentropy": 2.003150999546051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19752421230077744, + "step": 26650 + }, + { + "epoch": 0.53304, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0694091796875, + "learning_rate": 0.0001, + "loss": 3.9324, + "loss/crossentropy": 1.788662612438202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17759086191654205, + "step": 26652 + }, + { + "epoch": 0.53308, + "grad_norm": 1.859375, + "grad_norm_var": 0.07455240885416667, + "learning_rate": 0.0001, + "loss": 3.835, + "loss/crossentropy": 1.7692713737487793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16581754386425018, + "step": 26654 + }, + { + "epoch": 0.53312, + "grad_norm": 1.859375, + "grad_norm_var": 0.07297337849934896, + "learning_rate": 0.0001, + "loss": 3.798, + "loss/crossentropy": 1.8395215272903442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18920384347438812, + "step": 26656 + }, + { + "epoch": 0.53316, + "grad_norm": 1.7734375, + "grad_norm_var": 0.07563247680664062, + "learning_rate": 0.0001, + "loss": 4.0133, + "loss/crossentropy": 2.0375067591667175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19706164300441742, + "step": 26658 + }, + { + "epoch": 0.5332, + "grad_norm": 2.0, + "grad_norm_var": 0.076025390625, + "learning_rate": 0.0001, + "loss": 3.9964, + "loss/crossentropy": 2.101609468460083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20092131942510605, + "step": 26660 + }, + { + "epoch": 0.53324, + "grad_norm": 1.8984375, + "grad_norm_var": 0.07556864420572916, + "learning_rate": 0.0001, + "loss": 3.8518, + "loss/crossentropy": 1.9724875092506409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19354254007339478, + "step": 26662 + }, + { + "epoch": 0.53328, + "grad_norm": 1.828125, + "grad_norm_var": 0.015000152587890624, + "learning_rate": 0.0001, + "loss": 4.199, + "loss/crossentropy": 1.9491792917251587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21028251200914383, + "step": 26664 + }, + { + "epoch": 0.53332, + "grad_norm": 2.03125, + "grad_norm_var": 0.012133534749348958, + "learning_rate": 0.0001, + "loss": 3.8784, + "loss/crossentropy": 2.0189873576164246, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20224104076623917, + "step": 26666 + }, + { + "epoch": 0.53336, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01177978515625, + "learning_rate": 0.0001, + "loss": 3.6177, + "loss/crossentropy": 2.0183663368225098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20128542184829712, + "step": 26668 + }, + { + "epoch": 0.5334, + "grad_norm": 1.8125, + "grad_norm_var": 0.01939697265625, + "learning_rate": 0.0001, + "loss": 4.2613, + "loss/crossentropy": 2.323311448097229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202368974685669, + "step": 26670 + }, + { + "epoch": 0.53344, + "grad_norm": 2.0, + "grad_norm_var": 0.01764094034830729, + "learning_rate": 0.0001, + "loss": 4.2713, + "loss/crossentropy": 2.14883291721344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21048131585121155, + "step": 26672 + }, + { + "epoch": 0.53348, + "grad_norm": 1.96875, + "grad_norm_var": 0.015313466389973959, + "learning_rate": 0.0001, + "loss": 4.2169, + "loss/crossentropy": 2.0525609254837036, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19780127704143524, + "step": 26674 + }, + { + "epoch": 0.53352, + "grad_norm": 1.7734375, + "grad_norm_var": 0.016900380452473957, + "learning_rate": 0.0001, + "loss": 4.0969, + "loss/crossentropy": 2.125749707221985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18475912511348724, + "step": 26676 + }, + { + "epoch": 0.53356, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016044108072916667, + "learning_rate": 0.0001, + "loss": 4.1139, + "loss/crossentropy": 2.3169859647750854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21877627819776535, + "step": 26678 + }, + { + "epoch": 0.5336, + "grad_norm": 1.984375, + "grad_norm_var": 0.014525349934895833, + "learning_rate": 0.0001, + "loss": 3.9182, + "loss/crossentropy": 2.263182818889618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20379970222711563, + "step": 26680 + }, + { + "epoch": 0.53364, + "grad_norm": 1.8359375, + "grad_norm_var": 0.014737701416015625, + "learning_rate": 0.0001, + "loss": 4.1722, + "loss/crossentropy": 2.2744826078414917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20668719708919525, + "step": 26682 + }, + { + "epoch": 0.53368, + "grad_norm": 1.8828125, + "grad_norm_var": 0.014872233072916666, + "learning_rate": 0.0001, + "loss": 3.9593, + "loss/crossentropy": 2.2624911665916443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010696530342102, + "step": 26684 + }, + { + "epoch": 0.53372, + "grad_norm": 1.875, + "grad_norm_var": 0.005456288655598958, + "learning_rate": 0.0001, + "loss": 3.7412, + "loss/crossentropy": 2.2623918056488037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20084036886692047, + "step": 26686 + }, + { + "epoch": 0.53376, + "grad_norm": 2.1875, + "grad_norm_var": 0.013960774739583333, + "learning_rate": 0.0001, + "loss": 4.2368, + "loss/crossentropy": 2.116790533065796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22413402050733566, + "step": 26688 + }, + { + "epoch": 0.5338, + "grad_norm": 1.90625, + "grad_norm_var": 0.013881174723307292, + "learning_rate": 0.0001, + "loss": 3.8506, + "loss/crossentropy": 1.9280251860618591, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19084317982196808, + "step": 26690 + }, + { + "epoch": 0.53384, + "grad_norm": 1.8046875, + "grad_norm_var": 0.013490549723307292, + "learning_rate": 0.0001, + "loss": 3.727, + "loss/crossentropy": 1.8203374743461609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16682762652635574, + "step": 26692 + }, + { + "epoch": 0.53388, + "grad_norm": 1.9296875, + "grad_norm_var": 0.013337961832682292, + "learning_rate": 0.0001, + "loss": 3.7433, + "loss/crossentropy": 1.9362438321113586, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947326734662056, + "step": 26694 + }, + { + "epoch": 0.53392, + "grad_norm": 1.9375, + "grad_norm_var": 0.015148671468098958, + "learning_rate": 0.0001, + "loss": 4.1108, + "loss/crossentropy": 1.945202112197876, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2301146537065506, + "step": 26696 + }, + { + "epoch": 0.53396, + "grad_norm": 1.78125, + "grad_norm_var": 0.016087849934895832, + "learning_rate": 0.0001, + "loss": 3.8197, + "loss/crossentropy": 1.986995279788971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18129996210336685, + "step": 26698 + }, + { + "epoch": 0.534, + "grad_norm": 1.9375, + "grad_norm_var": 0.01623713175455729, + "learning_rate": 0.0001, + "loss": 3.9488, + "loss/crossentropy": 2.157664656639099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106902152299881, + "step": 26700 + }, + { + "epoch": 0.53404, + "grad_norm": 2.09375, + "grad_norm_var": 0.014839680989583333, + "learning_rate": 0.0001, + "loss": 4.1374, + "loss/crossentropy": 1.9703331589698792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21851468831300735, + "step": 26702 + }, + { + "epoch": 0.53408, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008160146077473958, + "learning_rate": 0.0001, + "loss": 3.9243, + "loss/crossentropy": 2.1339242458343506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987123116850853, + "step": 26704 + }, + { + "epoch": 0.53412, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0085357666015625, + "learning_rate": 0.0001, + "loss": 4.0897, + "loss/crossentropy": 1.9810134768486023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18423764407634735, + "step": 26706 + }, + { + "epoch": 0.53416, + "grad_norm": 1.7890625, + "grad_norm_var": 0.008695475260416667, + "learning_rate": 0.0001, + "loss": 4.0445, + "loss/crossentropy": 1.8842402696609497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19414211809635162, + "step": 26708 + }, + { + "epoch": 0.5342, + "grad_norm": 2.0625, + "grad_norm_var": 0.0121490478515625, + "learning_rate": 0.0001, + "loss": 4.0123, + "loss/crossentropy": 2.0668699741363525, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19577328860759735, + "step": 26710 + }, + { + "epoch": 0.53424, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009276326497395833, + "learning_rate": 0.0001, + "loss": 3.913, + "loss/crossentropy": 2.2768534421920776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21711596846580505, + "step": 26712 + }, + { + "epoch": 0.53428, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008872222900390626, + "learning_rate": 0.0001, + "loss": 4.0675, + "loss/crossentropy": 2.1378949880599976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20244380831718445, + "step": 26714 + }, + { + "epoch": 0.53432, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009798177083333333, + "learning_rate": 0.0001, + "loss": 4.0259, + "loss/crossentropy": 2.4403127431869507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2065192013978958, + "step": 26716 + }, + { + "epoch": 0.53436, + "grad_norm": 1.828125, + "grad_norm_var": 0.008324178059895833, + "learning_rate": 0.0001, + "loss": 3.8731, + "loss/crossentropy": 1.8811705708503723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1881936937570572, + "step": 26718 + }, + { + "epoch": 0.5344, + "grad_norm": 1.921875, + "grad_norm_var": 0.010682932535807292, + "learning_rate": 0.0001, + "loss": 4.3268, + "loss/crossentropy": 2.331605315208435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20130542665719986, + "step": 26720 + }, + { + "epoch": 0.53444, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011763509114583333, + "learning_rate": 0.0001, + "loss": 3.6066, + "loss/crossentropy": 1.521777868270874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1611887440085411, + "step": 26722 + }, + { + "epoch": 0.53448, + "grad_norm": 2.0625, + "grad_norm_var": 0.011905924479166666, + "learning_rate": 0.0001, + "loss": 4.0351, + "loss/crossentropy": 2.0702260732650757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21350041031837463, + "step": 26724 + }, + { + "epoch": 0.53452, + "grad_norm": 2.03125, + "grad_norm_var": 0.008760579427083333, + "learning_rate": 0.0001, + "loss": 4.2144, + "loss/crossentropy": 2.2656288146972656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1988365724682808, + "step": 26726 + }, + { + "epoch": 0.53456, + "grad_norm": 2.046875, + "grad_norm_var": 0.008821614583333333, + "learning_rate": 0.0001, + "loss": 3.9917, + "loss/crossentropy": 1.8514603972434998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19230767339468002, + "step": 26728 + }, + { + "epoch": 0.5346, + "grad_norm": 1.96875, + "grad_norm_var": 0.0082672119140625, + "learning_rate": 0.0001, + "loss": 3.6906, + "loss/crossentropy": 1.682263970375061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19245991110801697, + "step": 26730 + }, + { + "epoch": 0.53464, + "grad_norm": 1.84375, + "grad_norm_var": 0.008778635660807292, + "learning_rate": 0.0001, + "loss": 3.9556, + "loss/crossentropy": 2.0426923036575317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19547611474990845, + "step": 26732 + }, + { + "epoch": 0.53468, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007826487223307291, + "learning_rate": 0.0001, + "loss": 3.8705, + "loss/crossentropy": 2.0580697059631348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17934934794902802, + "step": 26734 + }, + { + "epoch": 0.53472, + "grad_norm": 2.015625, + "grad_norm_var": 0.0060791015625, + "learning_rate": 0.0001, + "loss": 4.0613, + "loss/crossentropy": 2.2542420625686646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21669819951057434, + "step": 26736 + }, + { + "epoch": 0.53476, + "grad_norm": 1.84375, + "grad_norm_var": 0.004740142822265625, + "learning_rate": 0.0001, + "loss": 3.8354, + "loss/crossentropy": 2.0720590949058533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18977323174476624, + "step": 26738 + }, + { + "epoch": 0.5348, + "grad_norm": 1.875, + "grad_norm_var": 0.007134755452473958, + "learning_rate": 0.0001, + "loss": 3.7576, + "loss/crossentropy": 2.1534887552261353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1833745241165161, + "step": 26740 + }, + { + "epoch": 0.53484, + "grad_norm": 2.015625, + "grad_norm_var": 0.008381907145182292, + "learning_rate": 0.0001, + "loss": 3.9765, + "loss/crossentropy": 1.4169192910194397, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14363554865121841, + "step": 26742 + }, + { + "epoch": 0.53488, + "grad_norm": 1.953125, + "grad_norm_var": 0.007340240478515625, + "learning_rate": 0.0001, + "loss": 4.046, + "loss/crossentropy": 2.149993062019348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20160876959562302, + "step": 26744 + }, + { + "epoch": 0.53492, + "grad_norm": 1.8203125, + "grad_norm_var": 0.007433827718098958, + "learning_rate": 0.0001, + "loss": 4.0038, + "loss/crossentropy": 2.032483458518982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18361316621303558, + "step": 26746 + }, + { + "epoch": 0.53496, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007979329427083333, + "learning_rate": 0.0001, + "loss": 4.0003, + "loss/crossentropy": 2.253453016281128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20721124857664108, + "step": 26748 + }, + { + "epoch": 0.535, + "grad_norm": 1.796875, + "grad_norm_var": 0.009528605143229167, + "learning_rate": 0.0001, + "loss": 3.983, + "loss/crossentropy": 1.8833884000778198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19324089586734772, + "step": 26750 + }, + { + "epoch": 0.53504, + "grad_norm": 2.078125, + "grad_norm_var": 0.011427561442057291, + "learning_rate": 0.0001, + "loss": 4.1681, + "loss/crossentropy": 2.146417558193207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19708352535963058, + "step": 26752 + }, + { + "epoch": 0.53508, + "grad_norm": 2.109375, + "grad_norm_var": 0.013248443603515625, + "learning_rate": 0.0001, + "loss": 4.1508, + "loss/crossentropy": 1.9810682535171509, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1777956709265709, + "step": 26754 + }, + { + "epoch": 0.53512, + "grad_norm": 1.859375, + "grad_norm_var": 0.010426584879557292, + "learning_rate": 0.0001, + "loss": 3.9092, + "loss/crossentropy": 2.0470046401023865, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1866726651787758, + "step": 26756 + }, + { + "epoch": 0.53516, + "grad_norm": 1.8046875, + "grad_norm_var": 0.009287261962890625, + "learning_rate": 0.0001, + "loss": 3.9476, + "loss/crossentropy": 2.0760093927383423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18907316029071808, + "step": 26758 + }, + { + "epoch": 0.5352, + "grad_norm": 2.015625, + "grad_norm_var": 0.011110178629557292, + "learning_rate": 0.0001, + "loss": 3.8118, + "loss/crossentropy": 1.7074882984161377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16364683210849762, + "step": 26760 + }, + { + "epoch": 0.53524, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011614735921223958, + "learning_rate": 0.0001, + "loss": 4.196, + "loss/crossentropy": 1.925625503063202, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18412255495786667, + "step": 26762 + }, + { + "epoch": 0.53528, + "grad_norm": 2.0, + "grad_norm_var": 0.012962849934895833, + "learning_rate": 0.0001, + "loss": 3.9354, + "loss/crossentropy": 2.03251188993454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1895642802119255, + "step": 26764 + }, + { + "epoch": 0.53532, + "grad_norm": 2.03125, + "grad_norm_var": 0.013012440999348958, + "learning_rate": 0.0001, + "loss": 3.9456, + "loss/crossentropy": 1.9689412117004395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17626702040433884, + "step": 26766 + }, + { + "epoch": 0.53536, + "grad_norm": 1.921875, + "grad_norm_var": 0.011864980061848959, + "learning_rate": 0.0001, + "loss": 3.9408, + "loss/crossentropy": 1.7644684314727783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18325244635343552, + "step": 26768 + }, + { + "epoch": 0.5354, + "grad_norm": 1.984375, + "grad_norm_var": 0.009544881184895833, + "learning_rate": 0.0001, + "loss": 3.8692, + "loss/crossentropy": 1.9543325304985046, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18265419453382492, + "step": 26770 + }, + { + "epoch": 0.53544, + "grad_norm": 1.9921875, + "grad_norm_var": 0.012703196207682291, + "learning_rate": 0.0001, + "loss": 4.2303, + "loss/crossentropy": 2.107349157333374, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18807468563318253, + "step": 26772 + }, + { + "epoch": 0.53548, + "grad_norm": 1.9140625, + "grad_norm_var": 0.022900390625, + "learning_rate": 0.0001, + "loss": 4.1627, + "loss/crossentropy": 2.0298044085502625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1947350949048996, + "step": 26774 + }, + { + "epoch": 0.53552, + "grad_norm": 1.953125, + "grad_norm_var": 0.020491536458333334, + "learning_rate": 0.0001, + "loss": 4.06, + "loss/crossentropy": 2.1605560183525085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2067297026515007, + "step": 26776 + }, + { + "epoch": 0.53556, + "grad_norm": 2.15625, + "grad_norm_var": 0.021762847900390625, + "learning_rate": 0.0001, + "loss": 4.2464, + "loss/crossentropy": 1.9689620733261108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24544911086559296, + "step": 26778 + }, + { + "epoch": 0.5356, + "grad_norm": 1.953125, + "grad_norm_var": 0.018723297119140624, + "learning_rate": 0.0001, + "loss": 3.93, + "loss/crossentropy": 1.7339588403701782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21119988709688187, + "step": 26780 + }, + { + "epoch": 0.53564, + "grad_norm": 1.9609375, + "grad_norm_var": 0.018260701497395834, + "learning_rate": 0.0001, + "loss": 3.8853, + "loss/crossentropy": 2.3095227479934692, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19727592915296555, + "step": 26782 + }, + { + "epoch": 0.53568, + "grad_norm": 1.8515625, + "grad_norm_var": 0.017661285400390626, + "learning_rate": 0.0001, + "loss": 3.9575, + "loss/crossentropy": 2.1668676137924194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18963289260864258, + "step": 26784 + }, + { + "epoch": 0.53572, + "grad_norm": 2.0, + "grad_norm_var": 0.018683878580729167, + "learning_rate": 0.0001, + "loss": 3.7399, + "loss/crossentropy": 1.7347453236579895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1699044108390808, + "step": 26786 + }, + { + "epoch": 0.53576, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0186431884765625, + "learning_rate": 0.0001, + "loss": 3.9689, + "loss/crossentropy": 2.144263744354248, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21406567096710205, + "step": 26788 + }, + { + "epoch": 0.5358, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008202870686848959, + "learning_rate": 0.0001, + "loss": 4.0654, + "loss/crossentropy": 2.2909047603607178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19608308374881744, + "step": 26790 + }, + { + "epoch": 0.53584, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0089752197265625, + "learning_rate": 0.0001, + "loss": 3.748, + "loss/crossentropy": 2.0192511677742004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19014007598161697, + "step": 26792 + }, + { + "epoch": 0.53588, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005879720052083333, + "learning_rate": 0.0001, + "loss": 4.1532, + "loss/crossentropy": 1.848621129989624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16638639569282532, + "step": 26794 + }, + { + "epoch": 0.53592, + "grad_norm": 1.7890625, + "grad_norm_var": 0.008128865559895834, + "learning_rate": 0.0001, + "loss": 3.8812, + "loss/crossentropy": 1.8177622556686401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17412111163139343, + "step": 26796 + }, + { + "epoch": 0.53596, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009490712483723959, + "learning_rate": 0.0001, + "loss": 3.8414, + "loss/crossentropy": 1.8192716240882874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1820041686296463, + "step": 26798 + }, + { + "epoch": 0.536, + "grad_norm": 1.8671875, + "grad_norm_var": 0.037534332275390624, + "learning_rate": 0.0001, + "loss": 3.9625, + "loss/crossentropy": 2.037019371986389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19533763080835342, + "step": 26800 + }, + { + "epoch": 0.53604, + "grad_norm": 1.828125, + "grad_norm_var": 0.03748779296875, + "learning_rate": 0.0001, + "loss": 4.1231, + "loss/crossentropy": 2.3284034729003906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20415876060724258, + "step": 26802 + }, + { + "epoch": 0.53608, + "grad_norm": 2.140625, + "grad_norm_var": 0.04108861287434896, + "learning_rate": 0.0001, + "loss": 4.5056, + "loss/crossentropy": 1.9130100011825562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19624194502830505, + "step": 26804 + }, + { + "epoch": 0.53612, + "grad_norm": 1.859375, + "grad_norm_var": 0.04149754842122396, + "learning_rate": 0.0001, + "loss": 3.9375, + "loss/crossentropy": 1.9683635234832764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2047107145190239, + "step": 26806 + }, + { + "epoch": 0.53616, + "grad_norm": 1.890625, + "grad_norm_var": 0.0410552978515625, + "learning_rate": 0.0001, + "loss": 4.0838, + "loss/crossentropy": 2.3120559453964233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024022340774536, + "step": 26808 + }, + { + "epoch": 0.5362, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0410552978515625, + "learning_rate": 0.0001, + "loss": 3.9571, + "loss/crossentropy": 2.0975415110588074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19205840677022934, + "step": 26810 + }, + { + "epoch": 0.53624, + "grad_norm": 1.9609375, + "grad_norm_var": 0.03737564086914062, + "learning_rate": 0.0001, + "loss": 3.9933, + "loss/crossentropy": 1.995516061782837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18409030139446259, + "step": 26812 + }, + { + "epoch": 0.53628, + "grad_norm": 1.796875, + "grad_norm_var": 0.03712539672851563, + "learning_rate": 0.0001, + "loss": 3.5401, + "loss/crossentropy": 1.9188540577888489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17965808510780334, + "step": 26814 + }, + { + "epoch": 0.53632, + "grad_norm": 1.9375, + "grad_norm_var": 0.03156636555989583, + "learning_rate": 0.0001, + "loss": 4.127, + "loss/crossentropy": 2.1765600442886353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20042569190263748, + "step": 26816 + }, + { + "epoch": 0.53636, + "grad_norm": 1.96875, + "grad_norm_var": 0.031243642171223957, + "learning_rate": 0.0001, + "loss": 4.1306, + "loss/crossentropy": 2.2123221158981323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19921905547380447, + "step": 26818 + }, + { + "epoch": 0.5364, + "grad_norm": 1.9296875, + "grad_norm_var": 0.027950032552083334, + "learning_rate": 0.0001, + "loss": 4.091, + "loss/crossentropy": 2.130329966545105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20274868607521057, + "step": 26820 + }, + { + "epoch": 0.53644, + "grad_norm": 1.8671875, + "grad_norm_var": 0.02783788045247396, + "learning_rate": 0.0001, + "loss": 3.9039, + "loss/crossentropy": 1.8405003547668457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1754842922091484, + "step": 26822 + }, + { + "epoch": 0.53648, + "grad_norm": 1.9765625, + "grad_norm_var": 0.027611287434895833, + "learning_rate": 0.0001, + "loss": 4.1288, + "loss/crossentropy": 2.1880215406417847, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983429118990898, + "step": 26824 + }, + { + "epoch": 0.53652, + "grad_norm": 1.9453125, + "grad_norm_var": 0.027486165364583332, + "learning_rate": 0.0001, + "loss": 4.0219, + "loss/crossentropy": 2.1088255643844604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2141232043504715, + "step": 26826 + }, + { + "epoch": 0.53656, + "grad_norm": 1.8125, + "grad_norm_var": 0.02906494140625, + "learning_rate": 0.0001, + "loss": 3.8428, + "loss/crossentropy": 2.2226059436798096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20068518072366714, + "step": 26828 + }, + { + "epoch": 0.5366, + "grad_norm": 1.9453125, + "grad_norm_var": 0.02655614217122396, + "learning_rate": 0.0001, + "loss": 3.9276, + "loss/crossentropy": 2.0935966968536377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20954880118370056, + "step": 26830 + }, + { + "epoch": 0.53664, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0034238179524739582, + "learning_rate": 0.0001, + "loss": 4.1073, + "loss/crossentropy": 1.9899848699569702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19131190329790115, + "step": 26832 + }, + { + "epoch": 0.53668, + "grad_norm": 1.9296875, + "grad_norm_var": 0.002854156494140625, + "learning_rate": 0.0001, + "loss": 4.0719, + "loss/crossentropy": 1.9950811862945557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20007413625717163, + "step": 26834 + }, + { + "epoch": 0.53672, + "grad_norm": 1.953125, + "grad_norm_var": 0.003763580322265625, + "learning_rate": 0.0001, + "loss": 3.6452, + "loss/crossentropy": 2.1717607975006104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19856677949428558, + "step": 26836 + }, + { + "epoch": 0.53676, + "grad_norm": 2.0, + "grad_norm_var": 0.003780110677083333, + "learning_rate": 0.0001, + "loss": 4.1077, + "loss/crossentropy": 2.0720032453536987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17657846212387085, + "step": 26838 + }, + { + "epoch": 0.5368, + "grad_norm": 1.90625, + "grad_norm_var": 0.005541737874348958, + "learning_rate": 0.0001, + "loss": 3.776, + "loss/crossentropy": 2.0533303022384644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19667675346136093, + "step": 26840 + }, + { + "epoch": 0.53684, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005844879150390625, + "learning_rate": 0.0001, + "loss": 3.9664, + "loss/crossentropy": 1.8576315641403198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17336532473564148, + "step": 26842 + }, + { + "epoch": 0.53688, + "grad_norm": 2.0625, + "grad_norm_var": 0.0067179361979166664, + "learning_rate": 0.0001, + "loss": 4.1348, + "loss/crossentropy": 2.250246524810791, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19317224621772766, + "step": 26844 + }, + { + "epoch": 0.53692, + "grad_norm": 1.90625, + "grad_norm_var": 0.006239573160807292, + "learning_rate": 0.0001, + "loss": 4.2467, + "loss/crossentropy": 2.241440773010254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20358380675315857, + "step": 26846 + }, + { + "epoch": 0.53696, + "grad_norm": 2.125, + "grad_norm_var": 0.009332021077473959, + "learning_rate": 0.0001, + "loss": 4.1142, + "loss/crossentropy": 1.8086896538734436, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17357632517814636, + "step": 26848 + }, + { + "epoch": 0.537, + "grad_norm": 2.078125, + "grad_norm_var": 0.010695139567057291, + "learning_rate": 0.0001, + "loss": 4.3621, + "loss/crossentropy": 2.396283984184265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21695053577423096, + "step": 26850 + }, + { + "epoch": 0.53704, + "grad_norm": 2.09375, + "grad_norm_var": 0.010477447509765625, + "learning_rate": 0.0001, + "loss": 4.2781, + "loss/crossentropy": 2.1522679328918457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21720080822706223, + "step": 26852 + }, + { + "epoch": 0.53708, + "grad_norm": 1.90625, + "grad_norm_var": 0.011763254801432291, + "learning_rate": 0.0001, + "loss": 3.8039, + "loss/crossentropy": 1.5082102417945862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15677820146083832, + "step": 26854 + }, + { + "epoch": 0.53712, + "grad_norm": 1.875, + "grad_norm_var": 0.014855702718098959, + "learning_rate": 0.0001, + "loss": 4.1837, + "loss/crossentropy": 2.1900625228881836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19950998574495316, + "step": 26856 + }, + { + "epoch": 0.53716, + "grad_norm": 1.984375, + "grad_norm_var": 0.015409088134765625, + "learning_rate": 0.0001, + "loss": 4.2357, + "loss/crossentropy": 1.978192925453186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19752255082130432, + "step": 26858 + }, + { + "epoch": 0.5372, + "grad_norm": 1.859375, + "grad_norm_var": 0.015962727864583335, + "learning_rate": 0.0001, + "loss": 4.0597, + "loss/crossentropy": 1.9195253252983093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2122282087802887, + "step": 26860 + }, + { + "epoch": 0.53724, + "grad_norm": 1.921875, + "grad_norm_var": 0.015913899739583334, + "learning_rate": 0.0001, + "loss": 4.0172, + "loss/crossentropy": 1.7090142369270325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1740838885307312, + "step": 26862 + }, + { + "epoch": 0.53728, + "grad_norm": 1.9296875, + "grad_norm_var": 0.029788970947265625, + "learning_rate": 0.0001, + "loss": 3.9981, + "loss/crossentropy": 1.9377044439315796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18080267310142517, + "step": 26864 + }, + { + "epoch": 0.53732, + "grad_norm": 1.9140625, + "grad_norm_var": 0.029499308268229166, + "learning_rate": 0.0001, + "loss": 3.7442, + "loss/crossentropy": 1.635703206062317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16993890702724457, + "step": 26866 + }, + { + "epoch": 0.53736, + "grad_norm": 1.8359375, + "grad_norm_var": 0.03104426066080729, + "learning_rate": 0.0001, + "loss": 3.6425, + "loss/crossentropy": 2.10710072517395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2069242298603058, + "step": 26868 + }, + { + "epoch": 0.5374, + "grad_norm": 1.859375, + "grad_norm_var": 0.030543772379557292, + "learning_rate": 0.0001, + "loss": 3.9258, + "loss/crossentropy": 1.9034352898597717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17317350208759308, + "step": 26870 + }, + { + "epoch": 0.53744, + "grad_norm": 1.8125, + "grad_norm_var": 0.027032216389973957, + "learning_rate": 0.0001, + "loss": 3.7471, + "loss/crossentropy": 1.97059965133667, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18057716637849808, + "step": 26872 + }, + { + "epoch": 0.53748, + "grad_norm": 1.96875, + "grad_norm_var": 0.02578913370768229, + "learning_rate": 0.0001, + "loss": 3.92, + "loss/crossentropy": 2.11410790681839, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19710883498191833, + "step": 26874 + }, + { + "epoch": 0.53752, + "grad_norm": 1.859375, + "grad_norm_var": 0.026652018229166668, + "learning_rate": 0.0001, + "loss": 3.7229, + "loss/crossentropy": 2.1530507802963257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18232395499944687, + "step": 26876 + }, + { + "epoch": 0.53756, + "grad_norm": 1.8203125, + "grad_norm_var": 0.027286783854166666, + "learning_rate": 0.0001, + "loss": 3.9919, + "loss/crossentropy": 1.9864270687103271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18189801275730133, + "step": 26878 + }, + { + "epoch": 0.5376, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004610188802083333, + "learning_rate": 0.0001, + "loss": 3.8101, + "loss/crossentropy": 1.7933497428894043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16698282212018967, + "step": 26880 + }, + { + "epoch": 0.53764, + "grad_norm": 1.984375, + "grad_norm_var": 0.005102284749348958, + "learning_rate": 0.0001, + "loss": 4.096, + "loss/crossentropy": 2.1818475127220154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2031894475221634, + "step": 26882 + }, + { + "epoch": 0.53768, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004980214436848958, + "learning_rate": 0.0001, + "loss": 3.8372, + "loss/crossentropy": 1.7670655250549316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19976353645324707, + "step": 26884 + }, + { + "epoch": 0.53772, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007252756754557292, + "learning_rate": 0.0001, + "loss": 3.7664, + "loss/crossentropy": 1.9625222086906433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1887579709291458, + "step": 26886 + }, + { + "epoch": 0.53776, + "grad_norm": 1.890625, + "grad_norm_var": 0.006082916259765625, + "learning_rate": 0.0001, + "loss": 4.065, + "loss/crossentropy": 1.989456593990326, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18352140486240387, + "step": 26888 + }, + { + "epoch": 0.5378, + "grad_norm": 1.953125, + "grad_norm_var": 0.0054595947265625, + "learning_rate": 0.0001, + "loss": 3.8264, + "loss/crossentropy": 1.9268362522125244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18406879901885986, + "step": 26890 + }, + { + "epoch": 0.53784, + "grad_norm": 1.7890625, + "grad_norm_var": 0.005515289306640625, + "learning_rate": 0.0001, + "loss": 4.0215, + "loss/crossentropy": 2.3704354763031006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20472833514213562, + "step": 26892 + }, + { + "epoch": 0.53788, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006831614176432291, + "learning_rate": 0.0001, + "loss": 4.0136, + "loss/crossentropy": 2.2468762397766113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20067602396011353, + "step": 26894 + }, + { + "epoch": 0.53792, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007456207275390625, + "learning_rate": 0.0001, + "loss": 4.0154, + "loss/crossentropy": 1.9968597888946533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19497767835855484, + "step": 26896 + }, + { + "epoch": 0.53796, + "grad_norm": 2.03125, + "grad_norm_var": 0.008861287434895834, + "learning_rate": 0.0001, + "loss": 4.3752, + "loss/crossentropy": 2.3659461736679077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061542198061943, + "step": 26898 + }, + { + "epoch": 0.538, + "grad_norm": 1.8203125, + "grad_norm_var": 0.00928955078125, + "learning_rate": 0.0001, + "loss": 3.7333, + "loss/crossentropy": 2.1831624507904053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18195770680904388, + "step": 26900 + }, + { + "epoch": 0.53804, + "grad_norm": 2.03125, + "grad_norm_var": 0.007419586181640625, + "learning_rate": 0.0001, + "loss": 3.9325, + "loss/crossentropy": 2.011473059654236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17340296506881714, + "step": 26902 + }, + { + "epoch": 0.53808, + "grad_norm": 2.109375, + "grad_norm_var": 0.009751129150390624, + "learning_rate": 0.0001, + "loss": 4.2062, + "loss/crossentropy": 2.0382936000823975, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20788908749818802, + "step": 26904 + }, + { + "epoch": 0.53812, + "grad_norm": 2.015625, + "grad_norm_var": 0.011530558268229166, + "learning_rate": 0.0001, + "loss": 3.7917, + "loss/crossentropy": 1.9090477228164673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1654892861843109, + "step": 26906 + }, + { + "epoch": 0.53816, + "grad_norm": 2.15625, + "grad_norm_var": 0.013850657145182292, + "learning_rate": 0.0001, + "loss": 4.1095, + "loss/crossentropy": 2.1782519817352295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20165212452411652, + "step": 26908 + }, + { + "epoch": 0.5382, + "grad_norm": 2.0625, + "grad_norm_var": 0.0138336181640625, + "learning_rate": 0.0001, + "loss": 4.0269, + "loss/crossentropy": 2.3456512689590454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19297856092453003, + "step": 26910 + }, + { + "epoch": 0.53824, + "grad_norm": 2.046875, + "grad_norm_var": 0.013108062744140624, + "learning_rate": 0.0001, + "loss": 4.1148, + "loss/crossentropy": 2.1863032579421997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19647317379713058, + "step": 26912 + }, + { + "epoch": 0.53828, + "grad_norm": 1.8359375, + "grad_norm_var": 0.013187408447265625, + "learning_rate": 0.0001, + "loss": 3.8884, + "loss/crossentropy": 1.824396550655365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18155476450920105, + "step": 26914 + }, + { + "epoch": 0.53832, + "grad_norm": 1.9765625, + "grad_norm_var": 0.013630930582682292, + "learning_rate": 0.0001, + "loss": 3.9332, + "loss/crossentropy": 1.913030207157135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16873834282159805, + "step": 26916 + }, + { + "epoch": 0.53836, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012780507405598959, + "learning_rate": 0.0001, + "loss": 4.0736, + "loss/crossentropy": 1.791024386882782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17709940671920776, + "step": 26918 + }, + { + "epoch": 0.5384, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010857899983723959, + "learning_rate": 0.0001, + "loss": 4.0551, + "loss/crossentropy": 1.9327716827392578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17906151711940765, + "step": 26920 + }, + { + "epoch": 0.53844, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010290273030598958, + "learning_rate": 0.0001, + "loss": 3.9922, + "loss/crossentropy": 2.1072241067886353, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19492276012897491, + "step": 26922 + }, + { + "epoch": 0.53848, + "grad_norm": 1.796875, + "grad_norm_var": 0.009993235270182291, + "learning_rate": 0.0001, + "loss": 4.039, + "loss/crossentropy": 2.024174690246582, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2018662467598915, + "step": 26924 + }, + { + "epoch": 0.53852, + "grad_norm": 1.90625, + "grad_norm_var": 0.00892333984375, + "learning_rate": 0.0001, + "loss": 3.988, + "loss/crossentropy": 1.9384547472000122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1663040593266487, + "step": 26926 + }, + { + "epoch": 0.53856, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007692209879557292, + "learning_rate": 0.0001, + "loss": 3.7386, + "loss/crossentropy": 2.0045499205589294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18697284162044525, + "step": 26928 + }, + { + "epoch": 0.5386, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008038075764973958, + "learning_rate": 0.0001, + "loss": 4.1255, + "loss/crossentropy": 2.3768097162246704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20385072380304337, + "step": 26930 + }, + { + "epoch": 0.53864, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008097330729166666, + "learning_rate": 0.0001, + "loss": 3.6845, + "loss/crossentropy": 1.8940687775611877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1758279949426651, + "step": 26932 + }, + { + "epoch": 0.53868, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008503214518229166, + "learning_rate": 0.0001, + "loss": 4.1272, + "loss/crossentropy": 2.258841872215271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18384167551994324, + "step": 26934 + }, + { + "epoch": 0.53872, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010115305582682291, + "learning_rate": 0.0001, + "loss": 4.1708, + "loss/crossentropy": 2.3876583576202393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20442500710487366, + "step": 26936 + }, + { + "epoch": 0.53876, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010106404622395834, + "learning_rate": 0.0001, + "loss": 3.975, + "loss/crossentropy": 2.3199750185012817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975177749991417, + "step": 26938 + }, + { + "epoch": 0.5388, + "grad_norm": 1.8359375, + "grad_norm_var": 0.006365712483723958, + "learning_rate": 0.0001, + "loss": 4.1408, + "loss/crossentropy": 2.2277639508247375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1962454915046692, + "step": 26940 + }, + { + "epoch": 0.53884, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009844716389973958, + "learning_rate": 0.0001, + "loss": 4.1696, + "loss/crossentropy": 2.2041778564453125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19852803647518158, + "step": 26942 + }, + { + "epoch": 0.53888, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009924062093098958, + "learning_rate": 0.0001, + "loss": 3.844, + "loss/crossentropy": 1.6400386095046997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15487553179264069, + "step": 26944 + }, + { + "epoch": 0.53892, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012059529622395834, + "learning_rate": 0.0001, + "loss": 4.0689, + "loss/crossentropy": 2.4118189811706543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22375066578388214, + "step": 26946 + }, + { + "epoch": 0.53896, + "grad_norm": 1.921875, + "grad_norm_var": 0.014253489176432292, + "learning_rate": 0.0001, + "loss": 3.932, + "loss/crossentropy": 2.238221287727356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18950769305229187, + "step": 26948 + }, + { + "epoch": 0.539, + "grad_norm": 1.765625, + "grad_norm_var": 0.0158935546875, + "learning_rate": 0.0001, + "loss": 4.0809, + "loss/crossentropy": 2.2348607182502747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1920975297689438, + "step": 26950 + }, + { + "epoch": 0.53904, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015954335530598957, + "learning_rate": 0.0001, + "loss": 4.0146, + "loss/crossentropy": 2.2838883996009827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2128254398703575, + "step": 26952 + }, + { + "epoch": 0.53908, + "grad_norm": 1.953125, + "grad_norm_var": 0.0150787353515625, + "learning_rate": 0.0001, + "loss": 3.7944, + "loss/crossentropy": 2.183197498321533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2030157521367073, + "step": 26954 + }, + { + "epoch": 0.53912, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014579010009765626, + "learning_rate": 0.0001, + "loss": 4.1249, + "loss/crossentropy": 2.132546067237854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2098417431116104, + "step": 26956 + }, + { + "epoch": 0.53916, + "grad_norm": 1.953125, + "grad_norm_var": 0.011839803059895833, + "learning_rate": 0.0001, + "loss": 3.9566, + "loss/crossentropy": 1.867222011089325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18416466563940048, + "step": 26958 + }, + { + "epoch": 0.5392, + "grad_norm": 1.875, + "grad_norm_var": 0.013710276285807291, + "learning_rate": 0.0001, + "loss": 4.3045, + "loss/crossentropy": 2.0456870198249817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19690076261758804, + "step": 26960 + }, + { + "epoch": 0.53924, + "grad_norm": 1.953125, + "grad_norm_var": 0.017731730143229166, + "learning_rate": 0.0001, + "loss": 4.207, + "loss/crossentropy": 2.387152314186096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2672813981771469, + "step": 26962 + }, + { + "epoch": 0.53928, + "grad_norm": 1.8125, + "grad_norm_var": 0.014679972330729167, + "learning_rate": 0.0001, + "loss": 3.8385, + "loss/crossentropy": 1.7455761432647705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.158515103161335, + "step": 26964 + }, + { + "epoch": 0.53932, + "grad_norm": 1.8203125, + "grad_norm_var": 0.013643391927083333, + "learning_rate": 0.0001, + "loss": 4.0564, + "loss/crossentropy": 2.0408239364624023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18679992854595184, + "step": 26966 + }, + { + "epoch": 0.53936, + "grad_norm": 2.015625, + "grad_norm_var": 0.012555948893229167, + "learning_rate": 0.0001, + "loss": 4.1616, + "loss/crossentropy": 2.1810312271118164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18764056265354156, + "step": 26968 + }, + { + "epoch": 0.5394, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012015787760416667, + "learning_rate": 0.0001, + "loss": 3.9197, + "loss/crossentropy": 2.152927279472351, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18063102662563324, + "step": 26970 + }, + { + "epoch": 0.53944, + "grad_norm": 1.96875, + "grad_norm_var": 0.01220703125, + "learning_rate": 0.0001, + "loss": 3.912, + "loss/crossentropy": 2.006937623023987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17757008224725723, + "step": 26972 + }, + { + "epoch": 0.53948, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01268310546875, + "learning_rate": 0.0001, + "loss": 4.0045, + "loss/crossentropy": 1.7032560110092163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16759414970874786, + "step": 26974 + }, + { + "epoch": 0.53952, + "grad_norm": 1.921875, + "grad_norm_var": 0.014506022135416666, + "learning_rate": 0.0001, + "loss": 4.0903, + "loss/crossentropy": 2.0685030817985535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18894075602293015, + "step": 26976 + }, + { + "epoch": 0.53956, + "grad_norm": 1.8125, + "grad_norm_var": 0.010015614827473958, + "learning_rate": 0.0001, + "loss": 3.5637, + "loss/crossentropy": 1.7113747000694275, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16482854634523392, + "step": 26978 + }, + { + "epoch": 0.5396, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009208170572916667, + "learning_rate": 0.0001, + "loss": 3.8915, + "loss/crossentropy": 1.9814255237579346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19485389441251755, + "step": 26980 + }, + { + "epoch": 0.53964, + "grad_norm": 1.8203125, + "grad_norm_var": 0.009466298421223958, + "learning_rate": 0.0001, + "loss": 3.7662, + "loss/crossentropy": 1.825324296951294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.172153040766716, + "step": 26982 + }, + { + "epoch": 0.53968, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008837636311848958, + "learning_rate": 0.0001, + "loss": 4.0153, + "loss/crossentropy": 2.093530297279358, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956939622759819, + "step": 26984 + }, + { + "epoch": 0.53972, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010716756184895834, + "learning_rate": 0.0001, + "loss": 4.3494, + "loss/crossentropy": 2.333137631416321, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21523389220237732, + "step": 26986 + }, + { + "epoch": 0.53976, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010814412434895834, + "learning_rate": 0.0001, + "loss": 3.8208, + "loss/crossentropy": 1.6317270994186401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16613581776618958, + "step": 26988 + }, + { + "epoch": 0.5398, + "grad_norm": 1.96875, + "grad_norm_var": 0.010560862223307292, + "learning_rate": 0.0001, + "loss": 3.7877, + "loss/crossentropy": 2.286810517311096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21348290145397186, + "step": 26990 + }, + { + "epoch": 0.53984, + "grad_norm": 1.6875, + "grad_norm_var": 0.008597819010416667, + "learning_rate": 0.0001, + "loss": 3.8261, + "loss/crossentropy": 2.2280890941619873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20495467633008957, + "step": 26992 + }, + { + "epoch": 0.53988, + "grad_norm": 1.8125, + "grad_norm_var": 0.0084625244140625, + "learning_rate": 0.0001, + "loss": 3.7743, + "loss/crossentropy": 1.4591253399848938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1521049663424492, + "step": 26994 + }, + { + "epoch": 0.53992, + "grad_norm": 1.875, + "grad_norm_var": 0.0085205078125, + "learning_rate": 0.0001, + "loss": 4.0559, + "loss/crossentropy": 2.236023187637329, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2155672013759613, + "step": 26996 + }, + { + "epoch": 0.53996, + "grad_norm": 2.078125, + "grad_norm_var": 0.0094390869140625, + "learning_rate": 0.0001, + "loss": 4.1202, + "loss/crossentropy": 2.0742298364639282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17755182832479477, + "step": 26998 + }, + { + "epoch": 0.54, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009562174479166666, + "learning_rate": 0.0001, + "loss": 3.8773, + "loss/crossentropy": 1.9153380990028381, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1838880032300949, + "step": 27000 + }, + { + "epoch": 0.54004, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009349568684895834, + "learning_rate": 0.0001, + "loss": 4.062, + "loss/crossentropy": 2.2510672211647034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2028658166527748, + "step": 27002 + }, + { + "epoch": 0.54008, + "grad_norm": 1.75, + "grad_norm_var": 0.011016591389973959, + "learning_rate": 0.0001, + "loss": 3.5163, + "loss/crossentropy": 1.9110174179077148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18953163921833038, + "step": 27004 + }, + { + "epoch": 0.54012, + "grad_norm": 1.8125, + "grad_norm_var": 0.012604777018229167, + "learning_rate": 0.0001, + "loss": 3.9874, + "loss/crossentropy": 1.8102795481681824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16844668984413147, + "step": 27006 + }, + { + "epoch": 0.54016, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009797159830729167, + "learning_rate": 0.0001, + "loss": 4.1755, + "loss/crossentropy": 1.7649898529052734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18175919353961945, + "step": 27008 + }, + { + "epoch": 0.5402, + "grad_norm": 2.078125, + "grad_norm_var": 0.011104075113932292, + "learning_rate": 0.0001, + "loss": 4.0751, + "loss/crossentropy": 2.199475884437561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19509385526180267, + "step": 27010 + }, + { + "epoch": 0.54024, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010935211181640625, + "learning_rate": 0.0001, + "loss": 4.1241, + "loss/crossentropy": 2.313786506652832, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2032681107521057, + "step": 27012 + }, + { + "epoch": 0.54028, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009992472330729167, + "learning_rate": 0.0001, + "loss": 4.1953, + "loss/crossentropy": 2.040158271789551, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20921558886766434, + "step": 27014 + }, + { + "epoch": 0.54032, + "grad_norm": 1.875, + "grad_norm_var": 0.010871378580729167, + "learning_rate": 0.0001, + "loss": 3.7967, + "loss/crossentropy": 2.0283621549606323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19085510820150375, + "step": 27016 + }, + { + "epoch": 0.54036, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015941365559895834, + "learning_rate": 0.0001, + "loss": 3.9703, + "loss/crossentropy": 2.1031923294067383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19315888732671738, + "step": 27018 + }, + { + "epoch": 0.5404, + "grad_norm": 1.8515625, + "grad_norm_var": 0.013142903645833334, + "learning_rate": 0.0001, + "loss": 3.7665, + "loss/crossentropy": 1.8264936804771423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17999880015850067, + "step": 27020 + }, + { + "epoch": 0.54044, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011531321207682292, + "learning_rate": 0.0001, + "loss": 3.7928, + "loss/crossentropy": 2.127643585205078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20216452330350876, + "step": 27022 + }, + { + "epoch": 0.54048, + "grad_norm": 1.9375, + "grad_norm_var": 0.0120849609375, + "learning_rate": 0.0001, + "loss": 3.9311, + "loss/crossentropy": 1.8567208647727966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17688161879777908, + "step": 27024 + }, + { + "epoch": 0.54052, + "grad_norm": 1.890625, + "grad_norm_var": 0.0108062744140625, + "learning_rate": 0.0001, + "loss": 3.8285, + "loss/crossentropy": 2.0056475400924683, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16437891870737076, + "step": 27026 + }, + { + "epoch": 0.54056, + "grad_norm": 1.875, + "grad_norm_var": 0.012023671468098959, + "learning_rate": 0.0001, + "loss": 4.0068, + "loss/crossentropy": 2.2776039838790894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21126240491867065, + "step": 27028 + }, + { + "epoch": 0.5406, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012674967447916666, + "learning_rate": 0.0001, + "loss": 4.2638, + "loss/crossentropy": 2.2637280225753784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19725032150745392, + "step": 27030 + }, + { + "epoch": 0.54064, + "grad_norm": 1.859375, + "grad_norm_var": 0.011895497639973959, + "learning_rate": 0.0001, + "loss": 3.837, + "loss/crossentropy": 1.6967254281044006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1950267255306244, + "step": 27032 + }, + { + "epoch": 0.54068, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008591461181640624, + "learning_rate": 0.0001, + "loss": 4.3259, + "loss/crossentropy": 2.2444742918014526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20126602053642273, + "step": 27034 + }, + { + "epoch": 0.54072, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008656565348307292, + "learning_rate": 0.0001, + "loss": 3.8887, + "loss/crossentropy": 2.216984808444977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18849068880081177, + "step": 27036 + }, + { + "epoch": 0.54076, + "grad_norm": 1.90625, + "grad_norm_var": 0.008296712239583334, + "learning_rate": 0.0001, + "loss": 3.9656, + "loss/crossentropy": 2.089816451072693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2007404863834381, + "step": 27038 + }, + { + "epoch": 0.5408, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0071370442708333336, + "learning_rate": 0.0001, + "loss": 3.7989, + "loss/crossentropy": 1.839013695716858, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17485745251178741, + "step": 27040 + }, + { + "epoch": 0.54084, + "grad_norm": 2.0, + "grad_norm_var": 0.007393391927083334, + "learning_rate": 0.0001, + "loss": 4.3015, + "loss/crossentropy": 2.3915692567825317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21270561963319778, + "step": 27042 + }, + { + "epoch": 0.54088, + "grad_norm": 2.046875, + "grad_norm_var": 0.0087799072265625, + "learning_rate": 0.0001, + "loss": 3.8979, + "loss/crossentropy": 2.2086989879608154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21337078511714935, + "step": 27044 + }, + { + "epoch": 0.54092, + "grad_norm": 2.0, + "grad_norm_var": 0.008971913655598959, + "learning_rate": 0.0001, + "loss": 4.241, + "loss/crossentropy": 1.8932998776435852, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1783016324043274, + "step": 27046 + }, + { + "epoch": 0.54096, + "grad_norm": 1.875, + "grad_norm_var": 0.008479563395182292, + "learning_rate": 0.0001, + "loss": 3.9551, + "loss/crossentropy": 1.9314876198768616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1857961043715477, + "step": 27048 + }, + { + "epoch": 0.541, + "grad_norm": 1.828125, + "grad_norm_var": 0.0074371337890625, + "learning_rate": 0.0001, + "loss": 3.875, + "loss/crossentropy": 1.616044044494629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16122639179229736, + "step": 27050 + }, + { + "epoch": 0.54104, + "grad_norm": 1.84375, + "grad_norm_var": 0.007572174072265625, + "learning_rate": 0.0001, + "loss": 3.9375, + "loss/crossentropy": 1.7567220330238342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18351546674966812, + "step": 27052 + }, + { + "epoch": 0.54108, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007350413004557291, + "learning_rate": 0.0001, + "loss": 3.9276, + "loss/crossentropy": 1.8721659779548645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19143584370613098, + "step": 27054 + }, + { + "epoch": 0.54112, + "grad_norm": 1.953125, + "grad_norm_var": 0.0079254150390625, + "learning_rate": 0.0001, + "loss": 3.8402, + "loss/crossentropy": 1.8633847832679749, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18027572333812714, + "step": 27056 + }, + { + "epoch": 0.54116, + "grad_norm": 1.859375, + "grad_norm_var": 0.008455149332682292, + "learning_rate": 0.0001, + "loss": 4.0355, + "loss/crossentropy": 2.0588165521621704, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19255611300468445, + "step": 27058 + }, + { + "epoch": 0.5412, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0056304931640625, + "learning_rate": 0.0001, + "loss": 4.1375, + "loss/crossentropy": 2.316554546356201, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2137911319732666, + "step": 27060 + }, + { + "epoch": 0.54124, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0047910054524739586, + "learning_rate": 0.0001, + "loss": 4.0224, + "loss/crossentropy": 2.1319636702537537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21494153887033463, + "step": 27062 + }, + { + "epoch": 0.54128, + "grad_norm": 1.84375, + "grad_norm_var": 0.005326334635416667, + "learning_rate": 0.0001, + "loss": 3.8054, + "loss/crossentropy": 1.9028087854385376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1869627833366394, + "step": 27064 + }, + { + "epoch": 0.54132, + "grad_norm": 1.875, + "grad_norm_var": 0.0044097900390625, + "learning_rate": 0.0001, + "loss": 3.9521, + "loss/crossentropy": 2.0914413928985596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2090655192732811, + "step": 27066 + }, + { + "epoch": 0.54136, + "grad_norm": 1.8515625, + "grad_norm_var": 0.004367828369140625, + "learning_rate": 0.0001, + "loss": 3.9667, + "loss/crossentropy": 2.1949799060821533, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19384562969207764, + "step": 27068 + }, + { + "epoch": 0.5414, + "grad_norm": 1.921875, + "grad_norm_var": 0.0045562744140625, + "learning_rate": 0.0001, + "loss": 4.0931, + "loss/crossentropy": 2.076161026954651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17761556804180145, + "step": 27070 + }, + { + "epoch": 0.54144, + "grad_norm": 1.8828125, + "grad_norm_var": 0.004874674479166666, + "learning_rate": 0.0001, + "loss": 3.9357, + "loss/crossentropy": 2.0287702679634094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18184736371040344, + "step": 27072 + }, + { + "epoch": 0.54148, + "grad_norm": 1.84375, + "grad_norm_var": 0.004914347330729167, + "learning_rate": 0.0001, + "loss": 3.7579, + "loss/crossentropy": 2.2093619108200073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20473447442054749, + "step": 27074 + }, + { + "epoch": 0.54152, + "grad_norm": 1.921875, + "grad_norm_var": 0.004646809895833334, + "learning_rate": 0.0001, + "loss": 4.0884, + "loss/crossentropy": 2.076124429702759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19069521129131317, + "step": 27076 + }, + { + "epoch": 0.54156, + "grad_norm": 1.78125, + "grad_norm_var": 0.004303995768229167, + "learning_rate": 0.0001, + "loss": 3.7931, + "loss/crossentropy": 2.014284610748291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18654431402683258, + "step": 27078 + }, + { + "epoch": 0.5416, + "grad_norm": 1.828125, + "grad_norm_var": 0.0044329325358072914, + "learning_rate": 0.0001, + "loss": 4.0599, + "loss/crossentropy": 2.0898516178131104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19269923120737076, + "step": 27080 + }, + { + "epoch": 0.54164, + "grad_norm": 1.9765625, + "grad_norm_var": 0.015819295247395834, + "learning_rate": 0.0001, + "loss": 4.4848, + "loss/crossentropy": 2.1016663908958435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058946192264557, + "step": 27082 + }, + { + "epoch": 0.54168, + "grad_norm": 2.171875, + "grad_norm_var": 0.020896148681640626, + "learning_rate": 0.0001, + "loss": 3.8787, + "loss/crossentropy": 1.6725799441337585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1853618249297142, + "step": 27084 + }, + { + "epoch": 0.54172, + "grad_norm": 1.9375, + "grad_norm_var": 0.020531209309895833, + "learning_rate": 0.0001, + "loss": 4.1032, + "loss/crossentropy": 2.414761781692505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2319842129945755, + "step": 27086 + }, + { + "epoch": 0.54176, + "grad_norm": 1.890625, + "grad_norm_var": 0.018344879150390625, + "learning_rate": 0.0001, + "loss": 4.0617, + "loss/crossentropy": 1.887232780456543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1807008758187294, + "step": 27088 + }, + { + "epoch": 0.5418, + "grad_norm": 1.828125, + "grad_norm_var": 0.0190093994140625, + "learning_rate": 0.0001, + "loss": 4.3199, + "loss/crossentropy": 2.1659968495368958, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2109128087759018, + "step": 27090 + }, + { + "epoch": 0.54184, + "grad_norm": 2.109375, + "grad_norm_var": 0.02069269816080729, + "learning_rate": 0.0001, + "loss": 4.3623, + "loss/crossentropy": 2.134926438331604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.200565405189991, + "step": 27092 + }, + { + "epoch": 0.54188, + "grad_norm": 2.03125, + "grad_norm_var": 0.016621907552083332, + "learning_rate": 0.0001, + "loss": 4.3347, + "loss/crossentropy": 2.3204580545425415, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21586615592241287, + "step": 27094 + }, + { + "epoch": 0.54192, + "grad_norm": 1.8203125, + "grad_norm_var": 0.016462198893229165, + "learning_rate": 0.0001, + "loss": 3.8415, + "loss/crossentropy": 2.1035609245300293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20287606865167618, + "step": 27096 + }, + { + "epoch": 0.54196, + "grad_norm": 1.96875, + "grad_norm_var": 0.0115631103515625, + "learning_rate": 0.0001, + "loss": 4.0134, + "loss/crossentropy": 1.9996371865272522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1981046125292778, + "step": 27098 + }, + { + "epoch": 0.542, + "grad_norm": 2.171875, + "grad_norm_var": 0.011220041910807292, + "learning_rate": 0.0001, + "loss": 4.4227, + "loss/crossentropy": 2.2769532203674316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19060392677783966, + "step": 27100 + }, + { + "epoch": 0.54204, + "grad_norm": 1.765625, + "grad_norm_var": 0.014684804280598958, + "learning_rate": 0.0001, + "loss": 3.92, + "loss/crossentropy": 1.8792597651481628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17460587620735168, + "step": 27102 + }, + { + "epoch": 0.54208, + "grad_norm": 1.8125, + "grad_norm_var": 0.01631647745768229, + "learning_rate": 0.0001, + "loss": 3.9027, + "loss/crossentropy": 1.7762881517410278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15890707820653915, + "step": 27104 + }, + { + "epoch": 0.54212, + "grad_norm": 2.03125, + "grad_norm_var": 0.016196441650390626, + "learning_rate": 0.0001, + "loss": 3.9892, + "loss/crossentropy": 2.125525116920471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22518931329250336, + "step": 27106 + }, + { + "epoch": 0.54216, + "grad_norm": 1.921875, + "grad_norm_var": 0.012033843994140625, + "learning_rate": 0.0001, + "loss": 3.7171, + "loss/crossentropy": 2.1541160345077515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1856529340147972, + "step": 27108 + }, + { + "epoch": 0.5422, + "grad_norm": 1.90625, + "grad_norm_var": 0.010927073160807292, + "learning_rate": 0.0001, + "loss": 4.0158, + "loss/crossentropy": 1.8441925048828125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19040126353502274, + "step": 27110 + }, + { + "epoch": 0.54224, + "grad_norm": 1.8984375, + "grad_norm_var": 0.011260732014973959, + "learning_rate": 0.0001, + "loss": 3.9886, + "loss/crossentropy": 1.9131816625595093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19395362585783005, + "step": 27112 + }, + { + "epoch": 0.54228, + "grad_norm": 1.8359375, + "grad_norm_var": 0.015254465738932292, + "learning_rate": 0.0001, + "loss": 4.0498, + "loss/crossentropy": 1.9133941531181335, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23238787055015564, + "step": 27114 + }, + { + "epoch": 0.54232, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011253865559895833, + "learning_rate": 0.0001, + "loss": 4.1971, + "loss/crossentropy": 2.2706873416900635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21193598210811615, + "step": 27116 + }, + { + "epoch": 0.54236, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009822336832682292, + "learning_rate": 0.0001, + "loss": 3.9732, + "loss/crossentropy": 1.7896053791046143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1722758710384369, + "step": 27118 + }, + { + "epoch": 0.5424, + "grad_norm": 2.015625, + "grad_norm_var": 0.012636057535807292, + "learning_rate": 0.0001, + "loss": 4.3074, + "loss/crossentropy": 2.3640060424804688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21565549075603485, + "step": 27120 + }, + { + "epoch": 0.54244, + "grad_norm": 1.828125, + "grad_norm_var": 0.012857818603515625, + "learning_rate": 0.0001, + "loss": 3.8032, + "loss/crossentropy": 1.6995807886123657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15929770469665527, + "step": 27122 + }, + { + "epoch": 0.54248, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012607574462890625, + "learning_rate": 0.0001, + "loss": 3.9518, + "loss/crossentropy": 1.6559955477714539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1741805449128151, + "step": 27124 + }, + { + "epoch": 0.54252, + "grad_norm": 1.921875, + "grad_norm_var": 0.012451171875, + "learning_rate": 0.0001, + "loss": 3.8393, + "loss/crossentropy": 1.9256713390350342, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23186514526605606, + "step": 27126 + }, + { + "epoch": 0.54256, + "grad_norm": 1.953125, + "grad_norm_var": 0.011486562093098958, + "learning_rate": 0.0001, + "loss": 3.8599, + "loss/crossentropy": 2.0060980319976807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20487740635871887, + "step": 27128 + }, + { + "epoch": 0.5426, + "grad_norm": 1.90625, + "grad_norm_var": 0.009545644124348959, + "learning_rate": 0.0001, + "loss": 3.6376, + "loss/crossentropy": 1.8419195413589478, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17364958673715591, + "step": 27130 + }, + { + "epoch": 0.54264, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0100250244140625, + "learning_rate": 0.0001, + "loss": 4.1161, + "loss/crossentropy": 1.9890483617782593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21333064138889313, + "step": 27132 + }, + { + "epoch": 0.54268, + "grad_norm": 2.015625, + "grad_norm_var": 0.009639485677083334, + "learning_rate": 0.0001, + "loss": 4.1288, + "loss/crossentropy": 2.2758660316467285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21215695142745972, + "step": 27134 + }, + { + "epoch": 0.54272, + "grad_norm": 1.9375, + "grad_norm_var": 0.006086985270182292, + "learning_rate": 0.0001, + "loss": 4.1358, + "loss/crossentropy": 2.273390769958496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20087362825870514, + "step": 27136 + }, + { + "epoch": 0.54276, + "grad_norm": 1.796875, + "grad_norm_var": 0.005322265625, + "learning_rate": 0.0001, + "loss": 4.0575, + "loss/crossentropy": 2.446821928024292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20775765180587769, + "step": 27138 + }, + { + "epoch": 0.5428, + "grad_norm": 1.875, + "grad_norm_var": 0.006449127197265625, + "learning_rate": 0.0001, + "loss": 4.0756, + "loss/crossentropy": 2.391844630241394, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2062644362449646, + "step": 27140 + }, + { + "epoch": 0.54284, + "grad_norm": 1.9375, + "grad_norm_var": 0.007216135660807292, + "learning_rate": 0.0001, + "loss": 3.9436, + "loss/crossentropy": 2.112763822078705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17983781546354294, + "step": 27142 + }, + { + "epoch": 0.54288, + "grad_norm": 1.921875, + "grad_norm_var": 0.006624094645182292, + "learning_rate": 0.0001, + "loss": 4.2086, + "loss/crossentropy": 2.184209704399109, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20391154289245605, + "step": 27144 + }, + { + "epoch": 0.54292, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005806477864583334, + "learning_rate": 0.0001, + "loss": 3.9503, + "loss/crossentropy": 1.911209523677826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19172395020723343, + "step": 27146 + }, + { + "epoch": 0.54296, + "grad_norm": 1.9609375, + "grad_norm_var": 0.00419921875, + "learning_rate": 0.0001, + "loss": 3.9281, + "loss/crossentropy": 2.132554292678833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1887652799487114, + "step": 27148 + }, + { + "epoch": 0.543, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010130818684895833, + "learning_rate": 0.0001, + "loss": 4.1577, + "loss/crossentropy": 2.4312938451766968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20902784168720245, + "step": 27150 + }, + { + "epoch": 0.54304, + "grad_norm": 1.7578125, + "grad_norm_var": 0.011909993489583333, + "learning_rate": 0.0001, + "loss": 4.088, + "loss/crossentropy": 1.857543170452118, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1920638307929039, + "step": 27152 + }, + { + "epoch": 0.54308, + "grad_norm": 1.953125, + "grad_norm_var": 0.01109619140625, + "learning_rate": 0.0001, + "loss": 4.035, + "loss/crossentropy": 1.8170040845870972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1604015827178955, + "step": 27154 + }, + { + "epoch": 0.54312, + "grad_norm": 2.015625, + "grad_norm_var": 0.010815175374348958, + "learning_rate": 0.0001, + "loss": 3.9593, + "loss/crossentropy": 1.8490120768547058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19839002192020416, + "step": 27156 + }, + { + "epoch": 0.54316, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009954579671223958, + "learning_rate": 0.0001, + "loss": 3.9611, + "loss/crossentropy": 2.1401009559631348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20167674124240875, + "step": 27158 + }, + { + "epoch": 0.5432, + "grad_norm": 2.03125, + "grad_norm_var": 0.0108306884765625, + "learning_rate": 0.0001, + "loss": 4.1829, + "loss/crossentropy": 2.216339647769928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2046881765127182, + "step": 27160 + }, + { + "epoch": 0.54324, + "grad_norm": 1.8359375, + "grad_norm_var": 0.011112213134765625, + "learning_rate": 0.0001, + "loss": 3.9202, + "loss/crossentropy": 2.2559473514556885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20772842317819595, + "step": 27162 + }, + { + "epoch": 0.54328, + "grad_norm": 1.9375, + "grad_norm_var": 0.010811360677083333, + "learning_rate": 0.0001, + "loss": 4.0261, + "loss/crossentropy": 1.9810506105422974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18164421617984772, + "step": 27164 + }, + { + "epoch": 0.54332, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0047271728515625, + "learning_rate": 0.0001, + "loss": 3.9554, + "loss/crossentropy": 2.2585933208465576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20920690149068832, + "step": 27166 + }, + { + "epoch": 0.54336, + "grad_norm": 1.75, + "grad_norm_var": 0.005524698893229167, + "learning_rate": 0.0001, + "loss": 3.6078, + "loss/crossentropy": 1.8078173995018005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16720515489578247, + "step": 27168 + }, + { + "epoch": 0.5434, + "grad_norm": 1.828125, + "grad_norm_var": 0.005678049723307292, + "learning_rate": 0.0001, + "loss": 3.9103, + "loss/crossentropy": 1.7529219388961792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17757199704647064, + "step": 27170 + }, + { + "epoch": 0.54344, + "grad_norm": 1.875, + "grad_norm_var": 0.004713694254557292, + "learning_rate": 0.0001, + "loss": 3.8833, + "loss/crossentropy": 2.0651434659957886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2342069447040558, + "step": 27172 + }, + { + "epoch": 0.54348, + "grad_norm": 1.984375, + "grad_norm_var": 0.005125935872395833, + "learning_rate": 0.0001, + "loss": 3.7693, + "loss/crossentropy": 1.7311297059059143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17116959393024445, + "step": 27174 + }, + { + "epoch": 0.54352, + "grad_norm": 1.8125, + "grad_norm_var": 0.0042803446451822914, + "learning_rate": 0.0001, + "loss": 3.9136, + "loss/crossentropy": 1.65495365858078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756131500005722, + "step": 27176 + }, + { + "epoch": 0.54356, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0039713541666666664, + "learning_rate": 0.0001, + "loss": 3.8718, + "loss/crossentropy": 2.1134061217308044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19070632755756378, + "step": 27178 + }, + { + "epoch": 0.5436, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0037261962890625, + "learning_rate": 0.0001, + "loss": 3.8315, + "loss/crossentropy": 1.8080366849899292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1905672550201416, + "step": 27180 + }, + { + "epoch": 0.54364, + "grad_norm": 2.0, + "grad_norm_var": 0.004550933837890625, + "learning_rate": 0.0001, + "loss": 4.1646, + "loss/crossentropy": 2.1257325410842896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20127511024475098, + "step": 27182 + }, + { + "epoch": 0.54368, + "grad_norm": 1.9375, + "grad_norm_var": 0.0030263264973958335, + "learning_rate": 0.0001, + "loss": 4.2156, + "loss/crossentropy": 2.3125778436660767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21088162064552307, + "step": 27184 + }, + { + "epoch": 0.54372, + "grad_norm": 2.125, + "grad_norm_var": 0.005777740478515625, + "learning_rate": 0.0001, + "loss": 4.14, + "loss/crossentropy": 2.21799373626709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073218673467636, + "step": 27186 + }, + { + "epoch": 0.54376, + "grad_norm": 1.90625, + "grad_norm_var": 0.005686187744140625, + "learning_rate": 0.0001, + "loss": 3.8577, + "loss/crossentropy": 1.8872935771942139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17742664366960526, + "step": 27188 + }, + { + "epoch": 0.5438, + "grad_norm": 1.9375, + "grad_norm_var": 0.0054433186848958336, + "learning_rate": 0.0001, + "loss": 3.7985, + "loss/crossentropy": 1.6956111788749695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17246189713478088, + "step": 27190 + }, + { + "epoch": 0.54384, + "grad_norm": 2.03125, + "grad_norm_var": 0.0057065327962239586, + "learning_rate": 0.0001, + "loss": 4.0475, + "loss/crossentropy": 2.0606382489204407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049379125237465, + "step": 27192 + }, + { + "epoch": 0.54388, + "grad_norm": 1.8828125, + "grad_norm_var": 0.005092112223307291, + "learning_rate": 0.0001, + "loss": 3.9601, + "loss/crossentropy": 2.1031752824783325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19281570613384247, + "step": 27194 + }, + { + "epoch": 0.54392, + "grad_norm": 1.8125, + "grad_norm_var": 0.006306711832682292, + "learning_rate": 0.0001, + "loss": 4.1294, + "loss/crossentropy": 2.113471269607544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17625518888235092, + "step": 27196 + }, + { + "epoch": 0.54396, + "grad_norm": 1.90625, + "grad_norm_var": 0.0061279296875, + "learning_rate": 0.0001, + "loss": 4.1399, + "loss/crossentropy": 2.0233633518218994, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19023866951465607, + "step": 27198 + }, + { + "epoch": 0.544, + "grad_norm": 2.0, + "grad_norm_var": 0.006453196207682292, + "learning_rate": 0.0001, + "loss": 4.0167, + "loss/crossentropy": 2.1072750091552734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18891653418540955, + "step": 27200 + }, + { + "epoch": 0.54404, + "grad_norm": 2.265625, + "grad_norm_var": 0.011250559488932292, + "learning_rate": 0.0001, + "loss": 4.0871, + "loss/crossentropy": 2.1586121320724487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19431254267692566, + "step": 27202 + }, + { + "epoch": 0.54408, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01123046875, + "learning_rate": 0.0001, + "loss": 3.9215, + "loss/crossentropy": 1.817845344543457, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17477234452962875, + "step": 27204 + }, + { + "epoch": 0.54412, + "grad_norm": 1.796875, + "grad_norm_var": 0.0123687744140625, + "learning_rate": 0.0001, + "loss": 3.7602, + "loss/crossentropy": 2.0410149693489075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19700051844120026, + "step": 27206 + }, + { + "epoch": 0.54416, + "grad_norm": 1.9375, + "grad_norm_var": 0.011563873291015625, + "learning_rate": 0.0001, + "loss": 4.1133, + "loss/crossentropy": 2.096368670463562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20101771503686905, + "step": 27208 + }, + { + "epoch": 0.5442, + "grad_norm": 1.96875, + "grad_norm_var": 0.013061269124348959, + "learning_rate": 0.0001, + "loss": 4.0634, + "loss/crossentropy": 2.3180510997772217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23763899505138397, + "step": 27210 + }, + { + "epoch": 0.54424, + "grad_norm": 1.96875, + "grad_norm_var": 0.011986287434895833, + "learning_rate": 0.0001, + "loss": 4.0617, + "loss/crossentropy": 2.2700339555740356, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21923137456178665, + "step": 27212 + }, + { + "epoch": 0.54428, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011659495035807292, + "learning_rate": 0.0001, + "loss": 3.8744, + "loss/crossentropy": 2.0254225730895996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1879374161362648, + "step": 27214 + }, + { + "epoch": 0.54432, + "grad_norm": 1.8125, + "grad_norm_var": 0.012401326497395834, + "learning_rate": 0.0001, + "loss": 4.0095, + "loss/crossentropy": 2.0617589950561523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20201390981674194, + "step": 27216 + }, + { + "epoch": 0.54436, + "grad_norm": 1.9375, + "grad_norm_var": 0.005147298177083333, + "learning_rate": 0.0001, + "loss": 4.1374, + "loss/crossentropy": 2.1432749032974243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21799273788928986, + "step": 27218 + }, + { + "epoch": 0.5444, + "grad_norm": 2.015625, + "grad_norm_var": 0.005622355143229166, + "learning_rate": 0.0001, + "loss": 4.2461, + "loss/crossentropy": 1.948439359664917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1876726821064949, + "step": 27220 + }, + { + "epoch": 0.54444, + "grad_norm": 2.015625, + "grad_norm_var": 0.004518381754557292, + "learning_rate": 0.0001, + "loss": 4.0501, + "loss/crossentropy": 2.2085187435150146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106829285621643, + "step": 27222 + }, + { + "epoch": 0.54448, + "grad_norm": 2.0, + "grad_norm_var": 0.005475870768229167, + "learning_rate": 0.0001, + "loss": 3.8899, + "loss/crossentropy": 2.0642993450164795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19151441007852554, + "step": 27224 + }, + { + "epoch": 0.54452, + "grad_norm": 2.125, + "grad_norm_var": 0.009427897135416667, + "learning_rate": 0.0001, + "loss": 3.8693, + "loss/crossentropy": 1.8351057767868042, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17997722327709198, + "step": 27226 + }, + { + "epoch": 0.54456, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009903971354166667, + "learning_rate": 0.0001, + "loss": 3.8896, + "loss/crossentropy": 2.033097803592682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18373706936836243, + "step": 27228 + }, + { + "epoch": 0.5446, + "grad_norm": 1.890625, + "grad_norm_var": 0.009643300374348959, + "learning_rate": 0.0001, + "loss": 4.076, + "loss/crossentropy": 1.9884595274925232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19420110434293747, + "step": 27230 + }, + { + "epoch": 0.54464, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009258778889973958, + "learning_rate": 0.0001, + "loss": 3.7903, + "loss/crossentropy": 1.9873828887939453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19645929336547852, + "step": 27232 + }, + { + "epoch": 0.54468, + "grad_norm": 1.828125, + "grad_norm_var": 0.010298411051432291, + "learning_rate": 0.0001, + "loss": 3.8454, + "loss/crossentropy": 1.9592986702919006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20220571756362915, + "step": 27234 + }, + { + "epoch": 0.54472, + "grad_norm": 1.890625, + "grad_norm_var": 0.009480794270833334, + "learning_rate": 0.0001, + "loss": 3.8467, + "loss/crossentropy": 2.0207183957099915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18422196060419083, + "step": 27236 + }, + { + "epoch": 0.54476, + "grad_norm": 1.7578125, + "grad_norm_var": 0.009740193684895834, + "learning_rate": 0.0001, + "loss": 3.753, + "loss/crossentropy": 1.8570082187652588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17869313061237335, + "step": 27238 + }, + { + "epoch": 0.5448, + "grad_norm": 1.8046875, + "grad_norm_var": 0.008882649739583333, + "learning_rate": 0.0001, + "loss": 3.7697, + "loss/crossentropy": 2.043560802936554, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18643952161073685, + "step": 27240 + }, + { + "epoch": 0.54484, + "grad_norm": 2.046875, + "grad_norm_var": 0.0052874247233072914, + "learning_rate": 0.0001, + "loss": 4.0459, + "loss/crossentropy": 2.2474546432495117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2210846170783043, + "step": 27242 + }, + { + "epoch": 0.54488, + "grad_norm": 1.984375, + "grad_norm_var": 0.005882771809895834, + "learning_rate": 0.0001, + "loss": 4.0702, + "loss/crossentropy": 2.4019227027893066, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20284153521060944, + "step": 27244 + }, + { + "epoch": 0.54492, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006367746988932292, + "learning_rate": 0.0001, + "loss": 3.9849, + "loss/crossentropy": 2.4402183294296265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2066793590784073, + "step": 27246 + }, + { + "epoch": 0.54496, + "grad_norm": 1.7734375, + "grad_norm_var": 0.006623331705729167, + "learning_rate": 0.0001, + "loss": 3.7691, + "loss/crossentropy": 1.6989346742630005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1731812208890915, + "step": 27248 + }, + { + "epoch": 0.545, + "grad_norm": 1.7890625, + "grad_norm_var": 0.007126617431640625, + "learning_rate": 0.0001, + "loss": 3.7411, + "loss/crossentropy": 1.658067524433136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17113492637872696, + "step": 27250 + }, + { + "epoch": 0.54504, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007287343343098958, + "learning_rate": 0.0001, + "loss": 4.0801, + "loss/crossentropy": 2.032026529312134, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18906760960817337, + "step": 27252 + }, + { + "epoch": 0.54508, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0065185546875, + "learning_rate": 0.0001, + "loss": 4.1494, + "loss/crossentropy": 2.0466246008872986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963697373867035, + "step": 27254 + }, + { + "epoch": 0.54512, + "grad_norm": 1.7890625, + "grad_norm_var": 0.006906890869140625, + "learning_rate": 0.0001, + "loss": 4.1727, + "loss/crossentropy": 2.199449300765991, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19188550859689713, + "step": 27256 + }, + { + "epoch": 0.54516, + "grad_norm": 1.875, + "grad_norm_var": 0.007542928059895833, + "learning_rate": 0.0001, + "loss": 4.1578, + "loss/crossentropy": 2.10872745513916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20023853331804276, + "step": 27258 + }, + { + "epoch": 0.5452, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007010904947916666, + "learning_rate": 0.0001, + "loss": 4.1923, + "loss/crossentropy": 2.2712149620056152, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19052613526582718, + "step": 27260 + }, + { + "epoch": 0.54524, + "grad_norm": 1.953125, + "grad_norm_var": 0.008156077067057291, + "learning_rate": 0.0001, + "loss": 3.6014, + "loss/crossentropy": 1.61397385597229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.155337356030941, + "step": 27262 + }, + { + "epoch": 0.54528, + "grad_norm": 2.0, + "grad_norm_var": 0.008194986979166667, + "learning_rate": 0.0001, + "loss": 4.1433, + "loss/crossentropy": 1.8988584876060486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17318273335695267, + "step": 27264 + }, + { + "epoch": 0.54532, + "grad_norm": 1.796875, + "grad_norm_var": 0.007298787434895833, + "learning_rate": 0.0001, + "loss": 4.1592, + "loss/crossentropy": 1.9996931552886963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18785406649112701, + "step": 27266 + }, + { + "epoch": 0.54536, + "grad_norm": 1.96875, + "grad_norm_var": 0.007462565104166667, + "learning_rate": 0.0001, + "loss": 4.2391, + "loss/crossentropy": 2.1527179479599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20097015798091888, + "step": 27268 + }, + { + "epoch": 0.5454, + "grad_norm": 2.03125, + "grad_norm_var": 0.008609771728515625, + "learning_rate": 0.0001, + "loss": 4.051, + "loss/crossentropy": 2.1786444187164307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20732732862234116, + "step": 27270 + }, + { + "epoch": 0.54544, + "grad_norm": 1.875, + "grad_norm_var": 0.007673136393229167, + "learning_rate": 0.0001, + "loss": 3.8147, + "loss/crossentropy": 1.8255316019058228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18340704590082169, + "step": 27272 + }, + { + "epoch": 0.54548, + "grad_norm": 2.0625, + "grad_norm_var": 0.008046213785807292, + "learning_rate": 0.0001, + "loss": 4.0119, + "loss/crossentropy": 2.1311771273612976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.206636942923069, + "step": 27274 + }, + { + "epoch": 0.54552, + "grad_norm": 1.7265625, + "grad_norm_var": 0.010113271077473958, + "learning_rate": 0.0001, + "loss": 3.752, + "loss/crossentropy": 1.7570669651031494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15514526516199112, + "step": 27276 + }, + { + "epoch": 0.54556, + "grad_norm": 1.984375, + "grad_norm_var": 0.00963134765625, + "learning_rate": 0.0001, + "loss": 4.3659, + "loss/crossentropy": 2.407896041870117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22130534052848816, + "step": 27278 + }, + { + "epoch": 0.5456, + "grad_norm": 1.921875, + "grad_norm_var": 0.009891764322916666, + "learning_rate": 0.0001, + "loss": 4.1536, + "loss/crossentropy": 2.1037758588790894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23098163306713104, + "step": 27280 + }, + { + "epoch": 0.54564, + "grad_norm": 1.875, + "grad_norm_var": 0.008847808837890625, + "learning_rate": 0.0001, + "loss": 4.1011, + "loss/crossentropy": 2.213571786880493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.210727259516716, + "step": 27282 + }, + { + "epoch": 0.54568, + "grad_norm": 1.828125, + "grad_norm_var": 0.00955810546875, + "learning_rate": 0.0001, + "loss": 3.997, + "loss/crossentropy": 1.9279372096061707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19715701043605804, + "step": 27284 + }, + { + "epoch": 0.54572, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009372711181640625, + "learning_rate": 0.0001, + "loss": 4.0212, + "loss/crossentropy": 2.257995128631592, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20844868570566177, + "step": 27286 + }, + { + "epoch": 0.54576, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009694163004557292, + "learning_rate": 0.0001, + "loss": 3.8984, + "loss/crossentropy": 1.7289800643920898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1757895052433014, + "step": 27288 + }, + { + "epoch": 0.5458, + "grad_norm": 2.0, + "grad_norm_var": 0.009930165608723958, + "learning_rate": 0.0001, + "loss": 3.767, + "loss/crossentropy": 1.9156250953674316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17408546805381775, + "step": 27290 + }, + { + "epoch": 0.54584, + "grad_norm": 1.96875, + "grad_norm_var": 0.007330067952473958, + "learning_rate": 0.0001, + "loss": 4.153, + "loss/crossentropy": 2.2218767404556274, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20564305782318115, + "step": 27292 + }, + { + "epoch": 0.54588, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007767740885416667, + "learning_rate": 0.0001, + "loss": 4.3436, + "loss/crossentropy": 2.2322014570236206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19707325100898743, + "step": 27294 + }, + { + "epoch": 0.54592, + "grad_norm": 1.8359375, + "grad_norm_var": 0.007575480143229166, + "learning_rate": 0.0001, + "loss": 3.9286, + "loss/crossentropy": 1.9559943675994873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2009485438466072, + "step": 27296 + }, + { + "epoch": 0.54596, + "grad_norm": 1.921875, + "grad_norm_var": 0.007496897379557292, + "learning_rate": 0.0001, + "loss": 3.8224, + "loss/crossentropy": 2.322990119457245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21545371413230896, + "step": 27298 + }, + { + "epoch": 0.546, + "grad_norm": 1.8359375, + "grad_norm_var": 0.007698567708333334, + "learning_rate": 0.0001, + "loss": 3.7273, + "loss/crossentropy": 1.7752234935760498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18764031678438187, + "step": 27300 + }, + { + "epoch": 0.54604, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008317057291666667, + "learning_rate": 0.0001, + "loss": 4.1671, + "loss/crossentropy": 2.2797446250915527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20618843287229538, + "step": 27302 + }, + { + "epoch": 0.54608, + "grad_norm": 1.953125, + "grad_norm_var": 0.009468332926432291, + "learning_rate": 0.0001, + "loss": 4.0499, + "loss/crossentropy": 2.1392215490341187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19076371937990189, + "step": 27304 + }, + { + "epoch": 0.54612, + "grad_norm": 1.9296875, + "grad_norm_var": 0.006852213541666667, + "learning_rate": 0.0001, + "loss": 3.9362, + "loss/crossentropy": 2.065139055252075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20819596201181412, + "step": 27306 + }, + { + "epoch": 0.54616, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007281239827473958, + "learning_rate": 0.0001, + "loss": 3.8854, + "loss/crossentropy": 1.90589839220047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18194814771413803, + "step": 27308 + }, + { + "epoch": 0.5462, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0071489969889322914, + "learning_rate": 0.0001, + "loss": 4.0056, + "loss/crossentropy": 2.18922221660614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18913178890943527, + "step": 27310 + }, + { + "epoch": 0.54624, + "grad_norm": 1.90625, + "grad_norm_var": 0.006357574462890625, + "learning_rate": 0.0001, + "loss": 3.9098, + "loss/crossentropy": 2.3256269693374634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20674752444028854, + "step": 27312 + }, + { + "epoch": 0.54628, + "grad_norm": 2.015625, + "grad_norm_var": 0.006859334309895834, + "learning_rate": 0.0001, + "loss": 3.8007, + "loss/crossentropy": 2.2145326733589172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18575318902730942, + "step": 27314 + }, + { + "epoch": 0.54632, + "grad_norm": 1.953125, + "grad_norm_var": 0.0065419514973958336, + "learning_rate": 0.0001, + "loss": 4.27, + "loss/crossentropy": 2.326914429664612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115549072623253, + "step": 27316 + }, + { + "epoch": 0.54636, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006551106770833333, + "learning_rate": 0.0001, + "loss": 4.1064, + "loss/crossentropy": 2.1162266731262207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951146125793457, + "step": 27318 + }, + { + "epoch": 0.5464, + "grad_norm": 1.8671875, + "grad_norm_var": 0.027858225504557292, + "learning_rate": 0.0001, + "loss": 3.929, + "loss/crossentropy": 1.96768057346344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19022534787654877, + "step": 27320 + }, + { + "epoch": 0.54644, + "grad_norm": 1.859375, + "grad_norm_var": 0.028368123372395835, + "learning_rate": 0.0001, + "loss": 3.7685, + "loss/crossentropy": 1.6591554880142212, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16152790188789368, + "step": 27322 + }, + { + "epoch": 0.54648, + "grad_norm": 1.734375, + "grad_norm_var": 0.030863444010416668, + "learning_rate": 0.0001, + "loss": 3.8354, + "loss/crossentropy": 2.3317655324935913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20369040220975876, + "step": 27324 + }, + { + "epoch": 0.54652, + "grad_norm": 1.84375, + "grad_norm_var": 0.032871246337890625, + "learning_rate": 0.0001, + "loss": 3.808, + "loss/crossentropy": 1.9729547500610352, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18059556931257248, + "step": 27326 + }, + { + "epoch": 0.54656, + "grad_norm": 1.9140625, + "grad_norm_var": 0.03397216796875, + "learning_rate": 0.0001, + "loss": 3.9546, + "loss/crossentropy": 2.0512842535972595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19131098687648773, + "step": 27328 + }, + { + "epoch": 0.5466, + "grad_norm": 2.125, + "grad_norm_var": 0.0363433837890625, + "learning_rate": 0.0001, + "loss": 4.1688, + "loss/crossentropy": 2.385029435157776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064640149474144, + "step": 27330 + }, + { + "epoch": 0.54664, + "grad_norm": 2.0, + "grad_norm_var": 0.0443603515625, + "learning_rate": 0.0001, + "loss": 4.2089, + "loss/crossentropy": 2.069806933403015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19368857145309448, + "step": 27332 + }, + { + "epoch": 0.54668, + "grad_norm": 1.9296875, + "grad_norm_var": 0.043314615885416664, + "learning_rate": 0.0001, + "loss": 3.9046, + "loss/crossentropy": 1.7913403511047363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18388576805591583, + "step": 27334 + }, + { + "epoch": 0.54672, + "grad_norm": 1.890625, + "grad_norm_var": 0.01974665323893229, + "learning_rate": 0.0001, + "loss": 3.9618, + "loss/crossentropy": 1.8095470070838928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16660822927951813, + "step": 27336 + }, + { + "epoch": 0.54676, + "grad_norm": 1.8828125, + "grad_norm_var": 0.019551595052083332, + "learning_rate": 0.0001, + "loss": 3.9662, + "loss/crossentropy": 1.9614168405532837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1717158928513527, + "step": 27338 + }, + { + "epoch": 0.5468, + "grad_norm": 1.90625, + "grad_norm_var": 0.0172607421875, + "learning_rate": 0.0001, + "loss": 4.1897, + "loss/crossentropy": 1.9303128719329834, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18384890258312225, + "step": 27340 + }, + { + "epoch": 0.54684, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014204915364583333, + "learning_rate": 0.0001, + "loss": 3.8205, + "loss/crossentropy": 1.675184428691864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17097727209329605, + "step": 27342 + }, + { + "epoch": 0.54688, + "grad_norm": 2.078125, + "grad_norm_var": 0.013331858317057292, + "learning_rate": 0.0001, + "loss": 3.817, + "loss/crossentropy": 2.0593711137771606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22880251705646515, + "step": 27344 + }, + { + "epoch": 0.54692, + "grad_norm": 1.875, + "grad_norm_var": 0.011424763997395834, + "learning_rate": 0.0001, + "loss": 3.7013, + "loss/crossentropy": 1.79487544298172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1764550730586052, + "step": 27346 + }, + { + "epoch": 0.54696, + "grad_norm": 1.8046875, + "grad_norm_var": 0.00430908203125, + "learning_rate": 0.0001, + "loss": 3.7599, + "loss/crossentropy": 2.2460497617721558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20630930364131927, + "step": 27348 + }, + { + "epoch": 0.547, + "grad_norm": 1.8671875, + "grad_norm_var": 0.024339803059895835, + "learning_rate": 0.0001, + "loss": 3.9512, + "loss/crossentropy": 2.056776225566864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956087201833725, + "step": 27350 + }, + { + "epoch": 0.54704, + "grad_norm": 1.953125, + "grad_norm_var": 0.0264556884765625, + "learning_rate": 0.0001, + "loss": 4.0642, + "loss/crossentropy": 2.1221381425857544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19081637263298035, + "step": 27352 + }, + { + "epoch": 0.54708, + "grad_norm": 1.9375, + "grad_norm_var": 0.025994618733723957, + "learning_rate": 0.0001, + "loss": 4.2032, + "loss/crossentropy": 2.1899205446243286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19173882901668549, + "step": 27354 + }, + { + "epoch": 0.54712, + "grad_norm": 2.15625, + "grad_norm_var": 0.029012044270833332, + "learning_rate": 0.0001, + "loss": 4.3297, + "loss/crossentropy": 1.9936851859092712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2355274260044098, + "step": 27356 + }, + { + "epoch": 0.54716, + "grad_norm": 1.9140625, + "grad_norm_var": 0.02979100545247396, + "learning_rate": 0.0001, + "loss": 3.882, + "loss/crossentropy": 1.7802082300186157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17976941913366318, + "step": 27358 + }, + { + "epoch": 0.5472, + "grad_norm": 2.0, + "grad_norm_var": 0.029808553059895833, + "learning_rate": 0.0001, + "loss": 3.9911, + "loss/crossentropy": 2.1092435121536255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18953461945056915, + "step": 27360 + }, + { + "epoch": 0.54724, + "grad_norm": 1.828125, + "grad_norm_var": 0.031676991780598955, + "learning_rate": 0.0001, + "loss": 3.735, + "loss/crossentropy": 1.9264041185379028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1695871204137802, + "step": 27362 + }, + { + "epoch": 0.54728, + "grad_norm": 2.015625, + "grad_norm_var": 0.028816731770833333, + "learning_rate": 0.0001, + "loss": 3.724, + "loss/crossentropy": 2.0451253056526184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19317582994699478, + "step": 27364 + }, + { + "epoch": 0.54732, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011218007405598958, + "learning_rate": 0.0001, + "loss": 4.0508, + "loss/crossentropy": 2.084929406642914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20370616763830185, + "step": 27366 + }, + { + "epoch": 0.54736, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008654530843098958, + "learning_rate": 0.0001, + "loss": 3.9552, + "loss/crossentropy": 2.1887794733047485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21116343885660172, + "step": 27368 + }, + { + "epoch": 0.5474, + "grad_norm": 1.875, + "grad_norm_var": 0.011229451497395833, + "learning_rate": 0.0001, + "loss": 3.7472, + "loss/crossentropy": 1.9667014479637146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17529867589473724, + "step": 27370 + }, + { + "epoch": 0.54744, + "grad_norm": 2.046875, + "grad_norm_var": 0.005346425374348958, + "learning_rate": 0.0001, + "loss": 4.1537, + "loss/crossentropy": 2.351253390312195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23024218529462814, + "step": 27372 + }, + { + "epoch": 0.54748, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006089019775390625, + "learning_rate": 0.0001, + "loss": 3.9757, + "loss/crossentropy": 1.9090047478675842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17822134494781494, + "step": 27374 + }, + { + "epoch": 0.54752, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005516560872395834, + "learning_rate": 0.0001, + "loss": 3.9657, + "loss/crossentropy": 2.1193515062332153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1953936368227005, + "step": 27376 + }, + { + "epoch": 0.54756, + "grad_norm": 1.875, + "grad_norm_var": 0.005230458577473959, + "learning_rate": 0.0001, + "loss": 3.9614, + "loss/crossentropy": 1.8148779273033142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18782274425029755, + "step": 27378 + }, + { + "epoch": 0.5476, + "grad_norm": 2.09375, + "grad_norm_var": 0.0066640218098958336, + "learning_rate": 0.0001, + "loss": 3.9958, + "loss/crossentropy": 1.8987022042274475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1779102012515068, + "step": 27380 + }, + { + "epoch": 0.54764, + "grad_norm": 1.828125, + "grad_norm_var": 0.0071604410807291664, + "learning_rate": 0.0001, + "loss": 3.8742, + "loss/crossentropy": 1.7504222989082336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.173823744058609, + "step": 27382 + }, + { + "epoch": 0.54768, + "grad_norm": 1.8046875, + "grad_norm_var": 0.008239491780598959, + "learning_rate": 0.0001, + "loss": 3.8653, + "loss/crossentropy": 1.9609686732292175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18690945953130722, + "step": 27384 + }, + { + "epoch": 0.54772, + "grad_norm": 1.875, + "grad_norm_var": 0.006639607747395833, + "learning_rate": 0.0001, + "loss": 3.8654, + "loss/crossentropy": 1.613632321357727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15942876040935516, + "step": 27386 + }, + { + "epoch": 0.54776, + "grad_norm": 1.8203125, + "grad_norm_var": 0.006379191080729167, + "learning_rate": 0.0001, + "loss": 3.9697, + "loss/crossentropy": 2.052034914493561, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18169156461954117, + "step": 27388 + }, + { + "epoch": 0.5478, + "grad_norm": 1.9375, + "grad_norm_var": 0.005509185791015625, + "learning_rate": 0.0001, + "loss": 3.8938, + "loss/crossentropy": 1.9778786301612854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21691841632127762, + "step": 27390 + }, + { + "epoch": 0.54784, + "grad_norm": 2.015625, + "grad_norm_var": 0.010416412353515625, + "learning_rate": 0.0001, + "loss": 4.2751, + "loss/crossentropy": 2.272818922996521, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.26315005123615265, + "step": 27392 + }, + { + "epoch": 0.54788, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010335286458333334, + "learning_rate": 0.0001, + "loss": 3.9688, + "loss/crossentropy": 1.8213757872581482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1901681274175644, + "step": 27394 + }, + { + "epoch": 0.54792, + "grad_norm": 1.9375, + "grad_norm_var": 0.0095855712890625, + "learning_rate": 0.0001, + "loss": 4.131, + "loss/crossentropy": 2.0346380472183228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18924283981323242, + "step": 27396 + }, + { + "epoch": 0.54796, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009738922119140625, + "learning_rate": 0.0001, + "loss": 4.1628, + "loss/crossentropy": 1.9133538603782654, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2124219387769699, + "step": 27398 + }, + { + "epoch": 0.548, + "grad_norm": 1.90625, + "grad_norm_var": 0.008540852864583334, + "learning_rate": 0.0001, + "loss": 4.0782, + "loss/crossentropy": 2.141755998134613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20012597739696503, + "step": 27400 + }, + { + "epoch": 0.54804, + "grad_norm": 1.984375, + "grad_norm_var": 0.008104451497395833, + "learning_rate": 0.0001, + "loss": 3.9103, + "loss/crossentropy": 2.031430244445801, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1892971396446228, + "step": 27402 + }, + { + "epoch": 0.54808, + "grad_norm": 1.953125, + "grad_norm_var": 0.0066220601399739586, + "learning_rate": 0.0001, + "loss": 4.1763, + "loss/crossentropy": 2.3186575174331665, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.231618270277977, + "step": 27404 + }, + { + "epoch": 0.54812, + "grad_norm": 1.953125, + "grad_norm_var": 0.007496897379557292, + "learning_rate": 0.0001, + "loss": 3.7874, + "loss/crossentropy": 1.719622254371643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1646794229745865, + "step": 27406 + }, + { + "epoch": 0.54816, + "grad_norm": 2.0625, + "grad_norm_var": 0.008379872639973958, + "learning_rate": 0.0001, + "loss": 4.5104, + "loss/crossentropy": 2.099879503250122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20306643843650818, + "step": 27408 + }, + { + "epoch": 0.5482, + "grad_norm": 2.03125, + "grad_norm_var": 0.008553059895833333, + "learning_rate": 0.0001, + "loss": 4.1404, + "loss/crossentropy": 2.104840338230133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20727242529392242, + "step": 27410 + }, + { + "epoch": 0.54824, + "grad_norm": 1.890625, + "grad_norm_var": 0.009256744384765625, + "learning_rate": 0.0001, + "loss": 4.026, + "loss/crossentropy": 2.3337541818618774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19424156099557877, + "step": 27412 + }, + { + "epoch": 0.54828, + "grad_norm": 1.7578125, + "grad_norm_var": 0.01055908203125, + "learning_rate": 0.0001, + "loss": 3.8915, + "loss/crossentropy": 1.751102864742279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1715778112411499, + "step": 27414 + }, + { + "epoch": 0.54832, + "grad_norm": 1.890625, + "grad_norm_var": 0.0109771728515625, + "learning_rate": 0.0001, + "loss": 4.2044, + "loss/crossentropy": 2.2091389894485474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2181248739361763, + "step": 27416 + }, + { + "epoch": 0.54836, + "grad_norm": 1.984375, + "grad_norm_var": 0.011226145426432292, + "learning_rate": 0.0001, + "loss": 4.0839, + "loss/crossentropy": 2.1870399713516235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20490305125713348, + "step": 27418 + }, + { + "epoch": 0.5484, + "grad_norm": 1.9765625, + "grad_norm_var": 0.011344146728515626, + "learning_rate": 0.0001, + "loss": 4.0857, + "loss/crossentropy": 2.4501689672470093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22984202951192856, + "step": 27420 + }, + { + "epoch": 0.54844, + "grad_norm": 1.84375, + "grad_norm_var": 0.011771392822265626, + "learning_rate": 0.0001, + "loss": 3.9126, + "loss/crossentropy": 1.8921697735786438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1749446839094162, + "step": 27422 + }, + { + "epoch": 0.54848, + "grad_norm": 1.828125, + "grad_norm_var": 0.008036041259765625, + "learning_rate": 0.0001, + "loss": 3.8426, + "loss/crossentropy": 1.6973050236701965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1689457967877388, + "step": 27424 + }, + { + "epoch": 0.54852, + "grad_norm": 1.8828125, + "grad_norm_var": 0.006257883707682292, + "learning_rate": 0.0001, + "loss": 3.9369, + "loss/crossentropy": 1.7454912066459656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17953843623399734, + "step": 27426 + }, + { + "epoch": 0.54856, + "grad_norm": 2.15625, + "grad_norm_var": 0.0097412109375, + "learning_rate": 0.0001, + "loss": 4.2128, + "loss/crossentropy": 2.327135920524597, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2150314748287201, + "step": 27428 + }, + { + "epoch": 0.5486, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008746083577473958, + "learning_rate": 0.0001, + "loss": 4.1777, + "loss/crossentropy": 2.273073196411133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20920293778181076, + "step": 27430 + }, + { + "epoch": 0.54864, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0080718994140625, + "learning_rate": 0.0001, + "loss": 3.9236, + "loss/crossentropy": 1.474639356136322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16699142009019852, + "step": 27432 + }, + { + "epoch": 0.54868, + "grad_norm": 1.984375, + "grad_norm_var": 0.01053466796875, + "learning_rate": 0.0001, + "loss": 4.1162, + "loss/crossentropy": 2.116548180580139, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.225176103413105, + "step": 27434 + }, + { + "epoch": 0.54872, + "grad_norm": 1.953125, + "grad_norm_var": 0.013060506184895833, + "learning_rate": 0.0001, + "loss": 3.7837, + "loss/crossentropy": 2.1261587142944336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20171497762203217, + "step": 27436 + }, + { + "epoch": 0.54876, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012074534098307292, + "learning_rate": 0.0001, + "loss": 4.1137, + "loss/crossentropy": 2.0924129486083984, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012445107102394, + "step": 27438 + }, + { + "epoch": 0.5488, + "grad_norm": 1.953125, + "grad_norm_var": 0.011008453369140626, + "learning_rate": 0.0001, + "loss": 3.934, + "loss/crossentropy": 1.7499247789382935, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17569267749786377, + "step": 27440 + }, + { + "epoch": 0.54884, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0103424072265625, + "learning_rate": 0.0001, + "loss": 3.7889, + "loss/crossentropy": 2.0465195178985596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17263885587453842, + "step": 27442 + }, + { + "epoch": 0.54888, + "grad_norm": 1.90625, + "grad_norm_var": 0.0085113525390625, + "learning_rate": 0.0001, + "loss": 3.873, + "loss/crossentropy": 1.8351019620895386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929614618420601, + "step": 27444 + }, + { + "epoch": 0.54892, + "grad_norm": 1.875, + "grad_norm_var": 0.008119455973307292, + "learning_rate": 0.0001, + "loss": 3.7952, + "loss/crossentropy": 1.8877743482589722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18555018305778503, + "step": 27446 + }, + { + "epoch": 0.54896, + "grad_norm": 1.921875, + "grad_norm_var": 0.009757486979166667, + "learning_rate": 0.0001, + "loss": 3.7006, + "loss/crossentropy": 1.7358683943748474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18054651468992233, + "step": 27448 + }, + { + "epoch": 0.549, + "grad_norm": 1.796875, + "grad_norm_var": 0.009178670247395833, + "learning_rate": 0.0001, + "loss": 3.9719, + "loss/crossentropy": 2.0366458892822266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2056865617632866, + "step": 27450 + }, + { + "epoch": 0.54904, + "grad_norm": 2.109375, + "grad_norm_var": 0.0085113525390625, + "learning_rate": 0.0001, + "loss": 4.0996, + "loss/crossentropy": 2.078625202178955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1962544023990631, + "step": 27452 + }, + { + "epoch": 0.54908, + "grad_norm": 1.8203125, + "grad_norm_var": 0.009590657552083333, + "learning_rate": 0.0001, + "loss": 3.6335, + "loss/crossentropy": 1.6732112765312195, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1725183129310608, + "step": 27454 + }, + { + "epoch": 0.54912, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009419759114583334, + "learning_rate": 0.0001, + "loss": 3.513, + "loss/crossentropy": 1.8087647557258606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1810692474246025, + "step": 27456 + }, + { + "epoch": 0.54916, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0110504150390625, + "learning_rate": 0.0001, + "loss": 4.2636, + "loss/crossentropy": 2.087713837623596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20880009979009628, + "step": 27458 + }, + { + "epoch": 0.5492, + "grad_norm": 2.015625, + "grad_norm_var": 0.010367838541666667, + "learning_rate": 0.0001, + "loss": 4.1659, + "loss/crossentropy": 1.8009834289550781, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19154398143291473, + "step": 27460 + }, + { + "epoch": 0.54924, + "grad_norm": 1.90625, + "grad_norm_var": 0.010994211832682291, + "learning_rate": 0.0001, + "loss": 4.1084, + "loss/crossentropy": 2.272579550743103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21466071158647537, + "step": 27462 + }, + { + "epoch": 0.54928, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010518391927083334, + "learning_rate": 0.0001, + "loss": 3.9836, + "loss/crossentropy": 2.001379668712616, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1841510310769081, + "step": 27464 + }, + { + "epoch": 0.54932, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008973948160807292, + "learning_rate": 0.0001, + "loss": 4.2364, + "loss/crossentropy": 2.197210907936096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20746250450611115, + "step": 27466 + }, + { + "epoch": 0.54936, + "grad_norm": 2.03125, + "grad_norm_var": 0.008153279622395834, + "learning_rate": 0.0001, + "loss": 4.0393, + "loss/crossentropy": 1.6839552521705627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19475190341472626, + "step": 27468 + }, + { + "epoch": 0.5494, + "grad_norm": 2.640625, + "grad_norm_var": 0.03849995930989583, + "learning_rate": 0.0001, + "loss": 3.8291, + "loss/crossentropy": 1.8280132412910461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19640739262104034, + "step": 27470 + }, + { + "epoch": 0.54944, + "grad_norm": 1.96875, + "grad_norm_var": 0.03960367838541667, + "learning_rate": 0.0001, + "loss": 3.8258, + "loss/crossentropy": 2.082128942012787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20284345746040344, + "step": 27472 + }, + { + "epoch": 0.54948, + "grad_norm": 1.7890625, + "grad_norm_var": 0.04104181925455729, + "learning_rate": 0.0001, + "loss": 4.1464, + "loss/crossentropy": 2.346489429473877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1972089186310768, + "step": 27474 + }, + { + "epoch": 0.54952, + "grad_norm": 2.0625, + "grad_norm_var": 0.041751861572265625, + "learning_rate": 0.0001, + "loss": 4.1133, + "loss/crossentropy": 2.431715250015259, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2505635917186737, + "step": 27476 + }, + { + "epoch": 0.54956, + "grad_norm": 2.0, + "grad_norm_var": 0.03922907511393229, + "learning_rate": 0.0001, + "loss": 4.1596, + "loss/crossentropy": 2.012192726135254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20177678763866425, + "step": 27478 + }, + { + "epoch": 0.5496, + "grad_norm": 2.015625, + "grad_norm_var": 0.040726725260416666, + "learning_rate": 0.0001, + "loss": 3.9623, + "loss/crossentropy": 2.1662270426750183, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1866678223013878, + "step": 27480 + }, + { + "epoch": 0.54964, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04033381144205729, + "learning_rate": 0.0001, + "loss": 4.1946, + "loss/crossentropy": 2.331666111946106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2118167132139206, + "step": 27482 + }, + { + "epoch": 0.54968, + "grad_norm": 1.8046875, + "grad_norm_var": 0.042429351806640626, + "learning_rate": 0.0001, + "loss": 4.084, + "loss/crossentropy": 2.371376156806946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20595970004796982, + "step": 27484 + }, + { + "epoch": 0.54972, + "grad_norm": 1.890625, + "grad_norm_var": 0.008780670166015626, + "learning_rate": 0.0001, + "loss": 3.9568, + "loss/crossentropy": 1.8372794389724731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18964159488677979, + "step": 27486 + }, + { + "epoch": 0.54976, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007865142822265626, + "learning_rate": 0.0001, + "loss": 3.9367, + "loss/crossentropy": 1.9252594709396362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19232141971588135, + "step": 27488 + }, + { + "epoch": 0.5498, + "grad_norm": 1.953125, + "grad_norm_var": 0.00655517578125, + "learning_rate": 0.0001, + "loss": 3.9779, + "loss/crossentropy": 1.9566543102264404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20439066737890244, + "step": 27490 + }, + { + "epoch": 0.54984, + "grad_norm": 2.03125, + "grad_norm_var": 0.006776682535807292, + "learning_rate": 0.0001, + "loss": 3.8192, + "loss/crossentropy": 1.8082122802734375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1651742085814476, + "step": 27492 + }, + { + "epoch": 0.54988, + "grad_norm": 1.8359375, + "grad_norm_var": 0.006834920247395833, + "learning_rate": 0.0001, + "loss": 4.0115, + "loss/crossentropy": 1.9120944738388062, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.181972436606884, + "step": 27494 + }, + { + "epoch": 0.54992, + "grad_norm": 1.8359375, + "grad_norm_var": 0.005812327067057292, + "learning_rate": 0.0001, + "loss": 3.9873, + "loss/crossentropy": 2.134087026119232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19440361112356186, + "step": 27496 + }, + { + "epoch": 0.54996, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007039388020833333, + "learning_rate": 0.0001, + "loss": 3.6912, + "loss/crossentropy": 1.8519493341445923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16762658208608627, + "step": 27498 + }, + { + "epoch": 0.55, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0065093994140625, + "learning_rate": 0.0001, + "loss": 3.6277, + "loss/crossentropy": 2.10863196849823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21393945068120956, + "step": 27500 + }, + { + "epoch": 0.55004, + "grad_norm": 2.140625, + "grad_norm_var": 0.011252593994140626, + "learning_rate": 0.0001, + "loss": 4.3002, + "loss/crossentropy": 2.226866364479065, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21370790153741837, + "step": 27502 + }, + { + "epoch": 0.55008, + "grad_norm": 1.796875, + "grad_norm_var": 0.011749013264973959, + "learning_rate": 0.0001, + "loss": 3.8337, + "loss/crossentropy": 1.5916427373886108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16826944053173065, + "step": 27504 + }, + { + "epoch": 0.55012, + "grad_norm": 1.9375, + "grad_norm_var": 0.011677805582682292, + "learning_rate": 0.0001, + "loss": 4.2064, + "loss/crossentropy": 2.212289810180664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20235298573970795, + "step": 27506 + }, + { + "epoch": 0.55016, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009698232014973959, + "learning_rate": 0.0001, + "loss": 3.8529, + "loss/crossentropy": 1.9228017926216125, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19821560382843018, + "step": 27508 + }, + { + "epoch": 0.5502, + "grad_norm": 1.953125, + "grad_norm_var": 0.010247548421223959, + "learning_rate": 0.0001, + "loss": 3.9431, + "loss/crossentropy": 1.8887941241264343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20294177532196045, + "step": 27510 + }, + { + "epoch": 0.55024, + "grad_norm": 1.921875, + "grad_norm_var": 0.010456339518229166, + "learning_rate": 0.0001, + "loss": 4.2666, + "loss/crossentropy": 2.2699393033981323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22084251046180725, + "step": 27512 + }, + { + "epoch": 0.55028, + "grad_norm": 2.03125, + "grad_norm_var": 0.008101145426432291, + "learning_rate": 0.0001, + "loss": 4.0852, + "loss/crossentropy": 1.9771258234977722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1795499324798584, + "step": 27514 + }, + { + "epoch": 0.55032, + "grad_norm": 1.828125, + "grad_norm_var": 0.007990519205729166, + "learning_rate": 0.0001, + "loss": 3.7655, + "loss/crossentropy": 2.0192587971687317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19595889747142792, + "step": 27516 + }, + { + "epoch": 0.55036, + "grad_norm": 1.8671875, + "grad_norm_var": 5.339900461832682, + "learning_rate": 0.0001, + "loss": 3.9772, + "loss/crossentropy": 1.9080755710601807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18093246221542358, + "step": 27518 + }, + { + "epoch": 0.5504, + "grad_norm": 2.0, + "grad_norm_var": 5.307067616780599, + "learning_rate": 0.0001, + "loss": 4.2234, + "loss/crossentropy": 2.237337589263916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21287637203931808, + "step": 27520 + }, + { + "epoch": 0.55044, + "grad_norm": 1.8203125, + "grad_norm_var": 5.324763743082682, + "learning_rate": 0.0001, + "loss": 3.8267, + "loss/crossentropy": 1.986120343208313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16889339685440063, + "step": 27522 + }, + { + "epoch": 0.55048, + "grad_norm": 1.9140625, + "grad_norm_var": 5.322076161702474, + "learning_rate": 0.0001, + "loss": 4.0203, + "loss/crossentropy": 1.9601141810417175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20309537649154663, + "step": 27524 + }, + { + "epoch": 0.55052, + "grad_norm": 1.921875, + "grad_norm_var": 5.332348378499349, + "learning_rate": 0.0001, + "loss": 4.0248, + "loss/crossentropy": 1.8901299238204956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18104882538318634, + "step": 27526 + }, + { + "epoch": 0.55056, + "grad_norm": 1.890625, + "grad_norm_var": 5.3486480712890625, + "learning_rate": 0.0001, + "loss": 4.1152, + "loss/crossentropy": 2.204539656639099, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20814384520053864, + "step": 27528 + }, + { + "epoch": 0.5506, + "grad_norm": 1.6796875, + "grad_norm_var": 5.393595377604167, + "learning_rate": 0.0001, + "loss": 3.6623, + "loss/crossentropy": 1.8643839955329895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1773769110441208, + "step": 27530 + }, + { + "epoch": 0.55064, + "grad_norm": 1.8359375, + "grad_norm_var": 5.410757446289063, + "learning_rate": 0.0001, + "loss": 3.5404, + "loss/crossentropy": 1.66867595911026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1539328545331955, + "step": 27532 + }, + { + "epoch": 0.55068, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0146484375, + "learning_rate": 0.0001, + "loss": 3.87, + "loss/crossentropy": 1.8830199241638184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18702629208564758, + "step": 27534 + }, + { + "epoch": 0.55072, + "grad_norm": 1.8203125, + "grad_norm_var": 0.007793935139973959, + "learning_rate": 0.0001, + "loss": 4.0369, + "loss/crossentropy": 1.8840887546539307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17654889076948166, + "step": 27536 + }, + { + "epoch": 0.55076, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008196767171223958, + "learning_rate": 0.0001, + "loss": 4.2076, + "loss/crossentropy": 1.9453116655349731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726330816745758, + "step": 27538 + }, + { + "epoch": 0.5508, + "grad_norm": 1.953125, + "grad_norm_var": 0.008133951822916667, + "learning_rate": 0.0001, + "loss": 3.9739, + "loss/crossentropy": 2.0022284388542175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18195754289627075, + "step": 27540 + }, + { + "epoch": 0.55084, + "grad_norm": 1.953125, + "grad_norm_var": 0.0090240478515625, + "learning_rate": 0.0001, + "loss": 4.2032, + "loss/crossentropy": 2.164666533470154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20948568731546402, + "step": 27542 + }, + { + "epoch": 0.55088, + "grad_norm": 1.9453125, + "grad_norm_var": 0.00947265625, + "learning_rate": 0.0001, + "loss": 4.0897, + "loss/crossentropy": 1.8899564146995544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1793677881360054, + "step": 27544 + }, + { + "epoch": 0.55092, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008648427327473958, + "learning_rate": 0.0001, + "loss": 4.0669, + "loss/crossentropy": 1.914944589138031, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21741337329149246, + "step": 27546 + }, + { + "epoch": 0.55096, + "grad_norm": 2.140625, + "grad_norm_var": 0.0091552734375, + "learning_rate": 0.0001, + "loss": 3.9413, + "loss/crossentropy": 1.8026766180992126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1790105178952217, + "step": 27548 + }, + { + "epoch": 0.551, + "grad_norm": 2.015625, + "grad_norm_var": 0.0085601806640625, + "learning_rate": 0.0001, + "loss": 4.2072, + "loss/crossentropy": 2.1936656832695007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21415145695209503, + "step": 27550 + }, + { + "epoch": 0.55104, + "grad_norm": 1.890625, + "grad_norm_var": 0.0077056884765625, + "learning_rate": 0.0001, + "loss": 4.0292, + "loss/crossentropy": 1.841277301311493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19467134028673172, + "step": 27552 + }, + { + "epoch": 0.55108, + "grad_norm": 1.96875, + "grad_norm_var": 0.0077056884765625, + "learning_rate": 0.0001, + "loss": 4.1539, + "loss/crossentropy": 1.9853840470314026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18696378916502, + "step": 27554 + }, + { + "epoch": 0.55112, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006740061442057291, + "learning_rate": 0.0001, + "loss": 4.0032, + "loss/crossentropy": 2.213033676147461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19526077806949615, + "step": 27556 + }, + { + "epoch": 0.55116, + "grad_norm": 1.9375, + "grad_norm_var": 0.00665283203125, + "learning_rate": 0.0001, + "loss": 3.9773, + "loss/crossentropy": 1.8020763397216797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1804225742816925, + "step": 27558 + }, + { + "epoch": 0.5512, + "grad_norm": 1.9140625, + "grad_norm_var": 0.005838775634765625, + "learning_rate": 0.0001, + "loss": 4.0551, + "loss/crossentropy": 1.899497926235199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19439035654067993, + "step": 27560 + }, + { + "epoch": 0.55124, + "grad_norm": 2.046875, + "grad_norm_var": 0.00552978515625, + "learning_rate": 0.0001, + "loss": 3.9789, + "loss/crossentropy": 1.9123604893684387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18431273847818375, + "step": 27562 + }, + { + "epoch": 0.55128, + "grad_norm": 2.09375, + "grad_norm_var": 0.004510243733723958, + "learning_rate": 0.0001, + "loss": 4.3084, + "loss/crossentropy": 2.3199894428253174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21417556703090668, + "step": 27564 + }, + { + "epoch": 0.55132, + "grad_norm": 1.8125, + "grad_norm_var": 0.0055653889973958336, + "learning_rate": 0.0001, + "loss": 3.8603, + "loss/crossentropy": 2.3096247911453247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19787238538265228, + "step": 27566 + }, + { + "epoch": 0.55136, + "grad_norm": 1.859375, + "grad_norm_var": 0.005861155192057292, + "learning_rate": 0.0001, + "loss": 4.083, + "loss/crossentropy": 2.2241322994232178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21322394907474518, + "step": 27568 + }, + { + "epoch": 0.5514, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005850982666015625, + "learning_rate": 0.0001, + "loss": 3.8552, + "loss/crossentropy": 2.3283156156539917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21560409665107727, + "step": 27570 + }, + { + "epoch": 0.55144, + "grad_norm": 2.0, + "grad_norm_var": 0.005352528889973959, + "learning_rate": 0.0001, + "loss": 4.0709, + "loss/crossentropy": 2.07872211933136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20099805295467377, + "step": 27572 + }, + { + "epoch": 0.55148, + "grad_norm": 2.0, + "grad_norm_var": 0.005273183186848958, + "learning_rate": 0.0001, + "loss": 4.0868, + "loss/crossentropy": 2.1145899295806885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19824448227882385, + "step": 27574 + }, + { + "epoch": 0.55152, + "grad_norm": 1.640625, + "grad_norm_var": 0.012674967447916666, + "learning_rate": 0.0001, + "loss": 3.4628, + "loss/crossentropy": 1.5884234309196472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1501767933368683, + "step": 27576 + }, + { + "epoch": 0.55156, + "grad_norm": 1.875, + "grad_norm_var": 0.0121002197265625, + "learning_rate": 0.0001, + "loss": 3.6905, + "loss/crossentropy": 1.9045534133911133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19776541739702225, + "step": 27578 + }, + { + "epoch": 0.5516, + "grad_norm": 1.796875, + "grad_norm_var": 0.010337066650390626, + "learning_rate": 0.0001, + "loss": 3.862, + "loss/crossentropy": 1.6315965056419373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15820547938346863, + "step": 27580 + }, + { + "epoch": 0.55164, + "grad_norm": 2.109375, + "grad_norm_var": 0.011790974934895834, + "learning_rate": 0.0001, + "loss": 4.298, + "loss/crossentropy": 2.1012359261512756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19510812312364578, + "step": 27582 + }, + { + "epoch": 0.55168, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012261708577473959, + "learning_rate": 0.0001, + "loss": 3.8833, + "loss/crossentropy": 1.8365219831466675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16663482040166855, + "step": 27584 + }, + { + "epoch": 0.55172, + "grad_norm": 1.9453125, + "grad_norm_var": 0.012181599934895834, + "learning_rate": 0.0001, + "loss": 3.6868, + "loss/crossentropy": 1.6274245977401733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1516888439655304, + "step": 27586 + }, + { + "epoch": 0.55176, + "grad_norm": 1.921875, + "grad_norm_var": 0.011529286702473959, + "learning_rate": 0.0001, + "loss": 3.8555, + "loss/crossentropy": 1.6476022601127625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17268501967191696, + "step": 27588 + }, + { + "epoch": 0.5518, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010400390625, + "learning_rate": 0.0001, + "loss": 3.7764, + "loss/crossentropy": 2.135971188545227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20169296115636826, + "step": 27590 + }, + { + "epoch": 0.55184, + "grad_norm": 1.8046875, + "grad_norm_var": 0.006640625, + "learning_rate": 0.0001, + "loss": 3.7435, + "loss/crossentropy": 2.2080432176589966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20463210344314575, + "step": 27592 + }, + { + "epoch": 0.55188, + "grad_norm": 2.015625, + "grad_norm_var": 0.008131663004557291, + "learning_rate": 0.0001, + "loss": 4.1494, + "loss/crossentropy": 1.953204333782196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18644939363002777, + "step": 27594 + }, + { + "epoch": 0.55192, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007429758707682292, + "learning_rate": 0.0001, + "loss": 4.1851, + "loss/crossentropy": 1.7578163146972656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1816823109984398, + "step": 27596 + }, + { + "epoch": 0.55196, + "grad_norm": 1.7578125, + "grad_norm_var": 0.006193033854166667, + "learning_rate": 0.0001, + "loss": 3.7816, + "loss/crossentropy": 1.9401060938835144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1864597350358963, + "step": 27598 + }, + { + "epoch": 0.552, + "grad_norm": 2.140625, + "grad_norm_var": 0.009455362955729166, + "learning_rate": 0.0001, + "loss": 3.8935, + "loss/crossentropy": 2.03184574842453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1829465553164482, + "step": 27600 + }, + { + "epoch": 0.55204, + "grad_norm": 1.7421875, + "grad_norm_var": 0.010365549723307292, + "learning_rate": 0.0001, + "loss": 3.6649, + "loss/crossentropy": 2.15334689617157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18497572094202042, + "step": 27602 + }, + { + "epoch": 0.55208, + "grad_norm": 2.078125, + "grad_norm_var": 0.012918853759765625, + "learning_rate": 0.0001, + "loss": 4.0528, + "loss/crossentropy": 1.894173800945282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1757444217801094, + "step": 27604 + }, + { + "epoch": 0.55212, + "grad_norm": 1.9375, + "grad_norm_var": 0.0134765625, + "learning_rate": 0.0001, + "loss": 3.9785, + "loss/crossentropy": 1.8364281058311462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17648981511592865, + "step": 27606 + }, + { + "epoch": 0.55216, + "grad_norm": 1.875, + "grad_norm_var": 0.012882232666015625, + "learning_rate": 0.0001, + "loss": 3.8651, + "loss/crossentropy": 1.5523585081100464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1650981828570366, + "step": 27608 + }, + { + "epoch": 0.5522, + "grad_norm": 2.0, + "grad_norm_var": 0.013887532552083333, + "learning_rate": 0.0001, + "loss": 3.8205, + "loss/crossentropy": 2.1311700344085693, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19313734769821167, + "step": 27610 + }, + { + "epoch": 0.55224, + "grad_norm": 1.84375, + "grad_norm_var": 0.014149729410807292, + "learning_rate": 0.0001, + "loss": 3.8613, + "loss/crossentropy": 2.23338782787323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21325203776359558, + "step": 27612 + }, + { + "epoch": 0.55228, + "grad_norm": 1.75, + "grad_norm_var": 0.013736724853515625, + "learning_rate": 0.0001, + "loss": 3.7978, + "loss/crossentropy": 2.0709031224250793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17793143540620804, + "step": 27614 + }, + { + "epoch": 0.55232, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010505167643229167, + "learning_rate": 0.0001, + "loss": 3.8325, + "loss/crossentropy": 1.758750557899475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1758880391716957, + "step": 27616 + }, + { + "epoch": 0.55236, + "grad_norm": 1.765625, + "grad_norm_var": 0.010353342692057291, + "learning_rate": 0.0001, + "loss": 3.695, + "loss/crossentropy": 1.8869126439094543, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16803206503391266, + "step": 27618 + }, + { + "epoch": 0.5524, + "grad_norm": 2.046875, + "grad_norm_var": 0.010982004801432292, + "learning_rate": 0.0001, + "loss": 4.1633, + "loss/crossentropy": 2.5184292793273926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22967734932899475, + "step": 27620 + }, + { + "epoch": 0.55244, + "grad_norm": 3.859375, + "grad_norm_var": 0.25396906534830727, + "learning_rate": 0.0001, + "loss": 3.9171, + "loss/crossentropy": 2.2767695784568787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21888408064842224, + "step": 27622 + }, + { + "epoch": 0.55248, + "grad_norm": 2.015625, + "grad_norm_var": 0.2516009012858073, + "learning_rate": 0.0001, + "loss": 4.0942, + "loss/crossentropy": 2.015432834625244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19384412467479706, + "step": 27624 + }, + { + "epoch": 0.55252, + "grad_norm": 1.921875, + "grad_norm_var": 0.2524879455566406, + "learning_rate": 0.0001, + "loss": 4.1693, + "loss/crossentropy": 1.9935010075569153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20147496461868286, + "step": 27626 + }, + { + "epoch": 0.55256, + "grad_norm": 2.0625, + "grad_norm_var": 0.24866714477539062, + "learning_rate": 0.0001, + "loss": 4.1237, + "loss/crossentropy": 2.0798122882843018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19859570264816284, + "step": 27628 + }, + { + "epoch": 0.5526, + "grad_norm": 1.890625, + "grad_norm_var": 0.24353001912434896, + "learning_rate": 0.0001, + "loss": 3.7006, + "loss/crossentropy": 1.871802031993866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1913897469639778, + "step": 27630 + }, + { + "epoch": 0.55264, + "grad_norm": 2.0625, + "grad_norm_var": 0.2379351298014323, + "learning_rate": 0.0001, + "loss": 3.9546, + "loss/crossentropy": 2.184646248817444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1939481720328331, + "step": 27632 + }, + { + "epoch": 0.55268, + "grad_norm": 1.9140625, + "grad_norm_var": 0.23611831665039062, + "learning_rate": 0.0001, + "loss": 3.8362, + "loss/crossentropy": 2.0267680883407593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20360219478607178, + "step": 27634 + }, + { + "epoch": 0.55272, + "grad_norm": 1.9921875, + "grad_norm_var": 0.2367754618326823, + "learning_rate": 0.0001, + "loss": 4.2263, + "loss/crossentropy": 2.2218815088272095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20947173982858658, + "step": 27636 + }, + { + "epoch": 0.55276, + "grad_norm": 2.15625, + "grad_norm_var": 0.010217030843098959, + "learning_rate": 0.0001, + "loss": 4.2844, + "loss/crossentropy": 1.982936441898346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1881447434425354, + "step": 27638 + }, + { + "epoch": 0.5528, + "grad_norm": 1.90625, + "grad_norm_var": 0.008601888020833334, + "learning_rate": 0.0001, + "loss": 3.8908, + "loss/crossentropy": 1.8088389039039612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1668427437543869, + "step": 27640 + }, + { + "epoch": 0.55284, + "grad_norm": 1.890625, + "grad_norm_var": 0.008735911051432291, + "learning_rate": 0.0001, + "loss": 4.214, + "loss/crossentropy": 2.0886768102645874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1919582262635231, + "step": 27642 + }, + { + "epoch": 0.55288, + "grad_norm": 2.90625, + "grad_norm_var": 0.06743748982747395, + "learning_rate": 0.0001, + "loss": 4.3581, + "loss/crossentropy": 2.0147945880889893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20268514752388, + "step": 27644 + }, + { + "epoch": 0.55292, + "grad_norm": 1.90625, + "grad_norm_var": 0.06584447224934896, + "learning_rate": 0.0001, + "loss": 4.0147, + "loss/crossentropy": 1.8306025266647339, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1949317753314972, + "step": 27646 + }, + { + "epoch": 0.55296, + "grad_norm": 1.8515625, + "grad_norm_var": 0.06684137980143229, + "learning_rate": 0.0001, + "loss": 4.0944, + "loss/crossentropy": 2.201099991798401, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19578266888856888, + "step": 27648 + }, + { + "epoch": 0.553, + "grad_norm": 1.875, + "grad_norm_var": 0.06663792928059896, + "learning_rate": 0.0001, + "loss": 3.9902, + "loss/crossentropy": 2.159493327140808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18864253163337708, + "step": 27650 + }, + { + "epoch": 0.55304, + "grad_norm": 1.828125, + "grad_norm_var": 0.06830215454101562, + "learning_rate": 0.0001, + "loss": 3.9129, + "loss/crossentropy": 2.0548607110977173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18820471316576004, + "step": 27652 + }, + { + "epoch": 0.55308, + "grad_norm": 1.8984375, + "grad_norm_var": 0.07153701782226562, + "learning_rate": 0.0001, + "loss": 3.4452, + "loss/crossentropy": 1.8324944972991943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19259792566299438, + "step": 27654 + }, + { + "epoch": 0.55312, + "grad_norm": 2.71875, + "grad_norm_var": 0.10803120930989583, + "learning_rate": 0.0001, + "loss": 4.2944, + "loss/crossentropy": 1.8857054114341736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1885119453072548, + "step": 27656 + }, + { + "epoch": 0.55316, + "grad_norm": 1.9921875, + "grad_norm_var": 0.10672200520833333, + "learning_rate": 0.0001, + "loss": 4.1616, + "loss/crossentropy": 2.0037535429000854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1955013945698738, + "step": 27658 + }, + { + "epoch": 0.5532, + "grad_norm": 1.8515625, + "grad_norm_var": 0.05010579427083333, + "learning_rate": 0.0001, + "loss": 3.8527, + "loss/crossentropy": 1.945756196975708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17360417544841766, + "step": 27660 + }, + { + "epoch": 0.55324, + "grad_norm": 1.8125, + "grad_norm_var": 0.05109456380208333, + "learning_rate": 0.0001, + "loss": 3.8767, + "loss/crossentropy": 1.8675792217254639, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17810215801000595, + "step": 27662 + }, + { + "epoch": 0.55328, + "grad_norm": 1.8515625, + "grad_norm_var": 0.05110041300455729, + "learning_rate": 0.0001, + "loss": 4.1631, + "loss/crossentropy": 1.9741141200065613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17763611674308777, + "step": 27664 + }, + { + "epoch": 0.55332, + "grad_norm": 2.09375, + "grad_norm_var": 0.05176493326822917, + "learning_rate": 0.0001, + "loss": 3.9846, + "loss/crossentropy": 2.123018801212311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21710921823978424, + "step": 27666 + }, + { + "epoch": 0.55336, + "grad_norm": 1.8828125, + "grad_norm_var": 0.05128758748372396, + "learning_rate": 0.0001, + "loss": 3.7929, + "loss/crossentropy": 2.043752074241638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1959235817193985, + "step": 27668 + }, + { + "epoch": 0.5534, + "grad_norm": 1.984375, + "grad_norm_var": 0.04549560546875, + "learning_rate": 0.0001, + "loss": 4.0634, + "loss/crossentropy": 1.9128360748291016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18556588888168335, + "step": 27670 + }, + { + "epoch": 0.55344, + "grad_norm": 2.046875, + "grad_norm_var": 0.008585611979166666, + "learning_rate": 0.0001, + "loss": 4.2678, + "loss/crossentropy": 2.1376100182533264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1985430046916008, + "step": 27672 + }, + { + "epoch": 0.55348, + "grad_norm": 2.03125, + "grad_norm_var": 0.0088775634765625, + "learning_rate": 0.0001, + "loss": 4.0457, + "loss/crossentropy": 2.106353759765625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20115973055362701, + "step": 27674 + }, + { + "epoch": 0.55352, + "grad_norm": 2.015625, + "grad_norm_var": 0.008316802978515624, + "learning_rate": 0.0001, + "loss": 4.0802, + "loss/crossentropy": 2.5206472873687744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21297650784254074, + "step": 27676 + }, + { + "epoch": 0.55356, + "grad_norm": 1.875, + "grad_norm_var": 0.008229319254557292, + "learning_rate": 0.0001, + "loss": 3.9164, + "loss/crossentropy": 1.8287059664726257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16045409440994263, + "step": 27678 + }, + { + "epoch": 0.5536, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007610829671223959, + "learning_rate": 0.0001, + "loss": 3.9755, + "loss/crossentropy": 2.103208303451538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20481672883033752, + "step": 27680 + }, + { + "epoch": 0.55364, + "grad_norm": 1.984375, + "grad_norm_var": 0.006689453125, + "learning_rate": 0.0001, + "loss": 4.0925, + "loss/crossentropy": 1.6746403574943542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16627443581819534, + "step": 27682 + }, + { + "epoch": 0.55368, + "grad_norm": 2.03125, + "grad_norm_var": 0.005954742431640625, + "learning_rate": 0.0001, + "loss": 3.9342, + "loss/crossentropy": 1.9239201545715332, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19345758855342865, + "step": 27684 + }, + { + "epoch": 0.55372, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006009674072265625, + "learning_rate": 0.0001, + "loss": 3.9512, + "loss/crossentropy": 2.052439272403717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18695925921201706, + "step": 27686 + }, + { + "epoch": 0.55376, + "grad_norm": 1.984375, + "grad_norm_var": 0.005476633707682292, + "learning_rate": 0.0001, + "loss": 4.3674, + "loss/crossentropy": 2.4474085569381714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21558378636837006, + "step": 27688 + }, + { + "epoch": 0.5538, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006062825520833333, + "learning_rate": 0.0001, + "loss": 3.6432, + "loss/crossentropy": 1.8521843552589417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16763968020677567, + "step": 27690 + }, + { + "epoch": 0.55384, + "grad_norm": 1.875, + "grad_norm_var": 0.0060455322265625, + "learning_rate": 0.0001, + "loss": 3.9086, + "loss/crossentropy": 1.888016939163208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.185853011906147, + "step": 27692 + }, + { + "epoch": 0.55388, + "grad_norm": 1.8828125, + "grad_norm_var": 0.005830637613932292, + "learning_rate": 0.0001, + "loss": 3.8378, + "loss/crossentropy": 1.7538975477218628, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854776367545128, + "step": 27694 + }, + { + "epoch": 0.55392, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005936431884765625, + "learning_rate": 0.0001, + "loss": 4.1631, + "loss/crossentropy": 2.259602427482605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21480850875377655, + "step": 27696 + }, + { + "epoch": 0.55396, + "grad_norm": 1.875, + "grad_norm_var": 0.006860097249348958, + "learning_rate": 0.0001, + "loss": 4.0021, + "loss/crossentropy": 2.0374972820281982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20546026527881622, + "step": 27698 + }, + { + "epoch": 0.554, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006392415364583333, + "learning_rate": 0.0001, + "loss": 4.1568, + "loss/crossentropy": 2.15248441696167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19681835174560547, + "step": 27700 + }, + { + "epoch": 0.55404, + "grad_norm": 1.78125, + "grad_norm_var": 0.0070798238118489586, + "learning_rate": 0.0001, + "loss": 3.7793, + "loss/crossentropy": 1.8983227610588074, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18331478536128998, + "step": 27702 + }, + { + "epoch": 0.55408, + "grad_norm": 1.765625, + "grad_norm_var": 0.00849609375, + "learning_rate": 0.0001, + "loss": 3.6701, + "loss/crossentropy": 1.7995752692222595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17722319066524506, + "step": 27704 + }, + { + "epoch": 0.55412, + "grad_norm": 1.921875, + "grad_norm_var": 0.00760498046875, + "learning_rate": 0.0001, + "loss": 4.0987, + "loss/crossentropy": 1.89347642660141, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17802709341049194, + "step": 27706 + }, + { + "epoch": 0.55416, + "grad_norm": 1.890625, + "grad_norm_var": 0.008343251546223958, + "learning_rate": 0.0001, + "loss": 3.8648, + "loss/crossentropy": 2.024496912956238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18508757650852203, + "step": 27708 + }, + { + "epoch": 0.5542, + "grad_norm": 1.7265625, + "grad_norm_var": 0.010731760660807292, + "learning_rate": 0.0001, + "loss": 3.7977, + "loss/crossentropy": 2.0142337679862976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18837066739797592, + "step": 27710 + }, + { + "epoch": 0.55424, + "grad_norm": 1.96875, + "grad_norm_var": 0.009913889567057292, + "learning_rate": 0.0001, + "loss": 3.9401, + "loss/crossentropy": 1.75287264585495, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1797751560807228, + "step": 27712 + }, + { + "epoch": 0.55428, + "grad_norm": 1.984375, + "grad_norm_var": 0.007861073811848958, + "learning_rate": 0.0001, + "loss": 4.2331, + "loss/crossentropy": 2.204139769077301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17446620762348175, + "step": 27714 + }, + { + "epoch": 0.55432, + "grad_norm": 1.8203125, + "grad_norm_var": 0.009317779541015625, + "learning_rate": 0.0001, + "loss": 4.1056, + "loss/crossentropy": 2.0878941416740417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19087567925453186, + "step": 27716 + }, + { + "epoch": 0.55436, + "grad_norm": 2.0625, + "grad_norm_var": 0.0128570556640625, + "learning_rate": 0.0001, + "loss": 4.2309, + "loss/crossentropy": 2.2289849519729614, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21995704621076584, + "step": 27718 + }, + { + "epoch": 0.5544, + "grad_norm": 1.84375, + "grad_norm_var": 0.013140614827473958, + "learning_rate": 0.0001, + "loss": 4.1908, + "loss/crossentropy": 2.2673051357269287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19668636471033096, + "step": 27720 + }, + { + "epoch": 0.55444, + "grad_norm": 2.109375, + "grad_norm_var": 0.0144439697265625, + "learning_rate": 0.0001, + "loss": 3.9146, + "loss/crossentropy": 1.7722193598747253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17487196624279022, + "step": 27722 + }, + { + "epoch": 0.55448, + "grad_norm": 2.03125, + "grad_norm_var": 0.0147613525390625, + "learning_rate": 0.0001, + "loss": 4.2427, + "loss/crossentropy": 1.8388479351997375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20283987373113632, + "step": 27724 + }, + { + "epoch": 0.55452, + "grad_norm": 1.9765625, + "grad_norm_var": 0.010334269205729166, + "learning_rate": 0.0001, + "loss": 4.1243, + "loss/crossentropy": 2.2758948802948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20532135665416718, + "step": 27726 + }, + { + "epoch": 0.55456, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009037017822265625, + "learning_rate": 0.0001, + "loss": 4.0048, + "loss/crossentropy": 2.272206664085388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20994101464748383, + "step": 27728 + }, + { + "epoch": 0.5546, + "grad_norm": 1.84375, + "grad_norm_var": 0.011824289957682291, + "learning_rate": 0.0001, + "loss": 4.1122, + "loss/crossentropy": 2.0169429779052734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21998245269060135, + "step": 27730 + }, + { + "epoch": 0.55464, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011668904622395834, + "learning_rate": 0.0001, + "loss": 4.0199, + "loss/crossentropy": 2.0700154900550842, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20395687222480774, + "step": 27732 + }, + { + "epoch": 0.55468, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010237630208333333, + "learning_rate": 0.0001, + "loss": 4.2295, + "loss/crossentropy": 2.2826544046401978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1890336349606514, + "step": 27734 + }, + { + "epoch": 0.55472, + "grad_norm": 1.765625, + "grad_norm_var": 0.010503896077473958, + "learning_rate": 0.0001, + "loss": 3.8089, + "loss/crossentropy": 2.2162158489227295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2044035568833351, + "step": 27736 + }, + { + "epoch": 0.55476, + "grad_norm": 2.0, + "grad_norm_var": 0.009894816080729167, + "learning_rate": 0.0001, + "loss": 4.2302, + "loss/crossentropy": 1.8466681838035583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18051490187644958, + "step": 27738 + }, + { + "epoch": 0.5548, + "grad_norm": 1.8046875, + "grad_norm_var": 0.010469309488932292, + "learning_rate": 0.0001, + "loss": 3.8307, + "loss/crossentropy": 2.19679594039917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19721779227256775, + "step": 27740 + }, + { + "epoch": 0.55484, + "grad_norm": 1.953125, + "grad_norm_var": 0.010109202067057291, + "learning_rate": 0.0001, + "loss": 4.1634, + "loss/crossentropy": 2.0091158151626587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19828540831804276, + "step": 27742 + }, + { + "epoch": 0.55488, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010261027018229167, + "learning_rate": 0.0001, + "loss": 4.1095, + "loss/crossentropy": 1.9434176087379456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20168793946504593, + "step": 27744 + }, + { + "epoch": 0.55492, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0070302327473958336, + "learning_rate": 0.0001, + "loss": 3.9754, + "loss/crossentropy": 2.365121006965637, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20621124655008316, + "step": 27746 + }, + { + "epoch": 0.55496, + "grad_norm": 2.203125, + "grad_norm_var": 0.011714426676432292, + "learning_rate": 0.0001, + "loss": 4.1344, + "loss/crossentropy": 2.4154850244522095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21647262573242188, + "step": 27748 + }, + { + "epoch": 0.555, + "grad_norm": 1.9921875, + "grad_norm_var": 0.01280517578125, + "learning_rate": 0.0001, + "loss": 3.8326, + "loss/crossentropy": 1.9805672764778137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1818150281906128, + "step": 27750 + }, + { + "epoch": 0.55504, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011188761393229166, + "learning_rate": 0.0001, + "loss": 4.0105, + "loss/crossentropy": 2.1163841485977173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20045197010040283, + "step": 27752 + }, + { + "epoch": 0.55508, + "grad_norm": 2.015625, + "grad_norm_var": 0.010008748372395833, + "learning_rate": 0.0001, + "loss": 3.8754, + "loss/crossentropy": 2.3555898666381836, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19626620411872864, + "step": 27754 + }, + { + "epoch": 0.55512, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007883453369140625, + "learning_rate": 0.0001, + "loss": 3.8355, + "loss/crossentropy": 1.8587325811386108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18491162359714508, + "step": 27756 + }, + { + "epoch": 0.55516, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009138743082682291, + "learning_rate": 0.0001, + "loss": 3.9636, + "loss/crossentropy": 1.9335539937019348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18707312643527985, + "step": 27758 + }, + { + "epoch": 0.5552, + "grad_norm": 1.796875, + "grad_norm_var": 0.010953776041666667, + "learning_rate": 0.0001, + "loss": 4.0691, + "loss/crossentropy": 1.9938113689422607, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1832147091627121, + "step": 27760 + }, + { + "epoch": 0.55524, + "grad_norm": 2.03125, + "grad_norm_var": 0.010106404622395834, + "learning_rate": 0.0001, + "loss": 4.1229, + "loss/crossentropy": 1.796451210975647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16813044995069504, + "step": 27762 + }, + { + "epoch": 0.55528, + "grad_norm": 2.078125, + "grad_norm_var": 0.0072672526041666664, + "learning_rate": 0.0001, + "loss": 3.9291, + "loss/crossentropy": 2.2282769680023193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20037836581468582, + "step": 27764 + }, + { + "epoch": 0.55532, + "grad_norm": 1.984375, + "grad_norm_var": 0.007088216145833334, + "learning_rate": 0.0001, + "loss": 4.0021, + "loss/crossentropy": 2.3429712057113647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21040766686201096, + "step": 27766 + }, + { + "epoch": 0.55536, + "grad_norm": 2.03125, + "grad_norm_var": 0.006932576497395833, + "learning_rate": 0.0001, + "loss": 4.0902, + "loss/crossentropy": 2.24351704120636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19055639952421188, + "step": 27768 + }, + { + "epoch": 0.5554, + "grad_norm": 2.0, + "grad_norm_var": 0.007120768229166667, + "learning_rate": 0.0001, + "loss": 4.1444, + "loss/crossentropy": 2.190573811531067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1991223245859146, + "step": 27770 + }, + { + "epoch": 0.55544, + "grad_norm": 1.84375, + "grad_norm_var": 0.008925120035807291, + "learning_rate": 0.0001, + "loss": 3.9567, + "loss/crossentropy": 1.9847629070281982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17740985751152039, + "step": 27772 + }, + { + "epoch": 0.55548, + "grad_norm": 1.828125, + "grad_norm_var": 0.008565012613932292, + "learning_rate": 0.0001, + "loss": 3.8379, + "loss/crossentropy": 2.0242174863815308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18693319708108902, + "step": 27774 + }, + { + "epoch": 0.55552, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008141835530598959, + "learning_rate": 0.0001, + "loss": 3.8359, + "loss/crossentropy": 1.7930986881256104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17298821359872818, + "step": 27776 + }, + { + "epoch": 0.55556, + "grad_norm": 1.7578125, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 3.7851, + "loss/crossentropy": 1.967544972896576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1983192041516304, + "step": 27778 + }, + { + "epoch": 0.5556, + "grad_norm": 2.03125, + "grad_norm_var": 0.008280436197916666, + "learning_rate": 0.0001, + "loss": 4.2846, + "loss/crossentropy": 2.353300929069519, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2090948149561882, + "step": 27780 + }, + { + "epoch": 0.55564, + "grad_norm": 2.234375, + "grad_norm_var": 0.013826243082682292, + "learning_rate": 0.0001, + "loss": 4.0386, + "loss/crossentropy": 2.1349263191223145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19604195654392242, + "step": 27782 + }, + { + "epoch": 0.55568, + "grad_norm": 1.84375, + "grad_norm_var": 0.013890584309895834, + "learning_rate": 0.0001, + "loss": 3.9092, + "loss/crossentropy": 2.1047816276550293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20156846940517426, + "step": 27784 + }, + { + "epoch": 0.55572, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013213857014973959, + "learning_rate": 0.0001, + "loss": 3.8338, + "loss/crossentropy": 1.9704868793487549, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18116163462400436, + "step": 27786 + }, + { + "epoch": 0.55576, + "grad_norm": 2.015625, + "grad_norm_var": 0.013628896077473958, + "learning_rate": 0.0001, + "loss": 3.9997, + "loss/crossentropy": 1.906118392944336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18277255445718765, + "step": 27788 + }, + { + "epoch": 0.5558, + "grad_norm": 1.9140625, + "grad_norm_var": 0.013449859619140626, + "learning_rate": 0.0001, + "loss": 4.2688, + "loss/crossentropy": 2.0571773648262024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1843668520450592, + "step": 27790 + }, + { + "epoch": 0.55584, + "grad_norm": 1.8046875, + "grad_norm_var": 0.01390380859375, + "learning_rate": 0.0001, + "loss": 3.8647, + "loss/crossentropy": 1.9324238300323486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16451232880353928, + "step": 27792 + }, + { + "epoch": 0.55588, + "grad_norm": 1.8203125, + "grad_norm_var": 0.012589263916015624, + "learning_rate": 0.0001, + "loss": 4.1593, + "loss/crossentropy": 2.27968692779541, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21742059290409088, + "step": 27794 + }, + { + "epoch": 0.55592, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012804921468098958, + "learning_rate": 0.0001, + "loss": 4.3922, + "loss/crossentropy": 2.20041286945343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22774580121040344, + "step": 27796 + }, + { + "epoch": 0.55596, + "grad_norm": 1.859375, + "grad_norm_var": 0.006239573160807292, + "learning_rate": 0.0001, + "loss": 3.9747, + "loss/crossentropy": 2.258635997772217, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18686214089393616, + "step": 27798 + }, + { + "epoch": 0.556, + "grad_norm": 1.875, + "grad_norm_var": 0.006217193603515625, + "learning_rate": 0.0001, + "loss": 3.8514, + "loss/crossentropy": 1.8728525638580322, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1724954918026924, + "step": 27800 + }, + { + "epoch": 0.55604, + "grad_norm": 2.09375, + "grad_norm_var": 0.009789021809895833, + "learning_rate": 0.0001, + "loss": 4.3107, + "loss/crossentropy": 1.7146453261375427, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18469443172216415, + "step": 27802 + }, + { + "epoch": 0.55608, + "grad_norm": 1.90625, + "grad_norm_var": 0.009015909830729167, + "learning_rate": 0.0001, + "loss": 3.9967, + "loss/crossentropy": 2.194948673248291, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963738277554512, + "step": 27804 + }, + { + "epoch": 0.55612, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0107177734375, + "learning_rate": 0.0001, + "loss": 3.7152, + "loss/crossentropy": 1.7860903143882751, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16485057771205902, + "step": 27806 + }, + { + "epoch": 0.55616, + "grad_norm": 2.0625, + "grad_norm_var": 0.011246490478515624, + "learning_rate": 0.0001, + "loss": 4.0281, + "loss/crossentropy": 2.1505147218704224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2449631690979004, + "step": 27808 + }, + { + "epoch": 0.5562, + "grad_norm": 2.015625, + "grad_norm_var": 0.011139933268229167, + "learning_rate": 0.0001, + "loss": 4.1753, + "loss/crossentropy": 2.2144479751586914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19644592702388763, + "step": 27810 + }, + { + "epoch": 0.55624, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010796864827473959, + "learning_rate": 0.0001, + "loss": 4.1869, + "loss/crossentropy": 2.2427414655685425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20603332668542862, + "step": 27812 + }, + { + "epoch": 0.55628, + "grad_norm": 1.6875, + "grad_norm_var": 0.013533528645833333, + "learning_rate": 0.0001, + "loss": 3.8191, + "loss/crossentropy": 1.9676510691642761, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16527333855628967, + "step": 27814 + }, + { + "epoch": 0.55632, + "grad_norm": 1.84375, + "grad_norm_var": 0.014025624593098958, + "learning_rate": 0.0001, + "loss": 3.823, + "loss/crossentropy": 1.9767807722091675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1865728572010994, + "step": 27816 + }, + { + "epoch": 0.55636, + "grad_norm": 1.9140625, + "grad_norm_var": 0.008782704671223959, + "learning_rate": 0.0001, + "loss": 3.7126, + "loss/crossentropy": 1.9406518936157227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16727876663208008, + "step": 27818 + }, + { + "epoch": 0.5564, + "grad_norm": 2.046875, + "grad_norm_var": 0.010550689697265626, + "learning_rate": 0.0001, + "loss": 3.9657, + "loss/crossentropy": 2.0927204489707947, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055317759513855, + "step": 27820 + }, + { + "epoch": 0.55644, + "grad_norm": 2.015625, + "grad_norm_var": 0.0100006103515625, + "learning_rate": 0.0001, + "loss": 3.825, + "loss/crossentropy": 1.9616308212280273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1885625645518303, + "step": 27822 + }, + { + "epoch": 0.55648, + "grad_norm": 1.828125, + "grad_norm_var": 0.008890533447265625, + "learning_rate": 0.0001, + "loss": 3.7727, + "loss/crossentropy": 1.7788268327713013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1750224232673645, + "step": 27824 + }, + { + "epoch": 0.55652, + "grad_norm": 1.96875, + "grad_norm_var": 0.009391021728515626, + "learning_rate": 0.0001, + "loss": 3.8066, + "loss/crossentropy": 2.006068170070648, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18516544997692108, + "step": 27826 + }, + { + "epoch": 0.55656, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012995402018229166, + "learning_rate": 0.0001, + "loss": 3.8937, + "loss/crossentropy": 2.046180784702301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17838794738054276, + "step": 27828 + }, + { + "epoch": 0.5566, + "grad_norm": 1.984375, + "grad_norm_var": 0.010526275634765625, + "learning_rate": 0.0001, + "loss": 3.8813, + "loss/crossentropy": 1.9033851027488708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18221035599708557, + "step": 27830 + }, + { + "epoch": 0.55664, + "grad_norm": 1.84375, + "grad_norm_var": 0.009415435791015624, + "learning_rate": 0.0001, + "loss": 4.128, + "loss/crossentropy": 2.1586283445358276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20454445481300354, + "step": 27832 + }, + { + "epoch": 0.55668, + "grad_norm": 1.890625, + "grad_norm_var": 0.009645334879557292, + "learning_rate": 0.0001, + "loss": 4.0045, + "loss/crossentropy": 2.383496642112732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19787979125976562, + "step": 27834 + }, + { + "epoch": 0.55672, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009150950113932292, + "learning_rate": 0.0001, + "loss": 3.9134, + "loss/crossentropy": 2.052733063697815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1916261911392212, + "step": 27836 + }, + { + "epoch": 0.55676, + "grad_norm": 2.046875, + "grad_norm_var": 0.009281412760416666, + "learning_rate": 0.0001, + "loss": 4.2263, + "loss/crossentropy": 2.1427736282348633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1837182566523552, + "step": 27838 + }, + { + "epoch": 0.5568, + "grad_norm": 1.8046875, + "grad_norm_var": 0.009651438395182291, + "learning_rate": 0.0001, + "loss": 3.8118, + "loss/crossentropy": 2.186544895172119, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20473113656044006, + "step": 27840 + }, + { + "epoch": 0.55684, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007845052083333333, + "learning_rate": 0.0001, + "loss": 3.9411, + "loss/crossentropy": 2.012458860874176, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20121898502111435, + "step": 27842 + }, + { + "epoch": 0.55688, + "grad_norm": 2.0, + "grad_norm_var": 0.00859375, + "learning_rate": 0.0001, + "loss": 3.8415, + "loss/crossentropy": 1.9217159748077393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1689399629831314, + "step": 27844 + }, + { + "epoch": 0.55692, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007258097330729167, + "learning_rate": 0.0001, + "loss": 3.9051, + "loss/crossentropy": 2.1690521240234375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21002651005983353, + "step": 27846 + }, + { + "epoch": 0.55696, + "grad_norm": 1.84375, + "grad_norm_var": 0.006992340087890625, + "learning_rate": 0.0001, + "loss": 3.9794, + "loss/crossentropy": 2.3649327754974365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20836569368839264, + "step": 27848 + }, + { + "epoch": 0.557, + "grad_norm": 2.390625, + "grad_norm_var": 0.021735636393229167, + "learning_rate": 0.0001, + "loss": 4.1079, + "loss/crossentropy": 2.118848145008087, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2462398260831833, + "step": 27850 + }, + { + "epoch": 0.55704, + "grad_norm": 1.8671875, + "grad_norm_var": 0.022043609619140626, + "learning_rate": 0.0001, + "loss": 4.1345, + "loss/crossentropy": 1.9461410641670227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1836184561252594, + "step": 27852 + }, + { + "epoch": 0.55708, + "grad_norm": 1.8671875, + "grad_norm_var": 0.021817779541015624, + "learning_rate": 0.0001, + "loss": 4.0911, + "loss/crossentropy": 2.225842833518982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19738706201314926, + "step": 27854 + }, + { + "epoch": 0.55712, + "grad_norm": 1.9765625, + "grad_norm_var": 0.020702870686848958, + "learning_rate": 0.0001, + "loss": 4.2437, + "loss/crossentropy": 2.379333972930908, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21520506590604782, + "step": 27856 + }, + { + "epoch": 0.55716, + "grad_norm": 1.984375, + "grad_norm_var": 0.020957183837890626, + "learning_rate": 0.0001, + "loss": 3.94, + "loss/crossentropy": 1.789103090763092, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16888972371816635, + "step": 27858 + }, + { + "epoch": 0.5572, + "grad_norm": 2.078125, + "grad_norm_var": 0.017464192708333333, + "learning_rate": 0.0001, + "loss": 4.1865, + "loss/crossentropy": 2.1485098600387573, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049245834350586, + "step": 27860 + }, + { + "epoch": 0.55724, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017185211181640625, + "learning_rate": 0.0001, + "loss": 4.1494, + "loss/crossentropy": 1.8173152804374695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17991861701011658, + "step": 27862 + }, + { + "epoch": 0.55728, + "grad_norm": 1.9765625, + "grad_norm_var": 0.016206868489583335, + "learning_rate": 0.0001, + "loss": 4.0881, + "loss/crossentropy": 2.1297428011894226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19571492820978165, + "step": 27864 + }, + { + "epoch": 0.55732, + "grad_norm": 2.109375, + "grad_norm_var": 0.007212066650390625, + "learning_rate": 0.0001, + "loss": 3.9824, + "loss/crossentropy": 2.038254678249359, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1918657273054123, + "step": 27866 + }, + { + "epoch": 0.55736, + "grad_norm": 2.015625, + "grad_norm_var": 0.006790924072265625, + "learning_rate": 0.0001, + "loss": 4.2425, + "loss/crossentropy": 2.1835550665855408, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19663546979427338, + "step": 27868 + }, + { + "epoch": 0.5574, + "grad_norm": 1.96875, + "grad_norm_var": 0.006021881103515625, + "learning_rate": 0.0001, + "loss": 4.097, + "loss/crossentropy": 2.0005786418914795, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20361479371786118, + "step": 27870 + }, + { + "epoch": 0.55744, + "grad_norm": 1.8828125, + "grad_norm_var": 0.006640625, + "learning_rate": 0.0001, + "loss": 4.036, + "loss/crossentropy": 1.9897403717041016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18811679631471634, + "step": 27872 + }, + { + "epoch": 0.55748, + "grad_norm": 2.625, + "grad_norm_var": 0.035042063395182295, + "learning_rate": 0.0001, + "loss": 3.8279, + "loss/crossentropy": 1.8575212359428406, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18323712795972824, + "step": 27874 + }, + { + "epoch": 0.55752, + "grad_norm": 2.296875, + "grad_norm_var": 0.040537261962890626, + "learning_rate": 0.0001, + "loss": 3.8482, + "loss/crossentropy": 2.1971237659454346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20962104946374893, + "step": 27876 + }, + { + "epoch": 0.55756, + "grad_norm": 1.9609375, + "grad_norm_var": 0.04003473917643229, + "learning_rate": 0.0001, + "loss": 4.0675, + "loss/crossentropy": 2.1657967567443848, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19974523782730103, + "step": 27878 + }, + { + "epoch": 0.5576, + "grad_norm": 2.0625, + "grad_norm_var": 0.04272028605143229, + "learning_rate": 0.0001, + "loss": 3.9154, + "loss/crossentropy": 1.988546371459961, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2359297126531601, + "step": 27880 + }, + { + "epoch": 0.55764, + "grad_norm": 2.0, + "grad_norm_var": 0.0413330078125, + "learning_rate": 0.0001, + "loss": 4.3125, + "loss/crossentropy": 1.979064404964447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1763293370604515, + "step": 27882 + }, + { + "epoch": 0.55768, + "grad_norm": 2.0, + "grad_norm_var": 0.04175516764322917, + "learning_rate": 0.0001, + "loss": 4.307, + "loss/crossentropy": 2.3738937377929688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21125850826501846, + "step": 27884 + }, + { + "epoch": 0.55772, + "grad_norm": 1.8671875, + "grad_norm_var": 0.04346110026041667, + "learning_rate": 0.0001, + "loss": 4.0331, + "loss/crossentropy": 1.9243283867835999, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22126196324825287, + "step": 27886 + }, + { + "epoch": 0.55776, + "grad_norm": 1.953125, + "grad_norm_var": 0.0432373046875, + "learning_rate": 0.0001, + "loss": 3.9017, + "loss/crossentropy": 2.0033947229385376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17997342348098755, + "step": 27888 + }, + { + "epoch": 0.5578, + "grad_norm": 1.9375, + "grad_norm_var": 0.015396881103515624, + "learning_rate": 0.0001, + "loss": 4.0113, + "loss/crossentropy": 2.1850863695144653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20201555639505386, + "step": 27890 + }, + { + "epoch": 0.55784, + "grad_norm": 1.796875, + "grad_norm_var": 0.008487955729166666, + "learning_rate": 0.0001, + "loss": 3.8585, + "loss/crossentropy": 1.9696857333183289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20272113382816315, + "step": 27892 + }, + { + "epoch": 0.55788, + "grad_norm": 1.921875, + "grad_norm_var": 0.009234364827473958, + "learning_rate": 0.0001, + "loss": 4.2124, + "loss/crossentropy": 2.014931857585907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2670851871371269, + "step": 27894 + }, + { + "epoch": 0.55792, + "grad_norm": 2.8125, + "grad_norm_var": 0.05494359334309896, + "learning_rate": 0.0001, + "loss": 3.91, + "loss/crossentropy": 1.7789946794509888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16623269766569138, + "step": 27896 + }, + { + "epoch": 0.55796, + "grad_norm": 1.7734375, + "grad_norm_var": 0.05752766927083333, + "learning_rate": 0.0001, + "loss": 3.9915, + "loss/crossentropy": 1.9287369847297668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17834855616092682, + "step": 27898 + }, + { + "epoch": 0.558, + "grad_norm": 1.921875, + "grad_norm_var": 0.0579254150390625, + "learning_rate": 0.0001, + "loss": 4.0939, + "loss/crossentropy": 2.2972252368927, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19952362030744553, + "step": 27900 + }, + { + "epoch": 0.55804, + "grad_norm": 2.015625, + "grad_norm_var": 0.05802993774414063, + "learning_rate": 0.0001, + "loss": 4.0678, + "loss/crossentropy": 2.307905077934265, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21856702864170074, + "step": 27902 + }, + { + "epoch": 0.55808, + "grad_norm": 2.203125, + "grad_norm_var": 0.06280008951822917, + "learning_rate": 0.0001, + "loss": 3.9421, + "loss/crossentropy": 2.0781975984573364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20248684287071228, + "step": 27904 + }, + { + "epoch": 0.55812, + "grad_norm": 2.125, + "grad_norm_var": 0.06393229166666667, + "learning_rate": 0.0001, + "loss": 4.0949, + "loss/crossentropy": 2.2437129616737366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20828180015087128, + "step": 27906 + }, + { + "epoch": 0.55816, + "grad_norm": 1.8671875, + "grad_norm_var": 0.06414286295572917, + "learning_rate": 0.0001, + "loss": 3.6518, + "loss/crossentropy": 1.8736275434494019, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1924392208456993, + "step": 27908 + }, + { + "epoch": 0.5582, + "grad_norm": 1.9296875, + "grad_norm_var": 0.06412734985351562, + "learning_rate": 0.0001, + "loss": 3.874, + "loss/crossentropy": 1.7928629517555237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1721597984433174, + "step": 27910 + }, + { + "epoch": 0.55824, + "grad_norm": 1.8125, + "grad_norm_var": 0.0142486572265625, + "learning_rate": 0.0001, + "loss": 3.9806, + "loss/crossentropy": 1.8429313898086548, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18428131937980652, + "step": 27912 + }, + { + "epoch": 0.55828, + "grad_norm": 2.015625, + "grad_norm_var": 0.015417226155598958, + "learning_rate": 0.0001, + "loss": 3.8051, + "loss/crossentropy": 1.7473148107528687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1703418493270874, + "step": 27914 + }, + { + "epoch": 0.55832, + "grad_norm": 2.0, + "grad_norm_var": 0.015712229410807292, + "learning_rate": 0.0001, + "loss": 4.1174, + "loss/crossentropy": 1.9749281406402588, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19364860653877258, + "step": 27916 + }, + { + "epoch": 0.55836, + "grad_norm": 1.84375, + "grad_norm_var": 0.014564768473307291, + "learning_rate": 0.0001, + "loss": 3.9925, + "loss/crossentropy": 2.0153204202651978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18824293464422226, + "step": 27918 + }, + { + "epoch": 0.5584, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008925120035807291, + "learning_rate": 0.0001, + "loss": 3.9048, + "loss/crossentropy": 1.984444797039032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17791762948036194, + "step": 27920 + }, + { + "epoch": 0.55844, + "grad_norm": 2.09375, + "grad_norm_var": 0.007846832275390625, + "learning_rate": 0.0001, + "loss": 4.0684, + "loss/crossentropy": 2.1440568566322327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.277907058596611, + "step": 27922 + }, + { + "epoch": 0.55848, + "grad_norm": 1.9140625, + "grad_norm_var": 0.01706517537434896, + "learning_rate": 0.0001, + "loss": 4.1565, + "loss/crossentropy": 2.261883854866028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21542102098464966, + "step": 27924 + }, + { + "epoch": 0.55852, + "grad_norm": 1.9609375, + "grad_norm_var": 0.017647298177083333, + "learning_rate": 0.0001, + "loss": 4.1513, + "loss/crossentropy": 2.0659791827201843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17906638234853745, + "step": 27926 + }, + { + "epoch": 0.55856, + "grad_norm": 1.921875, + "grad_norm_var": 0.016893513997395835, + "learning_rate": 0.0001, + "loss": 4.1226, + "loss/crossentropy": 2.536762237548828, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20740076154470444, + "step": 27928 + }, + { + "epoch": 0.5586, + "grad_norm": 1.9765625, + "grad_norm_var": 0.014314778645833333, + "learning_rate": 0.0001, + "loss": 3.741, + "loss/crossentropy": 1.8174605965614319, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17254772782325745, + "step": 27930 + }, + { + "epoch": 0.55864, + "grad_norm": 2.015625, + "grad_norm_var": 0.015575154622395834, + "learning_rate": 0.0001, + "loss": 3.9703, + "loss/crossentropy": 2.2474478483200073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22224698960781097, + "step": 27932 + }, + { + "epoch": 0.55868, + "grad_norm": 2.125, + "grad_norm_var": 0.016844685872395834, + "learning_rate": 0.0001, + "loss": 4.0159, + "loss/crossentropy": 1.9788220524787903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19506333768367767, + "step": 27934 + }, + { + "epoch": 0.55872, + "grad_norm": 1.8984375, + "grad_norm_var": 0.016670735677083333, + "learning_rate": 0.0001, + "loss": 4.0164, + "loss/crossentropy": 1.9576459527015686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17929108440876007, + "step": 27936 + }, + { + "epoch": 0.55876, + "grad_norm": 2.09375, + "grad_norm_var": 0.015987141927083334, + "learning_rate": 0.0001, + "loss": 4.2099, + "loss/crossentropy": 2.2682799100875854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22949229925870895, + "step": 27938 + }, + { + "epoch": 0.5588, + "grad_norm": 1.9453125, + "grad_norm_var": 0.009235636393229166, + "learning_rate": 0.0001, + "loss": 3.9841, + "loss/crossentropy": 2.0454965829849243, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19698018580675125, + "step": 27940 + }, + { + "epoch": 0.55884, + "grad_norm": 1.7734375, + "grad_norm_var": 0.010729726155598958, + "learning_rate": 0.0001, + "loss": 4.0611, + "loss/crossentropy": 2.3304296731948853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22486522793769836, + "step": 27942 + }, + { + "epoch": 0.55888, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010538736979166666, + "learning_rate": 0.0001, + "loss": 3.787, + "loss/crossentropy": 1.7931689620018005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16503961384296417, + "step": 27944 + }, + { + "epoch": 0.55892, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011124674479166667, + "learning_rate": 0.0001, + "loss": 3.7788, + "loss/crossentropy": 1.8899441957473755, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18204578012228012, + "step": 27946 + }, + { + "epoch": 0.55896, + "grad_norm": 2.0625, + "grad_norm_var": 0.0106109619140625, + "learning_rate": 0.0001, + "loss": 3.9515, + "loss/crossentropy": 1.9887003302574158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19222819805145264, + "step": 27948 + }, + { + "epoch": 0.559, + "grad_norm": 1.875, + "grad_norm_var": 0.0088531494140625, + "learning_rate": 0.0001, + "loss": 3.9696, + "loss/crossentropy": 2.097675323486328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877489537000656, + "step": 27950 + }, + { + "epoch": 0.55904, + "grad_norm": 1.8359375, + "grad_norm_var": 0.009248860677083333, + "learning_rate": 0.0001, + "loss": 3.9556, + "loss/crossentropy": 1.801637887954712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15989546477794647, + "step": 27952 + }, + { + "epoch": 0.55908, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008756256103515625, + "learning_rate": 0.0001, + "loss": 4.141, + "loss/crossentropy": 2.1653464436531067, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17995328456163406, + "step": 27954 + }, + { + "epoch": 0.55912, + "grad_norm": 1.796875, + "grad_norm_var": 0.009506988525390624, + "learning_rate": 0.0001, + "loss": 3.8267, + "loss/crossentropy": 2.370779275894165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22155411541461945, + "step": 27956 + }, + { + "epoch": 0.55916, + "grad_norm": 1.875, + "grad_norm_var": 0.008055623372395833, + "learning_rate": 0.0001, + "loss": 3.9821, + "loss/crossentropy": 1.9569392204284668, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20606116950511932, + "step": 27958 + }, + { + "epoch": 0.5592, + "grad_norm": 2.046875, + "grad_norm_var": 0.009919230143229167, + "learning_rate": 0.0001, + "loss": 3.9857, + "loss/crossentropy": 1.8667678833007812, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18434254080057144, + "step": 27960 + }, + { + "epoch": 0.55924, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009171549479166667, + "learning_rate": 0.0001, + "loss": 4.0802, + "loss/crossentropy": 2.4322283267974854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21796736866235733, + "step": 27962 + }, + { + "epoch": 0.55928, + "grad_norm": 1.921875, + "grad_norm_var": 0.0070220947265625, + "learning_rate": 0.0001, + "loss": 3.8252, + "loss/crossentropy": 1.7290849685668945, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17558299005031586, + "step": 27964 + }, + { + "epoch": 0.55932, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006430816650390625, + "learning_rate": 0.0001, + "loss": 3.8143, + "loss/crossentropy": 1.9967953562736511, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19310998916625977, + "step": 27966 + }, + { + "epoch": 0.55936, + "grad_norm": 1.953125, + "grad_norm_var": 0.006685129801432292, + "learning_rate": 0.0001, + "loss": 3.9863, + "loss/crossentropy": 2.027643382549286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1989550217986107, + "step": 27968 + }, + { + "epoch": 0.5594, + "grad_norm": 1.7890625, + "grad_norm_var": 0.016312408447265624, + "learning_rate": 0.0001, + "loss": 3.8968, + "loss/crossentropy": 2.057853937149048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117058396339417, + "step": 27970 + }, + { + "epoch": 0.55944, + "grad_norm": 1.9140625, + "grad_norm_var": 0.015122222900390624, + "learning_rate": 0.0001, + "loss": 4.1212, + "loss/crossentropy": 2.113202214241028, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19248170405626297, + "step": 27972 + }, + { + "epoch": 0.55948, + "grad_norm": 2.015625, + "grad_norm_var": 0.014454905192057292, + "learning_rate": 0.0001, + "loss": 4.0971, + "loss/crossentropy": 2.04659241437912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2064732015132904, + "step": 27974 + }, + { + "epoch": 0.55952, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012943522135416666, + "learning_rate": 0.0001, + "loss": 4.0117, + "loss/crossentropy": 2.046691119670868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19065889716148376, + "step": 27976 + }, + { + "epoch": 0.55956, + "grad_norm": 1.8671875, + "grad_norm_var": 0.014945475260416667, + "learning_rate": 0.0001, + "loss": 3.7466, + "loss/crossentropy": 1.896644115447998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1838160902261734, + "step": 27978 + }, + { + "epoch": 0.5596, + "grad_norm": 2.0, + "grad_norm_var": 0.015411122639973959, + "learning_rate": 0.0001, + "loss": 3.9367, + "loss/crossentropy": 1.9928682446479797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17411044239997864, + "step": 27980 + }, + { + "epoch": 0.55964, + "grad_norm": 1.875, + "grad_norm_var": 0.015843709309895832, + "learning_rate": 0.0001, + "loss": 3.9616, + "loss/crossentropy": 1.9449793100357056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18892540037631989, + "step": 27982 + }, + { + "epoch": 0.55968, + "grad_norm": 2.03125, + "grad_norm_var": 0.0159912109375, + "learning_rate": 0.0001, + "loss": 3.8687, + "loss/crossentropy": 1.949626863002777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1996856927871704, + "step": 27984 + }, + { + "epoch": 0.55972, + "grad_norm": 1.828125, + "grad_norm_var": 0.0056793212890625, + "learning_rate": 0.0001, + "loss": 4.0761, + "loss/crossentropy": 2.116423010826111, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18394631892442703, + "step": 27986 + }, + { + "epoch": 0.55976, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006320953369140625, + "learning_rate": 0.0001, + "loss": 3.9168, + "loss/crossentropy": 2.151313066482544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1895233690738678, + "step": 27988 + }, + { + "epoch": 0.5598, + "grad_norm": 2.03125, + "grad_norm_var": 0.007739003499348958, + "learning_rate": 0.0001, + "loss": 3.8336, + "loss/crossentropy": 2.1060383319854736, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21665210276842117, + "step": 27990 + }, + { + "epoch": 0.55984, + "grad_norm": 2.0625, + "grad_norm_var": 0.009528605143229167, + "learning_rate": 0.0001, + "loss": 3.9715, + "loss/crossentropy": 2.201907217502594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20985589921474457, + "step": 27992 + }, + { + "epoch": 0.55988, + "grad_norm": 2.0, + "grad_norm_var": 0.0083251953125, + "learning_rate": 0.0001, + "loss": 3.9765, + "loss/crossentropy": 2.0219600796699524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2357076108455658, + "step": 27994 + }, + { + "epoch": 0.55992, + "grad_norm": 1.984375, + "grad_norm_var": 0.008166249593098958, + "learning_rate": 0.0001, + "loss": 4.1518, + "loss/crossentropy": 2.006410300731659, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20221157371997833, + "step": 27996 + }, + { + "epoch": 0.55996, + "grad_norm": 2.0625, + "grad_norm_var": 0.009886678059895833, + "learning_rate": 0.0001, + "loss": 3.842, + "loss/crossentropy": 1.7009567618370056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16861556470394135, + "step": 27998 + }, + { + "epoch": 0.56, + "grad_norm": 1.90625, + "grad_norm_var": 0.010587310791015625, + "learning_rate": 0.0001, + "loss": 3.6914, + "loss/crossentropy": 1.7348475456237793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15609163790941238, + "step": 28000 + }, + { + "epoch": 0.56004, + "grad_norm": 2.0625, + "grad_norm_var": 0.011087799072265625, + "learning_rate": 0.0001, + "loss": 4.0131, + "loss/crossentropy": 2.253269076347351, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21236062049865723, + "step": 28002 + }, + { + "epoch": 0.56008, + "grad_norm": 1.890625, + "grad_norm_var": 0.01077880859375, + "learning_rate": 0.0001, + "loss": 3.6868, + "loss/crossentropy": 1.5210962891578674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15905095636844635, + "step": 28004 + }, + { + "epoch": 0.56012, + "grad_norm": 1.828125, + "grad_norm_var": 0.011017862955729167, + "learning_rate": 0.0001, + "loss": 4.201, + "loss/crossentropy": 2.201940894126892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.206693597137928, + "step": 28006 + }, + { + "epoch": 0.56016, + "grad_norm": 2.0, + "grad_norm_var": 0.010367584228515626, + "learning_rate": 0.0001, + "loss": 3.979, + "loss/crossentropy": 2.0980228185653687, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19499729573726654, + "step": 28008 + }, + { + "epoch": 0.5602, + "grad_norm": 1.96875, + "grad_norm_var": 0.011413319905598959, + "learning_rate": 0.0001, + "loss": 4.0716, + "loss/crossentropy": 2.0373584032058716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17089952528476715, + "step": 28010 + }, + { + "epoch": 0.56024, + "grad_norm": 2.09375, + "grad_norm_var": 0.013291168212890624, + "learning_rate": 0.0001, + "loss": 4.3161, + "loss/crossentropy": 2.4253649711608887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2154974490404129, + "step": 28012 + }, + { + "epoch": 0.56028, + "grad_norm": 1.984375, + "grad_norm_var": 0.011405181884765626, + "learning_rate": 0.0001, + "loss": 3.9914, + "loss/crossentropy": 2.121450662612915, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19490933418273926, + "step": 28014 + }, + { + "epoch": 0.56032, + "grad_norm": 1.859375, + "grad_norm_var": 0.013916015625, + "learning_rate": 0.0001, + "loss": 3.9158, + "loss/crossentropy": 1.9926584959030151, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19274179637432098, + "step": 28016 + }, + { + "epoch": 0.56036, + "grad_norm": 2.09375, + "grad_norm_var": 0.015559641520182292, + "learning_rate": 0.0001, + "loss": 3.9918, + "loss/crossentropy": 1.6820538640022278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17553947865962982, + "step": 28018 + }, + { + "epoch": 0.5604, + "grad_norm": 1.921875, + "grad_norm_var": 0.013496653238932291, + "learning_rate": 0.0001, + "loss": 4.2007, + "loss/crossentropy": 2.278828501701355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22723327577114105, + "step": 28020 + }, + { + "epoch": 0.56044, + "grad_norm": 1.7421875, + "grad_norm_var": 0.016682942708333332, + "learning_rate": 0.0001, + "loss": 3.4084, + "loss/crossentropy": 1.860534906387329, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17910753190517426, + "step": 28022 + }, + { + "epoch": 0.56048, + "grad_norm": 1.734375, + "grad_norm_var": 0.019945271809895835, + "learning_rate": 0.0001, + "loss": 3.6443, + "loss/crossentropy": 2.094952940940857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19640950858592987, + "step": 28024 + }, + { + "epoch": 0.56052, + "grad_norm": 2.0625, + "grad_norm_var": 0.03429133097330729, + "learning_rate": 0.0001, + "loss": 3.9637, + "loss/crossentropy": 2.2802677154541016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2223518192768097, + "step": 28026 + }, + { + "epoch": 0.56056, + "grad_norm": 2.078125, + "grad_norm_var": 0.03408177693684896, + "learning_rate": 0.0001, + "loss": 4.2298, + "loss/crossentropy": 2.382766842842102, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22707599401474, + "step": 28028 + }, + { + "epoch": 0.5606, + "grad_norm": 1.90625, + "grad_norm_var": 0.033512115478515625, + "learning_rate": 0.0001, + "loss": 3.984, + "loss/crossentropy": 1.861901044845581, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17228157818317413, + "step": 28030 + }, + { + "epoch": 0.56064, + "grad_norm": 1.96875, + "grad_norm_var": 0.030454254150390624, + "learning_rate": 0.0001, + "loss": 3.8515, + "loss/crossentropy": 1.8863429427146912, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17535988986492157, + "step": 28032 + }, + { + "epoch": 0.56068, + "grad_norm": 2.015625, + "grad_norm_var": 0.02935765584309896, + "learning_rate": 0.0001, + "loss": 4.3787, + "loss/crossentropy": 2.0987571477890015, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073696255683899, + "step": 28034 + }, + { + "epoch": 0.56072, + "grad_norm": 2.359375, + "grad_norm_var": 0.03867975870768229, + "learning_rate": 0.0001, + "loss": 3.9042, + "loss/crossentropy": 1.6989398002624512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1658734381198883, + "step": 28036 + }, + { + "epoch": 0.56076, + "grad_norm": 2.0625, + "grad_norm_var": 0.030586751302083333, + "learning_rate": 0.0001, + "loss": 3.8771, + "loss/crossentropy": 2.007630228996277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17940524220466614, + "step": 28038 + }, + { + "epoch": 0.5608, + "grad_norm": 1.78125, + "grad_norm_var": 0.028888956705729166, + "learning_rate": 0.0001, + "loss": 3.8596, + "loss/crossentropy": 2.2327693700790405, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22857680916786194, + "step": 28040 + }, + { + "epoch": 0.56084, + "grad_norm": 1.875, + "grad_norm_var": 0.016524251302083334, + "learning_rate": 0.0001, + "loss": 3.9398, + "loss/crossentropy": 2.051284074783325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18832950294017792, + "step": 28042 + }, + { + "epoch": 0.56088, + "grad_norm": 1.8984375, + "grad_norm_var": 0.016627756754557292, + "learning_rate": 0.0001, + "loss": 3.9606, + "loss/crossentropy": 1.9091659784317017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17789962142705917, + "step": 28044 + }, + { + "epoch": 0.56092, + "grad_norm": 2.109375, + "grad_norm_var": 0.05832112630208333, + "learning_rate": 0.0001, + "loss": 3.7684, + "loss/crossentropy": 1.8703528046607971, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24716350436210632, + "step": 28046 + }, + { + "epoch": 0.56096, + "grad_norm": 1.8046875, + "grad_norm_var": 0.060031890869140625, + "learning_rate": 0.0001, + "loss": 3.7816, + "loss/crossentropy": 1.9286105036735535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17276924848556519, + "step": 28048 + }, + { + "epoch": 0.561, + "grad_norm": 2.125, + "grad_norm_var": 0.06181208292643229, + "learning_rate": 0.0001, + "loss": 4.2004, + "loss/crossentropy": 2.115488290786743, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20674347877502441, + "step": 28050 + }, + { + "epoch": 0.56104, + "grad_norm": 1.8359375, + "grad_norm_var": 0.058426920572916666, + "learning_rate": 0.0001, + "loss": 3.6918, + "loss/crossentropy": 2.1754192113876343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19405043870210648, + "step": 28052 + }, + { + "epoch": 0.56108, + "grad_norm": 2.28125, + "grad_norm_var": 0.08551025390625, + "learning_rate": 0.0001, + "loss": 4.2559, + "loss/crossentropy": 2.378780484199524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19838006794452667, + "step": 28054 + }, + { + "epoch": 0.56112, + "grad_norm": 1.8984375, + "grad_norm_var": 0.08651504516601563, + "learning_rate": 0.0001, + "loss": 4.1649, + "loss/crossentropy": 1.8579109907150269, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19068627059459686, + "step": 28056 + }, + { + "epoch": 0.56116, + "grad_norm": 1.9453125, + "grad_norm_var": 0.1222564697265625, + "learning_rate": 0.0001, + "loss": 4.17, + "loss/crossentropy": 2.190861463546753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977207213640213, + "step": 28058 + }, + { + "epoch": 0.5612, + "grad_norm": 1.8671875, + "grad_norm_var": 0.15713297526041667, + "learning_rate": 0.0001, + "loss": 3.8599, + "loss/crossentropy": 1.8596174716949463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18201489746570587, + "step": 28060 + }, + { + "epoch": 0.56124, + "grad_norm": 1.9609375, + "grad_norm_var": 0.1685808817545573, + "learning_rate": 0.0001, + "loss": 4.105, + "loss/crossentropy": 1.8212996125221252, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16964350640773773, + "step": 28062 + }, + { + "epoch": 0.56128, + "grad_norm": 2.1875, + "grad_norm_var": 0.15612691243489582, + "learning_rate": 0.0001, + "loss": 4.0088, + "loss/crossentropy": 2.1002501845359802, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20258799940347672, + "step": 28064 + }, + { + "epoch": 0.56132, + "grad_norm": 2.21875, + "grad_norm_var": 0.18886617024739583, + "learning_rate": 0.0001, + "loss": 4.0179, + "loss/crossentropy": 2.1405937671661377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2100813016295433, + "step": 28066 + }, + { + "epoch": 0.56136, + "grad_norm": 2.90625, + "grad_norm_var": 0.22138264973958333, + "learning_rate": 0.0001, + "loss": 3.7586, + "loss/crossentropy": 2.23690402507782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19980955868959427, + "step": 28068 + }, + { + "epoch": 0.5614, + "grad_norm": 3.015625, + "grad_norm_var": 0.30380452473958336, + "learning_rate": 0.0001, + "loss": 3.5867, + "loss/crossentropy": 1.875457525253296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16666759550571442, + "step": 28070 + }, + { + "epoch": 0.56144, + "grad_norm": 2.78125, + "grad_norm_var": 0.26413141886393227, + "learning_rate": 0.0001, + "loss": 4.0021, + "loss/crossentropy": 2.1469756960868835, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19378764182329178, + "step": 28072 + }, + { + "epoch": 0.56148, + "grad_norm": 2.09375, + "grad_norm_var": 0.25313212076822916, + "learning_rate": 0.0001, + "loss": 4.0959, + "loss/crossentropy": 2.156083106994629, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20279116183519363, + "step": 28074 + }, + { + "epoch": 0.56152, + "grad_norm": 2.0, + "grad_norm_var": 0.27230631510416664, + "learning_rate": 0.0001, + "loss": 3.9681, + "loss/crossentropy": 1.9321695566177368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1937977820634842, + "step": 28076 + }, + { + "epoch": 0.56156, + "grad_norm": 2.078125, + "grad_norm_var": 0.29947077433268227, + "learning_rate": 0.0001, + "loss": 3.6465, + "loss/crossentropy": 2.005180776119232, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18012206256389618, + "step": 28078 + }, + { + "epoch": 0.5616, + "grad_norm": 1.7578125, + "grad_norm_var": 0.35347391764322916, + "learning_rate": 0.0001, + "loss": 3.7987, + "loss/crossentropy": 2.040120303630829, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17059049755334854, + "step": 28080 + }, + { + "epoch": 0.56164, + "grad_norm": 2.078125, + "grad_norm_var": 0.34440511067708335, + "learning_rate": 0.0001, + "loss": 4.3903, + "loss/crossentropy": 2.5171138048171997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21068025380373, + "step": 28082 + }, + { + "epoch": 0.56168, + "grad_norm": 1.890625, + "grad_norm_var": 0.2691070556640625, + "learning_rate": 0.0001, + "loss": 4.252, + "loss/crossentropy": 2.1827582120895386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1995992213487625, + "step": 28084 + }, + { + "epoch": 0.56172, + "grad_norm": 2.046875, + "grad_norm_var": 0.09042154947916667, + "learning_rate": 0.0001, + "loss": 4.1365, + "loss/crossentropy": 2.0082287192344666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23266522586345673, + "step": 28086 + }, + { + "epoch": 0.56176, + "grad_norm": 1.8046875, + "grad_norm_var": 0.024853261311848958, + "learning_rate": 0.0001, + "loss": 3.8143, + "loss/crossentropy": 1.59010910987854, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15921758115291595, + "step": 28088 + }, + { + "epoch": 0.5618, + "grad_norm": 1.921875, + "grad_norm_var": 0.012132771809895833, + "learning_rate": 0.0001, + "loss": 4.0431, + "loss/crossentropy": 2.3045564889907837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2091711461544037, + "step": 28090 + }, + { + "epoch": 0.56184, + "grad_norm": 1.828125, + "grad_norm_var": 0.0129547119140625, + "learning_rate": 0.0001, + "loss": 3.8729, + "loss/crossentropy": 1.9903671741485596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19884900748729706, + "step": 28092 + }, + { + "epoch": 0.56188, + "grad_norm": 1.7578125, + "grad_norm_var": 0.011970011393229167, + "learning_rate": 0.0001, + "loss": 3.9903, + "loss/crossentropy": 2.3190979957580566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20256339758634567, + "step": 28094 + }, + { + "epoch": 0.56192, + "grad_norm": 1.890625, + "grad_norm_var": 0.009983062744140625, + "learning_rate": 0.0001, + "loss": 4.0022, + "loss/crossentropy": 1.8695592880249023, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17132485657930374, + "step": 28096 + }, + { + "epoch": 0.56196, + "grad_norm": 1.8203125, + "grad_norm_var": 0.008253733317057291, + "learning_rate": 0.0001, + "loss": 3.7549, + "loss/crossentropy": 1.9170012474060059, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18364953994750977, + "step": 28098 + }, + { + "epoch": 0.562, + "grad_norm": 1.6328125, + "grad_norm_var": 0.011749013264973959, + "learning_rate": 0.0001, + "loss": 3.6666, + "loss/crossentropy": 1.7711463570594788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16884273290634155, + "step": 28100 + }, + { + "epoch": 0.56204, + "grad_norm": 2.0, + "grad_norm_var": 0.0090728759765625, + "learning_rate": 0.0001, + "loss": 4.1265, + "loss/crossentropy": 2.216683030128479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19033723324537277, + "step": 28102 + }, + { + "epoch": 0.56208, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008666737874348959, + "learning_rate": 0.0001, + "loss": 3.7862, + "loss/crossentropy": 2.068448841571808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18903378397226334, + "step": 28104 + }, + { + "epoch": 0.56212, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008378092447916667, + "learning_rate": 0.0001, + "loss": 3.9428, + "loss/crossentropy": 1.9194504618644714, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17733828723430634, + "step": 28106 + }, + { + "epoch": 0.56216, + "grad_norm": 1.84375, + "grad_norm_var": 0.00836181640625, + "learning_rate": 0.0001, + "loss": 3.8966, + "loss/crossentropy": 1.8640123009681702, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18660252541303635, + "step": 28108 + }, + { + "epoch": 0.5622, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006628163655598958, + "learning_rate": 0.0001, + "loss": 3.883, + "loss/crossentropy": 2.084853768348694, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1897662729024887, + "step": 28110 + }, + { + "epoch": 0.56224, + "grad_norm": 1.7890625, + "grad_norm_var": 0.006681060791015625, + "learning_rate": 0.0001, + "loss": 3.7912, + "loss/crossentropy": 1.9047400951385498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16695564985275269, + "step": 28112 + }, + { + "epoch": 0.56228, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007616170247395833, + "learning_rate": 0.0001, + "loss": 3.9128, + "loss/crossentropy": 1.9586027264595032, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18182067573070526, + "step": 28114 + }, + { + "epoch": 0.56232, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007448069254557292, + "learning_rate": 0.0001, + "loss": 4.0318, + "loss/crossentropy": 2.0719743371009827, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18775883316993713, + "step": 28116 + }, + { + "epoch": 0.56236, + "grad_norm": 2.03125, + "grad_norm_var": 0.014070638020833333, + "learning_rate": 0.0001, + "loss": 4.5828, + "loss/crossentropy": 2.2210047245025635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.32780784368515015, + "step": 28118 + }, + { + "epoch": 0.5624, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0147125244140625, + "learning_rate": 0.0001, + "loss": 3.9965, + "loss/crossentropy": 2.212351083755493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19030366092920303, + "step": 28120 + }, + { + "epoch": 0.56244, + "grad_norm": 2.0, + "grad_norm_var": 0.014939117431640624, + "learning_rate": 0.0001, + "loss": 4.1715, + "loss/crossentropy": 2.2713009119033813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21552074700593948, + "step": 28122 + }, + { + "epoch": 0.56248, + "grad_norm": 1.90625, + "grad_norm_var": 0.014192708333333333, + "learning_rate": 0.0001, + "loss": 3.8242, + "loss/crossentropy": 1.872189700603485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20009242743253708, + "step": 28124 + }, + { + "epoch": 0.56252, + "grad_norm": 2.0625, + "grad_norm_var": 0.015203603108723958, + "learning_rate": 0.0001, + "loss": 3.9798, + "loss/crossentropy": 2.0979323983192444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19020158797502518, + "step": 28126 + }, + { + "epoch": 0.56256, + "grad_norm": 2.0, + "grad_norm_var": 0.010937245686848958, + "learning_rate": 0.0001, + "loss": 4.1726, + "loss/crossentropy": 1.9983355402946472, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18428711593151093, + "step": 28128 + }, + { + "epoch": 0.5626, + "grad_norm": 2.109375, + "grad_norm_var": 0.011864217122395833, + "learning_rate": 0.0001, + "loss": 4.0696, + "loss/crossentropy": 1.784355342388153, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23605111241340637, + "step": 28130 + }, + { + "epoch": 0.56264, + "grad_norm": 1.9609375, + "grad_norm_var": 0.01231689453125, + "learning_rate": 0.0001, + "loss": 3.932, + "loss/crossentropy": 1.719801664352417, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16245415434241295, + "step": 28132 + }, + { + "epoch": 0.56268, + "grad_norm": 1.84375, + "grad_norm_var": 0.00958251953125, + "learning_rate": 0.0001, + "loss": 3.8402, + "loss/crossentropy": 1.8063556551933289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17704912275075912, + "step": 28134 + }, + { + "epoch": 0.56272, + "grad_norm": 1.984375, + "grad_norm_var": 0.007832845052083334, + "learning_rate": 0.0001, + "loss": 4.0469, + "loss/crossentropy": 1.9270595908164978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1939307227730751, + "step": 28136 + }, + { + "epoch": 0.56276, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008060709635416666, + "learning_rate": 0.0001, + "loss": 3.9754, + "loss/crossentropy": 1.9221222400665283, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1753731295466423, + "step": 28138 + }, + { + "epoch": 0.5628, + "grad_norm": 1.765625, + "grad_norm_var": 0.00994873046875, + "learning_rate": 0.0001, + "loss": 3.7678, + "loss/crossentropy": 1.718525469303131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1854087859392166, + "step": 28140 + }, + { + "epoch": 0.56284, + "grad_norm": 2.015625, + "grad_norm_var": 0.009200032552083333, + "learning_rate": 0.0001, + "loss": 4.0674, + "loss/crossentropy": 1.9418652057647705, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19125580787658691, + "step": 28142 + }, + { + "epoch": 0.56288, + "grad_norm": 1.875, + "grad_norm_var": 0.008587392171223958, + "learning_rate": 0.0001, + "loss": 4.1413, + "loss/crossentropy": 2.100765645503998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19208013266324997, + "step": 28144 + }, + { + "epoch": 0.56292, + "grad_norm": 1.8203125, + "grad_norm_var": 0.010849761962890624, + "learning_rate": 0.0001, + "loss": 3.8, + "loss/crossentropy": 1.9104883074760437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1662292778491974, + "step": 28146 + }, + { + "epoch": 0.56296, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010092926025390626, + "learning_rate": 0.0001, + "loss": 3.8448, + "loss/crossentropy": 2.059969484806061, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18414444476366043, + "step": 28148 + }, + { + "epoch": 0.563, + "grad_norm": 1.7890625, + "grad_norm_var": 0.010392252604166667, + "learning_rate": 0.0001, + "loss": 3.8721, + "loss/crossentropy": 1.9335008263587952, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18294576555490494, + "step": 28150 + }, + { + "epoch": 0.56304, + "grad_norm": 2.703125, + "grad_norm_var": 0.05110575358072917, + "learning_rate": 0.0001, + "loss": 4.3341, + "loss/crossentropy": 2.1855704188346863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19496884942054749, + "step": 28152 + }, + { + "epoch": 0.56308, + "grad_norm": 2.140625, + "grad_norm_var": 0.058176422119140626, + "learning_rate": 0.0001, + "loss": 4.2066, + "loss/crossentropy": 2.1697583198547363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19855469465255737, + "step": 28154 + }, + { + "epoch": 0.56312, + "grad_norm": 1.765625, + "grad_norm_var": 0.058176422119140626, + "learning_rate": 0.0001, + "loss": 3.833, + "loss/crossentropy": 1.926409661769867, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18937255442142487, + "step": 28156 + }, + { + "epoch": 0.56316, + "grad_norm": 1.9765625, + "grad_norm_var": 0.059456125895182295, + "learning_rate": 0.0001, + "loss": 4.1422, + "loss/crossentropy": 1.900004506111145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.190530426800251, + "step": 28158 + }, + { + "epoch": 0.5632, + "grad_norm": 1.96875, + "grad_norm_var": 0.06049982706705729, + "learning_rate": 0.0001, + "loss": 4.0405, + "loss/crossentropy": 1.8822137117385864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1802232339978218, + "step": 28160 + }, + { + "epoch": 0.56324, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0831207275390625, + "learning_rate": 0.0001, + "loss": 3.9198, + "loss/crossentropy": 2.088913321495056, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068777084350586, + "step": 28162 + }, + { + "epoch": 0.56328, + "grad_norm": 1.9921875, + "grad_norm_var": 0.08334935506184896, + "learning_rate": 0.0001, + "loss": 3.967, + "loss/crossentropy": 2.0997453927993774, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21025776118040085, + "step": 28164 + }, + { + "epoch": 0.56332, + "grad_norm": 1.875, + "grad_norm_var": 0.08046061197916667, + "learning_rate": 0.0001, + "loss": 3.9585, + "loss/crossentropy": 2.19319224357605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1984698325395584, + "step": 28166 + }, + { + "epoch": 0.56336, + "grad_norm": 1.9453125, + "grad_norm_var": 0.04764989217122396, + "learning_rate": 0.0001, + "loss": 4.0751, + "loss/crossentropy": 2.022126793861389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894361823797226, + "step": 28168 + }, + { + "epoch": 0.5634, + "grad_norm": 1.8984375, + "grad_norm_var": 0.04308242797851562, + "learning_rate": 0.0001, + "loss": 3.9752, + "loss/crossentropy": 2.0495948791503906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18251951038837433, + "step": 28170 + }, + { + "epoch": 0.56344, + "grad_norm": 1.8515625, + "grad_norm_var": 0.04134928385416667, + "learning_rate": 0.0001, + "loss": 4.1206, + "loss/crossentropy": 1.9946674704551697, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18497124314308167, + "step": 28172 + }, + { + "epoch": 0.56348, + "grad_norm": 1.9375, + "grad_norm_var": 0.03991063435872396, + "learning_rate": 0.0001, + "loss": 3.6823, + "loss/crossentropy": 1.970799744129181, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20478563010692596, + "step": 28174 + }, + { + "epoch": 0.56352, + "grad_norm": 1.9453125, + "grad_norm_var": 0.039606730143229164, + "learning_rate": 0.0001, + "loss": 3.6153, + "loss/crossentropy": 1.9632895588874817, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1864907443523407, + "step": 28176 + }, + { + "epoch": 0.56356, + "grad_norm": 1.8984375, + "grad_norm_var": 0.005150349934895834, + "learning_rate": 0.0001, + "loss": 3.8629, + "loss/crossentropy": 1.9328609108924866, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19433043897151947, + "step": 28178 + }, + { + "epoch": 0.5636, + "grad_norm": 1.8828125, + "grad_norm_var": 0.004727935791015625, + "learning_rate": 0.0001, + "loss": 3.9304, + "loss/crossentropy": 2.1020684242248535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18058212101459503, + "step": 28180 + }, + { + "epoch": 0.56364, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0033322652180989582, + "learning_rate": 0.0001, + "loss": 3.8489, + "loss/crossentropy": 2.008388578891754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.189447820186615, + "step": 28182 + }, + { + "epoch": 0.56368, + "grad_norm": 1.78125, + "grad_norm_var": 0.004073079427083333, + "learning_rate": 0.0001, + "loss": 3.554, + "loss/crossentropy": 1.790539801120758, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16334787011146545, + "step": 28184 + }, + { + "epoch": 0.56372, + "grad_norm": 2.015625, + "grad_norm_var": 0.004439290364583333, + "learning_rate": 0.0001, + "loss": 3.7392, + "loss/crossentropy": 1.9603918194770813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18403412401676178, + "step": 28186 + }, + { + "epoch": 0.56376, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0047760009765625, + "learning_rate": 0.0001, + "loss": 3.7194, + "loss/crossentropy": 2.04629784822464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18315870314836502, + "step": 28188 + }, + { + "epoch": 0.5638, + "grad_norm": 1.765625, + "grad_norm_var": 0.007950592041015624, + "learning_rate": 0.0001, + "loss": 3.8053, + "loss/crossentropy": 2.2571674585342407, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19582515209913254, + "step": 28190 + }, + { + "epoch": 0.56384, + "grad_norm": 2.046875, + "grad_norm_var": 0.008668772379557292, + "learning_rate": 0.0001, + "loss": 4.0982, + "loss/crossentropy": 1.923226773738861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19587361812591553, + "step": 28192 + }, + { + "epoch": 0.56388, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007527669270833333, + "learning_rate": 0.0001, + "loss": 4.0072, + "loss/crossentropy": 2.167626738548279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20034413039684296, + "step": 28194 + }, + { + "epoch": 0.56392, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007462310791015625, + "learning_rate": 0.0001, + "loss": 3.9982, + "loss/crossentropy": 2.1942092180252075, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21775193512439728, + "step": 28196 + }, + { + "epoch": 0.56396, + "grad_norm": 2.0, + "grad_norm_var": 0.007950592041015624, + "learning_rate": 0.0001, + "loss": 4.1772, + "loss/crossentropy": 1.789182722568512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19279392063617706, + "step": 28198 + }, + { + "epoch": 0.564, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0064656575520833336, + "learning_rate": 0.0001, + "loss": 4.1771, + "loss/crossentropy": 1.8911787271499634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18866229057312012, + "step": 28200 + }, + { + "epoch": 0.56404, + "grad_norm": 1.984375, + "grad_norm_var": 0.006091054280598958, + "learning_rate": 0.0001, + "loss": 3.9862, + "loss/crossentropy": 2.0467012524604797, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21637209504842758, + "step": 28202 + }, + { + "epoch": 0.56408, + "grad_norm": 2.15625, + "grad_norm_var": 0.007541656494140625, + "learning_rate": 0.0001, + "loss": 3.9004, + "loss/crossentropy": 2.183765947818756, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19495883584022522, + "step": 28204 + }, + { + "epoch": 0.56412, + "grad_norm": 1.9609375, + "grad_norm_var": 0.004489898681640625, + "learning_rate": 0.0001, + "loss": 4.0853, + "loss/crossentropy": 2.237957239151001, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20132584869861603, + "step": 28206 + }, + { + "epoch": 0.56416, + "grad_norm": 1.859375, + "grad_norm_var": 0.005549112955729167, + "learning_rate": 0.0001, + "loss": 3.7728, + "loss/crossentropy": 1.9930571913719177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18800070136785507, + "step": 28208 + }, + { + "epoch": 0.5642, + "grad_norm": 1.984375, + "grad_norm_var": 0.005915323893229167, + "learning_rate": 0.0001, + "loss": 3.8866, + "loss/crossentropy": 1.9499656558036804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19010086357593536, + "step": 28210 + }, + { + "epoch": 0.56424, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005708567301432292, + "learning_rate": 0.0001, + "loss": 4.1938, + "loss/crossentropy": 2.115884840488434, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19502697885036469, + "step": 28212 + }, + { + "epoch": 0.56428, + "grad_norm": 1.8984375, + "grad_norm_var": 0.005940755208333333, + "learning_rate": 0.0001, + "loss": 4.1859, + "loss/crossentropy": 2.1863423585891724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2202358990907669, + "step": 28214 + }, + { + "epoch": 0.56432, + "grad_norm": 1.78125, + "grad_norm_var": 0.007661946614583333, + "learning_rate": 0.0001, + "loss": 3.6467, + "loss/crossentropy": 1.860621988773346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18784625828266144, + "step": 28216 + }, + { + "epoch": 0.56436, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007338205973307292, + "learning_rate": 0.0001, + "loss": 4.055, + "loss/crossentropy": 1.7891228199005127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822507157921791, + "step": 28218 + }, + { + "epoch": 0.5644, + "grad_norm": 1.7734375, + "grad_norm_var": 0.0050771077473958336, + "learning_rate": 0.0001, + "loss": 3.9942, + "loss/crossentropy": 2.050603687763214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20126069337129593, + "step": 28220 + }, + { + "epoch": 0.56444, + "grad_norm": 1.8125, + "grad_norm_var": 0.005350748697916667, + "learning_rate": 0.0001, + "loss": 3.9056, + "loss/crossentropy": 2.3879984617233276, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19843067973852158, + "step": 28222 + }, + { + "epoch": 0.56448, + "grad_norm": 1.953125, + "grad_norm_var": 0.005158487955729167, + "learning_rate": 0.0001, + "loss": 4.0579, + "loss/crossentropy": 2.0878185033798218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18253600597381592, + "step": 28224 + }, + { + "epoch": 0.56452, + "grad_norm": 1.96875, + "grad_norm_var": 0.0050432840983072914, + "learning_rate": 0.0001, + "loss": 3.9793, + "loss/crossentropy": 2.1168408393859863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21165993809700012, + "step": 28226 + }, + { + "epoch": 0.56456, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0061948140462239586, + "learning_rate": 0.0001, + "loss": 3.8824, + "loss/crossentropy": 1.8579602241516113, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18875548988580704, + "step": 28228 + }, + { + "epoch": 0.5646, + "grad_norm": 1.7109375, + "grad_norm_var": 0.007323201497395833, + "learning_rate": 0.0001, + "loss": 3.7804, + "loss/crossentropy": 2.0095511078834534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1720530390739441, + "step": 28230 + }, + { + "epoch": 0.56464, + "grad_norm": 1.96875, + "grad_norm_var": 0.0074765523274739586, + "learning_rate": 0.0001, + "loss": 3.841, + "loss/crossentropy": 1.8599479794502258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17767678201198578, + "step": 28232 + }, + { + "epoch": 0.56468, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007746378580729167, + "learning_rate": 0.0001, + "loss": 4.2437, + "loss/crossentropy": 2.11702424287796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1897270455956459, + "step": 28234 + }, + { + "epoch": 0.56472, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006180826822916667, + "learning_rate": 0.0001, + "loss": 3.9554, + "loss/crossentropy": 1.8956347703933716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1801730990409851, + "step": 28236 + }, + { + "epoch": 0.56476, + "grad_norm": 1.953125, + "grad_norm_var": 0.006766764322916666, + "learning_rate": 0.0001, + "loss": 3.9988, + "loss/crossentropy": 2.044249713420868, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17439774423837662, + "step": 28238 + }, + { + "epoch": 0.5648, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006534576416015625, + "learning_rate": 0.0001, + "loss": 3.9429, + "loss/crossentropy": 1.8406980633735657, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18690447509288788, + "step": 28240 + }, + { + "epoch": 0.56484, + "grad_norm": 1.84375, + "grad_norm_var": 0.03523661295572917, + "learning_rate": 0.0001, + "loss": 3.6977, + "loss/crossentropy": 1.8951767683029175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18628580123186111, + "step": 28242 + }, + { + "epoch": 0.56488, + "grad_norm": 1.8046875, + "grad_norm_var": 0.03559951782226563, + "learning_rate": 0.0001, + "loss": 3.638, + "loss/crossentropy": 1.9350382685661316, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19427261501550674, + "step": 28244 + }, + { + "epoch": 0.56492, + "grad_norm": 1.9296875, + "grad_norm_var": 0.032698567708333334, + "learning_rate": 0.0001, + "loss": 3.997, + "loss/crossentropy": 2.370202422142029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2236226201057434, + "step": 28246 + }, + { + "epoch": 0.56496, + "grad_norm": 1.84375, + "grad_norm_var": 0.032163238525390624, + "learning_rate": 0.0001, + "loss": 4.0368, + "loss/crossentropy": 2.1876412630081177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005016803741455, + "step": 28248 + }, + { + "epoch": 0.565, + "grad_norm": 2.046875, + "grad_norm_var": 0.0342193603515625, + "learning_rate": 0.0001, + "loss": 3.9678, + "loss/crossentropy": 2.1329512000083923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20921162515878677, + "step": 28250 + }, + { + "epoch": 0.56504, + "grad_norm": 2.203125, + "grad_norm_var": 0.040036773681640624, + "learning_rate": 0.0001, + "loss": 3.8688, + "loss/crossentropy": 1.8053449392318726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17606808245182037, + "step": 28252 + }, + { + "epoch": 0.56508, + "grad_norm": 1.765625, + "grad_norm_var": 0.04079360961914062, + "learning_rate": 0.0001, + "loss": 3.9478, + "loss/crossentropy": 2.0177281498908997, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19240254908800125, + "step": 28254 + }, + { + "epoch": 0.56512, + "grad_norm": 1.953125, + "grad_norm_var": 0.04112930297851562, + "learning_rate": 0.0001, + "loss": 3.9164, + "loss/crossentropy": 1.9807183146476746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726047471165657, + "step": 28256 + }, + { + "epoch": 0.56516, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012345377604166667, + "learning_rate": 0.0001, + "loss": 4.0006, + "loss/crossentropy": 2.0992757081985474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19170843064785004, + "step": 28258 + }, + { + "epoch": 0.5652, + "grad_norm": 1.828125, + "grad_norm_var": 0.01220703125, + "learning_rate": 0.0001, + "loss": 3.6902, + "loss/crossentropy": 1.865010380744934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18145494908094406, + "step": 28260 + }, + { + "epoch": 0.56524, + "grad_norm": 1.9609375, + "grad_norm_var": 0.012788899739583333, + "learning_rate": 0.0001, + "loss": 3.7781, + "loss/crossentropy": 2.060899078845978, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17389538139104843, + "step": 28262 + }, + { + "epoch": 0.56528, + "grad_norm": 1.8046875, + "grad_norm_var": 0.013911946614583334, + "learning_rate": 0.0001, + "loss": 3.8494, + "loss/crossentropy": 1.7959403991699219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17851080000400543, + "step": 28264 + }, + { + "epoch": 0.56532, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012410227457682292, + "learning_rate": 0.0001, + "loss": 3.8456, + "loss/crossentropy": 2.169707775115967, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19611390680074692, + "step": 28266 + }, + { + "epoch": 0.56536, + "grad_norm": 2.1875, + "grad_norm_var": 0.015697224934895834, + "learning_rate": 0.0001, + "loss": 4.1967, + "loss/crossentropy": 2.322705864906311, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20728319883346558, + "step": 28268 + }, + { + "epoch": 0.5654, + "grad_norm": 2.09375, + "grad_norm_var": 0.013423665364583334, + "learning_rate": 0.0001, + "loss": 3.8118, + "loss/crossentropy": 1.9398122429847717, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19211900979280472, + "step": 28270 + }, + { + "epoch": 0.56544, + "grad_norm": 1.890625, + "grad_norm_var": 0.013435618082682291, + "learning_rate": 0.0001, + "loss": 3.9892, + "loss/crossentropy": 1.7359251976013184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16433238983154297, + "step": 28272 + }, + { + "epoch": 0.56548, + "grad_norm": 1.8125, + "grad_norm_var": 0.014973958333333334, + "learning_rate": 0.0001, + "loss": 4.1366, + "loss/crossentropy": 2.3107075691223145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2127886563539505, + "step": 28274 + }, + { + "epoch": 0.56552, + "grad_norm": 1.734375, + "grad_norm_var": 0.01906305948893229, + "learning_rate": 0.0001, + "loss": 3.5307, + "loss/crossentropy": 1.5368210673332214, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15861748158931732, + "step": 28276 + }, + { + "epoch": 0.56556, + "grad_norm": 1.90625, + "grad_norm_var": 0.01902033487955729, + "learning_rate": 0.0001, + "loss": 4.2099, + "loss/crossentropy": 2.145693838596344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1863054633140564, + "step": 28278 + }, + { + "epoch": 0.5656, + "grad_norm": 1.828125, + "grad_norm_var": 0.018464152018229166, + "learning_rate": 0.0001, + "loss": 3.9928, + "loss/crossentropy": 2.3174896240234375, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882578581571579, + "step": 28280 + }, + { + "epoch": 0.56564, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01803766886393229, + "learning_rate": 0.0001, + "loss": 4.2401, + "loss/crossentropy": 2.3332719802856445, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2089502438902855, + "step": 28282 + }, + { + "epoch": 0.56568, + "grad_norm": 1.84375, + "grad_norm_var": 0.008066558837890625, + "learning_rate": 0.0001, + "loss": 3.9758, + "loss/crossentropy": 2.3516300916671753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21649855375289917, + "step": 28284 + }, + { + "epoch": 0.56572, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0067138671875, + "learning_rate": 0.0001, + "loss": 3.9705, + "loss/crossentropy": 1.9031076431274414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18855175375938416, + "step": 28286 + }, + { + "epoch": 0.56576, + "grad_norm": 1.96875, + "grad_norm_var": 0.007100423177083333, + "learning_rate": 0.0001, + "loss": 4.1005, + "loss/crossentropy": 1.5520136952400208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1816611886024475, + "step": 28288 + }, + { + "epoch": 0.5658, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006894683837890625, + "learning_rate": 0.0001, + "loss": 3.7001, + "loss/crossentropy": 1.9759097695350647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19612447917461395, + "step": 28290 + }, + { + "epoch": 0.56584, + "grad_norm": 1.8515625, + "grad_norm_var": 0.005163319905598958, + "learning_rate": 0.0001, + "loss": 4.0389, + "loss/crossentropy": 2.1442651748657227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18353651463985443, + "step": 28292 + }, + { + "epoch": 0.56588, + "grad_norm": 2.03125, + "grad_norm_var": 0.005914052327473958, + "learning_rate": 0.0001, + "loss": 4.0021, + "loss/crossentropy": 1.8098361492156982, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18010258674621582, + "step": 28294 + }, + { + "epoch": 0.56592, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0052886962890625, + "learning_rate": 0.0001, + "loss": 3.9866, + "loss/crossentropy": 2.2412387132644653, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21006706357002258, + "step": 28296 + }, + { + "epoch": 0.56596, + "grad_norm": 1.921875, + "grad_norm_var": 0.006461334228515625, + "learning_rate": 0.0001, + "loss": 4.07, + "loss/crossentropy": 1.927619457244873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1915675550699234, + "step": 28298 + }, + { + "epoch": 0.566, + "grad_norm": 1.7734375, + "grad_norm_var": 0.007502237955729167, + "learning_rate": 0.0001, + "loss": 3.9184, + "loss/crossentropy": 2.317517042160034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19702450931072235, + "step": 28300 + }, + { + "epoch": 0.56604, + "grad_norm": 1.984375, + "grad_norm_var": 0.006514231363932292, + "learning_rate": 0.0001, + "loss": 4.0558, + "loss/crossentropy": 2.02554988861084, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20614077150821686, + "step": 28302 + }, + { + "epoch": 0.56608, + "grad_norm": 1.875, + "grad_norm_var": 0.006359608968098959, + "learning_rate": 0.0001, + "loss": 3.9733, + "loss/crossentropy": 2.2094991207122803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18375249207019806, + "step": 28304 + }, + { + "epoch": 0.56612, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0048601786295572914, + "learning_rate": 0.0001, + "loss": 3.956, + "loss/crossentropy": 1.938852310180664, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19518058001995087, + "step": 28306 + }, + { + "epoch": 0.56616, + "grad_norm": 2.09375, + "grad_norm_var": 0.0065081278483072914, + "learning_rate": 0.0001, + "loss": 4.0979, + "loss/crossentropy": 2.0719003677368164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19132573902606964, + "step": 28308 + }, + { + "epoch": 0.5662, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0059588114420572914, + "learning_rate": 0.0001, + "loss": 3.838, + "loss/crossentropy": 2.01191109418869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19400015473365784, + "step": 28310 + }, + { + "epoch": 0.56624, + "grad_norm": 1.875, + "grad_norm_var": 0.005782063802083333, + "learning_rate": 0.0001, + "loss": 3.9532, + "loss/crossentropy": 2.302769422531128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21603811532258987, + "step": 28312 + }, + { + "epoch": 0.56628, + "grad_norm": 2.03125, + "grad_norm_var": 0.006254069010416667, + "learning_rate": 0.0001, + "loss": 3.9965, + "loss/crossentropy": 1.8251231908798218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1660621464252472, + "step": 28314 + }, + { + "epoch": 0.56632, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0079254150390625, + "learning_rate": 0.0001, + "loss": 4.2141, + "loss/crossentropy": 2.0656734704971313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19783338904380798, + "step": 28316 + }, + { + "epoch": 0.56636, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008485666910807292, + "learning_rate": 0.0001, + "loss": 4.1956, + "loss/crossentropy": 1.949280321598053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19571927189826965, + "step": 28318 + }, + { + "epoch": 0.5664, + "grad_norm": 1.96875, + "grad_norm_var": 0.008571116129557292, + "learning_rate": 0.0001, + "loss": 4.0139, + "loss/crossentropy": 1.801173746585846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17420585453510284, + "step": 28320 + }, + { + "epoch": 0.56644, + "grad_norm": 1.90625, + "grad_norm_var": 0.008519490559895834, + "learning_rate": 0.0001, + "loss": 3.8094, + "loss/crossentropy": 2.000474214553833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19203417748212814, + "step": 28322 + }, + { + "epoch": 0.56648, + "grad_norm": 1.84375, + "grad_norm_var": 0.006681315104166667, + "learning_rate": 0.0001, + "loss": 4.0042, + "loss/crossentropy": 1.9475297331809998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19463783502578735, + "step": 28324 + }, + { + "epoch": 0.56652, + "grad_norm": 1.9921875, + "grad_norm_var": 0.005965169270833333, + "learning_rate": 0.0001, + "loss": 3.9425, + "loss/crossentropy": 2.029974043369293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1769544631242752, + "step": 28326 + }, + { + "epoch": 0.56656, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0058258056640625, + "learning_rate": 0.0001, + "loss": 3.995, + "loss/crossentropy": 1.9521240592002869, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.189279705286026, + "step": 28328 + }, + { + "epoch": 0.5666, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006136067708333333, + "learning_rate": 0.0001, + "loss": 4.1978, + "loss/crossentropy": 2.128006398677826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22940929979085922, + "step": 28330 + }, + { + "epoch": 0.56664, + "grad_norm": 1.78125, + "grad_norm_var": 0.005234527587890625, + "learning_rate": 0.0001, + "loss": 3.9098, + "loss/crossentropy": 2.3887888193130493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2106098011136055, + "step": 28332 + }, + { + "epoch": 0.56668, + "grad_norm": 1.78125, + "grad_norm_var": 0.006639607747395833, + "learning_rate": 0.0001, + "loss": 3.9625, + "loss/crossentropy": 1.9730157256126404, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2026069611310959, + "step": 28334 + }, + { + "epoch": 0.56672, + "grad_norm": 2.015625, + "grad_norm_var": 0.006856028238932292, + "learning_rate": 0.0001, + "loss": 4.0934, + "loss/crossentropy": 2.384614109992981, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21988043189048767, + "step": 28336 + }, + { + "epoch": 0.56676, + "grad_norm": 2.046875, + "grad_norm_var": 0.008388010660807292, + "learning_rate": 0.0001, + "loss": 3.9947, + "loss/crossentropy": 1.843708097934723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18971703946590424, + "step": 28338 + }, + { + "epoch": 0.5668, + "grad_norm": 2.0625, + "grad_norm_var": 0.009897613525390625, + "learning_rate": 0.0001, + "loss": 3.953, + "loss/crossentropy": 2.1084065437316895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19351261109113693, + "step": 28340 + }, + { + "epoch": 0.56684, + "grad_norm": 1.8125, + "grad_norm_var": 0.011321767171223959, + "learning_rate": 0.0001, + "loss": 3.742, + "loss/crossentropy": 1.7598699927330017, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18556873500347137, + "step": 28342 + }, + { + "epoch": 0.56688, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0115875244140625, + "learning_rate": 0.0001, + "loss": 3.8072, + "loss/crossentropy": 1.952016532421112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1828347072005272, + "step": 28344 + }, + { + "epoch": 0.56692, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010286458333333333, + "learning_rate": 0.0001, + "loss": 3.939, + "loss/crossentropy": 1.9486799240112305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19223074615001678, + "step": 28346 + }, + { + "epoch": 0.56696, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009197743733723958, + "learning_rate": 0.0001, + "loss": 3.9421, + "loss/crossentropy": 1.9944785237312317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19656967371702194, + "step": 28348 + }, + { + "epoch": 0.567, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007911936442057291, + "learning_rate": 0.0001, + "loss": 3.5903, + "loss/crossentropy": 2.055353820323944, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19104135036468506, + "step": 28350 + }, + { + "epoch": 0.56704, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007096354166666667, + "learning_rate": 0.0001, + "loss": 3.8961, + "loss/crossentropy": 1.9901525974273682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20019636303186417, + "step": 28352 + }, + { + "epoch": 0.56708, + "grad_norm": 1.9140625, + "grad_norm_var": 0.005418904622395833, + "learning_rate": 0.0001, + "loss": 4.0514, + "loss/crossentropy": 2.246248483657837, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20701756328344345, + "step": 28354 + }, + { + "epoch": 0.56712, + "grad_norm": 2.234375, + "grad_norm_var": 0.011136881510416667, + "learning_rate": 0.0001, + "loss": 3.8456, + "loss/crossentropy": 1.9263428449630737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19890911877155304, + "step": 28356 + }, + { + "epoch": 0.56716, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009338124593098959, + "learning_rate": 0.0001, + "loss": 4.0555, + "loss/crossentropy": 2.2119653820991516, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20472905784845352, + "step": 28358 + }, + { + "epoch": 0.5672, + "grad_norm": 2.09375, + "grad_norm_var": 0.010406239827473959, + "learning_rate": 0.0001, + "loss": 4.2023, + "loss/crossentropy": 2.1126968264579773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18353600800037384, + "step": 28360 + }, + { + "epoch": 0.56724, + "grad_norm": 2.03125, + "grad_norm_var": 0.010929361979166666, + "learning_rate": 0.0001, + "loss": 3.8412, + "loss/crossentropy": 1.7391886115074158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1703544184565544, + "step": 28362 + }, + { + "epoch": 0.56728, + "grad_norm": 1.9140625, + "grad_norm_var": 0.010689036051432291, + "learning_rate": 0.0001, + "loss": 3.8372, + "loss/crossentropy": 1.8684263229370117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18913953006267548, + "step": 28364 + }, + { + "epoch": 0.56732, + "grad_norm": 1.828125, + "grad_norm_var": 0.010699208577473958, + "learning_rate": 0.0001, + "loss": 3.7079, + "loss/crossentropy": 1.971863567829132, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.188376322388649, + "step": 28366 + }, + { + "epoch": 0.56736, + "grad_norm": 2.015625, + "grad_norm_var": 0.010827382405598959, + "learning_rate": 0.0001, + "loss": 4.1637, + "loss/crossentropy": 1.8890693187713623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18029199540615082, + "step": 28368 + }, + { + "epoch": 0.5674, + "grad_norm": 2.109375, + "grad_norm_var": 0.011861165364583334, + "learning_rate": 0.0001, + "loss": 4.2677, + "loss/crossentropy": 1.9506273865699768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19081934541463852, + "step": 28370 + }, + { + "epoch": 0.56744, + "grad_norm": 2.03125, + "grad_norm_var": 0.006463368733723958, + "learning_rate": 0.0001, + "loss": 4.0055, + "loss/crossentropy": 1.937731921672821, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21456565707921982, + "step": 28372 + }, + { + "epoch": 0.56748, + "grad_norm": 1.859375, + "grad_norm_var": 0.006891886393229167, + "learning_rate": 0.0001, + "loss": 4.0904, + "loss/crossentropy": 2.0715887546539307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21278052031993866, + "step": 28374 + }, + { + "epoch": 0.56752, + "grad_norm": 1.7890625, + "grad_norm_var": 0.007413736979166667, + "learning_rate": 0.0001, + "loss": 3.9901, + "loss/crossentropy": 2.2458627223968506, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20296980440616608, + "step": 28376 + }, + { + "epoch": 0.56756, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006886545817057292, + "learning_rate": 0.0001, + "loss": 4.178, + "loss/crossentropy": 2.3213919401168823, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19989027082920074, + "step": 28378 + }, + { + "epoch": 0.5676, + "grad_norm": 1.921875, + "grad_norm_var": 0.007500966389973958, + "learning_rate": 0.0001, + "loss": 4.2319, + "loss/crossentropy": 2.543839931488037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22476375102996826, + "step": 28380 + }, + { + "epoch": 0.56764, + "grad_norm": 1.984375, + "grad_norm_var": 0.006084950764973959, + "learning_rate": 0.0001, + "loss": 3.9426, + "loss/crossentropy": 2.1060808897018433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993144229054451, + "step": 28382 + }, + { + "epoch": 0.56768, + "grad_norm": 1.9375, + "grad_norm_var": 0.0074859619140625, + "learning_rate": 0.0001, + "loss": 3.8902, + "loss/crossentropy": 1.9778028726577759, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18509671837091446, + "step": 28384 + }, + { + "epoch": 0.56772, + "grad_norm": 2.203125, + "grad_norm_var": 0.010445149739583333, + "learning_rate": 0.0001, + "loss": 3.7404, + "loss/crossentropy": 1.7300578951835632, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17200996726751328, + "step": 28386 + }, + { + "epoch": 0.56776, + "grad_norm": 1.7734375, + "grad_norm_var": 0.011352284749348959, + "learning_rate": 0.0001, + "loss": 3.8029, + "loss/crossentropy": 1.87715744972229, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1691356599330902, + "step": 28388 + }, + { + "epoch": 0.5678, + "grad_norm": 1.7578125, + "grad_norm_var": 0.01263427734375, + "learning_rate": 0.0001, + "loss": 3.7688, + "loss/crossentropy": 1.9529326558113098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18868190050125122, + "step": 28390 + }, + { + "epoch": 0.56784, + "grad_norm": 1.84375, + "grad_norm_var": 0.012654622395833334, + "learning_rate": 0.0001, + "loss": 3.9265, + "loss/crossentropy": 1.9922877550125122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18494701385498047, + "step": 28392 + }, + { + "epoch": 0.56788, + "grad_norm": 2.046875, + "grad_norm_var": 0.014012654622395834, + "learning_rate": 0.0001, + "loss": 4.0745, + "loss/crossentropy": 2.1247864961624146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20046664774417877, + "step": 28394 + }, + { + "epoch": 0.56792, + "grad_norm": 1.796875, + "grad_norm_var": 0.019334920247395835, + "learning_rate": 0.0001, + "loss": 3.9056, + "loss/crossentropy": 2.106461524963379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1846928596496582, + "step": 28396 + }, + { + "epoch": 0.56796, + "grad_norm": 1.8203125, + "grad_norm_var": 0.02042210896809896, + "learning_rate": 0.0001, + "loss": 4.0709, + "loss/crossentropy": 2.5050116777420044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23650984466075897, + "step": 28398 + }, + { + "epoch": 0.568, + "grad_norm": 1.9765625, + "grad_norm_var": 0.01956151326497396, + "learning_rate": 0.0001, + "loss": 3.9026, + "loss/crossentropy": 1.8201875686645508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18172843009233475, + "step": 28400 + }, + { + "epoch": 0.56804, + "grad_norm": 1.78125, + "grad_norm_var": 0.014680735270182292, + "learning_rate": 0.0001, + "loss": 3.7716, + "loss/crossentropy": 1.748970091342926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1714978739619255, + "step": 28402 + }, + { + "epoch": 0.56808, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013703409830729167, + "learning_rate": 0.0001, + "loss": 3.9609, + "loss/crossentropy": 2.027329981327057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16597037762403488, + "step": 28404 + }, + { + "epoch": 0.56812, + "grad_norm": 1.859375, + "grad_norm_var": 0.012400054931640625, + "learning_rate": 0.0001, + "loss": 3.8902, + "loss/crossentropy": 1.8651673197746277, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17772310227155685, + "step": 28406 + }, + { + "epoch": 0.56816, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011869049072265625, + "learning_rate": 0.0001, + "loss": 4.0313, + "loss/crossentropy": 2.1574735045433044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2000775933265686, + "step": 28408 + }, + { + "epoch": 0.5682, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010861968994140625, + "learning_rate": 0.0001, + "loss": 4.1171, + "loss/crossentropy": 2.1159602403640747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19108104705810547, + "step": 28410 + }, + { + "epoch": 0.56824, + "grad_norm": 2.171875, + "grad_norm_var": 0.008829498291015625, + "learning_rate": 0.0001, + "loss": 4.229, + "loss/crossentropy": 2.203667163848877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2056882083415985, + "step": 28412 + }, + { + "epoch": 0.56828, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008381907145182292, + "learning_rate": 0.0001, + "loss": 4.0476, + "loss/crossentropy": 2.0054327845573425, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914311721920967, + "step": 28414 + }, + { + "epoch": 0.56832, + "grad_norm": 1.8984375, + "grad_norm_var": 0.008489735921223958, + "learning_rate": 0.0001, + "loss": 4.1249, + "loss/crossentropy": 2.199601411819458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020196169614792, + "step": 28416 + }, + { + "epoch": 0.56836, + "grad_norm": 2.015625, + "grad_norm_var": 0.007682291666666666, + "learning_rate": 0.0001, + "loss": 4.1068, + "loss/crossentropy": 1.84443199634552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17476340383291245, + "step": 28418 + }, + { + "epoch": 0.5684, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007624308268229167, + "learning_rate": 0.0001, + "loss": 3.9032, + "loss/crossentropy": 2.2852976322174072, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21395886689424515, + "step": 28420 + }, + { + "epoch": 0.56844, + "grad_norm": 2.046875, + "grad_norm_var": 0.009956614176432291, + "learning_rate": 0.0001, + "loss": 4.0763, + "loss/crossentropy": 2.2443678975105286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19366547465324402, + "step": 28422 + }, + { + "epoch": 0.56848, + "grad_norm": 1.953125, + "grad_norm_var": 0.012784830729166667, + "learning_rate": 0.0001, + "loss": 4.2711, + "loss/crossentropy": 2.055034577846527, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914302259683609, + "step": 28424 + }, + { + "epoch": 0.56852, + "grad_norm": 1.875, + "grad_norm_var": 0.012245432535807291, + "learning_rate": 0.0001, + "loss": 4.0782, + "loss/crossentropy": 2.33309006690979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2132418155670166, + "step": 28426 + }, + { + "epoch": 0.56856, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009246571858723959, + "learning_rate": 0.0001, + "loss": 4.2069, + "loss/crossentropy": 2.4401670694351196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20467859506607056, + "step": 28428 + }, + { + "epoch": 0.5686, + "grad_norm": 1.9375, + "grad_norm_var": 0.015778605143229166, + "learning_rate": 0.0001, + "loss": 3.9526, + "loss/crossentropy": 1.8660191297531128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23643852770328522, + "step": 28430 + }, + { + "epoch": 0.56864, + "grad_norm": 2.109375, + "grad_norm_var": 0.016080729166666665, + "learning_rate": 0.0001, + "loss": 4.1835, + "loss/crossentropy": 2.026413381099701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.25309374928474426, + "step": 28432 + }, + { + "epoch": 0.56868, + "grad_norm": 1.8203125, + "grad_norm_var": 0.01834691365559896, + "learning_rate": 0.0001, + "loss": 4.0417, + "loss/crossentropy": 2.1234898567199707, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19204584509134293, + "step": 28434 + }, + { + "epoch": 0.56872, + "grad_norm": 2.046875, + "grad_norm_var": 0.017292277018229166, + "learning_rate": 0.0001, + "loss": 4.1999, + "loss/crossentropy": 2.3347290754318237, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21934974193572998, + "step": 28436 + }, + { + "epoch": 0.56876, + "grad_norm": 1.828125, + "grad_norm_var": 0.016076405843098957, + "learning_rate": 0.0001, + "loss": 3.8796, + "loss/crossentropy": 2.0287744402885437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1834561824798584, + "step": 28438 + }, + { + "epoch": 0.5688, + "grad_norm": 1.8671875, + "grad_norm_var": 0.014654286702473958, + "learning_rate": 0.0001, + "loss": 4.0872, + "loss/crossentropy": 1.782648503780365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1810266673564911, + "step": 28440 + }, + { + "epoch": 0.56884, + "grad_norm": 1.8125, + "grad_norm_var": 0.016139475504557292, + "learning_rate": 0.0001, + "loss": 4.0876, + "loss/crossentropy": 2.2251389622688293, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2041720598936081, + "step": 28442 + }, + { + "epoch": 0.56888, + "grad_norm": 1.7734375, + "grad_norm_var": 0.018846638997395835, + "learning_rate": 0.0001, + "loss": 3.7876, + "loss/crossentropy": 1.9368031024932861, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1828494518995285, + "step": 28444 + }, + { + "epoch": 0.56892, + "grad_norm": 1.890625, + "grad_norm_var": 0.011763254801432291, + "learning_rate": 0.0001, + "loss": 3.9775, + "loss/crossentropy": 2.0573925971984863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192618265748024, + "step": 28446 + }, + { + "epoch": 0.56896, + "grad_norm": 1.90625, + "grad_norm_var": 0.0088775634765625, + "learning_rate": 0.0001, + "loss": 3.8321, + "loss/crossentropy": 1.9373971223831177, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20026732236146927, + "step": 28448 + }, + { + "epoch": 0.569, + "grad_norm": 1.8203125, + "grad_norm_var": 0.007391103108723958, + "learning_rate": 0.0001, + "loss": 3.8771, + "loss/crossentropy": 1.877286970615387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17163537442684174, + "step": 28450 + }, + { + "epoch": 0.56904, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0054931640625, + "learning_rate": 0.0001, + "loss": 3.8502, + "loss/crossentropy": 2.1928617358207703, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20396625250577927, + "step": 28452 + }, + { + "epoch": 0.56908, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008373006184895834, + "learning_rate": 0.0001, + "loss": 3.9737, + "loss/crossentropy": 2.087548613548279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2330121174454689, + "step": 28454 + }, + { + "epoch": 0.56912, + "grad_norm": 1.921875, + "grad_norm_var": 0.008275349934895834, + "learning_rate": 0.0001, + "loss": 4.24, + "loss/crossentropy": 2.0506752729415894, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19728151708841324, + "step": 28456 + }, + { + "epoch": 0.56916, + "grad_norm": 1.9453125, + "grad_norm_var": 0.006174468994140625, + "learning_rate": 0.0001, + "loss": 4.1038, + "loss/crossentropy": 2.3582634925842285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21210015565156937, + "step": 28458 + }, + { + "epoch": 0.5692, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0051422119140625, + "learning_rate": 0.0001, + "loss": 3.8762, + "loss/crossentropy": 2.3920403718948364, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20361161977052689, + "step": 28460 + }, + { + "epoch": 0.56924, + "grad_norm": 1.953125, + "grad_norm_var": 0.0048248291015625, + "learning_rate": 0.0001, + "loss": 3.9837, + "loss/crossentropy": 2.077300548553467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18526732921600342, + "step": 28462 + }, + { + "epoch": 0.56928, + "grad_norm": 1.828125, + "grad_norm_var": 0.0059967041015625, + "learning_rate": 0.0001, + "loss": 3.7625, + "loss/crossentropy": 2.0746008157730103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.176588736474514, + "step": 28464 + }, + { + "epoch": 0.56932, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005008697509765625, + "learning_rate": 0.0001, + "loss": 4.0275, + "loss/crossentropy": 2.651300311088562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22577688097953796, + "step": 28466 + }, + { + "epoch": 0.56936, + "grad_norm": 1.8203125, + "grad_norm_var": 0.006477864583333334, + "learning_rate": 0.0001, + "loss": 3.9464, + "loss/crossentropy": 2.169211745262146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20319853723049164, + "step": 28468 + }, + { + "epoch": 0.5694, + "grad_norm": 1.7109375, + "grad_norm_var": 0.005673980712890625, + "learning_rate": 0.0001, + "loss": 3.7715, + "loss/crossentropy": 2.16380512714386, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1928708478808403, + "step": 28470 + }, + { + "epoch": 0.56944, + "grad_norm": 1.890625, + "grad_norm_var": 0.0054094950358072914, + "learning_rate": 0.0001, + "loss": 3.8378, + "loss/crossentropy": 1.780498206615448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1736089140176773, + "step": 28472 + }, + { + "epoch": 0.56948, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005597941080729167, + "learning_rate": 0.0001, + "loss": 4.1348, + "loss/crossentropy": 1.9082182049751282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17953625321388245, + "step": 28474 + }, + { + "epoch": 0.56952, + "grad_norm": 1.984375, + "grad_norm_var": 0.005499013264973958, + "learning_rate": 0.0001, + "loss": 3.8997, + "loss/crossentropy": 1.6533525586128235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16805779933929443, + "step": 28476 + }, + { + "epoch": 0.56956, + "grad_norm": 2.03125, + "grad_norm_var": 0.006882476806640625, + "learning_rate": 0.0001, + "loss": 4.0518, + "loss/crossentropy": 2.0916685461997986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1956561803817749, + "step": 28478 + }, + { + "epoch": 0.5696, + "grad_norm": 2.03125, + "grad_norm_var": 0.0078277587890625, + "learning_rate": 0.0001, + "loss": 4.1282, + "loss/crossentropy": 1.9846032857894897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17997410148382187, + "step": 28480 + }, + { + "epoch": 0.56964, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008396148681640625, + "learning_rate": 0.0001, + "loss": 4.0715, + "loss/crossentropy": 1.9905300736427307, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17957545816898346, + "step": 28482 + }, + { + "epoch": 0.56968, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007356516520182292, + "learning_rate": 0.0001, + "loss": 3.7545, + "loss/crossentropy": 1.9274362921714783, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20025887340307236, + "step": 28484 + }, + { + "epoch": 0.56972, + "grad_norm": 1.953125, + "grad_norm_var": 0.003981272379557292, + "learning_rate": 0.0001, + "loss": 3.8606, + "loss/crossentropy": 1.7590890526771545, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17347681522369385, + "step": 28486 + }, + { + "epoch": 0.56976, + "grad_norm": 1.9453125, + "grad_norm_var": 0.003979237874348959, + "learning_rate": 0.0001, + "loss": 3.9737, + "loss/crossentropy": 1.8959112763404846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18316560983657837, + "step": 28488 + }, + { + "epoch": 0.5698, + "grad_norm": 2.046875, + "grad_norm_var": 0.004292805989583333, + "learning_rate": 0.0001, + "loss": 3.9393, + "loss/crossentropy": 2.108347535133362, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20547642558813095, + "step": 28490 + }, + { + "epoch": 0.56984, + "grad_norm": 1.90625, + "grad_norm_var": 0.003482818603515625, + "learning_rate": 0.0001, + "loss": 3.8703, + "loss/crossentropy": 1.9661864638328552, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1860649138689041, + "step": 28492 + }, + { + "epoch": 0.56988, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0029111226399739583, + "learning_rate": 0.0001, + "loss": 3.8467, + "loss/crossentropy": 1.7892447710037231, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16265498101711273, + "step": 28494 + }, + { + "epoch": 0.56992, + "grad_norm": 1.84375, + "grad_norm_var": 0.0030507405598958334, + "learning_rate": 0.0001, + "loss": 4.1363, + "loss/crossentropy": 2.2188740968704224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20954828709363937, + "step": 28496 + }, + { + "epoch": 0.56996, + "grad_norm": 2.03125, + "grad_norm_var": 0.005028279622395834, + "learning_rate": 0.0001, + "loss": 4.1339, + "loss/crossentropy": 2.1535292863845825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19683580845594406, + "step": 28498 + }, + { + "epoch": 0.57, + "grad_norm": 1.90625, + "grad_norm_var": 0.0048258463541666664, + "learning_rate": 0.0001, + "loss": 3.8687, + "loss/crossentropy": 2.170483350753784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20434408634901047, + "step": 28500 + }, + { + "epoch": 0.57004, + "grad_norm": 1.9375, + "grad_norm_var": 0.004735310872395833, + "learning_rate": 0.0001, + "loss": 3.9846, + "loss/crossentropy": 1.9062520265579224, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18132874369621277, + "step": 28502 + }, + { + "epoch": 0.57008, + "grad_norm": 2.0, + "grad_norm_var": 0.005014801025390625, + "learning_rate": 0.0001, + "loss": 3.9361, + "loss/crossentropy": 1.9402993321418762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1883842498064041, + "step": 28504 + }, + { + "epoch": 0.57012, + "grad_norm": 2.25, + "grad_norm_var": 0.010516103108723958, + "learning_rate": 0.0001, + "loss": 4.2954, + "loss/crossentropy": 2.1244900226593018, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19686578959226608, + "step": 28506 + }, + { + "epoch": 0.57016, + "grad_norm": 1.96875, + "grad_norm_var": 0.010432688395182292, + "learning_rate": 0.0001, + "loss": 4.213, + "loss/crossentropy": 2.255519151687622, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19818376004695892, + "step": 28508 + }, + { + "epoch": 0.5702, + "grad_norm": 1.90625, + "grad_norm_var": 0.017838541666666666, + "learning_rate": 0.0001, + "loss": 4.3414, + "loss/crossentropy": 1.69990873336792, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16047322005033493, + "step": 28510 + }, + { + "epoch": 0.57024, + "grad_norm": 1.859375, + "grad_norm_var": 0.01759618123372396, + "learning_rate": 0.0001, + "loss": 4.0451, + "loss/crossentropy": 2.0938282012939453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19069606065750122, + "step": 28512 + }, + { + "epoch": 0.57028, + "grad_norm": 1.84375, + "grad_norm_var": 0.015746053059895834, + "learning_rate": 0.0001, + "loss": 3.9285, + "loss/crossentropy": 2.119936943054199, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20535118132829666, + "step": 28514 + }, + { + "epoch": 0.57032, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015364583333333333, + "learning_rate": 0.0001, + "loss": 4.1106, + "loss/crossentropy": 2.124122440814972, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19785825163125992, + "step": 28516 + }, + { + "epoch": 0.57036, + "grad_norm": 1.765625, + "grad_norm_var": 0.018342081705729166, + "learning_rate": 0.0001, + "loss": 4.025, + "loss/crossentropy": 2.275521993637085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197978176176548, + "step": 28518 + }, + { + "epoch": 0.5704, + "grad_norm": 1.75, + "grad_norm_var": 0.021089680989583335, + "learning_rate": 0.0001, + "loss": 3.8305, + "loss/crossentropy": 1.9788197875022888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17468783259391785, + "step": 28520 + }, + { + "epoch": 0.57044, + "grad_norm": 1.984375, + "grad_norm_var": 0.015799713134765626, + "learning_rate": 0.0001, + "loss": 3.9497, + "loss/crossentropy": 1.9069678783416748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17532987147569656, + "step": 28522 + }, + { + "epoch": 0.57048, + "grad_norm": 1.9375, + "grad_norm_var": 0.015705362955729166, + "learning_rate": 0.0001, + "loss": 3.9084, + "loss/crossentropy": 1.7438762784004211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15763580799102783, + "step": 28524 + }, + { + "epoch": 0.57052, + "grad_norm": 2.0625, + "grad_norm_var": 0.008280436197916666, + "learning_rate": 0.0001, + "loss": 4.0142, + "loss/crossentropy": 2.0234435200691223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17508037388324738, + "step": 28526 + }, + { + "epoch": 0.57056, + "grad_norm": 2.0, + "grad_norm_var": 0.008571116129557292, + "learning_rate": 0.0001, + "loss": 4.1993, + "loss/crossentropy": 2.2856411933898926, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183924585580826, + "step": 28528 + }, + { + "epoch": 0.5706, + "grad_norm": 2.0625, + "grad_norm_var": 0.010358683268229167, + "learning_rate": 0.0001, + "loss": 3.9569, + "loss/crossentropy": 2.201618194580078, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2216714471578598, + "step": 28530 + }, + { + "epoch": 0.57064, + "grad_norm": 1.84375, + "grad_norm_var": 0.011348215738932292, + "learning_rate": 0.0001, + "loss": 4.017, + "loss/crossentropy": 1.9202881455421448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18147829920053482, + "step": 28532 + }, + { + "epoch": 0.57068, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009733072916666667, + "learning_rate": 0.0001, + "loss": 3.7967, + "loss/crossentropy": 1.5407178401947021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14478188008069992, + "step": 28534 + }, + { + "epoch": 0.57072, + "grad_norm": 2.09375, + "grad_norm_var": 0.010920206705729166, + "learning_rate": 0.0001, + "loss": 3.8766, + "loss/crossentropy": 1.836295247077942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18802417069673538, + "step": 28536 + }, + { + "epoch": 0.57076, + "grad_norm": 1.9609375, + "grad_norm_var": 0.013553619384765625, + "learning_rate": 0.0001, + "loss": 4.1791, + "loss/crossentropy": 2.043636739253998, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19008664041757584, + "step": 28538 + }, + { + "epoch": 0.5708, + "grad_norm": 2.078125, + "grad_norm_var": 0.014745076497395834, + "learning_rate": 0.0001, + "loss": 4.1531, + "loss/crossentropy": 1.8939302563667297, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18854600936174393, + "step": 28540 + }, + { + "epoch": 0.57084, + "grad_norm": 1.953125, + "grad_norm_var": 0.012658437093098959, + "learning_rate": 0.0001, + "loss": 4.0664, + "loss/crossentropy": 2.1033613085746765, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17389168590307236, + "step": 28542 + }, + { + "epoch": 0.57088, + "grad_norm": 2.0625, + "grad_norm_var": 0.013230133056640624, + "learning_rate": 0.0001, + "loss": 4.1976, + "loss/crossentropy": 2.245096266269684, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21308264881372452, + "step": 28544 + }, + { + "epoch": 0.57092, + "grad_norm": 1.796875, + "grad_norm_var": 0.012910715738932292, + "learning_rate": 0.0001, + "loss": 3.8638, + "loss/crossentropy": 1.6646681427955627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16192668676376343, + "step": 28546 + }, + { + "epoch": 0.57096, + "grad_norm": 1.78125, + "grad_norm_var": 0.011922200520833334, + "learning_rate": 0.0001, + "loss": 3.8962, + "loss/crossentropy": 2.29884135723114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17613109946250916, + "step": 28548 + }, + { + "epoch": 0.571, + "grad_norm": 1.890625, + "grad_norm_var": 0.01628392537434896, + "learning_rate": 0.0001, + "loss": 4.0125, + "loss/crossentropy": 2.10237193107605, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1935042291879654, + "step": 28550 + }, + { + "epoch": 0.57104, + "grad_norm": 1.9921875, + "grad_norm_var": 0.015048980712890625, + "learning_rate": 0.0001, + "loss": 4.0384, + "loss/crossentropy": 2.1097005009651184, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1907399743795395, + "step": 28552 + }, + { + "epoch": 0.57108, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012824503580729167, + "learning_rate": 0.0001, + "loss": 4.155, + "loss/crossentropy": 2.1058011054992676, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19711777567863464, + "step": 28554 + }, + { + "epoch": 0.57112, + "grad_norm": 1.9375, + "grad_norm_var": 0.011659495035807292, + "learning_rate": 0.0001, + "loss": 3.9603, + "loss/crossentropy": 2.0466136932373047, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18761292099952698, + "step": 28556 + }, + { + "epoch": 0.57116, + "grad_norm": 1.96875, + "grad_norm_var": 0.011759440104166666, + "learning_rate": 0.0001, + "loss": 3.6357, + "loss/crossentropy": 1.6006923913955688, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17703057825565338, + "step": 28558 + }, + { + "epoch": 0.5712, + "grad_norm": 2.015625, + "grad_norm_var": 0.011183420817057291, + "learning_rate": 0.0001, + "loss": 3.8007, + "loss/crossentropy": 1.884728193283081, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18712394684553146, + "step": 28560 + }, + { + "epoch": 0.57124, + "grad_norm": 1.953125, + "grad_norm_var": 0.009718577067057291, + "learning_rate": 0.0001, + "loss": 3.8565, + "loss/crossentropy": 1.8759589791297913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19130942225456238, + "step": 28562 + }, + { + "epoch": 0.57128, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009785715738932292, + "learning_rate": 0.0001, + "loss": 4.1392, + "loss/crossentropy": 1.9973544478416443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18966401368379593, + "step": 28564 + }, + { + "epoch": 0.57132, + "grad_norm": 1.75, + "grad_norm_var": 0.007368723551432292, + "learning_rate": 0.0001, + "loss": 3.9515, + "loss/crossentropy": 2.1246695518493652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1938970983028412, + "step": 28566 + }, + { + "epoch": 0.57136, + "grad_norm": 1.9375, + "grad_norm_var": 0.0065093994140625, + "learning_rate": 0.0001, + "loss": 3.7797, + "loss/crossentropy": 1.5528571605682373, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15729191899299622, + "step": 28568 + }, + { + "epoch": 0.5714, + "grad_norm": 1.984375, + "grad_norm_var": 0.0068023681640625, + "learning_rate": 0.0001, + "loss": 3.995, + "loss/crossentropy": 2.0078552961349487, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19063813984394073, + "step": 28570 + }, + { + "epoch": 0.57144, + "grad_norm": 1.890625, + "grad_norm_var": 0.0072021484375, + "learning_rate": 0.0001, + "loss": 4.0105, + "loss/crossentropy": 1.8582743406295776, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16606975346803665, + "step": 28572 + }, + { + "epoch": 0.57148, + "grad_norm": 1.8125, + "grad_norm_var": 0.008784739176432292, + "learning_rate": 0.0001, + "loss": 3.7379, + "loss/crossentropy": 2.1142138242721558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19289004802703857, + "step": 28574 + }, + { + "epoch": 0.57152, + "grad_norm": 1.859375, + "grad_norm_var": 0.008147939046223959, + "learning_rate": 0.0001, + "loss": 3.9771, + "loss/crossentropy": 2.0852617621421814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18875133991241455, + "step": 28576 + }, + { + "epoch": 0.57156, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007541656494140625, + "learning_rate": 0.0001, + "loss": 4.0243, + "loss/crossentropy": 2.1352654695510864, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19411734491586685, + "step": 28578 + }, + { + "epoch": 0.5716, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0069048563639322914, + "learning_rate": 0.0001, + "loss": 3.5985, + "loss/crossentropy": 1.7001954317092896, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17129848897457123, + "step": 28580 + }, + { + "epoch": 0.57164, + "grad_norm": 1.8515625, + "grad_norm_var": 0.004894765218098959, + "learning_rate": 0.0001, + "loss": 3.7744, + "loss/crossentropy": 1.8788430094718933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17039231956005096, + "step": 28582 + }, + { + "epoch": 0.57168, + "grad_norm": 1.9765625, + "grad_norm_var": 0.005307769775390625, + "learning_rate": 0.0001, + "loss": 4.0533, + "loss/crossentropy": 1.9935740232467651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18447662144899368, + "step": 28584 + }, + { + "epoch": 0.57172, + "grad_norm": 1.890625, + "grad_norm_var": 0.0067779541015625, + "learning_rate": 0.0001, + "loss": 3.652, + "loss/crossentropy": 1.6966716051101685, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17215370386838913, + "step": 28586 + }, + { + "epoch": 0.57176, + "grad_norm": 1.8359375, + "grad_norm_var": 0.006841786702473958, + "learning_rate": 0.0001, + "loss": 3.6792, + "loss/crossentropy": 1.8064610958099365, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18994714319705963, + "step": 28588 + }, + { + "epoch": 0.5718, + "grad_norm": 1.7890625, + "grad_norm_var": 0.006956990559895833, + "learning_rate": 0.0001, + "loss": 3.8408, + "loss/crossentropy": 1.8688351511955261, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17651986330747604, + "step": 28590 + }, + { + "epoch": 0.57184, + "grad_norm": 2.09375, + "grad_norm_var": 0.023482259114583334, + "learning_rate": 0.0001, + "loss": 3.9822, + "loss/crossentropy": 1.9788597226142883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21657729148864746, + "step": 28592 + }, + { + "epoch": 0.57188, + "grad_norm": 2.125, + "grad_norm_var": 0.02662938435872396, + "learning_rate": 0.0001, + "loss": 4.2794, + "loss/crossentropy": 2.2449204325675964, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20486489683389664, + "step": 28594 + }, + { + "epoch": 0.57192, + "grad_norm": 1.875, + "grad_norm_var": 0.02296727498372396, + "learning_rate": 0.0001, + "loss": 3.9333, + "loss/crossentropy": 1.9045695662498474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19815212488174438, + "step": 28596 + }, + { + "epoch": 0.57196, + "grad_norm": 1.8125, + "grad_norm_var": 0.025742340087890624, + "learning_rate": 0.0001, + "loss": 3.7332, + "loss/crossentropy": 1.6905238628387451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1561586558818817, + "step": 28598 + }, + { + "epoch": 0.572, + "grad_norm": 2.1875, + "grad_norm_var": 0.03126220703125, + "learning_rate": 0.0001, + "loss": 3.7146, + "loss/crossentropy": 1.788037657737732, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17239058762788773, + "step": 28600 + }, + { + "epoch": 0.57204, + "grad_norm": 2.046875, + "grad_norm_var": 0.02777684529622396, + "learning_rate": 0.0001, + "loss": 4.0529, + "loss/crossentropy": 1.9274045825004578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20560631155967712, + "step": 28602 + }, + { + "epoch": 0.57208, + "grad_norm": 1.7734375, + "grad_norm_var": 0.028820546468098958, + "learning_rate": 0.0001, + "loss": 3.8056, + "loss/crossentropy": 1.909210741519928, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17594803869724274, + "step": 28604 + }, + { + "epoch": 0.57212, + "grad_norm": 2.125, + "grad_norm_var": 0.028362782796223958, + "learning_rate": 0.0001, + "loss": 3.9912, + "loss/crossentropy": 2.0221698880195618, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1869770884513855, + "step": 28606 + }, + { + "epoch": 0.57216, + "grad_norm": 1.9375, + "grad_norm_var": 0.01813329060872396, + "learning_rate": 0.0001, + "loss": 4.184, + "loss/crossentropy": 2.235592007637024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17893031239509583, + "step": 28608 + }, + { + "epoch": 0.5722, + "grad_norm": 1.828125, + "grad_norm_var": 0.028303019205729165, + "learning_rate": 0.0001, + "loss": 4.0714, + "loss/crossentropy": 2.098126530647278, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2068413719534874, + "step": 28610 + }, + { + "epoch": 0.57224, + "grad_norm": 1.75, + "grad_norm_var": 0.03432184855143229, + "learning_rate": 0.0001, + "loss": 3.7392, + "loss/crossentropy": 1.97664475440979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18118216842412949, + "step": 28612 + }, + { + "epoch": 0.57228, + "grad_norm": 2.078125, + "grad_norm_var": 0.03205540974934896, + "learning_rate": 0.0001, + "loss": 3.9675, + "loss/crossentropy": 1.9696056246757507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18983419984579086, + "step": 28614 + }, + { + "epoch": 0.57232, + "grad_norm": 1.96875, + "grad_norm_var": 0.0686767578125, + "learning_rate": 0.0001, + "loss": 4.0106, + "loss/crossentropy": 2.1281611919403076, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22855115681886673, + "step": 28616 + }, + { + "epoch": 0.57236, + "grad_norm": 1.796875, + "grad_norm_var": 0.07779032389322917, + "learning_rate": 0.0001, + "loss": 3.987, + "loss/crossentropy": 1.7333523631095886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1650746390223503, + "step": 28618 + }, + { + "epoch": 0.5724, + "grad_norm": 2.71875, + "grad_norm_var": 0.10811258951822916, + "learning_rate": 0.0001, + "loss": 4.2372, + "loss/crossentropy": 2.021788716316223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19636931270360947, + "step": 28620 + }, + { + "epoch": 0.57244, + "grad_norm": 1.9921875, + "grad_norm_var": 0.1073394775390625, + "learning_rate": 0.0001, + "loss": 3.8144, + "loss/crossentropy": 1.7764569520950317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18557444959878922, + "step": 28622 + }, + { + "epoch": 0.57248, + "grad_norm": 1.8515625, + "grad_norm_var": 0.10955301920572917, + "learning_rate": 0.0001, + "loss": 3.8214, + "loss/crossentropy": 1.8160730004310608, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1741533875465393, + "step": 28624 + }, + { + "epoch": 0.57252, + "grad_norm": 1.84375, + "grad_norm_var": 0.1038225809733073, + "learning_rate": 0.0001, + "loss": 3.7493, + "loss/crossentropy": 1.7295884490013123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17627893388271332, + "step": 28626 + }, + { + "epoch": 0.57256, + "grad_norm": 1.9140625, + "grad_norm_var": 0.08949381510416667, + "learning_rate": 0.0001, + "loss": 3.9968, + "loss/crossentropy": 1.9967339038848877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1878109648823738, + "step": 28628 + }, + { + "epoch": 0.5726, + "grad_norm": 1.8984375, + "grad_norm_var": 0.09168294270833334, + "learning_rate": 0.0001, + "loss": 3.9292, + "loss/crossentropy": 2.4338849782943726, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2211342379450798, + "step": 28630 + }, + { + "epoch": 0.57264, + "grad_norm": 1.921875, + "grad_norm_var": 0.057566070556640626, + "learning_rate": 0.0001, + "loss": 3.928, + "loss/crossentropy": 2.0511388778686523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2051515057682991, + "step": 28632 + }, + { + "epoch": 0.57268, + "grad_norm": 1.84375, + "grad_norm_var": 0.05156021118164063, + "learning_rate": 0.0001, + "loss": 3.5661, + "loss/crossentropy": 1.7425475716590881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18100523948669434, + "step": 28634 + }, + { + "epoch": 0.57272, + "grad_norm": 2.03125, + "grad_norm_var": 0.004343414306640625, + "learning_rate": 0.0001, + "loss": 4.2029, + "loss/crossentropy": 2.0718079805374146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951262131333351, + "step": 28636 + }, + { + "epoch": 0.57276, + "grad_norm": 2.015625, + "grad_norm_var": 0.004514312744140625, + "learning_rate": 0.0001, + "loss": 4.0589, + "loss/crossentropy": 1.9455790519714355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024966925382614, + "step": 28638 + }, + { + "epoch": 0.5728, + "grad_norm": 1.8515625, + "grad_norm_var": 0.006075032552083333, + "learning_rate": 0.0001, + "loss": 3.8238, + "loss/crossentropy": 1.9923955798149109, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17989075928926468, + "step": 28640 + }, + { + "epoch": 0.57284, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005708567301432292, + "learning_rate": 0.0001, + "loss": 4.0637, + "loss/crossentropy": 2.178214907646179, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1973334327340126, + "step": 28642 + }, + { + "epoch": 0.57288, + "grad_norm": 1.9375, + "grad_norm_var": 0.004349517822265625, + "learning_rate": 0.0001, + "loss": 3.9935, + "loss/crossentropy": 1.881864607334137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17736243456602097, + "step": 28644 + }, + { + "epoch": 0.57292, + "grad_norm": 1.859375, + "grad_norm_var": 0.004988352457682292, + "learning_rate": 0.0001, + "loss": 4.2611, + "loss/crossentropy": 2.140734553337097, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2056664153933525, + "step": 28646 + }, + { + "epoch": 0.57296, + "grad_norm": 1.921875, + "grad_norm_var": 0.0049479166666666664, + "learning_rate": 0.0001, + "loss": 4.1077, + "loss/crossentropy": 2.2030951976776123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997833326458931, + "step": 28648 + }, + { + "epoch": 0.573, + "grad_norm": 1.984375, + "grad_norm_var": 0.0045888264973958336, + "learning_rate": 0.0001, + "loss": 4.1237, + "loss/crossentropy": 1.8152233958244324, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17497697472572327, + "step": 28650 + }, + { + "epoch": 0.57304, + "grad_norm": 1.9609375, + "grad_norm_var": 0.008056386311848959, + "learning_rate": 0.0001, + "loss": 4.4633, + "loss/crossentropy": 2.2969506978988647, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22079680860042572, + "step": 28652 + }, + { + "epoch": 0.57308, + "grad_norm": 1.875, + "grad_norm_var": 0.010550689697265626, + "learning_rate": 0.0001, + "loss": 4.0007, + "loss/crossentropy": 2.058579981327057, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1992625817656517, + "step": 28654 + }, + { + "epoch": 0.57312, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0073964436848958336, + "learning_rate": 0.0001, + "loss": 3.9863, + "loss/crossentropy": 1.9630435109138489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19192881137132645, + "step": 28656 + }, + { + "epoch": 0.57316, + "grad_norm": 1.953125, + "grad_norm_var": 0.008408355712890624, + "learning_rate": 0.0001, + "loss": 3.9874, + "loss/crossentropy": 2.089448630809784, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.185981385409832, + "step": 28658 + }, + { + "epoch": 0.5732, + "grad_norm": 1.890625, + "grad_norm_var": 0.0088287353515625, + "learning_rate": 0.0001, + "loss": 3.8705, + "loss/crossentropy": 2.1264119148254395, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18827120959758759, + "step": 28660 + }, + { + "epoch": 0.57324, + "grad_norm": 1.7734375, + "grad_norm_var": 0.0121978759765625, + "learning_rate": 0.0001, + "loss": 3.8047, + "loss/crossentropy": 2.121288537979126, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1772882044315338, + "step": 28662 + }, + { + "epoch": 0.57328, + "grad_norm": 2.484375, + "grad_norm_var": 0.03190689086914063, + "learning_rate": 0.0001, + "loss": 4.1513, + "loss/crossentropy": 2.0339037775993347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1962498426437378, + "step": 28664 + }, + { + "epoch": 0.57332, + "grad_norm": 1.828125, + "grad_norm_var": 0.03292210896809896, + "learning_rate": 0.0001, + "loss": 4.0585, + "loss/crossentropy": 2.1422963738441467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18338894098997116, + "step": 28666 + }, + { + "epoch": 0.57336, + "grad_norm": 2.09375, + "grad_norm_var": 0.030590565999348958, + "learning_rate": 0.0001, + "loss": 4.2466, + "loss/crossentropy": 2.029167354106903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20171435922384262, + "step": 28668 + }, + { + "epoch": 0.5734, + "grad_norm": 2.203125, + "grad_norm_var": 0.032225545247395834, + "learning_rate": 0.0001, + "loss": 3.918, + "loss/crossentropy": 1.7628564238548279, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19051892310380936, + "step": 28670 + }, + { + "epoch": 0.57344, + "grad_norm": 1.734375, + "grad_norm_var": 0.03588053385416667, + "learning_rate": 0.0001, + "loss": 3.7368, + "loss/crossentropy": 2.0292606949806213, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17968103289604187, + "step": 28672 + }, + { + "epoch": 0.57348, + "grad_norm": 2.125, + "grad_norm_var": 0.044130198160807294, + "learning_rate": 0.0001, + "loss": 4.2414, + "loss/crossentropy": 1.9167731404304504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22261708974838257, + "step": 28674 + }, + { + "epoch": 0.57352, + "grad_norm": 2.03125, + "grad_norm_var": 0.04302164713541667, + "learning_rate": 0.0001, + "loss": 3.9668, + "loss/crossentropy": 1.9671185612678528, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19196172803640366, + "step": 28676 + }, + { + "epoch": 0.57356, + "grad_norm": 1.8671875, + "grad_norm_var": 0.037666575113932295, + "learning_rate": 0.0001, + "loss": 3.8129, + "loss/crossentropy": 2.0191508531570435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18491507321596146, + "step": 28678 + }, + { + "epoch": 0.5736, + "grad_norm": 1.859375, + "grad_norm_var": 0.023209635416666666, + "learning_rate": 0.0001, + "loss": 4.0662, + "loss/crossentropy": 1.9060558676719666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18243618309497833, + "step": 28680 + }, + { + "epoch": 0.57364, + "grad_norm": 2.03125, + "grad_norm_var": 0.0218658447265625, + "learning_rate": 0.0001, + "loss": 4.0068, + "loss/crossentropy": 1.867979109287262, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23452743887901306, + "step": 28682 + }, + { + "epoch": 0.57368, + "grad_norm": 1.953125, + "grad_norm_var": 0.021354166666666667, + "learning_rate": 0.0001, + "loss": 4.079, + "loss/crossentropy": 1.6199312806129456, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16022612154483795, + "step": 28684 + }, + { + "epoch": 0.57372, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0176422119140625, + "learning_rate": 0.0001, + "loss": 4.191, + "loss/crossentropy": 2.5148130655288696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2360077053308487, + "step": 28686 + }, + { + "epoch": 0.57376, + "grad_norm": 2.015625, + "grad_norm_var": 0.015126291910807292, + "learning_rate": 0.0001, + "loss": 4.032, + "loss/crossentropy": 1.9117471575737, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2010490596294403, + "step": 28688 + }, + { + "epoch": 0.5738, + "grad_norm": 1.796875, + "grad_norm_var": 0.006821441650390625, + "learning_rate": 0.0001, + "loss": 4.0699, + "loss/crossentropy": 2.079534113407135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18718824535608292, + "step": 28690 + }, + { + "epoch": 0.57384, + "grad_norm": 1.9765625, + "grad_norm_var": 0.00633544921875, + "learning_rate": 0.0001, + "loss": 3.97, + "loss/crossentropy": 2.187251031398773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929212063550949, + "step": 28692 + }, + { + "epoch": 0.57388, + "grad_norm": 1.90625, + "grad_norm_var": 0.006023915608723959, + "learning_rate": 0.0001, + "loss": 4.0509, + "loss/crossentropy": 1.9140017628669739, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1726428121328354, + "step": 28694 + }, + { + "epoch": 0.57392, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0047515869140625, + "learning_rate": 0.0001, + "loss": 3.9645, + "loss/crossentropy": 1.530074417591095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15649892389774323, + "step": 28696 + }, + { + "epoch": 0.57396, + "grad_norm": 2.171875, + "grad_norm_var": 0.006998443603515625, + "learning_rate": 0.0001, + "loss": 4.1736, + "loss/crossentropy": 2.2588948011398315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20315733551979065, + "step": 28698 + }, + { + "epoch": 0.574, + "grad_norm": 2.140625, + "grad_norm_var": 0.0094390869140625, + "learning_rate": 0.0001, + "loss": 4.0446, + "loss/crossentropy": 2.1702714562416077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24278688430786133, + "step": 28700 + }, + { + "epoch": 0.57404, + "grad_norm": 2.046875, + "grad_norm_var": 0.009789021809895833, + "learning_rate": 0.0001, + "loss": 3.9904, + "loss/crossentropy": 2.1350772380828857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21515338122844696, + "step": 28702 + }, + { + "epoch": 0.57408, + "grad_norm": 1.953125, + "grad_norm_var": 0.0086822509765625, + "learning_rate": 0.0001, + "loss": 3.9282, + "loss/crossentropy": 1.9809751510620117, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18557599931955338, + "step": 28704 + }, + { + "epoch": 0.57412, + "grad_norm": 1.9921875, + "grad_norm_var": 0.006658681233723958, + "learning_rate": 0.0001, + "loss": 3.9628, + "loss/crossentropy": 1.7658061981201172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16570701450109482, + "step": 28706 + }, + { + "epoch": 0.57416, + "grad_norm": 1.7421875, + "grad_norm_var": 0.010188547770182292, + "learning_rate": 0.0001, + "loss": 3.8666, + "loss/crossentropy": 1.787250578403473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18032344430685043, + "step": 28708 + }, + { + "epoch": 0.5742, + "grad_norm": 1.84375, + "grad_norm_var": 0.011082967122395834, + "learning_rate": 0.0001, + "loss": 4.1878, + "loss/crossentropy": 2.361757516860962, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2135041281580925, + "step": 28710 + }, + { + "epoch": 0.57424, + "grad_norm": 1.984375, + "grad_norm_var": 0.010921223958333334, + "learning_rate": 0.0001, + "loss": 4.1266, + "loss/crossentropy": 2.0687750577926636, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20218269526958466, + "step": 28712 + }, + { + "epoch": 0.57428, + "grad_norm": 2.0625, + "grad_norm_var": 0.010587310791015625, + "learning_rate": 0.0001, + "loss": 3.8437, + "loss/crossentropy": 1.816366970539093, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1572190448641777, + "step": 28714 + }, + { + "epoch": 0.57432, + "grad_norm": 1.7265625, + "grad_norm_var": 0.010138956705729167, + "learning_rate": 0.0001, + "loss": 3.8273, + "loss/crossentropy": 2.2949939966201782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18894164264202118, + "step": 28716 + }, + { + "epoch": 0.57436, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008680979410807291, + "learning_rate": 0.0001, + "loss": 3.9153, + "loss/crossentropy": 2.249353289604187, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21117348968982697, + "step": 28718 + }, + { + "epoch": 0.5744, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008902740478515626, + "learning_rate": 0.0001, + "loss": 4.2787, + "loss/crossentropy": 2.3134714365005493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21170897036790848, + "step": 28720 + }, + { + "epoch": 0.57444, + "grad_norm": 2.046875, + "grad_norm_var": 0.009748331705729167, + "learning_rate": 0.0001, + "loss": 3.8791, + "loss/crossentropy": 2.241877317428589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200472578406334, + "step": 28722 + }, + { + "epoch": 0.57448, + "grad_norm": 1.96875, + "grad_norm_var": 0.007844034830729167, + "learning_rate": 0.0001, + "loss": 4.2115, + "loss/crossentropy": 2.39408540725708, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025068998336792, + "step": 28724 + }, + { + "epoch": 0.57452, + "grad_norm": 1.828125, + "grad_norm_var": 0.008063761393229167, + "learning_rate": 0.0001, + "loss": 3.9963, + "loss/crossentropy": 1.9735210537910461, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17944949120283127, + "step": 28726 + }, + { + "epoch": 0.57456, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009040323893229167, + "learning_rate": 0.0001, + "loss": 3.9401, + "loss/crossentropy": 2.028856635093689, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19235961139202118, + "step": 28728 + }, + { + "epoch": 0.5746, + "grad_norm": 1.9609375, + "grad_norm_var": 0.006992340087890625, + "learning_rate": 0.0001, + "loss": 4.3698, + "loss/crossentropy": 2.18750536441803, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18273723125457764, + "step": 28730 + }, + { + "epoch": 0.57464, + "grad_norm": 1.9765625, + "grad_norm_var": 0.004378255208333333, + "learning_rate": 0.0001, + "loss": 4.0532, + "loss/crossentropy": 2.127722382545471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2066717892885208, + "step": 28732 + }, + { + "epoch": 0.57468, + "grad_norm": 1.8828125, + "grad_norm_var": 0.004255930582682292, + "learning_rate": 0.0001, + "loss": 3.8228, + "loss/crossentropy": 2.1561193466186523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19151438027620316, + "step": 28734 + }, + { + "epoch": 0.57472, + "grad_norm": 1.890625, + "grad_norm_var": 0.004463704427083334, + "learning_rate": 0.0001, + "loss": 3.9893, + "loss/crossentropy": 2.3356776237487793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20846813172101974, + "step": 28736 + }, + { + "epoch": 0.57476, + "grad_norm": 1.828125, + "grad_norm_var": 0.005182902018229167, + "learning_rate": 0.0001, + "loss": 4.14, + "loss/crossentropy": 2.024761378765106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19134287536144257, + "step": 28738 + }, + { + "epoch": 0.5748, + "grad_norm": 1.890625, + "grad_norm_var": 0.006060536702473958, + "learning_rate": 0.0001, + "loss": 3.8726, + "loss/crossentropy": 2.242386519908905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20453084260225296, + "step": 28740 + }, + { + "epoch": 0.57484, + "grad_norm": 1.90625, + "grad_norm_var": 0.005472819010416667, + "learning_rate": 0.0001, + "loss": 3.9335, + "loss/crossentropy": 1.7191591262817383, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17990755289793015, + "step": 28742 + }, + { + "epoch": 0.57488, + "grad_norm": 1.9453125, + "grad_norm_var": 0.004571278889973958, + "learning_rate": 0.0001, + "loss": 3.7335, + "loss/crossentropy": 2.1018252968788147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20789441466331482, + "step": 28744 + }, + { + "epoch": 0.57492, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0042803446451822914, + "learning_rate": 0.0001, + "loss": 3.8906, + "loss/crossentropy": 1.9056601524353027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18688670545816422, + "step": 28746 + }, + { + "epoch": 0.57496, + "grad_norm": 1.8984375, + "grad_norm_var": 0.00360107421875, + "learning_rate": 0.0001, + "loss": 3.8521, + "loss/crossentropy": 2.0531609058380127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1931629702448845, + "step": 28748 + }, + { + "epoch": 0.575, + "grad_norm": 1.9140625, + "grad_norm_var": 0.003684234619140625, + "learning_rate": 0.0001, + "loss": 3.9218, + "loss/crossentropy": 1.89992356300354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1808946281671524, + "step": 28750 + }, + { + "epoch": 0.57504, + "grad_norm": 1.9375, + "grad_norm_var": 0.003739166259765625, + "learning_rate": 0.0001, + "loss": 3.8245, + "loss/crossentropy": 2.2009544372558594, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20267173647880554, + "step": 28752 + }, + { + "epoch": 0.57508, + "grad_norm": 1.9375, + "grad_norm_var": 0.0043108622233072914, + "learning_rate": 0.0001, + "loss": 4.2622, + "loss/crossentropy": 2.009132504463196, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20399124920368195, + "step": 28754 + }, + { + "epoch": 0.57512, + "grad_norm": 1.8046875, + "grad_norm_var": 0.004365793863932292, + "learning_rate": 0.0001, + "loss": 3.7545, + "loss/crossentropy": 1.9875890612602234, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20523500442504883, + "step": 28756 + }, + { + "epoch": 0.57516, + "grad_norm": 5.40625, + "grad_norm_var": 0.7703570048014323, + "learning_rate": 0.0001, + "loss": 4.0484, + "loss/crossentropy": 1.8446847200393677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16559410095214844, + "step": 28758 + }, + { + "epoch": 0.5752, + "grad_norm": 2.140625, + "grad_norm_var": 0.7632118225097656, + "learning_rate": 0.0001, + "loss": 4.1006, + "loss/crossentropy": 2.147071599960327, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19807270914316177, + "step": 28760 + }, + { + "epoch": 0.57524, + "grad_norm": 1.96875, + "grad_norm_var": 0.760791015625, + "learning_rate": 0.0001, + "loss": 4.1881, + "loss/crossentropy": 2.046617865562439, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18792100995779037, + "step": 28762 + }, + { + "epoch": 0.57528, + "grad_norm": 2.0, + "grad_norm_var": 0.7528297424316406, + "learning_rate": 0.0001, + "loss": 4.0231, + "loss/crossentropy": 1.6951394081115723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1635662019252777, + "step": 28764 + }, + { + "epoch": 0.57532, + "grad_norm": 2.046875, + "grad_norm_var": 0.7440752665201823, + "learning_rate": 0.0001, + "loss": 3.8999, + "loss/crossentropy": 1.7923210263252258, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16349300742149353, + "step": 28766 + }, + { + "epoch": 0.57536, + "grad_norm": 1.953125, + "grad_norm_var": 0.7344011942545573, + "learning_rate": 0.0001, + "loss": 4.2475, + "loss/crossentropy": 2.0918190479278564, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20321150124073029, + "step": 28768 + }, + { + "epoch": 0.5754, + "grad_norm": 1.9375, + "grad_norm_var": 0.7387451171875, + "learning_rate": 0.0001, + "loss": 4.1483, + "loss/crossentropy": 1.691568374633789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18507587909698486, + "step": 28770 + }, + { + "epoch": 0.57544, + "grad_norm": 1.921875, + "grad_norm_var": 0.7320757548014323, + "learning_rate": 0.0001, + "loss": 4.0857, + "loss/crossentropy": 2.019860029220581, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19651245325803757, + "step": 28772 + }, + { + "epoch": 0.57548, + "grad_norm": 2.015625, + "grad_norm_var": 0.01715672810872396, + "learning_rate": 0.0001, + "loss": 4.0712, + "loss/crossentropy": 1.9527531862258911, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19377586245536804, + "step": 28774 + }, + { + "epoch": 0.57552, + "grad_norm": 2.0625, + "grad_norm_var": 0.019486236572265624, + "learning_rate": 0.0001, + "loss": 4.0374, + "loss/crossentropy": 2.105517864227295, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1907084435224533, + "step": 28776 + }, + { + "epoch": 0.57556, + "grad_norm": 1.7890625, + "grad_norm_var": 0.014277903238932292, + "learning_rate": 0.0001, + "loss": 3.88, + "loss/crossentropy": 1.804184377193451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17274542152881622, + "step": 28778 + }, + { + "epoch": 0.5756, + "grad_norm": 1.96875, + "grad_norm_var": 0.014102935791015625, + "learning_rate": 0.0001, + "loss": 4.0615, + "loss/crossentropy": 2.3283534049987793, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21980226039886475, + "step": 28780 + }, + { + "epoch": 0.57564, + "grad_norm": 1.9375, + "grad_norm_var": 0.010727691650390624, + "learning_rate": 0.0001, + "loss": 3.8824, + "loss/crossentropy": 2.054227828979492, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1968243345618248, + "step": 28782 + }, + { + "epoch": 0.57568, + "grad_norm": 1.953125, + "grad_norm_var": 0.007614898681640625, + "learning_rate": 0.0001, + "loss": 3.9686, + "loss/crossentropy": 2.2226120233535767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20916803926229477, + "step": 28784 + }, + { + "epoch": 0.57572, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007905832926432292, + "learning_rate": 0.0001, + "loss": 4.007, + "loss/crossentropy": 1.9371869564056396, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21197544038295746, + "step": 28786 + }, + { + "epoch": 0.57576, + "grad_norm": 2.21875, + "grad_norm_var": 0.017122395833333335, + "learning_rate": 0.0001, + "loss": 3.6572, + "loss/crossentropy": 1.6595491766929626, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1565944254398346, + "step": 28788 + }, + { + "epoch": 0.5758, + "grad_norm": 1.90625, + "grad_norm_var": 0.015276845296223958, + "learning_rate": 0.0001, + "loss": 3.9956, + "loss/crossentropy": 1.7383070588111877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1768334060907364, + "step": 28790 + }, + { + "epoch": 0.57584, + "grad_norm": 1.859375, + "grad_norm_var": 0.015508778889973958, + "learning_rate": 0.0001, + "loss": 4.0105, + "loss/crossentropy": 1.9087260365486145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958516761660576, + "step": 28792 + }, + { + "epoch": 0.57588, + "grad_norm": 2.03125, + "grad_norm_var": 0.014460245768229166, + "learning_rate": 0.0001, + "loss": 3.9552, + "loss/crossentropy": 2.1638959646224976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20037583261728287, + "step": 28794 + }, + { + "epoch": 0.57592, + "grad_norm": 2.09375, + "grad_norm_var": 0.015132649739583334, + "learning_rate": 0.0001, + "loss": 4.1782, + "loss/crossentropy": 2.067569851875305, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20235508680343628, + "step": 28796 + }, + { + "epoch": 0.57596, + "grad_norm": 1.9296875, + "grad_norm_var": 0.014891560872395833, + "learning_rate": 0.0001, + "loss": 4.0691, + "loss/crossentropy": 2.377517819404602, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21966589242219925, + "step": 28798 + }, + { + "epoch": 0.576, + "grad_norm": 1.8046875, + "grad_norm_var": 0.016556549072265624, + "learning_rate": 0.0001, + "loss": 3.8553, + "loss/crossentropy": 1.8835238814353943, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17692726850509644, + "step": 28800 + }, + { + "epoch": 0.57604, + "grad_norm": 1.84375, + "grad_norm_var": 0.017093912760416666, + "learning_rate": 0.0001, + "loss": 4.0865, + "loss/crossentropy": 2.177576720714569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1855052188038826, + "step": 28802 + }, + { + "epoch": 0.57608, + "grad_norm": 2.03125, + "grad_norm_var": 0.007881673177083333, + "learning_rate": 0.0001, + "loss": 4.09, + "loss/crossentropy": 2.2061915397644043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2123064547777176, + "step": 28804 + }, + { + "epoch": 0.57612, + "grad_norm": 1.7421875, + "grad_norm_var": 0.010692342122395834, + "learning_rate": 0.0001, + "loss": 3.9471, + "loss/crossentropy": 1.9637818932533264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20487510412931442, + "step": 28806 + }, + { + "epoch": 0.57616, + "grad_norm": 2.140625, + "grad_norm_var": 0.014399973551432292, + "learning_rate": 0.0001, + "loss": 4.0394, + "loss/crossentropy": 2.1481754183769226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974523589015007, + "step": 28808 + }, + { + "epoch": 0.5762, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0139312744140625, + "learning_rate": 0.0001, + "loss": 3.9125, + "loss/crossentropy": 1.907672941684723, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18901971727609634, + "step": 28810 + }, + { + "epoch": 0.57624, + "grad_norm": 2.0, + "grad_norm_var": 0.0115875244140625, + "learning_rate": 0.0001, + "loss": 4.0425, + "loss/crossentropy": 2.2241225838661194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1882694587111473, + "step": 28812 + }, + { + "epoch": 0.57628, + "grad_norm": 1.828125, + "grad_norm_var": 0.01248779296875, + "learning_rate": 0.0001, + "loss": 4.1874, + "loss/crossentropy": 2.150957465171814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2233213186264038, + "step": 28814 + }, + { + "epoch": 0.57632, + "grad_norm": 1.953125, + "grad_norm_var": 0.011494954427083334, + "learning_rate": 0.0001, + "loss": 4.0973, + "loss/crossentropy": 1.9427680373191833, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19366063922643661, + "step": 28816 + }, + { + "epoch": 0.57636, + "grad_norm": 1.9140625, + "grad_norm_var": 0.012027740478515625, + "learning_rate": 0.0001, + "loss": 3.7749, + "loss/crossentropy": 1.7898414731025696, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18277223408222198, + "step": 28818 + }, + { + "epoch": 0.5764, + "grad_norm": 2.21875, + "grad_norm_var": 0.03846435546875, + "learning_rate": 0.0001, + "loss": 4.0501, + "loss/crossentropy": 2.0117841362953186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17678485065698624, + "step": 28820 + }, + { + "epoch": 0.57644, + "grad_norm": 1.796875, + "grad_norm_var": 0.03804423014322917, + "learning_rate": 0.0001, + "loss": 3.749, + "loss/crossentropy": 2.0270848274230957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17247303575277328, + "step": 28822 + }, + { + "epoch": 0.57648, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0336822509765625, + "learning_rate": 0.0001, + "loss": 3.9248, + "loss/crossentropy": 1.917984962463379, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1722751259803772, + "step": 28824 + }, + { + "epoch": 0.57652, + "grad_norm": 2.03125, + "grad_norm_var": 0.03392512003580729, + "learning_rate": 0.0001, + "loss": 4.1579, + "loss/crossentropy": 1.8189431428909302, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18306910246610641, + "step": 28826 + }, + { + "epoch": 0.57656, + "grad_norm": 1.8125, + "grad_norm_var": 0.03524144490559896, + "learning_rate": 0.0001, + "loss": 4.0638, + "loss/crossentropy": 1.8207548260688782, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1782800853252411, + "step": 28828 + }, + { + "epoch": 0.5766, + "grad_norm": 1.703125, + "grad_norm_var": 0.038266754150390624, + "learning_rate": 0.0001, + "loss": 4.0089, + "loss/crossentropy": 2.1244252920150757, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18752482533454895, + "step": 28830 + }, + { + "epoch": 0.57664, + "grad_norm": 1.890625, + "grad_norm_var": 0.038426717122395836, + "learning_rate": 0.0001, + "loss": 4.1127, + "loss/crossentropy": 2.435193181037903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23042550683021545, + "step": 28832 + }, + { + "epoch": 0.57668, + "grad_norm": 1.828125, + "grad_norm_var": 0.03916803995768229, + "learning_rate": 0.0001, + "loss": 3.6812, + "loss/crossentropy": 2.1272794008255005, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19795158505439758, + "step": 28834 + }, + { + "epoch": 0.57672, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009814453125, + "learning_rate": 0.0001, + "loss": 4.08, + "loss/crossentropy": 1.7166744470596313, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17582352459430695, + "step": 28836 + }, + { + "epoch": 0.57676, + "grad_norm": 1.7421875, + "grad_norm_var": 0.011248524983723958, + "learning_rate": 0.0001, + "loss": 3.85, + "loss/crossentropy": 2.066642463207245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19613336026668549, + "step": 28838 + }, + { + "epoch": 0.5768, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0114166259765625, + "learning_rate": 0.0001, + "loss": 3.8214, + "loss/crossentropy": 1.9082713723182678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.173607736825943, + "step": 28840 + }, + { + "epoch": 0.57684, + "grad_norm": 2.046875, + "grad_norm_var": 0.0119781494140625, + "learning_rate": 0.0001, + "loss": 3.8438, + "loss/crossentropy": 2.0809885263442993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1808517575263977, + "step": 28842 + }, + { + "epoch": 0.57688, + "grad_norm": 1.921875, + "grad_norm_var": 0.011246490478515624, + "learning_rate": 0.0001, + "loss": 3.7981, + "loss/crossentropy": 1.8057249784469604, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17345206439495087, + "step": 28844 + }, + { + "epoch": 0.57692, + "grad_norm": 1.953125, + "grad_norm_var": 0.012627919514973959, + "learning_rate": 0.0001, + "loss": 4.2121, + "loss/crossentropy": 2.3364038467407227, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2242283821105957, + "step": 28846 + }, + { + "epoch": 0.57696, + "grad_norm": 2.078125, + "grad_norm_var": 0.014387003580729167, + "learning_rate": 0.0001, + "loss": 4.2341, + "loss/crossentropy": 2.256643772125244, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20197833329439163, + "step": 28848 + }, + { + "epoch": 0.577, + "grad_norm": 1.9140625, + "grad_norm_var": 0.013270823160807292, + "learning_rate": 0.0001, + "loss": 4.0897, + "loss/crossentropy": 2.2531803846359253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19735650718212128, + "step": 28850 + }, + { + "epoch": 0.57704, + "grad_norm": 1.953125, + "grad_norm_var": 0.011897786458333334, + "learning_rate": 0.0001, + "loss": 4.1416, + "loss/crossentropy": 2.1737340688705444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2048204094171524, + "step": 28852 + }, + { + "epoch": 0.57708, + "grad_norm": 2.03125, + "grad_norm_var": 0.0110595703125, + "learning_rate": 0.0001, + "loss": 4.2294, + "loss/crossentropy": 2.2098671197891235, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21505644917488098, + "step": 28854 + }, + { + "epoch": 0.57712, + "grad_norm": 1.875, + "grad_norm_var": 0.011149088541666666, + "learning_rate": 0.0001, + "loss": 3.7808, + "loss/crossentropy": 2.0007169246673584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17798644304275513, + "step": 28856 + }, + { + "epoch": 0.57716, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0106597900390625, + "learning_rate": 0.0001, + "loss": 3.6825, + "loss/crossentropy": 1.9002285599708557, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18565426766872406, + "step": 28858 + }, + { + "epoch": 0.5772, + "grad_norm": 1.9375, + "grad_norm_var": 0.008833567301432291, + "learning_rate": 0.0001, + "loss": 3.8333, + "loss/crossentropy": 1.9257296323776245, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18187644332647324, + "step": 28860 + }, + { + "epoch": 0.57724, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007868448893229166, + "learning_rate": 0.0001, + "loss": 4.204, + "loss/crossentropy": 2.0777525305747986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20168712735176086, + "step": 28862 + }, + { + "epoch": 0.57728, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0074127197265625, + "learning_rate": 0.0001, + "loss": 3.9328, + "loss/crossentropy": 2.1630115509033203, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19102369993925095, + "step": 28864 + }, + { + "epoch": 0.57732, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0077301025390625, + "learning_rate": 0.0001, + "loss": 3.7329, + "loss/crossentropy": 1.8166091442108154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1680990755558014, + "step": 28866 + }, + { + "epoch": 0.57736, + "grad_norm": 1.96875, + "grad_norm_var": 0.009419759114583334, + "learning_rate": 0.0001, + "loss": 4.1253, + "loss/crossentropy": 2.073417544364929, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20478951185941696, + "step": 28868 + }, + { + "epoch": 0.5774, + "grad_norm": 1.8203125, + "grad_norm_var": 0.007184855143229167, + "learning_rate": 0.0001, + "loss": 3.9782, + "loss/crossentropy": 1.9039225578308105, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17434976249933243, + "step": 28870 + }, + { + "epoch": 0.57744, + "grad_norm": 2.03125, + "grad_norm_var": 0.0077301025390625, + "learning_rate": 0.0001, + "loss": 4.0646, + "loss/crossentropy": 2.3139874935150146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20631124079227448, + "step": 28872 + }, + { + "epoch": 0.57748, + "grad_norm": 1.8828125, + "grad_norm_var": 0.007279459635416667, + "learning_rate": 0.0001, + "loss": 3.8964, + "loss/crossentropy": 2.1889963150024414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22081536054611206, + "step": 28874 + }, + { + "epoch": 0.57752, + "grad_norm": 1.8359375, + "grad_norm_var": 0.007283274332682292, + "learning_rate": 0.0001, + "loss": 3.9964, + "loss/crossentropy": 2.1492413878440857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1974084973335266, + "step": 28876 + }, + { + "epoch": 0.57756, + "grad_norm": 1.8125, + "grad_norm_var": 0.007201131184895833, + "learning_rate": 0.0001, + "loss": 3.751, + "loss/crossentropy": 1.8974568843841553, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1662147268652916, + "step": 28878 + }, + { + "epoch": 0.5776, + "grad_norm": 1.7578125, + "grad_norm_var": 0.008093007405598958, + "learning_rate": 0.0001, + "loss": 3.8792, + "loss/crossentropy": 2.0488428473472595, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929314285516739, + "step": 28880 + }, + { + "epoch": 0.57764, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007664998372395833, + "learning_rate": 0.0001, + "loss": 3.9943, + "loss/crossentropy": 1.9983501434326172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19050447642803192, + "step": 28882 + }, + { + "epoch": 0.57768, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0066640218098958336, + "learning_rate": 0.0001, + "loss": 3.8174, + "loss/crossentropy": 1.8651137948036194, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18415149301290512, + "step": 28884 + }, + { + "epoch": 0.57772, + "grad_norm": 1.828125, + "grad_norm_var": 0.007055409749348958, + "learning_rate": 0.0001, + "loss": 3.8707, + "loss/crossentropy": 2.0147623419761658, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1963946893811226, + "step": 28886 + }, + { + "epoch": 0.57776, + "grad_norm": 2.046875, + "grad_norm_var": 0.23955052693684895, + "learning_rate": 0.0001, + "loss": 4.1033, + "loss/crossentropy": 2.057682752609253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22113247215747833, + "step": 28888 + }, + { + "epoch": 0.5778, + "grad_norm": 1.875, + "grad_norm_var": 0.239501953125, + "learning_rate": 0.0001, + "loss": 4.2273, + "loss/crossentropy": 2.0742294788360596, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2198052555322647, + "step": 28890 + }, + { + "epoch": 0.57784, + "grad_norm": 2.0, + "grad_norm_var": 0.23716201782226562, + "learning_rate": 0.0001, + "loss": 4.0652, + "loss/crossentropy": 2.1344255208969116, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19126100838184357, + "step": 28892 + }, + { + "epoch": 0.57788, + "grad_norm": 1.84375, + "grad_norm_var": 0.23585611979166668, + "learning_rate": 0.0001, + "loss": 3.8575, + "loss/crossentropy": 1.8000599145889282, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17296022176742554, + "step": 28894 + }, + { + "epoch": 0.57792, + "grad_norm": 1.90625, + "grad_norm_var": 0.2333941141764323, + "learning_rate": 0.0001, + "loss": 4.0821, + "loss/crossentropy": 2.2380261421203613, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18993133306503296, + "step": 28896 + }, + { + "epoch": 0.57796, + "grad_norm": 1.8984375, + "grad_norm_var": 0.23112360636393228, + "learning_rate": 0.0001, + "loss": 4.1168, + "loss/crossentropy": 2.066701650619507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1944030448794365, + "step": 28898 + }, + { + "epoch": 0.578, + "grad_norm": 1.9609375, + "grad_norm_var": 0.22492268880208333, + "learning_rate": 0.0001, + "loss": 4.2338, + "loss/crossentropy": 2.2941386699676514, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2139548882842064, + "step": 28900 + }, + { + "epoch": 0.57804, + "grad_norm": 1.9609375, + "grad_norm_var": 0.22176920572916667, + "learning_rate": 0.0001, + "loss": 3.8767, + "loss/crossentropy": 2.206367015838623, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1887761801481247, + "step": 28902 + }, + { + "epoch": 0.57808, + "grad_norm": 1.8828125, + "grad_norm_var": 0.004473622639973958, + "learning_rate": 0.0001, + "loss": 4.1728, + "loss/crossentropy": 2.176335871219635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016155868768692, + "step": 28904 + }, + { + "epoch": 0.57812, + "grad_norm": 2.359375, + "grad_norm_var": 0.015022532145182291, + "learning_rate": 0.0001, + "loss": 4.2275, + "loss/crossentropy": 2.2069336771965027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20764954388141632, + "step": 28906 + }, + { + "epoch": 0.57816, + "grad_norm": 1.828125, + "grad_norm_var": 0.0157379150390625, + "learning_rate": 0.0001, + "loss": 3.9015, + "loss/crossentropy": 2.2566596269607544, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19805442541837692, + "step": 28908 + }, + { + "epoch": 0.5782, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0152252197265625, + "learning_rate": 0.0001, + "loss": 3.8767, + "loss/crossentropy": 1.8642338514328003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19232972711324692, + "step": 28910 + }, + { + "epoch": 0.57824, + "grad_norm": 1.9453125, + "grad_norm_var": 0.015119425455729167, + "learning_rate": 0.0001, + "loss": 4.1759, + "loss/crossentropy": 1.9054778218269348, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1713176593184471, + "step": 28912 + }, + { + "epoch": 0.57828, + "grad_norm": 1.734375, + "grad_norm_var": 0.0199859619140625, + "learning_rate": 0.0001, + "loss": 3.6261, + "loss/crossentropy": 1.7423803210258484, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16640456765890121, + "step": 28914 + }, + { + "epoch": 0.57832, + "grad_norm": 2.03125, + "grad_norm_var": 0.02054443359375, + "learning_rate": 0.0001, + "loss": 3.9985, + "loss/crossentropy": 2.229828178882599, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20995531231164932, + "step": 28916 + }, + { + "epoch": 0.57836, + "grad_norm": 2.203125, + "grad_norm_var": 0.0248443603515625, + "learning_rate": 0.0001, + "loss": 3.9723, + "loss/crossentropy": 1.483105719089508, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15799641609191895, + "step": 28918 + }, + { + "epoch": 0.5784, + "grad_norm": 1.9140625, + "grad_norm_var": 0.024395497639973958, + "learning_rate": 0.0001, + "loss": 4.2127, + "loss/crossentropy": 2.2207179069519043, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1969587355852127, + "step": 28920 + }, + { + "epoch": 0.57844, + "grad_norm": 1.9375, + "grad_norm_var": 0.012259674072265626, + "learning_rate": 0.0001, + "loss": 3.9546, + "loss/crossentropy": 2.3045194149017334, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19103257358074188, + "step": 28922 + }, + { + "epoch": 0.57848, + "grad_norm": 2.078125, + "grad_norm_var": 0.014289347330729167, + "learning_rate": 0.0001, + "loss": 4.1317, + "loss/crossentropy": 2.0717945098876953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20137913525104523, + "step": 28924 + }, + { + "epoch": 0.57852, + "grad_norm": 2.046875, + "grad_norm_var": 0.014926910400390625, + "learning_rate": 0.0001, + "loss": 4.0922, + "loss/crossentropy": 2.2904374599456787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2105882465839386, + "step": 28926 + }, + { + "epoch": 0.57856, + "grad_norm": 1.7890625, + "grad_norm_var": 0.015818023681640626, + "learning_rate": 0.0001, + "loss": 3.9496, + "loss/crossentropy": 2.3007075786590576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22365766763687134, + "step": 28928 + }, + { + "epoch": 0.5786, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012276204427083333, + "learning_rate": 0.0001, + "loss": 4.082, + "loss/crossentropy": 2.0107430815696716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18182440847158432, + "step": 28930 + }, + { + "epoch": 0.57864, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013106028238932291, + "learning_rate": 0.0001, + "loss": 3.9859, + "loss/crossentropy": 2.1066025495529175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19574615359306335, + "step": 28932 + }, + { + "epoch": 0.57868, + "grad_norm": 1.890625, + "grad_norm_var": 0.007995351155598959, + "learning_rate": 0.0001, + "loss": 3.9903, + "loss/crossentropy": 2.1549283266067505, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20238593965768814, + "step": 28934 + }, + { + "epoch": 0.57872, + "grad_norm": 1.796875, + "grad_norm_var": 0.010734049479166667, + "learning_rate": 0.0001, + "loss": 3.683, + "loss/crossentropy": 2.0073814392089844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.185798779129982, + "step": 28936 + }, + { + "epoch": 0.57876, + "grad_norm": 2.0, + "grad_norm_var": 0.0111724853515625, + "learning_rate": 0.0001, + "loss": 3.8882, + "loss/crossentropy": 1.7613274455070496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1848607361316681, + "step": 28938 + }, + { + "epoch": 0.5788, + "grad_norm": 1.9140625, + "grad_norm_var": 0.08815078735351563, + "learning_rate": 0.0001, + "loss": 4.0142, + "loss/crossentropy": 2.322823405265808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20245341211557388, + "step": 28940 + }, + { + "epoch": 0.57884, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0876129150390625, + "learning_rate": 0.0001, + "loss": 3.8313, + "loss/crossentropy": 1.7385436296463013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17173148691654205, + "step": 28942 + }, + { + "epoch": 0.57888, + "grad_norm": 1.7421875, + "grad_norm_var": 0.08886286417643229, + "learning_rate": 0.0001, + "loss": 3.9017, + "loss/crossentropy": 1.9626798033714294, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17217369377613068, + "step": 28944 + }, + { + "epoch": 0.57892, + "grad_norm": 1.6640625, + "grad_norm_var": 0.09463297526041667, + "learning_rate": 0.0001, + "loss": 3.7427, + "loss/crossentropy": 2.4835134744644165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2073291689157486, + "step": 28946 + }, + { + "epoch": 0.57896, + "grad_norm": 1.9609375, + "grad_norm_var": 0.09482192993164062, + "learning_rate": 0.0001, + "loss": 3.9099, + "loss/crossentropy": 2.0418132543563843, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23464705049991608, + "step": 28948 + }, + { + "epoch": 0.579, + "grad_norm": 1.9609375, + "grad_norm_var": 0.09468968709309895, + "learning_rate": 0.0001, + "loss": 4.2291, + "loss/crossentropy": 2.2382947206497192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21655261516571045, + "step": 28950 + }, + { + "epoch": 0.57904, + "grad_norm": 1.890625, + "grad_norm_var": 0.09024632771809896, + "learning_rate": 0.0001, + "loss": 4.0013, + "loss/crossentropy": 2.0737303495407104, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19475168734788895, + "step": 28952 + }, + { + "epoch": 0.57908, + "grad_norm": 1.7890625, + "grad_norm_var": 0.09391988118489583, + "learning_rate": 0.0001, + "loss": 3.9371, + "loss/crossentropy": 2.088689923286438, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19861633330583572, + "step": 28954 + }, + { + "epoch": 0.57912, + "grad_norm": 2.0, + "grad_norm_var": 0.011472320556640625, + "learning_rate": 0.0001, + "loss": 3.6761, + "loss/crossentropy": 1.8260209560394287, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16850142180919647, + "step": 28956 + }, + { + "epoch": 0.57916, + "grad_norm": 1.921875, + "grad_norm_var": 0.010263824462890625, + "learning_rate": 0.0001, + "loss": 3.8753, + "loss/crossentropy": 1.834153652191162, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16283375024795532, + "step": 28958 + }, + { + "epoch": 0.5792, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009569295247395833, + "learning_rate": 0.0001, + "loss": 3.7873, + "loss/crossentropy": 1.9701993465423584, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18593734502792358, + "step": 28960 + }, + { + "epoch": 0.57924, + "grad_norm": 1.8125, + "grad_norm_var": 0.006965128580729166, + "learning_rate": 0.0001, + "loss": 3.7081, + "loss/crossentropy": 2.026169538497925, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18718570470809937, + "step": 28962 + }, + { + "epoch": 0.57928, + "grad_norm": 1.859375, + "grad_norm_var": 0.006400553385416666, + "learning_rate": 0.0001, + "loss": 4.1803, + "loss/crossentropy": 1.7088202238082886, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1926863193511963, + "step": 28964 + }, + { + "epoch": 0.57932, + "grad_norm": 1.9375, + "grad_norm_var": 0.006498209635416667, + "learning_rate": 0.0001, + "loss": 4.1078, + "loss/crossentropy": 2.008699059486389, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17876552045345306, + "step": 28966 + }, + { + "epoch": 0.57936, + "grad_norm": 1.984375, + "grad_norm_var": 0.007279459635416667, + "learning_rate": 0.0001, + "loss": 3.8845, + "loss/crossentropy": 1.7667925953865051, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16738490760326385, + "step": 28968 + }, + { + "epoch": 0.5794, + "grad_norm": 1.78125, + "grad_norm_var": 0.010239410400390624, + "learning_rate": 0.0001, + "loss": 4.0384, + "loss/crossentropy": 1.7718802094459534, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15908128768205643, + "step": 28970 + }, + { + "epoch": 0.57944, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007437896728515625, + "learning_rate": 0.0001, + "loss": 4.0281, + "loss/crossentropy": 2.086379885673523, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19513976573944092, + "step": 28972 + }, + { + "epoch": 0.57948, + "grad_norm": 1.921875, + "grad_norm_var": 0.0073486328125, + "learning_rate": 0.0001, + "loss": 4.0432, + "loss/crossentropy": 2.157579243183136, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2022448629140854, + "step": 28974 + }, + { + "epoch": 0.57952, + "grad_norm": 1.8203125, + "grad_norm_var": 0.007689412434895833, + "learning_rate": 0.0001, + "loss": 4.0684, + "loss/crossentropy": 1.9351604580879211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1752564013004303, + "step": 28976 + }, + { + "epoch": 0.57956, + "grad_norm": 1.765625, + "grad_norm_var": 0.0084625244140625, + "learning_rate": 0.0001, + "loss": 3.9584, + "loss/crossentropy": 1.8642477989196777, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1806860715150833, + "step": 28978 + }, + { + "epoch": 0.5796, + "grad_norm": 1.84375, + "grad_norm_var": 0.008420562744140625, + "learning_rate": 0.0001, + "loss": 3.9329, + "loss/crossentropy": 2.0846880078315735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18332630395889282, + "step": 28980 + }, + { + "epoch": 0.57964, + "grad_norm": 1.8125, + "grad_norm_var": 0.008595530192057292, + "learning_rate": 0.0001, + "loss": 3.993, + "loss/crossentropy": 2.019991397857666, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845393031835556, + "step": 28982 + }, + { + "epoch": 0.57968, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008250935872395834, + "learning_rate": 0.0001, + "loss": 4.0237, + "loss/crossentropy": 2.148271918296814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20690763741731644, + "step": 28984 + }, + { + "epoch": 0.57972, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0036740620930989583, + "learning_rate": 0.0001, + "loss": 4.1667, + "loss/crossentropy": 2.243967056274414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21643538773059845, + "step": 28986 + }, + { + "epoch": 0.57976, + "grad_norm": 1.953125, + "grad_norm_var": 0.004388173421223958, + "learning_rate": 0.0001, + "loss": 4.1014, + "loss/crossentropy": 2.094264507293701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19375698268413544, + "step": 28988 + }, + { + "epoch": 0.5798, + "grad_norm": 1.7578125, + "grad_norm_var": 0.005425771077473958, + "learning_rate": 0.0001, + "loss": 3.6987, + "loss/crossentropy": 2.0419931411743164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1727323830127716, + "step": 28990 + }, + { + "epoch": 0.57984, + "grad_norm": 1.9375, + "grad_norm_var": 0.0053708394368489586, + "learning_rate": 0.0001, + "loss": 3.887, + "loss/crossentropy": 2.1932610273361206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20193511247634888, + "step": 28992 + }, + { + "epoch": 0.57988, + "grad_norm": 2.015625, + "grad_norm_var": 0.006646474202473958, + "learning_rate": 0.0001, + "loss": 4.144, + "loss/crossentropy": 1.9849725365638733, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20237861573696136, + "step": 28994 + }, + { + "epoch": 0.57992, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006420644124348959, + "learning_rate": 0.0001, + "loss": 3.8113, + "loss/crossentropy": 1.801219642162323, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1687634363770485, + "step": 28996 + }, + { + "epoch": 0.57996, + "grad_norm": 1.8515625, + "grad_norm_var": 0.005686187744140625, + "learning_rate": 0.0001, + "loss": 4.0288, + "loss/crossentropy": 2.018694579601288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17765481770038605, + "step": 28998 + }, + { + "epoch": 0.58, + "grad_norm": 1.953125, + "grad_norm_var": 0.005598958333333333, + "learning_rate": 0.0001, + "loss": 4.1113, + "loss/crossentropy": 1.8309300541877747, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19343653321266174, + "step": 29000 + }, + { + "epoch": 0.58004, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005100250244140625, + "learning_rate": 0.0001, + "loss": 4.2134, + "loss/crossentropy": 2.264810800552368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20135793089866638, + "step": 29002 + }, + { + "epoch": 0.58008, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004903157552083333, + "learning_rate": 0.0001, + "loss": 3.9869, + "loss/crossentropy": 2.0125122666358948, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1977495700120926, + "step": 29004 + }, + { + "epoch": 0.58012, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0034739176432291665, + "learning_rate": 0.0001, + "loss": 3.7476, + "loss/crossentropy": 1.7581869959831238, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17235949635505676, + "step": 29006 + }, + { + "epoch": 0.58016, + "grad_norm": 1.984375, + "grad_norm_var": 0.0043609619140625, + "learning_rate": 0.0001, + "loss": 3.7818, + "loss/crossentropy": 2.0644874572753906, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20059636235237122, + "step": 29008 + }, + { + "epoch": 0.5802, + "grad_norm": 1.8828125, + "grad_norm_var": 0.0029296875, + "learning_rate": 0.0001, + "loss": 4.0232, + "loss/crossentropy": 2.2566142082214355, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18922609090805054, + "step": 29010 + }, + { + "epoch": 0.58024, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0065996805826822914, + "learning_rate": 0.0001, + "loss": 4.2414, + "loss/crossentropy": 2.3137046098709106, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21420790255069733, + "step": 29012 + }, + { + "epoch": 0.58028, + "grad_norm": 1.9296875, + "grad_norm_var": 0.0064280192057291664, + "learning_rate": 0.0001, + "loss": 3.9192, + "loss/crossentropy": 2.1429378986358643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20075450092554092, + "step": 29014 + }, + { + "epoch": 0.58032, + "grad_norm": 1.8671875, + "grad_norm_var": 0.007567342122395833, + "learning_rate": 0.0001, + "loss": 4.0905, + "loss/crossentropy": 2.5844578742980957, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21986322849988937, + "step": 29016 + }, + { + "epoch": 0.58036, + "grad_norm": 1.875, + "grad_norm_var": 0.0093902587890625, + "learning_rate": 0.0001, + "loss": 3.7977, + "loss/crossentropy": 1.847869634628296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16430244594812393, + "step": 29018 + }, + { + "epoch": 0.5804, + "grad_norm": 1.828125, + "grad_norm_var": 0.010544586181640624, + "learning_rate": 0.0001, + "loss": 3.8369, + "loss/crossentropy": 1.895844042301178, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18856079131364822, + "step": 29020 + }, + { + "epoch": 0.58044, + "grad_norm": 1.8984375, + "grad_norm_var": 0.009113566080729166, + "learning_rate": 0.0001, + "loss": 3.9936, + "loss/crossentropy": 2.1094332933425903, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2025027722120285, + "step": 29022 + }, + { + "epoch": 0.58048, + "grad_norm": 2.140625, + "grad_norm_var": 0.012172190348307292, + "learning_rate": 0.0001, + "loss": 4.1463, + "loss/crossentropy": 1.820436716079712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1872788816690445, + "step": 29024 + }, + { + "epoch": 0.58052, + "grad_norm": 2.0, + "grad_norm_var": 0.011641184488932291, + "learning_rate": 0.0001, + "loss": 4.1155, + "loss/crossentropy": 2.371734142303467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20477186143398285, + "step": 29026 + }, + { + "epoch": 0.58056, + "grad_norm": 1.9921875, + "grad_norm_var": 0.010273996988932292, + "learning_rate": 0.0001, + "loss": 4.3274, + "loss/crossentropy": 2.4618901014328003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20902744680643082, + "step": 29028 + }, + { + "epoch": 0.5806, + "grad_norm": 1.9296875, + "grad_norm_var": 0.010044097900390625, + "learning_rate": 0.0001, + "loss": 4.0957, + "loss/crossentropy": 1.9462957382202148, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18320611119270325, + "step": 29030 + }, + { + "epoch": 0.58064, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010589345296223959, + "learning_rate": 0.0001, + "loss": 3.8647, + "loss/crossentropy": 2.07541561126709, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19628040492534637, + "step": 29032 + }, + { + "epoch": 0.58068, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009269205729166667, + "learning_rate": 0.0001, + "loss": 3.9063, + "loss/crossentropy": 2.144878387451172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21301092207431793, + "step": 29034 + }, + { + "epoch": 0.58072, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009490712483723959, + "learning_rate": 0.0001, + "loss": 3.8692, + "loss/crossentropy": 1.9113212823867798, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18449342995882034, + "step": 29036 + }, + { + "epoch": 0.58076, + "grad_norm": 1.953125, + "grad_norm_var": 0.0095611572265625, + "learning_rate": 0.0001, + "loss": 4.1034, + "loss/crossentropy": 2.211828351020813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19539960473775864, + "step": 29038 + }, + { + "epoch": 0.5808, + "grad_norm": 1.8515625, + "grad_norm_var": 0.005179595947265625, + "learning_rate": 0.0001, + "loss": 4.0377, + "loss/crossentropy": 2.1476653814315796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979052647948265, + "step": 29040 + }, + { + "epoch": 0.58084, + "grad_norm": 1.90625, + "grad_norm_var": 0.004239654541015625, + "learning_rate": 0.0001, + "loss": 3.8152, + "loss/crossentropy": 1.5791842937469482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17846909165382385, + "step": 29042 + }, + { + "epoch": 0.58088, + "grad_norm": 1.9609375, + "grad_norm_var": 0.002249908447265625, + "learning_rate": 0.0001, + "loss": 3.7192, + "loss/crossentropy": 1.77503103017807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1774892508983612, + "step": 29044 + }, + { + "epoch": 0.58092, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0028523763020833334, + "learning_rate": 0.0001, + "loss": 4.0933, + "loss/crossentropy": 1.9995554089546204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18692871183156967, + "step": 29046 + }, + { + "epoch": 0.58096, + "grad_norm": 1.859375, + "grad_norm_var": 0.002750396728515625, + "learning_rate": 0.0001, + "loss": 3.9813, + "loss/crossentropy": 2.057563364505768, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19816753268241882, + "step": 29048 + }, + { + "epoch": 0.581, + "grad_norm": 1.7265625, + "grad_norm_var": 0.010599517822265625, + "learning_rate": 0.0001, + "loss": 3.7361, + "loss/crossentropy": 2.1810996532440186, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18617865443229675, + "step": 29050 + }, + { + "epoch": 0.58104, + "grad_norm": 1.875, + "grad_norm_var": 0.010038248697916667, + "learning_rate": 0.0001, + "loss": 4.0803, + "loss/crossentropy": 2.212439775466919, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19946376979351044, + "step": 29052 + }, + { + "epoch": 0.58108, + "grad_norm": 1.8046875, + "grad_norm_var": 0.0109130859375, + "learning_rate": 0.0001, + "loss": 3.7017, + "loss/crossentropy": 1.4078394174575806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.14310228824615479, + "step": 29054 + }, + { + "epoch": 0.58112, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0114013671875, + "learning_rate": 0.0001, + "loss": 3.9539, + "loss/crossentropy": 2.0167892575263977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.206185445189476, + "step": 29056 + }, + { + "epoch": 0.58116, + "grad_norm": 1.9375, + "grad_norm_var": 0.013312784830729167, + "learning_rate": 0.0001, + "loss": 4.0975, + "loss/crossentropy": 2.0703752040863037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20013286918401718, + "step": 29058 + }, + { + "epoch": 0.5812, + "grad_norm": 1.7578125, + "grad_norm_var": 0.015047200520833333, + "learning_rate": 0.0001, + "loss": 3.7723, + "loss/crossentropy": 1.528024673461914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16279471665620804, + "step": 29060 + }, + { + "epoch": 0.58124, + "grad_norm": 2.015625, + "grad_norm_var": 0.016190338134765624, + "learning_rate": 0.0001, + "loss": 4.1933, + "loss/crossentropy": 2.235237717628479, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22028259187936783, + "step": 29062 + }, + { + "epoch": 0.58128, + "grad_norm": 1.84375, + "grad_norm_var": 0.017293294270833332, + "learning_rate": 0.0001, + "loss": 3.7427, + "loss/crossentropy": 1.8778278231620789, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16768044978380203, + "step": 29064 + }, + { + "epoch": 0.58132, + "grad_norm": 1.7265625, + "grad_norm_var": 0.011102040608723959, + "learning_rate": 0.0001, + "loss": 3.5901, + "loss/crossentropy": 1.8109752535820007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16292864084243774, + "step": 29066 + }, + { + "epoch": 0.58136, + "grad_norm": 2.328125, + "grad_norm_var": 0.023502604166666666, + "learning_rate": 0.0001, + "loss": 3.8298, + "loss/crossentropy": 1.9164445996284485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17200321704149246, + "step": 29068 + }, + { + "epoch": 0.5814, + "grad_norm": 2.03125, + "grad_norm_var": 0.024103800455729168, + "learning_rate": 0.0001, + "loss": 4.1952, + "loss/crossentropy": 2.1318963766098022, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20111309736967087, + "step": 29070 + }, + { + "epoch": 0.58144, + "grad_norm": 2.078125, + "grad_norm_var": 0.023606109619140624, + "learning_rate": 0.0001, + "loss": 4.1548, + "loss/crossentropy": 1.8939481377601624, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2058345079421997, + "step": 29072 + }, + { + "epoch": 0.58148, + "grad_norm": 1.96875, + "grad_norm_var": 0.0228424072265625, + "learning_rate": 0.0001, + "loss": 4.1685, + "loss/crossentropy": 2.0977996587753296, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19700003415346146, + "step": 29074 + }, + { + "epoch": 0.58152, + "grad_norm": 1.96875, + "grad_norm_var": 0.0199615478515625, + "learning_rate": 0.0001, + "loss": 4.0528, + "loss/crossentropy": 2.163521647453308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20974216610193253, + "step": 29076 + }, + { + "epoch": 0.58156, + "grad_norm": 1.9375, + "grad_norm_var": 0.019358062744140626, + "learning_rate": 0.0001, + "loss": 4.0584, + "loss/crossentropy": 2.2900352478027344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20909854024648666, + "step": 29078 + }, + { + "epoch": 0.5816, + "grad_norm": 1.8671875, + "grad_norm_var": 0.018733723958333334, + "learning_rate": 0.0001, + "loss": 4.0277, + "loss/crossentropy": 2.0664632320404053, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1772863119840622, + "step": 29080 + }, + { + "epoch": 0.58164, + "grad_norm": 1.765625, + "grad_norm_var": 0.018436431884765625, + "learning_rate": 0.0001, + "loss": 3.7484, + "loss/crossentropy": 1.9867297410964966, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18980251252651215, + "step": 29082 + }, + { + "epoch": 0.58168, + "grad_norm": 1.9765625, + "grad_norm_var": 0.00830078125, + "learning_rate": 0.0001, + "loss": 4.1918, + "loss/crossentropy": 2.318376064300537, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22295164316892624, + "step": 29084 + }, + { + "epoch": 0.58172, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006156412760416666, + "learning_rate": 0.0001, + "loss": 3.8932, + "loss/crossentropy": 2.0880175828933716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18823902308940887, + "step": 29086 + }, + { + "epoch": 0.58176, + "grad_norm": 2.109375, + "grad_norm_var": 0.007039133707682292, + "learning_rate": 0.0001, + "loss": 4.1964, + "loss/crossentropy": 2.5453847646713257, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24233754724264145, + "step": 29088 + }, + { + "epoch": 0.5818, + "grad_norm": 1.734375, + "grad_norm_var": 0.00999755859375, + "learning_rate": 0.0001, + "loss": 3.8693, + "loss/crossentropy": 2.0354779958724976, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2055264338850975, + "step": 29090 + }, + { + "epoch": 0.58184, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009938303629557292, + "learning_rate": 0.0001, + "loss": 4.1731, + "loss/crossentropy": 2.3989592790603638, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18171776831150055, + "step": 29092 + }, + { + "epoch": 0.58188, + "grad_norm": 2.125, + "grad_norm_var": 0.013114166259765626, + "learning_rate": 0.0001, + "loss": 4.0719, + "loss/crossentropy": 2.0136520862579346, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18895280361175537, + "step": 29094 + }, + { + "epoch": 0.58192, + "grad_norm": 1.8046875, + "grad_norm_var": 0.013071441650390625, + "learning_rate": 0.0001, + "loss": 3.9095, + "loss/crossentropy": 2.057899534702301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17919254302978516, + "step": 29096 + }, + { + "epoch": 0.58196, + "grad_norm": 1.7890625, + "grad_norm_var": 0.013002268473307292, + "learning_rate": 0.0001, + "loss": 3.8561, + "loss/crossentropy": 1.9806170463562012, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2131671980023384, + "step": 29098 + }, + { + "epoch": 0.582, + "grad_norm": 1.890625, + "grad_norm_var": 0.013734690348307292, + "learning_rate": 0.0001, + "loss": 3.747, + "loss/crossentropy": 2.0952848196029663, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2057284563779831, + "step": 29100 + }, + { + "epoch": 0.58204, + "grad_norm": 2.015625, + "grad_norm_var": 0.013346354166666666, + "learning_rate": 0.0001, + "loss": 4.2041, + "loss/crossentropy": 2.2287687063217163, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18772760778665543, + "step": 29102 + }, + { + "epoch": 0.58208, + "grad_norm": 1.984375, + "grad_norm_var": 0.012221018473307291, + "learning_rate": 0.0001, + "loss": 3.9164, + "loss/crossentropy": 2.2227126359939575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2002912387251854, + "step": 29104 + }, + { + "epoch": 0.58212, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010042317708333333, + "learning_rate": 0.0001, + "loss": 3.724, + "loss/crossentropy": 1.9384364485740662, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18695514649152756, + "step": 29106 + }, + { + "epoch": 0.58216, + "grad_norm": 1.859375, + "grad_norm_var": 0.010481516520182291, + "learning_rate": 0.0001, + "loss": 3.8985, + "loss/crossentropy": 1.7653113007545471, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19013341516256332, + "step": 29108 + }, + { + "epoch": 0.5822, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008128865559895834, + "learning_rate": 0.0001, + "loss": 3.8348, + "loss/crossentropy": 1.7308897972106934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1701432168483734, + "step": 29110 + }, + { + "epoch": 0.58224, + "grad_norm": 2.234375, + "grad_norm_var": 0.017435709635416668, + "learning_rate": 0.0001, + "loss": 4.3871, + "loss/crossentropy": 2.286720633506775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2024293765425682, + "step": 29112 + }, + { + "epoch": 0.58228, + "grad_norm": 1.7890625, + "grad_norm_var": 0.0161529541015625, + "learning_rate": 0.0001, + "loss": 3.744, + "loss/crossentropy": 1.8618659377098083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18028806149959564, + "step": 29114 + }, + { + "epoch": 0.58232, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0154449462890625, + "learning_rate": 0.0001, + "loss": 4.0077, + "loss/crossentropy": 2.128199815750122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2121022716164589, + "step": 29116 + }, + { + "epoch": 0.58236, + "grad_norm": 1.84375, + "grad_norm_var": 0.0156890869140625, + "learning_rate": 0.0001, + "loss": 3.731, + "loss/crossentropy": 2.0003921389579773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1782400906085968, + "step": 29118 + }, + { + "epoch": 0.5824, + "grad_norm": 1.78125, + "grad_norm_var": 0.01651611328125, + "learning_rate": 0.0001, + "loss": 3.7362, + "loss/crossentropy": 1.653384268283844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1676875352859497, + "step": 29120 + }, + { + "epoch": 0.58244, + "grad_norm": 1.921875, + "grad_norm_var": 0.016261545817057292, + "learning_rate": 0.0001, + "loss": 4.1866, + "loss/crossentropy": 2.3602925539016724, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22520510107278824, + "step": 29122 + }, + { + "epoch": 0.58248, + "grad_norm": 1.984375, + "grad_norm_var": 0.017814127604166667, + "learning_rate": 0.0001, + "loss": 4.0382, + "loss/crossentropy": 1.839131772518158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19381030648946762, + "step": 29124 + }, + { + "epoch": 0.58252, + "grad_norm": 2.09375, + "grad_norm_var": 0.017878977457682292, + "learning_rate": 0.0001, + "loss": 4.178, + "loss/crossentropy": 2.312969148159027, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22048135846853256, + "step": 29126 + }, + { + "epoch": 0.58256, + "grad_norm": 2.046875, + "grad_norm_var": 0.010846964518229167, + "learning_rate": 0.0001, + "loss": 4.0541, + "loss/crossentropy": 2.153268814086914, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2186889946460724, + "step": 29128 + }, + { + "epoch": 0.5826, + "grad_norm": 2.03125, + "grad_norm_var": 0.011527252197265626, + "learning_rate": 0.0001, + "loss": 3.9336, + "loss/crossentropy": 2.1008894443511963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1893206238746643, + "step": 29130 + }, + { + "epoch": 0.58264, + "grad_norm": 1.9453125, + "grad_norm_var": 0.011331939697265625, + "learning_rate": 0.0001, + "loss": 4.0611, + "loss/crossentropy": 2.0414587259292603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1952778548002243, + "step": 29132 + }, + { + "epoch": 0.58268, + "grad_norm": 2.09375, + "grad_norm_var": 0.013085683186848959, + "learning_rate": 0.0001, + "loss": 4.1025, + "loss/crossentropy": 2.440865635871887, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21561746299266815, + "step": 29134 + }, + { + "epoch": 0.58272, + "grad_norm": 1.9296875, + "grad_norm_var": 0.012325032552083334, + "learning_rate": 0.0001, + "loss": 4.1542, + "loss/crossentropy": 2.309555768966675, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22722266614437103, + "step": 29136 + }, + { + "epoch": 0.58276, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012910715738932292, + "learning_rate": 0.0001, + "loss": 4.0456, + "loss/crossentropy": 2.1777420043945312, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1994665190577507, + "step": 29138 + }, + { + "epoch": 0.5828, + "grad_norm": 2.515625, + "grad_norm_var": 0.03129781087239583, + "learning_rate": 0.0001, + "loss": 3.9863, + "loss/crossentropy": 2.174704909324646, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19741538912057877, + "step": 29140 + }, + { + "epoch": 0.58284, + "grad_norm": 1.8046875, + "grad_norm_var": 0.03214111328125, + "learning_rate": 0.0001, + "loss": 3.889, + "loss/crossentropy": 2.0467293858528137, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19651174545288086, + "step": 29142 + }, + { + "epoch": 0.58288, + "grad_norm": 1.8671875, + "grad_norm_var": 0.030418904622395833, + "learning_rate": 0.0001, + "loss": 4.0606, + "loss/crossentropy": 2.050451397895813, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18523690849542618, + "step": 29144 + }, + { + "epoch": 0.58292, + "grad_norm": 1.9609375, + "grad_norm_var": 0.027787272135416666, + "learning_rate": 0.0001, + "loss": 4.1576, + "loss/crossentropy": 2.2801162004470825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20639225095510483, + "step": 29146 + }, + { + "epoch": 0.58296, + "grad_norm": 1.9140625, + "grad_norm_var": 0.03065973917643229, + "learning_rate": 0.0001, + "loss": 3.7002, + "loss/crossentropy": 1.7782122492790222, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15972331911325455, + "step": 29148 + }, + { + "epoch": 0.583, + "grad_norm": 2.015625, + "grad_norm_var": 0.029813639322916665, + "learning_rate": 0.0001, + "loss": 3.9282, + "loss/crossentropy": 2.1129753589630127, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19876645505428314, + "step": 29150 + }, + { + "epoch": 0.58304, + "grad_norm": 2.078125, + "grad_norm_var": 0.029259999593098957, + "learning_rate": 0.0001, + "loss": 4.0128, + "loss/crossentropy": 1.7885400652885437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17748355120420456, + "step": 29152 + }, + { + "epoch": 0.58308, + "grad_norm": 1.8671875, + "grad_norm_var": 0.029117584228515625, + "learning_rate": 0.0001, + "loss": 3.9049, + "loss/crossentropy": 1.9744374752044678, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20229245722293854, + "step": 29154 + }, + { + "epoch": 0.58312, + "grad_norm": 1.7890625, + "grad_norm_var": 0.007785797119140625, + "learning_rate": 0.0001, + "loss": 3.8136, + "loss/crossentropy": 1.7132557034492493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17987029254436493, + "step": 29156 + }, + { + "epoch": 0.58316, + "grad_norm": 1.875, + "grad_norm_var": 0.007389068603515625, + "learning_rate": 0.0001, + "loss": 3.9586, + "loss/crossentropy": 2.1243802309036255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19921009242534637, + "step": 29158 + }, + { + "epoch": 0.5832, + "grad_norm": 1.828125, + "grad_norm_var": 0.0078277587890625, + "learning_rate": 0.0001, + "loss": 3.8449, + "loss/crossentropy": 1.9450802206993103, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19191914796829224, + "step": 29160 + }, + { + "epoch": 0.58324, + "grad_norm": 1.875, + "grad_norm_var": 0.007372792561848958, + "learning_rate": 0.0001, + "loss": 3.9835, + "loss/crossentropy": 1.936660647392273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18147709965705872, + "step": 29162 + }, + { + "epoch": 0.58328, + "grad_norm": 2.125, + "grad_norm_var": 0.009403228759765625, + "learning_rate": 0.0001, + "loss": 4.1735, + "loss/crossentropy": 2.4342548847198486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22083742171525955, + "step": 29164 + }, + { + "epoch": 0.58332, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0081298828125, + "learning_rate": 0.0001, + "loss": 3.7471, + "loss/crossentropy": 1.8538039922714233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17534375935792923, + "step": 29166 + }, + { + "epoch": 0.58336, + "grad_norm": 1.8984375, + "grad_norm_var": 0.006959788004557292, + "learning_rate": 0.0001, + "loss": 4.0749, + "loss/crossentropy": 2.2351138591766357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20727767050266266, + "step": 29168 + }, + { + "epoch": 0.5834, + "grad_norm": 1.75, + "grad_norm_var": 0.008055623372395833, + "learning_rate": 0.0001, + "loss": 3.7464, + "loss/crossentropy": 1.655074954032898, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17760378867387772, + "step": 29170 + }, + { + "epoch": 0.58344, + "grad_norm": 1.8203125, + "grad_norm_var": 0.007902018229166667, + "learning_rate": 0.0001, + "loss": 3.9026, + "loss/crossentropy": 1.7590230703353882, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17171693593263626, + "step": 29172 + }, + { + "epoch": 0.58348, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0090484619140625, + "learning_rate": 0.0001, + "loss": 3.8981, + "loss/crossentropy": 1.7671796083450317, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16575640812516212, + "step": 29174 + }, + { + "epoch": 0.58352, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012686920166015626, + "learning_rate": 0.0001, + "loss": 4.0845, + "loss/crossentropy": 1.8754821419715881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17734245210886002, + "step": 29176 + }, + { + "epoch": 0.58356, + "grad_norm": 1.859375, + "grad_norm_var": 0.013014475504557291, + "learning_rate": 0.0001, + "loss": 3.9861, + "loss/crossentropy": 2.152361035346985, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.24318483471870422, + "step": 29178 + }, + { + "epoch": 0.5836, + "grad_norm": 1.96875, + "grad_norm_var": 0.010158030192057292, + "learning_rate": 0.0001, + "loss": 4.0301, + "loss/crossentropy": 2.091741681098938, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20466475933790207, + "step": 29180 + }, + { + "epoch": 0.58364, + "grad_norm": 1.8046875, + "grad_norm_var": 0.011757151285807291, + "learning_rate": 0.0001, + "loss": 4.003, + "loss/crossentropy": 2.493666410446167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20663270354270935, + "step": 29182 + }, + { + "epoch": 0.58368, + "grad_norm": 1.9765625, + "grad_norm_var": 0.012247467041015625, + "learning_rate": 0.0001, + "loss": 3.9938, + "loss/crossentropy": 2.311075210571289, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2115970104932785, + "step": 29184 + }, + { + "epoch": 0.58372, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010789998372395833, + "learning_rate": 0.0001, + "loss": 4.0891, + "loss/crossentropy": 2.2983113527297974, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2209111899137497, + "step": 29186 + }, + { + "epoch": 0.58376, + "grad_norm": 1.9765625, + "grad_norm_var": 0.009992472330729167, + "learning_rate": 0.0001, + "loss": 4.1642, + "loss/crossentropy": 2.035028040409088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19093219190835953, + "step": 29188 + }, + { + "epoch": 0.5838, + "grad_norm": 1.8515625, + "grad_norm_var": 0.008333333333333333, + "learning_rate": 0.0001, + "loss": 3.9368, + "loss/crossentropy": 2.1713255047798157, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1836071014404297, + "step": 29190 + }, + { + "epoch": 0.58384, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005033111572265625, + "learning_rate": 0.0001, + "loss": 3.9997, + "loss/crossentropy": 1.898379623889923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19957400858402252, + "step": 29192 + }, + { + "epoch": 0.58388, + "grad_norm": 1.90625, + "grad_norm_var": 0.004624176025390625, + "learning_rate": 0.0001, + "loss": 4.1757, + "loss/crossentropy": 1.930584728717804, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1872590333223343, + "step": 29194 + }, + { + "epoch": 0.58392, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0045654296875, + "learning_rate": 0.0001, + "loss": 3.9956, + "loss/crossentropy": 2.2888232469558716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20355024188756943, + "step": 29196 + }, + { + "epoch": 0.58396, + "grad_norm": 1.8203125, + "grad_norm_var": 0.005961100260416667, + "learning_rate": 0.0001, + "loss": 4.0838, + "loss/crossentropy": 2.242384433746338, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2161145657300949, + "step": 29198 + }, + { + "epoch": 0.584, + "grad_norm": 1.8125, + "grad_norm_var": 0.0058024088541666664, + "learning_rate": 0.0001, + "loss": 3.9119, + "loss/crossentropy": 1.7642216086387634, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17429838329553604, + "step": 29200 + }, + { + "epoch": 0.58404, + "grad_norm": 1.9609375, + "grad_norm_var": 0.005785878499348958, + "learning_rate": 0.0001, + "loss": 4.0763, + "loss/crossentropy": 2.2808165550231934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21707262098789215, + "step": 29202 + }, + { + "epoch": 0.58408, + "grad_norm": 1.9375, + "grad_norm_var": 0.008158365885416666, + "learning_rate": 0.0001, + "loss": 4.2149, + "loss/crossentropy": 2.2542625665664673, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21408841013908386, + "step": 29204 + }, + { + "epoch": 0.58412, + "grad_norm": 1.9140625, + "grad_norm_var": 0.0079010009765625, + "learning_rate": 0.0001, + "loss": 3.8883, + "loss/crossentropy": 2.084090828895569, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18555748462677002, + "step": 29206 + }, + { + "epoch": 0.58416, + "grad_norm": 2.265625, + "grad_norm_var": 0.01376953125, + "learning_rate": 0.0001, + "loss": 4.1465, + "loss/crossentropy": 2.3946259021759033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20357777923345566, + "step": 29208 + }, + { + "epoch": 0.5842, + "grad_norm": 1.9375, + "grad_norm_var": 0.014697011311848958, + "learning_rate": 0.0001, + "loss": 4.1172, + "loss/crossentropy": 2.2105261087417603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2044535130262375, + "step": 29210 + }, + { + "epoch": 0.58424, + "grad_norm": 1.9140625, + "grad_norm_var": 0.018904368082682293, + "learning_rate": 0.0001, + "loss": 3.639, + "loss/crossentropy": 1.5793652534484863, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1763077676296234, + "step": 29212 + }, + { + "epoch": 0.58428, + "grad_norm": 2.03125, + "grad_norm_var": 0.016340128580729165, + "learning_rate": 0.0001, + "loss": 4.2916, + "loss/crossentropy": 2.352652430534363, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22064512968063354, + "step": 29214 + }, + { + "epoch": 0.58432, + "grad_norm": 1.7734375, + "grad_norm_var": 0.017374674479166668, + "learning_rate": 0.0001, + "loss": 3.6412, + "loss/crossentropy": 1.9069225192070007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17932289838790894, + "step": 29216 + }, + { + "epoch": 0.58436, + "grad_norm": 1.890625, + "grad_norm_var": 0.017899576822916666, + "learning_rate": 0.0001, + "loss": 3.8841, + "loss/crossentropy": 1.7844374179840088, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17104727029800415, + "step": 29218 + }, + { + "epoch": 0.5844, + "grad_norm": 1.859375, + "grad_norm_var": 0.01678441365559896, + "learning_rate": 0.0001, + "loss": 3.8258, + "loss/crossentropy": 1.917112410068512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17463716119527817, + "step": 29220 + }, + { + "epoch": 0.58444, + "grad_norm": 1.9140625, + "grad_norm_var": 0.017103830973307293, + "learning_rate": 0.0001, + "loss": 4.0424, + "loss/crossentropy": 1.9697306156158447, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1797391176223755, + "step": 29222 + }, + { + "epoch": 0.58448, + "grad_norm": 1.8671875, + "grad_norm_var": 0.012092081705729167, + "learning_rate": 0.0001, + "loss": 4.0616, + "loss/crossentropy": 2.1226717829704285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1818629801273346, + "step": 29224 + }, + { + "epoch": 0.58452, + "grad_norm": 1.90625, + "grad_norm_var": 0.010567220052083333, + "learning_rate": 0.0001, + "loss": 3.9699, + "loss/crossentropy": 2.406674861907959, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20577572286128998, + "step": 29226 + }, + { + "epoch": 0.58456, + "grad_norm": 1.9375, + "grad_norm_var": 0.008235677083333334, + "learning_rate": 0.0001, + "loss": 3.915, + "loss/crossentropy": 2.14900803565979, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19237977266311646, + "step": 29228 + }, + { + "epoch": 0.5846, + "grad_norm": 1.875, + "grad_norm_var": 0.006803385416666667, + "learning_rate": 0.0001, + "loss": 3.9717, + "loss/crossentropy": 1.7613251209259033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15341567248106003, + "step": 29230 + }, + { + "epoch": 0.58464, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0053403218587239586, + "learning_rate": 0.0001, + "loss": 3.9471, + "loss/crossentropy": 2.0507588386535645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1979541778564453, + "step": 29232 + }, + { + "epoch": 0.58468, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005985260009765625, + "learning_rate": 0.0001, + "loss": 3.9946, + "loss/crossentropy": 1.9034594297409058, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17682257294654846, + "step": 29234 + }, + { + "epoch": 0.58472, + "grad_norm": 1.90625, + "grad_norm_var": 0.0058258056640625, + "learning_rate": 0.0001, + "loss": 3.9959, + "loss/crossentropy": 2.317684054374695, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21268346905708313, + "step": 29236 + }, + { + "epoch": 0.58476, + "grad_norm": 1.9921875, + "grad_norm_var": 0.0064656575520833336, + "learning_rate": 0.0001, + "loss": 4.2448, + "loss/crossentropy": 2.15400493144989, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20123393833637238, + "step": 29238 + }, + { + "epoch": 0.5848, + "grad_norm": 1.8125, + "grad_norm_var": 0.0051513671875, + "learning_rate": 0.0001, + "loss": 4.0108, + "loss/crossentropy": 1.8610196709632874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16698840260505676, + "step": 29240 + }, + { + "epoch": 0.58484, + "grad_norm": 2.140625, + "grad_norm_var": 0.008213043212890625, + "learning_rate": 0.0001, + "loss": 3.9196, + "loss/crossentropy": 1.9535472989082336, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17425117641687393, + "step": 29242 + }, + { + "epoch": 0.58488, + "grad_norm": 1.875, + "grad_norm_var": 0.008185831705729167, + "learning_rate": 0.0001, + "loss": 3.9185, + "loss/crossentropy": 1.966432809829712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19216462224721909, + "step": 29244 + }, + { + "epoch": 0.58492, + "grad_norm": 1.8203125, + "grad_norm_var": 0.009163157145182291, + "learning_rate": 0.0001, + "loss": 3.7805, + "loss/crossentropy": 2.0415098071098328, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1909058690071106, + "step": 29246 + }, + { + "epoch": 0.58496, + "grad_norm": 1.875, + "grad_norm_var": 0.010025787353515624, + "learning_rate": 0.0001, + "loss": 4.0979, + "loss/crossentropy": 2.1297216415405273, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19929852336645126, + "step": 29248 + }, + { + "epoch": 0.585, + "grad_norm": 1.8828125, + "grad_norm_var": 0.009417470296223958, + "learning_rate": 0.0001, + "loss": 4.0817, + "loss/crossentropy": 2.1042102575302124, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20568375289440155, + "step": 29250 + }, + { + "epoch": 0.58504, + "grad_norm": 1.9375, + "grad_norm_var": 0.009403483072916666, + "learning_rate": 0.0001, + "loss": 3.8923, + "loss/crossentropy": 2.1890978813171387, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029515951871872, + "step": 29252 + }, + { + "epoch": 0.58508, + "grad_norm": 2.0625, + "grad_norm_var": 0.009261067708333333, + "learning_rate": 0.0001, + "loss": 3.989, + "loss/crossentropy": 2.061577618122101, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19042493402957916, + "step": 29254 + }, + { + "epoch": 0.58512, + "grad_norm": 2.109375, + "grad_norm_var": 0.011366526285807291, + "learning_rate": 0.0001, + "loss": 4.0084, + "loss/crossentropy": 2.0101218819618225, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19894610345363617, + "step": 29256 + }, + { + "epoch": 0.58516, + "grad_norm": 2.03125, + "grad_norm_var": 0.008332316080729167, + "learning_rate": 0.0001, + "loss": 4.0886, + "loss/crossentropy": 1.7634521126747131, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18951942026615143, + "step": 29258 + }, + { + "epoch": 0.5852, + "grad_norm": 1.921875, + "grad_norm_var": 0.06482518513997396, + "learning_rate": 0.0001, + "loss": 3.9788, + "loss/crossentropy": 2.1544201374053955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19801289588212967, + "step": 29260 + }, + { + "epoch": 0.58524, + "grad_norm": 2.03125, + "grad_norm_var": 0.062154134114583336, + "learning_rate": 0.0001, + "loss": 3.9742, + "loss/crossentropy": 2.2371232509613037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19622696936130524, + "step": 29262 + }, + { + "epoch": 0.58528, + "grad_norm": 1.9140625, + "grad_norm_var": 0.06226781209309896, + "learning_rate": 0.0001, + "loss": 3.8049, + "loss/crossentropy": 1.9571356773376465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19151850044727325, + "step": 29264 + }, + { + "epoch": 0.58532, + "grad_norm": 1.8984375, + "grad_norm_var": 0.06256103515625, + "learning_rate": 0.0001, + "loss": 4.1047, + "loss/crossentropy": 2.149628520011902, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19668041914701462, + "step": 29266 + }, + { + "epoch": 0.58536, + "grad_norm": 1.921875, + "grad_norm_var": 0.06358006795247396, + "learning_rate": 0.0001, + "loss": 4.186, + "loss/crossentropy": 2.01633083820343, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19449809193611145, + "step": 29268 + }, + { + "epoch": 0.5854, + "grad_norm": 1.96875, + "grad_norm_var": 0.06281636555989584, + "learning_rate": 0.0001, + "loss": 4.0058, + "loss/crossentropy": 2.144730567932129, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20602761209011078, + "step": 29270 + }, + { + "epoch": 0.58544, + "grad_norm": 1.828125, + "grad_norm_var": 0.06256917317708334, + "learning_rate": 0.0001, + "loss": 3.9861, + "loss/crossentropy": 1.7255420684814453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16974858194589615, + "step": 29272 + }, + { + "epoch": 0.58548, + "grad_norm": 1.7890625, + "grad_norm_var": 0.06447931925455729, + "learning_rate": 0.0001, + "loss": 4.1475, + "loss/crossentropy": 2.3286606073379517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2005838081240654, + "step": 29274 + }, + { + "epoch": 0.58552, + "grad_norm": 1.953125, + "grad_norm_var": 0.0079498291015625, + "learning_rate": 0.0001, + "loss": 4.1118, + "loss/crossentropy": 2.085566818714142, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19351552426815033, + "step": 29276 + }, + { + "epoch": 0.58556, + "grad_norm": 1.7421875, + "grad_norm_var": 0.010188547770182292, + "learning_rate": 0.0001, + "loss": 3.8693, + "loss/crossentropy": 1.9797600507736206, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2020813450217247, + "step": 29278 + }, + { + "epoch": 0.5856, + "grad_norm": 2.0, + "grad_norm_var": 0.011328125, + "learning_rate": 0.0001, + "loss": 4.0625, + "loss/crossentropy": 2.2091132402420044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1922968104481697, + "step": 29280 + }, + { + "epoch": 0.58564, + "grad_norm": 1.9609375, + "grad_norm_var": 0.011665852864583333, + "learning_rate": 0.0001, + "loss": 4.0093, + "loss/crossentropy": 2.220807909965515, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21127308905124664, + "step": 29282 + }, + { + "epoch": 0.58568, + "grad_norm": 2.109375, + "grad_norm_var": 0.013206990559895833, + "learning_rate": 0.0001, + "loss": 3.9675, + "loss/crossentropy": 1.9829080700874329, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18882058560848236, + "step": 29284 + }, + { + "epoch": 0.58572, + "grad_norm": 1.96875, + "grad_norm_var": 0.011864217122395833, + "learning_rate": 0.0001, + "loss": 4.1373, + "loss/crossentropy": 1.9328150153160095, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1966244801878929, + "step": 29286 + }, + { + "epoch": 0.58576, + "grad_norm": 1.984375, + "grad_norm_var": 0.0115631103515625, + "learning_rate": 0.0001, + "loss": 4.0272, + "loss/crossentropy": 2.109495759010315, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19433388113975525, + "step": 29288 + }, + { + "epoch": 0.5858, + "grad_norm": 1.890625, + "grad_norm_var": 0.009913889567057292, + "learning_rate": 0.0001, + "loss": 4.1732, + "loss/crossentropy": 2.3193604946136475, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2086661458015442, + "step": 29290 + }, + { + "epoch": 0.58584, + "grad_norm": 1.8671875, + "grad_norm_var": 0.010422515869140624, + "learning_rate": 0.0001, + "loss": 3.9131, + "loss/crossentropy": 2.015924036502838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1846477910876274, + "step": 29292 + }, + { + "epoch": 0.58588, + "grad_norm": 2.015625, + "grad_norm_var": 0.007323201497395833, + "learning_rate": 0.0001, + "loss": 4.1275, + "loss/crossentropy": 2.2601329684257507, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19521460682153702, + "step": 29294 + }, + { + "epoch": 0.58592, + "grad_norm": 1.96875, + "grad_norm_var": 0.005890909830729167, + "learning_rate": 0.0001, + "loss": 3.7702, + "loss/crossentropy": 1.8056439757347107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19355197250843048, + "step": 29296 + }, + { + "epoch": 0.58596, + "grad_norm": 2.046875, + "grad_norm_var": 0.005564117431640625, + "learning_rate": 0.0001, + "loss": 4.2989, + "loss/crossentropy": 2.071826934814453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19889701902866364, + "step": 29298 + }, + { + "epoch": 0.586, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0033111572265625, + "learning_rate": 0.0001, + "loss": 4.0176, + "loss/crossentropy": 2.2320820093154907, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21529612690210342, + "step": 29300 + }, + { + "epoch": 0.58604, + "grad_norm": 2.015625, + "grad_norm_var": 0.0034912109375, + "learning_rate": 0.0001, + "loss": 4.2805, + "loss/crossentropy": 2.1989121437072754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20347003638744354, + "step": 29302 + }, + { + "epoch": 0.58608, + "grad_norm": 1.859375, + "grad_norm_var": 0.0037679036458333334, + "learning_rate": 0.0001, + "loss": 3.8216, + "loss/crossentropy": 2.128119111061096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1872687190771103, + "step": 29304 + }, + { + "epoch": 0.58612, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010396067301432292, + "learning_rate": 0.0001, + "loss": 4.0913, + "loss/crossentropy": 1.9635827541351318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1851392686367035, + "step": 29306 + }, + { + "epoch": 0.58616, + "grad_norm": 2.1875, + "grad_norm_var": 0.0121002197265625, + "learning_rate": 0.0001, + "loss": 4.2218, + "loss/crossentropy": 2.2543106079101562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21720967441797256, + "step": 29308 + }, + { + "epoch": 0.5862, + "grad_norm": 2.0, + "grad_norm_var": 0.013387044270833334, + "learning_rate": 0.0001, + "loss": 3.8456, + "loss/crossentropy": 2.3444888591766357, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2140304371714592, + "step": 29310 + }, + { + "epoch": 0.58624, + "grad_norm": 1.984375, + "grad_norm_var": 0.013182576497395833, + "learning_rate": 0.0001, + "loss": 4.1098, + "loss/crossentropy": 2.0403956174850464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20437155663967133, + "step": 29312 + }, + { + "epoch": 0.58628, + "grad_norm": 1.7890625, + "grad_norm_var": 0.01605199178059896, + "learning_rate": 0.0001, + "loss": 4.0612, + "loss/crossentropy": 1.9070860743522644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18735133856534958, + "step": 29314 + }, + { + "epoch": 0.58632, + "grad_norm": 1.875, + "grad_norm_var": 0.018308258056640624, + "learning_rate": 0.0001, + "loss": 3.917, + "loss/crossentropy": 1.9588303565979004, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16786205023527145, + "step": 29316 + }, + { + "epoch": 0.58636, + "grad_norm": 1.7578125, + "grad_norm_var": 0.0215240478515625, + "learning_rate": 0.0001, + "loss": 3.8377, + "loss/crossentropy": 1.8938056230545044, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18695908039808273, + "step": 29318 + }, + { + "epoch": 0.5864, + "grad_norm": 1.78125, + "grad_norm_var": 0.022867584228515626, + "learning_rate": 0.0001, + "loss": 3.7435, + "loss/crossentropy": 2.254639148712158, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19976592808961868, + "step": 29320 + }, + { + "epoch": 0.58644, + "grad_norm": 1.8828125, + "grad_norm_var": 0.01688232421875, + "learning_rate": 0.0001, + "loss": 3.7988, + "loss/crossentropy": 2.2075421810150146, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18852433562278748, + "step": 29322 + }, + { + "epoch": 0.58648, + "grad_norm": 2.03125, + "grad_norm_var": 0.010860188802083334, + "learning_rate": 0.0001, + "loss": 3.9432, + "loss/crossentropy": 1.6794416904449463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18429157137870789, + "step": 29324 + }, + { + "epoch": 0.58652, + "grad_norm": 1.8515625, + "grad_norm_var": 0.010111236572265625, + "learning_rate": 0.0001, + "loss": 4.0925, + "loss/crossentropy": 2.3620080947875977, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21314998716115952, + "step": 29326 + }, + { + "epoch": 0.58656, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0092681884765625, + "learning_rate": 0.0001, + "loss": 4.1378, + "loss/crossentropy": 2.3381006717681885, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20351716130971909, + "step": 29328 + }, + { + "epoch": 0.5866, + "grad_norm": 1.96875, + "grad_norm_var": 0.009765370686848959, + "learning_rate": 0.0001, + "loss": 3.9563, + "loss/crossentropy": 2.0491183400154114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19619758427143097, + "step": 29330 + }, + { + "epoch": 0.58664, + "grad_norm": 1.890625, + "grad_norm_var": 0.009855143229166667, + "learning_rate": 0.0001, + "loss": 3.8267, + "loss/crossentropy": 1.7091345191001892, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15915849804878235, + "step": 29332 + }, + { + "epoch": 0.58668, + "grad_norm": 1.9375, + "grad_norm_var": 0.005589803059895833, + "learning_rate": 0.0001, + "loss": 4.0168, + "loss/crossentropy": 2.1986491680145264, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1997922584414482, + "step": 29334 + }, + { + "epoch": 0.58672, + "grad_norm": 2.015625, + "grad_norm_var": 0.006086985270182292, + "learning_rate": 0.0001, + "loss": 3.8094, + "loss/crossentropy": 2.051343023777008, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19083235412836075, + "step": 29336 + }, + { + "epoch": 0.58676, + "grad_norm": 2.0625, + "grad_norm_var": 0.008845774332682292, + "learning_rate": 0.0001, + "loss": 3.7667, + "loss/crossentropy": 2.0580617785453796, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19386407732963562, + "step": 29338 + }, + { + "epoch": 0.5868, + "grad_norm": 1.796875, + "grad_norm_var": 0.007983144124348958, + "learning_rate": 0.0001, + "loss": 3.8437, + "loss/crossentropy": 1.988048791885376, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16248874366283417, + "step": 29340 + }, + { + "epoch": 0.58684, + "grad_norm": 2.359375, + "grad_norm_var": 0.022215779622395834, + "learning_rate": 0.0001, + "loss": 3.7604, + "loss/crossentropy": 1.9649037718772888, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17722021788358688, + "step": 29342 + }, + { + "epoch": 0.58688, + "grad_norm": 1.9140625, + "grad_norm_var": 0.021801503499348958, + "learning_rate": 0.0001, + "loss": 4.1983, + "loss/crossentropy": 2.014201283454895, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19323624670505524, + "step": 29344 + }, + { + "epoch": 0.58692, + "grad_norm": 1.8828125, + "grad_norm_var": 0.021170806884765626, + "learning_rate": 0.0001, + "loss": 3.8541, + "loss/crossentropy": 1.9666126370429993, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17525266110897064, + "step": 29346 + }, + { + "epoch": 0.58696, + "grad_norm": 1.8359375, + "grad_norm_var": 0.021923573811848958, + "learning_rate": 0.0001, + "loss": 3.7258, + "loss/crossentropy": 2.10392427444458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1954725682735443, + "step": 29348 + }, + { + "epoch": 0.587, + "grad_norm": 2.140625, + "grad_norm_var": 0.0308013916015625, + "learning_rate": 0.0001, + "loss": 4.0668, + "loss/crossentropy": 1.6613619327545166, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21818602830171585, + "step": 29350 + }, + { + "epoch": 0.58704, + "grad_norm": 1.875, + "grad_norm_var": 0.030460611979166666, + "learning_rate": 0.0001, + "loss": 4.0683, + "loss/crossentropy": 2.4305994510650635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2147374525666237, + "step": 29352 + }, + { + "epoch": 0.58708, + "grad_norm": 2.0625, + "grad_norm_var": 0.026944986979166665, + "learning_rate": 0.0001, + "loss": 4.2551, + "loss/crossentropy": 2.1946258544921875, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22057125717401505, + "step": 29354 + }, + { + "epoch": 0.58712, + "grad_norm": 1.9765625, + "grad_norm_var": 0.025852203369140625, + "learning_rate": 0.0001, + "loss": 4.1854, + "loss/crossentropy": 2.3252480030059814, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20392987877130508, + "step": 29356 + }, + { + "epoch": 0.58716, + "grad_norm": 1.8359375, + "grad_norm_var": 0.015999348958333333, + "learning_rate": 0.0001, + "loss": 3.9164, + "loss/crossentropy": 1.9041990041732788, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1759115606546402, + "step": 29358 + }, + { + "epoch": 0.5872, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0171783447265625, + "learning_rate": 0.0001, + "loss": 3.9968, + "loss/crossentropy": 1.9955511689186096, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1914498507976532, + "step": 29360 + }, + { + "epoch": 0.58724, + "grad_norm": 2.0, + "grad_norm_var": 0.0184814453125, + "learning_rate": 0.0001, + "loss": 3.7173, + "loss/crossentropy": 1.9557825922966003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19295810908079147, + "step": 29362 + }, + { + "epoch": 0.58728, + "grad_norm": 1.8046875, + "grad_norm_var": 0.01850153605143229, + "learning_rate": 0.0001, + "loss": 3.6251, + "loss/crossentropy": 1.4499501585960388, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.143144428730011, + "step": 29364 + }, + { + "epoch": 0.58732, + "grad_norm": 1.765625, + "grad_norm_var": 0.010536448160807291, + "learning_rate": 0.0001, + "loss": 3.8048, + "loss/crossentropy": 2.1275508999824524, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19290220737457275, + "step": 29366 + }, + { + "epoch": 0.58736, + "grad_norm": 1.8046875, + "grad_norm_var": 0.011177317301432291, + "learning_rate": 0.0001, + "loss": 3.7983, + "loss/crossentropy": 2.320215940475464, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19288549572229385, + "step": 29368 + }, + { + "epoch": 0.5874, + "grad_norm": 1.953125, + "grad_norm_var": 0.009248606363932292, + "learning_rate": 0.0001, + "loss": 4.0161, + "loss/crossentropy": 1.9968576431274414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18840061873197556, + "step": 29370 + }, + { + "epoch": 0.58744, + "grad_norm": 1.953125, + "grad_norm_var": 0.005671946207682291, + "learning_rate": 0.0001, + "loss": 4.2287, + "loss/crossentropy": 2.2592599391937256, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21203969419002533, + "step": 29372 + }, + { + "epoch": 0.58748, + "grad_norm": 2.0, + "grad_norm_var": 0.007051595052083333, + "learning_rate": 0.0001, + "loss": 4.1003, + "loss/crossentropy": 1.9274957180023193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17894938588142395, + "step": 29374 + }, + { + "epoch": 0.58752, + "grad_norm": 1.890625, + "grad_norm_var": 0.006982421875, + "learning_rate": 0.0001, + "loss": 3.7913, + "loss/crossentropy": 2.252523422241211, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21168971061706543, + "step": 29376 + }, + { + "epoch": 0.58756, + "grad_norm": 1.90625, + "grad_norm_var": 0.010188802083333334, + "learning_rate": 0.0001, + "loss": 3.95, + "loss/crossentropy": 2.110221028327942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2013455107808113, + "step": 29378 + }, + { + "epoch": 0.5876, + "grad_norm": 1.8671875, + "grad_norm_var": 0.008750152587890626, + "learning_rate": 0.0001, + "loss": 3.8604, + "loss/crossentropy": 1.9283993244171143, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17572121322155, + "step": 29380 + }, + { + "epoch": 0.58764, + "grad_norm": 1.8125, + "grad_norm_var": 0.007950592041015624, + "learning_rate": 0.0001, + "loss": 3.988, + "loss/crossentropy": 2.23159658908844, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21318458020687103, + "step": 29382 + }, + { + "epoch": 0.58768, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0097076416015625, + "learning_rate": 0.0001, + "loss": 3.9589, + "loss/crossentropy": 2.2170876264572144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22383015602827072, + "step": 29384 + }, + { + "epoch": 0.58772, + "grad_norm": 1.8671875, + "grad_norm_var": 0.011173248291015625, + "learning_rate": 0.0001, + "loss": 4.0071, + "loss/crossentropy": 2.2335199117660522, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19595736265182495, + "step": 29386 + }, + { + "epoch": 0.58776, + "grad_norm": 2.015625, + "grad_norm_var": 0.0118072509765625, + "learning_rate": 0.0001, + "loss": 3.9657, + "loss/crossentropy": 2.165065050125122, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20269524306058884, + "step": 29388 + }, + { + "epoch": 0.5878, + "grad_norm": 1.8125, + "grad_norm_var": 0.012400054931640625, + "learning_rate": 0.0001, + "loss": 3.5373, + "loss/crossentropy": 1.8111371397972107, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17589127272367477, + "step": 29390 + }, + { + "epoch": 0.58784, + "grad_norm": 1.78125, + "grad_norm_var": 0.015110015869140625, + "learning_rate": 0.0001, + "loss": 3.8698, + "loss/crossentropy": 2.083921492099762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19300700724124908, + "step": 29392 + }, + { + "epoch": 0.58788, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010929361979166666, + "learning_rate": 0.0001, + "loss": 4.168, + "loss/crossentropy": 1.916080355644226, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18377907574176788, + "step": 29394 + }, + { + "epoch": 0.58792, + "grad_norm": 1.8828125, + "grad_norm_var": 0.011842600504557292, + "learning_rate": 0.0001, + "loss": 4.2627, + "loss/crossentropy": 2.2706268429756165, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2135249674320221, + "step": 29396 + }, + { + "epoch": 0.58796, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0125396728515625, + "learning_rate": 0.0001, + "loss": 3.649, + "loss/crossentropy": 1.99881511926651, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20278233289718628, + "step": 29398 + }, + { + "epoch": 0.588, + "grad_norm": 1.8203125, + "grad_norm_var": 0.009087880452473959, + "learning_rate": 0.0001, + "loss": 3.9936, + "loss/crossentropy": 1.9932519793510437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18421290814876556, + "step": 29400 + }, + { + "epoch": 0.58804, + "grad_norm": 1.8125, + "grad_norm_var": 0.008893839518229167, + "learning_rate": 0.0001, + "loss": 4.0858, + "loss/crossentropy": 2.287453770637512, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20635423064231873, + "step": 29402 + }, + { + "epoch": 0.58808, + "grad_norm": 1.9921875, + "grad_norm_var": 0.008544921875, + "learning_rate": 0.0001, + "loss": 4.3109, + "loss/crossentropy": 2.2095491886138916, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1934846043586731, + "step": 29404 + }, + { + "epoch": 0.58812, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0108795166015625, + "learning_rate": 0.0001, + "loss": 4.1356, + "loss/crossentropy": 2.0484838485717773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19288712739944458, + "step": 29406 + }, + { + "epoch": 0.58816, + "grad_norm": 2.0, + "grad_norm_var": 0.009110514322916667, + "learning_rate": 0.0001, + "loss": 3.9269, + "loss/crossentropy": 2.431947708129883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21592531353235245, + "step": 29408 + }, + { + "epoch": 0.5882, + "grad_norm": 1.8671875, + "grad_norm_var": 0.009049479166666667, + "learning_rate": 0.0001, + "loss": 4.0748, + "loss/crossentropy": 2.0007177591323853, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1799403429031372, + "step": 29410 + }, + { + "epoch": 0.58824, + "grad_norm": 2.03125, + "grad_norm_var": 0.009224446614583333, + "learning_rate": 0.0001, + "loss": 4.1588, + "loss/crossentropy": 2.168426990509033, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2031354159116745, + "step": 29412 + }, + { + "epoch": 0.58828, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007917277018229167, + "learning_rate": 0.0001, + "loss": 4.0162, + "loss/crossentropy": 2.1504631638526917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19324814528226852, + "step": 29414 + }, + { + "epoch": 0.58832, + "grad_norm": 1.9375, + "grad_norm_var": 0.011515299479166666, + "learning_rate": 0.0001, + "loss": 3.9259, + "loss/crossentropy": 2.013205111026764, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18360282480716705, + "step": 29416 + }, + { + "epoch": 0.58836, + "grad_norm": 1.8828125, + "grad_norm_var": 0.010776519775390625, + "learning_rate": 0.0001, + "loss": 4.0384, + "loss/crossentropy": 2.155342757701874, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2049562856554985, + "step": 29418 + }, + { + "epoch": 0.5884, + "grad_norm": 1.921875, + "grad_norm_var": 0.012041982014973958, + "learning_rate": 0.0001, + "loss": 4.0741, + "loss/crossentropy": 2.117911696434021, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18994345515966415, + "step": 29420 + }, + { + "epoch": 0.58844, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009422810872395833, + "learning_rate": 0.0001, + "loss": 4.0517, + "loss/crossentropy": 1.980055332183838, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1862584576010704, + "step": 29422 + }, + { + "epoch": 0.58848, + "grad_norm": 2.078125, + "grad_norm_var": 0.009364573160807292, + "learning_rate": 0.0001, + "loss": 4.1284, + "loss/crossentropy": 2.2244694232940674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21067889779806137, + "step": 29424 + }, + { + "epoch": 0.58852, + "grad_norm": 1.9453125, + "grad_norm_var": 0.008876291910807292, + "learning_rate": 0.0001, + "loss": 4.1993, + "loss/crossentropy": 2.213523805141449, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20092912763357162, + "step": 29426 + }, + { + "epoch": 0.58856, + "grad_norm": 2.015625, + "grad_norm_var": 0.0081695556640625, + "learning_rate": 0.0001, + "loss": 4.1802, + "loss/crossentropy": 1.9292373657226562, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17156004905700684, + "step": 29428 + }, + { + "epoch": 0.5886, + "grad_norm": 1.796875, + "grad_norm_var": 0.011537424723307292, + "learning_rate": 0.0001, + "loss": 4.0741, + "loss/crossentropy": 2.4728565216064453, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22577238827943802, + "step": 29430 + }, + { + "epoch": 0.58864, + "grad_norm": 1.78125, + "grad_norm_var": 0.011639149983723958, + "learning_rate": 0.0001, + "loss": 3.9782, + "loss/crossentropy": 1.757906973361969, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19193939119577408, + "step": 29432 + }, + { + "epoch": 0.58868, + "grad_norm": 2.03125, + "grad_norm_var": 0.011351521809895833, + "learning_rate": 0.0001, + "loss": 4.019, + "loss/crossentropy": 2.120426893234253, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19062675535678864, + "step": 29434 + }, + { + "epoch": 0.58872, + "grad_norm": 2.140625, + "grad_norm_var": 0.01217041015625, + "learning_rate": 0.0001, + "loss": 3.8409, + "loss/crossentropy": 1.7993061542510986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1687781661748886, + "step": 29436 + }, + { + "epoch": 0.58876, + "grad_norm": 1.90625, + "grad_norm_var": 0.01608250935872396, + "learning_rate": 0.0001, + "loss": 3.775, + "loss/crossentropy": 1.995628297328949, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18463349342346191, + "step": 29438 + }, + { + "epoch": 0.5888, + "grad_norm": 1.90625, + "grad_norm_var": 0.015329742431640625, + "learning_rate": 0.0001, + "loss": 3.8311, + "loss/crossentropy": 2.411523938179016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2045421302318573, + "step": 29440 + }, + { + "epoch": 0.58884, + "grad_norm": 1.8046875, + "grad_norm_var": 0.017288970947265624, + "learning_rate": 0.0001, + "loss": 3.7145, + "loss/crossentropy": 1.9225006103515625, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17880843579769135, + "step": 29442 + }, + { + "epoch": 0.58888, + "grad_norm": 1.8203125, + "grad_norm_var": 0.017594401041666666, + "learning_rate": 0.0001, + "loss": 3.9736, + "loss/crossentropy": 2.1082658171653748, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19336317479610443, + "step": 29444 + }, + { + "epoch": 0.58892, + "grad_norm": 1.921875, + "grad_norm_var": 0.013362630208333334, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 2.350805640220642, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19736752659082413, + "step": 29446 + }, + { + "epoch": 0.58896, + "grad_norm": 1.875, + "grad_norm_var": 0.009712727864583333, + "learning_rate": 0.0001, + "loss": 3.9396, + "loss/crossentropy": 2.1030595302581787, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1941584125161171, + "step": 29448 + }, + { + "epoch": 0.589, + "grad_norm": 1.765625, + "grad_norm_var": 0.009700266520182292, + "learning_rate": 0.0001, + "loss": 3.6338, + "loss/crossentropy": 2.173567295074463, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19058758020401, + "step": 29450 + }, + { + "epoch": 0.58904, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0044514973958333336, + "learning_rate": 0.0001, + "loss": 4.1794, + "loss/crossentropy": 2.113981068134308, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20183458924293518, + "step": 29452 + }, + { + "epoch": 0.58908, + "grad_norm": 1.6796875, + "grad_norm_var": 0.0050537109375, + "learning_rate": 0.0001, + "loss": 3.8333, + "loss/crossentropy": 1.6564378142356873, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16441450640559196, + "step": 29454 + }, + { + "epoch": 0.58912, + "grad_norm": 1.671875, + "grad_norm_var": 0.007103474934895834, + "learning_rate": 0.0001, + "loss": 3.591, + "loss/crossentropy": 1.7756662368774414, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1805124208331108, + "step": 29456 + }, + { + "epoch": 0.58916, + "grad_norm": 1.828125, + "grad_norm_var": 0.006990305582682292, + "learning_rate": 0.0001, + "loss": 3.8145, + "loss/crossentropy": 1.825185477733612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1628718078136444, + "step": 29458 + }, + { + "epoch": 0.5892, + "grad_norm": 1.875, + "grad_norm_var": 0.006001790364583333, + "learning_rate": 0.0001, + "loss": 3.9426, + "loss/crossentropy": 2.0170122385025024, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19748982787132263, + "step": 29460 + }, + { + "epoch": 0.58924, + "grad_norm": 1.828125, + "grad_norm_var": 0.006783040364583334, + "learning_rate": 0.0001, + "loss": 3.8533, + "loss/crossentropy": 1.8711951971054077, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20168818533420563, + "step": 29462 + }, + { + "epoch": 0.58928, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006648508707682291, + "learning_rate": 0.0001, + "loss": 3.7884, + "loss/crossentropy": 1.668130338191986, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1706988662481308, + "step": 29464 + }, + { + "epoch": 0.58932, + "grad_norm": 1.9140625, + "grad_norm_var": 0.006552886962890625, + "learning_rate": 0.0001, + "loss": 3.8763, + "loss/crossentropy": 1.8372064232826233, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17468038201332092, + "step": 29466 + }, + { + "epoch": 0.58936, + "grad_norm": 1.8203125, + "grad_norm_var": 0.006811269124348958, + "learning_rate": 0.0001, + "loss": 4.235, + "loss/crossentropy": 2.234304904937744, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1845388039946556, + "step": 29468 + }, + { + "epoch": 0.5894, + "grad_norm": 1.7421875, + "grad_norm_var": 0.00662841796875, + "learning_rate": 0.0001, + "loss": 3.5877, + "loss/crossentropy": 1.9101244807243347, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17081473767757416, + "step": 29470 + }, + { + "epoch": 0.58944, + "grad_norm": 1.8046875, + "grad_norm_var": 0.004353586832682292, + "learning_rate": 0.0001, + "loss": 4.0295, + "loss/crossentropy": 2.2079320549964905, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1952483355998993, + "step": 29472 + }, + { + "epoch": 0.58948, + "grad_norm": 1.953125, + "grad_norm_var": 0.00458984375, + "learning_rate": 0.0001, + "loss": 3.9308, + "loss/crossentropy": 1.8550963997840881, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18943903595209122, + "step": 29474 + }, + { + "epoch": 0.58952, + "grad_norm": 1.9140625, + "grad_norm_var": 0.004865519205729167, + "learning_rate": 0.0001, + "loss": 3.7654, + "loss/crossentropy": 1.7661077976226807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19584594666957855, + "step": 29476 + }, + { + "epoch": 0.58956, + "grad_norm": 1.9296875, + "grad_norm_var": 0.005029042561848958, + "learning_rate": 0.0001, + "loss": 4.2088, + "loss/crossentropy": 2.338898777961731, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21608547866344452, + "step": 29478 + }, + { + "epoch": 0.5896, + "grad_norm": 1.8203125, + "grad_norm_var": 0.005242665608723958, + "learning_rate": 0.0001, + "loss": 4.1212, + "loss/crossentropy": 2.3376599550247192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20884696394205093, + "step": 29480 + }, + { + "epoch": 0.58964, + "grad_norm": 1.8125, + "grad_norm_var": 0.005964152018229167, + "learning_rate": 0.0001, + "loss": 4.16, + "loss/crossentropy": 2.241575002670288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2200220599770546, + "step": 29482 + }, + { + "epoch": 0.58968, + "grad_norm": 2.328125, + "grad_norm_var": 0.017805735270182293, + "learning_rate": 0.0001, + "loss": 4.0612, + "loss/crossentropy": 2.010516047477722, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22505514323711395, + "step": 29484 + }, + { + "epoch": 0.58972, + "grad_norm": 2.0625, + "grad_norm_var": 0.017545318603515624, + "learning_rate": 0.0001, + "loss": 3.9692, + "loss/crossentropy": 2.0848931670188904, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21239446848630905, + "step": 29486 + }, + { + "epoch": 0.58976, + "grad_norm": 1.921875, + "grad_norm_var": 0.016315714518229166, + "learning_rate": 0.0001, + "loss": 4.1173, + "loss/crossentropy": 2.0518574118614197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19289197772741318, + "step": 29488 + }, + { + "epoch": 0.5898, + "grad_norm": 2.0625, + "grad_norm_var": 0.016460927327473958, + "learning_rate": 0.0001, + "loss": 4.2193, + "loss/crossentropy": 2.1830244064331055, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21049733459949493, + "step": 29490 + }, + { + "epoch": 0.58984, + "grad_norm": 2.0, + "grad_norm_var": 0.018833160400390625, + "learning_rate": 0.0001, + "loss": 4.416, + "loss/crossentropy": 2.1932766437530518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1886359080672264, + "step": 29492 + }, + { + "epoch": 0.58988, + "grad_norm": 1.9921875, + "grad_norm_var": 0.018595123291015626, + "learning_rate": 0.0001, + "loss": 4.1936, + "loss/crossentropy": 2.0677687525749207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19729864597320557, + "step": 29494 + }, + { + "epoch": 0.58992, + "grad_norm": 1.984375, + "grad_norm_var": 0.0149169921875, + "learning_rate": 0.0001, + "loss": 4.1461, + "loss/crossentropy": 2.181239366531372, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2087482586503029, + "step": 29496 + }, + { + "epoch": 0.58996, + "grad_norm": 1.890625, + "grad_norm_var": 0.013132476806640625, + "learning_rate": 0.0001, + "loss": 4.0357, + "loss/crossentropy": 2.2734841108322144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22230970114469528, + "step": 29498 + }, + { + "epoch": 0.59, + "grad_norm": 1.9921875, + "grad_norm_var": 0.007454427083333334, + "learning_rate": 0.0001, + "loss": 4.1009, + "loss/crossentropy": 2.1078773736953735, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18602187931537628, + "step": 29500 + }, + { + "epoch": 0.59004, + "grad_norm": 2.046875, + "grad_norm_var": 0.009635416666666667, + "learning_rate": 0.0001, + "loss": 3.7355, + "loss/crossentropy": 1.856580138206482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.186849907040596, + "step": 29502 + }, + { + "epoch": 0.59008, + "grad_norm": 1.8125, + "grad_norm_var": 0.011842600504557292, + "learning_rate": 0.0001, + "loss": 3.6756, + "loss/crossentropy": 1.90110582113266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18654846400022507, + "step": 29504 + }, + { + "epoch": 0.59012, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0134918212890625, + "learning_rate": 0.0001, + "loss": 4.0382, + "loss/crossentropy": 1.874162197113037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18833528459072113, + "step": 29506 + }, + { + "epoch": 0.59016, + "grad_norm": 1.890625, + "grad_norm_var": 0.009694163004557292, + "learning_rate": 0.0001, + "loss": 4.0473, + "loss/crossentropy": 2.0299471020698547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19723627716302872, + "step": 29508 + }, + { + "epoch": 0.5902, + "grad_norm": 1.859375, + "grad_norm_var": 0.01085205078125, + "learning_rate": 0.0001, + "loss": 3.9377, + "loss/crossentropy": 1.8558722138404846, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18302107602357864, + "step": 29510 + }, + { + "epoch": 0.59024, + "grad_norm": 1.9453125, + "grad_norm_var": 0.010359446207682291, + "learning_rate": 0.0001, + "loss": 4.0567, + "loss/crossentropy": 1.8753638863563538, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2061474472284317, + "step": 29512 + }, + { + "epoch": 0.59028, + "grad_norm": 1.8125, + "grad_norm_var": 0.011954498291015626, + "learning_rate": 0.0001, + "loss": 3.8181, + "loss/crossentropy": 1.9646863341331482, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18984466791152954, + "step": 29514 + }, + { + "epoch": 0.59032, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0124755859375, + "learning_rate": 0.0001, + "loss": 3.7632, + "loss/crossentropy": 1.9403189420700073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17063701152801514, + "step": 29516 + }, + { + "epoch": 0.59036, + "grad_norm": 1.8359375, + "grad_norm_var": 0.010774739583333333, + "learning_rate": 0.0001, + "loss": 3.7807, + "loss/crossentropy": 1.9278589487075806, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1793733760714531, + "step": 29518 + }, + { + "epoch": 0.5904, + "grad_norm": 2.28125, + "grad_norm_var": 0.01877009073893229, + "learning_rate": 0.0001, + "loss": 4.109, + "loss/crossentropy": 2.2447856664657593, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21776193380355835, + "step": 29520 + }, + { + "epoch": 0.59044, + "grad_norm": 1.84375, + "grad_norm_var": 0.015572102864583333, + "learning_rate": 0.0001, + "loss": 4.0259, + "loss/crossentropy": 2.3484312891960144, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2081959918141365, + "step": 29522 + }, + { + "epoch": 0.59048, + "grad_norm": 2.015625, + "grad_norm_var": 0.016434478759765624, + "learning_rate": 0.0001, + "loss": 3.9571, + "loss/crossentropy": 2.121490001678467, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2033299133181572, + "step": 29524 + }, + { + "epoch": 0.59052, + "grad_norm": 2.140625, + "grad_norm_var": 0.017758941650390624, + "learning_rate": 0.0001, + "loss": 3.9742, + "loss/crossentropy": 2.3825392723083496, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22553616762161255, + "step": 29526 + }, + { + "epoch": 0.59056, + "grad_norm": 1.9296875, + "grad_norm_var": 0.01761652628580729, + "learning_rate": 0.0001, + "loss": 3.9717, + "loss/crossentropy": 2.027899742126465, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18623250722885132, + "step": 29528 + }, + { + "epoch": 0.5906, + "grad_norm": 2.03125, + "grad_norm_var": 0.017732747395833335, + "learning_rate": 0.0001, + "loss": 4.3458, + "loss/crossentropy": 2.231070041656494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22132613509893417, + "step": 29530 + }, + { + "epoch": 0.59064, + "grad_norm": 2.015625, + "grad_norm_var": 0.01613133748372396, + "learning_rate": 0.0001, + "loss": 4.1003, + "loss/crossentropy": 2.16735976934433, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1951517015695572, + "step": 29532 + }, + { + "epoch": 0.59068, + "grad_norm": 2.234375, + "grad_norm_var": 0.019188435872395833, + "learning_rate": 0.0001, + "loss": 3.8917, + "loss/crossentropy": 1.8973720073699951, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18189363181591034, + "step": 29534 + }, + { + "epoch": 0.59072, + "grad_norm": 1.9375, + "grad_norm_var": 0.01687800089518229, + "learning_rate": 0.0001, + "loss": 3.7249, + "loss/crossentropy": 2.0840883255004883, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19777310639619827, + "step": 29536 + }, + { + "epoch": 0.59076, + "grad_norm": 2.03125, + "grad_norm_var": 0.016291300455729168, + "learning_rate": 0.0001, + "loss": 3.8481, + "loss/crossentropy": 1.7272367477416992, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1722969114780426, + "step": 29538 + }, + { + "epoch": 0.5908, + "grad_norm": 1.984375, + "grad_norm_var": 0.015024566650390625, + "learning_rate": 0.0001, + "loss": 3.9883, + "loss/crossentropy": 2.2268466353416443, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19153434038162231, + "step": 29540 + }, + { + "epoch": 0.59084, + "grad_norm": 1.921875, + "grad_norm_var": 0.012739817301432291, + "learning_rate": 0.0001, + "loss": 4.0964, + "loss/crossentropy": 2.4328715801239014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21078508347272873, + "step": 29542 + }, + { + "epoch": 0.59088, + "grad_norm": 1.9921875, + "grad_norm_var": 0.013325754801432292, + "learning_rate": 0.0001, + "loss": 4.0033, + "loss/crossentropy": 1.9592428803443909, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19666817784309387, + "step": 29544 + }, + { + "epoch": 0.59092, + "grad_norm": 1.9296875, + "grad_norm_var": 0.011620076497395833, + "learning_rate": 0.0001, + "loss": 4.203, + "loss/crossentropy": 1.8915135860443115, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18332552164793015, + "step": 29546 + }, + { + "epoch": 0.59096, + "grad_norm": 1.890625, + "grad_norm_var": 0.011834462483723959, + "learning_rate": 0.0001, + "loss": 3.884, + "loss/crossentropy": 1.9134865403175354, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2012096270918846, + "step": 29548 + }, + { + "epoch": 0.591, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0061948140462239586, + "learning_rate": 0.0001, + "loss": 4.2232, + "loss/crossentropy": 2.280379056930542, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19985026121139526, + "step": 29550 + }, + { + "epoch": 0.59104, + "grad_norm": 1.921875, + "grad_norm_var": 0.00355224609375, + "learning_rate": 0.0001, + "loss": 4.144, + "loss/crossentropy": 2.000941574573517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18684983253479004, + "step": 29552 + }, + { + "epoch": 0.59108, + "grad_norm": 1.7734375, + "grad_norm_var": 0.005020904541015625, + "learning_rate": 0.0001, + "loss": 3.7859, + "loss/crossentropy": 1.9749475121498108, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18268440663814545, + "step": 29554 + }, + { + "epoch": 0.59112, + "grad_norm": 1.9375, + "grad_norm_var": 0.0055653889973958336, + "learning_rate": 0.0001, + "loss": 4.0483, + "loss/crossentropy": 2.169970750808716, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20792870223522186, + "step": 29556 + }, + { + "epoch": 0.59116, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005712890625, + "learning_rate": 0.0001, + "loss": 4.0875, + "loss/crossentropy": 2.059566855430603, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1943632885813713, + "step": 29558 + }, + { + "epoch": 0.5912, + "grad_norm": 2.53125, + "grad_norm_var": 0.0299713134765625, + "learning_rate": 0.0001, + "loss": 4.0545, + "loss/crossentropy": 1.80315762758255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15433073043823242, + "step": 29560 + }, + { + "epoch": 0.59124, + "grad_norm": 2.109375, + "grad_norm_var": 0.0330078125, + "learning_rate": 0.0001, + "loss": 3.9351, + "loss/crossentropy": 2.1045247316360474, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19684398919343948, + "step": 29562 + }, + { + "epoch": 0.59128, + "grad_norm": 1.8671875, + "grad_norm_var": 0.03242162068684896, + "learning_rate": 0.0001, + "loss": 4.1244, + "loss/crossentropy": 2.2456302642822266, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1998264193534851, + "step": 29564 + }, + { + "epoch": 0.59132, + "grad_norm": 1.96875, + "grad_norm_var": 0.032389322916666664, + "learning_rate": 0.0001, + "loss": 3.8945, + "loss/crossentropy": 2.105454444885254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18675578385591507, + "step": 29566 + }, + { + "epoch": 0.59136, + "grad_norm": 2.0, + "grad_norm_var": 0.03316624959309896, + "learning_rate": 0.0001, + "loss": 4.0291, + "loss/crossentropy": 1.8126618266105652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19125473499298096, + "step": 29568 + }, + { + "epoch": 0.5914, + "grad_norm": 1.8671875, + "grad_norm_var": 0.032246907552083336, + "learning_rate": 0.0001, + "loss": 3.873, + "loss/crossentropy": 1.8100510239601135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18192324042320251, + "step": 29570 + }, + { + "epoch": 0.59144, + "grad_norm": 1.890625, + "grad_norm_var": 0.031160481770833335, + "learning_rate": 0.0001, + "loss": 3.6655, + "loss/crossentropy": 1.7124195098876953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16115359216928482, + "step": 29572 + }, + { + "epoch": 0.59148, + "grad_norm": 1.9375, + "grad_norm_var": 0.03163960774739583, + "learning_rate": 0.0001, + "loss": 3.9359, + "loss/crossentropy": 2.2159979939460754, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20147743076086044, + "step": 29574 + }, + { + "epoch": 0.59152, + "grad_norm": 1.9609375, + "grad_norm_var": 0.009722646077473958, + "learning_rate": 0.0001, + "loss": 4.2278, + "loss/crossentropy": 2.195053517818451, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20202681422233582, + "step": 29576 + }, + { + "epoch": 0.59156, + "grad_norm": 1.8515625, + "grad_norm_var": 0.007806142171223958, + "learning_rate": 0.0001, + "loss": 4.1261, + "loss/crossentropy": 1.992495834827423, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18599994480609894, + "step": 29578 + }, + { + "epoch": 0.5916, + "grad_norm": 2.03125, + "grad_norm_var": 0.008147939046223959, + "learning_rate": 0.0001, + "loss": 4.0063, + "loss/crossentropy": 2.1876922845840454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.197817362844944, + "step": 29580 + }, + { + "epoch": 0.59164, + "grad_norm": 2.03125, + "grad_norm_var": 0.008455403645833333, + "learning_rate": 0.0001, + "loss": 4.1484, + "loss/crossentropy": 2.1962021589279175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2175462618470192, + "step": 29582 + }, + { + "epoch": 0.59168, + "grad_norm": 1.90625, + "grad_norm_var": 0.011242421468098958, + "learning_rate": 0.0001, + "loss": 3.6774, + "loss/crossentropy": 1.9215996265411377, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18740177899599075, + "step": 29584 + }, + { + "epoch": 0.59172, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007787068684895833, + "learning_rate": 0.0001, + "loss": 3.862, + "loss/crossentropy": 1.6820173263549805, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1574452817440033, + "step": 29586 + }, + { + "epoch": 0.59176, + "grad_norm": 1.9296875, + "grad_norm_var": 0.007877349853515625, + "learning_rate": 0.0001, + "loss": 4.0942, + "loss/crossentropy": 2.1028271913528442, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.187875434756279, + "step": 29588 + }, + { + "epoch": 0.5918, + "grad_norm": 1.859375, + "grad_norm_var": 0.007865142822265626, + "learning_rate": 0.0001, + "loss": 3.7429, + "loss/crossentropy": 1.9593722224235535, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1871783211827278, + "step": 29590 + }, + { + "epoch": 0.59184, + "grad_norm": 1.953125, + "grad_norm_var": 0.007901763916015625, + "learning_rate": 0.0001, + "loss": 4.1708, + "loss/crossentropy": 2.1829755902290344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22405223548412323, + "step": 29592 + }, + { + "epoch": 0.59188, + "grad_norm": 2.03125, + "grad_norm_var": 0.007330067952473958, + "learning_rate": 0.0001, + "loss": 3.9746, + "loss/crossentropy": 1.9692201018333435, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18905997276306152, + "step": 29594 + }, + { + "epoch": 0.59192, + "grad_norm": 1.9765625, + "grad_norm_var": 0.008931223551432292, + "learning_rate": 0.0001, + "loss": 4.1957, + "loss/crossentropy": 2.341002106666565, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21306651830673218, + "step": 29596 + }, + { + "epoch": 0.59196, + "grad_norm": 2.03125, + "grad_norm_var": 0.010453287760416667, + "learning_rate": 0.0001, + "loss": 4.1617, + "loss/crossentropy": 1.9179102778434753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1874692291021347, + "step": 29598 + }, + { + "epoch": 0.592, + "grad_norm": 2.328125, + "grad_norm_var": 0.015421295166015625, + "learning_rate": 0.0001, + "loss": 4.036, + "loss/crossentropy": 1.8973752856254578, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19447319954633713, + "step": 29600 + }, + { + "epoch": 0.59204, + "grad_norm": 1.8359375, + "grad_norm_var": 0.016501617431640626, + "learning_rate": 0.0001, + "loss": 3.8856, + "loss/crossentropy": 1.8788208365440369, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16276411712169647, + "step": 29602 + }, + { + "epoch": 0.59208, + "grad_norm": 1.9453125, + "grad_norm_var": 0.01578547159830729, + "learning_rate": 0.0001, + "loss": 3.9585, + "loss/crossentropy": 1.5552323460578918, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1751468926668167, + "step": 29604 + }, + { + "epoch": 0.59212, + "grad_norm": 1.921875, + "grad_norm_var": 0.014273834228515626, + "learning_rate": 0.0001, + "loss": 4.0705, + "loss/crossentropy": 1.906869649887085, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18109184503555298, + "step": 29606 + }, + { + "epoch": 0.59216, + "grad_norm": 1.8984375, + "grad_norm_var": 0.014764149983723959, + "learning_rate": 0.0001, + "loss": 4.1151, + "loss/crossentropy": 2.23270046710968, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.23064442723989487, + "step": 29608 + }, + { + "epoch": 0.5922, + "grad_norm": 2.0625, + "grad_norm_var": 0.01395263671875, + "learning_rate": 0.0001, + "loss": 4.0758, + "loss/crossentropy": 2.187607169151306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1993623673915863, + "step": 29610 + }, + { + "epoch": 0.59224, + "grad_norm": 2.046875, + "grad_norm_var": 0.013425445556640625, + "learning_rate": 0.0001, + "loss": 4.1025, + "loss/crossentropy": 2.1429349184036255, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19814343750476837, + "step": 29612 + }, + { + "epoch": 0.59228, + "grad_norm": 1.859375, + "grad_norm_var": 0.014899698893229167, + "learning_rate": 0.0001, + "loss": 3.6302, + "loss/crossentropy": 1.9003735780715942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17693869769573212, + "step": 29614 + }, + { + "epoch": 0.59232, + "grad_norm": 2.265625, + "grad_norm_var": 0.0130523681640625, + "learning_rate": 0.0001, + "loss": 3.991, + "loss/crossentropy": 2.2013859152793884, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19231925159692764, + "step": 29616 + }, + { + "epoch": 0.59236, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01234130859375, + "learning_rate": 0.0001, + "loss": 3.9533, + "loss/crossentropy": 1.3149051070213318, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1432316154241562, + "step": 29618 + }, + { + "epoch": 0.5924, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013728841145833334, + "learning_rate": 0.0001, + "loss": 4.0781, + "loss/crossentropy": 2.099441707134247, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19838576018810272, + "step": 29620 + }, + { + "epoch": 0.59244, + "grad_norm": 1.765625, + "grad_norm_var": 0.015970865885416668, + "learning_rate": 0.0001, + "loss": 3.9236, + "loss/crossentropy": 1.9029900431632996, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2029390037059784, + "step": 29622 + }, + { + "epoch": 0.59248, + "grad_norm": 1.8203125, + "grad_norm_var": 0.01672337849934896, + "learning_rate": 0.0001, + "loss": 4.0291, + "loss/crossentropy": 2.159297227859497, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1991179883480072, + "step": 29624 + }, + { + "epoch": 0.59252, + "grad_norm": 1.921875, + "grad_norm_var": 0.014964803059895834, + "learning_rate": 0.0001, + "loss": 4.1908, + "loss/crossentropy": 2.0404341220855713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1894593983888626, + "step": 29626 + }, + { + "epoch": 0.59256, + "grad_norm": 1.921875, + "grad_norm_var": 0.012286122639973958, + "learning_rate": 0.0001, + "loss": 4.0989, + "loss/crossentropy": 2.2824409008026123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19680628180503845, + "step": 29628 + }, + { + "epoch": 0.5926, + "grad_norm": 1.921875, + "grad_norm_var": 0.011905924479166666, + "learning_rate": 0.0001, + "loss": 4.131, + "loss/crossentropy": 2.2231918573379517, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1871175393462181, + "step": 29630 + }, + { + "epoch": 0.59264, + "grad_norm": 1.796875, + "grad_norm_var": 0.0030263264973958335, + "learning_rate": 0.0001, + "loss": 3.9817, + "loss/crossentropy": 1.688015341758728, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1651618853211403, + "step": 29632 + }, + { + "epoch": 0.59268, + "grad_norm": 1.90625, + "grad_norm_var": 0.003173828125, + "learning_rate": 0.0001, + "loss": 3.915, + "loss/crossentropy": 1.8363903760910034, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1848619431257248, + "step": 29634 + }, + { + "epoch": 0.59272, + "grad_norm": 1.90625, + "grad_norm_var": 0.0029937744140625, + "learning_rate": 0.0001, + "loss": 3.7981, + "loss/crossentropy": 2.2170976400375366, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.199843168258667, + "step": 29636 + }, + { + "epoch": 0.59276, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0033770243326822916, + "learning_rate": 0.0001, + "loss": 4.0177, + "loss/crossentropy": 1.707352876663208, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17033107578754425, + "step": 29638 + }, + { + "epoch": 0.5928, + "grad_norm": 1.96875, + "grad_norm_var": 0.003295644124348958, + "learning_rate": 0.0001, + "loss": 4.1408, + "loss/crossentropy": 2.418645143508911, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20665095001459122, + "step": 29640 + }, + { + "epoch": 0.59284, + "grad_norm": 2.046875, + "grad_norm_var": 0.00450439453125, + "learning_rate": 0.0001, + "loss": 4.0686, + "loss/crossentropy": 2.0012764930725098, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17807253450155258, + "step": 29642 + }, + { + "epoch": 0.59288, + "grad_norm": 1.859375, + "grad_norm_var": 0.0054013570149739586, + "learning_rate": 0.0001, + "loss": 4.0358, + "loss/crossentropy": 2.4282820224761963, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22419389337301254, + "step": 29644 + }, + { + "epoch": 0.59292, + "grad_norm": 1.90625, + "grad_norm_var": 0.005669911702473958, + "learning_rate": 0.0001, + "loss": 3.9593, + "loss/crossentropy": 1.9802210927009583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18829302489757538, + "step": 29646 + }, + { + "epoch": 0.59296, + "grad_norm": 1.8671875, + "grad_norm_var": 0.004959869384765625, + "learning_rate": 0.0001, + "loss": 4.1733, + "loss/crossentropy": 2.1611366271972656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20142409205436707, + "step": 29648 + }, + { + "epoch": 0.593, + "grad_norm": 1.84375, + "grad_norm_var": 0.006009674072265625, + "learning_rate": 0.0001, + "loss": 3.7293, + "loss/crossentropy": 2.0067127346992493, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1892910748720169, + "step": 29650 + }, + { + "epoch": 0.59304, + "grad_norm": 1.875, + "grad_norm_var": 0.006591796875, + "learning_rate": 0.0001, + "loss": 3.9099, + "loss/crossentropy": 1.8918231129646301, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17595022916793823, + "step": 29652 + }, + { + "epoch": 0.59308, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00543212890625, + "learning_rate": 0.0001, + "loss": 4.0838, + "loss/crossentropy": 2.310000419616699, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20750866830348969, + "step": 29654 + }, + { + "epoch": 0.59312, + "grad_norm": 1.953125, + "grad_norm_var": 0.005448404947916667, + "learning_rate": 0.0001, + "loss": 3.7038, + "loss/crossentropy": 2.0080875158309937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18810434639453888, + "step": 29656 + }, + { + "epoch": 0.59316, + "grad_norm": 1.8359375, + "grad_norm_var": 0.0039670308430989586, + "learning_rate": 0.0001, + "loss": 3.8419, + "loss/crossentropy": 1.9152463674545288, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17956481873989105, + "step": 29658 + }, + { + "epoch": 0.5932, + "grad_norm": 1.828125, + "grad_norm_var": 0.002982330322265625, + "learning_rate": 0.0001, + "loss": 3.9465, + "loss/crossentropy": 2.019734025001526, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1654059886932373, + "step": 29660 + }, + { + "epoch": 0.59324, + "grad_norm": 1.9609375, + "grad_norm_var": 0.003242746988932292, + "learning_rate": 0.0001, + "loss": 3.9672, + "loss/crossentropy": 1.9415631294250488, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18466275930404663, + "step": 29662 + }, + { + "epoch": 0.59328, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0037737528483072916, + "learning_rate": 0.0001, + "loss": 4.0275, + "loss/crossentropy": 2.246294617652893, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20313822478055954, + "step": 29664 + }, + { + "epoch": 0.59332, + "grad_norm": 2.0625, + "grad_norm_var": 0.005242665608723958, + "learning_rate": 0.0001, + "loss": 4.1661, + "loss/crossentropy": 1.8673067688941956, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1858188509941101, + "step": 29666 + }, + { + "epoch": 0.59336, + "grad_norm": 1.8671875, + "grad_norm_var": 0.00509033203125, + "learning_rate": 0.0001, + "loss": 4.1556, + "loss/crossentropy": 1.9696037769317627, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1936260536313057, + "step": 29668 + }, + { + "epoch": 0.5934, + "grad_norm": 2.015625, + "grad_norm_var": 0.10892512003580729, + "learning_rate": 0.0001, + "loss": 4.1505, + "loss/crossentropy": 2.2883976697921753, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22077778726816177, + "step": 29670 + }, + { + "epoch": 0.59344, + "grad_norm": 1.8515625, + "grad_norm_var": 0.1094378153483073, + "learning_rate": 0.0001, + "loss": 3.9617, + "loss/crossentropy": 2.11427104473114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1877901256084442, + "step": 29672 + }, + { + "epoch": 0.59348, + "grad_norm": 1.8359375, + "grad_norm_var": 0.1084307352701823, + "learning_rate": 0.0001, + "loss": 4.02, + "loss/crossentropy": 2.256463050842285, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1980907917022705, + "step": 29674 + }, + { + "epoch": 0.59352, + "grad_norm": 2.125, + "grad_norm_var": 0.10796890258789063, + "learning_rate": 0.0001, + "loss": 3.926, + "loss/crossentropy": 2.0554774403572083, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19413713365793228, + "step": 29676 + }, + { + "epoch": 0.59356, + "grad_norm": 2.015625, + "grad_norm_var": 0.10651626586914062, + "learning_rate": 0.0001, + "loss": 3.9095, + "loss/crossentropy": 2.0619908571243286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20501630008220673, + "step": 29678 + }, + { + "epoch": 0.5936, + "grad_norm": 1.8671875, + "grad_norm_var": 0.10798314412434896, + "learning_rate": 0.0001, + "loss": 3.8964, + "loss/crossentropy": 2.0001166462898254, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1859305128455162, + "step": 29680 + }, + { + "epoch": 0.59364, + "grad_norm": 1.84375, + "grad_norm_var": 0.11613667805989583, + "learning_rate": 0.0001, + "loss": 3.7957, + "loss/crossentropy": 2.0025678277015686, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18820206075906754, + "step": 29682 + }, + { + "epoch": 0.59368, + "grad_norm": 1.84375, + "grad_norm_var": 0.11686604817708333, + "learning_rate": 0.0001, + "loss": 4.0391, + "loss/crossentropy": 2.0470725297927856, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1986207813024521, + "step": 29684 + }, + { + "epoch": 0.59372, + "grad_norm": 2.015625, + "grad_norm_var": 0.010619862874348959, + "learning_rate": 0.0001, + "loss": 3.7557, + "loss/crossentropy": 1.9196028113365173, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19124136865139008, + "step": 29686 + }, + { + "epoch": 0.59376, + "grad_norm": 1.96875, + "grad_norm_var": 0.013315582275390625, + "learning_rate": 0.0001, + "loss": 3.6694, + "loss/crossentropy": 1.624564528465271, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15498200803995132, + "step": 29688 + }, + { + "epoch": 0.5938, + "grad_norm": 1.8828125, + "grad_norm_var": 0.012060546875, + "learning_rate": 0.0001, + "loss": 4.0355, + "loss/crossentropy": 2.147800922393799, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19418001919984818, + "step": 29690 + }, + { + "epoch": 0.59384, + "grad_norm": 1.953125, + "grad_norm_var": 0.008719635009765626, + "learning_rate": 0.0001, + "loss": 4.0047, + "loss/crossentropy": 2.1515848636627197, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17781265825033188, + "step": 29692 + }, + { + "epoch": 0.59388, + "grad_norm": 1.9765625, + "grad_norm_var": 0.0083740234375, + "learning_rate": 0.0001, + "loss": 3.9553, + "loss/crossentropy": 2.034223735332489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19139179587364197, + "step": 29694 + }, + { + "epoch": 0.59392, + "grad_norm": 2.0, + "grad_norm_var": 0.009226226806640625, + "learning_rate": 0.0001, + "loss": 3.9402, + "loss/crossentropy": 1.8839016556739807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1837429627776146, + "step": 29696 + }, + { + "epoch": 0.59396, + "grad_norm": 1.96875, + "grad_norm_var": 0.006878407796223959, + "learning_rate": 0.0001, + "loss": 4.2001, + "loss/crossentropy": 2.367957592010498, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21486172825098038, + "step": 29698 + }, + { + "epoch": 0.594, + "grad_norm": 1.7890625, + "grad_norm_var": 0.008062489827473958, + "learning_rate": 0.0001, + "loss": 3.8494, + "loss/crossentropy": 2.02916818857193, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17551806569099426, + "step": 29700 + }, + { + "epoch": 0.59404, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009384918212890624, + "learning_rate": 0.0001, + "loss": 4.1648, + "loss/crossentropy": 1.8610569834709167, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17259076982736588, + "step": 29702 + }, + { + "epoch": 0.59408, + "grad_norm": 1.8203125, + "grad_norm_var": 0.007420857747395833, + "learning_rate": 0.0001, + "loss": 4.0775, + "loss/crossentropy": 1.8655198812484741, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17619968950748444, + "step": 29704 + }, + { + "epoch": 0.59412, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007775624593098958, + "learning_rate": 0.0001, + "loss": 4.0338, + "loss/crossentropy": 1.7603416442871094, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.15166223049163818, + "step": 29706 + }, + { + "epoch": 0.59416, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007726796468098958, + "learning_rate": 0.0001, + "loss": 3.8432, + "loss/crossentropy": 1.9816042184829712, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975991204380989, + "step": 29708 + }, + { + "epoch": 0.5942, + "grad_norm": 1.9453125, + "grad_norm_var": 0.0070574442545572914, + "learning_rate": 0.0001, + "loss": 4.1009, + "loss/crossentropy": 2.2220958471298218, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20013637095689774, + "step": 29710 + }, + { + "epoch": 0.59424, + "grad_norm": 2.203125, + "grad_norm_var": 0.012719472249348959, + "learning_rate": 0.0001, + "loss": 4.0315, + "loss/crossentropy": 1.947887122631073, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19075323641300201, + "step": 29712 + }, + { + "epoch": 0.59428, + "grad_norm": 1.796875, + "grad_norm_var": 0.014286295572916666, + "learning_rate": 0.0001, + "loss": 4.0852, + "loss/crossentropy": 2.3358702659606934, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21242079883813858, + "step": 29714 + }, + { + "epoch": 0.59432, + "grad_norm": 1.828125, + "grad_norm_var": 0.01927464803059896, + "learning_rate": 0.0001, + "loss": 4.1379, + "loss/crossentropy": 1.746841311454773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1843656674027443, + "step": 29716 + }, + { + "epoch": 0.59436, + "grad_norm": 1.875, + "grad_norm_var": 0.0187255859375, + "learning_rate": 0.0001, + "loss": 4.1713, + "loss/crossentropy": 2.283482313156128, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2116861194372177, + "step": 29718 + }, + { + "epoch": 0.5944, + "grad_norm": 1.90625, + "grad_norm_var": 0.016927083333333332, + "learning_rate": 0.0001, + "loss": 4.0167, + "loss/crossentropy": 1.927943468093872, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18120041489601135, + "step": 29720 + }, + { + "epoch": 0.59444, + "grad_norm": 1.8046875, + "grad_norm_var": 0.021219889322916668, + "learning_rate": 0.0001, + "loss": 3.6467, + "loss/crossentropy": 1.8562633395195007, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1756514087319374, + "step": 29722 + }, + { + "epoch": 0.59448, + "grad_norm": 1.8359375, + "grad_norm_var": 0.021862538655598958, + "learning_rate": 0.0001, + "loss": 3.7351, + "loss/crossentropy": 2.0243565440177917, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1839814931154251, + "step": 29724 + }, + { + "epoch": 0.59452, + "grad_norm": 1.8671875, + "grad_norm_var": 0.023227691650390625, + "learning_rate": 0.0001, + "loss": 4.0603, + "loss/crossentropy": 2.136290669441223, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19461742788553238, + "step": 29726 + }, + { + "epoch": 0.59456, + "grad_norm": 1.9296875, + "grad_norm_var": 0.017308553059895832, + "learning_rate": 0.0001, + "loss": 4.1118, + "loss/crossentropy": 2.1178980469703674, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19608290493488312, + "step": 29728 + }, + { + "epoch": 0.5946, + "grad_norm": 2.015625, + "grad_norm_var": 0.016209920247395832, + "learning_rate": 0.0001, + "loss": 4.1603, + "loss/crossentropy": 2.1932308673858643, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19451629370450974, + "step": 29730 + }, + { + "epoch": 0.59464, + "grad_norm": 2.21875, + "grad_norm_var": 0.014890289306640625, + "learning_rate": 0.0001, + "loss": 4.1111, + "loss/crossentropy": 2.140673279762268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19651110470294952, + "step": 29732 + }, + { + "epoch": 0.59468, + "grad_norm": 1.953125, + "grad_norm_var": 0.013423411051432292, + "learning_rate": 0.0001, + "loss": 4.0438, + "loss/crossentropy": 2.174296498298645, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2079356089234352, + "step": 29734 + }, + { + "epoch": 0.59472, + "grad_norm": 1.953125, + "grad_norm_var": 0.015730539957682293, + "learning_rate": 0.0001, + "loss": 4.0483, + "loss/crossentropy": 1.7323570847511292, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18022552132606506, + "step": 29736 + }, + { + "epoch": 0.59476, + "grad_norm": 1.90625, + "grad_norm_var": 0.011229451497395833, + "learning_rate": 0.0001, + "loss": 3.9672, + "loss/crossentropy": 2.0798590779304504, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19130367040634155, + "step": 29738 + }, + { + "epoch": 0.5948, + "grad_norm": 1.9921875, + "grad_norm_var": 0.009837849934895834, + "learning_rate": 0.0001, + "loss": 3.9728, + "loss/crossentropy": 2.1414352655410767, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19515492022037506, + "step": 29740 + }, + { + "epoch": 0.59484, + "grad_norm": 1.875, + "grad_norm_var": 0.01082763671875, + "learning_rate": 0.0001, + "loss": 3.8085, + "loss/crossentropy": 2.0827420949935913, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1836787909269333, + "step": 29742 + }, + { + "epoch": 0.59488, + "grad_norm": 2.015625, + "grad_norm_var": 0.010960896809895834, + "learning_rate": 0.0001, + "loss": 4.1457, + "loss/crossentropy": 2.2884727716445923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20683631300926208, + "step": 29744 + }, + { + "epoch": 0.59492, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010838826497395834, + "learning_rate": 0.0001, + "loss": 3.9837, + "loss/crossentropy": 1.6755734086036682, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17120809853076935, + "step": 29746 + }, + { + "epoch": 0.59496, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0085205078125, + "learning_rate": 0.0001, + "loss": 4.1012, + "loss/crossentropy": 1.8576670289039612, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19007987529039383, + "step": 29748 + }, + { + "epoch": 0.595, + "grad_norm": 1.9140625, + "grad_norm_var": 0.009227498372395834, + "learning_rate": 0.0001, + "loss": 4.0576, + "loss/crossentropy": 1.9877051711082458, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18861526250839233, + "step": 29750 + }, + { + "epoch": 0.59504, + "grad_norm": 1.90625, + "grad_norm_var": 0.0072591145833333336, + "learning_rate": 0.0001, + "loss": 3.9335, + "loss/crossentropy": 2.4191300868988037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2063479870557785, + "step": 29752 + }, + { + "epoch": 0.59508, + "grad_norm": 1.9765625, + "grad_norm_var": 0.007279459635416667, + "learning_rate": 0.0001, + "loss": 4.1469, + "loss/crossentropy": 2.145687997341156, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19573777168989182, + "step": 29754 + }, + { + "epoch": 0.59512, + "grad_norm": 1.6640625, + "grad_norm_var": 0.010823313395182292, + "learning_rate": 0.0001, + "loss": 3.7671, + "loss/crossentropy": 1.956217110157013, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17622292786836624, + "step": 29756 + }, + { + "epoch": 0.59516, + "grad_norm": 1.8125, + "grad_norm_var": 0.013716634114583333, + "learning_rate": 0.0001, + "loss": 3.976, + "loss/crossentropy": 2.0639008283615112, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18797758221626282, + "step": 29758 + }, + { + "epoch": 0.5952, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013073476155598958, + "learning_rate": 0.0001, + "loss": 3.9458, + "loss/crossentropy": 2.0404372811317444, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18267738819122314, + "step": 29760 + }, + { + "epoch": 0.59524, + "grad_norm": 2.09375, + "grad_norm_var": 0.015543619791666666, + "learning_rate": 0.0001, + "loss": 3.9625, + "loss/crossentropy": 2.0870869159698486, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18522879481315613, + "step": 29762 + }, + { + "epoch": 0.59528, + "grad_norm": 2.015625, + "grad_norm_var": 0.013602701822916667, + "learning_rate": 0.0001, + "loss": 4.0244, + "loss/crossentropy": 2.0854042172431946, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.172662615776062, + "step": 29764 + }, + { + "epoch": 0.59532, + "grad_norm": 1.8828125, + "grad_norm_var": 0.014050038655598958, + "learning_rate": 0.0001, + "loss": 3.9533, + "loss/crossentropy": 2.012739658355713, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20193946361541748, + "step": 29766 + }, + { + "epoch": 0.59536, + "grad_norm": 1.8984375, + "grad_norm_var": 0.013834635416666666, + "learning_rate": 0.0001, + "loss": 3.7903, + "loss/crossentropy": 1.8651488423347473, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19326791167259216, + "step": 29768 + }, + { + "epoch": 0.5954, + "grad_norm": 2.046875, + "grad_norm_var": 0.015258534749348959, + "learning_rate": 0.0001, + "loss": 3.9921, + "loss/crossentropy": 1.8395410776138306, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17881877720355988, + "step": 29770 + }, + { + "epoch": 0.59544, + "grad_norm": 1.8984375, + "grad_norm_var": 0.010518137613932292, + "learning_rate": 0.0001, + "loss": 3.7816, + "loss/crossentropy": 1.7089250087738037, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19955646991729736, + "step": 29772 + }, + { + "epoch": 0.59548, + "grad_norm": 1.890625, + "grad_norm_var": 0.007122548421223959, + "learning_rate": 0.0001, + "loss": 4.0978, + "loss/crossentropy": 2.233728766441345, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20136447995901108, + "step": 29774 + }, + { + "epoch": 0.59552, + "grad_norm": 1.8125, + "grad_norm_var": 0.008324178059895833, + "learning_rate": 0.0001, + "loss": 4.0012, + "loss/crossentropy": 2.104355573654175, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19211959093809128, + "step": 29776 + }, + { + "epoch": 0.59556, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0061075846354166664, + "learning_rate": 0.0001, + "loss": 3.931, + "loss/crossentropy": 1.9304287433624268, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1887596696615219, + "step": 29778 + }, + { + "epoch": 0.5956, + "grad_norm": 1.890625, + "grad_norm_var": 0.004878489176432291, + "learning_rate": 0.0001, + "loss": 3.9997, + "loss/crossentropy": 1.9446918368339539, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17917344719171524, + "step": 29780 + }, + { + "epoch": 0.59564, + "grad_norm": 1.8828125, + "grad_norm_var": 0.004196929931640625, + "learning_rate": 0.0001, + "loss": 4.0013, + "loss/crossentropy": 2.2256661653518677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21204601228237152, + "step": 29782 + }, + { + "epoch": 0.59568, + "grad_norm": 2.109375, + "grad_norm_var": 0.0061431884765625, + "learning_rate": 0.0001, + "loss": 4.0934, + "loss/crossentropy": 2.1072696447372437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2179524451494217, + "step": 29784 + }, + { + "epoch": 0.59572, + "grad_norm": 1.8984375, + "grad_norm_var": 0.0048411051432291664, + "learning_rate": 0.0001, + "loss": 4.0207, + "loss/crossentropy": 2.2444673776626587, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19245705753564835, + "step": 29786 + }, + { + "epoch": 0.59576, + "grad_norm": 2.09375, + "grad_norm_var": 0.006833648681640625, + "learning_rate": 0.0001, + "loss": 4.2724, + "loss/crossentropy": 2.5239516496658325, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22635143250226974, + "step": 29788 + }, + { + "epoch": 0.5958, + "grad_norm": 1.921875, + "grad_norm_var": 0.006883748372395833, + "learning_rate": 0.0001, + "loss": 3.9709, + "loss/crossentropy": 1.7270027995109558, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18143896013498306, + "step": 29790 + }, + { + "epoch": 0.59584, + "grad_norm": 1.859375, + "grad_norm_var": 0.006170399983723958, + "learning_rate": 0.0001, + "loss": 3.9258, + "loss/crossentropy": 1.9974133372306824, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19476915150880814, + "step": 29792 + }, + { + "epoch": 0.59588, + "grad_norm": 2.015625, + "grad_norm_var": 0.006601715087890625, + "learning_rate": 0.0001, + "loss": 3.999, + "loss/crossentropy": 1.9523651003837585, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17940453439950943, + "step": 29794 + }, + { + "epoch": 0.59592, + "grad_norm": 1.84375, + "grad_norm_var": 0.0069580078125, + "learning_rate": 0.0001, + "loss": 3.7075, + "loss/crossentropy": 1.8218488097190857, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17136383056640625, + "step": 29796 + }, + { + "epoch": 0.59596, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007098134358723958, + "learning_rate": 0.0001, + "loss": 4.0404, + "loss/crossentropy": 2.1148566603660583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21685151010751724, + "step": 29798 + }, + { + "epoch": 0.596, + "grad_norm": 1.8671875, + "grad_norm_var": 0.004833984375, + "learning_rate": 0.0001, + "loss": 4.0628, + "loss/crossentropy": 2.4410229921340942, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20404956489801407, + "step": 29800 + }, + { + "epoch": 0.59604, + "grad_norm": 1.953125, + "grad_norm_var": 0.005378214518229166, + "learning_rate": 0.0001, + "loss": 3.8664, + "loss/crossentropy": 1.9751408100128174, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18364927917718887, + "step": 29802 + }, + { + "epoch": 0.59608, + "grad_norm": 1.8671875, + "grad_norm_var": 0.0031402587890625, + "learning_rate": 0.0001, + "loss": 3.861, + "loss/crossentropy": 2.2535789012908936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2096666395664215, + "step": 29804 + }, + { + "epoch": 0.59612, + "grad_norm": 1.7734375, + "grad_norm_var": 0.004142252604166666, + "learning_rate": 0.0001, + "loss": 3.817, + "loss/crossentropy": 2.2020750641822815, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21073434501886368, + "step": 29806 + }, + { + "epoch": 0.59616, + "grad_norm": 1.8515625, + "grad_norm_var": 0.0040934244791666664, + "learning_rate": 0.0001, + "loss": 4.0591, + "loss/crossentropy": 2.0761327147483826, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1929018497467041, + "step": 29808 + }, + { + "epoch": 0.5962, + "grad_norm": 2.015625, + "grad_norm_var": 0.004064687093098958, + "learning_rate": 0.0001, + "loss": 4.1519, + "loss/crossentropy": 2.014641046524048, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1975310891866684, + "step": 29810 + }, + { + "epoch": 0.59624, + "grad_norm": 1.921875, + "grad_norm_var": 0.005182902018229167, + "learning_rate": 0.0001, + "loss": 3.8542, + "loss/crossentropy": 1.9420717358589172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1935308501124382, + "step": 29812 + }, + { + "epoch": 0.59628, + "grad_norm": 1.8671875, + "grad_norm_var": 0.005399576822916667, + "learning_rate": 0.0001, + "loss": 3.829, + "loss/crossentropy": 1.7394835352897644, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16872025281190872, + "step": 29814 + }, + { + "epoch": 0.59632, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008980305989583333, + "learning_rate": 0.0001, + "loss": 4.012, + "loss/crossentropy": 2.0532928705215454, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20437174290418625, + "step": 29816 + }, + { + "epoch": 0.59636, + "grad_norm": 1.984375, + "grad_norm_var": 0.008292388916015626, + "learning_rate": 0.0001, + "loss": 3.8792, + "loss/crossentropy": 1.7072120904922485, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16668207198381424, + "step": 29818 + }, + { + "epoch": 0.5964, + "grad_norm": 1.953125, + "grad_norm_var": 0.008063761393229167, + "learning_rate": 0.0001, + "loss": 4.2173, + "loss/crossentropy": 2.362375855445862, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20404937118291855, + "step": 29820 + }, + { + "epoch": 0.59644, + "grad_norm": 1.8359375, + "grad_norm_var": 0.006894683837890625, + "learning_rate": 0.0001, + "loss": 3.9963, + "loss/crossentropy": 2.334542155265808, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21750033646821976, + "step": 29822 + }, + { + "epoch": 0.59648, + "grad_norm": 1.9765625, + "grad_norm_var": 0.006463368733723958, + "learning_rate": 0.0001, + "loss": 4.0611, + "loss/crossentropy": 2.1138614416122437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2092488631606102, + "step": 29824 + }, + { + "epoch": 0.59652, + "grad_norm": 1.921875, + "grad_norm_var": 0.006184895833333333, + "learning_rate": 0.0001, + "loss": 3.8785, + "loss/crossentropy": 1.7644943594932556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18817196786403656, + "step": 29826 + }, + { + "epoch": 0.59656, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005407460530598958, + "learning_rate": 0.0001, + "loss": 4.1779, + "loss/crossentropy": 2.1820908784866333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20322736352682114, + "step": 29828 + }, + { + "epoch": 0.5966, + "grad_norm": 1.953125, + "grad_norm_var": 0.0041656494140625, + "learning_rate": 0.0001, + "loss": 3.8968, + "loss/crossentropy": 2.146401524543762, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19272416085004807, + "step": 29830 + }, + { + "epoch": 0.59664, + "grad_norm": 1.984375, + "grad_norm_var": 0.0023943583170572915, + "learning_rate": 0.0001, + "loss": 3.9237, + "loss/crossentropy": 1.8045800924301147, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1904245838522911, + "step": 29832 + }, + { + "epoch": 0.59668, + "grad_norm": 1.765625, + "grad_norm_var": 0.00426025390625, + "learning_rate": 0.0001, + "loss": 3.9487, + "loss/crossentropy": 2.107872247695923, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18807457387447357, + "step": 29834 + }, + { + "epoch": 0.59672, + "grad_norm": 1.7734375, + "grad_norm_var": 0.006048329671223958, + "learning_rate": 0.0001, + "loss": 3.9153, + "loss/crossentropy": 2.2815098762512207, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2112163081765175, + "step": 29836 + }, + { + "epoch": 0.59676, + "grad_norm": 2.0, + "grad_norm_var": 0.006145985921223959, + "learning_rate": 0.0001, + "loss": 4.1667, + "loss/crossentropy": 2.242028594017029, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2006349116563797, + "step": 29838 + }, + { + "epoch": 0.5968, + "grad_norm": 1.953125, + "grad_norm_var": 0.006029256184895833, + "learning_rate": 0.0001, + "loss": 3.9325, + "loss/crossentropy": 2.2297927141189575, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2074139192700386, + "step": 29840 + }, + { + "epoch": 0.59684, + "grad_norm": 2.046875, + "grad_norm_var": 0.007071940104166666, + "learning_rate": 0.0001, + "loss": 4.1403, + "loss/crossentropy": 2.2018691301345825, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21267356723546982, + "step": 29842 + }, + { + "epoch": 0.59688, + "grad_norm": 1.828125, + "grad_norm_var": 0.007819620768229167, + "learning_rate": 0.0001, + "loss": 3.8575, + "loss/crossentropy": 2.0878920555114746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18239793926477432, + "step": 29844 + }, + { + "epoch": 0.59692, + "grad_norm": 1.9609375, + "grad_norm_var": 0.007917277018229167, + "learning_rate": 0.0001, + "loss": 4.0925, + "loss/crossentropy": 2.0679028034210205, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20400486141443253, + "step": 29846 + }, + { + "epoch": 0.59696, + "grad_norm": 1.6328125, + "grad_norm_var": 0.010227203369140625, + "learning_rate": 0.0001, + "loss": 3.781, + "loss/crossentropy": 2.0496456623077393, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2035546898841858, + "step": 29848 + }, + { + "epoch": 0.597, + "grad_norm": 1.90625, + "grad_norm_var": 0.009325917561848958, + "learning_rate": 0.0001, + "loss": 4.0158, + "loss/crossentropy": 2.0204665660858154, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1830250397324562, + "step": 29850 + }, + { + "epoch": 0.59704, + "grad_norm": 1.796875, + "grad_norm_var": 0.009325154622395833, + "learning_rate": 0.0001, + "loss": 3.6884, + "loss/crossentropy": 2.0521149039268494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18589533865451813, + "step": 29852 + }, + { + "epoch": 0.59708, + "grad_norm": 1.84375, + "grad_norm_var": 0.009506988525390624, + "learning_rate": 0.0001, + "loss": 3.6348, + "loss/crossentropy": 1.8169047832489014, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17271429300308228, + "step": 29854 + }, + { + "epoch": 0.59712, + "grad_norm": 1.84375, + "grad_norm_var": 0.008514149983723959, + "learning_rate": 0.0001, + "loss": 4.0186, + "loss/crossentropy": 2.029230833053589, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18239567428827286, + "step": 29856 + }, + { + "epoch": 0.59716, + "grad_norm": 1.71875, + "grad_norm_var": 0.0059893290201822914, + "learning_rate": 0.0001, + "loss": 3.97, + "loss/crossentropy": 2.0296765565872192, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1822112277150154, + "step": 29858 + }, + { + "epoch": 0.5972, + "grad_norm": 1.859375, + "grad_norm_var": 0.00657958984375, + "learning_rate": 0.0001, + "loss": 3.9411, + "loss/crossentropy": 1.9933258891105652, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1953386887907982, + "step": 29860 + }, + { + "epoch": 0.59724, + "grad_norm": 2.171875, + "grad_norm_var": 0.014229329427083333, + "learning_rate": 0.0001, + "loss": 3.9819, + "loss/crossentropy": 1.9453927278518677, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19596701860427856, + "step": 29862 + }, + { + "epoch": 0.59728, + "grad_norm": 1.890625, + "grad_norm_var": 0.011185455322265624, + "learning_rate": 0.0001, + "loss": 3.8056, + "loss/crossentropy": 1.714016318321228, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17374874651432037, + "step": 29864 + }, + { + "epoch": 0.59732, + "grad_norm": 1.8671875, + "grad_norm_var": 0.01087646484375, + "learning_rate": 0.0001, + "loss": 3.6872, + "loss/crossentropy": 2.0186676383018494, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19331404566764832, + "step": 29866 + }, + { + "epoch": 0.59736, + "grad_norm": 1.71875, + "grad_norm_var": 0.011922200520833334, + "learning_rate": 0.0001, + "loss": 3.8642, + "loss/crossentropy": 1.9068130850791931, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18482722342014313, + "step": 29868 + }, + { + "epoch": 0.5974, + "grad_norm": 1.90625, + "grad_norm_var": 0.014092763264973959, + "learning_rate": 0.0001, + "loss": 4.0994, + "loss/crossentropy": 2.192137658596039, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19123639166355133, + "step": 29870 + }, + { + "epoch": 0.59744, + "grad_norm": 1.7734375, + "grad_norm_var": 0.014788564046223958, + "learning_rate": 0.0001, + "loss": 3.5734, + "loss/crossentropy": 1.859050452709198, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19843916594982147, + "step": 29872 + }, + { + "epoch": 0.59748, + "grad_norm": 2.28125, + "grad_norm_var": 0.021469879150390624, + "learning_rate": 0.0001, + "loss": 4.1619, + "loss/crossentropy": 1.5558834075927734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17953873425722122, + "step": 29874 + }, + { + "epoch": 0.59752, + "grad_norm": 1.8828125, + "grad_norm_var": 0.021274566650390625, + "learning_rate": 0.0001, + "loss": 4.2012, + "loss/crossentropy": 2.1445621252059937, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20609040558338165, + "step": 29876 + }, + { + "epoch": 0.59756, + "grad_norm": 1.8984375, + "grad_norm_var": 0.01730931599934896, + "learning_rate": 0.0001, + "loss": 4.0528, + "loss/crossentropy": 1.8768232464790344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16527344286441803, + "step": 29878 + }, + { + "epoch": 0.5976, + "grad_norm": 1.796875, + "grad_norm_var": 0.018024698893229166, + "learning_rate": 0.0001, + "loss": 3.8604, + "loss/crossentropy": 1.830302894115448, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17598530650138855, + "step": 29880 + }, + { + "epoch": 0.59764, + "grad_norm": 1.875, + "grad_norm_var": 0.018115234375, + "learning_rate": 0.0001, + "loss": 3.9474, + "loss/crossentropy": 2.0607933402061462, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18962115049362183, + "step": 29882 + }, + { + "epoch": 0.59768, + "grad_norm": 1.984375, + "grad_norm_var": 0.015657552083333335, + "learning_rate": 0.0001, + "loss": 3.8516, + "loss/crossentropy": 2.2717671394348145, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21351712197065353, + "step": 29884 + }, + { + "epoch": 0.59772, + "grad_norm": 1.765625, + "grad_norm_var": 0.014383951822916666, + "learning_rate": 0.0001, + "loss": 3.8209, + "loss/crossentropy": 2.0096693634986877, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1812792867422104, + "step": 29886 + }, + { + "epoch": 0.59776, + "grad_norm": 1.90625, + "grad_norm_var": 0.0130035400390625, + "learning_rate": 0.0001, + "loss": 3.8955, + "loss/crossentropy": 1.9364767670631409, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19597072899341583, + "step": 29888 + }, + { + "epoch": 0.5978, + "grad_norm": 1.9296875, + "grad_norm_var": 0.003348541259765625, + "learning_rate": 0.0001, + "loss": 4.1403, + "loss/crossentropy": 2.1214417219161987, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1933349445462227, + "step": 29890 + }, + { + "epoch": 0.59784, + "grad_norm": 1.8984375, + "grad_norm_var": 0.007013956705729167, + "learning_rate": 0.0001, + "loss": 4.2015, + "loss/crossentropy": 1.9339659214019775, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1958373337984085, + "step": 29892 + }, + { + "epoch": 0.59788, + "grad_norm": 2.0625, + "grad_norm_var": 0.009913889567057292, + "learning_rate": 0.0001, + "loss": 4.3786, + "loss/crossentropy": 1.940473735332489, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18802516907453537, + "step": 29894 + }, + { + "epoch": 0.59792, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009230295817057291, + "learning_rate": 0.0001, + "loss": 3.8581, + "loss/crossentropy": 2.048341751098633, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18541985005140305, + "step": 29896 + }, + { + "epoch": 0.59796, + "grad_norm": 1.9296875, + "grad_norm_var": 0.008780924479166667, + "learning_rate": 0.0001, + "loss": 4.1022, + "loss/crossentropy": 1.9213348031044006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17794149369001389, + "step": 29898 + }, + { + "epoch": 0.598, + "grad_norm": 1.8828125, + "grad_norm_var": 0.008807118733723958, + "learning_rate": 0.0001, + "loss": 4.0137, + "loss/crossentropy": 2.160265564918518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19693263620138168, + "step": 29900 + }, + { + "epoch": 0.59804, + "grad_norm": 1.90625, + "grad_norm_var": 0.007718658447265625, + "learning_rate": 0.0001, + "loss": 4.1329, + "loss/crossentropy": 2.3574010133743286, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20197532325983047, + "step": 29902 + }, + { + "epoch": 0.59808, + "grad_norm": 1.90625, + "grad_norm_var": 0.007505035400390625, + "learning_rate": 0.0001, + "loss": 4.0925, + "loss/crossentropy": 2.2992867827415466, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21149636805057526, + "step": 29904 + }, + { + "epoch": 0.59812, + "grad_norm": 2.015625, + "grad_norm_var": 0.007328033447265625, + "learning_rate": 0.0001, + "loss": 4.0631, + "loss/crossentropy": 2.1905906200408936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21494387090206146, + "step": 29906 + }, + { + "epoch": 0.59816, + "grad_norm": 1.9140625, + "grad_norm_var": 0.007808176676432291, + "learning_rate": 0.0001, + "loss": 3.6926, + "loss/crossentropy": 2.0027430057525635, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19078665226697922, + "step": 29908 + }, + { + "epoch": 0.5982, + "grad_norm": 2.046875, + "grad_norm_var": 0.007346343994140625, + "learning_rate": 0.0001, + "loss": 4.0699, + "loss/crossentropy": 2.5675058364868164, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21340946853160858, + "step": 29910 + }, + { + "epoch": 0.59824, + "grad_norm": 1.8359375, + "grad_norm_var": 0.007746378580729167, + "learning_rate": 0.0001, + "loss": 4.0047, + "loss/crossentropy": 1.6819973587989807, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17643968760967255, + "step": 29912 + }, + { + "epoch": 0.59828, + "grad_norm": 1.953125, + "grad_norm_var": 0.008499908447265624, + "learning_rate": 0.0001, + "loss": 4.0427, + "loss/crossentropy": 1.9997480511665344, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1763923391699791, + "step": 29914 + }, + { + "epoch": 0.59832, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009580230712890625, + "learning_rate": 0.0001, + "loss": 3.9184, + "loss/crossentropy": 2.02975070476532, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19244793057441711, + "step": 29916 + }, + { + "epoch": 0.59836, + "grad_norm": 1.984375, + "grad_norm_var": 0.009405263264973958, + "learning_rate": 0.0001, + "loss": 3.903, + "loss/crossentropy": 1.9599812030792236, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20076382905244827, + "step": 29918 + }, + { + "epoch": 0.5984, + "grad_norm": 2.0625, + "grad_norm_var": 0.010119374593098958, + "learning_rate": 0.0001, + "loss": 4.0101, + "loss/crossentropy": 2.106861114501953, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18403494358062744, + "step": 29920 + }, + { + "epoch": 0.59844, + "grad_norm": 1.8125, + "grad_norm_var": 0.009830474853515625, + "learning_rate": 0.0001, + "loss": 3.9163, + "loss/crossentropy": 2.0991535782814026, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18648552894592285, + "step": 29922 + }, + { + "epoch": 0.59848, + "grad_norm": 1.859375, + "grad_norm_var": 0.0095123291015625, + "learning_rate": 0.0001, + "loss": 4.0917, + "loss/crossentropy": 2.205238461494446, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.192887581884861, + "step": 29924 + }, + { + "epoch": 0.59852, + "grad_norm": 1.8671875, + "grad_norm_var": 0.006807200113932292, + "learning_rate": 0.0001, + "loss": 4.1419, + "loss/crossentropy": 1.8452889919281006, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17111057043075562, + "step": 29926 + }, + { + "epoch": 0.59856, + "grad_norm": 1.890625, + "grad_norm_var": 0.0066640218098958336, + "learning_rate": 0.0001, + "loss": 3.8361, + "loss/crossentropy": 2.1874170303344727, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20274078845977783, + "step": 29928 + }, + { + "epoch": 0.5986, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009671783447265625, + "learning_rate": 0.0001, + "loss": 4.0676, + "loss/crossentropy": 2.0940569639205933, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19653251022100449, + "step": 29930 + }, + { + "epoch": 0.59864, + "grad_norm": 1.8515625, + "grad_norm_var": 0.009440104166666666, + "learning_rate": 0.0001, + "loss": 4.0597, + "loss/crossentropy": 2.1147854328155518, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19174247980117798, + "step": 29932 + }, + { + "epoch": 0.59868, + "grad_norm": 1.8125, + "grad_norm_var": 0.009279123942057292, + "learning_rate": 0.0001, + "loss": 3.8874, + "loss/crossentropy": 1.9983789324760437, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19001173973083496, + "step": 29934 + }, + { + "epoch": 0.59872, + "grad_norm": 1.9453125, + "grad_norm_var": 0.007264963785807292, + "learning_rate": 0.0001, + "loss": 3.8884, + "loss/crossentropy": 2.0145105123519897, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1948821321129799, + "step": 29936 + }, + { + "epoch": 0.59876, + "grad_norm": 1.7578125, + "grad_norm_var": 0.0087890625, + "learning_rate": 0.0001, + "loss": 3.9149, + "loss/crossentropy": 2.0878870487213135, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19328270852565765, + "step": 29938 + }, + { + "epoch": 0.5988, + "grad_norm": 2.0, + "grad_norm_var": 0.008177693684895833, + "learning_rate": 0.0001, + "loss": 3.8649, + "loss/crossentropy": 2.0433109998703003, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1775122806429863, + "step": 29940 + }, + { + "epoch": 0.59884, + "grad_norm": 1.8671875, + "grad_norm_var": 0.016060384114583333, + "learning_rate": 0.0001, + "loss": 4.339, + "loss/crossentropy": 2.45761239528656, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20690447837114334, + "step": 29942 + }, + { + "epoch": 0.59888, + "grad_norm": 2.171875, + "grad_norm_var": 0.019052886962890626, + "learning_rate": 0.0001, + "loss": 4.2211, + "loss/crossentropy": 2.149196147918701, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.22804293036460876, + "step": 29944 + }, + { + "epoch": 0.59892, + "grad_norm": 1.8203125, + "grad_norm_var": 0.0178131103515625, + "learning_rate": 0.0001, + "loss": 3.668, + "loss/crossentropy": 2.208665132522583, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1886969953775406, + "step": 29946 + }, + { + "epoch": 0.59896, + "grad_norm": 1.859375, + "grad_norm_var": 0.017479451497395833, + "learning_rate": 0.0001, + "loss": 3.9187, + "loss/crossentropy": 2.1634607315063477, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2016885057091713, + "step": 29948 + }, + { + "epoch": 0.599, + "grad_norm": 1.7890625, + "grad_norm_var": 0.018651326497395832, + "learning_rate": 0.0001, + "loss": 4.0225, + "loss/crossentropy": 2.289479374885559, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.209372416138649, + "step": 29950 + }, + { + "epoch": 0.59904, + "grad_norm": 1.8359375, + "grad_norm_var": 0.019535064697265625, + "learning_rate": 0.0001, + "loss": 3.9492, + "loss/crossentropy": 2.1859498023986816, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2030099779367447, + "step": 29952 + }, + { + "epoch": 0.59908, + "grad_norm": 1.8515625, + "grad_norm_var": 0.017496744791666668, + "learning_rate": 0.0001, + "loss": 3.8509, + "loss/crossentropy": 2.1713619232177734, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18735715746879578, + "step": 29954 + }, + { + "epoch": 0.59912, + "grad_norm": 1.9609375, + "grad_norm_var": 0.0174468994140625, + "learning_rate": 0.0001, + "loss": 4.1315, + "loss/crossentropy": 2.0735312700271606, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20961667597293854, + "step": 29956 + }, + { + "epoch": 0.59916, + "grad_norm": 1.828125, + "grad_norm_var": 0.009969075520833334, + "learning_rate": 0.0001, + "loss": 4.0316, + "loss/crossentropy": 1.6638267636299133, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16829392313957214, + "step": 29958 + }, + { + "epoch": 0.5992, + "grad_norm": 1.8828125, + "grad_norm_var": 0.005635325113932292, + "learning_rate": 0.0001, + "loss": 4.0428, + "loss/crossentropy": 1.924504578113556, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1855188086628914, + "step": 29960 + }, + { + "epoch": 0.59924, + "grad_norm": 2.0625, + "grad_norm_var": 0.0067942301432291664, + "learning_rate": 0.0001, + "loss": 4.0467, + "loss/crossentropy": 2.2729889154434204, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21448159217834473, + "step": 29962 + }, + { + "epoch": 0.59928, + "grad_norm": 1.796875, + "grad_norm_var": 0.007609049479166667, + "learning_rate": 0.0001, + "loss": 3.8208, + "loss/crossentropy": 1.9595959186553955, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21665186434984207, + "step": 29964 + }, + { + "epoch": 0.59932, + "grad_norm": 2.0625, + "grad_norm_var": 0.007002512613932292, + "learning_rate": 0.0001, + "loss": 3.9934, + "loss/crossentropy": 2.208333909511566, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20341281592845917, + "step": 29966 + }, + { + "epoch": 0.59936, + "grad_norm": 1.9296875, + "grad_norm_var": 0.00604248046875, + "learning_rate": 0.0001, + "loss": 3.9484, + "loss/crossentropy": 2.0344181060791016, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19173699617385864, + "step": 29968 + }, + { + "epoch": 0.5994, + "grad_norm": 1.7421875, + "grad_norm_var": 0.008063761393229167, + "learning_rate": 0.0001, + "loss": 4.0233, + "loss/crossentropy": 1.9575786590576172, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16551142930984497, + "step": 29970 + }, + { + "epoch": 0.59944, + "grad_norm": 1.828125, + "grad_norm_var": 0.008174387613932292, + "learning_rate": 0.0001, + "loss": 3.7948, + "loss/crossentropy": 1.7442238330841064, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.16575023531913757, + "step": 29972 + }, + { + "epoch": 0.59948, + "grad_norm": 1.8359375, + "grad_norm_var": 0.00819091796875, + "learning_rate": 0.0001, + "loss": 3.9174, + "loss/crossentropy": 2.3819799423217773, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21309742331504822, + "step": 29974 + }, + { + "epoch": 0.59952, + "grad_norm": 1.9296875, + "grad_norm_var": 0.009165191650390625, + "learning_rate": 0.0001, + "loss": 3.8633, + "loss/crossentropy": 2.097830832004547, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17393425852060318, + "step": 29976 + }, + { + "epoch": 0.59956, + "grad_norm": 1.9375, + "grad_norm_var": 0.007382965087890625, + "learning_rate": 0.0001, + "loss": 3.7568, + "loss/crossentropy": 1.771178424358368, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18638163059949875, + "step": 29978 + }, + { + "epoch": 0.5996, + "grad_norm": 1.96875, + "grad_norm_var": 0.007289377848307291, + "learning_rate": 0.0001, + "loss": 4.2791, + "loss/crossentropy": 2.419429302215576, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20359767973423004, + "step": 29980 + }, + { + "epoch": 0.59964, + "grad_norm": 1.890625, + "grad_norm_var": 0.005881500244140625, + "learning_rate": 0.0001, + "loss": 3.7571, + "loss/crossentropy": 2.0326388478279114, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20969726145267487, + "step": 29982 + }, + { + "epoch": 0.59968, + "grad_norm": 1.9453125, + "grad_norm_var": 0.005983225504557292, + "learning_rate": 0.0001, + "loss": 4.0842, + "loss/crossentropy": 1.9649672508239746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18691973388195038, + "step": 29984 + }, + { + "epoch": 0.59972, + "grad_norm": 2.03125, + "grad_norm_var": 0.00643310546875, + "learning_rate": 0.0001, + "loss": 3.8939, + "loss/crossentropy": 1.9001884460449219, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.18768402934074402, + "step": 29986 + }, + { + "epoch": 0.59976, + "grad_norm": 1.859375, + "grad_norm_var": 0.006208292643229167, + "learning_rate": 0.0001, + "loss": 3.8919, + "loss/crossentropy": 2.0225003361701965, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.2066223919391632, + "step": 29988 + }, + { + "epoch": 0.5998, + "grad_norm": 1.796875, + "grad_norm_var": 0.006451161702473959, + "learning_rate": 0.0001, + "loss": 3.9285, + "loss/crossentropy": 2.175786852836609, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.20807426422834396, + "step": 29990 + }, + { + "epoch": 0.59984, + "grad_norm": 1.71875, + "grad_norm_var": 0.007228342692057291, + "learning_rate": 0.0001, + "loss": 3.7598, + "loss/crossentropy": 1.842478632926941, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.17608553171157837, + "step": 29992 + }, + { + "epoch": 0.59988, + "grad_norm": 1.796875, + "grad_norm_var": 0.007746378580729167, + "learning_rate": 0.0001, + "loss": 3.8669, + "loss/crossentropy": 2.1033515334129333, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.19302091002464294, + "step": 29994 + }, + { + "epoch": 0.59992, + "grad_norm": 1.921875, + "grad_norm_var": 0.006990559895833333, + "learning_rate": 0.0001, + "loss": 4.0286, + "loss/crossentropy": 2.3058249950408936, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1858331635594368, + "step": 29996 + }, + { + "epoch": 0.59996, + "grad_norm": 1.890625, + "grad_norm_var": 0.006200917561848958, + "learning_rate": 0.0001, + "loss": 4.1295, + "loss/crossentropy": 2.3059096336364746, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.21149063110351562, + "step": 29998 + }, + { + "epoch": 0.6, + "grad_norm": 1.90625, + "grad_norm_var": 0.006029256184895833, + "learning_rate": 0.0001, + "loss": 3.9016, + "loss/crossentropy": 2.0354926586151123, + "loss/hidden": 0.0, + "loss/jsd": 0.0, + "loss/logits": 0.1987614408135414, + "step": 30000 + } + ], + "logging_steps": 2, + "max_steps": 50000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.12442965983232e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}