| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5, |
| "eval_steps": 250, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001, |
| "grad_norm": 0.000537872314453125, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 0.0002, |
| "loss/crossentropy": 0.8766392022371292, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.00021765431665698998, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 0.2265625, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 0.005, |
| "loss/crossentropy": 1.9883175492286682, |
| "loss/hidden": 0.0039215087890625, |
| "loss/logits": 0.001088879187591374, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 0.25390625, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 0.0052, |
| "loss/crossentropy": 1.8020615577697754, |
| "loss/hidden": 0.004180908203125, |
| "loss/logits": 0.0010398300073575228, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 0.255859375, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 0.0049, |
| "loss/crossentropy": 1.0764193534851074, |
| "loss/hidden": 0.00399017333984375, |
| "loss/logits": 0.0008995172393042594, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 0.224609375, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.0049, |
| "loss/crossentropy": 1.7853868007659912, |
| "loss/hidden": 0.0038604736328125, |
| "loss/logits": 0.0010730837238952518, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 0.2333984375, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 0.0051, |
| "loss/crossentropy": 2.4102118015289307, |
| "loss/hidden": 0.00388336181640625, |
| "loss/logits": 0.0011915687937289476, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 0.35546875, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 0.0056, |
| "loss/crossentropy": 1.9921993017196655, |
| "loss/hidden": 0.0044403076171875, |
| "loss/logits": 0.0011139529524371028, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 0.2353515625, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 0.0049, |
| "loss/crossentropy": 2.269957184791565, |
| "loss/hidden": 0.00376129150390625, |
| "loss/logits": 0.0011444001575000584, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 0.22265625, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.0051, |
| "loss/crossentropy": 2.1889681220054626, |
| "loss/hidden": 0.0038909912109375, |
| "loss/logits": 0.0011716101435013115, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.291015625, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0052, |
| "loss/crossentropy": 1.76205712556839, |
| "loss/hidden": 0.0041351318359375, |
| "loss/logits": 0.001058999594533816, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 0.2177734375, |
| "learning_rate": 2.2e-06, |
| "loss": 0.0049, |
| "loss/crossentropy": 2.438264012336731, |
| "loss/hidden": 0.003753662109375, |
| "loss/logits": 0.0011843050015158951, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 0.41015625, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 0.0071, |
| "loss/crossentropy": 1.8871825337409973, |
| "loss/hidden": 0.0059051513671875, |
| "loss/logits": 0.0011930759064853191, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.6e-06, |
| "loss": 0.0084, |
| "loss/crossentropy": 1.7400972247123718, |
| "loss/hidden": 0.0071258544921875, |
| "loss/logits": 0.001270102453418076, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 0.365234375, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 0.0075, |
| "loss/crossentropy": 2.0053656101226807, |
| "loss/hidden": 0.006256103515625, |
| "loss/logits": 0.0012446122709661722, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 0.455078125, |
| "learning_rate": 3e-06, |
| "loss": 0.0072, |
| "loss/crossentropy": 1.984630048274994, |
| "loss/hidden": 0.0059356689453125, |
| "loss/logits": 0.0012947238283231854, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 0.447265625, |
| "grad_norm_var": 0.016307008621940136, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.0072, |
| "loss/crossentropy": 2.4732788801193237, |
| "loss/hidden": 0.005767822265625, |
| "loss/logits": 0.00144299550447613, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 0.89453125, |
| "grad_norm_var": 0.031113270918528238, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 0.0076, |
| "loss/crossentropy": 1.7775737643241882, |
| "loss/hidden": 0.006317138671875, |
| "loss/logits": 0.001260987774003297, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 0.45703125, |
| "grad_norm_var": 0.030601243178049724, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 0.0067, |
| "loss/crossentropy": 1.1123631671071053, |
| "loss/hidden": 0.0057373046875, |
| "loss/logits": 0.0009507400100119412, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 0.298828125, |
| "grad_norm_var": 0.030057998498280843, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.0068, |
| "loss/crossentropy": 1.8855515718460083, |
| "loss/hidden": 0.0055694580078125, |
| "loss/logits": 0.0012491169618442655, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.3984375, |
| "grad_norm_var": 0.02918777068456014, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.007, |
| "loss/crossentropy": 1.773246705532074, |
| "loss/hidden": 0.005828857421875, |
| "loss/logits": 0.0011664124322123826, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 0.302734375, |
| "grad_norm_var": 0.02797787586847941, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 0.0069, |
| "loss/crossentropy": 2.1012651920318604, |
| "loss/hidden": 0.0056610107421875, |
| "loss/logits": 0.0012796117807738483, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 0.486328125, |
| "grad_norm_var": 0.026955906550089517, |
| "learning_rate": 4.4e-06, |
| "loss": 0.0101, |
| "loss/crossentropy": 1.9430513381958008, |
| "loss/hidden": 0.008514404296875, |
| "loss/logits": 0.0016175230266526341, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 0.609375, |
| "grad_norm_var": 0.029542907079060873, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 0.0118, |
| "loss/crossentropy": 1.5989271998405457, |
| "loss/hidden": 0.01025390625, |
| "loss/logits": 0.0015109491650946438, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 0.80078125, |
| "grad_norm_var": 0.03606090148289998, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 0.0102, |
| "loss/crossentropy": 1.141058474779129, |
| "loss/hidden": 0.009033203125, |
| "loss/logits": 0.0011210083321202546, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 0.361328125, |
| "grad_norm_var": 0.03307259480158488, |
| "learning_rate": 5e-06, |
| "loss": 0.0094, |
| "loss/crossentropy": 2.0950170755386353, |
| "loss/hidden": 0.0077972412109375, |
| "loss/logits": 0.001559894997626543, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 0.83984375, |
| "grad_norm_var": 0.0396828293800354, |
| "learning_rate": 5.2e-06, |
| "loss": 0.0112, |
| "loss/crossentropy": 0.9552253857254982, |
| "loss/hidden": 0.010284423828125, |
| "loss/logits": 0.0008805262332316488, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 0.546875, |
| "grad_norm_var": 0.034408044815063474, |
| "learning_rate": 5.400000000000001e-06, |
| "loss": 0.0091, |
| "loss/crossentropy": 1.3719437271356583, |
| "loss/hidden": 0.007965087890625, |
| "loss/logits": 0.001155910431407392, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 0.73046875, |
| "grad_norm_var": 0.036436065038045244, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": 0.0107, |
| "loss/crossentropy": 1.6477643251419067, |
| "loss/hidden": 0.009185791015625, |
| "loss/logits": 0.0015593590214848518, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 0.41796875, |
| "grad_norm_var": 0.03726207415262858, |
| "learning_rate": 5.8e-06, |
| "loss": 0.0096, |
| "loss/crossentropy": 1.7987680435180664, |
| "loss/hidden": 0.008087158203125, |
| "loss/logits": 0.0015162223717197776, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.33203125, |
| "grad_norm_var": 0.03804162343343099, |
| "learning_rate": 6e-06, |
| "loss": 0.0094, |
| "loss/crossentropy": 1.74210923910141, |
| "loss/hidden": 0.008026123046875, |
| "loss/logits": 0.0013514517340809107, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 0.4296875, |
| "grad_norm_var": 0.038314167658487955, |
| "learning_rate": 6.200000000000001e-06, |
| "loss": 0.0095, |
| "loss/crossentropy": 1.45715793967247, |
| "loss/hidden": 0.0081329345703125, |
| "loss/logits": 0.0013754194369539618, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.54296875, |
| "grad_norm_var": 0.03793176015218099, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 0.0137, |
| "loss/crossentropy": 1.635874330997467, |
| "loss/hidden": 0.01190185546875, |
| "loss/logits": 0.0017871989402920008, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 0.76171875, |
| "grad_norm_var": 0.03254489898681641, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 0.0143, |
| "loss/crossentropy": 1.0347481966018677, |
| "loss/hidden": 0.01300048828125, |
| "loss/logits": 0.0012789819156751037, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 0.515625, |
| "grad_norm_var": 0.032269287109375, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": 0.0132, |
| "loss/crossentropy": 2.0032879114151, |
| "loss/hidden": 0.011383056640625, |
| "loss/logits": 0.0018645224627107382, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 1.0703125, |
| "grad_norm_var": 0.04636419614156087, |
| "learning_rate": 7e-06, |
| "loss": 0.0143, |
| "loss/crossentropy": 1.8410796523094177, |
| "loss/hidden": 0.01226806640625, |
| "loss/logits": 0.001986370305530727, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 0.4296875, |
| "grad_norm_var": 0.045703490575154625, |
| "learning_rate": 7.2000000000000005e-06, |
| "loss": 0.0136, |
| "loss/crossentropy": 1.9098870158195496, |
| "loss/hidden": 0.01171875, |
| "loss/logits": 0.0018596722511574626, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 68.0, |
| "grad_norm_var": 284.03319854736327, |
| "learning_rate": 7.4e-06, |
| "loss": 0.0558, |
| "loss/crossentropy": 1.5951663255691528, |
| "loss/hidden": 0.051666259765625, |
| "loss/logits": 0.004160793498158455, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 0.380859375, |
| "grad_norm_var": 284.0946207046509, |
| "learning_rate": 7.600000000000001e-06, |
| "loss": 0.0133, |
| "loss/crossentropy": 2.25837504863739, |
| "loss/hidden": 0.01129150390625, |
| "loss/logits": 0.0020168160554021597, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 0.455078125, |
| "grad_norm_var": 284.1822828769684, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": 0.0126, |
| "loss/crossentropy": 2.126526176929474, |
| "loss/hidden": 0.0107421875, |
| "loss/logits": 0.0018400833941996098, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.63671875, |
| "grad_norm_var": 284.27119545936586, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.0142, |
| "loss/crossentropy": 1.4863142371177673, |
| "loss/hidden": 0.012481689453125, |
| "loss/logits": 0.0017027563299052417, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 0.283203125, |
| "grad_norm_var": 284.3175859928131, |
| "learning_rate": 8.2e-06, |
| "loss": 0.0112, |
| "loss/crossentropy": 2.0888695120811462, |
| "loss/hidden": 0.009521484375, |
| "loss/logits": 0.0017255974235013127, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 0.431640625, |
| "grad_norm_var": 284.5420877456665, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 0.0173, |
| "loss/crossentropy": 1.611488163471222, |
| "loss/hidden": 0.015380859375, |
| "loss/logits": 0.0019445380312390625, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 0.419921875, |
| "grad_norm_var": 284.6142045180003, |
| "learning_rate": 8.6e-06, |
| "loss": 0.0166, |
| "loss/crossentropy": 1.8987411260604858, |
| "loss/hidden": 0.0146484375, |
| "loss/logits": 0.0019467678503133357, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 0.58203125, |
| "grad_norm_var": 284.6949343204498, |
| "learning_rate": 8.8e-06, |
| "loss": 0.0183, |
| "loss/crossentropy": 1.4084473848342896, |
| "loss/hidden": 0.01605224609375, |
| "loss/logits": 0.002271471545100212, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 0.380859375, |
| "grad_norm_var": 284.71635888417563, |
| "learning_rate": 9e-06, |
| "loss": 0.0159, |
| "loss/crossentropy": 1.6970309615135193, |
| "loss/hidden": 0.01397705078125, |
| "loss/logits": 0.0019325784523971379, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 0.455078125, |
| "grad_norm_var": 284.64517935117084, |
| "learning_rate": 9.200000000000002e-06, |
| "loss": 0.0165, |
| "loss/crossentropy": 2.1346731781959534, |
| "loss/hidden": 0.014312744140625, |
| "loss/logits": 0.002142712823115289, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 283.818000014623, |
| "learning_rate": 9.4e-06, |
| "loss": 0.0175, |
| "loss/crossentropy": 1.6114214062690735, |
| "loss/hidden": 0.0155029296875, |
| "loss/logits": 0.0020421514636836946, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 0.44921875, |
| "grad_norm_var": 283.87235945065817, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 0.0157, |
| "loss/crossentropy": 2.056842625141144, |
| "loss/hidden": 0.013671875, |
| "loss/logits": 0.0020451846066862345, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 0.439453125, |
| "grad_norm_var": 284.05417149861654, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 0.016, |
| "loss/crossentropy": 1.5892411470413208, |
| "loss/hidden": 0.013946533203125, |
| "loss/logits": 0.00205704930704087, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.3359375, |
| "grad_norm_var": 284.15935770670575, |
| "learning_rate": 1e-05, |
| "loss": 0.0153, |
| "loss/crossentropy": 2.3872954845428467, |
| "loss/hidden": 0.01312255859375, |
| "loss/logits": 0.0021313573233783245, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 0.451171875, |
| "grad_norm_var": 284.49208029111225, |
| "learning_rate": 1.02e-05, |
| "loss": 0.0168, |
| "loss/crossentropy": 2.0149841904640198, |
| "loss/hidden": 0.01470947265625, |
| "loss/logits": 0.0020815907046198845, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 0.51953125, |
| "grad_norm_var": 284.44056928952534, |
| "learning_rate": 1.04e-05, |
| "loss": 0.021, |
| "loss/crossentropy": 1.9311216473579407, |
| "loss/hidden": 0.0185546875, |
| "loss/logits": 0.0024686548858880997, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 0.546875, |
| "grad_norm_var": 0.20315702756245932, |
| "learning_rate": 1.0600000000000002e-05, |
| "loss": 0.0204, |
| "loss/crossentropy": 1.9871841073036194, |
| "loss/hidden": 0.01806640625, |
| "loss/logits": 0.00237347767688334, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 0.51171875, |
| "grad_norm_var": 0.2010729471842448, |
| "learning_rate": 1.0800000000000002e-05, |
| "loss": 0.0195, |
| "loss/crossentropy": 1.4909774661064148, |
| "loss/hidden": 0.017578125, |
| "loss/logits": 0.0018839699332602322, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 0.376953125, |
| "grad_norm_var": 0.20264968872070313, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 0.0188, |
| "loss/crossentropy": 1.731587290763855, |
| "loss/hidden": 0.01666259765625, |
| "loss/logits": 0.0021363290725275874, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 0.482421875, |
| "grad_norm_var": 0.20266098976135255, |
| "learning_rate": 1.1200000000000001e-05, |
| "loss": 0.0198, |
| "loss/crossentropy": 1.8391692638397217, |
| "loss/hidden": 0.01751708984375, |
| "loss/logits": 0.0022706754971295595, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 0.82421875, |
| "grad_norm_var": 0.20132694244384766, |
| "learning_rate": 1.14e-05, |
| "loss": 0.0181, |
| "loss/crossentropy": 1.326266534626484, |
| "loss/hidden": 0.01654052734375, |
| "loss/logits": 0.0015604346699547023, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 0.41015625, |
| "grad_norm_var": 0.2018068790435791, |
| "learning_rate": 1.16e-05, |
| "loss": 0.0185, |
| "loss/crossentropy": 2.5511186122894287, |
| "loss/hidden": 0.01611328125, |
| "loss/logits": 0.0024241225328296423, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 1.609375, |
| "grad_norm_var": 0.26361236572265623, |
| "learning_rate": 1.18e-05, |
| "loss": 0.0183, |
| "loss/crossentropy": 1.0930684125050902, |
| "loss/hidden": 0.01702880859375, |
| "loss/logits": 0.0013018156460020691, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.486328125, |
| "grad_norm_var": 0.2652066389719645, |
| "learning_rate": 1.2e-05, |
| "loss": 0.02, |
| "loss/crossentropy": 2.0819135308265686, |
| "loss/hidden": 0.0174560546875, |
| "loss/logits": 0.0025293552316725254, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 1.09375, |
| "grad_norm_var": 0.2708051045735677, |
| "learning_rate": 1.22e-05, |
| "loss": 0.0183, |
| "loss/crossentropy": 0.9290539920330048, |
| "loss/hidden": 0.016754150390625, |
| "loss/logits": 0.0015562092885375023, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 0.453125, |
| "grad_norm_var": 0.2708693027496338, |
| "learning_rate": 1.2400000000000002e-05, |
| "loss": 0.0227, |
| "loss/crossentropy": 2.1691651344299316, |
| "loss/hidden": 0.01995849609375, |
| "loss/logits": 0.002767750178463757, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 0.4765625, |
| "grad_norm_var": 0.10790785153706868, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 0.0233, |
| "loss/crossentropy": 2.1545491218566895, |
| "loss/hidden": 0.0205078125, |
| "loss/logits": 0.002785824006423354, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 0.47265625, |
| "grad_norm_var": 0.10749700864156088, |
| "learning_rate": 1.2800000000000001e-05, |
| "loss": 0.0223, |
| "loss/crossentropy": 1.9527725577354431, |
| "loss/hidden": 0.01971435546875, |
| "loss/logits": 0.0025634407065808773, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 0.55078125, |
| "grad_norm_var": 0.10599034627278646, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 0.0256, |
| "loss/crossentropy": 1.8496606945991516, |
| "loss/hidden": 0.02288818359375, |
| "loss/logits": 0.0027499888092279434, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 0.55859375, |
| "grad_norm_var": 0.1012465794881185, |
| "learning_rate": 1.3200000000000002e-05, |
| "loss": 0.0221, |
| "loss/crossentropy": 1.9440131187438965, |
| "loss/hidden": 0.01971435546875, |
| "loss/logits": 0.002431391447316855, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 0.498046875, |
| "grad_norm_var": 0.10036614735921225, |
| "learning_rate": 1.3400000000000002e-05, |
| "loss": 0.0241, |
| "loss/crossentropy": 1.7777947187423706, |
| "loss/hidden": 0.02142333984375, |
| "loss/logits": 0.0026856372132897377, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 0.66015625, |
| "grad_norm_var": 0.09977563222249348, |
| "learning_rate": 1.3600000000000002e-05, |
| "loss": 0.0241, |
| "loss/crossentropy": 1.6634170711040497, |
| "loss/hidden": 0.02178955078125, |
| "loss/logits": 0.002268874435685575, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 0.359375, |
| "grad_norm_var": 0.1039443333943685, |
| "learning_rate": 1.38e-05, |
| "loss": 0.0217, |
| "loss/crossentropy": 1.9945446252822876, |
| "loss/hidden": 0.019287109375, |
| "loss/logits": 0.0024602848570793867, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.546875, |
| "grad_norm_var": 0.10354207356770834, |
| "learning_rate": 1.4e-05, |
| "loss": 0.0212, |
| "loss/crossentropy": 2.234881281852722, |
| "loss/hidden": 0.0185546875, |
| "loss/logits": 0.0026649613864719868, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 0.5390625, |
| "grad_norm_var": 0.1000130812327067, |
| "learning_rate": 1.4200000000000001e-05, |
| "loss": 0.0235, |
| "loss/crossentropy": 2.3283374309539795, |
| "loss/hidden": 0.0206298828125, |
| "loss/logits": 0.0028440920868888497, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 0.96484375, |
| "grad_norm_var": 0.10530134836832682, |
| "learning_rate": 1.4400000000000001e-05, |
| "loss": 0.0273, |
| "loss/crossentropy": 2.446515917778015, |
| "loss/hidden": 0.0244140625, |
| "loss/logits": 0.002847215859219432, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 0.66015625, |
| "grad_norm_var": 0.10331465403238932, |
| "learning_rate": 1.46e-05, |
| "loss": 0.0313, |
| "loss/crossentropy": 1.8365015387535095, |
| "loss/hidden": 0.0277099609375, |
| "loss/logits": 0.003543111262843013, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 0.58203125, |
| "grad_norm_var": 0.0997507095336914, |
| "learning_rate": 1.48e-05, |
| "loss": 0.0275, |
| "loss/crossentropy": 1.8750606179237366, |
| "loss/hidden": 0.0244140625, |
| "loss/logits": 0.0030850095208734274, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 0.6171875, |
| "grad_norm_var": 0.03528436024983724, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.0285, |
| "loss/crossentropy": 1.6197695136070251, |
| "loss/hidden": 0.02557373046875, |
| "loss/logits": 0.002948817447759211, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 0.5546875, |
| "grad_norm_var": 0.034586191177368164, |
| "learning_rate": 1.5200000000000002e-05, |
| "loss": 0.0253, |
| "loss/crossentropy": 2.139370322227478, |
| "loss/hidden": 0.0225830078125, |
| "loss/logits": 0.002709153341129422, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 0.78125, |
| "grad_norm_var": 0.020085255304972332, |
| "learning_rate": 1.54e-05, |
| "loss": 0.0308, |
| "loss/crossentropy": 1.5335928797721863, |
| "loss/hidden": 0.02777099609375, |
| "loss/logits": 0.00305762467905879, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 0.5078125, |
| "grad_norm_var": 0.019349145889282226, |
| "learning_rate": 1.5600000000000003e-05, |
| "loss": 0.0273, |
| "loss/crossentropy": 2.623558282852173, |
| "loss/hidden": 0.024169921875, |
| "loss/logits": 0.0031643210677430034, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 0.470703125, |
| "grad_norm_var": 0.019434547424316405, |
| "learning_rate": 1.58e-05, |
| "loss": 0.0275, |
| "loss/crossentropy": 2.3246337175369263, |
| "loss/hidden": 0.0242919921875, |
| "loss/logits": 0.0031679703388363123, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.431640625, |
| "grad_norm_var": 0.0201418399810791, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.0254, |
| "loss/crossentropy": 1.801970660686493, |
| "loss/hidden": 0.0228271484375, |
| "loss/logits": 0.0025987064000219107, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 0.44921875, |
| "grad_norm_var": 0.021184905370076498, |
| "learning_rate": 1.62e-05, |
| "loss": 0.0265, |
| "loss/crossentropy": 1.9489317536354065, |
| "loss/hidden": 0.02374267578125, |
| "loss/logits": 0.0027701087528839707, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 0.67578125, |
| "grad_norm_var": 0.02180479367574056, |
| "learning_rate": 1.64e-05, |
| "loss": 0.034, |
| "loss/crossentropy": 1.7697851061820984, |
| "loss/hidden": 0.03070068359375, |
| "loss/logits": 0.003283574478700757, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 0.57421875, |
| "grad_norm_var": 0.021323140462239584, |
| "learning_rate": 1.66e-05, |
| "loss": 0.0309, |
| "loss/crossentropy": 1.5783970654010773, |
| "loss/hidden": 0.028076171875, |
| "loss/logits": 0.002809713245369494, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 0.53125, |
| "grad_norm_var": 0.02108605702718099, |
| "learning_rate": 1.6800000000000002e-05, |
| "loss": 0.0332, |
| "loss/crossentropy": 1.460361659526825, |
| "loss/hidden": 0.0303955078125, |
| "loss/logits": 0.0027706819819286466, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 0.6015625, |
| "grad_norm_var": 0.017696062723795574, |
| "learning_rate": 1.7e-05, |
| "loss": 0.0324, |
| "loss/crossentropy": 2.1110434532165527, |
| "loss/hidden": 0.02911376953125, |
| "loss/logits": 0.0033112409291788936, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 0.451171875, |
| "grad_norm_var": 0.018857304255167642, |
| "learning_rate": 1.72e-05, |
| "loss": 0.0291, |
| "loss/crossentropy": 1.7163687944412231, |
| "loss/hidden": 0.02630615234375, |
| "loss/logits": 0.0027680074563249946, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 0.5703125, |
| "grad_norm_var": 0.018718449274698894, |
| "learning_rate": 1.7400000000000003e-05, |
| "loss": 0.0339, |
| "loss/crossentropy": 1.8893783688545227, |
| "loss/hidden": 0.03021240234375, |
| "loss/logits": 0.0037144168745726347, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 1.75, |
| "grad_norm_var": 0.0965951124827067, |
| "learning_rate": 1.76e-05, |
| "loss": 0.0293, |
| "loss/crossentropy": 1.0857177823781967, |
| "loss/hidden": 0.02716064453125, |
| "loss/logits": 0.002114512084517628, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 0.4609375, |
| "grad_norm_var": 0.09848872820536296, |
| "learning_rate": 1.7800000000000002e-05, |
| "loss": 0.0278, |
| "loss/crossentropy": 2.1670188307762146, |
| "loss/hidden": 0.0250244140625, |
| "loss/logits": 0.0027708488050848246, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 2.984375, |
| "grad_norm_var": 0.4452332655588786, |
| "learning_rate": 1.8e-05, |
| "loss": 0.034, |
| "loss/crossentropy": 0.8697951380163431, |
| "loss/hidden": 0.0322265625, |
| "loss/logits": 0.0017659573932178319, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 0.58984375, |
| "grad_norm_var": 0.44585811297098793, |
| "learning_rate": 1.8200000000000002e-05, |
| "loss": 0.0315, |
| "loss/crossentropy": 2.0653520226478577, |
| "loss/hidden": 0.02813720703125, |
| "loss/logits": 0.003313788794912398, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 0.66015625, |
| "grad_norm_var": 0.44346858660380045, |
| "learning_rate": 1.8400000000000003e-05, |
| "loss": 0.0352, |
| "loss/crossentropy": 2.1175276041030884, |
| "loss/hidden": 0.0318603515625, |
| "loss/logits": 0.003378898836672306, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 0.478515625, |
| "grad_norm_var": 0.44917195638020835, |
| "learning_rate": 1.86e-05, |
| "loss": 0.0328, |
| "loss/crossentropy": 2.192784309387207, |
| "loss/hidden": 0.029296875, |
| "loss/logits": 0.003497788915410638, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 0.50390625, |
| "grad_norm_var": 0.4493051528930664, |
| "learning_rate": 1.88e-05, |
| "loss": 0.0342, |
| "loss/crossentropy": 1.8000940680503845, |
| "loss/hidden": 0.0308837890625, |
| "loss/logits": 0.003295119386166334, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 0.86328125, |
| "grad_norm_var": 0.44371743202209474, |
| "learning_rate": 1.9e-05, |
| "loss": 0.0376, |
| "loss/crossentropy": 1.9514374732971191, |
| "loss/hidden": 0.0340576171875, |
| "loss/logits": 0.0035327656660228968, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 0.55859375, |
| "grad_norm_var": 0.4387262980143229, |
| "learning_rate": 1.9200000000000003e-05, |
| "loss": 0.0334, |
| "loss/crossentropy": 1.7834157943725586, |
| "loss/hidden": 0.03021240234375, |
| "loss/logits": 0.003167669870890677, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 0.71484375, |
| "grad_norm_var": 0.4309270222981771, |
| "learning_rate": 1.94e-05, |
| "loss": 0.0327, |
| "loss/crossentropy": 1.6889591813087463, |
| "loss/hidden": 0.02972412109375, |
| "loss/logits": 0.0029616469983011484, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 0.56640625, |
| "grad_norm_var": 0.4336400349934896, |
| "learning_rate": 1.9600000000000002e-05, |
| "loss": 0.0354, |
| "loss/crossentropy": 1.7813147902488708, |
| "loss/hidden": 0.031982421875, |
| "loss/logits": 0.003417789936065674, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 0.9140625, |
| "grad_norm_var": 0.43045953114827473, |
| "learning_rate": 1.98e-05, |
| "loss": 0.0376, |
| "loss/crossentropy": 1.3951178789138794, |
| "loss/hidden": 0.0345458984375, |
| "loss/logits": 0.0030310061993077397, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.56640625, |
| "grad_norm_var": 0.4291600545247396, |
| "learning_rate": 2e-05, |
| "loss": 0.0364, |
| "loss/crossentropy": 2.255498170852661, |
| "loss/hidden": 0.03277587890625, |
| "loss/logits": 0.0036420804681256413, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 0.58984375, |
| "grad_norm_var": 0.429521115620931, |
| "learning_rate": 2e-05, |
| "loss": 0.033, |
| "loss/crossentropy": 2.4104394912719727, |
| "loss/hidden": 0.02960205078125, |
| "loss/logits": 0.0033488960471004248, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 4.8125, |
| "grad_norm_var": 1.4001366774241129, |
| "learning_rate": 2e-05, |
| "loss": 0.0477, |
| "loss/crossentropy": 1.0830636993050575, |
| "loss/hidden": 0.0452880859375, |
| "loss/logits": 0.0023841604124754667, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 4.1875, |
| "grad_norm_var": 1.9629084110260009, |
| "learning_rate": 2e-05, |
| "loss": 0.0475, |
| "loss/crossentropy": 0.7437883876264095, |
| "loss/hidden": 0.0455322265625, |
| "loss/logits": 0.0019981139339506626, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 0.77734375, |
| "grad_norm_var": 1.9669294834136963, |
| "learning_rate": 2e-05, |
| "loss": 0.0387, |
| "loss/crossentropy": 2.1284059882164, |
| "loss/hidden": 0.0345458984375, |
| "loss/logits": 0.00411223981063813, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 1.6796875, |
| "grad_norm_var": 1.92922043800354, |
| "learning_rate": 2e-05, |
| "loss": 0.0459, |
| "loss/crossentropy": 2.1119471192359924, |
| "loss/hidden": 0.0411376953125, |
| "loss/logits": 0.0047579677775502205, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 0.90234375, |
| "grad_norm_var": 1.7437895298004151, |
| "learning_rate": 2e-05, |
| "loss": 0.044, |
| "loss/crossentropy": 2.391239643096924, |
| "loss/hidden": 0.0390625, |
| "loss/logits": 0.004930721828714013, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 1.7282822767893473, |
| "learning_rate": 2e-05, |
| "loss": 0.0451, |
| "loss/crossentropy": 1.7602136731147766, |
| "loss/hidden": 0.040283203125, |
| "loss/logits": 0.004797366913408041, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 0.8828125, |
| "grad_norm_var": 1.7130108992258708, |
| "learning_rate": 2e-05, |
| "loss": 0.0428, |
| "loss/crossentropy": 2.0745638012886047, |
| "loss/hidden": 0.0386962890625, |
| "loss/logits": 0.004113797098398209, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 0.82421875, |
| "grad_norm_var": 1.6829447428385416, |
| "learning_rate": 2e-05, |
| "loss": 0.0422, |
| "loss/crossentropy": 1.685157299041748, |
| "loss/hidden": 0.03857421875, |
| "loss/logits": 0.0036494951928034425, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 1.6387715021769205, |
| "learning_rate": 2e-05, |
| "loss": 0.0376, |
| "loss/crossentropy": 2.625019073486328, |
| "loss/hidden": 0.03369140625, |
| "loss/logits": 0.0039150441298261285, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 1.6204302469889322, |
| "learning_rate": 2e-05, |
| "loss": 0.0422, |
| "loss/crossentropy": 0.676440417766571, |
| "loss/hidden": 0.0401611328125, |
| "loss/logits": 0.0020512532209977508, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 0.65234375, |
| "grad_norm_var": 1.6101824442545574, |
| "learning_rate": 2e-05, |
| "loss": 0.0479, |
| "loss/crossentropy": 1.8928841352462769, |
| "loss/hidden": 0.0435791015625, |
| "loss/logits": 0.00434900657273829, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 1.09375, |
| "grad_norm_var": 1.5831150690714517, |
| "learning_rate": 2e-05, |
| "loss": 0.0498, |
| "loss/crossentropy": 1.2006176710128784, |
| "loss/hidden": 0.04638671875, |
| "loss/logits": 0.0034257903462275863, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 0.84375, |
| "grad_norm_var": 1.5551775614420573, |
| "learning_rate": 2e-05, |
| "loss": 0.0437, |
| "loss/crossentropy": 2.164067029953003, |
| "loss/hidden": 0.03955078125, |
| "loss/logits": 0.004164737183600664, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 0.875, |
| "grad_norm_var": 1.5581644694010417, |
| "learning_rate": 2e-05, |
| "loss": 0.0469, |
| "loss/crossentropy": 1.963140070438385, |
| "loss/hidden": 0.0419921875, |
| "loss/logits": 0.004867425188422203, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 0.83984375, |
| "grad_norm_var": 1.530010732014974, |
| "learning_rate": 2e-05, |
| "loss": 0.0469, |
| "loss/crossentropy": 1.936423420906067, |
| "loss/hidden": 0.04248046875, |
| "loss/logits": 0.004457900300621986, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 1.0, |
| "grad_norm_var": 1.4916320164998373, |
| "learning_rate": 2e-05, |
| "loss": 0.044, |
| "loss/crossentropy": 1.9027796387672424, |
| "loss/hidden": 0.0396728515625, |
| "loss/logits": 0.004306067014113069, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 0.921875, |
| "grad_norm_var": 0.724272092183431, |
| "learning_rate": 2e-05, |
| "loss": 0.048, |
| "loss/crossentropy": 1.4962169528007507, |
| "loss/hidden": 0.043212890625, |
| "loss/logits": 0.004831232130527496, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 0.12087090810139973, |
| "learning_rate": 2e-05, |
| "loss": 0.0458, |
| "loss/crossentropy": 1.8558754324913025, |
| "loss/hidden": 0.04150390625, |
| "loss/logits": 0.004260358400642872, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.7421875, |
| "grad_norm_var": 0.12239583333333333, |
| "learning_rate": 2e-05, |
| "loss": 0.0467, |
| "loss/crossentropy": 2.163163900375366, |
| "loss/hidden": 0.042236328125, |
| "loss/logits": 0.0044949238654226065, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 0.66796875, |
| "grad_norm_var": 0.10601139068603516, |
| "learning_rate": 2e-05, |
| "loss": 0.0429, |
| "loss/crossentropy": 1.875292718410492, |
| "loss/hidden": 0.0389404296875, |
| "loss/logits": 0.003972187405452132, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 0.97265625, |
| "grad_norm_var": 0.1052103042602539, |
| "learning_rate": 2e-05, |
| "loss": 0.0504, |
| "loss/crossentropy": 1.581692636013031, |
| "loss/hidden": 0.0462646484375, |
| "loss/logits": 0.0040856958366930485, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 0.77734375, |
| "grad_norm_var": 0.07660497029622396, |
| "learning_rate": 2e-05, |
| "loss": 0.0467, |
| "loss/crossentropy": 2.185007333755493, |
| "loss/hidden": 0.0419921875, |
| "loss/logits": 0.0047312104143202305, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 0.70703125, |
| "grad_norm_var": 0.08053887685139974, |
| "learning_rate": 2e-05, |
| "loss": 0.0527, |
| "loss/crossentropy": 1.7746418118476868, |
| "loss/hidden": 0.0482177734375, |
| "loss/logits": 0.004488097038120031, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 0.82421875, |
| "grad_norm_var": 0.08053887685139974, |
| "learning_rate": 2e-05, |
| "loss": 0.0483, |
| "loss/crossentropy": 1.8139249682426453, |
| "loss/hidden": 0.044189453125, |
| "loss/logits": 0.00407675513997674, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.126, |
| "grad_norm": 0.80078125, |
| "grad_norm_var": 0.05464986165364583, |
| "learning_rate": 2e-05, |
| "loss": 0.0536, |
| "loss/crossentropy": 1.8078742623329163, |
| "loss/hidden": 0.0489501953125, |
| "loss/logits": 0.004657944664359093, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.127, |
| "grad_norm": 1.09375, |
| "grad_norm_var": 0.030997467041015626, |
| "learning_rate": 2e-05, |
| "loss": 0.0496, |
| "loss/crossentropy": 2.0267322659492493, |
| "loss/hidden": 0.0447998046875, |
| "loss/logits": 0.0047590641770511866, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 0.85546875, |
| "grad_norm_var": 0.027347564697265625, |
| "learning_rate": 2e-05, |
| "loss": 0.0587, |
| "loss/crossentropy": 1.6603793501853943, |
| "loss/hidden": 0.052978515625, |
| "loss/logits": 0.005712392507120967, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.129, |
| "grad_norm": 5.375, |
| "grad_norm_var": 1.286358388264974, |
| "learning_rate": 2e-05, |
| "loss": 0.0577, |
| "loss/crossentropy": 0.8844976872205734, |
| "loss/hidden": 0.0550537109375, |
| "loss/logits": 0.0026012896560132504, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 0.94140625, |
| "grad_norm_var": 1.2828027725219726, |
| "learning_rate": 2e-05, |
| "loss": 0.0532, |
| "loss/crossentropy": 2.151723265647888, |
| "loss/hidden": 0.04833984375, |
| "loss/logits": 0.0048982377629727125, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.131, |
| "grad_norm": 0.92578125, |
| "grad_norm_var": 1.280975341796875, |
| "learning_rate": 2e-05, |
| "loss": 0.048, |
| "loss/crossentropy": 2.190707802772522, |
| "loss/hidden": 0.0435791015625, |
| "loss/logits": 0.004458446754142642, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 0.73828125, |
| "grad_norm_var": 1.2861162821451824, |
| "learning_rate": 2e-05, |
| "loss": 0.0562, |
| "loss/crossentropy": 2.0854132175445557, |
| "loss/hidden": 0.0511474609375, |
| "loss/logits": 0.005020990269258618, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.133, |
| "grad_norm": 0.6796875, |
| "grad_norm_var": 1.299598185221354, |
| "learning_rate": 2e-05, |
| "loss": 0.0509, |
| "loss/crossentropy": 2.0993438959121704, |
| "loss/hidden": 0.046142578125, |
| "loss/logits": 0.004787095822393894, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.134, |
| "grad_norm": 0.96875, |
| "grad_norm_var": 1.2983378092447917, |
| "learning_rate": 2e-05, |
| "loss": 0.0491, |
| "loss/crossentropy": 2.2328933477401733, |
| "loss/hidden": 0.0445556640625, |
| "loss/logits": 0.004536583088338375, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 1.0625, |
| "grad_norm_var": 1.2969581604003906, |
| "learning_rate": 2e-05, |
| "loss": 0.0638, |
| "loss/crossentropy": 1.9981300234794617, |
| "loss/hidden": 0.0579833984375, |
| "loss/logits": 0.00582107319496572, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 0.6796875, |
| "grad_norm_var": 1.3004615783691407, |
| "learning_rate": 2e-05, |
| "loss": 0.0542, |
| "loss/crossentropy": 2.1993343830108643, |
| "loss/hidden": 0.049072265625, |
| "loss/logits": 0.005134769715368748, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.137, |
| "grad_norm": 3.5, |
| "grad_norm_var": 1.627500343322754, |
| "learning_rate": 2e-05, |
| "loss": 0.0595, |
| "loss/crossentropy": 1.469780683517456, |
| "loss/hidden": 0.0552978515625, |
| "loss/logits": 0.0042177007999271154, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.138, |
| "grad_norm": 0.87109375, |
| "grad_norm_var": 1.632664426167806, |
| "learning_rate": 2e-05, |
| "loss": 0.0554, |
| "loss/crossentropy": 1.8814529180526733, |
| "loss/hidden": 0.0506591796875, |
| "loss/logits": 0.004711252404376864, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.139, |
| "grad_norm": 0.9140625, |
| "grad_norm_var": 1.62430419921875, |
| "learning_rate": 2e-05, |
| "loss": 0.0542, |
| "loss/crossentropy": 1.9769226908683777, |
| "loss/hidden": 0.049560546875, |
| "loss/logits": 0.004602615023031831, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 1.296875, |
| "grad_norm_var": 1.5987385431925456, |
| "learning_rate": 2e-05, |
| "loss": 0.0562, |
| "loss/crossentropy": 1.3646953105926514, |
| "loss/hidden": 0.0516357421875, |
| "loss/logits": 0.0045162534806877375, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.141, |
| "grad_norm": 0.91796875, |
| "grad_norm_var": 1.592772356669108, |
| "learning_rate": 2e-05, |
| "loss": 0.0586, |
| "loss/crossentropy": 1.5901939272880554, |
| "loss/hidden": 0.0538330078125, |
| "loss/logits": 0.004788138438016176, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.142, |
| "grad_norm": 1.109375, |
| "grad_norm_var": 1.5760719299316406, |
| "learning_rate": 2e-05, |
| "loss": 0.0686, |
| "loss/crossentropy": 1.8436982035636902, |
| "loss/hidden": 0.062744140625, |
| "loss/logits": 0.005897135473787785, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.143, |
| "grad_norm": 1.0, |
| "grad_norm_var": 1.5800819396972656, |
| "learning_rate": 2e-05, |
| "loss": 0.0677, |
| "loss/crossentropy": 1.7922558188438416, |
| "loss/hidden": 0.06103515625, |
| "loss/logits": 0.006622593384236097, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 1.046875, |
| "grad_norm_var": 1.5693745295206705, |
| "learning_rate": 2e-05, |
| "loss": 0.0626, |
| "loss/crossentropy": 1.8654756546020508, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.005447414005175233, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 0.8046875, |
| "grad_norm_var": 0.43840071360270183, |
| "learning_rate": 2e-05, |
| "loss": 0.0653, |
| "loss/crossentropy": 2.023370146751404, |
| "loss/hidden": 0.0596923828125, |
| "loss/logits": 0.005567178362980485, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.146, |
| "grad_norm": 1.7265625, |
| "grad_norm_var": 0.4612627665201823, |
| "learning_rate": 2e-05, |
| "loss": 0.0718, |
| "loss/crossentropy": 1.2652358412742615, |
| "loss/hidden": 0.066162109375, |
| "loss/logits": 0.00563872791826725, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.147, |
| "grad_norm": 0.8359375, |
| "grad_norm_var": 0.4643350601196289, |
| "learning_rate": 2e-05, |
| "loss": 0.0579, |
| "loss/crossentropy": 2.181838572025299, |
| "loss/hidden": 0.0528564453125, |
| "loss/logits": 0.0050070807337760925, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 1.65625, |
| "grad_norm_var": 0.4685035705566406, |
| "learning_rate": 2e-05, |
| "loss": 0.0653, |
| "loss/crossentropy": 1.6760476231575012, |
| "loss/hidden": 0.059814453125, |
| "loss/logits": 0.005448109935969114, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.149, |
| "grad_norm": 0.875, |
| "grad_norm_var": 0.45754903157552085, |
| "learning_rate": 2e-05, |
| "loss": 0.0608, |
| "loss/crossentropy": 1.9610846042633057, |
| "loss/hidden": 0.05517578125, |
| "loss/logits": 0.0055898819118738174, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 1.2890625, |
| "grad_norm_var": 0.45391006469726564, |
| "learning_rate": 2e-05, |
| "loss": 0.0607, |
| "loss/crossentropy": 2.0354663729667664, |
| "loss/hidden": 0.054931640625, |
| "loss/logits": 0.005750466603785753, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.151, |
| "grad_norm": 0.91015625, |
| "grad_norm_var": 0.4586435317993164, |
| "learning_rate": 2e-05, |
| "loss": 0.061, |
| "loss/crossentropy": 1.5509551763534546, |
| "loss/hidden": 0.05615234375, |
| "loss/logits": 0.00486933346837759, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 1.8203125, |
| "grad_norm_var": 0.45860640207926434, |
| "learning_rate": 2e-05, |
| "loss": 0.0823, |
| "loss/crossentropy": 1.3190861344337463, |
| "loss/hidden": 0.076171875, |
| "loss/logits": 0.006146557629108429, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.153, |
| "grad_norm": 1.65625, |
| "grad_norm_var": 0.12676741282145182, |
| "learning_rate": 2e-05, |
| "loss": 0.0689, |
| "loss/crossentropy": 2.0075970888137817, |
| "loss/hidden": 0.0626220703125, |
| "loss/logits": 0.0062951259315013885, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.154, |
| "grad_norm": 0.84375, |
| "grad_norm_var": 0.12790629069010417, |
| "learning_rate": 2e-05, |
| "loss": 0.0645, |
| "loss/crossentropy": 2.5025904178619385, |
| "loss/hidden": 0.0584716796875, |
| "loss/logits": 0.005998906912282109, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 1.75, |
| "grad_norm_var": 0.14317194620768228, |
| "learning_rate": 2e-05, |
| "loss": 0.0673, |
| "loss/crossentropy": 1.7674061059951782, |
| "loss/hidden": 0.0618896484375, |
| "loss/logits": 0.005377188790589571, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 1.046875, |
| "grad_norm_var": 0.14455540974934897, |
| "learning_rate": 2e-05, |
| "loss": 0.0696, |
| "loss/crossentropy": 1.4891575574874878, |
| "loss/hidden": 0.0640869140625, |
| "loss/logits": 0.005491052754223347, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.157, |
| "grad_norm": 1.0078125, |
| "grad_norm_var": 0.1416147232055664, |
| "learning_rate": 2e-05, |
| "loss": 0.0656, |
| "loss/crossentropy": 1.4295508861541748, |
| "loss/hidden": 0.060546875, |
| "loss/logits": 0.005026416387408972, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.158, |
| "grad_norm": 8.5, |
| "grad_norm_var": 3.4551263809204102, |
| "learning_rate": 2e-05, |
| "loss": 0.1047, |
| "loss/crossentropy": 1.6207728683948517, |
| "loss/hidden": 0.09716796875, |
| "loss/logits": 0.007503823610022664, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.159, |
| "grad_norm": 1.3125, |
| "grad_norm_var": 3.4331842422485352, |
| "learning_rate": 2e-05, |
| "loss": 0.0663, |
| "loss/crossentropy": 1.838720440864563, |
| "loss/hidden": 0.06103515625, |
| "loss/logits": 0.0052408319897949696, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 3.403587277730306, |
| "learning_rate": 2e-05, |
| "loss": 0.0729, |
| "loss/crossentropy": 1.9572261571884155, |
| "loss/hidden": 0.06640625, |
| "loss/logits": 0.00649917172268033, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.161, |
| "grad_norm": 7.71875, |
| "grad_norm_var": 5.5313720067342125, |
| "learning_rate": 2e-05, |
| "loss": 0.0873, |
| "loss/crossentropy": 0.06751747522503138, |
| "loss/hidden": 0.086181640625, |
| "loss/logits": 0.001096382096875459, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.162, |
| "grad_norm": 1.65625, |
| "grad_norm_var": 5.535835202534994, |
| "learning_rate": 2e-05, |
| "loss": 0.0753, |
| "loss/crossentropy": 1.9767259359359741, |
| "loss/hidden": 0.06884765625, |
| "loss/logits": 0.006433435715734959, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.163, |
| "grad_norm": 1.2734375, |
| "grad_norm_var": 5.470252927144369, |
| "learning_rate": 2e-05, |
| "loss": 0.0742, |
| "loss/crossentropy": 1.6337787508964539, |
| "loss/hidden": 0.068359375, |
| "loss/logits": 0.0058679585345089436, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 1.171875, |
| "grad_norm_var": 5.519557634989421, |
| "learning_rate": 2e-05, |
| "loss": 0.0791, |
| "loss/crossentropy": 1.5085630416870117, |
| "loss/hidden": 0.0732421875, |
| "loss/logits": 0.00587455416098237, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 1.328125, |
| "grad_norm_var": 5.454612668355306, |
| "learning_rate": 2e-05, |
| "loss": 0.0733, |
| "loss/crossentropy": 2.1295101046562195, |
| "loss/hidden": 0.0665283203125, |
| "loss/logits": 0.006821601651608944, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.166, |
| "grad_norm": 0.828125, |
| "grad_norm_var": 5.523303159077963, |
| "learning_rate": 2e-05, |
| "loss": 0.0681, |
| "loss/crossentropy": 2.1514192819595337, |
| "loss/hidden": 0.061767578125, |
| "loss/logits": 0.0063285790383815765, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.167, |
| "grad_norm": 0.9140625, |
| "grad_norm_var": 5.522652180989583, |
| "learning_rate": 2e-05, |
| "loss": 0.0799, |
| "loss/crossentropy": 1.907168447971344, |
| "loss/hidden": 0.072509765625, |
| "loss/logits": 0.0073654367588460445, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 0.70703125, |
| "grad_norm_var": 5.650849850972493, |
| "learning_rate": 2e-05, |
| "loss": 0.0665, |
| "loss/crossentropy": 2.490573525428772, |
| "loss/hidden": 0.0604248046875, |
| "loss/logits": 0.006123463856056333, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.169, |
| "grad_norm": 0.921875, |
| "grad_norm_var": 5.727275530497233, |
| "learning_rate": 2e-05, |
| "loss": 0.0686, |
| "loss/crossentropy": 2.1971182823181152, |
| "loss/hidden": 0.0625, |
| "loss/logits": 0.006081034895032644, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.84375, |
| "grad_norm_var": 5.727275530497233, |
| "learning_rate": 2e-05, |
| "loss": 0.0723, |
| "loss/crossentropy": 1.9449633955955505, |
| "loss/hidden": 0.06591796875, |
| "loss/logits": 0.00633727153763175, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.171, |
| "grad_norm": 0.80078125, |
| "grad_norm_var": 5.8211313883463545, |
| "learning_rate": 2e-05, |
| "loss": 0.0721, |
| "loss/crossentropy": 1.8933625221252441, |
| "loss/hidden": 0.066162109375, |
| "loss/logits": 0.005927694728597999, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 0.734375, |
| "grad_norm_var": 5.8664194742838545, |
| "learning_rate": 2e-05, |
| "loss": 0.0756, |
| "loss/crossentropy": 2.2961581349372864, |
| "loss/hidden": 0.069091796875, |
| "loss/logits": 0.00650426116771996, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.173, |
| "grad_norm": 1.0859375, |
| "grad_norm_var": 5.856801350911458, |
| "learning_rate": 2e-05, |
| "loss": 0.0876, |
| "loss/crossentropy": 1.5580723285675049, |
| "loss/hidden": 0.080322265625, |
| "loss/logits": 0.00728521216660738, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.174, |
| "grad_norm": 0.87109375, |
| "grad_norm_var": 2.8547820409138995, |
| "learning_rate": 2e-05, |
| "loss": 0.0785, |
| "loss/crossentropy": 2.4996918439865112, |
| "loss/hidden": 0.07080078125, |
| "loss/logits": 0.0076872315257787704, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 1.09375, |
| "grad_norm_var": 2.863120460510254, |
| "learning_rate": 2e-05, |
| "loss": 0.0842, |
| "loss/crossentropy": 2.341306686401367, |
| "loss/hidden": 0.075927734375, |
| "loss/logits": 0.008260179311037064, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 1.2734375, |
| "grad_norm_var": 2.859659767150879, |
| "learning_rate": 2e-05, |
| "loss": 0.0839, |
| "loss/crossentropy": 2.0976521968841553, |
| "loss/hidden": 0.075927734375, |
| "loss/logits": 0.007956868037581444, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.177, |
| "grad_norm": 1.6640625, |
| "grad_norm_var": 0.09129581451416016, |
| "learning_rate": 2e-05, |
| "loss": 0.0854, |
| "loss/crossentropy": 1.5655289888381958, |
| "loss/hidden": 0.078857421875, |
| "loss/logits": 0.006505638128146529, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.178, |
| "grad_norm": 0.96484375, |
| "grad_norm_var": 0.06740493774414062, |
| "learning_rate": 2e-05, |
| "loss": 0.0832, |
| "loss/crossentropy": 1.947506844997406, |
| "loss/hidden": 0.076171875, |
| "loss/logits": 0.0070168147794902325, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.179, |
| "grad_norm": 4.5625, |
| "grad_norm_var": 0.8503774007161459, |
| "learning_rate": 2e-05, |
| "loss": 0.0965, |
| "loss/crossentropy": 1.557403326034546, |
| "loss/hidden": 0.087158203125, |
| "loss/logits": 0.009354921989142895, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 8.3125, |
| "grad_norm_var": 3.9767252604166665, |
| "learning_rate": 2e-05, |
| "loss": 0.1122, |
| "loss/crossentropy": 0.45333431661129, |
| "loss/hidden": 0.109375, |
| "loss/logits": 0.0027967533096671104, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.181, |
| "grad_norm": 1.546875, |
| "grad_norm_var": 3.969405110677083, |
| "learning_rate": 2e-05, |
| "loss": 0.0829, |
| "loss/crossentropy": 2.005882978439331, |
| "loss/hidden": 0.075439453125, |
| "loss/logits": 0.007453362224623561, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.182, |
| "grad_norm": 1.3515625, |
| "grad_norm_var": 3.926006825764974, |
| "learning_rate": 2e-05, |
| "loss": 0.0849, |
| "loss/crossentropy": 2.199571132659912, |
| "loss/hidden": 0.077880859375, |
| "loss/logits": 0.0069826748222112656, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.183, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 3.8817014058430988, |
| "learning_rate": 2e-05, |
| "loss": 0.0921, |
| "loss/crossentropy": 1.6926537156105042, |
| "loss/hidden": 0.085205078125, |
| "loss/logits": 0.006879956694319844, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 1.203125, |
| "grad_norm_var": 3.826835568745931, |
| "learning_rate": 2e-05, |
| "loss": 0.0964, |
| "loss/crossentropy": 1.509221613407135, |
| "loss/hidden": 0.087890625, |
| "loss/logits": 0.00847849901765585, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 0.703125, |
| "grad_norm_var": 3.8554396947224934, |
| "learning_rate": 2e-05, |
| "loss": 0.0788, |
| "loss/crossentropy": 2.4337867498397827, |
| "loss/hidden": 0.072021484375, |
| "loss/logits": 0.0067423065192997456, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.186, |
| "grad_norm": 1.234375, |
| "grad_norm_var": 3.815881284077962, |
| "learning_rate": 2e-05, |
| "loss": 0.0966, |
| "loss/crossentropy": 1.7458332180976868, |
| "loss/hidden": 0.08837890625, |
| "loss/logits": 0.008262162329629064, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.187, |
| "grad_norm": 6.59375, |
| "grad_norm_var": 5.133159383138021, |
| "learning_rate": 2e-05, |
| "loss": 0.0928, |
| "loss/crossentropy": 2.116236627101898, |
| "loss/hidden": 0.0830078125, |
| "loss/logits": 0.00975541677325964, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 1.8203125, |
| "grad_norm_var": 4.998583730061849, |
| "learning_rate": 2e-05, |
| "loss": 0.0831, |
| "loss/crossentropy": 2.324514389038086, |
| "loss/hidden": 0.075439453125, |
| "loss/logits": 0.007644579978659749, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.189, |
| "grad_norm": 0.796875, |
| "grad_norm_var": 5.048313395182292, |
| "learning_rate": 2e-05, |
| "loss": 0.0867, |
| "loss/crossentropy": 1.9479625821113586, |
| "loss/hidden": 0.0791015625, |
| "loss/logits": 0.0075566458981484175, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 15.875, |
| "grad_norm_var": 16.414309628804524, |
| "learning_rate": 2e-05, |
| "loss": 0.1592, |
| "loss/crossentropy": 1.5863521695137024, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.009787225630134344, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.191, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 16.208450762430825, |
| "learning_rate": 2e-05, |
| "loss": 0.0784, |
| "loss/crossentropy": 0.8779918029904366, |
| "loss/hidden": 0.073974609375, |
| "loss/logits": 0.004391094436869025, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 1.375, |
| "grad_norm_var": 16.1827361424764, |
| "learning_rate": 2e-05, |
| "loss": 0.0931, |
| "loss/crossentropy": 2.1567060947418213, |
| "loss/hidden": 0.085693359375, |
| "loss/logits": 0.007449513301253319, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.193, |
| "grad_norm": 0.875, |
| "grad_norm_var": 16.386012204488118, |
| "learning_rate": 2e-05, |
| "loss": 0.0898, |
| "loss/crossentropy": 1.8178179860115051, |
| "loss/hidden": 0.08251953125, |
| "loss/logits": 0.007294924231246114, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.194, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 16.114434560139973, |
| "learning_rate": 2e-05, |
| "loss": 0.1014, |
| "loss/crossentropy": 1.8806178569793701, |
| "loss/hidden": 0.09375, |
| "loss/logits": 0.0076924534514546394, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 16.098729451497395, |
| "learning_rate": 2e-05, |
| "loss": 0.1048, |
| "loss/crossentropy": 1.6054936051368713, |
| "loss/hidden": 0.096435546875, |
| "loss/logits": 0.008354771416634321, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 1.90625, |
| "grad_norm_var": 14.200210571289062, |
| "learning_rate": 2e-05, |
| "loss": 0.0851, |
| "loss/crossentropy": 1.1937458366155624, |
| "loss/hidden": 0.079833984375, |
| "loss/logits": 0.005313969450071454, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.197, |
| "grad_norm": 2.453125, |
| "grad_norm_var": 14.113833618164062, |
| "learning_rate": 2e-05, |
| "loss": 0.1056, |
| "loss/crossentropy": 1.9973903894424438, |
| "loss/hidden": 0.09619140625, |
| "loss/logits": 0.00938287889584899, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.198, |
| "grad_norm": 1.5546875, |
| "grad_norm_var": 14.07872314453125, |
| "learning_rate": 2e-05, |
| "loss": 0.087, |
| "loss/crossentropy": 2.0422087907791138, |
| "loss/hidden": 0.07958984375, |
| "loss/logits": 0.007449948927387595, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.199, |
| "grad_norm": 0.875, |
| "grad_norm_var": 14.218849436442058, |
| "learning_rate": 2e-05, |
| "loss": 0.0908, |
| "loss/crossentropy": 2.040232002735138, |
| "loss/hidden": 0.08349609375, |
| "loss/logits": 0.007334771566092968, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 3.6875, |
| "grad_norm_var": 14.104658762613932, |
| "learning_rate": 2e-05, |
| "loss": 0.0996, |
| "loss/crossentropy": 1.7977141737937927, |
| "loss/hidden": 0.09130859375, |
| "loss/logits": 0.008285259362310171, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.201, |
| "grad_norm": 1.1640625, |
| "grad_norm_var": 13.984908040364584, |
| "learning_rate": 2e-05, |
| "loss": 0.0923, |
| "loss/crossentropy": 1.960830569267273, |
| "loss/hidden": 0.0849609375, |
| "loss/logits": 0.007373227505013347, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.202, |
| "grad_norm": 1.2109375, |
| "grad_norm_var": 13.99013646443685, |
| "learning_rate": 2e-05, |
| "loss": 0.1063, |
| "loss/crossentropy": 1.5903997421264648, |
| "loss/hidden": 0.098876953125, |
| "loss/logits": 0.007376475026831031, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.203, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 13.0423215230306, |
| "learning_rate": 2e-05, |
| "loss": 0.0958, |
| "loss/crossentropy": 1.1866007596254349, |
| "loss/hidden": 0.0908203125, |
| "loss/logits": 0.0049855056568048894, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.204, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 13.01123046875, |
| "learning_rate": 2e-05, |
| "loss": 0.1001, |
| "loss/crossentropy": 2.016387164592743, |
| "loss/hidden": 0.092529296875, |
| "loss/logits": 0.0076178074814379215, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 0.98828125, |
| "grad_norm_var": 12.966665585835775, |
| "learning_rate": 2e-05, |
| "loss": 0.1017, |
| "loss/crossentropy": 1.9937080144882202, |
| "loss/hidden": 0.09326171875, |
| "loss/logits": 0.008388462010771036, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.206, |
| "grad_norm": 1.65625, |
| "grad_norm_var": 0.5201679865519205, |
| "learning_rate": 2e-05, |
| "loss": 0.1012, |
| "loss/crossentropy": 1.8353246450424194, |
| "loss/hidden": 0.09326171875, |
| "loss/logits": 0.00795629364438355, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.207, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 0.5143070856730143, |
| "learning_rate": 2e-05, |
| "loss": 0.0918, |
| "loss/crossentropy": 1.0499791204929352, |
| "loss/hidden": 0.08740234375, |
| "loss/logits": 0.004438678151927888, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 1.0625, |
| "grad_norm_var": 0.5353540420532227, |
| "learning_rate": 2e-05, |
| "loss": 0.107, |
| "loss/crossentropy": 1.8614663481712341, |
| "loss/hidden": 0.09814453125, |
| "loss/logits": 0.008855776861310005, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.209, |
| "grad_norm": 2.390625, |
| "grad_norm_var": 0.5093535741170248, |
| "learning_rate": 2e-05, |
| "loss": 0.1072, |
| "loss/crossentropy": 2.363565683364868, |
| "loss/hidden": 0.096923828125, |
| "loss/logits": 0.010271006729453802, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 0.5069289525349935, |
| "learning_rate": 2e-05, |
| "loss": 0.1086, |
| "loss/crossentropy": 1.955030083656311, |
| "loss/hidden": 0.099365234375, |
| "loss/logits": 0.0092296302318573, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.211, |
| "grad_norm": 1.2265625, |
| "grad_norm_var": 0.5273447036743164, |
| "learning_rate": 2e-05, |
| "loss": 0.1062, |
| "loss/crossentropy": 1.774095892906189, |
| "loss/hidden": 0.0986328125, |
| "loss/logits": 0.007574398070573807, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.212, |
| "grad_norm": 1.2890625, |
| "grad_norm_var": 0.5396000544230143, |
| "learning_rate": 2e-05, |
| "loss": 0.1117, |
| "loss/crossentropy": 1.8405153155326843, |
| "loss/hidden": 0.10302734375, |
| "loss/logits": 0.008719130419194698, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.213, |
| "grad_norm": 1.40625, |
| "grad_norm_var": 0.5067829767862956, |
| "learning_rate": 2e-05, |
| "loss": 0.1045, |
| "loss/crossentropy": 2.0069875717163086, |
| "loss/hidden": 0.095947265625, |
| "loss/logits": 0.008583055343478918, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.214, |
| "grad_norm": 1.1640625, |
| "grad_norm_var": 0.5219018936157227, |
| "learning_rate": 2e-05, |
| "loss": 0.1103, |
| "loss/crossentropy": 1.670526921749115, |
| "loss/hidden": 0.102294921875, |
| "loss/logits": 0.008038338739424944, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 0.48292789459228513, |
| "learning_rate": 2e-05, |
| "loss": 0.1121, |
| "loss/crossentropy": 1.795514464378357, |
| "loss/hidden": 0.103759765625, |
| "loss/logits": 0.008318986743688583, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.216, |
| "grad_norm": 1.1328125, |
| "grad_norm_var": 0.2139871597290039, |
| "learning_rate": 2e-05, |
| "loss": 0.1066, |
| "loss/crossentropy": 2.180332064628601, |
| "loss/hidden": 0.09716796875, |
| "loss/logits": 0.009391986764967442, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.217, |
| "grad_norm": 1.9375, |
| "grad_norm_var": 0.21252689361572266, |
| "learning_rate": 2e-05, |
| "loss": 0.1234, |
| "loss/crossentropy": 1.8504464030265808, |
| "loss/hidden": 0.11181640625, |
| "loss/logits": 0.011583337560296059, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.218, |
| "grad_norm": 1.046875, |
| "grad_norm_var": 0.22248172760009766, |
| "learning_rate": 2e-05, |
| "loss": 0.1098, |
| "loss/crossentropy": 1.6542016863822937, |
| "loss/hidden": 0.101806640625, |
| "loss/logits": 0.007953221211209893, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.219, |
| "grad_norm": 1.1484375, |
| "grad_norm_var": 0.21898136138916016, |
| "learning_rate": 2e-05, |
| "loss": 0.1185, |
| "loss/crossentropy": 1.8401342630386353, |
| "loss/hidden": 0.107421875, |
| "loss/logits": 0.011056106071919203, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 1.2578125, |
| "grad_norm_var": 0.18931725819905598, |
| "learning_rate": 2e-05, |
| "loss": 0.1082, |
| "loss/crossentropy": 1.8265935778617859, |
| "loss/hidden": 0.09912109375, |
| "loss/logits": 0.009068313986063004, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.221, |
| "grad_norm": 52.25, |
| "grad_norm_var": 161.16229426066081, |
| "learning_rate": 2e-05, |
| "loss": 0.1937, |
| "loss/crossentropy": 1.5437742471694946, |
| "loss/hidden": 0.170654296875, |
| "loss/logits": 0.023064299020916224, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.222, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 160.93560969034831, |
| "learning_rate": 2e-05, |
| "loss": 0.1246, |
| "loss/crossentropy": 1.227450430393219, |
| "loss/hidden": 0.11572265625, |
| "loss/logits": 0.008849140722304583, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.223, |
| "grad_norm": 1.28125, |
| "grad_norm_var": 161.10956192016602, |
| "learning_rate": 2e-05, |
| "loss": 0.1196, |
| "loss/crossentropy": 1.9892451167106628, |
| "loss/hidden": 0.1103515625, |
| "loss/logits": 0.009212612174451351, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 1.0625, |
| "grad_norm_var": 161.10956192016602, |
| "learning_rate": 2e-05, |
| "loss": 0.1208, |
| "loss/crossentropy": 1.9727575778961182, |
| "loss/hidden": 0.111328125, |
| "loss/logits": 0.009519532322883606, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 1.9140625, |
| "grad_norm_var": 161.26942443847656, |
| "learning_rate": 2e-05, |
| "loss": 0.1112, |
| "loss/crossentropy": 2.20854651927948, |
| "loss/hidden": 0.1025390625, |
| "loss/logits": 0.008704130537807941, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.226, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 161.43824768066406, |
| "learning_rate": 2e-05, |
| "loss": 0.1249, |
| "loss/crossentropy": 1.8244708180427551, |
| "loss/hidden": 0.115478515625, |
| "loss/logits": 0.009438233450055122, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.227, |
| "grad_norm": 1.9921875, |
| "grad_norm_var": 161.12805989583333, |
| "learning_rate": 2e-05, |
| "loss": 0.1264, |
| "loss/crossentropy": 1.6184683442115784, |
| "loss/hidden": 0.117431640625, |
| "loss/logits": 0.008998575620353222, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.228, |
| "grad_norm": 1.40625, |
| "grad_norm_var": 161.0760617574056, |
| "learning_rate": 2e-05, |
| "loss": 0.1427, |
| "loss/crossentropy": 1.9090940952301025, |
| "loss/hidden": 0.1298828125, |
| "loss/logits": 0.01286676386371255, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.229, |
| "grad_norm": 1.5078125, |
| "grad_norm_var": 161.03238525390626, |
| "learning_rate": 2e-05, |
| "loss": 0.1191, |
| "loss/crossentropy": 1.7622392773628235, |
| "loss/hidden": 0.109619140625, |
| "loss/logits": 0.009484861977398396, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 1.3671875, |
| "grad_norm_var": 160.93959045410156, |
| "learning_rate": 2e-05, |
| "loss": 0.1185, |
| "loss/crossentropy": 1.7633178234100342, |
| "loss/hidden": 0.109130859375, |
| "loss/logits": 0.009330280125141144, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.231, |
| "grad_norm": 0.98828125, |
| "grad_norm_var": 161.32540073394776, |
| "learning_rate": 2e-05, |
| "loss": 0.1188, |
| "loss/crossentropy": 2.186140298843384, |
| "loss/hidden": 0.108154296875, |
| "loss/logits": 0.010631876531988382, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.232, |
| "grad_norm": 3.28125, |
| "grad_norm_var": 160.60855553944904, |
| "learning_rate": 2e-05, |
| "loss": 0.1224, |
| "loss/crossentropy": 0.8389374911785126, |
| "loss/hidden": 0.1171875, |
| "loss/logits": 0.005214276316110045, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.233, |
| "grad_norm": 1.0703125, |
| "grad_norm_var": 160.98382867177327, |
| "learning_rate": 2e-05, |
| "loss": 0.116, |
| "loss/crossentropy": 2.1515474915504456, |
| "loss/hidden": 0.107421875, |
| "loss/logits": 0.00860951654613018, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.234, |
| "grad_norm": 4.5, |
| "grad_norm_var": 160.03680464426677, |
| "learning_rate": 2e-05, |
| "loss": 0.1312, |
| "loss/crossentropy": 1.6820667684078217, |
| "loss/hidden": 0.123046875, |
| "loss/logits": 0.008124232292175293, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 2.40625, |
| "grad_norm_var": 159.50010522206625, |
| "learning_rate": 2e-05, |
| "loss": 0.1056, |
| "loss/crossentropy": 0.9079534839838743, |
| "loss/hidden": 0.10107421875, |
| "loss/logits": 0.004542189242783934, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.236, |
| "grad_norm": 0.984375, |
| "grad_norm_var": 159.64182631174722, |
| "learning_rate": 2e-05, |
| "loss": 0.1192, |
| "loss/crossentropy": 2.261181592941284, |
| "loss/hidden": 0.109619140625, |
| "loss/logits": 0.009581252932548523, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.237, |
| "grad_norm": 0.9921875, |
| "grad_norm_var": 0.9261479059855143, |
| "learning_rate": 2e-05, |
| "loss": 0.1281, |
| "loss/crossentropy": 1.9553669095039368, |
| "loss/hidden": 0.116943359375, |
| "loss/logits": 0.011152476072311401, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.238, |
| "grad_norm": 1.640625, |
| "grad_norm_var": 0.9103616714477539, |
| "learning_rate": 2e-05, |
| "loss": 0.1466, |
| "loss/crossentropy": 1.6360890865325928, |
| "loss/hidden": 0.13525390625, |
| "loss/logits": 0.011308418586850166, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.239, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 0.9085992813110352, |
| "learning_rate": 2e-05, |
| "loss": 0.133, |
| "loss/crossentropy": 1.0788212679326534, |
| "loss/hidden": 0.125732421875, |
| "loss/logits": 0.007256039883941412, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.7578125, |
| "grad_norm_var": 0.8688089370727539, |
| "learning_rate": 2e-05, |
| "loss": 0.1296, |
| "loss/crossentropy": 1.6809419393539429, |
| "loss/hidden": 0.119873046875, |
| "loss/logits": 0.009761545807123184, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.241, |
| "grad_norm": 1.4921875, |
| "grad_norm_var": 0.8769525527954102, |
| "learning_rate": 2e-05, |
| "loss": 0.1298, |
| "loss/crossentropy": 2.1073160767555237, |
| "loss/hidden": 0.1201171875, |
| "loss/logits": 0.009713000617921352, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.242, |
| "grad_norm": 3.3125, |
| "grad_norm_var": 1.0105956395467122, |
| "learning_rate": 2e-05, |
| "loss": 0.1851, |
| "loss/crossentropy": 1.7140259146690369, |
| "loss/hidden": 0.168212890625, |
| "loss/logits": 0.01692299358546734, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.243, |
| "grad_norm": 1.3203125, |
| "grad_norm_var": 1.0337132136027019, |
| "learning_rate": 2e-05, |
| "loss": 0.141, |
| "loss/crossentropy": 1.70401269197464, |
| "loss/hidden": 0.13037109375, |
| "loss/logits": 0.010653213132172823, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.244, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 1.0173481623331706, |
| "learning_rate": 2e-05, |
| "loss": 0.1561, |
| "loss/crossentropy": 1.9086145758628845, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.01448416942730546, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.245, |
| "grad_norm": 1.890625, |
| "grad_norm_var": 1.0048868179321289, |
| "learning_rate": 2e-05, |
| "loss": 0.1751, |
| "loss/crossentropy": 1.5015806555747986, |
| "loss/hidden": 0.16064453125, |
| "loss/logits": 0.014442750252783298, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.246, |
| "grad_norm": 1.6796875, |
| "grad_norm_var": 0.9864847183227539, |
| "learning_rate": 2e-05, |
| "loss": 0.1323, |
| "loss/crossentropy": 1.9546470642089844, |
| "loss/hidden": 0.12255859375, |
| "loss/logits": 0.009766705334186554, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.247, |
| "grad_norm": 1.203125, |
| "grad_norm_var": 0.9611083984375, |
| "learning_rate": 2e-05, |
| "loss": 0.1539, |
| "loss/crossentropy": 1.7062721848487854, |
| "loss/hidden": 0.1416015625, |
| "loss/logits": 0.01230617519468069, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.248, |
| "grad_norm": 4.21875, |
| "grad_norm_var": 1.1776611328125, |
| "learning_rate": 2e-05, |
| "loss": 0.1515, |
| "loss/crossentropy": 1.740279734134674, |
| "loss/hidden": 0.14013671875, |
| "loss/logits": 0.011402689386159182, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.249, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 1.1123573303222656, |
| "learning_rate": 2e-05, |
| "loss": 0.1504, |
| "loss/crossentropy": 1.640882670879364, |
| "loss/hidden": 0.1396484375, |
| "loss/logits": 0.01071554934605956, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.796875, |
| "grad_norm_var": 0.7542132059733073, |
| "learning_rate": 2e-05, |
| "loss": 0.1364, |
| "loss/crossentropy": 1.4670004844665527, |
| "loss/hidden": 0.126708984375, |
| "loss/logits": 0.0096431621350348, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.251, |
| "grad_norm": 1.1796875, |
| "grad_norm_var": 0.7847574869791667, |
| "learning_rate": 2e-05, |
| "loss": 0.14, |
| "loss/crossentropy": 2.2024736404418945, |
| "loss/hidden": 0.127197265625, |
| "loss/logits": 0.012759591452777386, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.252, |
| "grad_norm": 3.53125, |
| "grad_norm_var": 0.8651763916015625, |
| "learning_rate": 2e-05, |
| "loss": 0.1539, |
| "loss/crossentropy": 2.0269722938537598, |
| "loss/hidden": 0.14208984375, |
| "loss/logits": 0.011817097198218107, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.253, |
| "grad_norm": 9.375, |
| "grad_norm_var": 4.018281809488932, |
| "learning_rate": 2e-05, |
| "loss": 0.1661, |
| "loss/crossentropy": 0.34899202920496464, |
| "loss/hidden": 0.163818359375, |
| "loss/logits": 0.0022718849941156805, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.254, |
| "grad_norm": 1.9921875, |
| "grad_norm_var": 3.9798868815104167, |
| "learning_rate": 2e-05, |
| "loss": 0.1441, |
| "loss/crossentropy": 2.2475985288619995, |
| "loss/hidden": 0.1318359375, |
| "loss/logits": 0.012224531266838312, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.255, |
| "grad_norm": 1.6328125, |
| "grad_norm_var": 4.037050120035807, |
| "learning_rate": 2e-05, |
| "loss": 0.1497, |
| "loss/crossentropy": 2.8270416259765625, |
| "loss/hidden": 0.13623046875, |
| "loss/logits": 0.013480226043611765, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 1.4609375, |
| "grad_norm_var": 4.07616958618164, |
| "learning_rate": 2e-05, |
| "loss": 0.1668, |
| "loss/crossentropy": 1.3126854300498962, |
| "loss/hidden": 0.15576171875, |
| "loss/logits": 0.01107651786878705, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.257, |
| "grad_norm": 1.9140625, |
| "grad_norm_var": 4.02563247680664, |
| "learning_rate": 2e-05, |
| "loss": 0.1502, |
| "loss/crossentropy": 1.4198355078697205, |
| "loss/hidden": 0.1396484375, |
| "loss/logits": 0.01056258101016283, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.258, |
| "grad_norm": 1.3671875, |
| "grad_norm_var": 4.081167602539063, |
| "learning_rate": 2e-05, |
| "loss": 0.1421, |
| "loss/crossentropy": 1.657827377319336, |
| "loss/hidden": 0.13232421875, |
| "loss/logits": 0.009755304548889399, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.259, |
| "grad_norm": 1.75, |
| "grad_norm_var": 4.025512440999349, |
| "learning_rate": 2e-05, |
| "loss": 0.1352, |
| "loss/crossentropy": 2.3775731325149536, |
| "loss/hidden": 0.12548828125, |
| "loss/logits": 0.0096644451841712, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 1.40625, |
| "grad_norm_var": 4.089703114827474, |
| "learning_rate": 2e-05, |
| "loss": 0.1442, |
| "loss/crossentropy": 2.2461366653442383, |
| "loss/hidden": 0.13232421875, |
| "loss/logits": 0.011895926669239998, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.261, |
| "grad_norm": 2.578125, |
| "grad_norm_var": 4.065040842692057, |
| "learning_rate": 2e-05, |
| "loss": 0.1474, |
| "loss/crossentropy": 1.560776025056839, |
| "loss/hidden": 0.1337890625, |
| "loss/logits": 0.013578795362263918, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.262, |
| "grad_norm": 1.5390625, |
| "grad_norm_var": 4.082124582926432, |
| "learning_rate": 2e-05, |
| "loss": 0.1556, |
| "loss/crossentropy": 1.9976117014884949, |
| "loss/hidden": 0.14404296875, |
| "loss/logits": 0.011512083932757378, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.263, |
| "grad_norm": 1.6328125, |
| "grad_norm_var": 4.018440755208333, |
| "learning_rate": 2e-05, |
| "loss": 0.1759, |
| "loss/crossentropy": 1.705672264099121, |
| "loss/hidden": 0.16162109375, |
| "loss/logits": 0.014301342889666557, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.264, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 3.8464345296223956, |
| "learning_rate": 2e-05, |
| "loss": 0.1864, |
| "loss/crossentropy": 1.7075408101081848, |
| "loss/hidden": 0.171875, |
| "loss/logits": 0.01456779520958662, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.265, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 3.86392822265625, |
| "learning_rate": 2e-05, |
| "loss": 0.1677, |
| "loss/crossentropy": 2.094871759414673, |
| "loss/hidden": 0.15380859375, |
| "loss/logits": 0.013906504027545452, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.266, |
| "grad_norm": 2.578125, |
| "grad_norm_var": 3.8542154947916667, |
| "learning_rate": 2e-05, |
| "loss": 0.1591, |
| "loss/crossentropy": 2.166890859603882, |
| "loss/hidden": 0.146484375, |
| "loss/logits": 0.012606294360011816, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.267, |
| "grad_norm": 3.859375, |
| "grad_norm_var": 3.885705312093099, |
| "learning_rate": 2e-05, |
| "loss": 0.1763, |
| "loss/crossentropy": 1.674479365348816, |
| "loss/hidden": 0.162109375, |
| "loss/logits": 0.01416744152083993, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.268, |
| "grad_norm": 2.625, |
| "grad_norm_var": 3.8142555236816404, |
| "learning_rate": 2e-05, |
| "loss": 0.2022, |
| "loss/crossentropy": 1.0146620571613312, |
| "loss/hidden": 0.1904296875, |
| "loss/logits": 0.01172702293843031, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.269, |
| "grad_norm": 1.21875, |
| "grad_norm_var": 0.4503334045410156, |
| "learning_rate": 2e-05, |
| "loss": 0.1457, |
| "loss/crossentropy": 1.8024365305900574, |
| "loss/hidden": 0.13427734375, |
| "loss/logits": 0.011465264018625021, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 1.4296875, |
| "grad_norm_var": 0.46684951782226564, |
| "learning_rate": 2e-05, |
| "loss": 0.161, |
| "loss/crossentropy": 1.7421787977218628, |
| "loss/hidden": 0.14892578125, |
| "loss/logits": 0.012049074750393629, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.271, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 0.4663726806640625, |
| "learning_rate": 2e-05, |
| "loss": 0.1519, |
| "loss/crossentropy": 1.1601504981517792, |
| "loss/hidden": 0.14404296875, |
| "loss/logits": 0.007814974524080753, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 1.7421875, |
| "grad_norm_var": 0.4529693603515625, |
| "learning_rate": 2e-05, |
| "loss": 0.1693, |
| "loss/crossentropy": 1.9806629419326782, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.01302909990772605, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.273, |
| "grad_norm": 1.1796875, |
| "grad_norm_var": 0.4919352213541667, |
| "learning_rate": 2e-05, |
| "loss": 0.1724, |
| "loss/crossentropy": 2.005366265773773, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.014153223484754562, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.274, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 0.4723894755045573, |
| "learning_rate": 2e-05, |
| "loss": 0.1808, |
| "loss/crossentropy": 1.7814961075782776, |
| "loss/hidden": 0.166015625, |
| "loss/logits": 0.014784782659262419, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.275, |
| "grad_norm": 1.9921875, |
| "grad_norm_var": 0.4697011311848958, |
| "learning_rate": 2e-05, |
| "loss": 0.1963, |
| "loss/crossentropy": 1.5670437216758728, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.016570267733186483, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.276, |
| "grad_norm": 1.4765625, |
| "grad_norm_var": 0.464800771077474, |
| "learning_rate": 2e-05, |
| "loss": 0.1604, |
| "loss/crossentropy": 2.009281039237976, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.010985464788973331, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.277, |
| "grad_norm": 1.4453125, |
| "grad_norm_var": 0.45259501139322916, |
| "learning_rate": 2e-05, |
| "loss": 0.168, |
| "loss/crossentropy": 1.7085555791854858, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.011709913145750761, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.278, |
| "grad_norm": 1.3828125, |
| "grad_norm_var": 0.46154683430989585, |
| "learning_rate": 2e-05, |
| "loss": 0.1456, |
| "loss/crossentropy": 2.789747476577759, |
| "loss/hidden": 0.1337890625, |
| "loss/logits": 0.011802888009697199, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.279, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 0.45711441040039064, |
| "learning_rate": 2e-05, |
| "loss": 0.1881, |
| "loss/crossentropy": 1.5918955504894257, |
| "loss/hidden": 0.173828125, |
| "loss/logits": 0.014291070867329836, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 2.8125, |
| "grad_norm_var": 0.5068682352701823, |
| "learning_rate": 2e-05, |
| "loss": 0.1458, |
| "loss/crossentropy": 0.8236657343804836, |
| "loss/hidden": 0.139404296875, |
| "loss/logits": 0.00643135339487344, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.281, |
| "grad_norm": 4.125, |
| "grad_norm_var": 0.7956764221191406, |
| "learning_rate": 2e-05, |
| "loss": 0.1714, |
| "loss/crossentropy": 2.1279306411743164, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.015115040354430676, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.282, |
| "grad_norm": 1.296875, |
| "grad_norm_var": 0.8177813212076823, |
| "learning_rate": 2e-05, |
| "loss": 0.1669, |
| "loss/crossentropy": 2.2272568941116333, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.0135371801443398, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.283, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 0.6023089090983073, |
| "learning_rate": 2e-05, |
| "loss": 0.1781, |
| "loss/crossentropy": 2.2013776302337646, |
| "loss/hidden": 0.16259765625, |
| "loss/logits": 0.015500886365771294, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.284, |
| "grad_norm": 1.96875, |
| "grad_norm_var": 0.5695391337076823, |
| "learning_rate": 2e-05, |
| "loss": 0.1822, |
| "loss/crossentropy": 1.6315099596977234, |
| "loss/hidden": 0.1689453125, |
| "loss/logits": 0.013229990843683481, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.285, |
| "grad_norm": 2.421875, |
| "grad_norm_var": 0.550426991780599, |
| "learning_rate": 2e-05, |
| "loss": 0.1877, |
| "loss/crossentropy": 1.329133152961731, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.012850106693804264, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.286, |
| "grad_norm": 2.78125, |
| "grad_norm_var": 0.5659576416015625, |
| "learning_rate": 2e-05, |
| "loss": 0.1725, |
| "loss/crossentropy": 2.0431485772132874, |
| "loss/hidden": 0.15966796875, |
| "loss/logits": 0.01284833624958992, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.287, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 0.5648915608723958, |
| "learning_rate": 2e-05, |
| "loss": 0.2173, |
| "loss/crossentropy": 1.6292879581451416, |
| "loss/hidden": 0.19970703125, |
| "loss/logits": 0.017579292878508568, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 1.4296875, |
| "grad_norm_var": 0.5841379801432292, |
| "learning_rate": 2e-05, |
| "loss": 0.1632, |
| "loss/crossentropy": 2.0630630254745483, |
| "loss/hidden": 0.14990234375, |
| "loss/logits": 0.013251845724880695, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.289, |
| "grad_norm": 1.8203125, |
| "grad_norm_var": 0.5364664713541667, |
| "learning_rate": 2e-05, |
| "loss": 0.2067, |
| "loss/crossentropy": 2.168562591075897, |
| "loss/hidden": 0.18798828125, |
| "loss/logits": 0.01867722487077117, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 1.21875, |
| "grad_norm_var": 0.5779449462890625, |
| "learning_rate": 2e-05, |
| "loss": 0.166, |
| "loss/crossentropy": 1.8953060507774353, |
| "loss/hidden": 0.15380859375, |
| "loss/logits": 0.01215141685679555, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.291, |
| "grad_norm": 1.7109375, |
| "grad_norm_var": 0.5848297119140625, |
| "learning_rate": 2e-05, |
| "loss": 0.187, |
| "loss/crossentropy": 1.6148796081542969, |
| "loss/hidden": 0.173828125, |
| "loss/logits": 0.013202093075960875, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.292, |
| "grad_norm": 1.6328125, |
| "grad_norm_var": 0.5749013264973958, |
| "learning_rate": 2e-05, |
| "loss": 0.197, |
| "loss/crossentropy": 1.7814635038375854, |
| "loss/hidden": 0.1826171875, |
| "loss/logits": 0.014429094269871712, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.293, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 0.5503028869628906, |
| "learning_rate": 2e-05, |
| "loss": 0.1814, |
| "loss/crossentropy": 2.1830875873565674, |
| "loss/hidden": 0.16748046875, |
| "loss/logits": 0.013968405313789845, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.294, |
| "grad_norm": 1.7109375, |
| "grad_norm_var": 0.5268898010253906, |
| "learning_rate": 2e-05, |
| "loss": 0.2098, |
| "loss/crossentropy": 1.681401550769806, |
| "loss/hidden": 0.19482421875, |
| "loss/logits": 0.01494319923222065, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.295, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 0.5633453369140625, |
| "learning_rate": 2e-05, |
| "loss": 0.1884, |
| "loss/crossentropy": 1.953886091709137, |
| "loss/hidden": 0.173828125, |
| "loss/logits": 0.014602533541619778, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.296, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 0.5292144775390625, |
| "learning_rate": 2e-05, |
| "loss": 0.1987, |
| "loss/crossentropy": 1.6944631338119507, |
| "loss/hidden": 0.18603515625, |
| "loss/logits": 0.012617598287761211, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.297, |
| "grad_norm": 1.8359375, |
| "grad_norm_var": 0.20425999959309896, |
| "learning_rate": 2e-05, |
| "loss": 0.2261, |
| "loss/crossentropy": 2.214042544364929, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.020975200459361076, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.298, |
| "grad_norm": 1.1484375, |
| "grad_norm_var": 0.2164703369140625, |
| "learning_rate": 2e-05, |
| "loss": 0.1842, |
| "loss/crossentropy": 2.1237878799438477, |
| "loss/hidden": 0.16943359375, |
| "loss/logits": 0.014801782555878162, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.299, |
| "grad_norm": 1.4765625, |
| "grad_norm_var": 0.18964818318684895, |
| "learning_rate": 2e-05, |
| "loss": 0.1814, |
| "loss/crossentropy": 1.492847979068756, |
| "loss/hidden": 0.16845703125, |
| "loss/logits": 0.012967187445610762, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 1.890625, |
| "grad_norm_var": 0.1879595438639323, |
| "learning_rate": 2e-05, |
| "loss": 0.1776, |
| "loss/crossentropy": 2.2924291491508484, |
| "loss/hidden": 0.16357421875, |
| "loss/logits": 0.014043833129107952, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.301, |
| "grad_norm": 4.1875, |
| "grad_norm_var": 0.5374061584472656, |
| "learning_rate": 2e-05, |
| "loss": 0.2062, |
| "loss/crossentropy": 1.607342541217804, |
| "loss/hidden": 0.18994140625, |
| "loss/logits": 0.016273885034024715, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.302, |
| "grad_norm": 1.5859375, |
| "grad_norm_var": 0.4823486328125, |
| "learning_rate": 2e-05, |
| "loss": 0.2143, |
| "loss/crossentropy": 1.8559609055519104, |
| "loss/hidden": 0.197265625, |
| "loss/logits": 0.017047187313437462, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.303, |
| "grad_norm": 1.2265625, |
| "grad_norm_var": 0.4923052469889323, |
| "learning_rate": 2e-05, |
| "loss": 0.1814, |
| "loss/crossentropy": 2.4204115867614746, |
| "loss/hidden": 0.16796875, |
| "loss/logits": 0.013407074846327305, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 0.49497782389322914, |
| "learning_rate": 2e-05, |
| "loss": 0.2058, |
| "loss/crossentropy": 1.7306669354438782, |
| "loss/hidden": 0.189453125, |
| "loss/logits": 0.016323519870638847, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.305, |
| "grad_norm": 1.6484375, |
| "grad_norm_var": 0.4960856119791667, |
| "learning_rate": 2e-05, |
| "loss": 0.1877, |
| "loss/crossentropy": 2.212082266807556, |
| "loss/hidden": 0.171875, |
| "loss/logits": 0.015811644960194826, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.306, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 0.4901466369628906, |
| "learning_rate": 2e-05, |
| "loss": 0.1902, |
| "loss/crossentropy": 1.9250993132591248, |
| "loss/hidden": 0.17626953125, |
| "loss/logits": 0.013882125727832317, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.307, |
| "grad_norm": 5.75, |
| "grad_norm_var": 1.4711181640625, |
| "learning_rate": 2e-05, |
| "loss": 0.1934, |
| "loss/crossentropy": 0.4879331737756729, |
| "loss/hidden": 0.18701171875, |
| "loss/logits": 0.006413323106244206, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.308, |
| "grad_norm": 3.046875, |
| "grad_norm_var": 1.520232899983724, |
| "learning_rate": 2e-05, |
| "loss": 0.1973, |
| "loss/crossentropy": 1.4504847526550293, |
| "loss/hidden": 0.1875, |
| "loss/logits": 0.009785078698769212, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.309, |
| "grad_norm": 1.40625, |
| "grad_norm_var": 1.5522092183430989, |
| "learning_rate": 2e-05, |
| "loss": 0.2057, |
| "loss/crossentropy": 2.149027943611145, |
| "loss/hidden": 0.189453125, |
| "loss/logits": 0.01620970480144024, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 1.7578125, |
| "grad_norm_var": 1.550005849202474, |
| "learning_rate": 2e-05, |
| "loss": 0.2027, |
| "loss/crossentropy": 2.1503273248672485, |
| "loss/hidden": 0.185546875, |
| "loss/logits": 0.01712088193744421, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.311, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 1.5372304280598958, |
| "learning_rate": 2e-05, |
| "loss": 0.1888, |
| "loss/crossentropy": 2.1748342514038086, |
| "loss/hidden": 0.17333984375, |
| "loss/logits": 0.01546872965991497, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.312, |
| "grad_norm": 1.4921875, |
| "grad_norm_var": 1.5502703348795572, |
| "learning_rate": 2e-05, |
| "loss": 0.2158, |
| "loss/crossentropy": 1.3706732988357544, |
| "loss/hidden": 0.20166015625, |
| "loss/logits": 0.014161557890474796, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.313, |
| "grad_norm": 2.421875, |
| "grad_norm_var": 1.5523111979166666, |
| "learning_rate": 2e-05, |
| "loss": 0.2021, |
| "loss/crossentropy": 1.8907567262649536, |
| "loss/hidden": 0.18701171875, |
| "loss/logits": 0.015071831177920103, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.314, |
| "grad_norm": 1.296875, |
| "grad_norm_var": 1.5344378153483074, |
| "learning_rate": 2e-05, |
| "loss": 0.201, |
| "loss/crossentropy": 1.7888588905334473, |
| "loss/hidden": 0.1875, |
| "loss/logits": 0.013532605487853289, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.315, |
| "grad_norm": 1.5859375, |
| "grad_norm_var": 1.5256507873535157, |
| "learning_rate": 2e-05, |
| "loss": 0.2166, |
| "loss/crossentropy": 1.5358025133609772, |
| "loss/hidden": 0.2021484375, |
| "loss/logits": 0.014410331379622221, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.316, |
| "grad_norm": 56.0, |
| "grad_norm_var": 182.73569310506184, |
| "learning_rate": 2e-05, |
| "loss": 0.2529, |
| "loss/crossentropy": 2.1001065373420715, |
| "loss/hidden": 0.234375, |
| "loss/logits": 0.01847642147913575, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.317, |
| "grad_norm": 1.296875, |
| "grad_norm_var": 183.77112401326497, |
| "learning_rate": 2e-05, |
| "loss": 0.1958, |
| "loss/crossentropy": 2.3731868267059326, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.01615766156464815, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.318, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 183.79867248535157, |
| "learning_rate": 2e-05, |
| "loss": 0.2212, |
| "loss/crossentropy": 1.8716753125190735, |
| "loss/hidden": 0.2041015625, |
| "loss/logits": 0.017116894014179707, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.319, |
| "grad_norm": 1.9921875, |
| "grad_norm_var": 183.41590983072916, |
| "learning_rate": 2e-05, |
| "loss": 0.1938, |
| "loss/crossentropy": 1.2205194532871246, |
| "loss/hidden": 0.18115234375, |
| "loss/logits": 0.012608660385012627, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 1.203125, |
| "grad_norm_var": 183.88273010253906, |
| "learning_rate": 2e-05, |
| "loss": 0.1822, |
| "loss/crossentropy": 2.3611029386520386, |
| "loss/hidden": 0.1689453125, |
| "loss/logits": 0.013240456581115723, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.321, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 184.05854390462238, |
| "learning_rate": 2e-05, |
| "loss": 0.193, |
| "loss/crossentropy": 1.8402240872383118, |
| "loss/hidden": 0.18017578125, |
| "loss/logits": 0.012811433058232069, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.322, |
| "grad_norm": 1.3671875, |
| "grad_norm_var": 184.02547912597657, |
| "learning_rate": 2e-05, |
| "loss": 0.2238, |
| "loss/crossentropy": 1.9131136536598206, |
| "loss/hidden": 0.20751953125, |
| "loss/logits": 0.016317113302648067, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.323, |
| "grad_norm": 1.9765625, |
| "grad_norm_var": 184.69184951782228, |
| "learning_rate": 2e-05, |
| "loss": 0.2509, |
| "loss/crossentropy": 1.4010455012321472, |
| "loss/hidden": 0.23193359375, |
| "loss/logits": 0.018928353674709797, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.324, |
| "grad_norm": 2.234375, |
| "grad_norm_var": 184.9522621154785, |
| "learning_rate": 2e-05, |
| "loss": 0.1929, |
| "loss/crossentropy": 1.9659223556518555, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.013216304127126932, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.325, |
| "grad_norm": 1.75, |
| "grad_norm_var": 184.79406102498373, |
| "learning_rate": 2e-05, |
| "loss": 0.1877, |
| "loss/crossentropy": 1.5221052765846252, |
| "loss/hidden": 0.17626953125, |
| "loss/logits": 0.011447824770584702, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.326, |
| "grad_norm": 1.5546875, |
| "grad_norm_var": 184.88554662068685, |
| "learning_rate": 2e-05, |
| "loss": 0.2212, |
| "loss/crossentropy": 2.06081086397171, |
| "loss/hidden": 0.20361328125, |
| "loss/logits": 0.017567144706845284, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.327, |
| "grad_norm": 3.578125, |
| "grad_norm_var": 184.14719823201497, |
| "learning_rate": 2e-05, |
| "loss": 0.1707, |
| "loss/crossentropy": 0.8908511102199554, |
| "loss/hidden": 0.1640625, |
| "loss/logits": 0.006589735276065767, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.328, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 183.83722737630208, |
| "learning_rate": 2e-05, |
| "loss": 0.2041, |
| "loss/crossentropy": 1.4793621897697449, |
| "loss/hidden": 0.19384765625, |
| "loss/logits": 0.010210367618128657, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.329, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 184.19855931599935, |
| "learning_rate": 2e-05, |
| "loss": 0.2174, |
| "loss/crossentropy": 1.5629376769065857, |
| "loss/hidden": 0.20166015625, |
| "loss/logits": 0.015733799897134304, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 1.4609375, |
| "grad_norm_var": 184.11591389973958, |
| "learning_rate": 2e-05, |
| "loss": 0.2297, |
| "loss/crossentropy": 2.016783118247986, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.017778108827769756, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.331, |
| "grad_norm": 1.3671875, |
| "grad_norm_var": 184.22320963541668, |
| "learning_rate": 2e-05, |
| "loss": 0.2183, |
| "loss/crossentropy": 2.3946865797042847, |
| "loss/hidden": 0.2001953125, |
| "loss/logits": 0.01807898748666048, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.332, |
| "grad_norm": 1.234375, |
| "grad_norm_var": 0.35546773274739585, |
| "learning_rate": 2e-05, |
| "loss": 0.2244, |
| "loss/crossentropy": 1.6463975310325623, |
| "loss/hidden": 0.2099609375, |
| "loss/logits": 0.014466887805610895, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.333, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 0.34256083170572915, |
| "learning_rate": 2e-05, |
| "loss": 0.2653, |
| "loss/crossentropy": 1.727737307548523, |
| "loss/hidden": 0.24462890625, |
| "loss/logits": 0.020694734528660774, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.334, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 0.36001688639322915, |
| "learning_rate": 2e-05, |
| "loss": 0.2636, |
| "loss/crossentropy": 1.8381291031837463, |
| "loss/hidden": 0.244140625, |
| "loss/logits": 0.019478057511150837, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.335, |
| "grad_norm": 5.5, |
| "grad_norm_var": 1.2181292215983073, |
| "learning_rate": 2e-05, |
| "loss": 0.2789, |
| "loss/crossentropy": 1.395434319972992, |
| "loss/hidden": 0.25732421875, |
| "loss/logits": 0.02152822446078062, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 1.7578125, |
| "grad_norm_var": 1.1768707275390624, |
| "learning_rate": 2e-05, |
| "loss": 0.2301, |
| "loss/crossentropy": 1.7802979946136475, |
| "loss/hidden": 0.212890625, |
| "loss/logits": 0.01717265695333481, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.337, |
| "grad_norm": 1.2265625, |
| "grad_norm_var": 1.1850748697916667, |
| "learning_rate": 2e-05, |
| "loss": 0.2195, |
| "loss/crossentropy": 1.864999234676361, |
| "loss/hidden": 0.20361328125, |
| "loss/logits": 0.015909720212221146, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.338, |
| "grad_norm": 1.5078125, |
| "grad_norm_var": 1.1734934488932292, |
| "learning_rate": 2e-05, |
| "loss": 0.2322, |
| "loss/crossentropy": 1.9171935319900513, |
| "loss/hidden": 0.2138671875, |
| "loss/logits": 0.01834118738770485, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.339, |
| "grad_norm": 1.7109375, |
| "grad_norm_var": 1.1808430989583334, |
| "learning_rate": 2e-05, |
| "loss": 0.2546, |
| "loss/crossentropy": 2.232408821582794, |
| "loss/hidden": 0.23388671875, |
| "loss/logits": 0.02068551816046238, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 4.40625, |
| "grad_norm_var": 1.531086222330729, |
| "learning_rate": 2e-05, |
| "loss": 0.2209, |
| "loss/crossentropy": 0.885938722640276, |
| "loss/hidden": 0.2138671875, |
| "loss/logits": 0.0069831793662160635, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.341, |
| "grad_norm": 1.8046875, |
| "grad_norm_var": 1.5281471252441405, |
| "learning_rate": 2e-05, |
| "loss": 0.274, |
| "loss/crossentropy": 2.053671360015869, |
| "loss/hidden": 0.25, |
| "loss/logits": 0.024039674550294876, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.342, |
| "grad_norm": 1.4765625, |
| "grad_norm_var": 1.535064442952474, |
| "learning_rate": 2e-05, |
| "loss": 0.248, |
| "loss/crossentropy": 2.1628893613815308, |
| "loss/hidden": 0.22900390625, |
| "loss/logits": 0.01902489084750414, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.343, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 1.4053301493326822, |
| "learning_rate": 2e-05, |
| "loss": 0.2355, |
| "loss/crossentropy": 1.9784727692604065, |
| "loss/hidden": 0.216796875, |
| "loss/logits": 0.018667724914848804, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.344, |
| "grad_norm": 1.9453125, |
| "grad_norm_var": 1.4048492431640625, |
| "learning_rate": 2e-05, |
| "loss": 0.2215, |
| "loss/crossentropy": 2.1430813670158386, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.016372697427868843, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.345, |
| "grad_norm": 3.34375, |
| "grad_norm_var": 1.489422353108724, |
| "learning_rate": 2e-05, |
| "loss": 0.2828, |
| "loss/crossentropy": 1.4574592113494873, |
| "loss/hidden": 0.259765625, |
| "loss/logits": 0.02300189435482025, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.346, |
| "grad_norm": 4.59375, |
| "grad_norm_var": 1.8130035400390625, |
| "learning_rate": 2e-05, |
| "loss": 0.2555, |
| "loss/crossentropy": 2.1325125694274902, |
| "loss/hidden": 0.234375, |
| "loss/logits": 0.021130304783582687, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.347, |
| "grad_norm": 1.4453125, |
| "grad_norm_var": 1.8031412760416667, |
| "learning_rate": 2e-05, |
| "loss": 0.233, |
| "loss/crossentropy": 2.6941460371017456, |
| "loss/hidden": 0.21435546875, |
| "loss/logits": 0.01859632506966591, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.348, |
| "grad_norm": 1.6171875, |
| "grad_norm_var": 1.755077870686849, |
| "learning_rate": 2e-05, |
| "loss": 0.2562, |
| "loss/crossentropy": 1.8957814574241638, |
| "loss/hidden": 0.236328125, |
| "loss/logits": 0.019866405054926872, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.349, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 1.7364418029785156, |
| "learning_rate": 2e-05, |
| "loss": 0.2507, |
| "loss/crossentropy": 2.5658878087997437, |
| "loss/hidden": 0.2294921875, |
| "loss/logits": 0.02118699811398983, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 1.890625, |
| "grad_norm_var": 1.7523719787597656, |
| "learning_rate": 2e-05, |
| "loss": 0.233, |
| "loss/crossentropy": 1.9111933708190918, |
| "loss/hidden": 0.21533203125, |
| "loss/logits": 0.01770856324583292, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.351, |
| "grad_norm": 2.625, |
| "grad_norm_var": 1.0678749084472656, |
| "learning_rate": 2e-05, |
| "loss": 0.2712, |
| "loss/crossentropy": 1.5525288581848145, |
| "loss/hidden": 0.25244140625, |
| "loss/logits": 0.01877846010029316, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 1.671875, |
| "grad_norm_var": 1.07325439453125, |
| "learning_rate": 2e-05, |
| "loss": 0.2398, |
| "loss/crossentropy": 1.47780179977417, |
| "loss/hidden": 0.224609375, |
| "loss/logits": 0.015163760632276535, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.353, |
| "grad_norm": 1.40625, |
| "grad_norm_var": 1.0523902893066406, |
| "learning_rate": 2e-05, |
| "loss": 0.2579, |
| "loss/crossentropy": 1.6976242065429688, |
| "loss/hidden": 0.240234375, |
| "loss/logits": 0.01768268644809723, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.354, |
| "grad_norm": 1.375, |
| "grad_norm_var": 1.065623982747396, |
| "learning_rate": 2e-05, |
| "loss": 0.2594, |
| "loss/crossentropy": 1.5402989983558655, |
| "loss/hidden": 0.24169921875, |
| "loss/logits": 0.017742513678967953, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.355, |
| "grad_norm": 2.609375, |
| "grad_norm_var": 1.0593360900878905, |
| "learning_rate": 2e-05, |
| "loss": 0.2983, |
| "loss/crossentropy": 1.7891557812690735, |
| "loss/hidden": 0.2744140625, |
| "loss/logits": 0.023881751112639904, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.356, |
| "grad_norm": 1.5859375, |
| "grad_norm_var": 0.7421427408854167, |
| "learning_rate": 2e-05, |
| "loss": 0.2353, |
| "loss/crossentropy": 2.255465269088745, |
| "loss/hidden": 0.2177734375, |
| "loss/logits": 0.01755282748490572, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.357, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 0.763287099202474, |
| "learning_rate": 2e-05, |
| "loss": 0.2388, |
| "loss/crossentropy": 2.2716734409332275, |
| "loss/hidden": 0.22021484375, |
| "loss/logits": 0.018602201715111732, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.358, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 0.7449666341145833, |
| "learning_rate": 2e-05, |
| "loss": 0.2737, |
| "loss/crossentropy": 1.8382077813148499, |
| "loss/hidden": 0.2548828125, |
| "loss/logits": 0.018825003411620855, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.359, |
| "grad_norm": 1.5546875, |
| "grad_norm_var": 0.7532976786295573, |
| "learning_rate": 2e-05, |
| "loss": 0.2391, |
| "loss/crossentropy": 1.6230210661888123, |
| "loss/hidden": 0.224609375, |
| "loss/logits": 0.014487342443317175, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 1.421875, |
| "grad_norm_var": 0.7803385416666667, |
| "learning_rate": 2e-05, |
| "loss": 0.2519, |
| "loss/crossentropy": 1.6961406469345093, |
| "loss/hidden": 0.234375, |
| "loss/logits": 0.017499960027635098, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.361, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 0.6720965067545573, |
| "learning_rate": 2e-05, |
| "loss": 0.2623, |
| "loss/crossentropy": 2.1821005940437317, |
| "loss/hidden": 0.24072265625, |
| "loss/logits": 0.021556712687015533, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.362, |
| "grad_norm": 1.6484375, |
| "grad_norm_var": 0.17363688151041667, |
| "learning_rate": 2e-05, |
| "loss": 0.2759, |
| "loss/crossentropy": 1.7173206806182861, |
| "loss/hidden": 0.255859375, |
| "loss/logits": 0.020033356733620167, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.363, |
| "grad_norm": 1.5859375, |
| "grad_norm_var": 0.16897684733072918, |
| "learning_rate": 2e-05, |
| "loss": 0.2552, |
| "loss/crossentropy": 1.8281689882278442, |
| "loss/hidden": 0.23681640625, |
| "loss/logits": 0.018404729664325714, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.364, |
| "grad_norm": 1.3125, |
| "grad_norm_var": 0.1809282938639323, |
| "learning_rate": 2e-05, |
| "loss": 0.2546, |
| "loss/crossentropy": 2.181256651878357, |
| "loss/hidden": 0.23486328125, |
| "loss/logits": 0.01975287776440382, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.365, |
| "grad_norm": 3.796875, |
| "grad_norm_var": 0.4434466044108073, |
| "learning_rate": 2e-05, |
| "loss": 0.2803, |
| "loss/crossentropy": 1.4486916065216064, |
| "loss/hidden": 0.2607421875, |
| "loss/logits": 0.01950985286384821, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.366, |
| "grad_norm": 1.234375, |
| "grad_norm_var": 0.4680987040201823, |
| "learning_rate": 2e-05, |
| "loss": 0.2504, |
| "loss/crossentropy": 2.026048183441162, |
| "loss/hidden": 0.232421875, |
| "loss/logits": 0.017978372983634472, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.367, |
| "grad_norm": 4.3125, |
| "grad_norm_var": 0.8263628641764323, |
| "learning_rate": 2e-05, |
| "loss": 0.2579, |
| "loss/crossentropy": 1.4382375180721283, |
| "loss/hidden": 0.2412109375, |
| "loss/logits": 0.016655512619763613, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 2.25, |
| "grad_norm_var": 0.827416737874349, |
| "learning_rate": 2e-05, |
| "loss": 0.3072, |
| "loss/crossentropy": 1.57509446144104, |
| "loss/hidden": 0.2880859375, |
| "loss/logits": 0.019162926822900772, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.369, |
| "grad_norm": 1.78125, |
| "grad_norm_var": 0.808251698811849, |
| "learning_rate": 2e-05, |
| "loss": 0.2383, |
| "loss/crossentropy": 2.0060970187187195, |
| "loss/hidden": 0.22021484375, |
| "loss/logits": 0.018060280941426754, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 2.546875, |
| "grad_norm_var": 0.798180898030599, |
| "learning_rate": 2e-05, |
| "loss": 0.2523, |
| "loss/crossentropy": 1.2137621641159058, |
| "loss/hidden": 0.24072265625, |
| "loss/logits": 0.011561613995581865, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.371, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 0.7833717346191407, |
| "learning_rate": 2e-05, |
| "loss": 0.2561, |
| "loss/crossentropy": 1.764179289340973, |
| "loss/hidden": 0.240234375, |
| "loss/logits": 0.015869705006480217, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.372, |
| "grad_norm": 1.5625, |
| "grad_norm_var": 0.784716796875, |
| "learning_rate": 2e-05, |
| "loss": 0.2642, |
| "loss/crossentropy": 2.1394487619400024, |
| "loss/hidden": 0.2451171875, |
| "loss/logits": 0.01907090563327074, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.373, |
| "grad_norm": 1.9765625, |
| "grad_norm_var": 0.7621681213378906, |
| "learning_rate": 2e-05, |
| "loss": 0.2496, |
| "loss/crossentropy": 2.151320219039917, |
| "loss/hidden": 0.23095703125, |
| "loss/logits": 0.018605505116283894, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.374, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 0.77073974609375, |
| "learning_rate": 2e-05, |
| "loss": 0.2426, |
| "loss/crossentropy": 2.291616916656494, |
| "loss/hidden": 0.2255859375, |
| "loss/logits": 0.01696862932294607, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 1.1640625, |
| "grad_norm_var": 0.8027577718098958, |
| "learning_rate": 2e-05, |
| "loss": 0.2482, |
| "loss/crossentropy": 2.1597548127174377, |
| "loss/hidden": 0.228515625, |
| "loss/logits": 0.019656311720609665, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.376, |
| "grad_norm": 4.5625, |
| "grad_norm_var": 1.1930867513020833, |
| "learning_rate": 2e-05, |
| "loss": 0.2546, |
| "loss/crossentropy": 0.7966546472162008, |
| "loss/hidden": 0.24609375, |
| "loss/logits": 0.008532016014214605, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.377, |
| "grad_norm": 1.25, |
| "grad_norm_var": 1.2246070861816407, |
| "learning_rate": 2e-05, |
| "loss": 0.2394, |
| "loss/crossentropy": 1.730500340461731, |
| "loss/hidden": 0.22314453125, |
| "loss/logits": 0.016217158176004887, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.378, |
| "grad_norm": 1.9453125, |
| "grad_norm_var": 1.210729726155599, |
| "learning_rate": 2e-05, |
| "loss": 0.2672, |
| "loss/crossentropy": 2.0575554966926575, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.02009457629173994, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.379, |
| "grad_norm": 4.15625, |
| "grad_norm_var": 1.4280181884765626, |
| "learning_rate": 2e-05, |
| "loss": 0.3649, |
| "loss/crossentropy": 2.409613251686096, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.034814249724149704, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 1.3563140869140624, |
| "learning_rate": 2e-05, |
| "loss": 0.2651, |
| "loss/crossentropy": 1.4721761345863342, |
| "loss/hidden": 0.2490234375, |
| "loss/logits": 0.016095119062811136, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.381, |
| "grad_norm": 1.2265625, |
| "grad_norm_var": 1.2842750549316406, |
| "learning_rate": 2e-05, |
| "loss": 0.2538, |
| "loss/crossentropy": 2.51900315284729, |
| "loss/hidden": 0.2314453125, |
| "loss/logits": 0.022326381877064705, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.382, |
| "grad_norm": 2.234375, |
| "grad_norm_var": 1.2151995340983073, |
| "learning_rate": 2e-05, |
| "loss": 0.2743, |
| "loss/crossentropy": 2.030519187450409, |
| "loss/hidden": 0.2548828125, |
| "loss/logits": 0.01944338995963335, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.383, |
| "grad_norm": 3.859375, |
| "grad_norm_var": 1.1054583231608073, |
| "learning_rate": 2e-05, |
| "loss": 0.3105, |
| "loss/crossentropy": 0.7516276463866234, |
| "loss/hidden": 0.2978515625, |
| "loss/logits": 0.012636175146326423, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 1.1055620829264323, |
| "learning_rate": 2e-05, |
| "loss": 0.2867, |
| "loss/crossentropy": 1.9317356944084167, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.023075740784406662, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.385, |
| "grad_norm": 2.09375, |
| "grad_norm_var": 1.0917884826660156, |
| "learning_rate": 2e-05, |
| "loss": 0.3261, |
| "loss/crossentropy": 2.1155296564102173, |
| "loss/hidden": 0.2998046875, |
| "loss/logits": 0.02629261091351509, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.386, |
| "grad_norm": 1.7734375, |
| "grad_norm_var": 1.1014312744140624, |
| "learning_rate": 2e-05, |
| "loss": 0.287, |
| "loss/crossentropy": 2.1998232007026672, |
| "loss/hidden": 0.265625, |
| "loss/logits": 0.021336179226636887, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.387, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 1.0915992736816407, |
| "learning_rate": 2e-05, |
| "loss": 0.2608, |
| "loss/crossentropy": 1.9437836408615112, |
| "loss/hidden": 0.2412109375, |
| "loss/logits": 0.019607914611697197, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.388, |
| "grad_norm": 2.125, |
| "grad_norm_var": 1.0605812072753906, |
| "learning_rate": 2e-05, |
| "loss": 0.2871, |
| "loss/crossentropy": 1.7142232656478882, |
| "loss/hidden": 0.2666015625, |
| "loss/logits": 0.020461218431591988, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.389, |
| "grad_norm": 1.640625, |
| "grad_norm_var": 1.0809977213541666, |
| "learning_rate": 2e-05, |
| "loss": 0.2863, |
| "loss/crossentropy": 2.236941933631897, |
| "loss/hidden": 0.2626953125, |
| "loss/logits": 0.023648610338568687, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 3.125, |
| "grad_norm_var": 1.0853248596191407, |
| "learning_rate": 2e-05, |
| "loss": 0.2733, |
| "loss/crossentropy": 1.2834028005599976, |
| "loss/hidden": 0.2607421875, |
| "loss/logits": 0.01257804874330759, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.391, |
| "grad_norm": 1.5, |
| "grad_norm_var": 1.0390787760416667, |
| "learning_rate": 2e-05, |
| "loss": 0.3026, |
| "loss/crossentropy": 1.5867803692817688, |
| "loss/hidden": 0.2822265625, |
| "loss/logits": 0.020396556705236435, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.392, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 0.7292439778645833, |
| "learning_rate": 2e-05, |
| "loss": 0.297, |
| "loss/crossentropy": 1.4337636232376099, |
| "loss/hidden": 0.2783203125, |
| "loss/logits": 0.01866168435662985, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.393, |
| "grad_norm": 1.71875, |
| "grad_norm_var": 0.6845052083333333, |
| "learning_rate": 2e-05, |
| "loss": 0.2642, |
| "loss/crossentropy": 2.1386572122573853, |
| "loss/hidden": 0.24462890625, |
| "loss/logits": 0.019583708606660366, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.394, |
| "grad_norm": 2.9375, |
| "grad_norm_var": 0.710375722249349, |
| "learning_rate": 2e-05, |
| "loss": 0.3313, |
| "loss/crossentropy": 1.936402440071106, |
| "loss/hidden": 0.3046875, |
| "loss/logits": 0.026638174429535866, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.395, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 0.4642567952473958, |
| "learning_rate": 2e-05, |
| "loss": 0.2699, |
| "loss/crossentropy": 2.2741682529449463, |
| "loss/hidden": 0.248046875, |
| "loss/logits": 0.021812792867422104, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.396, |
| "grad_norm": 4.84375, |
| "grad_norm_var": 0.9248687744140625, |
| "learning_rate": 2e-05, |
| "loss": 0.3035, |
| "loss/crossentropy": 1.1322659850120544, |
| "loss/hidden": 0.291015625, |
| "loss/logits": 0.01252604997716844, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.397, |
| "grad_norm": 2.53125, |
| "grad_norm_var": 0.8462562561035156, |
| "learning_rate": 2e-05, |
| "loss": 0.3108, |
| "loss/crossentropy": 1.358659565448761, |
| "loss/hidden": 0.2900390625, |
| "loss/logits": 0.02074052207171917, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.398, |
| "grad_norm": 1.84375, |
| "grad_norm_var": 0.862939198811849, |
| "learning_rate": 2e-05, |
| "loss": 0.3, |
| "loss/crossentropy": 1.9806614518165588, |
| "loss/hidden": 0.2783203125, |
| "loss/logits": 0.02170161809772253, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.399, |
| "grad_norm": 1.9296875, |
| "grad_norm_var": 0.706591796875, |
| "learning_rate": 2e-05, |
| "loss": 0.2984, |
| "loss/crossentropy": 2.3857691287994385, |
| "loss/hidden": 0.2744140625, |
| "loss/logits": 0.023968273773789406, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 1.9140625, |
| "grad_norm_var": 0.7121620178222656, |
| "learning_rate": 2e-05, |
| "loss": 0.2732, |
| "loss/crossentropy": 2.006265163421631, |
| "loss/hidden": 0.2509765625, |
| "loss/logits": 0.02220850996673107, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.401, |
| "grad_norm": 1.8046875, |
| "grad_norm_var": 0.7215810139973958, |
| "learning_rate": 2e-05, |
| "loss": 0.2935, |
| "loss/crossentropy": 1.7221473455429077, |
| "loss/hidden": 0.275390625, |
| "loss/logits": 0.018067960627377033, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.402, |
| "grad_norm": 2.421875, |
| "grad_norm_var": 0.7123146057128906, |
| "learning_rate": 2e-05, |
| "loss": 0.2923, |
| "loss/crossentropy": 2.0756383538246155, |
| "loss/hidden": 0.275390625, |
| "loss/logits": 0.016928995959460735, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.403, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 0.7353993733723958, |
| "learning_rate": 2e-05, |
| "loss": 0.2972, |
| "loss/crossentropy": 1.6683465242385864, |
| "loss/hidden": 0.2783203125, |
| "loss/logits": 0.018839839845895767, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.404, |
| "grad_norm": 1.8125, |
| "grad_norm_var": 0.7447987874348958, |
| "learning_rate": 2e-05, |
| "loss": 0.2966, |
| "loss/crossentropy": 1.737410545349121, |
| "loss/hidden": 0.2763671875, |
| "loss/logits": 0.02023144531995058, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.405, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 0.7762163798014323, |
| "learning_rate": 2e-05, |
| "loss": 0.2855, |
| "loss/crossentropy": 2.2183534502983093, |
| "loss/hidden": 0.26513671875, |
| "loss/logits": 0.02036190778017044, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.406, |
| "grad_norm": 1.5, |
| "grad_norm_var": 0.7329465230305989, |
| "learning_rate": 2e-05, |
| "loss": 0.3193, |
| "loss/crossentropy": 1.8786720633506775, |
| "loss/hidden": 0.294921875, |
| "loss/logits": 0.024385149590671062, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.407, |
| "grad_norm": 1.5, |
| "grad_norm_var": 0.7329465230305989, |
| "learning_rate": 2e-05, |
| "loss": 0.3099, |
| "loss/crossentropy": 1.8731706738471985, |
| "loss/hidden": 0.2861328125, |
| "loss/logits": 0.023721362464129925, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.408, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 0.714214833577474, |
| "learning_rate": 2e-05, |
| "loss": 0.2993, |
| "loss/crossentropy": 2.0363497734069824, |
| "loss/hidden": 0.2763671875, |
| "loss/logits": 0.02292494662106037, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.409, |
| "grad_norm": 1.421875, |
| "grad_norm_var": 0.7343544006347656, |
| "learning_rate": 2e-05, |
| "loss": 0.2919, |
| "loss/crossentropy": 1.7596482038497925, |
| "loss/hidden": 0.2705078125, |
| "loss/logits": 0.021396052092313766, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 2.84375, |
| "grad_norm_var": 0.7240577697753906, |
| "learning_rate": 2e-05, |
| "loss": 0.3154, |
| "loss/crossentropy": 1.080414205789566, |
| "loss/hidden": 0.29736328125, |
| "loss/logits": 0.018078335095196962, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.411, |
| "grad_norm": 1.5625, |
| "grad_norm_var": 0.73785400390625, |
| "learning_rate": 2e-05, |
| "loss": 0.2928, |
| "loss/crossentropy": 2.527972936630249, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.02323300577700138, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.412, |
| "grad_norm": 1.5078125, |
| "grad_norm_var": 0.18848851521809895, |
| "learning_rate": 2e-05, |
| "loss": 0.2989, |
| "loss/crossentropy": 1.5808929204940796, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.01763766910880804, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.413, |
| "grad_norm": 1.6328125, |
| "grad_norm_var": 0.1557037353515625, |
| "learning_rate": 2e-05, |
| "loss": 0.3052, |
| "loss/crossentropy": 2.073564648628235, |
| "loss/hidden": 0.2841796875, |
| "loss/logits": 0.021017897874116898, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.414, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 0.15574951171875, |
| "learning_rate": 2e-05, |
| "loss": 0.3341, |
| "loss/crossentropy": 1.5968445539474487, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.023572119884192944, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.415, |
| "grad_norm": 1.65625, |
| "grad_norm_var": 0.15465469360351564, |
| "learning_rate": 2e-05, |
| "loss": 0.3319, |
| "loss/crossentropy": 2.13019335269928, |
| "loss/hidden": 0.3037109375, |
| "loss/logits": 0.028160166926681995, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 0.15405044555664063, |
| "learning_rate": 2e-05, |
| "loss": 0.2928, |
| "loss/crossentropy": 1.3558663129806519, |
| "loss/hidden": 0.2744140625, |
| "loss/logits": 0.018423012923449278, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.417, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 0.1642242431640625, |
| "learning_rate": 2e-05, |
| "loss": 0.3349, |
| "loss/crossentropy": 1.556907832622528, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.0243788855150342, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.418, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 0.1344879150390625, |
| "learning_rate": 2e-05, |
| "loss": 0.293, |
| "loss/crossentropy": 2.18166720867157, |
| "loss/hidden": 0.2705078125, |
| "loss/logits": 0.022501694969832897, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.419, |
| "grad_norm": 5.0, |
| "grad_norm_var": 0.7930084228515625, |
| "learning_rate": 2e-05, |
| "loss": 0.306, |
| "loss/crossentropy": 1.875123679637909, |
| "loss/hidden": 0.2841796875, |
| "loss/logits": 0.021816120482981205, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 2.0, |
| "grad_norm_var": 0.7917633056640625, |
| "learning_rate": 2e-05, |
| "loss": 0.3207, |
| "loss/crossentropy": 2.1878353357315063, |
| "loss/hidden": 0.29296875, |
| "loss/logits": 0.027718784287571907, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.421, |
| "grad_norm": 2.5, |
| "grad_norm_var": 0.7763160705566406, |
| "learning_rate": 2e-05, |
| "loss": 0.3106, |
| "loss/crossentropy": 2.46438992023468, |
| "loss/hidden": 0.2841796875, |
| "loss/logits": 0.026430321857333183, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.422, |
| "grad_norm": 1.59375, |
| "grad_norm_var": 0.7701576232910157, |
| "learning_rate": 2e-05, |
| "loss": 0.2847, |
| "loss/crossentropy": 1.991809368133545, |
| "loss/hidden": 0.265625, |
| "loss/logits": 0.019083392806351185, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.423, |
| "grad_norm": 2.421875, |
| "grad_norm_var": 0.7565935770670573, |
| "learning_rate": 2e-05, |
| "loss": 0.415, |
| "loss/crossentropy": 1.6859049797058105, |
| "loss/hidden": 0.3818359375, |
| "loss/logits": 0.03313039615750313, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.424, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 0.7589800516764323, |
| "learning_rate": 2e-05, |
| "loss": 0.3098, |
| "loss/crossentropy": 1.8961586952209473, |
| "loss/hidden": 0.2900390625, |
| "loss/logits": 0.019725864753127098, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.425, |
| "grad_norm": 1.6171875, |
| "grad_norm_var": 0.7438547770182292, |
| "learning_rate": 2e-05, |
| "loss": 0.3427, |
| "loss/crossentropy": 2.085192084312439, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.026326753199100494, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.426, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 0.705224609375, |
| "learning_rate": 2e-05, |
| "loss": 0.3321, |
| "loss/crossentropy": 1.912731111049652, |
| "loss/hidden": 0.3076171875, |
| "loss/logits": 0.02450721152126789, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.427, |
| "grad_norm": 1.8359375, |
| "grad_norm_var": 0.6918108622233073, |
| "learning_rate": 2e-05, |
| "loss": 0.3396, |
| "loss/crossentropy": 2.1176230907440186, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.029072879813611507, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.428, |
| "grad_norm": 1.6015625, |
| "grad_norm_var": 0.6852617899576823, |
| "learning_rate": 2e-05, |
| "loss": 0.318, |
| "loss/crossentropy": 2.351975202560425, |
| "loss/hidden": 0.291015625, |
| "loss/logits": 0.026953624561429024, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.429, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 0.6734690348307292, |
| "learning_rate": 2e-05, |
| "loss": 0.4069, |
| "loss/crossentropy": 1.6036078929901123, |
| "loss/hidden": 0.37109375, |
| "loss/logits": 0.03581710997968912, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 0.667138671875, |
| "learning_rate": 2e-05, |
| "loss": 0.3472, |
| "loss/crossentropy": 1.881849765777588, |
| "loss/hidden": 0.3232421875, |
| "loss/logits": 0.023961665108799934, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.431, |
| "grad_norm": 3.625, |
| "grad_norm_var": 0.77403564453125, |
| "learning_rate": 2e-05, |
| "loss": 0.3121, |
| "loss/crossentropy": 2.3671000599861145, |
| "loss/hidden": 0.2900390625, |
| "loss/logits": 0.022101588547229767, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 0.7627866109212239, |
| "learning_rate": 2e-05, |
| "loss": 0.3151, |
| "loss/crossentropy": 1.1575224101543427, |
| "loss/hidden": 0.298828125, |
| "loss/logits": 0.016257786191999912, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.433, |
| "grad_norm": 5.40625, |
| "grad_norm_var": 1.3478289286295573, |
| "learning_rate": 2e-05, |
| "loss": 0.3283, |
| "loss/crossentropy": 1.3821857124567032, |
| "loss/hidden": 0.3115234375, |
| "loss/logits": 0.016785149462521076, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.434, |
| "grad_norm": 2.140625, |
| "grad_norm_var": 1.3182634989420572, |
| "learning_rate": 2e-05, |
| "loss": 0.3499, |
| "loss/crossentropy": 1.4704007506370544, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.02373607736080885, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.435, |
| "grad_norm": 1.7890625, |
| "grad_norm_var": 0.9163736979166667, |
| "learning_rate": 2e-05, |
| "loss": 0.3453, |
| "loss/crossentropy": 1.7521992325782776, |
| "loss/hidden": 0.322265625, |
| "loss/logits": 0.023045840673148632, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.436, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 0.9093251546223958, |
| "learning_rate": 2e-05, |
| "loss": 0.3079, |
| "loss/crossentropy": 1.4147529304027557, |
| "loss/hidden": 0.2919921875, |
| "loss/logits": 0.01587154157459736, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.437, |
| "grad_norm": 1.78125, |
| "grad_norm_var": 0.9289784749348958, |
| "learning_rate": 2e-05, |
| "loss": 0.3572, |
| "loss/crossentropy": 2.1589527130126953, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.027110325172543526, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.438, |
| "grad_norm": 1.546875, |
| "grad_norm_var": 0.9336751302083334, |
| "learning_rate": 2e-05, |
| "loss": 0.3112, |
| "loss/crossentropy": 2.0695826411247253, |
| "loss/hidden": 0.2890625, |
| "loss/logits": 0.022175450809299946, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.439, |
| "grad_norm": 8.6875, |
| "grad_norm_var": 3.472150675455729, |
| "learning_rate": 2e-05, |
| "loss": 0.3174, |
| "loss/crossentropy": 2.715834140777588, |
| "loss/hidden": 0.2919921875, |
| "loss/logits": 0.02542768605053425, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 2.0625, |
| "grad_norm_var": 3.4516398111979165, |
| "learning_rate": 2e-05, |
| "loss": 0.3531, |
| "loss/crossentropy": 2.089130699634552, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.026951050385832787, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.441, |
| "grad_norm": 5.6875, |
| "grad_norm_var": 3.8860979715983075, |
| "learning_rate": 2e-05, |
| "loss": 0.352, |
| "loss/crossentropy": 1.6687681376934052, |
| "loss/hidden": 0.3330078125, |
| "loss/logits": 0.018973306752741337, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.442, |
| "grad_norm": 1.6953125, |
| "grad_norm_var": 3.941239420572917, |
| "learning_rate": 2e-05, |
| "loss": 0.354, |
| "loss/crossentropy": 1.4019538760185242, |
| "loss/hidden": 0.33203125, |
| "loss/logits": 0.021962410770356655, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.443, |
| "grad_norm": 2.453125, |
| "grad_norm_var": 3.8729509989420574, |
| "learning_rate": 2e-05, |
| "loss": 0.3591, |
| "loss/crossentropy": 2.068819046020508, |
| "loss/hidden": 0.328125, |
| "loss/logits": 0.03100405167788267, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.444, |
| "grad_norm": 2.625, |
| "grad_norm_var": 3.7484527587890626, |
| "learning_rate": 2e-05, |
| "loss": 0.3207, |
| "loss/crossentropy": 1.2215966582298279, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.014033652492798865, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.445, |
| "grad_norm": 2.796875, |
| "grad_norm_var": 3.7149943033854167, |
| "learning_rate": 2e-05, |
| "loss": 0.2843, |
| "loss/crossentropy": 0.8393277078866959, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.010860613780096173, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.446, |
| "grad_norm": 3.6875, |
| "grad_norm_var": 3.7072184244791666, |
| "learning_rate": 2e-05, |
| "loss": 0.3369, |
| "loss/crossentropy": 0.8106656819581985, |
| "loss/hidden": 0.32421875, |
| "loss/logits": 0.01267361780628562, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.447, |
| "grad_norm": 4.28125, |
| "grad_norm_var": 3.774466959635417, |
| "learning_rate": 2e-05, |
| "loss": 0.3246, |
| "loss/crossentropy": 1.0552468746900558, |
| "loss/hidden": 0.3095703125, |
| "loss/logits": 0.015042064245790243, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 2.734375, |
| "grad_norm_var": 3.749592081705729, |
| "learning_rate": 2e-05, |
| "loss": 0.3734, |
| "loss/crossentropy": 2.4344149827957153, |
| "loss/hidden": 0.3427734375, |
| "loss/logits": 0.030597456730902195, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.449, |
| "grad_norm": 3.984375, |
| "grad_norm_var": 3.4621622721354166, |
| "learning_rate": 2e-05, |
| "loss": 0.3036, |
| "loss/crossentropy": 1.054320715367794, |
| "loss/hidden": 0.28857421875, |
| "loss/logits": 0.014980267733335495, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 1.8359375, |
| "grad_norm_var": 3.5083513895670575, |
| "learning_rate": 2e-05, |
| "loss": 0.3366, |
| "loss/crossentropy": 2.0155181288719177, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.02600990142673254, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.451, |
| "grad_norm": 2.0, |
| "grad_norm_var": 3.4738199869791666, |
| "learning_rate": 2e-05, |
| "loss": 0.3511, |
| "loss/crossentropy": 1.755088448524475, |
| "loss/hidden": 0.3271484375, |
| "loss/logits": 0.023935355246067047, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.452, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 3.4946329752604166, |
| "learning_rate": 2e-05, |
| "loss": 0.3499, |
| "loss/crossentropy": 1.7622599005699158, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.023745747283101082, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.453, |
| "grad_norm": 1.7890625, |
| "grad_norm_var": 3.4932431538899738, |
| "learning_rate": 2e-05, |
| "loss": 0.3215, |
| "loss/crossentropy": 2.3116530179977417, |
| "loss/hidden": 0.298828125, |
| "loss/logits": 0.022703303024172783, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.454, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 3.464989980061849, |
| "learning_rate": 2e-05, |
| "loss": 0.3673, |
| "loss/crossentropy": 1.5556917786598206, |
| "loss/hidden": 0.3408203125, |
| "loss/logits": 0.026494111865758896, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.455, |
| "grad_norm": 2.0, |
| "grad_norm_var": 1.3033078511555989, |
| "learning_rate": 2e-05, |
| "loss": 0.3715, |
| "loss/crossentropy": 1.7844219207763672, |
| "loss/hidden": 0.345703125, |
| "loss/logits": 0.02580021321773529, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.456, |
| "grad_norm": 2.53125, |
| "grad_norm_var": 1.2765439351399739, |
| "learning_rate": 2e-05, |
| "loss": 0.448, |
| "loss/crossentropy": 1.2347650527954102, |
| "loss/hidden": 0.4150390625, |
| "loss/logits": 0.0329879354685545, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.457, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 0.7350563049316406, |
| "learning_rate": 2e-05, |
| "loss": 0.3455, |
| "loss/crossentropy": 1.9715585112571716, |
| "loss/hidden": 0.318359375, |
| "loss/logits": 0.02718514297157526, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.458, |
| "grad_norm": 1.5859375, |
| "grad_norm_var": 0.7471616109212239, |
| "learning_rate": 2e-05, |
| "loss": 0.3339, |
| "loss/crossentropy": 2.389525294303894, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.025292156264185905, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.459, |
| "grad_norm": 1.4921875, |
| "grad_norm_var": 0.8066884358723958, |
| "learning_rate": 2e-05, |
| "loss": 0.3166, |
| "loss/crossentropy": 1.7892733812332153, |
| "loss/hidden": 0.29296875, |
| "loss/logits": 0.023592060431838036, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 1.8125, |
| "grad_norm_var": 0.8243560791015625, |
| "learning_rate": 2e-05, |
| "loss": 0.3353, |
| "loss/crossentropy": 1.9092342853546143, |
| "loss/hidden": 0.3115234375, |
| "loss/logits": 0.02376522123813629, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.461, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 0.87099609375, |
| "learning_rate": 2e-05, |
| "loss": 0.349, |
| "loss/crossentropy": 1.9013403058052063, |
| "loss/hidden": 0.3251953125, |
| "loss/logits": 0.02381738182157278, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.462, |
| "grad_norm": 2.9375, |
| "grad_norm_var": 0.76396484375, |
| "learning_rate": 2e-05, |
| "loss": 0.3492, |
| "loss/crossentropy": 0.9097070023417473, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.01913693710230291, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.463, |
| "grad_norm": 2.828125, |
| "grad_norm_var": 0.4963287353515625, |
| "learning_rate": 2e-05, |
| "loss": 0.4669, |
| "loss/crossentropy": 1.9413211345672607, |
| "loss/hidden": 0.427734375, |
| "loss/logits": 0.03912976011633873, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 1.9296875, |
| "grad_norm_var": 0.47173233032226564, |
| "learning_rate": 2e-05, |
| "loss": 0.3569, |
| "loss/crossentropy": 2.3746496438980103, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.030762989073991776, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.465, |
| "grad_norm": 1.796875, |
| "grad_norm_var": 0.21467259724934895, |
| "learning_rate": 2e-05, |
| "loss": 0.3875, |
| "loss/crossentropy": 1.920172929763794, |
| "loss/hidden": 0.359375, |
| "loss/logits": 0.028154666535556316, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.466, |
| "grad_norm": 2.59375, |
| "grad_norm_var": 0.23995768229166667, |
| "learning_rate": 2e-05, |
| "loss": 0.4173, |
| "loss/crossentropy": 2.1804317831993103, |
| "loss/hidden": 0.3828125, |
| "loss/logits": 0.03448019549250603, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.467, |
| "grad_norm": 2.453125, |
| "grad_norm_var": 0.25349833170572916, |
| "learning_rate": 2e-05, |
| "loss": 0.3635, |
| "loss/crossentropy": 2.1129865646362305, |
| "loss/hidden": 0.3369140625, |
| "loss/logits": 0.026613284833729267, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.468, |
| "grad_norm": 3.4375, |
| "grad_norm_var": 0.37997639973958336, |
| "learning_rate": 2e-05, |
| "loss": 0.3892, |
| "loss/crossentropy": 1.6438812613487244, |
| "loss/hidden": 0.3623046875, |
| "loss/logits": 0.026910429820418358, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.469, |
| "grad_norm": 13.125, |
| "grad_norm_var": 7.936161041259766, |
| "learning_rate": 2e-05, |
| "loss": 0.4187, |
| "loss/crossentropy": 1.8062403798103333, |
| "loss/hidden": 0.3857421875, |
| "loss/logits": 0.03291827440261841, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 3.421875, |
| "grad_norm_var": 7.8641212463378904, |
| "learning_rate": 2e-05, |
| "loss": 0.4157, |
| "loss/crossentropy": 1.2208881378173828, |
| "loss/hidden": 0.39453125, |
| "loss/logits": 0.02117818035185337, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.471, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 7.8700111389160154, |
| "learning_rate": 2e-05, |
| "loss": 0.3306, |
| "loss/crossentropy": 2.474324107170105, |
| "loss/hidden": 0.3037109375, |
| "loss/logits": 0.026909410022199154, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.472, |
| "grad_norm": 2.796875, |
| "grad_norm_var": 7.860741933186849, |
| "learning_rate": 2e-05, |
| "loss": 0.4071, |
| "loss/crossentropy": 1.8907885551452637, |
| "loss/hidden": 0.3740234375, |
| "loss/logits": 0.03311134688556194, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.473, |
| "grad_norm": 5.40625, |
| "grad_norm_var": 8.053236643473307, |
| "learning_rate": 2e-05, |
| "loss": 0.482, |
| "loss/crossentropy": 1.851112186908722, |
| "loss/hidden": 0.4287109375, |
| "loss/logits": 0.0532735763117671, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.474, |
| "grad_norm": 1.8125, |
| "grad_norm_var": 8.008226521809895, |
| "learning_rate": 2e-05, |
| "loss": 0.4011, |
| "loss/crossentropy": 2.0893144607543945, |
| "loss/hidden": 0.37109375, |
| "loss/logits": 0.03000558167695999, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.475, |
| "grad_norm": 1.84375, |
| "grad_norm_var": 7.936071523030599, |
| "learning_rate": 2e-05, |
| "loss": 0.4086, |
| "loss/crossentropy": 1.692557156085968, |
| "loss/hidden": 0.37890625, |
| "loss/logits": 0.029658248648047447, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.476, |
| "grad_norm": 1.734375, |
| "grad_norm_var": 7.9510963439941404, |
| "learning_rate": 2e-05, |
| "loss": 0.3369, |
| "loss/crossentropy": 2.7231298685073853, |
| "loss/hidden": 0.3095703125, |
| "loss/logits": 0.027365448884665966, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.477, |
| "grad_norm": 122.5, |
| "grad_norm_var": 895.1761065165202, |
| "learning_rate": 2e-05, |
| "loss": 1.8739, |
| "loss/crossentropy": 1.9931391477584839, |
| "loss/hidden": 1.73828125, |
| "loss/logits": 0.13565433584153652, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.478, |
| "grad_norm": 18.75, |
| "grad_norm_var": 894.2567990620931, |
| "learning_rate": 2e-05, |
| "loss": 0.4467, |
| "loss/crossentropy": 1.0818050801753998, |
| "loss/hidden": 0.423828125, |
| "loss/logits": 0.022886332124471664, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.479, |
| "grad_norm": 1.9609375, |
| "grad_norm_var": 895.3381581624349, |
| "learning_rate": 2e-05, |
| "loss": 0.3744, |
| "loss/crossentropy": 2.382234215736389, |
| "loss/hidden": 0.3447265625, |
| "loss/logits": 0.029717115685343742, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 1.71875, |
| "grad_norm_var": 895.6162839253743, |
| "learning_rate": 2e-05, |
| "loss": 0.3323, |
| "loss/crossentropy": 2.0683305859565735, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.023680799640715122, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.481, |
| "grad_norm": 2.546875, |
| "grad_norm_var": 894.6604733784993, |
| "learning_rate": 2e-05, |
| "loss": 0.3756, |
| "loss/crossentropy": 2.154377818107605, |
| "loss/hidden": 0.34765625, |
| "loss/logits": 0.02795298583805561, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.482, |
| "grad_norm": 3.1875, |
| "grad_norm_var": 893.9573666890462, |
| "learning_rate": 2e-05, |
| "loss": 0.4124, |
| "loss/crossentropy": 1.9701088666915894, |
| "loss/hidden": 0.3779296875, |
| "loss/logits": 0.03451960347592831, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.483, |
| "grad_norm": 2.5, |
| "grad_norm_var": 893.8991452534993, |
| "learning_rate": 2e-05, |
| "loss": 0.4523, |
| "loss/crossentropy": 0.9486123919487, |
| "loss/hidden": 0.4306640625, |
| "loss/logits": 0.02167674619704485, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.484, |
| "grad_norm": 2.578125, |
| "grad_norm_var": 894.9027565002441, |
| "learning_rate": 2e-05, |
| "loss": 0.3955, |
| "loss/crossentropy": 1.7118502855300903, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.030311796814203262, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.485, |
| "grad_norm": 1.890625, |
| "grad_norm_var": 900.7159604390462, |
| "learning_rate": 2e-05, |
| "loss": 0.3914, |
| "loss/crossentropy": 1.7511045932769775, |
| "loss/hidden": 0.36328125, |
| "loss/logits": 0.02810109406709671, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.486, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 902.0463498433431, |
| "learning_rate": 2e-05, |
| "loss": 0.3893, |
| "loss/crossentropy": 1.9742628931999207, |
| "loss/hidden": 0.3603515625, |
| "loss/logits": 0.028935128822922707, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.487, |
| "grad_norm": 2.609375, |
| "grad_norm_var": 901.28504002889, |
| "learning_rate": 2e-05, |
| "loss": 0.338, |
| "loss/crossentropy": 1.5944682955741882, |
| "loss/hidden": 0.31640625, |
| "loss/logits": 0.02155130822211504, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.488, |
| "grad_norm": 2.0, |
| "grad_norm_var": 902.1965695699056, |
| "learning_rate": 2e-05, |
| "loss": 0.3749, |
| "loss/crossentropy": 2.109809994697571, |
| "loss/hidden": 0.3486328125, |
| "loss/logits": 0.026237317360937595, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.489, |
| "grad_norm": 2.828125, |
| "grad_norm_var": 904.5185605367025, |
| "learning_rate": 2e-05, |
| "loss": 0.3601, |
| "loss/crossentropy": 2.371906280517578, |
| "loss/hidden": 0.33203125, |
| "loss/logits": 0.0280781090259552, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 2.25, |
| "grad_norm_var": 904.0067481994629, |
| "learning_rate": 2e-05, |
| "loss": 0.3881, |
| "loss/crossentropy": 2.3074965476989746, |
| "loss/hidden": 0.3583984375, |
| "loss/logits": 0.029700559563934803, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.491, |
| "grad_norm": 1.609375, |
| "grad_norm_var": 904.2906532287598, |
| "learning_rate": 2e-05, |
| "loss": 0.3533, |
| "loss/crossentropy": 2.0604811906814575, |
| "loss/hidden": 0.3271484375, |
| "loss/logits": 0.026149596087634563, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.492, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 903.7375221252441, |
| "learning_rate": 2e-05, |
| "loss": 0.3982, |
| "loss/crossentropy": 2.0394086837768555, |
| "loss/hidden": 0.3671875, |
| "loss/logits": 0.030979415401816368, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.493, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 17.239774322509767, |
| "learning_rate": 2e-05, |
| "loss": 0.3721, |
| "loss/crossentropy": 1.992867350578308, |
| "loss/hidden": 0.3447265625, |
| "loss/logits": 0.02732760366052389, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.494, |
| "grad_norm": 1.5, |
| "grad_norm_var": 0.24021377563476562, |
| "learning_rate": 2e-05, |
| "loss": 0.3607, |
| "loss/crossentropy": 2.0647668838500977, |
| "loss/hidden": 0.3349609375, |
| "loss/logits": 0.02573198452591896, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.495, |
| "grad_norm": 3.265625, |
| "grad_norm_var": 0.3059153238932292, |
| "learning_rate": 2e-05, |
| "loss": 0.4332, |
| "loss/crossentropy": 2.0061678886413574, |
| "loss/hidden": 0.4033203125, |
| "loss/logits": 0.029847824946045876, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 1.671875, |
| "grad_norm_var": 0.30953776041666664, |
| "learning_rate": 2e-05, |
| "loss": 0.3677, |
| "loss/crossentropy": 2.029963493347168, |
| "loss/hidden": 0.3408203125, |
| "loss/logits": 0.026841914281249046, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.497, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 0.3045074462890625, |
| "learning_rate": 2e-05, |
| "loss": 0.3773, |
| "loss/crossentropy": 1.836094081401825, |
| "loss/hidden": 0.3505859375, |
| "loss/logits": 0.026703315787017345, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.498, |
| "grad_norm": 1.8984375, |
| "grad_norm_var": 0.24739761352539064, |
| "learning_rate": 2e-05, |
| "loss": 0.3934, |
| "loss/crossentropy": 2.284022331237793, |
| "loss/hidden": 0.36328125, |
| "loss/logits": 0.030102317221462727, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.499, |
| "grad_norm": 1.609375, |
| "grad_norm_var": 0.25783462524414064, |
| "learning_rate": 2e-05, |
| "loss": 0.422, |
| "loss/crossentropy": 1.7640503644943237, |
| "loss/hidden": 0.388671875, |
| "loss/logits": 0.03330034948885441, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 2.40625, |
| "grad_norm_var": 0.2490618387858073, |
| "learning_rate": 2e-05, |
| "loss": 0.4409, |
| "loss/crossentropy": 1.4432637095451355, |
| "loss/hidden": 0.4130859375, |
| "loss/logits": 0.027862844988703728, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.2202930782208e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|