{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 250, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 0.000537872314453125, "learning_rate": 2.0000000000000002e-07, "loss": 0.0002, "loss/crossentropy": 0.8766392022371292, "loss/hidden": 0.0, "loss/logits": 0.00021765431665698998, "step": 1 }, { "epoch": 0.002, "grad_norm": 0.2265625, "learning_rate": 4.0000000000000003e-07, "loss": 0.005, "loss/crossentropy": 1.9883175492286682, "loss/hidden": 0.0039215087890625, "loss/logits": 0.001088879187591374, "step": 2 }, { "epoch": 0.003, "grad_norm": 0.25390625, "learning_rate": 6.000000000000001e-07, "loss": 0.0052, "loss/crossentropy": 1.8020615577697754, "loss/hidden": 0.004180908203125, "loss/logits": 0.0010398300073575228, "step": 3 }, { "epoch": 0.004, "grad_norm": 0.255859375, "learning_rate": 8.000000000000001e-07, "loss": 0.0049, "loss/crossentropy": 1.0764193534851074, "loss/hidden": 0.00399017333984375, "loss/logits": 0.0008995172393042594, "step": 4 }, { "epoch": 0.005, "grad_norm": 0.224609375, "learning_rate": 1.0000000000000002e-06, "loss": 0.0049, "loss/crossentropy": 1.7853868007659912, "loss/hidden": 0.0038604736328125, "loss/logits": 0.0010730837238952518, "step": 5 }, { "epoch": 0.006, "grad_norm": 0.2333984375, "learning_rate": 1.2000000000000002e-06, "loss": 0.0051, "loss/crossentropy": 2.4102118015289307, "loss/hidden": 0.00388336181640625, "loss/logits": 0.0011915687937289476, "step": 6 }, { "epoch": 0.007, "grad_norm": 0.35546875, "learning_rate": 1.4000000000000001e-06, "loss": 0.0056, "loss/crossentropy": 1.9921993017196655, "loss/hidden": 0.0044403076171875, "loss/logits": 0.0011139529524371028, "step": 7 }, { "epoch": 0.008, "grad_norm": 0.2353515625, "learning_rate": 1.6000000000000001e-06, "loss": 0.0049, "loss/crossentropy": 2.269957184791565, "loss/hidden": 0.00376129150390625, "loss/logits": 0.0011444001575000584, "step": 8 }, { "epoch": 0.009, "grad_norm": 0.22265625, "learning_rate": 1.8000000000000001e-06, "loss": 0.0051, "loss/crossentropy": 2.1889681220054626, "loss/hidden": 0.0038909912109375, "loss/logits": 0.0011716101435013115, "step": 9 }, { "epoch": 0.01, "grad_norm": 0.291015625, "learning_rate": 2.0000000000000003e-06, "loss": 0.0052, "loss/crossentropy": 1.76205712556839, "loss/hidden": 0.0041351318359375, "loss/logits": 0.001058999594533816, "step": 10 }, { "epoch": 0.011, "grad_norm": 0.2177734375, "learning_rate": 2.2e-06, "loss": 0.0049, "loss/crossentropy": 2.438264012336731, "loss/hidden": 0.003753662109375, "loss/logits": 0.0011843050015158951, "step": 11 }, { "epoch": 0.012, "grad_norm": 0.41015625, "learning_rate": 2.4000000000000003e-06, "loss": 0.0071, "loss/crossentropy": 1.8871825337409973, "loss/hidden": 0.0059051513671875, "loss/logits": 0.0011930759064853191, "step": 12 }, { "epoch": 0.013, "grad_norm": 0.53125, "learning_rate": 2.6e-06, "loss": 0.0084, "loss/crossentropy": 1.7400972247123718, "loss/hidden": 0.0071258544921875, "loss/logits": 0.001270102453418076, "step": 13 }, { "epoch": 0.014, "grad_norm": 0.365234375, "learning_rate": 2.8000000000000003e-06, "loss": 0.0075, "loss/crossentropy": 2.0053656101226807, "loss/hidden": 0.006256103515625, "loss/logits": 0.0012446122709661722, "step": 14 }, { "epoch": 0.015, "grad_norm": 0.455078125, "learning_rate": 3e-06, "loss": 0.0072, "loss/crossentropy": 1.984630048274994, "loss/hidden": 0.0059356689453125, "loss/logits": 0.0012947238283231854, "step": 15 }, { "epoch": 0.016, "grad_norm": 0.447265625, "grad_norm_var": 0.016307008621940136, "learning_rate": 3.2000000000000003e-06, "loss": 0.0072, "loss/crossentropy": 2.4732788801193237, "loss/hidden": 0.005767822265625, "loss/logits": 0.00144299550447613, "step": 16 }, { "epoch": 0.017, "grad_norm": 0.89453125, "grad_norm_var": 0.031113270918528238, "learning_rate": 3.4000000000000005e-06, "loss": 0.0076, "loss/crossentropy": 1.7775737643241882, "loss/hidden": 0.006317138671875, "loss/logits": 0.001260987774003297, "step": 17 }, { "epoch": 0.018, "grad_norm": 0.45703125, "grad_norm_var": 0.030601243178049724, "learning_rate": 3.6000000000000003e-06, "loss": 0.0067, "loss/crossentropy": 1.1123631671071053, "loss/hidden": 0.0057373046875, "loss/logits": 0.0009507400100119412, "step": 18 }, { "epoch": 0.019, "grad_norm": 0.298828125, "grad_norm_var": 0.030057998498280843, "learning_rate": 3.8000000000000005e-06, "loss": 0.0068, "loss/crossentropy": 1.8855515718460083, "loss/hidden": 0.0055694580078125, "loss/logits": 0.0012491169618442655, "step": 19 }, { "epoch": 0.02, "grad_norm": 0.3984375, "grad_norm_var": 0.02918777068456014, "learning_rate": 4.000000000000001e-06, "loss": 0.007, "loss/crossentropy": 1.773246705532074, "loss/hidden": 0.005828857421875, "loss/logits": 0.0011664124322123826, "step": 20 }, { "epoch": 0.021, "grad_norm": 0.302734375, "grad_norm_var": 0.02797787586847941, "learning_rate": 4.2000000000000004e-06, "loss": 0.0069, "loss/crossentropy": 2.1012651920318604, "loss/hidden": 0.0056610107421875, "loss/logits": 0.0012796117807738483, "step": 21 }, { "epoch": 0.022, "grad_norm": 0.486328125, "grad_norm_var": 0.026955906550089517, "learning_rate": 4.4e-06, "loss": 0.0101, "loss/crossentropy": 1.9430513381958008, "loss/hidden": 0.008514404296875, "loss/logits": 0.0016175230266526341, "step": 22 }, { "epoch": 0.023, "grad_norm": 0.609375, "grad_norm_var": 0.029542907079060873, "learning_rate": 4.600000000000001e-06, "loss": 0.0118, "loss/crossentropy": 1.5989271998405457, "loss/hidden": 0.01025390625, "loss/logits": 0.0015109491650946438, "step": 23 }, { "epoch": 0.024, "grad_norm": 0.80078125, "grad_norm_var": 0.03606090148289998, "learning_rate": 4.800000000000001e-06, "loss": 0.0102, "loss/crossentropy": 1.141058474779129, "loss/hidden": 0.009033203125, "loss/logits": 0.0011210083321202546, "step": 24 }, { "epoch": 0.025, "grad_norm": 0.361328125, "grad_norm_var": 0.03307259480158488, "learning_rate": 5e-06, "loss": 0.0094, "loss/crossentropy": 2.0950170755386353, "loss/hidden": 0.0077972412109375, "loss/logits": 0.001559894997626543, "step": 25 }, { "epoch": 0.026, "grad_norm": 0.83984375, "grad_norm_var": 0.0396828293800354, "learning_rate": 5.2e-06, "loss": 0.0112, "loss/crossentropy": 0.9552253857254982, "loss/hidden": 0.010284423828125, "loss/logits": 0.0008805262332316488, "step": 26 }, { "epoch": 0.027, "grad_norm": 0.546875, "grad_norm_var": 0.034408044815063474, "learning_rate": 5.400000000000001e-06, "loss": 0.0091, "loss/crossentropy": 1.3719437271356583, "loss/hidden": 0.007965087890625, "loss/logits": 0.001155910431407392, "step": 27 }, { "epoch": 0.028, "grad_norm": 0.73046875, "grad_norm_var": 0.036436065038045244, "learning_rate": 5.600000000000001e-06, "loss": 0.0107, "loss/crossentropy": 1.6477643251419067, "loss/hidden": 0.009185791015625, "loss/logits": 0.0015593590214848518, "step": 28 }, { "epoch": 0.029, "grad_norm": 0.41796875, "grad_norm_var": 0.03726207415262858, "learning_rate": 5.8e-06, "loss": 0.0096, "loss/crossentropy": 1.7987680435180664, "loss/hidden": 0.008087158203125, "loss/logits": 0.0015162223717197776, "step": 29 }, { "epoch": 0.03, "grad_norm": 0.33203125, "grad_norm_var": 0.03804162343343099, "learning_rate": 6e-06, "loss": 0.0094, "loss/crossentropy": 1.74210923910141, "loss/hidden": 0.008026123046875, "loss/logits": 0.0013514517340809107, "step": 30 }, { "epoch": 0.031, "grad_norm": 0.4296875, "grad_norm_var": 0.038314167658487955, "learning_rate": 6.200000000000001e-06, "loss": 0.0095, "loss/crossentropy": 1.45715793967247, "loss/hidden": 0.0081329345703125, "loss/logits": 0.0013754194369539618, "step": 31 }, { "epoch": 0.032, "grad_norm": 0.54296875, "grad_norm_var": 0.03793176015218099, "learning_rate": 6.4000000000000006e-06, "loss": 0.0137, "loss/crossentropy": 1.635874330997467, "loss/hidden": 0.01190185546875, "loss/logits": 0.0017871989402920008, "step": 32 }, { "epoch": 0.033, "grad_norm": 0.76171875, "grad_norm_var": 0.03254489898681641, "learning_rate": 6.600000000000001e-06, "loss": 0.0143, "loss/crossentropy": 1.0347481966018677, "loss/hidden": 0.01300048828125, "loss/logits": 0.0012789819156751037, "step": 33 }, { "epoch": 0.034, "grad_norm": 0.515625, "grad_norm_var": 0.032269287109375, "learning_rate": 6.800000000000001e-06, "loss": 0.0132, "loss/crossentropy": 2.0032879114151, "loss/hidden": 0.011383056640625, "loss/logits": 0.0018645224627107382, "step": 34 }, { "epoch": 0.035, "grad_norm": 1.0703125, "grad_norm_var": 0.04636419614156087, "learning_rate": 7e-06, "loss": 0.0143, "loss/crossentropy": 1.8410796523094177, "loss/hidden": 0.01226806640625, "loss/logits": 0.001986370305530727, "step": 35 }, { "epoch": 0.036, "grad_norm": 0.4296875, "grad_norm_var": 0.045703490575154625, "learning_rate": 7.2000000000000005e-06, "loss": 0.0136, "loss/crossentropy": 1.9098870158195496, "loss/hidden": 0.01171875, "loss/logits": 0.0018596722511574626, "step": 36 }, { "epoch": 0.037, "grad_norm": 68.0, "grad_norm_var": 284.03319854736327, "learning_rate": 7.4e-06, "loss": 0.0558, "loss/crossentropy": 1.5951663255691528, "loss/hidden": 0.051666259765625, "loss/logits": 0.004160793498158455, "step": 37 }, { "epoch": 0.038, "grad_norm": 0.380859375, "grad_norm_var": 284.0946207046509, "learning_rate": 7.600000000000001e-06, "loss": 0.0133, "loss/crossentropy": 2.25837504863739, "loss/hidden": 0.01129150390625, "loss/logits": 0.0020168160554021597, "step": 38 }, { "epoch": 0.039, "grad_norm": 0.455078125, "grad_norm_var": 284.1822828769684, "learning_rate": 7.800000000000002e-06, "loss": 0.0126, "loss/crossentropy": 2.126526176929474, "loss/hidden": 0.0107421875, "loss/logits": 0.0018400833941996098, "step": 39 }, { "epoch": 0.04, "grad_norm": 0.63671875, "grad_norm_var": 284.27119545936586, "learning_rate": 8.000000000000001e-06, "loss": 0.0142, "loss/crossentropy": 1.4863142371177673, "loss/hidden": 0.012481689453125, "loss/logits": 0.0017027563299052417, "step": 40 }, { "epoch": 0.041, "grad_norm": 0.283203125, "grad_norm_var": 284.3175859928131, "learning_rate": 8.2e-06, "loss": 0.0112, "loss/crossentropy": 2.0888695120811462, "loss/hidden": 0.009521484375, "loss/logits": 0.0017255974235013127, "step": 41 }, { "epoch": 0.042, "grad_norm": 0.431640625, "grad_norm_var": 284.5420877456665, "learning_rate": 8.400000000000001e-06, "loss": 0.0173, "loss/crossentropy": 1.611488163471222, "loss/hidden": 0.015380859375, "loss/logits": 0.0019445380312390625, "step": 42 }, { "epoch": 0.043, "grad_norm": 0.419921875, "grad_norm_var": 284.6142045180003, "learning_rate": 8.6e-06, "loss": 0.0166, "loss/crossentropy": 1.8987411260604858, "loss/hidden": 0.0146484375, "loss/logits": 0.0019467678503133357, "step": 43 }, { "epoch": 0.044, "grad_norm": 0.58203125, "grad_norm_var": 284.6949343204498, "learning_rate": 8.8e-06, "loss": 0.0183, "loss/crossentropy": 1.4084473848342896, "loss/hidden": 0.01605224609375, "loss/logits": 0.002271471545100212, "step": 44 }, { "epoch": 0.045, "grad_norm": 0.380859375, "grad_norm_var": 284.71635888417563, "learning_rate": 9e-06, "loss": 0.0159, "loss/crossentropy": 1.6970309615135193, "loss/hidden": 0.01397705078125, "loss/logits": 0.0019325784523971379, "step": 45 }, { "epoch": 0.046, "grad_norm": 0.455078125, "grad_norm_var": 284.64517935117084, "learning_rate": 9.200000000000002e-06, "loss": 0.0165, "loss/crossentropy": 2.1346731781959534, "loss/hidden": 0.014312744140625, "loss/logits": 0.002142712823115289, "step": 46 }, { "epoch": 0.047, "grad_norm": 2.21875, "grad_norm_var": 283.818000014623, "learning_rate": 9.4e-06, "loss": 0.0175, "loss/crossentropy": 1.6114214062690735, "loss/hidden": 0.0155029296875, "loss/logits": 0.0020421514636836946, "step": 47 }, { "epoch": 0.048, "grad_norm": 0.44921875, "grad_norm_var": 283.87235945065817, "learning_rate": 9.600000000000001e-06, "loss": 0.0157, "loss/crossentropy": 2.056842625141144, "loss/hidden": 0.013671875, "loss/logits": 0.0020451846066862345, "step": 48 }, { "epoch": 0.049, "grad_norm": 0.439453125, "grad_norm_var": 284.05417149861654, "learning_rate": 9.800000000000001e-06, "loss": 0.016, "loss/crossentropy": 1.5892411470413208, "loss/hidden": 0.013946533203125, "loss/logits": 0.00205704930704087, "step": 49 }, { "epoch": 0.05, "grad_norm": 0.3359375, "grad_norm_var": 284.15935770670575, "learning_rate": 1e-05, "loss": 0.0153, "loss/crossentropy": 2.3872954845428467, "loss/hidden": 0.01312255859375, "loss/logits": 0.0021313573233783245, "step": 50 }, { "epoch": 0.051, "grad_norm": 0.451171875, "grad_norm_var": 284.49208029111225, "learning_rate": 1.02e-05, "loss": 0.0168, "loss/crossentropy": 2.0149841904640198, "loss/hidden": 0.01470947265625, "loss/logits": 0.0020815907046198845, "step": 51 }, { "epoch": 0.052, "grad_norm": 0.51953125, "grad_norm_var": 284.44056928952534, "learning_rate": 1.04e-05, "loss": 0.021, "loss/crossentropy": 1.9311216473579407, "loss/hidden": 0.0185546875, "loss/logits": 0.0024686548858880997, "step": 52 }, { "epoch": 0.053, "grad_norm": 0.546875, "grad_norm_var": 0.20315702756245932, "learning_rate": 1.0600000000000002e-05, "loss": 0.0204, "loss/crossentropy": 1.9871841073036194, "loss/hidden": 0.01806640625, "loss/logits": 0.00237347767688334, "step": 53 }, { "epoch": 0.054, "grad_norm": 0.51171875, "grad_norm_var": 0.2010729471842448, "learning_rate": 1.0800000000000002e-05, "loss": 0.0195, "loss/crossentropy": 1.4909774661064148, "loss/hidden": 0.017578125, "loss/logits": 0.0018839699332602322, "step": 54 }, { "epoch": 0.055, "grad_norm": 0.376953125, "grad_norm_var": 0.20264968872070313, "learning_rate": 1.1000000000000001e-05, "loss": 0.0188, "loss/crossentropy": 1.731587290763855, "loss/hidden": 0.01666259765625, "loss/logits": 0.0021363290725275874, "step": 55 }, { "epoch": 0.056, "grad_norm": 0.482421875, "grad_norm_var": 0.20266098976135255, "learning_rate": 1.1200000000000001e-05, "loss": 0.0198, "loss/crossentropy": 1.8391692638397217, "loss/hidden": 0.01751708984375, "loss/logits": 0.0022706754971295595, "step": 56 }, { "epoch": 0.057, "grad_norm": 0.82421875, "grad_norm_var": 0.20132694244384766, "learning_rate": 1.14e-05, "loss": 0.0181, "loss/crossentropy": 1.326266534626484, "loss/hidden": 0.01654052734375, "loss/logits": 0.0015604346699547023, "step": 57 }, { "epoch": 0.058, "grad_norm": 0.41015625, "grad_norm_var": 0.2018068790435791, "learning_rate": 1.16e-05, "loss": 0.0185, "loss/crossentropy": 2.5511186122894287, "loss/hidden": 0.01611328125, "loss/logits": 0.0024241225328296423, "step": 58 }, { "epoch": 0.059, "grad_norm": 1.609375, "grad_norm_var": 0.26361236572265623, "learning_rate": 1.18e-05, "loss": 0.0183, "loss/crossentropy": 1.0930684125050902, "loss/hidden": 0.01702880859375, "loss/logits": 0.0013018156460020691, "step": 59 }, { "epoch": 0.06, "grad_norm": 0.486328125, "grad_norm_var": 0.2652066389719645, "learning_rate": 1.2e-05, "loss": 0.02, "loss/crossentropy": 2.0819135308265686, "loss/hidden": 0.0174560546875, "loss/logits": 0.0025293552316725254, "step": 60 }, { "epoch": 0.061, "grad_norm": 1.09375, "grad_norm_var": 0.2708051045735677, "learning_rate": 1.22e-05, "loss": 0.0183, "loss/crossentropy": 0.9290539920330048, "loss/hidden": 0.016754150390625, "loss/logits": 0.0015562092885375023, "step": 61 }, { "epoch": 0.062, "grad_norm": 0.453125, "grad_norm_var": 0.2708693027496338, "learning_rate": 1.2400000000000002e-05, "loss": 0.0227, "loss/crossentropy": 2.1691651344299316, "loss/hidden": 0.01995849609375, "loss/logits": 0.002767750178463757, "step": 62 }, { "epoch": 0.063, "grad_norm": 0.4765625, "grad_norm_var": 0.10790785153706868, "learning_rate": 1.2600000000000001e-05, "loss": 0.0233, "loss/crossentropy": 2.1545491218566895, "loss/hidden": 0.0205078125, "loss/logits": 0.002785824006423354, "step": 63 }, { "epoch": 0.064, "grad_norm": 0.47265625, "grad_norm_var": 0.10749700864156088, "learning_rate": 1.2800000000000001e-05, "loss": 0.0223, "loss/crossentropy": 1.9527725577354431, "loss/hidden": 0.01971435546875, "loss/logits": 0.0025634407065808773, "step": 64 }, { "epoch": 0.065, "grad_norm": 0.55078125, "grad_norm_var": 0.10599034627278646, "learning_rate": 1.3000000000000001e-05, "loss": 0.0256, "loss/crossentropy": 1.8496606945991516, "loss/hidden": 0.02288818359375, "loss/logits": 0.0027499888092279434, "step": 65 }, { "epoch": 0.066, "grad_norm": 0.55859375, "grad_norm_var": 0.1012465794881185, "learning_rate": 1.3200000000000002e-05, "loss": 0.0221, "loss/crossentropy": 1.9440131187438965, "loss/hidden": 0.01971435546875, "loss/logits": 0.002431391447316855, "step": 66 }, { "epoch": 0.067, "grad_norm": 0.498046875, "grad_norm_var": 0.10036614735921225, "learning_rate": 1.3400000000000002e-05, "loss": 0.0241, "loss/crossentropy": 1.7777947187423706, "loss/hidden": 0.02142333984375, "loss/logits": 0.0026856372132897377, "step": 67 }, { "epoch": 0.068, "grad_norm": 0.66015625, "grad_norm_var": 0.09977563222249348, "learning_rate": 1.3600000000000002e-05, "loss": 0.0241, "loss/crossentropy": 1.6634170711040497, "loss/hidden": 0.02178955078125, "loss/logits": 0.002268874435685575, "step": 68 }, { "epoch": 0.069, "grad_norm": 0.359375, "grad_norm_var": 0.1039443333943685, "learning_rate": 1.38e-05, "loss": 0.0217, "loss/crossentropy": 1.9945446252822876, "loss/hidden": 0.019287109375, "loss/logits": 0.0024602848570793867, "step": 69 }, { "epoch": 0.07, "grad_norm": 0.546875, "grad_norm_var": 0.10354207356770834, "learning_rate": 1.4e-05, "loss": 0.0212, "loss/crossentropy": 2.234881281852722, "loss/hidden": 0.0185546875, "loss/logits": 0.0026649613864719868, "step": 70 }, { "epoch": 0.071, "grad_norm": 0.5390625, "grad_norm_var": 0.1000130812327067, "learning_rate": 1.4200000000000001e-05, "loss": 0.0235, "loss/crossentropy": 2.3283374309539795, "loss/hidden": 0.0206298828125, "loss/logits": 0.0028440920868888497, "step": 71 }, { "epoch": 0.072, "grad_norm": 0.96484375, "grad_norm_var": 0.10530134836832682, "learning_rate": 1.4400000000000001e-05, "loss": 0.0273, "loss/crossentropy": 2.446515917778015, "loss/hidden": 0.0244140625, "loss/logits": 0.002847215859219432, "step": 72 }, { "epoch": 0.073, "grad_norm": 0.66015625, "grad_norm_var": 0.10331465403238932, "learning_rate": 1.46e-05, "loss": 0.0313, "loss/crossentropy": 1.8365015387535095, "loss/hidden": 0.0277099609375, "loss/logits": 0.003543111262843013, "step": 73 }, { "epoch": 0.074, "grad_norm": 0.58203125, "grad_norm_var": 0.0997507095336914, "learning_rate": 1.48e-05, "loss": 0.0275, "loss/crossentropy": 1.8750606179237366, "loss/hidden": 0.0244140625, "loss/logits": 0.0030850095208734274, "step": 74 }, { "epoch": 0.075, "grad_norm": 0.6171875, "grad_norm_var": 0.03528436024983724, "learning_rate": 1.5000000000000002e-05, "loss": 0.0285, "loss/crossentropy": 1.6197695136070251, "loss/hidden": 0.02557373046875, "loss/logits": 0.002948817447759211, "step": 75 }, { "epoch": 0.076, "grad_norm": 0.5546875, "grad_norm_var": 0.034586191177368164, "learning_rate": 1.5200000000000002e-05, "loss": 0.0253, "loss/crossentropy": 2.139370322227478, "loss/hidden": 0.0225830078125, "loss/logits": 0.002709153341129422, "step": 76 }, { "epoch": 0.077, "grad_norm": 0.78125, "grad_norm_var": 0.020085255304972332, "learning_rate": 1.54e-05, "loss": 0.0308, "loss/crossentropy": 1.5335928797721863, "loss/hidden": 0.02777099609375, "loss/logits": 0.00305762467905879, "step": 77 }, { "epoch": 0.078, "grad_norm": 0.5078125, "grad_norm_var": 0.019349145889282226, "learning_rate": 1.5600000000000003e-05, "loss": 0.0273, "loss/crossentropy": 2.623558282852173, "loss/hidden": 0.024169921875, "loss/logits": 0.0031643210677430034, "step": 78 }, { "epoch": 0.079, "grad_norm": 0.470703125, "grad_norm_var": 0.019434547424316405, "learning_rate": 1.58e-05, "loss": 0.0275, "loss/crossentropy": 2.3246337175369263, "loss/hidden": 0.0242919921875, "loss/logits": 0.0031679703388363123, "step": 79 }, { "epoch": 0.08, "grad_norm": 0.431640625, "grad_norm_var": 0.0201418399810791, "learning_rate": 1.6000000000000003e-05, "loss": 0.0254, "loss/crossentropy": 1.801970660686493, "loss/hidden": 0.0228271484375, "loss/logits": 0.0025987064000219107, "step": 80 }, { "epoch": 0.081, "grad_norm": 0.44921875, "grad_norm_var": 0.021184905370076498, "learning_rate": 1.62e-05, "loss": 0.0265, "loss/crossentropy": 1.9489317536354065, "loss/hidden": 0.02374267578125, "loss/logits": 0.0027701087528839707, "step": 81 }, { "epoch": 0.082, "grad_norm": 0.67578125, "grad_norm_var": 0.02180479367574056, "learning_rate": 1.64e-05, "loss": 0.034, "loss/crossentropy": 1.7697851061820984, "loss/hidden": 0.03070068359375, "loss/logits": 0.003283574478700757, "step": 82 }, { "epoch": 0.083, "grad_norm": 0.57421875, "grad_norm_var": 0.021323140462239584, "learning_rate": 1.66e-05, "loss": 0.0309, "loss/crossentropy": 1.5783970654010773, "loss/hidden": 0.028076171875, "loss/logits": 0.002809713245369494, "step": 83 }, { "epoch": 0.084, "grad_norm": 0.53125, "grad_norm_var": 0.02108605702718099, "learning_rate": 1.6800000000000002e-05, "loss": 0.0332, "loss/crossentropy": 1.460361659526825, "loss/hidden": 0.0303955078125, "loss/logits": 0.0027706819819286466, "step": 84 }, { "epoch": 0.085, "grad_norm": 0.6015625, "grad_norm_var": 0.017696062723795574, "learning_rate": 1.7e-05, "loss": 0.0324, "loss/crossentropy": 2.1110434532165527, "loss/hidden": 0.02911376953125, "loss/logits": 0.0033112409291788936, "step": 85 }, { "epoch": 0.086, "grad_norm": 0.451171875, "grad_norm_var": 0.018857304255167642, "learning_rate": 1.72e-05, "loss": 0.0291, "loss/crossentropy": 1.7163687944412231, "loss/hidden": 0.02630615234375, "loss/logits": 0.0027680074563249946, "step": 86 }, { "epoch": 0.087, "grad_norm": 0.5703125, "grad_norm_var": 0.018718449274698894, "learning_rate": 1.7400000000000003e-05, "loss": 0.0339, "loss/crossentropy": 1.8893783688545227, "loss/hidden": 0.03021240234375, "loss/logits": 0.0037144168745726347, "step": 87 }, { "epoch": 0.088, "grad_norm": 1.75, "grad_norm_var": 0.0965951124827067, "learning_rate": 1.76e-05, "loss": 0.0293, "loss/crossentropy": 1.0857177823781967, "loss/hidden": 0.02716064453125, "loss/logits": 0.002114512084517628, "step": 88 }, { "epoch": 0.089, "grad_norm": 0.4609375, "grad_norm_var": 0.09848872820536296, "learning_rate": 1.7800000000000002e-05, "loss": 0.0278, "loss/crossentropy": 2.1670188307762146, "loss/hidden": 0.0250244140625, "loss/logits": 0.0027708488050848246, "step": 89 }, { "epoch": 0.09, "grad_norm": 2.984375, "grad_norm_var": 0.4452332655588786, "learning_rate": 1.8e-05, "loss": 0.034, "loss/crossentropy": 0.8697951380163431, "loss/hidden": 0.0322265625, "loss/logits": 0.0017659573932178319, "step": 90 }, { "epoch": 0.091, "grad_norm": 0.58984375, "grad_norm_var": 0.44585811297098793, "learning_rate": 1.8200000000000002e-05, "loss": 0.0315, "loss/crossentropy": 2.0653520226478577, "loss/hidden": 0.02813720703125, "loss/logits": 0.003313788794912398, "step": 91 }, { "epoch": 0.092, "grad_norm": 0.66015625, "grad_norm_var": 0.44346858660380045, "learning_rate": 1.8400000000000003e-05, "loss": 0.0352, "loss/crossentropy": 2.1175276041030884, "loss/hidden": 0.0318603515625, "loss/logits": 0.003378898836672306, "step": 92 }, { "epoch": 0.093, "grad_norm": 0.478515625, "grad_norm_var": 0.44917195638020835, "learning_rate": 1.86e-05, "loss": 0.0328, "loss/crossentropy": 2.192784309387207, "loss/hidden": 0.029296875, "loss/logits": 0.003497788915410638, "step": 93 }, { "epoch": 0.094, "grad_norm": 0.50390625, "grad_norm_var": 0.4493051528930664, "learning_rate": 1.88e-05, "loss": 0.0342, "loss/crossentropy": 1.8000940680503845, "loss/hidden": 0.0308837890625, "loss/logits": 0.003295119386166334, "step": 94 }, { "epoch": 0.095, "grad_norm": 0.86328125, "grad_norm_var": 0.44371743202209474, "learning_rate": 1.9e-05, "loss": 0.0376, "loss/crossentropy": 1.9514374732971191, "loss/hidden": 0.0340576171875, "loss/logits": 0.0035327656660228968, "step": 95 }, { "epoch": 0.096, "grad_norm": 0.55859375, "grad_norm_var": 0.4387262980143229, "learning_rate": 1.9200000000000003e-05, "loss": 0.0334, "loss/crossentropy": 1.7834157943725586, "loss/hidden": 0.03021240234375, "loss/logits": 0.003167669870890677, "step": 96 }, { "epoch": 0.097, "grad_norm": 0.71484375, "grad_norm_var": 0.4309270222981771, "learning_rate": 1.94e-05, "loss": 0.0327, "loss/crossentropy": 1.6889591813087463, "loss/hidden": 0.02972412109375, "loss/logits": 0.0029616469983011484, "step": 97 }, { "epoch": 0.098, "grad_norm": 0.56640625, "grad_norm_var": 0.4336400349934896, "learning_rate": 1.9600000000000002e-05, "loss": 0.0354, "loss/crossentropy": 1.7813147902488708, "loss/hidden": 0.031982421875, "loss/logits": 0.003417789936065674, "step": 98 }, { "epoch": 0.099, "grad_norm": 0.9140625, "grad_norm_var": 0.43045953114827473, "learning_rate": 1.98e-05, "loss": 0.0376, "loss/crossentropy": 1.3951178789138794, "loss/hidden": 0.0345458984375, "loss/logits": 0.0030310061993077397, "step": 99 }, { "epoch": 0.1, "grad_norm": 0.56640625, "grad_norm_var": 0.4291600545247396, "learning_rate": 2e-05, "loss": 0.0364, "loss/crossentropy": 2.255498170852661, "loss/hidden": 0.03277587890625, "loss/logits": 0.0036420804681256413, "step": 100 }, { "epoch": 0.101, "grad_norm": 0.58984375, "grad_norm_var": 0.429521115620931, "learning_rate": 2e-05, "loss": 0.033, "loss/crossentropy": 2.4104394912719727, "loss/hidden": 0.02960205078125, "loss/logits": 0.0033488960471004248, "step": 101 }, { "epoch": 0.102, "grad_norm": 4.8125, "grad_norm_var": 1.4001366774241129, "learning_rate": 2e-05, "loss": 0.0477, "loss/crossentropy": 1.0830636993050575, "loss/hidden": 0.0452880859375, "loss/logits": 0.0023841604124754667, "step": 102 }, { "epoch": 0.103, "grad_norm": 4.1875, "grad_norm_var": 1.9629084110260009, "learning_rate": 2e-05, "loss": 0.0475, "loss/crossentropy": 0.7437883876264095, "loss/hidden": 0.0455322265625, "loss/logits": 0.0019981139339506626, "step": 103 }, { "epoch": 0.104, "grad_norm": 0.77734375, "grad_norm_var": 1.9669294834136963, "learning_rate": 2e-05, "loss": 0.0387, "loss/crossentropy": 2.1284059882164, "loss/hidden": 0.0345458984375, "loss/logits": 0.00411223981063813, "step": 104 }, { "epoch": 0.105, "grad_norm": 1.6796875, "grad_norm_var": 1.92922043800354, "learning_rate": 2e-05, "loss": 0.0459, "loss/crossentropy": 2.1119471192359924, "loss/hidden": 0.0411376953125, "loss/logits": 0.0047579677775502205, "step": 105 }, { "epoch": 0.106, "grad_norm": 0.90234375, "grad_norm_var": 1.7437895298004151, "learning_rate": 2e-05, "loss": 0.044, "loss/crossentropy": 2.391239643096924, "loss/hidden": 0.0390625, "loss/logits": 0.004930721828714013, "step": 106 }, { "epoch": 0.107, "grad_norm": 1.6875, "grad_norm_var": 1.7282822767893473, "learning_rate": 2e-05, "loss": 0.0451, "loss/crossentropy": 1.7602136731147766, "loss/hidden": 0.040283203125, "loss/logits": 0.004797366913408041, "step": 107 }, { "epoch": 0.108, "grad_norm": 0.8828125, "grad_norm_var": 1.7130108992258708, "learning_rate": 2e-05, "loss": 0.0428, "loss/crossentropy": 2.0745638012886047, "loss/hidden": 0.0386962890625, "loss/logits": 0.004113797098398209, "step": 108 }, { "epoch": 0.109, "grad_norm": 0.82421875, "grad_norm_var": 1.6829447428385416, "learning_rate": 2e-05, "loss": 0.0422, "loss/crossentropy": 1.685157299041748, "loss/hidden": 0.03857421875, "loss/logits": 0.0036494951928034425, "step": 109 }, { "epoch": 0.11, "grad_norm": 1.5703125, "grad_norm_var": 1.6387715021769205, "learning_rate": 2e-05, "loss": 0.0376, "loss/crossentropy": 2.625019073486328, "loss/hidden": 0.03369140625, "loss/logits": 0.0039150441298261285, "step": 110 }, { "epoch": 0.111, "grad_norm": 1.5234375, "grad_norm_var": 1.6204302469889322, "learning_rate": 2e-05, "loss": 0.0422, "loss/crossentropy": 0.676440417766571, "loss/hidden": 0.0401611328125, "loss/logits": 0.0020512532209977508, "step": 111 }, { "epoch": 0.112, "grad_norm": 0.65234375, "grad_norm_var": 1.6101824442545574, "learning_rate": 2e-05, "loss": 0.0479, "loss/crossentropy": 1.8928841352462769, "loss/hidden": 0.0435791015625, "loss/logits": 0.00434900657273829, "step": 112 }, { "epoch": 0.113, "grad_norm": 1.09375, "grad_norm_var": 1.5831150690714517, "learning_rate": 2e-05, "loss": 0.0498, "loss/crossentropy": 1.2006176710128784, "loss/hidden": 0.04638671875, "loss/logits": 0.0034257903462275863, "step": 113 }, { "epoch": 0.114, "grad_norm": 0.84375, "grad_norm_var": 1.5551775614420573, "learning_rate": 2e-05, "loss": 0.0437, "loss/crossentropy": 2.164067029953003, "loss/hidden": 0.03955078125, "loss/logits": 0.004164737183600664, "step": 114 }, { "epoch": 0.115, "grad_norm": 0.875, "grad_norm_var": 1.5581644694010417, "learning_rate": 2e-05, "loss": 0.0469, "loss/crossentropy": 1.963140070438385, "loss/hidden": 0.0419921875, "loss/logits": 0.004867425188422203, "step": 115 }, { "epoch": 0.116, "grad_norm": 0.83984375, "grad_norm_var": 1.530010732014974, "learning_rate": 2e-05, "loss": 0.0469, "loss/crossentropy": 1.936423420906067, "loss/hidden": 0.04248046875, "loss/logits": 0.004457900300621986, "step": 116 }, { "epoch": 0.117, "grad_norm": 1.0, "grad_norm_var": 1.4916320164998373, "learning_rate": 2e-05, "loss": 0.044, "loss/crossentropy": 1.9027796387672424, "loss/hidden": 0.0396728515625, "loss/logits": 0.004306067014113069, "step": 117 }, { "epoch": 0.118, "grad_norm": 0.921875, "grad_norm_var": 0.724272092183431, "learning_rate": 2e-05, "loss": 0.048, "loss/crossentropy": 1.4962169528007507, "loss/hidden": 0.043212890625, "loss/logits": 0.004831232130527496, "step": 118 }, { "epoch": 0.119, "grad_norm": 1.3046875, "grad_norm_var": 0.12087090810139973, "learning_rate": 2e-05, "loss": 0.0458, "loss/crossentropy": 1.8558754324913025, "loss/hidden": 0.04150390625, "loss/logits": 0.004260358400642872, "step": 119 }, { "epoch": 0.12, "grad_norm": 0.7421875, "grad_norm_var": 0.12239583333333333, "learning_rate": 2e-05, "loss": 0.0467, "loss/crossentropy": 2.163163900375366, "loss/hidden": 0.042236328125, "loss/logits": 0.0044949238654226065, "step": 120 }, { "epoch": 0.121, "grad_norm": 0.66796875, "grad_norm_var": 0.10601139068603516, "learning_rate": 2e-05, "loss": 0.0429, "loss/crossentropy": 1.875292718410492, "loss/hidden": 0.0389404296875, "loss/logits": 0.003972187405452132, "step": 121 }, { "epoch": 0.122, "grad_norm": 0.97265625, "grad_norm_var": 0.1052103042602539, "learning_rate": 2e-05, "loss": 0.0504, "loss/crossentropy": 1.581692636013031, "loss/hidden": 0.0462646484375, "loss/logits": 0.0040856958366930485, "step": 122 }, { "epoch": 0.123, "grad_norm": 0.77734375, "grad_norm_var": 0.07660497029622396, "learning_rate": 2e-05, "loss": 0.0467, "loss/crossentropy": 2.185007333755493, "loss/hidden": 0.0419921875, "loss/logits": 0.0047312104143202305, "step": 123 }, { "epoch": 0.124, "grad_norm": 0.70703125, "grad_norm_var": 0.08053887685139974, "learning_rate": 2e-05, "loss": 0.0527, "loss/crossentropy": 1.7746418118476868, "loss/hidden": 0.0482177734375, "loss/logits": 0.004488097038120031, "step": 124 }, { "epoch": 0.125, "grad_norm": 0.82421875, "grad_norm_var": 0.08053887685139974, "learning_rate": 2e-05, "loss": 0.0483, "loss/crossentropy": 1.8139249682426453, "loss/hidden": 0.044189453125, "loss/logits": 0.00407675513997674, "step": 125 }, { "epoch": 0.126, "grad_norm": 0.80078125, "grad_norm_var": 0.05464986165364583, "learning_rate": 2e-05, "loss": 0.0536, "loss/crossentropy": 1.8078742623329163, "loss/hidden": 0.0489501953125, "loss/logits": 0.004657944664359093, "step": 126 }, { "epoch": 0.127, "grad_norm": 1.09375, "grad_norm_var": 0.030997467041015626, "learning_rate": 2e-05, "loss": 0.0496, "loss/crossentropy": 2.0267322659492493, "loss/hidden": 0.0447998046875, "loss/logits": 0.0047590641770511866, "step": 127 }, { "epoch": 0.128, "grad_norm": 0.85546875, "grad_norm_var": 0.027347564697265625, "learning_rate": 2e-05, "loss": 0.0587, "loss/crossentropy": 1.6603793501853943, "loss/hidden": 0.052978515625, "loss/logits": 0.005712392507120967, "step": 128 }, { "epoch": 0.129, "grad_norm": 5.375, "grad_norm_var": 1.286358388264974, "learning_rate": 2e-05, "loss": 0.0577, "loss/crossentropy": 0.8844976872205734, "loss/hidden": 0.0550537109375, "loss/logits": 0.0026012896560132504, "step": 129 }, { "epoch": 0.13, "grad_norm": 0.94140625, "grad_norm_var": 1.2828027725219726, "learning_rate": 2e-05, "loss": 0.0532, "loss/crossentropy": 2.151723265647888, "loss/hidden": 0.04833984375, "loss/logits": 0.0048982377629727125, "step": 130 }, { "epoch": 0.131, "grad_norm": 0.92578125, "grad_norm_var": 1.280975341796875, "learning_rate": 2e-05, "loss": 0.048, "loss/crossentropy": 2.190707802772522, "loss/hidden": 0.0435791015625, "loss/logits": 0.004458446754142642, "step": 131 }, { "epoch": 0.132, "grad_norm": 0.73828125, "grad_norm_var": 1.2861162821451824, "learning_rate": 2e-05, "loss": 0.0562, "loss/crossentropy": 2.0854132175445557, "loss/hidden": 0.0511474609375, "loss/logits": 0.005020990269258618, "step": 132 }, { "epoch": 0.133, "grad_norm": 0.6796875, "grad_norm_var": 1.299598185221354, "learning_rate": 2e-05, "loss": 0.0509, "loss/crossentropy": 2.0993438959121704, "loss/hidden": 0.046142578125, "loss/logits": 0.004787095822393894, "step": 133 }, { "epoch": 0.134, "grad_norm": 0.96875, "grad_norm_var": 1.2983378092447917, "learning_rate": 2e-05, "loss": 0.0491, "loss/crossentropy": 2.2328933477401733, "loss/hidden": 0.0445556640625, "loss/logits": 0.004536583088338375, "step": 134 }, { "epoch": 0.135, "grad_norm": 1.0625, "grad_norm_var": 1.2969581604003906, "learning_rate": 2e-05, "loss": 0.0638, "loss/crossentropy": 1.9981300234794617, "loss/hidden": 0.0579833984375, "loss/logits": 0.00582107319496572, "step": 135 }, { "epoch": 0.136, "grad_norm": 0.6796875, "grad_norm_var": 1.3004615783691407, "learning_rate": 2e-05, "loss": 0.0542, "loss/crossentropy": 2.1993343830108643, "loss/hidden": 0.049072265625, "loss/logits": 0.005134769715368748, "step": 136 }, { "epoch": 0.137, "grad_norm": 3.5, "grad_norm_var": 1.627500343322754, "learning_rate": 2e-05, "loss": 0.0595, "loss/crossentropy": 1.469780683517456, "loss/hidden": 0.0552978515625, "loss/logits": 0.0042177007999271154, "step": 137 }, { "epoch": 0.138, "grad_norm": 0.87109375, "grad_norm_var": 1.632664426167806, "learning_rate": 2e-05, "loss": 0.0554, "loss/crossentropy": 1.8814529180526733, "loss/hidden": 0.0506591796875, "loss/logits": 0.004711252404376864, "step": 138 }, { "epoch": 0.139, "grad_norm": 0.9140625, "grad_norm_var": 1.62430419921875, "learning_rate": 2e-05, "loss": 0.0542, "loss/crossentropy": 1.9769226908683777, "loss/hidden": 0.049560546875, "loss/logits": 0.004602615023031831, "step": 139 }, { "epoch": 0.14, "grad_norm": 1.296875, "grad_norm_var": 1.5987385431925456, "learning_rate": 2e-05, "loss": 0.0562, "loss/crossentropy": 1.3646953105926514, "loss/hidden": 0.0516357421875, "loss/logits": 0.0045162534806877375, "step": 140 }, { "epoch": 0.141, "grad_norm": 0.91796875, "grad_norm_var": 1.592772356669108, "learning_rate": 2e-05, "loss": 0.0586, "loss/crossentropy": 1.5901939272880554, "loss/hidden": 0.0538330078125, "loss/logits": 0.004788138438016176, "step": 141 }, { "epoch": 0.142, "grad_norm": 1.109375, "grad_norm_var": 1.5760719299316406, "learning_rate": 2e-05, "loss": 0.0686, "loss/crossentropy": 1.8436982035636902, "loss/hidden": 0.062744140625, "loss/logits": 0.005897135473787785, "step": 142 }, { "epoch": 0.143, "grad_norm": 1.0, "grad_norm_var": 1.5800819396972656, "learning_rate": 2e-05, "loss": 0.0677, "loss/crossentropy": 1.7922558188438416, "loss/hidden": 0.06103515625, "loss/logits": 0.006622593384236097, "step": 143 }, { "epoch": 0.144, "grad_norm": 1.046875, "grad_norm_var": 1.5693745295206705, "learning_rate": 2e-05, "loss": 0.0626, "loss/crossentropy": 1.8654756546020508, "loss/hidden": 0.05712890625, "loss/logits": 0.005447414005175233, "step": 144 }, { "epoch": 0.145, "grad_norm": 0.8046875, "grad_norm_var": 0.43840071360270183, "learning_rate": 2e-05, "loss": 0.0653, "loss/crossentropy": 2.023370146751404, "loss/hidden": 0.0596923828125, "loss/logits": 0.005567178362980485, "step": 145 }, { "epoch": 0.146, "grad_norm": 1.7265625, "grad_norm_var": 0.4612627665201823, "learning_rate": 2e-05, "loss": 0.0718, "loss/crossentropy": 1.2652358412742615, "loss/hidden": 0.066162109375, "loss/logits": 0.00563872791826725, "step": 146 }, { "epoch": 0.147, "grad_norm": 0.8359375, "grad_norm_var": 0.4643350601196289, "learning_rate": 2e-05, "loss": 0.0579, "loss/crossentropy": 2.181838572025299, "loss/hidden": 0.0528564453125, "loss/logits": 0.0050070807337760925, "step": 147 }, { "epoch": 0.148, "grad_norm": 1.65625, "grad_norm_var": 0.4685035705566406, "learning_rate": 2e-05, "loss": 0.0653, "loss/crossentropy": 1.6760476231575012, "loss/hidden": 0.059814453125, "loss/logits": 0.005448109935969114, "step": 148 }, { "epoch": 0.149, "grad_norm": 0.875, "grad_norm_var": 0.45754903157552085, "learning_rate": 2e-05, "loss": 0.0608, "loss/crossentropy": 1.9610846042633057, "loss/hidden": 0.05517578125, "loss/logits": 0.0055898819118738174, "step": 149 }, { "epoch": 0.15, "grad_norm": 1.2890625, "grad_norm_var": 0.45391006469726564, "learning_rate": 2e-05, "loss": 0.0607, "loss/crossentropy": 2.0354663729667664, "loss/hidden": 0.054931640625, "loss/logits": 0.005750466603785753, "step": 150 }, { "epoch": 0.151, "grad_norm": 0.91015625, "grad_norm_var": 0.4586435317993164, "learning_rate": 2e-05, "loss": 0.061, "loss/crossentropy": 1.5509551763534546, "loss/hidden": 0.05615234375, "loss/logits": 0.00486933346837759, "step": 151 }, { "epoch": 0.152, "grad_norm": 1.8203125, "grad_norm_var": 0.45860640207926434, "learning_rate": 2e-05, "loss": 0.0823, "loss/crossentropy": 1.3190861344337463, "loss/hidden": 0.076171875, "loss/logits": 0.006146557629108429, "step": 152 }, { "epoch": 0.153, "grad_norm": 1.65625, "grad_norm_var": 0.12676741282145182, "learning_rate": 2e-05, "loss": 0.0689, "loss/crossentropy": 2.0075970888137817, "loss/hidden": 0.0626220703125, "loss/logits": 0.0062951259315013885, "step": 153 }, { "epoch": 0.154, "grad_norm": 0.84375, "grad_norm_var": 0.12790629069010417, "learning_rate": 2e-05, "loss": 0.0645, "loss/crossentropy": 2.5025904178619385, "loss/hidden": 0.0584716796875, "loss/logits": 0.005998906912282109, "step": 154 }, { "epoch": 0.155, "grad_norm": 1.75, "grad_norm_var": 0.14317194620768228, "learning_rate": 2e-05, "loss": 0.0673, "loss/crossentropy": 1.7674061059951782, "loss/hidden": 0.0618896484375, "loss/logits": 0.005377188790589571, "step": 155 }, { "epoch": 0.156, "grad_norm": 1.046875, "grad_norm_var": 0.14455540974934897, "learning_rate": 2e-05, "loss": 0.0696, "loss/crossentropy": 1.4891575574874878, "loss/hidden": 0.0640869140625, "loss/logits": 0.005491052754223347, "step": 156 }, { "epoch": 0.157, "grad_norm": 1.0078125, "grad_norm_var": 0.1416147232055664, "learning_rate": 2e-05, "loss": 0.0656, "loss/crossentropy": 1.4295508861541748, "loss/hidden": 0.060546875, "loss/logits": 0.005026416387408972, "step": 157 }, { "epoch": 0.158, "grad_norm": 8.5, "grad_norm_var": 3.4551263809204102, "learning_rate": 2e-05, "loss": 0.1047, "loss/crossentropy": 1.6207728683948517, "loss/hidden": 0.09716796875, "loss/logits": 0.007503823610022664, "step": 158 }, { "epoch": 0.159, "grad_norm": 1.3125, "grad_norm_var": 3.4331842422485352, "learning_rate": 2e-05, "loss": 0.0663, "loss/crossentropy": 1.838720440864563, "loss/hidden": 0.06103515625, "loss/logits": 0.0052408319897949696, "step": 159 }, { "epoch": 0.16, "grad_norm": 1.765625, "grad_norm_var": 3.403587277730306, "learning_rate": 2e-05, "loss": 0.0729, "loss/crossentropy": 1.9572261571884155, "loss/hidden": 0.06640625, "loss/logits": 0.00649917172268033, "step": 160 }, { "epoch": 0.161, "grad_norm": 7.71875, "grad_norm_var": 5.5313720067342125, "learning_rate": 2e-05, "loss": 0.0873, "loss/crossentropy": 0.06751747522503138, "loss/hidden": 0.086181640625, "loss/logits": 0.001096382096875459, "step": 161 }, { "epoch": 0.162, "grad_norm": 1.65625, "grad_norm_var": 5.535835202534994, "learning_rate": 2e-05, "loss": 0.0753, "loss/crossentropy": 1.9767259359359741, "loss/hidden": 0.06884765625, "loss/logits": 0.006433435715734959, "step": 162 }, { "epoch": 0.163, "grad_norm": 1.2734375, "grad_norm_var": 5.470252927144369, "learning_rate": 2e-05, "loss": 0.0742, "loss/crossentropy": 1.6337787508964539, "loss/hidden": 0.068359375, "loss/logits": 0.0058679585345089436, "step": 163 }, { "epoch": 0.164, "grad_norm": 1.171875, "grad_norm_var": 5.519557634989421, "learning_rate": 2e-05, "loss": 0.0791, "loss/crossentropy": 1.5085630416870117, "loss/hidden": 0.0732421875, "loss/logits": 0.00587455416098237, "step": 164 }, { "epoch": 0.165, "grad_norm": 1.328125, "grad_norm_var": 5.454612668355306, "learning_rate": 2e-05, "loss": 0.0733, "loss/crossentropy": 2.1295101046562195, "loss/hidden": 0.0665283203125, "loss/logits": 0.006821601651608944, "step": 165 }, { "epoch": 0.166, "grad_norm": 0.828125, "grad_norm_var": 5.523303159077963, "learning_rate": 2e-05, "loss": 0.0681, "loss/crossentropy": 2.1514192819595337, "loss/hidden": 0.061767578125, "loss/logits": 0.0063285790383815765, "step": 166 }, { "epoch": 0.167, "grad_norm": 0.9140625, "grad_norm_var": 5.522652180989583, "learning_rate": 2e-05, "loss": 0.0799, "loss/crossentropy": 1.907168447971344, "loss/hidden": 0.072509765625, "loss/logits": 0.0073654367588460445, "step": 167 }, { "epoch": 0.168, "grad_norm": 0.70703125, "grad_norm_var": 5.650849850972493, "learning_rate": 2e-05, "loss": 0.0665, "loss/crossentropy": 2.490573525428772, "loss/hidden": 0.0604248046875, "loss/logits": 0.006123463856056333, "step": 168 }, { "epoch": 0.169, "grad_norm": 0.921875, "grad_norm_var": 5.727275530497233, "learning_rate": 2e-05, "loss": 0.0686, "loss/crossentropy": 2.1971182823181152, "loss/hidden": 0.0625, "loss/logits": 0.006081034895032644, "step": 169 }, { "epoch": 0.17, "grad_norm": 0.84375, "grad_norm_var": 5.727275530497233, "learning_rate": 2e-05, "loss": 0.0723, "loss/crossentropy": 1.9449633955955505, "loss/hidden": 0.06591796875, "loss/logits": 0.00633727153763175, "step": 170 }, { "epoch": 0.171, "grad_norm": 0.80078125, "grad_norm_var": 5.8211313883463545, "learning_rate": 2e-05, "loss": 0.0721, "loss/crossentropy": 1.8933625221252441, "loss/hidden": 0.066162109375, "loss/logits": 0.005927694728597999, "step": 171 }, { "epoch": 0.172, "grad_norm": 0.734375, "grad_norm_var": 5.8664194742838545, "learning_rate": 2e-05, "loss": 0.0756, "loss/crossentropy": 2.2961581349372864, "loss/hidden": 0.069091796875, "loss/logits": 0.00650426116771996, "step": 172 }, { "epoch": 0.173, "grad_norm": 1.0859375, "grad_norm_var": 5.856801350911458, "learning_rate": 2e-05, "loss": 0.0876, "loss/crossentropy": 1.5580723285675049, "loss/hidden": 0.080322265625, "loss/logits": 0.00728521216660738, "step": 173 }, { "epoch": 0.174, "grad_norm": 0.87109375, "grad_norm_var": 2.8547820409138995, "learning_rate": 2e-05, "loss": 0.0785, "loss/crossentropy": 2.4996918439865112, "loss/hidden": 0.07080078125, "loss/logits": 0.0076872315257787704, "step": 174 }, { "epoch": 0.175, "grad_norm": 1.09375, "grad_norm_var": 2.863120460510254, "learning_rate": 2e-05, "loss": 0.0842, "loss/crossentropy": 2.341306686401367, "loss/hidden": 0.075927734375, "loss/logits": 0.008260179311037064, "step": 175 }, { "epoch": 0.176, "grad_norm": 1.2734375, "grad_norm_var": 2.859659767150879, "learning_rate": 2e-05, "loss": 0.0839, "loss/crossentropy": 2.0976521968841553, "loss/hidden": 0.075927734375, "loss/logits": 0.007956868037581444, "step": 176 }, { "epoch": 0.177, "grad_norm": 1.6640625, "grad_norm_var": 0.09129581451416016, "learning_rate": 2e-05, "loss": 0.0854, "loss/crossentropy": 1.5655289888381958, "loss/hidden": 0.078857421875, "loss/logits": 0.006505638128146529, "step": 177 }, { "epoch": 0.178, "grad_norm": 0.96484375, "grad_norm_var": 0.06740493774414062, "learning_rate": 2e-05, "loss": 0.0832, "loss/crossentropy": 1.947506844997406, "loss/hidden": 0.076171875, "loss/logits": 0.0070168147794902325, "step": 178 }, { "epoch": 0.179, "grad_norm": 4.5625, "grad_norm_var": 0.8503774007161459, "learning_rate": 2e-05, "loss": 0.0965, "loss/crossentropy": 1.557403326034546, "loss/hidden": 0.087158203125, "loss/logits": 0.009354921989142895, "step": 179 }, { "epoch": 0.18, "grad_norm": 8.3125, "grad_norm_var": 3.9767252604166665, "learning_rate": 2e-05, "loss": 0.1122, "loss/crossentropy": 0.45333431661129, "loss/hidden": 0.109375, "loss/logits": 0.0027967533096671104, "step": 180 }, { "epoch": 0.181, "grad_norm": 1.546875, "grad_norm_var": 3.969405110677083, "learning_rate": 2e-05, "loss": 0.0829, "loss/crossentropy": 2.005882978439331, "loss/hidden": 0.075439453125, "loss/logits": 0.007453362224623561, "step": 181 }, { "epoch": 0.182, "grad_norm": 1.3515625, "grad_norm_var": 3.926006825764974, "learning_rate": 2e-05, "loss": 0.0849, "loss/crossentropy": 2.199571132659912, "loss/hidden": 0.077880859375, "loss/logits": 0.0069826748222112656, "step": 182 }, { "epoch": 0.183, "grad_norm": 1.5703125, "grad_norm_var": 3.8817014058430988, "learning_rate": 2e-05, "loss": 0.0921, "loss/crossentropy": 1.6926537156105042, "loss/hidden": 0.085205078125, "loss/logits": 0.006879956694319844, "step": 183 }, { "epoch": 0.184, "grad_norm": 1.203125, "grad_norm_var": 3.826835568745931, "learning_rate": 2e-05, "loss": 0.0964, "loss/crossentropy": 1.509221613407135, "loss/hidden": 0.087890625, "loss/logits": 0.00847849901765585, "step": 184 }, { "epoch": 0.185, "grad_norm": 0.703125, "grad_norm_var": 3.8554396947224934, "learning_rate": 2e-05, "loss": 0.0788, "loss/crossentropy": 2.4337867498397827, "loss/hidden": 0.072021484375, "loss/logits": 0.0067423065192997456, "step": 185 }, { "epoch": 0.186, "grad_norm": 1.234375, "grad_norm_var": 3.815881284077962, "learning_rate": 2e-05, "loss": 0.0966, "loss/crossentropy": 1.7458332180976868, "loss/hidden": 0.08837890625, "loss/logits": 0.008262162329629064, "step": 186 }, { "epoch": 0.187, "grad_norm": 6.59375, "grad_norm_var": 5.133159383138021, "learning_rate": 2e-05, "loss": 0.0928, "loss/crossentropy": 2.116236627101898, "loss/hidden": 0.0830078125, "loss/logits": 0.00975541677325964, "step": 187 }, { "epoch": 0.188, "grad_norm": 1.8203125, "grad_norm_var": 4.998583730061849, "learning_rate": 2e-05, "loss": 0.0831, "loss/crossentropy": 2.324514389038086, "loss/hidden": 0.075439453125, "loss/logits": 0.007644579978659749, "step": 188 }, { "epoch": 0.189, "grad_norm": 0.796875, "grad_norm_var": 5.048313395182292, "learning_rate": 2e-05, "loss": 0.0867, "loss/crossentropy": 1.9479625821113586, "loss/hidden": 0.0791015625, "loss/logits": 0.0075566458981484175, "step": 189 }, { "epoch": 0.19, "grad_norm": 15.875, "grad_norm_var": 16.414309628804524, "learning_rate": 2e-05, "loss": 0.1592, "loss/crossentropy": 1.5863521695137024, "loss/hidden": 0.1494140625, "loss/logits": 0.009787225630134344, "step": 190 }, { "epoch": 0.191, "grad_norm": 2.046875, "grad_norm_var": 16.208450762430825, "learning_rate": 2e-05, "loss": 0.0784, "loss/crossentropy": 0.8779918029904366, "loss/hidden": 0.073974609375, "loss/logits": 0.004391094436869025, "step": 191 }, { "epoch": 0.192, "grad_norm": 1.375, "grad_norm_var": 16.1827361424764, "learning_rate": 2e-05, "loss": 0.0931, "loss/crossentropy": 2.1567060947418213, "loss/hidden": 0.085693359375, "loss/logits": 0.007449513301253319, "step": 192 }, { "epoch": 0.193, "grad_norm": 0.875, "grad_norm_var": 16.386012204488118, "learning_rate": 2e-05, "loss": 0.0898, "loss/crossentropy": 1.8178179860115051, "loss/hidden": 0.08251953125, "loss/logits": 0.007294924231246114, "step": 193 }, { "epoch": 0.194, "grad_norm": 2.21875, "grad_norm_var": 16.114434560139973, "learning_rate": 2e-05, "loss": 0.1014, "loss/crossentropy": 1.8806178569793701, "loss/hidden": 0.09375, "loss/logits": 0.0076924534514546394, "step": 194 }, { "epoch": 0.195, "grad_norm": 1.8671875, "grad_norm_var": 16.098729451497395, "learning_rate": 2e-05, "loss": 0.1048, "loss/crossentropy": 1.6054936051368713, "loss/hidden": 0.096435546875, "loss/logits": 0.008354771416634321, "step": 195 }, { "epoch": 0.196, "grad_norm": 1.90625, "grad_norm_var": 14.200210571289062, "learning_rate": 2e-05, "loss": 0.0851, "loss/crossentropy": 1.1937458366155624, "loss/hidden": 0.079833984375, "loss/logits": 0.005313969450071454, "step": 196 }, { "epoch": 0.197, "grad_norm": 2.453125, "grad_norm_var": 14.113833618164062, "learning_rate": 2e-05, "loss": 0.1056, "loss/crossentropy": 1.9973903894424438, "loss/hidden": 0.09619140625, "loss/logits": 0.00938287889584899, "step": 197 }, { "epoch": 0.198, "grad_norm": 1.5546875, "grad_norm_var": 14.07872314453125, "learning_rate": 2e-05, "loss": 0.087, "loss/crossentropy": 2.0422087907791138, "loss/hidden": 0.07958984375, "loss/logits": 0.007449948927387595, "step": 198 }, { "epoch": 0.199, "grad_norm": 0.875, "grad_norm_var": 14.218849436442058, "learning_rate": 2e-05, "loss": 0.0908, "loss/crossentropy": 2.040232002735138, "loss/hidden": 0.08349609375, "loss/logits": 0.007334771566092968, "step": 199 }, { "epoch": 0.2, "grad_norm": 3.6875, "grad_norm_var": 14.104658762613932, "learning_rate": 2e-05, "loss": 0.0996, "loss/crossentropy": 1.7977141737937927, "loss/hidden": 0.09130859375, "loss/logits": 0.008285259362310171, "step": 200 }, { "epoch": 0.201, "grad_norm": 1.1640625, "grad_norm_var": 13.984908040364584, "learning_rate": 2e-05, "loss": 0.0923, "loss/crossentropy": 1.960830569267273, "loss/hidden": 0.0849609375, "loss/logits": 0.007373227505013347, "step": 201 }, { "epoch": 0.202, "grad_norm": 1.2109375, "grad_norm_var": 13.99013646443685, "learning_rate": 2e-05, "loss": 0.1063, "loss/crossentropy": 1.5903997421264648, "loss/hidden": 0.098876953125, "loss/logits": 0.007376475026831031, "step": 202 }, { "epoch": 0.203, "grad_norm": 2.015625, "grad_norm_var": 13.0423215230306, "learning_rate": 2e-05, "loss": 0.0958, "loss/crossentropy": 1.1866007596254349, "loss/hidden": 0.0908203125, "loss/logits": 0.0049855056568048894, "step": 203 }, { "epoch": 0.204, "grad_norm": 2.203125, "grad_norm_var": 13.01123046875, "learning_rate": 2e-05, "loss": 0.1001, "loss/crossentropy": 2.016387164592743, "loss/hidden": 0.092529296875, "loss/logits": 0.0076178074814379215, "step": 204 }, { "epoch": 0.205, "grad_norm": 0.98828125, "grad_norm_var": 12.966665585835775, "learning_rate": 2e-05, "loss": 0.1017, "loss/crossentropy": 1.9937080144882202, "loss/hidden": 0.09326171875, "loss/logits": 0.008388462010771036, "step": 205 }, { "epoch": 0.206, "grad_norm": 1.65625, "grad_norm_var": 0.5201679865519205, "learning_rate": 2e-05, "loss": 0.1012, "loss/crossentropy": 1.8353246450424194, "loss/hidden": 0.09326171875, "loss/logits": 0.00795629364438355, "step": 206 }, { "epoch": 0.207, "grad_norm": 1.6875, "grad_norm_var": 0.5143070856730143, "learning_rate": 2e-05, "loss": 0.0918, "loss/crossentropy": 1.0499791204929352, "loss/hidden": 0.08740234375, "loss/logits": 0.004438678151927888, "step": 207 }, { "epoch": 0.208, "grad_norm": 1.0625, "grad_norm_var": 0.5353540420532227, "learning_rate": 2e-05, "loss": 0.107, "loss/crossentropy": 1.8614663481712341, "loss/hidden": 0.09814453125, "loss/logits": 0.008855776861310005, "step": 208 }, { "epoch": 0.209, "grad_norm": 2.390625, "grad_norm_var": 0.5093535741170248, "learning_rate": 2e-05, "loss": 0.1072, "loss/crossentropy": 2.363565683364868, "loss/hidden": 0.096923828125, "loss/logits": 0.010271006729453802, "step": 209 }, { "epoch": 0.21, "grad_norm": 2.171875, "grad_norm_var": 0.5069289525349935, "learning_rate": 2e-05, "loss": 0.1086, "loss/crossentropy": 1.955030083656311, "loss/hidden": 0.099365234375, "loss/logits": 0.0092296302318573, "step": 210 }, { "epoch": 0.211, "grad_norm": 1.2265625, "grad_norm_var": 0.5273447036743164, "learning_rate": 2e-05, "loss": 0.1062, "loss/crossentropy": 1.774095892906189, "loss/hidden": 0.0986328125, "loss/logits": 0.007574398070573807, "step": 211 }, { "epoch": 0.212, "grad_norm": 1.2890625, "grad_norm_var": 0.5396000544230143, "learning_rate": 2e-05, "loss": 0.1117, "loss/crossentropy": 1.8405153155326843, "loss/hidden": 0.10302734375, "loss/logits": 0.008719130419194698, "step": 212 }, { "epoch": 0.213, "grad_norm": 1.40625, "grad_norm_var": 0.5067829767862956, "learning_rate": 2e-05, "loss": 0.1045, "loss/crossentropy": 2.0069875717163086, "loss/hidden": 0.095947265625, "loss/logits": 0.008583055343478918, "step": 213 }, { "epoch": 0.214, "grad_norm": 1.1640625, "grad_norm_var": 0.5219018936157227, "learning_rate": 2e-05, "loss": 0.1103, "loss/crossentropy": 1.670526921749115, "loss/hidden": 0.102294921875, "loss/logits": 0.008038338739424944, "step": 214 }, { "epoch": 0.215, "grad_norm": 1.8828125, "grad_norm_var": 0.48292789459228513, "learning_rate": 2e-05, "loss": 0.1121, "loss/crossentropy": 1.795514464378357, "loss/hidden": 0.103759765625, "loss/logits": 0.008318986743688583, "step": 215 }, { "epoch": 0.216, "grad_norm": 1.1328125, "grad_norm_var": 0.2139871597290039, "learning_rate": 2e-05, "loss": 0.1066, "loss/crossentropy": 2.180332064628601, "loss/hidden": 0.09716796875, "loss/logits": 0.009391986764967442, "step": 216 }, { "epoch": 0.217, "grad_norm": 1.9375, "grad_norm_var": 0.21252689361572266, "learning_rate": 2e-05, "loss": 0.1234, "loss/crossentropy": 1.8504464030265808, "loss/hidden": 0.11181640625, "loss/logits": 0.011583337560296059, "step": 217 }, { "epoch": 0.218, "grad_norm": 1.046875, "grad_norm_var": 0.22248172760009766, "learning_rate": 2e-05, "loss": 0.1098, "loss/crossentropy": 1.6542016863822937, "loss/hidden": 0.101806640625, "loss/logits": 0.007953221211209893, "step": 218 }, { "epoch": 0.219, "grad_norm": 1.1484375, "grad_norm_var": 0.21898136138916016, "learning_rate": 2e-05, "loss": 0.1185, "loss/crossentropy": 1.8401342630386353, "loss/hidden": 0.107421875, "loss/logits": 0.011056106071919203, "step": 219 }, { "epoch": 0.22, "grad_norm": 1.2578125, "grad_norm_var": 0.18931725819905598, "learning_rate": 2e-05, "loss": 0.1082, "loss/crossentropy": 1.8265935778617859, "loss/hidden": 0.09912109375, "loss/logits": 0.009068313986063004, "step": 220 }, { "epoch": 0.221, "grad_norm": 52.25, "grad_norm_var": 161.16229426066081, "learning_rate": 2e-05, "loss": 0.1937, "loss/crossentropy": 1.5437742471694946, "loss/hidden": 0.170654296875, "loss/logits": 0.023064299020916224, "step": 221 }, { "epoch": 0.222, "grad_norm": 2.28125, "grad_norm_var": 160.93560969034831, "learning_rate": 2e-05, "loss": 0.1246, "loss/crossentropy": 1.227450430393219, "loss/hidden": 0.11572265625, "loss/logits": 0.008849140722304583, "step": 222 }, { "epoch": 0.223, "grad_norm": 1.28125, "grad_norm_var": 161.10956192016602, "learning_rate": 2e-05, "loss": 0.1196, "loss/crossentropy": 1.9892451167106628, "loss/hidden": 0.1103515625, "loss/logits": 0.009212612174451351, "step": 223 }, { "epoch": 0.224, "grad_norm": 1.0625, "grad_norm_var": 161.10956192016602, "learning_rate": 2e-05, "loss": 0.1208, "loss/crossentropy": 1.9727575778961182, "loss/hidden": 0.111328125, "loss/logits": 0.009519532322883606, "step": 224 }, { "epoch": 0.225, "grad_norm": 1.9140625, "grad_norm_var": 161.26942443847656, "learning_rate": 2e-05, "loss": 0.1112, "loss/crossentropy": 2.20854651927948, "loss/hidden": 0.1025390625, "loss/logits": 0.008704130537807941, "step": 225 }, { "epoch": 0.226, "grad_norm": 1.703125, "grad_norm_var": 161.43824768066406, "learning_rate": 2e-05, "loss": 0.1249, "loss/crossentropy": 1.8244708180427551, "loss/hidden": 0.115478515625, "loss/logits": 0.009438233450055122, "step": 226 }, { "epoch": 0.227, "grad_norm": 1.9921875, "grad_norm_var": 161.12805989583333, "learning_rate": 2e-05, "loss": 0.1264, "loss/crossentropy": 1.6184683442115784, "loss/hidden": 0.117431640625, "loss/logits": 0.008998575620353222, "step": 227 }, { "epoch": 0.228, "grad_norm": 1.40625, "grad_norm_var": 161.0760617574056, "learning_rate": 2e-05, "loss": 0.1427, "loss/crossentropy": 1.9090940952301025, "loss/hidden": 0.1298828125, "loss/logits": 0.01286676386371255, "step": 228 }, { "epoch": 0.229, "grad_norm": 1.5078125, "grad_norm_var": 161.03238525390626, "learning_rate": 2e-05, "loss": 0.1191, "loss/crossentropy": 1.7622392773628235, "loss/hidden": 0.109619140625, "loss/logits": 0.009484861977398396, "step": 229 }, { "epoch": 0.23, "grad_norm": 1.3671875, "grad_norm_var": 160.93959045410156, "learning_rate": 2e-05, "loss": 0.1185, "loss/crossentropy": 1.7633178234100342, "loss/hidden": 0.109130859375, "loss/logits": 0.009330280125141144, "step": 230 }, { "epoch": 0.231, "grad_norm": 0.98828125, "grad_norm_var": 161.32540073394776, "learning_rate": 2e-05, "loss": 0.1188, "loss/crossentropy": 2.186140298843384, "loss/hidden": 0.108154296875, "loss/logits": 0.010631876531988382, "step": 231 }, { "epoch": 0.232, "grad_norm": 3.28125, "grad_norm_var": 160.60855553944904, "learning_rate": 2e-05, "loss": 0.1224, "loss/crossentropy": 0.8389374911785126, "loss/hidden": 0.1171875, "loss/logits": 0.005214276316110045, "step": 232 }, { "epoch": 0.233, "grad_norm": 1.0703125, "grad_norm_var": 160.98382867177327, "learning_rate": 2e-05, "loss": 0.116, "loss/crossentropy": 2.1515474915504456, "loss/hidden": 0.107421875, "loss/logits": 0.00860951654613018, "step": 233 }, { "epoch": 0.234, "grad_norm": 4.5, "grad_norm_var": 160.03680464426677, "learning_rate": 2e-05, "loss": 0.1312, "loss/crossentropy": 1.6820667684078217, "loss/hidden": 0.123046875, "loss/logits": 0.008124232292175293, "step": 234 }, { "epoch": 0.235, "grad_norm": 2.40625, "grad_norm_var": 159.50010522206625, "learning_rate": 2e-05, "loss": 0.1056, "loss/crossentropy": 0.9079534839838743, "loss/hidden": 0.10107421875, "loss/logits": 0.004542189242783934, "step": 235 }, { "epoch": 0.236, "grad_norm": 0.984375, "grad_norm_var": 159.64182631174722, "learning_rate": 2e-05, "loss": 0.1192, "loss/crossentropy": 2.261181592941284, "loss/hidden": 0.109619140625, "loss/logits": 0.009581252932548523, "step": 236 }, { "epoch": 0.237, "grad_norm": 0.9921875, "grad_norm_var": 0.9261479059855143, "learning_rate": 2e-05, "loss": 0.1281, "loss/crossentropy": 1.9553669095039368, "loss/hidden": 0.116943359375, "loss/logits": 0.011152476072311401, "step": 237 }, { "epoch": 0.238, "grad_norm": 1.640625, "grad_norm_var": 0.9103616714477539, "learning_rate": 2e-05, "loss": 0.1466, "loss/crossentropy": 1.6360890865325928, "loss/hidden": 0.13525390625, "loss/logits": 0.011308418586850166, "step": 238 }, { "epoch": 0.239, "grad_norm": 2.265625, "grad_norm_var": 0.9085992813110352, "learning_rate": 2e-05, "loss": 0.133, "loss/crossentropy": 1.0788212679326534, "loss/hidden": 0.125732421875, "loss/logits": 0.007256039883941412, "step": 239 }, { "epoch": 0.24, "grad_norm": 1.7578125, "grad_norm_var": 0.8688089370727539, "learning_rate": 2e-05, "loss": 0.1296, "loss/crossentropy": 1.6809419393539429, "loss/hidden": 0.119873046875, "loss/logits": 0.009761545807123184, "step": 240 }, { "epoch": 0.241, "grad_norm": 1.4921875, "grad_norm_var": 0.8769525527954102, "learning_rate": 2e-05, "loss": 0.1298, "loss/crossentropy": 2.1073160767555237, "loss/hidden": 0.1201171875, "loss/logits": 0.009713000617921352, "step": 241 }, { "epoch": 0.242, "grad_norm": 3.3125, "grad_norm_var": 1.0105956395467122, "learning_rate": 2e-05, "loss": 0.1851, "loss/crossentropy": 1.7140259146690369, "loss/hidden": 0.168212890625, "loss/logits": 0.01692299358546734, "step": 242 }, { "epoch": 0.243, "grad_norm": 1.3203125, "grad_norm_var": 1.0337132136027019, "learning_rate": 2e-05, "loss": 0.141, "loss/crossentropy": 1.70401269197464, "loss/hidden": 0.13037109375, "loss/logits": 0.010653213132172823, "step": 243 }, { "epoch": 0.244, "grad_norm": 2.015625, "grad_norm_var": 1.0173481623331706, "learning_rate": 2e-05, "loss": 0.1561, "loss/crossentropy": 1.9086145758628845, "loss/hidden": 0.1416015625, "loss/logits": 0.01448416942730546, "step": 244 }, { "epoch": 0.245, "grad_norm": 1.890625, "grad_norm_var": 1.0048868179321289, "learning_rate": 2e-05, "loss": 0.1751, "loss/crossentropy": 1.5015806555747986, "loss/hidden": 0.16064453125, "loss/logits": 0.014442750252783298, "step": 245 }, { "epoch": 0.246, "grad_norm": 1.6796875, "grad_norm_var": 0.9864847183227539, "learning_rate": 2e-05, "loss": 0.1323, "loss/crossentropy": 1.9546470642089844, "loss/hidden": 0.12255859375, "loss/logits": 0.009766705334186554, "step": 246 }, { "epoch": 0.247, "grad_norm": 1.203125, "grad_norm_var": 0.9611083984375, "learning_rate": 2e-05, "loss": 0.1539, "loss/crossentropy": 1.7062721848487854, "loss/hidden": 0.1416015625, "loss/logits": 0.01230617519468069, "step": 247 }, { "epoch": 0.248, "grad_norm": 4.21875, "grad_norm_var": 1.1776611328125, "learning_rate": 2e-05, "loss": 0.1515, "loss/crossentropy": 1.740279734134674, "loss/hidden": 0.14013671875, "loss/logits": 0.011402689386159182, "step": 248 }, { "epoch": 0.249, "grad_norm": 2.3125, "grad_norm_var": 1.1123573303222656, "learning_rate": 2e-05, "loss": 0.1504, "loss/crossentropy": 1.640882670879364, "loss/hidden": 0.1396484375, "loss/logits": 0.01071554934605956, "step": 249 }, { "epoch": 0.25, "grad_norm": 2.796875, "grad_norm_var": 0.7542132059733073, "learning_rate": 2e-05, "loss": 0.1364, "loss/crossentropy": 1.4670004844665527, "loss/hidden": 0.126708984375, "loss/logits": 0.0096431621350348, "step": 250 }, { "epoch": 0.251, "grad_norm": 1.1796875, "grad_norm_var": 0.7847574869791667, "learning_rate": 2e-05, "loss": 0.14, "loss/crossentropy": 2.2024736404418945, "loss/hidden": 0.127197265625, "loss/logits": 0.012759591452777386, "step": 251 }, { "epoch": 0.252, "grad_norm": 3.53125, "grad_norm_var": 0.8651763916015625, "learning_rate": 2e-05, "loss": 0.1539, "loss/crossentropy": 2.0269722938537598, "loss/hidden": 0.14208984375, "loss/logits": 0.011817097198218107, "step": 252 }, { "epoch": 0.253, "grad_norm": 9.375, "grad_norm_var": 4.018281809488932, "learning_rate": 2e-05, "loss": 0.1661, "loss/crossentropy": 0.34899202920496464, "loss/hidden": 0.163818359375, "loss/logits": 0.0022718849941156805, "step": 253 }, { "epoch": 0.254, "grad_norm": 1.9921875, "grad_norm_var": 3.9798868815104167, "learning_rate": 2e-05, "loss": 0.1441, "loss/crossentropy": 2.2475985288619995, "loss/hidden": 0.1318359375, "loss/logits": 0.012224531266838312, "step": 254 }, { "epoch": 0.255, "grad_norm": 1.6328125, "grad_norm_var": 4.037050120035807, "learning_rate": 2e-05, "loss": 0.1497, "loss/crossentropy": 2.8270416259765625, "loss/hidden": 0.13623046875, "loss/logits": 0.013480226043611765, "step": 255 }, { "epoch": 0.256, "grad_norm": 1.4609375, "grad_norm_var": 4.07616958618164, "learning_rate": 2e-05, "loss": 0.1668, "loss/crossentropy": 1.3126854300498962, "loss/hidden": 0.15576171875, "loss/logits": 0.01107651786878705, "step": 256 }, { "epoch": 0.257, "grad_norm": 1.9140625, "grad_norm_var": 4.02563247680664, "learning_rate": 2e-05, "loss": 0.1502, "loss/crossentropy": 1.4198355078697205, "loss/hidden": 0.1396484375, "loss/logits": 0.01056258101016283, "step": 257 }, { "epoch": 0.258, "grad_norm": 1.3671875, "grad_norm_var": 4.081167602539063, "learning_rate": 2e-05, "loss": 0.1421, "loss/crossentropy": 1.657827377319336, "loss/hidden": 0.13232421875, "loss/logits": 0.009755304548889399, "step": 258 }, { "epoch": 0.259, "grad_norm": 1.75, "grad_norm_var": 4.025512440999349, "learning_rate": 2e-05, "loss": 0.1352, "loss/crossentropy": 2.3775731325149536, "loss/hidden": 0.12548828125, "loss/logits": 0.0096644451841712, "step": 259 }, { "epoch": 0.26, "grad_norm": 1.40625, "grad_norm_var": 4.089703114827474, "learning_rate": 2e-05, "loss": 0.1442, "loss/crossentropy": 2.2461366653442383, "loss/hidden": 0.13232421875, "loss/logits": 0.011895926669239998, "step": 260 }, { "epoch": 0.261, "grad_norm": 2.578125, "grad_norm_var": 4.065040842692057, "learning_rate": 2e-05, "loss": 0.1474, "loss/crossentropy": 1.560776025056839, "loss/hidden": 0.1337890625, "loss/logits": 0.013578795362263918, "step": 261 }, { "epoch": 0.262, "grad_norm": 1.5390625, "grad_norm_var": 4.082124582926432, "learning_rate": 2e-05, "loss": 0.1556, "loss/crossentropy": 1.9976117014884949, "loss/hidden": 0.14404296875, "loss/logits": 0.011512083932757378, "step": 262 }, { "epoch": 0.263, "grad_norm": 1.6328125, "grad_norm_var": 4.018440755208333, "learning_rate": 2e-05, "loss": 0.1759, "loss/crossentropy": 1.705672264099121, "loss/hidden": 0.16162109375, "loss/logits": 0.014301342889666557, "step": 263 }, { "epoch": 0.264, "grad_norm": 1.765625, "grad_norm_var": 3.8464345296223956, "learning_rate": 2e-05, "loss": 0.1864, "loss/crossentropy": 1.7075408101081848, "loss/hidden": 0.171875, "loss/logits": 0.01456779520958662, "step": 264 }, { "epoch": 0.265, "grad_norm": 1.859375, "grad_norm_var": 3.86392822265625, "learning_rate": 2e-05, "loss": 0.1677, "loss/crossentropy": 2.094871759414673, "loss/hidden": 0.15380859375, "loss/logits": 0.013906504027545452, "step": 265 }, { "epoch": 0.266, "grad_norm": 2.578125, "grad_norm_var": 3.8542154947916667, "learning_rate": 2e-05, "loss": 0.1591, "loss/crossentropy": 2.166890859603882, "loss/hidden": 0.146484375, "loss/logits": 0.012606294360011816, "step": 266 }, { "epoch": 0.267, "grad_norm": 3.859375, "grad_norm_var": 3.885705312093099, "learning_rate": 2e-05, "loss": 0.1763, "loss/crossentropy": 1.674479365348816, "loss/hidden": 0.162109375, "loss/logits": 0.01416744152083993, "step": 267 }, { "epoch": 0.268, "grad_norm": 2.625, "grad_norm_var": 3.8142555236816404, "learning_rate": 2e-05, "loss": 0.2022, "loss/crossentropy": 1.0146620571613312, "loss/hidden": 0.1904296875, "loss/logits": 0.01172702293843031, "step": 268 }, { "epoch": 0.269, "grad_norm": 1.21875, "grad_norm_var": 0.4503334045410156, "learning_rate": 2e-05, "loss": 0.1457, "loss/crossentropy": 1.8024365305900574, "loss/hidden": 0.13427734375, "loss/logits": 0.011465264018625021, "step": 269 }, { "epoch": 0.27, "grad_norm": 1.4296875, "grad_norm_var": 0.46684951782226564, "learning_rate": 2e-05, "loss": 0.161, "loss/crossentropy": 1.7421787977218628, "loss/hidden": 0.14892578125, "loss/logits": 0.012049074750393629, "step": 270 }, { "epoch": 0.271, "grad_norm": 2.21875, "grad_norm_var": 0.4663726806640625, "learning_rate": 2e-05, "loss": 0.1519, "loss/crossentropy": 1.1601504981517792, "loss/hidden": 0.14404296875, "loss/logits": 0.007814974524080753, "step": 271 }, { "epoch": 0.272, "grad_norm": 1.7421875, "grad_norm_var": 0.4529693603515625, "learning_rate": 2e-05, "loss": 0.1693, "loss/crossentropy": 1.9806629419326782, "loss/hidden": 0.15625, "loss/logits": 0.01302909990772605, "step": 272 }, { "epoch": 0.273, "grad_norm": 1.1796875, "grad_norm_var": 0.4919352213541667, "learning_rate": 2e-05, "loss": 0.1724, "loss/crossentropy": 2.005366265773773, "loss/hidden": 0.158203125, "loss/logits": 0.014153223484754562, "step": 273 }, { "epoch": 0.274, "grad_norm": 1.765625, "grad_norm_var": 0.4723894755045573, "learning_rate": 2e-05, "loss": 0.1808, "loss/crossentropy": 1.7814961075782776, "loss/hidden": 0.166015625, "loss/logits": 0.014784782659262419, "step": 274 }, { "epoch": 0.275, "grad_norm": 1.9921875, "grad_norm_var": 0.4697011311848958, "learning_rate": 2e-05, "loss": 0.1963, "loss/crossentropy": 1.5670437216758728, "loss/hidden": 0.1796875, "loss/logits": 0.016570267733186483, "step": 275 }, { "epoch": 0.276, "grad_norm": 1.4765625, "grad_norm_var": 0.464800771077474, "learning_rate": 2e-05, "loss": 0.1604, "loss/crossentropy": 2.009281039237976, "loss/hidden": 0.1494140625, "loss/logits": 0.010985464788973331, "step": 276 }, { "epoch": 0.277, "grad_norm": 1.4453125, "grad_norm_var": 0.45259501139322916, "learning_rate": 2e-05, "loss": 0.168, "loss/crossentropy": 1.7085555791854858, "loss/hidden": 0.15625, "loss/logits": 0.011709913145750761, "step": 277 }, { "epoch": 0.278, "grad_norm": 1.3828125, "grad_norm_var": 0.46154683430989585, "learning_rate": 2e-05, "loss": 0.1456, "loss/crossentropy": 2.789747476577759, "loss/hidden": 0.1337890625, "loss/logits": 0.011802888009697199, "step": 278 }, { "epoch": 0.279, "grad_norm": 1.859375, "grad_norm_var": 0.45711441040039064, "learning_rate": 2e-05, "loss": 0.1881, "loss/crossentropy": 1.5918955504894257, "loss/hidden": 0.173828125, "loss/logits": 0.014291070867329836, "step": 279 }, { "epoch": 0.28, "grad_norm": 2.8125, "grad_norm_var": 0.5068682352701823, "learning_rate": 2e-05, "loss": 0.1458, "loss/crossentropy": 0.8236657343804836, "loss/hidden": 0.139404296875, "loss/logits": 0.00643135339487344, "step": 280 }, { "epoch": 0.281, "grad_norm": 4.125, "grad_norm_var": 0.7956764221191406, "learning_rate": 2e-05, "loss": 0.1714, "loss/crossentropy": 2.1279306411743164, "loss/hidden": 0.15625, "loss/logits": 0.015115040354430676, "step": 281 }, { "epoch": 0.282, "grad_norm": 1.296875, "grad_norm_var": 0.8177813212076823, "learning_rate": 2e-05, "loss": 0.1669, "loss/crossentropy": 2.2272568941116333, "loss/hidden": 0.1533203125, "loss/logits": 0.0135371801443398, "step": 282 }, { "epoch": 0.283, "grad_norm": 2.515625, "grad_norm_var": 0.6023089090983073, "learning_rate": 2e-05, "loss": 0.1781, "loss/crossentropy": 2.2013776302337646, "loss/hidden": 0.16259765625, "loss/logits": 0.015500886365771294, "step": 283 }, { "epoch": 0.284, "grad_norm": 1.96875, "grad_norm_var": 0.5695391337076823, "learning_rate": 2e-05, "loss": 0.1822, "loss/crossentropy": 1.6315099596977234, "loss/hidden": 0.1689453125, "loss/logits": 0.013229990843683481, "step": 284 }, { "epoch": 0.285, "grad_norm": 2.421875, "grad_norm_var": 0.550426991780599, "learning_rate": 2e-05, "loss": 0.1877, "loss/crossentropy": 1.329133152961731, "loss/hidden": 0.1748046875, "loss/logits": 0.012850106693804264, "step": 285 }, { "epoch": 0.286, "grad_norm": 2.78125, "grad_norm_var": 0.5659576416015625, "learning_rate": 2e-05, "loss": 0.1725, "loss/crossentropy": 2.0431485772132874, "loss/hidden": 0.15966796875, "loss/logits": 0.01284833624958992, "step": 286 }, { "epoch": 0.287, "grad_norm": 2.15625, "grad_norm_var": 0.5648915608723958, "learning_rate": 2e-05, "loss": 0.2173, "loss/crossentropy": 1.6292879581451416, "loss/hidden": 0.19970703125, "loss/logits": 0.017579292878508568, "step": 287 }, { "epoch": 0.288, "grad_norm": 1.4296875, "grad_norm_var": 0.5841379801432292, "learning_rate": 2e-05, "loss": 0.1632, "loss/crossentropy": 2.0630630254745483, "loss/hidden": 0.14990234375, "loss/logits": 0.013251845724880695, "step": 288 }, { "epoch": 0.289, "grad_norm": 1.8203125, "grad_norm_var": 0.5364664713541667, "learning_rate": 2e-05, "loss": 0.2067, "loss/crossentropy": 2.168562591075897, "loss/hidden": 0.18798828125, "loss/logits": 0.01867722487077117, "step": 289 }, { "epoch": 0.29, "grad_norm": 1.21875, "grad_norm_var": 0.5779449462890625, "learning_rate": 2e-05, "loss": 0.166, "loss/crossentropy": 1.8953060507774353, "loss/hidden": 0.15380859375, "loss/logits": 0.01215141685679555, "step": 290 }, { "epoch": 0.291, "grad_norm": 1.7109375, "grad_norm_var": 0.5848297119140625, "learning_rate": 2e-05, "loss": 0.187, "loss/crossentropy": 1.6148796081542969, "loss/hidden": 0.173828125, "loss/logits": 0.013202093075960875, "step": 291 }, { "epoch": 0.292, "grad_norm": 1.6328125, "grad_norm_var": 0.5749013264973958, "learning_rate": 2e-05, "loss": 0.197, "loss/crossentropy": 1.7814635038375854, "loss/hidden": 0.1826171875, "loss/logits": 0.014429094269871712, "step": 292 }, { "epoch": 0.293, "grad_norm": 2.015625, "grad_norm_var": 0.5503028869628906, "learning_rate": 2e-05, "loss": 0.1814, "loss/crossentropy": 2.1830875873565674, "loss/hidden": 0.16748046875, "loss/logits": 0.013968405313789845, "step": 293 }, { "epoch": 0.294, "grad_norm": 1.7109375, "grad_norm_var": 0.5268898010253906, "learning_rate": 2e-05, "loss": 0.2098, "loss/crossentropy": 1.681401550769806, "loss/hidden": 0.19482421875, "loss/logits": 0.01494319923222065, "step": 294 }, { "epoch": 0.295, "grad_norm": 1.3046875, "grad_norm_var": 0.5633453369140625, "learning_rate": 2e-05, "loss": 0.1884, "loss/crossentropy": 1.953886091709137, "loss/hidden": 0.173828125, "loss/logits": 0.014602533541619778, "step": 295 }, { "epoch": 0.296, "grad_norm": 1.6875, "grad_norm_var": 0.5292144775390625, "learning_rate": 2e-05, "loss": 0.1987, "loss/crossentropy": 1.6944631338119507, "loss/hidden": 0.18603515625, "loss/logits": 0.012617598287761211, "step": 296 }, { "epoch": 0.297, "grad_norm": 1.8359375, "grad_norm_var": 0.20425999959309896, "learning_rate": 2e-05, "loss": 0.2261, "loss/crossentropy": 2.214042544364929, "loss/hidden": 0.205078125, "loss/logits": 0.020975200459361076, "step": 297 }, { "epoch": 0.298, "grad_norm": 1.1484375, "grad_norm_var": 0.2164703369140625, "learning_rate": 2e-05, "loss": 0.1842, "loss/crossentropy": 2.1237878799438477, "loss/hidden": 0.16943359375, "loss/logits": 0.014801782555878162, "step": 298 }, { "epoch": 0.299, "grad_norm": 1.4765625, "grad_norm_var": 0.18964818318684895, "learning_rate": 2e-05, "loss": 0.1814, "loss/crossentropy": 1.492847979068756, "loss/hidden": 0.16845703125, "loss/logits": 0.012967187445610762, "step": 299 }, { "epoch": 0.3, "grad_norm": 1.890625, "grad_norm_var": 0.1879595438639323, "learning_rate": 2e-05, "loss": 0.1776, "loss/crossentropy": 2.2924291491508484, "loss/hidden": 0.16357421875, "loss/logits": 0.014043833129107952, "step": 300 }, { "epoch": 0.301, "grad_norm": 4.1875, "grad_norm_var": 0.5374061584472656, "learning_rate": 2e-05, "loss": 0.2062, "loss/crossentropy": 1.607342541217804, "loss/hidden": 0.18994140625, "loss/logits": 0.016273885034024715, "step": 301 }, { "epoch": 0.302, "grad_norm": 1.5859375, "grad_norm_var": 0.4823486328125, "learning_rate": 2e-05, "loss": 0.2143, "loss/crossentropy": 1.8559609055519104, "loss/hidden": 0.197265625, "loss/logits": 0.017047187313437462, "step": 302 }, { "epoch": 0.303, "grad_norm": 1.2265625, "grad_norm_var": 0.4923052469889323, "learning_rate": 2e-05, "loss": 0.1814, "loss/crossentropy": 2.4204115867614746, "loss/hidden": 0.16796875, "loss/logits": 0.013407074846327305, "step": 303 }, { "epoch": 0.304, "grad_norm": 2.15625, "grad_norm_var": 0.49497782389322914, "learning_rate": 2e-05, "loss": 0.2058, "loss/crossentropy": 1.7306669354438782, "loss/hidden": 0.189453125, "loss/logits": 0.016323519870638847, "step": 304 }, { "epoch": 0.305, "grad_norm": 1.6484375, "grad_norm_var": 0.4960856119791667, "learning_rate": 2e-05, "loss": 0.1877, "loss/crossentropy": 2.212082266807556, "loss/hidden": 0.171875, "loss/logits": 0.015811644960194826, "step": 305 }, { "epoch": 0.306, "grad_norm": 1.3046875, "grad_norm_var": 0.4901466369628906, "learning_rate": 2e-05, "loss": 0.1902, "loss/crossentropy": 1.9250993132591248, "loss/hidden": 0.17626953125, "loss/logits": 0.013882125727832317, "step": 306 }, { "epoch": 0.307, "grad_norm": 5.75, "grad_norm_var": 1.4711181640625, "learning_rate": 2e-05, "loss": 0.1934, "loss/crossentropy": 0.4879331737756729, "loss/hidden": 0.18701171875, "loss/logits": 0.006413323106244206, "step": 307 }, { "epoch": 0.308, "grad_norm": 3.046875, "grad_norm_var": 1.520232899983724, "learning_rate": 2e-05, "loss": 0.1973, "loss/crossentropy": 1.4504847526550293, "loss/hidden": 0.1875, "loss/logits": 0.009785078698769212, "step": 308 }, { "epoch": 0.309, "grad_norm": 1.40625, "grad_norm_var": 1.5522092183430989, "learning_rate": 2e-05, "loss": 0.2057, "loss/crossentropy": 2.149027943611145, "loss/hidden": 0.189453125, "loss/logits": 0.01620970480144024, "step": 309 }, { "epoch": 0.31, "grad_norm": 1.7578125, "grad_norm_var": 1.550005849202474, "learning_rate": 2e-05, "loss": 0.2027, "loss/crossentropy": 2.1503273248672485, "loss/hidden": 0.185546875, "loss/logits": 0.01712088193744421, "step": 310 }, { "epoch": 0.311, "grad_norm": 1.4375, "grad_norm_var": 1.5372304280598958, "learning_rate": 2e-05, "loss": 0.1888, "loss/crossentropy": 2.1748342514038086, "loss/hidden": 0.17333984375, "loss/logits": 0.01546872965991497, "step": 311 }, { "epoch": 0.312, "grad_norm": 1.4921875, "grad_norm_var": 1.5502703348795572, "learning_rate": 2e-05, "loss": 0.2158, "loss/crossentropy": 1.3706732988357544, "loss/hidden": 0.20166015625, "loss/logits": 0.014161557890474796, "step": 312 }, { "epoch": 0.313, "grad_norm": 2.421875, "grad_norm_var": 1.5523111979166666, "learning_rate": 2e-05, "loss": 0.2021, "loss/crossentropy": 1.8907567262649536, "loss/hidden": 0.18701171875, "loss/logits": 0.015071831177920103, "step": 313 }, { "epoch": 0.314, "grad_norm": 1.296875, "grad_norm_var": 1.5344378153483074, "learning_rate": 2e-05, "loss": 0.201, "loss/crossentropy": 1.7888588905334473, "loss/hidden": 0.1875, "loss/logits": 0.013532605487853289, "step": 314 }, { "epoch": 0.315, "grad_norm": 1.5859375, "grad_norm_var": 1.5256507873535157, "learning_rate": 2e-05, "loss": 0.2166, "loss/crossentropy": 1.5358025133609772, "loss/hidden": 0.2021484375, "loss/logits": 0.014410331379622221, "step": 315 }, { "epoch": 0.316, "grad_norm": 56.0, "grad_norm_var": 182.73569310506184, "learning_rate": 2e-05, "loss": 0.2529, "loss/crossentropy": 2.1001065373420715, "loss/hidden": 0.234375, "loss/logits": 0.01847642147913575, "step": 316 }, { "epoch": 0.317, "grad_norm": 1.296875, "grad_norm_var": 183.77112401326497, "learning_rate": 2e-05, "loss": 0.1958, "loss/crossentropy": 2.3731868267059326, "loss/hidden": 0.1796875, "loss/logits": 0.01615766156464815, "step": 317 }, { "epoch": 0.318, "grad_norm": 1.53125, "grad_norm_var": 183.79867248535157, "learning_rate": 2e-05, "loss": 0.2212, "loss/crossentropy": 1.8716753125190735, "loss/hidden": 0.2041015625, "loss/logits": 0.017116894014179707, "step": 318 }, { "epoch": 0.319, "grad_norm": 1.9921875, "grad_norm_var": 183.41590983072916, "learning_rate": 2e-05, "loss": 0.1938, "loss/crossentropy": 1.2205194532871246, "loss/hidden": 0.18115234375, "loss/logits": 0.012608660385012627, "step": 319 }, { "epoch": 0.32, "grad_norm": 1.203125, "grad_norm_var": 183.88273010253906, "learning_rate": 2e-05, "loss": 0.1822, "loss/crossentropy": 2.3611029386520386, "loss/hidden": 0.1689453125, "loss/logits": 0.013240456581115723, "step": 320 }, { "epoch": 0.321, "grad_norm": 1.3046875, "grad_norm_var": 184.05854390462238, "learning_rate": 2e-05, "loss": 0.193, "loss/crossentropy": 1.8402240872383118, "loss/hidden": 0.18017578125, "loss/logits": 0.012811433058232069, "step": 321 }, { "epoch": 0.322, "grad_norm": 1.3671875, "grad_norm_var": 184.02547912597657, "learning_rate": 2e-05, "loss": 0.2238, "loss/crossentropy": 1.9131136536598206, "loss/hidden": 0.20751953125, "loss/logits": 0.016317113302648067, "step": 322 }, { "epoch": 0.323, "grad_norm": 1.9765625, "grad_norm_var": 184.69184951782228, "learning_rate": 2e-05, "loss": 0.2509, "loss/crossentropy": 1.4010455012321472, "loss/hidden": 0.23193359375, "loss/logits": 0.018928353674709797, "step": 323 }, { "epoch": 0.324, "grad_norm": 2.234375, "grad_norm_var": 184.9522621154785, "learning_rate": 2e-05, "loss": 0.1929, "loss/crossentropy": 1.9659223556518555, "loss/hidden": 0.1796875, "loss/logits": 0.013216304127126932, "step": 324 }, { "epoch": 0.325, "grad_norm": 1.75, "grad_norm_var": 184.79406102498373, "learning_rate": 2e-05, "loss": 0.1877, "loss/crossentropy": 1.5221052765846252, "loss/hidden": 0.17626953125, "loss/logits": 0.011447824770584702, "step": 325 }, { "epoch": 0.326, "grad_norm": 1.5546875, "grad_norm_var": 184.88554662068685, "learning_rate": 2e-05, "loss": 0.2212, "loss/crossentropy": 2.06081086397171, "loss/hidden": 0.20361328125, "loss/logits": 0.017567144706845284, "step": 326 }, { "epoch": 0.327, "grad_norm": 3.578125, "grad_norm_var": 184.14719823201497, "learning_rate": 2e-05, "loss": 0.1707, "loss/crossentropy": 0.8908511102199554, "loss/hidden": 0.1640625, "loss/logits": 0.006589735276065767, "step": 327 }, { "epoch": 0.328, "grad_norm": 2.1875, "grad_norm_var": 183.83722737630208, "learning_rate": 2e-05, "loss": 0.2041, "loss/crossentropy": 1.4793621897697449, "loss/hidden": 0.19384765625, "loss/logits": 0.010210367618128657, "step": 328 }, { "epoch": 0.329, "grad_norm": 1.5703125, "grad_norm_var": 184.19855931599935, "learning_rate": 2e-05, "loss": 0.2174, "loss/crossentropy": 1.5629376769065857, "loss/hidden": 0.20166015625, "loss/logits": 0.015733799897134304, "step": 329 }, { "epoch": 0.33, "grad_norm": 1.4609375, "grad_norm_var": 184.11591389973958, "learning_rate": 2e-05, "loss": 0.2297, "loss/crossentropy": 2.016783118247986, "loss/hidden": 0.2119140625, "loss/logits": 0.017778108827769756, "step": 330 }, { "epoch": 0.331, "grad_norm": 1.3671875, "grad_norm_var": 184.22320963541668, "learning_rate": 2e-05, "loss": 0.2183, "loss/crossentropy": 2.3946865797042847, "loss/hidden": 0.2001953125, "loss/logits": 0.01807898748666048, "step": 331 }, { "epoch": 0.332, "grad_norm": 1.234375, "grad_norm_var": 0.35546773274739585, "learning_rate": 2e-05, "loss": 0.2244, "loss/crossentropy": 1.6463975310325623, "loss/hidden": 0.2099609375, "loss/logits": 0.014466887805610895, "step": 332 }, { "epoch": 0.333, "grad_norm": 1.703125, "grad_norm_var": 0.34256083170572915, "learning_rate": 2e-05, "loss": 0.2653, "loss/crossentropy": 1.727737307548523, "loss/hidden": 0.24462890625, "loss/logits": 0.020694734528660774, "step": 333 }, { "epoch": 0.334, "grad_norm": 2.34375, "grad_norm_var": 0.36001688639322915, "learning_rate": 2e-05, "loss": 0.2636, "loss/crossentropy": 1.8381291031837463, "loss/hidden": 0.244140625, "loss/logits": 0.019478057511150837, "step": 334 }, { "epoch": 0.335, "grad_norm": 5.5, "grad_norm_var": 1.2181292215983073, "learning_rate": 2e-05, "loss": 0.2789, "loss/crossentropy": 1.395434319972992, "loss/hidden": 0.25732421875, "loss/logits": 0.02152822446078062, "step": 335 }, { "epoch": 0.336, "grad_norm": 1.7578125, "grad_norm_var": 1.1768707275390624, "learning_rate": 2e-05, "loss": 0.2301, "loss/crossentropy": 1.7802979946136475, "loss/hidden": 0.212890625, "loss/logits": 0.01717265695333481, "step": 336 }, { "epoch": 0.337, "grad_norm": 1.2265625, "grad_norm_var": 1.1850748697916667, "learning_rate": 2e-05, "loss": 0.2195, "loss/crossentropy": 1.864999234676361, "loss/hidden": 0.20361328125, "loss/logits": 0.015909720212221146, "step": 337 }, { "epoch": 0.338, "grad_norm": 1.5078125, "grad_norm_var": 1.1734934488932292, "learning_rate": 2e-05, "loss": 0.2322, "loss/crossentropy": 1.9171935319900513, "loss/hidden": 0.2138671875, "loss/logits": 0.01834118738770485, "step": 338 }, { "epoch": 0.339, "grad_norm": 1.7109375, "grad_norm_var": 1.1808430989583334, "learning_rate": 2e-05, "loss": 0.2546, "loss/crossentropy": 2.232408821582794, "loss/hidden": 0.23388671875, "loss/logits": 0.02068551816046238, "step": 339 }, { "epoch": 0.34, "grad_norm": 4.40625, "grad_norm_var": 1.531086222330729, "learning_rate": 2e-05, "loss": 0.2209, "loss/crossentropy": 0.885938722640276, "loss/hidden": 0.2138671875, "loss/logits": 0.0069831793662160635, "step": 340 }, { "epoch": 0.341, "grad_norm": 1.8046875, "grad_norm_var": 1.5281471252441405, "learning_rate": 2e-05, "loss": 0.274, "loss/crossentropy": 2.053671360015869, "loss/hidden": 0.25, "loss/logits": 0.024039674550294876, "step": 341 }, { "epoch": 0.342, "grad_norm": 1.4765625, "grad_norm_var": 1.535064442952474, "learning_rate": 2e-05, "loss": 0.248, "loss/crossentropy": 2.1628893613815308, "loss/hidden": 0.22900390625, "loss/logits": 0.01902489084750414, "step": 342 }, { "epoch": 0.343, "grad_norm": 1.6875, "grad_norm_var": 1.4053301493326822, "learning_rate": 2e-05, "loss": 0.2355, "loss/crossentropy": 1.9784727692604065, "loss/hidden": 0.216796875, "loss/logits": 0.018667724914848804, "step": 343 }, { "epoch": 0.344, "grad_norm": 1.9453125, "grad_norm_var": 1.4048492431640625, "learning_rate": 2e-05, "loss": 0.2215, "loss/crossentropy": 2.1430813670158386, "loss/hidden": 0.205078125, "loss/logits": 0.016372697427868843, "step": 344 }, { "epoch": 0.345, "grad_norm": 3.34375, "grad_norm_var": 1.489422353108724, "learning_rate": 2e-05, "loss": 0.2828, "loss/crossentropy": 1.4574592113494873, "loss/hidden": 0.259765625, "loss/logits": 0.02300189435482025, "step": 345 }, { "epoch": 0.346, "grad_norm": 4.59375, "grad_norm_var": 1.8130035400390625, "learning_rate": 2e-05, "loss": 0.2555, "loss/crossentropy": 2.1325125694274902, "loss/hidden": 0.234375, "loss/logits": 0.021130304783582687, "step": 346 }, { "epoch": 0.347, "grad_norm": 1.4453125, "grad_norm_var": 1.8031412760416667, "learning_rate": 2e-05, "loss": 0.233, "loss/crossentropy": 2.6941460371017456, "loss/hidden": 0.21435546875, "loss/logits": 0.01859632506966591, "step": 347 }, { "epoch": 0.348, "grad_norm": 1.6171875, "grad_norm_var": 1.755077870686849, "learning_rate": 2e-05, "loss": 0.2562, "loss/crossentropy": 1.8957814574241638, "loss/hidden": 0.236328125, "loss/logits": 0.019866405054926872, "step": 348 }, { "epoch": 0.349, "grad_norm": 1.953125, "grad_norm_var": 1.7364418029785156, "learning_rate": 2e-05, "loss": 0.2507, "loss/crossentropy": 2.5658878087997437, "loss/hidden": 0.2294921875, "loss/logits": 0.02118699811398983, "step": 349 }, { "epoch": 0.35, "grad_norm": 1.890625, "grad_norm_var": 1.7523719787597656, "learning_rate": 2e-05, "loss": 0.233, "loss/crossentropy": 1.9111933708190918, "loss/hidden": 0.21533203125, "loss/logits": 0.01770856324583292, "step": 350 }, { "epoch": 0.351, "grad_norm": 2.625, "grad_norm_var": 1.0678749084472656, "learning_rate": 2e-05, "loss": 0.2712, "loss/crossentropy": 1.5525288581848145, "loss/hidden": 0.25244140625, "loss/logits": 0.01877846010029316, "step": 351 }, { "epoch": 0.352, "grad_norm": 1.671875, "grad_norm_var": 1.07325439453125, "learning_rate": 2e-05, "loss": 0.2398, "loss/crossentropy": 1.47780179977417, "loss/hidden": 0.224609375, "loss/logits": 0.015163760632276535, "step": 352 }, { "epoch": 0.353, "grad_norm": 1.40625, "grad_norm_var": 1.0523902893066406, "learning_rate": 2e-05, "loss": 0.2579, "loss/crossentropy": 1.6976242065429688, "loss/hidden": 0.240234375, "loss/logits": 0.01768268644809723, "step": 353 }, { "epoch": 0.354, "grad_norm": 1.375, "grad_norm_var": 1.065623982747396, "learning_rate": 2e-05, "loss": 0.2594, "loss/crossentropy": 1.5402989983558655, "loss/hidden": 0.24169921875, "loss/logits": 0.017742513678967953, "step": 354 }, { "epoch": 0.355, "grad_norm": 2.609375, "grad_norm_var": 1.0593360900878905, "learning_rate": 2e-05, "loss": 0.2983, "loss/crossentropy": 1.7891557812690735, "loss/hidden": 0.2744140625, "loss/logits": 0.023881751112639904, "step": 355 }, { "epoch": 0.356, "grad_norm": 1.5859375, "grad_norm_var": 0.7421427408854167, "learning_rate": 2e-05, "loss": 0.2353, "loss/crossentropy": 2.255465269088745, "loss/hidden": 0.2177734375, "loss/logits": 0.01755282748490572, "step": 356 }, { "epoch": 0.357, "grad_norm": 1.4375, "grad_norm_var": 0.763287099202474, "learning_rate": 2e-05, "loss": 0.2388, "loss/crossentropy": 2.2716734409332275, "loss/hidden": 0.22021484375, "loss/logits": 0.018602201715111732, "step": 357 }, { "epoch": 0.358, "grad_norm": 2.34375, "grad_norm_var": 0.7449666341145833, "learning_rate": 2e-05, "loss": 0.2737, "loss/crossentropy": 1.8382077813148499, "loss/hidden": 0.2548828125, "loss/logits": 0.018825003411620855, "step": 358 }, { "epoch": 0.359, "grad_norm": 1.5546875, "grad_norm_var": 0.7532976786295573, "learning_rate": 2e-05, "loss": 0.2391, "loss/crossentropy": 1.6230210661888123, "loss/hidden": 0.224609375, "loss/logits": 0.014487342443317175, "step": 359 }, { "epoch": 0.36, "grad_norm": 1.421875, "grad_norm_var": 0.7803385416666667, "learning_rate": 2e-05, "loss": 0.2519, "loss/crossentropy": 1.6961406469345093, "loss/hidden": 0.234375, "loss/logits": 0.017499960027635098, "step": 360 }, { "epoch": 0.361, "grad_norm": 1.5703125, "grad_norm_var": 0.6720965067545573, "learning_rate": 2e-05, "loss": 0.2623, "loss/crossentropy": 2.1821005940437317, "loss/hidden": 0.24072265625, "loss/logits": 0.021556712687015533, "step": 361 }, { "epoch": 0.362, "grad_norm": 1.6484375, "grad_norm_var": 0.17363688151041667, "learning_rate": 2e-05, "loss": 0.2759, "loss/crossentropy": 1.7173206806182861, "loss/hidden": 0.255859375, "loss/logits": 0.020033356733620167, "step": 362 }, { "epoch": 0.363, "grad_norm": 1.5859375, "grad_norm_var": 0.16897684733072918, "learning_rate": 2e-05, "loss": 0.2552, "loss/crossentropy": 1.8281689882278442, "loss/hidden": 0.23681640625, "loss/logits": 0.018404729664325714, "step": 363 }, { "epoch": 0.364, "grad_norm": 1.3125, "grad_norm_var": 0.1809282938639323, "learning_rate": 2e-05, "loss": 0.2546, "loss/crossentropy": 2.181256651878357, "loss/hidden": 0.23486328125, "loss/logits": 0.01975287776440382, "step": 364 }, { "epoch": 0.365, "grad_norm": 3.796875, "grad_norm_var": 0.4434466044108073, "learning_rate": 2e-05, "loss": 0.2803, "loss/crossentropy": 1.4486916065216064, "loss/hidden": 0.2607421875, "loss/logits": 0.01950985286384821, "step": 365 }, { "epoch": 0.366, "grad_norm": 1.234375, "grad_norm_var": 0.4680987040201823, "learning_rate": 2e-05, "loss": 0.2504, "loss/crossentropy": 2.026048183441162, "loss/hidden": 0.232421875, "loss/logits": 0.017978372983634472, "step": 366 }, { "epoch": 0.367, "grad_norm": 4.3125, "grad_norm_var": 0.8263628641764323, "learning_rate": 2e-05, "loss": 0.2579, "loss/crossentropy": 1.4382375180721283, "loss/hidden": 0.2412109375, "loss/logits": 0.016655512619763613, "step": 367 }, { "epoch": 0.368, "grad_norm": 2.25, "grad_norm_var": 0.827416737874349, "learning_rate": 2e-05, "loss": 0.3072, "loss/crossentropy": 1.57509446144104, "loss/hidden": 0.2880859375, "loss/logits": 0.019162926822900772, "step": 368 }, { "epoch": 0.369, "grad_norm": 1.78125, "grad_norm_var": 0.808251698811849, "learning_rate": 2e-05, "loss": 0.2383, "loss/crossentropy": 2.0060970187187195, "loss/hidden": 0.22021484375, "loss/logits": 0.018060280941426754, "step": 369 }, { "epoch": 0.37, "grad_norm": 2.546875, "grad_norm_var": 0.798180898030599, "learning_rate": 2e-05, "loss": 0.2523, "loss/crossentropy": 1.2137621641159058, "loss/hidden": 0.24072265625, "loss/logits": 0.011561613995581865, "step": 370 }, { "epoch": 0.371, "grad_norm": 1.703125, "grad_norm_var": 0.7833717346191407, "learning_rate": 2e-05, "loss": 0.2561, "loss/crossentropy": 1.764179289340973, "loss/hidden": 0.240234375, "loss/logits": 0.015869705006480217, "step": 371 }, { "epoch": 0.372, "grad_norm": 1.5625, "grad_norm_var": 0.784716796875, "learning_rate": 2e-05, "loss": 0.2642, "loss/crossentropy": 2.1394487619400024, "loss/hidden": 0.2451171875, "loss/logits": 0.01907090563327074, "step": 372 }, { "epoch": 0.373, "grad_norm": 1.9765625, "grad_norm_var": 0.7621681213378906, "learning_rate": 2e-05, "loss": 0.2496, "loss/crossentropy": 2.151320219039917, "loss/hidden": 0.23095703125, "loss/logits": 0.018605505116283894, "step": 373 }, { "epoch": 0.374, "grad_norm": 1.5234375, "grad_norm_var": 0.77073974609375, "learning_rate": 2e-05, "loss": 0.2426, "loss/crossentropy": 2.291616916656494, "loss/hidden": 0.2255859375, "loss/logits": 0.01696862932294607, "step": 374 }, { "epoch": 0.375, "grad_norm": 1.1640625, "grad_norm_var": 0.8027577718098958, "learning_rate": 2e-05, "loss": 0.2482, "loss/crossentropy": 2.1597548127174377, "loss/hidden": 0.228515625, "loss/logits": 0.019656311720609665, "step": 375 }, { "epoch": 0.376, "grad_norm": 4.5625, "grad_norm_var": 1.1930867513020833, "learning_rate": 2e-05, "loss": 0.2546, "loss/crossentropy": 0.7966546472162008, "loss/hidden": 0.24609375, "loss/logits": 0.008532016014214605, "step": 376 }, { "epoch": 0.377, "grad_norm": 1.25, "grad_norm_var": 1.2246070861816407, "learning_rate": 2e-05, "loss": 0.2394, "loss/crossentropy": 1.730500340461731, "loss/hidden": 0.22314453125, "loss/logits": 0.016217158176004887, "step": 377 }, { "epoch": 0.378, "grad_norm": 1.9453125, "grad_norm_var": 1.210729726155599, "learning_rate": 2e-05, "loss": 0.2672, "loss/crossentropy": 2.0575554966926575, "loss/hidden": 0.2470703125, "loss/logits": 0.02009457629173994, "step": 378 }, { "epoch": 0.379, "grad_norm": 4.15625, "grad_norm_var": 1.4280181884765626, "learning_rate": 2e-05, "loss": 0.3649, "loss/crossentropy": 2.409613251686096, "loss/hidden": 0.330078125, "loss/logits": 0.034814249724149704, "step": 379 }, { "epoch": 0.38, "grad_norm": 2.34375, "grad_norm_var": 1.3563140869140624, "learning_rate": 2e-05, "loss": 0.2651, "loss/crossentropy": 1.4721761345863342, "loss/hidden": 0.2490234375, "loss/logits": 0.016095119062811136, "step": 380 }, { "epoch": 0.381, "grad_norm": 1.2265625, "grad_norm_var": 1.2842750549316406, "learning_rate": 2e-05, "loss": 0.2538, "loss/crossentropy": 2.51900315284729, "loss/hidden": 0.2314453125, "loss/logits": 0.022326381877064705, "step": 381 }, { "epoch": 0.382, "grad_norm": 2.234375, "grad_norm_var": 1.2151995340983073, "learning_rate": 2e-05, "loss": 0.2743, "loss/crossentropy": 2.030519187450409, "loss/hidden": 0.2548828125, "loss/logits": 0.01944338995963335, "step": 382 }, { "epoch": 0.383, "grad_norm": 3.859375, "grad_norm_var": 1.1054583231608073, "learning_rate": 2e-05, "loss": 0.3105, "loss/crossentropy": 0.7516276463866234, "loss/hidden": 0.2978515625, "loss/logits": 0.012636175146326423, "step": 383 }, { "epoch": 0.384, "grad_norm": 2.296875, "grad_norm_var": 1.1055620829264323, "learning_rate": 2e-05, "loss": 0.2867, "loss/crossentropy": 1.9317356944084167, "loss/hidden": 0.263671875, "loss/logits": 0.023075740784406662, "step": 384 }, { "epoch": 0.385, "grad_norm": 2.09375, "grad_norm_var": 1.0917884826660156, "learning_rate": 2e-05, "loss": 0.3261, "loss/crossentropy": 2.1155296564102173, "loss/hidden": 0.2998046875, "loss/logits": 0.02629261091351509, "step": 385 }, { "epoch": 0.386, "grad_norm": 1.7734375, "grad_norm_var": 1.1014312744140624, "learning_rate": 2e-05, "loss": 0.287, "loss/crossentropy": 2.1998232007026672, "loss/hidden": 0.265625, "loss/logits": 0.021336179226636887, "step": 386 }, { "epoch": 0.387, "grad_norm": 1.8671875, "grad_norm_var": 1.0915992736816407, "learning_rate": 2e-05, "loss": 0.2608, "loss/crossentropy": 1.9437836408615112, "loss/hidden": 0.2412109375, "loss/logits": 0.019607914611697197, "step": 387 }, { "epoch": 0.388, "grad_norm": 2.125, "grad_norm_var": 1.0605812072753906, "learning_rate": 2e-05, "loss": 0.2871, "loss/crossentropy": 1.7142232656478882, "loss/hidden": 0.2666015625, "loss/logits": 0.020461218431591988, "step": 388 }, { "epoch": 0.389, "grad_norm": 1.640625, "grad_norm_var": 1.0809977213541666, "learning_rate": 2e-05, "loss": 0.2863, "loss/crossentropy": 2.236941933631897, "loss/hidden": 0.2626953125, "loss/logits": 0.023648610338568687, "step": 389 }, { "epoch": 0.39, "grad_norm": 3.125, "grad_norm_var": 1.0853248596191407, "learning_rate": 2e-05, "loss": 0.2733, "loss/crossentropy": 1.2834028005599976, "loss/hidden": 0.2607421875, "loss/logits": 0.01257804874330759, "step": 390 }, { "epoch": 0.391, "grad_norm": 1.5, "grad_norm_var": 1.0390787760416667, "learning_rate": 2e-05, "loss": 0.3026, "loss/crossentropy": 1.5867803692817688, "loss/hidden": 0.2822265625, "loss/logits": 0.020396556705236435, "step": 391 }, { "epoch": 0.392, "grad_norm": 1.53125, "grad_norm_var": 0.7292439778645833, "learning_rate": 2e-05, "loss": 0.297, "loss/crossentropy": 1.4337636232376099, "loss/hidden": 0.2783203125, "loss/logits": 0.01866168435662985, "step": 392 }, { "epoch": 0.393, "grad_norm": 1.71875, "grad_norm_var": 0.6845052083333333, "learning_rate": 2e-05, "loss": 0.2642, "loss/crossentropy": 2.1386572122573853, "loss/hidden": 0.24462890625, "loss/logits": 0.019583708606660366, "step": 393 }, { "epoch": 0.394, "grad_norm": 2.9375, "grad_norm_var": 0.710375722249349, "learning_rate": 2e-05, "loss": 0.3313, "loss/crossentropy": 1.936402440071106, "loss/hidden": 0.3046875, "loss/logits": 0.026638174429535866, "step": 394 }, { "epoch": 0.395, "grad_norm": 1.8671875, "grad_norm_var": 0.4642567952473958, "learning_rate": 2e-05, "loss": 0.2699, "loss/crossentropy": 2.2741682529449463, "loss/hidden": 0.248046875, "loss/logits": 0.021812792867422104, "step": 395 }, { "epoch": 0.396, "grad_norm": 4.84375, "grad_norm_var": 0.9248687744140625, "learning_rate": 2e-05, "loss": 0.3035, "loss/crossentropy": 1.1322659850120544, "loss/hidden": 0.291015625, "loss/logits": 0.01252604997716844, "step": 396 }, { "epoch": 0.397, "grad_norm": 2.53125, "grad_norm_var": 0.8462562561035156, "learning_rate": 2e-05, "loss": 0.3108, "loss/crossentropy": 1.358659565448761, "loss/hidden": 0.2900390625, "loss/logits": 0.02074052207171917, "step": 397 }, { "epoch": 0.398, "grad_norm": 1.84375, "grad_norm_var": 0.862939198811849, "learning_rate": 2e-05, "loss": 0.3, "loss/crossentropy": 1.9806614518165588, "loss/hidden": 0.2783203125, "loss/logits": 0.02170161809772253, "step": 398 }, { "epoch": 0.399, "grad_norm": 1.9296875, "grad_norm_var": 0.706591796875, "learning_rate": 2e-05, "loss": 0.2984, "loss/crossentropy": 2.3857691287994385, "loss/hidden": 0.2744140625, "loss/logits": 0.023968273773789406, "step": 399 }, { "epoch": 0.4, "grad_norm": 1.9140625, "grad_norm_var": 0.7121620178222656, "learning_rate": 2e-05, "loss": 0.2732, "loss/crossentropy": 2.006265163421631, "loss/hidden": 0.2509765625, "loss/logits": 0.02220850996673107, "step": 400 }, { "epoch": 0.401, "grad_norm": 1.8046875, "grad_norm_var": 0.7215810139973958, "learning_rate": 2e-05, "loss": 0.2935, "loss/crossentropy": 1.7221473455429077, "loss/hidden": 0.275390625, "loss/logits": 0.018067960627377033, "step": 401 }, { "epoch": 0.402, "grad_norm": 2.421875, "grad_norm_var": 0.7123146057128906, "learning_rate": 2e-05, "loss": 0.2923, "loss/crossentropy": 2.0756383538246155, "loss/hidden": 0.275390625, "loss/logits": 0.016928995959460735, "step": 402 }, { "epoch": 0.403, "grad_norm": 1.53125, "grad_norm_var": 0.7353993733723958, "learning_rate": 2e-05, "loss": 0.2972, "loss/crossentropy": 1.6683465242385864, "loss/hidden": 0.2783203125, "loss/logits": 0.018839839845895767, "step": 403 }, { "epoch": 0.404, "grad_norm": 1.8125, "grad_norm_var": 0.7447987874348958, "learning_rate": 2e-05, "loss": 0.2966, "loss/crossentropy": 1.737410545349121, "loss/hidden": 0.2763671875, "loss/logits": 0.02023144531995058, "step": 404 }, { "epoch": 0.405, "grad_norm": 1.3046875, "grad_norm_var": 0.7762163798014323, "learning_rate": 2e-05, "loss": 0.2855, "loss/crossentropy": 2.2183534502983093, "loss/hidden": 0.26513671875, "loss/logits": 0.02036190778017044, "step": 405 }, { "epoch": 0.406, "grad_norm": 1.5, "grad_norm_var": 0.7329465230305989, "learning_rate": 2e-05, "loss": 0.3193, "loss/crossentropy": 1.8786720633506775, "loss/hidden": 0.294921875, "loss/logits": 0.024385149590671062, "step": 406 }, { "epoch": 0.407, "grad_norm": 1.5, "grad_norm_var": 0.7329465230305989, "learning_rate": 2e-05, "loss": 0.3099, "loss/crossentropy": 1.8731706738471985, "loss/hidden": 0.2861328125, "loss/logits": 0.023721362464129925, "step": 407 }, { "epoch": 0.408, "grad_norm": 1.953125, "grad_norm_var": 0.714214833577474, "learning_rate": 2e-05, "loss": 0.2993, "loss/crossentropy": 2.0363497734069824, "loss/hidden": 0.2763671875, "loss/logits": 0.02292494662106037, "step": 408 }, { "epoch": 0.409, "grad_norm": 1.421875, "grad_norm_var": 0.7343544006347656, "learning_rate": 2e-05, "loss": 0.2919, "loss/crossentropy": 1.7596482038497925, "loss/hidden": 0.2705078125, "loss/logits": 0.021396052092313766, "step": 409 }, { "epoch": 0.41, "grad_norm": 2.84375, "grad_norm_var": 0.7240577697753906, "learning_rate": 2e-05, "loss": 0.3154, "loss/crossentropy": 1.080414205789566, "loss/hidden": 0.29736328125, "loss/logits": 0.018078335095196962, "step": 410 }, { "epoch": 0.411, "grad_norm": 1.5625, "grad_norm_var": 0.73785400390625, "learning_rate": 2e-05, "loss": 0.2928, "loss/crossentropy": 2.527972936630249, "loss/hidden": 0.26953125, "loss/logits": 0.02323300577700138, "step": 411 }, { "epoch": 0.412, "grad_norm": 1.5078125, "grad_norm_var": 0.18848851521809895, "learning_rate": 2e-05, "loss": 0.2989, "loss/crossentropy": 1.5808929204940796, "loss/hidden": 0.28125, "loss/logits": 0.01763766910880804, "step": 412 }, { "epoch": 0.413, "grad_norm": 1.6328125, "grad_norm_var": 0.1557037353515625, "learning_rate": 2e-05, "loss": 0.3052, "loss/crossentropy": 2.073564648628235, "loss/hidden": 0.2841796875, "loss/logits": 0.021017897874116898, "step": 413 }, { "epoch": 0.414, "grad_norm": 1.703125, "grad_norm_var": 0.15574951171875, "learning_rate": 2e-05, "loss": 0.3341, "loss/crossentropy": 1.5968445539474487, "loss/hidden": 0.310546875, "loss/logits": 0.023572119884192944, "step": 414 }, { "epoch": 0.415, "grad_norm": 1.65625, "grad_norm_var": 0.15465469360351564, "learning_rate": 2e-05, "loss": 0.3319, "loss/crossentropy": 2.13019335269928, "loss/hidden": 0.3037109375, "loss/logits": 0.028160166926681995, "step": 415 }, { "epoch": 0.416, "grad_norm": 1.8828125, "grad_norm_var": 0.15405044555664063, "learning_rate": 2e-05, "loss": 0.2928, "loss/crossentropy": 1.3558663129806519, "loss/hidden": 0.2744140625, "loss/logits": 0.018423012923449278, "step": 416 }, { "epoch": 0.417, "grad_norm": 2.15625, "grad_norm_var": 0.1642242431640625, "learning_rate": 2e-05, "loss": 0.3349, "loss/crossentropy": 1.556907832622528, "loss/hidden": 0.310546875, "loss/logits": 0.0243788855150342, "step": 417 }, { "epoch": 0.418, "grad_norm": 1.765625, "grad_norm_var": 0.1344879150390625, "learning_rate": 2e-05, "loss": 0.293, "loss/crossentropy": 2.18166720867157, "loss/hidden": 0.2705078125, "loss/logits": 0.022501694969832897, "step": 418 }, { "epoch": 0.419, "grad_norm": 5.0, "grad_norm_var": 0.7930084228515625, "learning_rate": 2e-05, "loss": 0.306, "loss/crossentropy": 1.875123679637909, "loss/hidden": 0.2841796875, "loss/logits": 0.021816120482981205, "step": 419 }, { "epoch": 0.42, "grad_norm": 2.0, "grad_norm_var": 0.7917633056640625, "learning_rate": 2e-05, "loss": 0.3207, "loss/crossentropy": 2.1878353357315063, "loss/hidden": 0.29296875, "loss/logits": 0.027718784287571907, "step": 420 }, { "epoch": 0.421, "grad_norm": 2.5, "grad_norm_var": 0.7763160705566406, "learning_rate": 2e-05, "loss": 0.3106, "loss/crossentropy": 2.46438992023468, "loss/hidden": 0.2841796875, "loss/logits": 0.026430321857333183, "step": 421 }, { "epoch": 0.422, "grad_norm": 1.59375, "grad_norm_var": 0.7701576232910157, "learning_rate": 2e-05, "loss": 0.2847, "loss/crossentropy": 1.991809368133545, "loss/hidden": 0.265625, "loss/logits": 0.019083392806351185, "step": 422 }, { "epoch": 0.423, "grad_norm": 2.421875, "grad_norm_var": 0.7565935770670573, "learning_rate": 2e-05, "loss": 0.415, "loss/crossentropy": 1.6859049797058105, "loss/hidden": 0.3818359375, "loss/logits": 0.03313039615750313, "step": 423 }, { "epoch": 0.424, "grad_norm": 1.859375, "grad_norm_var": 0.7589800516764323, "learning_rate": 2e-05, "loss": 0.3098, "loss/crossentropy": 1.8961586952209473, "loss/hidden": 0.2900390625, "loss/logits": 0.019725864753127098, "step": 424 }, { "epoch": 0.425, "grad_norm": 1.6171875, "grad_norm_var": 0.7438547770182292, "learning_rate": 2e-05, "loss": 0.3427, "loss/crossentropy": 2.085192084312439, "loss/hidden": 0.31640625, "loss/logits": 0.026326753199100494, "step": 425 }, { "epoch": 0.426, "grad_norm": 2.078125, "grad_norm_var": 0.705224609375, "learning_rate": 2e-05, "loss": 0.3321, "loss/crossentropy": 1.912731111049652, "loss/hidden": 0.3076171875, "loss/logits": 0.02450721152126789, "step": 426 }, { "epoch": 0.427, "grad_norm": 1.8359375, "grad_norm_var": 0.6918108622233073, "learning_rate": 2e-05, "loss": 0.3396, "loss/crossentropy": 2.1176230907440186, "loss/hidden": 0.310546875, "loss/logits": 0.029072879813611507, "step": 427 }, { "epoch": 0.428, "grad_norm": 1.6015625, "grad_norm_var": 0.6852617899576823, "learning_rate": 2e-05, "loss": 0.318, "loss/crossentropy": 2.351975202560425, "loss/hidden": 0.291015625, "loss/logits": 0.026953624561429024, "step": 428 }, { "epoch": 0.429, "grad_norm": 2.3125, "grad_norm_var": 0.6734690348307292, "learning_rate": 2e-05, "loss": 0.4069, "loss/crossentropy": 1.6036078929901123, "loss/hidden": 0.37109375, "loss/logits": 0.03581710997968912, "step": 429 }, { "epoch": 0.43, "grad_norm": 2.46875, "grad_norm_var": 0.667138671875, "learning_rate": 2e-05, "loss": 0.3472, "loss/crossentropy": 1.881849765777588, "loss/hidden": 0.3232421875, "loss/logits": 0.023961665108799934, "step": 430 }, { "epoch": 0.431, "grad_norm": 3.625, "grad_norm_var": 0.77403564453125, "learning_rate": 2e-05, "loss": 0.3121, "loss/crossentropy": 2.3671000599861145, "loss/hidden": 0.2900390625, "loss/logits": 0.022101588547229767, "step": 431 }, { "epoch": 0.432, "grad_norm": 2.4375, "grad_norm_var": 0.7627866109212239, "learning_rate": 2e-05, "loss": 0.3151, "loss/crossentropy": 1.1575224101543427, "loss/hidden": 0.298828125, "loss/logits": 0.016257786191999912, "step": 432 }, { "epoch": 0.433, "grad_norm": 5.40625, "grad_norm_var": 1.3478289286295573, "learning_rate": 2e-05, "loss": 0.3283, "loss/crossentropy": 1.3821857124567032, "loss/hidden": 0.3115234375, "loss/logits": 0.016785149462521076, "step": 433 }, { "epoch": 0.434, "grad_norm": 2.140625, "grad_norm_var": 1.3182634989420572, "learning_rate": 2e-05, "loss": 0.3499, "loss/crossentropy": 1.4704007506370544, "loss/hidden": 0.326171875, "loss/logits": 0.02373607736080885, "step": 434 }, { "epoch": 0.435, "grad_norm": 1.7890625, "grad_norm_var": 0.9163736979166667, "learning_rate": 2e-05, "loss": 0.3453, "loss/crossentropy": 1.7521992325782776, "loss/hidden": 0.322265625, "loss/logits": 0.023045840673148632, "step": 435 }, { "epoch": 0.436, "grad_norm": 2.203125, "grad_norm_var": 0.9093251546223958, "learning_rate": 2e-05, "loss": 0.3079, "loss/crossentropy": 1.4147529304027557, "loss/hidden": 0.2919921875, "loss/logits": 0.01587154157459736, "step": 436 }, { "epoch": 0.437, "grad_norm": 1.78125, "grad_norm_var": 0.9289784749348958, "learning_rate": 2e-05, "loss": 0.3572, "loss/crossentropy": 2.1589527130126953, "loss/hidden": 0.330078125, "loss/logits": 0.027110325172543526, "step": 437 }, { "epoch": 0.438, "grad_norm": 1.546875, "grad_norm_var": 0.9336751302083334, "learning_rate": 2e-05, "loss": 0.3112, "loss/crossentropy": 2.0695826411247253, "loss/hidden": 0.2890625, "loss/logits": 0.022175450809299946, "step": 438 }, { "epoch": 0.439, "grad_norm": 8.6875, "grad_norm_var": 3.472150675455729, "learning_rate": 2e-05, "loss": 0.3174, "loss/crossentropy": 2.715834140777588, "loss/hidden": 0.2919921875, "loss/logits": 0.02542768605053425, "step": 439 }, { "epoch": 0.44, "grad_norm": 2.0625, "grad_norm_var": 3.4516398111979165, "learning_rate": 2e-05, "loss": 0.3531, "loss/crossentropy": 2.089130699634552, "loss/hidden": 0.326171875, "loss/logits": 0.026951050385832787, "step": 440 }, { "epoch": 0.441, "grad_norm": 5.6875, "grad_norm_var": 3.8860979715983075, "learning_rate": 2e-05, "loss": 0.352, "loss/crossentropy": 1.6687681376934052, "loss/hidden": 0.3330078125, "loss/logits": 0.018973306752741337, "step": 441 }, { "epoch": 0.442, "grad_norm": 1.6953125, "grad_norm_var": 3.941239420572917, "learning_rate": 2e-05, "loss": 0.354, "loss/crossentropy": 1.4019538760185242, "loss/hidden": 0.33203125, "loss/logits": 0.021962410770356655, "step": 442 }, { "epoch": 0.443, "grad_norm": 2.453125, "grad_norm_var": 3.8729509989420574, "learning_rate": 2e-05, "loss": 0.3591, "loss/crossentropy": 2.068819046020508, "loss/hidden": 0.328125, "loss/logits": 0.03100405167788267, "step": 443 }, { "epoch": 0.444, "grad_norm": 2.625, "grad_norm_var": 3.7484527587890626, "learning_rate": 2e-05, "loss": 0.3207, "loss/crossentropy": 1.2215966582298279, "loss/hidden": 0.306640625, "loss/logits": 0.014033652492798865, "step": 444 }, { "epoch": 0.445, "grad_norm": 2.796875, "grad_norm_var": 3.7149943033854167, "learning_rate": 2e-05, "loss": 0.2843, "loss/crossentropy": 0.8393277078866959, "loss/hidden": 0.2734375, "loss/logits": 0.010860613780096173, "step": 445 }, { "epoch": 0.446, "grad_norm": 3.6875, "grad_norm_var": 3.7072184244791666, "learning_rate": 2e-05, "loss": 0.3369, "loss/crossentropy": 0.8106656819581985, "loss/hidden": 0.32421875, "loss/logits": 0.01267361780628562, "step": 446 }, { "epoch": 0.447, "grad_norm": 4.28125, "grad_norm_var": 3.774466959635417, "learning_rate": 2e-05, "loss": 0.3246, "loss/crossentropy": 1.0552468746900558, "loss/hidden": 0.3095703125, "loss/logits": 0.015042064245790243, "step": 447 }, { "epoch": 0.448, "grad_norm": 2.734375, "grad_norm_var": 3.749592081705729, "learning_rate": 2e-05, "loss": 0.3734, "loss/crossentropy": 2.4344149827957153, "loss/hidden": 0.3427734375, "loss/logits": 0.030597456730902195, "step": 448 }, { "epoch": 0.449, "grad_norm": 3.984375, "grad_norm_var": 3.4621622721354166, "learning_rate": 2e-05, "loss": 0.3036, "loss/crossentropy": 1.054320715367794, "loss/hidden": 0.28857421875, "loss/logits": 0.014980267733335495, "step": 449 }, { "epoch": 0.45, "grad_norm": 1.8359375, "grad_norm_var": 3.5083513895670575, "learning_rate": 2e-05, "loss": 0.3366, "loss/crossentropy": 2.0155181288719177, "loss/hidden": 0.310546875, "loss/logits": 0.02600990142673254, "step": 450 }, { "epoch": 0.451, "grad_norm": 2.0, "grad_norm_var": 3.4738199869791666, "learning_rate": 2e-05, "loss": 0.3511, "loss/crossentropy": 1.755088448524475, "loss/hidden": 0.3271484375, "loss/logits": 0.023935355246067047, "step": 451 }, { "epoch": 0.452, "grad_norm": 2.046875, "grad_norm_var": 3.4946329752604166, "learning_rate": 2e-05, "loss": 0.3499, "loss/crossentropy": 1.7622599005699158, "loss/hidden": 0.326171875, "loss/logits": 0.023745747283101082, "step": 452 }, { "epoch": 0.453, "grad_norm": 1.7890625, "grad_norm_var": 3.4932431538899738, "learning_rate": 2e-05, "loss": 0.3215, "loss/crossentropy": 2.3116530179977417, "loss/hidden": 0.298828125, "loss/logits": 0.022703303024172783, "step": 453 }, { "epoch": 0.454, "grad_norm": 1.6875, "grad_norm_var": 3.464989980061849, "learning_rate": 2e-05, "loss": 0.3673, "loss/crossentropy": 1.5556917786598206, "loss/hidden": 0.3408203125, "loss/logits": 0.026494111865758896, "step": 454 }, { "epoch": 0.455, "grad_norm": 2.0, "grad_norm_var": 1.3033078511555989, "learning_rate": 2e-05, "loss": 0.3715, "loss/crossentropy": 1.7844219207763672, "loss/hidden": 0.345703125, "loss/logits": 0.02580021321773529, "step": 455 }, { "epoch": 0.456, "grad_norm": 2.53125, "grad_norm_var": 1.2765439351399739, "learning_rate": 2e-05, "loss": 0.448, "loss/crossentropy": 1.2347650527954102, "loss/hidden": 0.4150390625, "loss/logits": 0.0329879354685545, "step": 456 }, { "epoch": 0.457, "grad_norm": 1.4375, "grad_norm_var": 0.7350563049316406, "learning_rate": 2e-05, "loss": 0.3455, "loss/crossentropy": 1.9715585112571716, "loss/hidden": 0.318359375, "loss/logits": 0.02718514297157526, "step": 457 }, { "epoch": 0.458, "grad_norm": 1.5859375, "grad_norm_var": 0.7471616109212239, "learning_rate": 2e-05, "loss": 0.3339, "loss/crossentropy": 2.389525294303894, "loss/hidden": 0.30859375, "loss/logits": 0.025292156264185905, "step": 458 }, { "epoch": 0.459, "grad_norm": 1.4921875, "grad_norm_var": 0.8066884358723958, "learning_rate": 2e-05, "loss": 0.3166, "loss/crossentropy": 1.7892733812332153, "loss/hidden": 0.29296875, "loss/logits": 0.023592060431838036, "step": 459 }, { "epoch": 0.46, "grad_norm": 1.8125, "grad_norm_var": 0.8243560791015625, "learning_rate": 2e-05, "loss": 0.3353, "loss/crossentropy": 1.9092342853546143, "loss/hidden": 0.3115234375, "loss/logits": 0.02376522123813629, "step": 460 }, { "epoch": 0.461, "grad_norm": 1.34375, "grad_norm_var": 0.87099609375, "learning_rate": 2e-05, "loss": 0.349, "loss/crossentropy": 1.9013403058052063, "loss/hidden": 0.3251953125, "loss/logits": 0.02381738182157278, "step": 461 }, { "epoch": 0.462, "grad_norm": 2.9375, "grad_norm_var": 0.76396484375, "learning_rate": 2e-05, "loss": 0.3492, "loss/crossentropy": 0.9097070023417473, "loss/hidden": 0.330078125, "loss/logits": 0.01913693710230291, "step": 462 }, { "epoch": 0.463, "grad_norm": 2.828125, "grad_norm_var": 0.4963287353515625, "learning_rate": 2e-05, "loss": 0.4669, "loss/crossentropy": 1.9413211345672607, "loss/hidden": 0.427734375, "loss/logits": 0.03912976011633873, "step": 463 }, { "epoch": 0.464, "grad_norm": 1.9296875, "grad_norm_var": 0.47173233032226564, "learning_rate": 2e-05, "loss": 0.3569, "loss/crossentropy": 2.3746496438980103, "loss/hidden": 0.326171875, "loss/logits": 0.030762989073991776, "step": 464 }, { "epoch": 0.465, "grad_norm": 1.796875, "grad_norm_var": 0.21467259724934895, "learning_rate": 2e-05, "loss": 0.3875, "loss/crossentropy": 1.920172929763794, "loss/hidden": 0.359375, "loss/logits": 0.028154666535556316, "step": 465 }, { "epoch": 0.466, "grad_norm": 2.59375, "grad_norm_var": 0.23995768229166667, "learning_rate": 2e-05, "loss": 0.4173, "loss/crossentropy": 2.1804317831993103, "loss/hidden": 0.3828125, "loss/logits": 0.03448019549250603, "step": 466 }, { "epoch": 0.467, "grad_norm": 2.453125, "grad_norm_var": 0.25349833170572916, "learning_rate": 2e-05, "loss": 0.3635, "loss/crossentropy": 2.1129865646362305, "loss/hidden": 0.3369140625, "loss/logits": 0.026613284833729267, "step": 467 }, { "epoch": 0.468, "grad_norm": 3.4375, "grad_norm_var": 0.37997639973958336, "learning_rate": 2e-05, "loss": 0.3892, "loss/crossentropy": 1.6438812613487244, "loss/hidden": 0.3623046875, "loss/logits": 0.026910429820418358, "step": 468 }, { "epoch": 0.469, "grad_norm": 13.125, "grad_norm_var": 7.936161041259766, "learning_rate": 2e-05, "loss": 0.4187, "loss/crossentropy": 1.8062403798103333, "loss/hidden": 0.3857421875, "loss/logits": 0.03291827440261841, "step": 469 }, { "epoch": 0.47, "grad_norm": 3.421875, "grad_norm_var": 7.8641212463378904, "learning_rate": 2e-05, "loss": 0.4157, "loss/crossentropy": 1.2208881378173828, "loss/hidden": 0.39453125, "loss/logits": 0.02117818035185337, "step": 470 }, { "epoch": 0.471, "grad_norm": 1.953125, "grad_norm_var": 7.8700111389160154, "learning_rate": 2e-05, "loss": 0.3306, "loss/crossentropy": 2.474324107170105, "loss/hidden": 0.3037109375, "loss/logits": 0.026909410022199154, "step": 471 }, { "epoch": 0.472, "grad_norm": 2.796875, "grad_norm_var": 7.860741933186849, "learning_rate": 2e-05, "loss": 0.4071, "loss/crossentropy": 1.8907885551452637, "loss/hidden": 0.3740234375, "loss/logits": 0.03311134688556194, "step": 472 }, { "epoch": 0.473, "grad_norm": 5.40625, "grad_norm_var": 8.053236643473307, "learning_rate": 2e-05, "loss": 0.482, "loss/crossentropy": 1.851112186908722, "loss/hidden": 0.4287109375, "loss/logits": 0.0532735763117671, "step": 473 }, { "epoch": 0.474, "grad_norm": 1.8125, "grad_norm_var": 8.008226521809895, "learning_rate": 2e-05, "loss": 0.4011, "loss/crossentropy": 2.0893144607543945, "loss/hidden": 0.37109375, "loss/logits": 0.03000558167695999, "step": 474 }, { "epoch": 0.475, "grad_norm": 1.84375, "grad_norm_var": 7.936071523030599, "learning_rate": 2e-05, "loss": 0.4086, "loss/crossentropy": 1.692557156085968, "loss/hidden": 0.37890625, "loss/logits": 0.029658248648047447, "step": 475 }, { "epoch": 0.476, "grad_norm": 1.734375, "grad_norm_var": 7.9510963439941404, "learning_rate": 2e-05, "loss": 0.3369, "loss/crossentropy": 2.7231298685073853, "loss/hidden": 0.3095703125, "loss/logits": 0.027365448884665966, "step": 476 }, { "epoch": 0.477, "grad_norm": 122.5, "grad_norm_var": 895.1761065165202, "learning_rate": 2e-05, "loss": 1.8739, "loss/crossentropy": 1.9931391477584839, "loss/hidden": 1.73828125, "loss/logits": 0.13565433584153652, "step": 477 }, { "epoch": 0.478, "grad_norm": 18.75, "grad_norm_var": 894.2567990620931, "learning_rate": 2e-05, "loss": 0.4467, "loss/crossentropy": 1.0818050801753998, "loss/hidden": 0.423828125, "loss/logits": 0.022886332124471664, "step": 478 }, { "epoch": 0.479, "grad_norm": 1.9609375, "grad_norm_var": 895.3381581624349, "learning_rate": 2e-05, "loss": 0.3744, "loss/crossentropy": 2.382234215736389, "loss/hidden": 0.3447265625, "loss/logits": 0.029717115685343742, "step": 479 }, { "epoch": 0.48, "grad_norm": 1.71875, "grad_norm_var": 895.6162839253743, "learning_rate": 2e-05, "loss": 0.3323, "loss/crossentropy": 2.0683305859565735, "loss/hidden": 0.30859375, "loss/logits": 0.023680799640715122, "step": 480 }, { "epoch": 0.481, "grad_norm": 2.546875, "grad_norm_var": 894.6604733784993, "learning_rate": 2e-05, "loss": 0.3756, "loss/crossentropy": 2.154377818107605, "loss/hidden": 0.34765625, "loss/logits": 0.02795298583805561, "step": 481 }, { "epoch": 0.482, "grad_norm": 3.1875, "grad_norm_var": 893.9573666890462, "learning_rate": 2e-05, "loss": 0.4124, "loss/crossentropy": 1.9701088666915894, "loss/hidden": 0.3779296875, "loss/logits": 0.03451960347592831, "step": 482 }, { "epoch": 0.483, "grad_norm": 2.5, "grad_norm_var": 893.8991452534993, "learning_rate": 2e-05, "loss": 0.4523, "loss/crossentropy": 0.9486123919487, "loss/hidden": 0.4306640625, "loss/logits": 0.02167674619704485, "step": 483 }, { "epoch": 0.484, "grad_norm": 2.578125, "grad_norm_var": 894.9027565002441, "learning_rate": 2e-05, "loss": 0.3955, "loss/crossentropy": 1.7118502855300903, "loss/hidden": 0.365234375, "loss/logits": 0.030311796814203262, "step": 484 }, { "epoch": 0.485, "grad_norm": 1.890625, "grad_norm_var": 900.7159604390462, "learning_rate": 2e-05, "loss": 0.3914, "loss/crossentropy": 1.7511045932769775, "loss/hidden": 0.36328125, "loss/logits": 0.02810109406709671, "step": 485 }, { "epoch": 0.486, "grad_norm": 2.203125, "grad_norm_var": 902.0463498433431, "learning_rate": 2e-05, "loss": 0.3893, "loss/crossentropy": 1.9742628931999207, "loss/hidden": 0.3603515625, "loss/logits": 0.028935128822922707, "step": 486 }, { "epoch": 0.487, "grad_norm": 2.609375, "grad_norm_var": 901.28504002889, "learning_rate": 2e-05, "loss": 0.338, "loss/crossentropy": 1.5944682955741882, "loss/hidden": 0.31640625, "loss/logits": 0.02155130822211504, "step": 487 }, { "epoch": 0.488, "grad_norm": 2.0, "grad_norm_var": 902.1965695699056, "learning_rate": 2e-05, "loss": 0.3749, "loss/crossentropy": 2.109809994697571, "loss/hidden": 0.3486328125, "loss/logits": 0.026237317360937595, "step": 488 }, { "epoch": 0.489, "grad_norm": 2.828125, "grad_norm_var": 904.5185605367025, "learning_rate": 2e-05, "loss": 0.3601, "loss/crossentropy": 2.371906280517578, "loss/hidden": 0.33203125, "loss/logits": 0.0280781090259552, "step": 489 }, { "epoch": 0.49, "grad_norm": 2.25, "grad_norm_var": 904.0067481994629, "learning_rate": 2e-05, "loss": 0.3881, "loss/crossentropy": 2.3074965476989746, "loss/hidden": 0.3583984375, "loss/logits": 0.029700559563934803, "step": 490 }, { "epoch": 0.491, "grad_norm": 1.609375, "grad_norm_var": 904.2906532287598, "learning_rate": 2e-05, "loss": 0.3533, "loss/crossentropy": 2.0604811906814575, "loss/hidden": 0.3271484375, "loss/logits": 0.026149596087634563, "step": 491 }, { "epoch": 0.492, "grad_norm": 2.203125, "grad_norm_var": 903.7375221252441, "learning_rate": 2e-05, "loss": 0.3982, "loss/crossentropy": 2.0394086837768555, "loss/hidden": 0.3671875, "loss/logits": 0.030979415401816368, "step": 492 }, { "epoch": 0.493, "grad_norm": 1.53125, "grad_norm_var": 17.239774322509767, "learning_rate": 2e-05, "loss": 0.3721, "loss/crossentropy": 1.992867350578308, "loss/hidden": 0.3447265625, "loss/logits": 0.02732760366052389, "step": 493 }, { "epoch": 0.494, "grad_norm": 1.5, "grad_norm_var": 0.24021377563476562, "learning_rate": 2e-05, "loss": 0.3607, "loss/crossentropy": 2.0647668838500977, "loss/hidden": 0.3349609375, "loss/logits": 0.02573198452591896, "step": 494 }, { "epoch": 0.495, "grad_norm": 3.265625, "grad_norm_var": 0.3059153238932292, "learning_rate": 2e-05, "loss": 0.4332, "loss/crossentropy": 2.0061678886413574, "loss/hidden": 0.4033203125, "loss/logits": 0.029847824946045876, "step": 495 }, { "epoch": 0.496, "grad_norm": 1.671875, "grad_norm_var": 0.30953776041666664, "learning_rate": 2e-05, "loss": 0.3677, "loss/crossentropy": 2.029963493347168, "loss/hidden": 0.3408203125, "loss/logits": 0.026841914281249046, "step": 496 }, { "epoch": 0.497, "grad_norm": 2.1875, "grad_norm_var": 0.3045074462890625, "learning_rate": 2e-05, "loss": 0.3773, "loss/crossentropy": 1.836094081401825, "loss/hidden": 0.3505859375, "loss/logits": 0.026703315787017345, "step": 497 }, { "epoch": 0.498, "grad_norm": 1.8984375, "grad_norm_var": 0.24739761352539064, "learning_rate": 2e-05, "loss": 0.3934, "loss/crossentropy": 2.284022331237793, "loss/hidden": 0.36328125, "loss/logits": 0.030102317221462727, "step": 498 }, { "epoch": 0.499, "grad_norm": 1.609375, "grad_norm_var": 0.25783462524414064, "learning_rate": 2e-05, "loss": 0.422, "loss/crossentropy": 1.7640503644943237, "loss/hidden": 0.388671875, "loss/logits": 0.03330034948885441, "step": 499 }, { "epoch": 0.5, "grad_norm": 2.40625, "grad_norm_var": 0.2490618387858073, "learning_rate": 2e-05, "loss": 0.4409, "loss/crossentropy": 1.4432637095451355, "loss/hidden": 0.4130859375, "loss/logits": 0.027862844988703728, "step": 500 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.2202930782208e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }