cqtest2500 / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
a523dc8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5,
"eval_steps": 250,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001,
"grad_norm": 0.000537872314453125,
"learning_rate": 2.0000000000000002e-07,
"loss": 0.0002,
"loss/crossentropy": 0.8766392022371292,
"loss/hidden": 0.0,
"loss/logits": 0.00021765431665698998,
"step": 1
},
{
"epoch": 0.002,
"grad_norm": 0.2265625,
"learning_rate": 4.0000000000000003e-07,
"loss": 0.005,
"loss/crossentropy": 1.9883175492286682,
"loss/hidden": 0.0039215087890625,
"loss/logits": 0.001088879187591374,
"step": 2
},
{
"epoch": 0.003,
"grad_norm": 0.25390625,
"learning_rate": 6.000000000000001e-07,
"loss": 0.0052,
"loss/crossentropy": 1.8020615577697754,
"loss/hidden": 0.004180908203125,
"loss/logits": 0.0010398300073575228,
"step": 3
},
{
"epoch": 0.004,
"grad_norm": 0.255859375,
"learning_rate": 8.000000000000001e-07,
"loss": 0.0049,
"loss/crossentropy": 1.0764193534851074,
"loss/hidden": 0.00399017333984375,
"loss/logits": 0.0008995172393042594,
"step": 4
},
{
"epoch": 0.005,
"grad_norm": 0.224609375,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0049,
"loss/crossentropy": 1.7853868007659912,
"loss/hidden": 0.0038604736328125,
"loss/logits": 0.0010730837238952518,
"step": 5
},
{
"epoch": 0.006,
"grad_norm": 0.2333984375,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.0051,
"loss/crossentropy": 2.4102118015289307,
"loss/hidden": 0.00388336181640625,
"loss/logits": 0.0011915687937289476,
"step": 6
},
{
"epoch": 0.007,
"grad_norm": 0.35546875,
"learning_rate": 1.4000000000000001e-06,
"loss": 0.0056,
"loss/crossentropy": 1.9921993017196655,
"loss/hidden": 0.0044403076171875,
"loss/logits": 0.0011139529524371028,
"step": 7
},
{
"epoch": 0.008,
"grad_norm": 0.2353515625,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.0049,
"loss/crossentropy": 2.269957184791565,
"loss/hidden": 0.00376129150390625,
"loss/logits": 0.0011444001575000584,
"step": 8
},
{
"epoch": 0.009,
"grad_norm": 0.22265625,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.0051,
"loss/crossentropy": 2.1889681220054626,
"loss/hidden": 0.0038909912109375,
"loss/logits": 0.0011716101435013115,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 0.291015625,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0052,
"loss/crossentropy": 1.76205712556839,
"loss/hidden": 0.0041351318359375,
"loss/logits": 0.001058999594533816,
"step": 10
},
{
"epoch": 0.011,
"grad_norm": 0.2177734375,
"learning_rate": 2.2e-06,
"loss": 0.0049,
"loss/crossentropy": 2.438264012336731,
"loss/hidden": 0.003753662109375,
"loss/logits": 0.0011843050015158951,
"step": 11
},
{
"epoch": 0.012,
"grad_norm": 0.41015625,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.0071,
"loss/crossentropy": 1.8871825337409973,
"loss/hidden": 0.0059051513671875,
"loss/logits": 0.0011930759064853191,
"step": 12
},
{
"epoch": 0.013,
"grad_norm": 0.53125,
"learning_rate": 2.6e-06,
"loss": 0.0084,
"loss/crossentropy": 1.7400972247123718,
"loss/hidden": 0.0071258544921875,
"loss/logits": 0.001270102453418076,
"step": 13
},
{
"epoch": 0.014,
"grad_norm": 0.365234375,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.0075,
"loss/crossentropy": 2.0053656101226807,
"loss/hidden": 0.006256103515625,
"loss/logits": 0.0012446122709661722,
"step": 14
},
{
"epoch": 0.015,
"grad_norm": 0.455078125,
"learning_rate": 3e-06,
"loss": 0.0072,
"loss/crossentropy": 1.984630048274994,
"loss/hidden": 0.0059356689453125,
"loss/logits": 0.0012947238283231854,
"step": 15
},
{
"epoch": 0.016,
"grad_norm": 0.447265625,
"grad_norm_var": 0.016307008621940136,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.0072,
"loss/crossentropy": 2.4732788801193237,
"loss/hidden": 0.005767822265625,
"loss/logits": 0.00144299550447613,
"step": 16
},
{
"epoch": 0.017,
"grad_norm": 0.89453125,
"grad_norm_var": 0.031113270918528238,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.0076,
"loss/crossentropy": 1.7775737643241882,
"loss/hidden": 0.006317138671875,
"loss/logits": 0.001260987774003297,
"step": 17
},
{
"epoch": 0.018,
"grad_norm": 0.45703125,
"grad_norm_var": 0.030601243178049724,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.0067,
"loss/crossentropy": 1.1123631671071053,
"loss/hidden": 0.0057373046875,
"loss/logits": 0.0009507400100119412,
"step": 18
},
{
"epoch": 0.019,
"grad_norm": 0.298828125,
"grad_norm_var": 0.030057998498280843,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.0068,
"loss/crossentropy": 1.8855515718460083,
"loss/hidden": 0.0055694580078125,
"loss/logits": 0.0012491169618442655,
"step": 19
},
{
"epoch": 0.02,
"grad_norm": 0.3984375,
"grad_norm_var": 0.02918777068456014,
"learning_rate": 4.000000000000001e-06,
"loss": 0.007,
"loss/crossentropy": 1.773246705532074,
"loss/hidden": 0.005828857421875,
"loss/logits": 0.0011664124322123826,
"step": 20
},
{
"epoch": 0.021,
"grad_norm": 0.302734375,
"grad_norm_var": 0.02797787586847941,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.0069,
"loss/crossentropy": 2.1012651920318604,
"loss/hidden": 0.0056610107421875,
"loss/logits": 0.0012796117807738483,
"step": 21
},
{
"epoch": 0.022,
"grad_norm": 0.486328125,
"grad_norm_var": 0.026955906550089517,
"learning_rate": 4.4e-06,
"loss": 0.0101,
"loss/crossentropy": 1.9430513381958008,
"loss/hidden": 0.008514404296875,
"loss/logits": 0.0016175230266526341,
"step": 22
},
{
"epoch": 0.023,
"grad_norm": 0.609375,
"grad_norm_var": 0.029542907079060873,
"learning_rate": 4.600000000000001e-06,
"loss": 0.0118,
"loss/crossentropy": 1.5989271998405457,
"loss/hidden": 0.01025390625,
"loss/logits": 0.0015109491650946438,
"step": 23
},
{
"epoch": 0.024,
"grad_norm": 0.80078125,
"grad_norm_var": 0.03606090148289998,
"learning_rate": 4.800000000000001e-06,
"loss": 0.0102,
"loss/crossentropy": 1.141058474779129,
"loss/hidden": 0.009033203125,
"loss/logits": 0.0011210083321202546,
"step": 24
},
{
"epoch": 0.025,
"grad_norm": 0.361328125,
"grad_norm_var": 0.03307259480158488,
"learning_rate": 5e-06,
"loss": 0.0094,
"loss/crossentropy": 2.0950170755386353,
"loss/hidden": 0.0077972412109375,
"loss/logits": 0.001559894997626543,
"step": 25
},
{
"epoch": 0.026,
"grad_norm": 0.83984375,
"grad_norm_var": 0.0396828293800354,
"learning_rate": 5.2e-06,
"loss": 0.0112,
"loss/crossentropy": 0.9552253857254982,
"loss/hidden": 0.010284423828125,
"loss/logits": 0.0008805262332316488,
"step": 26
},
{
"epoch": 0.027,
"grad_norm": 0.546875,
"grad_norm_var": 0.034408044815063474,
"learning_rate": 5.400000000000001e-06,
"loss": 0.0091,
"loss/crossentropy": 1.3719437271356583,
"loss/hidden": 0.007965087890625,
"loss/logits": 0.001155910431407392,
"step": 27
},
{
"epoch": 0.028,
"grad_norm": 0.73046875,
"grad_norm_var": 0.036436065038045244,
"learning_rate": 5.600000000000001e-06,
"loss": 0.0107,
"loss/crossentropy": 1.6477643251419067,
"loss/hidden": 0.009185791015625,
"loss/logits": 0.0015593590214848518,
"step": 28
},
{
"epoch": 0.029,
"grad_norm": 0.41796875,
"grad_norm_var": 0.03726207415262858,
"learning_rate": 5.8e-06,
"loss": 0.0096,
"loss/crossentropy": 1.7987680435180664,
"loss/hidden": 0.008087158203125,
"loss/logits": 0.0015162223717197776,
"step": 29
},
{
"epoch": 0.03,
"grad_norm": 0.33203125,
"grad_norm_var": 0.03804162343343099,
"learning_rate": 6e-06,
"loss": 0.0094,
"loss/crossentropy": 1.74210923910141,
"loss/hidden": 0.008026123046875,
"loss/logits": 0.0013514517340809107,
"step": 30
},
{
"epoch": 0.031,
"grad_norm": 0.4296875,
"grad_norm_var": 0.038314167658487955,
"learning_rate": 6.200000000000001e-06,
"loss": 0.0095,
"loss/crossentropy": 1.45715793967247,
"loss/hidden": 0.0081329345703125,
"loss/logits": 0.0013754194369539618,
"step": 31
},
{
"epoch": 0.032,
"grad_norm": 0.54296875,
"grad_norm_var": 0.03793176015218099,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.0137,
"loss/crossentropy": 1.635874330997467,
"loss/hidden": 0.01190185546875,
"loss/logits": 0.0017871989402920008,
"step": 32
},
{
"epoch": 0.033,
"grad_norm": 0.76171875,
"grad_norm_var": 0.03254489898681641,
"learning_rate": 6.600000000000001e-06,
"loss": 0.0143,
"loss/crossentropy": 1.0347481966018677,
"loss/hidden": 0.01300048828125,
"loss/logits": 0.0012789819156751037,
"step": 33
},
{
"epoch": 0.034,
"grad_norm": 0.515625,
"grad_norm_var": 0.032269287109375,
"learning_rate": 6.800000000000001e-06,
"loss": 0.0132,
"loss/crossentropy": 2.0032879114151,
"loss/hidden": 0.011383056640625,
"loss/logits": 0.0018645224627107382,
"step": 34
},
{
"epoch": 0.035,
"grad_norm": 1.0703125,
"grad_norm_var": 0.04636419614156087,
"learning_rate": 7e-06,
"loss": 0.0143,
"loss/crossentropy": 1.8410796523094177,
"loss/hidden": 0.01226806640625,
"loss/logits": 0.001986370305530727,
"step": 35
},
{
"epoch": 0.036,
"grad_norm": 0.4296875,
"grad_norm_var": 0.045703490575154625,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.0136,
"loss/crossentropy": 1.9098870158195496,
"loss/hidden": 0.01171875,
"loss/logits": 0.0018596722511574626,
"step": 36
},
{
"epoch": 0.037,
"grad_norm": 68.0,
"grad_norm_var": 284.03319854736327,
"learning_rate": 7.4e-06,
"loss": 0.0558,
"loss/crossentropy": 1.5951663255691528,
"loss/hidden": 0.051666259765625,
"loss/logits": 0.004160793498158455,
"step": 37
},
{
"epoch": 0.038,
"grad_norm": 0.380859375,
"grad_norm_var": 284.0946207046509,
"learning_rate": 7.600000000000001e-06,
"loss": 0.0133,
"loss/crossentropy": 2.25837504863739,
"loss/hidden": 0.01129150390625,
"loss/logits": 0.0020168160554021597,
"step": 38
},
{
"epoch": 0.039,
"grad_norm": 0.455078125,
"grad_norm_var": 284.1822828769684,
"learning_rate": 7.800000000000002e-06,
"loss": 0.0126,
"loss/crossentropy": 2.126526176929474,
"loss/hidden": 0.0107421875,
"loss/logits": 0.0018400833941996098,
"step": 39
},
{
"epoch": 0.04,
"grad_norm": 0.63671875,
"grad_norm_var": 284.27119545936586,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0142,
"loss/crossentropy": 1.4863142371177673,
"loss/hidden": 0.012481689453125,
"loss/logits": 0.0017027563299052417,
"step": 40
},
{
"epoch": 0.041,
"grad_norm": 0.283203125,
"grad_norm_var": 284.3175859928131,
"learning_rate": 8.2e-06,
"loss": 0.0112,
"loss/crossentropy": 2.0888695120811462,
"loss/hidden": 0.009521484375,
"loss/logits": 0.0017255974235013127,
"step": 41
},
{
"epoch": 0.042,
"grad_norm": 0.431640625,
"grad_norm_var": 284.5420877456665,
"learning_rate": 8.400000000000001e-06,
"loss": 0.0173,
"loss/crossentropy": 1.611488163471222,
"loss/hidden": 0.015380859375,
"loss/logits": 0.0019445380312390625,
"step": 42
},
{
"epoch": 0.043,
"grad_norm": 0.419921875,
"grad_norm_var": 284.6142045180003,
"learning_rate": 8.6e-06,
"loss": 0.0166,
"loss/crossentropy": 1.8987411260604858,
"loss/hidden": 0.0146484375,
"loss/logits": 0.0019467678503133357,
"step": 43
},
{
"epoch": 0.044,
"grad_norm": 0.58203125,
"grad_norm_var": 284.6949343204498,
"learning_rate": 8.8e-06,
"loss": 0.0183,
"loss/crossentropy": 1.4084473848342896,
"loss/hidden": 0.01605224609375,
"loss/logits": 0.002271471545100212,
"step": 44
},
{
"epoch": 0.045,
"grad_norm": 0.380859375,
"grad_norm_var": 284.71635888417563,
"learning_rate": 9e-06,
"loss": 0.0159,
"loss/crossentropy": 1.6970309615135193,
"loss/hidden": 0.01397705078125,
"loss/logits": 0.0019325784523971379,
"step": 45
},
{
"epoch": 0.046,
"grad_norm": 0.455078125,
"grad_norm_var": 284.64517935117084,
"learning_rate": 9.200000000000002e-06,
"loss": 0.0165,
"loss/crossentropy": 2.1346731781959534,
"loss/hidden": 0.014312744140625,
"loss/logits": 0.002142712823115289,
"step": 46
},
{
"epoch": 0.047,
"grad_norm": 2.21875,
"grad_norm_var": 283.818000014623,
"learning_rate": 9.4e-06,
"loss": 0.0175,
"loss/crossentropy": 1.6114214062690735,
"loss/hidden": 0.0155029296875,
"loss/logits": 0.0020421514636836946,
"step": 47
},
{
"epoch": 0.048,
"grad_norm": 0.44921875,
"grad_norm_var": 283.87235945065817,
"learning_rate": 9.600000000000001e-06,
"loss": 0.0157,
"loss/crossentropy": 2.056842625141144,
"loss/hidden": 0.013671875,
"loss/logits": 0.0020451846066862345,
"step": 48
},
{
"epoch": 0.049,
"grad_norm": 0.439453125,
"grad_norm_var": 284.05417149861654,
"learning_rate": 9.800000000000001e-06,
"loss": 0.016,
"loss/crossentropy": 1.5892411470413208,
"loss/hidden": 0.013946533203125,
"loss/logits": 0.00205704930704087,
"step": 49
},
{
"epoch": 0.05,
"grad_norm": 0.3359375,
"grad_norm_var": 284.15935770670575,
"learning_rate": 1e-05,
"loss": 0.0153,
"loss/crossentropy": 2.3872954845428467,
"loss/hidden": 0.01312255859375,
"loss/logits": 0.0021313573233783245,
"step": 50
},
{
"epoch": 0.051,
"grad_norm": 0.451171875,
"grad_norm_var": 284.49208029111225,
"learning_rate": 1.02e-05,
"loss": 0.0168,
"loss/crossentropy": 2.0149841904640198,
"loss/hidden": 0.01470947265625,
"loss/logits": 0.0020815907046198845,
"step": 51
},
{
"epoch": 0.052,
"grad_norm": 0.51953125,
"grad_norm_var": 284.44056928952534,
"learning_rate": 1.04e-05,
"loss": 0.021,
"loss/crossentropy": 1.9311216473579407,
"loss/hidden": 0.0185546875,
"loss/logits": 0.0024686548858880997,
"step": 52
},
{
"epoch": 0.053,
"grad_norm": 0.546875,
"grad_norm_var": 0.20315702756245932,
"learning_rate": 1.0600000000000002e-05,
"loss": 0.0204,
"loss/crossentropy": 1.9871841073036194,
"loss/hidden": 0.01806640625,
"loss/logits": 0.00237347767688334,
"step": 53
},
{
"epoch": 0.054,
"grad_norm": 0.51171875,
"grad_norm_var": 0.2010729471842448,
"learning_rate": 1.0800000000000002e-05,
"loss": 0.0195,
"loss/crossentropy": 1.4909774661064148,
"loss/hidden": 0.017578125,
"loss/logits": 0.0018839699332602322,
"step": 54
},
{
"epoch": 0.055,
"grad_norm": 0.376953125,
"grad_norm_var": 0.20264968872070313,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.0188,
"loss/crossentropy": 1.731587290763855,
"loss/hidden": 0.01666259765625,
"loss/logits": 0.0021363290725275874,
"step": 55
},
{
"epoch": 0.056,
"grad_norm": 0.482421875,
"grad_norm_var": 0.20266098976135255,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.0198,
"loss/crossentropy": 1.8391692638397217,
"loss/hidden": 0.01751708984375,
"loss/logits": 0.0022706754971295595,
"step": 56
},
{
"epoch": 0.057,
"grad_norm": 0.82421875,
"grad_norm_var": 0.20132694244384766,
"learning_rate": 1.14e-05,
"loss": 0.0181,
"loss/crossentropy": 1.326266534626484,
"loss/hidden": 0.01654052734375,
"loss/logits": 0.0015604346699547023,
"step": 57
},
{
"epoch": 0.058,
"grad_norm": 0.41015625,
"grad_norm_var": 0.2018068790435791,
"learning_rate": 1.16e-05,
"loss": 0.0185,
"loss/crossentropy": 2.5511186122894287,
"loss/hidden": 0.01611328125,
"loss/logits": 0.0024241225328296423,
"step": 58
},
{
"epoch": 0.059,
"grad_norm": 1.609375,
"grad_norm_var": 0.26361236572265623,
"learning_rate": 1.18e-05,
"loss": 0.0183,
"loss/crossentropy": 1.0930684125050902,
"loss/hidden": 0.01702880859375,
"loss/logits": 0.0013018156460020691,
"step": 59
},
{
"epoch": 0.06,
"grad_norm": 0.486328125,
"grad_norm_var": 0.2652066389719645,
"learning_rate": 1.2e-05,
"loss": 0.02,
"loss/crossentropy": 2.0819135308265686,
"loss/hidden": 0.0174560546875,
"loss/logits": 0.0025293552316725254,
"step": 60
},
{
"epoch": 0.061,
"grad_norm": 1.09375,
"grad_norm_var": 0.2708051045735677,
"learning_rate": 1.22e-05,
"loss": 0.0183,
"loss/crossentropy": 0.9290539920330048,
"loss/hidden": 0.016754150390625,
"loss/logits": 0.0015562092885375023,
"step": 61
},
{
"epoch": 0.062,
"grad_norm": 0.453125,
"grad_norm_var": 0.2708693027496338,
"learning_rate": 1.2400000000000002e-05,
"loss": 0.0227,
"loss/crossentropy": 2.1691651344299316,
"loss/hidden": 0.01995849609375,
"loss/logits": 0.002767750178463757,
"step": 62
},
{
"epoch": 0.063,
"grad_norm": 0.4765625,
"grad_norm_var": 0.10790785153706868,
"learning_rate": 1.2600000000000001e-05,
"loss": 0.0233,
"loss/crossentropy": 2.1545491218566895,
"loss/hidden": 0.0205078125,
"loss/logits": 0.002785824006423354,
"step": 63
},
{
"epoch": 0.064,
"grad_norm": 0.47265625,
"grad_norm_var": 0.10749700864156088,
"learning_rate": 1.2800000000000001e-05,
"loss": 0.0223,
"loss/crossentropy": 1.9527725577354431,
"loss/hidden": 0.01971435546875,
"loss/logits": 0.0025634407065808773,
"step": 64
},
{
"epoch": 0.065,
"grad_norm": 0.55078125,
"grad_norm_var": 0.10599034627278646,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.0256,
"loss/crossentropy": 1.8496606945991516,
"loss/hidden": 0.02288818359375,
"loss/logits": 0.0027499888092279434,
"step": 65
},
{
"epoch": 0.066,
"grad_norm": 0.55859375,
"grad_norm_var": 0.1012465794881185,
"learning_rate": 1.3200000000000002e-05,
"loss": 0.0221,
"loss/crossentropy": 1.9440131187438965,
"loss/hidden": 0.01971435546875,
"loss/logits": 0.002431391447316855,
"step": 66
},
{
"epoch": 0.067,
"grad_norm": 0.498046875,
"grad_norm_var": 0.10036614735921225,
"learning_rate": 1.3400000000000002e-05,
"loss": 0.0241,
"loss/crossentropy": 1.7777947187423706,
"loss/hidden": 0.02142333984375,
"loss/logits": 0.0026856372132897377,
"step": 67
},
{
"epoch": 0.068,
"grad_norm": 0.66015625,
"grad_norm_var": 0.09977563222249348,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.0241,
"loss/crossentropy": 1.6634170711040497,
"loss/hidden": 0.02178955078125,
"loss/logits": 0.002268874435685575,
"step": 68
},
{
"epoch": 0.069,
"grad_norm": 0.359375,
"grad_norm_var": 0.1039443333943685,
"learning_rate": 1.38e-05,
"loss": 0.0217,
"loss/crossentropy": 1.9945446252822876,
"loss/hidden": 0.019287109375,
"loss/logits": 0.0024602848570793867,
"step": 69
},
{
"epoch": 0.07,
"grad_norm": 0.546875,
"grad_norm_var": 0.10354207356770834,
"learning_rate": 1.4e-05,
"loss": 0.0212,
"loss/crossentropy": 2.234881281852722,
"loss/hidden": 0.0185546875,
"loss/logits": 0.0026649613864719868,
"step": 70
},
{
"epoch": 0.071,
"grad_norm": 0.5390625,
"grad_norm_var": 0.1000130812327067,
"learning_rate": 1.4200000000000001e-05,
"loss": 0.0235,
"loss/crossentropy": 2.3283374309539795,
"loss/hidden": 0.0206298828125,
"loss/logits": 0.0028440920868888497,
"step": 71
},
{
"epoch": 0.072,
"grad_norm": 0.96484375,
"grad_norm_var": 0.10530134836832682,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.0273,
"loss/crossentropy": 2.446515917778015,
"loss/hidden": 0.0244140625,
"loss/logits": 0.002847215859219432,
"step": 72
},
{
"epoch": 0.073,
"grad_norm": 0.66015625,
"grad_norm_var": 0.10331465403238932,
"learning_rate": 1.46e-05,
"loss": 0.0313,
"loss/crossentropy": 1.8365015387535095,
"loss/hidden": 0.0277099609375,
"loss/logits": 0.003543111262843013,
"step": 73
},
{
"epoch": 0.074,
"grad_norm": 0.58203125,
"grad_norm_var": 0.0997507095336914,
"learning_rate": 1.48e-05,
"loss": 0.0275,
"loss/crossentropy": 1.8750606179237366,
"loss/hidden": 0.0244140625,
"loss/logits": 0.0030850095208734274,
"step": 74
},
{
"epoch": 0.075,
"grad_norm": 0.6171875,
"grad_norm_var": 0.03528436024983724,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.0285,
"loss/crossentropy": 1.6197695136070251,
"loss/hidden": 0.02557373046875,
"loss/logits": 0.002948817447759211,
"step": 75
},
{
"epoch": 0.076,
"grad_norm": 0.5546875,
"grad_norm_var": 0.034586191177368164,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.0253,
"loss/crossentropy": 2.139370322227478,
"loss/hidden": 0.0225830078125,
"loss/logits": 0.002709153341129422,
"step": 76
},
{
"epoch": 0.077,
"grad_norm": 0.78125,
"grad_norm_var": 0.020085255304972332,
"learning_rate": 1.54e-05,
"loss": 0.0308,
"loss/crossentropy": 1.5335928797721863,
"loss/hidden": 0.02777099609375,
"loss/logits": 0.00305762467905879,
"step": 77
},
{
"epoch": 0.078,
"grad_norm": 0.5078125,
"grad_norm_var": 0.019349145889282226,
"learning_rate": 1.5600000000000003e-05,
"loss": 0.0273,
"loss/crossentropy": 2.623558282852173,
"loss/hidden": 0.024169921875,
"loss/logits": 0.0031643210677430034,
"step": 78
},
{
"epoch": 0.079,
"grad_norm": 0.470703125,
"grad_norm_var": 0.019434547424316405,
"learning_rate": 1.58e-05,
"loss": 0.0275,
"loss/crossentropy": 2.3246337175369263,
"loss/hidden": 0.0242919921875,
"loss/logits": 0.0031679703388363123,
"step": 79
},
{
"epoch": 0.08,
"grad_norm": 0.431640625,
"grad_norm_var": 0.0201418399810791,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.0254,
"loss/crossentropy": 1.801970660686493,
"loss/hidden": 0.0228271484375,
"loss/logits": 0.0025987064000219107,
"step": 80
},
{
"epoch": 0.081,
"grad_norm": 0.44921875,
"grad_norm_var": 0.021184905370076498,
"learning_rate": 1.62e-05,
"loss": 0.0265,
"loss/crossentropy": 1.9489317536354065,
"loss/hidden": 0.02374267578125,
"loss/logits": 0.0027701087528839707,
"step": 81
},
{
"epoch": 0.082,
"grad_norm": 0.67578125,
"grad_norm_var": 0.02180479367574056,
"learning_rate": 1.64e-05,
"loss": 0.034,
"loss/crossentropy": 1.7697851061820984,
"loss/hidden": 0.03070068359375,
"loss/logits": 0.003283574478700757,
"step": 82
},
{
"epoch": 0.083,
"grad_norm": 0.57421875,
"grad_norm_var": 0.021323140462239584,
"learning_rate": 1.66e-05,
"loss": 0.0309,
"loss/crossentropy": 1.5783970654010773,
"loss/hidden": 0.028076171875,
"loss/logits": 0.002809713245369494,
"step": 83
},
{
"epoch": 0.084,
"grad_norm": 0.53125,
"grad_norm_var": 0.02108605702718099,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.0332,
"loss/crossentropy": 1.460361659526825,
"loss/hidden": 0.0303955078125,
"loss/logits": 0.0027706819819286466,
"step": 84
},
{
"epoch": 0.085,
"grad_norm": 0.6015625,
"grad_norm_var": 0.017696062723795574,
"learning_rate": 1.7e-05,
"loss": 0.0324,
"loss/crossentropy": 2.1110434532165527,
"loss/hidden": 0.02911376953125,
"loss/logits": 0.0033112409291788936,
"step": 85
},
{
"epoch": 0.086,
"grad_norm": 0.451171875,
"grad_norm_var": 0.018857304255167642,
"learning_rate": 1.72e-05,
"loss": 0.0291,
"loss/crossentropy": 1.7163687944412231,
"loss/hidden": 0.02630615234375,
"loss/logits": 0.0027680074563249946,
"step": 86
},
{
"epoch": 0.087,
"grad_norm": 0.5703125,
"grad_norm_var": 0.018718449274698894,
"learning_rate": 1.7400000000000003e-05,
"loss": 0.0339,
"loss/crossentropy": 1.8893783688545227,
"loss/hidden": 0.03021240234375,
"loss/logits": 0.0037144168745726347,
"step": 87
},
{
"epoch": 0.088,
"grad_norm": 1.75,
"grad_norm_var": 0.0965951124827067,
"learning_rate": 1.76e-05,
"loss": 0.0293,
"loss/crossentropy": 1.0857177823781967,
"loss/hidden": 0.02716064453125,
"loss/logits": 0.002114512084517628,
"step": 88
},
{
"epoch": 0.089,
"grad_norm": 0.4609375,
"grad_norm_var": 0.09848872820536296,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.0278,
"loss/crossentropy": 2.1670188307762146,
"loss/hidden": 0.0250244140625,
"loss/logits": 0.0027708488050848246,
"step": 89
},
{
"epoch": 0.09,
"grad_norm": 2.984375,
"grad_norm_var": 0.4452332655588786,
"learning_rate": 1.8e-05,
"loss": 0.034,
"loss/crossentropy": 0.8697951380163431,
"loss/hidden": 0.0322265625,
"loss/logits": 0.0017659573932178319,
"step": 90
},
{
"epoch": 0.091,
"grad_norm": 0.58984375,
"grad_norm_var": 0.44585811297098793,
"learning_rate": 1.8200000000000002e-05,
"loss": 0.0315,
"loss/crossentropy": 2.0653520226478577,
"loss/hidden": 0.02813720703125,
"loss/logits": 0.003313788794912398,
"step": 91
},
{
"epoch": 0.092,
"grad_norm": 0.66015625,
"grad_norm_var": 0.44346858660380045,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.0352,
"loss/crossentropy": 2.1175276041030884,
"loss/hidden": 0.0318603515625,
"loss/logits": 0.003378898836672306,
"step": 92
},
{
"epoch": 0.093,
"grad_norm": 0.478515625,
"grad_norm_var": 0.44917195638020835,
"learning_rate": 1.86e-05,
"loss": 0.0328,
"loss/crossentropy": 2.192784309387207,
"loss/hidden": 0.029296875,
"loss/logits": 0.003497788915410638,
"step": 93
},
{
"epoch": 0.094,
"grad_norm": 0.50390625,
"grad_norm_var": 0.4493051528930664,
"learning_rate": 1.88e-05,
"loss": 0.0342,
"loss/crossentropy": 1.8000940680503845,
"loss/hidden": 0.0308837890625,
"loss/logits": 0.003295119386166334,
"step": 94
},
{
"epoch": 0.095,
"grad_norm": 0.86328125,
"grad_norm_var": 0.44371743202209474,
"learning_rate": 1.9e-05,
"loss": 0.0376,
"loss/crossentropy": 1.9514374732971191,
"loss/hidden": 0.0340576171875,
"loss/logits": 0.0035327656660228968,
"step": 95
},
{
"epoch": 0.096,
"grad_norm": 0.55859375,
"grad_norm_var": 0.4387262980143229,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.0334,
"loss/crossentropy": 1.7834157943725586,
"loss/hidden": 0.03021240234375,
"loss/logits": 0.003167669870890677,
"step": 96
},
{
"epoch": 0.097,
"grad_norm": 0.71484375,
"grad_norm_var": 0.4309270222981771,
"learning_rate": 1.94e-05,
"loss": 0.0327,
"loss/crossentropy": 1.6889591813087463,
"loss/hidden": 0.02972412109375,
"loss/logits": 0.0029616469983011484,
"step": 97
},
{
"epoch": 0.098,
"grad_norm": 0.56640625,
"grad_norm_var": 0.4336400349934896,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.0354,
"loss/crossentropy": 1.7813147902488708,
"loss/hidden": 0.031982421875,
"loss/logits": 0.003417789936065674,
"step": 98
},
{
"epoch": 0.099,
"grad_norm": 0.9140625,
"grad_norm_var": 0.43045953114827473,
"learning_rate": 1.98e-05,
"loss": 0.0376,
"loss/crossentropy": 1.3951178789138794,
"loss/hidden": 0.0345458984375,
"loss/logits": 0.0030310061993077397,
"step": 99
},
{
"epoch": 0.1,
"grad_norm": 0.56640625,
"grad_norm_var": 0.4291600545247396,
"learning_rate": 2e-05,
"loss": 0.0364,
"loss/crossentropy": 2.255498170852661,
"loss/hidden": 0.03277587890625,
"loss/logits": 0.0036420804681256413,
"step": 100
},
{
"epoch": 0.101,
"grad_norm": 0.58984375,
"grad_norm_var": 0.429521115620931,
"learning_rate": 2e-05,
"loss": 0.033,
"loss/crossentropy": 2.4104394912719727,
"loss/hidden": 0.02960205078125,
"loss/logits": 0.0033488960471004248,
"step": 101
},
{
"epoch": 0.102,
"grad_norm": 4.8125,
"grad_norm_var": 1.4001366774241129,
"learning_rate": 2e-05,
"loss": 0.0477,
"loss/crossentropy": 1.0830636993050575,
"loss/hidden": 0.0452880859375,
"loss/logits": 0.0023841604124754667,
"step": 102
},
{
"epoch": 0.103,
"grad_norm": 4.1875,
"grad_norm_var": 1.9629084110260009,
"learning_rate": 2e-05,
"loss": 0.0475,
"loss/crossentropy": 0.7437883876264095,
"loss/hidden": 0.0455322265625,
"loss/logits": 0.0019981139339506626,
"step": 103
},
{
"epoch": 0.104,
"grad_norm": 0.77734375,
"grad_norm_var": 1.9669294834136963,
"learning_rate": 2e-05,
"loss": 0.0387,
"loss/crossentropy": 2.1284059882164,
"loss/hidden": 0.0345458984375,
"loss/logits": 0.00411223981063813,
"step": 104
},
{
"epoch": 0.105,
"grad_norm": 1.6796875,
"grad_norm_var": 1.92922043800354,
"learning_rate": 2e-05,
"loss": 0.0459,
"loss/crossentropy": 2.1119471192359924,
"loss/hidden": 0.0411376953125,
"loss/logits": 0.0047579677775502205,
"step": 105
},
{
"epoch": 0.106,
"grad_norm": 0.90234375,
"grad_norm_var": 1.7437895298004151,
"learning_rate": 2e-05,
"loss": 0.044,
"loss/crossentropy": 2.391239643096924,
"loss/hidden": 0.0390625,
"loss/logits": 0.004930721828714013,
"step": 106
},
{
"epoch": 0.107,
"grad_norm": 1.6875,
"grad_norm_var": 1.7282822767893473,
"learning_rate": 2e-05,
"loss": 0.0451,
"loss/crossentropy": 1.7602136731147766,
"loss/hidden": 0.040283203125,
"loss/logits": 0.004797366913408041,
"step": 107
},
{
"epoch": 0.108,
"grad_norm": 0.8828125,
"grad_norm_var": 1.7130108992258708,
"learning_rate": 2e-05,
"loss": 0.0428,
"loss/crossentropy": 2.0745638012886047,
"loss/hidden": 0.0386962890625,
"loss/logits": 0.004113797098398209,
"step": 108
},
{
"epoch": 0.109,
"grad_norm": 0.82421875,
"grad_norm_var": 1.6829447428385416,
"learning_rate": 2e-05,
"loss": 0.0422,
"loss/crossentropy": 1.685157299041748,
"loss/hidden": 0.03857421875,
"loss/logits": 0.0036494951928034425,
"step": 109
},
{
"epoch": 0.11,
"grad_norm": 1.5703125,
"grad_norm_var": 1.6387715021769205,
"learning_rate": 2e-05,
"loss": 0.0376,
"loss/crossentropy": 2.625019073486328,
"loss/hidden": 0.03369140625,
"loss/logits": 0.0039150441298261285,
"step": 110
},
{
"epoch": 0.111,
"grad_norm": 1.5234375,
"grad_norm_var": 1.6204302469889322,
"learning_rate": 2e-05,
"loss": 0.0422,
"loss/crossentropy": 0.676440417766571,
"loss/hidden": 0.0401611328125,
"loss/logits": 0.0020512532209977508,
"step": 111
},
{
"epoch": 0.112,
"grad_norm": 0.65234375,
"grad_norm_var": 1.6101824442545574,
"learning_rate": 2e-05,
"loss": 0.0479,
"loss/crossentropy": 1.8928841352462769,
"loss/hidden": 0.0435791015625,
"loss/logits": 0.00434900657273829,
"step": 112
},
{
"epoch": 0.113,
"grad_norm": 1.09375,
"grad_norm_var": 1.5831150690714517,
"learning_rate": 2e-05,
"loss": 0.0498,
"loss/crossentropy": 1.2006176710128784,
"loss/hidden": 0.04638671875,
"loss/logits": 0.0034257903462275863,
"step": 113
},
{
"epoch": 0.114,
"grad_norm": 0.84375,
"grad_norm_var": 1.5551775614420573,
"learning_rate": 2e-05,
"loss": 0.0437,
"loss/crossentropy": 2.164067029953003,
"loss/hidden": 0.03955078125,
"loss/logits": 0.004164737183600664,
"step": 114
},
{
"epoch": 0.115,
"grad_norm": 0.875,
"grad_norm_var": 1.5581644694010417,
"learning_rate": 2e-05,
"loss": 0.0469,
"loss/crossentropy": 1.963140070438385,
"loss/hidden": 0.0419921875,
"loss/logits": 0.004867425188422203,
"step": 115
},
{
"epoch": 0.116,
"grad_norm": 0.83984375,
"grad_norm_var": 1.530010732014974,
"learning_rate": 2e-05,
"loss": 0.0469,
"loss/crossentropy": 1.936423420906067,
"loss/hidden": 0.04248046875,
"loss/logits": 0.004457900300621986,
"step": 116
},
{
"epoch": 0.117,
"grad_norm": 1.0,
"grad_norm_var": 1.4916320164998373,
"learning_rate": 2e-05,
"loss": 0.044,
"loss/crossentropy": 1.9027796387672424,
"loss/hidden": 0.0396728515625,
"loss/logits": 0.004306067014113069,
"step": 117
},
{
"epoch": 0.118,
"grad_norm": 0.921875,
"grad_norm_var": 0.724272092183431,
"learning_rate": 2e-05,
"loss": 0.048,
"loss/crossentropy": 1.4962169528007507,
"loss/hidden": 0.043212890625,
"loss/logits": 0.004831232130527496,
"step": 118
},
{
"epoch": 0.119,
"grad_norm": 1.3046875,
"grad_norm_var": 0.12087090810139973,
"learning_rate": 2e-05,
"loss": 0.0458,
"loss/crossentropy": 1.8558754324913025,
"loss/hidden": 0.04150390625,
"loss/logits": 0.004260358400642872,
"step": 119
},
{
"epoch": 0.12,
"grad_norm": 0.7421875,
"grad_norm_var": 0.12239583333333333,
"learning_rate": 2e-05,
"loss": 0.0467,
"loss/crossentropy": 2.163163900375366,
"loss/hidden": 0.042236328125,
"loss/logits": 0.0044949238654226065,
"step": 120
},
{
"epoch": 0.121,
"grad_norm": 0.66796875,
"grad_norm_var": 0.10601139068603516,
"learning_rate": 2e-05,
"loss": 0.0429,
"loss/crossentropy": 1.875292718410492,
"loss/hidden": 0.0389404296875,
"loss/logits": 0.003972187405452132,
"step": 121
},
{
"epoch": 0.122,
"grad_norm": 0.97265625,
"grad_norm_var": 0.1052103042602539,
"learning_rate": 2e-05,
"loss": 0.0504,
"loss/crossentropy": 1.581692636013031,
"loss/hidden": 0.0462646484375,
"loss/logits": 0.0040856958366930485,
"step": 122
},
{
"epoch": 0.123,
"grad_norm": 0.77734375,
"grad_norm_var": 0.07660497029622396,
"learning_rate": 2e-05,
"loss": 0.0467,
"loss/crossentropy": 2.185007333755493,
"loss/hidden": 0.0419921875,
"loss/logits": 0.0047312104143202305,
"step": 123
},
{
"epoch": 0.124,
"grad_norm": 0.70703125,
"grad_norm_var": 0.08053887685139974,
"learning_rate": 2e-05,
"loss": 0.0527,
"loss/crossentropy": 1.7746418118476868,
"loss/hidden": 0.0482177734375,
"loss/logits": 0.004488097038120031,
"step": 124
},
{
"epoch": 0.125,
"grad_norm": 0.82421875,
"grad_norm_var": 0.08053887685139974,
"learning_rate": 2e-05,
"loss": 0.0483,
"loss/crossentropy": 1.8139249682426453,
"loss/hidden": 0.044189453125,
"loss/logits": 0.00407675513997674,
"step": 125
},
{
"epoch": 0.126,
"grad_norm": 0.80078125,
"grad_norm_var": 0.05464986165364583,
"learning_rate": 2e-05,
"loss": 0.0536,
"loss/crossentropy": 1.8078742623329163,
"loss/hidden": 0.0489501953125,
"loss/logits": 0.004657944664359093,
"step": 126
},
{
"epoch": 0.127,
"grad_norm": 1.09375,
"grad_norm_var": 0.030997467041015626,
"learning_rate": 2e-05,
"loss": 0.0496,
"loss/crossentropy": 2.0267322659492493,
"loss/hidden": 0.0447998046875,
"loss/logits": 0.0047590641770511866,
"step": 127
},
{
"epoch": 0.128,
"grad_norm": 0.85546875,
"grad_norm_var": 0.027347564697265625,
"learning_rate": 2e-05,
"loss": 0.0587,
"loss/crossentropy": 1.6603793501853943,
"loss/hidden": 0.052978515625,
"loss/logits": 0.005712392507120967,
"step": 128
},
{
"epoch": 0.129,
"grad_norm": 5.375,
"grad_norm_var": 1.286358388264974,
"learning_rate": 2e-05,
"loss": 0.0577,
"loss/crossentropy": 0.8844976872205734,
"loss/hidden": 0.0550537109375,
"loss/logits": 0.0026012896560132504,
"step": 129
},
{
"epoch": 0.13,
"grad_norm": 0.94140625,
"grad_norm_var": 1.2828027725219726,
"learning_rate": 2e-05,
"loss": 0.0532,
"loss/crossentropy": 2.151723265647888,
"loss/hidden": 0.04833984375,
"loss/logits": 0.0048982377629727125,
"step": 130
},
{
"epoch": 0.131,
"grad_norm": 0.92578125,
"grad_norm_var": 1.280975341796875,
"learning_rate": 2e-05,
"loss": 0.048,
"loss/crossentropy": 2.190707802772522,
"loss/hidden": 0.0435791015625,
"loss/logits": 0.004458446754142642,
"step": 131
},
{
"epoch": 0.132,
"grad_norm": 0.73828125,
"grad_norm_var": 1.2861162821451824,
"learning_rate": 2e-05,
"loss": 0.0562,
"loss/crossentropy": 2.0854132175445557,
"loss/hidden": 0.0511474609375,
"loss/logits": 0.005020990269258618,
"step": 132
},
{
"epoch": 0.133,
"grad_norm": 0.6796875,
"grad_norm_var": 1.299598185221354,
"learning_rate": 2e-05,
"loss": 0.0509,
"loss/crossentropy": 2.0993438959121704,
"loss/hidden": 0.046142578125,
"loss/logits": 0.004787095822393894,
"step": 133
},
{
"epoch": 0.134,
"grad_norm": 0.96875,
"grad_norm_var": 1.2983378092447917,
"learning_rate": 2e-05,
"loss": 0.0491,
"loss/crossentropy": 2.2328933477401733,
"loss/hidden": 0.0445556640625,
"loss/logits": 0.004536583088338375,
"step": 134
},
{
"epoch": 0.135,
"grad_norm": 1.0625,
"grad_norm_var": 1.2969581604003906,
"learning_rate": 2e-05,
"loss": 0.0638,
"loss/crossentropy": 1.9981300234794617,
"loss/hidden": 0.0579833984375,
"loss/logits": 0.00582107319496572,
"step": 135
},
{
"epoch": 0.136,
"grad_norm": 0.6796875,
"grad_norm_var": 1.3004615783691407,
"learning_rate": 2e-05,
"loss": 0.0542,
"loss/crossentropy": 2.1993343830108643,
"loss/hidden": 0.049072265625,
"loss/logits": 0.005134769715368748,
"step": 136
},
{
"epoch": 0.137,
"grad_norm": 3.5,
"grad_norm_var": 1.627500343322754,
"learning_rate": 2e-05,
"loss": 0.0595,
"loss/crossentropy": 1.469780683517456,
"loss/hidden": 0.0552978515625,
"loss/logits": 0.0042177007999271154,
"step": 137
},
{
"epoch": 0.138,
"grad_norm": 0.87109375,
"grad_norm_var": 1.632664426167806,
"learning_rate": 2e-05,
"loss": 0.0554,
"loss/crossentropy": 1.8814529180526733,
"loss/hidden": 0.0506591796875,
"loss/logits": 0.004711252404376864,
"step": 138
},
{
"epoch": 0.139,
"grad_norm": 0.9140625,
"grad_norm_var": 1.62430419921875,
"learning_rate": 2e-05,
"loss": 0.0542,
"loss/crossentropy": 1.9769226908683777,
"loss/hidden": 0.049560546875,
"loss/logits": 0.004602615023031831,
"step": 139
},
{
"epoch": 0.14,
"grad_norm": 1.296875,
"grad_norm_var": 1.5987385431925456,
"learning_rate": 2e-05,
"loss": 0.0562,
"loss/crossentropy": 1.3646953105926514,
"loss/hidden": 0.0516357421875,
"loss/logits": 0.0045162534806877375,
"step": 140
},
{
"epoch": 0.141,
"grad_norm": 0.91796875,
"grad_norm_var": 1.592772356669108,
"learning_rate": 2e-05,
"loss": 0.0586,
"loss/crossentropy": 1.5901939272880554,
"loss/hidden": 0.0538330078125,
"loss/logits": 0.004788138438016176,
"step": 141
},
{
"epoch": 0.142,
"grad_norm": 1.109375,
"grad_norm_var": 1.5760719299316406,
"learning_rate": 2e-05,
"loss": 0.0686,
"loss/crossentropy": 1.8436982035636902,
"loss/hidden": 0.062744140625,
"loss/logits": 0.005897135473787785,
"step": 142
},
{
"epoch": 0.143,
"grad_norm": 1.0,
"grad_norm_var": 1.5800819396972656,
"learning_rate": 2e-05,
"loss": 0.0677,
"loss/crossentropy": 1.7922558188438416,
"loss/hidden": 0.06103515625,
"loss/logits": 0.006622593384236097,
"step": 143
},
{
"epoch": 0.144,
"grad_norm": 1.046875,
"grad_norm_var": 1.5693745295206705,
"learning_rate": 2e-05,
"loss": 0.0626,
"loss/crossentropy": 1.8654756546020508,
"loss/hidden": 0.05712890625,
"loss/logits": 0.005447414005175233,
"step": 144
},
{
"epoch": 0.145,
"grad_norm": 0.8046875,
"grad_norm_var": 0.43840071360270183,
"learning_rate": 2e-05,
"loss": 0.0653,
"loss/crossentropy": 2.023370146751404,
"loss/hidden": 0.0596923828125,
"loss/logits": 0.005567178362980485,
"step": 145
},
{
"epoch": 0.146,
"grad_norm": 1.7265625,
"grad_norm_var": 0.4612627665201823,
"learning_rate": 2e-05,
"loss": 0.0718,
"loss/crossentropy": 1.2652358412742615,
"loss/hidden": 0.066162109375,
"loss/logits": 0.00563872791826725,
"step": 146
},
{
"epoch": 0.147,
"grad_norm": 0.8359375,
"grad_norm_var": 0.4643350601196289,
"learning_rate": 2e-05,
"loss": 0.0579,
"loss/crossentropy": 2.181838572025299,
"loss/hidden": 0.0528564453125,
"loss/logits": 0.0050070807337760925,
"step": 147
},
{
"epoch": 0.148,
"grad_norm": 1.65625,
"grad_norm_var": 0.4685035705566406,
"learning_rate": 2e-05,
"loss": 0.0653,
"loss/crossentropy": 1.6760476231575012,
"loss/hidden": 0.059814453125,
"loss/logits": 0.005448109935969114,
"step": 148
},
{
"epoch": 0.149,
"grad_norm": 0.875,
"grad_norm_var": 0.45754903157552085,
"learning_rate": 2e-05,
"loss": 0.0608,
"loss/crossentropy": 1.9610846042633057,
"loss/hidden": 0.05517578125,
"loss/logits": 0.0055898819118738174,
"step": 149
},
{
"epoch": 0.15,
"grad_norm": 1.2890625,
"grad_norm_var": 0.45391006469726564,
"learning_rate": 2e-05,
"loss": 0.0607,
"loss/crossentropy": 2.0354663729667664,
"loss/hidden": 0.054931640625,
"loss/logits": 0.005750466603785753,
"step": 150
},
{
"epoch": 0.151,
"grad_norm": 0.91015625,
"grad_norm_var": 0.4586435317993164,
"learning_rate": 2e-05,
"loss": 0.061,
"loss/crossentropy": 1.5509551763534546,
"loss/hidden": 0.05615234375,
"loss/logits": 0.00486933346837759,
"step": 151
},
{
"epoch": 0.152,
"grad_norm": 1.8203125,
"grad_norm_var": 0.45860640207926434,
"learning_rate": 2e-05,
"loss": 0.0823,
"loss/crossentropy": 1.3190861344337463,
"loss/hidden": 0.076171875,
"loss/logits": 0.006146557629108429,
"step": 152
},
{
"epoch": 0.153,
"grad_norm": 1.65625,
"grad_norm_var": 0.12676741282145182,
"learning_rate": 2e-05,
"loss": 0.0689,
"loss/crossentropy": 2.0075970888137817,
"loss/hidden": 0.0626220703125,
"loss/logits": 0.0062951259315013885,
"step": 153
},
{
"epoch": 0.154,
"grad_norm": 0.84375,
"grad_norm_var": 0.12790629069010417,
"learning_rate": 2e-05,
"loss": 0.0645,
"loss/crossentropy": 2.5025904178619385,
"loss/hidden": 0.0584716796875,
"loss/logits": 0.005998906912282109,
"step": 154
},
{
"epoch": 0.155,
"grad_norm": 1.75,
"grad_norm_var": 0.14317194620768228,
"learning_rate": 2e-05,
"loss": 0.0673,
"loss/crossentropy": 1.7674061059951782,
"loss/hidden": 0.0618896484375,
"loss/logits": 0.005377188790589571,
"step": 155
},
{
"epoch": 0.156,
"grad_norm": 1.046875,
"grad_norm_var": 0.14455540974934897,
"learning_rate": 2e-05,
"loss": 0.0696,
"loss/crossentropy": 1.4891575574874878,
"loss/hidden": 0.0640869140625,
"loss/logits": 0.005491052754223347,
"step": 156
},
{
"epoch": 0.157,
"grad_norm": 1.0078125,
"grad_norm_var": 0.1416147232055664,
"learning_rate": 2e-05,
"loss": 0.0656,
"loss/crossentropy": 1.4295508861541748,
"loss/hidden": 0.060546875,
"loss/logits": 0.005026416387408972,
"step": 157
},
{
"epoch": 0.158,
"grad_norm": 8.5,
"grad_norm_var": 3.4551263809204102,
"learning_rate": 2e-05,
"loss": 0.1047,
"loss/crossentropy": 1.6207728683948517,
"loss/hidden": 0.09716796875,
"loss/logits": 0.007503823610022664,
"step": 158
},
{
"epoch": 0.159,
"grad_norm": 1.3125,
"grad_norm_var": 3.4331842422485352,
"learning_rate": 2e-05,
"loss": 0.0663,
"loss/crossentropy": 1.838720440864563,
"loss/hidden": 0.06103515625,
"loss/logits": 0.0052408319897949696,
"step": 159
},
{
"epoch": 0.16,
"grad_norm": 1.765625,
"grad_norm_var": 3.403587277730306,
"learning_rate": 2e-05,
"loss": 0.0729,
"loss/crossentropy": 1.9572261571884155,
"loss/hidden": 0.06640625,
"loss/logits": 0.00649917172268033,
"step": 160
},
{
"epoch": 0.161,
"grad_norm": 7.71875,
"grad_norm_var": 5.5313720067342125,
"learning_rate": 2e-05,
"loss": 0.0873,
"loss/crossentropy": 0.06751747522503138,
"loss/hidden": 0.086181640625,
"loss/logits": 0.001096382096875459,
"step": 161
},
{
"epoch": 0.162,
"grad_norm": 1.65625,
"grad_norm_var": 5.535835202534994,
"learning_rate": 2e-05,
"loss": 0.0753,
"loss/crossentropy": 1.9767259359359741,
"loss/hidden": 0.06884765625,
"loss/logits": 0.006433435715734959,
"step": 162
},
{
"epoch": 0.163,
"grad_norm": 1.2734375,
"grad_norm_var": 5.470252927144369,
"learning_rate": 2e-05,
"loss": 0.0742,
"loss/crossentropy": 1.6337787508964539,
"loss/hidden": 0.068359375,
"loss/logits": 0.0058679585345089436,
"step": 163
},
{
"epoch": 0.164,
"grad_norm": 1.171875,
"grad_norm_var": 5.519557634989421,
"learning_rate": 2e-05,
"loss": 0.0791,
"loss/crossentropy": 1.5085630416870117,
"loss/hidden": 0.0732421875,
"loss/logits": 0.00587455416098237,
"step": 164
},
{
"epoch": 0.165,
"grad_norm": 1.328125,
"grad_norm_var": 5.454612668355306,
"learning_rate": 2e-05,
"loss": 0.0733,
"loss/crossentropy": 2.1295101046562195,
"loss/hidden": 0.0665283203125,
"loss/logits": 0.006821601651608944,
"step": 165
},
{
"epoch": 0.166,
"grad_norm": 0.828125,
"grad_norm_var": 5.523303159077963,
"learning_rate": 2e-05,
"loss": 0.0681,
"loss/crossentropy": 2.1514192819595337,
"loss/hidden": 0.061767578125,
"loss/logits": 0.0063285790383815765,
"step": 166
},
{
"epoch": 0.167,
"grad_norm": 0.9140625,
"grad_norm_var": 5.522652180989583,
"learning_rate": 2e-05,
"loss": 0.0799,
"loss/crossentropy": 1.907168447971344,
"loss/hidden": 0.072509765625,
"loss/logits": 0.0073654367588460445,
"step": 167
},
{
"epoch": 0.168,
"grad_norm": 0.70703125,
"grad_norm_var": 5.650849850972493,
"learning_rate": 2e-05,
"loss": 0.0665,
"loss/crossentropy": 2.490573525428772,
"loss/hidden": 0.0604248046875,
"loss/logits": 0.006123463856056333,
"step": 168
},
{
"epoch": 0.169,
"grad_norm": 0.921875,
"grad_norm_var": 5.727275530497233,
"learning_rate": 2e-05,
"loss": 0.0686,
"loss/crossentropy": 2.1971182823181152,
"loss/hidden": 0.0625,
"loss/logits": 0.006081034895032644,
"step": 169
},
{
"epoch": 0.17,
"grad_norm": 0.84375,
"grad_norm_var": 5.727275530497233,
"learning_rate": 2e-05,
"loss": 0.0723,
"loss/crossentropy": 1.9449633955955505,
"loss/hidden": 0.06591796875,
"loss/logits": 0.00633727153763175,
"step": 170
},
{
"epoch": 0.171,
"grad_norm": 0.80078125,
"grad_norm_var": 5.8211313883463545,
"learning_rate": 2e-05,
"loss": 0.0721,
"loss/crossentropy": 1.8933625221252441,
"loss/hidden": 0.066162109375,
"loss/logits": 0.005927694728597999,
"step": 171
},
{
"epoch": 0.172,
"grad_norm": 0.734375,
"grad_norm_var": 5.8664194742838545,
"learning_rate": 2e-05,
"loss": 0.0756,
"loss/crossentropy": 2.2961581349372864,
"loss/hidden": 0.069091796875,
"loss/logits": 0.00650426116771996,
"step": 172
},
{
"epoch": 0.173,
"grad_norm": 1.0859375,
"grad_norm_var": 5.856801350911458,
"learning_rate": 2e-05,
"loss": 0.0876,
"loss/crossentropy": 1.5580723285675049,
"loss/hidden": 0.080322265625,
"loss/logits": 0.00728521216660738,
"step": 173
},
{
"epoch": 0.174,
"grad_norm": 0.87109375,
"grad_norm_var": 2.8547820409138995,
"learning_rate": 2e-05,
"loss": 0.0785,
"loss/crossentropy": 2.4996918439865112,
"loss/hidden": 0.07080078125,
"loss/logits": 0.0076872315257787704,
"step": 174
},
{
"epoch": 0.175,
"grad_norm": 1.09375,
"grad_norm_var": 2.863120460510254,
"learning_rate": 2e-05,
"loss": 0.0842,
"loss/crossentropy": 2.341306686401367,
"loss/hidden": 0.075927734375,
"loss/logits": 0.008260179311037064,
"step": 175
},
{
"epoch": 0.176,
"grad_norm": 1.2734375,
"grad_norm_var": 2.859659767150879,
"learning_rate": 2e-05,
"loss": 0.0839,
"loss/crossentropy": 2.0976521968841553,
"loss/hidden": 0.075927734375,
"loss/logits": 0.007956868037581444,
"step": 176
},
{
"epoch": 0.177,
"grad_norm": 1.6640625,
"grad_norm_var": 0.09129581451416016,
"learning_rate": 2e-05,
"loss": 0.0854,
"loss/crossentropy": 1.5655289888381958,
"loss/hidden": 0.078857421875,
"loss/logits": 0.006505638128146529,
"step": 177
},
{
"epoch": 0.178,
"grad_norm": 0.96484375,
"grad_norm_var": 0.06740493774414062,
"learning_rate": 2e-05,
"loss": 0.0832,
"loss/crossentropy": 1.947506844997406,
"loss/hidden": 0.076171875,
"loss/logits": 0.0070168147794902325,
"step": 178
},
{
"epoch": 0.179,
"grad_norm": 4.5625,
"grad_norm_var": 0.8503774007161459,
"learning_rate": 2e-05,
"loss": 0.0965,
"loss/crossentropy": 1.557403326034546,
"loss/hidden": 0.087158203125,
"loss/logits": 0.009354921989142895,
"step": 179
},
{
"epoch": 0.18,
"grad_norm": 8.3125,
"grad_norm_var": 3.9767252604166665,
"learning_rate": 2e-05,
"loss": 0.1122,
"loss/crossentropy": 0.45333431661129,
"loss/hidden": 0.109375,
"loss/logits": 0.0027967533096671104,
"step": 180
},
{
"epoch": 0.181,
"grad_norm": 1.546875,
"grad_norm_var": 3.969405110677083,
"learning_rate": 2e-05,
"loss": 0.0829,
"loss/crossentropy": 2.005882978439331,
"loss/hidden": 0.075439453125,
"loss/logits": 0.007453362224623561,
"step": 181
},
{
"epoch": 0.182,
"grad_norm": 1.3515625,
"grad_norm_var": 3.926006825764974,
"learning_rate": 2e-05,
"loss": 0.0849,
"loss/crossentropy": 2.199571132659912,
"loss/hidden": 0.077880859375,
"loss/logits": 0.0069826748222112656,
"step": 182
},
{
"epoch": 0.183,
"grad_norm": 1.5703125,
"grad_norm_var": 3.8817014058430988,
"learning_rate": 2e-05,
"loss": 0.0921,
"loss/crossentropy": 1.6926537156105042,
"loss/hidden": 0.085205078125,
"loss/logits": 0.006879956694319844,
"step": 183
},
{
"epoch": 0.184,
"grad_norm": 1.203125,
"grad_norm_var": 3.826835568745931,
"learning_rate": 2e-05,
"loss": 0.0964,
"loss/crossentropy": 1.509221613407135,
"loss/hidden": 0.087890625,
"loss/logits": 0.00847849901765585,
"step": 184
},
{
"epoch": 0.185,
"grad_norm": 0.703125,
"grad_norm_var": 3.8554396947224934,
"learning_rate": 2e-05,
"loss": 0.0788,
"loss/crossentropy": 2.4337867498397827,
"loss/hidden": 0.072021484375,
"loss/logits": 0.0067423065192997456,
"step": 185
},
{
"epoch": 0.186,
"grad_norm": 1.234375,
"grad_norm_var": 3.815881284077962,
"learning_rate": 2e-05,
"loss": 0.0966,
"loss/crossentropy": 1.7458332180976868,
"loss/hidden": 0.08837890625,
"loss/logits": 0.008262162329629064,
"step": 186
},
{
"epoch": 0.187,
"grad_norm": 6.59375,
"grad_norm_var": 5.133159383138021,
"learning_rate": 2e-05,
"loss": 0.0928,
"loss/crossentropy": 2.116236627101898,
"loss/hidden": 0.0830078125,
"loss/logits": 0.00975541677325964,
"step": 187
},
{
"epoch": 0.188,
"grad_norm": 1.8203125,
"grad_norm_var": 4.998583730061849,
"learning_rate": 2e-05,
"loss": 0.0831,
"loss/crossentropy": 2.324514389038086,
"loss/hidden": 0.075439453125,
"loss/logits": 0.007644579978659749,
"step": 188
},
{
"epoch": 0.189,
"grad_norm": 0.796875,
"grad_norm_var": 5.048313395182292,
"learning_rate": 2e-05,
"loss": 0.0867,
"loss/crossentropy": 1.9479625821113586,
"loss/hidden": 0.0791015625,
"loss/logits": 0.0075566458981484175,
"step": 189
},
{
"epoch": 0.19,
"grad_norm": 15.875,
"grad_norm_var": 16.414309628804524,
"learning_rate": 2e-05,
"loss": 0.1592,
"loss/crossentropy": 1.5863521695137024,
"loss/hidden": 0.1494140625,
"loss/logits": 0.009787225630134344,
"step": 190
},
{
"epoch": 0.191,
"grad_norm": 2.046875,
"grad_norm_var": 16.208450762430825,
"learning_rate": 2e-05,
"loss": 0.0784,
"loss/crossentropy": 0.8779918029904366,
"loss/hidden": 0.073974609375,
"loss/logits": 0.004391094436869025,
"step": 191
},
{
"epoch": 0.192,
"grad_norm": 1.375,
"grad_norm_var": 16.1827361424764,
"learning_rate": 2e-05,
"loss": 0.0931,
"loss/crossentropy": 2.1567060947418213,
"loss/hidden": 0.085693359375,
"loss/logits": 0.007449513301253319,
"step": 192
},
{
"epoch": 0.193,
"grad_norm": 0.875,
"grad_norm_var": 16.386012204488118,
"learning_rate": 2e-05,
"loss": 0.0898,
"loss/crossentropy": 1.8178179860115051,
"loss/hidden": 0.08251953125,
"loss/logits": 0.007294924231246114,
"step": 193
},
{
"epoch": 0.194,
"grad_norm": 2.21875,
"grad_norm_var": 16.114434560139973,
"learning_rate": 2e-05,
"loss": 0.1014,
"loss/crossentropy": 1.8806178569793701,
"loss/hidden": 0.09375,
"loss/logits": 0.0076924534514546394,
"step": 194
},
{
"epoch": 0.195,
"grad_norm": 1.8671875,
"grad_norm_var": 16.098729451497395,
"learning_rate": 2e-05,
"loss": 0.1048,
"loss/crossentropy": 1.6054936051368713,
"loss/hidden": 0.096435546875,
"loss/logits": 0.008354771416634321,
"step": 195
},
{
"epoch": 0.196,
"grad_norm": 1.90625,
"grad_norm_var": 14.200210571289062,
"learning_rate": 2e-05,
"loss": 0.0851,
"loss/crossentropy": 1.1937458366155624,
"loss/hidden": 0.079833984375,
"loss/logits": 0.005313969450071454,
"step": 196
},
{
"epoch": 0.197,
"grad_norm": 2.453125,
"grad_norm_var": 14.113833618164062,
"learning_rate": 2e-05,
"loss": 0.1056,
"loss/crossentropy": 1.9973903894424438,
"loss/hidden": 0.09619140625,
"loss/logits": 0.00938287889584899,
"step": 197
},
{
"epoch": 0.198,
"grad_norm": 1.5546875,
"grad_norm_var": 14.07872314453125,
"learning_rate": 2e-05,
"loss": 0.087,
"loss/crossentropy": 2.0422087907791138,
"loss/hidden": 0.07958984375,
"loss/logits": 0.007449948927387595,
"step": 198
},
{
"epoch": 0.199,
"grad_norm": 0.875,
"grad_norm_var": 14.218849436442058,
"learning_rate": 2e-05,
"loss": 0.0908,
"loss/crossentropy": 2.040232002735138,
"loss/hidden": 0.08349609375,
"loss/logits": 0.007334771566092968,
"step": 199
},
{
"epoch": 0.2,
"grad_norm": 3.6875,
"grad_norm_var": 14.104658762613932,
"learning_rate": 2e-05,
"loss": 0.0996,
"loss/crossentropy": 1.7977141737937927,
"loss/hidden": 0.09130859375,
"loss/logits": 0.008285259362310171,
"step": 200
},
{
"epoch": 0.201,
"grad_norm": 1.1640625,
"grad_norm_var": 13.984908040364584,
"learning_rate": 2e-05,
"loss": 0.0923,
"loss/crossentropy": 1.960830569267273,
"loss/hidden": 0.0849609375,
"loss/logits": 0.007373227505013347,
"step": 201
},
{
"epoch": 0.202,
"grad_norm": 1.2109375,
"grad_norm_var": 13.99013646443685,
"learning_rate": 2e-05,
"loss": 0.1063,
"loss/crossentropy": 1.5903997421264648,
"loss/hidden": 0.098876953125,
"loss/logits": 0.007376475026831031,
"step": 202
},
{
"epoch": 0.203,
"grad_norm": 2.015625,
"grad_norm_var": 13.0423215230306,
"learning_rate": 2e-05,
"loss": 0.0958,
"loss/crossentropy": 1.1866007596254349,
"loss/hidden": 0.0908203125,
"loss/logits": 0.0049855056568048894,
"step": 203
},
{
"epoch": 0.204,
"grad_norm": 2.203125,
"grad_norm_var": 13.01123046875,
"learning_rate": 2e-05,
"loss": 0.1001,
"loss/crossentropy": 2.016387164592743,
"loss/hidden": 0.092529296875,
"loss/logits": 0.0076178074814379215,
"step": 204
},
{
"epoch": 0.205,
"grad_norm": 0.98828125,
"grad_norm_var": 12.966665585835775,
"learning_rate": 2e-05,
"loss": 0.1017,
"loss/crossentropy": 1.9937080144882202,
"loss/hidden": 0.09326171875,
"loss/logits": 0.008388462010771036,
"step": 205
},
{
"epoch": 0.206,
"grad_norm": 1.65625,
"grad_norm_var": 0.5201679865519205,
"learning_rate": 2e-05,
"loss": 0.1012,
"loss/crossentropy": 1.8353246450424194,
"loss/hidden": 0.09326171875,
"loss/logits": 0.00795629364438355,
"step": 206
},
{
"epoch": 0.207,
"grad_norm": 1.6875,
"grad_norm_var": 0.5143070856730143,
"learning_rate": 2e-05,
"loss": 0.0918,
"loss/crossentropy": 1.0499791204929352,
"loss/hidden": 0.08740234375,
"loss/logits": 0.004438678151927888,
"step": 207
},
{
"epoch": 0.208,
"grad_norm": 1.0625,
"grad_norm_var": 0.5353540420532227,
"learning_rate": 2e-05,
"loss": 0.107,
"loss/crossentropy": 1.8614663481712341,
"loss/hidden": 0.09814453125,
"loss/logits": 0.008855776861310005,
"step": 208
},
{
"epoch": 0.209,
"grad_norm": 2.390625,
"grad_norm_var": 0.5093535741170248,
"learning_rate": 2e-05,
"loss": 0.1072,
"loss/crossentropy": 2.363565683364868,
"loss/hidden": 0.096923828125,
"loss/logits": 0.010271006729453802,
"step": 209
},
{
"epoch": 0.21,
"grad_norm": 2.171875,
"grad_norm_var": 0.5069289525349935,
"learning_rate": 2e-05,
"loss": 0.1086,
"loss/crossentropy": 1.955030083656311,
"loss/hidden": 0.099365234375,
"loss/logits": 0.0092296302318573,
"step": 210
},
{
"epoch": 0.211,
"grad_norm": 1.2265625,
"grad_norm_var": 0.5273447036743164,
"learning_rate": 2e-05,
"loss": 0.1062,
"loss/crossentropy": 1.774095892906189,
"loss/hidden": 0.0986328125,
"loss/logits": 0.007574398070573807,
"step": 211
},
{
"epoch": 0.212,
"grad_norm": 1.2890625,
"grad_norm_var": 0.5396000544230143,
"learning_rate": 2e-05,
"loss": 0.1117,
"loss/crossentropy": 1.8405153155326843,
"loss/hidden": 0.10302734375,
"loss/logits": 0.008719130419194698,
"step": 212
},
{
"epoch": 0.213,
"grad_norm": 1.40625,
"grad_norm_var": 0.5067829767862956,
"learning_rate": 2e-05,
"loss": 0.1045,
"loss/crossentropy": 2.0069875717163086,
"loss/hidden": 0.095947265625,
"loss/logits": 0.008583055343478918,
"step": 213
},
{
"epoch": 0.214,
"grad_norm": 1.1640625,
"grad_norm_var": 0.5219018936157227,
"learning_rate": 2e-05,
"loss": 0.1103,
"loss/crossentropy": 1.670526921749115,
"loss/hidden": 0.102294921875,
"loss/logits": 0.008038338739424944,
"step": 214
},
{
"epoch": 0.215,
"grad_norm": 1.8828125,
"grad_norm_var": 0.48292789459228513,
"learning_rate": 2e-05,
"loss": 0.1121,
"loss/crossentropy": 1.795514464378357,
"loss/hidden": 0.103759765625,
"loss/logits": 0.008318986743688583,
"step": 215
},
{
"epoch": 0.216,
"grad_norm": 1.1328125,
"grad_norm_var": 0.2139871597290039,
"learning_rate": 2e-05,
"loss": 0.1066,
"loss/crossentropy": 2.180332064628601,
"loss/hidden": 0.09716796875,
"loss/logits": 0.009391986764967442,
"step": 216
},
{
"epoch": 0.217,
"grad_norm": 1.9375,
"grad_norm_var": 0.21252689361572266,
"learning_rate": 2e-05,
"loss": 0.1234,
"loss/crossentropy": 1.8504464030265808,
"loss/hidden": 0.11181640625,
"loss/logits": 0.011583337560296059,
"step": 217
},
{
"epoch": 0.218,
"grad_norm": 1.046875,
"grad_norm_var": 0.22248172760009766,
"learning_rate": 2e-05,
"loss": 0.1098,
"loss/crossentropy": 1.6542016863822937,
"loss/hidden": 0.101806640625,
"loss/logits": 0.007953221211209893,
"step": 218
},
{
"epoch": 0.219,
"grad_norm": 1.1484375,
"grad_norm_var": 0.21898136138916016,
"learning_rate": 2e-05,
"loss": 0.1185,
"loss/crossentropy": 1.8401342630386353,
"loss/hidden": 0.107421875,
"loss/logits": 0.011056106071919203,
"step": 219
},
{
"epoch": 0.22,
"grad_norm": 1.2578125,
"grad_norm_var": 0.18931725819905598,
"learning_rate": 2e-05,
"loss": 0.1082,
"loss/crossentropy": 1.8265935778617859,
"loss/hidden": 0.09912109375,
"loss/logits": 0.009068313986063004,
"step": 220
},
{
"epoch": 0.221,
"grad_norm": 52.25,
"grad_norm_var": 161.16229426066081,
"learning_rate": 2e-05,
"loss": 0.1937,
"loss/crossentropy": 1.5437742471694946,
"loss/hidden": 0.170654296875,
"loss/logits": 0.023064299020916224,
"step": 221
},
{
"epoch": 0.222,
"grad_norm": 2.28125,
"grad_norm_var": 160.93560969034831,
"learning_rate": 2e-05,
"loss": 0.1246,
"loss/crossentropy": 1.227450430393219,
"loss/hidden": 0.11572265625,
"loss/logits": 0.008849140722304583,
"step": 222
},
{
"epoch": 0.223,
"grad_norm": 1.28125,
"grad_norm_var": 161.10956192016602,
"learning_rate": 2e-05,
"loss": 0.1196,
"loss/crossentropy": 1.9892451167106628,
"loss/hidden": 0.1103515625,
"loss/logits": 0.009212612174451351,
"step": 223
},
{
"epoch": 0.224,
"grad_norm": 1.0625,
"grad_norm_var": 161.10956192016602,
"learning_rate": 2e-05,
"loss": 0.1208,
"loss/crossentropy": 1.9727575778961182,
"loss/hidden": 0.111328125,
"loss/logits": 0.009519532322883606,
"step": 224
},
{
"epoch": 0.225,
"grad_norm": 1.9140625,
"grad_norm_var": 161.26942443847656,
"learning_rate": 2e-05,
"loss": 0.1112,
"loss/crossentropy": 2.20854651927948,
"loss/hidden": 0.1025390625,
"loss/logits": 0.008704130537807941,
"step": 225
},
{
"epoch": 0.226,
"grad_norm": 1.703125,
"grad_norm_var": 161.43824768066406,
"learning_rate": 2e-05,
"loss": 0.1249,
"loss/crossentropy": 1.8244708180427551,
"loss/hidden": 0.115478515625,
"loss/logits": 0.009438233450055122,
"step": 226
},
{
"epoch": 0.227,
"grad_norm": 1.9921875,
"grad_norm_var": 161.12805989583333,
"learning_rate": 2e-05,
"loss": 0.1264,
"loss/crossentropy": 1.6184683442115784,
"loss/hidden": 0.117431640625,
"loss/logits": 0.008998575620353222,
"step": 227
},
{
"epoch": 0.228,
"grad_norm": 1.40625,
"grad_norm_var": 161.0760617574056,
"learning_rate": 2e-05,
"loss": 0.1427,
"loss/crossentropy": 1.9090940952301025,
"loss/hidden": 0.1298828125,
"loss/logits": 0.01286676386371255,
"step": 228
},
{
"epoch": 0.229,
"grad_norm": 1.5078125,
"grad_norm_var": 161.03238525390626,
"learning_rate": 2e-05,
"loss": 0.1191,
"loss/crossentropy": 1.7622392773628235,
"loss/hidden": 0.109619140625,
"loss/logits": 0.009484861977398396,
"step": 229
},
{
"epoch": 0.23,
"grad_norm": 1.3671875,
"grad_norm_var": 160.93959045410156,
"learning_rate": 2e-05,
"loss": 0.1185,
"loss/crossentropy": 1.7633178234100342,
"loss/hidden": 0.109130859375,
"loss/logits": 0.009330280125141144,
"step": 230
},
{
"epoch": 0.231,
"grad_norm": 0.98828125,
"grad_norm_var": 161.32540073394776,
"learning_rate": 2e-05,
"loss": 0.1188,
"loss/crossentropy": 2.186140298843384,
"loss/hidden": 0.108154296875,
"loss/logits": 0.010631876531988382,
"step": 231
},
{
"epoch": 0.232,
"grad_norm": 3.28125,
"grad_norm_var": 160.60855553944904,
"learning_rate": 2e-05,
"loss": 0.1224,
"loss/crossentropy": 0.8389374911785126,
"loss/hidden": 0.1171875,
"loss/logits": 0.005214276316110045,
"step": 232
},
{
"epoch": 0.233,
"grad_norm": 1.0703125,
"grad_norm_var": 160.98382867177327,
"learning_rate": 2e-05,
"loss": 0.116,
"loss/crossentropy": 2.1515474915504456,
"loss/hidden": 0.107421875,
"loss/logits": 0.00860951654613018,
"step": 233
},
{
"epoch": 0.234,
"grad_norm": 4.5,
"grad_norm_var": 160.03680464426677,
"learning_rate": 2e-05,
"loss": 0.1312,
"loss/crossentropy": 1.6820667684078217,
"loss/hidden": 0.123046875,
"loss/logits": 0.008124232292175293,
"step": 234
},
{
"epoch": 0.235,
"grad_norm": 2.40625,
"grad_norm_var": 159.50010522206625,
"learning_rate": 2e-05,
"loss": 0.1056,
"loss/crossentropy": 0.9079534839838743,
"loss/hidden": 0.10107421875,
"loss/logits": 0.004542189242783934,
"step": 235
},
{
"epoch": 0.236,
"grad_norm": 0.984375,
"grad_norm_var": 159.64182631174722,
"learning_rate": 2e-05,
"loss": 0.1192,
"loss/crossentropy": 2.261181592941284,
"loss/hidden": 0.109619140625,
"loss/logits": 0.009581252932548523,
"step": 236
},
{
"epoch": 0.237,
"grad_norm": 0.9921875,
"grad_norm_var": 0.9261479059855143,
"learning_rate": 2e-05,
"loss": 0.1281,
"loss/crossentropy": 1.9553669095039368,
"loss/hidden": 0.116943359375,
"loss/logits": 0.011152476072311401,
"step": 237
},
{
"epoch": 0.238,
"grad_norm": 1.640625,
"grad_norm_var": 0.9103616714477539,
"learning_rate": 2e-05,
"loss": 0.1466,
"loss/crossentropy": 1.6360890865325928,
"loss/hidden": 0.13525390625,
"loss/logits": 0.011308418586850166,
"step": 238
},
{
"epoch": 0.239,
"grad_norm": 2.265625,
"grad_norm_var": 0.9085992813110352,
"learning_rate": 2e-05,
"loss": 0.133,
"loss/crossentropy": 1.0788212679326534,
"loss/hidden": 0.125732421875,
"loss/logits": 0.007256039883941412,
"step": 239
},
{
"epoch": 0.24,
"grad_norm": 1.7578125,
"grad_norm_var": 0.8688089370727539,
"learning_rate": 2e-05,
"loss": 0.1296,
"loss/crossentropy": 1.6809419393539429,
"loss/hidden": 0.119873046875,
"loss/logits": 0.009761545807123184,
"step": 240
},
{
"epoch": 0.241,
"grad_norm": 1.4921875,
"grad_norm_var": 0.8769525527954102,
"learning_rate": 2e-05,
"loss": 0.1298,
"loss/crossentropy": 2.1073160767555237,
"loss/hidden": 0.1201171875,
"loss/logits": 0.009713000617921352,
"step": 241
},
{
"epoch": 0.242,
"grad_norm": 3.3125,
"grad_norm_var": 1.0105956395467122,
"learning_rate": 2e-05,
"loss": 0.1851,
"loss/crossentropy": 1.7140259146690369,
"loss/hidden": 0.168212890625,
"loss/logits": 0.01692299358546734,
"step": 242
},
{
"epoch": 0.243,
"grad_norm": 1.3203125,
"grad_norm_var": 1.0337132136027019,
"learning_rate": 2e-05,
"loss": 0.141,
"loss/crossentropy": 1.70401269197464,
"loss/hidden": 0.13037109375,
"loss/logits": 0.010653213132172823,
"step": 243
},
{
"epoch": 0.244,
"grad_norm": 2.015625,
"grad_norm_var": 1.0173481623331706,
"learning_rate": 2e-05,
"loss": 0.1561,
"loss/crossentropy": 1.9086145758628845,
"loss/hidden": 0.1416015625,
"loss/logits": 0.01448416942730546,
"step": 244
},
{
"epoch": 0.245,
"grad_norm": 1.890625,
"grad_norm_var": 1.0048868179321289,
"learning_rate": 2e-05,
"loss": 0.1751,
"loss/crossentropy": 1.5015806555747986,
"loss/hidden": 0.16064453125,
"loss/logits": 0.014442750252783298,
"step": 245
},
{
"epoch": 0.246,
"grad_norm": 1.6796875,
"grad_norm_var": 0.9864847183227539,
"learning_rate": 2e-05,
"loss": 0.1323,
"loss/crossentropy": 1.9546470642089844,
"loss/hidden": 0.12255859375,
"loss/logits": 0.009766705334186554,
"step": 246
},
{
"epoch": 0.247,
"grad_norm": 1.203125,
"grad_norm_var": 0.9611083984375,
"learning_rate": 2e-05,
"loss": 0.1539,
"loss/crossentropy": 1.7062721848487854,
"loss/hidden": 0.1416015625,
"loss/logits": 0.01230617519468069,
"step": 247
},
{
"epoch": 0.248,
"grad_norm": 4.21875,
"grad_norm_var": 1.1776611328125,
"learning_rate": 2e-05,
"loss": 0.1515,
"loss/crossentropy": 1.740279734134674,
"loss/hidden": 0.14013671875,
"loss/logits": 0.011402689386159182,
"step": 248
},
{
"epoch": 0.249,
"grad_norm": 2.3125,
"grad_norm_var": 1.1123573303222656,
"learning_rate": 2e-05,
"loss": 0.1504,
"loss/crossentropy": 1.640882670879364,
"loss/hidden": 0.1396484375,
"loss/logits": 0.01071554934605956,
"step": 249
},
{
"epoch": 0.25,
"grad_norm": 2.796875,
"grad_norm_var": 0.7542132059733073,
"learning_rate": 2e-05,
"loss": 0.1364,
"loss/crossentropy": 1.4670004844665527,
"loss/hidden": 0.126708984375,
"loss/logits": 0.0096431621350348,
"step": 250
},
{
"epoch": 0.251,
"grad_norm": 1.1796875,
"grad_norm_var": 0.7847574869791667,
"learning_rate": 2e-05,
"loss": 0.14,
"loss/crossentropy": 2.2024736404418945,
"loss/hidden": 0.127197265625,
"loss/logits": 0.012759591452777386,
"step": 251
},
{
"epoch": 0.252,
"grad_norm": 3.53125,
"grad_norm_var": 0.8651763916015625,
"learning_rate": 2e-05,
"loss": 0.1539,
"loss/crossentropy": 2.0269722938537598,
"loss/hidden": 0.14208984375,
"loss/logits": 0.011817097198218107,
"step": 252
},
{
"epoch": 0.253,
"grad_norm": 9.375,
"grad_norm_var": 4.018281809488932,
"learning_rate": 2e-05,
"loss": 0.1661,
"loss/crossentropy": 0.34899202920496464,
"loss/hidden": 0.163818359375,
"loss/logits": 0.0022718849941156805,
"step": 253
},
{
"epoch": 0.254,
"grad_norm": 1.9921875,
"grad_norm_var": 3.9798868815104167,
"learning_rate": 2e-05,
"loss": 0.1441,
"loss/crossentropy": 2.2475985288619995,
"loss/hidden": 0.1318359375,
"loss/logits": 0.012224531266838312,
"step": 254
},
{
"epoch": 0.255,
"grad_norm": 1.6328125,
"grad_norm_var": 4.037050120035807,
"learning_rate": 2e-05,
"loss": 0.1497,
"loss/crossentropy": 2.8270416259765625,
"loss/hidden": 0.13623046875,
"loss/logits": 0.013480226043611765,
"step": 255
},
{
"epoch": 0.256,
"grad_norm": 1.4609375,
"grad_norm_var": 4.07616958618164,
"learning_rate": 2e-05,
"loss": 0.1668,
"loss/crossentropy": 1.3126854300498962,
"loss/hidden": 0.15576171875,
"loss/logits": 0.01107651786878705,
"step": 256
},
{
"epoch": 0.257,
"grad_norm": 1.9140625,
"grad_norm_var": 4.02563247680664,
"learning_rate": 2e-05,
"loss": 0.1502,
"loss/crossentropy": 1.4198355078697205,
"loss/hidden": 0.1396484375,
"loss/logits": 0.01056258101016283,
"step": 257
},
{
"epoch": 0.258,
"grad_norm": 1.3671875,
"grad_norm_var": 4.081167602539063,
"learning_rate": 2e-05,
"loss": 0.1421,
"loss/crossentropy": 1.657827377319336,
"loss/hidden": 0.13232421875,
"loss/logits": 0.009755304548889399,
"step": 258
},
{
"epoch": 0.259,
"grad_norm": 1.75,
"grad_norm_var": 4.025512440999349,
"learning_rate": 2e-05,
"loss": 0.1352,
"loss/crossentropy": 2.3775731325149536,
"loss/hidden": 0.12548828125,
"loss/logits": 0.0096644451841712,
"step": 259
},
{
"epoch": 0.26,
"grad_norm": 1.40625,
"grad_norm_var": 4.089703114827474,
"learning_rate": 2e-05,
"loss": 0.1442,
"loss/crossentropy": 2.2461366653442383,
"loss/hidden": 0.13232421875,
"loss/logits": 0.011895926669239998,
"step": 260
},
{
"epoch": 0.261,
"grad_norm": 2.578125,
"grad_norm_var": 4.065040842692057,
"learning_rate": 2e-05,
"loss": 0.1474,
"loss/crossentropy": 1.560776025056839,
"loss/hidden": 0.1337890625,
"loss/logits": 0.013578795362263918,
"step": 261
},
{
"epoch": 0.262,
"grad_norm": 1.5390625,
"grad_norm_var": 4.082124582926432,
"learning_rate": 2e-05,
"loss": 0.1556,
"loss/crossentropy": 1.9976117014884949,
"loss/hidden": 0.14404296875,
"loss/logits": 0.011512083932757378,
"step": 262
},
{
"epoch": 0.263,
"grad_norm": 1.6328125,
"grad_norm_var": 4.018440755208333,
"learning_rate": 2e-05,
"loss": 0.1759,
"loss/crossentropy": 1.705672264099121,
"loss/hidden": 0.16162109375,
"loss/logits": 0.014301342889666557,
"step": 263
},
{
"epoch": 0.264,
"grad_norm": 1.765625,
"grad_norm_var": 3.8464345296223956,
"learning_rate": 2e-05,
"loss": 0.1864,
"loss/crossentropy": 1.7075408101081848,
"loss/hidden": 0.171875,
"loss/logits": 0.01456779520958662,
"step": 264
},
{
"epoch": 0.265,
"grad_norm": 1.859375,
"grad_norm_var": 3.86392822265625,
"learning_rate": 2e-05,
"loss": 0.1677,
"loss/crossentropy": 2.094871759414673,
"loss/hidden": 0.15380859375,
"loss/logits": 0.013906504027545452,
"step": 265
},
{
"epoch": 0.266,
"grad_norm": 2.578125,
"grad_norm_var": 3.8542154947916667,
"learning_rate": 2e-05,
"loss": 0.1591,
"loss/crossentropy": 2.166890859603882,
"loss/hidden": 0.146484375,
"loss/logits": 0.012606294360011816,
"step": 266
},
{
"epoch": 0.267,
"grad_norm": 3.859375,
"grad_norm_var": 3.885705312093099,
"learning_rate": 2e-05,
"loss": 0.1763,
"loss/crossentropy": 1.674479365348816,
"loss/hidden": 0.162109375,
"loss/logits": 0.01416744152083993,
"step": 267
},
{
"epoch": 0.268,
"grad_norm": 2.625,
"grad_norm_var": 3.8142555236816404,
"learning_rate": 2e-05,
"loss": 0.2022,
"loss/crossentropy": 1.0146620571613312,
"loss/hidden": 0.1904296875,
"loss/logits": 0.01172702293843031,
"step": 268
},
{
"epoch": 0.269,
"grad_norm": 1.21875,
"grad_norm_var": 0.4503334045410156,
"learning_rate": 2e-05,
"loss": 0.1457,
"loss/crossentropy": 1.8024365305900574,
"loss/hidden": 0.13427734375,
"loss/logits": 0.011465264018625021,
"step": 269
},
{
"epoch": 0.27,
"grad_norm": 1.4296875,
"grad_norm_var": 0.46684951782226564,
"learning_rate": 2e-05,
"loss": 0.161,
"loss/crossentropy": 1.7421787977218628,
"loss/hidden": 0.14892578125,
"loss/logits": 0.012049074750393629,
"step": 270
},
{
"epoch": 0.271,
"grad_norm": 2.21875,
"grad_norm_var": 0.4663726806640625,
"learning_rate": 2e-05,
"loss": 0.1519,
"loss/crossentropy": 1.1601504981517792,
"loss/hidden": 0.14404296875,
"loss/logits": 0.007814974524080753,
"step": 271
},
{
"epoch": 0.272,
"grad_norm": 1.7421875,
"grad_norm_var": 0.4529693603515625,
"learning_rate": 2e-05,
"loss": 0.1693,
"loss/crossentropy": 1.9806629419326782,
"loss/hidden": 0.15625,
"loss/logits": 0.01302909990772605,
"step": 272
},
{
"epoch": 0.273,
"grad_norm": 1.1796875,
"grad_norm_var": 0.4919352213541667,
"learning_rate": 2e-05,
"loss": 0.1724,
"loss/crossentropy": 2.005366265773773,
"loss/hidden": 0.158203125,
"loss/logits": 0.014153223484754562,
"step": 273
},
{
"epoch": 0.274,
"grad_norm": 1.765625,
"grad_norm_var": 0.4723894755045573,
"learning_rate": 2e-05,
"loss": 0.1808,
"loss/crossentropy": 1.7814961075782776,
"loss/hidden": 0.166015625,
"loss/logits": 0.014784782659262419,
"step": 274
},
{
"epoch": 0.275,
"grad_norm": 1.9921875,
"grad_norm_var": 0.4697011311848958,
"learning_rate": 2e-05,
"loss": 0.1963,
"loss/crossentropy": 1.5670437216758728,
"loss/hidden": 0.1796875,
"loss/logits": 0.016570267733186483,
"step": 275
},
{
"epoch": 0.276,
"grad_norm": 1.4765625,
"grad_norm_var": 0.464800771077474,
"learning_rate": 2e-05,
"loss": 0.1604,
"loss/crossentropy": 2.009281039237976,
"loss/hidden": 0.1494140625,
"loss/logits": 0.010985464788973331,
"step": 276
},
{
"epoch": 0.277,
"grad_norm": 1.4453125,
"grad_norm_var": 0.45259501139322916,
"learning_rate": 2e-05,
"loss": 0.168,
"loss/crossentropy": 1.7085555791854858,
"loss/hidden": 0.15625,
"loss/logits": 0.011709913145750761,
"step": 277
},
{
"epoch": 0.278,
"grad_norm": 1.3828125,
"grad_norm_var": 0.46154683430989585,
"learning_rate": 2e-05,
"loss": 0.1456,
"loss/crossentropy": 2.789747476577759,
"loss/hidden": 0.1337890625,
"loss/logits": 0.011802888009697199,
"step": 278
},
{
"epoch": 0.279,
"grad_norm": 1.859375,
"grad_norm_var": 0.45711441040039064,
"learning_rate": 2e-05,
"loss": 0.1881,
"loss/crossentropy": 1.5918955504894257,
"loss/hidden": 0.173828125,
"loss/logits": 0.014291070867329836,
"step": 279
},
{
"epoch": 0.28,
"grad_norm": 2.8125,
"grad_norm_var": 0.5068682352701823,
"learning_rate": 2e-05,
"loss": 0.1458,
"loss/crossentropy": 0.8236657343804836,
"loss/hidden": 0.139404296875,
"loss/logits": 0.00643135339487344,
"step": 280
},
{
"epoch": 0.281,
"grad_norm": 4.125,
"grad_norm_var": 0.7956764221191406,
"learning_rate": 2e-05,
"loss": 0.1714,
"loss/crossentropy": 2.1279306411743164,
"loss/hidden": 0.15625,
"loss/logits": 0.015115040354430676,
"step": 281
},
{
"epoch": 0.282,
"grad_norm": 1.296875,
"grad_norm_var": 0.8177813212076823,
"learning_rate": 2e-05,
"loss": 0.1669,
"loss/crossentropy": 2.2272568941116333,
"loss/hidden": 0.1533203125,
"loss/logits": 0.0135371801443398,
"step": 282
},
{
"epoch": 0.283,
"grad_norm": 2.515625,
"grad_norm_var": 0.6023089090983073,
"learning_rate": 2e-05,
"loss": 0.1781,
"loss/crossentropy": 2.2013776302337646,
"loss/hidden": 0.16259765625,
"loss/logits": 0.015500886365771294,
"step": 283
},
{
"epoch": 0.284,
"grad_norm": 1.96875,
"grad_norm_var": 0.5695391337076823,
"learning_rate": 2e-05,
"loss": 0.1822,
"loss/crossentropy": 1.6315099596977234,
"loss/hidden": 0.1689453125,
"loss/logits": 0.013229990843683481,
"step": 284
},
{
"epoch": 0.285,
"grad_norm": 2.421875,
"grad_norm_var": 0.550426991780599,
"learning_rate": 2e-05,
"loss": 0.1877,
"loss/crossentropy": 1.329133152961731,
"loss/hidden": 0.1748046875,
"loss/logits": 0.012850106693804264,
"step": 285
},
{
"epoch": 0.286,
"grad_norm": 2.78125,
"grad_norm_var": 0.5659576416015625,
"learning_rate": 2e-05,
"loss": 0.1725,
"loss/crossentropy": 2.0431485772132874,
"loss/hidden": 0.15966796875,
"loss/logits": 0.01284833624958992,
"step": 286
},
{
"epoch": 0.287,
"grad_norm": 2.15625,
"grad_norm_var": 0.5648915608723958,
"learning_rate": 2e-05,
"loss": 0.2173,
"loss/crossentropy": 1.6292879581451416,
"loss/hidden": 0.19970703125,
"loss/logits": 0.017579292878508568,
"step": 287
},
{
"epoch": 0.288,
"grad_norm": 1.4296875,
"grad_norm_var": 0.5841379801432292,
"learning_rate": 2e-05,
"loss": 0.1632,
"loss/crossentropy": 2.0630630254745483,
"loss/hidden": 0.14990234375,
"loss/logits": 0.013251845724880695,
"step": 288
},
{
"epoch": 0.289,
"grad_norm": 1.8203125,
"grad_norm_var": 0.5364664713541667,
"learning_rate": 2e-05,
"loss": 0.2067,
"loss/crossentropy": 2.168562591075897,
"loss/hidden": 0.18798828125,
"loss/logits": 0.01867722487077117,
"step": 289
},
{
"epoch": 0.29,
"grad_norm": 1.21875,
"grad_norm_var": 0.5779449462890625,
"learning_rate": 2e-05,
"loss": 0.166,
"loss/crossentropy": 1.8953060507774353,
"loss/hidden": 0.15380859375,
"loss/logits": 0.01215141685679555,
"step": 290
},
{
"epoch": 0.291,
"grad_norm": 1.7109375,
"grad_norm_var": 0.5848297119140625,
"learning_rate": 2e-05,
"loss": 0.187,
"loss/crossentropy": 1.6148796081542969,
"loss/hidden": 0.173828125,
"loss/logits": 0.013202093075960875,
"step": 291
},
{
"epoch": 0.292,
"grad_norm": 1.6328125,
"grad_norm_var": 0.5749013264973958,
"learning_rate": 2e-05,
"loss": 0.197,
"loss/crossentropy": 1.7814635038375854,
"loss/hidden": 0.1826171875,
"loss/logits": 0.014429094269871712,
"step": 292
},
{
"epoch": 0.293,
"grad_norm": 2.015625,
"grad_norm_var": 0.5503028869628906,
"learning_rate": 2e-05,
"loss": 0.1814,
"loss/crossentropy": 2.1830875873565674,
"loss/hidden": 0.16748046875,
"loss/logits": 0.013968405313789845,
"step": 293
},
{
"epoch": 0.294,
"grad_norm": 1.7109375,
"grad_norm_var": 0.5268898010253906,
"learning_rate": 2e-05,
"loss": 0.2098,
"loss/crossentropy": 1.681401550769806,
"loss/hidden": 0.19482421875,
"loss/logits": 0.01494319923222065,
"step": 294
},
{
"epoch": 0.295,
"grad_norm": 1.3046875,
"grad_norm_var": 0.5633453369140625,
"learning_rate": 2e-05,
"loss": 0.1884,
"loss/crossentropy": 1.953886091709137,
"loss/hidden": 0.173828125,
"loss/logits": 0.014602533541619778,
"step": 295
},
{
"epoch": 0.296,
"grad_norm": 1.6875,
"grad_norm_var": 0.5292144775390625,
"learning_rate": 2e-05,
"loss": 0.1987,
"loss/crossentropy": 1.6944631338119507,
"loss/hidden": 0.18603515625,
"loss/logits": 0.012617598287761211,
"step": 296
},
{
"epoch": 0.297,
"grad_norm": 1.8359375,
"grad_norm_var": 0.20425999959309896,
"learning_rate": 2e-05,
"loss": 0.2261,
"loss/crossentropy": 2.214042544364929,
"loss/hidden": 0.205078125,
"loss/logits": 0.020975200459361076,
"step": 297
},
{
"epoch": 0.298,
"grad_norm": 1.1484375,
"grad_norm_var": 0.2164703369140625,
"learning_rate": 2e-05,
"loss": 0.1842,
"loss/crossentropy": 2.1237878799438477,
"loss/hidden": 0.16943359375,
"loss/logits": 0.014801782555878162,
"step": 298
},
{
"epoch": 0.299,
"grad_norm": 1.4765625,
"grad_norm_var": 0.18964818318684895,
"learning_rate": 2e-05,
"loss": 0.1814,
"loss/crossentropy": 1.492847979068756,
"loss/hidden": 0.16845703125,
"loss/logits": 0.012967187445610762,
"step": 299
},
{
"epoch": 0.3,
"grad_norm": 1.890625,
"grad_norm_var": 0.1879595438639323,
"learning_rate": 2e-05,
"loss": 0.1776,
"loss/crossentropy": 2.2924291491508484,
"loss/hidden": 0.16357421875,
"loss/logits": 0.014043833129107952,
"step": 300
},
{
"epoch": 0.301,
"grad_norm": 4.1875,
"grad_norm_var": 0.5374061584472656,
"learning_rate": 2e-05,
"loss": 0.2062,
"loss/crossentropy": 1.607342541217804,
"loss/hidden": 0.18994140625,
"loss/logits": 0.016273885034024715,
"step": 301
},
{
"epoch": 0.302,
"grad_norm": 1.5859375,
"grad_norm_var": 0.4823486328125,
"learning_rate": 2e-05,
"loss": 0.2143,
"loss/crossentropy": 1.8559609055519104,
"loss/hidden": 0.197265625,
"loss/logits": 0.017047187313437462,
"step": 302
},
{
"epoch": 0.303,
"grad_norm": 1.2265625,
"grad_norm_var": 0.4923052469889323,
"learning_rate": 2e-05,
"loss": 0.1814,
"loss/crossentropy": 2.4204115867614746,
"loss/hidden": 0.16796875,
"loss/logits": 0.013407074846327305,
"step": 303
},
{
"epoch": 0.304,
"grad_norm": 2.15625,
"grad_norm_var": 0.49497782389322914,
"learning_rate": 2e-05,
"loss": 0.2058,
"loss/crossentropy": 1.7306669354438782,
"loss/hidden": 0.189453125,
"loss/logits": 0.016323519870638847,
"step": 304
},
{
"epoch": 0.305,
"grad_norm": 1.6484375,
"grad_norm_var": 0.4960856119791667,
"learning_rate": 2e-05,
"loss": 0.1877,
"loss/crossentropy": 2.212082266807556,
"loss/hidden": 0.171875,
"loss/logits": 0.015811644960194826,
"step": 305
},
{
"epoch": 0.306,
"grad_norm": 1.3046875,
"grad_norm_var": 0.4901466369628906,
"learning_rate": 2e-05,
"loss": 0.1902,
"loss/crossentropy": 1.9250993132591248,
"loss/hidden": 0.17626953125,
"loss/logits": 0.013882125727832317,
"step": 306
},
{
"epoch": 0.307,
"grad_norm": 5.75,
"grad_norm_var": 1.4711181640625,
"learning_rate": 2e-05,
"loss": 0.1934,
"loss/crossentropy": 0.4879331737756729,
"loss/hidden": 0.18701171875,
"loss/logits": 0.006413323106244206,
"step": 307
},
{
"epoch": 0.308,
"grad_norm": 3.046875,
"grad_norm_var": 1.520232899983724,
"learning_rate": 2e-05,
"loss": 0.1973,
"loss/crossentropy": 1.4504847526550293,
"loss/hidden": 0.1875,
"loss/logits": 0.009785078698769212,
"step": 308
},
{
"epoch": 0.309,
"grad_norm": 1.40625,
"grad_norm_var": 1.5522092183430989,
"learning_rate": 2e-05,
"loss": 0.2057,
"loss/crossentropy": 2.149027943611145,
"loss/hidden": 0.189453125,
"loss/logits": 0.01620970480144024,
"step": 309
},
{
"epoch": 0.31,
"grad_norm": 1.7578125,
"grad_norm_var": 1.550005849202474,
"learning_rate": 2e-05,
"loss": 0.2027,
"loss/crossentropy": 2.1503273248672485,
"loss/hidden": 0.185546875,
"loss/logits": 0.01712088193744421,
"step": 310
},
{
"epoch": 0.311,
"grad_norm": 1.4375,
"grad_norm_var": 1.5372304280598958,
"learning_rate": 2e-05,
"loss": 0.1888,
"loss/crossentropy": 2.1748342514038086,
"loss/hidden": 0.17333984375,
"loss/logits": 0.01546872965991497,
"step": 311
},
{
"epoch": 0.312,
"grad_norm": 1.4921875,
"grad_norm_var": 1.5502703348795572,
"learning_rate": 2e-05,
"loss": 0.2158,
"loss/crossentropy": 1.3706732988357544,
"loss/hidden": 0.20166015625,
"loss/logits": 0.014161557890474796,
"step": 312
},
{
"epoch": 0.313,
"grad_norm": 2.421875,
"grad_norm_var": 1.5523111979166666,
"learning_rate": 2e-05,
"loss": 0.2021,
"loss/crossentropy": 1.8907567262649536,
"loss/hidden": 0.18701171875,
"loss/logits": 0.015071831177920103,
"step": 313
},
{
"epoch": 0.314,
"grad_norm": 1.296875,
"grad_norm_var": 1.5344378153483074,
"learning_rate": 2e-05,
"loss": 0.201,
"loss/crossentropy": 1.7888588905334473,
"loss/hidden": 0.1875,
"loss/logits": 0.013532605487853289,
"step": 314
},
{
"epoch": 0.315,
"grad_norm": 1.5859375,
"grad_norm_var": 1.5256507873535157,
"learning_rate": 2e-05,
"loss": 0.2166,
"loss/crossentropy": 1.5358025133609772,
"loss/hidden": 0.2021484375,
"loss/logits": 0.014410331379622221,
"step": 315
},
{
"epoch": 0.316,
"grad_norm": 56.0,
"grad_norm_var": 182.73569310506184,
"learning_rate": 2e-05,
"loss": 0.2529,
"loss/crossentropy": 2.1001065373420715,
"loss/hidden": 0.234375,
"loss/logits": 0.01847642147913575,
"step": 316
},
{
"epoch": 0.317,
"grad_norm": 1.296875,
"grad_norm_var": 183.77112401326497,
"learning_rate": 2e-05,
"loss": 0.1958,
"loss/crossentropy": 2.3731868267059326,
"loss/hidden": 0.1796875,
"loss/logits": 0.01615766156464815,
"step": 317
},
{
"epoch": 0.318,
"grad_norm": 1.53125,
"grad_norm_var": 183.79867248535157,
"learning_rate": 2e-05,
"loss": 0.2212,
"loss/crossentropy": 1.8716753125190735,
"loss/hidden": 0.2041015625,
"loss/logits": 0.017116894014179707,
"step": 318
},
{
"epoch": 0.319,
"grad_norm": 1.9921875,
"grad_norm_var": 183.41590983072916,
"learning_rate": 2e-05,
"loss": 0.1938,
"loss/crossentropy": 1.2205194532871246,
"loss/hidden": 0.18115234375,
"loss/logits": 0.012608660385012627,
"step": 319
},
{
"epoch": 0.32,
"grad_norm": 1.203125,
"grad_norm_var": 183.88273010253906,
"learning_rate": 2e-05,
"loss": 0.1822,
"loss/crossentropy": 2.3611029386520386,
"loss/hidden": 0.1689453125,
"loss/logits": 0.013240456581115723,
"step": 320
},
{
"epoch": 0.321,
"grad_norm": 1.3046875,
"grad_norm_var": 184.05854390462238,
"learning_rate": 2e-05,
"loss": 0.193,
"loss/crossentropy": 1.8402240872383118,
"loss/hidden": 0.18017578125,
"loss/logits": 0.012811433058232069,
"step": 321
},
{
"epoch": 0.322,
"grad_norm": 1.3671875,
"grad_norm_var": 184.02547912597657,
"learning_rate": 2e-05,
"loss": 0.2238,
"loss/crossentropy": 1.9131136536598206,
"loss/hidden": 0.20751953125,
"loss/logits": 0.016317113302648067,
"step": 322
},
{
"epoch": 0.323,
"grad_norm": 1.9765625,
"grad_norm_var": 184.69184951782228,
"learning_rate": 2e-05,
"loss": 0.2509,
"loss/crossentropy": 1.4010455012321472,
"loss/hidden": 0.23193359375,
"loss/logits": 0.018928353674709797,
"step": 323
},
{
"epoch": 0.324,
"grad_norm": 2.234375,
"grad_norm_var": 184.9522621154785,
"learning_rate": 2e-05,
"loss": 0.1929,
"loss/crossentropy": 1.9659223556518555,
"loss/hidden": 0.1796875,
"loss/logits": 0.013216304127126932,
"step": 324
},
{
"epoch": 0.325,
"grad_norm": 1.75,
"grad_norm_var": 184.79406102498373,
"learning_rate": 2e-05,
"loss": 0.1877,
"loss/crossentropy": 1.5221052765846252,
"loss/hidden": 0.17626953125,
"loss/logits": 0.011447824770584702,
"step": 325
},
{
"epoch": 0.326,
"grad_norm": 1.5546875,
"grad_norm_var": 184.88554662068685,
"learning_rate": 2e-05,
"loss": 0.2212,
"loss/crossentropy": 2.06081086397171,
"loss/hidden": 0.20361328125,
"loss/logits": 0.017567144706845284,
"step": 326
},
{
"epoch": 0.327,
"grad_norm": 3.578125,
"grad_norm_var": 184.14719823201497,
"learning_rate": 2e-05,
"loss": 0.1707,
"loss/crossentropy": 0.8908511102199554,
"loss/hidden": 0.1640625,
"loss/logits": 0.006589735276065767,
"step": 327
},
{
"epoch": 0.328,
"grad_norm": 2.1875,
"grad_norm_var": 183.83722737630208,
"learning_rate": 2e-05,
"loss": 0.2041,
"loss/crossentropy": 1.4793621897697449,
"loss/hidden": 0.19384765625,
"loss/logits": 0.010210367618128657,
"step": 328
},
{
"epoch": 0.329,
"grad_norm": 1.5703125,
"grad_norm_var": 184.19855931599935,
"learning_rate": 2e-05,
"loss": 0.2174,
"loss/crossentropy": 1.5629376769065857,
"loss/hidden": 0.20166015625,
"loss/logits": 0.015733799897134304,
"step": 329
},
{
"epoch": 0.33,
"grad_norm": 1.4609375,
"grad_norm_var": 184.11591389973958,
"learning_rate": 2e-05,
"loss": 0.2297,
"loss/crossentropy": 2.016783118247986,
"loss/hidden": 0.2119140625,
"loss/logits": 0.017778108827769756,
"step": 330
},
{
"epoch": 0.331,
"grad_norm": 1.3671875,
"grad_norm_var": 184.22320963541668,
"learning_rate": 2e-05,
"loss": 0.2183,
"loss/crossentropy": 2.3946865797042847,
"loss/hidden": 0.2001953125,
"loss/logits": 0.01807898748666048,
"step": 331
},
{
"epoch": 0.332,
"grad_norm": 1.234375,
"grad_norm_var": 0.35546773274739585,
"learning_rate": 2e-05,
"loss": 0.2244,
"loss/crossentropy": 1.6463975310325623,
"loss/hidden": 0.2099609375,
"loss/logits": 0.014466887805610895,
"step": 332
},
{
"epoch": 0.333,
"grad_norm": 1.703125,
"grad_norm_var": 0.34256083170572915,
"learning_rate": 2e-05,
"loss": 0.2653,
"loss/crossentropy": 1.727737307548523,
"loss/hidden": 0.24462890625,
"loss/logits": 0.020694734528660774,
"step": 333
},
{
"epoch": 0.334,
"grad_norm": 2.34375,
"grad_norm_var": 0.36001688639322915,
"learning_rate": 2e-05,
"loss": 0.2636,
"loss/crossentropy": 1.8381291031837463,
"loss/hidden": 0.244140625,
"loss/logits": 0.019478057511150837,
"step": 334
},
{
"epoch": 0.335,
"grad_norm": 5.5,
"grad_norm_var": 1.2181292215983073,
"learning_rate": 2e-05,
"loss": 0.2789,
"loss/crossentropy": 1.395434319972992,
"loss/hidden": 0.25732421875,
"loss/logits": 0.02152822446078062,
"step": 335
},
{
"epoch": 0.336,
"grad_norm": 1.7578125,
"grad_norm_var": 1.1768707275390624,
"learning_rate": 2e-05,
"loss": 0.2301,
"loss/crossentropy": 1.7802979946136475,
"loss/hidden": 0.212890625,
"loss/logits": 0.01717265695333481,
"step": 336
},
{
"epoch": 0.337,
"grad_norm": 1.2265625,
"grad_norm_var": 1.1850748697916667,
"learning_rate": 2e-05,
"loss": 0.2195,
"loss/crossentropy": 1.864999234676361,
"loss/hidden": 0.20361328125,
"loss/logits": 0.015909720212221146,
"step": 337
},
{
"epoch": 0.338,
"grad_norm": 1.5078125,
"grad_norm_var": 1.1734934488932292,
"learning_rate": 2e-05,
"loss": 0.2322,
"loss/crossentropy": 1.9171935319900513,
"loss/hidden": 0.2138671875,
"loss/logits": 0.01834118738770485,
"step": 338
},
{
"epoch": 0.339,
"grad_norm": 1.7109375,
"grad_norm_var": 1.1808430989583334,
"learning_rate": 2e-05,
"loss": 0.2546,
"loss/crossentropy": 2.232408821582794,
"loss/hidden": 0.23388671875,
"loss/logits": 0.02068551816046238,
"step": 339
},
{
"epoch": 0.34,
"grad_norm": 4.40625,
"grad_norm_var": 1.531086222330729,
"learning_rate": 2e-05,
"loss": 0.2209,
"loss/crossentropy": 0.885938722640276,
"loss/hidden": 0.2138671875,
"loss/logits": 0.0069831793662160635,
"step": 340
},
{
"epoch": 0.341,
"grad_norm": 1.8046875,
"grad_norm_var": 1.5281471252441405,
"learning_rate": 2e-05,
"loss": 0.274,
"loss/crossentropy": 2.053671360015869,
"loss/hidden": 0.25,
"loss/logits": 0.024039674550294876,
"step": 341
},
{
"epoch": 0.342,
"grad_norm": 1.4765625,
"grad_norm_var": 1.535064442952474,
"learning_rate": 2e-05,
"loss": 0.248,
"loss/crossentropy": 2.1628893613815308,
"loss/hidden": 0.22900390625,
"loss/logits": 0.01902489084750414,
"step": 342
},
{
"epoch": 0.343,
"grad_norm": 1.6875,
"grad_norm_var": 1.4053301493326822,
"learning_rate": 2e-05,
"loss": 0.2355,
"loss/crossentropy": 1.9784727692604065,
"loss/hidden": 0.216796875,
"loss/logits": 0.018667724914848804,
"step": 343
},
{
"epoch": 0.344,
"grad_norm": 1.9453125,
"grad_norm_var": 1.4048492431640625,
"learning_rate": 2e-05,
"loss": 0.2215,
"loss/crossentropy": 2.1430813670158386,
"loss/hidden": 0.205078125,
"loss/logits": 0.016372697427868843,
"step": 344
},
{
"epoch": 0.345,
"grad_norm": 3.34375,
"grad_norm_var": 1.489422353108724,
"learning_rate": 2e-05,
"loss": 0.2828,
"loss/crossentropy": 1.4574592113494873,
"loss/hidden": 0.259765625,
"loss/logits": 0.02300189435482025,
"step": 345
},
{
"epoch": 0.346,
"grad_norm": 4.59375,
"grad_norm_var": 1.8130035400390625,
"learning_rate": 2e-05,
"loss": 0.2555,
"loss/crossentropy": 2.1325125694274902,
"loss/hidden": 0.234375,
"loss/logits": 0.021130304783582687,
"step": 346
},
{
"epoch": 0.347,
"grad_norm": 1.4453125,
"grad_norm_var": 1.8031412760416667,
"learning_rate": 2e-05,
"loss": 0.233,
"loss/crossentropy": 2.6941460371017456,
"loss/hidden": 0.21435546875,
"loss/logits": 0.01859632506966591,
"step": 347
},
{
"epoch": 0.348,
"grad_norm": 1.6171875,
"grad_norm_var": 1.755077870686849,
"learning_rate": 2e-05,
"loss": 0.2562,
"loss/crossentropy": 1.8957814574241638,
"loss/hidden": 0.236328125,
"loss/logits": 0.019866405054926872,
"step": 348
},
{
"epoch": 0.349,
"grad_norm": 1.953125,
"grad_norm_var": 1.7364418029785156,
"learning_rate": 2e-05,
"loss": 0.2507,
"loss/crossentropy": 2.5658878087997437,
"loss/hidden": 0.2294921875,
"loss/logits": 0.02118699811398983,
"step": 349
},
{
"epoch": 0.35,
"grad_norm": 1.890625,
"grad_norm_var": 1.7523719787597656,
"learning_rate": 2e-05,
"loss": 0.233,
"loss/crossentropy": 1.9111933708190918,
"loss/hidden": 0.21533203125,
"loss/logits": 0.01770856324583292,
"step": 350
},
{
"epoch": 0.351,
"grad_norm": 2.625,
"grad_norm_var": 1.0678749084472656,
"learning_rate": 2e-05,
"loss": 0.2712,
"loss/crossentropy": 1.5525288581848145,
"loss/hidden": 0.25244140625,
"loss/logits": 0.01877846010029316,
"step": 351
},
{
"epoch": 0.352,
"grad_norm": 1.671875,
"grad_norm_var": 1.07325439453125,
"learning_rate": 2e-05,
"loss": 0.2398,
"loss/crossentropy": 1.47780179977417,
"loss/hidden": 0.224609375,
"loss/logits": 0.015163760632276535,
"step": 352
},
{
"epoch": 0.353,
"grad_norm": 1.40625,
"grad_norm_var": 1.0523902893066406,
"learning_rate": 2e-05,
"loss": 0.2579,
"loss/crossentropy": 1.6976242065429688,
"loss/hidden": 0.240234375,
"loss/logits": 0.01768268644809723,
"step": 353
},
{
"epoch": 0.354,
"grad_norm": 1.375,
"grad_norm_var": 1.065623982747396,
"learning_rate": 2e-05,
"loss": 0.2594,
"loss/crossentropy": 1.5402989983558655,
"loss/hidden": 0.24169921875,
"loss/logits": 0.017742513678967953,
"step": 354
},
{
"epoch": 0.355,
"grad_norm": 2.609375,
"grad_norm_var": 1.0593360900878905,
"learning_rate": 2e-05,
"loss": 0.2983,
"loss/crossentropy": 1.7891557812690735,
"loss/hidden": 0.2744140625,
"loss/logits": 0.023881751112639904,
"step": 355
},
{
"epoch": 0.356,
"grad_norm": 1.5859375,
"grad_norm_var": 0.7421427408854167,
"learning_rate": 2e-05,
"loss": 0.2353,
"loss/crossentropy": 2.255465269088745,
"loss/hidden": 0.2177734375,
"loss/logits": 0.01755282748490572,
"step": 356
},
{
"epoch": 0.357,
"grad_norm": 1.4375,
"grad_norm_var": 0.763287099202474,
"learning_rate": 2e-05,
"loss": 0.2388,
"loss/crossentropy": 2.2716734409332275,
"loss/hidden": 0.22021484375,
"loss/logits": 0.018602201715111732,
"step": 357
},
{
"epoch": 0.358,
"grad_norm": 2.34375,
"grad_norm_var": 0.7449666341145833,
"learning_rate": 2e-05,
"loss": 0.2737,
"loss/crossentropy": 1.8382077813148499,
"loss/hidden": 0.2548828125,
"loss/logits": 0.018825003411620855,
"step": 358
},
{
"epoch": 0.359,
"grad_norm": 1.5546875,
"grad_norm_var": 0.7532976786295573,
"learning_rate": 2e-05,
"loss": 0.2391,
"loss/crossentropy": 1.6230210661888123,
"loss/hidden": 0.224609375,
"loss/logits": 0.014487342443317175,
"step": 359
},
{
"epoch": 0.36,
"grad_norm": 1.421875,
"grad_norm_var": 0.7803385416666667,
"learning_rate": 2e-05,
"loss": 0.2519,
"loss/crossentropy": 1.6961406469345093,
"loss/hidden": 0.234375,
"loss/logits": 0.017499960027635098,
"step": 360
},
{
"epoch": 0.361,
"grad_norm": 1.5703125,
"grad_norm_var": 0.6720965067545573,
"learning_rate": 2e-05,
"loss": 0.2623,
"loss/crossentropy": 2.1821005940437317,
"loss/hidden": 0.24072265625,
"loss/logits": 0.021556712687015533,
"step": 361
},
{
"epoch": 0.362,
"grad_norm": 1.6484375,
"grad_norm_var": 0.17363688151041667,
"learning_rate": 2e-05,
"loss": 0.2759,
"loss/crossentropy": 1.7173206806182861,
"loss/hidden": 0.255859375,
"loss/logits": 0.020033356733620167,
"step": 362
},
{
"epoch": 0.363,
"grad_norm": 1.5859375,
"grad_norm_var": 0.16897684733072918,
"learning_rate": 2e-05,
"loss": 0.2552,
"loss/crossentropy": 1.8281689882278442,
"loss/hidden": 0.23681640625,
"loss/logits": 0.018404729664325714,
"step": 363
},
{
"epoch": 0.364,
"grad_norm": 1.3125,
"grad_norm_var": 0.1809282938639323,
"learning_rate": 2e-05,
"loss": 0.2546,
"loss/crossentropy": 2.181256651878357,
"loss/hidden": 0.23486328125,
"loss/logits": 0.01975287776440382,
"step": 364
},
{
"epoch": 0.365,
"grad_norm": 3.796875,
"grad_norm_var": 0.4434466044108073,
"learning_rate": 2e-05,
"loss": 0.2803,
"loss/crossentropy": 1.4486916065216064,
"loss/hidden": 0.2607421875,
"loss/logits": 0.01950985286384821,
"step": 365
},
{
"epoch": 0.366,
"grad_norm": 1.234375,
"grad_norm_var": 0.4680987040201823,
"learning_rate": 2e-05,
"loss": 0.2504,
"loss/crossentropy": 2.026048183441162,
"loss/hidden": 0.232421875,
"loss/logits": 0.017978372983634472,
"step": 366
},
{
"epoch": 0.367,
"grad_norm": 4.3125,
"grad_norm_var": 0.8263628641764323,
"learning_rate": 2e-05,
"loss": 0.2579,
"loss/crossentropy": 1.4382375180721283,
"loss/hidden": 0.2412109375,
"loss/logits": 0.016655512619763613,
"step": 367
},
{
"epoch": 0.368,
"grad_norm": 2.25,
"grad_norm_var": 0.827416737874349,
"learning_rate": 2e-05,
"loss": 0.3072,
"loss/crossentropy": 1.57509446144104,
"loss/hidden": 0.2880859375,
"loss/logits": 0.019162926822900772,
"step": 368
},
{
"epoch": 0.369,
"grad_norm": 1.78125,
"grad_norm_var": 0.808251698811849,
"learning_rate": 2e-05,
"loss": 0.2383,
"loss/crossentropy": 2.0060970187187195,
"loss/hidden": 0.22021484375,
"loss/logits": 0.018060280941426754,
"step": 369
},
{
"epoch": 0.37,
"grad_norm": 2.546875,
"grad_norm_var": 0.798180898030599,
"learning_rate": 2e-05,
"loss": 0.2523,
"loss/crossentropy": 1.2137621641159058,
"loss/hidden": 0.24072265625,
"loss/logits": 0.011561613995581865,
"step": 370
},
{
"epoch": 0.371,
"grad_norm": 1.703125,
"grad_norm_var": 0.7833717346191407,
"learning_rate": 2e-05,
"loss": 0.2561,
"loss/crossentropy": 1.764179289340973,
"loss/hidden": 0.240234375,
"loss/logits": 0.015869705006480217,
"step": 371
},
{
"epoch": 0.372,
"grad_norm": 1.5625,
"grad_norm_var": 0.784716796875,
"learning_rate": 2e-05,
"loss": 0.2642,
"loss/crossentropy": 2.1394487619400024,
"loss/hidden": 0.2451171875,
"loss/logits": 0.01907090563327074,
"step": 372
},
{
"epoch": 0.373,
"grad_norm": 1.9765625,
"grad_norm_var": 0.7621681213378906,
"learning_rate": 2e-05,
"loss": 0.2496,
"loss/crossentropy": 2.151320219039917,
"loss/hidden": 0.23095703125,
"loss/logits": 0.018605505116283894,
"step": 373
},
{
"epoch": 0.374,
"grad_norm": 1.5234375,
"grad_norm_var": 0.77073974609375,
"learning_rate": 2e-05,
"loss": 0.2426,
"loss/crossentropy": 2.291616916656494,
"loss/hidden": 0.2255859375,
"loss/logits": 0.01696862932294607,
"step": 374
},
{
"epoch": 0.375,
"grad_norm": 1.1640625,
"grad_norm_var": 0.8027577718098958,
"learning_rate": 2e-05,
"loss": 0.2482,
"loss/crossentropy": 2.1597548127174377,
"loss/hidden": 0.228515625,
"loss/logits": 0.019656311720609665,
"step": 375
},
{
"epoch": 0.376,
"grad_norm": 4.5625,
"grad_norm_var": 1.1930867513020833,
"learning_rate": 2e-05,
"loss": 0.2546,
"loss/crossentropy": 0.7966546472162008,
"loss/hidden": 0.24609375,
"loss/logits": 0.008532016014214605,
"step": 376
},
{
"epoch": 0.377,
"grad_norm": 1.25,
"grad_norm_var": 1.2246070861816407,
"learning_rate": 2e-05,
"loss": 0.2394,
"loss/crossentropy": 1.730500340461731,
"loss/hidden": 0.22314453125,
"loss/logits": 0.016217158176004887,
"step": 377
},
{
"epoch": 0.378,
"grad_norm": 1.9453125,
"grad_norm_var": 1.210729726155599,
"learning_rate": 2e-05,
"loss": 0.2672,
"loss/crossentropy": 2.0575554966926575,
"loss/hidden": 0.2470703125,
"loss/logits": 0.02009457629173994,
"step": 378
},
{
"epoch": 0.379,
"grad_norm": 4.15625,
"grad_norm_var": 1.4280181884765626,
"learning_rate": 2e-05,
"loss": 0.3649,
"loss/crossentropy": 2.409613251686096,
"loss/hidden": 0.330078125,
"loss/logits": 0.034814249724149704,
"step": 379
},
{
"epoch": 0.38,
"grad_norm": 2.34375,
"grad_norm_var": 1.3563140869140624,
"learning_rate": 2e-05,
"loss": 0.2651,
"loss/crossentropy": 1.4721761345863342,
"loss/hidden": 0.2490234375,
"loss/logits": 0.016095119062811136,
"step": 380
},
{
"epoch": 0.381,
"grad_norm": 1.2265625,
"grad_norm_var": 1.2842750549316406,
"learning_rate": 2e-05,
"loss": 0.2538,
"loss/crossentropy": 2.51900315284729,
"loss/hidden": 0.2314453125,
"loss/logits": 0.022326381877064705,
"step": 381
},
{
"epoch": 0.382,
"grad_norm": 2.234375,
"grad_norm_var": 1.2151995340983073,
"learning_rate": 2e-05,
"loss": 0.2743,
"loss/crossentropy": 2.030519187450409,
"loss/hidden": 0.2548828125,
"loss/logits": 0.01944338995963335,
"step": 382
},
{
"epoch": 0.383,
"grad_norm": 3.859375,
"grad_norm_var": 1.1054583231608073,
"learning_rate": 2e-05,
"loss": 0.3105,
"loss/crossentropy": 0.7516276463866234,
"loss/hidden": 0.2978515625,
"loss/logits": 0.012636175146326423,
"step": 383
},
{
"epoch": 0.384,
"grad_norm": 2.296875,
"grad_norm_var": 1.1055620829264323,
"learning_rate": 2e-05,
"loss": 0.2867,
"loss/crossentropy": 1.9317356944084167,
"loss/hidden": 0.263671875,
"loss/logits": 0.023075740784406662,
"step": 384
},
{
"epoch": 0.385,
"grad_norm": 2.09375,
"grad_norm_var": 1.0917884826660156,
"learning_rate": 2e-05,
"loss": 0.3261,
"loss/crossentropy": 2.1155296564102173,
"loss/hidden": 0.2998046875,
"loss/logits": 0.02629261091351509,
"step": 385
},
{
"epoch": 0.386,
"grad_norm": 1.7734375,
"grad_norm_var": 1.1014312744140624,
"learning_rate": 2e-05,
"loss": 0.287,
"loss/crossentropy": 2.1998232007026672,
"loss/hidden": 0.265625,
"loss/logits": 0.021336179226636887,
"step": 386
},
{
"epoch": 0.387,
"grad_norm": 1.8671875,
"grad_norm_var": 1.0915992736816407,
"learning_rate": 2e-05,
"loss": 0.2608,
"loss/crossentropy": 1.9437836408615112,
"loss/hidden": 0.2412109375,
"loss/logits": 0.019607914611697197,
"step": 387
},
{
"epoch": 0.388,
"grad_norm": 2.125,
"grad_norm_var": 1.0605812072753906,
"learning_rate": 2e-05,
"loss": 0.2871,
"loss/crossentropy": 1.7142232656478882,
"loss/hidden": 0.2666015625,
"loss/logits": 0.020461218431591988,
"step": 388
},
{
"epoch": 0.389,
"grad_norm": 1.640625,
"grad_norm_var": 1.0809977213541666,
"learning_rate": 2e-05,
"loss": 0.2863,
"loss/crossentropy": 2.236941933631897,
"loss/hidden": 0.2626953125,
"loss/logits": 0.023648610338568687,
"step": 389
},
{
"epoch": 0.39,
"grad_norm": 3.125,
"grad_norm_var": 1.0853248596191407,
"learning_rate": 2e-05,
"loss": 0.2733,
"loss/crossentropy": 1.2834028005599976,
"loss/hidden": 0.2607421875,
"loss/logits": 0.01257804874330759,
"step": 390
},
{
"epoch": 0.391,
"grad_norm": 1.5,
"grad_norm_var": 1.0390787760416667,
"learning_rate": 2e-05,
"loss": 0.3026,
"loss/crossentropy": 1.5867803692817688,
"loss/hidden": 0.2822265625,
"loss/logits": 0.020396556705236435,
"step": 391
},
{
"epoch": 0.392,
"grad_norm": 1.53125,
"grad_norm_var": 0.7292439778645833,
"learning_rate": 2e-05,
"loss": 0.297,
"loss/crossentropy": 1.4337636232376099,
"loss/hidden": 0.2783203125,
"loss/logits": 0.01866168435662985,
"step": 392
},
{
"epoch": 0.393,
"grad_norm": 1.71875,
"grad_norm_var": 0.6845052083333333,
"learning_rate": 2e-05,
"loss": 0.2642,
"loss/crossentropy": 2.1386572122573853,
"loss/hidden": 0.24462890625,
"loss/logits": 0.019583708606660366,
"step": 393
},
{
"epoch": 0.394,
"grad_norm": 2.9375,
"grad_norm_var": 0.710375722249349,
"learning_rate": 2e-05,
"loss": 0.3313,
"loss/crossentropy": 1.936402440071106,
"loss/hidden": 0.3046875,
"loss/logits": 0.026638174429535866,
"step": 394
},
{
"epoch": 0.395,
"grad_norm": 1.8671875,
"grad_norm_var": 0.4642567952473958,
"learning_rate": 2e-05,
"loss": 0.2699,
"loss/crossentropy": 2.2741682529449463,
"loss/hidden": 0.248046875,
"loss/logits": 0.021812792867422104,
"step": 395
},
{
"epoch": 0.396,
"grad_norm": 4.84375,
"grad_norm_var": 0.9248687744140625,
"learning_rate": 2e-05,
"loss": 0.3035,
"loss/crossentropy": 1.1322659850120544,
"loss/hidden": 0.291015625,
"loss/logits": 0.01252604997716844,
"step": 396
},
{
"epoch": 0.397,
"grad_norm": 2.53125,
"grad_norm_var": 0.8462562561035156,
"learning_rate": 2e-05,
"loss": 0.3108,
"loss/crossentropy": 1.358659565448761,
"loss/hidden": 0.2900390625,
"loss/logits": 0.02074052207171917,
"step": 397
},
{
"epoch": 0.398,
"grad_norm": 1.84375,
"grad_norm_var": 0.862939198811849,
"learning_rate": 2e-05,
"loss": 0.3,
"loss/crossentropy": 1.9806614518165588,
"loss/hidden": 0.2783203125,
"loss/logits": 0.02170161809772253,
"step": 398
},
{
"epoch": 0.399,
"grad_norm": 1.9296875,
"grad_norm_var": 0.706591796875,
"learning_rate": 2e-05,
"loss": 0.2984,
"loss/crossentropy": 2.3857691287994385,
"loss/hidden": 0.2744140625,
"loss/logits": 0.023968273773789406,
"step": 399
},
{
"epoch": 0.4,
"grad_norm": 1.9140625,
"grad_norm_var": 0.7121620178222656,
"learning_rate": 2e-05,
"loss": 0.2732,
"loss/crossentropy": 2.006265163421631,
"loss/hidden": 0.2509765625,
"loss/logits": 0.02220850996673107,
"step": 400
},
{
"epoch": 0.401,
"grad_norm": 1.8046875,
"grad_norm_var": 0.7215810139973958,
"learning_rate": 2e-05,
"loss": 0.2935,
"loss/crossentropy": 1.7221473455429077,
"loss/hidden": 0.275390625,
"loss/logits": 0.018067960627377033,
"step": 401
},
{
"epoch": 0.402,
"grad_norm": 2.421875,
"grad_norm_var": 0.7123146057128906,
"learning_rate": 2e-05,
"loss": 0.2923,
"loss/crossentropy": 2.0756383538246155,
"loss/hidden": 0.275390625,
"loss/logits": 0.016928995959460735,
"step": 402
},
{
"epoch": 0.403,
"grad_norm": 1.53125,
"grad_norm_var": 0.7353993733723958,
"learning_rate": 2e-05,
"loss": 0.2972,
"loss/crossentropy": 1.6683465242385864,
"loss/hidden": 0.2783203125,
"loss/logits": 0.018839839845895767,
"step": 403
},
{
"epoch": 0.404,
"grad_norm": 1.8125,
"grad_norm_var": 0.7447987874348958,
"learning_rate": 2e-05,
"loss": 0.2966,
"loss/crossentropy": 1.737410545349121,
"loss/hidden": 0.2763671875,
"loss/logits": 0.02023144531995058,
"step": 404
},
{
"epoch": 0.405,
"grad_norm": 1.3046875,
"grad_norm_var": 0.7762163798014323,
"learning_rate": 2e-05,
"loss": 0.2855,
"loss/crossentropy": 2.2183534502983093,
"loss/hidden": 0.26513671875,
"loss/logits": 0.02036190778017044,
"step": 405
},
{
"epoch": 0.406,
"grad_norm": 1.5,
"grad_norm_var": 0.7329465230305989,
"learning_rate": 2e-05,
"loss": 0.3193,
"loss/crossentropy": 1.8786720633506775,
"loss/hidden": 0.294921875,
"loss/logits": 0.024385149590671062,
"step": 406
},
{
"epoch": 0.407,
"grad_norm": 1.5,
"grad_norm_var": 0.7329465230305989,
"learning_rate": 2e-05,
"loss": 0.3099,
"loss/crossentropy": 1.8731706738471985,
"loss/hidden": 0.2861328125,
"loss/logits": 0.023721362464129925,
"step": 407
},
{
"epoch": 0.408,
"grad_norm": 1.953125,
"grad_norm_var": 0.714214833577474,
"learning_rate": 2e-05,
"loss": 0.2993,
"loss/crossentropy": 2.0363497734069824,
"loss/hidden": 0.2763671875,
"loss/logits": 0.02292494662106037,
"step": 408
},
{
"epoch": 0.409,
"grad_norm": 1.421875,
"grad_norm_var": 0.7343544006347656,
"learning_rate": 2e-05,
"loss": 0.2919,
"loss/crossentropy": 1.7596482038497925,
"loss/hidden": 0.2705078125,
"loss/logits": 0.021396052092313766,
"step": 409
},
{
"epoch": 0.41,
"grad_norm": 2.84375,
"grad_norm_var": 0.7240577697753906,
"learning_rate": 2e-05,
"loss": 0.3154,
"loss/crossentropy": 1.080414205789566,
"loss/hidden": 0.29736328125,
"loss/logits": 0.018078335095196962,
"step": 410
},
{
"epoch": 0.411,
"grad_norm": 1.5625,
"grad_norm_var": 0.73785400390625,
"learning_rate": 2e-05,
"loss": 0.2928,
"loss/crossentropy": 2.527972936630249,
"loss/hidden": 0.26953125,
"loss/logits": 0.02323300577700138,
"step": 411
},
{
"epoch": 0.412,
"grad_norm": 1.5078125,
"grad_norm_var": 0.18848851521809895,
"learning_rate": 2e-05,
"loss": 0.2989,
"loss/crossentropy": 1.5808929204940796,
"loss/hidden": 0.28125,
"loss/logits": 0.01763766910880804,
"step": 412
},
{
"epoch": 0.413,
"grad_norm": 1.6328125,
"grad_norm_var": 0.1557037353515625,
"learning_rate": 2e-05,
"loss": 0.3052,
"loss/crossentropy": 2.073564648628235,
"loss/hidden": 0.2841796875,
"loss/logits": 0.021017897874116898,
"step": 413
},
{
"epoch": 0.414,
"grad_norm": 1.703125,
"grad_norm_var": 0.15574951171875,
"learning_rate": 2e-05,
"loss": 0.3341,
"loss/crossentropy": 1.5968445539474487,
"loss/hidden": 0.310546875,
"loss/logits": 0.023572119884192944,
"step": 414
},
{
"epoch": 0.415,
"grad_norm": 1.65625,
"grad_norm_var": 0.15465469360351564,
"learning_rate": 2e-05,
"loss": 0.3319,
"loss/crossentropy": 2.13019335269928,
"loss/hidden": 0.3037109375,
"loss/logits": 0.028160166926681995,
"step": 415
},
{
"epoch": 0.416,
"grad_norm": 1.8828125,
"grad_norm_var": 0.15405044555664063,
"learning_rate": 2e-05,
"loss": 0.2928,
"loss/crossentropy": 1.3558663129806519,
"loss/hidden": 0.2744140625,
"loss/logits": 0.018423012923449278,
"step": 416
},
{
"epoch": 0.417,
"grad_norm": 2.15625,
"grad_norm_var": 0.1642242431640625,
"learning_rate": 2e-05,
"loss": 0.3349,
"loss/crossentropy": 1.556907832622528,
"loss/hidden": 0.310546875,
"loss/logits": 0.0243788855150342,
"step": 417
},
{
"epoch": 0.418,
"grad_norm": 1.765625,
"grad_norm_var": 0.1344879150390625,
"learning_rate": 2e-05,
"loss": 0.293,
"loss/crossentropy": 2.18166720867157,
"loss/hidden": 0.2705078125,
"loss/logits": 0.022501694969832897,
"step": 418
},
{
"epoch": 0.419,
"grad_norm": 5.0,
"grad_norm_var": 0.7930084228515625,
"learning_rate": 2e-05,
"loss": 0.306,
"loss/crossentropy": 1.875123679637909,
"loss/hidden": 0.2841796875,
"loss/logits": 0.021816120482981205,
"step": 419
},
{
"epoch": 0.42,
"grad_norm": 2.0,
"grad_norm_var": 0.7917633056640625,
"learning_rate": 2e-05,
"loss": 0.3207,
"loss/crossentropy": 2.1878353357315063,
"loss/hidden": 0.29296875,
"loss/logits": 0.027718784287571907,
"step": 420
},
{
"epoch": 0.421,
"grad_norm": 2.5,
"grad_norm_var": 0.7763160705566406,
"learning_rate": 2e-05,
"loss": 0.3106,
"loss/crossentropy": 2.46438992023468,
"loss/hidden": 0.2841796875,
"loss/logits": 0.026430321857333183,
"step": 421
},
{
"epoch": 0.422,
"grad_norm": 1.59375,
"grad_norm_var": 0.7701576232910157,
"learning_rate": 2e-05,
"loss": 0.2847,
"loss/crossentropy": 1.991809368133545,
"loss/hidden": 0.265625,
"loss/logits": 0.019083392806351185,
"step": 422
},
{
"epoch": 0.423,
"grad_norm": 2.421875,
"grad_norm_var": 0.7565935770670573,
"learning_rate": 2e-05,
"loss": 0.415,
"loss/crossentropy": 1.6859049797058105,
"loss/hidden": 0.3818359375,
"loss/logits": 0.03313039615750313,
"step": 423
},
{
"epoch": 0.424,
"grad_norm": 1.859375,
"grad_norm_var": 0.7589800516764323,
"learning_rate": 2e-05,
"loss": 0.3098,
"loss/crossentropy": 1.8961586952209473,
"loss/hidden": 0.2900390625,
"loss/logits": 0.019725864753127098,
"step": 424
},
{
"epoch": 0.425,
"grad_norm": 1.6171875,
"grad_norm_var": 0.7438547770182292,
"learning_rate": 2e-05,
"loss": 0.3427,
"loss/crossentropy": 2.085192084312439,
"loss/hidden": 0.31640625,
"loss/logits": 0.026326753199100494,
"step": 425
},
{
"epoch": 0.426,
"grad_norm": 2.078125,
"grad_norm_var": 0.705224609375,
"learning_rate": 2e-05,
"loss": 0.3321,
"loss/crossentropy": 1.912731111049652,
"loss/hidden": 0.3076171875,
"loss/logits": 0.02450721152126789,
"step": 426
},
{
"epoch": 0.427,
"grad_norm": 1.8359375,
"grad_norm_var": 0.6918108622233073,
"learning_rate": 2e-05,
"loss": 0.3396,
"loss/crossentropy": 2.1176230907440186,
"loss/hidden": 0.310546875,
"loss/logits": 0.029072879813611507,
"step": 427
},
{
"epoch": 0.428,
"grad_norm": 1.6015625,
"grad_norm_var": 0.6852617899576823,
"learning_rate": 2e-05,
"loss": 0.318,
"loss/crossentropy": 2.351975202560425,
"loss/hidden": 0.291015625,
"loss/logits": 0.026953624561429024,
"step": 428
},
{
"epoch": 0.429,
"grad_norm": 2.3125,
"grad_norm_var": 0.6734690348307292,
"learning_rate": 2e-05,
"loss": 0.4069,
"loss/crossentropy": 1.6036078929901123,
"loss/hidden": 0.37109375,
"loss/logits": 0.03581710997968912,
"step": 429
},
{
"epoch": 0.43,
"grad_norm": 2.46875,
"grad_norm_var": 0.667138671875,
"learning_rate": 2e-05,
"loss": 0.3472,
"loss/crossentropy": 1.881849765777588,
"loss/hidden": 0.3232421875,
"loss/logits": 0.023961665108799934,
"step": 430
},
{
"epoch": 0.431,
"grad_norm": 3.625,
"grad_norm_var": 0.77403564453125,
"learning_rate": 2e-05,
"loss": 0.3121,
"loss/crossentropy": 2.3671000599861145,
"loss/hidden": 0.2900390625,
"loss/logits": 0.022101588547229767,
"step": 431
},
{
"epoch": 0.432,
"grad_norm": 2.4375,
"grad_norm_var": 0.7627866109212239,
"learning_rate": 2e-05,
"loss": 0.3151,
"loss/crossentropy": 1.1575224101543427,
"loss/hidden": 0.298828125,
"loss/logits": 0.016257786191999912,
"step": 432
},
{
"epoch": 0.433,
"grad_norm": 5.40625,
"grad_norm_var": 1.3478289286295573,
"learning_rate": 2e-05,
"loss": 0.3283,
"loss/crossentropy": 1.3821857124567032,
"loss/hidden": 0.3115234375,
"loss/logits": 0.016785149462521076,
"step": 433
},
{
"epoch": 0.434,
"grad_norm": 2.140625,
"grad_norm_var": 1.3182634989420572,
"learning_rate": 2e-05,
"loss": 0.3499,
"loss/crossentropy": 1.4704007506370544,
"loss/hidden": 0.326171875,
"loss/logits": 0.02373607736080885,
"step": 434
},
{
"epoch": 0.435,
"grad_norm": 1.7890625,
"grad_norm_var": 0.9163736979166667,
"learning_rate": 2e-05,
"loss": 0.3453,
"loss/crossentropy": 1.7521992325782776,
"loss/hidden": 0.322265625,
"loss/logits": 0.023045840673148632,
"step": 435
},
{
"epoch": 0.436,
"grad_norm": 2.203125,
"grad_norm_var": 0.9093251546223958,
"learning_rate": 2e-05,
"loss": 0.3079,
"loss/crossentropy": 1.4147529304027557,
"loss/hidden": 0.2919921875,
"loss/logits": 0.01587154157459736,
"step": 436
},
{
"epoch": 0.437,
"grad_norm": 1.78125,
"grad_norm_var": 0.9289784749348958,
"learning_rate": 2e-05,
"loss": 0.3572,
"loss/crossentropy": 2.1589527130126953,
"loss/hidden": 0.330078125,
"loss/logits": 0.027110325172543526,
"step": 437
},
{
"epoch": 0.438,
"grad_norm": 1.546875,
"grad_norm_var": 0.9336751302083334,
"learning_rate": 2e-05,
"loss": 0.3112,
"loss/crossentropy": 2.0695826411247253,
"loss/hidden": 0.2890625,
"loss/logits": 0.022175450809299946,
"step": 438
},
{
"epoch": 0.439,
"grad_norm": 8.6875,
"grad_norm_var": 3.472150675455729,
"learning_rate": 2e-05,
"loss": 0.3174,
"loss/crossentropy": 2.715834140777588,
"loss/hidden": 0.2919921875,
"loss/logits": 0.02542768605053425,
"step": 439
},
{
"epoch": 0.44,
"grad_norm": 2.0625,
"grad_norm_var": 3.4516398111979165,
"learning_rate": 2e-05,
"loss": 0.3531,
"loss/crossentropy": 2.089130699634552,
"loss/hidden": 0.326171875,
"loss/logits": 0.026951050385832787,
"step": 440
},
{
"epoch": 0.441,
"grad_norm": 5.6875,
"grad_norm_var": 3.8860979715983075,
"learning_rate": 2e-05,
"loss": 0.352,
"loss/crossentropy": 1.6687681376934052,
"loss/hidden": 0.3330078125,
"loss/logits": 0.018973306752741337,
"step": 441
},
{
"epoch": 0.442,
"grad_norm": 1.6953125,
"grad_norm_var": 3.941239420572917,
"learning_rate": 2e-05,
"loss": 0.354,
"loss/crossentropy": 1.4019538760185242,
"loss/hidden": 0.33203125,
"loss/logits": 0.021962410770356655,
"step": 442
},
{
"epoch": 0.443,
"grad_norm": 2.453125,
"grad_norm_var": 3.8729509989420574,
"learning_rate": 2e-05,
"loss": 0.3591,
"loss/crossentropy": 2.068819046020508,
"loss/hidden": 0.328125,
"loss/logits": 0.03100405167788267,
"step": 443
},
{
"epoch": 0.444,
"grad_norm": 2.625,
"grad_norm_var": 3.7484527587890626,
"learning_rate": 2e-05,
"loss": 0.3207,
"loss/crossentropy": 1.2215966582298279,
"loss/hidden": 0.306640625,
"loss/logits": 0.014033652492798865,
"step": 444
},
{
"epoch": 0.445,
"grad_norm": 2.796875,
"grad_norm_var": 3.7149943033854167,
"learning_rate": 2e-05,
"loss": 0.2843,
"loss/crossentropy": 0.8393277078866959,
"loss/hidden": 0.2734375,
"loss/logits": 0.010860613780096173,
"step": 445
},
{
"epoch": 0.446,
"grad_norm": 3.6875,
"grad_norm_var": 3.7072184244791666,
"learning_rate": 2e-05,
"loss": 0.3369,
"loss/crossentropy": 0.8106656819581985,
"loss/hidden": 0.32421875,
"loss/logits": 0.01267361780628562,
"step": 446
},
{
"epoch": 0.447,
"grad_norm": 4.28125,
"grad_norm_var": 3.774466959635417,
"learning_rate": 2e-05,
"loss": 0.3246,
"loss/crossentropy": 1.0552468746900558,
"loss/hidden": 0.3095703125,
"loss/logits": 0.015042064245790243,
"step": 447
},
{
"epoch": 0.448,
"grad_norm": 2.734375,
"grad_norm_var": 3.749592081705729,
"learning_rate": 2e-05,
"loss": 0.3734,
"loss/crossentropy": 2.4344149827957153,
"loss/hidden": 0.3427734375,
"loss/logits": 0.030597456730902195,
"step": 448
},
{
"epoch": 0.449,
"grad_norm": 3.984375,
"grad_norm_var": 3.4621622721354166,
"learning_rate": 2e-05,
"loss": 0.3036,
"loss/crossentropy": 1.054320715367794,
"loss/hidden": 0.28857421875,
"loss/logits": 0.014980267733335495,
"step": 449
},
{
"epoch": 0.45,
"grad_norm": 1.8359375,
"grad_norm_var": 3.5083513895670575,
"learning_rate": 2e-05,
"loss": 0.3366,
"loss/crossentropy": 2.0155181288719177,
"loss/hidden": 0.310546875,
"loss/logits": 0.02600990142673254,
"step": 450
},
{
"epoch": 0.451,
"grad_norm": 2.0,
"grad_norm_var": 3.4738199869791666,
"learning_rate": 2e-05,
"loss": 0.3511,
"loss/crossentropy": 1.755088448524475,
"loss/hidden": 0.3271484375,
"loss/logits": 0.023935355246067047,
"step": 451
},
{
"epoch": 0.452,
"grad_norm": 2.046875,
"grad_norm_var": 3.4946329752604166,
"learning_rate": 2e-05,
"loss": 0.3499,
"loss/crossentropy": 1.7622599005699158,
"loss/hidden": 0.326171875,
"loss/logits": 0.023745747283101082,
"step": 452
},
{
"epoch": 0.453,
"grad_norm": 1.7890625,
"grad_norm_var": 3.4932431538899738,
"learning_rate": 2e-05,
"loss": 0.3215,
"loss/crossentropy": 2.3116530179977417,
"loss/hidden": 0.298828125,
"loss/logits": 0.022703303024172783,
"step": 453
},
{
"epoch": 0.454,
"grad_norm": 1.6875,
"grad_norm_var": 3.464989980061849,
"learning_rate": 2e-05,
"loss": 0.3673,
"loss/crossentropy": 1.5556917786598206,
"loss/hidden": 0.3408203125,
"loss/logits": 0.026494111865758896,
"step": 454
},
{
"epoch": 0.455,
"grad_norm": 2.0,
"grad_norm_var": 1.3033078511555989,
"learning_rate": 2e-05,
"loss": 0.3715,
"loss/crossentropy": 1.7844219207763672,
"loss/hidden": 0.345703125,
"loss/logits": 0.02580021321773529,
"step": 455
},
{
"epoch": 0.456,
"grad_norm": 2.53125,
"grad_norm_var": 1.2765439351399739,
"learning_rate": 2e-05,
"loss": 0.448,
"loss/crossentropy": 1.2347650527954102,
"loss/hidden": 0.4150390625,
"loss/logits": 0.0329879354685545,
"step": 456
},
{
"epoch": 0.457,
"grad_norm": 1.4375,
"grad_norm_var": 0.7350563049316406,
"learning_rate": 2e-05,
"loss": 0.3455,
"loss/crossentropy": 1.9715585112571716,
"loss/hidden": 0.318359375,
"loss/logits": 0.02718514297157526,
"step": 457
},
{
"epoch": 0.458,
"grad_norm": 1.5859375,
"grad_norm_var": 0.7471616109212239,
"learning_rate": 2e-05,
"loss": 0.3339,
"loss/crossentropy": 2.389525294303894,
"loss/hidden": 0.30859375,
"loss/logits": 0.025292156264185905,
"step": 458
},
{
"epoch": 0.459,
"grad_norm": 1.4921875,
"grad_norm_var": 0.8066884358723958,
"learning_rate": 2e-05,
"loss": 0.3166,
"loss/crossentropy": 1.7892733812332153,
"loss/hidden": 0.29296875,
"loss/logits": 0.023592060431838036,
"step": 459
},
{
"epoch": 0.46,
"grad_norm": 1.8125,
"grad_norm_var": 0.8243560791015625,
"learning_rate": 2e-05,
"loss": 0.3353,
"loss/crossentropy": 1.9092342853546143,
"loss/hidden": 0.3115234375,
"loss/logits": 0.02376522123813629,
"step": 460
},
{
"epoch": 0.461,
"grad_norm": 1.34375,
"grad_norm_var": 0.87099609375,
"learning_rate": 2e-05,
"loss": 0.349,
"loss/crossentropy": 1.9013403058052063,
"loss/hidden": 0.3251953125,
"loss/logits": 0.02381738182157278,
"step": 461
},
{
"epoch": 0.462,
"grad_norm": 2.9375,
"grad_norm_var": 0.76396484375,
"learning_rate": 2e-05,
"loss": 0.3492,
"loss/crossentropy": 0.9097070023417473,
"loss/hidden": 0.330078125,
"loss/logits": 0.01913693710230291,
"step": 462
},
{
"epoch": 0.463,
"grad_norm": 2.828125,
"grad_norm_var": 0.4963287353515625,
"learning_rate": 2e-05,
"loss": 0.4669,
"loss/crossentropy": 1.9413211345672607,
"loss/hidden": 0.427734375,
"loss/logits": 0.03912976011633873,
"step": 463
},
{
"epoch": 0.464,
"grad_norm": 1.9296875,
"grad_norm_var": 0.47173233032226564,
"learning_rate": 2e-05,
"loss": 0.3569,
"loss/crossentropy": 2.3746496438980103,
"loss/hidden": 0.326171875,
"loss/logits": 0.030762989073991776,
"step": 464
},
{
"epoch": 0.465,
"grad_norm": 1.796875,
"grad_norm_var": 0.21467259724934895,
"learning_rate": 2e-05,
"loss": 0.3875,
"loss/crossentropy": 1.920172929763794,
"loss/hidden": 0.359375,
"loss/logits": 0.028154666535556316,
"step": 465
},
{
"epoch": 0.466,
"grad_norm": 2.59375,
"grad_norm_var": 0.23995768229166667,
"learning_rate": 2e-05,
"loss": 0.4173,
"loss/crossentropy": 2.1804317831993103,
"loss/hidden": 0.3828125,
"loss/logits": 0.03448019549250603,
"step": 466
},
{
"epoch": 0.467,
"grad_norm": 2.453125,
"grad_norm_var": 0.25349833170572916,
"learning_rate": 2e-05,
"loss": 0.3635,
"loss/crossentropy": 2.1129865646362305,
"loss/hidden": 0.3369140625,
"loss/logits": 0.026613284833729267,
"step": 467
},
{
"epoch": 0.468,
"grad_norm": 3.4375,
"grad_norm_var": 0.37997639973958336,
"learning_rate": 2e-05,
"loss": 0.3892,
"loss/crossentropy": 1.6438812613487244,
"loss/hidden": 0.3623046875,
"loss/logits": 0.026910429820418358,
"step": 468
},
{
"epoch": 0.469,
"grad_norm": 13.125,
"grad_norm_var": 7.936161041259766,
"learning_rate": 2e-05,
"loss": 0.4187,
"loss/crossentropy": 1.8062403798103333,
"loss/hidden": 0.3857421875,
"loss/logits": 0.03291827440261841,
"step": 469
},
{
"epoch": 0.47,
"grad_norm": 3.421875,
"grad_norm_var": 7.8641212463378904,
"learning_rate": 2e-05,
"loss": 0.4157,
"loss/crossentropy": 1.2208881378173828,
"loss/hidden": 0.39453125,
"loss/logits": 0.02117818035185337,
"step": 470
},
{
"epoch": 0.471,
"grad_norm": 1.953125,
"grad_norm_var": 7.8700111389160154,
"learning_rate": 2e-05,
"loss": 0.3306,
"loss/crossentropy": 2.474324107170105,
"loss/hidden": 0.3037109375,
"loss/logits": 0.026909410022199154,
"step": 471
},
{
"epoch": 0.472,
"grad_norm": 2.796875,
"grad_norm_var": 7.860741933186849,
"learning_rate": 2e-05,
"loss": 0.4071,
"loss/crossentropy": 1.8907885551452637,
"loss/hidden": 0.3740234375,
"loss/logits": 0.03311134688556194,
"step": 472
},
{
"epoch": 0.473,
"grad_norm": 5.40625,
"grad_norm_var": 8.053236643473307,
"learning_rate": 2e-05,
"loss": 0.482,
"loss/crossentropy": 1.851112186908722,
"loss/hidden": 0.4287109375,
"loss/logits": 0.0532735763117671,
"step": 473
},
{
"epoch": 0.474,
"grad_norm": 1.8125,
"grad_norm_var": 8.008226521809895,
"learning_rate": 2e-05,
"loss": 0.4011,
"loss/crossentropy": 2.0893144607543945,
"loss/hidden": 0.37109375,
"loss/logits": 0.03000558167695999,
"step": 474
},
{
"epoch": 0.475,
"grad_norm": 1.84375,
"grad_norm_var": 7.936071523030599,
"learning_rate": 2e-05,
"loss": 0.4086,
"loss/crossentropy": 1.692557156085968,
"loss/hidden": 0.37890625,
"loss/logits": 0.029658248648047447,
"step": 475
},
{
"epoch": 0.476,
"grad_norm": 1.734375,
"grad_norm_var": 7.9510963439941404,
"learning_rate": 2e-05,
"loss": 0.3369,
"loss/crossentropy": 2.7231298685073853,
"loss/hidden": 0.3095703125,
"loss/logits": 0.027365448884665966,
"step": 476
},
{
"epoch": 0.477,
"grad_norm": 122.5,
"grad_norm_var": 895.1761065165202,
"learning_rate": 2e-05,
"loss": 1.8739,
"loss/crossentropy": 1.9931391477584839,
"loss/hidden": 1.73828125,
"loss/logits": 0.13565433584153652,
"step": 477
},
{
"epoch": 0.478,
"grad_norm": 18.75,
"grad_norm_var": 894.2567990620931,
"learning_rate": 2e-05,
"loss": 0.4467,
"loss/crossentropy": 1.0818050801753998,
"loss/hidden": 0.423828125,
"loss/logits": 0.022886332124471664,
"step": 478
},
{
"epoch": 0.479,
"grad_norm": 1.9609375,
"grad_norm_var": 895.3381581624349,
"learning_rate": 2e-05,
"loss": 0.3744,
"loss/crossentropy": 2.382234215736389,
"loss/hidden": 0.3447265625,
"loss/logits": 0.029717115685343742,
"step": 479
},
{
"epoch": 0.48,
"grad_norm": 1.71875,
"grad_norm_var": 895.6162839253743,
"learning_rate": 2e-05,
"loss": 0.3323,
"loss/crossentropy": 2.0683305859565735,
"loss/hidden": 0.30859375,
"loss/logits": 0.023680799640715122,
"step": 480
},
{
"epoch": 0.481,
"grad_norm": 2.546875,
"grad_norm_var": 894.6604733784993,
"learning_rate": 2e-05,
"loss": 0.3756,
"loss/crossentropy": 2.154377818107605,
"loss/hidden": 0.34765625,
"loss/logits": 0.02795298583805561,
"step": 481
},
{
"epoch": 0.482,
"grad_norm": 3.1875,
"grad_norm_var": 893.9573666890462,
"learning_rate": 2e-05,
"loss": 0.4124,
"loss/crossentropy": 1.9701088666915894,
"loss/hidden": 0.3779296875,
"loss/logits": 0.03451960347592831,
"step": 482
},
{
"epoch": 0.483,
"grad_norm": 2.5,
"grad_norm_var": 893.8991452534993,
"learning_rate": 2e-05,
"loss": 0.4523,
"loss/crossentropy": 0.9486123919487,
"loss/hidden": 0.4306640625,
"loss/logits": 0.02167674619704485,
"step": 483
},
{
"epoch": 0.484,
"grad_norm": 2.578125,
"grad_norm_var": 894.9027565002441,
"learning_rate": 2e-05,
"loss": 0.3955,
"loss/crossentropy": 1.7118502855300903,
"loss/hidden": 0.365234375,
"loss/logits": 0.030311796814203262,
"step": 484
},
{
"epoch": 0.485,
"grad_norm": 1.890625,
"grad_norm_var": 900.7159604390462,
"learning_rate": 2e-05,
"loss": 0.3914,
"loss/crossentropy": 1.7511045932769775,
"loss/hidden": 0.36328125,
"loss/logits": 0.02810109406709671,
"step": 485
},
{
"epoch": 0.486,
"grad_norm": 2.203125,
"grad_norm_var": 902.0463498433431,
"learning_rate": 2e-05,
"loss": 0.3893,
"loss/crossentropy": 1.9742628931999207,
"loss/hidden": 0.3603515625,
"loss/logits": 0.028935128822922707,
"step": 486
},
{
"epoch": 0.487,
"grad_norm": 2.609375,
"grad_norm_var": 901.28504002889,
"learning_rate": 2e-05,
"loss": 0.338,
"loss/crossentropy": 1.5944682955741882,
"loss/hidden": 0.31640625,
"loss/logits": 0.02155130822211504,
"step": 487
},
{
"epoch": 0.488,
"grad_norm": 2.0,
"grad_norm_var": 902.1965695699056,
"learning_rate": 2e-05,
"loss": 0.3749,
"loss/crossentropy": 2.109809994697571,
"loss/hidden": 0.3486328125,
"loss/logits": 0.026237317360937595,
"step": 488
},
{
"epoch": 0.489,
"grad_norm": 2.828125,
"grad_norm_var": 904.5185605367025,
"learning_rate": 2e-05,
"loss": 0.3601,
"loss/crossentropy": 2.371906280517578,
"loss/hidden": 0.33203125,
"loss/logits": 0.0280781090259552,
"step": 489
},
{
"epoch": 0.49,
"grad_norm": 2.25,
"grad_norm_var": 904.0067481994629,
"learning_rate": 2e-05,
"loss": 0.3881,
"loss/crossentropy": 2.3074965476989746,
"loss/hidden": 0.3583984375,
"loss/logits": 0.029700559563934803,
"step": 490
},
{
"epoch": 0.491,
"grad_norm": 1.609375,
"grad_norm_var": 904.2906532287598,
"learning_rate": 2e-05,
"loss": 0.3533,
"loss/crossentropy": 2.0604811906814575,
"loss/hidden": 0.3271484375,
"loss/logits": 0.026149596087634563,
"step": 491
},
{
"epoch": 0.492,
"grad_norm": 2.203125,
"grad_norm_var": 903.7375221252441,
"learning_rate": 2e-05,
"loss": 0.3982,
"loss/crossentropy": 2.0394086837768555,
"loss/hidden": 0.3671875,
"loss/logits": 0.030979415401816368,
"step": 492
},
{
"epoch": 0.493,
"grad_norm": 1.53125,
"grad_norm_var": 17.239774322509767,
"learning_rate": 2e-05,
"loss": 0.3721,
"loss/crossentropy": 1.992867350578308,
"loss/hidden": 0.3447265625,
"loss/logits": 0.02732760366052389,
"step": 493
},
{
"epoch": 0.494,
"grad_norm": 1.5,
"grad_norm_var": 0.24021377563476562,
"learning_rate": 2e-05,
"loss": 0.3607,
"loss/crossentropy": 2.0647668838500977,
"loss/hidden": 0.3349609375,
"loss/logits": 0.02573198452591896,
"step": 494
},
{
"epoch": 0.495,
"grad_norm": 3.265625,
"grad_norm_var": 0.3059153238932292,
"learning_rate": 2e-05,
"loss": 0.4332,
"loss/crossentropy": 2.0061678886413574,
"loss/hidden": 0.4033203125,
"loss/logits": 0.029847824946045876,
"step": 495
},
{
"epoch": 0.496,
"grad_norm": 1.671875,
"grad_norm_var": 0.30953776041666664,
"learning_rate": 2e-05,
"loss": 0.3677,
"loss/crossentropy": 2.029963493347168,
"loss/hidden": 0.3408203125,
"loss/logits": 0.026841914281249046,
"step": 496
},
{
"epoch": 0.497,
"grad_norm": 2.1875,
"grad_norm_var": 0.3045074462890625,
"learning_rate": 2e-05,
"loss": 0.3773,
"loss/crossentropy": 1.836094081401825,
"loss/hidden": 0.3505859375,
"loss/logits": 0.026703315787017345,
"step": 497
},
{
"epoch": 0.498,
"grad_norm": 1.8984375,
"grad_norm_var": 0.24739761352539064,
"learning_rate": 2e-05,
"loss": 0.3934,
"loss/crossentropy": 2.284022331237793,
"loss/hidden": 0.36328125,
"loss/logits": 0.030102317221462727,
"step": 498
},
{
"epoch": 0.499,
"grad_norm": 1.609375,
"grad_norm_var": 0.25783462524414064,
"learning_rate": 2e-05,
"loss": 0.422,
"loss/crossentropy": 1.7640503644943237,
"loss/hidden": 0.388671875,
"loss/logits": 0.03330034948885441,
"step": 499
},
{
"epoch": 0.5,
"grad_norm": 2.40625,
"grad_norm_var": 0.2490618387858073,
"learning_rate": 2e-05,
"loss": 0.4409,
"loss/crossentropy": 1.4432637095451355,
"loss/hidden": 0.4130859375,
"loss/logits": 0.027862844988703728,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.2202930782208e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}