| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5, |
| "eval_steps": 250, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001, |
| "grad_norm": 0.000537872314453125, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 0.0002, |
| "loss/crossentropy": 0.8766392022371292, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.00021765431665698998, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 0.384765625, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 0.0055, |
| "loss/crossentropy": 1.988494873046875, |
| "loss/hidden": 0.0043487548828125, |
| "loss/logits": 0.0011881994432769716, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 0.328125, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 0.0056, |
| "loss/crossentropy": 1.8016360402107239, |
| "loss/hidden": 0.0044403076171875, |
| "loss/logits": 0.0011122562573291361, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 0.490234375, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 0.0053, |
| "loss/crossentropy": 1.0765393376350403, |
| "loss/hidden": 0.004302978515625, |
| "loss/logits": 0.000948212924413383, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 0.28125, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.0052, |
| "loss/crossentropy": 1.7854897379875183, |
| "loss/hidden": 0.004058837890625, |
| "loss/logits": 0.0011507467716000974, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 0.3828125, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 0.0055, |
| "loss/crossentropy": 2.4101182222366333, |
| "loss/hidden": 0.004180908203125, |
| "loss/logits": 0.0012829686747863889, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 0.61328125, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 0.0058, |
| "loss/crossentropy": 1.992232859134674, |
| "loss/hidden": 0.004608154296875, |
| "loss/logits": 0.0011995871318504214, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 0.283203125, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 0.0051, |
| "loss/crossentropy": 2.268880248069763, |
| "loss/hidden": 0.00394439697265625, |
| "loss/logits": 0.0012029792997054756, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 0.384765625, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.0055, |
| "loss/crossentropy": 2.190282464027405, |
| "loss/hidden": 0.00421142578125, |
| "loss/logits": 0.0012559492606669664, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.4765625, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0057, |
| "loss/crossentropy": 1.7616363763809204, |
| "loss/hidden": 0.0045318603515625, |
| "loss/logits": 0.0011331779533065856, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 0.328125, |
| "learning_rate": 2.2e-06, |
| "loss": 0.0053, |
| "loss/crossentropy": 2.4380578994750977, |
| "loss/hidden": 0.00406646728515625, |
| "loss/logits": 0.0012633077567443252, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 0.60546875, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 0.0083, |
| "loss/crossentropy": 1.8881370425224304, |
| "loss/hidden": 0.006866455078125, |
| "loss/logits": 0.0014203558093868196, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 0.7734375, |
| "learning_rate": 2.6e-06, |
| "loss": 0.0096, |
| "loss/crossentropy": 1.7407108545303345, |
| "loss/hidden": 0.008087158203125, |
| "loss/logits": 0.0015171858831308782, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 0.6171875, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 0.0088, |
| "loss/crossentropy": 2.006006360054016, |
| "loss/hidden": 0.0072479248046875, |
| "loss/logits": 0.0015724042314104736, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 0.5234375, |
| "learning_rate": 3e-06, |
| "loss": 0.008, |
| "loss/crossentropy": 1.985671579837799, |
| "loss/hidden": 0.006500244140625, |
| "loss/logits": 0.001542731188237667, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 0.470703125, |
| "grad_norm_var": 0.032565849785108486, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.0078, |
| "loss/crossentropy": 2.473353385925293, |
| "loss/hidden": 0.0061492919921875, |
| "loss/logits": 0.0016758597921580076, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 0.4296875, |
| "grad_norm_var": 0.019274123509724937, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 0.008, |
| "loss/crossentropy": 1.7784300446510315, |
| "loss/hidden": 0.0065765380859375, |
| "loss/logits": 0.0014022670220583677, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 0.7109375, |
| "grad_norm_var": 0.022616004943847655, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 0.0077, |
| "loss/crossentropy": 1.1124538108706474, |
| "loss/hidden": 0.0066070556640625, |
| "loss/logits": 0.0011093285284005105, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 0.392578125, |
| "grad_norm_var": 0.021560144424438477, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.0077, |
| "loss/crossentropy": 1.8864194750785828, |
| "loss/hidden": 0.006195068359375, |
| "loss/logits": 0.0014855173067189753, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.5234375, |
| "grad_norm_var": 0.021651204427083334, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.0077, |
| "loss/crossentropy": 1.7746207118034363, |
| "loss/hidden": 0.00634765625, |
| "loss/logits": 0.0013394113047979772, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 0.44140625, |
| "grad_norm_var": 0.018854204813639322, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 0.0076, |
| "loss/crossentropy": 2.1028923988342285, |
| "loss/hidden": 0.0061798095703125, |
| "loss/logits": 0.0014341563801281154, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 0.51171875, |
| "grad_norm_var": 0.017924753824869792, |
| "learning_rate": 4.4e-06, |
| "loss": 0.0115, |
| "loss/crossentropy": 1.9447378516197205, |
| "loss/hidden": 0.00946044921875, |
| "loss/logits": 0.0020560644334182143, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 0.6640625, |
| "grad_norm_var": 0.018816566467285155, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 0.0134, |
| "loss/crossentropy": 1.6007800102233887, |
| "loss/hidden": 0.011474609375, |
| "loss/logits": 0.0019165858393535018, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 0.91796875, |
| "grad_norm_var": 0.024927632013956705, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 0.0117, |
| "loss/crossentropy": 1.1415547728538513, |
| "loss/hidden": 0.010284423828125, |
| "loss/logits": 0.0013778514403384179, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 0.474609375, |
| "grad_norm_var": 0.02347410519917806, |
| "learning_rate": 5e-06, |
| "loss": 0.0107, |
| "loss/crossentropy": 2.0954560041427612, |
| "loss/hidden": 0.00872802734375, |
| "loss/logits": 0.0019610102754086256, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 1.03125, |
| "grad_norm_var": 0.03698919614156087, |
| "learning_rate": 5.2e-06, |
| "loss": 0.0129, |
| "loss/crossentropy": 0.9556205421686172, |
| "loss/hidden": 0.0118408203125, |
| "loss/logits": 0.0010545893164817244, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 0.65234375, |
| "grad_norm_var": 0.03230322202046712, |
| "learning_rate": 5.400000000000001e-06, |
| "loss": 0.0106, |
| "loss/crossentropy": 1.372634619474411, |
| "loss/hidden": 0.009185791015625, |
| "loss/logits": 0.0014620812726207078, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 0.64453125, |
| "grad_norm_var": 0.03238142331441243, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": 0.0121, |
| "loss/crossentropy": 1.6492629051208496, |
| "loss/hidden": 0.010223388671875, |
| "loss/logits": 0.0018439113046042621, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 0.546875, |
| "grad_norm_var": 0.03068884213765462, |
| "learning_rate": 5.8e-06, |
| "loss": 0.0108, |
| "loss/crossentropy": 1.8000301718711853, |
| "loss/hidden": 0.009033203125, |
| "loss/logits": 0.0017924956628121436, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.388671875, |
| "grad_norm_var": 0.03333886464436849, |
| "learning_rate": 6e-06, |
| "loss": 0.0105, |
| "loss/crossentropy": 1.7418496012687683, |
| "loss/hidden": 0.00885009765625, |
| "loss/logits": 0.0016374444239772856, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 0.65234375, |
| "grad_norm_var": 0.03335774739583333, |
| "learning_rate": 6.200000000000001e-06, |
| "loss": 0.0108, |
| "loss/crossentropy": 1.4579273164272308, |
| "loss/hidden": 0.009185791015625, |
| "loss/logits": 0.001588685205206275, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.56640625, |
| "grad_norm_var": 0.03239744504292806, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 0.0154, |
| "loss/crossentropy": 1.6372994184494019, |
| "loss/hidden": 0.013214111328125, |
| "loss/logits": 0.0022216038778424263, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 1.3828125, |
| "grad_norm_var": 0.06793796221415202, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 0.0166, |
| "loss/crossentropy": 1.036409616470337, |
| "loss/hidden": 0.015045166015625, |
| "loss/logits": 0.0015461337170563638, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 0.56640625, |
| "grad_norm_var": 0.06819202105204264, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": 0.0148, |
| "loss/crossentropy": 2.006142556667328, |
| "loss/hidden": 0.012542724609375, |
| "loss/logits": 0.002277214080095291, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 1.1328125, |
| "grad_norm_var": 0.07729434967041016, |
| "learning_rate": 7e-06, |
| "loss": 0.016, |
| "loss/crossentropy": 1.842608094215393, |
| "loss/hidden": 0.013641357421875, |
| "loss/logits": 0.0023830662248656154, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 0.7265625, |
| "grad_norm_var": 0.07526442209879557, |
| "learning_rate": 7.2000000000000005e-06, |
| "loss": 0.016, |
| "loss/crossentropy": 1.9097451567649841, |
| "loss/hidden": 0.013580322265625, |
| "loss/logits": 0.0024145807838067412, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 57.0, |
| "grad_norm_var": 198.00732879638673, |
| "learning_rate": 7.4e-06, |
| "loss": 0.0439, |
| "loss/crossentropy": 1.5945889949798584, |
| "loss/hidden": 0.0399169921875, |
| "loss/logits": 0.003960137953981757, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 0.5078125, |
| "grad_norm_var": 198.00927219390869, |
| "learning_rate": 7.600000000000001e-06, |
| "loss": 0.0149, |
| "loss/crossentropy": 2.260584235191345, |
| "loss/hidden": 0.01251220703125, |
| "loss/logits": 0.002418684889562428, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 0.53515625, |
| "grad_norm_var": 198.0717887878418, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": 0.0143, |
| "loss/crossentropy": 2.127864420413971, |
| "loss/hidden": 0.012115478515625, |
| "loss/logits": 0.0022204924607649446, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.546875, |
| "grad_norm_var": 198.24441623687744, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.0155, |
| "loss/crossentropy": 1.4864744544029236, |
| "loss/hidden": 0.0135498046875, |
| "loss/logits": 0.0019313275697641075, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 0.337890625, |
| "grad_norm_var": 198.3136723836263, |
| "learning_rate": 8.2e-06, |
| "loss": 0.0126, |
| "loss/crossentropy": 2.0884488821029663, |
| "loss/hidden": 0.0106201171875, |
| "loss/logits": 0.0020087960874661803, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 0.58203125, |
| "grad_norm_var": 198.51614983876547, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 0.02, |
| "loss/crossentropy": 1.6123111844062805, |
| "loss/hidden": 0.0174560546875, |
| "loss/logits": 0.0025139962090179324, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 0.69921875, |
| "grad_norm_var": 198.49428246815998, |
| "learning_rate": 8.6e-06, |
| "loss": 0.0191, |
| "loss/crossentropy": 1.899111807346344, |
| "loss/hidden": 0.01654052734375, |
| "loss/logits": 0.0025533820735290647, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 0.6015625, |
| "grad_norm_var": 198.51463038126627, |
| "learning_rate": 8.8e-06, |
| "loss": 0.0209, |
| "loss/crossentropy": 1.410634160041809, |
| "loss/hidden": 0.01812744140625, |
| "loss/logits": 0.002780333859845996, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 0.4609375, |
| "grad_norm_var": 198.55664520263673, |
| "learning_rate": 9e-06, |
| "loss": 0.0181, |
| "loss/crossentropy": 1.6974017024040222, |
| "loss/hidden": 0.01580810546875, |
| "loss/logits": 0.0023181557189673185, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 0.478515625, |
| "grad_norm_var": 198.51187686920167, |
| "learning_rate": 9.200000000000002e-06, |
| "loss": 0.0187, |
| "loss/crossentropy": 2.1352469325065613, |
| "loss/hidden": 0.01611328125, |
| "loss/logits": 0.002572571625933051, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 198.18177642822266, |
| "learning_rate": 9.4e-06, |
| "loss": 0.0191, |
| "loss/crossentropy": 1.6112200021743774, |
| "loss/hidden": 0.01678466796875, |
| "loss/logits": 0.0022820517187938094, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 0.462890625, |
| "grad_norm_var": 198.23291001319885, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 0.0176, |
| "loss/crossentropy": 2.0570507049560547, |
| "loss/hidden": 0.01519775390625, |
| "loss/logits": 0.002418220858089626, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 0.5078125, |
| "grad_norm_var": 198.61132187843322, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 0.0181, |
| "loss/crossentropy": 1.5905100107192993, |
| "loss/hidden": 0.0157470703125, |
| "loss/logits": 0.0023602789733558893, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.48046875, |
| "grad_norm_var": 198.65297722816467, |
| "learning_rate": 1e-05, |
| "loss": 0.0174, |
| "loss/crossentropy": 2.3879631757736206, |
| "loss/hidden": 0.014984130859375, |
| "loss/logits": 0.0024567440850660205, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 0.6875, |
| "grad_norm_var": 198.84488053321837, |
| "learning_rate": 1.02e-05, |
| "loss": 0.0188, |
| "loss/crossentropy": 2.015933036804199, |
| "loss/hidden": 0.0164794921875, |
| "loss/logits": 0.0023623716551810503, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 0.52734375, |
| "grad_norm_var": 198.93771958351135, |
| "learning_rate": 1.04e-05, |
| "loss": 0.0237, |
| "loss/crossentropy": 1.9334338307380676, |
| "loss/hidden": 0.02069091796875, |
| "loss/logits": 0.0029913606122136116, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 0.73828125, |
| "grad_norm_var": 0.0602358341217041, |
| "learning_rate": 1.0600000000000002e-05, |
| "loss": 0.0234, |
| "loss/crossentropy": 1.988130509853363, |
| "loss/hidden": 0.02056884765625, |
| "loss/logits": 0.0028755036182701588, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 0.87890625, |
| "grad_norm_var": 0.06430675188700358, |
| "learning_rate": 1.0800000000000002e-05, |
| "loss": 0.0225, |
| "loss/crossentropy": 1.4915976524353027, |
| "loss/hidden": 0.02008056640625, |
| "loss/logits": 0.0023959834361448884, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 0.5859375, |
| "grad_norm_var": 0.0638753096262614, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 0.0212, |
| "loss/crossentropy": 1.7327674627304077, |
| "loss/hidden": 0.0186767578125, |
| "loss/logits": 0.002566903829574585, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 0.53515625, |
| "grad_norm_var": 0.06400729815165201, |
| "learning_rate": 1.1200000000000001e-05, |
| "loss": 0.0221, |
| "loss/crossentropy": 1.8408621549606323, |
| "loss/hidden": 0.01947021484375, |
| "loss/logits": 0.0026238159043714404, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 0.8984375, |
| "grad_norm_var": 0.06217803955078125, |
| "learning_rate": 1.14e-05, |
| "loss": 0.0213, |
| "loss/crossentropy": 1.32709925994277, |
| "loss/hidden": 0.01947021484375, |
| "loss/logits": 0.001822101214202121, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 0.482421875, |
| "grad_norm_var": 0.06383576393127441, |
| "learning_rate": 1.16e-05, |
| "loss": 0.0211, |
| "loss/crossentropy": 2.5516231060028076, |
| "loss/hidden": 0.018310546875, |
| "loss/logits": 0.002763173426501453, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 1.5546875, |
| "grad_norm_var": 0.11474061012268066, |
| "learning_rate": 1.18e-05, |
| "loss": 0.0216, |
| "loss/crossentropy": 1.093215293250978, |
| "loss/hidden": 0.02008056640625, |
| "loss/logits": 0.001494493626523763, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.046875, |
| "grad_norm_var": 0.12085061073303223, |
| "learning_rate": 1.2e-05, |
| "loss": 0.023, |
| "loss/crossentropy": 2.0825194716453552, |
| "loss/hidden": 0.02008056640625, |
| "loss/logits": 0.002906191977672279, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 1.5625, |
| "grad_norm_var": 0.1564039707183838, |
| "learning_rate": 1.22e-05, |
| "loss": 0.0219, |
| "loss/crossentropy": 0.930735819041729, |
| "loss/hidden": 0.02020263671875, |
| "loss/logits": 0.0017190971411764622, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 0.60546875, |
| "grad_norm_var": 0.15190048217773439, |
| "learning_rate": 1.2400000000000002e-05, |
| "loss": 0.026, |
| "loss/crossentropy": 2.1702520847320557, |
| "loss/hidden": 0.0228271484375, |
| "loss/logits": 0.0031759394332766533, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 0.90234375, |
| "grad_norm_var": 0.1251688003540039, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 0.0268, |
| "loss/crossentropy": 2.155192196369171, |
| "loss/hidden": 0.02337646484375, |
| "loss/logits": 0.0033858821261674166, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 0.66015625, |
| "grad_norm_var": 0.11929802894592285, |
| "learning_rate": 1.2800000000000001e-05, |
| "loss": 0.0261, |
| "loss/crossentropy": 1.952944815158844, |
| "loss/hidden": 0.02276611328125, |
| "loss/logits": 0.0033275720197707415, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 0.81640625, |
| "grad_norm_var": 0.11360230445861816, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 0.0294, |
| "loss/crossentropy": 1.8505353331565857, |
| "loss/hidden": 0.02606201171875, |
| "loss/logits": 0.0032885426189750433, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 0.58203125, |
| "grad_norm_var": 0.10978213946024577, |
| "learning_rate": 1.3200000000000002e-05, |
| "loss": 0.0254, |
| "loss/crossentropy": 1.9442384243011475, |
| "loss/hidden": 0.02252197265625, |
| "loss/logits": 0.002852104022167623, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 0.63671875, |
| "grad_norm_var": 0.11081693967183431, |
| "learning_rate": 1.3400000000000002e-05, |
| "loss": 0.0272, |
| "loss/crossentropy": 1.7780007719993591, |
| "loss/hidden": 0.024169921875, |
| "loss/logits": 0.0030432826606556773, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 0.6875, |
| "grad_norm_var": 0.10631254514058432, |
| "learning_rate": 1.3600000000000002e-05, |
| "loss": 0.0271, |
| "loss/crossentropy": 1.6640018224716187, |
| "loss/hidden": 0.0244140625, |
| "loss/logits": 0.002680669422261417, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 0.48046875, |
| "grad_norm_var": 0.11339147885640462, |
| "learning_rate": 1.38e-05, |
| "loss": 0.0249, |
| "loss/crossentropy": 1.9946751594543457, |
| "loss/hidden": 0.0220947265625, |
| "loss/logits": 0.002759344642981887, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.4765625, |
| "grad_norm_var": 0.11966500282287598, |
| "learning_rate": 1.4e-05, |
| "loss": 0.0236, |
| "loss/crossentropy": 2.234663248062134, |
| "loss/hidden": 0.02069091796875, |
| "loss/logits": 0.0029394502053037286, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 0.52734375, |
| "grad_norm_var": 0.12141213417053223, |
| "learning_rate": 1.4200000000000001e-05, |
| "loss": 0.0261, |
| "loss/crossentropy": 2.327102780342102, |
| "loss/hidden": 0.0230712890625, |
| "loss/logits": 0.0030780097004026175, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 1.4609375, |
| "grad_norm_var": 0.14494843482971193, |
| "learning_rate": 1.4400000000000001e-05, |
| "loss": 0.0311, |
| "loss/crossentropy": 2.447921633720398, |
| "loss/hidden": 0.0277099609375, |
| "loss/logits": 0.0034353630617260933, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 0.6953125, |
| "grad_norm_var": 0.1458443800608317, |
| "learning_rate": 1.46e-05, |
| "loss": 0.0348, |
| "loss/crossentropy": 1.8365623950958252, |
| "loss/hidden": 0.03082275390625, |
| "loss/logits": 0.003995993640273809, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 0.6328125, |
| "grad_norm_var": 0.14041646321614584, |
| "learning_rate": 1.48e-05, |
| "loss": 0.0309, |
| "loss/crossentropy": 1.8763534426689148, |
| "loss/hidden": 0.02752685546875, |
| "loss/logits": 0.0034082168713212013, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 0.57421875, |
| "grad_norm_var": 0.10615431467692057, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.0323, |
| "loss/crossentropy": 1.6230891346931458, |
| "loss/hidden": 0.02899169921875, |
| "loss/logits": 0.0033399080857634544, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 0.494140625, |
| "grad_norm_var": 0.10497129758199056, |
| "learning_rate": 1.5200000000000002e-05, |
| "loss": 0.0287, |
| "loss/crossentropy": 2.1396928429603577, |
| "loss/hidden": 0.02557373046875, |
| "loss/logits": 0.003119353437796235, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 0.80859375, |
| "grad_norm_var": 0.05753312110900879, |
| "learning_rate": 1.54e-05, |
| "loss": 0.0347, |
| "loss/crossentropy": 1.532863974571228, |
| "loss/hidden": 0.03131103515625, |
| "loss/logits": 0.0034041637554764748, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 0.490234375, |
| "grad_norm_var": 0.059662818908691406, |
| "learning_rate": 1.5600000000000003e-05, |
| "loss": 0.0306, |
| "loss/crossentropy": 2.6230881214141846, |
| "loss/hidden": 0.027099609375, |
| "loss/logits": 0.0035023156087845564, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 0.5078125, |
| "grad_norm_var": 0.05784556070963542, |
| "learning_rate": 1.58e-05, |
| "loss": 0.0308, |
| "loss/crossentropy": 2.324823498725891, |
| "loss/hidden": 0.02716064453125, |
| "loss/logits": 0.0035959234228357673, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.451171875, |
| "grad_norm_var": 0.06052079200744629, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.029, |
| "loss/crossentropy": 1.8020533919334412, |
| "loss/hidden": 0.026123046875, |
| "loss/logits": 0.0029267214704304934, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 0.462890625, |
| "grad_norm_var": 0.06025899251302083, |
| "learning_rate": 1.62e-05, |
| "loss": 0.0294, |
| "loss/crossentropy": 1.9489082098007202, |
| "loss/hidden": 0.02642822265625, |
| "loss/logits": 0.0030203944770619273, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 0.6796875, |
| "grad_norm_var": 0.06032098134358724, |
| "learning_rate": 1.64e-05, |
| "loss": 0.0386, |
| "loss/crossentropy": 1.7716471552848816, |
| "loss/hidden": 0.03466796875, |
| "loss/logits": 0.003940345952287316, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 0.71484375, |
| "grad_norm_var": 0.06078128814697266, |
| "learning_rate": 1.66e-05, |
| "loss": 0.0357, |
| "loss/crossentropy": 1.580382227897644, |
| "loss/hidden": 0.0322265625, |
| "loss/logits": 0.003445054404437542, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 0.66015625, |
| "grad_norm_var": 0.060633087158203126, |
| "learning_rate": 1.6800000000000002e-05, |
| "loss": 0.0378, |
| "loss/crossentropy": 1.4624913334846497, |
| "loss/hidden": 0.0345458984375, |
| "loss/logits": 0.0032839860068634152, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 0.6015625, |
| "grad_norm_var": 0.05909773508707682, |
| "learning_rate": 1.7e-05, |
| "loss": 0.0362, |
| "loss/crossentropy": 2.1083823442459106, |
| "loss/hidden": 0.0325927734375, |
| "loss/logits": 0.0036526399198919535, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 0.52734375, |
| "grad_norm_var": 0.058153025309244794, |
| "learning_rate": 1.72e-05, |
| "loss": 0.0325, |
| "loss/crossentropy": 1.717766523361206, |
| "loss/hidden": 0.02947998046875, |
| "loss/logits": 0.0030690066050738096, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 0.66796875, |
| "grad_norm_var": 0.05721918741861979, |
| "learning_rate": 1.7400000000000003e-05, |
| "loss": 0.0378, |
| "loss/crossentropy": 1.8904065489768982, |
| "loss/hidden": 0.0335693359375, |
| "loss/logits": 0.0042799420189112425, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 1.3671875, |
| "grad_norm_var": 0.047654978434244794, |
| "learning_rate": 1.76e-05, |
| "loss": 0.0335, |
| "loss/crossentropy": 1.0870572477579117, |
| "loss/hidden": 0.03125, |
| "loss/logits": 0.0022940505295991898, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 0.5234375, |
| "grad_norm_var": 0.04837112426757813, |
| "learning_rate": 1.7800000000000002e-05, |
| "loss": 0.0321, |
| "loss/crossentropy": 2.1679897904396057, |
| "loss/hidden": 0.02886962890625, |
| "loss/logits": 0.0031942062778398395, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 0.22415873209635417, |
| "learning_rate": 1.8e-05, |
| "loss": 0.0383, |
| "loss/crossentropy": 0.8709187796339393, |
| "loss/hidden": 0.036376953125, |
| "loss/logits": 0.00189178493747022, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 0.609375, |
| "grad_norm_var": 0.22345778147379558, |
| "learning_rate": 1.8200000000000002e-05, |
| "loss": 0.0351, |
| "loss/crossentropy": 2.064531624317169, |
| "loss/hidden": 0.031494140625, |
| "loss/logits": 0.003647907287813723, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 0.76171875, |
| "grad_norm_var": 0.2190743605295817, |
| "learning_rate": 1.8400000000000003e-05, |
| "loss": 0.0397, |
| "loss/crossentropy": 2.1158279180526733, |
| "loss/hidden": 0.0357666015625, |
| "loss/logits": 0.003927323617972434, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 0.58203125, |
| "grad_norm_var": 0.22078906695048015, |
| "learning_rate": 1.86e-05, |
| "loss": 0.037, |
| "loss/crossentropy": 2.194046139717102, |
| "loss/hidden": 0.03289794921875, |
| "loss/logits": 0.004065982066094875, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 0.55078125, |
| "grad_norm_var": 0.2189615249633789, |
| "learning_rate": 1.88e-05, |
| "loss": 0.0387, |
| "loss/crossentropy": 1.7997042536735535, |
| "loss/hidden": 0.034912109375, |
| "loss/logits": 0.003757980652153492, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 0.96875, |
| "grad_norm_var": 0.21743106842041016, |
| "learning_rate": 1.9e-05, |
| "loss": 0.0424, |
| "loss/crossentropy": 1.9515060782432556, |
| "loss/hidden": 0.038330078125, |
| "loss/logits": 0.004115240182727575, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 0.58203125, |
| "grad_norm_var": 0.21280604998270672, |
| "learning_rate": 1.9200000000000003e-05, |
| "loss": 0.0383, |
| "loss/crossentropy": 1.7843334674835205, |
| "loss/hidden": 0.0345458984375, |
| "loss/logits": 0.0037059325259178877, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 0.76171875, |
| "grad_norm_var": 0.20552260080973309, |
| "learning_rate": 1.94e-05, |
| "loss": 0.0366, |
| "loss/crossentropy": 1.6897225379943848, |
| "loss/hidden": 0.0333251953125, |
| "loss/logits": 0.003316206973977387, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 0.5625, |
| "grad_norm_var": 0.20833021799723309, |
| "learning_rate": 1.9600000000000002e-05, |
| "loss": 0.0392, |
| "loss/crossentropy": 1.7818755507469177, |
| "loss/hidden": 0.0352783203125, |
| "loss/logits": 0.0038841436617076397, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 0.7421875, |
| "grad_norm_var": 0.20807698567708333, |
| "learning_rate": 1.98e-05, |
| "loss": 0.0414, |
| "loss/crossentropy": 1.3952041864395142, |
| "loss/hidden": 0.0379638671875, |
| "loss/logits": 0.0034202728420495987, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.61328125, |
| "grad_norm_var": 0.20908101399739584, |
| "learning_rate": 2e-05, |
| "loss": 0.0411, |
| "loss/crossentropy": 2.257350206375122, |
| "loss/hidden": 0.0369873046875, |
| "loss/logits": 0.004119190853089094, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 94.0, |
| "grad_norm_var": 542.9932492574056, |
| "learning_rate": 2e-05, |
| "loss": 0.049, |
| "loss/crossentropy": 2.412990689277649, |
| "loss/hidden": 0.044677734375, |
| "loss/logits": 0.004347695037722588, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 5.71875, |
| "grad_norm_var": 540.451200803121, |
| "learning_rate": 2e-05, |
| "loss": 0.0655, |
| "loss/crossentropy": 1.082998514175415, |
| "loss/hidden": 0.0626220703125, |
| "loss/logits": 0.002859282889403403, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 3.828125, |
| "grad_norm_var": 538.4251312255859, |
| "learning_rate": 2e-05, |
| "loss": 0.0533, |
| "loss/crossentropy": 0.7443170174956322, |
| "loss/hidden": 0.0509033203125, |
| "loss/logits": 0.0023913229815661907, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 0.7734375, |
| "grad_norm_var": 538.9053883870442, |
| "learning_rate": 2e-05, |
| "loss": 0.0443, |
| "loss/crossentropy": 2.128484547138214, |
| "loss/hidden": 0.0394287109375, |
| "loss/logits": 0.004840084817260504, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 0.9609375, |
| "grad_norm_var": 538.5326588948568, |
| "learning_rate": 2e-05, |
| "loss": 0.0488, |
| "loss/crossentropy": 2.1107423305511475, |
| "loss/hidden": 0.0438232421875, |
| "loss/logits": 0.004928479436784983, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 1.2578125, |
| "grad_norm_var": 539.2818234761556, |
| "learning_rate": 2e-05, |
| "loss": 0.0503, |
| "loss/crossentropy": 2.3903501629829407, |
| "loss/hidden": 0.044677734375, |
| "loss/logits": 0.0056252507492899895, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 43.75, |
| "grad_norm_var": 618.3842038472493, |
| "learning_rate": 2e-05, |
| "loss": 0.0608, |
| "loss/crossentropy": 1.7641326189041138, |
| "loss/hidden": 0.0543212890625, |
| "loss/logits": 0.006453194189816713, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 0.76953125, |
| "grad_norm_var": 618.3748179117839, |
| "learning_rate": 2e-05, |
| "loss": 0.0477, |
| "loss/crossentropy": 2.0768980383872986, |
| "loss/hidden": 0.043212890625, |
| "loss/logits": 0.004535661078989506, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 0.69140625, |
| "grad_norm_var": 618.2414815266927, |
| "learning_rate": 2e-05, |
| "loss": 0.0473, |
| "loss/crossentropy": 1.6824833154678345, |
| "loss/hidden": 0.043212890625, |
| "loss/logits": 0.004120671423152089, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 1.15625, |
| "grad_norm_var": 617.5190678278606, |
| "learning_rate": 2e-05, |
| "loss": 0.0434, |
| "loss/crossentropy": 2.627159595489502, |
| "loss/hidden": 0.03857421875, |
| "loss/logits": 0.004800508031621575, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 3.78125, |
| "grad_norm_var": 614.6938419977824, |
| "learning_rate": 2e-05, |
| "loss": 0.0566, |
| "loss/crossentropy": 0.6765162199735641, |
| "loss/hidden": 0.054443359375, |
| "loss/logits": 0.0021908242197241634, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 0.734375, |
| "grad_norm_var": 614.5040545145671, |
| "learning_rate": 2e-05, |
| "loss": 0.053, |
| "loss/crossentropy": 1.8953843116760254, |
| "loss/hidden": 0.0479736328125, |
| "loss/logits": 0.00501963822171092, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 0.94140625, |
| "grad_norm_var": 614.2845865885416, |
| "learning_rate": 2e-05, |
| "loss": 0.0549, |
| "loss/crossentropy": 1.2002166509628296, |
| "loss/hidden": 0.051025390625, |
| "loss/logits": 0.0038950731977820396, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 0.796875, |
| "grad_norm_var": 613.9925486246744, |
| "learning_rate": 2e-05, |
| "loss": 0.049, |
| "loss/crossentropy": 2.166276276111603, |
| "loss/hidden": 0.044189453125, |
| "loss/logits": 0.004782476229593158, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 0.73828125, |
| "grad_norm_var": 613.9973881403605, |
| "learning_rate": 2e-05, |
| "loss": 0.0516, |
| "loss/crossentropy": 1.9635725021362305, |
| "loss/hidden": 0.046142578125, |
| "loss/logits": 0.005422875052317977, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 1.015625, |
| "grad_norm_var": 613.5022315979004, |
| "learning_rate": 2e-05, |
| "loss": 0.0531, |
| "loss/crossentropy": 1.9369041323661804, |
| "loss/hidden": 0.048095703125, |
| "loss/logits": 0.004988969303667545, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 0.88671875, |
| "grad_norm_var": 113.22293949127197, |
| "learning_rate": 2e-05, |
| "loss": 0.0487, |
| "loss/crossentropy": 1.904948651790619, |
| "loss/hidden": 0.0438232421875, |
| "loss/logits": 0.004903967492282391, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 1.03125, |
| "grad_norm_var": 113.6704797744751, |
| "learning_rate": 2e-05, |
| "loss": 0.0531, |
| "loss/crossentropy": 1.4953274130821228, |
| "loss/hidden": 0.0477294921875, |
| "loss/logits": 0.0053821399342268705, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 0.80078125, |
| "grad_norm_var": 114.29028701782227, |
| "learning_rate": 2e-05, |
| "loss": 0.0499, |
| "loss/crossentropy": 1.8553323149681091, |
| "loss/hidden": 0.0455322265625, |
| "loss/logits": 0.004347938811406493, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.65234375, |
| "grad_norm_var": 114.33934930165609, |
| "learning_rate": 2e-05, |
| "loss": 0.0507, |
| "loss/crossentropy": 2.1623928546905518, |
| "loss/hidden": 0.0458984375, |
| "loss/logits": 0.004752044100314379, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 0.60546875, |
| "grad_norm_var": 114.47933247884114, |
| "learning_rate": 2e-05, |
| "loss": 0.048, |
| "loss/crossentropy": 1.8770819902420044, |
| "loss/hidden": 0.04345703125, |
| "loss/logits": 0.004505418939515948, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 0.84375, |
| "grad_norm_var": 114.62628962198893, |
| "learning_rate": 2e-05, |
| "loss": 0.056, |
| "loss/crossentropy": 1.5817698240280151, |
| "loss/hidden": 0.0513916015625, |
| "loss/logits": 0.004640725441277027, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 0.62109375, |
| "grad_norm_var": 0.5726551691691081, |
| "learning_rate": 2e-05, |
| "loss": 0.0525, |
| "loss/crossentropy": 2.1861724853515625, |
| "loss/hidden": 0.04736328125, |
| "loss/logits": 0.005103343166410923, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 0.75, |
| "grad_norm_var": 0.5732899983723958, |
| "learning_rate": 2e-05, |
| "loss": 0.0588, |
| "loss/crossentropy": 1.7791939973831177, |
| "loss/hidden": 0.0538330078125, |
| "loss/logits": 0.004983734572306275, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 0.859375, |
| "grad_norm_var": 0.5680765151977539, |
| "learning_rate": 2e-05, |
| "loss": 0.0538, |
| "loss/crossentropy": 1.8149547576904297, |
| "loss/hidden": 0.0491943359375, |
| "loss/logits": 0.004628779133781791, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.126, |
| "grad_norm": 0.796875, |
| "grad_norm_var": 0.5693048477172852, |
| "learning_rate": 2e-05, |
| "loss": 0.0599, |
| "loss/crossentropy": 1.8067073822021484, |
| "loss/hidden": 0.0546875, |
| "loss/logits": 0.005181350978091359, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.127, |
| "grad_norm": 1.2578125, |
| "grad_norm_var": 0.02847436269124349, |
| "learning_rate": 2e-05, |
| "loss": 0.0568, |
| "loss/crossentropy": 2.025430202484131, |
| "loss/hidden": 0.05126953125, |
| "loss/logits": 0.005537106888368726, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 1.21875, |
| "grad_norm_var": 0.03675225575764974, |
| "learning_rate": 2e-05, |
| "loss": 0.0643, |
| "loss/crossentropy": 1.6609545350074768, |
| "loss/hidden": 0.0582275390625, |
| "loss/logits": 0.006046091904863715, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.129, |
| "grad_norm": 4.34375, |
| "grad_norm_var": 0.7955790201822917, |
| "learning_rate": 2e-05, |
| "loss": 0.0594, |
| "loss/crossentropy": 0.885542593896389, |
| "loss/hidden": 0.0565185546875, |
| "loss/logits": 0.0028679996030405164, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.015625, |
| "grad_norm_var": 0.790423583984375, |
| "learning_rate": 2e-05, |
| "loss": 0.0594, |
| "loss/crossentropy": 2.1523451805114746, |
| "loss/hidden": 0.053955078125, |
| "loss/logits": 0.0054600914008915424, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.131, |
| "grad_norm": 0.7734375, |
| "grad_norm_var": 0.7888528823852539, |
| "learning_rate": 2e-05, |
| "loss": 0.0534, |
| "loss/crossentropy": 2.189045548439026, |
| "loss/hidden": 0.04833984375, |
| "loss/logits": 0.005026256432756782, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 0.7265625, |
| "grad_norm_var": 0.7970204035441081, |
| "learning_rate": 2e-05, |
| "loss": 0.0629, |
| "loss/crossentropy": 2.0893185138702393, |
| "loss/hidden": 0.05712890625, |
| "loss/logits": 0.005753096425905824, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.133, |
| "grad_norm": 0.703125, |
| "grad_norm_var": 0.8037109375, |
| "learning_rate": 2e-05, |
| "loss": 0.0573, |
| "loss/crossentropy": 2.1007819771766663, |
| "loss/hidden": 0.0517578125, |
| "loss/logits": 0.005532474257051945, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.134, |
| "grad_norm": 2.8125, |
| "grad_norm_var": 0.99459228515625, |
| "learning_rate": 2e-05, |
| "loss": 0.0557, |
| "loss/crossentropy": 2.234036445617676, |
| "loss/hidden": 0.050537109375, |
| "loss/logits": 0.00520496373064816, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 1.1328125, |
| "grad_norm_var": 0.9849674860636394, |
| "learning_rate": 2e-05, |
| "loss": 0.0724, |
| "loss/crossentropy": 2.0017648339271545, |
| "loss/hidden": 0.065673828125, |
| "loss/logits": 0.006751159438863397, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 0.828125, |
| "grad_norm_var": 0.9741900126139323, |
| "learning_rate": 2e-05, |
| "loss": 0.0606, |
| "loss/crossentropy": 2.2030688524246216, |
| "loss/hidden": 0.0545654296875, |
| "loss/logits": 0.006013393867760897, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.137, |
| "grad_norm": 2.828125, |
| "grad_norm_var": 1.1051111221313477, |
| "learning_rate": 2e-05, |
| "loss": 0.0666, |
| "loss/crossentropy": 1.4665716886520386, |
| "loss/hidden": 0.0615234375, |
| "loss/logits": 0.005048300372436643, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.138, |
| "grad_norm": 0.87890625, |
| "grad_norm_var": 1.1028411865234375, |
| "learning_rate": 2e-05, |
| "loss": 0.0601, |
| "loss/crossentropy": 1.8831827044487, |
| "loss/hidden": 0.0545654296875, |
| "loss/logits": 0.005558329168707132, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.139, |
| "grad_norm": 0.890625, |
| "grad_norm_var": 1.0813058853149413, |
| "learning_rate": 2e-05, |
| "loss": 0.0597, |
| "loss/crossentropy": 1.9754237532615662, |
| "loss/hidden": 0.054443359375, |
| "loss/logits": 0.005236838245764375, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.8671875, |
| "grad_norm_var": 1.0725778579711913, |
| "learning_rate": 2e-05, |
| "loss": 0.0606, |
| "loss/crossentropy": 1.365915298461914, |
| "loss/hidden": 0.0557861328125, |
| "loss/logits": 0.004818538203835487, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.141, |
| "grad_norm": 0.99609375, |
| "grad_norm_var": 1.064422353108724, |
| "learning_rate": 2e-05, |
| "loss": 0.065, |
| "loss/crossentropy": 1.5900118350982666, |
| "loss/hidden": 0.0596923828125, |
| "loss/logits": 0.005343996454030275, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.142, |
| "grad_norm": 1.3203125, |
| "grad_norm_var": 1.0408915201822917, |
| "learning_rate": 2e-05, |
| "loss": 0.0752, |
| "loss/crossentropy": 1.846006989479065, |
| "loss/hidden": 0.068603515625, |
| "loss/logits": 0.006600282387807965, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.143, |
| "grad_norm": 1.4453125, |
| "grad_norm_var": 1.0392313639322917, |
| "learning_rate": 2e-05, |
| "loss": 0.0767, |
| "loss/crossentropy": 1.7931447625160217, |
| "loss/hidden": 0.069091796875, |
| "loss/logits": 0.00764817837625742, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 1.1328125, |
| "grad_norm_var": 1.0420427958170573, |
| "learning_rate": 2e-05, |
| "loss": 0.069, |
| "loss/crossentropy": 1.8643503785133362, |
| "loss/hidden": 0.0628662109375, |
| "loss/logits": 0.006141298217698932, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 0.828125, |
| "grad_norm_var": 0.4432879130045573, |
| "learning_rate": 2e-05, |
| "loss": 0.073, |
| "loss/crossentropy": 2.024896204471588, |
| "loss/hidden": 0.066650390625, |
| "loss/logits": 0.006367244059219956, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.146, |
| "grad_norm": 1.3515625, |
| "grad_norm_var": 0.44213968912760415, |
| "learning_rate": 2e-05, |
| "loss": 0.0771, |
| "loss/crossentropy": 1.2655363082885742, |
| "loss/hidden": 0.071044921875, |
| "loss/logits": 0.00609672162681818, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.147, |
| "grad_norm": 0.9140625, |
| "grad_norm_var": 0.4350077311197917, |
| "learning_rate": 2e-05, |
| "loss": 0.065, |
| "loss/crossentropy": 2.1826120615005493, |
| "loss/hidden": 0.0592041015625, |
| "loss/logits": 0.005788894835859537, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 3.0625, |
| "grad_norm_var": 0.619707997639974, |
| "learning_rate": 2e-05, |
| "loss": 0.0808, |
| "loss/crossentropy": 1.6820347905158997, |
| "loss/hidden": 0.072021484375, |
| "loss/logits": 0.008779237512499094, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.149, |
| "grad_norm": 0.953125, |
| "grad_norm_var": 0.6012346903483073, |
| "learning_rate": 2e-05, |
| "loss": 0.0695, |
| "loss/crossentropy": 1.9632675051689148, |
| "loss/hidden": 0.0626220703125, |
| "loss/logits": 0.006908563897013664, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 1.21875, |
| "grad_norm_var": 0.45773493448893227, |
| "learning_rate": 2e-05, |
| "loss": 0.0678, |
| "loss/crossentropy": 2.0338906049728394, |
| "loss/hidden": 0.061279296875, |
| "loss/logits": 0.006486581405624747, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.151, |
| "grad_norm": 1.2578125, |
| "grad_norm_var": 0.4560829162597656, |
| "learning_rate": 2e-05, |
| "loss": 0.0681, |
| "loss/crossentropy": 1.5496352314949036, |
| "loss/hidden": 0.0623779296875, |
| "loss/logits": 0.0057510188780725, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 1.28125, |
| "grad_norm_var": 0.4405067443847656, |
| "learning_rate": 2e-05, |
| "loss": 0.0867, |
| "loss/crossentropy": 1.3210809230804443, |
| "loss/hidden": 0.080078125, |
| "loss/logits": 0.006626329850405455, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.153, |
| "grad_norm": 0.96484375, |
| "grad_norm_var": 0.28447513580322265, |
| "learning_rate": 2e-05, |
| "loss": 0.0734, |
| "loss/crossentropy": 2.0116345286369324, |
| "loss/hidden": 0.06689453125, |
| "loss/logits": 0.00652907881885767, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.154, |
| "grad_norm": 0.72265625, |
| "grad_norm_var": 0.2929030736287435, |
| "learning_rate": 2e-05, |
| "loss": 0.0705, |
| "loss/crossentropy": 2.5001909732818604, |
| "loss/hidden": 0.063720703125, |
| "loss/logits": 0.006769401952624321, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 1.328125, |
| "grad_norm_var": 0.2867934544881185, |
| "learning_rate": 2e-05, |
| "loss": 0.0728, |
| "loss/crossentropy": 1.7657602429389954, |
| "loss/hidden": 0.067138671875, |
| "loss/logits": 0.005616480251774192, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 1.75, |
| "grad_norm_var": 0.2930582046508789, |
| "learning_rate": 2e-05, |
| "loss": 0.0768, |
| "loss/crossentropy": 1.4872492551803589, |
| "loss/hidden": 0.070556640625, |
| "loss/logits": 0.006273286417126656, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.157, |
| "grad_norm": 0.99609375, |
| "grad_norm_var": 0.2930582046508789, |
| "learning_rate": 2e-05, |
| "loss": 0.0716, |
| "loss/crossentropy": 1.4289509057998657, |
| "loss/hidden": 0.066162109375, |
| "loss/logits": 0.0054210335947573185, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.158, |
| "grad_norm": 3.390625, |
| "grad_norm_var": 0.571256446838379, |
| "learning_rate": 2e-05, |
| "loss": 0.0936, |
| "loss/crossentropy": 1.6232830286026, |
| "loss/hidden": 0.08740234375, |
| "loss/logits": 0.006192366126924753, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.159, |
| "grad_norm": 1.1875, |
| "grad_norm_var": 0.574277687072754, |
| "learning_rate": 2e-05, |
| "loss": 0.071, |
| "loss/crossentropy": 1.8413723707199097, |
| "loss/hidden": 0.06494140625, |
| "loss/logits": 0.006042992230504751, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.85546875, |
| "grad_norm_var": 0.5888264973958334, |
| "learning_rate": 2e-05, |
| "loss": 0.0768, |
| "loss/crossentropy": 1.9591792821884155, |
| "loss/hidden": 0.0697021484375, |
| "loss/logits": 0.007108409656211734, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.161, |
| "grad_norm": 7.59375, |
| "grad_norm_var": 2.952831013997396, |
| "learning_rate": 2e-05, |
| "loss": 0.0979, |
| "loss/crossentropy": 0.07059483416378498, |
| "loss/hidden": 0.096435546875, |
| "loss/logits": 0.0014511747285723686, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.162, |
| "grad_norm": 1.8515625, |
| "grad_norm_var": 2.9384429931640623, |
| "learning_rate": 2e-05, |
| "loss": 0.0837, |
| "loss/crossentropy": 1.9789559841156006, |
| "loss/hidden": 0.075927734375, |
| "loss/logits": 0.007821006467565894, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.163, |
| "grad_norm": 1.484375, |
| "grad_norm_var": 2.8888933817545572, |
| "learning_rate": 2e-05, |
| "loss": 0.0802, |
| "loss/crossentropy": 1.6365671753883362, |
| "loss/hidden": 0.07373046875, |
| "loss/logits": 0.006428756983950734, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 1.0859375, |
| "grad_norm_var": 2.818439737955729, |
| "learning_rate": 2e-05, |
| "loss": 0.0854, |
| "loss/crossentropy": 1.509697675704956, |
| "loss/hidden": 0.078857421875, |
| "loss/logits": 0.006509122438728809, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 1.2265625, |
| "grad_norm_var": 2.7942380269368488, |
| "learning_rate": 2e-05, |
| "loss": 0.0805, |
| "loss/crossentropy": 2.1344351172447205, |
| "loss/hidden": 0.072265625, |
| "loss/logits": 0.008192164823412895, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.166, |
| "grad_norm": 1.1484375, |
| "grad_norm_var": 2.7996419270833335, |
| "learning_rate": 2e-05, |
| "loss": 0.0762, |
| "loss/crossentropy": 2.1507211923599243, |
| "loss/hidden": 0.06884765625, |
| "loss/logits": 0.007326696766540408, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.167, |
| "grad_norm": 1.0546875, |
| "grad_norm_var": 2.8157623291015623, |
| "learning_rate": 2e-05, |
| "loss": 0.0877, |
| "loss/crossentropy": 1.9121323823928833, |
| "loss/hidden": 0.07958984375, |
| "loss/logits": 0.008159820456057787, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 0.71484375, |
| "grad_norm_var": 2.8708449681599935, |
| "learning_rate": 2e-05, |
| "loss": 0.0734, |
| "loss/crossentropy": 2.492128014564514, |
| "loss/hidden": 0.06640625, |
| "loss/logits": 0.006960670696571469, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.169, |
| "grad_norm": 1.03125, |
| "grad_norm_var": 2.8645253499348957, |
| "learning_rate": 2e-05, |
| "loss": 0.0759, |
| "loss/crossentropy": 2.199109196662903, |
| "loss/hidden": 0.06884765625, |
| "loss/logits": 0.007038923678919673, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 0.88671875, |
| "grad_norm_var": 2.844524892171224, |
| "learning_rate": 2e-05, |
| "loss": 0.0777, |
| "loss/crossentropy": 1.9425055384635925, |
| "loss/hidden": 0.07080078125, |
| "loss/logits": 0.0068553604651242495, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.171, |
| "grad_norm": 0.890625, |
| "grad_norm_var": 2.8795875549316405, |
| "learning_rate": 2e-05, |
| "loss": 0.0786, |
| "loss/crossentropy": 1.8932055234909058, |
| "loss/hidden": 0.072021484375, |
| "loss/logits": 0.006576135754585266, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 0.8671875, |
| "grad_norm_var": 2.922032674153646, |
| "learning_rate": 2e-05, |
| "loss": 0.0836, |
| "loss/crossentropy": 2.2985092401504517, |
| "loss/hidden": 0.075927734375, |
| "loss/logits": 0.00770363537594676, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.173, |
| "grad_norm": 0.9453125, |
| "grad_norm_var": 2.9265644709269205, |
| "learning_rate": 2e-05, |
| "loss": 0.094, |
| "loss/crossentropy": 1.5614506006240845, |
| "loss/hidden": 0.085693359375, |
| "loss/logits": 0.00831273477524519, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.174, |
| "grad_norm": 0.89453125, |
| "grad_norm_var": 2.7328165690104167, |
| "learning_rate": 2e-05, |
| "loss": 0.0862, |
| "loss/crossentropy": 2.5026639699935913, |
| "loss/hidden": 0.07763671875, |
| "loss/logits": 0.008605414070189, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 0.8828125, |
| "grad_norm_var": 2.7505999247233075, |
| "learning_rate": 2e-05, |
| "loss": 0.0919, |
| "loss/crossentropy": 2.3450429439544678, |
| "loss/hidden": 0.0830078125, |
| "loss/logits": 0.008909917902201414, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 1.40625, |
| "grad_norm_var": 2.724916521708171, |
| "learning_rate": 2e-05, |
| "loss": 0.0915, |
| "loss/crossentropy": 2.098711371421814, |
| "loss/hidden": 0.0830078125, |
| "loss/logits": 0.008489594794809818, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.177, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 0.09405256907145182, |
| "learning_rate": 2e-05, |
| "loss": 0.0933, |
| "loss/crossentropy": 1.5701825618743896, |
| "loss/hidden": 0.0859375, |
| "loss/logits": 0.0073861428536474705, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.178, |
| "grad_norm": 0.80078125, |
| "grad_norm_var": 0.060343424479166664, |
| "learning_rate": 2e-05, |
| "loss": 0.0898, |
| "loss/crossentropy": 1.9461122155189514, |
| "loss/hidden": 0.08203125, |
| "loss/logits": 0.007745326962321997, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.179, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 0.05778299967447917, |
| "learning_rate": 2e-05, |
| "loss": 0.0995, |
| "loss/crossentropy": 1.5463888049125671, |
| "loss/hidden": 0.090576171875, |
| "loss/logits": 0.008878839667886496, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 4.15625, |
| "grad_norm_var": 0.6617510477701823, |
| "learning_rate": 2e-05, |
| "loss": 0.1124, |
| "loss/crossentropy": 0.4524843990802765, |
| "loss/hidden": 0.109130859375, |
| "loss/logits": 0.0032549845636822283, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.181, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 0.661974843343099, |
| "learning_rate": 2e-05, |
| "loss": 0.0863, |
| "loss/crossentropy": 2.0084250569343567, |
| "loss/hidden": 0.07861328125, |
| "loss/logits": 0.0077257591765373945, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.182, |
| "grad_norm": 1.734375, |
| "grad_norm_var": 0.6757649739583333, |
| "learning_rate": 2e-05, |
| "loss": 0.0932, |
| "loss/crossentropy": 2.200223922729492, |
| "loss/hidden": 0.08544921875, |
| "loss/logits": 0.007743534166365862, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.183, |
| "grad_norm": 1.828125, |
| "grad_norm_var": 0.689587148030599, |
| "learning_rate": 2e-05, |
| "loss": 0.101, |
| "loss/crossentropy": 1.6961349844932556, |
| "loss/hidden": 0.092529296875, |
| "loss/logits": 0.008438969030976295, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 1.1640625, |
| "grad_norm_var": 0.6652617772420247, |
| "learning_rate": 2e-05, |
| "loss": 0.1007, |
| "loss/crossentropy": 1.51398366689682, |
| "loss/hidden": 0.091796875, |
| "loss/logits": 0.00888208020478487, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 0.734375, |
| "grad_norm_var": 0.6837681452433269, |
| "learning_rate": 2e-05, |
| "loss": 0.087, |
| "loss/crossentropy": 2.4364923238754272, |
| "loss/hidden": 0.079345703125, |
| "loss/logits": 0.007686517434194684, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.186, |
| "grad_norm": 1.25, |
| "grad_norm_var": 0.6700091044108073, |
| "learning_rate": 2e-05, |
| "loss": 0.1027, |
| "loss/crossentropy": 1.7433177828788757, |
| "loss/hidden": 0.09375, |
| "loss/logits": 0.008915970101952553, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.187, |
| "grad_norm": 1.5, |
| "grad_norm_var": 0.6547747294108073, |
| "learning_rate": 2e-05, |
| "loss": 0.0937, |
| "loss/crossentropy": 2.1145116686820984, |
| "loss/hidden": 0.08447265625, |
| "loss/logits": 0.009221571031957865, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 0.84375, |
| "grad_norm_var": 0.6564798990885417, |
| "learning_rate": 2e-05, |
| "loss": 0.089, |
| "loss/crossentropy": 2.3255231976509094, |
| "loss/hidden": 0.080810546875, |
| "loss/logits": 0.008193989749997854, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.189, |
| "grad_norm": 0.7421875, |
| "grad_norm_var": 0.6713836669921875, |
| "learning_rate": 2e-05, |
| "loss": 0.0937, |
| "loss/crossentropy": 1.9500952363014221, |
| "loss/hidden": 0.085205078125, |
| "loss/logits": 0.008466629311442375, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 12.875, |
| "grad_norm_var": 8.854332415262858, |
| "learning_rate": 2e-05, |
| "loss": 0.1398, |
| "loss/crossentropy": 1.5803423523902893, |
| "loss/hidden": 0.130126953125, |
| "loss/logits": 0.009700319729745388, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.191, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 8.742569414774577, |
| "learning_rate": 2e-05, |
| "loss": 0.0831, |
| "loss/crossentropy": 0.8776814043521881, |
| "loss/hidden": 0.07861328125, |
| "loss/logits": 0.0044753485126420856, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 1.0390625, |
| "grad_norm_var": 8.79083449045817, |
| "learning_rate": 2e-05, |
| "loss": 0.0981, |
| "loss/crossentropy": 2.157645583152771, |
| "loss/hidden": 0.08984375, |
| "loss/logits": 0.00824650377035141, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.193, |
| "grad_norm": 0.828125, |
| "grad_norm_var": 8.883497556050619, |
| "learning_rate": 2e-05, |
| "loss": 0.095, |
| "loss/crossentropy": 1.819543182849884, |
| "loss/hidden": 0.087158203125, |
| "loss/logits": 0.007819573860615492, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.194, |
| "grad_norm": 1.1484375, |
| "grad_norm_var": 8.828344472249348, |
| "learning_rate": 2e-05, |
| "loss": 0.1045, |
| "loss/crossentropy": 1.8836165070533752, |
| "loss/hidden": 0.095947265625, |
| "loss/logits": 0.008551866048946977, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 0.9296875, |
| "grad_norm_var": 8.89441630045573, |
| "learning_rate": 2e-05, |
| "loss": 0.1058, |
| "loss/crossentropy": 1.6061448454856873, |
| "loss/hidden": 0.097412109375, |
| "loss/logits": 0.008399839047342539, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 2.375, |
| "grad_norm_var": 8.61470438639323, |
| "learning_rate": 2e-05, |
| "loss": 0.0946, |
| "loss/crossentropy": 1.1957413852214813, |
| "loss/hidden": 0.0888671875, |
| "loss/logits": 0.005780589068308473, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.197, |
| "grad_norm": 1.921875, |
| "grad_norm_var": 8.578641510009765, |
| "learning_rate": 2e-05, |
| "loss": 0.1091, |
| "loss/crossentropy": 1.9956754446029663, |
| "loss/hidden": 0.099609375, |
| "loss/logits": 0.009533480275422335, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.198, |
| "grad_norm": 1.484375, |
| "grad_norm_var": 8.593761952718099, |
| "learning_rate": 2e-05, |
| "loss": 0.0932, |
| "loss/crossentropy": 2.0423865914344788, |
| "loss/hidden": 0.084716796875, |
| "loss/logits": 0.008501260075718164, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.199, |
| "grad_norm": 0.9765625, |
| "grad_norm_var": 8.664864095052083, |
| "learning_rate": 2e-05, |
| "loss": 0.0965, |
| "loss/crossentropy": 2.0403915643692017, |
| "loss/hidden": 0.088134765625, |
| "loss/logits": 0.008330879732966423, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 1.21875, |
| "grad_norm_var": 8.658941396077473, |
| "learning_rate": 2e-05, |
| "loss": 0.1046, |
| "loss/crossentropy": 1.79813152551651, |
| "loss/hidden": 0.095703125, |
| "loss/logits": 0.008865772746503353, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.201, |
| "grad_norm": 0.984375, |
| "grad_norm_var": 8.62048110961914, |
| "learning_rate": 2e-05, |
| "loss": 0.0998, |
| "loss/crossentropy": 1.9608579874038696, |
| "loss/hidden": 0.09130859375, |
| "loss/logits": 0.00849946541711688, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.202, |
| "grad_norm": 0.93359375, |
| "grad_norm_var": 8.65926456451416, |
| "learning_rate": 2e-05, |
| "loss": 0.1103, |
| "loss/crossentropy": 1.5905942916870117, |
| "loss/hidden": 0.10205078125, |
| "loss/logits": 0.00825613015331328, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.203, |
| "grad_norm": 3.09375, |
| "grad_norm_var": 8.711507606506348, |
| "learning_rate": 2e-05, |
| "loss": 0.1068, |
| "loss/crossentropy": 1.1885337010025978, |
| "loss/hidden": 0.101318359375, |
| "loss/logits": 0.005469106370583177, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.204, |
| "grad_norm": 1.046875, |
| "grad_norm_var": 8.680040423075358, |
| "learning_rate": 2e-05, |
| "loss": 0.1043, |
| "loss/crossentropy": 2.019789695739746, |
| "loss/hidden": 0.095703125, |
| "loss/logits": 0.0086362911388278, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 0.953125, |
| "grad_norm_var": 8.64425245920817, |
| "learning_rate": 2e-05, |
| "loss": 0.1081, |
| "loss/crossentropy": 1.9956069588661194, |
| "loss/hidden": 0.0986328125, |
| "loss/logits": 0.009510128758847713, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.206, |
| "grad_norm": 0.90234375, |
| "grad_norm_var": 0.4452044169108073, |
| "learning_rate": 2e-05, |
| "loss": 0.1047, |
| "loss/crossentropy": 1.838368535041809, |
| "loss/hidden": 0.09619140625, |
| "loss/logits": 0.008526989258825779, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.207, |
| "grad_norm": 1.6015625, |
| "grad_norm_var": 0.399859619140625, |
| "learning_rate": 2e-05, |
| "loss": 0.0996, |
| "loss/crossentropy": 1.0530972927808762, |
| "loss/hidden": 0.094482421875, |
| "loss/logits": 0.0051099995616823435, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 1.09375, |
| "grad_norm_var": 0.3978533426920573, |
| "learning_rate": 2e-05, |
| "loss": 0.1135, |
| "loss/crossentropy": 1.8649645447731018, |
| "loss/hidden": 0.10400390625, |
| "loss/logits": 0.00953456899151206, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.209, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 0.4300188700358073, |
| "learning_rate": 2e-05, |
| "loss": 0.1115, |
| "loss/crossentropy": 2.3657619953155518, |
| "loss/hidden": 0.100830078125, |
| "loss/logits": 0.01067457627505064, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 1.71875, |
| "grad_norm_var": 0.4286265055338542, |
| "learning_rate": 2e-05, |
| "loss": 0.1102, |
| "loss/crossentropy": 1.952780544757843, |
| "loss/hidden": 0.1005859375, |
| "loss/logits": 0.009660405106842518, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.211, |
| "grad_norm": 0.97265625, |
| "grad_norm_var": 0.4256479263305664, |
| "learning_rate": 2e-05, |
| "loss": 0.1107, |
| "loss/crossentropy": 1.7751940488815308, |
| "loss/hidden": 0.10205078125, |
| "loss/logits": 0.008633972378447652, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.212, |
| "grad_norm": 0.92578125, |
| "grad_norm_var": 0.38250630696614585, |
| "learning_rate": 2e-05, |
| "loss": 0.1148, |
| "loss/crossentropy": 1.8458402156829834, |
| "loss/hidden": 0.105224609375, |
| "loss/logits": 0.009561538230627775, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.213, |
| "grad_norm": 1.0546875, |
| "grad_norm_var": 0.36706517537434896, |
| "learning_rate": 2e-05, |
| "loss": 0.1085, |
| "loss/crossentropy": 2.008933424949646, |
| "loss/hidden": 0.098876953125, |
| "loss/logits": 0.009649577550590038, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.214, |
| "grad_norm": 1.109375, |
| "grad_norm_var": 0.368017323811849, |
| "learning_rate": 2e-05, |
| "loss": 0.1136, |
| "loss/crossentropy": 1.6732726097106934, |
| "loss/hidden": 0.104736328125, |
| "loss/logits": 0.008881766349077225, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 1.46875, |
| "grad_norm_var": 0.3616566975911458, |
| "learning_rate": 2e-05, |
| "loss": 0.1176, |
| "loss/crossentropy": 1.7936006784439087, |
| "loss/hidden": 0.1083984375, |
| "loss/logits": 0.009185456205159426, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.216, |
| "grad_norm": 0.765625, |
| "grad_norm_var": 0.3815104166666667, |
| "learning_rate": 2e-05, |
| "loss": 0.1129, |
| "loss/crossentropy": 2.183436155319214, |
| "loss/hidden": 0.1025390625, |
| "loss/logits": 0.01038127625361085, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.217, |
| "grad_norm": 1.3984375, |
| "grad_norm_var": 0.3744341532389323, |
| "learning_rate": 2e-05, |
| "loss": 0.1261, |
| "loss/crossentropy": 1.854296088218689, |
| "loss/hidden": 0.113525390625, |
| "loss/logits": 0.012611255049705505, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.218, |
| "grad_norm": 1.0390625, |
| "grad_norm_var": 0.3695194880167643, |
| "learning_rate": 2e-05, |
| "loss": 0.1126, |
| "loss/crossentropy": 1.6531599760055542, |
| "loss/hidden": 0.10400390625, |
| "loss/logits": 0.008608407340943813, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.219, |
| "grad_norm": 1.625, |
| "grad_norm_var": 0.16072940826416016, |
| "learning_rate": 2e-05, |
| "loss": 0.1235, |
| "loss/crossentropy": 1.841816484928131, |
| "loss/hidden": 0.111083984375, |
| "loss/logits": 0.012444535735994577, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 1.4140625, |
| "grad_norm_var": 0.159342892964681, |
| "learning_rate": 2e-05, |
| "loss": 0.1123, |
| "loss/crossentropy": 1.8304393887519836, |
| "loss/hidden": 0.1025390625, |
| "loss/logits": 0.009785267058759928, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.221, |
| "grad_norm": 48.75, |
| "grad_norm_var": 140.92207330067953, |
| "learning_rate": 2e-05, |
| "loss": 0.1595, |
| "loss/crossentropy": 1.5171640515327454, |
| "loss/hidden": 0.1435546875, |
| "loss/logits": 0.01598053053021431, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.222, |
| "grad_norm": 3.0, |
| "grad_norm_var": 140.2586690266927, |
| "learning_rate": 2e-05, |
| "loss": 0.1352, |
| "loss/crossentropy": 1.230861783027649, |
| "loss/hidden": 0.12548828125, |
| "loss/logits": 0.00975158391520381, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.223, |
| "grad_norm": 0.96875, |
| "grad_norm_var": 140.51885960896809, |
| "learning_rate": 2e-05, |
| "loss": 0.1252, |
| "loss/crossentropy": 1.9903615713119507, |
| "loss/hidden": 0.11474609375, |
| "loss/logits": 0.010449206922203302, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 0.96875, |
| "grad_norm_var": 140.57409235636393, |
| "learning_rate": 2e-05, |
| "loss": 0.1248, |
| "loss/crossentropy": 1.9731428623199463, |
| "loss/hidden": 0.114501953125, |
| "loss/logits": 0.010306693147867918, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 1.7421875, |
| "grad_norm_var": 140.74032084147134, |
| "learning_rate": 2e-05, |
| "loss": 0.1181, |
| "loss/crossentropy": 2.2073622941970825, |
| "loss/hidden": 0.1083984375, |
| "loss/logits": 0.009740452282130718, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.226, |
| "grad_norm": 1.421875, |
| "grad_norm_var": 140.84830525716146, |
| "learning_rate": 2e-05, |
| "loss": 0.1331, |
| "loss/crossentropy": 1.8276602029800415, |
| "loss/hidden": 0.12158203125, |
| "loss/logits": 0.011564167682081461, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.227, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 140.60635369618734, |
| "learning_rate": 2e-05, |
| "loss": 0.1277, |
| "loss/crossentropy": 1.6165171265602112, |
| "loss/hidden": 0.11865234375, |
| "loss/logits": 0.009028040803968906, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.228, |
| "grad_norm": 1.640625, |
| "grad_norm_var": 140.31416829427084, |
| "learning_rate": 2e-05, |
| "loss": 0.1452, |
| "loss/crossentropy": 1.9070194959640503, |
| "loss/hidden": 0.132080078125, |
| "loss/logits": 0.01316736824810505, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.229, |
| "grad_norm": 1.2421875, |
| "grad_norm_var": 140.23345540364582, |
| "learning_rate": 2e-05, |
| "loss": 0.1219, |
| "loss/crossentropy": 1.7599803805351257, |
| "loss/hidden": 0.11181640625, |
| "loss/logits": 0.010045101400464773, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 1.2109375, |
| "grad_norm_var": 140.18977228800455, |
| "learning_rate": 2e-05, |
| "loss": 0.1235, |
| "loss/crossentropy": 1.7666748762130737, |
| "loss/hidden": 0.11328125, |
| "loss/logits": 0.010186517611145973, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.231, |
| "grad_norm": 0.90625, |
| "grad_norm_var": 140.4285784403483, |
| "learning_rate": 2e-05, |
| "loss": 0.1227, |
| "loss/crossentropy": 2.1898998022079468, |
| "loss/hidden": 0.111083984375, |
| "loss/logits": 0.011586464941501617, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.232, |
| "grad_norm": 3.34375, |
| "grad_norm_var": 139.61049372355143, |
| "learning_rate": 2e-05, |
| "loss": 0.1371, |
| "loss/crossentropy": 0.8376815989613533, |
| "loss/hidden": 0.13134765625, |
| "loss/logits": 0.005797527148388326, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.233, |
| "grad_norm": 0.97265625, |
| "grad_norm_var": 139.79876194000244, |
| "learning_rate": 2e-05, |
| "loss": 0.1231, |
| "loss/crossentropy": 2.154396891593933, |
| "loss/hidden": 0.11328125, |
| "loss/logits": 0.00980774499475956, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.234, |
| "grad_norm": 3.96875, |
| "grad_norm_var": 138.98775730133056, |
| "learning_rate": 2e-05, |
| "loss": 0.1401, |
| "loss/crossentropy": 1.683815360069275, |
| "loss/hidden": 0.130859375, |
| "loss/logits": 0.009223262313753366, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 3.046875, |
| "grad_norm_var": 138.5365248998006, |
| "learning_rate": 2e-05, |
| "loss": 0.1141, |
| "loss/crossentropy": 0.9084681533277035, |
| "loss/hidden": 0.109130859375, |
| "loss/logits": 0.004976746684405953, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.236, |
| "grad_norm": 1.1015625, |
| "grad_norm_var": 138.68206322987874, |
| "learning_rate": 2e-05, |
| "loss": 0.125, |
| "loss/crossentropy": 2.260706901550293, |
| "loss/hidden": 0.114501953125, |
| "loss/logits": 0.010530740953981876, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.237, |
| "grad_norm": 0.96484375, |
| "grad_norm_var": 0.9987485249837239, |
| "learning_rate": 2e-05, |
| "loss": 0.1338, |
| "loss/crossentropy": 1.9574591517448425, |
| "loss/hidden": 0.121337890625, |
| "loss/logits": 0.012480344157665968, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.238, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 0.8951250712076823, |
| "learning_rate": 2e-05, |
| "loss": 0.1468, |
| "loss/crossentropy": 1.6318422555923462, |
| "loss/hidden": 0.13525390625, |
| "loss/logits": 0.011589228175580502, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.239, |
| "grad_norm": 3.15625, |
| "grad_norm_var": 0.9952430725097656, |
| "learning_rate": 2e-05, |
| "loss": 0.146, |
| "loss/crossentropy": 1.0812250077724457, |
| "loss/hidden": 0.1376953125, |
| "loss/logits": 0.00828252313658595, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.234375, |
| "grad_norm_var": 0.9706520080566406, |
| "learning_rate": 2e-05, |
| "loss": 0.1321, |
| "loss/crossentropy": 1.6809369325637817, |
| "loss/hidden": 0.12109375, |
| "loss/logits": 0.011021927930414677, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.241, |
| "grad_norm": 1.8046875, |
| "grad_norm_var": 0.970379384358724, |
| "learning_rate": 2e-05, |
| "loss": 0.1399, |
| "loss/crossentropy": 2.109734356403351, |
| "loss/hidden": 0.129150390625, |
| "loss/logits": 0.010797888273373246, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.242, |
| "grad_norm": 1.90625, |
| "grad_norm_var": 0.9600990295410157, |
| "learning_rate": 2e-05, |
| "loss": 0.1826, |
| "loss/crossentropy": 1.712592601776123, |
| "loss/hidden": 0.164794921875, |
| "loss/logits": 0.017772471997886896, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.243, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 0.9821624755859375, |
| "learning_rate": 2e-05, |
| "loss": 0.1467, |
| "loss/crossentropy": 1.7069213390350342, |
| "loss/hidden": 0.13525390625, |
| "loss/logits": 0.01143598323687911, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.244, |
| "grad_norm": 1.3828125, |
| "grad_norm_var": 0.9951454162597656, |
| "learning_rate": 2e-05, |
| "loss": 0.1557, |
| "loss/crossentropy": 1.9101948738098145, |
| "loss/hidden": 0.14111328125, |
| "loss/logits": 0.01458168076351285, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.245, |
| "grad_norm": 1.796875, |
| "grad_norm_var": 0.9671040852864583, |
| "learning_rate": 2e-05, |
| "loss": 0.182, |
| "loss/crossentropy": 1.500315010547638, |
| "loss/hidden": 0.16650390625, |
| "loss/logits": 0.015495330560952425, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.246, |
| "grad_norm": 1.2890625, |
| "grad_norm_var": 0.9601409912109375, |
| "learning_rate": 2e-05, |
| "loss": 0.1356, |
| "loss/crossentropy": 1.9563832879066467, |
| "loss/hidden": 0.125, |
| "loss/logits": 0.010621066205203533, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.247, |
| "grad_norm": 1.3203125, |
| "grad_norm_var": 0.9148394266764323, |
| "learning_rate": 2e-05, |
| "loss": 0.1562, |
| "loss/crossentropy": 1.7067843675613403, |
| "loss/hidden": 0.14306640625, |
| "loss/logits": 0.013140381313860416, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.248, |
| "grad_norm": 5.65625, |
| "grad_norm_var": 1.6798011779785156, |
| "learning_rate": 2e-05, |
| "loss": 0.1564, |
| "loss/crossentropy": 1.740093469619751, |
| "loss/hidden": 0.14404296875, |
| "loss/logits": 0.012368456460535526, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.249, |
| "grad_norm": 1.03125, |
| "grad_norm_var": 1.6712762832641601, |
| "learning_rate": 2e-05, |
| "loss": 0.1465, |
| "loss/crossentropy": 1.6390604376792908, |
| "loss/hidden": 0.13525390625, |
| "loss/logits": 0.011263488791882992, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 1.4282775243123373, |
| "learning_rate": 2e-05, |
| "loss": 0.138, |
| "loss/crossentropy": 1.4673374891281128, |
| "loss/hidden": 0.12744140625, |
| "loss/logits": 0.010537488851696253, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.251, |
| "grad_norm": 1.1015625, |
| "grad_norm_var": 1.3907897313435873, |
| "learning_rate": 2e-05, |
| "loss": 0.1444, |
| "loss/crossentropy": 2.2028943300247192, |
| "loss/hidden": 0.130859375, |
| "loss/logits": 0.013547219336032867, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.252, |
| "grad_norm": 6.09375, |
| "grad_norm_var": 2.4376540501912434, |
| "learning_rate": 2e-05, |
| "loss": 0.1647, |
| "loss/crossentropy": 2.0260573029518127, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.01333728851750493, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.253, |
| "grad_norm": 6.03125, |
| "grad_norm_var": 3.2204566955566407, |
| "learning_rate": 2e-05, |
| "loss": 0.1512, |
| "loss/crossentropy": 0.34831926599144936, |
| "loss/hidden": 0.1484375, |
| "loss/logits": 0.0027739905344787985, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.254, |
| "grad_norm": 1.9921875, |
| "grad_norm_var": 3.1469797770182293, |
| "learning_rate": 2e-05, |
| "loss": 0.1476, |
| "loss/crossentropy": 2.248442053794861, |
| "loss/hidden": 0.13427734375, |
| "loss/logits": 0.013290174771100283, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.255, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 3.1591837565104166, |
| "learning_rate": 2e-05, |
| "loss": 0.1564, |
| "loss/crossentropy": 2.827474355697632, |
| "loss/hidden": 0.14208984375, |
| "loss/logits": 0.014359638560563326, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 1.90625, |
| "grad_norm_var": 3.0787424723307293, |
| "learning_rate": 2e-05, |
| "loss": 0.1662, |
| "loss/crossentropy": 1.315816342830658, |
| "loss/hidden": 0.15380859375, |
| "loss/logits": 0.012355703860521317, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.257, |
| "grad_norm": 1.4296875, |
| "grad_norm_var": 3.121760050455729, |
| "learning_rate": 2e-05, |
| "loss": 0.1513, |
| "loss/crossentropy": 1.420203685760498, |
| "loss/hidden": 0.14013671875, |
| "loss/logits": 0.011188656091690063, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.258, |
| "grad_norm": 1.1796875, |
| "grad_norm_var": 3.2089617411295572, |
| "learning_rate": 2e-05, |
| "loss": 0.1473, |
| "loss/crossentropy": 1.659629762172699, |
| "loss/hidden": 0.13623046875, |
| "loss/logits": 0.011019795201718807, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.259, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 3.2086260477701822, |
| "learning_rate": 2e-05, |
| "loss": 0.1416, |
| "loss/crossentropy": 2.383759617805481, |
| "loss/hidden": 0.130859375, |
| "loss/logits": 0.01075922092422843, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 1.21875, |
| "grad_norm_var": 3.232770792643229, |
| "learning_rate": 2e-05, |
| "loss": 0.1489, |
| "loss/crossentropy": 2.2486014366149902, |
| "loss/hidden": 0.1357421875, |
| "loss/logits": 0.013177596032619476, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.261, |
| "grad_norm": 1.9765625, |
| "grad_norm_var": 3.2203529357910154, |
| "learning_rate": 2e-05, |
| "loss": 0.1598, |
| "loss/crossentropy": 1.5499208569526672, |
| "loss/hidden": 0.1435546875, |
| "loss/logits": 0.01623948197811842, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.262, |
| "grad_norm": 1.3359375, |
| "grad_norm_var": 3.2134803771972655, |
| "learning_rate": 2e-05, |
| "loss": 0.1576, |
| "loss/crossentropy": 2.002042293548584, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.013080449774861336, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.263, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 3.2100982666015625, |
| "learning_rate": 2e-05, |
| "loss": 0.1644, |
| "loss/crossentropy": 1.708718717098236, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.013069577515125275, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.264, |
| "grad_norm": 1.7890625, |
| "grad_norm_var": 2.4735450744628906, |
| "learning_rate": 2e-05, |
| "loss": 0.1842, |
| "loss/crossentropy": 1.708518147468567, |
| "loss/hidden": 0.1689453125, |
| "loss/logits": 0.015263590961694717, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.265, |
| "grad_norm": 1.75, |
| "grad_norm_var": 2.3963823954264325, |
| "learning_rate": 2e-05, |
| "loss": 0.1611, |
| "loss/crossentropy": 2.0935128927230835, |
| "loss/hidden": 0.14794921875, |
| "loss/logits": 0.013175863306969404, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.266, |
| "grad_norm": 1.2578125, |
| "grad_norm_var": 2.4529693603515623, |
| "learning_rate": 2e-05, |
| "loss": 0.1585, |
| "loss/crossentropy": 2.1675453186035156, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.014018273446708918, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.267, |
| "grad_norm": 1.6953125, |
| "grad_norm_var": 2.3917388916015625, |
| "learning_rate": 2e-05, |
| "loss": 0.1635, |
| "loss/crossentropy": 1.6719801425933838, |
| "loss/hidden": 0.15087890625, |
| "loss/logits": 0.012653316371142864, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.268, |
| "grad_norm": 1.984375, |
| "grad_norm_var": 1.3084798177083334, |
| "learning_rate": 2e-05, |
| "loss": 0.1916, |
| "loss/crossentropy": 1.020781397819519, |
| "loss/hidden": 0.1787109375, |
| "loss/logits": 0.012872546911239624, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.269, |
| "grad_norm": 0.9765625, |
| "grad_norm_var": 0.1436968485514323, |
| "learning_rate": 2e-05, |
| "loss": 0.1506, |
| "loss/crossentropy": 1.80218106508255, |
| "loss/hidden": 0.1376953125, |
| "loss/logits": 0.01287886407226324, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 1.2578125, |
| "grad_norm_var": 0.14073257446289061, |
| "learning_rate": 2e-05, |
| "loss": 0.1604, |
| "loss/crossentropy": 1.7428264021873474, |
| "loss/hidden": 0.14794921875, |
| "loss/logits": 0.012486261315643787, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.271, |
| "grad_norm": 1.6484375, |
| "grad_norm_var": 0.13996175130208333, |
| "learning_rate": 2e-05, |
| "loss": 0.1521, |
| "loss/crossentropy": 1.1585648953914642, |
| "loss/hidden": 0.14404296875, |
| "loss/logits": 0.008048155345022678, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 1.2578125, |
| "grad_norm_var": 0.13702774047851562, |
| "learning_rate": 2e-05, |
| "loss": 0.1655, |
| "loss/crossentropy": 1.9835703372955322, |
| "loss/hidden": 0.15185546875, |
| "loss/logits": 0.013653023168444633, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.273, |
| "grad_norm": 0.984375, |
| "grad_norm_var": 0.1552490234375, |
| "learning_rate": 2e-05, |
| "loss": 0.1703, |
| "loss/crossentropy": 2.008173108100891, |
| "loss/hidden": 0.1552734375, |
| "loss/logits": 0.015069750137627125, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.274, |
| "grad_norm": 1.28125, |
| "grad_norm_var": 0.15155614217122396, |
| "learning_rate": 2e-05, |
| "loss": 0.1827, |
| "loss/crossentropy": 1.7864757776260376, |
| "loss/hidden": 0.16650390625, |
| "loss/logits": 0.016235563438385725, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.275, |
| "grad_norm": 1.421875, |
| "grad_norm_var": 0.10174128214518229, |
| "learning_rate": 2e-05, |
| "loss": 0.1897, |
| "loss/crossentropy": 1.565641164779663, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.01689150184392929, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.276, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 0.09888483683268229, |
| "learning_rate": 2e-05, |
| "loss": 0.1613, |
| "loss/crossentropy": 2.0091532468795776, |
| "loss/hidden": 0.14892578125, |
| "loss/logits": 0.01232942147180438, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.277, |
| "grad_norm": 1.0703125, |
| "grad_norm_var": 0.08737970987955729, |
| "learning_rate": 2e-05, |
| "loss": 0.1649, |
| "loss/crossentropy": 1.713157057762146, |
| "loss/hidden": 0.15283203125, |
| "loss/logits": 0.012066506315022707, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.278, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 0.1059234619140625, |
| "learning_rate": 2e-05, |
| "loss": 0.1488, |
| "loss/crossentropy": 2.7945786714553833, |
| "loss/hidden": 0.1357421875, |
| "loss/logits": 0.01302829384803772, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.279, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 0.1059234619140625, |
| "learning_rate": 2e-05, |
| "loss": 0.1874, |
| "loss/crossentropy": 1.5954543948173523, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.014592200517654419, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 3.875, |
| "grad_norm_var": 0.4753761291503906, |
| "learning_rate": 2e-05, |
| "loss": 0.1574, |
| "loss/crossentropy": 0.8256451673805714, |
| "loss/hidden": 0.14990234375, |
| "loss/logits": 0.007495811965782195, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.281, |
| "grad_norm": 1.84375, |
| "grad_norm_var": 0.47818984985351565, |
| "learning_rate": 2e-05, |
| "loss": 0.1664, |
| "loss/crossentropy": 2.1294795870780945, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.015040545724332333, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.282, |
| "grad_norm": 1.125, |
| "grad_norm_var": 0.4849039713541667, |
| "learning_rate": 2e-05, |
| "loss": 0.1653, |
| "loss/crossentropy": 2.227915644645691, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.013931581284850836, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.283, |
| "grad_norm": 1.453125, |
| "grad_norm_var": 0.48440729777018227, |
| "learning_rate": 2e-05, |
| "loss": 0.1845, |
| "loss/crossentropy": 2.2042417526245117, |
| "loss/hidden": 0.1669921875, |
| "loss/logits": 0.017517327796667814, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.284, |
| "grad_norm": 1.25, |
| "grad_norm_var": 0.4757057189941406, |
| "learning_rate": 2e-05, |
| "loss": 0.1747, |
| "loss/crossentropy": 1.6324898600578308, |
| "loss/hidden": 0.16162109375, |
| "loss/logits": 0.013037709519267082, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.285, |
| "grad_norm": 1.984375, |
| "grad_norm_var": 0.46812744140625, |
| "learning_rate": 2e-05, |
| "loss": 0.184, |
| "loss/crossentropy": 1.3294448852539062, |
| "loss/hidden": 0.1708984375, |
| "loss/logits": 0.01308374060317874, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.286, |
| "grad_norm": 1.5859375, |
| "grad_norm_var": 0.4612701416015625, |
| "learning_rate": 2e-05, |
| "loss": 0.1663, |
| "loss/crossentropy": 2.046096980571747, |
| "loss/hidden": 0.1533203125, |
| "loss/logits": 0.012951537501066923, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.287, |
| "grad_norm": 1.453125, |
| "grad_norm_var": 0.462103017171224, |
| "learning_rate": 2e-05, |
| "loss": 0.2072, |
| "loss/crossentropy": 1.6311957836151123, |
| "loss/hidden": 0.18994140625, |
| "loss/logits": 0.017279242165386677, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 1.1640625, |
| "grad_norm_var": 0.46663792928059894, |
| "learning_rate": 2e-05, |
| "loss": 0.1617, |
| "loss/crossentropy": 2.061966359615326, |
| "loss/hidden": 0.1474609375, |
| "loss/logits": 0.014273086562752724, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.289, |
| "grad_norm": 1.796875, |
| "grad_norm_var": 0.44436823527018227, |
| "learning_rate": 2e-05, |
| "loss": 0.2079, |
| "loss/crossentropy": 2.167171001434326, |
| "loss/hidden": 0.1884765625, |
| "loss/logits": 0.01938449963927269, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 1.5390625, |
| "grad_norm_var": 0.4368235270182292, |
| "learning_rate": 2e-05, |
| "loss": 0.1681, |
| "loss/crossentropy": 1.8964014649391174, |
| "loss/hidden": 0.15478515625, |
| "loss/logits": 0.01332055265083909, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.291, |
| "grad_norm": 1.8203125, |
| "grad_norm_var": 0.4352801005045573, |
| "learning_rate": 2e-05, |
| "loss": 0.1799, |
| "loss/crossentropy": 1.615691602230072, |
| "loss/hidden": 0.16650390625, |
| "loss/logits": 0.013393020257353783, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.292, |
| "grad_norm": 1.09375, |
| "grad_norm_var": 0.4498146057128906, |
| "learning_rate": 2e-05, |
| "loss": 0.1902, |
| "loss/crossentropy": 1.7846480011940002, |
| "loss/hidden": 0.17529296875, |
| "loss/logits": 0.014929667580872774, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.293, |
| "grad_norm": 4.40625, |
| "grad_norm_var": 0.8888509114583333, |
| "learning_rate": 2e-05, |
| "loss": 0.1837, |
| "loss/crossentropy": 2.181917905807495, |
| "loss/hidden": 0.16845703125, |
| "loss/logits": 0.015237171668559313, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.294, |
| "grad_norm": 1.09375, |
| "grad_norm_var": 0.9238189697265625, |
| "learning_rate": 2e-05, |
| "loss": 0.2001, |
| "loss/crossentropy": 1.679999053478241, |
| "loss/hidden": 0.18505859375, |
| "loss/logits": 0.015001565217971802, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.295, |
| "grad_norm": 1.03125, |
| "grad_norm_var": 0.9490061442057292, |
| "learning_rate": 2e-05, |
| "loss": 0.1837, |
| "loss/crossentropy": 1.9540930390357971, |
| "loss/hidden": 0.16845703125, |
| "loss/logits": 0.015270334668457508, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.296, |
| "grad_norm": 1.9453125, |
| "grad_norm_var": 0.6432838439941406, |
| "learning_rate": 2e-05, |
| "loss": 0.1955, |
| "loss/crossentropy": 1.6934563517570496, |
| "loss/hidden": 0.181640625, |
| "loss/logits": 0.013896575663238764, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.297, |
| "grad_norm": 2.140625, |
| "grad_norm_var": 0.6560015360514323, |
| "learning_rate": 2e-05, |
| "loss": 0.2318, |
| "loss/crossentropy": 2.22105610370636, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.0237618088722229, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.298, |
| "grad_norm": 0.9140625, |
| "grad_norm_var": 0.6743967692057292, |
| "learning_rate": 2e-05, |
| "loss": 0.1801, |
| "loss/crossentropy": 2.1219042539596558, |
| "loss/hidden": 0.16455078125, |
| "loss/logits": 0.015568919479846954, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.299, |
| "grad_norm": 1.140625, |
| "grad_norm_var": 0.6894114176432292, |
| "learning_rate": 2e-05, |
| "loss": 0.1764, |
| "loss/crossentropy": 1.493699312210083, |
| "loss/hidden": 0.16357421875, |
| "loss/logits": 0.012840181589126587, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 2.234375, |
| "grad_norm_var": 0.6978068033854167, |
| "learning_rate": 2e-05, |
| "loss": 0.1802, |
| "loss/crossentropy": 2.2966129779815674, |
| "loss/hidden": 0.16455078125, |
| "loss/logits": 0.015650255605578423, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.301, |
| "grad_norm": 2.609375, |
| "grad_norm_var": 0.7451700846354167, |
| "learning_rate": 2e-05, |
| "loss": 0.1937, |
| "loss/crossentropy": 1.6076778769493103, |
| "loss/hidden": 0.17919921875, |
| "loss/logits": 0.014486700296401978, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.302, |
| "grad_norm": 1.421875, |
| "grad_norm_var": 0.7503985087076823, |
| "learning_rate": 2e-05, |
| "loss": 0.2056, |
| "loss/crossentropy": 1.8623589277267456, |
| "loss/hidden": 0.1875, |
| "loss/logits": 0.018080852460116148, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.303, |
| "grad_norm": 1.1015625, |
| "grad_norm_var": 0.7714670817057292, |
| "learning_rate": 2e-05, |
| "loss": 0.1809, |
| "loss/crossentropy": 2.4212971925735474, |
| "loss/hidden": 0.166015625, |
| "loss/logits": 0.014885799959301949, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 2.90625, |
| "grad_norm_var": 0.8329994201660156, |
| "learning_rate": 2e-05, |
| "loss": 0.2033, |
| "loss/crossentropy": 1.7328632473945618, |
| "loss/hidden": 0.1845703125, |
| "loss/logits": 0.018764227628707886, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.305, |
| "grad_norm": 1.359375, |
| "grad_norm_var": 0.8465858459472656, |
| "learning_rate": 2e-05, |
| "loss": 0.1893, |
| "loss/crossentropy": 2.2132304906845093, |
| "loss/hidden": 0.1728515625, |
| "loss/logits": 0.01644426677376032, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.306, |
| "grad_norm": 1.1953125, |
| "grad_norm_var": 0.8658098856608073, |
| "learning_rate": 2e-05, |
| "loss": 0.1886, |
| "loss/crossentropy": 1.9263676404953003, |
| "loss/hidden": 0.17333984375, |
| "loss/logits": 0.015250771306455135, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.307, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 0.8772369384765625, |
| "learning_rate": 2e-05, |
| "loss": 0.1854, |
| "loss/crossentropy": 0.4892140328884125, |
| "loss/hidden": 0.17919921875, |
| "loss/logits": 0.006185333244502544, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.308, |
| "grad_norm": 2.109375, |
| "grad_norm_var": 0.84609375, |
| "learning_rate": 2e-05, |
| "loss": 0.1979, |
| "loss/crossentropy": 1.4508822858333588, |
| "loss/hidden": 0.1875, |
| "loss/logits": 0.010406092507764697, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.309, |
| "grad_norm": 1.4296875, |
| "grad_norm_var": 0.3905982971191406, |
| "learning_rate": 2e-05, |
| "loss": 0.202, |
| "loss/crossentropy": 2.148550570011139, |
| "loss/hidden": 0.1845703125, |
| "loss/logits": 0.01741566974669695, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 0.36989720662434894, |
| "learning_rate": 2e-05, |
| "loss": 0.1966, |
| "loss/crossentropy": 2.151822566986084, |
| "loss/hidden": 0.1787109375, |
| "loss/logits": 0.0178435780107975, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.311, |
| "grad_norm": 1.453125, |
| "grad_norm_var": 0.341662343343099, |
| "learning_rate": 2e-05, |
| "loss": 0.1841, |
| "loss/crossentropy": 2.1770130395889282, |
| "loss/hidden": 0.16796875, |
| "loss/logits": 0.016128853894770145, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.312, |
| "grad_norm": 1.2890625, |
| "grad_norm_var": 0.352129872639974, |
| "learning_rate": 2e-05, |
| "loss": 0.207, |
| "loss/crossentropy": 1.3690854907035828, |
| "loss/hidden": 0.19189453125, |
| "loss/logits": 0.015079677104949951, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.313, |
| "grad_norm": 0.921875, |
| "grad_norm_var": 0.376012929280599, |
| "learning_rate": 2e-05, |
| "loss": 0.1903, |
| "loss/crossentropy": 1.8874292969703674, |
| "loss/hidden": 0.17578125, |
| "loss/logits": 0.014515384566038847, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.314, |
| "grad_norm": 1.125, |
| "grad_norm_var": 0.3583730061848958, |
| "learning_rate": 2e-05, |
| "loss": 0.194, |
| "loss/crossentropy": 1.7909797430038452, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.014313298743218184, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.315, |
| "grad_norm": 1.6640625, |
| "grad_norm_var": 0.33971532185872394, |
| "learning_rate": 2e-05, |
| "loss": 0.21, |
| "loss/crossentropy": 1.5393443405628204, |
| "loss/hidden": 0.19482421875, |
| "loss/logits": 0.015221260488033295, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.316, |
| "grad_norm": 26.0, |
| "grad_norm_var": 37.3775754292806, |
| "learning_rate": 2e-05, |
| "loss": 0.2622, |
| "loss/crossentropy": 2.1051180362701416, |
| "loss/hidden": 0.2412109375, |
| "loss/logits": 0.021030566655099392, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.317, |
| "grad_norm": 1.375, |
| "grad_norm_var": 37.565303293863934, |
| "learning_rate": 2e-05, |
| "loss": 0.1906, |
| "loss/crossentropy": 2.3774940967559814, |
| "loss/hidden": 0.17333984375, |
| "loss/logits": 0.01725342869758606, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.318, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 37.433223215738934, |
| "learning_rate": 2e-05, |
| "loss": 0.2138, |
| "loss/crossentropy": 1.8774001598358154, |
| "loss/hidden": 0.19580078125, |
| "loss/logits": 0.01802137354388833, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.319, |
| "grad_norm": 1.8125, |
| "grad_norm_var": 37.271480305989584, |
| "learning_rate": 2e-05, |
| "loss": 0.1798, |
| "loss/crossentropy": 1.2233986854553223, |
| "loss/hidden": 0.16748046875, |
| "loss/logits": 0.01230758335441351, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 1.1328125, |
| "grad_norm_var": 37.53408991495768, |
| "learning_rate": 2e-05, |
| "loss": 0.1821, |
| "loss/crossentropy": 2.3604530096054077, |
| "loss/hidden": 0.16748046875, |
| "loss/logits": 0.014640996232628822, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.321, |
| "grad_norm": 1.1484375, |
| "grad_norm_var": 37.58511454264323, |
| "learning_rate": 2e-05, |
| "loss": 0.1881, |
| "loss/crossentropy": 1.8407636880874634, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.013287198729813099, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.322, |
| "grad_norm": 1.2890625, |
| "grad_norm_var": 37.56233622233073, |
| "learning_rate": 2e-05, |
| "loss": 0.2141, |
| "loss/crossentropy": 1.9129706621170044, |
| "loss/hidden": 0.197265625, |
| "loss/logits": 0.01678755320608616, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.323, |
| "grad_norm": 2.0625, |
| "grad_norm_var": 37.579777018229166, |
| "learning_rate": 2e-05, |
| "loss": 0.2385, |
| "loss/crossentropy": 1.4077640175819397, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.01974598690867424, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.324, |
| "grad_norm": 2.703125, |
| "grad_norm_var": 37.52666422526042, |
| "learning_rate": 2e-05, |
| "loss": 0.1895, |
| "loss/crossentropy": 1.964367389678955, |
| "loss/hidden": 0.17529296875, |
| "loss/logits": 0.014245324768126011, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.325, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 37.459093983968096, |
| "learning_rate": 2e-05, |
| "loss": 0.182, |
| "loss/crossentropy": 1.522091805934906, |
| "loss/hidden": 0.16943359375, |
| "loss/logits": 0.012529378291219473, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.326, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 37.5768430074056, |
| "learning_rate": 2e-05, |
| "loss": 0.2034, |
| "loss/crossentropy": 2.062265932559967, |
| "loss/hidden": 0.18603515625, |
| "loss/logits": 0.01738209556788206, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.327, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 37.47470677693685, |
| "learning_rate": 2e-05, |
| "loss": 0.162, |
| "loss/crossentropy": 0.8921825066208839, |
| "loss/hidden": 0.15478515625, |
| "loss/logits": 0.007188015151768923, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.328, |
| "grad_norm": 2.703125, |
| "grad_norm_var": 37.25564676920573, |
| "learning_rate": 2e-05, |
| "loss": 0.2076, |
| "loss/crossentropy": 1.4789501875638962, |
| "loss/hidden": 0.19677734375, |
| "loss/logits": 0.010850622318685055, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.329, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 36.95995178222656, |
| "learning_rate": 2e-05, |
| "loss": 0.2166, |
| "loss/crossentropy": 1.5635761618614197, |
| "loss/hidden": 0.19921875, |
| "loss/logits": 0.017354733310639858, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 1.359375, |
| "grad_norm_var": 36.895849609375, |
| "learning_rate": 2e-05, |
| "loss": 0.2199, |
| "loss/crossentropy": 2.017998516559601, |
| "loss/hidden": 0.20166015625, |
| "loss/logits": 0.018201622180640697, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.331, |
| "grad_norm": 1.1171875, |
| "grad_norm_var": 37.0338857014974, |
| "learning_rate": 2e-05, |
| "loss": 0.2122, |
| "loss/crossentropy": 2.3959954977035522, |
| "loss/hidden": 0.19287109375, |
| "loss/logits": 0.01937100477516651, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.332, |
| "grad_norm": 1.2109375, |
| "grad_norm_var": 0.3013689676920573, |
| "learning_rate": 2e-05, |
| "loss": 0.2154, |
| "loss/crossentropy": 1.6444975137710571, |
| "loss/hidden": 0.19970703125, |
| "loss/logits": 0.015708534978330135, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.333, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 0.295763905843099, |
| "learning_rate": 2e-05, |
| "loss": 0.2496, |
| "loss/crossentropy": 1.7237208485603333, |
| "loss/hidden": 0.2294921875, |
| "loss/logits": 0.020079893060028553, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.334, |
| "grad_norm": 2.25, |
| "grad_norm_var": 0.3007789611816406, |
| "learning_rate": 2e-05, |
| "loss": 0.2527, |
| "loss/crossentropy": 1.8400374054908752, |
| "loss/hidden": 0.23193359375, |
| "loss/logits": 0.020749946124851704, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.335, |
| "grad_norm": 4.4375, |
| "grad_norm_var": 0.7596412658691406, |
| "learning_rate": 2e-05, |
| "loss": 0.2364, |
| "loss/crossentropy": 1.390014111995697, |
| "loss/hidden": 0.2197265625, |
| "loss/logits": 0.016656511463224888, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 1.7734375, |
| "grad_norm_var": 0.7201026916503906, |
| "learning_rate": 2e-05, |
| "loss": 0.2185, |
| "loss/crossentropy": 1.7787038087844849, |
| "loss/hidden": 0.201171875, |
| "loss/logits": 0.017284206114709377, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.337, |
| "grad_norm": 1.84375, |
| "grad_norm_var": 0.6773020426432291, |
| "learning_rate": 2e-05, |
| "loss": 0.2114, |
| "loss/crossentropy": 1.867686927318573, |
| "loss/hidden": 0.1943359375, |
| "loss/logits": 0.017048891633749008, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.338, |
| "grad_norm": 1.1640625, |
| "grad_norm_var": 0.6897857666015625, |
| "learning_rate": 2e-05, |
| "loss": 0.2197, |
| "loss/crossentropy": 1.9208934307098389, |
| "loss/hidden": 0.2021484375, |
| "loss/logits": 0.01750459522008896, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.339, |
| "grad_norm": 1.4765625, |
| "grad_norm_var": 0.7041481018066407, |
| "learning_rate": 2e-05, |
| "loss": 0.242, |
| "loss/crossentropy": 2.2355746626853943, |
| "loss/hidden": 0.2216796875, |
| "loss/logits": 0.020301150158047676, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 3.9375, |
| "grad_norm_var": 0.9257891337076823, |
| "learning_rate": 2e-05, |
| "loss": 0.2155, |
| "loss/crossentropy": 0.8867910504341125, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.007481162436306477, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.341, |
| "grad_norm": 1.484375, |
| "grad_norm_var": 0.9399798075358073, |
| "learning_rate": 2e-05, |
| "loss": 0.2585, |
| "loss/crossentropy": 2.052453339099884, |
| "loss/hidden": 0.23486328125, |
| "loss/logits": 0.023593857884407043, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.342, |
| "grad_norm": 2.203125, |
| "grad_norm_var": 0.9115577697753906, |
| "learning_rate": 2e-05, |
| "loss": 0.2369, |
| "loss/crossentropy": 2.1617825031280518, |
| "loss/hidden": 0.21728515625, |
| "loss/logits": 0.0196513207629323, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.343, |
| "grad_norm": 1.7578125, |
| "grad_norm_var": 0.9168365478515625, |
| "learning_rate": 2e-05, |
| "loss": 0.227, |
| "loss/crossentropy": 1.9764072895050049, |
| "loss/hidden": 0.20703125, |
| "loss/logits": 0.0199996093288064, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.344, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 0.8982887268066406, |
| "learning_rate": 2e-05, |
| "loss": 0.2182, |
| "loss/crossentropy": 2.143546998500824, |
| "loss/hidden": 0.2001953125, |
| "loss/logits": 0.017965962179005146, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.345, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 0.8949989318847656, |
| "learning_rate": 2e-05, |
| "loss": 0.2493, |
| "loss/crossentropy": 1.4641490578651428, |
| "loss/hidden": 0.23095703125, |
| "loss/logits": 0.018342602998018265, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.346, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 0.8710731506347656, |
| "learning_rate": 2e-05, |
| "loss": 0.2466, |
| "loss/crossentropy": 2.1338253021240234, |
| "loss/hidden": 0.22509765625, |
| "loss/logits": 0.021518733352422714, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.347, |
| "grad_norm": 1.2421875, |
| "grad_norm_var": 0.8576047261555989, |
| "learning_rate": 2e-05, |
| "loss": 0.225, |
| "loss/crossentropy": 2.6952072381973267, |
| "loss/hidden": 0.205078125, |
| "loss/logits": 0.019927838817238808, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.348, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 0.815874989827474, |
| "learning_rate": 2e-05, |
| "loss": 0.2426, |
| "loss/crossentropy": 1.8981314897537231, |
| "loss/hidden": 0.22265625, |
| "loss/logits": 0.01995532214641571, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.349, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 0.7987363179524739, |
| "learning_rate": 2e-05, |
| "loss": 0.2394, |
| "loss/crossentropy": 2.5630762577056885, |
| "loss/hidden": 0.21826171875, |
| "loss/logits": 0.021146751008927822, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 1.234375, |
| "grad_norm_var": 0.8374834696451823, |
| "learning_rate": 2e-05, |
| "loss": 0.2219, |
| "loss/crossentropy": 1.913558542728424, |
| "loss/hidden": 0.20458984375, |
| "loss/logits": 0.017343497835099697, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.351, |
| "grad_norm": 2.578125, |
| "grad_norm_var": 0.44841893513997394, |
| "learning_rate": 2e-05, |
| "loss": 0.2603, |
| "loss/crossentropy": 1.54945570230484, |
| "loss/hidden": 0.24072265625, |
| "loss/logits": 0.019581012427806854, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 1.6875, |
| "grad_norm_var": 0.45010579427083336, |
| "learning_rate": 2e-05, |
| "loss": 0.2255, |
| "loss/crossentropy": 1.4761220812797546, |
| "loss/hidden": 0.2099609375, |
| "loss/logits": 0.01557510020211339, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.353, |
| "grad_norm": 1.625, |
| "grad_norm_var": 0.45400797526041664, |
| "learning_rate": 2e-05, |
| "loss": 0.2422, |
| "loss/crossentropy": 1.6972084641456604, |
| "loss/hidden": 0.2236328125, |
| "loss/logits": 0.018569067120552063, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.354, |
| "grad_norm": 1.3359375, |
| "grad_norm_var": 0.4398752848307292, |
| "learning_rate": 2e-05, |
| "loss": 0.243, |
| "loss/crossentropy": 1.5384193658828735, |
| "loss/hidden": 0.224609375, |
| "loss/logits": 0.018395395018160343, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.355, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 0.4311358133951823, |
| "learning_rate": 2e-05, |
| "loss": 0.2595, |
| "loss/crossentropy": 1.7909427881240845, |
| "loss/hidden": 0.2373046875, |
| "loss/logits": 0.022242317907512188, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.356, |
| "grad_norm": 1.421875, |
| "grad_norm_var": 0.13862889607747395, |
| "learning_rate": 2e-05, |
| "loss": 0.2252, |
| "loss/crossentropy": 2.2541953325271606, |
| "loss/hidden": 0.20654296875, |
| "loss/logits": 0.018652436323463917, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.357, |
| "grad_norm": 1.1015625, |
| "grad_norm_var": 0.16027425130208334, |
| "learning_rate": 2e-05, |
| "loss": 0.2262, |
| "loss/crossentropy": 2.2753015756607056, |
| "loss/hidden": 0.20654296875, |
| "loss/logits": 0.019669558852910995, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.358, |
| "grad_norm": 1.7421875, |
| "grad_norm_var": 0.14294408162434896, |
| "learning_rate": 2e-05, |
| "loss": 0.2585, |
| "loss/crossentropy": 1.8439677953720093, |
| "loss/hidden": 0.23974609375, |
| "loss/logits": 0.018735644407570362, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.359, |
| "grad_norm": 1.1640625, |
| "grad_norm_var": 0.15852228800455728, |
| "learning_rate": 2e-05, |
| "loss": 0.2257, |
| "loss/crossentropy": 1.6270447373390198, |
| "loss/hidden": 0.21044921875, |
| "loss/logits": 0.015203338116407394, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 1.21875, |
| "grad_norm_var": 0.16902567545572916, |
| "learning_rate": 2e-05, |
| "loss": 0.2411, |
| "loss/crossentropy": 1.6992469429969788, |
| "loss/hidden": 0.22314453125, |
| "loss/logits": 0.017964603379368782, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.361, |
| "grad_norm": 1.5625, |
| "grad_norm_var": 0.14740397135416666, |
| "learning_rate": 2e-05, |
| "loss": 0.2516, |
| "loss/crossentropy": 2.1820708513259888, |
| "loss/hidden": 0.2294921875, |
| "loss/logits": 0.022074894048273563, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.362, |
| "grad_norm": 1.3359375, |
| "grad_norm_var": 0.14517186482747396, |
| "learning_rate": 2e-05, |
| "loss": 0.2587, |
| "loss/crossentropy": 1.7148206233978271, |
| "loss/hidden": 0.2392578125, |
| "loss/logits": 0.019486463628709316, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.363, |
| "grad_norm": 1.296875, |
| "grad_norm_var": 0.14311930338541667, |
| "learning_rate": 2e-05, |
| "loss": 0.2428, |
| "loss/crossentropy": 1.8336674571037292, |
| "loss/hidden": 0.2236328125, |
| "loss/logits": 0.019144260324537754, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.364, |
| "grad_norm": 1.140625, |
| "grad_norm_var": 0.14488296508789061, |
| "learning_rate": 2e-05, |
| "loss": 0.2436, |
| "loss/crossentropy": 2.184293031692505, |
| "loss/hidden": 0.22265625, |
| "loss/logits": 0.020982088521122932, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.365, |
| "grad_norm": 1.484375, |
| "grad_norm_var": 0.13069229125976561, |
| "learning_rate": 2e-05, |
| "loss": 0.253, |
| "loss/crossentropy": 1.4488345384597778, |
| "loss/hidden": 0.23388671875, |
| "loss/logits": 0.01907930802553892, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.366, |
| "grad_norm": 1.0546875, |
| "grad_norm_var": 0.13852437337239584, |
| "learning_rate": 2e-05, |
| "loss": 0.2372, |
| "loss/crossentropy": 2.027945578098297, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.01842686627060175, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.367, |
| "grad_norm": 1.6953125, |
| "grad_norm_var": 0.056306711832682294, |
| "learning_rate": 2e-05, |
| "loss": 0.2299, |
| "loss/crossentropy": 1.4406660199165344, |
| "loss/hidden": 0.21337890625, |
| "loss/logits": 0.016529593151062727, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 0.11914850870768229, |
| "learning_rate": 2e-05, |
| "loss": 0.2864, |
| "loss/crossentropy": 1.579603135585785, |
| "loss/hidden": 0.2666015625, |
| "loss/logits": 0.019802499562501907, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.369, |
| "grad_norm": 1.5078125, |
| "grad_norm_var": 0.11738993326822916, |
| "learning_rate": 2e-05, |
| "loss": 0.2232, |
| "loss/crossentropy": 2.0089566707611084, |
| "loss/hidden": 0.20556640625, |
| "loss/logits": 0.017678971402347088, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 1.9921875, |
| "grad_norm_var": 0.13430887858072918, |
| "learning_rate": 2e-05, |
| "loss": 0.2358, |
| "loss/crossentropy": 1.216122329235077, |
| "loss/hidden": 0.22412109375, |
| "loss/logits": 0.011710493825376034, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.371, |
| "grad_norm": 1.4140625, |
| "grad_norm_var": 0.13136367797851561, |
| "learning_rate": 2e-05, |
| "loss": 0.2407, |
| "loss/crossentropy": 1.7659806609153748, |
| "loss/hidden": 0.22412109375, |
| "loss/logits": 0.016615580767393112, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.372, |
| "grad_norm": 1.515625, |
| "grad_norm_var": 0.13127212524414061, |
| "learning_rate": 2e-05, |
| "loss": 0.252, |
| "loss/crossentropy": 2.137717843055725, |
| "loss/hidden": 0.2314453125, |
| "loss/logits": 0.020573250949382782, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.373, |
| "grad_norm": 1.1640625, |
| "grad_norm_var": 0.12837092081705728, |
| "learning_rate": 2e-05, |
| "loss": 0.2294, |
| "loss/crossentropy": 2.1520731449127197, |
| "loss/hidden": 0.21044921875, |
| "loss/logits": 0.018917559646070004, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.374, |
| "grad_norm": 1.140625, |
| "grad_norm_var": 0.13019205729166666, |
| "learning_rate": 2e-05, |
| "loss": 0.2299, |
| "loss/crossentropy": 2.2924171090126038, |
| "loss/hidden": 0.2119140625, |
| "loss/logits": 0.017971434630453587, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 1.1796875, |
| "grad_norm_var": 0.12962137858072917, |
| "learning_rate": 2e-05, |
| "loss": 0.2368, |
| "loss/crossentropy": 2.1585444808006287, |
| "loss/hidden": 0.216796875, |
| "loss/logits": 0.02000956330448389, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.376, |
| "grad_norm": 8.25, |
| "grad_norm_var": 3.006208292643229, |
| "learning_rate": 2e-05, |
| "loss": 0.2451, |
| "loss/crossentropy": 0.7908148150891066, |
| "loss/hidden": 0.23583984375, |
| "loss/logits": 0.009249582537449896, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.377, |
| "grad_norm": 1.3671875, |
| "grad_norm_var": 3.017010243733724, |
| "learning_rate": 2e-05, |
| "loss": 0.2249, |
| "loss/crossentropy": 1.7327390313148499, |
| "loss/hidden": 0.20751953125, |
| "loss/logits": 0.017398852854967117, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.378, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 3.005767567952474, |
| "learning_rate": 2e-05, |
| "loss": 0.2485, |
| "loss/crossentropy": 2.0554267168045044, |
| "loss/hidden": 0.2275390625, |
| "loss/logits": 0.020936082117259502, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.379, |
| "grad_norm": 4.25, |
| "grad_norm_var": 3.319152577718099, |
| "learning_rate": 2e-05, |
| "loss": 0.3606, |
| "loss/crossentropy": 2.4205610156059265, |
| "loss/hidden": 0.322265625, |
| "loss/logits": 0.0383535772562027, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 1.84375, |
| "grad_norm_var": 3.262939198811849, |
| "learning_rate": 2e-05, |
| "loss": 0.2521, |
| "loss/crossentropy": 1.477292001247406, |
| "loss/hidden": 0.23583984375, |
| "loss/logits": 0.016257247421890497, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.381, |
| "grad_norm": 1.0546875, |
| "grad_norm_var": 3.3105377197265624, |
| "learning_rate": 2e-05, |
| "loss": 0.2395, |
| "loss/crossentropy": 2.5222301483154297, |
| "loss/hidden": 0.21630859375, |
| "loss/logits": 0.023189062252640724, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.382, |
| "grad_norm": 1.3203125, |
| "grad_norm_var": 3.278389485677083, |
| "learning_rate": 2e-05, |
| "loss": 0.259, |
| "loss/crossentropy": 2.0298832058906555, |
| "loss/hidden": 0.23828125, |
| "loss/logits": 0.020761173218488693, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.383, |
| "grad_norm": 2.84375, |
| "grad_norm_var": 3.298315175374349, |
| "learning_rate": 2e-05, |
| "loss": 0.2819, |
| "loss/crossentropy": 0.7599294036626816, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.012356668477877975, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 1.4921875, |
| "grad_norm_var": 3.321117146809896, |
| "learning_rate": 2e-05, |
| "loss": 0.2588, |
| "loss/crossentropy": 1.9294875860214233, |
| "loss/hidden": 0.23583984375, |
| "loss/logits": 0.02299057226628065, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.385, |
| "grad_norm": 2.375, |
| "grad_norm_var": 3.297771962483724, |
| "learning_rate": 2e-05, |
| "loss": 0.3085, |
| "loss/crossentropy": 2.11811500787735, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.027242762967944145, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.386, |
| "grad_norm": 1.7109375, |
| "grad_norm_var": 3.309399159749349, |
| "learning_rate": 2e-05, |
| "loss": 0.2698, |
| "loss/crossentropy": 2.2003984451293945, |
| "loss/hidden": 0.24853515625, |
| "loss/logits": 0.021241911686956882, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.387, |
| "grad_norm": 1.640625, |
| "grad_norm_var": 3.2902903238932293, |
| "learning_rate": 2e-05, |
| "loss": 0.2405, |
| "loss/crossentropy": 1.9465845227241516, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.019845230504870415, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.388, |
| "grad_norm": 1.8046875, |
| "grad_norm_var": 3.270407867431641, |
| "learning_rate": 2e-05, |
| "loss": 0.264, |
| "loss/crossentropy": 1.718444287776947, |
| "loss/hidden": 0.244140625, |
| "loss/logits": 0.01988315861672163, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.389, |
| "grad_norm": 1.5, |
| "grad_norm_var": 3.2317291259765626, |
| "learning_rate": 2e-05, |
| "loss": 0.2673, |
| "loss/crossentropy": 2.229490637779236, |
| "loss/hidden": 0.2431640625, |
| "loss/logits": 0.02413833886384964, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 3.1607236226399738, |
| "learning_rate": 2e-05, |
| "loss": 0.2541, |
| "loss/crossentropy": 1.289111077785492, |
| "loss/hidden": 0.24072265625, |
| "loss/logits": 0.013331972528249025, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.391, |
| "grad_norm": 1.2109375, |
| "grad_norm_var": 3.1563148498535156, |
| "learning_rate": 2e-05, |
| "loss": 0.2772, |
| "loss/crossentropy": 1.5863260626792908, |
| "loss/hidden": 0.2568359375, |
| "loss/logits": 0.020383677445352077, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.392, |
| "grad_norm": 1.390625, |
| "grad_norm_var": 0.6135231018066406, |
| "learning_rate": 2e-05, |
| "loss": 0.2698, |
| "loss/crossentropy": 1.4361680746078491, |
| "loss/hidden": 0.2509765625, |
| "loss/logits": 0.01877568569034338, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.393, |
| "grad_norm": 1.546875, |
| "grad_norm_var": 0.6045562744140625, |
| "learning_rate": 2e-05, |
| "loss": 0.2426, |
| "loss/crossentropy": 2.1396487951278687, |
| "loss/hidden": 0.22265625, |
| "loss/logits": 0.019969161599874496, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.394, |
| "grad_norm": 3.90625, |
| "grad_norm_var": 0.8598243713378906, |
| "learning_rate": 2e-05, |
| "loss": 0.3119, |
| "loss/crossentropy": 1.9341546297073364, |
| "loss/hidden": 0.2841796875, |
| "loss/logits": 0.02770126238465309, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.395, |
| "grad_norm": 1.171875, |
| "grad_norm_var": 0.522753651936849, |
| "learning_rate": 2e-05, |
| "loss": 0.2504, |
| "loss/crossentropy": 2.2734580039978027, |
| "loss/hidden": 0.228515625, |
| "loss/logits": 0.02187348995357752, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.396, |
| "grad_norm": 4.375, |
| "grad_norm_var": 0.940179189046224, |
| "learning_rate": 2e-05, |
| "loss": 0.2926, |
| "loss/crossentropy": 1.1335339732468128, |
| "loss/hidden": 0.279296875, |
| "loss/logits": 0.013279704377055168, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.397, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 0.8845743815104167, |
| "learning_rate": 2e-05, |
| "loss": 0.2879, |
| "loss/crossentropy": 1.3619316220283508, |
| "loss/hidden": 0.2666015625, |
| "loss/logits": 0.02124913316220045, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.398, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 0.8496192932128906, |
| "learning_rate": 2e-05, |
| "loss": 0.2811, |
| "loss/crossentropy": 1.98111492395401, |
| "loss/hidden": 0.2578125, |
| "loss/logits": 0.02325397450476885, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.399, |
| "grad_norm": 1.546875, |
| "grad_norm_var": 0.8207435607910156, |
| "learning_rate": 2e-05, |
| "loss": 0.2818, |
| "loss/crossentropy": 2.389267683029175, |
| "loss/hidden": 0.2568359375, |
| "loss/logits": 0.02493403758853674, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 1.6484375, |
| "grad_norm_var": 0.811944325764974, |
| "learning_rate": 2e-05, |
| "loss": 0.2543, |
| "loss/crossentropy": 2.0064558386802673, |
| "loss/hidden": 0.232421875, |
| "loss/logits": 0.021828239783644676, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.401, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 0.8026120503743489, |
| "learning_rate": 2e-05, |
| "loss": 0.2706, |
| "loss/crossentropy": 1.7220072150230408, |
| "loss/hidden": 0.2529296875, |
| "loss/logits": 0.01771449577063322, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.402, |
| "grad_norm": 77.0, |
| "grad_norm_var": 352.52654520670575, |
| "learning_rate": 2e-05, |
| "loss": 0.3571, |
| "loss/crossentropy": 2.084269881248474, |
| "loss/hidden": 0.3349609375, |
| "loss/logits": 0.022124722599983215, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.403, |
| "grad_norm": 3.765625, |
| "grad_norm_var": 351.38352762858074, |
| "learning_rate": 2e-05, |
| "loss": 0.2745, |
| "loss/crossentropy": 1.6723448634147644, |
| "loss/hidden": 0.2548828125, |
| "loss/logits": 0.01958293654024601, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.404, |
| "grad_norm": 2.125, |
| "grad_norm_var": 351.17644017537435, |
| "learning_rate": 2e-05, |
| "loss": 0.2731, |
| "loss/crossentropy": 1.742283046245575, |
| "loss/hidden": 0.25146484375, |
| "loss/logits": 0.021613112650811672, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.405, |
| "grad_norm": 1.1328125, |
| "grad_norm_var": 351.4455078125, |
| "learning_rate": 2e-05, |
| "loss": 0.2683, |
| "loss/crossentropy": 2.223657548427582, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.02124713361263275, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.406, |
| "grad_norm": 1.3515625, |
| "grad_norm_var": 351.81150309244794, |
| "learning_rate": 2e-05, |
| "loss": 0.2949, |
| "loss/crossentropy": 1.8797453045845032, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.02535920962691307, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.407, |
| "grad_norm": 1.390625, |
| "grad_norm_var": 351.68039321899414, |
| "learning_rate": 2e-05, |
| "loss": 0.2861, |
| "loss/crossentropy": 1.8714227080345154, |
| "loss/hidden": 0.26171875, |
| "loss/logits": 0.024350603111088276, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.408, |
| "grad_norm": 1.59375, |
| "grad_norm_var": 351.53704198201496, |
| "learning_rate": 2e-05, |
| "loss": 0.2751, |
| "loss/crossentropy": 2.038064181804657, |
| "loss/hidden": 0.25244140625, |
| "loss/logits": 0.022680481895804405, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.409, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 351.04773534138997, |
| "learning_rate": 2e-05, |
| "loss": 0.2719, |
| "loss/crossentropy": 1.7517433166503906, |
| "loss/hidden": 0.25, |
| "loss/logits": 0.021894831210374832, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 4.0, |
| "grad_norm_var": 351.0116330464681, |
| "learning_rate": 2e-05, |
| "loss": 0.3178, |
| "loss/crossentropy": 1.0839223191142082, |
| "loss/hidden": 0.29833984375, |
| "loss/logits": 0.019504179246723652, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.411, |
| "grad_norm": 1.3125, |
| "grad_norm_var": 350.9065121968587, |
| "learning_rate": 2e-05, |
| "loss": 0.2768, |
| "loss/crossentropy": 2.5323891639709473, |
| "loss/hidden": 0.2529296875, |
| "loss/logits": 0.023888778872787952, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.412, |
| "grad_norm": 1.5859375, |
| "grad_norm_var": 352.3142079671224, |
| "learning_rate": 2e-05, |
| "loss": 0.2754, |
| "loss/crossentropy": 1.5777837038040161, |
| "loss/hidden": 0.2578125, |
| "loss/logits": 0.017572961747646332, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.413, |
| "grad_norm": 1.359375, |
| "grad_norm_var": 352.8437082926432, |
| "learning_rate": 2e-05, |
| "loss": 0.2789, |
| "loss/crossentropy": 2.079995810985565, |
| "loss/hidden": 0.2578125, |
| "loss/logits": 0.02109308261424303, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.414, |
| "grad_norm": 1.8515625, |
| "grad_norm_var": 352.9843584696452, |
| "learning_rate": 2e-05, |
| "loss": 0.3105, |
| "loss/crossentropy": 1.5966813564300537, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.025356116704642773, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.415, |
| "grad_norm": 1.3515625, |
| "grad_norm_var": 353.1186930338542, |
| "learning_rate": 2e-05, |
| "loss": 0.3049, |
| "loss/crossentropy": 2.1346817016601562, |
| "loss/hidden": 0.27734375, |
| "loss/logits": 0.027529660612344742, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 2.15625, |
| "grad_norm_var": 352.79944229125977, |
| "learning_rate": 2e-05, |
| "loss": 0.2757, |
| "loss/crossentropy": 1.3576586246490479, |
| "loss/hidden": 0.25634765625, |
| "loss/logits": 0.01940206252038479, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.417, |
| "grad_norm": 1.421875, |
| "grad_norm_var": 353.089884185791, |
| "learning_rate": 2e-05, |
| "loss": 0.3062, |
| "loss/crossentropy": 1.5645692944526672, |
| "loss/hidden": 0.2822265625, |
| "loss/logits": 0.02394524496048689, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.418, |
| "grad_norm": 1.7421875, |
| "grad_norm_var": 0.7133056640625, |
| "learning_rate": 2e-05, |
| "loss": 0.2676, |
| "loss/crossentropy": 2.1792179346084595, |
| "loss/hidden": 0.24560546875, |
| "loss/logits": 0.022001913748681545, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.419, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 0.4785552978515625, |
| "learning_rate": 2e-05, |
| "loss": 0.2776, |
| "loss/crossentropy": 1.8772451281547546, |
| "loss/hidden": 0.255859375, |
| "loss/logits": 0.02171818818897009, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 1.5, |
| "grad_norm_var": 0.4763580322265625, |
| "learning_rate": 2e-05, |
| "loss": 0.2993, |
| "loss/crossentropy": 2.191072165966034, |
| "loss/hidden": 0.2705078125, |
| "loss/logits": 0.0287649966776371, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.421, |
| "grad_norm": 3.328125, |
| "grad_norm_var": 0.5920550028483073, |
| "learning_rate": 2e-05, |
| "loss": 0.2893, |
| "loss/crossentropy": 2.4681068658828735, |
| "loss/hidden": 0.2626953125, |
| "loss/logits": 0.02658757194876671, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.422, |
| "grad_norm": 1.953125, |
| "grad_norm_var": 0.5703776041666667, |
| "learning_rate": 2e-05, |
| "loss": 0.2574, |
| "loss/crossentropy": 1.9942094683647156, |
| "loss/hidden": 0.2392578125, |
| "loss/logits": 0.018155298195779324, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.423, |
| "grad_norm": 1.7109375, |
| "grad_norm_var": 0.5532671610514323, |
| "learning_rate": 2e-05, |
| "loss": 0.379, |
| "loss/crossentropy": 1.6838626861572266, |
| "loss/hidden": 0.3466796875, |
| "loss/logits": 0.0323002003133297, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.424, |
| "grad_norm": 1.7578125, |
| "grad_norm_var": 0.5469065348307292, |
| "learning_rate": 2e-05, |
| "loss": 0.2931, |
| "loss/crossentropy": 1.897689163684845, |
| "loss/hidden": 0.2724609375, |
| "loss/logits": 0.020671049132943153, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.425, |
| "grad_norm": 1.375, |
| "grad_norm_var": 0.5600504557291667, |
| "learning_rate": 2e-05, |
| "loss": 0.3171, |
| "loss/crossentropy": 2.0812936425209045, |
| "loss/hidden": 0.291015625, |
| "loss/logits": 0.02603732794523239, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.426, |
| "grad_norm": 1.5546875, |
| "grad_norm_var": 0.2536699930826823, |
| "learning_rate": 2e-05, |
| "loss": 0.3041, |
| "loss/crossentropy": 1.9111879467964172, |
| "loss/hidden": 0.2802734375, |
| "loss/logits": 0.02386578731238842, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.427, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 0.2541412353515625, |
| "learning_rate": 2e-05, |
| "loss": 0.3047, |
| "loss/crossentropy": 2.118652582168579, |
| "loss/hidden": 0.2763671875, |
| "loss/logits": 0.028314806520938873, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.428, |
| "grad_norm": 1.28125, |
| "grad_norm_var": 0.2670448303222656, |
| "learning_rate": 2e-05, |
| "loss": 0.2985, |
| "loss/crossentropy": 2.3579596281051636, |
| "loss/hidden": 0.2705078125, |
| "loss/logits": 0.02802193909883499, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.429, |
| "grad_norm": 2.0625, |
| "grad_norm_var": 0.2621009826660156, |
| "learning_rate": 2e-05, |
| "loss": 0.3687, |
| "loss/crossentropy": 1.604810118675232, |
| "loss/hidden": 0.333984375, |
| "loss/logits": 0.03476274199783802, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 0.2784088134765625, |
| "learning_rate": 2e-05, |
| "loss": 0.3199, |
| "loss/crossentropy": 1.8773449063301086, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.023035001009702682, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.431, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 0.26665420532226564, |
| "learning_rate": 2e-05, |
| "loss": 0.291, |
| "loss/crossentropy": 2.3691608905792236, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.023443943820893764, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 1.6171875, |
| "grad_norm_var": 0.2634429931640625, |
| "learning_rate": 2e-05, |
| "loss": 0.2813, |
| "loss/crossentropy": 1.1586915850639343, |
| "loss/hidden": 0.26513671875, |
| "loss/logits": 0.01613916177302599, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.433, |
| "grad_norm": 6.21875, |
| "grad_norm_var": 1.4436116536458334, |
| "learning_rate": 2e-05, |
| "loss": 0.3126, |
| "loss/crossentropy": 1.3897653669118881, |
| "loss/hidden": 0.2958984375, |
| "loss/logits": 0.016718640457838774, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.434, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 1.4577677408854166, |
| "learning_rate": 2e-05, |
| "loss": 0.3143, |
| "loss/crossentropy": 1.4730547070503235, |
| "loss/hidden": 0.291015625, |
| "loss/logits": 0.023296916857361794, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.435, |
| "grad_norm": 2.03125, |
| "grad_norm_var": 1.4572794596354166, |
| "learning_rate": 2e-05, |
| "loss": 0.315, |
| "loss/crossentropy": 1.745673418045044, |
| "loss/hidden": 0.2919921875, |
| "loss/logits": 0.02297977078706026, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.436, |
| "grad_norm": 2.5, |
| "grad_norm_var": 1.43983154296875, |
| "learning_rate": 2e-05, |
| "loss": 0.2914, |
| "loss/crossentropy": 1.4162335693836212, |
| "loss/hidden": 0.275390625, |
| "loss/logits": 0.016047537326812744, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.437, |
| "grad_norm": 1.4609375, |
| "grad_norm_var": 1.3674415588378905, |
| "learning_rate": 2e-05, |
| "loss": 0.3182, |
| "loss/crossentropy": 2.154849946498871, |
| "loss/hidden": 0.2919921875, |
| "loss/logits": 0.026252766139805317, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.438, |
| "grad_norm": 2.40625, |
| "grad_norm_var": 1.3746986389160156, |
| "learning_rate": 2e-05, |
| "loss": 0.2885, |
| "loss/crossentropy": 2.0660600662231445, |
| "loss/hidden": 0.265625, |
| "loss/logits": 0.022861075587570667, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.439, |
| "grad_norm": 7.15625, |
| "grad_norm_var": 2.9645100911458333, |
| "learning_rate": 2e-05, |
| "loss": 0.3, |
| "loss/crossentropy": 2.717895984649658, |
| "loss/hidden": 0.27392578125, |
| "loss/logits": 0.026059124618768692, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 13.0625, |
| "grad_norm_var": 9.962597401936849, |
| "learning_rate": 2e-05, |
| "loss": 0.3237, |
| "loss/crossentropy": 2.089266359806061, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.02680843137204647, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.441, |
| "grad_norm": 3.46875, |
| "grad_norm_var": 9.749269358317058, |
| "learning_rate": 2e-05, |
| "loss": 0.3274, |
| "loss/crossentropy": 1.671483427286148, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.018828653264790773, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.442, |
| "grad_norm": 3.171875, |
| "grad_norm_var": 9.54685770670573, |
| "learning_rate": 2e-05, |
| "loss": 0.3257, |
| "loss/crossentropy": 1.40052130818367, |
| "loss/hidden": 0.3037109375, |
| "loss/logits": 0.021975211799144745, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.443, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 9.41304423014323, |
| "learning_rate": 2e-05, |
| "loss": 0.3262, |
| "loss/crossentropy": 2.070408821105957, |
| "loss/hidden": 0.2958984375, |
| "loss/logits": 0.030301526188850403, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.444, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 9.161588541666667, |
| "learning_rate": 2e-05, |
| "loss": 0.3014, |
| "loss/crossentropy": 1.219970703125, |
| "loss/hidden": 0.287109375, |
| "loss/logits": 0.014268356142565608, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.445, |
| "grad_norm": 2.703125, |
| "grad_norm_var": 9.067455037434895, |
| "learning_rate": 2e-05, |
| "loss": 0.2696, |
| "loss/crossentropy": 0.8407798707485199, |
| "loss/hidden": 0.2578125, |
| "loss/logits": 0.011814095778390765, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.446, |
| "grad_norm": 3.25, |
| "grad_norm_var": 8.97071533203125, |
| "learning_rate": 2e-05, |
| "loss": 0.3224, |
| "loss/crossentropy": 0.8129114657640457, |
| "loss/hidden": 0.30859375, |
| "loss/logits": 0.013845205074176192, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.447, |
| "grad_norm": 4.34375, |
| "grad_norm_var": 8.84253641764323, |
| "learning_rate": 2e-05, |
| "loss": 0.3003, |
| "loss/crossentropy": 1.0620581209659576, |
| "loss/hidden": 0.2841796875, |
| "loss/logits": 0.016078725922852755, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 8.682106272379558, |
| "learning_rate": 2e-05, |
| "loss": 0.3515, |
| "loss/crossentropy": 2.4432766437530518, |
| "loss/hidden": 0.3193359375, |
| "loss/logits": 0.03212443180382252, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.449, |
| "grad_norm": 3.0, |
| "grad_norm_var": 8.2694943745931, |
| "learning_rate": 2e-05, |
| "loss": 0.2823, |
| "loss/crossentropy": 1.0543333142995834, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.014680951833724976, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 1.3125, |
| "grad_norm_var": 8.329198201497396, |
| "learning_rate": 2e-05, |
| "loss": 0.3061, |
| "loss/crossentropy": 2.019322693347931, |
| "loss/hidden": 0.2802734375, |
| "loss/logits": 0.025797588750720024, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.451, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 8.470213826497396, |
| "learning_rate": 2e-05, |
| "loss": 0.3175, |
| "loss/crossentropy": 1.7490596175193787, |
| "loss/hidden": 0.2939453125, |
| "loss/logits": 0.023543373681604862, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.452, |
| "grad_norm": 2.53125, |
| "grad_norm_var": 8.466120402018229, |
| "learning_rate": 2e-05, |
| "loss": 0.3217, |
| "loss/crossentropy": 1.759113371372223, |
| "loss/hidden": 0.2978515625, |
| "loss/logits": 0.02383426111191511, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.453, |
| "grad_norm": 1.5859375, |
| "grad_norm_var": 8.433128865559896, |
| "learning_rate": 2e-05, |
| "loss": 0.3027, |
| "loss/crossentropy": 2.3126111030578613, |
| "loss/hidden": 0.2783203125, |
| "loss/logits": 0.024376308545470238, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.454, |
| "grad_norm": 1.703125, |
| "grad_norm_var": 8.567207845052083, |
| "learning_rate": 2e-05, |
| "loss": 0.3403, |
| "loss/crossentropy": 1.5542563199996948, |
| "loss/hidden": 0.3134765625, |
| "loss/logits": 0.026786498725414276, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.455, |
| "grad_norm": 1.6796875, |
| "grad_norm_var": 7.744832102457682, |
| "learning_rate": 2e-05, |
| "loss": 0.3387, |
| "loss/crossentropy": 1.7838309407234192, |
| "loss/hidden": 0.3134765625, |
| "loss/logits": 0.02520835865288973, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.456, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 0.7266741434733073, |
| "learning_rate": 2e-05, |
| "loss": 0.389, |
| "loss/crossentropy": 1.2347190976142883, |
| "loss/hidden": 0.3583984375, |
| "loss/logits": 0.030589699745178223, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.457, |
| "grad_norm": 1.4921875, |
| "grad_norm_var": 0.69765625, |
| "learning_rate": 2e-05, |
| "loss": 0.3172, |
| "loss/crossentropy": 1.9718384146690369, |
| "loss/hidden": 0.2900390625, |
| "loss/logits": 0.02716031763702631, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.458, |
| "grad_norm": 1.3984375, |
| "grad_norm_var": 0.6900937398274739, |
| "learning_rate": 2e-05, |
| "loss": 0.311, |
| "loss/crossentropy": 2.390895366668701, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.02580439206212759, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.459, |
| "grad_norm": 1.328125, |
| "grad_norm_var": 0.7320149739583334, |
| "learning_rate": 2e-05, |
| "loss": 0.2917, |
| "loss/crossentropy": 1.794322669506073, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.02415597066283226, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 0.727898915608724, |
| "learning_rate": 2e-05, |
| "loss": 0.3103, |
| "loss/crossentropy": 1.9104264378547668, |
| "loss/hidden": 0.2861328125, |
| "loss/logits": 0.024162941612303257, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.461, |
| "grad_norm": 1.0625, |
| "grad_norm_var": 0.769341786702474, |
| "learning_rate": 2e-05, |
| "loss": 0.314, |
| "loss/crossentropy": 1.9037153720855713, |
| "loss/hidden": 0.2900390625, |
| "loss/logits": 0.02395364549010992, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.462, |
| "grad_norm": 3.59375, |
| "grad_norm_var": 0.833056386311849, |
| "learning_rate": 2e-05, |
| "loss": 0.3334, |
| "loss/crossentropy": 0.9103630632162094, |
| "loss/hidden": 0.314453125, |
| "loss/logits": 0.018936143023893237, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.463, |
| "grad_norm": 2.609375, |
| "grad_norm_var": 0.48889134724934896, |
| "learning_rate": 2e-05, |
| "loss": 0.4235, |
| "loss/crossentropy": 1.9410001635551453, |
| "loss/hidden": 0.384765625, |
| "loss/logits": 0.03869070205837488, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 1.234375, |
| "grad_norm_var": 0.5080523173014323, |
| "learning_rate": 2e-05, |
| "loss": 0.3223, |
| "loss/crossentropy": 2.3750853538513184, |
| "loss/hidden": 0.2919921875, |
| "loss/logits": 0.030337156727910042, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.465, |
| "grad_norm": 1.3359375, |
| "grad_norm_var": 0.42988688151041665, |
| "learning_rate": 2e-05, |
| "loss": 0.3482, |
| "loss/crossentropy": 1.9180442690849304, |
| "loss/hidden": 0.322265625, |
| "loss/logits": 0.025961963459849358, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.466, |
| "grad_norm": 1.6953125, |
| "grad_norm_var": 0.41601740519205727, |
| "learning_rate": 2e-05, |
| "loss": 0.3814, |
| "loss/crossentropy": 2.180357873439789, |
| "loss/hidden": 0.34765625, |
| "loss/logits": 0.03377598337829113, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.467, |
| "grad_norm": 1.453125, |
| "grad_norm_var": 0.4153032938639323, |
| "learning_rate": 2e-05, |
| "loss": 0.3298, |
| "loss/crossentropy": 2.112913489341736, |
| "loss/hidden": 0.302734375, |
| "loss/logits": 0.027080713771283627, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.468, |
| "grad_norm": 2.890625, |
| "grad_norm_var": 0.4589617411295573, |
| "learning_rate": 2e-05, |
| "loss": 0.3384, |
| "loss/crossentropy": 1.643601417541504, |
| "loss/hidden": 0.3134765625, |
| "loss/logits": 0.024958825670182705, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.469, |
| "grad_norm": 6.375, |
| "grad_norm_var": 1.7486724853515625, |
| "learning_rate": 2e-05, |
| "loss": 0.3684, |
| "loss/crossentropy": 1.8053930401802063, |
| "loss/hidden": 0.3369140625, |
| "loss/logits": 0.03144758567214012, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 3.109375, |
| "grad_norm_var": 1.7959136962890625, |
| "learning_rate": 2e-05, |
| "loss": 0.3813, |
| "loss/crossentropy": 1.2240911722183228, |
| "loss/hidden": 0.359375, |
| "loss/logits": 0.021923545747995377, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.471, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 1.8306304931640625, |
| "learning_rate": 2e-05, |
| "loss": 0.3026, |
| "loss/crossentropy": 2.474206805229187, |
| "loss/hidden": 0.2763671875, |
| "loss/logits": 0.02618865016847849, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.472, |
| "grad_norm": 3.6875, |
| "grad_norm_var": 1.9708740234375, |
| "learning_rate": 2e-05, |
| "loss": 0.3763, |
| "loss/crossentropy": 1.8918054699897766, |
| "loss/hidden": 0.3447265625, |
| "loss/logits": 0.03158373944461346, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.473, |
| "grad_norm": 5.21875, |
| "grad_norm_var": 2.4487037658691406, |
| "learning_rate": 2e-05, |
| "loss": 0.4363, |
| "loss/crossentropy": 1.8472670912742615, |
| "loss/hidden": 0.3857421875, |
| "loss/logits": 0.05057228542864323, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.474, |
| "grad_norm": 1.453125, |
| "grad_norm_var": 2.44078369140625, |
| "learning_rate": 2e-05, |
| "loss": 0.3664, |
| "loss/crossentropy": 2.0877062678337097, |
| "loss/hidden": 0.3359375, |
| "loss/logits": 0.030460949055850506, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.475, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 2.3744466145833334, |
| "learning_rate": 2e-05, |
| "loss": 0.3774, |
| "loss/crossentropy": 1.695314645767212, |
| "loss/hidden": 0.34765625, |
| "loss/logits": 0.02971694804728031, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.476, |
| "grad_norm": 1.375, |
| "grad_norm_var": 2.4341916402180988, |
| "learning_rate": 2e-05, |
| "loss": 0.314, |
| "loss/crossentropy": 2.7232651710510254, |
| "loss/hidden": 0.2861328125, |
| "loss/logits": 0.02784702740609646, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.477, |
| "grad_norm": 20.375, |
| "grad_norm_var": 22.00192845662435, |
| "learning_rate": 2e-05, |
| "loss": 0.4724, |
| "loss/crossentropy": 1.8712067008018494, |
| "loss/hidden": 0.4365234375, |
| "loss/logits": 0.03585300501435995, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.478, |
| "grad_norm": 17.125, |
| "grad_norm_var": 33.21189956665039, |
| "learning_rate": 2e-05, |
| "loss": 0.4076, |
| "loss/crossentropy": 1.0799504667520523, |
| "loss/hidden": 0.3857421875, |
| "loss/logits": 0.021893550641834736, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.479, |
| "grad_norm": 11.3125, |
| "grad_norm_var": 35.67211888631185, |
| "learning_rate": 2e-05, |
| "loss": 0.3711, |
| "loss/crossentropy": 2.385028600692749, |
| "loss/hidden": 0.3408203125, |
| "loss/logits": 0.030241595581173897, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 1.546875, |
| "grad_norm_var": 35.516621653238936, |
| "learning_rate": 2e-05, |
| "loss": 0.3033, |
| "loss/crossentropy": 2.070383071899414, |
| "loss/hidden": 0.2783203125, |
| "loss/logits": 0.02501996699720621, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.481, |
| "grad_norm": 1.78125, |
| "grad_norm_var": 35.30360514322917, |
| "learning_rate": 2e-05, |
| "loss": 0.342, |
| "loss/crossentropy": 2.150891959667206, |
| "loss/hidden": 0.3134765625, |
| "loss/logits": 0.028521432541310787, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.482, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 35.091365305582684, |
| "learning_rate": 2e-05, |
| "loss": 0.3651, |
| "loss/crossentropy": 1.959929347038269, |
| "loss/hidden": 0.3330078125, |
| "loss/logits": 0.032120613381266594, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.483, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 34.89572347005208, |
| "learning_rate": 2e-05, |
| "loss": 0.4069, |
| "loss/crossentropy": 0.9524770379066467, |
| "loss/hidden": 0.3857421875, |
| "loss/logits": 0.02115157339721918, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.484, |
| "grad_norm": 2.578125, |
| "grad_norm_var": 34.99875081380208, |
| "learning_rate": 2e-05, |
| "loss": 0.365, |
| "loss/crossentropy": 1.7186467051506042, |
| "loss/hidden": 0.3349609375, |
| "loss/logits": 0.030065175145864487, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.485, |
| "grad_norm": 1.7109375, |
| "grad_norm_var": 35.6259396870931, |
| "learning_rate": 2e-05, |
| "loss": 0.3548, |
| "loss/crossentropy": 1.7490887641906738, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.02866003941744566, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.486, |
| "grad_norm": 1.3359375, |
| "grad_norm_var": 36.24727783203125, |
| "learning_rate": 2e-05, |
| "loss": 0.351, |
| "loss/crossentropy": 1.9811639785766602, |
| "loss/hidden": 0.322265625, |
| "loss/logits": 0.028715823777019978, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.487, |
| "grad_norm": 1.8984375, |
| "grad_norm_var": 35.993001302083336, |
| "learning_rate": 2e-05, |
| "loss": 0.3115, |
| "loss/crossentropy": 1.5961838364601135, |
| "loss/hidden": 0.2890625, |
| "loss/logits": 0.02241756021976471, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.488, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 36.615944163004556, |
| "learning_rate": 2e-05, |
| "loss": 0.3399, |
| "loss/crossentropy": 2.1186224818229675, |
| "loss/hidden": 0.3134765625, |
| "loss/logits": 0.026388862170279026, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.489, |
| "grad_norm": 3.09375, |
| "grad_norm_var": 36.75027847290039, |
| "learning_rate": 2e-05, |
| "loss": 0.3319, |
| "loss/crossentropy": 2.3766279220581055, |
| "loss/hidden": 0.302734375, |
| "loss/logits": 0.029136340133845806, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 3.1875, |
| "grad_norm_var": 36.21890029907227, |
| "learning_rate": 2e-05, |
| "loss": 0.3622, |
| "loss/crossentropy": 2.3046228289604187, |
| "loss/hidden": 0.3330078125, |
| "loss/logits": 0.029148480854928493, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.491, |
| "grad_norm": 1.3359375, |
| "grad_norm_var": 36.4323476155599, |
| "learning_rate": 2e-05, |
| "loss": 0.3253, |
| "loss/crossentropy": 2.0606563687324524, |
| "loss/hidden": 0.298828125, |
| "loss/logits": 0.026501288637518883, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.492, |
| "grad_norm": 1.8984375, |
| "grad_norm_var": 36.22162653605143, |
| "learning_rate": 2e-05, |
| "loss": 0.3576, |
| "loss/crossentropy": 2.0364081263542175, |
| "loss/hidden": 0.3271484375, |
| "loss/logits": 0.030420588329434395, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.493, |
| "grad_norm": 1.25, |
| "grad_norm_var": 19.040254465738933, |
| "learning_rate": 2e-05, |
| "loss": 0.3372, |
| "loss/crossentropy": 1.996462881565094, |
| "loss/hidden": 0.3095703125, |
| "loss/logits": 0.027610108256340027, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.494, |
| "grad_norm": 1.3671875, |
| "grad_norm_var": 5.884635416666667, |
| "learning_rate": 2e-05, |
| "loss": 0.3332, |
| "loss/crossentropy": 2.0653313398361206, |
| "loss/hidden": 0.3076171875, |
| "loss/logits": 0.025608508847653866, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.495, |
| "grad_norm": 11.1875, |
| "grad_norm_var": 5.738606770833333, |
| "learning_rate": 2e-05, |
| "loss": 0.4247, |
| "loss/crossentropy": 2.0056963562965393, |
| "loss/hidden": 0.39453125, |
| "loss/logits": 0.0301496759057045, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 1.4140625, |
| "grad_norm_var": 5.756310780843099, |
| "learning_rate": 2e-05, |
| "loss": 0.3424, |
| "loss/crossentropy": 2.031468689441681, |
| "loss/hidden": 0.3154296875, |
| "loss/logits": 0.026999052613973618, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.497, |
| "grad_norm": 1.3359375, |
| "grad_norm_var": 5.809959920247396, |
| "learning_rate": 2e-05, |
| "loss": 0.3332, |
| "loss/crossentropy": 1.8416547179222107, |
| "loss/hidden": 0.3076171875, |
| "loss/logits": 0.025574706494808197, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.498, |
| "grad_norm": 1.3515625, |
| "grad_norm_var": 5.882696278889974, |
| "learning_rate": 2e-05, |
| "loss": 0.3496, |
| "loss/crossentropy": 2.2838199138641357, |
| "loss/hidden": 0.3193359375, |
| "loss/logits": 0.030256418511271477, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.499, |
| "grad_norm": 1.453125, |
| "grad_norm_var": 5.922606404622396, |
| "learning_rate": 2e-05, |
| "loss": 0.3806, |
| "loss/crossentropy": 1.759089708328247, |
| "loss/hidden": 0.349609375, |
| "loss/logits": 0.031000351533293724, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 1.9375, |
| "grad_norm_var": 5.930489095052083, |
| "learning_rate": 2e-05, |
| "loss": 0.3987, |
| "loss/crossentropy": 1.4412594437599182, |
| "loss/hidden": 0.37109375, |
| "loss/logits": 0.027633181773126125, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.2202930782208e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|