| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5, |
| "eval_steps": 250, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001, |
| "grad_norm": 0.000606536865234375, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 0.0002, |
| "loss/crossentropy": 0.8867233544588089, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.00023235346816363744, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 0.2392578125, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 0.0051, |
| "loss/crossentropy": 2.0406004190444946, |
| "loss/hidden": 0.003997802734375, |
| "loss/logits": 0.0010988512658514082, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 0.234375, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 0.0051, |
| "loss/crossentropy": 1.8371902108192444, |
| "loss/hidden": 0.0041351318359375, |
| "loss/logits": 0.0009956851426977664, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 0.263671875, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 0.0049, |
| "loss/crossentropy": 1.105260580778122, |
| "loss/hidden": 0.00402069091796875, |
| "loss/logits": 0.000880763225723058, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 0.2265625, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.0049, |
| "loss/crossentropy": 1.8291524648666382, |
| "loss/hidden": 0.003875732421875, |
| "loss/logits": 0.0010709252674132586, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 0.2890625, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 0.0051, |
| "loss/crossentropy": 2.4322463274002075, |
| "loss/hidden": 0.00389862060546875, |
| "loss/logits": 0.0012143508065491915, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 0.5, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 0.0056, |
| "loss/crossentropy": 2.0112226605415344, |
| "loss/hidden": 0.004486083984375, |
| "loss/logits": 0.0011396315530873835, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 1.2109375, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 0.005, |
| "loss/crossentropy": 2.319802165031433, |
| "loss/hidden": 0.00384521484375, |
| "loss/logits": 0.0011385786347091198, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.005, |
| "loss/crossentropy": 2.2094499468803406, |
| "loss/hidden": 0.0038909912109375, |
| "loss/logits": 0.001147054077591747, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 0.30859375, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0053, |
| "loss/crossentropy": 1.7928503155708313, |
| "loss/hidden": 0.004180908203125, |
| "loss/logits": 0.0010775169357657433, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 2.2e-06, |
| "loss": 0.0049, |
| "loss/crossentropy": 2.4660322666168213, |
| "loss/hidden": 0.00377655029296875, |
| "loss/logits": 0.0011588135384954512, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 0.4140625, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 0.0071, |
| "loss/crossentropy": 1.9250787496566772, |
| "loss/hidden": 0.00592041015625, |
| "loss/logits": 0.001213293697219342, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 0.5859375, |
| "learning_rate": 2.6e-06, |
| "loss": 0.0083, |
| "loss/crossentropy": 1.7589636445045471, |
| "loss/hidden": 0.0070343017578125, |
| "loss/logits": 0.0012634693994186819, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 0.3984375, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 0.0076, |
| "loss/crossentropy": 2.017001152038574, |
| "loss/hidden": 0.0063629150390625, |
| "loss/logits": 0.0012496507843025029, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 0.453125, |
| "learning_rate": 3e-06, |
| "loss": 0.0072, |
| "loss/crossentropy": 2.017792582511902, |
| "loss/hidden": 0.0059356689453125, |
| "loss/logits": 0.0012571557890623808, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 0.337890625, |
| "grad_norm_var": 0.06922563040176707, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.0071, |
| "loss/crossentropy": 2.4987382888793945, |
| "loss/hidden": 0.0056915283203125, |
| "loss/logits": 0.0014035521890036762, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 0.73828125, |
| "grad_norm_var": 0.06696637074152628, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 0.0076, |
| "loss/crossentropy": 1.826792299747467, |
| "loss/hidden": 0.006317138671875, |
| "loss/logits": 0.0013136997004039586, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 0.55078125, |
| "grad_norm_var": 0.06571272214253744, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 0.0068, |
| "loss/crossentropy": 1.126530610024929, |
| "loss/hidden": 0.005828857421875, |
| "loss/logits": 0.000935775664402172, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 0.32421875, |
| "grad_norm_var": 0.06381465593973795, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.0069, |
| "loss/crossentropy": 1.908576250076294, |
| "loss/hidden": 0.005615234375, |
| "loss/logits": 0.0012609789846464992, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.390625, |
| "grad_norm_var": 0.06182791392008464, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.007, |
| "loss/crossentropy": 1.7991210222244263, |
| "loss/hidden": 0.0058441162109375, |
| "loss/logits": 0.0011686266516335309, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 0.3046875, |
| "grad_norm_var": 0.05989767710367839, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 0.007, |
| "loss/crossentropy": 2.1353321075439453, |
| "loss/hidden": 0.005706787109375, |
| "loss/logits": 0.0012753225746564567, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 0.443359375, |
| "grad_norm_var": 0.05800538063049317, |
| "learning_rate": 4.4e-06, |
| "loss": 0.0102, |
| "loss/crossentropy": 1.9959095120429993, |
| "loss/hidden": 0.008544921875, |
| "loss/logits": 0.0016535300528630614, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 0.58984375, |
| "grad_norm_var": 0.058952951431274415, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 0.0117, |
| "loss/crossentropy": 1.6306341290473938, |
| "loss/hidden": 0.0101318359375, |
| "loss/logits": 0.0015702435630373657, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 0.609375, |
| "grad_norm_var": 0.0220308780670166, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 0.0101, |
| "loss/crossentropy": 1.1691604554653168, |
| "loss/hidden": 0.009002685546875, |
| "loss/logits": 0.0011398608330637217, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 0.404296875, |
| "grad_norm_var": 0.01901772419611613, |
| "learning_rate": 5e-06, |
| "loss": 0.0094, |
| "loss/crossentropy": 2.1158002614974976, |
| "loss/hidden": 0.0078582763671875, |
| "loss/logits": 0.001586797763593495, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.026, |
| "grad_norm": 0.9375, |
| "grad_norm_var": 0.03251402775446574, |
| "learning_rate": 5.2e-06, |
| "loss": 0.0112, |
| "loss/crossentropy": 0.9760683104395866, |
| "loss/hidden": 0.010345458984375, |
| "loss/logits": 0.0009004934981931001, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.027, |
| "grad_norm": 0.59375, |
| "grad_norm_var": 0.028410832087198894, |
| "learning_rate": 5.400000000000001e-06, |
| "loss": 0.0092, |
| "loss/crossentropy": 1.3714375793933868, |
| "loss/hidden": 0.00799560546875, |
| "loss/logits": 0.0011652775574475527, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 1.4296875, |
| "grad_norm_var": 0.0805971622467041, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": 0.0108, |
| "loss/crossentropy": 1.6821632981300354, |
| "loss/hidden": 0.00921630859375, |
| "loss/logits": 0.0015629479894414544, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.029, |
| "grad_norm": 0.439453125, |
| "grad_norm_var": 0.08159255981445312, |
| "learning_rate": 5.8e-06, |
| "loss": 0.0096, |
| "loss/crossentropy": 1.814743161201477, |
| "loss/hidden": 0.0081329345703125, |
| "loss/logits": 0.001480952720157802, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 0.306640625, |
| "grad_norm_var": 0.08408544858296713, |
| "learning_rate": 6e-06, |
| "loss": 0.0095, |
| "loss/crossentropy": 1.7807061672210693, |
| "loss/hidden": 0.00811767578125, |
| "loss/logits": 0.0013878631871193647, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.031, |
| "grad_norm": 0.45703125, |
| "grad_norm_var": 0.08403420448303223, |
| "learning_rate": 6.200000000000001e-06, |
| "loss": 0.0095, |
| "loss/crossentropy": 1.4871117770671844, |
| "loss/hidden": 0.0081329345703125, |
| "loss/logits": 0.0013768199132755399, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.66015625, |
| "grad_norm_var": 0.08125686645507812, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 0.0139, |
| "loss/crossentropy": 1.6797687411308289, |
| "loss/hidden": 0.012054443359375, |
| "loss/logits": 0.001833194401115179, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.033, |
| "grad_norm": 0.7890625, |
| "grad_norm_var": 0.08253218332926432, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 0.0144, |
| "loss/crossentropy": 1.0567285418510437, |
| "loss/hidden": 0.01312255859375, |
| "loss/logits": 0.00127064943080768, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.034, |
| "grad_norm": 0.40234375, |
| "grad_norm_var": 0.08442630767822265, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": 0.0133, |
| "loss/crossentropy": 2.0356476306915283, |
| "loss/hidden": 0.011383056640625, |
| "loss/logits": 0.0018824898870661855, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.035, |
| "grad_norm": 1.390625, |
| "grad_norm_var": 0.12089309692382813, |
| "learning_rate": 7e-06, |
| "loss": 0.0141, |
| "loss/crossentropy": 1.8682858347892761, |
| "loss/hidden": 0.012115478515625, |
| "loss/logits": 0.00193694734480232, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 0.451171875, |
| "grad_norm_var": 0.11915523211161295, |
| "learning_rate": 7.2000000000000005e-06, |
| "loss": 0.0137, |
| "loss/crossentropy": 1.9475785493850708, |
| "loss/hidden": 0.011871337890625, |
| "loss/logits": 0.0018277816707268357, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.037, |
| "grad_norm": 72.5, |
| "grad_norm_var": 322.67027967770895, |
| "learning_rate": 7.4e-06, |
| "loss": 0.0547, |
| "loss/crossentropy": 1.6348857879638672, |
| "loss/hidden": 0.05029296875, |
| "loss/logits": 0.004432929214090109, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.038, |
| "grad_norm": 0.37109375, |
| "grad_norm_var": 322.7159591039022, |
| "learning_rate": 7.600000000000001e-06, |
| "loss": 0.0133, |
| "loss/crossentropy": 2.290273904800415, |
| "loss/hidden": 0.011260986328125, |
| "loss/logits": 0.0020360149210318923, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.039, |
| "grad_norm": 0.4609375, |
| "grad_norm_var": 322.79530232747396, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": 0.0128, |
| "loss/crossentropy": 2.156642735004425, |
| "loss/hidden": 0.0108642578125, |
| "loss/logits": 0.0018967647338286042, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.68359375, |
| "grad_norm_var": 322.75083510080975, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.0143, |
| "loss/crossentropy": 1.5011438727378845, |
| "loss/hidden": 0.012603759765625, |
| "loss/logits": 0.00171089411014691, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.041, |
| "grad_norm": 0.279296875, |
| "grad_norm_var": 322.830778948466, |
| "learning_rate": 8.2e-06, |
| "loss": 0.0113, |
| "loss/crossentropy": 2.1166284680366516, |
| "loss/hidden": 0.009552001953125, |
| "loss/logits": 0.0017564677400514483, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.042, |
| "grad_norm": 0.462890625, |
| "grad_norm_var": 323.11045009295145, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 0.0177, |
| "loss/crossentropy": 1.6626213192939758, |
| "loss/hidden": 0.015655517578125, |
| "loss/logits": 0.0020019301446154714, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.043, |
| "grad_norm": 0.451171875, |
| "grad_norm_var": 323.19747867584226, |
| "learning_rate": 8.6e-06, |
| "loss": 0.0166, |
| "loss/crossentropy": 1.9376201629638672, |
| "loss/hidden": 0.0145263671875, |
| "loss/logits": 0.002024749875999987, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 0.6796875, |
| "grad_norm_var": 323.5992609024048, |
| "learning_rate": 8.8e-06, |
| "loss": 0.0189, |
| "loss/crossentropy": 1.4491004347801208, |
| "loss/hidden": 0.01654052734375, |
| "loss/logits": 0.0023616516264155507, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.045, |
| "grad_norm": 0.41015625, |
| "grad_norm_var": 323.6173208713532, |
| "learning_rate": 9e-06, |
| "loss": 0.0162, |
| "loss/crossentropy": 1.7365482449531555, |
| "loss/hidden": 0.014251708984375, |
| "loss/logits": 0.0019506089156493545, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.046, |
| "grad_norm": 0.4765625, |
| "grad_norm_var": 323.51172122955325, |
| "learning_rate": 9.200000000000002e-06, |
| "loss": 0.0166, |
| "loss/crossentropy": 2.1552224159240723, |
| "loss/hidden": 0.014404296875, |
| "loss/logits": 0.002185105928219855, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.047, |
| "grad_norm": 1.046875, |
| "grad_norm_var": 323.1716298421224, |
| "learning_rate": 9.4e-06, |
| "loss": 0.0165, |
| "loss/crossentropy": 1.6503152251243591, |
| "loss/hidden": 0.014617919921875, |
| "loss/logits": 0.0018592309206724167, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 0.4609375, |
| "grad_norm_var": 323.29190362294514, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 0.0159, |
| "loss/crossentropy": 2.1065614819526672, |
| "loss/hidden": 0.013824462890625, |
| "loss/logits": 0.002064164378680289, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.049, |
| "grad_norm": 0.474609375, |
| "grad_norm_var": 323.478085565567, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 0.0163, |
| "loss/crossentropy": 1.6366270780563354, |
| "loss/hidden": 0.014190673828125, |
| "loss/logits": 0.00208114180713892, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 0.369140625, |
| "grad_norm_var": 323.49878590901693, |
| "learning_rate": 1e-05, |
| "loss": 0.0153, |
| "loss/crossentropy": 2.427204728126526, |
| "loss/hidden": 0.01318359375, |
| "loss/logits": 0.0021336106583476067, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.051, |
| "grad_norm": 0.435546875, |
| "grad_norm_var": 324.0231384118398, |
| "learning_rate": 1.02e-05, |
| "loss": 0.0169, |
| "loss/crossentropy": 2.065056622028351, |
| "loss/hidden": 0.014801025390625, |
| "loss/logits": 0.0020794584415853024, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 0.61328125, |
| "grad_norm_var": 323.9264413833618, |
| "learning_rate": 1.04e-05, |
| "loss": 0.0214, |
| "loss/crossentropy": 1.9746105670928955, |
| "loss/hidden": 0.01885986328125, |
| "loss/logits": 0.002499670721590519, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.053, |
| "grad_norm": 0.6171875, |
| "grad_norm_var": 0.032596778869628903, |
| "learning_rate": 1.0600000000000002e-05, |
| "loss": 0.0209, |
| "loss/crossentropy": 2.029407024383545, |
| "loss/hidden": 0.0185546875, |
| "loss/logits": 0.0023920219391584396, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.054, |
| "grad_norm": 0.52734375, |
| "grad_norm_var": 0.031055641174316407, |
| "learning_rate": 1.0800000000000002e-05, |
| "loss": 0.0196, |
| "loss/crossentropy": 1.5256073474884033, |
| "loss/hidden": 0.017578125, |
| "loss/logits": 0.0019834558479487896, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.055, |
| "grad_norm": 0.51953125, |
| "grad_norm_var": 0.030745697021484376, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 0.019, |
| "loss/crossentropy": 1.781435489654541, |
| "loss/hidden": 0.016845703125, |
| "loss/logits": 0.0021612788550555706, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 0.5390625, |
| "grad_norm_var": 0.029124895731608074, |
| "learning_rate": 1.1200000000000001e-05, |
| "loss": 0.0199, |
| "loss/crossentropy": 1.8619230389595032, |
| "loss/hidden": 0.01763916015625, |
| "loss/logits": 0.002292199060320854, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.057, |
| "grad_norm": 0.8984375, |
| "grad_norm_var": 0.03298948605855306, |
| "learning_rate": 1.14e-05, |
| "loss": 0.0182, |
| "loss/crossentropy": 1.3362514935433865, |
| "loss/hidden": 0.0166015625, |
| "loss/logits": 0.0015543379704467952, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.058, |
| "grad_norm": 0.423828125, |
| "grad_norm_var": 0.033597930272420244, |
| "learning_rate": 1.16e-05, |
| "loss": 0.0186, |
| "loss/crossentropy": 2.560052752494812, |
| "loss/hidden": 0.01617431640625, |
| "loss/logits": 0.0024208942195400596, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.059, |
| "grad_norm": 1.6640625, |
| "grad_norm_var": 0.10811055501302083, |
| "learning_rate": 1.18e-05, |
| "loss": 0.0183, |
| "loss/crossentropy": 1.0995870353654027, |
| "loss/hidden": 0.0169677734375, |
| "loss/logits": 0.0013089054118609056, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.5234375, |
| "grad_norm_var": 0.1087005615234375, |
| "learning_rate": 1.2e-05, |
| "loss": 0.02, |
| "loss/crossentropy": 2.119443416595459, |
| "loss/hidden": 0.0174560546875, |
| "loss/logits": 0.0025522300275042653, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.061, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 0.15513909657796224, |
| "learning_rate": 1.22e-05, |
| "loss": 0.0187, |
| "loss/crossentropy": 0.9516562968492508, |
| "loss/hidden": 0.01715087890625, |
| "loss/logits": 0.0015256062615662813, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.062, |
| "grad_norm": 0.458984375, |
| "grad_norm_var": 0.15567053159077962, |
| "learning_rate": 1.2400000000000002e-05, |
| "loss": 0.0229, |
| "loss/crossentropy": 2.208059787750244, |
| "loss/hidden": 0.02008056640625, |
| "loss/logits": 0.002771534491330385, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.063, |
| "grad_norm": 0.52734375, |
| "grad_norm_var": 0.1480940341949463, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 0.0234, |
| "loss/crossentropy": 2.1920509338378906, |
| "loss/hidden": 0.02056884765625, |
| "loss/logits": 0.0028626667335629463, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 0.4296875, |
| "grad_norm_var": 0.1489907423655192, |
| "learning_rate": 1.2800000000000001e-05, |
| "loss": 0.0222, |
| "loss/crossentropy": 1.986942172050476, |
| "loss/hidden": 0.0196533203125, |
| "loss/logits": 0.002575715654529631, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.065, |
| "grad_norm": 0.69140625, |
| "grad_norm_var": 0.14658247629801432, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 0.0259, |
| "loss/crossentropy": 1.8861247301101685, |
| "loss/hidden": 0.0230712890625, |
| "loss/logits": 0.0027794617926701903, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.066, |
| "grad_norm": 0.55859375, |
| "grad_norm_var": 0.1411477247873942, |
| "learning_rate": 1.3200000000000002e-05, |
| "loss": 0.0222, |
| "loss/crossentropy": 1.9872633218765259, |
| "loss/hidden": 0.019775390625, |
| "loss/logits": 0.002420680597424507, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.067, |
| "grad_norm": 1.0, |
| "grad_norm_var": 0.14229151407877605, |
| "learning_rate": 1.3400000000000002e-05, |
| "loss": 0.0248, |
| "loss/crossentropy": 1.8202188611030579, |
| "loss/hidden": 0.02203369140625, |
| "loss/logits": 0.0027603432536125183, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 0.44921875, |
| "grad_norm_var": 0.1463129679361979, |
| "learning_rate": 1.3600000000000002e-05, |
| "loss": 0.024, |
| "loss/crossentropy": 1.7114304900169373, |
| "loss/hidden": 0.02166748046875, |
| "loss/logits": 0.002324871253222227, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.069, |
| "grad_norm": 0.474609375, |
| "grad_norm_var": 0.1493471622467041, |
| "learning_rate": 1.38e-05, |
| "loss": 0.0221, |
| "loss/crossentropy": 2.0404117703437805, |
| "loss/hidden": 0.01959228515625, |
| "loss/logits": 0.002503333264030516, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 0.7890625, |
| "grad_norm_var": 0.14756658871968586, |
| "learning_rate": 1.4e-05, |
| "loss": 0.0216, |
| "loss/crossentropy": 2.26701283454895, |
| "loss/hidden": 0.01885986328125, |
| "loss/logits": 0.0027175976429134607, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.071, |
| "grad_norm": 0.6015625, |
| "grad_norm_var": 0.14582289059956868, |
| "learning_rate": 1.4200000000000001e-05, |
| "loss": 0.0238, |
| "loss/crossentropy": 2.3501675128936768, |
| "loss/hidden": 0.0208740234375, |
| "loss/logits": 0.0028861036989837885, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 1.6796875, |
| "grad_norm_var": 0.19923399289449056, |
| "learning_rate": 1.4400000000000001e-05, |
| "loss": 0.0284, |
| "loss/crossentropy": 2.463997960090637, |
| "loss/hidden": 0.0255126953125, |
| "loss/logits": 0.002903890563175082, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.073, |
| "grad_norm": 0.640625, |
| "grad_norm_var": 0.19979208310445148, |
| "learning_rate": 1.46e-05, |
| "loss": 0.0315, |
| "loss/crossentropy": 1.8753514885902405, |
| "loss/hidden": 0.02789306640625, |
| "loss/logits": 0.0035823375219479203, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.074, |
| "grad_norm": 0.625, |
| "grad_norm_var": 0.19282932281494142, |
| "learning_rate": 1.48e-05, |
| "loss": 0.0277, |
| "loss/crossentropy": 1.9195038080215454, |
| "loss/hidden": 0.02459716796875, |
| "loss/logits": 0.003096952917985618, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 0.7890625, |
| "grad_norm_var": 0.1387399673461914, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.0291, |
| "loss/crossentropy": 1.6664665341377258, |
| "loss/hidden": 0.026123046875, |
| "loss/logits": 0.003020121017470956, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 0.53125, |
| "grad_norm_var": 0.1385227839152018, |
| "learning_rate": 1.5200000000000002e-05, |
| "loss": 0.0256, |
| "loss/crossentropy": 2.158577561378479, |
| "loss/hidden": 0.02288818359375, |
| "loss/logits": 0.0027230100240558386, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.077, |
| "grad_norm": 0.8515625, |
| "grad_norm_var": 0.09533430735270182, |
| "learning_rate": 1.54e-05, |
| "loss": 0.0311, |
| "loss/crossentropy": 1.5809077620506287, |
| "loss/hidden": 0.02801513671875, |
| "loss/logits": 0.0030948739731684327, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.078, |
| "grad_norm": 0.58203125, |
| "grad_norm_var": 0.09243137041727702, |
| "learning_rate": 1.5600000000000003e-05, |
| "loss": 0.0278, |
| "loss/crossentropy": 2.628559112548828, |
| "loss/hidden": 0.02459716796875, |
| "loss/logits": 0.0032311297254636884, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.079, |
| "grad_norm": 0.57421875, |
| "grad_norm_var": 0.09148151079813639, |
| "learning_rate": 1.58e-05, |
| "loss": 0.0278, |
| "loss/crossentropy": 2.3653043508529663, |
| "loss/hidden": 0.0245361328125, |
| "loss/logits": 0.0032503672409802675, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.419921875, |
| "grad_norm_var": 0.09184494018554687, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.0255, |
| "loss/crossentropy": 1.8367934226989746, |
| "loss/hidden": 0.02288818359375, |
| "loss/logits": 0.002621771185658872, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.081, |
| "grad_norm": 0.5234375, |
| "grad_norm_var": 0.09388167063395182, |
| "learning_rate": 1.62e-05, |
| "loss": 0.0269, |
| "loss/crossentropy": 1.9675615429878235, |
| "loss/hidden": 0.02410888671875, |
| "loss/logits": 0.0028106847312301397, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.082, |
| "grad_norm": 0.66796875, |
| "grad_norm_var": 0.0926675796508789, |
| "learning_rate": 1.64e-05, |
| "loss": 0.034, |
| "loss/crossentropy": 1.8206292390823364, |
| "loss/hidden": 0.03070068359375, |
| "loss/logits": 0.0033048836048692465, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.083, |
| "grad_norm": 0.62890625, |
| "grad_norm_var": 0.08642832438151042, |
| "learning_rate": 1.66e-05, |
| "loss": 0.0311, |
| "loss/crossentropy": 1.6110271513462067, |
| "loss/hidden": 0.02825927734375, |
| "loss/logits": 0.0028002122417092323, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 0.69140625, |
| "grad_norm_var": 0.0827466328938802, |
| "learning_rate": 1.6800000000000002e-05, |
| "loss": 0.0337, |
| "loss/crossentropy": 1.5017576217651367, |
| "loss/hidden": 0.0308837890625, |
| "loss/logits": 0.002857009880244732, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.085, |
| "grad_norm": 0.55078125, |
| "grad_norm_var": 0.08090246518452962, |
| "learning_rate": 1.7e-05, |
| "loss": 0.0327, |
| "loss/crossentropy": 2.151441216468811, |
| "loss/hidden": 0.02935791015625, |
| "loss/logits": 0.0033320217626169324, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.086, |
| "grad_norm": 0.51953125, |
| "grad_norm_var": 0.08212202390034994, |
| "learning_rate": 1.72e-05, |
| "loss": 0.0294, |
| "loss/crossentropy": 1.74313086271286, |
| "loss/hidden": 0.026611328125, |
| "loss/logits": 0.0027418186655268073, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.087, |
| "grad_norm": 0.640625, |
| "grad_norm_var": 0.0818098545074463, |
| "learning_rate": 1.7400000000000003e-05, |
| "loss": 0.0344, |
| "loss/crossentropy": 1.9408629536628723, |
| "loss/hidden": 0.0306396484375, |
| "loss/logits": 0.003764115506783128, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 0.04418638547261556, |
| "learning_rate": 1.76e-05, |
| "loss": 0.0287, |
| "loss/crossentropy": 1.097998559474945, |
| "loss/hidden": 0.02667236328125, |
| "loss/logits": 0.0020423351670615375, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.089, |
| "grad_norm": 0.48828125, |
| "grad_norm_var": 0.04605596860249837, |
| "learning_rate": 1.7800000000000002e-05, |
| "loss": 0.0283, |
| "loss/crossentropy": 2.197614073753357, |
| "loss/hidden": 0.0255126953125, |
| "loss/logits": 0.002818369073793292, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 3.328125, |
| "grad_norm_var": 0.49310110410054525, |
| "learning_rate": 1.8e-05, |
| "loss": 0.0349, |
| "loss/crossentropy": 0.8821299159899354, |
| "loss/hidden": 0.03314208984375, |
| "loss/logits": 0.0018038563139270991, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.091, |
| "grad_norm": 0.609375, |
| "grad_norm_var": 0.4958765506744385, |
| "learning_rate": 1.8200000000000002e-05, |
| "loss": 0.0316, |
| "loss/crossentropy": 2.084783136844635, |
| "loss/hidden": 0.02825927734375, |
| "loss/logits": 0.003339589573442936, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 0.8828125, |
| "grad_norm_var": 0.49056077003479004, |
| "learning_rate": 1.8400000000000003e-05, |
| "loss": 0.036, |
| "loss/crossentropy": 2.17539119720459, |
| "loss/hidden": 0.032470703125, |
| "loss/logits": 0.00348565389867872, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.093, |
| "grad_norm": 0.55859375, |
| "grad_norm_var": 0.49513840675354004, |
| "learning_rate": 1.86e-05, |
| "loss": 0.0329, |
| "loss/crossentropy": 2.2177764177322388, |
| "loss/hidden": 0.0294189453125, |
| "loss/logits": 0.0035150039475411177, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.094, |
| "grad_norm": 0.578125, |
| "grad_norm_var": 0.4952597141265869, |
| "learning_rate": 1.88e-05, |
| "loss": 0.0349, |
| "loss/crossentropy": 1.8453172445297241, |
| "loss/hidden": 0.03143310546875, |
| "loss/logits": 0.003429048229008913, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.095, |
| "grad_norm": 1.0234375, |
| "grad_norm_var": 0.49357806841532387, |
| "learning_rate": 1.9e-05, |
| "loss": 0.0372, |
| "loss/crossentropy": 1.9843305349349976, |
| "loss/hidden": 0.03369140625, |
| "loss/logits": 0.003536572912707925, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 0.6328125, |
| "grad_norm_var": 0.48445987701416016, |
| "learning_rate": 1.9200000000000003e-05, |
| "loss": 0.0336, |
| "loss/crossentropy": 1.8042800426483154, |
| "loss/hidden": 0.03033447265625, |
| "loss/logits": 0.0032234484096989036, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.097, |
| "grad_norm": 0.78125, |
| "grad_norm_var": 0.47724246978759766, |
| "learning_rate": 1.94e-05, |
| "loss": 0.0335, |
| "loss/crossentropy": 1.747575581073761, |
| "loss/hidden": 0.0303955078125, |
| "loss/logits": 0.003057013265788555, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.098, |
| "grad_norm": 0.85546875, |
| "grad_norm_var": 0.4743799209594727, |
| "learning_rate": 1.9600000000000002e-05, |
| "loss": 0.0359, |
| "loss/crossentropy": 1.8212959170341492, |
| "loss/hidden": 0.0323486328125, |
| "loss/logits": 0.00355940917506814, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.099, |
| "grad_norm": 0.78515625, |
| "grad_norm_var": 0.4706313451131185, |
| "learning_rate": 1.98e-05, |
| "loss": 0.038, |
| "loss/crossentropy": 1.421421229839325, |
| "loss/hidden": 0.034912109375, |
| "loss/logits": 0.003108043922111392, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.58203125, |
| "grad_norm_var": 0.4743021011352539, |
| "learning_rate": 2e-05, |
| "loss": 0.0368, |
| "loss/crossentropy": 2.2832354307174683, |
| "loss/hidden": 0.0330810546875, |
| "loss/logits": 0.0037010950036346912, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.101, |
| "grad_norm": 0.60546875, |
| "grad_norm_var": 0.4720519383748372, |
| "learning_rate": 2e-05, |
| "loss": 0.0331, |
| "loss/crossentropy": 2.4297910928726196, |
| "loss/hidden": 0.02978515625, |
| "loss/logits": 0.00334281416144222, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.102, |
| "grad_norm": 4.34375, |
| "grad_norm_var": 1.1979937235514322, |
| "learning_rate": 2e-05, |
| "loss": 0.0458, |
| "loss/crossentropy": 1.0970591604709625, |
| "loss/hidden": 0.04345703125, |
| "loss/logits": 0.002338708785828203, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.103, |
| "grad_norm": 3.90625, |
| "grad_norm_var": 1.652544911702474, |
| "learning_rate": 2e-05, |
| "loss": 0.047, |
| "loss/crossentropy": 0.7569917887449265, |
| "loss/hidden": 0.0450439453125, |
| "loss/logits": 0.0019756766268983483, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 0.92578125, |
| "grad_norm_var": 1.6627832412719727, |
| "learning_rate": 2e-05, |
| "loss": 0.0397, |
| "loss/crossentropy": 2.1630712747573853, |
| "loss/hidden": 0.0355224609375, |
| "loss/logits": 0.004194425302557647, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.105, |
| "grad_norm": 2.25, |
| "grad_norm_var": 1.6648190816243489, |
| "learning_rate": 2e-05, |
| "loss": 0.0467, |
| "loss/crossentropy": 2.131657838821411, |
| "loss/hidden": 0.0418701171875, |
| "loss/logits": 0.004862830974161625, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.106, |
| "grad_norm": 0.8828125, |
| "grad_norm_var": 1.414954630533854, |
| "learning_rate": 2e-05, |
| "loss": 0.0442, |
| "loss/crossentropy": 2.395767092704773, |
| "loss/hidden": 0.039306640625, |
| "loss/logits": 0.004875035956501961, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.107, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 1.431434122721354, |
| "learning_rate": 2e-05, |
| "loss": 0.0467, |
| "loss/crossentropy": 1.8139055967330933, |
| "loss/hidden": 0.0416259765625, |
| "loss/logits": 0.005075734108686447, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 0.90234375, |
| "grad_norm_var": 1.4302143732706705, |
| "learning_rate": 2e-05, |
| "loss": 0.0425, |
| "loss/crossentropy": 2.0919321179389954, |
| "loss/hidden": 0.0384521484375, |
| "loss/logits": 0.004045868292450905, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.109, |
| "grad_norm": 0.77734375, |
| "grad_norm_var": 1.4097848892211915, |
| "learning_rate": 2e-05, |
| "loss": 0.0431, |
| "loss/crossentropy": 1.7248334288597107, |
| "loss/hidden": 0.039306640625, |
| "loss/logits": 0.0037899790331721306, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.11, |
| "grad_norm": 1.5859375, |
| "grad_norm_var": 1.3661523818969727, |
| "learning_rate": 2e-05, |
| "loss": 0.0383, |
| "loss/crossentropy": 2.6418296098709106, |
| "loss/hidden": 0.0343017578125, |
| "loss/logits": 0.003979154396802187, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.111, |
| "grad_norm": 1.5, |
| "grad_norm_var": 1.353990109761556, |
| "learning_rate": 2e-05, |
| "loss": 0.0416, |
| "loss/crossentropy": 0.6973606944084167, |
| "loss/hidden": 0.0396728515625, |
| "loss/logits": 0.0019382394093554467, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 0.859375, |
| "grad_norm_var": 1.331968116760254, |
| "learning_rate": 2e-05, |
| "loss": 0.049, |
| "loss/crossentropy": 1.9394667744636536, |
| "loss/hidden": 0.04443359375, |
| "loss/logits": 0.0046083766501396894, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.113, |
| "grad_norm": 1.0625, |
| "grad_norm_var": 1.3106271743774414, |
| "learning_rate": 2e-05, |
| "loss": 0.0504, |
| "loss/crossentropy": 1.2368495762348175, |
| "loss/hidden": 0.046875, |
| "loss/logits": 0.00347616511862725, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.114, |
| "grad_norm": 1.203125, |
| "grad_norm_var": 1.2883158365885417, |
| "learning_rate": 2e-05, |
| "loss": 0.045, |
| "loss/crossentropy": 2.196173131465912, |
| "loss/hidden": 0.04052734375, |
| "loss/logits": 0.004511563340201974, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.115, |
| "grad_norm": 0.78515625, |
| "grad_norm_var": 1.2883158365885417, |
| "learning_rate": 2e-05, |
| "loss": 0.047, |
| "loss/crossentropy": 1.9997879266738892, |
| "loss/hidden": 0.0421142578125, |
| "loss/logits": 0.004871049430221319, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 1.375, |
| "grad_norm_var": 1.228288205464681, |
| "learning_rate": 2e-05, |
| "loss": 0.0521, |
| "loss/crossentropy": 1.9844006299972534, |
| "loss/hidden": 0.046875, |
| "loss/logits": 0.0051891920156776905, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.117, |
| "grad_norm": 0.65625, |
| "grad_norm_var": 1.22191162109375, |
| "learning_rate": 2e-05, |
| "loss": 0.0435, |
| "loss/crossentropy": 1.9568707346916199, |
| "loss/hidden": 0.039306640625, |
| "loss/logits": 0.004212545929476619, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.118, |
| "grad_norm": 0.89453125, |
| "grad_norm_var": 0.6917851130167644, |
| "learning_rate": 2e-05, |
| "loss": 0.0481, |
| "loss/crossentropy": 1.546873390674591, |
| "loss/hidden": 0.043212890625, |
| "loss/logits": 0.004850989207625389, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.119, |
| "grad_norm": 0.6875, |
| "grad_norm_var": 0.24595890045166016, |
| "learning_rate": 2e-05, |
| "loss": 0.0452, |
| "loss/crossentropy": 1.89975243806839, |
| "loss/hidden": 0.041259765625, |
| "loss/logits": 0.0039442547131329775, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.75, |
| "grad_norm_var": 0.2533203125, |
| "learning_rate": 2e-05, |
| "loss": 0.047, |
| "loss/crossentropy": 2.1915535926818848, |
| "loss/hidden": 0.0423583984375, |
| "loss/logits": 0.004610191797837615, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.121, |
| "grad_norm": 0.65234375, |
| "grad_norm_var": 0.17778008778889973, |
| "learning_rate": 2e-05, |
| "loss": 0.0436, |
| "loss/crossentropy": 1.9114365577697754, |
| "loss/hidden": 0.03955078125, |
| "loss/logits": 0.0040131560526788235, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.122, |
| "grad_norm": 2.59375, |
| "grad_norm_var": 0.3233657201131185, |
| "learning_rate": 2e-05, |
| "loss": 0.053, |
| "loss/crossentropy": 1.6134037971496582, |
| "loss/hidden": 0.048583984375, |
| "loss/logits": 0.004459647228941321, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.123, |
| "grad_norm": 0.91015625, |
| "grad_norm_var": 0.2515520731608073, |
| "learning_rate": 2e-05, |
| "loss": 0.0471, |
| "loss/crossentropy": 2.2170268297195435, |
| "loss/hidden": 0.04248046875, |
| "loss/logits": 0.004627523710951209, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 0.75, |
| "grad_norm_var": 0.25650374094645184, |
| "learning_rate": 2e-05, |
| "loss": 0.054, |
| "loss/crossentropy": 1.8259931206703186, |
| "loss/hidden": 0.04931640625, |
| "loss/logits": 0.004666820866987109, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 1.0859375, |
| "grad_norm_var": 0.25061213175455727, |
| "learning_rate": 2e-05, |
| "loss": 0.0499, |
| "loss/crossentropy": 1.8636714816093445, |
| "loss/hidden": 0.045654296875, |
| "loss/logits": 0.004241484683007002, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.126, |
| "grad_norm": 1.125, |
| "grad_norm_var": 0.2330718994140625, |
| "learning_rate": 2e-05, |
| "loss": 0.0553, |
| "loss/crossentropy": 1.8325402736663818, |
| "loss/hidden": 0.0504150390625, |
| "loss/logits": 0.004864088725298643, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.127, |
| "grad_norm": 0.78515625, |
| "grad_norm_var": 0.22265872955322266, |
| "learning_rate": 2e-05, |
| "loss": 0.0502, |
| "loss/crossentropy": 2.0756123661994934, |
| "loss/hidden": 0.04541015625, |
| "loss/logits": 0.004794843029230833, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 1.6484375, |
| "grad_norm_var": 0.2456216812133789, |
| "learning_rate": 2e-05, |
| "loss": 0.06, |
| "loss/crossentropy": 1.6910657286643982, |
| "loss/hidden": 0.0540771484375, |
| "loss/logits": 0.005873451940715313, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.129, |
| "grad_norm": 5.375, |
| "grad_norm_var": 1.409238624572754, |
| "learning_rate": 2e-05, |
| "loss": 0.0589, |
| "loss/crossentropy": 0.8958628624677658, |
| "loss/hidden": 0.0562744140625, |
| "loss/logits": 0.0026033871108666062, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.0390625, |
| "grad_norm_var": 1.4136926651000976, |
| "learning_rate": 2e-05, |
| "loss": 0.0542, |
| "loss/crossentropy": 2.181567072868347, |
| "loss/hidden": 0.0491943359375, |
| "loss/logits": 0.005030616419389844, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.131, |
| "grad_norm": 1.2265625, |
| "grad_norm_var": 1.3944170633951822, |
| "learning_rate": 2e-05, |
| "loss": 0.0492, |
| "loss/crossentropy": 2.2363303899765015, |
| "loss/hidden": 0.04443359375, |
| "loss/logits": 0.004747522529214621, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 0.7890625, |
| "grad_norm_var": 1.4137003580729166, |
| "learning_rate": 2e-05, |
| "loss": 0.0567, |
| "loss/crossentropy": 2.1101560592651367, |
| "loss/hidden": 0.051513671875, |
| "loss/logits": 0.00514927739277482, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.133, |
| "grad_norm": 0.6796875, |
| "grad_norm_var": 1.4116900126139322, |
| "learning_rate": 2e-05, |
| "loss": 0.0515, |
| "loss/crossentropy": 2.1291067600250244, |
| "loss/hidden": 0.046630859375, |
| "loss/logits": 0.004876742837950587, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.134, |
| "grad_norm": 0.984375, |
| "grad_norm_var": 1.4071934382120768, |
| "learning_rate": 2e-05, |
| "loss": 0.0492, |
| "loss/crossentropy": 2.2621551752090454, |
| "loss/hidden": 0.044677734375, |
| "loss/logits": 0.00452903937548399, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.135, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 1.4250879287719727, |
| "learning_rate": 2e-05, |
| "loss": 0.069, |
| "loss/crossentropy": 2.0280712842941284, |
| "loss/hidden": 0.062744140625, |
| "loss/logits": 0.006305122980847955, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 0.828125, |
| "grad_norm_var": 1.4185597101847331, |
| "learning_rate": 2e-05, |
| "loss": 0.0549, |
| "loss/crossentropy": 2.2339359521865845, |
| "loss/hidden": 0.0496826171875, |
| "loss/logits": 0.005226947134360671, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.137, |
| "grad_norm": 4.34375, |
| "grad_norm_var": 1.8932634989420574, |
| "learning_rate": 2e-05, |
| "loss": 0.0615, |
| "loss/crossentropy": 1.514957308769226, |
| "loss/hidden": 0.056884765625, |
| "loss/logits": 0.0045981991570442915, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.138, |
| "grad_norm": 0.921875, |
| "grad_norm_var": 1.8573443094889324, |
| "learning_rate": 2e-05, |
| "loss": 0.0557, |
| "loss/crossentropy": 1.9399707913398743, |
| "loss/hidden": 0.05078125, |
| "loss/logits": 0.004873000085353851, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.139, |
| "grad_norm": 0.93359375, |
| "grad_norm_var": 1.855396525065104, |
| "learning_rate": 2e-05, |
| "loss": 0.0543, |
| "loss/crossentropy": 1.995535969734192, |
| "loss/hidden": 0.049560546875, |
| "loss/logits": 0.004754737950861454, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.8828125, |
| "grad_norm_var": 1.8424049377441407, |
| "learning_rate": 2e-05, |
| "loss": 0.0567, |
| "loss/crossentropy": 1.4075381755828857, |
| "loss/hidden": 0.05224609375, |
| "loss/logits": 0.004473250824958086, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.141, |
| "grad_norm": 1.0390625, |
| "grad_norm_var": 1.8454689025878905, |
| "learning_rate": 2e-05, |
| "loss": 0.0591, |
| "loss/crossentropy": 1.6278843879699707, |
| "loss/hidden": 0.05419921875, |
| "loss/logits": 0.004886955488473177, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.142, |
| "grad_norm": 1.234375, |
| "grad_norm_var": 1.8400001525878906, |
| "learning_rate": 2e-05, |
| "loss": 0.0696, |
| "loss/crossentropy": 1.8771538138389587, |
| "loss/hidden": 0.063720703125, |
| "loss/logits": 0.005841289181262255, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.143, |
| "grad_norm": 1.1875, |
| "grad_norm_var": 1.8086521784464518, |
| "learning_rate": 2e-05, |
| "loss": 0.0689, |
| "loss/crossentropy": 1.8481554985046387, |
| "loss/hidden": 0.0621337890625, |
| "loss/logits": 0.006735805422067642, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 1.8084919611612955, |
| "learning_rate": 2e-05, |
| "loss": 0.065, |
| "loss/crossentropy": 1.8980860114097595, |
| "loss/hidden": 0.0592041015625, |
| "loss/logits": 0.005796665325760841, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.145, |
| "grad_norm": 0.82421875, |
| "grad_norm_var": 0.7976763407389323, |
| "learning_rate": 2e-05, |
| "loss": 0.0661, |
| "loss/crossentropy": 2.0533955097198486, |
| "loss/hidden": 0.0604248046875, |
| "loss/logits": 0.005705197807401419, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.146, |
| "grad_norm": 1.328125, |
| "grad_norm_var": 0.7931691487630208, |
| "learning_rate": 2e-05, |
| "loss": 0.0704, |
| "loss/crossentropy": 1.3036039471626282, |
| "loss/hidden": 0.065185546875, |
| "loss/logits": 0.00520844548009336, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.147, |
| "grad_norm": 0.76171875, |
| "grad_norm_var": 0.8118188858032227, |
| "learning_rate": 2e-05, |
| "loss": 0.059, |
| "loss/crossentropy": 2.2193727493286133, |
| "loss/hidden": 0.0538330078125, |
| "loss/logits": 0.005119079956784844, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 1.46875, |
| "grad_norm_var": 0.7961542129516601, |
| "learning_rate": 2e-05, |
| "loss": 0.0648, |
| "loss/crossentropy": 1.7154983878135681, |
| "loss/hidden": 0.05908203125, |
| "loss/logits": 0.005755244754254818, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.149, |
| "grad_norm": 0.9296875, |
| "grad_norm_var": 0.7786167780558269, |
| "learning_rate": 2e-05, |
| "loss": 0.0618, |
| "loss/crossentropy": 2.0040605664253235, |
| "loss/hidden": 0.05615234375, |
| "loss/logits": 0.0056907604448497295, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 1.109375, |
| "grad_norm_var": 0.7736892064412435, |
| "learning_rate": 2e-05, |
| "loss": 0.0616, |
| "loss/crossentropy": 2.0739275217056274, |
| "loss/hidden": 0.055908203125, |
| "loss/logits": 0.00570546486414969, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.151, |
| "grad_norm": 1.171875, |
| "grad_norm_var": 0.7204253514607747, |
| "learning_rate": 2e-05, |
| "loss": 0.0627, |
| "loss/crossentropy": 1.594135582447052, |
| "loss/hidden": 0.0576171875, |
| "loss/logits": 0.005040129646658897, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 1.484375, |
| "grad_norm_var": 0.7077147801717122, |
| "learning_rate": 2e-05, |
| "loss": 0.0804, |
| "loss/crossentropy": 1.3758890628814697, |
| "loss/hidden": 0.074462890625, |
| "loss/logits": 0.005972953978925943, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.153, |
| "grad_norm": 1.3828125, |
| "grad_norm_var": 0.06270847320556641, |
| "learning_rate": 2e-05, |
| "loss": 0.0681, |
| "loss/crossentropy": 2.049718499183655, |
| "loss/hidden": 0.062255859375, |
| "loss/logits": 0.005824308842420578, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.154, |
| "grad_norm": 0.79296875, |
| "grad_norm_var": 0.06744384765625, |
| "learning_rate": 2e-05, |
| "loss": 0.0648, |
| "loss/crossentropy": 2.5180286169052124, |
| "loss/hidden": 0.058837890625, |
| "loss/logits": 0.005995590705424547, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.155, |
| "grad_norm": 1.296875, |
| "grad_norm_var": 0.06623172760009766, |
| "learning_rate": 2e-05, |
| "loss": 0.0672, |
| "loss/crossentropy": 1.8163335919380188, |
| "loss/hidden": 0.06201171875, |
| "loss/logits": 0.005222021602094173, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 1.0859375, |
| "grad_norm_var": 0.06153049468994141, |
| "learning_rate": 2e-05, |
| "loss": 0.0701, |
| "loss/crossentropy": 1.5288804769515991, |
| "loss/hidden": 0.064453125, |
| "loss/logits": 0.0056282891891896725, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.157, |
| "grad_norm": 1.1484375, |
| "grad_norm_var": 0.060451698303222653, |
| "learning_rate": 2e-05, |
| "loss": 0.0673, |
| "loss/crossentropy": 1.4844446778297424, |
| "loss/hidden": 0.0621337890625, |
| "loss/logits": 0.005207852926105261, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.158, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 0.13561344146728516, |
| "learning_rate": 2e-05, |
| "loss": 0.0765, |
| "loss/crossentropy": 1.644486278295517, |
| "loss/hidden": 0.071044921875, |
| "loss/logits": 0.00542254070751369, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.159, |
| "grad_norm": 1.09375, |
| "grad_norm_var": 0.13676395416259765, |
| "learning_rate": 2e-05, |
| "loss": 0.0659, |
| "loss/crossentropy": 1.871021330356598, |
| "loss/hidden": 0.0606689453125, |
| "loss/logits": 0.005253549199551344, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 1.71875, |
| "grad_norm_var": 0.14649906158447265, |
| "learning_rate": 2e-05, |
| "loss": 0.0734, |
| "loss/crossentropy": 1.9814706444740295, |
| "loss/hidden": 0.0667724609375, |
| "loss/logits": 0.006675436161458492, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.161, |
| "grad_norm": 5.71875, |
| "grad_norm_var": 1.3714861551920572, |
| "learning_rate": 2e-05, |
| "loss": 0.0761, |
| "loss/crossentropy": 0.06814652029424906, |
| "loss/hidden": 0.0751953125, |
| "loss/logits": 0.0008854267362039536, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.162, |
| "grad_norm": 1.046875, |
| "grad_norm_var": 1.3846514383951822, |
| "learning_rate": 2e-05, |
| "loss": 0.0736, |
| "loss/crossentropy": 2.006903052330017, |
| "loss/hidden": 0.0673828125, |
| "loss/logits": 0.006249978672713041, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.163, |
| "grad_norm": 1.078125, |
| "grad_norm_var": 1.3585056940714517, |
| "learning_rate": 2e-05, |
| "loss": 0.0735, |
| "loss/crossentropy": 1.6816741824150085, |
| "loss/hidden": 0.067626953125, |
| "loss/logits": 0.005850112298503518, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 1.5078125, |
| "grad_norm_var": 1.3581801732381185, |
| "learning_rate": 2e-05, |
| "loss": 0.0813, |
| "loss/crossentropy": 1.5440534353256226, |
| "loss/hidden": 0.0751953125, |
| "loss/logits": 0.006087863817811012, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.165, |
| "grad_norm": 1.3359375, |
| "grad_norm_var": 1.3347864151000977, |
| "learning_rate": 2e-05, |
| "loss": 0.0734, |
| "loss/crossentropy": 2.158173441886902, |
| "loss/hidden": 0.066650390625, |
| "loss/logits": 0.006765024736523628, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.166, |
| "grad_norm": 1.1015625, |
| "grad_norm_var": 1.3352777481079101, |
| "learning_rate": 2e-05, |
| "loss": 0.075, |
| "loss/crossentropy": 2.1744515895843506, |
| "loss/hidden": 0.0675048828125, |
| "loss/logits": 0.0074809337966144085, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.167, |
| "grad_norm": 1.421875, |
| "grad_norm_var": 1.32568302154541, |
| "learning_rate": 2e-05, |
| "loss": 0.0823, |
| "loss/crossentropy": 1.9644973278045654, |
| "loss/hidden": 0.074462890625, |
| "loss/logits": 0.00784232746809721, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 0.75390625, |
| "grad_norm_var": 1.3695658365885417, |
| "learning_rate": 2e-05, |
| "loss": 0.0672, |
| "loss/crossentropy": 2.5146409273147583, |
| "loss/hidden": 0.06103515625, |
| "loss/logits": 0.006188275059685111, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.169, |
| "grad_norm": 1.5390625, |
| "grad_norm_var": 1.3676737467447917, |
| "learning_rate": 2e-05, |
| "loss": 0.0702, |
| "loss/crossentropy": 2.2412500977516174, |
| "loss/hidden": 0.0638427734375, |
| "loss/logits": 0.0063555194064974785, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 1.4140625, |
| "grad_norm_var": 1.3285420099894205, |
| "learning_rate": 2e-05, |
| "loss": 0.0753, |
| "loss/crossentropy": 1.9867605566978455, |
| "loss/hidden": 0.068603515625, |
| "loss/logits": 0.006648675538599491, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.171, |
| "grad_norm": 0.92578125, |
| "grad_norm_var": 1.351922607421875, |
| "learning_rate": 2e-05, |
| "loss": 0.0735, |
| "loss/crossentropy": 1.9463918209075928, |
| "loss/hidden": 0.0673828125, |
| "loss/logits": 0.0060692906845360994, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 0.80078125, |
| "grad_norm_var": 1.3754953384399413, |
| "learning_rate": 2e-05, |
| "loss": 0.0764, |
| "loss/crossentropy": 2.3115424513816833, |
| "loss/hidden": 0.06982421875, |
| "loss/logits": 0.006593452533707023, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.173, |
| "grad_norm": 1.265625, |
| "grad_norm_var": 1.3700098037719726, |
| "learning_rate": 2e-05, |
| "loss": 0.0878, |
| "loss/crossentropy": 1.5958466529846191, |
| "loss/hidden": 0.080322265625, |
| "loss/logits": 0.007451239973306656, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.174, |
| "grad_norm": 0.875, |
| "grad_norm_var": 1.360367774963379, |
| "learning_rate": 2e-05, |
| "loss": 0.079, |
| "loss/crossentropy": 2.529710054397583, |
| "loss/hidden": 0.0712890625, |
| "loss/logits": 0.0077202459797263145, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 1.15625, |
| "grad_norm_var": 1.3574360529581706, |
| "learning_rate": 2e-05, |
| "loss": 0.0857, |
| "loss/crossentropy": 2.347429037094116, |
| "loss/hidden": 0.077392578125, |
| "loss/logits": 0.008293167222291231, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 1.265625, |
| "grad_norm_var": 1.3557692845662435, |
| "learning_rate": 2e-05, |
| "loss": 0.0856, |
| "loss/crossentropy": 2.128890037536621, |
| "loss/hidden": 0.077880859375, |
| "loss/logits": 0.007751723285764456, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.177, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 0.06857649485270183, |
| "learning_rate": 2e-05, |
| "loss": 0.0862, |
| "loss/crossentropy": 1.599511444568634, |
| "loss/hidden": 0.07958984375, |
| "loss/logits": 0.006571376929059625, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.178, |
| "grad_norm": 0.9921875, |
| "grad_norm_var": 0.0697977066040039, |
| "learning_rate": 2e-05, |
| "loss": 0.0849, |
| "loss/crossentropy": 1.9755426049232483, |
| "loss/hidden": 0.07763671875, |
| "loss/logits": 0.007265463005751371, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.179, |
| "grad_norm": 2.5625, |
| "grad_norm_var": 0.18629601796468098, |
| "learning_rate": 2e-05, |
| "loss": 0.0939, |
| "loss/crossentropy": 1.5967216491699219, |
| "loss/hidden": 0.085205078125, |
| "loss/logits": 0.008647671202197671, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 8.625, |
| "grad_norm_var": 3.5702035903930662, |
| "learning_rate": 2e-05, |
| "loss": 0.1158, |
| "loss/crossentropy": 0.4676814042031765, |
| "loss/hidden": 0.113037109375, |
| "loss/logits": 0.002761453448329121, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.181, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 3.579422442118327, |
| "learning_rate": 2e-05, |
| "loss": 0.0892, |
| "loss/crossentropy": 2.035480260848999, |
| "loss/hidden": 0.080810546875, |
| "loss/logits": 0.008422555401921272, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.182, |
| "grad_norm": 2.0, |
| "grad_norm_var": 3.5481292088826497, |
| "learning_rate": 2e-05, |
| "loss": 0.089, |
| "loss/crossentropy": 2.231297492980957, |
| "loss/hidden": 0.08154296875, |
| "loss/logits": 0.007416091160848737, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.183, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 3.5431105931599935, |
| "learning_rate": 2e-05, |
| "loss": 0.0927, |
| "loss/crossentropy": 1.736401379108429, |
| "loss/hidden": 0.085693359375, |
| "loss/logits": 0.007044479949399829, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 1.2109375, |
| "grad_norm_var": 3.4895894368489584, |
| "learning_rate": 2e-05, |
| "loss": 0.0972, |
| "loss/crossentropy": 1.550825297832489, |
| "loss/hidden": 0.088623046875, |
| "loss/logits": 0.008623293600976467, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.185, |
| "grad_norm": 0.78125, |
| "grad_norm_var": 3.5594256083170572, |
| "learning_rate": 2e-05, |
| "loss": 0.0799, |
| "loss/crossentropy": 2.4604450464248657, |
| "loss/hidden": 0.072998046875, |
| "loss/logits": 0.006882155779749155, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.186, |
| "grad_norm": 1.1796875, |
| "grad_norm_var": 3.5757830301920572, |
| "learning_rate": 2e-05, |
| "loss": 0.0973, |
| "loss/crossentropy": 1.7816944122314453, |
| "loss/hidden": 0.089111328125, |
| "loss/logits": 0.008230784442275763, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.187, |
| "grad_norm": 1.875, |
| "grad_norm_var": 3.5198094050089517, |
| "learning_rate": 2e-05, |
| "loss": 0.0889, |
| "loss/crossentropy": 2.1426581144332886, |
| "loss/hidden": 0.080078125, |
| "loss/logits": 0.008809308055788279, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 1.1875, |
| "grad_norm_var": 3.4739054361979167, |
| "learning_rate": 2e-05, |
| "loss": 0.0835, |
| "loss/crossentropy": 2.351286768913269, |
| "loss/hidden": 0.075927734375, |
| "loss/logits": 0.00757948262616992, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.189, |
| "grad_norm": 1.03125, |
| "grad_norm_var": 3.497053019205729, |
| "learning_rate": 2e-05, |
| "loss": 0.0869, |
| "loss/crossentropy": 1.9852749109268188, |
| "loss/hidden": 0.079345703125, |
| "loss/logits": 0.007507144706323743, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 20.75, |
| "grad_norm_var": 25.517414347330728, |
| "learning_rate": 2e-05, |
| "loss": 0.1651, |
| "loss/crossentropy": 1.6293256282806396, |
| "loss/hidden": 0.155029296875, |
| "loss/logits": 0.010117939207702875, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.191, |
| "grad_norm": 1.4921875, |
| "grad_norm_var": 25.43632787068685, |
| "learning_rate": 2e-05, |
| "loss": 0.076, |
| "loss/crossentropy": 0.8999969586730003, |
| "loss/hidden": 0.07177734375, |
| "loss/logits": 0.004198798444122076, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 1.546875, |
| "grad_norm_var": 25.370794423421223, |
| "learning_rate": 2e-05, |
| "loss": 0.0947, |
| "loss/crossentropy": 2.1950655579566956, |
| "loss/hidden": 0.0869140625, |
| "loss/logits": 0.007782091153785586, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.193, |
| "grad_norm": 0.859375, |
| "grad_norm_var": 25.545149485270183, |
| "learning_rate": 2e-05, |
| "loss": 0.0902, |
| "loss/crossentropy": 1.8509384989738464, |
| "loss/hidden": 0.082763671875, |
| "loss/logits": 0.007444815244525671, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.194, |
| "grad_norm": 2.625, |
| "grad_norm_var": 25.248404947916665, |
| "learning_rate": 2e-05, |
| "loss": 0.1009, |
| "loss/crossentropy": 1.9304978251457214, |
| "loss/hidden": 0.093017578125, |
| "loss/logits": 0.007872100453823805, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.195, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 25.407342274983723, |
| "learning_rate": 2e-05, |
| "loss": 0.103, |
| "loss/crossentropy": 1.6395078301429749, |
| "loss/hidden": 0.094970703125, |
| "loss/logits": 0.007982299197465181, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 1.3984375, |
| "grad_norm_var": 23.403335571289062, |
| "learning_rate": 2e-05, |
| "loss": 0.0838, |
| "loss/crossentropy": 1.227138563990593, |
| "loss/hidden": 0.07861328125, |
| "loss/logits": 0.005217177560552955, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.197, |
| "grad_norm": 2.53125, |
| "grad_norm_var": 23.394847615559897, |
| "learning_rate": 2e-05, |
| "loss": 0.1046, |
| "loss/crossentropy": 2.0176168084144592, |
| "loss/hidden": 0.095458984375, |
| "loss/logits": 0.009147892240434885, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.198, |
| "grad_norm": 1.0390625, |
| "grad_norm_var": 23.54477513631185, |
| "learning_rate": 2e-05, |
| "loss": 0.0865, |
| "loss/crossentropy": 2.086784780025482, |
| "loss/hidden": 0.0791015625, |
| "loss/logits": 0.007412708830088377, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.199, |
| "grad_norm": 0.89453125, |
| "grad_norm_var": 23.664772987365723, |
| "learning_rate": 2e-05, |
| "loss": 0.0913, |
| "loss/crossentropy": 2.081687569618225, |
| "loss/hidden": 0.083984375, |
| "loss/logits": 0.007345015881583095, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 4.5625, |
| "grad_norm_var": 23.736997413635255, |
| "learning_rate": 2e-05, |
| "loss": 0.1025, |
| "loss/crossentropy": 1.8505841493606567, |
| "loss/hidden": 0.09375, |
| "loss/logits": 0.008708036039024591, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.201, |
| "grad_norm": 1.6171875, |
| "grad_norm_var": 23.5523401260376, |
| "learning_rate": 2e-05, |
| "loss": 0.0948, |
| "loss/crossentropy": 1.9951569437980652, |
| "loss/hidden": 0.0869140625, |
| "loss/logits": 0.007902283919975162, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.202, |
| "grad_norm": 1.4296875, |
| "grad_norm_var": 23.499499956766766, |
| "learning_rate": 2e-05, |
| "loss": 0.1069, |
| "loss/crossentropy": 1.6185332536697388, |
| "loss/hidden": 0.099365234375, |
| "loss/logits": 0.007533603347837925, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.203, |
| "grad_norm": 4.84375, |
| "grad_norm_var": 23.645521481831867, |
| "learning_rate": 2e-05, |
| "loss": 0.1039, |
| "loss/crossentropy": 1.2041783891618252, |
| "loss/hidden": 0.098876953125, |
| "loss/logits": 0.0050107044517062604, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.204, |
| "grad_norm": 1.328125, |
| "grad_norm_var": 23.611212094624836, |
| "learning_rate": 2e-05, |
| "loss": 0.0982, |
| "loss/crossentropy": 2.0442604422569275, |
| "loss/hidden": 0.0908203125, |
| "loss/logits": 0.007397865178063512, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.205, |
| "grad_norm": 1.0078125, |
| "grad_norm_var": 23.617686398824056, |
| "learning_rate": 2e-05, |
| "loss": 0.1027, |
| "loss/crossentropy": 2.0113831758499146, |
| "loss/hidden": 0.09423828125, |
| "loss/logits": 0.008448891807347536, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.206, |
| "grad_norm": 0.98046875, |
| "grad_norm_var": 1.495696767171224, |
| "learning_rate": 2e-05, |
| "loss": 0.1003, |
| "loss/crossentropy": 1.8787236213684082, |
| "loss/hidden": 0.092529296875, |
| "loss/logits": 0.007756400853395462, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.207, |
| "grad_norm": 1.28125, |
| "grad_norm_var": 1.5086812337239583, |
| "learning_rate": 2e-05, |
| "loss": 0.0918, |
| "loss/crossentropy": 1.0508884936571121, |
| "loss/hidden": 0.08740234375, |
| "loss/logits": 0.004398926626890898, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 1.28125, |
| "grad_norm_var": 1.523536173502604, |
| "learning_rate": 2e-05, |
| "loss": 0.1075, |
| "loss/crossentropy": 1.9026619791984558, |
| "loss/hidden": 0.0986328125, |
| "loss/logits": 0.008909344673156738, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.209, |
| "grad_norm": 2.828125, |
| "grad_norm_var": 1.512256876627604, |
| "learning_rate": 2e-05, |
| "loss": 0.1075, |
| "loss/crossentropy": 2.3852533102035522, |
| "loss/hidden": 0.09716796875, |
| "loss/logits": 0.010357038583606482, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.21, |
| "grad_norm": 3.375, |
| "grad_norm_var": 1.615088907877604, |
| "learning_rate": 2e-05, |
| "loss": 0.1104, |
| "loss/crossentropy": 2.0014060735702515, |
| "loss/hidden": 0.1005859375, |
| "loss/logits": 0.00982728274539113, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.211, |
| "grad_norm": 1.09375, |
| "grad_norm_var": 1.6536516825358072, |
| "learning_rate": 2e-05, |
| "loss": 0.1074, |
| "loss/crossentropy": 1.8040838837623596, |
| "loss/hidden": 0.099365234375, |
| "loss/logits": 0.007996065076440573, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.212, |
| "grad_norm": 1.515625, |
| "grad_norm_var": 1.645606486002604, |
| "learning_rate": 2e-05, |
| "loss": 0.1134, |
| "loss/crossentropy": 1.886792778968811, |
| "loss/hidden": 0.1044921875, |
| "loss/logits": 0.008902385365217924, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.213, |
| "grad_norm": 1.453125, |
| "grad_norm_var": 1.6383768717447917, |
| "learning_rate": 2e-05, |
| "loss": 0.1066, |
| "loss/crossentropy": 2.0408843755722046, |
| "loss/hidden": 0.097900390625, |
| "loss/logits": 0.008735827170312405, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.214, |
| "grad_norm": 1.1484375, |
| "grad_norm_var": 1.6264495849609375, |
| "learning_rate": 2e-05, |
| "loss": 0.1105, |
| "loss/crossentropy": 1.7106547951698303, |
| "loss/hidden": 0.1025390625, |
| "loss/logits": 0.008001305861398578, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.215, |
| "grad_norm": 1.1953125, |
| "grad_norm_var": 1.5911773045857747, |
| "learning_rate": 2e-05, |
| "loss": 0.1127, |
| "loss/crossentropy": 1.833223044872284, |
| "loss/hidden": 0.1044921875, |
| "loss/logits": 0.008159627206623554, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.216, |
| "grad_norm": 5.125, |
| "grad_norm_var": 1.8081023534138998, |
| "learning_rate": 2e-05, |
| "loss": 0.1086, |
| "loss/crossentropy": 2.2071104049682617, |
| "loss/hidden": 0.098876953125, |
| "loss/logits": 0.009761610999703407, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.217, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 1.8164082209269206, |
| "learning_rate": 2e-05, |
| "loss": 0.1258, |
| "loss/crossentropy": 1.903680682182312, |
| "loss/hidden": 0.11376953125, |
| "loss/logits": 0.012041741516441107, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.218, |
| "grad_norm": 1.8515625, |
| "grad_norm_var": 1.7940373102823892, |
| "learning_rate": 2e-05, |
| "loss": 0.1137, |
| "loss/crossentropy": 1.6936857104301453, |
| "loss/hidden": 0.105224609375, |
| "loss/logits": 0.00842969585210085, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.219, |
| "grad_norm": 1.2578125, |
| "grad_norm_var": 1.2626835505167644, |
| "learning_rate": 2e-05, |
| "loss": 0.1195, |
| "loss/crossentropy": 1.8624382019042969, |
| "loss/hidden": 0.1083984375, |
| "loss/logits": 0.0110846646130085, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 2.140625, |
| "grad_norm_var": 1.2498559951782227, |
| "learning_rate": 2e-05, |
| "loss": 0.1109, |
| "loss/crossentropy": 1.8767052292823792, |
| "loss/hidden": 0.101318359375, |
| "loss/logits": 0.009544559754431248, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.221, |
| "grad_norm": 34.5, |
| "grad_norm_var": 67.47106266021729, |
| "learning_rate": 2e-05, |
| "loss": 0.1706, |
| "loss/crossentropy": 1.5845009684562683, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.021197613328695297, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.222, |
| "grad_norm": 2.765625, |
| "grad_norm_var": 66.9583241780599, |
| "learning_rate": 2e-05, |
| "loss": 0.1291, |
| "loss/crossentropy": 1.2468619346618652, |
| "loss/hidden": 0.1201171875, |
| "loss/logits": 0.0089989029802382, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.223, |
| "grad_norm": 1.1953125, |
| "grad_norm_var": 66.99088923136394, |
| "learning_rate": 2e-05, |
| "loss": 0.1207, |
| "loss/crossentropy": 2.0021961331367493, |
| "loss/hidden": 0.111328125, |
| "loss/logits": 0.009369145147502422, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 1.5390625, |
| "grad_norm_var": 66.89891764322917, |
| "learning_rate": 2e-05, |
| "loss": 0.1261, |
| "loss/crossentropy": 2.015094578266144, |
| "loss/hidden": 0.115966796875, |
| "loss/logits": 0.010106082540005445, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 1.578125, |
| "grad_norm_var": 67.20751139322917, |
| "learning_rate": 2e-05, |
| "loss": 0.1121, |
| "loss/crossentropy": 2.212425947189331, |
| "loss/hidden": 0.103271484375, |
| "loss/logits": 0.00880527961999178, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.226, |
| "grad_norm": 2.09375, |
| "grad_norm_var": 67.41955159505208, |
| "learning_rate": 2e-05, |
| "loss": 0.1287, |
| "loss/crossentropy": 1.8419872522354126, |
| "loss/hidden": 0.118896484375, |
| "loss/logits": 0.009808670729398727, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.227, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 67.05773518880208, |
| "learning_rate": 2e-05, |
| "loss": 0.1269, |
| "loss/crossentropy": 1.6536349058151245, |
| "loss/hidden": 0.1181640625, |
| "loss/logits": 0.008706679102033377, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.228, |
| "grad_norm": 1.6484375, |
| "grad_norm_var": 67.01467056274414, |
| "learning_rate": 2e-05, |
| "loss": 0.1451, |
| "loss/crossentropy": 1.9453503489494324, |
| "loss/hidden": 0.132080078125, |
| "loss/logits": 0.012995603960007429, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.229, |
| "grad_norm": 1.6796875, |
| "grad_norm_var": 66.94039611816406, |
| "learning_rate": 2e-05, |
| "loss": 0.1232, |
| "loss/crossentropy": 1.796463429927826, |
| "loss/hidden": 0.113037109375, |
| "loss/logits": 0.01019327249377966, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.23, |
| "grad_norm": 1.4921875, |
| "grad_norm_var": 66.81560770670573, |
| "learning_rate": 2e-05, |
| "loss": 0.1195, |
| "loss/crossentropy": 1.794281244277954, |
| "loss/hidden": 0.110107421875, |
| "loss/logits": 0.00934369070455432, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.231, |
| "grad_norm": 1.0078125, |
| "grad_norm_var": 66.88926493326822, |
| "learning_rate": 2e-05, |
| "loss": 0.1218, |
| "loss/crossentropy": 2.2390308380126953, |
| "loss/hidden": 0.1103515625, |
| "loss/logits": 0.011494161561131477, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.232, |
| "grad_norm": 4.5625, |
| "grad_norm_var": 66.82781473795573, |
| "learning_rate": 2e-05, |
| "loss": 0.1348, |
| "loss/crossentropy": 0.861265268176794, |
| "loss/hidden": 0.129150390625, |
| "loss/logits": 0.005646681529469788, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.233, |
| "grad_norm": 1.1015625, |
| "grad_norm_var": 67.23394343058268, |
| "learning_rate": 2e-05, |
| "loss": 0.1179, |
| "loss/crossentropy": 2.1645957827568054, |
| "loss/hidden": 0.109130859375, |
| "loss/logits": 0.00873167160898447, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.234, |
| "grad_norm": 3.921875, |
| "grad_norm_var": 66.93128153483073, |
| "learning_rate": 2e-05, |
| "loss": 0.1323, |
| "loss/crossentropy": 1.7015637755393982, |
| "loss/hidden": 0.1240234375, |
| "loss/logits": 0.008305847179144621, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.235, |
| "grad_norm": 2.25, |
| "grad_norm_var": 66.62370986938477, |
| "learning_rate": 2e-05, |
| "loss": 0.1082, |
| "loss/crossentropy": 0.9229027386754751, |
| "loss/hidden": 0.103515625, |
| "loss/logits": 0.0047085680707823485, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.236, |
| "grad_norm": 1.796875, |
| "grad_norm_var": 66.7213518778483, |
| "learning_rate": 2e-05, |
| "loss": 0.1256, |
| "loss/crossentropy": 2.2883217334747314, |
| "loss/hidden": 0.115478515625, |
| "loss/logits": 0.010137087665498257, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.237, |
| "grad_norm": 1.125, |
| "grad_norm_var": 1.0078020731608073, |
| "learning_rate": 2e-05, |
| "loss": 0.1318, |
| "loss/crossentropy": 1.9819908738136292, |
| "loss/hidden": 0.1201171875, |
| "loss/logits": 0.011642226949334145, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.238, |
| "grad_norm": 1.9765625, |
| "grad_norm_var": 0.9664225260416667, |
| "learning_rate": 2e-05, |
| "loss": 0.1484, |
| "loss/crossentropy": 1.6898375153541565, |
| "loss/hidden": 0.13671875, |
| "loss/logits": 0.011645910330116749, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.239, |
| "grad_norm": 2.515625, |
| "grad_norm_var": 0.941967519124349, |
| "learning_rate": 2e-05, |
| "loss": 0.1338, |
| "loss/crossentropy": 1.088462270796299, |
| "loss/hidden": 0.12646484375, |
| "loss/logits": 0.007348200713749975, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 1.296875, |
| "grad_norm_var": 0.9616689046223958, |
| "learning_rate": 2e-05, |
| "loss": 0.1269, |
| "loss/crossentropy": 1.7091381549835205, |
| "loss/hidden": 0.1171875, |
| "loss/logits": 0.009663785342127085, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.241, |
| "grad_norm": 1.8125, |
| "grad_norm_var": 0.9512776692708333, |
| "learning_rate": 2e-05, |
| "loss": 0.1346, |
| "loss/crossentropy": 2.133821189403534, |
| "loss/hidden": 0.12451171875, |
| "loss/logits": 0.010053665610030293, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.242, |
| "grad_norm": 2.921875, |
| "grad_norm_var": 1.0006093343098958, |
| "learning_rate": 2e-05, |
| "loss": 0.1848, |
| "loss/crossentropy": 1.743838369846344, |
| "loss/hidden": 0.167724609375, |
| "loss/logits": 0.01710428297519684, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.243, |
| "grad_norm": 1.1953125, |
| "grad_norm_var": 1.0461748758951823, |
| "learning_rate": 2e-05, |
| "loss": 0.1427, |
| "loss/crossentropy": 1.751486599445343, |
| "loss/hidden": 0.1318359375, |
| "loss/logits": 0.010820105206221342, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.244, |
| "grad_norm": 2.359375, |
| "grad_norm_var": 1.0426340738932292, |
| "learning_rate": 2e-05, |
| "loss": 0.1599, |
| "loss/crossentropy": 1.9516586065292358, |
| "loss/hidden": 0.14501953125, |
| "loss/logits": 0.014852947555482388, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.245, |
| "grad_norm": 1.9375, |
| "grad_norm_var": 1.0335955301920572, |
| "learning_rate": 2e-05, |
| "loss": 0.1764, |
| "loss/crossentropy": 1.5456209182739258, |
| "loss/hidden": 0.162109375, |
| "loss/logits": 0.01430825050920248, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.246, |
| "grad_norm": 1.6328125, |
| "grad_norm_var": 1.0238176981608074, |
| "learning_rate": 2e-05, |
| "loss": 0.1353, |
| "loss/crossentropy": 1.987372636795044, |
| "loss/hidden": 0.125244140625, |
| "loss/logits": 0.010050846263766289, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.247, |
| "grad_norm": 1.46875, |
| "grad_norm_var": 0.9706868489583333, |
| "learning_rate": 2e-05, |
| "loss": 0.1573, |
| "loss/crossentropy": 1.7526296973228455, |
| "loss/hidden": 0.14453125, |
| "loss/logits": 0.012786582577973604, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.248, |
| "grad_norm": 3.421875, |
| "grad_norm_var": 0.6801096598307291, |
| "learning_rate": 2e-05, |
| "loss": 0.163, |
| "loss/crossentropy": 1.7866697311401367, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.01353980042040348, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.249, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 0.6205848693847656, |
| "learning_rate": 2e-05, |
| "loss": 0.1482, |
| "loss/crossentropy": 1.6994678974151611, |
| "loss/hidden": 0.13720703125, |
| "loss/logits": 0.010973525233566761, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 1.8984375, |
| "grad_norm_var": 0.38313395182291665, |
| "learning_rate": 2e-05, |
| "loss": 0.1356, |
| "loss/crossentropy": 1.5037266612052917, |
| "loss/hidden": 0.126220703125, |
| "loss/logits": 0.00937253376469016, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.251, |
| "grad_norm": 1.234375, |
| "grad_norm_var": 0.40925191243489584, |
| "learning_rate": 2e-05, |
| "loss": 0.1432, |
| "loss/crossentropy": 2.2240084409713745, |
| "loss/hidden": 0.13037109375, |
| "loss/logits": 0.012790975160896778, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.252, |
| "grad_norm": 4.40625, |
| "grad_norm_var": 0.7977701822916666, |
| "learning_rate": 2e-05, |
| "loss": 0.1575, |
| "loss/crossentropy": 2.0401850938796997, |
| "loss/hidden": 0.1455078125, |
| "loss/logits": 0.011962223332375288, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.253, |
| "grad_norm": 7.1875, |
| "grad_norm_var": 2.3339192708333334, |
| "learning_rate": 2e-05, |
| "loss": 0.146, |
| "loss/crossentropy": 0.35353903006762266, |
| "loss/hidden": 0.143798828125, |
| "loss/logits": 0.0022313917579595, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.254, |
| "grad_norm": 2.5, |
| "grad_norm_var": 2.318328603108724, |
| "learning_rate": 2e-05, |
| "loss": 0.1481, |
| "loss/crossentropy": 2.2920628786087036, |
| "loss/hidden": 0.13525390625, |
| "loss/logits": 0.012886842247098684, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.255, |
| "grad_norm": 1.59375, |
| "grad_norm_var": 2.3668230692545573, |
| "learning_rate": 2e-05, |
| "loss": 0.1543, |
| "loss/crossentropy": 2.8475781679153442, |
| "loss/hidden": 0.140625, |
| "loss/logits": 0.013668275438249111, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 2.09375, |
| "grad_norm_var": 2.2871355692545574, |
| "learning_rate": 2e-05, |
| "loss": 0.17, |
| "loss/crossentropy": 1.3631716966629028, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.011748222634196281, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.257, |
| "grad_norm": 5.65625, |
| "grad_norm_var": 2.873457590738932, |
| "learning_rate": 2e-05, |
| "loss": 0.1634, |
| "loss/crossentropy": 1.473731517791748, |
| "loss/hidden": 0.1513671875, |
| "loss/logits": 0.01205692021176219, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.258, |
| "grad_norm": 1.4296875, |
| "grad_norm_var": 2.970556640625, |
| "learning_rate": 2e-05, |
| "loss": 0.1436, |
| "loss/crossentropy": 1.6837199926376343, |
| "loss/hidden": 0.1337890625, |
| "loss/logits": 0.009849925991147757, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.259, |
| "grad_norm": 1.59375, |
| "grad_norm_var": 2.9049415588378906, |
| "learning_rate": 2e-05, |
| "loss": 0.1372, |
| "loss/crossentropy": 2.395468235015869, |
| "loss/hidden": 0.12744140625, |
| "loss/logits": 0.009797783568501472, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 1.3515625, |
| "grad_norm_var": 3.0064117431640627, |
| "learning_rate": 2e-05, |
| "loss": 0.1464, |
| "loss/crossentropy": 2.267671525478363, |
| "loss/hidden": 0.13427734375, |
| "loss/logits": 0.012089194264262915, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.261, |
| "grad_norm": 1.5078125, |
| "grad_norm_var": 3.054709625244141, |
| "learning_rate": 2e-05, |
| "loss": 0.1548, |
| "loss/crossentropy": 1.5905962586402893, |
| "loss/hidden": 0.14013671875, |
| "loss/logits": 0.014669264666736126, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.262, |
| "grad_norm": 2.609375, |
| "grad_norm_var": 2.9945963541666667, |
| "learning_rate": 2e-05, |
| "loss": 0.162, |
| "loss/crossentropy": 2.0416821241378784, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.012546437792479992, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.263, |
| "grad_norm": 1.40625, |
| "grad_norm_var": 3.004378255208333, |
| "learning_rate": 2e-05, |
| "loss": 0.1679, |
| "loss/crossentropy": 1.7455708384513855, |
| "loss/hidden": 0.154296875, |
| "loss/logits": 0.01356972474604845, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.264, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 2.99639892578125, |
| "learning_rate": 2e-05, |
| "loss": 0.1899, |
| "loss/crossentropy": 1.721125602722168, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.015131386462599039, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.265, |
| "grad_norm": 1.6640625, |
| "grad_norm_var": 3.015618642171224, |
| "learning_rate": 2e-05, |
| "loss": 0.1624, |
| "loss/crossentropy": 2.1271780133247375, |
| "loss/hidden": 0.1494140625, |
| "loss/logits": 0.01298526581376791, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.266, |
| "grad_norm": 2.65625, |
| "grad_norm_var": 2.99136962890625, |
| "learning_rate": 2e-05, |
| "loss": 0.1635, |
| "loss/crossentropy": 2.186352849006653, |
| "loss/hidden": 0.150390625, |
| "loss/logits": 0.013157612178474665, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.267, |
| "grad_norm": 4.75, |
| "grad_norm_var": 3.1513580322265624, |
| "learning_rate": 2e-05, |
| "loss": 0.1739, |
| "loss/crossentropy": 1.7296316027641296, |
| "loss/hidden": 0.16015625, |
| "loss/logits": 0.013769491575658321, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.268, |
| "grad_norm": 2.140625, |
| "grad_norm_var": 2.9750935872395834, |
| "learning_rate": 2e-05, |
| "loss": 0.1812, |
| "loss/crossentropy": 1.0499393939971924, |
| "loss/hidden": 0.17138671875, |
| "loss/logits": 0.009774566628038883, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.269, |
| "grad_norm": 1.3046875, |
| "grad_norm_var": 1.554754384358724, |
| "learning_rate": 2e-05, |
| "loss": 0.1479, |
| "loss/crossentropy": 1.8199211955070496, |
| "loss/hidden": 0.13623046875, |
| "loss/logits": 0.011690349318087101, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 1.5819976806640625, |
| "learning_rate": 2e-05, |
| "loss": 0.1663, |
| "loss/crossentropy": 1.792409062385559, |
| "loss/hidden": 0.15380859375, |
| "loss/logits": 0.012502346187829971, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.271, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 1.5571034749348958, |
| "learning_rate": 2e-05, |
| "loss": 0.1633, |
| "loss/crossentropy": 1.1957984417676926, |
| "loss/hidden": 0.154296875, |
| "loss/logits": 0.009009127970784903, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 1.4765625, |
| "grad_norm_var": 1.5925636291503906, |
| "learning_rate": 2e-05, |
| "loss": 0.1714, |
| "loss/crossentropy": 2.025519847869873, |
| "loss/hidden": 0.158203125, |
| "loss/logits": 0.013240211643278599, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.273, |
| "grad_norm": 1.1640625, |
| "grad_norm_var": 0.7817179361979166, |
| "learning_rate": 2e-05, |
| "loss": 0.1741, |
| "loss/crossentropy": 2.028028964996338, |
| "loss/hidden": 0.15966796875, |
| "loss/logits": 0.014450239017605782, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.274, |
| "grad_norm": 1.6953125, |
| "grad_norm_var": 0.7689036051432292, |
| "learning_rate": 2e-05, |
| "loss": 0.183, |
| "loss/crossentropy": 1.8278576731681824, |
| "loss/hidden": 0.16796875, |
| "loss/logits": 0.01498580351471901, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.275, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 0.7609659830729166, |
| "learning_rate": 2e-05, |
| "loss": 0.1968, |
| "loss/crossentropy": 1.6091482639312744, |
| "loss/hidden": 0.1796875, |
| "loss/logits": 0.017123520374298096, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.276, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 0.746240234375, |
| "learning_rate": 2e-05, |
| "loss": 0.1621, |
| "loss/crossentropy": 2.029966711997986, |
| "loss/hidden": 0.15087890625, |
| "loss/logits": 0.011231188662350178, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.277, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 0.7426106770833333, |
| "learning_rate": 2e-05, |
| "loss": 0.1714, |
| "loss/crossentropy": 1.753463089466095, |
| "loss/hidden": 0.1591796875, |
| "loss/logits": 0.01222995389252901, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.278, |
| "grad_norm": 1.390625, |
| "grad_norm_var": 0.7326131184895833, |
| "learning_rate": 2e-05, |
| "loss": 0.1484, |
| "loss/crossentropy": 2.815610885620117, |
| "loss/hidden": 0.13623046875, |
| "loss/logits": 0.012166950851678848, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.279, |
| "grad_norm": 1.59375, |
| "grad_norm_var": 0.7224568684895833, |
| "learning_rate": 2e-05, |
| "loss": 0.19, |
| "loss/crossentropy": 1.6269866228103638, |
| "loss/hidden": 0.17529296875, |
| "loss/logits": 0.014737497083842754, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 2.453125, |
| "grad_norm_var": 0.7385701497395833, |
| "learning_rate": 2e-05, |
| "loss": 0.1486, |
| "loss/crossentropy": 0.8505218476057053, |
| "loss/hidden": 0.141845703125, |
| "loss/logits": 0.006716415053233504, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.281, |
| "grad_norm": 3.78125, |
| "grad_norm_var": 0.9365740458170573, |
| "learning_rate": 2e-05, |
| "loss": 0.1775, |
| "loss/crossentropy": 2.1807326674461365, |
| "loss/hidden": 0.1611328125, |
| "loss/logits": 0.016383664682507515, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.282, |
| "grad_norm": 1.34375, |
| "grad_norm_var": 0.9446917215983073, |
| "learning_rate": 2e-05, |
| "loss": 0.1684, |
| "loss/crossentropy": 2.2373656034469604, |
| "loss/hidden": 0.15478515625, |
| "loss/logits": 0.013567784801125526, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.283, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 0.40909805297851565, |
| "learning_rate": 2e-05, |
| "loss": 0.1811, |
| "loss/crossentropy": 2.249536633491516, |
| "loss/hidden": 0.16455078125, |
| "loss/logits": 0.01651722937822342, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.284, |
| "grad_norm": 1.546875, |
| "grad_norm_var": 0.40612157185872394, |
| "learning_rate": 2e-05, |
| "loss": 0.1828, |
| "loss/crossentropy": 1.6665422916412354, |
| "loss/hidden": 0.16943359375, |
| "loss/logits": 0.013365902472287416, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.285, |
| "grad_norm": 2.375, |
| "grad_norm_var": 0.4088043212890625, |
| "learning_rate": 2e-05, |
| "loss": 0.1882, |
| "loss/crossentropy": 1.3738424181938171, |
| "loss/hidden": 0.17578125, |
| "loss/logits": 0.012407675385475159, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.286, |
| "grad_norm": 2.5, |
| "grad_norm_var": 0.42530288696289065, |
| "learning_rate": 2e-05, |
| "loss": 0.1692, |
| "loss/crossentropy": 2.0839606523513794, |
| "loss/hidden": 0.15673828125, |
| "loss/logits": 0.01245969720184803, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.287, |
| "grad_norm": 1.890625, |
| "grad_norm_var": 0.4140968322753906, |
| "learning_rate": 2e-05, |
| "loss": 0.2152, |
| "loss/crossentropy": 1.6726297736167908, |
| "loss/hidden": 0.197265625, |
| "loss/logits": 0.017983403988182545, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 1.5, |
| "grad_norm_var": 0.412841796875, |
| "learning_rate": 2e-05, |
| "loss": 0.1656, |
| "loss/crossentropy": 2.1090754866600037, |
| "loss/hidden": 0.15185546875, |
| "loss/logits": 0.013696501962840557, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.289, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 0.37602920532226564, |
| "learning_rate": 2e-05, |
| "loss": 0.2104, |
| "loss/crossentropy": 2.181394100189209, |
| "loss/hidden": 0.19140625, |
| "loss/logits": 0.01902033807709813, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.29, |
| "grad_norm": 1.3203125, |
| "grad_norm_var": 0.3973426818847656, |
| "learning_rate": 2e-05, |
| "loss": 0.1687, |
| "loss/crossentropy": 1.9306071400642395, |
| "loss/hidden": 0.15625, |
| "loss/logits": 0.012489455286413431, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.291, |
| "grad_norm": 1.640625, |
| "grad_norm_var": 0.40146865844726565, |
| "learning_rate": 2e-05, |
| "loss": 0.1893, |
| "loss/crossentropy": 1.672035276889801, |
| "loss/hidden": 0.17578125, |
| "loss/logits": 0.013549179770052433, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.292, |
| "grad_norm": 1.640625, |
| "grad_norm_var": 0.3986968994140625, |
| "learning_rate": 2e-05, |
| "loss": 0.1994, |
| "loss/crossentropy": 1.8278534412384033, |
| "loss/hidden": 0.1845703125, |
| "loss/logits": 0.014865144155919552, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.293, |
| "grad_norm": 1.640625, |
| "grad_norm_var": 0.39588394165039065, |
| "learning_rate": 2e-05, |
| "loss": 0.1816, |
| "loss/crossentropy": 2.2214853763580322, |
| "loss/hidden": 0.16748046875, |
| "loss/logits": 0.014129704795777798, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.294, |
| "grad_norm": 1.6796875, |
| "grad_norm_var": 0.3811767578125, |
| "learning_rate": 2e-05, |
| "loss": 0.2136, |
| "loss/crossentropy": 1.7116456627845764, |
| "loss/hidden": 0.1982421875, |
| "loss/logits": 0.015314914286136627, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.295, |
| "grad_norm": 1.3359375, |
| "grad_norm_var": 0.3967445373535156, |
| "learning_rate": 2e-05, |
| "loss": 0.1888, |
| "loss/crossentropy": 1.9804767370224, |
| "loss/hidden": 0.17431640625, |
| "loss/logits": 0.014444439671933651, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.296, |
| "grad_norm": 1.875, |
| "grad_norm_var": 0.37574234008789065, |
| "learning_rate": 2e-05, |
| "loss": 0.2019, |
| "loss/crossentropy": 1.738105595111847, |
| "loss/hidden": 0.18896484375, |
| "loss/logits": 0.012953449971973896, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.297, |
| "grad_norm": 1.859375, |
| "grad_norm_var": 0.11774063110351562, |
| "learning_rate": 2e-05, |
| "loss": 0.227, |
| "loss/crossentropy": 2.2144758701324463, |
| "loss/hidden": 0.2060546875, |
| "loss/logits": 0.02090621367096901, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.298, |
| "grad_norm": 1.1953125, |
| "grad_norm_var": 0.12722574869791667, |
| "learning_rate": 2e-05, |
| "loss": 0.1889, |
| "loss/crossentropy": 2.162367820739746, |
| "loss/hidden": 0.173828125, |
| "loss/logits": 0.015115509741008282, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.299, |
| "grad_norm": 1.40625, |
| "grad_norm_var": 0.1330963134765625, |
| "learning_rate": 2e-05, |
| "loss": 0.1827, |
| "loss/crossentropy": 1.5395055413246155, |
| "loss/hidden": 0.16943359375, |
| "loss/logits": 0.013315946329385042, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 1.5546875, |
| "grad_norm_var": 0.1329241434733073, |
| "learning_rate": 2e-05, |
| "loss": 0.1786, |
| "loss/crossentropy": 2.3225117325782776, |
| "loss/hidden": 0.16455078125, |
| "loss/logits": 0.014059492386877537, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.301, |
| "grad_norm": 2.3125, |
| "grad_norm_var": 0.12767918904622397, |
| "learning_rate": 2e-05, |
| "loss": 0.2045, |
| "loss/crossentropy": 1.6509876251220703, |
| "loss/hidden": 0.18798828125, |
| "loss/logits": 0.016557442024350166, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.302, |
| "grad_norm": 1.546875, |
| "grad_norm_var": 0.08436660766601563, |
| "learning_rate": 2e-05, |
| "loss": 0.2172, |
| "loss/crossentropy": 1.8931352496147156, |
| "loss/hidden": 0.19970703125, |
| "loss/logits": 0.017513110302388668, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.303, |
| "grad_norm": 1.3203125, |
| "grad_norm_var": 0.08661295572916666, |
| "learning_rate": 2e-05, |
| "loss": 0.1832, |
| "loss/crossentropy": 2.438356041908264, |
| "loss/hidden": 0.16943359375, |
| "loss/logits": 0.01381587702780962, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 2.53125, |
| "grad_norm_var": 0.13696695963541666, |
| "learning_rate": 2e-05, |
| "loss": 0.2107, |
| "loss/crossentropy": 1.7741122841835022, |
| "loss/hidden": 0.193359375, |
| "loss/logits": 0.017319072037935257, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.305, |
| "grad_norm": 1.90625, |
| "grad_norm_var": 0.13135477701822917, |
| "learning_rate": 2e-05, |
| "loss": 0.1921, |
| "loss/crossentropy": 2.2266836166381836, |
| "loss/hidden": 0.17578125, |
| "loss/logits": 0.016311957966536283, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.306, |
| "grad_norm": 1.4140625, |
| "grad_norm_var": 0.12749735514322916, |
| "learning_rate": 2e-05, |
| "loss": 0.1934, |
| "loss/crossentropy": 1.9566237926483154, |
| "loss/hidden": 0.17919921875, |
| "loss/logits": 0.014166621956974268, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.307, |
| "grad_norm": 5.46875, |
| "grad_norm_var": 1.0239664713541667, |
| "learning_rate": 2e-05, |
| "loss": 0.1923, |
| "loss/crossentropy": 0.48859013617038727, |
| "loss/hidden": 0.18603515625, |
| "loss/logits": 0.006227712146937847, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.308, |
| "grad_norm": 2.375, |
| "grad_norm_var": 1.0305165608723958, |
| "learning_rate": 2e-05, |
| "loss": 0.2001, |
| "loss/crossentropy": 1.4687460958957672, |
| "loss/hidden": 0.18994140625, |
| "loss/logits": 0.01019106456078589, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.309, |
| "grad_norm": 1.6328125, |
| "grad_norm_var": 1.0308570861816406, |
| "learning_rate": 2e-05, |
| "loss": 0.2086, |
| "loss/crossentropy": 2.159992814064026, |
| "loss/hidden": 0.19189453125, |
| "loss/logits": 0.016735559795051813, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.31, |
| "grad_norm": 1.6015625, |
| "grad_norm_var": 1.0341936747233074, |
| "learning_rate": 2e-05, |
| "loss": 0.2022, |
| "loss/crossentropy": 2.1935722827911377, |
| "loss/hidden": 0.18505859375, |
| "loss/logits": 0.01716968510299921, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.311, |
| "grad_norm": 1.7421875, |
| "grad_norm_var": 1.0107866923014324, |
| "learning_rate": 2e-05, |
| "loss": 0.1908, |
| "loss/crossentropy": 2.2257096767425537, |
| "loss/hidden": 0.1748046875, |
| "loss/logits": 0.01596822775900364, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.312, |
| "grad_norm": 1.5625, |
| "grad_norm_var": 1.0214271545410156, |
| "learning_rate": 2e-05, |
| "loss": 0.2196, |
| "loss/crossentropy": 1.4189770817756653, |
| "loss/hidden": 0.20458984375, |
| "loss/logits": 0.01496713887900114, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.313, |
| "grad_norm": 2.546875, |
| "grad_norm_var": 1.0413449605305989, |
| "learning_rate": 2e-05, |
| "loss": 0.2043, |
| "loss/crossentropy": 1.9225648045539856, |
| "loss/hidden": 0.18896484375, |
| "loss/logits": 0.015352770686149597, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.314, |
| "grad_norm": 1.5390625, |
| "grad_norm_var": 1.0115130106608072, |
| "learning_rate": 2e-05, |
| "loss": 0.2049, |
| "loss/crossentropy": 1.8363057971000671, |
| "loss/hidden": 0.1904296875, |
| "loss/logits": 0.01448015309870243, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.315, |
| "grad_norm": 1.6171875, |
| "grad_norm_var": 0.9967844645182292, |
| "learning_rate": 2e-05, |
| "loss": 0.2193, |
| "loss/crossentropy": 1.560690462589264, |
| "loss/hidden": 0.20458984375, |
| "loss/logits": 0.014727555215358734, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.316, |
| "grad_norm": 53.5, |
| "grad_norm_var": 166.26641006469725, |
| "learning_rate": 2e-05, |
| "loss": 0.2757, |
| "loss/crossentropy": 2.1398332118988037, |
| "loss/hidden": 0.25537109375, |
| "loss/logits": 0.020312640815973282, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.317, |
| "grad_norm": 1.265625, |
| "grad_norm_var": 166.75031712849935, |
| "learning_rate": 2e-05, |
| "loss": 0.197, |
| "loss/crossentropy": 2.4257794618606567, |
| "loss/hidden": 0.1806640625, |
| "loss/logits": 0.0163337429985404, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.318, |
| "grad_norm": 1.9921875, |
| "grad_norm_var": 166.5444325764974, |
| "learning_rate": 2e-05, |
| "loss": 0.2233, |
| "loss/crossentropy": 1.9029493927955627, |
| "loss/hidden": 0.20556640625, |
| "loss/logits": 0.017750290222465992, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.319, |
| "grad_norm": 3.703125, |
| "grad_norm_var": 165.65049006144207, |
| "learning_rate": 2e-05, |
| "loss": 0.1867, |
| "loss/crossentropy": 1.2511920928955078, |
| "loss/hidden": 0.17578125, |
| "loss/logits": 0.010939843952655792, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 1.28125, |
| "grad_norm_var": 166.22625503540038, |
| "learning_rate": 2e-05, |
| "loss": 0.1851, |
| "loss/crossentropy": 2.3942267894744873, |
| "loss/hidden": 0.17138671875, |
| "loss/logits": 0.013719380833208561, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.321, |
| "grad_norm": 1.2265625, |
| "grad_norm_var": 166.56466064453124, |
| "learning_rate": 2e-05, |
| "loss": 0.1952, |
| "loss/crossentropy": 1.8679406642913818, |
| "loss/hidden": 0.18212890625, |
| "loss/logits": 0.013037680182605982, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.322, |
| "grad_norm": 1.5390625, |
| "grad_norm_var": 166.50121663411457, |
| "learning_rate": 2e-05, |
| "loss": 0.2283, |
| "loss/crossentropy": 1.9437570571899414, |
| "loss/hidden": 0.2109375, |
| "loss/logits": 0.017378264106810093, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.323, |
| "grad_norm": 2.09375, |
| "grad_norm_var": 167.1313924153646, |
| "learning_rate": 2e-05, |
| "loss": 0.2546, |
| "loss/crossentropy": 1.438418209552765, |
| "loss/hidden": 0.2353515625, |
| "loss/logits": 0.019218457862734795, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.324, |
| "grad_norm": 1.7734375, |
| "grad_norm_var": 167.37066624959309, |
| "learning_rate": 2e-05, |
| "loss": 0.1924, |
| "loss/crossentropy": 1.98748379945755, |
| "loss/hidden": 0.17919921875, |
| "loss/logits": 0.013240392319858074, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.325, |
| "grad_norm": 1.984375, |
| "grad_norm_var": 167.21874593098957, |
| "learning_rate": 2e-05, |
| "loss": 0.1905, |
| "loss/crossentropy": 1.5414408445358276, |
| "loss/hidden": 0.1787109375, |
| "loss/logits": 0.011756124440580606, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.326, |
| "grad_norm": 1.625, |
| "grad_norm_var": 167.2079709370931, |
| "learning_rate": 2e-05, |
| "loss": 0.2237, |
| "loss/crossentropy": 2.0893434286117554, |
| "loss/hidden": 0.20556640625, |
| "loss/logits": 0.018133269622921944, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.327, |
| "grad_norm": 2.84375, |
| "grad_norm_var": 166.79621175130208, |
| "learning_rate": 2e-05, |
| "loss": 0.1658, |
| "loss/crossentropy": 0.912944782525301, |
| "loss/hidden": 0.1591796875, |
| "loss/logits": 0.006613137491513044, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.328, |
| "grad_norm": 2.59375, |
| "grad_norm_var": 166.37202962239584, |
| "learning_rate": 2e-05, |
| "loss": 0.2162, |
| "loss/crossentropy": 1.5049991756677628, |
| "loss/hidden": 0.20361328125, |
| "loss/logits": 0.012560161063447595, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.329, |
| "grad_norm": 2.796875, |
| "grad_norm_var": 166.28765462239582, |
| "learning_rate": 2e-05, |
| "loss": 0.2336, |
| "loss/crossentropy": 1.5983822345733643, |
| "loss/hidden": 0.21630859375, |
| "loss/logits": 0.0172890517860651, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 3.96875, |
| "grad_norm_var": 165.46708145141602, |
| "learning_rate": 2e-05, |
| "loss": 0.2399, |
| "loss/crossentropy": 2.047563672065735, |
| "loss/hidden": 0.220703125, |
| "loss/logits": 0.01918058656156063, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.331, |
| "grad_norm": 1.5703125, |
| "grad_norm_var": 165.49062881469726, |
| "learning_rate": 2e-05, |
| "loss": 0.2215, |
| "loss/crossentropy": 2.413437008857727, |
| "loss/hidden": 0.203125, |
| "loss/logits": 0.018364297226071358, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.332, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 0.7243405659993489, |
| "learning_rate": 2e-05, |
| "loss": 0.2291, |
| "loss/crossentropy": 1.7002423405647278, |
| "loss/hidden": 0.2138671875, |
| "loss/logits": 0.015211229212582111, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.333, |
| "grad_norm": 1.71875, |
| "grad_norm_var": 0.6864031473795573, |
| "learning_rate": 2e-05, |
| "loss": 0.2687, |
| "loss/crossentropy": 1.7492387890815735, |
| "loss/hidden": 0.24755859375, |
| "loss/logits": 0.021109523251652718, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.334, |
| "grad_norm": 2.671875, |
| "grad_norm_var": 0.7023996988932292, |
| "learning_rate": 2e-05, |
| "loss": 0.267, |
| "loss/crossentropy": 1.8675777316093445, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.019932386465370655, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.335, |
| "grad_norm": 3.515625, |
| "grad_norm_var": 0.6664377848307291, |
| "learning_rate": 2e-05, |
| "loss": 0.2483, |
| "loss/crossentropy": 1.4301924109458923, |
| "loss/hidden": 0.23193359375, |
| "loss/logits": 0.016404244117438793, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 2.046875, |
| "grad_norm_var": 0.61285400390625, |
| "learning_rate": 2e-05, |
| "loss": 0.2359, |
| "loss/crossentropy": 1.8374149799346924, |
| "loss/hidden": 0.21826171875, |
| "loss/logits": 0.017619389109313488, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.337, |
| "grad_norm": 1.4140625, |
| "grad_norm_var": 0.59039306640625, |
| "learning_rate": 2e-05, |
| "loss": 0.2243, |
| "loss/crossentropy": 1.9097219705581665, |
| "loss/hidden": 0.20751953125, |
| "loss/logits": 0.016775669530034065, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.338, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 0.5918365478515625, |
| "learning_rate": 2e-05, |
| "loss": 0.2368, |
| "loss/crossentropy": 1.950836181640625, |
| "loss/hidden": 0.21826171875, |
| "loss/logits": 0.018574767746031284, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.339, |
| "grad_norm": 1.7734375, |
| "grad_norm_var": 0.6037961324055989, |
| "learning_rate": 2e-05, |
| "loss": 0.2568, |
| "loss/crossentropy": 2.2642165422439575, |
| "loss/hidden": 0.236328125, |
| "loss/logits": 0.020494097843766212, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 3.421875, |
| "grad_norm_var": 0.6790812174479167, |
| "learning_rate": 2e-05, |
| "loss": 0.2163, |
| "loss/crossentropy": 0.9126859158277512, |
| "loss/hidden": 0.208984375, |
| "loss/logits": 0.007284831663127989, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.341, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 0.6914713541666667, |
| "learning_rate": 2e-05, |
| "loss": 0.2768, |
| "loss/crossentropy": 2.073229432106018, |
| "loss/hidden": 0.25244140625, |
| "loss/logits": 0.02436121366918087, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.342, |
| "grad_norm": 1.984375, |
| "grad_norm_var": 0.6675364176432291, |
| "learning_rate": 2e-05, |
| "loss": 0.2537, |
| "loss/crossentropy": 2.194327235221863, |
| "loss/hidden": 0.23388671875, |
| "loss/logits": 0.019843836314976215, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.343, |
| "grad_norm": 2.125, |
| "grad_norm_var": 0.6491933186848958, |
| "learning_rate": 2e-05, |
| "loss": 0.2386, |
| "loss/crossentropy": 2.0116490721702576, |
| "loss/hidden": 0.21923828125, |
| "loss/logits": 0.01933881640434265, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.344, |
| "grad_norm": 1.8515625, |
| "grad_norm_var": 0.6516334533691406, |
| "learning_rate": 2e-05, |
| "loss": 0.2251, |
| "loss/crossentropy": 2.178938388824463, |
| "loss/hidden": 0.2080078125, |
| "loss/logits": 0.01710722129791975, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.345, |
| "grad_norm": 2.75, |
| "grad_norm_var": 0.6481910705566406, |
| "learning_rate": 2e-05, |
| "loss": 0.2717, |
| "loss/crossentropy": 1.5063744187355042, |
| "loss/hidden": 0.2529296875, |
| "loss/logits": 0.018804851919412613, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.346, |
| "grad_norm": 7.65625, |
| "grad_norm_var": 2.357260894775391, |
| "learning_rate": 2e-05, |
| "loss": 0.2667, |
| "loss/crossentropy": 2.1732876300811768, |
| "loss/hidden": 0.24365234375, |
| "loss/logits": 0.023004580289125443, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.347, |
| "grad_norm": 1.4453125, |
| "grad_norm_var": 2.3729265848795573, |
| "learning_rate": 2e-05, |
| "loss": 0.2344, |
| "loss/crossentropy": 2.7138930559158325, |
| "loss/hidden": 0.21533203125, |
| "loss/logits": 0.01906493306159973, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.348, |
| "grad_norm": 1.578125, |
| "grad_norm_var": 2.3552935282389322, |
| "learning_rate": 2e-05, |
| "loss": 0.2579, |
| "loss/crossentropy": 1.9490618705749512, |
| "loss/hidden": 0.23779296875, |
| "loss/logits": 0.02011325489729643, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.349, |
| "grad_norm": 2.1875, |
| "grad_norm_var": 2.3231585184733072, |
| "learning_rate": 2e-05, |
| "loss": 0.2559, |
| "loss/crossentropy": 2.595244288444519, |
| "loss/hidden": 0.234375, |
| "loss/logits": 0.021507341414690018, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 2.34324951171875, |
| "learning_rate": 2e-05, |
| "loss": 0.2372, |
| "loss/crossentropy": 1.9536077380180359, |
| "loss/hidden": 0.21875, |
| "loss/logits": 0.018428120762109756, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.351, |
| "grad_norm": 2.40625, |
| "grad_norm_var": 2.2598297119140627, |
| "learning_rate": 2e-05, |
| "loss": 0.2742, |
| "loss/crossentropy": 1.5974725484848022, |
| "loss/hidden": 0.2548828125, |
| "loss/logits": 0.019336224533617496, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 1.671875, |
| "grad_norm_var": 2.2843902587890623, |
| "learning_rate": 2e-05, |
| "loss": 0.2418, |
| "loss/crossentropy": 1.526045560836792, |
| "loss/hidden": 0.2265625, |
| "loss/logits": 0.015270682983100414, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.353, |
| "grad_norm": 1.84375, |
| "grad_norm_var": 2.2429461161295574, |
| "learning_rate": 2e-05, |
| "loss": 0.2647, |
| "loss/crossentropy": 1.7462327480316162, |
| "loss/hidden": 0.24609375, |
| "loss/logits": 0.018634210340678692, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.354, |
| "grad_norm": 1.5078125, |
| "grad_norm_var": 2.2447161356608074, |
| "learning_rate": 2e-05, |
| "loss": 0.2625, |
| "loss/crossentropy": 1.5919590592384338, |
| "loss/hidden": 0.244140625, |
| "loss/logits": 0.018357904627919197, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.355, |
| "grad_norm": 2.0625, |
| "grad_norm_var": 2.2271484375, |
| "learning_rate": 2e-05, |
| "loss": 0.2872, |
| "loss/crossentropy": 1.8317471146583557, |
| "loss/hidden": 0.263671875, |
| "loss/logits": 0.023572669364511967, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.356, |
| "grad_norm": 1.7421875, |
| "grad_norm_var": 2.170776112874349, |
| "learning_rate": 2e-05, |
| "loss": 0.2404, |
| "loss/crossentropy": 2.284816026687622, |
| "loss/hidden": 0.22216796875, |
| "loss/logits": 0.01825767755508423, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.357, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 2.199914296468099, |
| "learning_rate": 2e-05, |
| "loss": 0.2427, |
| "loss/crossentropy": 2.321465492248535, |
| "loss/hidden": 0.2236328125, |
| "loss/logits": 0.019115036353468895, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.358, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 2.194817860921224, |
| "learning_rate": 2e-05, |
| "loss": 0.2768, |
| "loss/crossentropy": 1.8682809472084045, |
| "loss/hidden": 0.25732421875, |
| "loss/logits": 0.019482238218188286, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.359, |
| "grad_norm": 1.453125, |
| "grad_norm_var": 2.236197662353516, |
| "learning_rate": 2e-05, |
| "loss": 0.2433, |
| "loss/crossentropy": 1.6617138981819153, |
| "loss/hidden": 0.22802734375, |
| "loss/logits": 0.015255556907504797, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 2.258771769205729, |
| "learning_rate": 2e-05, |
| "loss": 0.2572, |
| "loss/crossentropy": 1.7371195554733276, |
| "loss/hidden": 0.2392578125, |
| "loss/logits": 0.01798265241086483, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.361, |
| "grad_norm": 1.6015625, |
| "grad_norm_var": 2.2585100809733074, |
| "learning_rate": 2e-05, |
| "loss": 0.2657, |
| "loss/crossentropy": 2.2058807611465454, |
| "loss/hidden": 0.24365234375, |
| "loss/logits": 0.022086018696427345, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.362, |
| "grad_norm": 1.6796875, |
| "grad_norm_var": 0.09375712076822916, |
| "learning_rate": 2e-05, |
| "loss": 0.2811, |
| "loss/crossentropy": 1.7656659483909607, |
| "loss/hidden": 0.2607421875, |
| "loss/logits": 0.02040653582662344, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.363, |
| "grad_norm": 1.59375, |
| "grad_norm_var": 0.08881403605143229, |
| "learning_rate": 2e-05, |
| "loss": 0.2609, |
| "loss/crossentropy": 1.8748412728309631, |
| "loss/hidden": 0.24169921875, |
| "loss/logits": 0.01923577580600977, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.364, |
| "grad_norm": 1.484375, |
| "grad_norm_var": 0.0918108622233073, |
| "learning_rate": 2e-05, |
| "loss": 0.2624, |
| "loss/crossentropy": 2.2202606201171875, |
| "loss/hidden": 0.2412109375, |
| "loss/logits": 0.021174591034650803, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.365, |
| "grad_norm": 2.875, |
| "grad_norm_var": 0.15979995727539062, |
| "learning_rate": 2e-05, |
| "loss": 0.2783, |
| "loss/crossentropy": 1.4792028665542603, |
| "loss/hidden": 0.2587890625, |
| "loss/logits": 0.01950626727193594, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.366, |
| "grad_norm": 1.28125, |
| "grad_norm_var": 0.1768707275390625, |
| "learning_rate": 2e-05, |
| "loss": 0.2551, |
| "loss/crossentropy": 2.0616153478622437, |
| "loss/hidden": 0.23681640625, |
| "loss/logits": 0.018247103318572044, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.367, |
| "grad_norm": 4.84375, |
| "grad_norm_var": 0.7535552978515625, |
| "learning_rate": 2e-05, |
| "loss": 0.2611, |
| "loss/crossentropy": 1.4847497940063477, |
| "loss/hidden": 0.24365234375, |
| "loss/logits": 0.017470326274633408, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 2.34375, |
| "grad_norm_var": 0.758935546875, |
| "learning_rate": 2e-05, |
| "loss": 0.3112, |
| "loss/crossentropy": 1.5849341750144958, |
| "loss/hidden": 0.2919921875, |
| "loss/logits": 0.01922945911064744, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.369, |
| "grad_norm": 2.0625, |
| "grad_norm_var": 0.7582804361979166, |
| "learning_rate": 2e-05, |
| "loss": 0.2421, |
| "loss/crossentropy": 2.056998133659363, |
| "loss/hidden": 0.2236328125, |
| "loss/logits": 0.018427716568112373, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.37, |
| "grad_norm": 2.484375, |
| "grad_norm_var": 0.7560869852701823, |
| "learning_rate": 2e-05, |
| "loss": 0.2576, |
| "loss/crossentropy": 1.2348133325576782, |
| "loss/hidden": 0.24609375, |
| "loss/logits": 0.011468608397990465, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.371, |
| "grad_norm": 1.8203125, |
| "grad_norm_var": 0.7591379801432292, |
| "learning_rate": 2e-05, |
| "loss": 0.2614, |
| "loss/crossentropy": 1.803863763809204, |
| "loss/hidden": 0.2451171875, |
| "loss/logits": 0.0162671972066164, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.372, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 0.7553456624348959, |
| "learning_rate": 2e-05, |
| "loss": 0.2698, |
| "loss/crossentropy": 2.1798359155654907, |
| "loss/hidden": 0.25, |
| "loss/logits": 0.01984146609902382, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.373, |
| "grad_norm": 1.6171875, |
| "grad_norm_var": 0.7430213928222656, |
| "learning_rate": 2e-05, |
| "loss": 0.2549, |
| "loss/crossentropy": 2.1805367469787598, |
| "loss/hidden": 0.2353515625, |
| "loss/logits": 0.019528215751051903, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.374, |
| "grad_norm": 1.6015625, |
| "grad_norm_var": 0.7527252197265625, |
| "learning_rate": 2e-05, |
| "loss": 0.2466, |
| "loss/crossentropy": 2.310236632823944, |
| "loss/hidden": 0.22900390625, |
| "loss/logits": 0.017580187879502773, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 1.2734375, |
| "grad_norm_var": 0.7680559794108073, |
| "learning_rate": 2e-05, |
| "loss": 0.2513, |
| "loss/crossentropy": 2.1785320043563843, |
| "loss/hidden": 0.2314453125, |
| "loss/logits": 0.01982511207461357, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.376, |
| "grad_norm": 4.4375, |
| "grad_norm_var": 1.1152544657389323, |
| "learning_rate": 2e-05, |
| "loss": 0.2596, |
| "loss/crossentropy": 0.8171134777367115, |
| "loss/hidden": 0.25048828125, |
| "loss/logits": 0.009078820585273206, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.377, |
| "grad_norm": 1.328125, |
| "grad_norm_var": 1.1409871419270834, |
| "learning_rate": 2e-05, |
| "loss": 0.2424, |
| "loss/crossentropy": 1.7614737749099731, |
| "loss/hidden": 0.2255859375, |
| "loss/logits": 0.0168464295566082, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.378, |
| "grad_norm": 2.59375, |
| "grad_norm_var": 1.134411366780599, |
| "learning_rate": 2e-05, |
| "loss": 0.274, |
| "loss/crossentropy": 2.0964131355285645, |
| "loss/hidden": 0.2529296875, |
| "loss/logits": 0.021060709841549397, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.379, |
| "grad_norm": 4.28125, |
| "grad_norm_var": 1.361694081624349, |
| "learning_rate": 2e-05, |
| "loss": 0.3697, |
| "loss/crossentropy": 2.450555920600891, |
| "loss/hidden": 0.333984375, |
| "loss/logits": 0.03570409305393696, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 2.75, |
| "grad_norm_var": 1.309454091389974, |
| "learning_rate": 2e-05, |
| "loss": 0.2725, |
| "loss/crossentropy": 1.5147664248943329, |
| "loss/hidden": 0.255859375, |
| "loss/logits": 0.016599873080849648, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.381, |
| "grad_norm": 1.3203125, |
| "grad_norm_var": 1.3758015950520834, |
| "learning_rate": 2e-05, |
| "loss": 0.2579, |
| "loss/crossentropy": 2.537486433982849, |
| "loss/hidden": 0.23486328125, |
| "loss/logits": 0.02300652489066124, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.382, |
| "grad_norm": 1.7734375, |
| "grad_norm_var": 1.319549306233724, |
| "learning_rate": 2e-05, |
| "loss": 0.2737, |
| "loss/crossentropy": 2.0867425203323364, |
| "loss/hidden": 0.25390625, |
| "loss/logits": 0.01982806995511055, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.383, |
| "grad_norm": 4.84375, |
| "grad_norm_var": 1.319549306233724, |
| "learning_rate": 2e-05, |
| "loss": 0.3155, |
| "loss/crossentropy": 0.7784290872514248, |
| "loss/hidden": 0.302734375, |
| "loss/logits": 0.012721415725536644, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 1.320037587483724, |
| "learning_rate": 2e-05, |
| "loss": 0.2877, |
| "loss/crossentropy": 1.9695168733596802, |
| "loss/hidden": 0.2646484375, |
| "loss/logits": 0.023098460398614407, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.385, |
| "grad_norm": 2.578125, |
| "grad_norm_var": 1.313659413655599, |
| "learning_rate": 2e-05, |
| "loss": 0.3314, |
| "loss/crossentropy": 2.1691209077835083, |
| "loss/hidden": 0.3037109375, |
| "loss/logits": 0.027717983350157738, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.386, |
| "grad_norm": 1.9921875, |
| "grad_norm_var": 1.3251790364583333, |
| "learning_rate": 2e-05, |
| "loss": 0.2891, |
| "loss/crossentropy": 2.215142011642456, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.021533265709877014, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.387, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 1.3295814514160156, |
| "learning_rate": 2e-05, |
| "loss": 0.2626, |
| "loss/crossentropy": 1.975950837135315, |
| "loss/hidden": 0.24267578125, |
| "loss/logits": 0.01991237886250019, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.388, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 1.3295814514160156, |
| "learning_rate": 2e-05, |
| "loss": 0.2888, |
| "loss/crossentropy": 1.7475430369377136, |
| "loss/hidden": 0.2685546875, |
| "loss/logits": 0.020239483565092087, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.389, |
| "grad_norm": 1.5546875, |
| "grad_norm_var": 1.3363075256347656, |
| "learning_rate": 2e-05, |
| "loss": 0.2896, |
| "loss/crossentropy": 2.2691071033477783, |
| "loss/hidden": 0.265625, |
| "loss/logits": 0.023962998762726784, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.39, |
| "grad_norm": 3.703125, |
| "grad_norm_var": 1.3911041259765624, |
| "learning_rate": 2e-05, |
| "loss": 0.2825, |
| "loss/crossentropy": 1.2959857881069183, |
| "loss/hidden": 0.26953125, |
| "loss/logits": 0.01301146112382412, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.391, |
| "grad_norm": 1.5234375, |
| "grad_norm_var": 1.3533762613932292, |
| "learning_rate": 2e-05, |
| "loss": 0.3079, |
| "loss/crossentropy": 1.6272442936897278, |
| "loss/hidden": 0.287109375, |
| "loss/logits": 0.02082507126033306, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.392, |
| "grad_norm": 1.671875, |
| "grad_norm_var": 1.1310099283854167, |
| "learning_rate": 2e-05, |
| "loss": 0.3002, |
| "loss/crossentropy": 1.470352828502655, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.01891372073441744, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.393, |
| "grad_norm": 1.6484375, |
| "grad_norm_var": 1.0931292215983073, |
| "learning_rate": 2e-05, |
| "loss": 0.2669, |
| "loss/crossentropy": 2.162920594215393, |
| "loss/hidden": 0.2470703125, |
| "loss/logits": 0.019794010557234287, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.394, |
| "grad_norm": 1.8984375, |
| "grad_norm_var": 1.10401611328125, |
| "learning_rate": 2e-05, |
| "loss": 0.332, |
| "loss/crossentropy": 1.9784889817237854, |
| "loss/hidden": 0.3056640625, |
| "loss/logits": 0.02638086024671793, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.395, |
| "grad_norm": 1.65625, |
| "grad_norm_var": 0.85587158203125, |
| "learning_rate": 2e-05, |
| "loss": 0.2741, |
| "loss/crossentropy": 2.3192607164382935, |
| "loss/hidden": 0.251953125, |
| "loss/logits": 0.022180434316396713, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.396, |
| "grad_norm": 5.15625, |
| "grad_norm_var": 1.4013509114583333, |
| "learning_rate": 2e-05, |
| "loss": 0.3156, |
| "loss/crossentropy": 1.164614673703909, |
| "loss/hidden": 0.3017578125, |
| "loss/logits": 0.013803320820443332, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.397, |
| "grad_norm": 2.96875, |
| "grad_norm_var": 1.349676259358724, |
| "learning_rate": 2e-05, |
| "loss": 0.3132, |
| "loss/crossentropy": 1.406236708164215, |
| "loss/hidden": 0.2919921875, |
| "loss/logits": 0.021233216859400272, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.398, |
| "grad_norm": 1.8671875, |
| "grad_norm_var": 1.3420041402180989, |
| "learning_rate": 2e-05, |
| "loss": 0.3031, |
| "loss/crossentropy": 2.00553822517395, |
| "loss/hidden": 0.28125, |
| "loss/logits": 0.021876277402043343, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.399, |
| "grad_norm": 1.9453125, |
| "grad_norm_var": 0.9369588216145833, |
| "learning_rate": 2e-05, |
| "loss": 0.3002, |
| "loss/crossentropy": 2.423048973083496, |
| "loss/hidden": 0.27587890625, |
| "loss/logits": 0.024367909878492355, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 0.9387532552083333, |
| "learning_rate": 2e-05, |
| "loss": 0.2802, |
| "loss/crossentropy": 2.0390727519989014, |
| "loss/hidden": 0.25732421875, |
| "loss/logits": 0.022871583700180054, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.401, |
| "grad_norm": 2.0, |
| "grad_norm_var": 0.9337473551432292, |
| "learning_rate": 2e-05, |
| "loss": 0.2949, |
| "loss/crossentropy": 1.7632352113723755, |
| "loss/hidden": 0.2763671875, |
| "loss/logits": 0.018549944274127483, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.402, |
| "grad_norm": 2.734375, |
| "grad_norm_var": 0.9470110575358073, |
| "learning_rate": 2e-05, |
| "loss": 0.2974, |
| "loss/crossentropy": 2.082307666540146, |
| "loss/hidden": 0.2802734375, |
| "loss/logits": 0.017112099565565586, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.403, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 0.9346900939941406, |
| "learning_rate": 2e-05, |
| "loss": 0.3015, |
| "loss/crossentropy": 1.6929956078529358, |
| "loss/hidden": 0.2822265625, |
| "loss/logits": 0.019285614602267742, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.404, |
| "grad_norm": 1.7734375, |
| "grad_norm_var": 0.9402503967285156, |
| "learning_rate": 2e-05, |
| "loss": 0.2973, |
| "loss/crossentropy": 1.7842009663581848, |
| "loss/hidden": 0.2763671875, |
| "loss/logits": 0.020938260480761528, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.405, |
| "grad_norm": 1.53125, |
| "grad_norm_var": 0.9424957275390625, |
| "learning_rate": 2e-05, |
| "loss": 0.2892, |
| "loss/crossentropy": 2.237372636795044, |
| "loss/hidden": 0.2685546875, |
| "loss/logits": 0.02065686136484146, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.406, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 0.8275919596354167, |
| "learning_rate": 2e-05, |
| "loss": 0.3242, |
| "loss/crossentropy": 1.9259309768676758, |
| "loss/hidden": 0.298828125, |
| "loss/logits": 0.025404607877135277, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.407, |
| "grad_norm": 1.6171875, |
| "grad_norm_var": 0.8206949869791667, |
| "learning_rate": 2e-05, |
| "loss": 0.3138, |
| "loss/crossentropy": 1.921570897102356, |
| "loss/hidden": 0.2890625, |
| "loss/logits": 0.024776766076683998, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.408, |
| "grad_norm": 1.890625, |
| "grad_norm_var": 0.8104695638020833, |
| "learning_rate": 2e-05, |
| "loss": 0.3062, |
| "loss/crossentropy": 2.0737192630767822, |
| "loss/hidden": 0.2822265625, |
| "loss/logits": 0.023924206383526325, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.409, |
| "grad_norm": 1.3984375, |
| "grad_norm_var": 0.8307169596354167, |
| "learning_rate": 2e-05, |
| "loss": 0.2937, |
| "loss/crossentropy": 1.8156836032867432, |
| "loss/hidden": 0.2724609375, |
| "loss/logits": 0.021192410960793495, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.41, |
| "grad_norm": 3.265625, |
| "grad_norm_var": 0.906597646077474, |
| "learning_rate": 2e-05, |
| "loss": 0.3215, |
| "loss/crossentropy": 1.1065580025315285, |
| "loss/hidden": 0.30322265625, |
| "loss/logits": 0.018325693090446293, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.411, |
| "grad_norm": 1.625, |
| "grad_norm_var": 0.908959706624349, |
| "learning_rate": 2e-05, |
| "loss": 0.2949, |
| "loss/crossentropy": 2.533439874649048, |
| "loss/hidden": 0.271484375, |
| "loss/logits": 0.023434624075889587, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.412, |
| "grad_norm": 1.65625, |
| "grad_norm_var": 0.2980547587076823, |
| "learning_rate": 2e-05, |
| "loss": 0.3031, |
| "loss/crossentropy": 1.6181366443634033, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.01792371179908514, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.413, |
| "grad_norm": 1.6953125, |
| "grad_norm_var": 0.23284912109375, |
| "learning_rate": 2e-05, |
| "loss": 0.3098, |
| "loss/crossentropy": 2.1245399713516235, |
| "loss/hidden": 0.2880859375, |
| "loss/logits": 0.021745222620666027, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.414, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 0.24203872680664062, |
| "learning_rate": 2e-05, |
| "loss": 0.3426, |
| "loss/crossentropy": 1.6368454098701477, |
| "loss/hidden": 0.3173828125, |
| "loss/logits": 0.025259889662265778, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.415, |
| "grad_norm": 1.8515625, |
| "grad_norm_var": 0.24245986938476563, |
| "learning_rate": 2e-05, |
| "loss": 0.335, |
| "loss/crossentropy": 2.158787727355957, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.02839304693043232, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 2.03125, |
| "grad_norm_var": 0.24166641235351563, |
| "learning_rate": 2e-05, |
| "loss": 0.2975, |
| "loss/crossentropy": 1.4076629281044006, |
| "loss/hidden": 0.27734375, |
| "loss/logits": 0.020123483147472143, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.417, |
| "grad_norm": 2.03125, |
| "grad_norm_var": 0.24203465779622396, |
| "learning_rate": 2e-05, |
| "loss": 0.3425, |
| "loss/crossentropy": 1.603783369064331, |
| "loss/hidden": 0.3173828125, |
| "loss/logits": 0.02515949122607708, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.418, |
| "grad_norm": 1.921875, |
| "grad_norm_var": 0.1959612528483073, |
| "learning_rate": 2e-05, |
| "loss": 0.2969, |
| "loss/crossentropy": 2.239418148994446, |
| "loss/hidden": 0.2734375, |
| "loss/logits": 0.023483557626605034, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.419, |
| "grad_norm": 4.96875, |
| "grad_norm_var": 0.7954302469889323, |
| "learning_rate": 2e-05, |
| "loss": 0.3047, |
| "loss/crossentropy": 1.924178957939148, |
| "loss/hidden": 0.283203125, |
| "loss/logits": 0.021516622975468636, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 2.375, |
| "grad_norm_var": 0.7949015299479166, |
| "learning_rate": 2e-05, |
| "loss": 0.3283, |
| "loss/crossentropy": 2.2020623683929443, |
| "loss/hidden": 0.2998046875, |
| "loss/logits": 0.028535975143313408, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.421, |
| "grad_norm": 2.703125, |
| "grad_norm_var": 0.7919260660807291, |
| "learning_rate": 2e-05, |
| "loss": 0.312, |
| "loss/crossentropy": 2.484855532646179, |
| "loss/hidden": 0.28515625, |
| "loss/logits": 0.02685546688735485, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.422, |
| "grad_norm": 1.4921875, |
| "grad_norm_var": 0.7867510477701823, |
| "learning_rate": 2e-05, |
| "loss": 0.287, |
| "loss/crossentropy": 2.021208882331848, |
| "loss/hidden": 0.267578125, |
| "loss/logits": 0.019387257285416126, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.423, |
| "grad_norm": 2.25, |
| "grad_norm_var": 0.7646067301432292, |
| "learning_rate": 2e-05, |
| "loss": 0.4152, |
| "loss/crossentropy": 1.692772924900055, |
| "loss/hidden": 0.3828125, |
| "loss/logits": 0.03240843303501606, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.424, |
| "grad_norm": 2.171875, |
| "grad_norm_var": 0.7573557535807292, |
| "learning_rate": 2e-05, |
| "loss": 0.3138, |
| "loss/crossentropy": 1.9183388948440552, |
| "loss/hidden": 0.2939453125, |
| "loss/logits": 0.019842680543661118, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.425, |
| "grad_norm": 1.890625, |
| "grad_norm_var": 0.7177019755045573, |
| "learning_rate": 2e-05, |
| "loss": 0.3489, |
| "loss/crossentropy": 2.1215824484825134, |
| "loss/hidden": 0.3212890625, |
| "loss/logits": 0.02758181467652321, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.426, |
| "grad_norm": 2.328125, |
| "grad_norm_var": 0.6474505106608073, |
| "learning_rate": 2e-05, |
| "loss": 0.3388, |
| "loss/crossentropy": 1.9683921933174133, |
| "loss/hidden": 0.3134765625, |
| "loss/logits": 0.02529013529419899, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.427, |
| "grad_norm": 1.8359375, |
| "grad_norm_var": 0.6339029947916667, |
| "learning_rate": 2e-05, |
| "loss": 0.3422, |
| "loss/crossentropy": 2.143548607826233, |
| "loss/hidden": 0.3125, |
| "loss/logits": 0.029734219424426556, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.428, |
| "grad_norm": 1.578125, |
| "grad_norm_var": 0.6401438395182292, |
| "learning_rate": 2e-05, |
| "loss": 0.3222, |
| "loss/crossentropy": 2.3858566284179688, |
| "loss/hidden": 0.294921875, |
| "loss/logits": 0.02722975332289934, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.429, |
| "grad_norm": 5.6875, |
| "grad_norm_var": 1.3602190653483073, |
| "learning_rate": 2e-05, |
| "loss": 0.4125, |
| "loss/crossentropy": 1.6411058902740479, |
| "loss/hidden": 0.375, |
| "loss/logits": 0.037471650168299675, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.43, |
| "grad_norm": 2.5, |
| "grad_norm_var": 1.358288319905599, |
| "learning_rate": 2e-05, |
| "loss": 0.3505, |
| "loss/crossentropy": 1.9110031723976135, |
| "loss/hidden": 0.326171875, |
| "loss/logits": 0.024304441176354885, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.431, |
| "grad_norm": 2.953125, |
| "grad_norm_var": 1.3424031575520834, |
| "learning_rate": 2e-05, |
| "loss": 0.3132, |
| "loss/crossentropy": 2.3980900645256042, |
| "loss/hidden": 0.291015625, |
| "loss/logits": 0.022203332744538784, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 1.3248942057291666, |
| "learning_rate": 2e-05, |
| "loss": 0.3188, |
| "loss/crossentropy": 1.1947295665740967, |
| "loss/hidden": 0.3017578125, |
| "loss/logits": 0.017058685887604952, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.433, |
| "grad_norm": 4.65625, |
| "grad_norm_var": 1.5668863932291666, |
| "learning_rate": 2e-05, |
| "loss": 0.327, |
| "loss/crossentropy": 1.3946540653705597, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.01643618429079652, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.434, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 1.5572794596354167, |
| "learning_rate": 2e-05, |
| "loss": 0.354, |
| "loss/crossentropy": 1.5059205293655396, |
| "loss/hidden": 0.330078125, |
| "loss/logits": 0.023915931582450867, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.435, |
| "grad_norm": 1.8828125, |
| "grad_norm_var": 1.2355262756347656, |
| "learning_rate": 2e-05, |
| "loss": 0.3513, |
| "loss/crossentropy": 1.792906641960144, |
| "loss/hidden": 0.3271484375, |
| "loss/logits": 0.024162176996469498, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.436, |
| "grad_norm": 2.4375, |
| "grad_norm_var": 1.2343340555826823, |
| "learning_rate": 2e-05, |
| "loss": 0.3136, |
| "loss/crossentropy": 1.450164943933487, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.016714961268007755, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.437, |
| "grad_norm": 1.765625, |
| "grad_norm_var": 1.2702837626139323, |
| "learning_rate": 2e-05, |
| "loss": 0.3604, |
| "loss/crossentropy": 2.180246353149414, |
| "loss/hidden": 0.3330078125, |
| "loss/logits": 0.027343785390257835, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.438, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 1.2044962565104167, |
| "learning_rate": 2e-05, |
| "loss": 0.3166, |
| "loss/crossentropy": 2.12428879737854, |
| "loss/hidden": 0.2939453125, |
| "loss/logits": 0.022634671069681644, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.439, |
| "grad_norm": 9.6875, |
| "grad_norm_var": 4.37318115234375, |
| "learning_rate": 2e-05, |
| "loss": 0.3195, |
| "loss/crossentropy": 2.7157105207443237, |
| "loss/hidden": 0.2939453125, |
| "loss/logits": 0.025585060939192772, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 2.265625, |
| "grad_norm_var": 4.3633056640625, |
| "learning_rate": 2e-05, |
| "loss": 0.3597, |
| "loss/crossentropy": 2.1406047344207764, |
| "loss/hidden": 0.3310546875, |
| "loss/logits": 0.028656354174017906, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.441, |
| "grad_norm": 6.53125, |
| "grad_norm_var": 5.015591430664062, |
| "learning_rate": 2e-05, |
| "loss": 0.3547, |
| "loss/crossentropy": 1.6884450316429138, |
| "loss/hidden": 0.3359375, |
| "loss/logits": 0.018803046084940434, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.442, |
| "grad_norm": 2.015625, |
| "grad_norm_var": 5.062262980143229, |
| "learning_rate": 2e-05, |
| "loss": 0.3637, |
| "loss/crossentropy": 1.4371431171894073, |
| "loss/hidden": 0.3408203125, |
| "loss/logits": 0.022848802618682384, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.443, |
| "grad_norm": 2.28125, |
| "grad_norm_var": 4.988783518473308, |
| "learning_rate": 2e-05, |
| "loss": 0.3638, |
| "loss/crossentropy": 2.1007304191589355, |
| "loss/hidden": 0.33203125, |
| "loss/logits": 0.03172433376312256, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.444, |
| "grad_norm": 2.625, |
| "grad_norm_var": 4.815531158447266, |
| "learning_rate": 2e-05, |
| "loss": 0.3252, |
| "loss/crossentropy": 1.2444575875997543, |
| "loss/hidden": 0.310546875, |
| "loss/logits": 0.01463229674845934, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.445, |
| "grad_norm": 2.921875, |
| "grad_norm_var": 4.441019439697266, |
| "learning_rate": 2e-05, |
| "loss": 0.2883, |
| "loss/crossentropy": 0.8468852750957012, |
| "loss/hidden": 0.27734375, |
| "loss/logits": 0.010967775015160441, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.446, |
| "grad_norm": 2.78125, |
| "grad_norm_var": 4.41961441040039, |
| "learning_rate": 2e-05, |
| "loss": 0.3366, |
| "loss/crossentropy": 0.8375072181224823, |
| "loss/hidden": 0.32421875, |
| "loss/logits": 0.012416118755936623, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.447, |
| "grad_norm": 3.703125, |
| "grad_norm_var": 4.428061676025391, |
| "learning_rate": 2e-05, |
| "loss": 0.3169, |
| "loss/crossentropy": 1.0745504349470139, |
| "loss/hidden": 0.3017578125, |
| "loss/logits": 0.015155402477830648, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 3.296875, |
| "grad_norm_var": 4.379162343343099, |
| "learning_rate": 2e-05, |
| "loss": 0.3871, |
| "loss/crossentropy": 2.4705830812454224, |
| "loss/hidden": 0.3544921875, |
| "loss/logits": 0.032651511020958424, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.449, |
| "grad_norm": 2.765625, |
| "grad_norm_var": 4.265921783447266, |
| "learning_rate": 2e-05, |
| "loss": 0.3059, |
| "loss/crossentropy": 1.0752387568354607, |
| "loss/hidden": 0.29052734375, |
| "loss/logits": 0.015399906318634748, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 1.921875, |
| "grad_norm_var": 4.28130874633789, |
| "learning_rate": 2e-05, |
| "loss": 0.34, |
| "loss/crossentropy": 2.0603660941123962, |
| "loss/hidden": 0.3134765625, |
| "loss/logits": 0.026526711881160736, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.451, |
| "grad_norm": 1.9296875, |
| "grad_norm_var": 4.273233795166016, |
| "learning_rate": 2e-05, |
| "loss": 0.3604, |
| "loss/crossentropy": 1.7995396256446838, |
| "loss/hidden": 0.3349609375, |
| "loss/logits": 0.025428760796785355, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.452, |
| "grad_norm": 2.0, |
| "grad_norm_var": 4.329658762613932, |
| "learning_rate": 2e-05, |
| "loss": 0.3508, |
| "loss/crossentropy": 1.784224808216095, |
| "loss/hidden": 0.3271484375, |
| "loss/logits": 0.02365759387612343, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.453, |
| "grad_norm": 1.7421875, |
| "grad_norm_var": 4.334089152018229, |
| "learning_rate": 2e-05, |
| "loss": 0.3251, |
| "loss/crossentropy": 2.343531847000122, |
| "loss/hidden": 0.3017578125, |
| "loss/logits": 0.023351279087364674, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.454, |
| "grad_norm": 1.7265625, |
| "grad_norm_var": 4.417317454020182, |
| "learning_rate": 2e-05, |
| "loss": 0.374, |
| "loss/crossentropy": 1.5861871838569641, |
| "loss/hidden": 0.3466796875, |
| "loss/logits": 0.027331165969371796, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.455, |
| "grad_norm": 2.390625, |
| "grad_norm_var": 1.3722023010253905, |
| "learning_rate": 2e-05, |
| "loss": 0.377, |
| "loss/crossentropy": 1.8146731853485107, |
| "loss/hidden": 0.3505859375, |
| "loss/logits": 0.026416282169520855, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.456, |
| "grad_norm": 2.9375, |
| "grad_norm_var": 1.3631914774576823, |
| "learning_rate": 2e-05, |
| "loss": 0.4535, |
| "loss/crossentropy": 1.2616842985153198, |
| "loss/hidden": 0.4189453125, |
| "loss/logits": 0.03458691854029894, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.457, |
| "grad_norm": 1.5078125, |
| "grad_norm_var": 0.3897369384765625, |
| "learning_rate": 2e-05, |
| "loss": 0.3512, |
| "loss/crossentropy": 2.0038002133369446, |
| "loss/hidden": 0.3232421875, |
| "loss/logits": 0.027975574135780334, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.458, |
| "grad_norm": 1.65625, |
| "grad_norm_var": 0.4166666666666667, |
| "learning_rate": 2e-05, |
| "loss": 0.3401, |
| "loss/crossentropy": 2.4134583473205566, |
| "loss/hidden": 0.314453125, |
| "loss/logits": 0.025651385076344013, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.459, |
| "grad_norm": 1.5390625, |
| "grad_norm_var": 0.4615313212076823, |
| "learning_rate": 2e-05, |
| "loss": 0.3211, |
| "loss/crossentropy": 1.8191776871681213, |
| "loss/hidden": 0.296875, |
| "loss/logits": 0.024273362010717392, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 1.9921875, |
| "grad_norm_var": 0.46254069010416665, |
| "learning_rate": 2e-05, |
| "loss": 0.3388, |
| "loss/crossentropy": 1.9413084983825684, |
| "loss/hidden": 0.314453125, |
| "loss/logits": 0.024359911680221558, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.461, |
| "grad_norm": 1.4375, |
| "grad_norm_var": 0.47732645670572915, |
| "learning_rate": 2e-05, |
| "loss": 0.3526, |
| "loss/crossentropy": 1.9450668096542358, |
| "loss/hidden": 0.328125, |
| "loss/logits": 0.024524222128093243, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.462, |
| "grad_norm": 3.390625, |
| "grad_norm_var": 0.5471110026041667, |
| "learning_rate": 2e-05, |
| "loss": 0.3562, |
| "loss/crossentropy": 0.9243374168872833, |
| "loss/hidden": 0.3369140625, |
| "loss/logits": 0.01929214200936258, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.463, |
| "grad_norm": 3.0625, |
| "grad_norm_var": 0.4483062744140625, |
| "learning_rate": 2e-05, |
| "loss": 0.4748, |
| "loss/crossentropy": 1.9480538964271545, |
| "loss/hidden": 0.435546875, |
| "loss/logits": 0.0392797002568841, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 2.078125, |
| "grad_norm_var": 0.3638824462890625, |
| "learning_rate": 2e-05, |
| "loss": 0.363, |
| "loss/crossentropy": 2.4018349647521973, |
| "loss/hidden": 0.3310546875, |
| "loss/logits": 0.031900130212306976, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.465, |
| "grad_norm": 1.71875, |
| "grad_norm_var": 0.34364013671875, |
| "learning_rate": 2e-05, |
| "loss": 0.3927, |
| "loss/crossentropy": 1.9479365348815918, |
| "loss/hidden": 0.3642578125, |
| "loss/logits": 0.028458724729716778, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.466, |
| "grad_norm": 3.609375, |
| "grad_norm_var": 0.48953857421875, |
| "learning_rate": 2e-05, |
| "loss": 0.4255, |
| "loss/crossentropy": 2.2209248542785645, |
| "loss/hidden": 0.3896484375, |
| "loss/logits": 0.035828106105327606, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.467, |
| "grad_norm": 2.734375, |
| "grad_norm_var": 0.5042335510253906, |
| "learning_rate": 2e-05, |
| "loss": 0.3716, |
| "loss/crossentropy": 2.147653102874756, |
| "loss/hidden": 0.34375, |
| "loss/logits": 0.027827131561934948, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.468, |
| "grad_norm": 6.71875, |
| "grad_norm_var": 1.7573443094889323, |
| "learning_rate": 2e-05, |
| "loss": 0.4079, |
| "loss/crossentropy": 1.6784539818763733, |
| "loss/hidden": 0.37890625, |
| "loss/logits": 0.02900131233036518, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.469, |
| "grad_norm": 17.125, |
| "grad_norm_var": 14.96142578125, |
| "learning_rate": 2e-05, |
| "loss": 0.4166, |
| "loss/crossentropy": 1.8554924130439758, |
| "loss/hidden": 0.3857421875, |
| "loss/logits": 0.030880731530487537, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 3.25, |
| "grad_norm_var": 14.75101089477539, |
| "learning_rate": 2e-05, |
| "loss": 0.4192, |
| "loss/crossentropy": 1.2564690709114075, |
| "loss/hidden": 0.3974609375, |
| "loss/logits": 0.021707525476813316, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.471, |
| "grad_norm": 2.03125, |
| "grad_norm_var": 14.815679677327473, |
| "learning_rate": 2e-05, |
| "loss": 0.334, |
| "loss/crossentropy": 2.4818525314331055, |
| "loss/hidden": 0.306640625, |
| "loss/logits": 0.027309534139931202, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.472, |
| "grad_norm": 3.53125, |
| "grad_norm_var": 14.789277903238933, |
| "learning_rate": 2e-05, |
| "loss": 0.4143, |
| "loss/crossentropy": 1.932157814502716, |
| "loss/hidden": 0.380859375, |
| "loss/logits": 0.033446524292230606, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.473, |
| "grad_norm": 5.59375, |
| "grad_norm_var": 14.700297037760416, |
| "learning_rate": 2e-05, |
| "loss": 0.4902, |
| "loss/crossentropy": 1.8874651789665222, |
| "loss/hidden": 0.435546875, |
| "loss/logits": 0.05470091383904219, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.474, |
| "grad_norm": 1.9140625, |
| "grad_norm_var": 14.629323069254557, |
| "learning_rate": 2e-05, |
| "loss": 0.4083, |
| "loss/crossentropy": 2.1272743344306946, |
| "loss/hidden": 0.376953125, |
| "loss/logits": 0.03137173503637314, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.475, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 14.43091532389323, |
| "learning_rate": 2e-05, |
| "loss": 0.4207, |
| "loss/crossentropy": 1.7301099300384521, |
| "loss/hidden": 0.3896484375, |
| "loss/logits": 0.031028682366013527, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.476, |
| "grad_norm": 1.7578125, |
| "grad_norm_var": 14.494132486979167, |
| "learning_rate": 2e-05, |
| "loss": 0.3407, |
| "loss/crossentropy": 2.730847716331482, |
| "loss/hidden": 0.3125, |
| "loss/logits": 0.02817363850772381, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.477, |
| "grad_norm": 153.0, |
| "grad_norm_var": 1400.620027669271, |
| "learning_rate": 2e-05, |
| "loss": 2.013, |
| "loss/crossentropy": 2.05564546585083, |
| "loss/hidden": 1.8564453125, |
| "loss/logits": 0.15657314844429493, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.478, |
| "grad_norm": 19.75, |
| "grad_norm_var": 1395.5939931233725, |
| "learning_rate": 2e-05, |
| "loss": 0.454, |
| "loss/crossentropy": 1.1017511487007141, |
| "loss/hidden": 0.4306640625, |
| "loss/logits": 0.0233338032849133, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.479, |
| "grad_norm": 3.375, |
| "grad_norm_var": 1395.128294881185, |
| "learning_rate": 2e-05, |
| "loss": 0.3834, |
| "loss/crossentropy": 2.434400200843811, |
| "loss/hidden": 0.3525390625, |
| "loss/logits": 0.030910088680684566, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 1.7421875, |
| "grad_norm_var": 1395.687501780192, |
| "learning_rate": 2e-05, |
| "loss": 0.3356, |
| "loss/crossentropy": 2.0959779620170593, |
| "loss/hidden": 0.3115234375, |
| "loss/logits": 0.024123553186655045, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.481, |
| "grad_norm": 2.296875, |
| "grad_norm_var": 1394.732089996338, |
| "learning_rate": 2e-05, |
| "loss": 0.3781, |
| "loss/crossentropy": 2.1706955432891846, |
| "loss/hidden": 0.349609375, |
| "loss/logits": 0.02846558205783367, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.482, |
| "grad_norm": 2.78125, |
| "grad_norm_var": 1395.9686703999837, |
| "learning_rate": 2e-05, |
| "loss": 0.4153, |
| "loss/crossentropy": 2.017691493034363, |
| "loss/hidden": 0.380859375, |
| "loss/logits": 0.03442922607064247, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.483, |
| "grad_norm": 2.625, |
| "grad_norm_var": 1396.139084625244, |
| "learning_rate": 2e-05, |
| "loss": 0.4602, |
| "loss/crossentropy": 0.9743039608001709, |
| "loss/hidden": 0.4384765625, |
| "loss/logits": 0.02175035234540701, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.484, |
| "grad_norm": 3.09375, |
| "grad_norm_var": 1400.654522450765, |
| "learning_rate": 2e-05, |
| "loss": 0.4012, |
| "loss/crossentropy": 1.763650119304657, |
| "loss/hidden": 0.3701171875, |
| "loss/logits": 0.031122790649533272, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.485, |
| "grad_norm": 2.140625, |
| "grad_norm_var": 1408.7144772847494, |
| "learning_rate": 2e-05, |
| "loss": 0.4013, |
| "loss/crossentropy": 1.809104084968567, |
| "loss/hidden": 0.3720703125, |
| "loss/logits": 0.029215124435722828, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.486, |
| "grad_norm": 2.375, |
| "grad_norm_var": 1409.9230140686036, |
| "learning_rate": 2e-05, |
| "loss": 0.3948, |
| "loss/crossentropy": 2.004360318183899, |
| "loss/hidden": 0.365234375, |
| "loss/logits": 0.029561107978224754, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.487, |
| "grad_norm": 2.546875, |
| "grad_norm_var": 1409.175626373291, |
| "learning_rate": 2e-05, |
| "loss": 0.3458, |
| "loss/crossentropy": 1.6244131922721863, |
| "loss/hidden": 0.3232421875, |
| "loss/logits": 0.022567229345440865, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.488, |
| "grad_norm": 1.921875, |
| "grad_norm_var": 1411.4071673075357, |
| "learning_rate": 2e-05, |
| "loss": 0.376, |
| "loss/crossentropy": 2.131972551345825, |
| "loss/hidden": 0.349609375, |
| "loss/logits": 0.026385605335235596, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.489, |
| "grad_norm": 3.265625, |
| "grad_norm_var": 1414.0684445699055, |
| "learning_rate": 2e-05, |
| "loss": 0.3666, |
| "loss/crossentropy": 2.4170000553131104, |
| "loss/hidden": 0.337890625, |
| "loss/logits": 0.028684877790510654, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.49, |
| "grad_norm": 2.21875, |
| "grad_norm_var": 1413.6267171223958, |
| "learning_rate": 2e-05, |
| "loss": 0.395, |
| "loss/crossentropy": 2.346976161003113, |
| "loss/hidden": 0.3642578125, |
| "loss/logits": 0.030727216973900795, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.491, |
| "grad_norm": 1.8125, |
| "grad_norm_var": 1414.3293446858725, |
| "learning_rate": 2e-05, |
| "loss": 0.3558, |
| "loss/crossentropy": 2.088572919368744, |
| "loss/hidden": 0.3291015625, |
| "loss/logits": 0.026736538857221603, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.492, |
| "grad_norm": 2.140625, |
| "grad_norm_var": 1413.7688209533692, |
| "learning_rate": 2e-05, |
| "loss": 0.4049, |
| "loss/crossentropy": 2.0796549916267395, |
| "loss/hidden": 0.373046875, |
| "loss/logits": 0.03185986541211605, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.493, |
| "grad_norm": 1.546875, |
| "grad_norm_var": 19.118241119384766, |
| "learning_rate": 2e-05, |
| "loss": 0.3787, |
| "loss/crossentropy": 2.037910223007202, |
| "loss/hidden": 0.3505859375, |
| "loss/logits": 0.028140602633357048, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.494, |
| "grad_norm": 1.5, |
| "grad_norm_var": 0.33713760375976565, |
| "learning_rate": 2e-05, |
| "loss": 0.3671, |
| "loss/crossentropy": 2.094150185585022, |
| "loss/hidden": 0.3408203125, |
| "loss/logits": 0.026267005130648613, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.495, |
| "grad_norm": 4.21875, |
| "grad_norm_var": 0.4984718322753906, |
| "learning_rate": 2e-05, |
| "loss": 0.4406, |
| "loss/crossentropy": 2.0400118231773376, |
| "loss/hidden": 0.41015625, |
| "loss/logits": 0.030417216941714287, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 1.9296875, |
| "grad_norm_var": 0.4844947814941406, |
| "learning_rate": 2e-05, |
| "loss": 0.3762, |
| "loss/crossentropy": 2.0676961541175842, |
| "loss/hidden": 0.3486328125, |
| "loss/logits": 0.027518443763256073, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.497, |
| "grad_norm": 2.0625, |
| "grad_norm_var": 0.4911781311035156, |
| "learning_rate": 2e-05, |
| "loss": 0.3816, |
| "loss/crossentropy": 1.886461853981018, |
| "loss/hidden": 0.353515625, |
| "loss/logits": 0.028073432855308056, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.498, |
| "grad_norm": 1.75, |
| "grad_norm_var": 0.5033302307128906, |
| "learning_rate": 2e-05, |
| "loss": 0.3943, |
| "loss/crossentropy": 2.311934471130371, |
| "loss/hidden": 0.3642578125, |
| "loss/logits": 0.03002871386706829, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.499, |
| "grad_norm": 2.125, |
| "grad_norm_var": 0.49874038696289064, |
| "learning_rate": 2e-05, |
| "loss": 0.4378, |
| "loss/crossentropy": 1.8267198204994202, |
| "loss/hidden": 0.40234375, |
| "loss/logits": 0.03542067110538483, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 2.46875, |
| "grad_norm_var": 0.45621922810872395, |
| "learning_rate": 2e-05, |
| "loss": 0.4457, |
| "loss/crossentropy": 1.4779966473579407, |
| "loss/hidden": 0.4169921875, |
| "loss/logits": 0.028671161271631718, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.2202930782208e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|