qwen8btest / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
ba65290 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.025,
"eval_steps": 1000,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 2.5e-05,
"grad_norm": 11.25,
"learning_rate": 1.0000000000000002e-06,
"loss": 1607.1185,
"loss/crossentropy": 0.4998045265674591,
"loss/hidden": 0.2041015625,
"loss/logits": 0.007406285032629967,
"loss/reg": 1606.4072265625,
"loss/twn": 0.0,
"step": 1
},
{
"epoch": 5e-05,
"grad_norm": 25.625,
"learning_rate": 2.0000000000000003e-06,
"loss": 1604.4435,
"loss/crossentropy": 1.6524670124053955,
"loss/hidden": 0.1357421875,
"loss/logits": 0.006067799869924784,
"loss/reg": 1602.649169921875,
"loss/twn": 0.0,
"step": 2
},
{
"epoch": 7.5e-05,
"grad_norm": 9.9375,
"learning_rate": 3e-06,
"loss": 1547.7074,
"loss/crossentropy": 1.9341739416122437,
"loss/hidden": 0.15234375,
"loss/logits": 0.014470485970377922,
"loss/reg": 1545.6063232421875,
"loss/twn": 0.0,
"step": 3
},
{
"epoch": 0.0001,
"grad_norm": 14.375,
"learning_rate": 4.000000000000001e-06,
"loss": 1500.7928,
"loss/crossentropy": 2.407871723175049,
"loss/hidden": 0.1875,
"loss/logits": 0.01105956919491291,
"loss/reg": 1498.1864013671875,
"loss/twn": 0.0,
"step": 4
},
{
"epoch": 0.000125,
"grad_norm": 9.875,
"learning_rate": 5e-06,
"loss": 1421.8827,
"loss/crossentropy": 1.7022260427474976,
"loss/hidden": 0.10546875,
"loss/logits": 0.00693091843277216,
"loss/reg": 1420.0679931640625,
"loss/twn": 0.0,
"step": 5
},
{
"epoch": 0.00015,
"grad_norm": 470.0,
"learning_rate": 6e-06,
"loss": 1315.2301,
"loss/crossentropy": 1.3705801963806152,
"loss/hidden": 0.181640625,
"loss/logits": 0.002700040116906166,
"loss/reg": 1313.6751708984375,
"loss/twn": 0.0,
"step": 6
},
{
"epoch": 0.000175,
"grad_norm": 12.4375,
"learning_rate": 7.000000000000001e-06,
"loss": 1187.7322,
"loss/crossentropy": 1.8566981554031372,
"loss/hidden": 0.083984375,
"loss/logits": 0.004405488260090351,
"loss/reg": 1185.787109375,
"loss/twn": 0.0,
"step": 7
},
{
"epoch": 0.0002,
"grad_norm": 20.75,
"learning_rate": 8.000000000000001e-06,
"loss": 1041.5831,
"loss/crossentropy": 2.76304030418396,
"loss/hidden": 0.14453125,
"loss/logits": 0.009587295353412628,
"loss/reg": 1038.6658935546875,
"loss/twn": 0.0,
"step": 8
},
{
"epoch": 0.000225,
"grad_norm": 280.0,
"learning_rate": 9e-06,
"loss": 889.5813,
"loss/crossentropy": 2.0730843544006348,
"loss/hidden": 0.1767578125,
"loss/logits": 0.009802292101085186,
"loss/reg": 887.3216552734375,
"loss/twn": 0.0,
"step": 9
},
{
"epoch": 0.00025,
"grad_norm": 13.8125,
"learning_rate": 1e-05,
"loss": 739.1588,
"loss/crossentropy": 2.678976535797119,
"loss/hidden": 0.1357421875,
"loss/logits": 0.01222484465688467,
"loss/reg": 736.3318481445312,
"loss/twn": 0.0,
"step": 10
},
{
"epoch": 0.000275,
"grad_norm": 19.5,
"learning_rate": 1.1000000000000001e-05,
"loss": 602.8989,
"loss/crossentropy": 1.9476336240768433,
"loss/hidden": 0.1865234375,
"loss/logits": 0.007580972742289305,
"loss/reg": 600.7571411132812,
"loss/twn": 0.0,
"step": 11
},
{
"epoch": 0.0003,
"grad_norm": 9.875,
"learning_rate": 1.2e-05,
"loss": 486.0685,
"loss/crossentropy": 2.1175615787506104,
"loss/hidden": 0.1767578125,
"loss/logits": 0.008497287519276142,
"loss/reg": 483.76568603515625,
"loss/twn": 0.0,
"step": 12
},
{
"epoch": 0.000325,
"grad_norm": 14.0625,
"learning_rate": 1.3000000000000001e-05,
"loss": 389.9533,
"loss/crossentropy": 1.2835053205490112,
"loss/hidden": 0.10498046875,
"loss/logits": 0.008934162557125092,
"loss/reg": 388.5558776855469,
"loss/twn": 0.0,
"step": 13
},
{
"epoch": 0.00035,
"grad_norm": 19.5,
"learning_rate": 1.4000000000000001e-05,
"loss": 313.737,
"loss/crossentropy": 1.8903541564941406,
"loss/hidden": 0.1103515625,
"loss/logits": 0.008576408959925175,
"loss/reg": 311.7276916503906,
"loss/twn": 0.0,
"step": 14
},
{
"epoch": 0.000375,
"grad_norm": 32.25,
"learning_rate": 1.5e-05,
"loss": 253.3787,
"loss/crossentropy": 1.3272770643234253,
"loss/hidden": 0.2275390625,
"loss/logits": 0.009392762556672096,
"loss/reg": 251.8144989013672,
"loss/twn": 0.0,
"step": 15
},
{
"epoch": 0.0004,
"grad_norm": 10.6875,
"grad_norm_var": 16279.171077473959,
"learning_rate": 1.6000000000000003e-05,
"loss": 207.3934,
"loss/crossentropy": 1.8237905502319336,
"loss/hidden": 0.0986328125,
"loss/logits": 0.0111556276679039,
"loss/reg": 205.45985412597656,
"loss/twn": 0.0,
"step": 16
},
{
"epoch": 0.000425,
"grad_norm": 115.5,
"grad_norm_var": 16268.691780598958,
"learning_rate": 1.7000000000000003e-05,
"loss": 171.5853,
"loss/crossentropy": 1.6593583822250366,
"loss/hidden": 0.140625,
"loss/logits": 0.004231919534504414,
"loss/reg": 169.7810516357422,
"loss/twn": 0.0,
"step": 17
},
{
"epoch": 0.00045,
"grad_norm": 16.375,
"grad_norm_var": 16325.545556640625,
"learning_rate": 1.8e-05,
"loss": 143.6776,
"loss/crossentropy": 0.922300398349762,
"loss/hidden": 0.22265625,
"loss/logits": 0.012654997408390045,
"loss/reg": 142.52001953125,
"loss/twn": 0.0,
"step": 18
},
{
"epoch": 0.000475,
"grad_norm": 228.0,
"grad_norm_var": 17643.971875,
"learning_rate": 1.9e-05,
"loss": 123.6267,
"loss/crossentropy": 1.7576591968536377,
"loss/hidden": 0.12255859375,
"loss/logits": 0.0077894763089716434,
"loss/reg": 121.73872375488281,
"loss/twn": 0.0,
"step": 19
},
{
"epoch": 0.0005,
"grad_norm": 15.3125,
"grad_norm_var": 17635.768994140624,
"learning_rate": 2e-05,
"loss": 107.5298,
"loss/crossentropy": 1.702596664428711,
"loss/hidden": 0.193359375,
"loss/logits": 0.012834219262003899,
"loss/reg": 105.62105560302734,
"loss/twn": 0.0,
"step": 20
},
{
"epoch": 0.000525,
"grad_norm": 21.125,
"grad_norm_var": 17537.747509765624,
"learning_rate": 2.1e-05,
"loss": 96.0316,
"loss/crossentropy": 2.7474312782287598,
"loss/hidden": 0.16796875,
"loss/logits": 0.018948907032608986,
"loss/reg": 93.09722900390625,
"loss/twn": 0.0,
"step": 21
},
{
"epoch": 0.00055,
"grad_norm": 12.8125,
"grad_norm_var": 6900.875520833333,
"learning_rate": 2.2000000000000003e-05,
"loss": 85.865,
"loss/crossentropy": 2.7010557651519775,
"loss/hidden": 0.1005859375,
"loss/logits": 0.004374333191663027,
"loss/reg": 83.05902862548828,
"loss/twn": 0.0,
"step": 22
},
{
"epoch": 0.000575,
"grad_norm": 13.875,
"grad_norm_var": 6893.302067057291,
"learning_rate": 2.3000000000000003e-05,
"loss": 78.0325,
"loss/crossentropy": 2.801321029663086,
"loss/hidden": 0.146484375,
"loss/logits": 0.019743533805012703,
"loss/reg": 75.06490325927734,
"loss/twn": 0.0,
"step": 23
},
{
"epoch": 0.0006,
"grad_norm": 11.625,
"grad_norm_var": 6937.396728515625,
"learning_rate": 2.4e-05,
"loss": 70.1903,
"loss/crossentropy": 1.5617326498031616,
"loss/hidden": 0.134765625,
"loss/logits": 0.006116375792771578,
"loss/reg": 68.48768615722656,
"loss/twn": 0.0,
"step": 24
},
{
"epoch": 0.000625,
"grad_norm": 32.0,
"grad_norm_var": 3246.975895182292,
"learning_rate": 2.5e-05,
"loss": 64.4648,
"loss/crossentropy": 1.4777029752731323,
"loss/hidden": 0.158203125,
"loss/logits": 0.007067938335239887,
"loss/reg": 62.821861267089844,
"loss/twn": 0.0,
"step": 25
},
{
"epoch": 0.00065,
"grad_norm": 11.3125,
"grad_norm_var": 3254.977197265625,
"learning_rate": 2.6000000000000002e-05,
"loss": 60.7985,
"loss/crossentropy": 2.695087432861328,
"loss/hidden": 0.154296875,
"loss/logits": 0.009972814470529556,
"loss/reg": 57.93910217285156,
"loss/twn": 0.0,
"step": 26
},
{
"epoch": 0.000675,
"grad_norm": 9.9375,
"grad_norm_var": 3282.35234375,
"learning_rate": 2.7000000000000002e-05,
"loss": 54.9976,
"loss/crossentropy": 1.1553270816802979,
"loss/hidden": 0.13671875,
"loss/logits": 0.008951587602496147,
"loss/reg": 53.69655990600586,
"loss/twn": 0.0,
"step": 27
},
{
"epoch": 0.0007,
"grad_norm": 20.0,
"grad_norm_var": 3253.6384765625,
"learning_rate": 2.8000000000000003e-05,
"loss": 54.2425,
"loss/crossentropy": 4.120519638061523,
"loss/hidden": 0.1171875,
"loss/logits": 0.012582110241055489,
"loss/reg": 49.99223709106445,
"loss/twn": 0.0,
"step": 28
},
{
"epoch": 0.000725,
"grad_norm": 12.3125,
"grad_norm_var": 3259.070768229167,
"learning_rate": 2.9e-05,
"loss": 49.3116,
"loss/crossentropy": 2.52665638923645,
"loss/hidden": 0.11474609375,
"loss/logits": 0.009403377771377563,
"loss/reg": 46.660804748535156,
"loss/twn": 0.0,
"step": 29
},
{
"epoch": 0.00075,
"grad_norm": 10.375,
"grad_norm_var": 3284.8536458333333,
"learning_rate": 3e-05,
"loss": 44.966,
"loss/crossentropy": 1.1925100088119507,
"loss/hidden": 0.08837890625,
"loss/logits": 0.002639985177665949,
"loss/reg": 43.682464599609375,
"loss/twn": 0.0,
"step": 30
},
{
"epoch": 0.000775,
"grad_norm": 67.5,
"grad_norm_var": 3345.6231770833333,
"learning_rate": 3.1e-05,
"loss": 42.8695,
"loss/crossentropy": 1.6953678131103516,
"loss/hidden": 0.2060546875,
"loss/logits": 0.011261125095188618,
"loss/reg": 40.95684814453125,
"loss/twn": 0.0,
"step": 31
},
{
"epoch": 0.0008,
"grad_norm": 22.5,
"grad_norm_var": 3311.253108723958,
"learning_rate": 3.2000000000000005e-05,
"loss": 41.8117,
"loss/crossentropy": 3.090846538543701,
"loss/hidden": 0.2451171875,
"loss/logits": 0.015867076814174652,
"loss/reg": 38.459877014160156,
"loss/twn": 0.0,
"step": 32
},
{
"epoch": 0.000825,
"grad_norm": 14.8125,
"grad_norm_var": 2914.9796223958333,
"learning_rate": 3.3e-05,
"loss": 38.0881,
"loss/crossentropy": 1.7205989360809326,
"loss/hidden": 0.1650390625,
"loss/logits": 0.013207211159169674,
"loss/reg": 36.18929672241211,
"loss/twn": 0.0,
"step": 33
},
{
"epoch": 0.00085,
"grad_norm": 17.75,
"grad_norm_var": 2912.14296875,
"learning_rate": 3.4000000000000007e-05,
"loss": 35.2028,
"loss/crossentropy": 0.8660376667976379,
"loss/hidden": 0.1865234375,
"loss/logits": 0.012940528802573681,
"loss/reg": 34.13732147216797,
"loss/twn": 0.0,
"step": 34
},
{
"epoch": 0.000875,
"grad_norm": 12.5,
"grad_norm_var": 199.53671875,
"learning_rate": 3.5e-05,
"loss": 34.9671,
"loss/crossentropy": 2.679326057434082,
"loss/hidden": 0.044921875,
"loss/logits": 0.005477376747876406,
"loss/reg": 32.237335205078125,
"loss/twn": 0.0,
"step": 35
},
{
"epoch": 0.0009,
"grad_norm": 13.1875,
"grad_norm_var": 200.8947265625,
"learning_rate": 3.6e-05,
"loss": 32.3725,
"loss/crossentropy": 1.6193571090698242,
"loss/hidden": 0.2314453125,
"loss/logits": 0.016166094690561295,
"loss/reg": 30.505502700805664,
"loss/twn": 0.0,
"step": 36
},
{
"epoch": 0.000925,
"grad_norm": 13.5625,
"grad_norm_var": 202.30284830729167,
"learning_rate": 3.7e-05,
"loss": 31.5358,
"loss/crossentropy": 2.4860172271728516,
"loss/hidden": 0.1572265625,
"loss/logits": 0.010934200137853622,
"loss/reg": 28.881595611572266,
"loss/twn": 0.0,
"step": 37
},
{
"epoch": 0.00095,
"grad_norm": 41.5,
"grad_norm_var": 231.96901041666666,
"learning_rate": 3.8e-05,
"loss": 30.499,
"loss/crossentropy": 2.840606689453125,
"loss/hidden": 0.2138671875,
"loss/logits": 0.017023704946041107,
"loss/reg": 27.427488327026367,
"loss/twn": 0.0,
"step": 38
},
{
"epoch": 0.000975,
"grad_norm": 39.75,
"grad_norm_var": 251.65826822916668,
"learning_rate": 3.9000000000000006e-05,
"loss": 27.702,
"loss/crossentropy": 1.4538397789001465,
"loss/hidden": 0.19921875,
"loss/logits": 0.01595349609851837,
"loss/reg": 26.032983779907227,
"loss/twn": 0.0,
"step": 39
},
{
"epoch": 0.001,
"grad_norm": 9.5,
"grad_norm_var": 254.85572916666666,
"learning_rate": 4e-05,
"loss": 27.0967,
"loss/crossentropy": 2.216383695602417,
"loss/hidden": 0.10595703125,
"loss/logits": 0.006870034150779247,
"loss/reg": 24.76752471923828,
"loss/twn": 0.0,
"step": 40
},
{
"epoch": 0.001025,
"grad_norm": 11.625,
"grad_norm_var": 253.04108072916668,
"learning_rate": 4.1e-05,
"loss": 25.7473,
"loss/crossentropy": 2.1163833141326904,
"loss/hidden": 0.06689453125,
"loss/logits": 0.004073521587997675,
"loss/reg": 23.559967041015625,
"loss/twn": 0.0,
"step": 41
},
{
"epoch": 0.00105,
"grad_norm": 14.0,
"grad_norm_var": 250.197509765625,
"learning_rate": 4.2e-05,
"loss": 23.4299,
"loss/crossentropy": 0.7737110257148743,
"loss/hidden": 0.1748046875,
"loss/logits": 0.009298819117248058,
"loss/reg": 22.47205352783203,
"loss/twn": 0.0,
"step": 42
},
{
"epoch": 0.001075,
"grad_norm": 12.75,
"grad_norm_var": 246.6650390625,
"learning_rate": 4.3e-05,
"loss": 23.3947,
"loss/crossentropy": 1.8225781917572021,
"loss/hidden": 0.1123046875,
"loss/logits": 0.006940089166164398,
"loss/reg": 21.452856063842773,
"loss/twn": 0.0,
"step": 43
},
{
"epoch": 0.0011,
"grad_norm": 82.0,
"grad_norm_var": 479.8754557291667,
"learning_rate": 4.4000000000000006e-05,
"loss": 23.171,
"loss/crossentropy": 2.5216610431671143,
"loss/hidden": 0.1396484375,
"loss/logits": 0.01111547276377678,
"loss/reg": 20.498552322387695,
"loss/twn": 0.0,
"step": 44
},
{
"epoch": 0.001125,
"grad_norm": 10.75,
"grad_norm_var": 482.614306640625,
"learning_rate": 4.5e-05,
"loss": 22.58,
"loss/crossentropy": 2.7781012058258057,
"loss/hidden": 0.158203125,
"loss/logits": 0.008185407146811485,
"loss/reg": 19.63548469543457,
"loss/twn": 0.0,
"step": 45
},
{
"epoch": 0.00115,
"grad_norm": 27.125,
"grad_norm_var": 468.31573893229165,
"learning_rate": 4.600000000000001e-05,
"loss": 21.6325,
"loss/crossentropy": 2.6625006198883057,
"loss/hidden": 0.1494140625,
"loss/logits": 0.009840598329901695,
"loss/reg": 18.810749053955078,
"loss/twn": 0.0,
"step": 46
},
{
"epoch": 0.001175,
"grad_norm": 20.5,
"grad_norm_var": 344.27980143229166,
"learning_rate": 4.7e-05,
"loss": 20.5522,
"loss/crossentropy": 2.3305137157440186,
"loss/hidden": 0.1484375,
"loss/logits": 0.016644544899463654,
"loss/reg": 18.056581497192383,
"loss/twn": 0.0,
"step": 47
},
{
"epoch": 0.0012,
"grad_norm": 11.25,
"grad_norm_var": 352.54737955729166,
"learning_rate": 4.8e-05,
"loss": 18.7184,
"loss/crossentropy": 1.2184098958969116,
"loss/hidden": 0.1455078125,
"loss/logits": 0.0094651710242033,
"loss/reg": 17.344999313354492,
"loss/twn": 0.0,
"step": 48
},
{
"epoch": 0.001225,
"grad_norm": 9.5,
"grad_norm_var": 359.42734375,
"learning_rate": 4.9e-05,
"loss": 19.2773,
"loss/crossentropy": 2.487840414047241,
"loss/hidden": 0.0986328125,
"loss/logits": 0.004987399093806744,
"loss/reg": 16.685823440551758,
"loss/twn": 0.0,
"step": 49
},
{
"epoch": 0.00125,
"grad_norm": 15.1875,
"grad_norm_var": 361.1883951822917,
"learning_rate": 5e-05,
"loss": 18.9783,
"loss/crossentropy": 2.735170602798462,
"loss/hidden": 0.166015625,
"loss/logits": 0.011278904974460602,
"loss/reg": 16.065805435180664,
"loss/twn": 0.0,
"step": 50
},
{
"epoch": 0.001275,
"grad_norm": 15.75,
"grad_norm_var": 357.929931640625,
"learning_rate": 5.1000000000000006e-05,
"loss": 17.1984,
"loss/crossentropy": 1.4663747549057007,
"loss/hidden": 0.2353515625,
"loss/logits": 0.003095359541475773,
"loss/reg": 15.49356746673584,
"loss/twn": 0.0,
"step": 51
},
{
"epoch": 0.0013,
"grad_norm": 9.9375,
"grad_norm_var": 362.29881184895834,
"learning_rate": 5.2000000000000004e-05,
"loss": 16.953,
"loss/crossentropy": 1.918389916419983,
"loss/hidden": 0.0791015625,
"loss/logits": 0.0031922967173159122,
"loss/reg": 14.95230484008789,
"loss/twn": 0.0,
"step": 52
},
{
"epoch": 0.001325,
"grad_norm": 10.6875,
"grad_norm_var": 365.8745930989583,
"learning_rate": 5.300000000000001e-05,
"loss": 16.8909,
"loss/crossentropy": 2.3705666065216064,
"loss/hidden": 0.07275390625,
"loss/logits": 0.0030757079366594553,
"loss/reg": 14.444525718688965,
"loss/twn": 0.0,
"step": 53
},
{
"epoch": 0.00135,
"grad_norm": 16.75,
"grad_norm_var": 337.7085774739583,
"learning_rate": 5.4000000000000005e-05,
"loss": 16.1047,
"loss/crossentropy": 1.9884377717971802,
"loss/hidden": 0.12890625,
"loss/logits": 0.010179271921515465,
"loss/reg": 13.977179527282715,
"loss/twn": 0.0,
"step": 54
},
{
"epoch": 0.001375,
"grad_norm": 13.375,
"grad_norm_var": 311.08631184895836,
"learning_rate": 5.500000000000001e-05,
"loss": 16.2292,
"loss/crossentropy": 2.642868995666504,
"loss/hidden": 0.04736328125,
"loss/logits": 0.004405863583087921,
"loss/reg": 13.5346097946167,
"loss/twn": 0.0,
"step": 55
},
{
"epoch": 0.0014,
"grad_norm": 20.875,
"grad_norm_var": 306.026806640625,
"learning_rate": 5.6000000000000006e-05,
"loss": 14.8051,
"loss/crossentropy": 1.465383529663086,
"loss/hidden": 0.2080078125,
"loss/logits": 0.00864885188639164,
"loss/reg": 13.123102188110352,
"loss/twn": 0.0,
"step": 56
},
{
"epoch": 0.001425,
"grad_norm": 15.875,
"grad_norm_var": 303.045166015625,
"learning_rate": 5.6999999999999996e-05,
"loss": 14.4812,
"loss/crossentropy": 1.640454649925232,
"loss/hidden": 0.09765625,
"loss/logits": 0.007794347126036882,
"loss/reg": 12.735280990600586,
"loss/twn": 0.0,
"step": 57
},
{
"epoch": 0.00145,
"grad_norm": 16.5,
"grad_norm_var": 301.720947265625,
"learning_rate": 5.8e-05,
"loss": 14.5378,
"loss/crossentropy": 2.0751516819000244,
"loss/hidden": 0.07861328125,
"loss/logits": 0.006516133435070515,
"loss/reg": 12.377544403076172,
"loss/twn": 0.0,
"step": 58
},
{
"epoch": 0.001475,
"grad_norm": 20.875,
"grad_norm_var": 298.750244140625,
"learning_rate": 5.9e-05,
"loss": 14.1279,
"loss/crossentropy": 1.9119625091552734,
"loss/hidden": 0.1552734375,
"loss/logits": 0.021668870002031326,
"loss/reg": 12.038968086242676,
"loss/twn": 0.0,
"step": 59
},
{
"epoch": 0.0015,
"grad_norm": 13.5625,
"grad_norm_var": 23.984375,
"learning_rate": 6e-05,
"loss": 12.6629,
"loss/crossentropy": 0.7274801731109619,
"loss/hidden": 0.203125,
"loss/logits": 0.009123459458351135,
"loss/reg": 11.72317123413086,
"loss/twn": 0.0,
"step": 60
},
{
"epoch": 0.001525,
"grad_norm": 11.875,
"grad_norm_var": 23.3462890625,
"learning_rate": 6.1e-05,
"loss": 14.3076,
"loss/crossentropy": 2.738680601119995,
"loss/hidden": 0.1396484375,
"loss/logits": 0.010733511298894882,
"loss/reg": 11.418492317199707,
"loss/twn": 0.0,
"step": 61
},
{
"epoch": 0.00155,
"grad_norm": 41.25,
"grad_norm_var": 57.518489583333334,
"learning_rate": 6.2e-05,
"loss": 13.614,
"loss/crossentropy": 2.1940059661865234,
"loss/hidden": 0.26171875,
"loss/logits": 0.016079768538475037,
"loss/reg": 11.142221450805664,
"loss/twn": 0.0,
"step": 62
},
{
"epoch": 0.001575,
"grad_norm": 16.25,
"grad_norm_var": 56.371875,
"learning_rate": 6.3e-05,
"loss": 12.6424,
"loss/crossentropy": 1.5363647937774658,
"loss/hidden": 0.220703125,
"loss/logits": 0.009181533940136433,
"loss/reg": 10.876102447509766,
"loss/twn": 0.0,
"step": 63
},
{
"epoch": 0.0016,
"grad_norm": 8.25,
"grad_norm_var": 58.921875,
"learning_rate": 6.400000000000001e-05,
"loss": 12.6573,
"loss/crossentropy": 1.9360976219177246,
"loss/hidden": 0.08642578125,
"loss/logits": 0.00900467112660408,
"loss/reg": 10.625749588012695,
"loss/twn": 0.0,
"step": 64
},
{
"epoch": 0.001625,
"grad_norm": 25.0,
"grad_norm_var": 60.43958333333333,
"learning_rate": 6.500000000000001e-05,
"loss": 12.8598,
"loss/crossentropy": 2.2861106395721436,
"loss/hidden": 0.169921875,
"loss/logits": 0.008098036982119083,
"loss/reg": 10.395671844482422,
"loss/twn": 0.0,
"step": 65
},
{
"epoch": 0.00165,
"grad_norm": 10.5,
"grad_norm_var": 62.94568684895833,
"learning_rate": 6.6e-05,
"loss": 10.8882,
"loss/crossentropy": 0.5159875154495239,
"loss/hidden": 0.1884765625,
"loss/logits": 0.007731384597718716,
"loss/reg": 10.176012992858887,
"loss/twn": 0.0,
"step": 66
},
{
"epoch": 0.001675,
"grad_norm": 15.4375,
"grad_norm_var": 62.99166666666667,
"learning_rate": 6.7e-05,
"loss": 11.7439,
"loss/crossentropy": 1.6010075807571411,
"loss/hidden": 0.16796875,
"loss/logits": 0.0077722882851958275,
"loss/reg": 9.96713924407959,
"loss/twn": 0.0,
"step": 67
},
{
"epoch": 0.0017,
"grad_norm": 8.125,
"grad_norm_var": 64.82823893229167,
"learning_rate": 6.800000000000001e-05,
"loss": 11.7136,
"loss/crossentropy": 1.8642301559448242,
"loss/hidden": 0.07177734375,
"loss/logits": 0.003159617306664586,
"loss/reg": 9.774468421936035,
"loss/twn": 0.0,
"step": 68
},
{
"epoch": 0.001725,
"grad_norm": 12.75,
"grad_norm_var": 63.475260416666664,
"learning_rate": 6.9e-05,
"loss": 11.2146,
"loss/crossentropy": 1.5259939432144165,
"loss/hidden": 0.0927734375,
"loss/logits": 0.0040178182534873486,
"loss/reg": 9.591811180114746,
"loss/twn": 0.0,
"step": 69
},
{
"epoch": 0.00175,
"grad_norm": 13.25,
"grad_norm_var": 64.21901041666666,
"learning_rate": 7e-05,
"loss": 10.007,
"loss/crossentropy": 0.4211646616458893,
"loss/hidden": 0.1630859375,
"loss/logits": 0.006910983473062515,
"loss/reg": 9.415842056274414,
"loss/twn": 0.0,
"step": 70
},
{
"epoch": 0.001775,
"grad_norm": 15.1875,
"grad_norm_var": 63.672900390625,
"learning_rate": 7.1e-05,
"loss": 11.3191,
"loss/crossentropy": 1.91142737865448,
"loss/hidden": 0.1416015625,
"loss/logits": 0.012339383363723755,
"loss/reg": 9.253693580627441,
"loss/twn": 0.0,
"step": 71
},
{
"epoch": 0.0018,
"grad_norm": 25.125,
"grad_norm_var": 67.225634765625,
"learning_rate": 7.2e-05,
"loss": 9.6954,
"loss/crossentropy": 0.3831652104854584,
"loss/hidden": 0.2060546875,
"loss/logits": 0.007283635437488556,
"loss/reg": 9.098925590515137,
"loss/twn": 0.0,
"step": 72
},
{
"epoch": 0.001825,
"grad_norm": 12.375,
"grad_norm_var": 68.45245768229167,
"learning_rate": 7.3e-05,
"loss": 11.8289,
"loss/crossentropy": 2.7114861011505127,
"loss/hidden": 0.1572265625,
"loss/logits": 0.00806540995836258,
"loss/reg": 8.952132225036621,
"loss/twn": 0.0,
"step": 73
},
{
"epoch": 0.00185,
"grad_norm": 18.0,
"grad_norm_var": 68.56417643229166,
"learning_rate": 7.4e-05,
"loss": 9.9016,
"loss/crossentropy": 0.9610092043876648,
"loss/hidden": 0.12255859375,
"loss/logits": 0.005808320362120867,
"loss/reg": 8.812213897705078,
"loss/twn": 0.0,
"step": 74
},
{
"epoch": 0.001875,
"grad_norm": 11.75,
"grad_norm_var": 68.73527018229167,
"learning_rate": 7.500000000000001e-05,
"loss": 11.33,
"loss/crossentropy": 2.4811081886291504,
"loss/hidden": 0.154296875,
"loss/logits": 0.012214528396725655,
"loss/reg": 8.682340621948242,
"loss/twn": 0.0,
"step": 75
},
{
"epoch": 0.0019,
"grad_norm": 16.75,
"grad_norm_var": 68.26295572916666,
"learning_rate": 7.6e-05,
"loss": 10.7107,
"loss/crossentropy": 1.9060146808624268,
"loss/hidden": 0.2294921875,
"loss/logits": 0.016750231385231018,
"loss/reg": 8.558440208435059,
"loss/twn": 0.0,
"step": 76
},
{
"epoch": 0.001925,
"grad_norm": 11.5,
"grad_norm_var": 68.49635416666666,
"learning_rate": 7.7e-05,
"loss": 10.9686,
"loss/crossentropy": 2.370375394821167,
"loss/hidden": 0.1435546875,
"loss/logits": 0.014391288161277771,
"loss/reg": 8.440238952636719,
"loss/twn": 0.0,
"step": 77
},
{
"epoch": 0.00195,
"grad_norm": 30.0,
"grad_norm_var": 39.04713541666667,
"learning_rate": 7.800000000000001e-05,
"loss": 11.235,
"loss/crossentropy": 2.7426469326019287,
"loss/hidden": 0.1552734375,
"loss/logits": 0.012605298310518265,
"loss/reg": 8.32447624206543,
"loss/twn": 0.0,
"step": 78
},
{
"epoch": 0.001975,
"grad_norm": 12.8125,
"grad_norm_var": 39.50636393229167,
"learning_rate": 7.900000000000001e-05,
"loss": 10.4604,
"loss/crossentropy": 2.1269540786743164,
"loss/hidden": 0.10986328125,
"loss/logits": 0.0066815330646932125,
"loss/reg": 8.216917991638184,
"loss/twn": 0.0,
"step": 79
},
{
"epoch": 0.002,
"grad_norm": 13.625,
"grad_norm_var": 36.169384765625,
"learning_rate": 8e-05,
"loss": 10.9456,
"loss/crossentropy": 2.6664817333221436,
"loss/hidden": 0.15234375,
"loss/logits": 0.011289350688457489,
"loss/reg": 8.115513801574707,
"loss/twn": 0.0,
"step": 80
},
{
"epoch": 0.002025,
"grad_norm": 32.25,
"grad_norm_var": 48.38487955729167,
"learning_rate": 8.1e-05,
"loss": 10.8641,
"loss/crossentropy": 2.6699304580688477,
"loss/hidden": 0.1640625,
"loss/logits": 0.010942000895738602,
"loss/reg": 8.019161224365234,
"loss/twn": 0.0,
"step": 81
},
{
"epoch": 0.00205,
"grad_norm": 21.875,
"grad_norm_var": 47.804280598958336,
"learning_rate": 8.2e-05,
"loss": 10.4686,
"loss/crossentropy": 2.2571003437042236,
"loss/hidden": 0.267578125,
"loss/logits": 0.021330825984477997,
"loss/reg": 7.922557353973389,
"loss/twn": 0.0,
"step": 82
},
{
"epoch": 0.002075,
"grad_norm": 11.1875,
"grad_norm_var": 49.776546223958334,
"learning_rate": 8.3e-05,
"loss": 10.8712,
"loss/crossentropy": 2.8793890476226807,
"loss/hidden": 0.1435546875,
"loss/logits": 0.0122376699000597,
"loss/reg": 7.835977077484131,
"loss/twn": 0.0,
"step": 83
},
{
"epoch": 0.0021,
"grad_norm": 12.125,
"grad_norm_var": 46.224462890625,
"learning_rate": 8.4e-05,
"loss": 10.6407,
"loss/crossentropy": 2.8739991188049316,
"loss/hidden": 0.0140380859375,
"loss/logits": 0.0031395466066896915,
"loss/reg": 7.749497413635254,
"loss/twn": 0.0,
"step": 84
},
{
"epoch": 0.002125,
"grad_norm": 224.0,
"grad_norm_var": 2718.206884765625,
"learning_rate": 8.5e-05,
"loss": 9.9904,
"loss/crossentropy": 2.1338188648223877,
"loss/hidden": 0.1748046875,
"loss/logits": 0.013754406943917274,
"loss/reg": 7.668061256408691,
"loss/twn": 0.0,
"step": 85
},
{
"epoch": 0.00215,
"grad_norm": 9.0625,
"grad_norm_var": 2728.7181640625,
"learning_rate": 8.6e-05,
"loss": 10.4151,
"loss/crossentropy": 2.6691489219665527,
"loss/hidden": 0.1455078125,
"loss/logits": 0.011645539663732052,
"loss/reg": 7.588749885559082,
"loss/twn": 0.0,
"step": 86
},
{
"epoch": 0.002175,
"grad_norm": 23.375,
"grad_norm_var": 2716.899593098958,
"learning_rate": 8.7e-05,
"loss": 9.2735,
"loss/crossentropy": 1.5923405885696411,
"loss/hidden": 0.1591796875,
"loss/logits": 0.006691344082355499,
"loss/reg": 7.515244007110596,
"loss/twn": 0.0,
"step": 87
},
{
"epoch": 0.0022,
"grad_norm": 11.3125,
"grad_norm_var": 2738.4708333333333,
"learning_rate": 8.800000000000001e-05,
"loss": 10.0973,
"loss/crossentropy": 2.563422679901123,
"loss/hidden": 0.08642578125,
"loss/logits": 0.006290389224886894,
"loss/reg": 7.441190242767334,
"loss/twn": 0.0,
"step": 88
},
{
"epoch": 0.002225,
"grad_norm": 16.75,
"grad_norm_var": 2729.6775390625,
"learning_rate": 8.900000000000001e-05,
"loss": 10.1917,
"loss/crossentropy": 2.605319023132324,
"loss/hidden": 0.1845703125,
"loss/logits": 0.029636088758707047,
"loss/reg": 7.372167587280273,
"loss/twn": 0.0,
"step": 89
},
{
"epoch": 0.00225,
"grad_norm": 17.0,
"grad_norm_var": 2731.3098307291666,
"learning_rate": 9e-05,
"loss": 9.2722,
"loss/crossentropy": 1.7703652381896973,
"loss/hidden": 0.1845703125,
"loss/logits": 0.011869278736412525,
"loss/reg": 7.30535364151001,
"loss/twn": 0.0,
"step": 90
},
{
"epoch": 0.002275,
"grad_norm": 26.625,
"grad_norm_var": 2709.51640625,
"learning_rate": 9.1e-05,
"loss": 10.0987,
"loss/crossentropy": 2.770080327987671,
"loss/hidden": 0.08154296875,
"loss/logits": 0.006538551300764084,
"loss/reg": 7.240530967712402,
"loss/twn": 0.0,
"step": 91
},
{
"epoch": 0.0023,
"grad_norm": 12.3125,
"grad_norm_var": 2718.9657389322915,
"learning_rate": 9.200000000000001e-05,
"loss": 9.1897,
"loss/crossentropy": 1.7330008745193481,
"loss/hidden": 0.265625,
"loss/logits": 0.012156343087553978,
"loss/reg": 7.1789398193359375,
"loss/twn": 0.0,
"step": 92
},
{
"epoch": 0.002325,
"grad_norm": 113.5,
"grad_norm_var": 3112.675113932292,
"learning_rate": 9.300000000000001e-05,
"loss": 8.4886,
"loss/crossentropy": 1.163967490196228,
"loss/hidden": 0.197265625,
"loss/logits": 0.009277150966227055,
"loss/reg": 7.11806058883667,
"loss/twn": 0.0,
"step": 93
},
{
"epoch": 0.00235,
"grad_norm": 37.25,
"grad_norm_var": 3109.446598307292,
"learning_rate": 9.4e-05,
"loss": 9.3185,
"loss/crossentropy": 2.135645627975464,
"loss/hidden": 0.11767578125,
"loss/logits": 0.0029190080240368843,
"loss/reg": 7.06224250793457,
"loss/twn": 0.0,
"step": 94
},
{
"epoch": 0.002375,
"grad_norm": 8.3125,
"grad_norm_var": 3125.339567057292,
"learning_rate": 9.5e-05,
"loss": 9.3855,
"loss/crossentropy": 2.309610605239868,
"loss/hidden": 0.06689453125,
"loss/logits": 0.0035296978894621134,
"loss/reg": 7.005456447601318,
"loss/twn": 0.0,
"step": 95
},
{
"epoch": 0.0024,
"grad_norm": 91.5,
"grad_norm_var": 3262.5942545572916,
"learning_rate": 9.6e-05,
"loss": 9.9531,
"loss/crossentropy": 2.7651376724243164,
"loss/hidden": 0.2197265625,
"loss/logits": 0.01677127555012703,
"loss/reg": 6.951422214508057,
"loss/twn": 0.0,
"step": 96
},
{
"epoch": 0.002425,
"grad_norm": 35.25,
"grad_norm_var": 3259.3458170572917,
"learning_rate": 9.7e-05,
"loss": 8.7807,
"loss/crossentropy": 1.7207653522491455,
"loss/hidden": 0.1494140625,
"loss/logits": 0.011507261544466019,
"loss/reg": 6.8990349769592285,
"loss/twn": 0.0,
"step": 97
},
{
"epoch": 0.00245,
"grad_norm": 13.75,
"grad_norm_var": 3285.235791015625,
"learning_rate": 9.8e-05,
"loss": 8.475,
"loss/crossentropy": 1.481154441833496,
"loss/hidden": 0.140625,
"loss/logits": 0.005128794349730015,
"loss/reg": 6.848050594329834,
"loss/twn": 0.0,
"step": 98
},
{
"epoch": 0.002475,
"grad_norm": 10.1875,
"grad_norm_var": 3289.334228515625,
"learning_rate": 9.900000000000001e-05,
"loss": 9.021,
"loss/crossentropy": 2.196463108062744,
"loss/hidden": 0.0234375,
"loss/logits": 0.0013116542249917984,
"loss/reg": 6.79979133605957,
"loss/twn": 0.0,
"step": 99
},
{
"epoch": 0.0025,
"grad_norm": 13.6875,
"grad_norm_var": 3283.3889973958335,
"learning_rate": 0.0001,
"loss": 9.1231,
"loss/crossentropy": 2.0860254764556885,
"loss/hidden": 0.265625,
"loss/logits": 0.0192781500518322,
"loss/reg": 6.75217342376709,
"loss/twn": 0.0,
"step": 100
},
{
"epoch": 0.002525,
"grad_norm": 65.5,
"grad_norm_var": 996.5311848958333,
"learning_rate": 0.0001,
"loss": 8.6978,
"loss/crossentropy": 1.8436778783798218,
"loss/hidden": 0.140625,
"loss/logits": 0.006662796251475811,
"loss/reg": 6.706822395324707,
"loss/twn": 0.0,
"step": 101
},
{
"epoch": 0.00255,
"grad_norm": 9.0625,
"grad_norm_var": 996.5311848958333,
"learning_rate": 0.0001,
"loss": 9.2667,
"loss/crossentropy": 2.4968836307525635,
"loss/hidden": 0.0986328125,
"loss/logits": 0.007922045886516571,
"loss/reg": 6.663230895996094,
"loss/twn": 0.0,
"step": 102
},
{
"epoch": 0.002575,
"grad_norm": 11.1875,
"grad_norm_var": 1019.1574055989583,
"learning_rate": 0.0001,
"loss": 8.2507,
"loss/crossentropy": 1.475099802017212,
"loss/hidden": 0.1484375,
"loss/logits": 0.007549532223492861,
"loss/reg": 6.619617938995361,
"loss/twn": 0.0,
"step": 103
},
{
"epoch": 0.0026,
"grad_norm": 13.5625,
"grad_norm_var": 1013.6202962239583,
"learning_rate": 0.0001,
"loss": 9.2719,
"loss/crossentropy": 2.5519533157348633,
"loss/hidden": 0.1328125,
"loss/logits": 0.00820184126496315,
"loss/reg": 6.578925132751465,
"loss/twn": 0.0,
"step": 104
},
{
"epoch": 0.002625,
"grad_norm": 288.0,
"grad_norm_var": 5098.051936848959,
"learning_rate": 0.0001,
"loss": 7.9889,
"loss/crossentropy": 1.3079354763031006,
"loss/hidden": 0.1328125,
"loss/logits": 0.01036953553557396,
"loss/reg": 6.537764072418213,
"loss/twn": 0.0,
"step": 105
},
{
"epoch": 0.00265,
"grad_norm": 17.5,
"grad_norm_var": 5096.006363932292,
"learning_rate": 0.0001,
"loss": 9.3801,
"loss/crossentropy": 2.7196500301361084,
"loss/hidden": 0.1474609375,
"loss/logits": 0.013073693960905075,
"loss/reg": 6.49993371963501,
"loss/twn": 0.0,
"step": 106
},
{
"epoch": 0.002675,
"grad_norm": 772.0,
"grad_norm_var": 37700.72758789062,
"learning_rate": 0.0001,
"loss": 7.3614,
"loss/crossentropy": 0.6930418014526367,
"loss/hidden": 0.1982421875,
"loss/logits": 0.0074032871052622795,
"loss/reg": 6.462671279907227,
"loss/twn": 0.0,
"step": 107
},
{
"epoch": 0.0027,
"grad_norm": 10.125,
"grad_norm_var": 37725.00826822917,
"learning_rate": 0.0001,
"loss": 8.4394,
"loss/crossentropy": 1.9201096296310425,
"loss/hidden": 0.08642578125,
"loss/logits": 0.006301195826381445,
"loss/reg": 6.426520347595215,
"loss/twn": 0.0,
"step": 108
},
{
"epoch": 0.002725,
"grad_norm": 11.1875,
"grad_norm_var": 38118.67159830729,
"learning_rate": 0.0001,
"loss": 9.0847,
"loss/crossentropy": 2.634326219558716,
"loss/hidden": 0.056640625,
"loss/logits": 0.003579255659133196,
"loss/reg": 6.3901753425598145,
"loss/twn": 0.0,
"step": 109
},
{
"epoch": 0.00275,
"grad_norm": 13.0,
"grad_norm_var": 38319.52980143229,
"learning_rate": 0.0001,
"loss": 9.0789,
"loss/crossentropy": 2.5669283866882324,
"loss/hidden": 0.1455078125,
"loss/logits": 0.010270677506923676,
"loss/reg": 6.356166839599609,
"loss/twn": 0.0,
"step": 110
},
{
"epoch": 0.002775,
"grad_norm": 14.4375,
"grad_norm_var": 38258.03097330729,
"learning_rate": 0.0001,
"loss": 8.9621,
"loss/crossentropy": 2.5042357444763184,
"loss/hidden": 0.125,
"loss/logits": 0.00943165272474289,
"loss/reg": 6.323448657989502,
"loss/twn": 0.0,
"step": 111
},
{
"epoch": 0.0028,
"grad_norm": 10.75,
"grad_norm_var": 38615.72823893229,
"learning_rate": 0.0001,
"loss": 8.0629,
"loss/crossentropy": 1.681036353111267,
"loss/hidden": 0.08642578125,
"loss/logits": 0.003971286583691835,
"loss/reg": 6.291506767272949,
"loss/twn": 0.0,
"step": 112
},
{
"epoch": 0.002825,
"grad_norm": 23.875,
"grad_norm_var": 38694.45271809896,
"learning_rate": 0.0001,
"loss": 7.101,
"loss/crossentropy": 0.6117576956748962,
"loss/hidden": 0.2158203125,
"loss/logits": 0.012755107134580612,
"loss/reg": 6.260617256164551,
"loss/twn": 0.0,
"step": 113
},
{
"epoch": 0.00285,
"grad_norm": 12.1875,
"grad_norm_var": 38708.639322916664,
"learning_rate": 0.0001,
"loss": 8.0025,
"loss/crossentropy": 1.5227508544921875,
"loss/hidden": 0.234375,
"loss/logits": 0.015468025580048561,
"loss/reg": 6.229867935180664,
"loss/twn": 0.0,
"step": 114
},
{
"epoch": 0.002875,
"grad_norm": 16.375,
"grad_norm_var": 38652.598942057295,
"learning_rate": 0.0001,
"loss": 9.088,
"loss/crossentropy": 2.7616689205169678,
"loss/hidden": 0.115234375,
"loss/logits": 0.009811250492930412,
"loss/reg": 6.201269626617432,
"loss/twn": 0.0,
"step": 115
},
{
"epoch": 0.0029,
"grad_norm": 12.375,
"grad_norm_var": 38664.55670572917,
"learning_rate": 0.0001,
"loss": 8.9623,
"loss/crossentropy": 2.647496461868286,
"loss/hidden": 0.1328125,
"loss/logits": 0.009309421293437481,
"loss/reg": 6.172722339630127,
"loss/twn": 0.0,
"step": 116
},
{
"epoch": 0.002925,
"grad_norm": 20.5,
"grad_norm_var": 38886.04108072917,
"learning_rate": 0.0001,
"loss": 9.1565,
"loss/crossentropy": 2.8847148418426514,
"loss/hidden": 0.11767578125,
"loss/logits": 0.008627700619399548,
"loss/reg": 6.145481586456299,
"loss/twn": 0.0,
"step": 117
},
{
"epoch": 0.00295,
"grad_norm": 16.375,
"grad_norm_var": 38821.67394205729,
"learning_rate": 0.0001,
"loss": 9.0896,
"loss/crossentropy": 2.7421655654907227,
"loss/hidden": 0.212890625,
"loss/logits": 0.016587935388088226,
"loss/reg": 6.117995262145996,
"loss/twn": 0.0,
"step": 118
},
{
"epoch": 0.002975,
"grad_norm": 8.5625,
"grad_norm_var": 38845.82667643229,
"learning_rate": 0.0001,
"loss": 8.091,
"loss/crossentropy": 1.8508156538009644,
"loss/hidden": 0.138671875,
"loss/logits": 0.008302265778183937,
"loss/reg": 6.093196868896484,
"loss/twn": 0.0,
"step": 119
},
{
"epoch": 0.003,
"grad_norm": 11.625,
"grad_norm_var": 38862.91451822917,
"learning_rate": 0.0001,
"loss": 8.6832,
"loss/crossentropy": 2.444472312927246,
"loss/hidden": 0.1552734375,
"loss/logits": 0.016056066378951073,
"loss/reg": 6.067349433898926,
"loss/twn": 0.0,
"step": 120
},
{
"epoch": 0.003025,
"grad_norm": 15.0625,
"grad_norm_var": 35901.329410807295,
"learning_rate": 0.0001,
"loss": 7.6991,
"loss/crossentropy": 1.443003535270691,
"loss/hidden": 0.203125,
"loss/logits": 0.009365499019622803,
"loss/reg": 6.043575763702393,
"loss/twn": 0.0,
"step": 121
},
{
"epoch": 0.00305,
"grad_norm": 10.125,
"grad_norm_var": 35948.114567057295,
"learning_rate": 0.0001,
"loss": 8.3654,
"loss/crossentropy": 2.175076961517334,
"loss/hidden": 0.1572265625,
"loss/logits": 0.012922637164592743,
"loss/reg": 6.0201544761657715,
"loss/twn": 0.0,
"step": 122
},
{
"epoch": 0.003075,
"grad_norm": 12.9375,
"grad_norm_var": 16.191145833333334,
"learning_rate": 0.0001,
"loss": 8.7509,
"loss/crossentropy": 2.659536123275757,
"loss/hidden": 0.08642578125,
"loss/logits": 0.007753277197480202,
"loss/reg": 5.997157096862793,
"loss/twn": 0.0,
"step": 123
},
{
"epoch": 0.0031,
"grad_norm": 11.9375,
"grad_norm_var": 15.527978515625,
"learning_rate": 0.0001,
"loss": 8.4483,
"loss/crossentropy": 2.3908164501190186,
"loss/hidden": 0.07666015625,
"loss/logits": 0.005580560304224491,
"loss/reg": 5.975290298461914,
"loss/twn": 0.0,
"step": 124
},
{
"epoch": 0.003125,
"grad_norm": 12.8125,
"grad_norm_var": 15.120035807291666,
"learning_rate": 0.0001,
"loss": 8.1928,
"loss/crossentropy": 2.0353291034698486,
"loss/hidden": 0.1923828125,
"loss/logits": 0.011610760353505611,
"loss/reg": 5.953509330749512,
"loss/twn": 0.0,
"step": 125
},
{
"epoch": 0.00315,
"grad_norm": 11.4375,
"grad_norm_var": 15.467122395833334,
"learning_rate": 0.0001,
"loss": 6.9926,
"loss/crossentropy": 0.923692524433136,
"loss/hidden": 0.12890625,
"loss/logits": 0.006808650679886341,
"loss/reg": 5.933147430419922,
"loss/twn": 0.0,
"step": 126
},
{
"epoch": 0.003175,
"grad_norm": 18.875,
"grad_norm_var": 17.053759765625,
"learning_rate": 0.0001,
"loss": 8.6838,
"loss/crossentropy": 2.7514772415161133,
"loss/hidden": 0.016357421875,
"loss/logits": 0.00333950063213706,
"loss/reg": 5.91263484954834,
"loss/twn": 0.0,
"step": 127
},
{
"epoch": 0.0032,
"grad_norm": 13.375,
"grad_norm_var": 16.307275390625,
"learning_rate": 0.0001,
"loss": 7.5016,
"loss/crossentropy": 1.4413155317306519,
"loss/hidden": 0.1552734375,
"loss/logits": 0.011735007166862488,
"loss/reg": 5.893232822418213,
"loss/twn": 0.0,
"step": 128
},
{
"epoch": 0.003225,
"grad_norm": 52.25,
"grad_norm_var": 102.939697265625,
"learning_rate": 0.0001,
"loss": 7.4808,
"loss/crossentropy": 1.5077205896377563,
"loss/hidden": 0.0947265625,
"loss/logits": 0.004158593248575926,
"loss/reg": 5.874199867248535,
"loss/twn": 0.0,
"step": 129
},
{
"epoch": 0.00325,
"grad_norm": 12.75,
"grad_norm_var": 102.6697265625,
"learning_rate": 0.0001,
"loss": 8.7541,
"loss/crossentropy": 2.7712345123291016,
"loss/hidden": 0.1201171875,
"loss/logits": 0.006270177662372589,
"loss/reg": 5.856495380401611,
"loss/twn": 0.0,
"step": 130
},
{
"epoch": 0.003275,
"grad_norm": 9.5625,
"grad_norm_var": 105.30779622395833,
"learning_rate": 0.0001,
"loss": 7.2955,
"loss/crossentropy": 1.3631829023361206,
"loss/hidden": 0.08837890625,
"loss/logits": 0.005783870816230774,
"loss/reg": 5.838170528411865,
"loss/twn": 0.0,
"step": 131
},
{
"epoch": 0.0033,
"grad_norm": 22.625,
"grad_norm_var": 107.38448893229166,
"learning_rate": 0.0001,
"loss": 8.8285,
"loss/crossentropy": 2.8225176334381104,
"loss/hidden": 0.1767578125,
"loss/logits": 0.007785791996866465,
"loss/reg": 5.82139778137207,
"loss/twn": 0.0,
"step": 132
},
{
"epoch": 0.003325,
"grad_norm": 23.5,
"grad_norm_var": 109.62667643229166,
"learning_rate": 0.0001,
"loss": 7.4036,
"loss/crossentropy": 1.5806615352630615,
"loss/hidden": 0.016357421875,
"loss/logits": 0.001918629975989461,
"loss/reg": 5.804649829864502,
"loss/twn": 0.0,
"step": 133
},
{
"epoch": 0.00335,
"grad_norm": 11.1875,
"grad_norm_var": 111.3869140625,
"learning_rate": 0.0001,
"loss": 8.0831,
"loss/crossentropy": 2.268815755844116,
"loss/hidden": 0.0233154296875,
"loss/logits": 0.0031364229507744312,
"loss/reg": 5.787786483764648,
"loss/twn": 0.0,
"step": 134
},
{
"epoch": 0.003375,
"grad_norm": 17.875,
"grad_norm_var": 107.36847330729167,
"learning_rate": 0.0001,
"loss": 8.4399,
"loss/crossentropy": 2.5208940505981445,
"loss/hidden": 0.1376953125,
"loss/logits": 0.009613238275051117,
"loss/reg": 5.771730422973633,
"loss/twn": 0.0,
"step": 135
},
{
"epoch": 0.0034,
"grad_norm": 12.1875,
"grad_norm_var": 107.00416666666666,
"learning_rate": 0.0001,
"loss": 7.3628,
"loss/crossentropy": 1.5146337747573853,
"loss/hidden": 0.08642578125,
"loss/logits": 0.004632354713976383,
"loss/reg": 5.757077693939209,
"loss/twn": 0.0,
"step": 136
},
{
"epoch": 0.003425,
"grad_norm": 74.5,
"grad_norm_var": 314.18409830729166,
"learning_rate": 0.0001,
"loss": 8.7141,
"loss/crossentropy": 2.663015127182007,
"loss/hidden": 0.298828125,
"loss/logits": 0.010531080886721611,
"loss/reg": 5.741701126098633,
"loss/twn": 0.0,
"step": 137
},
{
"epoch": 0.00345,
"grad_norm": 11.5625,
"grad_norm_var": 312.32545572916666,
"learning_rate": 0.0001,
"loss": 8.2802,
"loss/crossentropy": 2.3824350833892822,
"loss/hidden": 0.1591796875,
"loss/logits": 0.011414668522775173,
"loss/reg": 5.727158546447754,
"loss/twn": 0.0,
"step": 138
},
{
"epoch": 0.003475,
"grad_norm": 11.1875,
"grad_norm_var": 314.30149739583334,
"learning_rate": 0.0001,
"loss": 8.1258,
"loss/crossentropy": 2.285022497177124,
"loss/hidden": 0.11962890625,
"loss/logits": 0.008713757619261742,
"loss/reg": 5.712470054626465,
"loss/twn": 0.0,
"step": 139
},
{
"epoch": 0.0035,
"grad_norm": 16.375,
"grad_norm_var": 310.479931640625,
"learning_rate": 0.0001,
"loss": 8.1639,
"loss/crossentropy": 2.350821018218994,
"loss/hidden": 0.10107421875,
"loss/logits": 0.012461278587579727,
"loss/reg": 5.699510097503662,
"loss/twn": 0.0,
"step": 140
},
{
"epoch": 0.003525,
"grad_norm": 9.4375,
"grad_norm_var": 314.765478515625,
"learning_rate": 0.0001,
"loss": 7.8463,
"loss/crossentropy": 2.103158473968506,
"loss/hidden": 0.05224609375,
"loss/logits": 0.005224157590419054,
"loss/reg": 5.685665130615234,
"loss/twn": 0.0,
"step": 141
},
{
"epoch": 0.00355,
"grad_norm": 9.0625,
"grad_norm_var": 318.001416015625,
"learning_rate": 0.0001,
"loss": 8.1747,
"loss/crossentropy": 2.418196678161621,
"loss/hidden": 0.07666015625,
"loss/logits": 0.006400687620043755,
"loss/reg": 5.6734795570373535,
"loss/twn": 0.0,
"step": 142
},
{
"epoch": 0.003575,
"grad_norm": 15.4375,
"grad_norm_var": 319.43639322916664,
"learning_rate": 0.0001,
"loss": 6.5554,
"loss/crossentropy": 0.6986656785011292,
"loss/hidden": 0.1845703125,
"loss/logits": 0.012149279937148094,
"loss/reg": 5.660000324249268,
"loss/twn": 0.0,
"step": 143
},
{
"epoch": 0.0036,
"grad_norm": 15.875,
"grad_norm_var": 317.5587890625,
"learning_rate": 0.0001,
"loss": 8.5371,
"loss/crossentropy": 2.7418227195739746,
"loss/hidden": 0.13671875,
"loss/logits": 0.0111403688788414,
"loss/reg": 5.647412300109863,
"loss/twn": 0.0,
"step": 144
},
{
"epoch": 0.003625,
"grad_norm": 13.875,
"grad_norm_var": 246.30520833333333,
"learning_rate": 0.0001,
"loss": 6.2652,
"loss/crossentropy": 0.4583094120025635,
"loss/hidden": 0.158203125,
"loss/logits": 0.012293124571442604,
"loss/reg": 5.6363725662231445,
"loss/twn": 0.0,
"step": 145
},
{
"epoch": 0.00365,
"grad_norm": 22.75,
"grad_norm_var": 245.63854166666667,
"learning_rate": 0.0001,
"loss": 7.3134,
"loss/crossentropy": 1.468201994895935,
"loss/hidden": 0.2109375,
"loss/logits": 0.009673453867435455,
"loss/reg": 5.624554634094238,
"loss/twn": 0.0,
"step": 146
},
{
"epoch": 0.003675,
"grad_norm": 310.0,
"grad_norm_var": 5526.531754557292,
"learning_rate": 0.0001,
"loss": 7.0649,
"loss/crossentropy": 1.2726686000823975,
"loss/hidden": 0.1728515625,
"loss/logits": 0.005334332585334778,
"loss/reg": 5.614006042480469,
"loss/twn": 0.0,
"step": 147
},
{
"epoch": 0.0037,
"grad_norm": 9.25,
"grad_norm_var": 5563.953889973958,
"learning_rate": 0.0001,
"loss": 6.774,
"loss/crossentropy": 1.0251015424728394,
"loss/hidden": 0.138671875,
"loss/logits": 0.007210130337625742,
"loss/reg": 5.603022575378418,
"loss/twn": 0.0,
"step": 148
},
{
"epoch": 0.003725,
"grad_norm": 17.875,
"grad_norm_var": 5575.684358723958,
"learning_rate": 0.0001,
"loss": 8.7246,
"loss/crossentropy": 2.911123275756836,
"loss/hidden": 0.19921875,
"loss/logits": 0.02136034518480301,
"loss/reg": 5.592944145202637,
"loss/twn": 0.0,
"step": 149
},
{
"epoch": 0.00375,
"grad_norm": 9.875,
"grad_norm_var": 5580.160872395833,
"learning_rate": 0.0001,
"loss": 8.39,
"loss/crossentropy": 2.711456537246704,
"loss/hidden": 0.09033203125,
"loss/logits": 0.005852097645401955,
"loss/reg": 5.582311153411865,
"loss/twn": 0.0,
"step": 150
},
{
"epoch": 0.003775,
"grad_norm": 14.625,
"grad_norm_var": 5588.7056640625,
"learning_rate": 0.0001,
"loss": 7.2155,
"loss/crossentropy": 1.4148988723754883,
"loss/hidden": 0.2158203125,
"loss/logits": 0.012599754147231579,
"loss/reg": 5.5721516609191895,
"loss/twn": 0.0,
"step": 151
},
{
"epoch": 0.0038,
"grad_norm": 12.375,
"grad_norm_var": 5588.115869140625,
"learning_rate": 0.0001,
"loss": 8.013,
"loss/crossentropy": 2.3517696857452393,
"loss/hidden": 0.09130859375,
"loss/logits": 0.007323693484067917,
"loss/reg": 5.562623023986816,
"loss/twn": 0.0,
"step": 152
},
{
"epoch": 0.003825,
"grad_norm": 30.5,
"grad_norm_var": 5482.538785807292,
"learning_rate": 0.0001,
"loss": 7.8607,
"loss/crossentropy": 2.2278008460998535,
"loss/hidden": 0.07421875,
"loss/logits": 0.0050660185515880585,
"loss/reg": 5.553621292114258,
"loss/twn": 0.0,
"step": 153
},
{
"epoch": 0.00385,
"grad_norm": 13.0625,
"grad_norm_var": 5478.366129557292,
"learning_rate": 0.0001,
"loss": 8.277,
"loss/crossentropy": 2.206120252609253,
"loss/hidden": 0.5078125,
"loss/logits": 0.018887437880039215,
"loss/reg": 5.544199466705322,
"loss/twn": 0.0,
"step": 154
},
{
"epoch": 0.003875,
"grad_norm": 93.0,
"grad_norm_var": 5656.329622395833,
"learning_rate": 0.0001,
"loss": 8.3416,
"loss/crossentropy": 2.643498420715332,
"loss/hidden": 0.1474609375,
"loss/logits": 0.015275152400135994,
"loss/reg": 5.535386562347412,
"loss/twn": 0.0,
"step": 155
},
{
"epoch": 0.0039,
"grad_norm": 15.8125,
"grad_norm_var": 5657.996468098959,
"learning_rate": 0.0001,
"loss": 8.66,
"loss/crossentropy": 3.025573492050171,
"loss/hidden": 0.0986328125,
"loss/logits": 0.008579680696129799,
"loss/reg": 5.52721643447876,
"loss/twn": 0.0,
"step": 156
},
{
"epoch": 0.003925,
"grad_norm": 15.3125,
"grad_norm_var": 5637.544124348959,
"learning_rate": 0.0001,
"loss": 8.3922,
"loss/crossentropy": 2.7117786407470703,
"loss/hidden": 0.15234375,
"loss/logits": 0.008373694494366646,
"loss/reg": 5.519668102264404,
"loss/twn": 0.0,
"step": 157
},
{
"epoch": 0.00395,
"grad_norm": 24.75,
"grad_norm_var": 5591.000455729167,
"learning_rate": 0.0001,
"loss": 8.4122,
"loss/crossentropy": 2.7280266284942627,
"loss/hidden": 0.1611328125,
"loss/logits": 0.01191728375852108,
"loss/reg": 5.511092662811279,
"loss/twn": 0.0,
"step": 158
},
{
"epoch": 0.003975,
"grad_norm": 12.9375,
"grad_norm_var": 5599.461393229167,
"learning_rate": 0.0001,
"loss": 8.2413,
"loss/crossentropy": 2.5042476654052734,
"loss/hidden": 0.21484375,
"loss/logits": 0.018616054207086563,
"loss/reg": 5.503547668457031,
"loss/twn": 0.0,
"step": 159
},
{
"epoch": 0.004,
"grad_norm": 12.875,
"grad_norm_var": 5609.470768229166,
"learning_rate": 0.0001,
"loss": 8.4032,
"loss/crossentropy": 2.762385606765747,
"loss/hidden": 0.1337890625,
"loss/logits": 0.010888181626796722,
"loss/reg": 5.496166706085205,
"loss/twn": 0.0,
"step": 160
},
{
"epoch": 0.004025,
"grad_norm": 10.8125,
"grad_norm_var": 5620.440738932291,
"learning_rate": 0.0001,
"loss": 8.0804,
"loss/crossentropy": 2.5779385566711426,
"loss/hidden": 0.0093994140625,
"loss/logits": 0.004039571154862642,
"loss/reg": 5.489066123962402,
"loss/twn": 0.0,
"step": 161
},
{
"epoch": 0.00405,
"grad_norm": 334.0,
"grad_norm_var": 10996.149723307291,
"learning_rate": 0.0001,
"loss": 6.2302,
"loss/crossentropy": 0.5805911421775818,
"loss/hidden": 0.158203125,
"loss/logits": 0.008694609627127647,
"loss/reg": 5.482710361480713,
"loss/twn": 0.0,
"step": 162
},
{
"epoch": 0.004075,
"grad_norm": 131.0,
"grad_norm_var": 6997.830452473959,
"learning_rate": 0.0001,
"loss": 7.0482,
"loss/crossentropy": 1.3563833236694336,
"loss/hidden": 0.2060546875,
"loss/logits": 0.010020879097282887,
"loss/reg": 5.475753307342529,
"loss/twn": 0.0,
"step": 163
},
{
"epoch": 0.0041,
"grad_norm": 13.125,
"grad_norm_var": 6979.068994140625,
"learning_rate": 0.0001,
"loss": 8.1687,
"loss/crossentropy": 2.569106101989746,
"loss/hidden": 0.1171875,
"loss/logits": 0.012607071548700333,
"loss/reg": 5.469805717468262,
"loss/twn": 0.0,
"step": 164
},
{
"epoch": 0.004125,
"grad_norm": 14.1875,
"grad_norm_var": 6994.544010416666,
"learning_rate": 0.0001,
"loss": 7.2464,
"loss/crossentropy": 1.6330703496932983,
"loss/hidden": 0.142578125,
"loss/logits": 0.007347936742007732,
"loss/reg": 5.463380336761475,
"loss/twn": 0.0,
"step": 165
},
{
"epoch": 0.00415,
"grad_norm": 11.125,
"grad_norm_var": 6988.3890625,
"learning_rate": 0.0001,
"loss": 8.1928,
"loss/crossentropy": 2.5154733657836914,
"loss/hidden": 0.20703125,
"loss/logits": 0.013312840834259987,
"loss/reg": 5.45693826675415,
"loss/twn": 0.0,
"step": 166
},
{
"epoch": 0.004175,
"grad_norm": 13.875,
"grad_norm_var": 6991.70859375,
"learning_rate": 0.0001,
"loss": 7.1058,
"loss/crossentropy": 1.5941680669784546,
"loss/hidden": 0.05712890625,
"loss/logits": 0.003117609303444624,
"loss/reg": 5.451424598693848,
"loss/twn": 0.0,
"step": 167
},
{
"epoch": 0.0042,
"grad_norm": 25.5,
"grad_norm_var": 6941.1431640625,
"learning_rate": 0.0001,
"loss": 5.8236,
"loss/crossentropy": 0.22541135549545288,
"loss/hidden": 0.1494140625,
"loss/logits": 0.002781955059617758,
"loss/reg": 5.445990562438965,
"loss/twn": 0.0,
"step": 168
},
{
"epoch": 0.004225,
"grad_norm": 9.9375,
"grad_norm_var": 7016.212353515625,
"learning_rate": 0.0001,
"loss": 8.0762,
"loss/crossentropy": 2.55377197265625,
"loss/hidden": 0.07666015625,
"loss/logits": 0.005485657136887312,
"loss/reg": 5.4402995109558105,
"loss/twn": 0.0,
"step": 169
},
{
"epoch": 0.00425,
"grad_norm": 16.0,
"grad_norm_var": 7003.476302083333,
"learning_rate": 0.0001,
"loss": 8.043,
"loss/crossentropy": 2.316843032836914,
"loss/hidden": 0.26953125,
"loss/logits": 0.021316442638635635,
"loss/reg": 5.435269832611084,
"loss/twn": 0.0,
"step": 170
},
{
"epoch": 0.004275,
"grad_norm": 11.125,
"grad_norm_var": 6921.814518229166,
"learning_rate": 0.0001,
"loss": 8.12,
"loss/crossentropy": 2.539064645767212,
"loss/hidden": 0.134765625,
"loss/logits": 0.01590101048350334,
"loss/reg": 5.430272579193115,
"loss/twn": 0.0,
"step": 171
},
{
"epoch": 0.0043,
"grad_norm": 12.5625,
"grad_norm_var": 6933.832747395833,
"learning_rate": 0.0001,
"loss": 7.4549,
"loss/crossentropy": 1.8348197937011719,
"loss/hidden": 0.1826171875,
"loss/logits": 0.012229220010340214,
"loss/reg": 5.425241470336914,
"loss/twn": 0.0,
"step": 172
},
{
"epoch": 0.004325,
"grad_norm": 14.1875,
"grad_norm_var": 6937.888020833333,
"learning_rate": 0.0001,
"loss": 6.9049,
"loss/crossentropy": 1.270473599433899,
"loss/hidden": 0.205078125,
"loss/logits": 0.009345902130007744,
"loss/reg": 5.4199981689453125,
"loss/twn": 0.0,
"step": 173
},
{
"epoch": 0.00435,
"grad_norm": 10.5625,
"grad_norm_var": 6982.626676432292,
"learning_rate": 0.0001,
"loss": 7.0624,
"loss/crossentropy": 1.425657033920288,
"loss/hidden": 0.2099609375,
"loss/logits": 0.011879321187734604,
"loss/reg": 5.4148969650268555,
"loss/twn": 0.0,
"step": 174
},
{
"epoch": 0.004375,
"grad_norm": 8.875,
"grad_norm_var": 6998.784635416667,
"learning_rate": 0.0001,
"loss": 7.1708,
"loss/crossentropy": 1.4886845350265503,
"loss/hidden": 0.265625,
"loss/logits": 0.005756002385169268,
"loss/reg": 5.410771369934082,
"loss/twn": 0.0,
"step": 175
},
{
"epoch": 0.0044,
"grad_norm": 28.625,
"grad_norm_var": 6956.046354166667,
"learning_rate": 0.0001,
"loss": 6.1956,
"loss/crossentropy": 0.5712894201278687,
"loss/hidden": 0.2119140625,
"loss/logits": 0.006170031148940325,
"loss/reg": 5.406195163726807,
"loss/twn": 0.0,
"step": 176
},
{
"epoch": 0.004425,
"grad_norm": 8.75,
"grad_norm_var": 6964.777067057292,
"learning_rate": 0.0001,
"loss": 6.737,
"loss/crossentropy": 1.1914383172988892,
"loss/hidden": 0.13671875,
"loss/logits": 0.006925875786691904,
"loss/reg": 5.401881694793701,
"loss/twn": 0.0,
"step": 177
},
{
"epoch": 0.00445,
"grad_norm": 44.5,
"grad_norm_var": 911.0606608072917,
"learning_rate": 0.0001,
"loss": 8.1517,
"loss/crossentropy": 2.5778274536132812,
"loss/hidden": 0.1669921875,
"loss/logits": 0.009458218701183796,
"loss/reg": 5.397425174713135,
"loss/twn": 0.0,
"step": 178
},
{
"epoch": 0.004475,
"grad_norm": 16.625,
"grad_norm_var": 87.32237955729167,
"learning_rate": 0.0001,
"loss": 7.1888,
"loss/crossentropy": 1.5994395017623901,
"loss/hidden": 0.185546875,
"loss/logits": 0.01000029407441616,
"loss/reg": 5.393801212310791,
"loss/twn": 0.0,
"step": 179
},
{
"epoch": 0.0045,
"grad_norm": 12.1875,
"grad_norm_var": 87.76451822916667,
"learning_rate": 0.0001,
"loss": 8.0046,
"loss/crossentropy": 2.455324172973633,
"loss/hidden": 0.1474609375,
"loss/logits": 0.012162324041128159,
"loss/reg": 5.389675617218018,
"loss/twn": 0.0,
"step": 180
},
{
"epoch": 0.004525,
"grad_norm": 124.0,
"grad_norm_var": 812.4984212239583,
"learning_rate": 0.0001,
"loss": 6.3516,
"loss/crossentropy": 0.8374608755111694,
"loss/hidden": 0.125,
"loss/logits": 0.003441192675381899,
"loss/reg": 5.3856940269470215,
"loss/twn": 0.0,
"step": 181
},
{
"epoch": 0.00455,
"grad_norm": 10.9375,
"grad_norm_var": 812.7981770833334,
"learning_rate": 0.0001,
"loss": 8.1559,
"loss/crossentropy": 2.670118570327759,
"loss/hidden": 0.0986328125,
"loss/logits": 0.005220318678766489,
"loss/reg": 5.381902694702148,
"loss/twn": 0.0,
"step": 182
},
{
"epoch": 0.004575,
"grad_norm": 10.0,
"grad_norm_var": 818.4593098958334,
"learning_rate": 0.0001,
"loss": 7.6118,
"loss/crossentropy": 2.0260519981384277,
"loss/hidden": 0.1982421875,
"loss/logits": 0.009290603920817375,
"loss/reg": 5.378239631652832,
"loss/twn": 0.0,
"step": 183
},
{
"epoch": 0.0046,
"grad_norm": 11.3125,
"grad_norm_var": 825.881884765625,
"learning_rate": 0.0001,
"loss": 8.0393,
"loss/crossentropy": 2.524176597595215,
"loss/hidden": 0.130859375,
"loss/logits": 0.009289154782891273,
"loss/reg": 5.3750152587890625,
"loss/twn": 0.0,
"step": 184
},
{
"epoch": 0.004625,
"grad_norm": 17.375,
"grad_norm_var": 817.4895182291667,
"learning_rate": 0.0001,
"loss": 8.2323,
"loss/crossentropy": 2.7599668502807617,
"loss/hidden": 0.09130859375,
"loss/logits": 0.009666713885962963,
"loss/reg": 5.371392250061035,
"loss/twn": 0.0,
"step": 185
},
{
"epoch": 0.00465,
"grad_norm": 20.625,
"grad_norm_var": 814.9096354166667,
"learning_rate": 0.0001,
"loss": 7.9849,
"loss/crossentropy": 2.4787378311157227,
"loss/hidden": 0.126953125,
"loss/logits": 0.011332664638757706,
"loss/reg": 5.367901802062988,
"loss/twn": 0.0,
"step": 186
},
{
"epoch": 0.004675,
"grad_norm": 20.625,
"grad_norm_var": 805.9638020833333,
"learning_rate": 0.0001,
"loss": 8.1101,
"loss/crossentropy": 2.6036434173583984,
"loss/hidden": 0.1318359375,
"loss/logits": 0.009613338857889175,
"loss/reg": 5.364970684051514,
"loss/twn": 0.0,
"step": 187
},
{
"epoch": 0.0047,
"grad_norm": 14.5,
"grad_norm_var": 803.4415201822917,
"learning_rate": 0.0001,
"loss": 8.2869,
"loss/crossentropy": 2.7700679302215576,
"loss/hidden": 0.1435546875,
"loss/logits": 0.011366615071892738,
"loss/reg": 5.361906051635742,
"loss/twn": 0.0,
"step": 188
},
{
"epoch": 0.004725,
"grad_norm": 30.75,
"grad_norm_var": 800.3403645833333,
"learning_rate": 0.0001,
"loss": 8.1568,
"loss/crossentropy": 2.633868455886841,
"loss/hidden": 0.158203125,
"loss/logits": 0.006533905863761902,
"loss/reg": 5.358221054077148,
"loss/twn": 0.0,
"step": 189
},
{
"epoch": 0.00475,
"grad_norm": 33.0,
"grad_norm_var": 790.4363118489583,
"learning_rate": 0.0001,
"loss": 7.5575,
"loss/crossentropy": 1.9494565725326538,
"loss/hidden": 0.2412109375,
"loss/logits": 0.011269403621554375,
"loss/reg": 5.3555755615234375,
"loss/twn": 0.0,
"step": 190
},
{
"epoch": 0.004775,
"grad_norm": 9.625,
"grad_norm_var": 788.7796712239583,
"learning_rate": 0.0001,
"loss": 7.0502,
"loss/crossentropy": 1.607956051826477,
"loss/hidden": 0.08642578125,
"loss/logits": 0.0032915128394961357,
"loss/reg": 5.35251522064209,
"loss/twn": 0.0,
"step": 191
},
{
"epoch": 0.0048,
"grad_norm": 100.5,
"grad_norm_var": 1138.346728515625,
"learning_rate": 0.0001,
"loss": 8.4206,
"loss/crossentropy": 2.9291961193084717,
"loss/hidden": 0.1337890625,
"loss/logits": 0.007461494766175747,
"loss/reg": 5.3501105308532715,
"loss/twn": 0.0,
"step": 192
},
{
"epoch": 0.004825,
"grad_norm": 14.875,
"grad_norm_var": 1123.0661295572916,
"learning_rate": 0.0001,
"loss": 7.4399,
"loss/crossentropy": 1.9533724784851074,
"loss/hidden": 0.134765625,
"loss/logits": 0.004339105449616909,
"loss/reg": 5.3473944664001465,
"loss/twn": 0.0,
"step": 193
},
{
"epoch": 0.00485,
"grad_norm": 64.5,
"grad_norm_var": 1184.8265462239583,
"learning_rate": 0.0001,
"loss": 7.542,
"loss/crossentropy": 2.0671801567077637,
"loss/hidden": 0.1171875,
"loss/logits": 0.012455419637262821,
"loss/reg": 5.345158100128174,
"loss/twn": 0.0,
"step": 194
},
{
"epoch": 0.004875,
"grad_norm": 16.125,
"grad_norm_var": 1185.8648274739583,
"learning_rate": 0.0001,
"loss": 7.938,
"loss/crossentropy": 2.456360101699829,
"loss/hidden": 0.130859375,
"loss/logits": 0.008208954706788063,
"loss/reg": 5.342526435852051,
"loss/twn": 0.0,
"step": 195
},
{
"epoch": 0.0049,
"grad_norm": 80.0,
"grad_norm_var": 1294.7356770833333,
"learning_rate": 0.0001,
"loss": 7.8581,
"loss/crossentropy": 2.487790107727051,
"loss/hidden": 0.025634765625,
"loss/logits": 0.0046631209552288055,
"loss/reg": 5.340009689331055,
"loss/twn": 0.0,
"step": 196
},
{
"epoch": 0.004925,
"grad_norm": 14.75,
"grad_norm_var": 761.3453125,
"learning_rate": 0.0001,
"loss": 6.8181,
"loss/crossentropy": 1.304626703262329,
"loss/hidden": 0.171875,
"loss/logits": 0.004350706003606319,
"loss/reg": 5.337262153625488,
"loss/twn": 0.0,
"step": 197
},
{
"epoch": 0.00495,
"grad_norm": 11.125,
"grad_norm_var": 760.887353515625,
"learning_rate": 0.0001,
"loss": 7.9797,
"loss/crossentropy": 2.4714841842651367,
"loss/hidden": 0.1611328125,
"loss/logits": 0.011602582409977913,
"loss/reg": 5.335472106933594,
"loss/twn": 0.0,
"step": 198
},
{
"epoch": 0.004975,
"grad_norm": 95.0,
"grad_norm_var": 993.0878743489583,
"learning_rate": 0.0001,
"loss": 7.04,
"loss/crossentropy": 1.4688469171524048,
"loss/hidden": 0.2275390625,
"loss/logits": 0.0106576569378376,
"loss/reg": 5.332970142364502,
"loss/twn": 0.0,
"step": 199
},
{
"epoch": 0.005,
"grad_norm": 9.9375,
"grad_norm_var": 997.4878743489584,
"learning_rate": 0.0001,
"loss": 6.7972,
"loss/crossentropy": 1.3856897354125977,
"loss/hidden": 0.080078125,
"loss/logits": 0.001242777332663536,
"loss/reg": 5.330203056335449,
"loss/twn": 0.0,
"step": 200
},
{
"epoch": 0.005025,
"grad_norm": 11.875,
"grad_norm_var": 1011.9969889322916,
"learning_rate": 0.0001,
"loss": 7.5639,
"loss/crossentropy": 2.073434591293335,
"loss/hidden": 0.1484375,
"loss/logits": 0.013964459300041199,
"loss/reg": 5.328036308288574,
"loss/twn": 0.0,
"step": 201
},
{
"epoch": 0.00505,
"grad_norm": 18.25,
"grad_norm_var": 1016.660400390625,
"learning_rate": 0.0001,
"loss": 8.68,
"loss/crossentropy": 3.2589170932769775,
"loss/hidden": 0.08642578125,
"loss/logits": 0.008732986636459827,
"loss/reg": 5.325946807861328,
"loss/twn": 0.0,
"step": 202
},
{
"epoch": 0.005075,
"grad_norm": 16.125,
"grad_norm_var": 1026.004931640625,
"learning_rate": 0.0001,
"loss": 8.6078,
"loss/crossentropy": 3.1099424362182617,
"loss/hidden": 0.1552734375,
"loss/logits": 0.018260516226291656,
"loss/reg": 5.324294090270996,
"loss/twn": 0.0,
"step": 203
},
{
"epoch": 0.0051,
"grad_norm": 46.75,
"grad_norm_var": 1007.981884765625,
"learning_rate": 0.0001,
"loss": 7.5539,
"loss/crossentropy": 2.091862678527832,
"loss/hidden": 0.130859375,
"loss/logits": 0.009298876859247684,
"loss/reg": 5.321921348571777,
"loss/twn": 0.0,
"step": 204
},
{
"epoch": 0.005125,
"grad_norm": 10.3125,
"grad_norm_var": 1047.91484375,
"learning_rate": 0.0001,
"loss": 8.124,
"loss/crossentropy": 2.732879400253296,
"loss/hidden": 0.064453125,
"loss/logits": 0.0066910069435834885,
"loss/reg": 5.320003986358643,
"loss/twn": 0.0,
"step": 205
},
{
"epoch": 0.00515,
"grad_norm": 11.0625,
"grad_norm_var": 1082.517822265625,
"learning_rate": 0.0001,
"loss": 6.9944,
"loss/crossentropy": 1.5260496139526367,
"loss/hidden": 0.142578125,
"loss/logits": 0.007849331945180893,
"loss/reg": 5.317881107330322,
"loss/twn": 0.0,
"step": 206
},
{
"epoch": 0.005175,
"grad_norm": 8.6875,
"grad_norm_var": 1085.5166015625,
"learning_rate": 0.0001,
"loss": 6.7833,
"loss/crossentropy": 1.3956589698791504,
"loss/hidden": 0.06689453125,
"loss/logits": 0.004923268221318722,
"loss/reg": 5.315812587738037,
"loss/twn": 0.0,
"step": 207
},
{
"epoch": 0.0052,
"grad_norm": 10.0625,
"grad_norm_var": 784.176025390625,
"learning_rate": 0.0001,
"loss": 6.6116,
"loss/crossentropy": 1.040010690689087,
"loss/hidden": 0.2451171875,
"loss/logits": 0.011602293699979782,
"loss/reg": 5.3148298263549805,
"loss/twn": 0.0,
"step": 208
},
{
"epoch": 0.005225,
"grad_norm": 21.875,
"grad_norm_var": 775.4880045572917,
"learning_rate": 0.0001,
"loss": 8.5451,
"loss/crossentropy": 3.072871685028076,
"loss/hidden": 0.1474609375,
"loss/logits": 0.011389853432774544,
"loss/reg": 5.313349723815918,
"loss/twn": 0.0,
"step": 209
},
{
"epoch": 0.00525,
"grad_norm": 16.25,
"grad_norm_var": 685.5469889322917,
"learning_rate": 0.0001,
"loss": 6.872,
"loss/crossentropy": 1.4240162372589111,
"loss/hidden": 0.130859375,
"loss/logits": 0.0054016802459955215,
"loss/reg": 5.311694145202637,
"loss/twn": 0.0,
"step": 210
},
{
"epoch": 0.005275,
"grad_norm": 10.8125,
"grad_norm_var": 693.5171223958333,
"learning_rate": 0.0001,
"loss": 6.665,
"loss/crossentropy": 1.2265129089355469,
"loss/hidden": 0.12451171875,
"loss/logits": 0.0037709574680775404,
"loss/reg": 5.31024169921875,
"loss/twn": 0.0,
"step": 211
},
{
"epoch": 0.0053,
"grad_norm": 11.4375,
"grad_norm_var": 480.45558268229166,
"learning_rate": 0.0001,
"loss": 8.1829,
"loss/crossentropy": 2.7488791942596436,
"loss/hidden": 0.11767578125,
"loss/logits": 0.007810299750417471,
"loss/reg": 5.3085198402404785,
"loss/twn": 0.0,
"step": 212
},
{
"epoch": 0.005325,
"grad_norm": 13.5,
"grad_norm_var": 481.47316080729166,
"learning_rate": 0.0001,
"loss": 8.1725,
"loss/crossentropy": 2.769019603729248,
"loss/hidden": 0.08642578125,
"loss/logits": 0.010608029551804066,
"loss/reg": 5.306417942047119,
"loss/twn": 0.0,
"step": 213
},
{
"epoch": 0.00535,
"grad_norm": 10.875,
"grad_norm_var": 481.77928059895834,
"learning_rate": 0.0001,
"loss": 7.1392,
"loss/crossentropy": 1.7968316078186035,
"loss/hidden": 0.03271484375,
"loss/logits": 0.004501561634242535,
"loss/reg": 5.305141925811768,
"loss/twn": 0.0,
"step": 214
},
{
"epoch": 0.005375,
"grad_norm": 11.1875,
"grad_norm_var": 84.65208333333334,
"learning_rate": 0.0001,
"loss": 7.577,
"loss/crossentropy": 2.1686487197875977,
"loss/hidden": 0.0986328125,
"loss/logits": 0.006175590679049492,
"loss/reg": 5.303523540496826,
"loss/twn": 0.0,
"step": 215
},
{
"epoch": 0.0054,
"grad_norm": 12.0625,
"grad_norm_var": 83.51764322916667,
"learning_rate": 0.0001,
"loss": 8.042,
"loss/crossentropy": 2.737717628479004,
"loss/hidden": 6.16908073425293e-06,
"loss/logits": 0.0018352700863033533,
"loss/reg": 5.302443027496338,
"loss/twn": 0.0,
"step": 216
},
{
"epoch": 0.005425,
"grad_norm": 12.8125,
"grad_norm_var": 83.17316080729167,
"learning_rate": 0.0001,
"loss": 8.3658,
"loss/crossentropy": 2.9845688343048096,
"loss/hidden": 0.07421875,
"loss/logits": 0.005686669610440731,
"loss/reg": 5.301285743713379,
"loss/twn": 0.0,
"step": 217
},
{
"epoch": 0.00545,
"grad_norm": 13.25,
"grad_norm_var": 82.654931640625,
"learning_rate": 0.0001,
"loss": 8.1609,
"loss/crossentropy": 2.7524123191833496,
"loss/hidden": 0.10107421875,
"loss/logits": 0.007715051528066397,
"loss/reg": 5.299709320068359,
"loss/twn": 0.0,
"step": 218
},
{
"epoch": 0.005475,
"grad_norm": 23.375,
"grad_norm_var": 87.20506184895834,
"learning_rate": 0.0001,
"loss": 7.156,
"loss/crossentropy": 1.624443531036377,
"loss/hidden": 0.2216796875,
"loss/logits": 0.011536870151758194,
"loss/reg": 5.298386573791504,
"loss/twn": 0.0,
"step": 219
},
{
"epoch": 0.0055,
"grad_norm": 18.625,
"grad_norm_var": 18.591780598958334,
"learning_rate": 0.0001,
"loss": 8.0654,
"loss/crossentropy": 2.5879249572753906,
"loss/hidden": 0.1630859375,
"loss/logits": 0.017003701999783516,
"loss/reg": 5.297426223754883,
"loss/twn": 0.0,
"step": 220
},
{
"epoch": 0.005525,
"grad_norm": 43.25,
"grad_norm_var": 72.34680989583333,
"learning_rate": 0.0001,
"loss": 7.8801,
"loss/crossentropy": 2.4491031169891357,
"loss/hidden": 0.125,
"loss/logits": 0.010065239854156971,
"loss/reg": 5.295965194702148,
"loss/twn": 0.0,
"step": 221
},
{
"epoch": 0.00555,
"grad_norm": 9.5,
"grad_norm_var": 73.438525390625,
"learning_rate": 0.0001,
"loss": 7.2681,
"loss/crossentropy": 1.8589441776275635,
"loss/hidden": 0.10595703125,
"loss/logits": 0.0087303277105093,
"loss/reg": 5.2944464683532715,
"loss/twn": 0.0,
"step": 222
},
{
"epoch": 0.005575,
"grad_norm": 10.3125,
"grad_norm_var": 72.133447265625,
"learning_rate": 0.0001,
"loss": 6.9474,
"loss/crossentropy": 1.452248215675354,
"loss/hidden": 0.1962890625,
"loss/logits": 0.005369896534830332,
"loss/reg": 5.293449401855469,
"loss/twn": 0.0,
"step": 223
},
{
"epoch": 0.0056,
"grad_norm": 15.25,
"grad_norm_var": 70.00305989583333,
"learning_rate": 0.0001,
"loss": 8.1935,
"loss/crossentropy": 2.740863800048828,
"loss/hidden": 0.14453125,
"loss/logits": 0.01583397574722767,
"loss/reg": 5.292267799377441,
"loss/twn": 0.0,
"step": 224
},
{
"epoch": 0.005625,
"grad_norm": 1056.0,
"grad_norm_var": 67732.47864583334,
"learning_rate": 0.0001,
"loss": 7.8539,
"loss/crossentropy": 2.403062105178833,
"loss/hidden": 0.1474609375,
"loss/logits": 0.012353872880339622,
"loss/reg": 5.29097318649292,
"loss/twn": 0.0,
"step": 225
},
{
"epoch": 0.00565,
"grad_norm": 10.3125,
"grad_norm_var": 67785.57133789062,
"learning_rate": 0.0001,
"loss": 7.194,
"loss/crossentropy": 1.7008212804794312,
"loss/hidden": 0.189453125,
"loss/logits": 0.013665840029716492,
"loss/reg": 5.290075778961182,
"loss/twn": 0.0,
"step": 226
},
{
"epoch": 0.005675,
"grad_norm": 12.5,
"grad_norm_var": 67770.14609375,
"learning_rate": 0.0001,
"loss": 6.5587,
"loss/crossentropy": 1.045196771621704,
"loss/hidden": 0.2080078125,
"loss/logits": 0.016789620742201805,
"loss/reg": 5.288687705993652,
"loss/twn": 0.0,
"step": 227
},
{
"epoch": 0.0057,
"grad_norm": 27.625,
"grad_norm_var": 67637.96925455729,
"learning_rate": 0.0001,
"loss": 7.1449,
"loss/crossentropy": 1.6837458610534668,
"loss/hidden": 0.158203125,
"loss/logits": 0.015014993026852608,
"loss/reg": 5.2879252433776855,
"loss/twn": 0.0,
"step": 228
},
{
"epoch": 0.005725,
"grad_norm": 10.6875,
"grad_norm_var": 67663.88014322917,
"learning_rate": 0.0001,
"loss": 7.0729,
"loss/crossentropy": 1.7119382619857788,
"loss/hidden": 0.06689453125,
"loss/logits": 0.007287868298590183,
"loss/reg": 5.286799907684326,
"loss/twn": 0.0,
"step": 229
},
{
"epoch": 0.00575,
"grad_norm": 53.0,
"grad_norm_var": 67380.34817708333,
"learning_rate": 0.0001,
"loss": 7.5607,
"loss/crossentropy": 2.0949935913085938,
"loss/hidden": 0.169921875,
"loss/logits": 0.009483925998210907,
"loss/reg": 5.286267280578613,
"loss/twn": 0.0,
"step": 230
},
{
"epoch": 0.005775,
"grad_norm": 19.375,
"grad_norm_var": 67305.34086914062,
"learning_rate": 0.0001,
"loss": 7.9328,
"loss/crossentropy": 2.538356304168701,
"loss/hidden": 0.10498046875,
"loss/logits": 0.005003707949072123,
"loss/reg": 5.284492492675781,
"loss/twn": 0.0,
"step": 231
},
{
"epoch": 0.0058,
"grad_norm": 24.5,
"grad_norm_var": 67195.30462239584,
"learning_rate": 0.0001,
"loss": 8.2162,
"loss/crossentropy": 2.749314308166504,
"loss/hidden": 0.169921875,
"loss/logits": 0.012603437528014183,
"loss/reg": 5.2843194007873535,
"loss/twn": 0.0,
"step": 232
},
{
"epoch": 0.005825,
"grad_norm": 14.6875,
"grad_norm_var": 67177.47161458334,
"learning_rate": 0.0001,
"loss": 7.1017,
"loss/crossentropy": 1.6476311683654785,
"loss/hidden": 0.1572265625,
"loss/logits": 0.01403855625540018,
"loss/reg": 5.282772064208984,
"loss/twn": 0.0,
"step": 233
},
{
"epoch": 0.00585,
"grad_norm": 10.875,
"grad_norm_var": 67200.58951822917,
"learning_rate": 0.0001,
"loss": 7.2783,
"loss/crossentropy": 1.8889567852020264,
"loss/hidden": 0.09814453125,
"loss/logits": 0.009974194690585136,
"loss/reg": 5.281259059906006,
"loss/twn": 0.0,
"step": 234
},
{
"epoch": 0.005875,
"grad_norm": 8.4375,
"grad_norm_var": 67337.25597330728,
"learning_rate": 0.0001,
"loss": 7.2088,
"loss/crossentropy": 1.7615716457366943,
"loss/hidden": 0.16015625,
"loss/logits": 0.0061751967296004295,
"loss/reg": 5.280921459197998,
"loss/twn": 0.0,
"step": 235
},
{
"epoch": 0.0059,
"grad_norm": 9.4375,
"grad_norm_var": 67422.68776041667,
"learning_rate": 0.0001,
"loss": 7.3424,
"loss/crossentropy": 1.9424008131027222,
"loss/hidden": 0.115234375,
"loss/logits": 0.004247123841196299,
"loss/reg": 5.280468940734863,
"loss/twn": 0.0,
"step": 236
},
{
"epoch": 0.005925,
"grad_norm": 10.3125,
"grad_norm_var": 67667.18865559896,
"learning_rate": 0.0001,
"loss": 8.127,
"loss/crossentropy": 2.7163503170013428,
"loss/hidden": 0.12255859375,
"loss/logits": 0.008166075684130192,
"loss/reg": 5.279946804046631,
"loss/twn": 0.0,
"step": 237
},
{
"epoch": 0.00595,
"grad_norm": 12.5,
"grad_norm_var": 67638.98084309897,
"learning_rate": 0.0001,
"loss": 7.0844,
"loss/crossentropy": 1.5295031070709229,
"loss/hidden": 0.26953125,
"loss/logits": 0.006746275350451469,
"loss/reg": 5.278590679168701,
"loss/twn": 0.0,
"step": 238
},
{
"epoch": 0.005975,
"grad_norm": 92.5,
"grad_norm_var": 67279.8171875,
"learning_rate": 0.0001,
"loss": 5.8036,
"loss/crossentropy": 0.4050528109073639,
"loss/hidden": 0.1142578125,
"loss/logits": 0.005355454981327057,
"loss/reg": 5.278897762298584,
"loss/twn": 0.0,
"step": 239
},
{
"epoch": 0.006,
"grad_norm": 28.875,
"grad_norm_var": 67161.52805989583,
"learning_rate": 0.0001,
"loss": 8.1647,
"loss/crossentropy": 2.643662929534912,
"loss/hidden": 0.2294921875,
"loss/logits": 0.013766671530902386,
"loss/reg": 5.277756214141846,
"loss/twn": 0.0,
"step": 240
},
{
"epoch": 0.006025,
"grad_norm": 13.1875,
"grad_norm_var": 479.914697265625,
"learning_rate": 0.0001,
"loss": 6.9306,
"loss/crossentropy": 1.5002285242080688,
"loss/hidden": 0.1435546875,
"loss/logits": 0.009923199191689491,
"loss/reg": 5.276930809020996,
"loss/twn": 0.0,
"step": 241
},
{
"epoch": 0.00605,
"grad_norm": 11.9375,
"grad_norm_var": 477.45519205729164,
"learning_rate": 0.0001,
"loss": 8.1267,
"loss/crossentropy": 2.7130773067474365,
"loss/hidden": 0.12890625,
"loss/logits": 0.00791969709098339,
"loss/reg": 5.276750087738037,
"loss/twn": 0.0,
"step": 242
},
{
"epoch": 0.006075,
"grad_norm": 10.75,
"grad_norm_var": 479.98631184895834,
"learning_rate": 0.0001,
"loss": 7.1488,
"loss/crossentropy": 1.727049469947815,
"loss/hidden": 0.1376953125,
"loss/logits": 0.00825846754014492,
"loss/reg": 5.2758002281188965,
"loss/twn": 0.0,
"step": 243
},
{
"epoch": 0.0061,
"grad_norm": 15.4375,
"grad_norm_var": 480.80833333333334,
"learning_rate": 0.0001,
"loss": 7.921,
"loss/crossentropy": 2.5109403133392334,
"loss/hidden": 0.12255859375,
"loss/logits": 0.012591829523444176,
"loss/reg": 5.274876594543457,
"loss/twn": 0.0,
"step": 244
},
{
"epoch": 0.006125,
"grad_norm": 14.6875,
"grad_norm_var": 475.9583333333333,
"learning_rate": 0.0001,
"loss": 7.9285,
"loss/crossentropy": 2.4956674575805664,
"loss/hidden": 0.146484375,
"loss/logits": 0.012429025955498219,
"loss/reg": 5.2739410400390625,
"loss/twn": 0.0,
"step": 245
},
{
"epoch": 0.00615,
"grad_norm": 11.5625,
"grad_norm_var": 411.4820149739583,
"learning_rate": 0.0001,
"loss": 7.9078,
"loss/crossentropy": 2.5263020992279053,
"loss/hidden": 0.10107421875,
"loss/logits": 0.007129446603357792,
"loss/reg": 5.273260116577148,
"loss/twn": 0.0,
"step": 246
},
{
"epoch": 0.006175,
"grad_norm": 17.5,
"grad_norm_var": 411.6870930989583,
"learning_rate": 0.0001,
"loss": 7.0028,
"loss/crossentropy": 1.6373541355133057,
"loss/hidden": 0.08642578125,
"loss/logits": 0.0059446613304317,
"loss/reg": 5.273036479949951,
"loss/twn": 0.0,
"step": 247
},
{
"epoch": 0.0062,
"grad_norm": 11.375,
"grad_norm_var": 413.1773274739583,
"learning_rate": 0.0001,
"loss": 7.5298,
"loss/crossentropy": 2.1802141666412354,
"loss/hidden": 0.07568359375,
"loss/logits": 0.0017865689005702734,
"loss/reg": 5.272162914276123,
"loss/twn": 0.0,
"step": 248
},
{
"epoch": 0.006225,
"grad_norm": 8.625,
"grad_norm_var": 418.4583333333333,
"learning_rate": 0.0001,
"loss": 6.7983,
"loss/crossentropy": 1.4629161357879639,
"loss/hidden": 0.06201171875,
"loss/logits": 0.0020404397509992123,
"loss/reg": 5.271305084228516,
"loss/twn": 0.0,
"step": 249
},
{
"epoch": 0.00625,
"grad_norm": 10.625,
"grad_norm_var": 418.6997395833333,
"learning_rate": 0.0001,
"loss": 6.9642,
"loss/crossentropy": 1.5569666624069214,
"loss/hidden": 0.12451171875,
"loss/logits": 0.011692131869494915,
"loss/reg": 5.27101469039917,
"loss/twn": 0.0,
"step": 250
},
{
"epoch": 0.006275,
"grad_norm": 174.0,
"grad_norm_var": 1921.1363118489583,
"learning_rate": 0.0001,
"loss": 6.8711,
"loss/crossentropy": 1.442520260810852,
"loss/hidden": 0.1484375,
"loss/logits": 0.009676285088062286,
"loss/reg": 5.270504951477051,
"loss/twn": 0.0,
"step": 251
},
{
"epoch": 0.0063,
"grad_norm": 7.75,
"grad_norm_var": 1925.5655598958333,
"learning_rate": 0.0001,
"loss": 6.6066,
"loss/crossentropy": 1.229660153388977,
"loss/hidden": 0.10107421875,
"loss/logits": 0.006052733864635229,
"loss/reg": 5.269782543182373,
"loss/twn": 0.0,
"step": 252
},
{
"epoch": 0.006325,
"grad_norm": 10.75,
"grad_norm_var": 1924.5325358072917,
"learning_rate": 0.0001,
"loss": 7.977,
"loss/crossentropy": 2.645150899887085,
"loss/hidden": 0.05712890625,
"loss/logits": 0.005253541748970747,
"loss/reg": 5.269515514373779,
"loss/twn": 0.0,
"step": 253
},
{
"epoch": 0.00635,
"grad_norm": 15.3125,
"grad_norm_var": 1919.1192057291667,
"learning_rate": 0.0001,
"loss": 8.4118,
"loss/crossentropy": 2.997927188873291,
"loss/hidden": 0.1337890625,
"loss/logits": 0.010878477245569229,
"loss/reg": 5.269217014312744,
"loss/twn": 0.0,
"step": 254
},
{
"epoch": 0.006375,
"grad_norm": 10.5625,
"grad_norm_var": 1638.7606608072917,
"learning_rate": 0.0001,
"loss": 7.96,
"loss/crossentropy": 2.5343592166900635,
"loss/hidden": 0.1494140625,
"loss/logits": 0.00772454310208559,
"loss/reg": 5.268545627593994,
"loss/twn": 0.0,
"step": 255
},
{
"epoch": 0.0064,
"grad_norm": 11.125,
"grad_norm_var": 1645.2782389322917,
"learning_rate": 0.0001,
"loss": 8.081,
"loss/crossentropy": 2.7416863441467285,
"loss/hidden": 0.06689453125,
"loss/logits": 0.004296740982681513,
"loss/reg": 5.268085479736328,
"loss/twn": 0.0,
"step": 256
},
{
"epoch": 0.006425,
"grad_norm": 83.5,
"grad_norm_var": 1869.7838541666667,
"learning_rate": 0.0001,
"loss": 7.679,
"loss/crossentropy": 2.278621196746826,
"loss/hidden": 0.12255859375,
"loss/logits": 0.01035086065530777,
"loss/reg": 5.267502307891846,
"loss/twn": 0.0,
"step": 257
},
{
"epoch": 0.00645,
"grad_norm": 11.3125,
"grad_norm_var": 1871.0296223958333,
"learning_rate": 0.0001,
"loss": 7.0164,
"loss/crossentropy": 1.6138280630111694,
"loss/hidden": 0.125,
"loss/logits": 0.01041114330291748,
"loss/reg": 5.267125606536865,
"loss/twn": 0.0,
"step": 258
},
{
"epoch": 0.006475,
"grad_norm": 9.375,
"grad_norm_var": 1874.0453125,
"learning_rate": 0.0001,
"loss": 8.1242,
"loss/crossentropy": 2.724980354309082,
"loss/hidden": 0.1220703125,
"loss/logits": 0.010582932271063328,
"loss/reg": 5.266530990600586,
"loss/twn": 0.0,
"step": 259
},
{
"epoch": 0.0065,
"grad_norm": 7.09375,
"grad_norm_var": 1890.6687133789062,
"learning_rate": 0.0001,
"loss": 7.1719,
"loss/crossentropy": 1.8638581037521362,
"loss/hidden": 0.0400390625,
"loss/logits": 0.002111276611685753,
"loss/reg": 5.265857696533203,
"loss/twn": 0.0,
"step": 260
},
{
"epoch": 0.006525,
"grad_norm": 12.9375,
"grad_norm_var": 1893.4873982747397,
"learning_rate": 0.0001,
"loss": 7.2962,
"loss/crossentropy": 1.825720191001892,
"loss/hidden": 0.189453125,
"loss/logits": 0.015128025785088539,
"loss/reg": 5.26585054397583,
"loss/twn": 0.0,
"step": 261
},
{
"epoch": 0.00655,
"grad_norm": 116.5,
"grad_norm_var": 2381.9933227539063,
"learning_rate": 0.0001,
"loss": 5.8142,
"loss/crossentropy": 0.3675730228424072,
"loss/hidden": 0.173828125,
"loss/logits": 0.0077675022184848785,
"loss/reg": 5.265065670013428,
"loss/twn": 0.0,
"step": 262
},
{
"epoch": 0.006575,
"grad_norm": 10.875,
"grad_norm_var": 2397.895048014323,
"learning_rate": 0.0001,
"loss": 7.9558,
"loss/crossentropy": 2.6135733127593994,
"loss/hidden": 0.07177734375,
"loss/logits": 0.005567469634115696,
"loss/reg": 5.264838695526123,
"loss/twn": 0.0,
"step": 263
},
{
"epoch": 0.0066,
"grad_norm": 26.75,
"grad_norm_var": 2370.424247233073,
"learning_rate": 0.0001,
"loss": 7.1463,
"loss/crossentropy": 1.744168996810913,
"loss/hidden": 0.130859375,
"loss/logits": 0.006982623599469662,
"loss/reg": 5.264274597167969,
"loss/twn": 0.0,
"step": 264
},
{
"epoch": 0.006625,
"grad_norm": 12.9375,
"grad_norm_var": 2357.603544108073,
"learning_rate": 0.0001,
"loss": 6.835,
"loss/crossentropy": 1.3916579484939575,
"loss/hidden": 0.16796875,
"loss/logits": 0.011533312499523163,
"loss/reg": 5.263826847076416,
"loss/twn": 0.0,
"step": 265
},
{
"epoch": 0.00665,
"grad_norm": 86.0,
"grad_norm_var": 2485.682157389323,
"learning_rate": 0.0001,
"loss": 8.1388,
"loss/crossentropy": 2.743070363998413,
"loss/hidden": 0.11962890625,
"loss/logits": 0.012774601578712463,
"loss/reg": 5.263358116149902,
"loss/twn": 0.0,
"step": 266
},
{
"epoch": 0.006675,
"grad_norm": 20.125,
"grad_norm_var": 1173.6974243164063,
"learning_rate": 0.0001,
"loss": 7.547,
"loss/crossentropy": 2.15224289894104,
"loss/hidden": 0.126953125,
"loss/logits": 0.004906866233795881,
"loss/reg": 5.262901782989502,
"loss/twn": 0.0,
"step": 267
},
{
"epoch": 0.0067,
"grad_norm": 8.25,
"grad_norm_var": 1172.3426066080729,
"learning_rate": 0.0001,
"loss": 8.0178,
"loss/crossentropy": 2.7544662952423096,
"loss/hidden": 2.9206275939941406e-06,
"loss/logits": 0.0010420402977615595,
"loss/reg": 5.262295246124268,
"loss/twn": 0.0,
"step": 268
},
{
"epoch": 0.006725,
"grad_norm": 10.5625,
"grad_norm_var": 1172.7845011393229,
"learning_rate": 0.0001,
"loss": 7.2573,
"loss/crossentropy": 1.708762288093567,
"loss/hidden": 0.27734375,
"loss/logits": 0.008786465972661972,
"loss/reg": 5.262362480163574,
"loss/twn": 0.0,
"step": 269
},
{
"epoch": 0.00675,
"grad_norm": 8.6875,
"grad_norm_var": 1187.023075358073,
"learning_rate": 0.0001,
"loss": 7.3198,
"loss/crossentropy": 1.9545139074325562,
"loss/hidden": 0.09619140625,
"loss/logits": 0.0075501929968595505,
"loss/reg": 5.261580467224121,
"loss/twn": 0.0,
"step": 270
},
{
"epoch": 0.006775,
"grad_norm": 27.75,
"grad_norm_var": 1165.726688639323,
"learning_rate": 0.0001,
"loss": 7.9432,
"loss/crossentropy": 2.5177128314971924,
"loss/hidden": 0.1552734375,
"loss/logits": 0.008861662819981575,
"loss/reg": 5.261343955993652,
"loss/twn": 0.0,
"step": 271
},
{
"epoch": 0.0068,
"grad_norm": 12.75,
"grad_norm_var": 1162.0217732747396,
"learning_rate": 0.0001,
"loss": 7.2029,
"loss/crossentropy": 1.8247922658920288,
"loss/hidden": 0.1103515625,
"loss/logits": 0.00664330180734396,
"loss/reg": 5.261136054992676,
"loss/twn": 0.0,
"step": 272
},
{
"epoch": 0.006825,
"grad_norm": 24.625,
"grad_norm_var": 951.5283162434896,
"learning_rate": 0.0001,
"loss": 6.3479,
"loss/crossentropy": 0.8956549167633057,
"loss/hidden": 0.1845703125,
"loss/logits": 0.00696325721219182,
"loss/reg": 5.2607598304748535,
"loss/twn": 0.0,
"step": 273
},
{
"epoch": 0.00685,
"grad_norm": 15.3125,
"grad_norm_var": 945.0106079101563,
"learning_rate": 0.0001,
"loss": 7.1854,
"loss/crossentropy": 1.796543002128601,
"loss/hidden": 0.12255859375,
"loss/logits": 0.005927722901105881,
"loss/reg": 5.26037073135376,
"loss/twn": 0.0,
"step": 274
},
{
"epoch": 0.006875,
"grad_norm": 18.625,
"grad_norm_var": 930.2756469726562,
"learning_rate": 0.0001,
"loss": 8.1342,
"loss/crossentropy": 2.680659770965576,
"loss/hidden": 0.181640625,
"loss/logits": 0.011690370738506317,
"loss/reg": 5.260243892669678,
"loss/twn": 0.0,
"step": 275
},
{
"epoch": 0.0069,
"grad_norm": 14.4375,
"grad_norm_var": 914.9025390625,
"learning_rate": 0.0001,
"loss": 8.5123,
"loss/crossentropy": 3.0964365005493164,
"loss/hidden": 0.1455078125,
"loss/logits": 0.010575573891401291,
"loss/reg": 5.259780406951904,
"loss/twn": 0.0,
"step": 276
},
{
"epoch": 0.006925,
"grad_norm": 12.6875,
"grad_norm_var": 915.3650390625,
"learning_rate": 0.0001,
"loss": 8.2582,
"loss/crossentropy": 2.825162172317505,
"loss/hidden": 0.15625,
"loss/logits": 0.01738206297159195,
"loss/reg": 5.259433746337891,
"loss/twn": 0.0,
"step": 277
},
{
"epoch": 0.00695,
"grad_norm": 21.625,
"grad_norm_var": 341.7171875,
"learning_rate": 0.0001,
"loss": 7.1032,
"loss/crossentropy": 1.7048217058181763,
"loss/hidden": 0.1279296875,
"loss/logits": 0.011121492832899094,
"loss/reg": 5.259332656860352,
"loss/twn": 0.0,
"step": 278
},
{
"epoch": 0.006975,
"grad_norm": 9.3125,
"grad_norm_var": 343.92706705729165,
"learning_rate": 0.0001,
"loss": 7.4175,
"loss/crossentropy": 1.9668402671813965,
"loss/hidden": 0.1826171875,
"loss/logits": 0.009085997007787228,
"loss/reg": 5.258953094482422,
"loss/twn": 0.0,
"step": 279
},
{
"epoch": 0.007,
"grad_norm": 14.3125,
"grad_norm_var": 343.48333333333335,
"learning_rate": 0.0001,
"loss": 7.5905,
"loss/crossentropy": 2.2082672119140625,
"loss/hidden": 0.1181640625,
"loss/logits": 0.005382226780056953,
"loss/reg": 5.258672714233398,
"loss/twn": 0.0,
"step": 280
},
{
"epoch": 0.007025,
"grad_norm": 17.75,
"grad_norm_var": 340.4792805989583,
"learning_rate": 0.0001,
"loss": 6.2281,
"loss/crossentropy": 0.711824893951416,
"loss/hidden": 0.251953125,
"loss/logits": 0.005929501727223396,
"loss/reg": 5.258391380310059,
"loss/twn": 0.0,
"step": 281
},
{
"epoch": 0.00705,
"grad_norm": 48.5,
"grad_norm_var": 99.24881184895834,
"learning_rate": 0.0001,
"loss": 6.6114,
"loss/crossentropy": 1.0958250761032104,
"loss/hidden": 0.25,
"loss/logits": 0.007801922038197517,
"loss/reg": 5.257816314697266,
"loss/twn": 0.0,
"step": 282
},
{
"epoch": 0.007075,
"grad_norm": 16.0,
"grad_norm_var": 99.05115559895833,
"learning_rate": 0.0001,
"loss": 6.7629,
"loss/crossentropy": 1.2992652654647827,
"loss/hidden": 0.1953125,
"loss/logits": 0.01093169767409563,
"loss/reg": 5.257413864135742,
"loss/twn": 0.0,
"step": 283
},
{
"epoch": 0.0071,
"grad_norm": 9.5,
"grad_norm_var": 97.594775390625,
"learning_rate": 0.0001,
"loss": 7.7528,
"loss/crossentropy": 2.3623929023742676,
"loss/hidden": 0.1279296875,
"loss/logits": 0.00520264683291316,
"loss/reg": 5.257322788238525,
"loss/twn": 0.0,
"step": 284
},
{
"epoch": 0.007125,
"grad_norm": 16.75,
"grad_norm_var": 94.1384765625,
"learning_rate": 0.0001,
"loss": 7.282,
"loss/crossentropy": 1.8802975416183472,
"loss/hidden": 0.1357421875,
"loss/logits": 0.008890845812857151,
"loss/reg": 5.2571001052856445,
"loss/twn": 0.0,
"step": 285
},
{
"epoch": 0.00715,
"grad_norm": 9.875,
"grad_norm_var": 92.745947265625,
"learning_rate": 0.0001,
"loss": 7.0654,
"loss/crossentropy": 1.632087230682373,
"loss/hidden": 0.1669921875,
"loss/logits": 0.009606104344129562,
"loss/reg": 5.256716728210449,
"loss/twn": 0.0,
"step": 286
},
{
"epoch": 0.007175,
"grad_norm": 29.25,
"grad_norm_var": 94.813916015625,
"learning_rate": 0.0001,
"loss": 7.8745,
"loss/crossentropy": 2.4687438011169434,
"loss/hidden": 0.140625,
"loss/logits": 0.00846975389868021,
"loss/reg": 5.256651878356934,
"loss/twn": 0.0,
"step": 287
},
{
"epoch": 0.0072,
"grad_norm": 28.25,
"grad_norm_var": 98.55167643229167,
"learning_rate": 0.0001,
"loss": 8.246,
"loss/crossentropy": 2.8182952404022217,
"loss/hidden": 0.158203125,
"loss/logits": 0.013284040614962578,
"loss/reg": 5.256263256072998,
"loss/twn": 0.0,
"step": 288
},
{
"epoch": 0.007225,
"grad_norm": 10.625,
"grad_norm_var": 100.62980143229167,
"learning_rate": 0.0001,
"loss": 8.0743,
"loss/crossentropy": 2.664623737335205,
"loss/hidden": 0.1435546875,
"loss/logits": 0.009994969703257084,
"loss/reg": 5.256109237670898,
"loss/twn": 0.0,
"step": 289
},
{
"epoch": 0.00725,
"grad_norm": 134.0,
"grad_norm_var": 933.7604166666666,
"learning_rate": 0.0001,
"loss": 8.0955,
"loss/crossentropy": 2.6748669147491455,
"loss/hidden": 0.158203125,
"loss/logits": 0.006934846751391888,
"loss/reg": 5.25545597076416,
"loss/twn": 0.0,
"step": 290
},
{
"epoch": 0.007275,
"grad_norm": 11.0625,
"grad_norm_var": 944.487744140625,
"learning_rate": 0.0001,
"loss": 7.7484,
"loss/crossentropy": 2.490112781524658,
"loss/hidden": 9.655952453613281e-06,
"loss/logits": 0.0029325929936021566,
"loss/reg": 5.255389213562012,
"loss/twn": 0.0,
"step": 291
},
{
"epoch": 0.0073,
"grad_norm": 9.375,
"grad_norm_var": 953.3853515625,
"learning_rate": 0.0001,
"loss": 7.3465,
"loss/crossentropy": 1.976689338684082,
"loss/hidden": 0.10791015625,
"loss/logits": 0.006814016494899988,
"loss/reg": 5.255037307739258,
"loss/twn": 0.0,
"step": 292
},
{
"epoch": 0.007325,
"grad_norm": 20.125,
"grad_norm_var": 944.7024576822917,
"learning_rate": 0.0001,
"loss": 7.0865,
"loss/crossentropy": 1.7022920846939087,
"loss/hidden": 0.12451171875,
"loss/logits": 0.004657519515603781,
"loss/reg": 5.25502347946167,
"loss/twn": 0.0,
"step": 293
},
{
"epoch": 0.00735,
"grad_norm": 10.8125,
"grad_norm_var": 957.44375,
"learning_rate": 0.0001,
"loss": 5.8776,
"loss/crossentropy": 0.5808318853378296,
"loss/hidden": 0.0400390625,
"loss/logits": 0.001980610191822052,
"loss/reg": 5.254761219024658,
"loss/twn": 0.0,
"step": 294
},
{
"epoch": 0.007375,
"grad_norm": 10.0,
"grad_norm_var": 956.0610514322917,
"learning_rate": 0.0001,
"loss": 7.66,
"loss/crossentropy": 2.271899700164795,
"loss/hidden": 0.125,
"loss/logits": 0.008504629135131836,
"loss/reg": 5.254581451416016,
"loss/twn": 0.0,
"step": 295
},
{
"epoch": 0.0074,
"grad_norm": 8.6875,
"grad_norm_var": 965.8755045572917,
"learning_rate": 0.0001,
"loss": 6.0481,
"loss/crossentropy": 0.639009952545166,
"loss/hidden": 0.1533203125,
"loss/logits": 0.0019615632481873035,
"loss/reg": 5.253849983215332,
"loss/twn": 0.0,
"step": 296
},
{
"epoch": 0.007425,
"grad_norm": 11.1875,
"grad_norm_var": 974.3947916666667,
"learning_rate": 0.0001,
"loss": 7.7422,
"loss/crossentropy": 2.3546876907348633,
"loss/hidden": 0.12451171875,
"loss/logits": 0.00921722687780857,
"loss/reg": 5.253818035125732,
"loss/twn": 0.0,
"step": 297
},
{
"epoch": 0.00745,
"grad_norm": 17.625,
"grad_norm_var": 933.1155598958334,
"learning_rate": 0.0001,
"loss": 8.3911,
"loss/crossentropy": 3.0292787551879883,
"loss/hidden": 0.10107421875,
"loss/logits": 0.00718055572360754,
"loss/reg": 5.253612518310547,
"loss/twn": 0.0,
"step": 298
},
{
"epoch": 0.007475,
"grad_norm": 440.0,
"grad_norm_var": 11825.940559895833,
"learning_rate": 0.0001,
"loss": 7.0072,
"loss/crossentropy": 1.5630390644073486,
"loss/hidden": 0.18359375,
"loss/logits": 0.007107208017259836,
"loss/reg": 5.253422737121582,
"loss/twn": 0.0,
"step": 299
},
{
"epoch": 0.0075,
"grad_norm": 10.875,
"grad_norm_var": 11818.895833333334,
"learning_rate": 0.0001,
"loss": 6.9788,
"loss/crossentropy": 1.483949065208435,
"loss/hidden": 0.2314453125,
"loss/logits": 0.010475615039467812,
"loss/reg": 5.25289249420166,
"loss/twn": 0.0,
"step": 300
},
{
"epoch": 0.007525,
"grad_norm": 9.8125,
"grad_norm_var": 11851.417171223959,
"learning_rate": 0.0001,
"loss": 7.5546,
"loss/crossentropy": 2.208834648132324,
"loss/hidden": 0.08642578125,
"loss/logits": 0.006687905173748732,
"loss/reg": 5.2526960372924805,
"loss/twn": 0.0,
"step": 301
},
{
"epoch": 0.00755,
"grad_norm": 11.25,
"grad_norm_var": 11844.504931640626,
"learning_rate": 0.0001,
"loss": 7.1337,
"loss/crossentropy": 1.752866506576538,
"loss/hidden": 0.1201171875,
"loss/logits": 0.008473502472043037,
"loss/reg": 5.2522807121276855,
"loss/twn": 0.0,
"step": 302
},
{
"epoch": 0.007575,
"grad_norm": 20.5,
"grad_norm_var": 11871.525113932292,
"learning_rate": 0.0001,
"loss": 5.9332,
"loss/crossentropy": 0.6589277386665344,
"loss/hidden": 0.02099609375,
"loss/logits": 0.0013422563206404448,
"loss/reg": 5.251956462860107,
"loss/twn": 0.0,
"step": 303
},
{
"epoch": 0.0076,
"grad_norm": 11.5625,
"grad_norm_var": 11932.343229166667,
"learning_rate": 0.0001,
"loss": 7.0088,
"loss/crossentropy": 1.5739790201187134,
"loss/hidden": 0.1748046875,
"loss/logits": 0.007969235070049763,
"loss/reg": 5.2520952224731445,
"loss/twn": 0.0,
"step": 304
},
{
"epoch": 0.007625,
"grad_norm": 8.9375,
"grad_norm_var": 11940.642301432292,
"learning_rate": 0.0001,
"loss": 8.1038,
"loss/crossentropy": 2.782174587249756,
"loss/hidden": 0.064453125,
"loss/logits": 0.005289027933031321,
"loss/reg": 5.251874923706055,
"loss/twn": 0.0,
"step": 305
},
{
"epoch": 0.00765,
"grad_norm": 19.75,
"grad_norm_var": 11425.267692057292,
"learning_rate": 0.0001,
"loss": 8.1897,
"loss/crossentropy": 2.776104688644409,
"loss/hidden": 0.150390625,
"loss/logits": 0.011526349931955338,
"loss/reg": 5.251704692840576,
"loss/twn": 0.0,
"step": 306
},
{
"epoch": 0.007675,
"grad_norm": 7.9375,
"grad_norm_var": 11437.715608723958,
"learning_rate": 0.0001,
"loss": 8.9456,
"loss/crossentropy": 3.692781686782837,
"loss/hidden": 5.245208740234375e-06,
"loss/logits": 0.0012750843307003379,
"loss/reg": 5.251509189605713,
"loss/twn": 0.0,
"step": 307
},
{
"epoch": 0.0077,
"grad_norm": 69.0,
"grad_norm_var": 11422.188264973958,
"learning_rate": 0.0001,
"loss": 8.1468,
"loss/crossentropy": 2.5686264038085938,
"loss/hidden": 0.30078125,
"loss/logits": 0.025831755250692368,
"loss/reg": 5.251588821411133,
"loss/twn": 0.0,
"step": 308
},
{
"epoch": 0.007725,
"grad_norm": 13.375,
"grad_norm_var": 11445.626936848957,
"learning_rate": 0.0001,
"loss": 7.1097,
"loss/crossentropy": 1.660717248916626,
"loss/hidden": 0.185546875,
"loss/logits": 0.01224461942911148,
"loss/reg": 5.25119161605835,
"loss/twn": 0.0,
"step": 309
},
{
"epoch": 0.00775,
"grad_norm": 17.875,
"grad_norm_var": 11418.828059895834,
"learning_rate": 0.0001,
"loss": 8.2796,
"loss/crossentropy": 2.8778281211853027,
"loss/hidden": 0.1376953125,
"loss/logits": 0.013323968276381493,
"loss/reg": 5.250760555267334,
"loss/twn": 0.0,
"step": 310
},
{
"epoch": 0.007775,
"grad_norm": 10.25,
"grad_norm_var": 11417.731184895832,
"learning_rate": 0.0001,
"loss": 7.1638,
"loss/crossentropy": 1.7495626211166382,
"loss/hidden": 0.1533203125,
"loss/logits": 0.010259518399834633,
"loss/reg": 5.250608921051025,
"loss/twn": 0.0,
"step": 311
},
{
"epoch": 0.0078,
"grad_norm": 21.0,
"grad_norm_var": 11370.812223307292,
"learning_rate": 0.0001,
"loss": 7.2684,
"loss/crossentropy": 1.8736885786056519,
"loss/hidden": 0.1357421875,
"loss/logits": 0.008427501656115055,
"loss/reg": 5.250565528869629,
"loss/twn": 0.0,
"step": 312
},
{
"epoch": 0.007825,
"grad_norm": 171.0,
"grad_norm_var": 12271.96328125,
"learning_rate": 0.0001,
"loss": 6.804,
"loss/crossentropy": 1.4117506742477417,
"loss/hidden": 0.13671875,
"loss/logits": 0.004921610467135906,
"loss/reg": 5.250616550445557,
"loss/twn": 0.0,
"step": 313
},
{
"epoch": 0.00785,
"grad_norm": 21.75,
"grad_norm_var": 12253.1322265625,
"learning_rate": 0.0001,
"loss": 7.013,
"loss/crossentropy": 1.5148544311523438,
"loss/hidden": 0.23828125,
"loss/logits": 0.009858010336756706,
"loss/reg": 5.24995756149292,
"loss/twn": 0.0,
"step": 314
},
{
"epoch": 0.007875,
"grad_norm": 127.0,
"grad_norm_var": 2269.4103515625,
"learning_rate": 0.0001,
"loss": 8.0458,
"loss/crossentropy": 2.6658105850219727,
"loss/hidden": 0.12255859375,
"loss/logits": 0.007101866416633129,
"loss/reg": 5.250338077545166,
"loss/twn": 0.0,
"step": 315
},
{
"epoch": 0.0079,
"grad_norm": 49.25,
"grad_norm_var": 2240.6091145833334,
"learning_rate": 0.0001,
"loss": 6.9261,
"loss/crossentropy": 1.5474319458007812,
"loss/hidden": 0.12451171875,
"loss/logits": 0.004424188286066055,
"loss/reg": 5.249726295471191,
"loss/twn": 0.0,
"step": 316
},
{
"epoch": 0.007925,
"grad_norm": 11.0625,
"grad_norm_var": 2236.19375,
"learning_rate": 0.0001,
"loss": 7.9458,
"loss/crossentropy": 2.6948788166046143,
"loss/hidden": 3.7550926208496094e-06,
"loss/logits": 0.0011138684349134564,
"loss/reg": 5.249834060668945,
"loss/twn": 0.0,
"step": 317
},
{
"epoch": 0.00795,
"grad_norm": 14.625,
"grad_norm_var": 2225.3322265625,
"learning_rate": 0.0001,
"loss": 7.097,
"loss/crossentropy": 1.674680233001709,
"loss/hidden": 0.1533203125,
"loss/logits": 0.018878858536481857,
"loss/reg": 5.250132083892822,
"loss/twn": 0.0,
"step": 318
},
{
"epoch": 0.007975,
"grad_norm": 280.0,
"grad_norm_var": 5856.9806640625,
"learning_rate": 0.0001,
"loss": 6.8544,
"loss/crossentropy": 1.4390206336975098,
"loss/hidden": 0.158203125,
"loss/logits": 0.007244464010000229,
"loss/reg": 5.249932765960693,
"loss/twn": 0.0,
"step": 319
},
{
"epoch": 0.008,
"grad_norm": 17.75,
"grad_norm_var": 5824.858837890625,
"learning_rate": 0.0001,
"loss": 8.0011,
"loss/crossentropy": 2.620617389678955,
"loss/hidden": 0.12255859375,
"loss/logits": 0.008633976802229881,
"loss/reg": 5.249265193939209,
"loss/twn": 0.0,
"step": 320
},
{
"epoch": 0.008025,
"grad_norm": 18.625,
"grad_norm_var": 5772.79609375,
"learning_rate": 0.0001,
"loss": 6.3629,
"loss/crossentropy": 0.973727822303772,
"loss/hidden": 0.1337890625,
"loss/logits": 0.005988460034132004,
"loss/reg": 5.249377250671387,
"loss/twn": 0.0,
"step": 321
},
{
"epoch": 0.00805,
"grad_norm": 7.71875,
"grad_norm_var": 5837.412365722656,
"learning_rate": 0.0001,
"loss": 6.802,
"loss/crossentropy": 1.4281165599822998,
"loss/hidden": 0.1171875,
"loss/logits": 0.007195750251412392,
"loss/reg": 5.249452114105225,
"loss/twn": 0.0,
"step": 322
},
{
"epoch": 0.008075,
"grad_norm": 10.375,
"grad_norm_var": 5822.930822753906,
"learning_rate": 0.0001,
"loss": 8.0202,
"loss/crossentropy": 2.667816162109375,
"loss/hidden": 0.09619140625,
"loss/logits": 0.00719710998237133,
"loss/reg": 5.249003887176514,
"loss/twn": 0.0,
"step": 323
},
{
"epoch": 0.0081,
"grad_norm": 30.5,
"grad_norm_var": 5837.498661295573,
"learning_rate": 0.0001,
"loss": 8.5258,
"loss/crossentropy": 3.144439697265625,
"loss/hidden": 0.1171875,
"loss/logits": 0.01504556369036436,
"loss/reg": 5.249162197113037,
"loss/twn": 0.0,
"step": 324
},
{
"epoch": 0.008125,
"grad_norm": 10.5625,
"grad_norm_var": 5852.246708170573,
"learning_rate": 0.0001,
"loss": 7.8167,
"loss/crossentropy": 2.3962008953094482,
"loss/hidden": 0.1650390625,
"loss/logits": 0.006836063228547573,
"loss/reg": 5.248575210571289,
"loss/twn": 0.0,
"step": 325
},
{
"epoch": 0.00815,
"grad_norm": 486.0,
"grad_norm_var": 17467.963993326823,
"learning_rate": 0.0001,
"loss": 5.9967,
"loss/crossentropy": 0.583743691444397,
"loss/hidden": 0.1572265625,
"loss/logits": 0.007230043411254883,
"loss/reg": 5.248484134674072,
"loss/twn": 0.0,
"step": 326
},
{
"epoch": 0.008175,
"grad_norm": 10.8125,
"grad_norm_var": 17462.717508951824,
"learning_rate": 0.0001,
"loss": 7.1253,
"loss/crossentropy": 1.6490944623947144,
"loss/hidden": 0.2138671875,
"loss/logits": 0.01414478849619627,
"loss/reg": 5.248198509216309,
"loss/twn": 0.0,
"step": 327
},
{
"epoch": 0.0082,
"grad_norm": 10.125,
"grad_norm_var": 17556.386942545574,
"learning_rate": 0.0001,
"loss": 7.8925,
"loss/crossentropy": 2.5412166118621826,
"loss/hidden": 0.09619140625,
"loss/logits": 0.006635190453380346,
"loss/reg": 5.248410701751709,
"loss/twn": 0.0,
"step": 328
},
{
"epoch": 0.008225,
"grad_norm": 12.6875,
"grad_norm_var": 17198.204911295572,
"learning_rate": 0.0001,
"loss": 6.9945,
"loss/crossentropy": 1.603257656097412,
"loss/hidden": 0.1328125,
"loss/logits": 0.010111295618116856,
"loss/reg": 5.248310565948486,
"loss/twn": 0.0,
"step": 329
},
{
"epoch": 0.00825,
"grad_norm": 12.5,
"grad_norm_var": 17262.97177327474,
"learning_rate": 0.0001,
"loss": 5.7919,
"loss/crossentropy": 0.3630053400993347,
"loss/hidden": 0.177734375,
"loss/logits": 0.002837226027622819,
"loss/reg": 5.248295783996582,
"loss/twn": 0.0,
"step": 330
},
{
"epoch": 0.008275,
"grad_norm": 22.0,
"grad_norm_var": 17144.92880452474,
"learning_rate": 0.0001,
"loss": 8.1295,
"loss/crossentropy": 2.8160948753356934,
"loss/hidden": 0.0595703125,
"loss/logits": 0.005728469230234623,
"loss/reg": 5.24811315536499,
"loss/twn": 0.0,
"step": 331
},
{
"epoch": 0.0083,
"grad_norm": 17.125,
"grad_norm_var": 17267.413732910158,
"learning_rate": 0.0001,
"loss": 6.9959,
"loss/crossentropy": 1.5546507835388184,
"loss/hidden": 0.18359375,
"loss/logits": 0.010080805979669094,
"loss/reg": 5.247556209564209,
"loss/twn": 0.0,
"step": 332
},
{
"epoch": 0.008325,
"grad_norm": 11.5625,
"grad_norm_var": 17264.114904785158,
"learning_rate": 0.0001,
"loss": 7.8893,
"loss/crossentropy": 2.5168728828430176,
"loss/hidden": 0.115234375,
"loss/logits": 0.009305896237492561,
"loss/reg": 5.247858047485352,
"loss/twn": 0.0,
"step": 333
},
{
"epoch": 0.00835,
"grad_norm": 13.4375,
"grad_norm_var": 17271.515751139323,
"learning_rate": 0.0001,
"loss": 8.1715,
"loss/crossentropy": 2.723308563232422,
"loss/hidden": 0.1884765625,
"loss/logits": 0.01238995511084795,
"loss/reg": 5.247326374053955,
"loss/twn": 0.0,
"step": 334
},
{
"epoch": 0.008375,
"grad_norm": 13.0,
"grad_norm_var": 13921.291532389323,
"learning_rate": 0.0001,
"loss": 8.3945,
"loss/crossentropy": 3.096419334411621,
"loss/hidden": 0.04736328125,
"loss/logits": 0.0033957725390791893,
"loss/reg": 5.247368812561035,
"loss/twn": 0.0,
"step": 335
},
{
"epoch": 0.0084,
"grad_norm": 17.75,
"grad_norm_var": 13921.291532389323,
"learning_rate": 0.0001,
"loss": 7.3812,
"loss/crossentropy": 1.8540621995925903,
"loss/hidden": 0.2734375,
"loss/logits": 0.006610853597521782,
"loss/reg": 5.247133731842041,
"loss/twn": 0.0,
"step": 336
},
{
"epoch": 0.008425,
"grad_norm": 13.3125,
"grad_norm_var": 13941.063993326823,
"learning_rate": 0.0001,
"loss": 8.1344,
"loss/crossentropy": 2.7475340366363525,
"loss/hidden": 0.1328125,
"loss/logits": 0.007097205147147179,
"loss/reg": 5.246928691864014,
"loss/twn": 0.0,
"step": 337
},
{
"epoch": 0.00845,
"grad_norm": 68.5,
"grad_norm_var": 13880.22734375,
"learning_rate": 0.0001,
"loss": 7.8648,
"loss/crossentropy": 2.4544427394866943,
"loss/hidden": 0.15234375,
"loss/logits": 0.010997762903571129,
"loss/reg": 5.246999263763428,
"loss/twn": 0.0,
"step": 338
},
{
"epoch": 0.008475,
"grad_norm": 18.75,
"grad_norm_var": 13843.137434895832,
"learning_rate": 0.0001,
"loss": 7.9608,
"loss/crossentropy": 2.603945255279541,
"loss/hidden": 0.099609375,
"loss/logits": 0.010800717398524284,
"loss/reg": 5.246415615081787,
"loss/twn": 0.0,
"step": 339
},
{
"epoch": 0.0085,
"grad_norm": 29.5,
"grad_norm_var": 13845.5384765625,
"learning_rate": 0.0001,
"loss": 6.9841,
"loss/crossentropy": 1.614331603050232,
"loss/hidden": 0.1171875,
"loss/logits": 0.005967713892459869,
"loss/reg": 5.246609687805176,
"loss/twn": 0.0,
"step": 340
},
{
"epoch": 0.008525,
"grad_norm": 11.0625,
"grad_norm_var": 13843.059830729168,
"learning_rate": 0.0001,
"loss": 7.04,
"loss/crossentropy": 1.6529104709625244,
"loss/hidden": 0.130859375,
"loss/logits": 0.009600062854588032,
"loss/reg": 5.246581554412842,
"loss/twn": 0.0,
"step": 341
},
{
"epoch": 0.00855,
"grad_norm": 13.5,
"grad_norm_var": 203.06764322916666,
"learning_rate": 0.0001,
"loss": 7.3224,
"loss/crossentropy": 1.903311848640442,
"loss/hidden": 0.162109375,
"loss/logits": 0.010554994456470013,
"loss/reg": 5.246466636657715,
"loss/twn": 0.0,
"step": 342
},
{
"epoch": 0.008575,
"grad_norm": 21.125,
"grad_norm_var": 199.17628580729166,
"learning_rate": 0.0001,
"loss": 7.0821,
"loss/crossentropy": 1.5746668577194214,
"loss/hidden": 0.24609375,
"loss/logits": 0.014962641522288322,
"loss/reg": 5.246390342712402,
"loss/twn": 0.0,
"step": 343
},
{
"epoch": 0.0086,
"grad_norm": 14.4375,
"grad_norm_var": 195.16588541666667,
"learning_rate": 0.0001,
"loss": 7.2347,
"loss/crossentropy": 1.8215773105621338,
"loss/hidden": 0.1552734375,
"loss/logits": 0.01197369396686554,
"loss/reg": 5.24590539932251,
"loss/twn": 0.0,
"step": 344
},
{
"epoch": 0.008625,
"grad_norm": 109.0,
"grad_norm_var": 688.8426920572916,
"learning_rate": 0.0001,
"loss": 7.5554,
"loss/crossentropy": 2.1289989948272705,
"loss/hidden": 0.1630859375,
"loss/logits": 0.017385877668857574,
"loss/reg": 5.245935440063477,
"loss/twn": 0.0,
"step": 345
},
{
"epoch": 0.00865,
"grad_norm": 238.0,
"grad_norm_var": 3478.8179524739585,
"learning_rate": 0.0001,
"loss": 7.9943,
"loss/crossentropy": 2.593379020690918,
"loss/hidden": 0.142578125,
"loss/logits": 0.012457353994250298,
"loss/reg": 5.245845317840576,
"loss/twn": 0.0,
"step": 346
},
{
"epoch": 0.008675,
"grad_norm": 43.5,
"grad_norm_var": 3457.530712890625,
"learning_rate": 0.0001,
"loss": 7.1267,
"loss/crossentropy": 1.6978092193603516,
"loss/hidden": 0.1728515625,
"loss/logits": 0.010361634194850922,
"loss/reg": 5.245694637298584,
"loss/twn": 0.0,
"step": 347
},
{
"epoch": 0.0087,
"grad_norm": 24.875,
"grad_norm_var": 3436.771207682292,
"learning_rate": 0.0001,
"loss": 6.7434,
"loss/crossentropy": 1.4021539688110352,
"loss/hidden": 0.08837890625,
"loss/logits": 0.007280138321220875,
"loss/reg": 5.245609283447266,
"loss/twn": 0.0,
"step": 348
},
{
"epoch": 0.008725,
"grad_norm": 26.25,
"grad_norm_var": 3391.9552083333333,
"learning_rate": 0.0001,
"loss": 8.1297,
"loss/crossentropy": 2.7253024578094482,
"loss/hidden": 0.146484375,
"loss/logits": 0.012663663364946842,
"loss/reg": 5.245262145996094,
"loss/twn": 0.0,
"step": 349
},
{
"epoch": 0.00875,
"grad_norm": 10.875,
"grad_norm_var": 3402.2098795572915,
"learning_rate": 0.0001,
"loss": 7.2976,
"loss/crossentropy": 1.8926178216934204,
"loss/hidden": 0.146484375,
"loss/logits": 0.013426396995782852,
"loss/reg": 5.245053291320801,
"loss/twn": 0.0,
"step": 350
},
{
"epoch": 0.008775,
"grad_norm": 13.0625,
"grad_norm_var": 3401.967708333333,
"learning_rate": 0.0001,
"loss": 7.3178,
"loss/crossentropy": 1.8622666597366333,
"loss/hidden": 0.1953125,
"loss/logits": 0.015104337595403194,
"loss/reg": 5.245081424713135,
"loss/twn": 0.0,
"step": 351
},
{
"epoch": 0.0088,
"grad_norm": 29.875,
"grad_norm_var": 3371.8004557291665,
"learning_rate": 0.0001,
"loss": 7.9433,
"loss/crossentropy": 2.5869476795196533,
"loss/hidden": 0.10546875,
"loss/logits": 0.0058593666180968285,
"loss/reg": 5.2449822425842285,
"loss/twn": 0.0,
"step": 352
},
{
"epoch": 0.008825,
"grad_norm": 28.5,
"grad_norm_var": 3326.400113932292,
"learning_rate": 0.0001,
"loss": 7.0321,
"loss/crossentropy": 1.740645408630371,
"loss/hidden": 0.04248046875,
"loss/logits": 0.004086637869477272,
"loss/reg": 5.244920253753662,
"loss/twn": 0.0,
"step": 353
},
{
"epoch": 0.00885,
"grad_norm": 11.75,
"grad_norm_var": 3340.7945149739585,
"learning_rate": 0.0001,
"loss": 7.3477,
"loss/crossentropy": 1.9114320278167725,
"loss/hidden": 0.1787109375,
"loss/logits": 0.012824185192584991,
"loss/reg": 5.244693279266357,
"loss/twn": 0.0,
"step": 354
},
{
"epoch": 0.008875,
"grad_norm": 9.6875,
"grad_norm_var": 3371.9114583333335,
"learning_rate": 0.0001,
"loss": 6.4325,
"loss/crossentropy": 1.1136820316314697,
"loss/hidden": 0.0732421875,
"loss/logits": 0.001058733556419611,
"loss/reg": 5.244504451751709,
"loss/twn": 0.0,
"step": 355
},
{
"epoch": 0.0089,
"grad_norm": 9.625,
"grad_norm_var": 3423.5968098958333,
"learning_rate": 0.0001,
"loss": 7.5243,
"loss/crossentropy": 2.1711511611938477,
"loss/hidden": 0.0986328125,
"loss/logits": 0.009911064058542252,
"loss/reg": 5.244630813598633,
"loss/twn": 0.0,
"step": 356
},
{
"epoch": 0.008925,
"grad_norm": 13.25,
"grad_norm_var": 3415.909228515625,
"learning_rate": 0.0001,
"loss": 7.3907,
"loss/crossentropy": 1.9722574949264526,
"loss/hidden": 0.1611328125,
"loss/logits": 0.012831033207476139,
"loss/reg": 5.244527816772461,
"loss/twn": 0.0,
"step": 357
},
{
"epoch": 0.00895,
"grad_norm": 258.0,
"grad_norm_var": 6334.500634765625,
"learning_rate": 0.0001,
"loss": 6.6838,
"loss/crossentropy": 1.2551920413970947,
"loss/hidden": 0.1796875,
"loss/logits": 0.004792730323970318,
"loss/reg": 5.244173526763916,
"loss/twn": 0.0,
"step": 358
},
{
"epoch": 0.008975,
"grad_norm": 9.375,
"grad_norm_var": 6394.419514973958,
"learning_rate": 0.0001,
"loss": 7.731,
"loss/crossentropy": 2.3687314987182617,
"loss/hidden": 0.10791015625,
"loss/logits": 0.010197984986007214,
"loss/reg": 5.244191646575928,
"loss/twn": 0.0,
"step": 359
},
{
"epoch": 0.009,
"grad_norm": 10.75,
"grad_norm_var": 6414.292643229167,
"learning_rate": 0.0001,
"loss": 6.3021,
"loss/crossentropy": 0.8574244379997253,
"loss/hidden": 0.19140625,
"loss/logits": 0.009014951065182686,
"loss/reg": 5.244270324707031,
"loss/twn": 0.0,
"step": 360
},
{
"epoch": 0.009025,
"grad_norm": 12.0,
"grad_norm_var": 6276.774934895833,
"learning_rate": 0.0001,
"loss": 7.8089,
"loss/crossentropy": 2.456528902053833,
"loss/hidden": 0.09619140625,
"loss/logits": 0.012111629359424114,
"loss/reg": 5.244028091430664,
"loss/twn": 0.0,
"step": 361
},
{
"epoch": 0.00905,
"grad_norm": 12.6875,
"grad_norm_var": 3706.745556640625,
"learning_rate": 0.0001,
"loss": 6.1318,
"loss/crossentropy": 0.654383659362793,
"loss/hidden": 0.2255859375,
"loss/logits": 0.00800924189388752,
"loss/reg": 5.243789196014404,
"loss/twn": 0.0,
"step": 362
},
{
"epoch": 0.009075,
"grad_norm": 60.0,
"grad_norm_var": 3747.402587890625,
"learning_rate": 0.0001,
"loss": 5.8096,
"loss/crossentropy": 0.38534435629844666,
"loss/hidden": 0.17578125,
"loss/logits": 0.004911348223686218,
"loss/reg": 5.243527412414551,
"loss/twn": 0.0,
"step": 363
},
{
"epoch": 0.0091,
"grad_norm": 8.625,
"grad_norm_var": 3783.2118326822915,
"learning_rate": 0.0001,
"loss": 7.0141,
"loss/crossentropy": 1.7209011316299438,
"loss/hidden": 0.04736328125,
"loss/logits": 0.002276923507452011,
"loss/reg": 5.243542671203613,
"loss/twn": 0.0,
"step": 364
},
{
"epoch": 0.009125,
"grad_norm": 700.0,
"grad_norm_var": 31568.732014973957,
"learning_rate": 0.0001,
"loss": 6.5463,
"loss/crossentropy": 1.1213988065719604,
"loss/hidden": 0.171875,
"loss/logits": 0.009275542572140694,
"loss/reg": 5.243773937225342,
"loss/twn": 0.0,
"step": 365
},
{
"epoch": 0.00915,
"grad_norm": 9.5,
"grad_norm_var": 31580.584228515625,
"learning_rate": 0.0001,
"loss": 7.2758,
"loss/crossentropy": 1.888658046722412,
"loss/hidden": 0.1328125,
"loss/logits": 0.011050861328840256,
"loss/reg": 5.243287086486816,
"loss/twn": 0.0,
"step": 366
},
{
"epoch": 0.009175,
"grad_norm": 7.6875,
"grad_norm_var": 31626.63006184896,
"learning_rate": 0.0001,
"loss": 6.2879,
"loss/crossentropy": 0.9124002456665039,
"loss/hidden": 0.1220703125,
"loss/logits": 0.00983446091413498,
"loss/reg": 5.243640899658203,
"loss/twn": 0.0,
"step": 367
},
{
"epoch": 0.0092,
"grad_norm": 17.75,
"grad_norm_var": 31707.892822265625,
"learning_rate": 0.0001,
"loss": 8.2629,
"loss/crossentropy": 2.844151735305786,
"loss/hidden": 0.16015625,
"loss/logits": 0.015287065878510475,
"loss/reg": 5.243282794952393,
"loss/twn": 0.0,
"step": 368
},
{
"epoch": 0.009225,
"grad_norm": 10.8125,
"grad_norm_var": 31834.040625,
"learning_rate": 0.0001,
"loss": 7.021,
"loss/crossentropy": 1.6278934478759766,
"loss/hidden": 0.142578125,
"loss/logits": 0.007217081263661385,
"loss/reg": 5.243272304534912,
"loss/twn": 0.0,
"step": 369
},
{
"epoch": 0.00925,
"grad_norm": 8.875,
"grad_norm_var": 31857.8806640625,
"learning_rate": 0.0001,
"loss": 7.2069,
"loss/crossentropy": 1.8656002283096313,
"loss/hidden": 0.09375,
"loss/logits": 0.004388316534459591,
"loss/reg": 5.243135452270508,
"loss/twn": 0.0,
"step": 370
},
{
"epoch": 0.009275,
"grad_norm": 14.0625,
"grad_norm_var": 31822.486458333333,
"learning_rate": 0.0001,
"loss": 6.0393,
"loss/crossentropy": 0.5794708132743835,
"loss/hidden": 0.2080078125,
"loss/logits": 0.008730066008865833,
"loss/reg": 5.243083953857422,
"loss/twn": 0.0,
"step": 371
},
{
"epoch": 0.0093,
"grad_norm": 10.625,
"grad_norm_var": 31814.140625,
"learning_rate": 0.0001,
"loss": 7.7146,
"loss/crossentropy": 2.3672714233398438,
"loss/hidden": 0.0986328125,
"loss/logits": 0.005637788213789463,
"loss/reg": 5.243073463439941,
"loss/twn": 0.0,
"step": 372
},
{
"epoch": 0.009325,
"grad_norm": 11.125,
"grad_norm_var": 31831.281184895834,
"learning_rate": 0.0001,
"loss": 7.1862,
"loss/crossentropy": 1.8489041328430176,
"loss/hidden": 0.08837890625,
"loss/logits": 0.00582461804151535,
"loss/reg": 5.243066787719727,
"loss/twn": 0.0,
"step": 373
},
{
"epoch": 0.00935,
"grad_norm": 10.3125,
"grad_norm_var": 29543.332405598958,
"learning_rate": 0.0001,
"loss": 8.1468,
"loss/crossentropy": 2.779754877090454,
"loss/hidden": 0.11279296875,
"loss/logits": 0.011424477212131023,
"loss/reg": 5.242800235748291,
"loss/twn": 0.0,
"step": 374
},
{
"epoch": 0.009375,
"grad_norm": 13.5625,
"grad_norm_var": 29517.761393229168,
"learning_rate": 0.0001,
"loss": 5.9118,
"loss/crossentropy": 0.5431471467018127,
"loss/hidden": 0.1171875,
"loss/logits": 0.008679039776325226,
"loss/reg": 5.242814064025879,
"loss/twn": 0.0,
"step": 375
},
{
"epoch": 0.0094,
"grad_norm": 13.3125,
"grad_norm_var": 29502.23357747396,
"learning_rate": 0.0001,
"loss": 7.1375,
"loss/crossentropy": 1.6137068271636963,
"loss/hidden": 0.267578125,
"loss/logits": 0.013521241024136543,
"loss/reg": 5.2426886558532715,
"loss/twn": 0.0,
"step": 376
},
{
"epoch": 0.009425,
"grad_norm": 20.25,
"grad_norm_var": 29456.37303059896,
"learning_rate": 0.0001,
"loss": 8.1117,
"loss/crossentropy": 2.5268547534942627,
"loss/hidden": 0.328125,
"loss/logits": 0.014234257861971855,
"loss/reg": 5.242476463317871,
"loss/twn": 0.0,
"step": 377
},
{
"epoch": 0.00945,
"grad_norm": 12.625,
"grad_norm_var": 29456.751497395835,
"learning_rate": 0.0001,
"loss": 7.0252,
"loss/crossentropy": 1.6614928245544434,
"loss/hidden": 0.1123046875,
"loss/logits": 0.009039688855409622,
"loss/reg": 5.242366313934326,
"loss/twn": 0.0,
"step": 378
},
{
"epoch": 0.009475,
"grad_norm": 12.375,
"grad_norm_var": 29586.256770833334,
"learning_rate": 0.0001,
"loss": 7.3434,
"loss/crossentropy": 1.9416403770446777,
"loss/hidden": 0.1513671875,
"loss/logits": 0.008125634863972664,
"loss/reg": 5.242313385009766,
"loss/twn": 0.0,
"step": 379
},
{
"epoch": 0.0095,
"grad_norm": 9.3125,
"grad_norm_var": 29582.02667643229,
"learning_rate": 0.0001,
"loss": 7.2748,
"loss/crossentropy": 1.8965728282928467,
"loss/hidden": 0.12890625,
"loss/logits": 0.0069448379799723625,
"loss/reg": 5.242362976074219,
"loss/twn": 0.0,
"step": 380
},
{
"epoch": 0.009525,
"grad_norm": 10.75,
"grad_norm_var": 10.688785807291667,
"learning_rate": 0.0001,
"loss": 7.591,
"loss/crossentropy": 2.19319224357605,
"loss/hidden": 0.146484375,
"loss/logits": 0.009041574783623219,
"loss/reg": 5.242304801940918,
"loss/twn": 0.0,
"step": 381
},
{
"epoch": 0.00955,
"grad_norm": 17.875,
"grad_norm_var": 12.215478515625,
"learning_rate": 0.0001,
"loss": 8.1746,
"loss/crossentropy": 2.799837827682495,
"loss/hidden": 0.1220703125,
"loss/logits": 0.010410355404019356,
"loss/reg": 5.242269992828369,
"loss/twn": 0.0,
"step": 382
},
{
"epoch": 0.009575,
"grad_norm": 142.0,
"grad_norm_var": 1052.0530598958333,
"learning_rate": 0.0001,
"loss": 5.5973,
"loss/crossentropy": 0.23081077635288239,
"loss/hidden": 0.119140625,
"loss/logits": 0.00518256239593029,
"loss/reg": 5.242154121398926,
"loss/twn": 0.0,
"step": 383
},
{
"epoch": 0.0096,
"grad_norm": 178.0,
"grad_norm_var": 2588.1160807291667,
"learning_rate": 0.0001,
"loss": 7.9228,
"loss/crossentropy": 2.5279970169067383,
"loss/hidden": 0.146484375,
"loss/logits": 0.006114904303103685,
"loss/reg": 5.242175102233887,
"loss/twn": 0.0,
"step": 384
},
{
"epoch": 0.009625,
"grad_norm": 9.6875,
"grad_norm_var": 2591.222135416667,
"learning_rate": 0.0001,
"loss": 8.0683,
"loss/crossentropy": 2.7547900676727295,
"loss/hidden": 0.064453125,
"loss/logits": 0.007344301789999008,
"loss/reg": 5.241701602935791,
"loss/twn": 0.0,
"step": 385
},
{
"epoch": 0.00965,
"grad_norm": 15.4375,
"grad_norm_var": 2574.622770182292,
"learning_rate": 0.0001,
"loss": 8.1417,
"loss/crossentropy": 2.6952314376831055,
"loss/hidden": 0.185546875,
"loss/logits": 0.019006717950105667,
"loss/reg": 5.241945743560791,
"loss/twn": 0.0,
"step": 386
},
{
"epoch": 0.009675,
"grad_norm": 10.5625,
"grad_norm_var": 2583.447509765625,
"learning_rate": 0.0001,
"loss": 7.02,
"loss/crossentropy": 1.5960569381713867,
"loss/hidden": 0.16796875,
"loss/logits": 0.014128390699625015,
"loss/reg": 5.241806507110596,
"loss/twn": 0.0,
"step": 387
},
{
"epoch": 0.0097,
"grad_norm": 9.875,
"grad_norm_var": 2585.531494140625,
"learning_rate": 0.0001,
"loss": 7.6158,
"loss/crossentropy": 2.310436487197876,
"loss/hidden": 0.05712890625,
"loss/logits": 0.006508246064186096,
"loss/reg": 5.241683006286621,
"loss/twn": 0.0,
"step": 388
},
{
"epoch": 0.009725,
"grad_norm": 13.625,
"grad_norm_var": 2579.2749837239585,
"learning_rate": 0.0001,
"loss": 8.1291,
"loss/crossentropy": 2.7542238235473633,
"loss/hidden": 0.12255859375,
"loss/logits": 0.010532179847359657,
"loss/reg": 5.241789817810059,
"loss/twn": 0.0,
"step": 389
},
{
"epoch": 0.00975,
"grad_norm": 8.0625,
"grad_norm_var": 2585.864436848958,
"learning_rate": 0.0001,
"loss": 7.6721,
"loss/crossentropy": 2.319371223449707,
"loss/hidden": 0.107421875,
"loss/logits": 0.003740239655598998,
"loss/reg": 5.241525650024414,
"loss/twn": 0.0,
"step": 390
},
{
"epoch": 0.009775,
"grad_norm": 10.4375,
"grad_norm_var": 2593.774593098958,
"learning_rate": 0.0001,
"loss": 8.1857,
"loss/crossentropy": 2.8464860916137695,
"loss/hidden": 0.09130859375,
"loss/logits": 0.006419371347874403,
"loss/reg": 5.241455078125,
"loss/twn": 0.0,
"step": 391
},
{
"epoch": 0.0098,
"grad_norm": 17.125,
"grad_norm_var": 2585.749479166667,
"learning_rate": 0.0001,
"loss": 8.0076,
"loss/crossentropy": 2.555150032043457,
"loss/hidden": 0.2001953125,
"loss/logits": 0.01086280308663845,
"loss/reg": 5.241420745849609,
"loss/twn": 0.0,
"step": 392
},
{
"epoch": 0.009825,
"grad_norm": 9.0,
"grad_norm_var": 2609.972135416667,
"learning_rate": 0.0001,
"loss": 6.3665,
"loss/crossentropy": 1.016921877861023,
"loss/hidden": 0.10205078125,
"loss/logits": 0.005952711217105389,
"loss/reg": 5.241562843322754,
"loss/twn": 0.0,
"step": 393
},
{
"epoch": 0.00985,
"grad_norm": 10.875,
"grad_norm_var": 2614.3161458333334,
"learning_rate": 0.0001,
"loss": 7.8429,
"loss/crossentropy": 2.5393762588500977,
"loss/hidden": 0.05712890625,
"loss/logits": 0.005034131929278374,
"loss/reg": 5.241337299346924,
"loss/twn": 0.0,
"step": 394
},
{
"epoch": 0.009875,
"grad_norm": 80.0,
"grad_norm_var": 2738.4009765625,
"learning_rate": 0.0001,
"loss": 8.0734,
"loss/crossentropy": 2.591184377670288,
"loss/hidden": 0.2294921875,
"loss/logits": 0.01196893397718668,
"loss/reg": 5.24077033996582,
"loss/twn": 0.0,
"step": 395
},
{
"epoch": 0.0099,
"grad_norm": 20.375,
"grad_norm_var": 2708.840478515625,
"learning_rate": 0.0001,
"loss": 8.3222,
"loss/crossentropy": 2.96229887008667,
"loss/hidden": 0.10791015625,
"loss/logits": 0.010710952803492546,
"loss/reg": 5.2412495613098145,
"loss/twn": 0.0,
"step": 396
},
{
"epoch": 0.009925,
"grad_norm": 14.0,
"grad_norm_var": 2698.892431640625,
"learning_rate": 0.0001,
"loss": 7.2185,
"loss/crossentropy": 1.843474268913269,
"loss/hidden": 0.125,
"loss/logits": 0.009277286008000374,
"loss/reg": 5.240739345550537,
"loss/twn": 0.0,
"step": 397
},
{
"epoch": 0.00995,
"grad_norm": 11.0,
"grad_norm_var": 2717.9419108072916,
"learning_rate": 0.0001,
"loss": 7.7714,
"loss/crossentropy": 2.3742737770080566,
"loss/hidden": 0.146484375,
"loss/logits": 0.009680403396487236,
"loss/reg": 5.240973472595215,
"loss/twn": 0.0,
"step": 398
},
{
"epoch": 0.009975,
"grad_norm": 12.0,
"grad_norm_var": 1919.5929524739583,
"learning_rate": 0.0001,
"loss": 8.0069,
"loss/crossentropy": 2.649110794067383,
"loss/hidden": 0.10791015625,
"loss/logits": 0.00901185255497694,
"loss/reg": 5.24085807800293,
"loss/twn": 0.0,
"step": 399
},
{
"epoch": 0.01,
"grad_norm": 9.5,
"grad_norm_var": 298.921337890625,
"learning_rate": 0.0001,
"loss": 7.3041,
"loss/crossentropy": 1.9669857025146484,
"loss/hidden": 0.0908203125,
"loss/logits": 0.005569307133555412,
"loss/reg": 5.240681171417236,
"loss/twn": 0.0,
"step": 400
},
{
"epoch": 0.010025,
"grad_norm": 169.0,
"grad_norm_var": 1743.7280598958334,
"learning_rate": 0.0001,
"loss": 5.7116,
"loss/crossentropy": 0.3063473105430603,
"loss/hidden": 0.1611328125,
"loss/logits": 0.003124656155705452,
"loss/reg": 5.2410149574279785,
"loss/twn": 0.0,
"step": 401
},
{
"epoch": 0.01005,
"grad_norm": 61.75,
"grad_norm_var": 1810.6761555989583,
"learning_rate": 0.0001,
"loss": 7.4546,
"loss/crossentropy": 2.0817315578460693,
"loss/hidden": 0.1279296875,
"loss/logits": 0.004020760301500559,
"loss/reg": 5.240888595581055,
"loss/twn": 0.0,
"step": 402
},
{
"epoch": 0.010075,
"grad_norm": 19.25,
"grad_norm_var": 1793.8056640625,
"learning_rate": 0.0001,
"loss": 6.645,
"loss/crossentropy": 1.261731743812561,
"loss/hidden": 0.13671875,
"loss/logits": 0.006218242458999157,
"loss/reg": 5.240310192108154,
"loss/twn": 0.0,
"step": 403
},
{
"epoch": 0.0101,
"grad_norm": 8.875,
"grad_norm_var": 1796.5171223958334,
"learning_rate": 0.0001,
"loss": 6.3703,
"loss/crossentropy": 0.9765498638153076,
"loss/hidden": 0.1455078125,
"loss/logits": 0.007485189475119114,
"loss/reg": 5.240753650665283,
"loss/twn": 0.0,
"step": 404
},
{
"epoch": 0.010125,
"grad_norm": 10.8125,
"grad_norm_var": 1803.0320149739584,
"learning_rate": 0.0001,
"loss": 7.0126,
"loss/crossentropy": 1.6325502395629883,
"loss/hidden": 0.1318359375,
"loss/logits": 0.007993818260729313,
"loss/reg": 5.240240097045898,
"loss/twn": 0.0,
"step": 405
},
{
"epoch": 0.01015,
"grad_norm": 41.25,
"grad_norm_var": 1776.9919270833334,
"learning_rate": 0.0001,
"loss": 6.1547,
"loss/crossentropy": 0.801928699016571,
"loss/hidden": 0.10986328125,
"loss/logits": 0.002597600221633911,
"loss/reg": 5.240261554718018,
"loss/twn": 0.0,
"step": 406
},
{
"epoch": 0.010175,
"grad_norm": 15.4375,
"grad_norm_var": 1764.4606770833334,
"learning_rate": 0.0001,
"loss": 7.3067,
"loss/crossentropy": 1.9361686706542969,
"loss/hidden": 0.11962890625,
"loss/logits": 0.01030397042632103,
"loss/reg": 5.240562438964844,
"loss/twn": 0.0,
"step": 407
},
{
"epoch": 0.0102,
"grad_norm": 9.375,
"grad_norm_var": 1783.4723958333334,
"learning_rate": 0.0001,
"loss": 8.0074,
"loss/crossentropy": 2.654067039489746,
"loss/hidden": 0.10546875,
"loss/logits": 0.0074330884963274,
"loss/reg": 5.240452766418457,
"loss/twn": 0.0,
"step": 408
},
{
"epoch": 0.010225,
"grad_norm": 8.0625,
"grad_norm_var": 1786.3281087239584,
"learning_rate": 0.0001,
"loss": 7.3042,
"loss/crossentropy": 1.928905963897705,
"loss/hidden": 0.125,
"loss/logits": 0.009886398911476135,
"loss/reg": 5.240396499633789,
"loss/twn": 0.0,
"step": 409
},
{
"epoch": 0.01025,
"grad_norm": 11.8125,
"grad_norm_var": 1783.8239583333334,
"learning_rate": 0.0001,
"loss": 7.1916,
"loss/crossentropy": 1.7467641830444336,
"loss/hidden": 0.1943359375,
"loss/logits": 0.010304899886250496,
"loss/reg": 5.2401838302612305,
"loss/twn": 0.0,
"step": 410
},
{
"epoch": 0.010275,
"grad_norm": 9.6875,
"grad_norm_var": 1637.2480305989584,
"learning_rate": 0.0001,
"loss": 7.8017,
"loss/crossentropy": 2.478278636932373,
"loss/hidden": 0.07666015625,
"loss/logits": 0.0064398422837257385,
"loss/reg": 5.2403669357299805,
"loss/twn": 0.0,
"step": 411
},
{
"epoch": 0.0103,
"grad_norm": 13.75,
"grad_norm_var": 1645.8536295572917,
"learning_rate": 0.0001,
"loss": 5.9725,
"loss/crossentropy": 0.5244819521903992,
"loss/hidden": 0.1982421875,
"loss/logits": 0.009226880967617035,
"loss/reg": 5.24050760269165,
"loss/twn": 0.0,
"step": 412
},
{
"epoch": 0.010325,
"grad_norm": 12.75,
"grad_norm_var": 1648.0508951822917,
"learning_rate": 0.0001,
"loss": 6.1233,
"loss/crossentropy": 0.6444658041000366,
"loss/hidden": 0.228515625,
"loss/logits": 0.009683252312242985,
"loss/reg": 5.240647792816162,
"loss/twn": 0.0,
"step": 413
},
{
"epoch": 0.01035,
"grad_norm": 30.375,
"grad_norm_var": 1631.4206868489584,
"learning_rate": 0.0001,
"loss": 7.3483,
"loss/crossentropy": 1.942959189414978,
"loss/hidden": 0.150390625,
"loss/logits": 0.014488045126199722,
"loss/reg": 5.2405009269714355,
"loss/twn": 0.0,
"step": 414
},
{
"epoch": 0.010375,
"grad_norm": 148.0,
"grad_norm_var": 2502.174853515625,
"learning_rate": 0.0001,
"loss": 5.7032,
"loss/crossentropy": 0.21244874596595764,
"loss/hidden": 0.2451171875,
"loss/logits": 0.005563709884881973,
"loss/reg": 5.240046501159668,
"loss/twn": 0.0,
"step": 415
},
{
"epoch": 0.0104,
"grad_norm": 11.3125,
"grad_norm_var": 2495.9203125,
"learning_rate": 0.0001,
"loss": 8.1335,
"loss/crossentropy": 2.8577675819396973,
"loss/hidden": 0.0302734375,
"loss/logits": 0.005365458317101002,
"loss/reg": 5.240131378173828,
"loss/twn": 0.0,
"step": 416
},
{
"epoch": 0.010425,
"grad_norm": 10.8125,
"grad_norm_var": 1261.934228515625,
"learning_rate": 0.0001,
"loss": 6.9774,
"loss/crossentropy": 1.5951570272445679,
"loss/hidden": 0.134765625,
"loss/logits": 0.007291505113244057,
"loss/reg": 5.240211486816406,
"loss/twn": 0.0,
"step": 417
},
{
"epoch": 0.01045,
"grad_norm": 9.875,
"grad_norm_var": 1186.0130045572917,
"learning_rate": 0.0001,
"loss": 7.6738,
"loss/crossentropy": 2.354665756225586,
"loss/hidden": 0.07421875,
"loss/logits": 0.004681308753788471,
"loss/reg": 5.240237236022949,
"loss/twn": 0.0,
"step": 418
},
{
"epoch": 0.010475,
"grad_norm": 18.0,
"grad_norm_var": 1186.7714680989584,
"learning_rate": 0.0001,
"loss": 7.9042,
"loss/crossentropy": 2.479489326477051,
"loss/hidden": 0.1748046875,
"loss/logits": 0.009896760806441307,
"loss/reg": 5.239961624145508,
"loss/twn": 0.0,
"step": 419
},
{
"epoch": 0.0105,
"grad_norm": 78.0,
"grad_norm_var": 1353.9675618489584,
"learning_rate": 0.0001,
"loss": 6.7,
"loss/crossentropy": 1.1504077911376953,
"loss/hidden": 0.306640625,
"loss/logits": 0.0026911741588264704,
"loss/reg": 5.240240097045898,
"loss/twn": 0.0,
"step": 420
},
{
"epoch": 0.010525,
"grad_norm": 14.5625,
"grad_norm_var": 1346.5242024739584,
"learning_rate": 0.0001,
"loss": 8.0741,
"loss/crossentropy": 2.7434136867523193,
"loss/hidden": 0.083984375,
"loss/logits": 0.006712112110108137,
"loss/reg": 5.2399516105651855,
"loss/twn": 0.0,
"step": 421
},
{
"epoch": 0.01055,
"grad_norm": 14.375,
"grad_norm_var": 1343.0808430989584,
"learning_rate": 0.0001,
"loss": 6.6885,
"loss/crossentropy": 1.297805666923523,
"loss/hidden": 0.1455078125,
"loss/logits": 0.005279569886624813,
"loss/reg": 5.2398576736450195,
"loss/twn": 0.0,
"step": 422
},
{
"epoch": 0.010575,
"grad_norm": 10.125,
"grad_norm_var": 1352.3348307291667,
"learning_rate": 0.0001,
"loss": 5.9427,
"loss/crossentropy": 0.5282614231109619,
"loss/hidden": 0.1669921875,
"loss/logits": 0.007687091361731291,
"loss/reg": 5.239724159240723,
"loss/twn": 0.0,
"step": 423
},
{
"epoch": 0.0106,
"grad_norm": 13.6875,
"grad_norm_var": 1344.1219889322917,
"learning_rate": 0.0001,
"loss": 7.3072,
"loss/crossentropy": 1.7837430238723755,
"loss/hidden": 0.271484375,
"loss/logits": 0.012293729931116104,
"loss/reg": 5.2396626472473145,
"loss/twn": 0.0,
"step": 424
},
{
"epoch": 0.010625,
"grad_norm": 53.75,
"grad_norm_var": 1365.6212890625,
"learning_rate": 0.0001,
"loss": 8.513,
"loss/crossentropy": 3.109501838684082,
"loss/hidden": 0.1572265625,
"loss/logits": 0.006876545026898384,
"loss/reg": 5.239365100860596,
"loss/twn": 0.0,
"step": 425
},
{
"epoch": 0.01065,
"grad_norm": 8.5625,
"grad_norm_var": 1373.6447265625,
"learning_rate": 0.0001,
"loss": 7.5404,
"loss/crossentropy": 2.174077272415161,
"loss/hidden": 0.115234375,
"loss/logits": 0.011358590796589851,
"loss/reg": 5.239738941192627,
"loss/twn": 0.0,
"step": 426
},
{
"epoch": 0.010675,
"grad_norm": 12.1875,
"grad_norm_var": 1367.7306640625,
"learning_rate": 0.0001,
"loss": 8.1571,
"loss/crossentropy": 2.827575922012329,
"loss/hidden": 0.083984375,
"loss/logits": 0.005951396189630032,
"loss/reg": 5.239595890045166,
"loss/twn": 0.0,
"step": 427
},
{
"epoch": 0.0107,
"grad_norm": 14.875,
"grad_norm_var": 1365.55859375,
"learning_rate": 0.0001,
"loss": 8.0294,
"loss/crossentropy": 2.697416305541992,
"loss/hidden": 0.087890625,
"loss/logits": 0.004499722272157669,
"loss/reg": 5.239617824554443,
"loss/twn": 0.0,
"step": 428
},
{
"epoch": 0.010725,
"grad_norm": 11.0,
"grad_norm_var": 1369.5015625,
"learning_rate": 0.0001,
"loss": 6.9497,
"loss/crossentropy": 1.5794799327850342,
"loss/hidden": 0.12255859375,
"loss/logits": 0.008041350170969963,
"loss/reg": 5.239623069763184,
"loss/twn": 0.0,
"step": 429
},
{
"epoch": 0.01075,
"grad_norm": 12.0,
"grad_norm_var": 1386.5462890625,
"learning_rate": 0.0001,
"loss": 6.8916,
"loss/crossentropy": 1.5268070697784424,
"loss/hidden": 0.11962890625,
"loss/logits": 0.0057592848315835,
"loss/reg": 5.23940372467041,
"loss/twn": 0.0,
"step": 430
},
{
"epoch": 0.010775,
"grad_norm": 11.5,
"grad_norm_var": 359.2416015625,
"learning_rate": 0.0001,
"loss": 8.0251,
"loss/crossentropy": 2.647442579269409,
"loss/hidden": 0.12890625,
"loss/logits": 0.009594411589205265,
"loss/reg": 5.239197254180908,
"loss/twn": 0.0,
"step": 431
},
{
"epoch": 0.0108,
"grad_norm": 18.75,
"grad_norm_var": 355.0367024739583,
"learning_rate": 0.0001,
"loss": 8.1387,
"loss/crossentropy": 2.7440555095672607,
"loss/hidden": 0.1376953125,
"loss/logits": 0.01775319315493107,
"loss/reg": 5.23914909362793,
"loss/twn": 0.0,
"step": 432
},
{
"epoch": 0.010825,
"grad_norm": 9.0,
"grad_norm_var": 357.3424479166667,
"learning_rate": 0.0001,
"loss": 7.2622,
"loss/crossentropy": 1.8811193704605103,
"loss/hidden": 0.1337890625,
"loss/logits": 0.007980940863490105,
"loss/reg": 5.239285469055176,
"loss/twn": 0.0,
"step": 433
},
{
"epoch": 0.01085,
"grad_norm": 10.625,
"grad_norm_var": 356.42604166666666,
"learning_rate": 0.0001,
"loss": 8.2447,
"loss/crossentropy": 2.8804891109466553,
"loss/hidden": 0.1123046875,
"loss/logits": 0.01278759352862835,
"loss/reg": 5.239134311676025,
"loss/twn": 0.0,
"step": 434
},
{
"epoch": 0.010875,
"grad_norm": 102.0,
"grad_norm_var": 781.3260416666667,
"learning_rate": 0.0001,
"loss": 6.8421,
"loss/crossentropy": 1.4239375591278076,
"loss/hidden": 0.1669921875,
"loss/logits": 0.011750075966119766,
"loss/reg": 5.239468574523926,
"loss/twn": 0.0,
"step": 435
},
{
"epoch": 0.0109,
"grad_norm": 14.25,
"grad_norm_var": 582.1736979166667,
"learning_rate": 0.0001,
"loss": 7.9746,
"loss/crossentropy": 2.5394973754882812,
"loss/hidden": 0.1875,
"loss/logits": 0.008459478616714478,
"loss/reg": 5.239116191864014,
"loss/twn": 0.0,
"step": 436
},
{
"epoch": 0.010925,
"grad_norm": 7.8125,
"grad_norm_var": 590.5479166666667,
"learning_rate": 0.0001,
"loss": 7.3172,
"loss/crossentropy": 1.975609540939331,
"loss/hidden": 0.0986328125,
"loss/logits": 0.0038147151935845613,
"loss/reg": 5.239123344421387,
"loss/twn": 0.0,
"step": 437
},
{
"epoch": 0.01095,
"grad_norm": 10.8125,
"grad_norm_var": 594.1465983072917,
"learning_rate": 0.0001,
"loss": 7.1303,
"loss/crossentropy": 1.7625492811203003,
"loss/hidden": 0.12255859375,
"loss/logits": 0.005926240235567093,
"loss/reg": 5.239302635192871,
"loss/twn": 0.0,
"step": 438
},
{
"epoch": 0.010975,
"grad_norm": 11.75,
"grad_norm_var": 592.1593587239583,
"learning_rate": 0.0001,
"loss": 8.0788,
"loss/crossentropy": 2.697333812713623,
"loss/hidden": 0.12890625,
"loss/logits": 0.013756821863353252,
"loss/reg": 5.2388529777526855,
"loss/twn": 0.0,
"step": 439
},
{
"epoch": 0.011,
"grad_norm": 57.0,
"grad_norm_var": 672.0280598958333,
"learning_rate": 0.0001,
"loss": 7.046,
"loss/crossentropy": 1.7012284994125366,
"loss/hidden": 0.10009765625,
"loss/logits": 0.005743634421378374,
"loss/reg": 5.238898754119873,
"loss/twn": 0.0,
"step": 440
},
{
"epoch": 0.011025,
"grad_norm": 16.375,
"grad_norm_var": 605.434375,
"learning_rate": 0.0001,
"loss": 7.7288,
"loss/crossentropy": 2.397491216659546,
"loss/hidden": 0.08642578125,
"loss/logits": 0.005718431435525417,
"loss/reg": 5.239189624786377,
"loss/twn": 0.0,
"step": 441
},
{
"epoch": 0.01105,
"grad_norm": 30.75,
"grad_norm_var": 600.7946451822917,
"learning_rate": 0.0001,
"loss": 6.7732,
"loss/crossentropy": 1.3775757551193237,
"loss/hidden": 0.150390625,
"loss/logits": 0.006438620388507843,
"loss/reg": 5.23883056640625,
"loss/twn": 0.0,
"step": 442
},
{
"epoch": 0.011075,
"grad_norm": 326.0,
"grad_norm_var": 6348.5484375,
"learning_rate": 0.0001,
"loss": 6.8298,
"loss/crossentropy": 1.457594394683838,
"loss/hidden": 0.12890625,
"loss/logits": 0.00396731635555625,
"loss/reg": 5.239315509796143,
"loss/twn": 0.0,
"step": 443
},
{
"epoch": 0.0111,
"grad_norm": 12.9375,
"grad_norm_var": 6355.669254557291,
"learning_rate": 0.0001,
"loss": 8.2584,
"loss/crossentropy": 2.878317356109619,
"loss/hidden": 0.130859375,
"loss/logits": 0.010759024880826473,
"loss/reg": 5.238509178161621,
"loss/twn": 0.0,
"step": 444
},
{
"epoch": 0.011125,
"grad_norm": 8.4375,
"grad_norm_var": 6366.469791666666,
"learning_rate": 0.0001,
"loss": 7.6916,
"loss/crossentropy": 2.346791982650757,
"loss/hidden": 0.10107421875,
"loss/logits": 0.004902126267552376,
"loss/reg": 5.2388715744018555,
"loss/twn": 0.0,
"step": 445
},
{
"epoch": 0.01115,
"grad_norm": 35.75,
"grad_norm_var": 6309.098697916666,
"learning_rate": 0.0001,
"loss": 6.5195,
"loss/crossentropy": 1.047242283821106,
"loss/hidden": 0.2265625,
"loss/logits": 0.006808393634855747,
"loss/reg": 5.2388739585876465,
"loss/twn": 0.0,
"step": 446
},
{
"epoch": 0.011175,
"grad_norm": 51.75,
"grad_norm_var": 6242.728125,
"learning_rate": 0.0001,
"loss": 8.1962,
"loss/crossentropy": 2.7516071796417236,
"loss/hidden": 0.1865234375,
"loss/logits": 0.0192459337413311,
"loss/reg": 5.238797187805176,
"loss/twn": 0.0,
"step": 447
},
{
"epoch": 0.0112,
"grad_norm": 12.6875,
"grad_norm_var": 6266.446077473958,
"learning_rate": 0.0001,
"loss": 8.1265,
"loss/crossentropy": 2.730348587036133,
"loss/hidden": 0.142578125,
"loss/logits": 0.015144633129239082,
"loss/reg": 5.238423824310303,
"loss/twn": 0.0,
"step": 448
},
{
"epoch": 0.011225,
"grad_norm": 48.25,
"grad_norm_var": 6175.005843098958,
"learning_rate": 0.0001,
"loss": 6.9994,
"loss/crossentropy": 1.4689970016479492,
"loss/hidden": 0.283203125,
"loss/logits": 0.008740945719182491,
"loss/reg": 5.238440036773682,
"loss/twn": 0.0,
"step": 449
},
{
"epoch": 0.01125,
"grad_norm": 7.8125,
"grad_norm_var": 6189.262434895833,
"learning_rate": 0.0001,
"loss": 7.8836,
"loss/crossentropy": 2.586432695388794,
"loss/hidden": 0.0546875,
"loss/logits": 0.004194296896457672,
"loss/reg": 5.238241672515869,
"loss/twn": 0.0,
"step": 450
},
{
"epoch": 0.011275,
"grad_norm": 12.5625,
"grad_norm_var": 6035.099202473958,
"learning_rate": 0.0001,
"loss": 7.5775,
"loss/crossentropy": 2.230092763900757,
"loss/hidden": 0.10107421875,
"loss/logits": 0.007766470313072205,
"loss/reg": 5.238610744476318,
"loss/twn": 0.0,
"step": 451
},
{
"epoch": 0.0113,
"grad_norm": 10.9375,
"grad_norm_var": 6047.8462890625,
"learning_rate": 0.0001,
"loss": 7.9835,
"loss/crossentropy": 2.615774154663086,
"loss/hidden": 0.1201171875,
"loss/logits": 0.009664995595812798,
"loss/reg": 5.237979412078857,
"loss/twn": 0.0,
"step": 452
},
{
"epoch": 0.011325,
"grad_norm": 76.0,
"grad_norm_var": 6033.516259765625,
"learning_rate": 0.0001,
"loss": 6.81,
"loss/crossentropy": 1.415939450263977,
"loss/hidden": 0.1435546875,
"loss/logits": 0.01219608448445797,
"loss/reg": 5.238288402557373,
"loss/twn": 0.0,
"step": 453
},
{
"epoch": 0.01135,
"grad_norm": 9.125,
"grad_norm_var": 6041.5244140625,
"learning_rate": 0.0001,
"loss": 7.0682,
"loss/crossentropy": 1.6937216520309448,
"loss/hidden": 0.1298828125,
"loss/logits": 0.006700664758682251,
"loss/reg": 5.237900733947754,
"loss/twn": 0.0,
"step": 454
},
{
"epoch": 0.011375,
"grad_norm": 19.125,
"grad_norm_var": 6011.728645833334,
"learning_rate": 0.0001,
"loss": 8.4153,
"loss/crossentropy": 2.93511700630188,
"loss/hidden": 0.2197265625,
"loss/logits": 0.022560518234968185,
"loss/reg": 5.237886905670166,
"loss/twn": 0.0,
"step": 455
},
{
"epoch": 0.0114,
"grad_norm": 15.3125,
"grad_norm_var": 6059.028759765625,
"learning_rate": 0.0001,
"loss": 7.0063,
"loss/crossentropy": 1.5995585918426514,
"loss/hidden": 0.1572265625,
"loss/logits": 0.011760546825826168,
"loss/reg": 5.237764835357666,
"loss/twn": 0.0,
"step": 456
},
{
"epoch": 0.011425,
"grad_norm": 10.875,
"grad_norm_var": 6080.710791015625,
"learning_rate": 0.0001,
"loss": 7.0912,
"loss/crossentropy": 1.6729381084442139,
"loss/hidden": 0.1669921875,
"loss/logits": 0.013158103451132774,
"loss/reg": 5.238087177276611,
"loss/twn": 0.0,
"step": 457
},
{
"epoch": 0.01145,
"grad_norm": 11.3125,
"grad_norm_var": 6136.1228515625,
"learning_rate": 0.0001,
"loss": 7.8938,
"loss/crossentropy": 2.558936834335327,
"loss/hidden": 0.08642578125,
"loss/logits": 0.010592980310320854,
"loss/reg": 5.23784065246582,
"loss/twn": 0.0,
"step": 458
},
{
"epoch": 0.011475,
"grad_norm": 14.3125,
"grad_norm_var": 397.25792643229164,
"learning_rate": 0.0001,
"loss": 8.1499,
"loss/crossentropy": 2.677492141723633,
"loss/hidden": 0.21875,
"loss/logits": 0.015965130180120468,
"loss/reg": 5.237676620483398,
"loss/twn": 0.0,
"step": 459
},
{
"epoch": 0.0115,
"grad_norm": 15.4375,
"grad_norm_var": 394.51964518229164,
"learning_rate": 0.0001,
"loss": 8.1308,
"loss/crossentropy": 2.706998348236084,
"loss/hidden": 0.1708984375,
"loss/logits": 0.015249890275299549,
"loss/reg": 5.237621784210205,
"loss/twn": 0.0,
"step": 460
},
{
"epoch": 0.011525,
"grad_norm": 10.6875,
"grad_norm_var": 390.62316080729164,
"learning_rate": 0.0001,
"loss": 7.1208,
"loss/crossentropy": 1.7324503660202026,
"loss/hidden": 0.1416015625,
"loss/logits": 0.008826036937534809,
"loss/reg": 5.237947940826416,
"loss/twn": 0.0,
"step": 461
},
{
"epoch": 0.01155,
"grad_norm": 88.0,
"grad_norm_var": 652.7167805989583,
"learning_rate": 0.0001,
"loss": 7.8363,
"loss/crossentropy": 2.4369447231292725,
"loss/hidden": 0.15234375,
"loss/logits": 0.00924272183328867,
"loss/reg": 5.237813949584961,
"loss/twn": 0.0,
"step": 462
},
{
"epoch": 0.011575,
"grad_norm": 8.875,
"grad_norm_var": 619.7566243489583,
"learning_rate": 0.0001,
"loss": 7.7379,
"loss/crossentropy": 2.4647915363311768,
"loss/hidden": 0.0302734375,
"loss/logits": 0.005017576273530722,
"loss/reg": 5.237803936004639,
"loss/twn": 0.0,
"step": 463
},
{
"epoch": 0.0116,
"grad_norm": 8.3125,
"grad_norm_var": 627.089306640625,
"learning_rate": 0.0001,
"loss": 7.7172,
"loss/crossentropy": 2.3781650066375732,
"loss/hidden": 0.09619140625,
"loss/logits": 0.0055891769006848335,
"loss/reg": 5.237229824066162,
"loss/twn": 0.0,
"step": 464
},
{
"epoch": 0.011625,
"grad_norm": 65.5,
"grad_norm_var": 703.914697265625,
"learning_rate": 0.0001,
"loss": 8.1072,
"loss/crossentropy": 2.73203182220459,
"loss/hidden": 0.126953125,
"loss/logits": 0.010718154720962048,
"loss/reg": 5.237488746643066,
"loss/twn": 0.0,
"step": 465
},
{
"epoch": 0.01165,
"grad_norm": 21.875,
"grad_norm_var": 685.90078125,
"learning_rate": 0.0001,
"loss": 8.3027,
"loss/crossentropy": 2.8547723293304443,
"loss/hidden": 0.1865234375,
"loss/logits": 0.023786598816514015,
"loss/reg": 5.2376484870910645,
"loss/twn": 0.0,
"step": 466
},
{
"epoch": 0.011675,
"grad_norm": 101.5,
"grad_norm_var": 1034.077197265625,
"learning_rate": 0.0001,
"loss": 7.6863,
"loss/crossentropy": 2.4159936904907227,
"loss/hidden": 0.0302734375,
"loss/logits": 0.0024244533851742744,
"loss/reg": 5.23759126663208,
"loss/twn": 0.0,
"step": 467
},
{
"epoch": 0.0117,
"grad_norm": 13.9375,
"grad_norm_var": 1026.835009765625,
"learning_rate": 0.0001,
"loss": 8.0093,
"loss/crossentropy": 2.580734968185425,
"loss/hidden": 0.17578125,
"loss/logits": 0.015159064903855324,
"loss/reg": 5.2376275062561035,
"loss/twn": 0.0,
"step": 468
},
{
"epoch": 0.011725,
"grad_norm": 9.8125,
"grad_norm_var": 900.303125,
"learning_rate": 0.0001,
"loss": 7.7953,
"loss/crossentropy": 2.4660680294036865,
"loss/hidden": 0.08642578125,
"loss/logits": 0.005589427426457405,
"loss/reg": 5.237224578857422,
"loss/twn": 0.0,
"step": 469
},
{
"epoch": 0.01175,
"grad_norm": 384.0,
"grad_norm_var": 8815.046809895834,
"learning_rate": 0.0001,
"loss": 6.1676,
"loss/crossentropy": 0.7747684121131897,
"loss/hidden": 0.1494140625,
"loss/logits": 0.005672769621014595,
"loss/reg": 5.23769998550415,
"loss/twn": 0.0,
"step": 470
},
{
"epoch": 0.011775,
"grad_norm": 14.75,
"grad_norm_var": 8834.2125,
"learning_rate": 0.0001,
"loss": 6.978,
"loss/crossentropy": 1.5184272527694702,
"loss/hidden": 0.2109375,
"loss/logits": 0.011126836761832237,
"loss/reg": 5.2375288009643555,
"loss/twn": 0.0,
"step": 471
},
{
"epoch": 0.0118,
"grad_norm": 10.4375,
"grad_norm_var": 8858.0212890625,
"learning_rate": 0.0001,
"loss": 7.7419,
"loss/crossentropy": 2.359570264816284,
"loss/hidden": 0.1318359375,
"loss/logits": 0.013412706553936005,
"loss/reg": 5.2371039390563965,
"loss/twn": 0.0,
"step": 472
},
{
"epoch": 0.011825,
"grad_norm": 13.9375,
"grad_norm_var": 8842.896207682292,
"learning_rate": 0.0001,
"loss": 8.1685,
"loss/crossentropy": 2.766843557357788,
"loss/hidden": 0.150390625,
"loss/logits": 0.01371270976960659,
"loss/reg": 5.237538814544678,
"loss/twn": 0.0,
"step": 473
},
{
"epoch": 0.01185,
"grad_norm": 15.3125,
"grad_norm_var": 8823.506624348958,
"learning_rate": 0.0001,
"loss": 7.5292,
"loss/crossentropy": 2.151641607284546,
"loss/hidden": 0.1318359375,
"loss/logits": 0.008503757417201996,
"loss/reg": 5.237189769744873,
"loss/twn": 0.0,
"step": 474
},
{
"epoch": 0.011875,
"grad_norm": 11.25,
"grad_norm_var": 8838.5806640625,
"learning_rate": 0.0001,
"loss": 8.0395,
"loss/crossentropy": 2.686025619506836,
"loss/hidden": 0.1103515625,
"loss/logits": 0.005928123835474253,
"loss/reg": 5.237187385559082,
"loss/twn": 0.0,
"step": 475
},
{
"epoch": 0.0119,
"grad_norm": 10.1875,
"grad_norm_var": 8864.2181640625,
"learning_rate": 0.0001,
"loss": 6.7249,
"loss/crossentropy": 1.3968653678894043,
"loss/hidden": 0.08642578125,
"loss/logits": 0.004261254798620939,
"loss/reg": 5.23736047744751,
"loss/twn": 0.0,
"step": 476
},
{
"epoch": 0.011925,
"grad_norm": 17.375,
"grad_norm_var": 8832.607535807292,
"learning_rate": 0.0001,
"loss": 7.1631,
"loss/crossentropy": 1.7119083404541016,
"loss/hidden": 0.203125,
"loss/logits": 0.010743262246251106,
"loss/reg": 5.237338542938232,
"loss/twn": 0.0,
"step": 477
},
{
"epoch": 0.01195,
"grad_norm": 584.0,
"grad_norm_var": 26742.08253580729,
"learning_rate": 0.0001,
"loss": 6.4806,
"loss/crossentropy": 1.1264278888702393,
"loss/hidden": 0.11376953125,
"loss/logits": 0.003235449083149433,
"loss/reg": 5.237181663513184,
"loss/twn": 0.0,
"step": 478
},
{
"epoch": 0.011975,
"grad_norm": 11.375,
"grad_norm_var": 26718.53435872396,
"learning_rate": 0.0001,
"loss": 5.9934,
"loss/crossentropy": 0.5191141963005066,
"loss/hidden": 0.2275390625,
"loss/logits": 0.009647047147154808,
"loss/reg": 5.237125396728516,
"loss/twn": 0.0,
"step": 479
},
{
"epoch": 0.012,
"grad_norm": 10.375,
"grad_norm_var": 26698.853059895835,
"learning_rate": 0.0001,
"loss": 8.1124,
"loss/crossentropy": 2.780978202819824,
"loss/hidden": 0.08642578125,
"loss/logits": 0.007745138369500637,
"loss/reg": 5.23725700378418,
"loss/twn": 0.0,
"step": 480
},
{
"epoch": 0.012025,
"grad_norm": 34.5,
"grad_norm_var": 26822.8853515625,
"learning_rate": 0.0001,
"loss": 7.868,
"loss/crossentropy": 2.5087831020355225,
"loss/hidden": 0.11474609375,
"loss/logits": 0.007656463421881199,
"loss/reg": 5.236792087554932,
"loss/twn": 0.0,
"step": 481
},
{
"epoch": 0.01205,
"grad_norm": 8.6875,
"grad_norm_var": 26934.268212890624,
"learning_rate": 0.0001,
"loss": 7.2881,
"loss/crossentropy": 1.942946434020996,
"loss/hidden": 0.10107421875,
"loss/logits": 0.007265533320605755,
"loss/reg": 5.236773490905762,
"loss/twn": 0.0,
"step": 482
},
{
"epoch": 0.012075,
"grad_norm": 10.375,
"grad_norm_var": 27170.338916015626,
"learning_rate": 0.0001,
"loss": 7.3296,
"loss/crossentropy": 1.9258122444152832,
"loss/hidden": 0.1552734375,
"loss/logits": 0.011823762208223343,
"loss/reg": 5.2366719245910645,
"loss/twn": 0.0,
"step": 483
},
{
"epoch": 0.0121,
"grad_norm": 9.4375,
"grad_norm_var": 27206.753759765626,
"learning_rate": 0.0001,
"loss": 7.756,
"loss/crossentropy": 2.4538094997406006,
"loss/hidden": 0.0595703125,
"loss/logits": 0.005918778479099274,
"loss/reg": 5.2366943359375,
"loss/twn": 0.0,
"step": 484
},
{
"epoch": 0.012125,
"grad_norm": 13.0625,
"grad_norm_var": 27180.362744140624,
"learning_rate": 0.0001,
"loss": 8.1167,
"loss/crossentropy": 2.796402931213379,
"loss/hidden": 0.0791015625,
"loss/logits": 0.004426885861903429,
"loss/reg": 5.236767768859863,
"loss/twn": 0.0,
"step": 485
},
{
"epoch": 0.01215,
"grad_norm": 10.0625,
"grad_norm_var": 20385.898893229165,
"learning_rate": 0.0001,
"loss": 7.9673,
"loss/crossentropy": 2.5698435306549072,
"loss/hidden": 0.1494140625,
"loss/logits": 0.011329087428748608,
"loss/reg": 5.236757278442383,
"loss/twn": 0.0,
"step": 486
},
{
"epoch": 0.012175,
"grad_norm": 10.625,
"grad_norm_var": 20405.838541666668,
"learning_rate": 0.0001,
"loss": 7.2025,
"loss/crossentropy": 1.8346238136291504,
"loss/hidden": 0.126953125,
"loss/logits": 0.004264689050614834,
"loss/reg": 5.236656188964844,
"loss/twn": 0.0,
"step": 487
},
{
"epoch": 0.0122,
"grad_norm": 14.9375,
"grad_norm_var": 20384.079166666666,
"learning_rate": 0.0001,
"loss": 7.7223,
"loss/crossentropy": 2.3055596351623535,
"loss/hidden": 0.1650390625,
"loss/logits": 0.015009969472885132,
"loss/reg": 5.236688137054443,
"loss/twn": 0.0,
"step": 488
},
{
"epoch": 0.012225,
"grad_norm": 16.25,
"grad_norm_var": 20373.57355143229,
"learning_rate": 0.0001,
"loss": 7.9159,
"loss/crossentropy": 2.573246955871582,
"loss/hidden": 0.0986328125,
"loss/logits": 0.007458841428160667,
"loss/reg": 5.236563682556152,
"loss/twn": 0.0,
"step": 489
},
{
"epoch": 0.01225,
"grad_norm": 12.3125,
"grad_norm_var": 20387.70636393229,
"learning_rate": 0.0001,
"loss": 7.9834,
"loss/crossentropy": 2.5690972805023193,
"loss/hidden": 0.1650390625,
"loss/logits": 0.012899991124868393,
"loss/reg": 5.236404895782471,
"loss/twn": 0.0,
"step": 490
},
{
"epoch": 0.012275,
"grad_norm": 10.0625,
"grad_norm_var": 20393.779622395832,
"learning_rate": 0.0001,
"loss": 8.1415,
"loss/crossentropy": 2.8210272789001465,
"loss/hidden": 0.07666015625,
"loss/logits": 0.006865846458822489,
"loss/reg": 5.23691463470459,
"loss/twn": 0.0,
"step": 491
},
{
"epoch": 0.0123,
"grad_norm": 9.8125,
"grad_norm_var": 20395.727864583332,
"learning_rate": 0.0001,
"loss": 7.8399,
"loss/crossentropy": 2.5191946029663086,
"loss/hidden": 0.0791015625,
"loss/logits": 0.00524523202329874,
"loss/reg": 5.2364020347595215,
"loss/twn": 0.0,
"step": 492
},
{
"epoch": 0.012325,
"grad_norm": 19.625,
"grad_norm_var": 20386.570833333335,
"learning_rate": 0.0001,
"loss": 8.46,
"loss/crossentropy": 3.0929410457611084,
"loss/hidden": 0.12060546875,
"loss/logits": 0.010085565969347954,
"loss/reg": 5.236414909362793,
"loss/twn": 0.0,
"step": 493
},
{
"epoch": 0.01235,
"grad_norm": 13.375,
"grad_norm_var": 39.9259765625,
"learning_rate": 0.0001,
"loss": 8.1608,
"loss/crossentropy": 2.7697112560272217,
"loss/hidden": 0.1396484375,
"loss/logits": 0.015202455222606659,
"loss/reg": 5.236268043518066,
"loss/twn": 0.0,
"step": 494
},
{
"epoch": 0.012375,
"grad_norm": 24.125,
"grad_norm_var": 46.5931640625,
"learning_rate": 0.0001,
"loss": 7.9654,
"loss/crossentropy": 2.5814311504364014,
"loss/hidden": 0.140625,
"loss/logits": 0.006853965111076832,
"loss/reg": 5.236512184143066,
"loss/twn": 0.0,
"step": 495
},
{
"epoch": 0.0124,
"grad_norm": 10.3125,
"grad_norm_var": 46.62550455729167,
"learning_rate": 0.0001,
"loss": 6.2441,
"loss/crossentropy": 0.8653862476348877,
"loss/hidden": 0.13671875,
"loss/logits": 0.005771493539214134,
"loss/reg": 5.236272811889648,
"loss/twn": 0.0,
"step": 496
},
{
"epoch": 0.012425,
"grad_norm": 19.0,
"grad_norm_var": 19.734619140625,
"learning_rate": 0.0001,
"loss": 7.8094,
"loss/crossentropy": 2.4122979640960693,
"loss/hidden": 0.1533203125,
"loss/logits": 0.007559158839285374,
"loss/reg": 5.236222743988037,
"loss/twn": 0.0,
"step": 497
},
{
"epoch": 0.01245,
"grad_norm": 23.375,
"grad_norm_var": 24.274739583333332,
"learning_rate": 0.0001,
"loss": 7.2016,
"loss/crossentropy": 1.773500680923462,
"loss/hidden": 0.177734375,
"loss/logits": 0.01412028819322586,
"loss/reg": 5.236289024353027,
"loss/twn": 0.0,
"step": 498
},
{
"epoch": 0.012475,
"grad_norm": 12.25,
"grad_norm_var": 23.545247395833332,
"learning_rate": 0.0001,
"loss": 7.306,
"loss/crossentropy": 1.899260401725769,
"loss/hidden": 0.16015625,
"loss/logits": 0.010113149881362915,
"loss/reg": 5.236475467681885,
"loss/twn": 0.0,
"step": 499
},
{
"epoch": 0.0125,
"grad_norm": 10.0625,
"grad_norm_var": 23.165364583333332,
"learning_rate": 0.0001,
"loss": 7.6044,
"loss/crossentropy": 2.234079360961914,
"loss/hidden": 0.126953125,
"loss/logits": 0.0073781562969088554,
"loss/reg": 5.235958576202393,
"loss/twn": 0.0,
"step": 500
},
{
"epoch": 0.012525,
"grad_norm": 11.8125,
"grad_norm_var": 23.473958333333332,
"learning_rate": 0.0001,
"loss": 7.5774,
"loss/crossentropy": 2.250281572341919,
"loss/hidden": 0.08642578125,
"loss/logits": 0.004540358670055866,
"loss/reg": 5.236131191253662,
"loss/twn": 0.0,
"step": 501
},
{
"epoch": 0.01255,
"grad_norm": 10.1875,
"grad_norm_var": 23.405143229166665,
"learning_rate": 0.0001,
"loss": 6.5639,
"loss/crossentropy": 1.1574146747589111,
"loss/hidden": 0.1591796875,
"loss/logits": 0.011335412040352821,
"loss/reg": 5.236012935638428,
"loss/twn": 0.0,
"step": 502
},
{
"epoch": 0.012575,
"grad_norm": 12.0,
"grad_norm_var": 22.857291666666665,
"learning_rate": 0.0001,
"loss": 7.8073,
"loss/crossentropy": 2.4466099739074707,
"loss/hidden": 0.115234375,
"loss/logits": 0.009260098449885845,
"loss/reg": 5.236217021942139,
"loss/twn": 0.0,
"step": 503
},
{
"epoch": 0.0126,
"grad_norm": 15.0,
"grad_norm_var": 22.862483723958334,
"learning_rate": 0.0001,
"loss": 6.7732,
"loss/crossentropy": 1.3012182712554932,
"loss/hidden": 0.2265625,
"loss/logits": 0.00941612757742405,
"loss/reg": 5.235978126525879,
"loss/twn": 0.0,
"step": 504
},
{
"epoch": 0.012625,
"grad_norm": 141.0,
"grad_norm_var": 1027.1649576822917,
"learning_rate": 0.0001,
"loss": 7.359,
"loss/crossentropy": 1.941611886024475,
"loss/hidden": 0.173828125,
"loss/logits": 0.007238644640892744,
"loss/reg": 5.236276626586914,
"loss/twn": 0.0,
"step": 505
},
{
"epoch": 0.01265,
"grad_norm": 11.875,
"grad_norm_var": 1027.7504557291666,
"learning_rate": 0.0001,
"loss": 8.2093,
"loss/crossentropy": 2.877250909805298,
"loss/hidden": 0.08642578125,
"loss/logits": 0.009630267508327961,
"loss/reg": 5.235978603363037,
"loss/twn": 0.0,
"step": 506
},
{
"epoch": 0.012675,
"grad_norm": 136.0,
"grad_norm_var": 1816.5980305989583,
"learning_rate": 0.0001,
"loss": 8.2749,
"loss/crossentropy": 2.9017584323883057,
"loss/hidden": 0.1279296875,
"loss/logits": 0.00875360518693924,
"loss/reg": 5.236504077911377,
"loss/twn": 0.0,
"step": 507
},
{
"epoch": 0.0127,
"grad_norm": 124.5,
"grad_norm_var": 2330.153125,
"learning_rate": 0.0001,
"loss": 5.9319,
"loss/crossentropy": 0.569814920425415,
"loss/hidden": 0.11962890625,
"loss/logits": 0.006820861250162125,
"loss/reg": 5.235653400421143,
"loss/twn": 0.0,
"step": 508
},
{
"epoch": 0.012725,
"grad_norm": 12.6875,
"grad_norm_var": 2349.377587890625,
"learning_rate": 0.0001,
"loss": 7.4016,
"loss/crossentropy": 2.056201934814453,
"loss/hidden": 0.0986328125,
"loss/logits": 0.010679306462407112,
"loss/reg": 5.236123085021973,
"loss/twn": 0.0,
"step": 509
},
{
"epoch": 0.01275,
"grad_norm": 10.9375,
"grad_norm_var": 2357.3369140625,
"learning_rate": 0.0001,
"loss": 7.799,
"loss/crossentropy": 2.4681053161621094,
"loss/hidden": 0.08642578125,
"loss/logits": 0.008713052608072758,
"loss/reg": 5.235713481903076,
"loss/twn": 0.0,
"step": 510
},
{
"epoch": 0.012775,
"grad_norm": 11.8125,
"grad_norm_var": 2387.242822265625,
"learning_rate": 0.0001,
"loss": 7.0821,
"loss/crossentropy": 1.63010573387146,
"loss/hidden": 0.2041015625,
"loss/logits": 0.011883174069225788,
"loss/reg": 5.235997200012207,
"loss/twn": 0.0,
"step": 511
},
{
"epoch": 0.0128,
"grad_norm": 9.3125,
"grad_norm_var": 2390.703759765625,
"learning_rate": 0.0001,
"loss": 7.1311,
"loss/crossentropy": 1.7556850910186768,
"loss/hidden": 0.1328125,
"loss/logits": 0.0065896175801754,
"loss/reg": 5.236062049865723,
"loss/twn": 0.0,
"step": 512
},
{
"epoch": 0.012825,
"grad_norm": 7.15625,
"grad_norm_var": 2425.903446451823,
"learning_rate": 0.0001,
"loss": 6.1515,
"loss/crossentropy": 0.7963519096374512,
"loss/hidden": 0.11279296875,
"loss/logits": 0.006251027341932058,
"loss/reg": 5.236119270324707,
"loss/twn": 0.0,
"step": 513
},
{
"epoch": 0.01285,
"grad_norm": 14.3125,
"grad_norm_var": 2445.081018066406,
"learning_rate": 0.0001,
"loss": 8.0629,
"loss/crossentropy": 2.6161036491394043,
"loss/hidden": 0.197265625,
"loss/logits": 0.013607255183160305,
"loss/reg": 5.235938549041748,
"loss/twn": 0.0,
"step": 514
},
{
"epoch": 0.012875,
"grad_norm": 9.25,
"grad_norm_var": 2454.5161743164062,
"learning_rate": 0.0001,
"loss": 8.0433,
"loss/crossentropy": 2.7305965423583984,
"loss/hidden": 0.0693359375,
"loss/logits": 0.007636295165866613,
"loss/reg": 5.235754489898682,
"loss/twn": 0.0,
"step": 515
},
{
"epoch": 0.0129,
"grad_norm": 17.125,
"grad_norm_var": 2434.8625610351564,
"learning_rate": 0.0001,
"loss": 7.871,
"loss/crossentropy": 2.3984246253967285,
"loss/hidden": 0.220703125,
"loss/logits": 0.01586098223924637,
"loss/reg": 5.2359771728515625,
"loss/twn": 0.0,
"step": 516
},
{
"epoch": 0.012925,
"grad_norm": 16.75,
"grad_norm_var": 2421.328153483073,
"learning_rate": 0.0001,
"loss": 8.2615,
"loss/crossentropy": 2.9614031314849854,
"loss/hidden": 0.0595703125,
"loss/logits": 0.0044908830896019936,
"loss/reg": 5.23606014251709,
"loss/twn": 0.0,
"step": 517
},
{
"epoch": 0.01295,
"grad_norm": 13.375,
"grad_norm_var": 2411.420340983073,
"learning_rate": 0.0001,
"loss": 7.3563,
"loss/crossentropy": 2.021721363067627,
"loss/hidden": 0.09375,
"loss/logits": 0.004891795106232166,
"loss/reg": 5.235958099365234,
"loss/twn": 0.0,
"step": 518
},
{
"epoch": 0.012975,
"grad_norm": 14.6875,
"grad_norm_var": 2403.5608032226564,
"learning_rate": 0.0001,
"loss": 7.2795,
"loss/crossentropy": 1.8727322816848755,
"loss/hidden": 0.16015625,
"loss/logits": 0.010636523365974426,
"loss/reg": 5.235999584197998,
"loss/twn": 0.0,
"step": 519
},
{
"epoch": 0.013,
"grad_norm": 10.1875,
"grad_norm_var": 2418.0734985351564,
"learning_rate": 0.0001,
"loss": 7.2173,
"loss/crossentropy": 1.8416680097579956,
"loss/hidden": 0.1318359375,
"loss/logits": 0.00781365018337965,
"loss/reg": 5.235951900482178,
"loss/twn": 0.0,
"step": 520
},
{
"epoch": 0.013025,
"grad_norm": 9.8125,
"grad_norm_var": 1640.6509073893228,
"learning_rate": 0.0001,
"loss": 7.9581,
"loss/crossentropy": 2.697462797164917,
"loss/hidden": 0.02099609375,
"loss/logits": 0.004056986421346664,
"loss/reg": 5.235566139221191,
"loss/twn": 0.0,
"step": 521
},
{
"epoch": 0.01305,
"grad_norm": 444.0,
"grad_norm_var": 12447.939611816406,
"learning_rate": 0.0001,
"loss": 6.4374,
"loss/crossentropy": 1.0375036001205444,
"loss/hidden": 0.1611328125,
"loss/logits": 0.003013317007571459,
"loss/reg": 5.235776901245117,
"loss/twn": 0.0,
"step": 522
},
{
"epoch": 0.013075,
"grad_norm": 14.5625,
"grad_norm_var": 12039.795764160157,
"learning_rate": 0.0001,
"loss": 7.6035,
"loss/crossentropy": 2.2387633323669434,
"loss/hidden": 0.11767578125,
"loss/logits": 0.011812473647296429,
"loss/reg": 5.235291957855225,
"loss/twn": 0.0,
"step": 523
},
{
"epoch": 0.0131,
"grad_norm": 11.75,
"grad_norm_var": 11658.413016764323,
"learning_rate": 0.0001,
"loss": 8.2169,
"loss/crossentropy": 2.89178729057312,
"loss/hidden": 0.083984375,
"loss/logits": 0.005315279122442007,
"loss/reg": 5.235769748687744,
"loss/twn": 0.0,
"step": 524
},
{
"epoch": 0.013125,
"grad_norm": 25.0,
"grad_norm_var": 11624.30995686849,
"learning_rate": 0.0001,
"loss": 6.9365,
"loss/crossentropy": 1.5732824802398682,
"loss/hidden": 0.1201171875,
"loss/logits": 0.007601576391607523,
"loss/reg": 5.235477924346924,
"loss/twn": 0.0,
"step": 525
},
{
"epoch": 0.01315,
"grad_norm": 10.875,
"grad_norm_var": 11624.552404785156,
"learning_rate": 0.0001,
"loss": 8.1942,
"loss/crossentropy": 2.9256229400634766,
"loss/hidden": 0.0302734375,
"loss/logits": 0.002373297931626439,
"loss/reg": 5.235915184020996,
"loss/twn": 0.0,
"step": 526
},
{
"epoch": 0.013175,
"grad_norm": 8.1875,
"grad_norm_var": 11638.996708170573,
"learning_rate": 0.0001,
"loss": 6.944,
"loss/crossentropy": 1.6115573644638062,
"loss/hidden": 0.09375,
"loss/logits": 0.002878053579479456,
"loss/reg": 5.235781669616699,
"loss/twn": 0.0,
"step": 527
},
{
"epoch": 0.0132,
"grad_norm": 33.75,
"grad_norm_var": 11577.075646972657,
"learning_rate": 0.0001,
"loss": 5.7046,
"loss/crossentropy": 0.35535818338394165,
"loss/hidden": 0.11328125,
"loss/logits": 0.0003594207810238004,
"loss/reg": 5.235568046569824,
"loss/twn": 0.0,
"step": 528
},
{
"epoch": 0.013225,
"grad_norm": 17.0,
"grad_norm_var": 11538.3197265625,
"learning_rate": 0.0001,
"loss": 7.8718,
"loss/crossentropy": 2.430708646774292,
"loss/hidden": 0.197265625,
"loss/logits": 0.008184842765331268,
"loss/reg": 5.235634803771973,
"loss/twn": 0.0,
"step": 529
},
{
"epoch": 0.01325,
"grad_norm": 12.6875,
"grad_norm_var": 11544.465104166668,
"learning_rate": 0.0001,
"loss": 7.1415,
"loss/crossentropy": 1.7498486042022705,
"loss/hidden": 0.146484375,
"loss/logits": 0.009613174945116043,
"loss/reg": 5.235511302947998,
"loss/twn": 0.0,
"step": 530
},
{
"epoch": 0.013275,
"grad_norm": 10.5,
"grad_norm_var": 11539.135677083334,
"learning_rate": 0.0001,
"loss": 7.9653,
"loss/crossentropy": 2.616609573364258,
"loss/hidden": 0.10107421875,
"loss/logits": 0.011807188391685486,
"loss/reg": 5.235820770263672,
"loss/twn": 0.0,
"step": 531
},
{
"epoch": 0.0133,
"grad_norm": 15.5625,
"grad_norm_var": 11544.447770182293,
"learning_rate": 0.0001,
"loss": 7.821,
"loss/crossentropy": 2.5074052810668945,
"loss/hidden": 0.07177734375,
"loss/logits": 0.006313705816864967,
"loss/reg": 5.235459804534912,
"loss/twn": 0.0,
"step": 532
},
{
"epoch": 0.013325,
"grad_norm": 55.0,
"grad_norm_var": 11508.170035807292,
"learning_rate": 0.0001,
"loss": 6.9534,
"loss/crossentropy": 1.6006724834442139,
"loss/hidden": 0.10986328125,
"loss/logits": 0.007289988920092583,
"loss/reg": 5.235603332519531,
"loss/twn": 0.0,
"step": 533
},
{
"epoch": 0.01335,
"grad_norm": 12.625,
"grad_norm_var": 11511.286051432291,
"learning_rate": 0.0001,
"loss": 8.027,
"loss/crossentropy": 2.671614646911621,
"loss/hidden": 0.11279296875,
"loss/logits": 0.0071898894384503365,
"loss/reg": 5.235414028167725,
"loss/twn": 0.0,
"step": 534
},
{
"epoch": 0.013375,
"grad_norm": 372.0,
"grad_norm_var": 18087.790104166666,
"learning_rate": 0.0001,
"loss": 8.0188,
"loss/crossentropy": 2.639939308166504,
"loss/hidden": 0.134765625,
"loss/logits": 0.008616717532277107,
"loss/reg": 5.235448360443115,
"loss/twn": 0.0,
"step": 535
},
{
"epoch": 0.0134,
"grad_norm": 9.0625,
"grad_norm_var": 18096.311393229167,
"learning_rate": 0.0001,
"loss": 7.087,
"loss/crossentropy": 1.6764798164367676,
"loss/hidden": 0.1630859375,
"loss/logits": 0.012176426127552986,
"loss/reg": 5.235233783721924,
"loss/twn": 0.0,
"step": 536
},
{
"epoch": 0.013425,
"grad_norm": 9.3125,
"grad_norm_var": 18100.0994140625,
"learning_rate": 0.0001,
"loss": 6.986,
"loss/crossentropy": 1.620530605316162,
"loss/hidden": 0.12255859375,
"loss/logits": 0.007392015308141708,
"loss/reg": 5.235503673553467,
"loss/twn": 0.0,
"step": 537
},
{
"epoch": 0.01345,
"grad_norm": 16.125,
"grad_norm_var": 7998.4609375,
"learning_rate": 0.0001,
"loss": 8.4295,
"loss/crossentropy": 3.019792079925537,
"loss/hidden": 0.1591796875,
"loss/logits": 0.015434058383107185,
"loss/reg": 5.235138893127441,
"loss/twn": 0.0,
"step": 538
},
{
"epoch": 0.013475,
"grad_norm": 18.5,
"grad_norm_var": 7986.272119140625,
"learning_rate": 0.0001,
"loss": 6.5863,
"loss/crossentropy": 0.9472768902778625,
"loss/hidden": 0.3984375,
"loss/logits": 0.005067166872322559,
"loss/reg": 5.235495090484619,
"loss/twn": 0.0,
"step": 539
},
{
"epoch": 0.0135,
"grad_norm": 68.0,
"grad_norm_var": 7973.117822265625,
"learning_rate": 0.0001,
"loss": 7.0399,
"loss/crossentropy": 1.5947017669677734,
"loss/hidden": 0.201171875,
"loss/logits": 0.008832491934299469,
"loss/reg": 5.235156059265137,
"loss/twn": 0.0,
"step": 540
},
{
"epoch": 0.013525,
"grad_norm": 696.0,
"grad_norm_var": 34468.181884765625,
"learning_rate": 0.0001,
"loss": 8.1707,
"loss/crossentropy": 2.8003129959106445,
"loss/hidden": 0.123046875,
"loss/logits": 0.012298551388084888,
"loss/reg": 5.235071659088135,
"loss/twn": 0.0,
"step": 541
},
{
"epoch": 0.01355,
"grad_norm": 11.5,
"grad_norm_var": 34462.002197265625,
"learning_rate": 0.0001,
"loss": 7.5471,
"loss/crossentropy": 2.217853546142578,
"loss/hidden": 0.08642578125,
"loss/logits": 0.007477154955267906,
"loss/reg": 5.235381126403809,
"loss/twn": 0.0,
"step": 542
},
{
"epoch": 0.013575,
"grad_norm": 12.6875,
"grad_norm_var": 34416.96235351563,
"learning_rate": 0.0001,
"loss": 8.3214,
"loss/crossentropy": 2.926238775253296,
"loss/hidden": 0.14453125,
"loss/logits": 0.015387848019599915,
"loss/reg": 5.235249042510986,
"loss/twn": 0.0,
"step": 543
},
{
"epoch": 0.0136,
"grad_norm": 78.0,
"grad_norm_var": 34233.16352539063,
"learning_rate": 0.0001,
"loss": 8.227,
"loss/crossentropy": 2.8347182273864746,
"loss/hidden": 0.1416015625,
"loss/logits": 0.015542502515017986,
"loss/reg": 5.235121250152588,
"loss/twn": 0.0,
"step": 544
},
{
"epoch": 0.013625,
"grad_norm": 9.625,
"grad_norm_var": 34306.78292643229,
"learning_rate": 0.0001,
"loss": 7.7463,
"loss/crossentropy": 2.451880693435669,
"loss/hidden": 0.0546875,
"loss/logits": 0.004437028430402279,
"loss/reg": 5.2353057861328125,
"loss/twn": 0.0,
"step": 545
},
{
"epoch": 0.01365,
"grad_norm": 11.3125,
"grad_norm_var": 34320.69907226563,
"learning_rate": 0.0001,
"loss": 6.514,
"loss/crossentropy": 1.0950896739959717,
"loss/hidden": 0.1787109375,
"loss/logits": 0.004933930933475494,
"loss/reg": 5.235309600830078,
"loss/twn": 0.0,
"step": 546
},
{
"epoch": 0.013675,
"grad_norm": 19.375,
"grad_norm_var": 34234.075374348955,
"learning_rate": 0.0001,
"loss": 7.917,
"loss/crossentropy": 2.5059616565704346,
"loss/hidden": 0.1650390625,
"loss/logits": 0.010931363329291344,
"loss/reg": 5.23504114151001,
"loss/twn": 0.0,
"step": 547
},
{
"epoch": 0.0137,
"grad_norm": 13.0625,
"grad_norm_var": 34258.751155598955,
"learning_rate": 0.0001,
"loss": 8.0647,
"loss/crossentropy": 2.657578706741333,
"loss/hidden": 0.1640625,
"loss/logits": 0.007689584046602249,
"loss/reg": 5.235403060913086,
"loss/twn": 0.0,
"step": 548
},
{
"epoch": 0.013725,
"grad_norm": 8.6875,
"grad_norm_var": 34598.19524739583,
"learning_rate": 0.0001,
"loss": 7.7748,
"loss/crossentropy": 2.4471874237060547,
"loss/hidden": 0.08642578125,
"loss/logits": 0.006248220801353455,
"loss/reg": 5.234949588775635,
"loss/twn": 0.0,
"step": 549
},
{
"epoch": 0.01375,
"grad_norm": 20.125,
"grad_norm_var": 34528.96868489583,
"learning_rate": 0.0001,
"loss": 8.0594,
"loss/crossentropy": 2.625808000564575,
"loss/hidden": 0.1826171875,
"loss/logits": 0.015631355345249176,
"loss/reg": 5.235373020172119,
"loss/twn": 0.0,
"step": 550
},
{
"epoch": 0.013775,
"grad_norm": 13.875,
"grad_norm_var": 28880.479427083334,
"learning_rate": 0.0001,
"loss": 5.896,
"loss/crossentropy": 0.3797203600406647,
"loss/hidden": 0.27734375,
"loss/logits": 0.003970766440033913,
"loss/reg": 5.234944820404053,
"loss/twn": 0.0,
"step": 551
},
{
"epoch": 0.0138,
"grad_norm": 37.0,
"grad_norm_var": 28726.655843098957,
"learning_rate": 0.0001,
"loss": 8.1785,
"loss/crossentropy": 2.7710931301116943,
"loss/hidden": 0.1650390625,
"loss/logits": 0.007251654751598835,
"loss/reg": 5.235079765319824,
"loss/twn": 0.0,
"step": 552
},
{
"epoch": 0.013825,
"grad_norm": 10.4375,
"grad_norm_var": 28718.351936848958,
"learning_rate": 0.0001,
"loss": 7.4216,
"loss/crossentropy": 2.1164746284484863,
"loss/hidden": 0.06689453125,
"loss/logits": 0.003332372521981597,
"loss/reg": 5.234862327575684,
"loss/twn": 0.0,
"step": 553
},
{
"epoch": 0.01385,
"grad_norm": 8.625,
"grad_norm_var": 28771.012093098958,
"learning_rate": 0.0001,
"loss": 6.9919,
"loss/crossentropy": 1.6055660247802734,
"loss/hidden": 0.140625,
"loss/logits": 0.010463319718837738,
"loss/reg": 5.235208034515381,
"loss/twn": 0.0,
"step": 554
},
{
"epoch": 0.013875,
"grad_norm": 19.375,
"grad_norm_var": 28765.65818684896,
"learning_rate": 0.0001,
"loss": 7.6993,
"loss/crossentropy": 2.3250389099121094,
"loss/hidden": 0.1328125,
"loss/logits": 0.006799938622862101,
"loss/reg": 5.234645843505859,
"loss/twn": 0.0,
"step": 555
},
{
"epoch": 0.0139,
"grad_norm": 28.0,
"grad_norm_var": 28848.887353515624,
"learning_rate": 0.0001,
"loss": 6.6932,
"loss/crossentropy": 1.2838351726531982,
"loss/hidden": 0.16796875,
"loss/logits": 0.00634372141212225,
"loss/reg": 5.235021591186523,
"loss/twn": 0.0,
"step": 556
},
{
"epoch": 0.013925,
"grad_norm": 209.0,
"grad_norm_var": 2527.298291015625,
"learning_rate": 0.0001,
"loss": 8.2303,
"loss/crossentropy": 2.8924214839935303,
"loss/hidden": 0.095703125,
"loss/logits": 0.007457260973751545,
"loss/reg": 5.234717845916748,
"loss/twn": 0.0,
"step": 557
},
{
"epoch": 0.01395,
"grad_norm": 14.25,
"grad_norm_var": 2520.284358723958,
"learning_rate": 0.0001,
"loss": 7.1526,
"loss/crossentropy": 1.7737421989440918,
"loss/hidden": 0.134765625,
"loss/logits": 0.009325024671852589,
"loss/reg": 5.2347235679626465,
"loss/twn": 0.0,
"step": 558
},
{
"epoch": 0.013975,
"grad_norm": 31.75,
"grad_norm_var": 2493.68125,
"learning_rate": 0.0001,
"loss": 7.8427,
"loss/crossentropy": 2.5061957836151123,
"loss/hidden": 0.09326171875,
"loss/logits": 0.008374359458684921,
"loss/reg": 5.234871864318848,
"loss/twn": 0.0,
"step": 559
},
{
"epoch": 0.014,
"grad_norm": 9.5625,
"grad_norm_var": 2378.353369140625,
"learning_rate": 0.0001,
"loss": 8.2168,
"loss/crossentropy": 2.899165391921997,
"loss/hidden": 0.0791015625,
"loss/logits": 0.0035689827054739,
"loss/reg": 5.234927654266357,
"loss/twn": 0.0,
"step": 560
},
{
"epoch": 0.014025,
"grad_norm": 53.0,
"grad_norm_var": 2383.8656087239583,
"learning_rate": 0.0001,
"loss": 5.8024,
"loss/crossentropy": 0.35774412751197815,
"loss/hidden": 0.2021484375,
"loss/logits": 0.00793472956866026,
"loss/reg": 5.234549045562744,
"loss/twn": 0.0,
"step": 561
},
{
"epoch": 0.01405,
"grad_norm": 64.5,
"grad_norm_var": 2415.985872395833,
"learning_rate": 0.0001,
"loss": 7.1944,
"loss/crossentropy": 1.7934857606887817,
"loss/hidden": 0.15625,
"loss/logits": 0.01002482883632183,
"loss/reg": 5.234671115875244,
"loss/twn": 0.0,
"step": 562
},
{
"epoch": 0.014075,
"grad_norm": 7.78125,
"grad_norm_var": 2448.6008422851564,
"learning_rate": 0.0001,
"loss": 7.6461,
"loss/crossentropy": 2.308584451675415,
"loss/hidden": 0.09619140625,
"loss/logits": 0.006569989956915379,
"loss/reg": 5.23472261428833,
"loss/twn": 0.0,
"step": 563
},
{
"epoch": 0.0141,
"grad_norm": 55.25,
"grad_norm_var": 2440.2951782226564,
"learning_rate": 0.0001,
"loss": 7.8559,
"loss/crossentropy": 2.439448118209839,
"loss/hidden": 0.16796875,
"loss/logits": 0.01364111714065075,
"loss/reg": 5.234842300415039,
"loss/twn": 0.0,
"step": 564
},
{
"epoch": 0.014125,
"grad_norm": 16.25,
"grad_norm_var": 2415.370438639323,
"learning_rate": 0.0001,
"loss": 8.3004,
"loss/crossentropy": 2.8731632232666016,
"loss/hidden": 0.1787109375,
"loss/logits": 0.014065857976675034,
"loss/reg": 5.23447847366333,
"loss/twn": 0.0,
"step": 565
},
{
"epoch": 0.01415,
"grad_norm": 11.8125,
"grad_norm_var": 2438.8619099934895,
"learning_rate": 0.0001,
"loss": 7.1178,
"loss/crossentropy": 1.780933141708374,
"loss/hidden": 0.09375,
"loss/logits": 0.008497532457113266,
"loss/reg": 5.234607696533203,
"loss/twn": 0.0,
"step": 566
},
{
"epoch": 0.014175,
"grad_norm": 14.375,
"grad_norm_var": 2437.342248535156,
"learning_rate": 0.0001,
"loss": 8.2505,
"loss/crossentropy": 2.875947952270508,
"loss/hidden": 0.125,
"loss/logits": 0.0144406259059906,
"loss/reg": 5.235077857971191,
"loss/twn": 0.0,
"step": 567
},
{
"epoch": 0.0142,
"grad_norm": 22.75,
"grad_norm_var": 2449.9111938476562,
"learning_rate": 0.0001,
"loss": 7.0607,
"loss/crossentropy": 1.6746110916137695,
"loss/hidden": 0.146484375,
"loss/logits": 0.0051438165828585625,
"loss/reg": 5.234502792358398,
"loss/twn": 0.0,
"step": 568
},
{
"epoch": 0.014225,
"grad_norm": 19.625,
"grad_norm_var": 2423.817736816406,
"learning_rate": 0.0001,
"loss": 7.989,
"loss/crossentropy": 2.5603058338165283,
"loss/hidden": 0.1826171875,
"loss/logits": 0.011639740318059921,
"loss/reg": 5.234450340270996,
"loss/twn": 0.0,
"step": 569
},
{
"epoch": 0.01425,
"grad_norm": 12.5625,
"grad_norm_var": 2410.089807128906,
"learning_rate": 0.0001,
"loss": 6.1451,
"loss/crossentropy": 0.8017870187759399,
"loss/hidden": 0.10498046875,
"loss/logits": 0.004113970324397087,
"loss/reg": 5.2342610359191895,
"loss/twn": 0.0,
"step": 570
},
{
"epoch": 0.014275,
"grad_norm": 7.96875,
"grad_norm_var": 2444.820947265625,
"learning_rate": 0.0001,
"loss": 6.3145,
"loss/crossentropy": 0.9622921943664551,
"loss/hidden": 0.10791015625,
"loss/logits": 0.00957135483622551,
"loss/reg": 5.234708786010742,
"loss/twn": 0.0,
"step": 571
},
{
"epoch": 0.0143,
"grad_norm": 9.5,
"grad_norm_var": 2486.3206868489583,
"learning_rate": 0.0001,
"loss": 7.6128,
"loss/crossentropy": 2.2709686756134033,
"loss/hidden": 0.0986328125,
"loss/logits": 0.008640400134027004,
"loss/reg": 5.2345733642578125,
"loss/twn": 0.0,
"step": 572
},
{
"epoch": 0.014325,
"grad_norm": 7.6875,
"grad_norm_var": 348.6860026041667,
"learning_rate": 0.0001,
"loss": 7.5522,
"loss/crossentropy": 2.1828360557556152,
"loss/hidden": 0.126953125,
"loss/logits": 0.008249370381236076,
"loss/reg": 5.234208583831787,
"loss/twn": 0.0,
"step": 573
},
{
"epoch": 0.01435,
"grad_norm": 12.4375,
"grad_norm_var": 350.864306640625,
"learning_rate": 0.0001,
"loss": 8.2355,
"loss/crossentropy": 2.8409531116485596,
"loss/hidden": 0.1494140625,
"loss/logits": 0.01071943435817957,
"loss/reg": 5.234445095062256,
"loss/twn": 0.0,
"step": 574
},
{
"epoch": 0.014375,
"grad_norm": 9.8125,
"grad_norm_var": 353.3037109375,
"learning_rate": 0.0001,
"loss": 6.8632,
"loss/crossentropy": 1.4696354866027832,
"loss/hidden": 0.1513671875,
"loss/logits": 0.007870590314269066,
"loss/reg": 5.234313011169434,
"loss/twn": 0.0,
"step": 575
},
{
"epoch": 0.0144,
"grad_norm": 54.5,
"grad_norm_var": 411.406494140625,
"learning_rate": 0.0001,
"loss": 7.1332,
"loss/crossentropy": 1.8072994947433472,
"loss/hidden": 0.08642578125,
"loss/logits": 0.004960792139172554,
"loss/reg": 5.234533309936523,
"loss/twn": 0.0,
"step": 576
},
{
"epoch": 0.014425,
"grad_norm": 14.9375,
"grad_norm_var": 353.450390625,
"learning_rate": 0.0001,
"loss": 8.0425,
"loss/crossentropy": 2.6438798904418945,
"loss/hidden": 0.1494140625,
"loss/logits": 0.014983810484409332,
"loss/reg": 5.234223365783691,
"loss/twn": 0.0,
"step": 577
},
{
"epoch": 0.01445,
"grad_norm": 8.6875,
"grad_norm_var": 227.10193684895833,
"learning_rate": 0.0001,
"loss": 6.7681,
"loss/crossentropy": 1.3993828296661377,
"loss/hidden": 0.12890625,
"loss/logits": 0.005370709113776684,
"loss/reg": 5.234410285949707,
"loss/twn": 0.0,
"step": 578
},
{
"epoch": 0.014475,
"grad_norm": 30.875,
"grad_norm_var": 229.36620686848957,
"learning_rate": 0.0001,
"loss": 8.3016,
"loss/crossentropy": 2.88641095161438,
"loss/hidden": 0.166015625,
"loss/logits": 0.014844672754406929,
"loss/reg": 5.234356880187988,
"loss/twn": 0.0,
"step": 579
},
{
"epoch": 0.0145,
"grad_norm": 10.625,
"grad_norm_var": 140.0116170247396,
"learning_rate": 0.0001,
"loss": 6.7513,
"loss/crossentropy": 1.4178798198699951,
"loss/hidden": 0.09375,
"loss/logits": 0.005413350649178028,
"loss/reg": 5.234261989593506,
"loss/twn": 0.0,
"step": 580
},
{
"epoch": 0.014525,
"grad_norm": 17.875,
"grad_norm_var": 140.1169881184896,
"learning_rate": 0.0001,
"loss": 7.069,
"loss/crossentropy": 1.6242541074752808,
"loss/hidden": 0.19921875,
"loss/logits": 0.011282745748758316,
"loss/reg": 5.234253883361816,
"loss/twn": 0.0,
"step": 581
},
{
"epoch": 0.01455,
"grad_norm": 13.5,
"grad_norm_var": 139.2117146809896,
"learning_rate": 0.0001,
"loss": 6.743,
"loss/crossentropy": 1.388974666595459,
"loss/hidden": 0.1103515625,
"loss/logits": 0.009235072880983353,
"loss/reg": 5.234424591064453,
"loss/twn": 0.0,
"step": 582
},
{
"epoch": 0.014575,
"grad_norm": 13.125,
"grad_norm_var": 139.70227457682293,
"learning_rate": 0.0001,
"loss": 7.0183,
"loss/crossentropy": 1.626574993133545,
"loss/hidden": 0.15234375,
"loss/logits": 0.005102044437080622,
"loss/reg": 5.234281539916992,
"loss/twn": 0.0,
"step": 583
},
{
"epoch": 0.0146,
"grad_norm": 11.4375,
"grad_norm_var": 138.50621337890624,
"learning_rate": 0.0001,
"loss": 8.1816,
"loss/crossentropy": 2.712043285369873,
"loss/hidden": 0.21484375,
"loss/logits": 0.020563386380672455,
"loss/reg": 5.234140396118164,
"loss/twn": 0.0,
"step": 584
},
{
"epoch": 0.014625,
"grad_norm": 135.0,
"grad_norm_var": 1027.0439412434896,
"learning_rate": 0.0001,
"loss": 7.6221,
"loss/crossentropy": 2.309222936630249,
"loss/hidden": 0.07177734375,
"loss/logits": 0.00689616659656167,
"loss/reg": 5.234216213226318,
"loss/twn": 0.0,
"step": 585
},
{
"epoch": 0.01465,
"grad_norm": 12.3125,
"grad_norm_var": 1027.4010375976563,
"learning_rate": 0.0001,
"loss": 8.0345,
"loss/crossentropy": 2.648197650909424,
"loss/hidden": 0.14453125,
"loss/logits": 0.0074767498299479485,
"loss/reg": 5.2342705726623535,
"loss/twn": 0.0,
"step": 586
},
{
"epoch": 0.014675,
"grad_norm": 11.3125,
"grad_norm_var": 1021.3348307291667,
"learning_rate": 0.0001,
"loss": 6.9087,
"loss/crossentropy": 1.557716965675354,
"loss/hidden": 0.1103515625,
"loss/logits": 0.0062755015678703785,
"loss/reg": 5.2343974113464355,
"loss/twn": 0.0,
"step": 587
},
{
"epoch": 0.0147,
"grad_norm": 14.0,
"grad_norm_var": 1014.2895182291667,
"learning_rate": 0.0001,
"loss": 7.9829,
"loss/crossentropy": 2.599196195602417,
"loss/hidden": 0.1416015625,
"loss/logits": 0.008030779659748077,
"loss/reg": 5.23403787612915,
"loss/twn": 0.0,
"step": 588
},
{
"epoch": 0.014725,
"grad_norm": 10.75,
"grad_norm_var": 1008.364697265625,
"learning_rate": 0.0001,
"loss": 6.2994,
"loss/crossentropy": 0.9181722402572632,
"loss/hidden": 0.1455078125,
"loss/logits": 0.0018882363801822066,
"loss/reg": 5.2338151931762695,
"loss/twn": 0.0,
"step": 589
},
{
"epoch": 0.01475,
"grad_norm": 10.75,
"grad_norm_var": 1011.1046875,
"learning_rate": 0.0001,
"loss": 7.8917,
"loss/crossentropy": 2.6068289279937744,
"loss/hidden": 0.04736328125,
"loss/logits": 0.003562201978638768,
"loss/reg": 5.233980178833008,
"loss/twn": 0.0,
"step": 590
},
{
"epoch": 0.014775,
"grad_norm": 10.4375,
"grad_norm_var": 1009.9702473958333,
"learning_rate": 0.0001,
"loss": 6.9904,
"loss/crossentropy": 1.5700491666793823,
"loss/hidden": 0.1787109375,
"loss/logits": 0.0076020704582333565,
"loss/reg": 5.234048366546631,
"loss/twn": 0.0,
"step": 591
},
{
"epoch": 0.0148,
"grad_norm": 13.75,
"grad_norm_var": 946.7228515625,
"learning_rate": 0.0001,
"loss": 8.3853,
"loss/crossentropy": 2.9748241901397705,
"loss/hidden": 0.162109375,
"loss/logits": 0.014261037111282349,
"loss/reg": 5.234062194824219,
"loss/twn": 0.0,
"step": 592
},
{
"epoch": 0.014825,
"grad_norm": 20.625,
"grad_norm_var": 943.9872233072916,
"learning_rate": 0.0001,
"loss": 7.1549,
"loss/crossentropy": 1.6911969184875488,
"loss/hidden": 0.2158203125,
"loss/logits": 0.013989459723234177,
"loss/reg": 5.233931541442871,
"loss/twn": 0.0,
"step": 593
},
{
"epoch": 0.01485,
"grad_norm": 12.1875,
"grad_norm_var": 938.7426920572917,
"learning_rate": 0.0001,
"loss": 6.7309,
"loss/crossentropy": 1.2908926010131836,
"loss/hidden": 0.1943359375,
"loss/logits": 0.011645066551864147,
"loss/reg": 5.234016418457031,
"loss/twn": 0.0,
"step": 594
},
{
"epoch": 0.014875,
"grad_norm": 11.1875,
"grad_norm_var": 939.1067057291667,
"learning_rate": 0.0001,
"loss": 8.2017,
"loss/crossentropy": 2.890516757965088,
"loss/hidden": 0.07177734375,
"loss/logits": 0.005793450400233269,
"loss/reg": 5.233628273010254,
"loss/twn": 0.0,
"step": 595
},
{
"epoch": 0.0149,
"grad_norm": 9.75,
"grad_norm_var": 940.3130208333333,
"learning_rate": 0.0001,
"loss": 6.5188,
"loss/crossentropy": 1.189386010169983,
"loss/hidden": 0.09130859375,
"loss/logits": 0.004200034309178591,
"loss/reg": 5.233954906463623,
"loss/twn": 0.0,
"step": 596
},
{
"epoch": 0.014925,
"grad_norm": 7.09375,
"grad_norm_var": 951.3511678059896,
"learning_rate": 0.0001,
"loss": 6.7014,
"loss/crossentropy": 1.3759723901748657,
"loss/hidden": 0.08642578125,
"loss/logits": 0.0052226390689611435,
"loss/reg": 5.233729362487793,
"loss/twn": 0.0,
"step": 597
},
{
"epoch": 0.01495,
"grad_norm": 8.875,
"grad_norm_var": 956.5892211914063,
"learning_rate": 0.0001,
"loss": 7.4921,
"loss/crossentropy": 2.140756368637085,
"loss/hidden": 0.10986328125,
"loss/logits": 0.007792431861162186,
"loss/reg": 5.233695983886719,
"loss/twn": 0.0,
"step": 598
},
{
"epoch": 0.014975,
"grad_norm": 12.5,
"grad_norm_var": 957.1479777018229,
"learning_rate": 0.0001,
"loss": 8.0241,
"loss/crossentropy": 2.6345574855804443,
"loss/hidden": 0.1435546875,
"loss/logits": 0.012438047677278519,
"loss/reg": 5.233548164367676,
"loss/twn": 0.0,
"step": 599
},
{
"epoch": 0.015,
"grad_norm": 10.0,
"grad_norm_var": 958.8220662434895,
"learning_rate": 0.0001,
"loss": 6.329,
"loss/crossentropy": 0.9624335765838623,
"loss/hidden": 0.125,
"loss/logits": 0.007683398202061653,
"loss/reg": 5.233921051025391,
"loss/twn": 0.0,
"step": 600
},
{
"epoch": 0.015025,
"grad_norm": 78.5,
"grad_norm_var": 287.54615478515626,
"learning_rate": 0.0001,
"loss": 8.289,
"loss/crossentropy": 2.931908369064331,
"loss/hidden": 0.11279296875,
"loss/logits": 0.010676998645067215,
"loss/reg": 5.233586311340332,
"loss/twn": 0.0,
"step": 601
},
{
"epoch": 0.01505,
"grad_norm": 11.125,
"grad_norm_var": 288.1986612955729,
"learning_rate": 0.0001,
"loss": 7.1402,
"loss/crossentropy": 1.7836761474609375,
"loss/hidden": 0.115234375,
"loss/logits": 0.007275612559169531,
"loss/reg": 5.234038352966309,
"loss/twn": 0.0,
"step": 602
},
{
"epoch": 0.015075,
"grad_norm": 214.0,
"grad_norm_var": 2734.4889933268228,
"learning_rate": 0.0001,
"loss": 8.0975,
"loss/crossentropy": 2.6364102363586426,
"loss/hidden": 0.220703125,
"loss/logits": 0.006685142405331135,
"loss/reg": 5.233693599700928,
"loss/twn": 0.0,
"step": 603
},
{
"epoch": 0.0151,
"grad_norm": 14.125,
"grad_norm_var": 2734.2487915039064,
"learning_rate": 0.0001,
"loss": 7.7466,
"loss/crossentropy": 2.511186122894287,
"loss/hidden": 6.556510925292969e-06,
"loss/logits": 0.0016271582571789622,
"loss/reg": 5.233736515045166,
"loss/twn": 0.0,
"step": 604
},
{
"epoch": 0.015125,
"grad_norm": 11.3125,
"grad_norm_var": 2732.938928222656,
"learning_rate": 0.0001,
"loss": 7.2908,
"loss/crossentropy": 1.9334180355072021,
"loss/hidden": 0.11279296875,
"loss/logits": 0.011062689125537872,
"loss/reg": 5.233515739440918,
"loss/twn": 0.0,
"step": 605
},
{
"epoch": 0.01515,
"grad_norm": 38.25,
"grad_norm_var": 2715.0710896809896,
"learning_rate": 0.0001,
"loss": 7.1654,
"loss/crossentropy": 1.7573686838150024,
"loss/hidden": 0.1630859375,
"loss/logits": 0.01111831609159708,
"loss/reg": 5.233856201171875,
"loss/twn": 0.0,
"step": 606
},
{
"epoch": 0.015175,
"grad_norm": 10.9375,
"grad_norm_var": 2713.767053222656,
"learning_rate": 0.0001,
"loss": 8.0656,
"loss/crossentropy": 2.725510835647583,
"loss/hidden": 0.09619140625,
"loss/logits": 0.010500291362404823,
"loss/reg": 5.233447074890137,
"loss/twn": 0.0,
"step": 607
},
{
"epoch": 0.0152,
"grad_norm": 15.4375,
"grad_norm_var": 2710.229455566406,
"learning_rate": 0.0001,
"loss": 8.219,
"loss/crossentropy": 2.8715081214904785,
"loss/hidden": 0.103515625,
"loss/logits": 0.010522611439228058,
"loss/reg": 5.233500003814697,
"loss/twn": 0.0,
"step": 608
},
{
"epoch": 0.015225,
"grad_norm": 12.75,
"grad_norm_var": 2724.336779785156,
"learning_rate": 0.0001,
"loss": 7.4159,
"loss/crossentropy": 2.050481081008911,
"loss/hidden": 0.1201171875,
"loss/logits": 0.012026194483041763,
"loss/reg": 5.233264446258545,
"loss/twn": 0.0,
"step": 609
},
{
"epoch": 0.01525,
"grad_norm": 102.5,
"grad_norm_var": 3021.098010253906,
"learning_rate": 0.0001,
"loss": 6.1157,
"loss/crossentropy": 0.7093434929847717,
"loss/hidden": 0.162109375,
"loss/logits": 0.010594572871923447,
"loss/reg": 5.233696460723877,
"loss/twn": 0.0,
"step": 610
},
{
"epoch": 0.015275,
"grad_norm": 11.0,
"grad_norm_var": 3021.7085571289062,
"learning_rate": 0.0001,
"loss": 7.9153,
"loss/crossentropy": 2.5407986640930176,
"loss/hidden": 0.130859375,
"loss/logits": 0.010352734476327896,
"loss/reg": 5.233257293701172,
"loss/twn": 0.0,
"step": 611
},
{
"epoch": 0.0153,
"grad_norm": 9.3125,
"grad_norm_var": 3023.223173014323,
"learning_rate": 0.0001,
"loss": 7.4287,
"loss/crossentropy": 2.056551694869995,
"loss/hidden": 0.1298828125,
"loss/logits": 0.008765427395701408,
"loss/reg": 5.233468055725098,
"loss/twn": 0.0,
"step": 612
},
{
"epoch": 0.015325,
"grad_norm": 39.25,
"grad_norm_var": 2966.133268229167,
"learning_rate": 0.0001,
"loss": 7.0537,
"loss/crossentropy": 1.7130509614944458,
"loss/hidden": 0.10205078125,
"loss/logits": 0.005178738851100206,
"loss/reg": 5.233448028564453,
"loss/twn": 0.0,
"step": 613
},
{
"epoch": 0.01535,
"grad_norm": 15.6875,
"grad_norm_var": 2943.0399576822915,
"learning_rate": 0.0001,
"loss": 7.4671,
"loss/crossentropy": 2.072944402694702,
"loss/hidden": 0.138671875,
"loss/logits": 0.021782729774713516,
"loss/reg": 5.233670234680176,
"loss/twn": 0.0,
"step": 614
},
{
"epoch": 0.015375,
"grad_norm": 40.5,
"grad_norm_var": 2897.146207682292,
"learning_rate": 0.0001,
"loss": 7.2931,
"loss/crossentropy": 1.9488314390182495,
"loss/hidden": 0.10693359375,
"loss/logits": 0.004016375169157982,
"loss/reg": 5.233325481414795,
"loss/twn": 0.0,
"step": 615
},
{
"epoch": 0.0154,
"grad_norm": 17.875,
"grad_norm_var": 2869.870817057292,
"learning_rate": 0.0001,
"loss": 7.5828,
"loss/crossentropy": 2.250316858291626,
"loss/hidden": 0.09375,
"loss/logits": 0.005294739734381437,
"loss/reg": 5.233473300933838,
"loss/twn": 0.0,
"step": 616
},
{
"epoch": 0.015425,
"grad_norm": 13.6875,
"grad_norm_var": 2801.0919270833333,
"learning_rate": 0.0001,
"loss": 7.946,
"loss/crossentropy": 2.605081796646118,
"loss/hidden": 0.09619140625,
"loss/logits": 0.01103608775883913,
"loss/reg": 5.233642578125,
"loss/twn": 0.0,
"step": 617
},
{
"epoch": 0.01545,
"grad_norm": 10.8125,
"grad_norm_var": 2802.1390462239583,
"learning_rate": 0.0001,
"loss": 7.9096,
"loss/crossentropy": 2.6560287475585938,
"loss/hidden": 0.016357421875,
"loss/logits": 0.0036827209405601025,
"loss/reg": 5.233491897583008,
"loss/twn": 0.0,
"step": 618
},
{
"epoch": 0.015475,
"grad_norm": 10.5625,
"grad_norm_var": 563.0020833333333,
"learning_rate": 0.0001,
"loss": 7.9054,
"loss/crossentropy": 2.542119026184082,
"loss/hidden": 0.12255859375,
"loss/logits": 0.007251254748553038,
"loss/reg": 5.2334303855896,
"loss/twn": 0.0,
"step": 619
},
{
"epoch": 0.0155,
"grad_norm": 17.875,
"grad_norm_var": 559.2559895833333,
"learning_rate": 0.0001,
"loss": 8.014,
"loss/crossentropy": 2.6021788120269775,
"loss/hidden": 0.1708984375,
"loss/logits": 0.007565245497971773,
"loss/reg": 5.23338508605957,
"loss/twn": 0.0,
"step": 620
},
{
"epoch": 0.015525,
"grad_norm": 21.625,
"grad_norm_var": 548.9945149739583,
"learning_rate": 0.0001,
"loss": 8.1369,
"loss/crossentropy": 2.7238121032714844,
"loss/hidden": 0.1650390625,
"loss/logits": 0.0146188298240304,
"loss/reg": 5.233391284942627,
"loss/twn": 0.0,
"step": 621
},
{
"epoch": 0.01555,
"grad_norm": 26.875,
"grad_norm_var": 535.8540201822917,
"learning_rate": 0.0001,
"loss": 6.8563,
"loss/crossentropy": 1.4800411462783813,
"loss/hidden": 0.134765625,
"loss/logits": 0.008334355428814888,
"loss/reg": 5.233196258544922,
"loss/twn": 0.0,
"step": 622
},
{
"epoch": 0.015575,
"grad_norm": 53.0,
"grad_norm_var": 575.73671875,
"learning_rate": 0.0001,
"loss": 8.042,
"loss/crossentropy": 2.7313811779022217,
"loss/hidden": 0.07275390625,
"loss/logits": 0.004363874904811382,
"loss/reg": 5.23349666595459,
"loss/twn": 0.0,
"step": 623
},
{
"epoch": 0.0156,
"grad_norm": 11.9375,
"grad_norm_var": 581.51171875,
"learning_rate": 0.0001,
"loss": 7.9383,
"loss/crossentropy": 2.561565637588501,
"loss/hidden": 0.12890625,
"loss/logits": 0.014568326994776726,
"loss/reg": 5.233224868774414,
"loss/twn": 0.0,
"step": 624
},
{
"epoch": 0.015625,
"grad_norm": 17.875,
"grad_norm_var": 574.1311848958334,
"learning_rate": 0.0001,
"loss": 7.8326,
"loss/crossentropy": 2.5030391216278076,
"loss/hidden": 0.08642578125,
"loss/logits": 0.009584227576851845,
"loss/reg": 5.233513355255127,
"loss/twn": 0.0,
"step": 625
},
{
"epoch": 0.01565,
"grad_norm": 14.8125,
"grad_norm_var": 163.484228515625,
"learning_rate": 0.0001,
"loss": 6.0231,
"loss/crossentropy": 0.6415687203407288,
"loss/hidden": 0.1376953125,
"loss/logits": 0.010496280156075954,
"loss/reg": 5.233306884765625,
"loss/twn": 0.0,
"step": 626
},
{
"epoch": 0.015675,
"grad_norm": 20.125,
"grad_norm_var": 156.77355143229167,
"learning_rate": 0.0001,
"loss": 6.8528,
"loss/crossentropy": 1.486595630645752,
"loss/hidden": 0.126953125,
"loss/logits": 0.005804477259516716,
"loss/reg": 5.2334794998168945,
"loss/twn": 0.0,
"step": 627
},
{
"epoch": 0.0157,
"grad_norm": 18.0,
"grad_norm_var": 147.53177083333333,
"learning_rate": 0.0001,
"loss": 6.2299,
"loss/crossentropy": 0.6993483304977417,
"loss/hidden": 0.287109375,
"loss/logits": 0.010279776528477669,
"loss/reg": 5.233189582824707,
"loss/twn": 0.0,
"step": 628
},
{
"epoch": 0.015725,
"grad_norm": 9.0625,
"grad_norm_var": 134.67849934895833,
"learning_rate": 0.0001,
"loss": 7.3681,
"loss/crossentropy": 2.038987874984741,
"loss/hidden": 0.08642578125,
"loss/logits": 0.009393742308020592,
"loss/reg": 5.2332634925842285,
"loss/twn": 0.0,
"step": 629
},
{
"epoch": 0.01575,
"grad_norm": 13.625,
"grad_norm_var": 136.13567708333332,
"learning_rate": 0.0001,
"loss": 7.7675,
"loss/crossentropy": 2.3605449199676514,
"loss/hidden": 0.16796875,
"loss/logits": 0.006129886955022812,
"loss/reg": 5.232873916625977,
"loss/twn": 0.0,
"step": 630
},
{
"epoch": 0.015775,
"grad_norm": 14.6875,
"grad_norm_var": 106.847900390625,
"learning_rate": 0.0001,
"loss": 8.0393,
"loss/crossentropy": 2.6303060054779053,
"loss/hidden": 0.1689453125,
"loss/logits": 0.007038387469947338,
"loss/reg": 5.23299503326416,
"loss/twn": 0.0,
"step": 631
},
{
"epoch": 0.0158,
"grad_norm": 72.0,
"grad_norm_var": 287.03904622395834,
"learning_rate": 0.0001,
"loss": 8.0743,
"loss/crossentropy": 2.7000534534454346,
"loss/hidden": 0.130859375,
"loss/logits": 0.010081219486892223,
"loss/reg": 5.233316421508789,
"loss/twn": 0.0,
"step": 632
},
{
"epoch": 0.015825,
"grad_norm": 39.0,
"grad_norm_var": 300.17649739583334,
"learning_rate": 0.0001,
"loss": 7.4968,
"loss/crossentropy": 2.066685676574707,
"loss/hidden": 0.1875,
"loss/logits": 0.009645121172070503,
"loss/reg": 5.233018398284912,
"loss/twn": 0.0,
"step": 633
},
{
"epoch": 0.01585,
"grad_norm": 11.125,
"grad_norm_var": 299.664697265625,
"learning_rate": 0.0001,
"loss": 7.4915,
"loss/crossentropy": 2.1979663372039795,
"loss/hidden": 0.0546875,
"loss/logits": 0.0055001177825033665,
"loss/reg": 5.23332405090332,
"loss/twn": 0.0,
"step": 634
},
{
"epoch": 0.015875,
"grad_norm": 13.5625,
"grad_norm_var": 295.147509765625,
"learning_rate": 0.0001,
"loss": 8.1414,
"loss/crossentropy": 2.8441083431243896,
"loss/hidden": 0.05712890625,
"loss/logits": 0.007112159393727779,
"loss/reg": 5.233099460601807,
"loss/twn": 0.0,
"step": 635
},
{
"epoch": 0.0159,
"grad_norm": 14.4375,
"grad_norm_var": 298.4408854166667,
"learning_rate": 0.0001,
"loss": 7.8434,
"loss/crossentropy": 2.419523239135742,
"loss/hidden": 0.1787109375,
"loss/logits": 0.011547038331627846,
"loss/reg": 5.233601093292236,
"loss/twn": 0.0,
"step": 636
},
{
"epoch": 0.015925,
"grad_norm": 135.0,
"grad_norm_var": 1077.4806640625,
"learning_rate": 0.0001,
"loss": 7.9874,
"loss/crossentropy": 2.4825010299682617,
"loss/hidden": 0.25390625,
"loss/logits": 0.017681429162621498,
"loss/reg": 5.233287811279297,
"loss/twn": 0.0,
"step": 637
},
{
"epoch": 0.01595,
"grad_norm": 79.5,
"grad_norm_var": 1226.39296875,
"learning_rate": 0.0001,
"loss": 5.6235,
"loss/crossentropy": 0.20704708993434906,
"loss/hidden": 0.1806640625,
"loss/logits": 0.0022514096926897764,
"loss/reg": 5.233528137207031,
"loss/twn": 0.0,
"step": 638
},
{
"epoch": 0.015975,
"grad_norm": 12.3125,
"grad_norm_var": 1224.665869140625,
"learning_rate": 0.0001,
"loss": 7.9267,
"loss/crossentropy": 2.5261454582214355,
"loss/hidden": 0.1572265625,
"loss/logits": 0.010081654414534569,
"loss/reg": 5.233248233795166,
"loss/twn": 0.0,
"step": 639
},
{
"epoch": 0.016,
"grad_norm": 384.0,
"grad_norm_var": 8927.619205729166,
"learning_rate": 0.0001,
"loss": 7.0656,
"loss/crossentropy": 1.7351843118667603,
"loss/hidden": 0.0927734375,
"loss/logits": 0.004287827759981155,
"loss/reg": 5.233373641967773,
"loss/twn": 0.0,
"step": 640
},
{
"epoch": 0.016025,
"grad_norm": 18.125,
"grad_norm_var": 8926.408268229166,
"learning_rate": 0.0001,
"loss": 7.0317,
"loss/crossentropy": 1.592779278755188,
"loss/hidden": 0.193359375,
"loss/logits": 0.012589013203978539,
"loss/reg": 5.232937335968018,
"loss/twn": 0.0,
"step": 641
},
{
"epoch": 0.01605,
"grad_norm": 9.125,
"grad_norm_var": 8958.401936848959,
"learning_rate": 0.0001,
"loss": 8.2427,
"loss/crossentropy": 2.891831874847412,
"loss/hidden": 0.1123046875,
"loss/logits": 0.005527087952941656,
"loss/reg": 5.233004093170166,
"loss/twn": 0.0,
"step": 642
},
{
"epoch": 0.016075,
"grad_norm": 93.0,
"grad_norm_var": 8961.362483723959,
"learning_rate": 0.0001,
"loss": 6.9493,
"loss/crossentropy": 1.5797322988510132,
"loss/hidden": 0.1259765625,
"loss/logits": 0.010628938674926758,
"loss/reg": 5.232941627502441,
"loss/twn": 0.0,
"step": 643
},
{
"epoch": 0.0161,
"grad_norm": 9.875,
"grad_norm_var": 9009.401546223959,
"learning_rate": 0.0001,
"loss": 8.1281,
"loss/crossentropy": 2.7545292377471924,
"loss/hidden": 0.1328125,
"loss/logits": 0.007755103521049023,
"loss/reg": 5.232993125915527,
"loss/twn": 0.0,
"step": 644
},
{
"epoch": 0.016125,
"grad_norm": 28.75,
"grad_norm_var": 8905.093684895834,
"learning_rate": 0.0001,
"loss": 7.9328,
"loss/crossentropy": 2.643615245819092,
"loss/hidden": 0.054443359375,
"loss/logits": 0.0016623031115159392,
"loss/reg": 5.2330780029296875,
"loss/twn": 0.0,
"step": 645
},
{
"epoch": 0.01615,
"grad_norm": 20.625,
"grad_norm_var": 8865.565559895833,
"learning_rate": 0.0001,
"loss": 6.9237,
"loss/crossentropy": 1.4672698974609375,
"loss/hidden": 0.2138671875,
"loss/logits": 0.009410521015524864,
"loss/reg": 5.233164310455322,
"loss/twn": 0.0,
"step": 646
},
{
"epoch": 0.016175,
"grad_norm": 15.25,
"grad_norm_var": 8862.209749348958,
"learning_rate": 0.0001,
"loss": 7.9973,
"loss/crossentropy": 2.6482272148132324,
"loss/hidden": 0.10791015625,
"loss/logits": 0.00825223047286272,
"loss/reg": 5.232880592346191,
"loss/twn": 0.0,
"step": 647
},
{
"epoch": 0.0162,
"grad_norm": 13.6875,
"grad_norm_var": 8979.335872395834,
"learning_rate": 0.0001,
"loss": 6.1538,
"loss/crossentropy": 0.7194666862487793,
"loss/hidden": 0.1884765625,
"loss/logits": 0.012791863642632961,
"loss/reg": 5.233090877532959,
"loss/twn": 0.0,
"step": 648
},
{
"epoch": 0.016225,
"grad_norm": 24.875,
"grad_norm_var": 9023.984114583332,
"learning_rate": 0.0001,
"loss": 7.1305,
"loss/crossentropy": 1.7185572385787964,
"loss/hidden": 0.1669921875,
"loss/logits": 0.011910820379853249,
"loss/reg": 5.233007431030273,
"loss/twn": 0.0,
"step": 649
},
{
"epoch": 0.01625,
"grad_norm": 12.4375,
"grad_norm_var": 9016.378108723959,
"learning_rate": 0.0001,
"loss": 7.4389,
"loss/crossentropy": 2.082362174987793,
"loss/hidden": 0.11279296875,
"loss/logits": 0.010908122174441814,
"loss/reg": 5.232817649841309,
"loss/twn": 0.0,
"step": 650
},
{
"epoch": 0.016275,
"grad_norm": 8.9375,
"grad_norm_var": 9043.443994140625,
"learning_rate": 0.0001,
"loss": 7.4177,
"loss/crossentropy": 2.062870502471924,
"loss/hidden": 0.11328125,
"loss/logits": 0.008752668276429176,
"loss/reg": 5.232777118682861,
"loss/twn": 0.0,
"step": 651
},
{
"epoch": 0.0163,
"grad_norm": 12.0625,
"grad_norm_var": 9056.640087890624,
"learning_rate": 0.0001,
"loss": 7.9823,
"loss/crossentropy": 2.6078176498413086,
"loss/hidden": 0.1328125,
"loss/logits": 0.008596043102443218,
"loss/reg": 5.23306941986084,
"loss/twn": 0.0,
"step": 652
},
{
"epoch": 0.016325,
"grad_norm": 122.5,
"grad_norm_var": 8932.818473307292,
"learning_rate": 0.0001,
"loss": 7.9264,
"loss/crossentropy": 2.5046682357788086,
"loss/hidden": 0.1748046875,
"loss/logits": 0.014288893900811672,
"loss/reg": 5.232659339904785,
"loss/twn": 0.0,
"step": 653
},
{
"epoch": 0.01635,
"grad_norm": 21.5,
"grad_norm_var": 8946.382014973959,
"learning_rate": 0.0001,
"loss": 7.2315,
"loss/crossentropy": 1.8828779458999634,
"loss/hidden": 0.10986328125,
"loss/logits": 0.005740518681704998,
"loss/reg": 5.23300838470459,
"loss/twn": 0.0,
"step": 654
},
{
"epoch": 0.016375,
"grad_norm": 352.0,
"grad_norm_var": 14431.18515625,
"learning_rate": 0.0001,
"loss": 6.8525,
"loss/crossentropy": 1.471374273300171,
"loss/hidden": 0.138671875,
"loss/logits": 0.009916655719280243,
"loss/reg": 5.232522010803223,
"loss/twn": 0.0,
"step": 655
},
{
"epoch": 0.0164,
"grad_norm": 33.75,
"grad_norm_var": 7512.653125,
"learning_rate": 0.0001,
"loss": 6.2595,
"loss/crossentropy": 0.7713863253593445,
"loss/hidden": 0.244140625,
"loss/logits": 0.01109264511615038,
"loss/reg": 5.232911109924316,
"loss/twn": 0.0,
"step": 656
},
{
"epoch": 0.016425,
"grad_norm": 15.0625,
"grad_norm_var": 7526.165608723958,
"learning_rate": 0.0001,
"loss": 8.093,
"loss/crossentropy": 2.709456205368042,
"loss/hidden": 0.1337890625,
"loss/logits": 0.01727338880300522,
"loss/reg": 5.23248815536499,
"loss/twn": 0.0,
"step": 657
},
{
"epoch": 0.01645,
"grad_norm": 9.5625,
"grad_norm_var": 7523.817122395833,
"learning_rate": 0.0001,
"loss": 6.334,
"loss/crossentropy": 0.9124006032943726,
"loss/hidden": 0.1806640625,
"loss/logits": 0.007904157042503357,
"loss/reg": 5.2329912185668945,
"loss/twn": 0.0,
"step": 658
},
{
"epoch": 0.016475,
"grad_norm": 9.625,
"grad_norm_var": 7476.006770833334,
"learning_rate": 0.0001,
"loss": 7.0775,
"loss/crossentropy": 1.669623613357544,
"loss/hidden": 0.1640625,
"loss/logits": 0.011253604665398598,
"loss/reg": 5.232557773590088,
"loss/twn": 0.0,
"step": 659
},
{
"epoch": 0.0165,
"grad_norm": 9.8125,
"grad_norm_var": 7476.294775390625,
"learning_rate": 0.0001,
"loss": 8.0574,
"loss/crossentropy": 2.7882609367370605,
"loss/hidden": 0.03515625,
"loss/logits": 0.0012600821210071445,
"loss/reg": 5.23272705078125,
"loss/twn": 0.0,
"step": 660
},
{
"epoch": 0.016525,
"grad_norm": 64.5,
"grad_norm_var": 7481.564176432292,
"learning_rate": 0.0001,
"loss": 7.1712,
"loss/crossentropy": 1.5283738374710083,
"loss/hidden": 0.40234375,
"loss/logits": 0.008047623559832573,
"loss/reg": 5.2324652671813965,
"loss/twn": 0.0,
"step": 661
},
{
"epoch": 0.01655,
"grad_norm": 14.0625,
"grad_norm_var": 7507.016080729167,
"learning_rate": 0.0001,
"loss": 6.9527,
"loss/crossentropy": 1.5561813116073608,
"loss/hidden": 0.150390625,
"loss/logits": 0.013308672234416008,
"loss/reg": 5.232776641845703,
"loss/twn": 0.0,
"step": 662
},
{
"epoch": 0.016575,
"grad_norm": 8.5625,
"grad_norm_var": 7537.432014973959,
"learning_rate": 0.0001,
"loss": 6.883,
"loss/crossentropy": 1.4900498390197754,
"loss/hidden": 0.15234375,
"loss/logits": 0.008140160702168941,
"loss/reg": 5.232505798339844,
"loss/twn": 0.0,
"step": 663
},
{
"epoch": 0.0166,
"grad_norm": 10.4375,
"grad_norm_var": 7552.011311848958,
"learning_rate": 0.0001,
"loss": 7.9879,
"loss/crossentropy": 2.649501085281372,
"loss/hidden": 0.0986328125,
"loss/logits": 0.006926264148205519,
"loss/reg": 5.2328362464904785,
"loss/twn": 0.0,
"step": 664
},
{
"epoch": 0.016625,
"grad_norm": 20.875,
"grad_norm_var": 7564.067561848959,
"learning_rate": 0.0001,
"loss": 7.9074,
"loss/crossentropy": 2.5822925567626953,
"loss/hidden": 0.08642578125,
"loss/logits": 0.006065651774406433,
"loss/reg": 5.232655048370361,
"loss/twn": 0.0,
"step": 665
},
{
"epoch": 0.01665,
"grad_norm": 10.125,
"grad_norm_var": 7574.551497395833,
"learning_rate": 0.0001,
"loss": 8.0297,
"loss/crossentropy": 2.7020585536956787,
"loss/hidden": 0.08642578125,
"loss/logits": 0.008298511616885662,
"loss/reg": 5.232966899871826,
"loss/twn": 0.0,
"step": 666
},
{
"epoch": 0.016675,
"grad_norm": 11.3125,
"grad_norm_var": 7563.417447916667,
"learning_rate": 0.0001,
"loss": 6.2821,
"loss/crossentropy": 0.8871417045593262,
"loss/hidden": 0.1572265625,
"loss/logits": 0.005114687606692314,
"loss/reg": 5.232606410980225,
"loss/twn": 0.0,
"step": 667
},
{
"epoch": 0.0167,
"grad_norm": 16.75,
"grad_norm_var": 7543.980192057292,
"learning_rate": 0.0001,
"loss": 7.9399,
"loss/crossentropy": 2.610417604446411,
"loss/hidden": 0.08642578125,
"loss/logits": 0.010183998383581638,
"loss/reg": 5.2328290939331055,
"loss/twn": 0.0,
"step": 668
},
{
"epoch": 0.016725,
"grad_norm": 7.71875,
"grad_norm_var": 7191.31181233724,
"learning_rate": 0.0001,
"loss": 7.23,
"loss/crossentropy": 1.891376256942749,
"loss/hidden": 0.09765625,
"loss/logits": 0.008510958403348923,
"loss/reg": 5.232450485229492,
"loss/twn": 0.0,
"step": 669
},
{
"epoch": 0.01675,
"grad_norm": 16.5,
"grad_norm_var": 7204.193322753907,
"learning_rate": 0.0001,
"loss": 6.8222,
"loss/crossentropy": 1.4736872911453247,
"loss/hidden": 0.10546875,
"loss/logits": 0.010109667666256428,
"loss/reg": 5.232895374298096,
"loss/twn": 0.0,
"step": 670
},
{
"epoch": 0.016775,
"grad_norm": 17.0,
"grad_norm_var": 200.33785400390624,
"learning_rate": 0.0001,
"loss": 8.133,
"loss/crossentropy": 2.7790119647979736,
"loss/hidden": 0.11279296875,
"loss/logits": 0.008631115779280663,
"loss/reg": 5.232613563537598,
"loss/twn": 0.0,
"step": 671
},
{
"epoch": 0.0168,
"grad_norm": 21.875,
"grad_norm_var": 182.99231363932293,
"learning_rate": 0.0001,
"loss": 8.1379,
"loss/crossentropy": 2.766389846801758,
"loss/hidden": 0.12890625,
"loss/logits": 0.009614645503461361,
"loss/reg": 5.232971668243408,
"loss/twn": 0.0,
"step": 672
},
{
"epoch": 0.016825,
"grad_norm": 10.1875,
"grad_norm_var": 185.4031534830729,
"learning_rate": 0.0001,
"loss": 7.7498,
"loss/crossentropy": 2.4996728897094727,
"loss/hidden": 0.0140380859375,
"loss/logits": 0.0034008692018687725,
"loss/reg": 5.2326860427856445,
"loss/twn": 0.0,
"step": 673
},
{
"epoch": 0.01685,
"grad_norm": 11.6875,
"grad_norm_var": 183.8099568684896,
"learning_rate": 0.0001,
"loss": 6.7223,
"loss/crossentropy": 1.3949775695800781,
"loss/hidden": 0.0927734375,
"loss/logits": 0.0020429021678864956,
"loss/reg": 5.23252010345459,
"loss/twn": 0.0,
"step": 674
},
{
"epoch": 0.016875,
"grad_norm": 11.0,
"grad_norm_var": 182.70172119140625,
"learning_rate": 0.0001,
"loss": 7.9354,
"loss/crossentropy": 2.6207685470581055,
"loss/hidden": 0.07666015625,
"loss/logits": 0.005197848193347454,
"loss/reg": 5.232755184173584,
"loss/twn": 0.0,
"step": 675
},
{
"epoch": 0.0169,
"grad_norm": 20.5,
"grad_norm_var": 180.45289306640626,
"learning_rate": 0.0001,
"loss": 8.3505,
"loss/crossentropy": 2.9047460556030273,
"loss/hidden": 0.193359375,
"loss/logits": 0.019472159445285797,
"loss/reg": 5.232929706573486,
"loss/twn": 0.0,
"step": 676
},
{
"epoch": 0.016925,
"grad_norm": 8.5,
"grad_norm_var": 22.29664306640625,
"learning_rate": 0.0001,
"loss": 7.8368,
"loss/crossentropy": 2.4469549655914307,
"loss/hidden": 0.1484375,
"loss/logits": 0.008610617369413376,
"loss/reg": 5.232751369476318,
"loss/twn": 0.0,
"step": 677
},
{
"epoch": 0.01695,
"grad_norm": 14.9375,
"grad_norm_var": 22.402144368489584,
"learning_rate": 0.0001,
"loss": 7.847,
"loss/crossentropy": 2.5082645416259766,
"loss/hidden": 0.0986328125,
"loss/logits": 0.007502686232328415,
"loss/reg": 5.2326483726501465,
"loss/twn": 0.0,
"step": 678
},
{
"epoch": 0.016975,
"grad_norm": 179.0,
"grad_norm_var": 1722.9600545247397,
"learning_rate": 0.0001,
"loss": 6.7476,
"loss/crossentropy": 1.1506078243255615,
"loss/hidden": 0.35546875,
"loss/logits": 0.008823427371680737,
"loss/reg": 5.2327399253845215,
"loss/twn": 0.0,
"step": 679
},
{
"epoch": 0.017,
"grad_norm": 10.0625,
"grad_norm_var": 1723.6607381184897,
"learning_rate": 0.0001,
"loss": 7.9045,
"loss/crossentropy": 2.541614055633545,
"loss/hidden": 0.12109375,
"loss/logits": 0.008851654827594757,
"loss/reg": 5.232950687408447,
"loss/twn": 0.0,
"step": 680
},
{
"epoch": 0.017025,
"grad_norm": 38.5,
"grad_norm_var": 1735.1399373372396,
"learning_rate": 0.0001,
"loss": 6.9999,
"loss/crossentropy": 1.6033979654312134,
"loss/hidden": 0.1572265625,
"loss/logits": 0.006385164335370064,
"loss/reg": 5.232907772064209,
"loss/twn": 0.0,
"step": 681
},
{
"epoch": 0.01705,
"grad_norm": 17.625,
"grad_norm_var": 1723.4270467122396,
"learning_rate": 0.0001,
"loss": 7.998,
"loss/crossentropy": 2.6382601261138916,
"loss/hidden": 0.11279296875,
"loss/logits": 0.014375717379152775,
"loss/reg": 5.232568264007568,
"loss/twn": 0.0,
"step": 682
},
{
"epoch": 0.017075,
"grad_norm": 10.6875,
"grad_norm_var": 1724.6606079101562,
"learning_rate": 0.0001,
"loss": 6.5706,
"loss/crossentropy": 1.2485917806625366,
"loss/hidden": 0.08642578125,
"loss/logits": 0.002942750696092844,
"loss/reg": 5.232631206512451,
"loss/twn": 0.0,
"step": 683
},
{
"epoch": 0.0171,
"grad_norm": 55.25,
"grad_norm_var": 1770.930790201823,
"learning_rate": 0.0001,
"loss": 7.0946,
"loss/crossentropy": 1.641523003578186,
"loss/hidden": 0.212890625,
"loss/logits": 0.007442857138812542,
"loss/reg": 5.2327880859375,
"loss/twn": 0.0,
"step": 684
},
{
"epoch": 0.017125,
"grad_norm": 12.0,
"grad_norm_var": 1760.3909993489583,
"learning_rate": 0.0001,
"loss": 6.5653,
"loss/crossentropy": 1.1899452209472656,
"loss/hidden": 0.130859375,
"loss/logits": 0.0118794534355402,
"loss/reg": 5.232624530792236,
"loss/twn": 0.0,
"step": 685
},
{
"epoch": 0.01715,
"grad_norm": 11.0625,
"grad_norm_var": 1770.9077473958334,
"learning_rate": 0.0001,
"loss": 7.3521,
"loss/crossentropy": 2.0265913009643555,
"loss/hidden": 0.08642578125,
"loss/logits": 0.006208708509802818,
"loss/reg": 5.2328410148620605,
"loss/twn": 0.0,
"step": 686
},
{
"epoch": 0.017175,
"grad_norm": 17.75,
"grad_norm_var": 1769.8311848958333,
"learning_rate": 0.0001,
"loss": 6.2951,
"loss/crossentropy": 0.9038959741592407,
"loss/hidden": 0.1513671875,
"loss/logits": 0.007222220301628113,
"loss/reg": 5.2325825691223145,
"loss/twn": 0.0,
"step": 687
},
{
"epoch": 0.0172,
"grad_norm": 137.0,
"grad_norm_var": 2501.6544270833333,
"learning_rate": 0.0001,
"loss": 7.9399,
"loss/crossentropy": 2.5502278804779053,
"loss/hidden": 0.1455078125,
"loss/logits": 0.011638427153229713,
"loss/reg": 5.232552528381348,
"loss/twn": 0.0,
"step": 688
},
{
"epoch": 0.017225,
"grad_norm": 16.25,
"grad_norm_var": 2483.604280598958,
"learning_rate": 0.0001,
"loss": 8.1215,
"loss/crossentropy": 2.7514421939849854,
"loss/hidden": 0.126953125,
"loss/logits": 0.010679876431822777,
"loss/reg": 5.232418537139893,
"loss/twn": 0.0,
"step": 689
},
{
"epoch": 0.01725,
"grad_norm": 9.75,
"grad_norm_var": 2490.0520182291666,
"learning_rate": 0.0001,
"loss": 7.1755,
"loss/crossentropy": 1.8485499620437622,
"loss/hidden": 0.08837890625,
"loss/logits": 0.005838857963681221,
"loss/reg": 5.2327117919921875,
"loss/twn": 0.0,
"step": 690
},
{
"epoch": 0.017275,
"grad_norm": 12.5,
"grad_norm_var": 2485.2692057291665,
"learning_rate": 0.0001,
"loss": 7.1405,
"loss/crossentropy": 1.8488080501556396,
"loss/hidden": 0.0546875,
"loss/logits": 0.0044571696780622005,
"loss/reg": 5.232502460479736,
"loss/twn": 0.0,
"step": 691
},
{
"epoch": 0.0173,
"grad_norm": 1448.0,
"grad_norm_var": 126949.88639322917,
"learning_rate": 0.0001,
"loss": 6.8951,
"loss/crossentropy": 1.4486842155456543,
"loss/hidden": 0.2001953125,
"loss/logits": 0.013465155847370625,
"loss/reg": 5.23276424407959,
"loss/twn": 0.0,
"step": 692
},
{
"epoch": 0.017325,
"grad_norm": 192.0,
"grad_norm_var": 126205.7556640625,
"learning_rate": 0.0001,
"loss": 6.7837,
"loss/crossentropy": 1.3860431909561157,
"loss/hidden": 0.1591796875,
"loss/logits": 0.006227361969649792,
"loss/reg": 5.232283592224121,
"loss/twn": 0.0,
"step": 693
},
{
"epoch": 0.01735,
"grad_norm": 8.8125,
"grad_norm_var": 126307.29348958333,
"learning_rate": 0.0001,
"loss": 7.7481,
"loss/crossentropy": 2.4704062938690186,
"loss/hidden": 0.0400390625,
"loss/logits": 0.004876245744526386,
"loss/reg": 5.232789993286133,
"loss/twn": 0.0,
"step": 694
},
{
"epoch": 0.017375,
"grad_norm": 13.9375,
"grad_norm_var": 127064.13084309896,
"learning_rate": 0.0001,
"loss": 7.4,
"loss/crossentropy": 2.0642364025115967,
"loss/hidden": 0.09375,
"loss/logits": 0.009522231295704842,
"loss/reg": 5.2324628829956055,
"loss/twn": 0.0,
"step": 695
},
{
"epoch": 0.0174,
"grad_norm": 9.3125,
"grad_norm_var": 127075.72967122396,
"learning_rate": 0.0001,
"loss": 7.1133,
"loss/crossentropy": 1.8082016706466675,
"loss/hidden": 0.0693359375,
"loss/logits": 0.0033433041535317898,
"loss/reg": 5.232382297515869,
"loss/twn": 0.0,
"step": 696
},
{
"epoch": 0.017425,
"grad_norm": 15.5,
"grad_norm_var": 127376.05935872396,
"learning_rate": 0.0001,
"loss": 8.0679,
"loss/crossentropy": 2.6998050212860107,
"loss/hidden": 0.125,
"loss/logits": 0.011031926609575748,
"loss/reg": 5.232105731964111,
"loss/twn": 0.0,
"step": 697
},
{
"epoch": 0.01745,
"grad_norm": 16.5,
"grad_norm_var": 127392.12693684896,
"learning_rate": 0.0001,
"loss": 7.1902,
"loss/crossentropy": 1.788904070854187,
"loss/hidden": 0.158203125,
"loss/logits": 0.010435621254146099,
"loss/reg": 5.232694149017334,
"loss/twn": 0.0,
"step": 698
},
{
"epoch": 0.017475,
"grad_norm": 26.25,
"grad_norm_var": 127171.84055989583,
"learning_rate": 0.0001,
"loss": 8.0089,
"loss/crossentropy": 2.5942537784576416,
"loss/hidden": 0.1728515625,
"loss/logits": 0.009584179148077965,
"loss/reg": 5.232229232788086,
"loss/twn": 0.0,
"step": 699
},
{
"epoch": 0.0175,
"grad_norm": 7.375,
"grad_norm_var": 127761.07708333334,
"learning_rate": 0.0001,
"loss": 7.1149,
"loss/crossentropy": 1.7783420085906982,
"loss/hidden": 0.0986328125,
"loss/logits": 0.0052945781499147415,
"loss/reg": 5.232630252838135,
"loss/twn": 0.0,
"step": 700
},
{
"epoch": 0.017525,
"grad_norm": 14.875,
"grad_norm_var": 127719.3791015625,
"learning_rate": 0.0001,
"loss": 8.0908,
"loss/crossentropy": 2.7603297233581543,
"loss/hidden": 0.08642578125,
"loss/logits": 0.011579321697354317,
"loss/reg": 5.232461929321289,
"loss/twn": 0.0,
"step": 701
},
{
"epoch": 0.01755,
"grad_norm": 25.75,
"grad_norm_var": 127515.01248372396,
"learning_rate": 0.0001,
"loss": 7.8012,
"loss/crossentropy": 2.4607961177825928,
"loss/hidden": 0.0986328125,
"loss/logits": 0.009348167106509209,
"loss/reg": 5.232429504394531,
"loss/twn": 0.0,
"step": 702
},
{
"epoch": 0.017575,
"grad_norm": 36.25,
"grad_norm_var": 127276.2372233073,
"learning_rate": 0.0001,
"loss": 7.7355,
"loss/crossentropy": 2.2735610008239746,
"loss/hidden": 0.216796875,
"loss/logits": 0.012781517580151558,
"loss/reg": 5.232335090637207,
"loss/twn": 0.0,
"step": 703
},
{
"epoch": 0.0176,
"grad_norm": 12.8125,
"grad_norm_var": 128031.16139322917,
"learning_rate": 0.0001,
"loss": 7.7608,
"loss/crossentropy": 2.4146182537078857,
"loss/hidden": 0.10595703125,
"loss/logits": 0.007775201462209225,
"loss/reg": 5.232488632202148,
"loss/twn": 0.0,
"step": 704
},
{
"epoch": 0.017625,
"grad_norm": 12.875,
"grad_norm_var": 128077.03854166667,
"learning_rate": 0.0001,
"loss": 7.6663,
"loss/crossentropy": 2.3123011589050293,
"loss/hidden": 0.11279296875,
"loss/logits": 0.008896533399820328,
"loss/reg": 5.232283115386963,
"loss/twn": 0.0,
"step": 705
},
{
"epoch": 0.01765,
"grad_norm": 9.8125,
"grad_norm_var": 128076.14998372395,
"learning_rate": 0.0001,
"loss": 6.7063,
"loss/crossentropy": 1.328172206878662,
"loss/hidden": 0.138671875,
"loss/logits": 0.006915399804711342,
"loss/reg": 5.232503414154053,
"loss/twn": 0.0,
"step": 706
},
{
"epoch": 0.017675,
"grad_norm": 10.25,
"grad_norm_var": 128107.63943684896,
"learning_rate": 0.0001,
"loss": 5.6211,
"loss/crossentropy": 0.28784415125846863,
"loss/hidden": 0.09521484375,
"loss/logits": 0.005815575830638409,
"loss/reg": 5.232184410095215,
"loss/twn": 0.0,
"step": 707
},
{
"epoch": 0.0177,
"grad_norm": 9.875,
"grad_norm_var": 2011.0417805989584,
"learning_rate": 0.0001,
"loss": 7.6542,
"loss/crossentropy": 2.2898013591766357,
"loss/hidden": 0.126953125,
"loss/logits": 0.005081703420728445,
"loss/reg": 5.232325553894043,
"loss/twn": 0.0,
"step": 708
},
{
"epoch": 0.017725,
"grad_norm": 9.6875,
"grad_norm_var": 62.6244140625,
"learning_rate": 0.0001,
"loss": 6.8514,
"loss/crossentropy": 1.4818062782287598,
"loss/hidden": 0.1328125,
"loss/logits": 0.0045208255760371685,
"loss/reg": 5.232298851013184,
"loss/twn": 0.0,
"step": 709
},
{
"epoch": 0.01775,
"grad_norm": 14.9375,
"grad_norm_var": 59.92239583333333,
"learning_rate": 0.0001,
"loss": 7.9023,
"loss/crossentropy": 2.559528112411499,
"loss/hidden": 0.103515625,
"loss/logits": 0.006969613488763571,
"loss/reg": 5.232254505157471,
"loss/twn": 0.0,
"step": 710
},
{
"epoch": 0.017775,
"grad_norm": 11.375,
"grad_norm_var": 60.82394205729167,
"learning_rate": 0.0001,
"loss": 6.8892,
"loss/crossentropy": 1.5907115936279297,
"loss/hidden": 0.064453125,
"loss/logits": 0.0018022289732471108,
"loss/reg": 5.232196807861328,
"loss/twn": 0.0,
"step": 711
},
{
"epoch": 0.0178,
"grad_norm": 12.5,
"grad_norm_var": 58.95045572916667,
"learning_rate": 0.0001,
"loss": 8.0051,
"loss/crossentropy": 2.6705007553100586,
"loss/hidden": 0.09619140625,
"loss/logits": 0.005942562595009804,
"loss/reg": 5.232419967651367,
"loss/twn": 0.0,
"step": 712
},
{
"epoch": 0.017825,
"grad_norm": 7.6875,
"grad_norm_var": 62.675634765625,
"learning_rate": 0.0001,
"loss": 7.4538,
"loss/crossentropy": 2.1238763332366943,
"loss/hidden": 0.09130859375,
"loss/logits": 0.006380223203450441,
"loss/reg": 5.232213497161865,
"loss/twn": 0.0,
"step": 713
},
{
"epoch": 0.01785,
"grad_norm": 11.5,
"grad_norm_var": 63.188655598958334,
"learning_rate": 0.0001,
"loss": 7.0007,
"loss/crossentropy": 1.551921010017395,
"loss/hidden": 0.2099609375,
"loss/logits": 0.006598391104489565,
"loss/reg": 5.2322468757629395,
"loss/twn": 0.0,
"step": 714
},
{
"epoch": 0.017875,
"grad_norm": 12.5625,
"grad_norm_var": 53.66087239583333,
"learning_rate": 0.0001,
"loss": 8.5447,
"loss/crossentropy": 3.1880903244018555,
"loss/hidden": 0.11474609375,
"loss/logits": 0.009867793880403042,
"loss/reg": 5.2319817543029785,
"loss/twn": 0.0,
"step": 715
},
{
"epoch": 0.0179,
"grad_norm": 8.8125,
"grad_norm_var": 52.566650390625,
"learning_rate": 0.0001,
"loss": 7.3708,
"loss/crossentropy": 2.0305135250091553,
"loss/hidden": 0.10107421875,
"loss/logits": 0.00682512391358614,
"loss/reg": 5.232370853424072,
"loss/twn": 0.0,
"step": 716
},
{
"epoch": 0.017925,
"grad_norm": 9.5,
"grad_norm_var": 53.636051432291666,
"learning_rate": 0.0001,
"loss": 6.9861,
"loss/crossentropy": 1.6283918619155884,
"loss/hidden": 0.1201171875,
"loss/logits": 0.0056338622234761715,
"loss/reg": 5.231950759887695,
"loss/twn": 0.0,
"step": 717
},
{
"epoch": 0.01795,
"grad_norm": 16.375,
"grad_norm_var": 43.831363932291666,
"learning_rate": 0.0001,
"loss": 7.5541,
"loss/crossentropy": 2.1077566146850586,
"loss/hidden": 0.2041015625,
"loss/logits": 0.009965687990188599,
"loss/reg": 5.232308387756348,
"loss/twn": 0.0,
"step": 718
},
{
"epoch": 0.017975,
"grad_norm": 12.8125,
"grad_norm_var": 5.275455729166667,
"learning_rate": 0.0001,
"loss": 8.3363,
"loss/crossentropy": 2.959925413131714,
"loss/hidden": 0.1298828125,
"loss/logits": 0.014515706337988377,
"loss/reg": 5.231976509094238,
"loss/twn": 0.0,
"step": 719
},
{
"epoch": 0.018,
"grad_norm": 13.1875,
"grad_norm_var": 5.351822916666666,
"learning_rate": 0.0001,
"loss": 8.3041,
"loss/crossentropy": 2.966278553009033,
"loss/hidden": 0.0986328125,
"loss/logits": 0.007250492461025715,
"loss/reg": 5.2319159507751465,
"loss/twn": 0.0,
"step": 720
},
{
"epoch": 0.018025,
"grad_norm": 16.25,
"grad_norm_var": 6.689518229166667,
"learning_rate": 0.0001,
"loss": 8.1075,
"loss/crossentropy": 2.7332916259765625,
"loss/hidden": 0.12890625,
"loss/logits": 0.0132482023909688,
"loss/reg": 5.232076644897461,
"loss/twn": 0.0,
"step": 721
},
{
"epoch": 0.01805,
"grad_norm": 12.0,
"grad_norm_var": 6.439436848958334,
"learning_rate": 0.0001,
"loss": 6.6983,
"loss/crossentropy": 1.3738974332809448,
"loss/hidden": 0.08642578125,
"loss/logits": 0.006151068024337292,
"loss/reg": 5.231801986694336,
"loss/twn": 0.0,
"step": 722
},
{
"epoch": 0.018075,
"grad_norm": 8.8125,
"grad_norm_var": 6.871809895833334,
"learning_rate": 0.0001,
"loss": 7.4411,
"loss/crossentropy": 2.184509515762329,
"loss/hidden": 0.02099609375,
"loss/logits": 0.00361478328704834,
"loss/reg": 5.23202657699585,
"loss/twn": 0.0,
"step": 723
},
{
"epoch": 0.0181,
"grad_norm": 13.75,
"grad_norm_var": 6.845572916666667,
"learning_rate": 0.0001,
"loss": 7.2513,
"loss/crossentropy": 1.84177827835083,
"loss/hidden": 0.16796875,
"loss/logits": 0.009550071321427822,
"loss/reg": 5.231957912445068,
"loss/twn": 0.0,
"step": 724
},
{
"epoch": 0.018125,
"grad_norm": 11.625,
"grad_norm_var": 6.486832682291666,
"learning_rate": 0.0001,
"loss": 8.1392,
"loss/crossentropy": 2.7632906436920166,
"loss/hidden": 0.1318359375,
"loss/logits": 0.012371431104838848,
"loss/reg": 5.231691360473633,
"loss/twn": 0.0,
"step": 725
},
{
"epoch": 0.01815,
"grad_norm": 13.625,
"grad_norm_var": 6.098893229166666,
"learning_rate": 0.0001,
"loss": 8.1336,
"loss/crossentropy": 2.7336745262145996,
"loss/hidden": 0.158203125,
"loss/logits": 0.00991059560328722,
"loss/reg": 5.231861114501953,
"loss/twn": 0.0,
"step": 726
},
{
"epoch": 0.018175,
"grad_norm": 9.3125,
"grad_norm_var": 6.543082682291667,
"learning_rate": 0.0001,
"loss": 8.1011,
"loss/crossentropy": 2.7824230194091797,
"loss/hidden": 0.08154296875,
"loss/logits": 0.005494968965649605,
"loss/reg": 5.231605052947998,
"loss/twn": 0.0,
"step": 727
},
{
"epoch": 0.0182,
"grad_norm": 214.0,
"grad_norm_var": 2560.450634765625,
"learning_rate": 0.0001,
"loss": 7.8675,
"loss/crossentropy": 2.6006758213043213,
"loss/hidden": 0.03271484375,
"loss/logits": 0.0022825347259640694,
"loss/reg": 5.231857776641846,
"loss/twn": 0.0,
"step": 728
},
{
"epoch": 0.018225,
"grad_norm": 10.25,
"grad_norm_var": 2555.1207682291665,
"learning_rate": 0.0001,
"loss": 7.9332,
"loss/crossentropy": 2.5436322689056396,
"loss/hidden": 0.150390625,
"loss/logits": 0.007534053176641464,
"loss/reg": 5.231611251831055,
"loss/twn": 0.0,
"step": 729
},
{
"epoch": 0.01825,
"grad_norm": 7.46875,
"grad_norm_var": 2563.203739420573,
"learning_rate": 0.0001,
"loss": 6.4295,
"loss/crossentropy": 1.0562663078308105,
"loss/hidden": 0.1357421875,
"loss/logits": 0.005792177282273769,
"loss/reg": 5.231747150421143,
"loss/twn": 0.0,
"step": 730
},
{
"epoch": 0.018275,
"grad_norm": 9.5,
"grad_norm_var": 2568.6221313476562,
"learning_rate": 0.0001,
"loss": 6.1,
"loss/crossentropy": 0.7717524766921997,
"loss/hidden": 0.0908203125,
"loss/logits": 0.005497816018760204,
"loss/reg": 5.231908321380615,
"loss/twn": 0.0,
"step": 731
},
{
"epoch": 0.0183,
"grad_norm": 15.25,
"grad_norm_var": 2558.0002563476564,
"learning_rate": 0.0001,
"loss": 8.2472,
"loss/crossentropy": 2.8519446849823,
"loss/hidden": 0.1484375,
"loss/logits": 0.015055367723107338,
"loss/reg": 5.231762886047363,
"loss/twn": 0.0,
"step": 732
},
{
"epoch": 0.018325,
"grad_norm": 18.625,
"grad_norm_var": 2544.823661295573,
"learning_rate": 0.0001,
"loss": 8.0862,
"loss/crossentropy": 2.667858600616455,
"loss/hidden": 0.1708984375,
"loss/logits": 0.015612177550792694,
"loss/reg": 5.231838703155518,
"loss/twn": 0.0,
"step": 733
},
{
"epoch": 0.01835,
"grad_norm": 11.0,
"grad_norm_var": 2552.937951660156,
"learning_rate": 0.0001,
"loss": 8.1949,
"loss/crossentropy": 2.86808705329895,
"loss/hidden": 0.08642578125,
"loss/logits": 0.00871000811457634,
"loss/reg": 5.231663703918457,
"loss/twn": 0.0,
"step": 734
},
{
"epoch": 0.018375,
"grad_norm": 10.375,
"grad_norm_var": 2557.2188110351562,
"learning_rate": 0.0001,
"loss": 5.7664,
"loss/crossentropy": 0.43561050295829773,
"loss/hidden": 0.0947265625,
"loss/logits": 0.004361970815807581,
"loss/reg": 5.231712818145752,
"loss/twn": 0.0,
"step": 735
},
{
"epoch": 0.0184,
"grad_norm": 10.375,
"grad_norm_var": 2562.0264282226562,
"learning_rate": 0.0001,
"loss": 5.8451,
"loss/crossentropy": 0.3734654188156128,
"loss/hidden": 0.232421875,
"loss/logits": 0.0075777387246489525,
"loss/reg": 5.231621265411377,
"loss/twn": 0.0,
"step": 736
},
{
"epoch": 0.018425,
"grad_norm": 14.625,
"grad_norm_var": 2563.9819295247394,
"learning_rate": 0.0001,
"loss": 6.356,
"loss/crossentropy": 0.9930484294891357,
"loss/hidden": 0.12451171875,
"loss/logits": 0.006776281166821718,
"loss/reg": 5.231681823730469,
"loss/twn": 0.0,
"step": 737
},
{
"epoch": 0.01845,
"grad_norm": 7.75,
"grad_norm_var": 2572.144364420573,
"learning_rate": 0.0001,
"loss": 6.7646,
"loss/crossentropy": 1.4421303272247314,
"loss/hidden": 0.08642578125,
"loss/logits": 0.0040518054738640785,
"loss/reg": 5.231963157653809,
"loss/twn": 0.0,
"step": 738
},
{
"epoch": 0.018475,
"grad_norm": 13.5,
"grad_norm_var": 2563.933915201823,
"learning_rate": 0.0001,
"loss": 8.2343,
"loss/crossentropy": 2.907639741897583,
"loss/hidden": 0.08642578125,
"loss/logits": 0.008740050718188286,
"loss/reg": 5.23153829574585,
"loss/twn": 0.0,
"step": 739
},
{
"epoch": 0.0185,
"grad_norm": 13.875,
"grad_norm_var": 2563.7567342122397,
"learning_rate": 0.0001,
"loss": 8.0317,
"loss/crossentropy": 2.705251693725586,
"loss/hidden": 0.08642578125,
"loss/logits": 0.008279213681817055,
"loss/reg": 5.231719493865967,
"loss/twn": 0.0,
"step": 740
},
{
"epoch": 0.018525,
"grad_norm": 13.6875,
"grad_norm_var": 2560.4964803059897,
"learning_rate": 0.0001,
"loss": 6.2045,
"loss/crossentropy": 0.7274767160415649,
"loss/hidden": 0.236328125,
"loss/logits": 0.009199721738696098,
"loss/reg": 5.231486797332764,
"loss/twn": 0.0,
"step": 741
},
{
"epoch": 0.01855,
"grad_norm": 17.5,
"grad_norm_var": 2555.7768513997394,
"learning_rate": 0.0001,
"loss": 7.1638,
"loss/crossentropy": 1.6126173734664917,
"loss/hidden": 0.30078125,
"loss/logits": 0.018833626061677933,
"loss/reg": 5.231540203094482,
"loss/twn": 0.0,
"step": 742
},
{
"epoch": 0.018575,
"grad_norm": 63.25,
"grad_norm_var": 2626.092248535156,
"learning_rate": 0.0001,
"loss": 7.4974,
"loss/crossentropy": 1.9825630187988281,
"loss/hidden": 0.263671875,
"loss/logits": 0.019452253356575966,
"loss/reg": 5.231677055358887,
"loss/twn": 0.0,
"step": 743
},
{
"epoch": 0.0186,
"grad_norm": 11.5625,
"grad_norm_var": 172.0647420247396,
"learning_rate": 0.0001,
"loss": 7.2546,
"loss/crossentropy": 1.7497669458389282,
"loss/hidden": 0.259765625,
"loss/logits": 0.013276930898427963,
"loss/reg": 5.231839179992676,
"loss/twn": 0.0,
"step": 744
},
{
"epoch": 0.018625,
"grad_norm": 9.5,
"grad_norm_var": 172.62860921223958,
"learning_rate": 0.0001,
"loss": 7.8067,
"loss/crossentropy": 2.420611619949341,
"loss/hidden": 0.1455078125,
"loss/logits": 0.009184225462377071,
"loss/reg": 5.231430530548096,
"loss/twn": 0.0,
"step": 745
},
{
"epoch": 0.01865,
"grad_norm": 13.875,
"grad_norm_var": 168.34192708333333,
"learning_rate": 0.0001,
"loss": 6.583,
"loss/crossentropy": 1.1484166383743286,
"loss/hidden": 0.1943359375,
"loss/logits": 0.008578259497880936,
"loss/reg": 5.231657028198242,
"loss/twn": 0.0,
"step": 746
},
{
"epoch": 0.018675,
"grad_norm": 118.0,
"grad_norm_var": 811.6565104166667,
"learning_rate": 0.0001,
"loss": 6.9941,
"loss/crossentropy": 1.6018927097320557,
"loss/hidden": 0.154296875,
"loss/logits": 0.0065146745182573795,
"loss/reg": 5.231416702270508,
"loss/twn": 0.0,
"step": 747
},
{
"epoch": 0.0187,
"grad_norm": 17.625,
"grad_norm_var": 809.6587890625,
"learning_rate": 0.0001,
"loss": 7.9705,
"loss/crossentropy": 2.5781874656677246,
"loss/hidden": 0.150390625,
"loss/logits": 0.010151069611310959,
"loss/reg": 5.2317585945129395,
"loss/twn": 0.0,
"step": 748
},
{
"epoch": 0.018725,
"grad_norm": 15.75,
"grad_norm_var": 811.78359375,
"learning_rate": 0.0001,
"loss": 7.9057,
"loss/crossentropy": 2.5473275184631348,
"loss/hidden": 0.11767578125,
"loss/logits": 0.009290559217333794,
"loss/reg": 5.231451034545898,
"loss/twn": 0.0,
"step": 749
},
{
"epoch": 0.01875,
"grad_norm": 57.0,
"grad_norm_var": 872.6377604166667,
"learning_rate": 0.0001,
"loss": 5.8577,
"loss/crossentropy": 0.43267643451690674,
"loss/hidden": 0.1845703125,
"loss/logits": 0.008671510964632034,
"loss/reg": 5.231771469116211,
"loss/twn": 0.0,
"step": 750
},
{
"epoch": 0.018775,
"grad_norm": 12.875,
"grad_norm_var": 867.9815104166667,
"learning_rate": 0.0001,
"loss": 8.1731,
"loss/crossentropy": 2.8458542823791504,
"loss/hidden": 0.0888671875,
"loss/logits": 0.00722483079880476,
"loss/reg": 5.231186389923096,
"loss/twn": 0.0,
"step": 751
},
{
"epoch": 0.0188,
"grad_norm": 31.25,
"grad_norm_var": 852.6405598958333,
"learning_rate": 0.0001,
"loss": 6.5314,
"loss/crossentropy": 1.1035902500152588,
"loss/hidden": 0.19140625,
"loss/logits": 0.0045470790937542915,
"loss/reg": 5.231815814971924,
"loss/twn": 0.0,
"step": 752
},
{
"epoch": 0.018825,
"grad_norm": 9.6875,
"grad_norm_var": 862.2956868489583,
"learning_rate": 0.0001,
"loss": 5.7019,
"loss/crossentropy": 0.3551396429538727,
"loss/hidden": 0.1123046875,
"loss/logits": 0.003157853614538908,
"loss/reg": 5.2312774658203125,
"loss/twn": 0.0,
"step": 753
},
{
"epoch": 0.01885,
"grad_norm": 9.6875,
"grad_norm_var": 857.6431640625,
"learning_rate": 0.0001,
"loss": 7.8963,
"loss/crossentropy": 2.603400945663452,
"loss/hidden": 0.05712890625,
"loss/logits": 0.004422674421221018,
"loss/reg": 5.231386661529541,
"loss/twn": 0.0,
"step": 754
},
{
"epoch": 0.018875,
"grad_norm": 14.125,
"grad_norm_var": 856.56015625,
"learning_rate": 0.0001,
"loss": 7.4962,
"loss/crossentropy": 2.134359359741211,
"loss/hidden": 0.1201171875,
"loss/logits": 0.010161285288631916,
"loss/reg": 5.231522083282471,
"loss/twn": 0.0,
"step": 755
},
{
"epoch": 0.0189,
"grad_norm": 16.125,
"grad_norm_var": 852.990625,
"learning_rate": 0.0001,
"loss": 8.2635,
"loss/crossentropy": 2.91304874420166,
"loss/hidden": 0.10595703125,
"loss/logits": 0.01325392909348011,
"loss/reg": 5.231236934661865,
"loss/twn": 0.0,
"step": 756
},
{
"epoch": 0.018925,
"grad_norm": 8.75,
"grad_norm_var": 863.2577962239583,
"learning_rate": 0.0001,
"loss": 6.6673,
"loss/crossentropy": 1.2952691316604614,
"loss/hidden": 0.1337890625,
"loss/logits": 0.006863090209662914,
"loss/reg": 5.231356143951416,
"loss/twn": 0.0,
"step": 757
},
{
"epoch": 0.01895,
"grad_norm": 9.4375,
"grad_norm_var": 877.1677083333333,
"learning_rate": 0.0001,
"loss": 6.9557,
"loss/crossentropy": 1.5827580690383911,
"loss/hidden": 0.1328125,
"loss/logits": 0.008454329334199429,
"loss/reg": 5.231715202331543,
"loss/twn": 0.0,
"step": 758
},
{
"epoch": 0.018975,
"grad_norm": 16.0,
"grad_norm_var": 783.0122395833333,
"learning_rate": 0.0001,
"loss": 7.1366,
"loss/crossentropy": 1.7788175344467163,
"loss/hidden": 0.1201171875,
"loss/logits": 0.006595224142074585,
"loss/reg": 5.231091499328613,
"loss/twn": 0.0,
"step": 759
},
{
"epoch": 0.019,
"grad_norm": 9.75,
"grad_norm_var": 786.030712890625,
"learning_rate": 0.0001,
"loss": 8.1245,
"loss/crossentropy": 2.800469160079956,
"loss/hidden": 0.08642578125,
"loss/logits": 0.006068210117518902,
"loss/reg": 5.231574535369873,
"loss/twn": 0.0,
"step": 760
},
{
"epoch": 0.019025,
"grad_norm": 19.25,
"grad_norm_var": 774.305322265625,
"learning_rate": 0.0001,
"loss": 6.9579,
"loss/crossentropy": 1.6193712949752808,
"loss/hidden": 0.10107421875,
"loss/logits": 0.006502064410597086,
"loss/reg": 5.2309889793396,
"loss/twn": 0.0,
"step": 761
},
{
"epoch": 0.01905,
"grad_norm": 20.625,
"grad_norm_var": 768.311181640625,
"learning_rate": 0.0001,
"loss": 8.3539,
"loss/crossentropy": 2.9865291118621826,
"loss/hidden": 0.11962890625,
"loss/logits": 0.016289401799440384,
"loss/reg": 5.231488227844238,
"loss/twn": 0.0,
"step": 762
},
{
"epoch": 0.019075,
"grad_norm": 31.625,
"grad_norm_var": 153.429150390625,
"learning_rate": 0.0001,
"loss": 7.0021,
"loss/crossentropy": 1.6152843236923218,
"loss/hidden": 0.1494140625,
"loss/logits": 0.006050444208085537,
"loss/reg": 5.2313923835754395,
"loss/twn": 0.0,
"step": 763
},
{
"epoch": 0.0191,
"grad_norm": 39.25,
"grad_norm_var": 179.49178059895834,
"learning_rate": 0.0001,
"loss": 6.1086,
"loss/crossentropy": 0.6142204403877258,
"loss/hidden": 0.2578125,
"loss/logits": 0.005147262010723352,
"loss/reg": 5.231447219848633,
"loss/twn": 0.0,
"step": 764
},
{
"epoch": 0.019125,
"grad_norm": 28.0,
"grad_norm_var": 181.80779622395832,
"learning_rate": 0.0001,
"loss": 7.0546,
"loss/crossentropy": 1.6059232950210571,
"loss/hidden": 0.212890625,
"loss/logits": 0.00476275198161602,
"loss/reg": 5.231032371520996,
"loss/twn": 0.0,
"step": 765
},
{
"epoch": 0.01915,
"grad_norm": 11.875,
"grad_norm_var": 91.510791015625,
"learning_rate": 0.0001,
"loss": 7.9657,
"loss/crossentropy": 2.631168842315674,
"loss/hidden": 0.09619140625,
"loss/logits": 0.007133619859814644,
"loss/reg": 5.231229305267334,
"loss/twn": 0.0,
"step": 766
},
{
"epoch": 0.019175,
"grad_norm": 43.75,
"grad_norm_var": 129.911572265625,
"learning_rate": 0.0001,
"loss": 6.3914,
"loss/crossentropy": 1.015294075012207,
"loss/hidden": 0.13671875,
"loss/logits": 0.00839436985552311,
"loss/reg": 5.231020450592041,
"loss/twn": 0.0,
"step": 767
},
{
"epoch": 0.0192,
"grad_norm": 13.625,
"grad_norm_var": 122.769775390625,
"learning_rate": 0.0001,
"loss": 6.8334,
"loss/crossentropy": 1.33900785446167,
"loss/hidden": 0.251953125,
"loss/logits": 0.011178944259881973,
"loss/reg": 5.2312493324279785,
"loss/twn": 0.0,
"step": 768
},
{
"epoch": 0.019225,
"grad_norm": 11.5625,
"grad_norm_var": 120.699462890625,
"learning_rate": 0.0001,
"loss": 6.9225,
"loss/crossentropy": 1.5955766439437866,
"loss/hidden": 0.0908203125,
"loss/logits": 0.005241988226771355,
"loss/reg": 5.230888366699219,
"loss/twn": 0.0,
"step": 769
},
{
"epoch": 0.01925,
"grad_norm": 17.125,
"grad_norm_var": 114.95670572916667,
"learning_rate": 0.0001,
"loss": 8.1154,
"loss/crossentropy": 2.7243542671203613,
"loss/hidden": 0.142578125,
"loss/logits": 0.017005622386932373,
"loss/reg": 5.231443881988525,
"loss/twn": 0.0,
"step": 770
},
{
"epoch": 0.019275,
"grad_norm": 306.0,
"grad_norm_var": 5232.954427083333,
"learning_rate": 0.0001,
"loss": 7.935,
"loss/crossentropy": 2.5942704677581787,
"loss/hidden": 0.10205078125,
"loss/logits": 0.007505115121603012,
"loss/reg": 5.231191635131836,
"loss/twn": 0.0,
"step": 771
},
{
"epoch": 0.0193,
"grad_norm": 13.0,
"grad_norm_var": 5242.542643229167,
"learning_rate": 0.0001,
"loss": 7.9428,
"loss/crossentropy": 2.644366979598999,
"loss/hidden": 0.0595703125,
"loss/logits": 0.007467132993042469,
"loss/reg": 5.231389045715332,
"loss/twn": 0.0,
"step": 772
},
{
"epoch": 0.019325,
"grad_norm": 49.5,
"grad_norm_var": 5190.246809895833,
"learning_rate": 0.0001,
"loss": 6.8313,
"loss/crossentropy": 1.4516693353652954,
"loss/hidden": 0.1416015625,
"loss/logits": 0.006859183311462402,
"loss/reg": 5.231204032897949,
"loss/twn": 0.0,
"step": 773
},
{
"epoch": 0.01935,
"grad_norm": 10.25,
"grad_norm_var": 5186.974593098958,
"learning_rate": 0.0001,
"loss": 8.2807,
"loss/crossentropy": 3.0418143272399902,
"loss/hidden": 0.00469970703125,
"loss/logits": 0.002671225229278207,
"loss/reg": 5.23149299621582,
"loss/twn": 0.0,
"step": 774
},
{
"epoch": 0.019375,
"grad_norm": 12.8125,
"grad_norm_var": 5197.841145833333,
"learning_rate": 0.0001,
"loss": 7.9916,
"loss/crossentropy": 2.7271320819854736,
"loss/hidden": 0.0302734375,
"loss/logits": 0.00295096542686224,
"loss/reg": 5.231270790100098,
"loss/twn": 0.0,
"step": 775
},
{
"epoch": 0.0194,
"grad_norm": 23.0,
"grad_norm_var": 5155.59296875,
"learning_rate": 0.0001,
"loss": 8.1296,
"loss/crossentropy": 2.7584662437438965,
"loss/hidden": 0.126953125,
"loss/logits": 0.012552576139569283,
"loss/reg": 5.231604099273682,
"loss/twn": 0.0,
"step": 776
},
{
"epoch": 0.019425,
"grad_norm": 14.1875,
"grad_norm_var": 5171.675634765625,
"learning_rate": 0.0001,
"loss": 5.5239,
"loss/crossentropy": 0.21823735535144806,
"loss/hidden": 0.0693359375,
"loss/logits": 0.005251707974821329,
"loss/reg": 5.231090545654297,
"loss/twn": 0.0,
"step": 777
},
{
"epoch": 0.01945,
"grad_norm": 11.875,
"grad_norm_var": 5199.516129557292,
"learning_rate": 0.0001,
"loss": 7.0206,
"loss/crossentropy": 1.654329538345337,
"loss/hidden": 0.12451171875,
"loss/logits": 0.01058058813214302,
"loss/reg": 5.231167316436768,
"loss/twn": 0.0,
"step": 778
},
{
"epoch": 0.019475,
"grad_norm": 31.75,
"grad_norm_var": 5199.380192057291,
"learning_rate": 0.0001,
"loss": 7.4021,
"loss/crossentropy": 2.093484878540039,
"loss/hidden": 0.07568359375,
"loss/logits": 0.0018800008110702038,
"loss/reg": 5.231100082397461,
"loss/twn": 0.0,
"step": 779
},
{
"epoch": 0.0195,
"grad_norm": 230.0,
"grad_norm_var": 7458.277457682291,
"learning_rate": 0.0001,
"loss": 7.047,
"loss/crossentropy": 1.6409082412719727,
"loss/hidden": 0.1650390625,
"loss/logits": 0.00975135900080204,
"loss/reg": 5.231270790100098,
"loss/twn": 0.0,
"step": 780
},
{
"epoch": 0.019525,
"grad_norm": 17.875,
"grad_norm_var": 7496.773551432291,
"learning_rate": 0.0001,
"loss": 8.1639,
"loss/crossentropy": 2.775987148284912,
"loss/hidden": 0.150390625,
"loss/logits": 0.006807660683989525,
"loss/reg": 5.230672836303711,
"loss/twn": 0.0,
"step": 781
},
{
"epoch": 0.01955,
"grad_norm": 13.6875,
"grad_norm_var": 7487.490625,
"learning_rate": 0.0001,
"loss": 6.7803,
"loss/crossentropy": 1.4760520458221436,
"loss/hidden": 0.0693359375,
"loss/logits": 0.003954825457185507,
"loss/reg": 5.230950355529785,
"loss/twn": 0.0,
"step": 782
},
{
"epoch": 0.019575,
"grad_norm": 15.0625,
"grad_norm_var": 7567.613916015625,
"learning_rate": 0.0001,
"loss": 8.3797,
"loss/crossentropy": 3.015629291534424,
"loss/hidden": 0.11962890625,
"loss/logits": 0.013525455258786678,
"loss/reg": 5.230944633483887,
"loss/twn": 0.0,
"step": 783
},
{
"epoch": 0.0196,
"grad_norm": 13.125,
"grad_norm_var": 7570.018343098958,
"learning_rate": 0.0001,
"loss": 8.0947,
"loss/crossentropy": 2.698845863342285,
"loss/hidden": 0.1474609375,
"loss/logits": 0.017367932945489883,
"loss/reg": 5.231037616729736,
"loss/twn": 0.0,
"step": 784
},
{
"epoch": 0.019625,
"grad_norm": 98.0,
"grad_norm_var": 7600.609114583333,
"learning_rate": 0.0001,
"loss": 8.2002,
"loss/crossentropy": 2.826629161834717,
"loss/hidden": 0.1318359375,
"loss/logits": 0.0108743105083704,
"loss/reg": 5.230876445770264,
"loss/twn": 0.0,
"step": 785
},
{
"epoch": 0.01965,
"grad_norm": 10.125,
"grad_norm_var": 7638.861197916666,
"learning_rate": 0.0001,
"loss": 7.1464,
"loss/crossentropy": 1.752783179283142,
"loss/hidden": 0.15625,
"loss/logits": 0.006541845388710499,
"loss/reg": 5.230816841125488,
"loss/twn": 0.0,
"step": 786
},
{
"epoch": 0.019675,
"grad_norm": 12.75,
"grad_norm_var": 3175.657291666667,
"learning_rate": 0.0001,
"loss": 7.4269,
"loss/crossentropy": 2.056227207183838,
"loss/hidden": 0.125,
"loss/logits": 0.014510264620184898,
"loss/reg": 5.231206893920898,
"loss/twn": 0.0,
"step": 787
},
{
"epoch": 0.0197,
"grad_norm": 10.3125,
"grad_norm_var": 3184.372770182292,
"learning_rate": 0.0001,
"loss": 6.9013,
"loss/crossentropy": 1.5427310466766357,
"loss/hidden": 0.1201171875,
"loss/logits": 0.007622131146490574,
"loss/reg": 5.230830192565918,
"loss/twn": 0.0,
"step": 788
},
{
"epoch": 0.019725,
"grad_norm": 9.375,
"grad_norm_var": 3212.2094889322916,
"learning_rate": 0.0001,
"loss": 8.0128,
"loss/crossentropy": 2.7377614974975586,
"loss/hidden": 0.0400390625,
"loss/logits": 0.004178863950073719,
"loss/reg": 5.230835914611816,
"loss/twn": 0.0,
"step": 789
},
{
"epoch": 0.01975,
"grad_norm": 9.625,
"grad_norm_var": 3214.161962890625,
"learning_rate": 0.0001,
"loss": 6.8224,
"loss/crossentropy": 1.4642736911773682,
"loss/hidden": 0.11767578125,
"loss/logits": 0.009586405009031296,
"loss/reg": 5.230891704559326,
"loss/twn": 0.0,
"step": 790
},
{
"epoch": 0.019775,
"grad_norm": 9.125,
"grad_norm_var": 3225.1082682291667,
"learning_rate": 0.0001,
"loss": 7.9752,
"loss/crossentropy": 2.6582653522491455,
"loss/hidden": 0.0791015625,
"loss/logits": 0.007132208906114101,
"loss/reg": 5.230742931365967,
"loss/twn": 0.0,
"step": 791
},
{
"epoch": 0.0198,
"grad_norm": 14.0625,
"grad_norm_var": 3242.157014973958,
"learning_rate": 0.0001,
"loss": 7.6474,
"loss/crossentropy": 2.298769235610962,
"loss/hidden": 0.11279296875,
"loss/logits": 0.004840749781578779,
"loss/reg": 5.2309794425964355,
"loss/twn": 0.0,
"step": 792
},
{
"epoch": 0.019825,
"grad_norm": 17.375,
"grad_norm_var": 3234.984309895833,
"learning_rate": 0.0001,
"loss": 7.5051,
"loss/crossentropy": 2.1144237518310547,
"loss/hidden": 0.1484375,
"loss/logits": 0.011511989869177341,
"loss/reg": 5.230709552764893,
"loss/twn": 0.0,
"step": 793
},
{
"epoch": 0.01985,
"grad_norm": 12.6875,
"grad_norm_var": 3232.7632649739585,
"learning_rate": 0.0001,
"loss": 6.3903,
"loss/crossentropy": 0.9461896419525146,
"loss/hidden": 0.2080078125,
"loss/logits": 0.005450299009680748,
"loss/reg": 5.230684757232666,
"loss/twn": 0.0,
"step": 794
},
{
"epoch": 0.019875,
"grad_norm": 13.9375,
"grad_norm_var": 3255.1077473958335,
"learning_rate": 0.0001,
"loss": 7.4702,
"loss/crossentropy": 2.1320881843566895,
"loss/hidden": 0.0986328125,
"loss/logits": 0.008736222982406616,
"loss/reg": 5.230694770812988,
"loss/twn": 0.0,
"step": 795
},
{
"epoch": 0.0199,
"grad_norm": 9.8125,
"grad_norm_var": 463.37107747395834,
"learning_rate": 0.0001,
"loss": 5.7159,
"loss/crossentropy": 0.34444770216941833,
"loss/hidden": 0.1357421875,
"loss/logits": 0.005040573887526989,
"loss/reg": 5.230666160583496,
"loss/twn": 0.0,
"step": 796
},
{
"epoch": 0.019925,
"grad_norm": 7.125,
"grad_norm_var": 470.67771809895834,
"learning_rate": 0.0001,
"loss": 6.3741,
"loss/crossentropy": 0.9926528334617615,
"loss/hidden": 0.142578125,
"loss/logits": 0.007962658070027828,
"loss/reg": 5.23092794418335,
"loss/twn": 0.0,
"step": 797
},
{
"epoch": 0.01995,
"grad_norm": 9.0,
"grad_norm_var": 474.28489583333334,
"learning_rate": 0.0001,
"loss": 7.4732,
"loss/crossentropy": 2.123459815979004,
"loss/hidden": 0.1123046875,
"loss/logits": 0.0064601292833685875,
"loss/reg": 5.2309346199035645,
"loss/twn": 0.0,
"step": 798
},
{
"epoch": 0.019975,
"grad_norm": 17.25,
"grad_norm_var": 474.027978515625,
"learning_rate": 0.0001,
"loss": 7.4841,
"loss/crossentropy": 2.1429977416992188,
"loss/hidden": 0.10107421875,
"loss/logits": 0.009501198306679726,
"loss/reg": 5.230530738830566,
"loss/twn": 0.0,
"step": 799
},
{
"epoch": 0.02,
"grad_norm": 13.3125,
"grad_norm_var": 473.9306640625,
"learning_rate": 0.0001,
"loss": 7.9787,
"loss/crossentropy": 2.5355384349823,
"loss/hidden": 0.203125,
"loss/logits": 0.009105566889047623,
"loss/reg": 5.230905055999756,
"loss/twn": 0.0,
"step": 800
},
{
"epoch": 0.020025,
"grad_norm": 15.5625,
"grad_norm_var": 9.640087890625,
"learning_rate": 0.0001,
"loss": 7.6371,
"loss/crossentropy": 2.3353328704833984,
"loss/hidden": 0.064453125,
"loss/logits": 0.00665889261290431,
"loss/reg": 5.230616569519043,
"loss/twn": 0.0,
"step": 801
},
{
"epoch": 0.02005,
"grad_norm": 64.0,
"grad_norm_var": 177.831103515625,
"learning_rate": 0.0001,
"loss": 8.2014,
"loss/crossentropy": 2.8111629486083984,
"loss/hidden": 0.1513671875,
"loss/logits": 0.008202088996767998,
"loss/reg": 5.230618476867676,
"loss/twn": 0.0,
"step": 802
},
{
"epoch": 0.020075,
"grad_norm": 12.0625,
"grad_norm_var": 178.09733072916666,
"learning_rate": 0.0001,
"loss": 6.0497,
"loss/crossentropy": 0.6109923124313354,
"loss/hidden": 0.2021484375,
"loss/logits": 0.005807585082948208,
"loss/reg": 5.230772495269775,
"loss/twn": 0.0,
"step": 803
},
{
"epoch": 0.0201,
"grad_norm": 8.8125,
"grad_norm_var": 179.23326822916667,
"learning_rate": 0.0001,
"loss": 7.8262,
"loss/crossentropy": 2.455592393875122,
"loss/hidden": 0.1328125,
"loss/logits": 0.0069784787483513355,
"loss/reg": 5.230773448944092,
"loss/twn": 0.0,
"step": 804
},
{
"epoch": 0.020125,
"grad_norm": 73.5,
"grad_norm_var": 386.47057291666664,
"learning_rate": 0.0001,
"loss": 7.1917,
"loss/crossentropy": 1.826701283454895,
"loss/hidden": 0.125,
"loss/logits": 0.0094426479190588,
"loss/reg": 5.230578422546387,
"loss/twn": 0.0,
"step": 805
},
{
"epoch": 0.02015,
"grad_norm": 10.25,
"grad_norm_var": 385.6968098958333,
"learning_rate": 0.0001,
"loss": 7.9012,
"loss/crossentropy": 2.539992094039917,
"loss/hidden": 0.12158203125,
"loss/logits": 0.008753599599003792,
"loss/reg": 5.230825424194336,
"loss/twn": 0.0,
"step": 806
},
{
"epoch": 0.020175,
"grad_norm": 19.875,
"grad_norm_var": 378.4181640625,
"learning_rate": 0.0001,
"loss": 6.9229,
"loss/crossentropy": 1.5474615097045898,
"loss/hidden": 0.1376953125,
"loss/logits": 0.00709810946136713,
"loss/reg": 5.230693817138672,
"loss/twn": 0.0,
"step": 807
},
{
"epoch": 0.0202,
"grad_norm": 10.25,
"grad_norm_var": 382.30115559895836,
"learning_rate": 0.0001,
"loss": 7.2085,
"loss/crossentropy": 1.7812546491622925,
"loss/hidden": 0.189453125,
"loss/logits": 0.007315409369766712,
"loss/reg": 5.230427265167236,
"loss/twn": 0.0,
"step": 808
},
{
"epoch": 0.020225,
"grad_norm": 9.5,
"grad_norm_var": 388.59295247395835,
"learning_rate": 0.0001,
"loss": 7.8477,
"loss/crossentropy": 2.541975498199463,
"loss/hidden": 0.07177734375,
"loss/logits": 0.003062914125621319,
"loss/reg": 5.230909824371338,
"loss/twn": 0.0,
"step": 809
},
{
"epoch": 0.02025,
"grad_norm": 15.5625,
"grad_norm_var": 386.619384765625,
"learning_rate": 0.0001,
"loss": 7.3771,
"loss/crossentropy": 2.025947332382202,
"loss/hidden": 0.11083984375,
"loss/logits": 0.009859994053840637,
"loss/reg": 5.230443954467773,
"loss/twn": 0.0,
"step": 810
},
{
"epoch": 0.020275,
"grad_norm": 8.6875,
"grad_norm_var": 392.140087890625,
"learning_rate": 0.0001,
"loss": 7.2273,
"loss/crossentropy": 1.893878698348999,
"loss/hidden": 0.09375,
"loss/logits": 0.009128249250352383,
"loss/reg": 5.2305426597595215,
"loss/twn": 0.0,
"step": 811
},
{
"epoch": 0.0203,
"grad_norm": 33.75,
"grad_norm_var": 398.5171875,
"learning_rate": 0.0001,
"loss": 8.1449,
"loss/crossentropy": 2.717454195022583,
"loss/hidden": 0.1796875,
"loss/logits": 0.016887273639440536,
"loss/reg": 5.230856895446777,
"loss/twn": 0.0,
"step": 812
},
{
"epoch": 0.020325,
"grad_norm": 12.875,
"grad_norm_var": 390.30546875,
"learning_rate": 0.0001,
"loss": 8.3305,
"loss/crossentropy": 3.0012078285217285,
"loss/hidden": 0.09326171875,
"loss/logits": 0.005363960284739733,
"loss/reg": 5.230653285980225,
"loss/twn": 0.0,
"step": 813
},
{
"epoch": 0.02035,
"grad_norm": 34.0,
"grad_norm_var": 389.73255208333336,
"learning_rate": 0.0001,
"loss": 6.834,
"loss/crossentropy": 1.4066799879074097,
"loss/hidden": 0.193359375,
"loss/logits": 0.003266718937084079,
"loss/reg": 5.230650901794434,
"loss/twn": 0.0,
"step": 814
},
{
"epoch": 0.020375,
"grad_norm": 326.0,
"grad_norm_var": 6133.447395833334,
"learning_rate": 0.0001,
"loss": 8.109,
"loss/crossentropy": 2.785377025604248,
"loss/hidden": 0.087890625,
"loss/logits": 0.005352129694074392,
"loss/reg": 5.230373859405518,
"loss/twn": 0.0,
"step": 815
},
{
"epoch": 0.0204,
"grad_norm": 61.75,
"grad_norm_var": 6096.425504557292,
"learning_rate": 0.0001,
"loss": 6.3327,
"loss/crossentropy": 0.9138974547386169,
"loss/hidden": 0.17578125,
"loss/logits": 0.012529873289167881,
"loss/reg": 5.230535984039307,
"loss/twn": 0.0,
"step": 816
},
{
"epoch": 0.020425,
"grad_norm": 15.8125,
"grad_norm_var": 6095.455582682292,
"learning_rate": 0.0001,
"loss": 8.4489,
"loss/crossentropy": 3.057579517364502,
"loss/hidden": 0.142578125,
"loss/logits": 0.018565690144896507,
"loss/reg": 5.230161666870117,
"loss/twn": 0.0,
"step": 817
},
{
"epoch": 0.02045,
"grad_norm": 18.125,
"grad_norm_var": 6109.504801432292,
"learning_rate": 0.0001,
"loss": 7.9928,
"loss/crossentropy": 2.607743740081787,
"loss/hidden": 0.1494140625,
"loss/logits": 0.004932316951453686,
"loss/reg": 5.230733394622803,
"loss/twn": 0.0,
"step": 818
},
{
"epoch": 0.020475,
"grad_norm": 21.25,
"grad_norm_var": 6078.197916666667,
"learning_rate": 0.0001,
"loss": 8.2796,
"loss/crossentropy": 2.8803536891937256,
"loss/hidden": 0.15234375,
"loss/logits": 0.016475437209010124,
"loss/reg": 5.230462551116943,
"loss/twn": 0.0,
"step": 819
},
{
"epoch": 0.0205,
"grad_norm": 9.6875,
"grad_norm_var": 6074.315559895834,
"learning_rate": 0.0001,
"loss": 7.3712,
"loss/crossentropy": 1.9426707029342651,
"loss/hidden": 0.1923828125,
"loss/logits": 0.005748513620346785,
"loss/reg": 5.230403900146484,
"loss/twn": 0.0,
"step": 820
},
{
"epoch": 0.020525,
"grad_norm": 10.0,
"grad_norm_var": 6064.3275390625,
"learning_rate": 0.0001,
"loss": 8.233,
"loss/crossentropy": 2.895735025405884,
"loss/hidden": 0.0986328125,
"loss/logits": 0.008359922096133232,
"loss/reg": 5.230307102203369,
"loss/twn": 0.0,
"step": 821
},
{
"epoch": 0.02055,
"grad_norm": 11.8125,
"grad_norm_var": 6058.576806640625,
"learning_rate": 0.0001,
"loss": 7.0902,
"loss/crossentropy": 1.73037588596344,
"loss/hidden": 0.11962890625,
"loss/logits": 0.009449999779462814,
"loss/reg": 5.230698585510254,
"loss/twn": 0.0,
"step": 822
},
{
"epoch": 0.020575,
"grad_norm": 11.125,
"grad_norm_var": 6085.305322265625,
"learning_rate": 0.0001,
"loss": 7.9687,
"loss/crossentropy": 2.613971471786499,
"loss/hidden": 0.11767578125,
"loss/logits": 0.0068368250504136086,
"loss/reg": 5.230223178863525,
"loss/twn": 0.0,
"step": 823
},
{
"epoch": 0.0206,
"grad_norm": 13.625,
"grad_norm_var": 6073.468212890625,
"learning_rate": 0.0001,
"loss": 7.6646,
"loss/crossentropy": 2.2981951236724854,
"loss/hidden": 0.12451171875,
"loss/logits": 0.01143670454621315,
"loss/reg": 5.230466842651367,
"loss/twn": 0.0,
"step": 824
},
{
"epoch": 0.020625,
"grad_norm": 9.1875,
"grad_norm_var": 6074.676302083333,
"learning_rate": 0.0001,
"loss": 7.9187,
"loss/crossentropy": 2.578167676925659,
"loss/hidden": 0.10546875,
"loss/logits": 0.005032903980463743,
"loss/reg": 5.2300333976745605,
"loss/twn": 0.0,
"step": 825
},
{
"epoch": 0.02065,
"grad_norm": 12.3125,
"grad_norm_var": 6085.2015625,
"learning_rate": 0.0001,
"loss": 7.4645,
"loss/crossentropy": 2.171114444732666,
"loss/hidden": 0.0595703125,
"loss/logits": 0.003213082440197468,
"loss/reg": 5.23060417175293,
"loss/twn": 0.0,
"step": 826
},
{
"epoch": 0.020675,
"grad_norm": 7.75,
"grad_norm_var": 6088.936181640625,
"learning_rate": 0.0001,
"loss": 7.2651,
"loss/crossentropy": 1.9576009511947632,
"loss/hidden": 0.0693359375,
"loss/logits": 0.008241134695708752,
"loss/reg": 5.229961395263672,
"loss/twn": 0.0,
"step": 827
},
{
"epoch": 0.0207,
"grad_norm": 9.1875,
"grad_norm_var": 6140.7796875,
"learning_rate": 0.0001,
"loss": 6.9102,
"loss/crossentropy": 1.5859615802764893,
"loss/hidden": 0.09130859375,
"loss/logits": 0.0022871571127325296,
"loss/reg": 5.230637550354004,
"loss/twn": 0.0,
"step": 828
},
{
"epoch": 0.020725,
"grad_norm": 8.875,
"grad_norm_var": 6154.396354166666,
"learning_rate": 0.0001,
"loss": 6.8867,
"loss/crossentropy": 1.5643407106399536,
"loss/hidden": 0.08642578125,
"loss/logits": 0.005755975842475891,
"loss/reg": 5.230214595794678,
"loss/twn": 0.0,
"step": 829
},
{
"epoch": 0.02075,
"grad_norm": 79.5,
"grad_norm_var": 6269.947395833334,
"learning_rate": 0.0001,
"loss": 6.9087,
"loss/crossentropy": 1.5483061075210571,
"loss/hidden": 0.12451171875,
"loss/logits": 0.005769835785031319,
"loss/reg": 5.230134963989258,
"loss/twn": 0.0,
"step": 830
},
{
"epoch": 0.020775,
"grad_norm": 10.25,
"grad_norm_var": 423.63880208333336,
"learning_rate": 0.0001,
"loss": 8.0232,
"loss/crossentropy": 2.6613450050354004,
"loss/hidden": 0.126953125,
"loss/logits": 0.004578165709972382,
"loss/reg": 5.2303147315979,
"loss/twn": 0.0,
"step": 831
},
{
"epoch": 0.0208,
"grad_norm": 11.9375,
"grad_norm_var": 297.382275390625,
"learning_rate": 0.0001,
"loss": 7.1745,
"loss/crossentropy": 1.7624868154525757,
"loss/hidden": 0.1708984375,
"loss/logits": 0.0105954110622406,
"loss/reg": 5.2305684089660645,
"loss/twn": 0.0,
"step": 832
},
{
"epoch": 0.020825,
"grad_norm": 11.8125,
"grad_norm_var": 298.63019205729165,
"learning_rate": 0.0001,
"loss": 8.0084,
"loss/crossentropy": 2.670598030090332,
"loss/hidden": 0.0986328125,
"loss/logits": 0.009010246023535728,
"loss/reg": 5.230124473571777,
"loss/twn": 0.0,
"step": 833
},
{
"epoch": 0.02085,
"grad_norm": 11.6875,
"grad_norm_var": 299.41979166666664,
"learning_rate": 0.0001,
"loss": 7.9989,
"loss/crossentropy": 2.6066150665283203,
"loss/hidden": 0.1533203125,
"loss/logits": 0.008532057516276836,
"loss/reg": 5.230454444885254,
"loss/twn": 0.0,
"step": 834
},
{
"epoch": 0.020875,
"grad_norm": 21.375,
"grad_norm_var": 299.51451822916664,
"learning_rate": 0.0001,
"loss": 8.5087,
"loss/crossentropy": 3.1679840087890625,
"loss/hidden": 0.10107421875,
"loss/logits": 0.009620252065360546,
"loss/reg": 5.230065822601318,
"loss/twn": 0.0,
"step": 835
},
{
"epoch": 0.0209,
"grad_norm": 44.0,
"grad_norm_var": 345.89894205729166,
"learning_rate": 0.0001,
"loss": 7.9201,
"loss/crossentropy": 2.6046769618988037,
"loss/hidden": 0.080078125,
"loss/logits": 0.004920288920402527,
"loss/reg": 5.230381488800049,
"loss/twn": 0.0,
"step": 836
},
{
"epoch": 0.020925,
"grad_norm": 8.875,
"grad_norm_var": 347.14464518229164,
"learning_rate": 0.0001,
"loss": 7.3371,
"loss/crossentropy": 1.9817231893539429,
"loss/hidden": 0.11767578125,
"loss/logits": 0.007638626731932163,
"loss/reg": 5.230074405670166,
"loss/twn": 0.0,
"step": 837
},
{
"epoch": 0.02095,
"grad_norm": 42.25,
"grad_norm_var": 381.12526041666666,
"learning_rate": 0.0001,
"loss": 8.104,
"loss/crossentropy": 2.708381414413452,
"loss/hidden": 0.1513671875,
"loss/logits": 0.013956461101770401,
"loss/reg": 5.230310440063477,
"loss/twn": 0.0,
"step": 838
},
{
"epoch": 0.020975,
"grad_norm": 15.75,
"grad_norm_var": 377.2301432291667,
"learning_rate": 0.0001,
"loss": 7.4726,
"loss/crossentropy": 2.0920002460479736,
"loss/hidden": 0.142578125,
"loss/logits": 0.00805431604385376,
"loss/reg": 5.230012893676758,
"loss/twn": 0.0,
"step": 839
},
{
"epoch": 0.021,
"grad_norm": 15.5,
"grad_norm_var": 375.8815104166667,
"learning_rate": 0.0001,
"loss": 8.1933,
"loss/crossentropy": 2.8643245697021484,
"loss/hidden": 0.08642578125,
"loss/logits": 0.012173913419246674,
"loss/reg": 5.230340480804443,
"loss/twn": 0.0,
"step": 840
},
{
"epoch": 0.021025,
"grad_norm": 15.8125,
"grad_norm_var": 369.05983072916666,
"learning_rate": 0.0001,
"loss": 7.1166,
"loss/crossentropy": 1.7462717294692993,
"loss/hidden": 0.126953125,
"loss/logits": 0.0133826844394207,
"loss/reg": 5.2299885749816895,
"loss/twn": 0.0,
"step": 841
},
{
"epoch": 0.02105,
"grad_norm": 11.25,
"grad_norm_var": 370.280322265625,
"learning_rate": 0.0001,
"loss": 8.2839,
"loss/crossentropy": 2.8843467235565186,
"loss/hidden": 0.1591796875,
"loss/logits": 0.01038344856351614,
"loss/reg": 5.229991912841797,
"loss/twn": 0.0,
"step": 842
},
{
"epoch": 0.021075,
"grad_norm": 10.5,
"grad_norm_var": 366.12810872395835,
"learning_rate": 0.0001,
"loss": 7.9547,
"loss/crossentropy": 2.5944228172302246,
"loss/hidden": 0.1201171875,
"loss/logits": 0.009886080399155617,
"loss/reg": 5.230307579040527,
"loss/twn": 0.0,
"step": 843
},
{
"epoch": 0.0211,
"grad_norm": 9.125,
"grad_norm_var": 366.22291666666666,
"learning_rate": 0.0001,
"loss": 7.8067,
"loss/crossentropy": 2.4823381900787354,
"loss/hidden": 0.08642578125,
"loss/logits": 0.007714688777923584,
"loss/reg": 5.2301812171936035,
"loss/twn": 0.0,
"step": 844
},
{
"epoch": 0.021125,
"grad_norm": 10.125,
"grad_norm_var": 364.37786458333335,
"learning_rate": 0.0001,
"loss": 7.77,
"loss/crossentropy": 2.477754592895508,
"loss/hidden": 0.058837890625,
"loss/logits": 0.0033183712512254715,
"loss/reg": 5.2300519943237305,
"loss/twn": 0.0,
"step": 845
},
{
"epoch": 0.02115,
"grad_norm": 8.125,
"grad_norm_var": 122.33483072916667,
"learning_rate": 0.0001,
"loss": 6.1579,
"loss/crossentropy": 0.859074056148529,
"loss/hidden": 0.06689453125,
"loss/logits": 0.0018144365167245269,
"loss/reg": 5.230114936828613,
"loss/twn": 0.0,
"step": 846
},
{
"epoch": 0.021175,
"grad_norm": 11.375,
"grad_norm_var": 121.52916666666667,
"learning_rate": 0.0001,
"loss": 7.8874,
"loss/crossentropy": 2.5228824615478516,
"loss/hidden": 0.1279296875,
"loss/logits": 0.006642586551606655,
"loss/reg": 5.229929447174072,
"loss/twn": 0.0,
"step": 847
},
{
"epoch": 0.0212,
"grad_norm": 7.84375,
"grad_norm_var": 124.91343994140625,
"learning_rate": 0.0001,
"loss": 7.0446,
"loss/crossentropy": 1.6815242767333984,
"loss/hidden": 0.123046875,
"loss/logits": 0.009667545557022095,
"loss/reg": 5.230370044708252,
"loss/twn": 0.0,
"step": 848
},
{
"epoch": 0.021225,
"grad_norm": 10.875,
"grad_norm_var": 125.48717041015625,
"learning_rate": 0.0001,
"loss": 7.8503,
"loss/crossentropy": 2.5361642837524414,
"loss/hidden": 0.0791015625,
"loss/logits": 0.004664687905460596,
"loss/reg": 5.230417728424072,
"loss/twn": 0.0,
"step": 849
},
{
"epoch": 0.02125,
"grad_norm": 8.5625,
"grad_norm_var": 127.85452067057291,
"learning_rate": 0.0001,
"loss": 7.3983,
"loss/crossentropy": 2.035668134689331,
"loss/hidden": 0.125,
"loss/logits": 0.00722795445472002,
"loss/reg": 5.230389595031738,
"loss/twn": 0.0,
"step": 850
},
{
"epoch": 0.021275,
"grad_norm": 12.5,
"grad_norm_var": 126.07258707682291,
"learning_rate": 0.0001,
"loss": 7.8854,
"loss/crossentropy": 2.4939327239990234,
"loss/hidden": 0.1494140625,
"loss/logits": 0.012096907943487167,
"loss/reg": 5.229991912841797,
"loss/twn": 0.0,
"step": 851
},
{
"epoch": 0.0213,
"grad_norm": 11.5,
"grad_norm_var": 67.09016520182291,
"learning_rate": 0.0001,
"loss": 6.8451,
"loss/crossentropy": 1.4572025537490845,
"loss/hidden": 0.14453125,
"loss/logits": 0.013090159744024277,
"loss/reg": 5.230262756347656,
"loss/twn": 0.0,
"step": 852
},
{
"epoch": 0.021325,
"grad_norm": 12.0625,
"grad_norm_var": 65.91975504557291,
"learning_rate": 0.0001,
"loss": 7.7312,
"loss/crossentropy": 2.376828193664551,
"loss/hidden": 0.115234375,
"loss/logits": 0.009080484509468079,
"loss/reg": 5.230012893676758,
"loss/twn": 0.0,
"step": 853
},
{
"epoch": 0.02135,
"grad_norm": 11.75,
"grad_norm_var": 6.420926920572916,
"learning_rate": 0.0001,
"loss": 7.0291,
"loss/crossentropy": 1.6736173629760742,
"loss/hidden": 0.12060546875,
"loss/logits": 0.004477534908801317,
"loss/reg": 5.230405807495117,
"loss/twn": 0.0,
"step": 854
},
{
"epoch": 0.021375,
"grad_norm": 12.0,
"grad_norm_var": 5.132840983072916,
"learning_rate": 0.0001,
"loss": 6.8586,
"loss/crossentropy": 1.5102664232254028,
"loss/hidden": 0.1103515625,
"loss/logits": 0.008198726922273636,
"loss/reg": 5.229762077331543,
"loss/twn": 0.0,
"step": 855
},
{
"epoch": 0.0214,
"grad_norm": 38.0,
"grad_norm_var": 49.72854410807292,
"learning_rate": 0.0001,
"loss": 7.6098,
"loss/crossentropy": 2.24526047706604,
"loss/hidden": 0.12255859375,
"loss/logits": 0.011424753814935684,
"loss/reg": 5.23051118850708,
"loss/twn": 0.0,
"step": 856
},
{
"epoch": 0.021425,
"grad_norm": 164.0,
"grad_norm_var": 1485.9123982747396,
"learning_rate": 0.0001,
"loss": 7.3561,
"loss/crossentropy": 1.972996711730957,
"loss/hidden": 0.14453125,
"loss/logits": 0.008586418814957142,
"loss/reg": 5.230007648468018,
"loss/twn": 0.0,
"step": 857
},
{
"epoch": 0.02145,
"grad_norm": 24.25,
"grad_norm_var": 1478.1022420247396,
"learning_rate": 0.0001,
"loss": 7.1992,
"loss/crossentropy": 1.7982319593429565,
"loss/hidden": 0.1611328125,
"loss/logits": 0.009712583385407925,
"loss/reg": 5.230077743530273,
"loss/twn": 0.0,
"step": 858
},
{
"epoch": 0.021475,
"grad_norm": 55.0,
"grad_norm_var": 1529.7060180664062,
"learning_rate": 0.0001,
"loss": 6.8691,
"loss/crossentropy": 1.487571358680725,
"loss/hidden": 0.1416015625,
"loss/logits": 0.009719829075038433,
"loss/reg": 5.230188369750977,
"loss/twn": 0.0,
"step": 859
},
{
"epoch": 0.0215,
"grad_norm": 37.75,
"grad_norm_var": 1518.6361938476562,
"learning_rate": 0.0001,
"loss": 6.2527,
"loss/crossentropy": 0.7488301396369934,
"loss/hidden": 0.263671875,
"loss/logits": 0.010243739932775497,
"loss/reg": 5.229933261871338,
"loss/twn": 0.0,
"step": 860
},
{
"epoch": 0.021525,
"grad_norm": 22.75,
"grad_norm_var": 1499.8006144205729,
"learning_rate": 0.0001,
"loss": 6.8342,
"loss/crossentropy": 1.4506139755249023,
"loss/hidden": 0.150390625,
"loss/logits": 0.002946457825601101,
"loss/reg": 5.23027229309082,
"loss/twn": 0.0,
"step": 861
},
{
"epoch": 0.02155,
"grad_norm": 26.125,
"grad_norm_var": 1472.299051920573,
"learning_rate": 0.0001,
"loss": 8.298,
"loss/crossentropy": 2.793138265609741,
"loss/hidden": 0.2578125,
"loss/logits": 0.016960376873612404,
"loss/reg": 5.230113506317139,
"loss/twn": 0.0,
"step": 862
},
{
"epoch": 0.021575,
"grad_norm": 12.0,
"grad_norm_var": 1470.842508951823,
"learning_rate": 0.0001,
"loss": 8.426,
"loss/crossentropy": 3.0826714038848877,
"loss/hidden": 0.10302734375,
"loss/logits": 0.009999147616326809,
"loss/reg": 5.230251789093018,
"loss/twn": 0.0,
"step": 863
},
{
"epoch": 0.0216,
"grad_norm": 170.0,
"grad_norm_var": 2652.8306640625,
"learning_rate": 0.0001,
"loss": 8.011,
"loss/crossentropy": 2.557135820388794,
"loss/hidden": 0.205078125,
"loss/logits": 0.018819302320480347,
"loss/reg": 5.2300004959106445,
"loss/twn": 0.0,
"step": 864
},
{
"epoch": 0.021625,
"grad_norm": 9.8125,
"grad_norm_var": 2656.930973307292,
"learning_rate": 0.0001,
"loss": 6.4711,
"loss/crossentropy": 1.1575171947479248,
"loss/hidden": 0.0791015625,
"loss/logits": 0.004193156957626343,
"loss/reg": 5.230248928070068,
"loss/twn": 0.0,
"step": 865
},
{
"epoch": 0.02165,
"grad_norm": 10.125,
"grad_norm_var": 2650.689518229167,
"learning_rate": 0.0001,
"loss": 7.0724,
"loss/crossentropy": 1.6960041522979736,
"loss/hidden": 0.13671875,
"loss/logits": 0.009182040579617023,
"loss/reg": 5.230460166931152,
"loss/twn": 0.0,
"step": 866
},
{
"epoch": 0.021675,
"grad_norm": 7.875,
"grad_norm_var": 2668.584895833333,
"learning_rate": 0.0001,
"loss": 7.1586,
"loss/crossentropy": 1.821337103843689,
"loss/hidden": 0.10107421875,
"loss/logits": 0.00650972593575716,
"loss/reg": 5.229717254638672,
"loss/twn": 0.0,
"step": 867
},
{
"epoch": 0.0217,
"grad_norm": 14.6875,
"grad_norm_var": 2657.5058430989584,
"learning_rate": 0.0001,
"loss": 8.054,
"loss/crossentropy": 2.651723623275757,
"loss/hidden": 0.1552734375,
"loss/logits": 0.01694151759147644,
"loss/reg": 5.230074405670166,
"loss/twn": 0.0,
"step": 868
},
{
"epoch": 0.021725,
"grad_norm": 17.5,
"grad_norm_var": 2639.634309895833,
"learning_rate": 0.0001,
"loss": 7.8281,
"loss/crossentropy": 2.49051570892334,
"loss/hidden": 0.0986328125,
"loss/logits": 0.009045520797371864,
"loss/reg": 5.229867935180664,
"loss/twn": 0.0,
"step": 869
},
{
"epoch": 0.02175,
"grad_norm": 13.4375,
"grad_norm_var": 2633.545686848958,
"learning_rate": 0.0001,
"loss": 7.0012,
"loss/crossentropy": 1.546895146369934,
"loss/hidden": 0.21484375,
"loss/logits": 0.009267905727028847,
"loss/reg": 5.230212211608887,
"loss/twn": 0.0,
"step": 870
},
{
"epoch": 0.021775,
"grad_norm": 14.9375,
"grad_norm_var": 2623.2330729166665,
"learning_rate": 0.0001,
"loss": 7.582,
"loss/crossentropy": 2.2645156383514404,
"loss/hidden": 0.08154296875,
"loss/logits": 0.006237420719116926,
"loss/reg": 5.229740619659424,
"loss/twn": 0.0,
"step": 871
},
{
"epoch": 0.0218,
"grad_norm": 12.6875,
"grad_norm_var": 2669.6590983072915,
"learning_rate": 0.0001,
"loss": 6.8665,
"loss/crossentropy": 1.5306649208068848,
"loss/hidden": 0.1005859375,
"loss/logits": 0.004954389296472073,
"loss/reg": 5.230282306671143,
"loss/twn": 0.0,
"step": 872
},
{
"epoch": 0.021825,
"grad_norm": 35.0,
"grad_norm_var": 1547.8294108072917,
"learning_rate": 0.0001,
"loss": 8.1648,
"loss/crossentropy": 2.7598485946655273,
"loss/hidden": 0.1611328125,
"loss/logits": 0.014029700309038162,
"loss/reg": 5.229771137237549,
"loss/twn": 0.0,
"step": 873
},
{
"epoch": 0.02185,
"grad_norm": 9.375,
"grad_norm_var": 1573.5507649739584,
"learning_rate": 0.0001,
"loss": 6.9286,
"loss/crossentropy": 1.5998884439468384,
"loss/hidden": 0.09521484375,
"loss/logits": 0.003966475836932659,
"loss/reg": 5.2295050621032715,
"loss/twn": 0.0,
"step": 874
},
{
"epoch": 0.021875,
"grad_norm": 13.375,
"grad_norm_var": 1539.2968587239584,
"learning_rate": 0.0001,
"loss": 8.1337,
"loss/crossentropy": 2.756443977355957,
"loss/hidden": 0.1357421875,
"loss/logits": 0.011736356653273106,
"loss/reg": 5.229776859283447,
"loss/twn": 0.0,
"step": 875
},
{
"epoch": 0.0219,
"grad_norm": 14.1875,
"grad_norm_var": 1539.3275390625,
"learning_rate": 0.0001,
"loss": 8.4196,
"loss/crossentropy": 3.001187324523926,
"loss/hidden": 0.171875,
"loss/logits": 0.01653527095913887,
"loss/reg": 5.230004787445068,
"loss/twn": 0.0,
"step": 876
},
{
"epoch": 0.021925,
"grad_norm": 10.0625,
"grad_norm_var": 1553.6042805989584,
"learning_rate": 0.0001,
"loss": 7.6758,
"loss/crossentropy": 2.371596097946167,
"loss/hidden": 0.0693359375,
"loss/logits": 0.004934161901473999,
"loss/reg": 5.229954242706299,
"loss/twn": 0.0,
"step": 877
},
{
"epoch": 0.02195,
"grad_norm": 15.25,
"grad_norm_var": 1558.5659993489583,
"learning_rate": 0.0001,
"loss": 7.0978,
"loss/crossentropy": 1.6408387422561646,
"loss/hidden": 0.208984375,
"loss/logits": 0.017855621874332428,
"loss/reg": 5.230149745941162,
"loss/twn": 0.0,
"step": 878
},
{
"epoch": 0.021975,
"grad_norm": 11.6875,
"grad_norm_var": 1559.0625,
"learning_rate": 0.0001,
"loss": 7.826,
"loss/crossentropy": 2.489450454711914,
"loss/hidden": 0.10107421875,
"loss/logits": 0.005990723147988319,
"loss/reg": 5.22952938079834,
"loss/twn": 0.0,
"step": 879
},
{
"epoch": 0.022,
"grad_norm": 142.0,
"grad_norm_var": 1062.0625,
"learning_rate": 0.0001,
"loss": 6.2769,
"loss/crossentropy": 0.8815757632255554,
"loss/hidden": 0.1513671875,
"loss/logits": 0.01392771303653717,
"loss/reg": 5.230021953582764,
"loss/twn": 0.0,
"step": 880
},
{
"epoch": 0.022025,
"grad_norm": 9.75,
"grad_norm_var": 1062.164306640625,
"learning_rate": 0.0001,
"loss": 7.2034,
"loss/crossentropy": 1.8355108499526978,
"loss/hidden": 0.1318359375,
"loss/logits": 0.00601241085678339,
"loss/reg": 5.230048656463623,
"loss/twn": 0.0,
"step": 881
},
{
"epoch": 0.02205,
"grad_norm": 10.5625,
"grad_norm_var": 1061.4837890625,
"learning_rate": 0.0001,
"loss": 6.8518,
"loss/crossentropy": 1.397723913192749,
"loss/hidden": 0.2158203125,
"loss/logits": 0.00827928725630045,
"loss/reg": 5.229991912841797,
"loss/twn": 0.0,
"step": 882
},
{
"epoch": 0.022075,
"grad_norm": 10.6875,
"grad_norm_var": 1056.672509765625,
"learning_rate": 0.0001,
"loss": 8.2004,
"loss/crossentropy": 2.9133853912353516,
"loss/hidden": 0.05224609375,
"loss/logits": 0.005087848752737045,
"loss/reg": 5.2297210693359375,
"loss/twn": 0.0,
"step": 883
},
{
"epoch": 0.0221,
"grad_norm": 37.75,
"grad_norm_var": 1066.81640625,
"learning_rate": 0.0001,
"loss": 6.094,
"loss/crossentropy": 0.7456091046333313,
"loss/hidden": 0.1162109375,
"loss/logits": 0.002344908192753792,
"loss/reg": 5.229843616485596,
"loss/twn": 0.0,
"step": 884
},
{
"epoch": 0.022125,
"grad_norm": 14.3125,
"grad_norm_var": 1070.061181640625,
"learning_rate": 0.0001,
"loss": 7.1178,
"loss/crossentropy": 1.7599776983261108,
"loss/hidden": 0.12109375,
"loss/logits": 0.0070150988176465034,
"loss/reg": 5.229736804962158,
"loss/twn": 0.0,
"step": 885
},
{
"epoch": 0.02215,
"grad_norm": 18.0,
"grad_norm_var": 1065.2764973958333,
"learning_rate": 0.0001,
"loss": 6.9191,
"loss/crossentropy": 1.5698696374893188,
"loss/hidden": 0.10986328125,
"loss/logits": 0.009311170317232609,
"loss/reg": 5.230006217956543,
"loss/twn": 0.0,
"step": 886
},
{
"epoch": 0.022175,
"grad_norm": 12.875,
"grad_norm_var": 1067.9593587239583,
"learning_rate": 0.0001,
"loss": 7.0143,
"loss/crossentropy": 1.6162168979644775,
"loss/hidden": 0.1552734375,
"loss/logits": 0.013361955992877483,
"loss/reg": 5.229493618011475,
"loss/twn": 0.0,
"step": 887
},
{
"epoch": 0.0222,
"grad_norm": 13.4375,
"grad_norm_var": 1066.9034993489583,
"learning_rate": 0.0001,
"loss": 8.1799,
"loss/crossentropy": 2.848146915435791,
"loss/hidden": 0.09375,
"loss/logits": 0.008583602495491505,
"loss/reg": 5.229381561279297,
"loss/twn": 0.0,
"step": 888
},
{
"epoch": 0.022225,
"grad_norm": 18.75,
"grad_norm_var": 1058.8038899739583,
"learning_rate": 0.0001,
"loss": 8.1476,
"loss/crossentropy": 2.7504210472106934,
"loss/hidden": 0.158203125,
"loss/logits": 0.00982433557510376,
"loss/reg": 5.229184627532959,
"loss/twn": 0.0,
"step": 889
},
{
"epoch": 0.02225,
"grad_norm": 57.25,
"grad_norm_var": 1117.4507649739583,
"learning_rate": 0.0001,
"loss": 6.9922,
"loss/crossentropy": 1.6320585012435913,
"loss/hidden": 0.1171875,
"loss/logits": 0.01302691176533699,
"loss/reg": 5.229902267456055,
"loss/twn": 0.0,
"step": 890
},
{
"epoch": 0.022275,
"grad_norm": 11.125,
"grad_norm_var": 1121.4409993489583,
"learning_rate": 0.0001,
"loss": 7.9641,
"loss/crossentropy": 2.6222574710845947,
"loss/hidden": 0.10107421875,
"loss/logits": 0.011231745593249798,
"loss/reg": 5.229504108428955,
"loss/twn": 0.0,
"step": 891
},
{
"epoch": 0.0223,
"grad_norm": 7.65625,
"grad_norm_var": 1133.9413696289062,
"learning_rate": 0.0001,
"loss": 7.5977,
"loss/crossentropy": 2.262641191482544,
"loss/hidden": 0.09619140625,
"loss/logits": 0.008943114429712296,
"loss/reg": 5.229957103729248,
"loss/twn": 0.0,
"step": 892
},
{
"epoch": 0.022325,
"grad_norm": 11.3125,
"grad_norm_var": 1131.5373982747396,
"learning_rate": 0.0001,
"loss": 7.885,
"loss/crossentropy": 2.514071226119995,
"loss/hidden": 0.134765625,
"loss/logits": 0.006755891256034374,
"loss/reg": 5.229411602020264,
"loss/twn": 0.0,
"step": 893
},
{
"epoch": 0.02235,
"grad_norm": 31.875,
"grad_norm_var": 1126.865946451823,
"learning_rate": 0.0001,
"loss": 6.8167,
"loss/crossentropy": 1.4057066440582275,
"loss/hidden": 0.17578125,
"loss/logits": 0.005477352067828178,
"loss/reg": 5.229723930358887,
"loss/twn": 0.0,
"step": 894
},
{
"epoch": 0.022375,
"grad_norm": 10.375,
"grad_norm_var": 1129.511454264323,
"learning_rate": 0.0001,
"loss": 7.7274,
"loss/crossentropy": 2.414677381515503,
"loss/hidden": 0.0791015625,
"loss/logits": 0.004009230528026819,
"loss/reg": 5.229562282562256,
"loss/twn": 0.0,
"step": 895
},
{
"epoch": 0.0224,
"grad_norm": 11.125,
"grad_norm_var": 177.70256754557292,
"learning_rate": 0.0001,
"loss": 6.6999,
"loss/crossentropy": 1.3421200513839722,
"loss/hidden": 0.1220703125,
"loss/logits": 0.006061128340661526,
"loss/reg": 5.229669094085693,
"loss/twn": 0.0,
"step": 896
},
{
"epoch": 0.022425,
"grad_norm": 20.125,
"grad_norm_var": 173.1175740559896,
"learning_rate": 0.0001,
"loss": 7.0618,
"loss/crossentropy": 1.6401444673538208,
"loss/hidden": 0.1806640625,
"loss/logits": 0.011477080173790455,
"loss/reg": 5.229546546936035,
"loss/twn": 0.0,
"step": 897
},
{
"epoch": 0.02245,
"grad_norm": 9.5,
"grad_norm_var": 174.32340087890626,
"learning_rate": 0.0001,
"loss": 7.0679,
"loss/crossentropy": 1.6865355968475342,
"loss/hidden": 0.1435546875,
"loss/logits": 0.007968233898282051,
"loss/reg": 5.229793071746826,
"loss/twn": 0.0,
"step": 898
},
{
"epoch": 0.022475,
"grad_norm": 9.5625,
"grad_norm_var": 175.57584228515626,
"learning_rate": 0.0001,
"loss": 6.9239,
"loss/crossentropy": 1.5547279119491577,
"loss/hidden": 0.1328125,
"loss/logits": 0.006813929416239262,
"loss/reg": 5.229519844055176,
"loss/twn": 0.0,
"step": 899
},
{
"epoch": 0.0225,
"grad_norm": 15.1875,
"grad_norm_var": 149.29993082682293,
"learning_rate": 0.0001,
"loss": 7.6572,
"loss/crossentropy": 2.416555166244507,
"loss/hidden": 0.0093994140625,
"loss/logits": 0.001800237107090652,
"loss/reg": 5.229480266571045,
"loss/twn": 0.0,
"step": 900
},
{
"epoch": 0.022525,
"grad_norm": 21.5,
"grad_norm_var": 149.9250935872396,
"learning_rate": 0.0001,
"loss": 8.2505,
"loss/crossentropy": 2.8510868549346924,
"loss/hidden": 0.1591796875,
"loss/logits": 0.01060121227055788,
"loss/reg": 5.229599952697754,
"loss/twn": 0.0,
"step": 901
},
{
"epoch": 0.02255,
"grad_norm": 9.6875,
"grad_norm_var": 153.66571858723958,
"learning_rate": 0.0001,
"loss": 7.8264,
"loss/crossentropy": 2.532890796661377,
"loss/hidden": 0.0595703125,
"loss/logits": 0.004101074766367674,
"loss/reg": 5.229794979095459,
"loss/twn": 0.0,
"step": 902
},
{
"epoch": 0.022575,
"grad_norm": 41.75,
"grad_norm_var": 190.05273030598957,
"learning_rate": 0.0001,
"loss": 7.0146,
"loss/crossentropy": 1.6412537097930908,
"loss/hidden": 0.134765625,
"loss/logits": 0.008777379989624023,
"loss/reg": 5.229798793792725,
"loss/twn": 0.0,
"step": 903
},
{
"epoch": 0.0226,
"grad_norm": 15.625,
"grad_norm_var": 188.79833577473957,
"learning_rate": 0.0001,
"loss": 8.2633,
"loss/crossentropy": 2.9667930603027344,
"loss/hidden": 0.06201171875,
"loss/logits": 0.005043432116508484,
"loss/reg": 5.229448318481445,
"loss/twn": 0.0,
"step": 904
},
{
"epoch": 0.022625,
"grad_norm": 12.4375,
"grad_norm_var": 191.41539306640624,
"learning_rate": 0.0001,
"loss": 8.0256,
"loss/crossentropy": 2.680783987045288,
"loss/hidden": 0.10986328125,
"loss/logits": 0.005312731955200434,
"loss/reg": 5.229615211486816,
"loss/twn": 0.0,
"step": 905
},
{
"epoch": 0.02265,
"grad_norm": 16.625,
"grad_norm_var": 84.70071207682291,
"learning_rate": 0.0001,
"loss": 7.5648,
"loss/crossentropy": 2.088520050048828,
"loss/hidden": 0.234375,
"loss/logits": 0.012642334215342999,
"loss/reg": 5.229223251342773,
"loss/twn": 0.0,
"step": 906
},
{
"epoch": 0.022675,
"grad_norm": 24.125,
"grad_norm_var": 86.87076416015626,
"learning_rate": 0.0001,
"loss": 5.9125,
"loss/crossentropy": 0.48404479026794434,
"loss/hidden": 0.1904296875,
"loss/logits": 0.008365976624190807,
"loss/reg": 5.2296600341796875,
"loss/twn": 0.0,
"step": 907
},
{
"epoch": 0.0227,
"grad_norm": 11.0,
"grad_norm_var": 83.502197265625,
"learning_rate": 0.0001,
"loss": 6.8349,
"loss/crossentropy": 1.4288034439086914,
"loss/hidden": 0.1669921875,
"loss/logits": 0.009537655860185623,
"loss/reg": 5.229605674743652,
"loss/twn": 0.0,
"step": 908
},
{
"epoch": 0.022725,
"grad_norm": 16.75,
"grad_norm_var": 81.23515625,
"learning_rate": 0.0001,
"loss": 7.9611,
"loss/crossentropy": 2.5989267826080322,
"loss/hidden": 0.1201171875,
"loss/logits": 0.012637370266020298,
"loss/reg": 5.22941255569458,
"loss/twn": 0.0,
"step": 909
},
{
"epoch": 0.02275,
"grad_norm": 15.8125,
"grad_norm_var": 66.20584309895834,
"learning_rate": 0.0001,
"loss": 7.4412,
"loss/crossentropy": 2.0751564502716064,
"loss/hidden": 0.125,
"loss/logits": 0.011572781018912792,
"loss/reg": 5.229443550109863,
"loss/twn": 0.0,
"step": 910
},
{
"epoch": 0.022775,
"grad_norm": 13.0,
"grad_norm_var": 64.55428059895833,
"learning_rate": 0.0001,
"loss": 6.7192,
"loss/crossentropy": 1.3851910829544067,
"loss/hidden": 0.09716796875,
"loss/logits": 0.007330389227718115,
"loss/reg": 5.229530334472656,
"loss/twn": 0.0,
"step": 911
},
{
"epoch": 0.0228,
"grad_norm": 12.125,
"grad_norm_var": 63.901676432291666,
"learning_rate": 0.0001,
"loss": 7.152,
"loss/crossentropy": 1.8044177293777466,
"loss/hidden": 0.1123046875,
"loss/logits": 0.005986911244690418,
"loss/reg": 5.22929048538208,
"loss/twn": 0.0,
"step": 912
},
{
"epoch": 0.022825,
"grad_norm": 8.8125,
"grad_norm_var": 66.50885416666667,
"learning_rate": 0.0001,
"loss": 8.4326,
"loss/crossentropy": 3.201655864715576,
"loss/hidden": 4.380941390991211e-06,
"loss/logits": 0.0015021440340206027,
"loss/reg": 5.229450702667236,
"loss/twn": 0.0,
"step": 913
},
{
"epoch": 0.02285,
"grad_norm": 10.3125,
"grad_norm_var": 65.86287434895833,
"learning_rate": 0.0001,
"loss": 6.5774,
"loss/crossentropy": 1.2178183794021606,
"loss/hidden": 0.12255859375,
"loss/logits": 0.007649564184248447,
"loss/reg": 5.22938346862793,
"loss/twn": 0.0,
"step": 914
},
{
"epoch": 0.022875,
"grad_norm": 13.0,
"grad_norm_var": 63.69921875,
"learning_rate": 0.0001,
"loss": 7.8085,
"loss/crossentropy": 2.4297561645507812,
"loss/hidden": 0.1376953125,
"loss/logits": 0.011574456468224525,
"loss/reg": 5.2294511795043945,
"loss/twn": 0.0,
"step": 915
},
{
"epoch": 0.0229,
"grad_norm": 13.25,
"grad_norm_var": 64.17198893229167,
"learning_rate": 0.0001,
"loss": 7.9386,
"loss/crossentropy": 2.616173028945923,
"loss/hidden": 0.08642578125,
"loss/logits": 0.0062749385833740234,
"loss/reg": 5.229771137237549,
"loss/twn": 0.0,
"step": 916
},
{
"epoch": 0.022925,
"grad_norm": 10.6875,
"grad_norm_var": 63.5328125,
"learning_rate": 0.0001,
"loss": 8.1081,
"loss/crossentropy": 2.7487690448760986,
"loss/hidden": 0.1201171875,
"loss/logits": 0.009604476392269135,
"loss/reg": 5.2296576499938965,
"loss/twn": 0.0,
"step": 917
},
{
"epoch": 0.02295,
"grad_norm": 21.625,
"grad_norm_var": 63.486181640625,
"learning_rate": 0.0001,
"loss": 7.0533,
"loss/crossentropy": 1.7067608833312988,
"loss/hidden": 0.1123046875,
"loss/logits": 0.004787761718034744,
"loss/reg": 5.2294087409973145,
"loss/twn": 0.0,
"step": 918
},
{
"epoch": 0.022975,
"grad_norm": 14.6875,
"grad_norm_var": 16.556705729166666,
"learning_rate": 0.0001,
"loss": 6.862,
"loss/crossentropy": 1.5273187160491943,
"loss/hidden": 0.0986328125,
"loss/logits": 0.006548475474119186,
"loss/reg": 5.229493618011475,
"loss/twn": 0.0,
"step": 919
},
{
"epoch": 0.023,
"grad_norm": 10.25,
"grad_norm_var": 17.4609375,
"learning_rate": 0.0001,
"loss": 8.096,
"loss/crossentropy": 2.765378952026367,
"loss/hidden": 0.09375,
"loss/logits": 0.007458665873855352,
"loss/reg": 5.229459285736084,
"loss/twn": 0.0,
"step": 920
},
{
"epoch": 0.023025,
"grad_norm": 15.875,
"grad_norm_var": 17.468994140625,
"learning_rate": 0.0001,
"loss": 6.2566,
"loss/crossentropy": 0.8696529865264893,
"loss/hidden": 0.1484375,
"loss/logits": 0.009295967407524586,
"loss/reg": 5.229222297668457,
"loss/twn": 0.0,
"step": 921
},
{
"epoch": 0.02305,
"grad_norm": 14.5,
"grad_norm_var": 17.077197265625,
"learning_rate": 0.0001,
"loss": 7.8853,
"loss/crossentropy": 2.496795892715454,
"loss/hidden": 0.1494140625,
"loss/logits": 0.009545085951685905,
"loss/reg": 5.229542255401611,
"loss/twn": 0.0,
"step": 922
},
{
"epoch": 0.023075,
"grad_norm": 23.25,
"grad_norm_var": 15.957014973958334,
"learning_rate": 0.0001,
"loss": 7.049,
"loss/crossentropy": 1.6003493070602417,
"loss/hidden": 0.2060546875,
"loss/logits": 0.013520617038011551,
"loss/reg": 5.229060173034668,
"loss/twn": 0.0,
"step": 923
},
{
"epoch": 0.0231,
"grad_norm": 9.75,
"grad_norm_var": 16.564436848958334,
"learning_rate": 0.0001,
"loss": 6.8671,
"loss/crossentropy": 1.5159342288970947,
"loss/hidden": 0.10986328125,
"loss/logits": 0.011563955806195736,
"loss/reg": 5.2297444343566895,
"loss/twn": 0.0,
"step": 924
},
{
"epoch": 0.023125,
"grad_norm": 9.125,
"grad_norm_var": 17.382535807291667,
"learning_rate": 0.0001,
"loss": 7.8368,
"loss/crossentropy": 2.537524938583374,
"loss/hidden": 0.064453125,
"loss/logits": 0.005966213531792164,
"loss/reg": 5.228902339935303,
"loss/twn": 0.0,
"step": 925
},
{
"epoch": 0.02315,
"grad_norm": 10.6875,
"grad_norm_var": 17.446598307291666,
"learning_rate": 0.0001,
"loss": 7.097,
"loss/crossentropy": 1.695339560508728,
"loss/hidden": 0.1630859375,
"loss/logits": 0.009303221479058266,
"loss/reg": 5.229240894317627,
"loss/twn": 0.0,
"step": 926
},
{
"epoch": 0.023175,
"grad_norm": 10.375,
"grad_norm_var": 17.941520182291665,
"learning_rate": 0.0001,
"loss": 8.0381,
"loss/crossentropy": 2.6705760955810547,
"loss/hidden": 0.12890625,
"loss/logits": 0.00920666940510273,
"loss/reg": 5.229386329650879,
"loss/twn": 0.0,
"step": 927
},
{
"epoch": 0.0232,
"grad_norm": 8.75,
"grad_norm_var": 19.055973307291666,
"learning_rate": 0.0001,
"loss": 7.8173,
"loss/crossentropy": 2.529360055923462,
"loss/hidden": 0.0546875,
"loss/logits": 0.003768081543967128,
"loss/reg": 5.229437351226807,
"loss/twn": 0.0,
"step": 928
},
{
"epoch": 0.023225,
"grad_norm": 74.5,
"grad_norm_var": 253.73483072916667,
"learning_rate": 0.0001,
"loss": 6.3033,
"loss/crossentropy": 0.9277183413505554,
"loss/hidden": 0.138671875,
"loss/logits": 0.007658226415514946,
"loss/reg": 5.229298114776611,
"loss/twn": 0.0,
"step": 929
},
{
"epoch": 0.02325,
"grad_norm": 19.875,
"grad_norm_var": 251.03292643229167,
"learning_rate": 0.0001,
"loss": 8.3524,
"loss/crossentropy": 2.9925942420959473,
"loss/hidden": 0.1201171875,
"loss/logits": 0.010561013594269753,
"loss/reg": 5.229130268096924,
"loss/twn": 0.0,
"step": 930
},
{
"epoch": 0.023275,
"grad_norm": 25.5,
"grad_norm_var": 253.27902018229167,
"learning_rate": 0.0001,
"loss": 7.8689,
"loss/crossentropy": 2.5310251712799072,
"loss/hidden": 0.0986328125,
"loss/logits": 0.010016044601798058,
"loss/reg": 5.229218482971191,
"loss/twn": 0.0,
"step": 931
},
{
"epoch": 0.0233,
"grad_norm": 138.0,
"grad_norm_var": 1142.0577962239583,
"learning_rate": 0.0001,
"loss": 6.7784,
"loss/crossentropy": 1.2963286638259888,
"loss/hidden": 0.2431640625,
"loss/logits": 0.00944933295249939,
"loss/reg": 5.229430198669434,
"loss/twn": 0.0,
"step": 932
},
{
"epoch": 0.023325,
"grad_norm": 12.0625,
"grad_norm_var": 1139.352197265625,
"learning_rate": 0.0001,
"loss": 8.0218,
"loss/crossentropy": 2.6391210556030273,
"loss/hidden": 0.142578125,
"loss/logits": 0.011118912138044834,
"loss/reg": 5.229004383087158,
"loss/twn": 0.0,
"step": 933
},
{
"epoch": 0.02335,
"grad_norm": 10.75,
"grad_norm_var": 1153.342431640625,
"learning_rate": 0.0001,
"loss": 7.3416,
"loss/crossentropy": 2.0831658840179443,
"loss/hidden": 0.0279541015625,
"loss/logits": 0.0012758576776832342,
"loss/reg": 5.229192733764648,
"loss/twn": 0.0,
"step": 934
},
{
"epoch": 0.023375,
"grad_norm": 29.125,
"grad_norm_var": 1145.5634765625,
"learning_rate": 0.0001,
"loss": 6.8365,
"loss/crossentropy": 1.4616435766220093,
"loss/hidden": 0.138671875,
"loss/logits": 0.007113803178071976,
"loss/reg": 5.229030132293701,
"loss/twn": 0.0,
"step": 935
},
{
"epoch": 0.0234,
"grad_norm": 13.625,
"grad_norm_var": 1139.00859375,
"learning_rate": 0.0001,
"loss": 6.1257,
"loss/crossentropy": 0.7412286400794983,
"loss/hidden": 0.1484375,
"loss/logits": 0.006910163909196854,
"loss/reg": 5.229117393493652,
"loss/twn": 0.0,
"step": 936
},
{
"epoch": 0.023425,
"grad_norm": 98.5,
"grad_norm_var": 1447.4322265625,
"learning_rate": 0.0001,
"loss": 7.7219,
"loss/crossentropy": 2.3482983112335205,
"loss/hidden": 0.130859375,
"loss/logits": 0.013444026932120323,
"loss/reg": 5.229300498962402,
"loss/twn": 0.0,
"step": 937
},
{
"epoch": 0.02345,
"grad_norm": 6.3125,
"grad_norm_var": 1470.478759765625,
"learning_rate": 0.0001,
"loss": 6.4927,
"loss/crossentropy": 1.20167076587677,
"loss/hidden": 0.0595703125,
"loss/logits": 0.0022249873727560043,
"loss/reg": 5.229248523712158,
"loss/twn": 0.0,
"step": 938
},
{
"epoch": 0.023475,
"grad_norm": 9.3125,
"grad_norm_var": 1497.5080729166666,
"learning_rate": 0.0001,
"loss": 7.1103,
"loss/crossentropy": 1.7726614475250244,
"loss/hidden": 0.0986328125,
"loss/logits": 0.009686892852187157,
"loss/reg": 5.2292704582214355,
"loss/twn": 0.0,
"step": 939
},
{
"epoch": 0.0235,
"grad_norm": 62.75,
"grad_norm_var": 1527.21015625,
"learning_rate": 0.0001,
"loss": 6.9551,
"loss/crossentropy": 1.6360843181610107,
"loss/hidden": 0.08642578125,
"loss/logits": 0.0034621984232217073,
"loss/reg": 5.229147434234619,
"loss/twn": 0.0,
"step": 940
},
{
"epoch": 0.023525,
"grad_norm": 25.0,
"grad_norm_var": 1490.9374348958333,
"learning_rate": 0.0001,
"loss": 6.9106,
"loss/crossentropy": 1.543732762336731,
"loss/hidden": 0.12890625,
"loss/logits": 0.008542709052562714,
"loss/reg": 5.229380130767822,
"loss/twn": 0.0,
"step": 941
},
{
"epoch": 0.02355,
"grad_norm": 15.0,
"grad_norm_var": 1478.2952962239583,
"learning_rate": 0.0001,
"loss": 8.1479,
"loss/crossentropy": 2.7828969955444336,
"loss/hidden": 0.12255859375,
"loss/logits": 0.013092401437461376,
"loss/reg": 5.229334831237793,
"loss/twn": 0.0,
"step": 942
},
{
"epoch": 0.023575,
"grad_norm": 10.375,
"grad_norm_var": 1478.2952962239583,
"learning_rate": 0.0001,
"loss": 6.8721,
"loss/crossentropy": 1.47151517868042,
"loss/hidden": 0.1611328125,
"loss/logits": 0.010217259638011456,
"loss/reg": 5.229234218597412,
"loss/twn": 0.0,
"step": 943
},
{
"epoch": 0.0236,
"grad_norm": 10.5,
"grad_norm_var": 1472.3699055989584,
"learning_rate": 0.0001,
"loss": 6.1565,
"loss/crossentropy": 0.7564952373504639,
"loss/hidden": 0.1630859375,
"loss/logits": 0.007633813191205263,
"loss/reg": 5.229316234588623,
"loss/twn": 0.0,
"step": 944
},
{
"epoch": 0.023625,
"grad_norm": 12.0,
"grad_norm_var": 1387.962353515625,
"learning_rate": 0.0001,
"loss": 8.4192,
"loss/crossentropy": 3.100782871246338,
"loss/hidden": 0.08154296875,
"loss/logits": 0.00750060984864831,
"loss/reg": 5.22934627532959,
"loss/twn": 0.0,
"step": 945
},
{
"epoch": 0.02365,
"grad_norm": 10.5,
"grad_norm_var": 1407.571728515625,
"learning_rate": 0.0001,
"loss": 7.9888,
"loss/crossentropy": 2.6322426795959473,
"loss/hidden": 0.11962890625,
"loss/logits": 0.008000584319233894,
"loss/reg": 5.2289581298828125,
"loss/twn": 0.0,
"step": 946
},
{
"epoch": 0.023675,
"grad_norm": 15.5625,
"grad_norm_var": 1420.4775390625,
"learning_rate": 0.0001,
"loss": 8.039,
"loss/crossentropy": 2.6621694564819336,
"loss/hidden": 0.138671875,
"loss/logits": 0.008840564638376236,
"loss/reg": 5.229346752166748,
"loss/twn": 0.0,
"step": 947
},
{
"epoch": 0.0237,
"grad_norm": 9.1875,
"grad_norm_var": 601.947900390625,
"learning_rate": 0.0001,
"loss": 6.9809,
"loss/crossentropy": 1.6236486434936523,
"loss/hidden": 0.1201171875,
"loss/logits": 0.0081618158146739,
"loss/reg": 5.229002475738525,
"loss/twn": 0.0,
"step": 948
},
{
"epoch": 0.023725,
"grad_norm": 52.0,
"grad_norm_var": 649.196875,
"learning_rate": 0.0001,
"loss": 7.1623,
"loss/crossentropy": 1.6345115900039673,
"loss/hidden": 0.294921875,
"loss/logits": 0.00377917499281466,
"loss/reg": 5.2290778160095215,
"loss/twn": 0.0,
"step": 949
},
{
"epoch": 0.02375,
"grad_norm": 11.8125,
"grad_norm_var": 647.3327962239583,
"learning_rate": 0.0001,
"loss": 6.9575,
"loss/crossentropy": 1.564661979675293,
"loss/hidden": 0.1572265625,
"loss/logits": 0.006771073676645756,
"loss/reg": 5.228812217712402,
"loss/twn": 0.0,
"step": 950
},
{
"epoch": 0.023775,
"grad_norm": 9.5,
"grad_norm_var": 659.2304524739583,
"learning_rate": 0.0001,
"loss": 7.8587,
"loss/crossentropy": 2.5682647228240967,
"loss/hidden": 0.05712890625,
"loss/logits": 0.003940091468393803,
"loss/reg": 5.229336261749268,
"loss/twn": 0.0,
"step": 951
},
{
"epoch": 0.0238,
"grad_norm": 12.25,
"grad_norm_var": 661.1124837239583,
"learning_rate": 0.0001,
"loss": 7.4498,
"loss/crossentropy": 2.0796985626220703,
"loss/hidden": 0.1328125,
"loss/logits": 0.007796227466315031,
"loss/reg": 5.229506015777588,
"loss/twn": 0.0,
"step": 952
},
{
"epoch": 0.023825,
"grad_norm": 12.9375,
"grad_norm_var": 259.1692708333333,
"learning_rate": 0.0001,
"loss": 7.7385,
"loss/crossentropy": 2.351168155670166,
"loss/hidden": 0.1484375,
"loss/logits": 0.010073304176330566,
"loss/reg": 5.228799819946289,
"loss/twn": 0.0,
"step": 953
},
{
"epoch": 0.02385,
"grad_norm": 7.3125,
"grad_norm_var": 257.6984375,
"learning_rate": 0.0001,
"loss": 6.6797,
"loss/crossentropy": 1.3460360765457153,
"loss/hidden": 0.0986328125,
"loss/logits": 0.005743211135268211,
"loss/reg": 5.229253768920898,
"loss/twn": 0.0,
"step": 954
},
{
"epoch": 0.023875,
"grad_norm": 50.5,
"grad_norm_var": 316.70167643229166,
"learning_rate": 0.0001,
"loss": 7.4466,
"loss/crossentropy": 2.0729310512542725,
"loss/hidden": 0.134765625,
"loss/logits": 0.009920709766447544,
"loss/reg": 5.228950023651123,
"loss/twn": 0.0,
"step": 955
},
{
"epoch": 0.0239,
"grad_norm": 32.0,
"grad_norm_var": 202.36612955729166,
"learning_rate": 0.0001,
"loss": 6.7081,
"loss/crossentropy": 1.379569172859192,
"loss/hidden": 0.09521484375,
"loss/logits": 0.00438508577644825,
"loss/reg": 5.228931903839111,
"loss/twn": 0.0,
"step": 956
},
{
"epoch": 0.023925,
"grad_norm": 8.875,
"grad_norm_var": 204.70089518229167,
"learning_rate": 0.0001,
"loss": 6.1735,
"loss/crossentropy": 0.8404383063316345,
"loss/hidden": 0.0986328125,
"loss/logits": 0.0052786958403885365,
"loss/reg": 5.229166030883789,
"loss/twn": 0.0,
"step": 957
},
{
"epoch": 0.02395,
"grad_norm": 11.0625,
"grad_norm_var": 206.99264322916667,
"learning_rate": 0.0001,
"loss": 8.0646,
"loss/crossentropy": 2.8019535541534424,
"loss/hidden": 0.0302734375,
"loss/logits": 0.003258619224652648,
"loss/reg": 5.229119777679443,
"loss/twn": 0.0,
"step": 958
},
{
"epoch": 0.023975,
"grad_norm": 62.25,
"grad_norm_var": 327.46692708333336,
"learning_rate": 0.0001,
"loss": 6.2366,
"loss/crossentropy": 0.899376630783081,
"loss/hidden": 0.1044921875,
"loss/logits": 0.0037081395275890827,
"loss/reg": 5.229043960571289,
"loss/twn": 0.0,
"step": 959
},
{
"epoch": 0.024,
"grad_norm": 19.75,
"grad_norm_var": 320.46197916666665,
"learning_rate": 0.0001,
"loss": 8.3454,
"loss/crossentropy": 2.9434502124786377,
"loss/hidden": 0.1572265625,
"loss/logits": 0.015512878075242043,
"loss/reg": 5.2291717529296875,
"loss/twn": 0.0,
"step": 960
},
{
"epoch": 0.024025,
"grad_norm": 12.8125,
"grad_norm_var": 319.51808268229166,
"learning_rate": 0.0001,
"loss": 6.8387,
"loss/crossentropy": 1.3557770252227783,
"loss/hidden": 0.2470703125,
"loss/logits": 0.006780410185456276,
"loss/reg": 5.229072093963623,
"loss/twn": 0.0,
"step": 961
},
{
"epoch": 0.02405,
"grad_norm": 69.0,
"grad_norm_var": 450.38136393229166,
"learning_rate": 0.0001,
"loss": 6.6024,
"loss/crossentropy": 1.2391810417175293,
"loss/hidden": 0.12255859375,
"loss/logits": 0.011559647507965565,
"loss/reg": 5.229081630706787,
"loss/twn": 0.0,
"step": 962
},
{
"epoch": 0.024075,
"grad_norm": 11.1875,
"grad_norm_var": 456.966650390625,
"learning_rate": 0.0001,
"loss": 6.8096,
"loss/crossentropy": 1.4291319847106934,
"loss/hidden": 0.150390625,
"loss/logits": 0.0013875500299036503,
"loss/reg": 5.228731632232666,
"loss/twn": 0.0,
"step": 963
},
{
"epoch": 0.0241,
"grad_norm": 13.8125,
"grad_norm_var": 448.843994140625,
"learning_rate": 0.0001,
"loss": 6.9963,
"loss/crossentropy": 1.6549384593963623,
"loss/hidden": 0.1025390625,
"loss/logits": 0.009464550763368607,
"loss/reg": 5.22934103012085,
"loss/twn": 0.0,
"step": 964
},
{
"epoch": 0.024125,
"grad_norm": 160.0,
"grad_norm_var": 1569.287744140625,
"learning_rate": 0.0001,
"loss": 5.8326,
"loss/crossentropy": 0.4071745276451111,
"loss/hidden": 0.19140625,
"loss/logits": 0.005319996736943722,
"loss/reg": 5.228703498840332,
"loss/twn": 0.0,
"step": 965
},
{
"epoch": 0.02415,
"grad_norm": 9.625,
"grad_norm_var": 1575.3483723958334,
"learning_rate": 0.0001,
"loss": 7.4917,
"loss/crossentropy": 2.1543774604797363,
"loss/hidden": 0.0986328125,
"loss/logits": 0.00969572365283966,
"loss/reg": 5.229022026062012,
"loss/twn": 0.0,
"step": 966
},
{
"epoch": 0.024175,
"grad_norm": 22.5,
"grad_norm_var": 1547.8994140625,
"learning_rate": 0.0001,
"loss": 8.001,
"loss/crossentropy": 2.677924156188965,
"loss/hidden": 0.08642578125,
"loss/logits": 0.007645599078387022,
"loss/reg": 5.228973388671875,
"loss/twn": 0.0,
"step": 967
},
{
"epoch": 0.0242,
"grad_norm": 14.1875,
"grad_norm_var": 1542.969384765625,
"learning_rate": 0.0001,
"loss": 7.5317,
"loss/crossentropy": 2.1442387104034424,
"loss/hidden": 0.146484375,
"loss/logits": 0.012024961411952972,
"loss/reg": 5.228928089141846,
"loss/twn": 0.0,
"step": 968
},
{
"epoch": 0.024225,
"grad_norm": 12.5,
"grad_norm_var": 1544.1145182291666,
"learning_rate": 0.0001,
"loss": 6.3484,
"loss/crossentropy": 0.9374382495880127,
"loss/hidden": 0.177734375,
"loss/logits": 0.004194112028926611,
"loss/reg": 5.229069232940674,
"loss/twn": 0.0,
"step": 969
},
{
"epoch": 0.02425,
"grad_norm": 17.75,
"grad_norm_var": 1516.099072265625,
"learning_rate": 0.0001,
"loss": 6.8346,
"loss/crossentropy": 1.4595236778259277,
"loss/hidden": 0.142578125,
"loss/logits": 0.0036797509528696537,
"loss/reg": 5.228834629058838,
"loss/twn": 0.0,
"step": 970
},
{
"epoch": 0.024275,
"grad_norm": 12.3125,
"grad_norm_var": 1518.0780598958333,
"learning_rate": 0.0001,
"loss": 6.2077,
"loss/crossentropy": 0.8395573496818542,
"loss/hidden": 0.1328125,
"loss/logits": 0.0061765448190271854,
"loss/reg": 5.22913122177124,
"loss/twn": 0.0,
"step": 971
},
{
"epoch": 0.0243,
"grad_norm": 8.9375,
"grad_norm_var": 1547.0202962239584,
"learning_rate": 0.0001,
"loss": 6.998,
"loss/crossentropy": 1.55341374874115,
"loss/hidden": 0.2080078125,
"loss/logits": 0.008093073032796383,
"loss/reg": 5.228493690490723,
"loss/twn": 0.0,
"step": 972
},
{
"epoch": 0.024325,
"grad_norm": 8.5625,
"grad_norm_var": 1547.8716145833334,
"learning_rate": 0.0001,
"loss": 7.1173,
"loss/crossentropy": 1.7166783809661865,
"loss/hidden": 0.1640625,
"loss/logits": 0.007523189298808575,
"loss/reg": 5.229001045227051,
"loss/twn": 0.0,
"step": 973
},
{
"epoch": 0.02435,
"grad_norm": 224.0,
"grad_norm_var": 3868.502197265625,
"learning_rate": 0.0001,
"loss": 6.3737,
"loss/crossentropy": 0.9948546886444092,
"loss/hidden": 0.1435546875,
"loss/logits": 0.00641383184120059,
"loss/reg": 5.228926658630371,
"loss/twn": 0.0,
"step": 974
},
{
"epoch": 0.024375,
"grad_norm": 15.375,
"grad_norm_var": 3882.076416015625,
"learning_rate": 0.0001,
"loss": 8.2234,
"loss/crossentropy": 2.891629219055176,
"loss/hidden": 0.09375,
"loss/logits": 0.00904359295964241,
"loss/reg": 5.228950500488281,
"loss/twn": 0.0,
"step": 975
},
{
"epoch": 0.0244,
"grad_norm": 740.0,
"grad_norm_var": 34406.04633789063,
"learning_rate": 0.0001,
"loss": 6.273,
"loss/crossentropy": 0.9103600978851318,
"loss/hidden": 0.1298828125,
"loss/logits": 0.003978141117841005,
"loss/reg": 5.228822231292725,
"loss/twn": 0.0,
"step": 976
},
{
"epoch": 0.024425,
"grad_norm": 12.75,
"grad_norm_var": 34406.644270833334,
"learning_rate": 0.0001,
"loss": 7.4745,
"loss/crossentropy": 2.0163183212280273,
"loss/hidden": 0.2197265625,
"loss/logits": 0.009444335475564003,
"loss/reg": 5.229053497314453,
"loss/twn": 0.0,
"step": 977
},
{
"epoch": 0.02445,
"grad_norm": 12.4375,
"grad_norm_var": 34723.73305664062,
"learning_rate": 0.0001,
"loss": 7.8159,
"loss/crossentropy": 2.453317642211914,
"loss/hidden": 0.125,
"loss/logits": 0.008999479934573174,
"loss/reg": 5.228621959686279,
"loss/twn": 0.0,
"step": 978
},
{
"epoch": 0.024475,
"grad_norm": 15.0,
"grad_norm_var": 34689.15546875,
"learning_rate": 0.0001,
"loss": 8.3269,
"loss/crossentropy": 2.9546549320220947,
"loss/hidden": 0.130859375,
"loss/logits": 0.012454254552721977,
"loss/reg": 5.2289299964904785,
"loss/twn": 0.0,
"step": 979
},
{
"epoch": 0.0245,
"grad_norm": 23.625,
"grad_norm_var": 34606.96300455729,
"learning_rate": 0.0001,
"loss": 5.9559,
"loss/crossentropy": 0.5534784197807312,
"loss/hidden": 0.166015625,
"loss/logits": 0.007569343317300081,
"loss/reg": 5.2288498878479,
"loss/twn": 0.0,
"step": 980
},
{
"epoch": 0.024525,
"grad_norm": 12.125,
"grad_norm_var": 34432.74777018229,
"learning_rate": 0.0001,
"loss": 8.0158,
"loss/crossentropy": 2.6883370876312256,
"loss/hidden": 0.08642578125,
"loss/logits": 0.012242003343999386,
"loss/reg": 5.228771686553955,
"loss/twn": 0.0,
"step": 981
},
{
"epoch": 0.02455,
"grad_norm": 17.0,
"grad_norm_var": 34374.21638997396,
"learning_rate": 0.0001,
"loss": 8.0208,
"loss/crossentropy": 2.660618782043457,
"loss/hidden": 0.11767578125,
"loss/logits": 0.013429110869765282,
"loss/reg": 5.229035377502441,
"loss/twn": 0.0,
"step": 982
},
{
"epoch": 0.024575,
"grad_norm": 26.875,
"grad_norm_var": 34345.91560872396,
"learning_rate": 0.0001,
"loss": 7.0489,
"loss/crossentropy": 1.6950386762619019,
"loss/hidden": 0.119140625,
"loss/logits": 0.006002393085509539,
"loss/reg": 5.2287116050720215,
"loss/twn": 0.0,
"step": 983
},
{
"epoch": 0.0246,
"grad_norm": 44.0,
"grad_norm_var": 34166.33411458333,
"learning_rate": 0.0001,
"loss": 7.4035,
"loss/crossentropy": 2.058884859085083,
"loss/hidden": 0.107421875,
"loss/logits": 0.008141661062836647,
"loss/reg": 5.22901725769043,
"loss/twn": 0.0,
"step": 984
},
{
"epoch": 0.024625,
"grad_norm": 196.0,
"grad_norm_var": 34736.71328125,
"learning_rate": 0.0001,
"loss": 8.0381,
"loss/crossentropy": 2.5864806175231934,
"loss/hidden": 0.2119140625,
"loss/logits": 0.01062602736055851,
"loss/reg": 5.229076862335205,
"loss/twn": 0.0,
"step": 985
},
{
"epoch": 0.02465,
"grad_norm": 11.8125,
"grad_norm_var": 34793.479801432295,
"learning_rate": 0.0001,
"loss": 7.9697,
"loss/crossentropy": 2.6345629692077637,
"loss/hidden": 0.0986328125,
"loss/logits": 0.007676620967686176,
"loss/reg": 5.228846073150635,
"loss/twn": 0.0,
"step": 986
},
{
"epoch": 0.024675,
"grad_norm": 11.8125,
"grad_norm_var": 34798.427978515625,
"learning_rate": 0.0001,
"loss": 7.7632,
"loss/crossentropy": 2.396425724029541,
"loss/hidden": 0.1240234375,
"loss/logits": 0.013703764416277409,
"loss/reg": 5.229032516479492,
"loss/twn": 0.0,
"step": 987
},
{
"epoch": 0.0247,
"grad_norm": 10.125,
"grad_norm_var": 34786.271875,
"learning_rate": 0.0001,
"loss": 8.1079,
"loss/crossentropy": 2.747255325317383,
"loss/hidden": 0.1240234375,
"loss/logits": 0.007296864874660969,
"loss/reg": 5.2292985916137695,
"loss/twn": 0.0,
"step": 988
},
{
"epoch": 0.024725,
"grad_norm": 15.0,
"grad_norm_var": 34722.09972330729,
"learning_rate": 0.0001,
"loss": 6.9097,
"loss/crossentropy": 1.5659387111663818,
"loss/hidden": 0.10986328125,
"loss/logits": 0.005029057152569294,
"loss/reg": 5.228851795196533,
"loss/twn": 0.0,
"step": 989
},
{
"epoch": 0.02475,
"grad_norm": 15.75,
"grad_norm_var": 33621.52016601562,
"learning_rate": 0.0001,
"loss": 7.1496,
"loss/crossentropy": 1.7617563009262085,
"loss/hidden": 0.1513671875,
"loss/logits": 0.007417085114866495,
"loss/reg": 5.229080677032471,
"loss/twn": 0.0,
"step": 990
},
{
"epoch": 0.024775,
"grad_norm": 14.75,
"grad_norm_var": 33626.407535807295,
"learning_rate": 0.0001,
"loss": 7.4938,
"loss/crossentropy": 2.093966245651245,
"loss/hidden": 0.162109375,
"loss/logits": 0.008837287314236164,
"loss/reg": 5.228903770446777,
"loss/twn": 0.0,
"step": 991
},
{
"epoch": 0.0248,
"grad_norm": 21.375,
"grad_norm_var": 2059.307275390625,
"learning_rate": 0.0001,
"loss": 5.6135,
"loss/crossentropy": 0.2334800660610199,
"loss/hidden": 0.14453125,
"loss/logits": 0.00651364354416728,
"loss/reg": 5.228927135467529,
"loss/twn": 0.0,
"step": 992
},
{
"epoch": 0.024825,
"grad_norm": 11.875,
"grad_norm_var": 2061.2249837239583,
"learning_rate": 0.0001,
"loss": 7.9624,
"loss/crossentropy": 2.5844943523406982,
"loss/hidden": 0.1396484375,
"loss/logits": 0.009160241112112999,
"loss/reg": 5.229069709777832,
"loss/twn": 0.0,
"step": 993
},
{
"epoch": 0.02485,
"grad_norm": 16.75,
"grad_norm_var": 2053.023372395833,
"learning_rate": 0.0001,
"loss": 7.9445,
"loss/crossentropy": 2.5577454566955566,
"loss/hidden": 0.146484375,
"loss/logits": 0.011256640776991844,
"loss/reg": 5.229057312011719,
"loss/twn": 0.0,
"step": 994
},
{
"epoch": 0.024875,
"grad_norm": 15.875,
"grad_norm_var": 2051.4388020833335,
"learning_rate": 0.0001,
"loss": 8.5701,
"loss/crossentropy": 3.218754529953003,
"loss/hidden": 0.115234375,
"loss/logits": 0.0072383033111691475,
"loss/reg": 5.228893280029297,
"loss/twn": 0.0,
"step": 995
},
{
"epoch": 0.0249,
"grad_norm": 11.6875,
"grad_norm_var": 2068.9751139322916,
"learning_rate": 0.0001,
"loss": 6.8167,
"loss/crossentropy": 1.4376857280731201,
"loss/hidden": 0.140625,
"loss/logits": 0.009291324764490128,
"loss/reg": 5.22910737991333,
"loss/twn": 0.0,
"step": 996
},
{
"epoch": 0.024925,
"grad_norm": 23.875,
"grad_norm_var": 2052.261962890625,
"learning_rate": 0.0001,
"loss": 7.6128,
"loss/crossentropy": 2.2152881622314453,
"loss/hidden": 0.1630859375,
"loss/logits": 0.0057820603251457214,
"loss/reg": 5.228606700897217,
"loss/twn": 0.0,
"step": 997
},
{
"epoch": 0.02495,
"grad_norm": 84.5,
"grad_norm_var": 2228.711181640625,
"learning_rate": 0.0001,
"loss": 8.1583,
"loss/crossentropy": 2.797482490539551,
"loss/hidden": 0.1220703125,
"loss/logits": 0.009698813781142235,
"loss/reg": 5.229077339172363,
"loss/twn": 0.0,
"step": 998
},
{
"epoch": 0.024975,
"grad_norm": 10.125,
"grad_norm_var": 2260.4925618489583,
"learning_rate": 0.0001,
"loss": 7.9002,
"loss/crossentropy": 2.5686769485473633,
"loss/hidden": 0.09619140625,
"loss/logits": 0.006712072994560003,
"loss/reg": 5.228668212890625,
"loss/twn": 0.0,
"step": 999
},
{
"epoch": 0.025,
"grad_norm": 71.0,
"grad_norm_var": 2348.5097493489584,
"learning_rate": 0.0001,
"loss": 7.9685,
"loss/crossentropy": 2.6334779262542725,
"loss/hidden": 0.099609375,
"loss/logits": 0.0062838364392519,
"loss/reg": 5.229094505310059,
"loss/twn": 0.0,
"step": 1000
}
],
"logging_steps": 1,
"max_steps": 40000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.0457034088448e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}