| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.025, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 2.5e-05, |
| "grad_norm": 1860.177978515625, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 346.4354, |
| "loss/crossentropy": 2.979090690612793, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.38270998001098633, |
| "loss/reg": 343.0735778808594, |
| "step": 1 |
| }, |
| { |
| "epoch": 5e-05, |
| "grad_norm": 19.24078941345215, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 346.2272, |
| "loss/crossentropy": 2.8327953815460205, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.3208012580871582, |
| "loss/reg": 343.0735778808594, |
| "step": 2 |
| }, |
| { |
| "epoch": 7.5e-05, |
| "grad_norm": 12.568899154663086, |
| "learning_rate": 3e-06, |
| "loss": 346.5434, |
| "loss/crossentropy": 3.0858442783355713, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.38866907358169556, |
| "loss/reg": 343.0688781738281, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0001, |
| "grad_norm": 493.76806640625, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 346.1103, |
| "loss/crossentropy": 2.736830711364746, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.3126556873321533, |
| "loss/reg": 343.0608215332031, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.000125, |
| "grad_norm": 17.433624267578125, |
| "learning_rate": 5e-06, |
| "loss": 346.3604, |
| "loss/crossentropy": 2.9568891525268555, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.3535460829734802, |
| "loss/reg": 343.0499267578125, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00015, |
| "grad_norm": 43.95212936401367, |
| "learning_rate": 6e-06, |
| "loss": 346.2806, |
| "loss/crossentropy": 2.8500430583953857, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.3941458761692047, |
| "loss/reg": 343.03643798828125, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.000175, |
| "grad_norm": 14.617745399475098, |
| "learning_rate": 7.000000000000001e-06, |
| "loss": 346.1611, |
| "loss/crossentropy": 2.815033435821533, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.3255086839199066, |
| "loss/reg": 343.0205993652344, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0002, |
| "grad_norm": 94.23649597167969, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 346.2723, |
| "loss/crossentropy": 2.938474178314209, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.33254432678222656, |
| "loss/reg": 343.00128173828125, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.000225, |
| "grad_norm": 13.357231140136719, |
| "learning_rate": 9e-06, |
| "loss": 345.8072, |
| "loss/crossentropy": 2.565714120864868, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26191234588623047, |
| "loss/reg": 342.9796142578125, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.00025, |
| "grad_norm": 7.0878190994262695, |
| "learning_rate": 1e-05, |
| "loss": 346.0762, |
| "loss/crossentropy": 2.8687756061553955, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25318092107772827, |
| "loss/reg": 342.9542236328125, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.000275, |
| "grad_norm": 33.097476959228516, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 345.8165, |
| "loss/crossentropy": 2.6577072143554688, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23302724957466125, |
| "loss/reg": 342.92578125, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0003, |
| "grad_norm": 17.795812606811523, |
| "learning_rate": 1.2e-05, |
| "loss": 346.2238, |
| "loss/crossentropy": 3.0676422119140625, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2625121474266052, |
| "loss/reg": 342.8936462402344, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.000325, |
| "grad_norm": 5.172369003295898, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 345.8532, |
| "loss/crossentropy": 2.8076300621032715, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18656937777996063, |
| "loss/reg": 342.8590087890625, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.00035, |
| "grad_norm": 5.153316497802734, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 345.4301, |
| "loss/crossentropy": 2.4606783390045166, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15036556124687195, |
| "loss/reg": 342.8190612792969, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.000375, |
| "grad_norm": 5.007053852081299, |
| "learning_rate": 1.5e-05, |
| "loss": 345.9839, |
| "loss/crossentropy": 3.0140442848205566, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19039079546928406, |
| "loss/reg": 342.7794189453125, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0004, |
| "grad_norm": 9.89411449432373, |
| "grad_norm_var": 218532.61416541884, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 345.7775, |
| "loss/crossentropy": 2.900439977645874, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14180609583854675, |
| "loss/reg": 342.7352600097656, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.000425, |
| "grad_norm": 4.438236713409424, |
| "grad_norm_var": 14521.267629174466, |
| "learning_rate": 1.7000000000000003e-05, |
| "loss": 345.5058, |
| "loss/crossentropy": 2.6860878467559814, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13272428512573242, |
| "loss/reg": 342.68695068359375, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.00045, |
| "grad_norm": 6.959152698516846, |
| "grad_norm_var": 14580.739492472594, |
| "learning_rate": 1.8e-05, |
| "loss": 345.8498, |
| "loss/crossentropy": 3.0659737586975098, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14944659173488617, |
| "loss/reg": 342.6343688964844, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.000475, |
| "grad_norm": 4.815605640411377, |
| "grad_norm_var": 14622.192919380279, |
| "learning_rate": 1.9e-05, |
| "loss": 345.6652, |
| "loss/crossentropy": 2.9661877155303955, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11781711876392365, |
| "loss/reg": 342.5811462402344, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0005, |
| "grad_norm": 20.490373611450195, |
| "grad_norm_var": 526.7206949274603, |
| "learning_rate": 2e-05, |
| "loss": 345.5417, |
| "loss/crossentropy": 2.880610942840576, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1311033070087433, |
| "loss/reg": 342.530029296875, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.000525, |
| "grad_norm": 8.06376838684082, |
| "grad_norm_var": 534.1263548023838, |
| "learning_rate": 2.1e-05, |
| "loss": 345.2038, |
| "loss/crossentropy": 2.6139538288116455, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1176835298538208, |
| "loss/reg": 342.4721984863281, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.00055, |
| "grad_norm": 2.9717931747436523, |
| "grad_norm_var": 499.3808875231212, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 345.0013, |
| "loss/crossentropy": 2.48826265335083, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09986373782157898, |
| "loss/reg": 342.4132080078125, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.000575, |
| "grad_norm": 3.0796430110931396, |
| "grad_norm_var": 509.5546291852752, |
| "learning_rate": 2.3000000000000003e-05, |
| "loss": 345.5402, |
| "loss/crossentropy": 3.083524227142334, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11023637652397156, |
| "loss/reg": 342.3464050292969, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.0006, |
| "grad_norm": 4.465290069580078, |
| "grad_norm_var": 66.02530253493347, |
| "learning_rate": 2.4e-05, |
| "loss": 345.118, |
| "loss/crossentropy": 2.7357635498046875, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.10327694565057755, |
| "loss/reg": 342.2790222167969, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.000625, |
| "grad_norm": 2.7485175132751465, |
| "grad_norm_var": 67.58997383241844, |
| "learning_rate": 2.5e-05, |
| "loss": 345.032, |
| "loss/crossentropy": 2.7314038276672363, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09067989885807037, |
| "loss/reg": 342.2099609375, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.00065, |
| "grad_norm": 6.201633930206299, |
| "grad_norm_var": 67.84461638262088, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 345.2013, |
| "loss/crossentropy": 2.9543004035949707, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.10873008519411087, |
| "loss/reg": 342.1382141113281, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.000675, |
| "grad_norm": 2.800253391265869, |
| "grad_norm_var": 26.949349170164666, |
| "learning_rate": 2.7000000000000002e-05, |
| "loss": 345.0995, |
| "loss/crossentropy": 2.9452881813049316, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09716818481683731, |
| "loss/reg": 342.0570983886719, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0007, |
| "grad_norm": 2.699833631515503, |
| "grad_norm_var": 19.218166857921933, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 344.7669, |
| "loss/crossentropy": 2.708958864212036, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.08778566122055054, |
| "loss/reg": 341.9701843261719, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.000725, |
| "grad_norm": 2.547497272491455, |
| "grad_norm_var": 19.91571754218688, |
| "learning_rate": 2.9e-05, |
| "loss": 344.7711, |
| "loss/crossentropy": 2.8091719150543213, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.08377528190612793, |
| "loss/reg": 341.8781433105469, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.00075, |
| "grad_norm": 2.952777862548828, |
| "grad_norm_var": 20.399598744815588, |
| "learning_rate": 3e-05, |
| "loss": 344.7112, |
| "loss/crossentropy": 2.8418242931365967, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.08888664841651917, |
| "loss/reg": 341.7804260253906, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.000775, |
| "grad_norm": 2.7115345001220703, |
| "grad_norm_var": 20.92066401501108, |
| "learning_rate": 3.1e-05, |
| "loss": 344.3383, |
| "loss/crossentropy": 2.567899703979492, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.08625812828540802, |
| "loss/reg": 341.68408203125, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.0008, |
| "grad_norm": 2.8718879222869873, |
| "grad_norm_var": 19.879086013782704, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 344.435, |
| "loss/crossentropy": 2.772453546524048, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.08201509714126587, |
| "loss/reg": 341.58056640625, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.000825, |
| "grad_norm": 3.096867084503174, |
| "grad_norm_var": 20.10115293189847, |
| "learning_rate": 3.3e-05, |
| "loss": 344.5786, |
| "loss/crossentropy": 3.0346808433532715, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07674358785152435, |
| "loss/reg": 341.4671936035156, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.00085, |
| "grad_norm": 2.8663108348846436, |
| "grad_norm_var": 20.061121544886294, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 344.4023, |
| "loss/crossentropy": 2.9602417945861816, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09072191268205643, |
| "loss/reg": 341.3513488769531, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.000875, |
| "grad_norm": 2.5331344604492188, |
| "grad_norm_var": 20.355035956744192, |
| "learning_rate": 3.5e-05, |
| "loss": 343.8108, |
| "loss/crossentropy": 2.4948618412017822, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07910144329071045, |
| "loss/reg": 341.2367858886719, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0009, |
| "grad_norm": 3.133824110031128, |
| "grad_norm_var": 2.3373841825731096, |
| "learning_rate": 3.6e-05, |
| "loss": 344.111, |
| "loss/crossentropy": 2.913165807723999, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.08120056986808777, |
| "loss/reg": 341.11663818359375, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.000925, |
| "grad_norm": 2.7178568840026855, |
| "grad_norm_var": 0.8591776946022585, |
| "learning_rate": 3.7e-05, |
| "loss": 343.527, |
| "loss/crossentropy": 2.4706835746765137, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.06964104622602463, |
| "loss/reg": 340.9867248535156, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.00095, |
| "grad_norm": 2.5725350379943848, |
| "grad_norm_var": 0.8786228996593496, |
| "learning_rate": 3.8e-05, |
| "loss": 343.4507, |
| "loss/crossentropy": 2.5263352394104004, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07618534564971924, |
| "loss/reg": 340.84814453125, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.000975, |
| "grad_norm": 2.9023842811584473, |
| "grad_norm_var": 0.8816577904134154, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 343.5782, |
| "loss/crossentropy": 2.794910192489624, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07984155416488647, |
| "loss/reg": 340.7034606933594, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.001, |
| "grad_norm": 2.4993066787719727, |
| "grad_norm_var": 0.7689802883673867, |
| "learning_rate": 4e-05, |
| "loss": 343.2404, |
| "loss/crossentropy": 2.6109530925750732, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.06857939064502716, |
| "loss/reg": 340.5608215332031, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.001025, |
| "grad_norm": 2.9259226322174072, |
| "grad_norm_var": 0.7652114100620696, |
| "learning_rate": 4.1e-05, |
| "loss": 343.4892, |
| "loss/crossentropy": 2.9959700107574463, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.0883413702249527, |
| "loss/reg": 340.40484619140625, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00105, |
| "grad_norm": 2.8494505882263184, |
| "grad_norm_var": 0.03747455037091238, |
| "learning_rate": 4.2e-05, |
| "loss": 342.9757, |
| "loss/crossentropy": 2.6481616497039795, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07580338418483734, |
| "loss/reg": 340.2517395019531, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.001075, |
| "grad_norm": 2.8318848609924316, |
| "grad_norm_var": 0.03756942187646525, |
| "learning_rate": 4.3e-05, |
| "loss": 343.0756, |
| "loss/crossentropy": 2.914668083190918, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07487093657255173, |
| "loss/reg": 340.0860290527344, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.0011, |
| "grad_norm": 2.5573673248291016, |
| "grad_norm_var": 0.04063739560772698, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 342.7438, |
| "loss/crossentropy": 2.758633613586426, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07187295705080032, |
| "loss/reg": 339.913330078125, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.001125, |
| "grad_norm": 2.6494765281677246, |
| "grad_norm_var": 0.038049041798770604, |
| "learning_rate": 4.5e-05, |
| "loss": 342.6349, |
| "loss/crossentropy": 2.820467710494995, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07692387700080872, |
| "loss/reg": 339.7375183105469, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.00115, |
| "grad_norm": 2.503408432006836, |
| "grad_norm_var": 0.04103864613901654, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 342.367, |
| "loss/crossentropy": 2.7334773540496826, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.0775354877114296, |
| "loss/reg": 339.5559387207031, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.001175, |
| "grad_norm": 2.55014967918396, |
| "grad_norm_var": 0.04379427355291886, |
| "learning_rate": 4.7e-05, |
| "loss": 342.1882, |
| "loss/crossentropy": 2.7380266189575195, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07468418776988983, |
| "loss/reg": 339.37548828125, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.0012, |
| "grad_norm": 2.682318687438965, |
| "grad_norm_var": 0.04305705435034118, |
| "learning_rate": 4.8e-05, |
| "loss": 341.9997, |
| "loss/crossentropy": 2.7375612258911133, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.06463472545146942, |
| "loss/reg": 339.1974792480469, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.001225, |
| "grad_norm": 2.775233268737793, |
| "grad_norm_var": 0.03430480419531913, |
| "learning_rate": 4.9e-05, |
| "loss": 342.0445, |
| "loss/crossentropy": 2.9467415809631348, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07443998754024506, |
| "loss/reg": 339.0233459472656, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.00125, |
| "grad_norm": 2.5482699871063232, |
| "grad_norm_var": 0.034503307506799766, |
| "learning_rate": 5e-05, |
| "loss": 341.5835, |
| "loss/crossentropy": 2.6657731533050537, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07320375740528107, |
| "loss/reg": 338.8445129394531, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.001275, |
| "grad_norm": 2.7958645820617676, |
| "grad_norm_var": 0.032900881109643856, |
| "learning_rate": 5.1000000000000006e-05, |
| "loss": 341.7335, |
| "loss/crossentropy": 3.0148534774780273, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.08110088109970093, |
| "loss/reg": 338.6375427246094, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0013, |
| "grad_norm": 2.803079605102539, |
| "grad_norm_var": 0.02142033029363475, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 341.2834, |
| "loss/crossentropy": 2.762022018432617, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07052050530910492, |
| "loss/reg": 338.4508056640625, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.001325, |
| "grad_norm": 3.264613389968872, |
| "grad_norm_var": 0.041567737457307886, |
| "learning_rate": 5.300000000000001e-05, |
| "loss": 341.1463, |
| "loss/crossentropy": 2.8193376064300537, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09100329875946045, |
| "loss/reg": 338.2359924316406, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.00135, |
| "grad_norm": 2.7328436374664307, |
| "grad_norm_var": 0.039766415905754825, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 340.6042, |
| "loss/crossentropy": 2.502927780151367, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07811737805604935, |
| "loss/reg": 338.0231628417969, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.001375, |
| "grad_norm": 3.4147236347198486, |
| "grad_norm_var": 0.06713011702883306, |
| "learning_rate": 5.500000000000001e-05, |
| "loss": 340.9179, |
| "loss/crossentropy": 3.0260379314422607, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09565207362174988, |
| "loss/reg": 337.7962341308594, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.0014, |
| "grad_norm": 2.4889872074127197, |
| "grad_norm_var": 0.0675147239578789, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 340.2508, |
| "loss/crossentropy": 2.5990207195281982, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07676048576831818, |
| "loss/reg": 337.57501220703125, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.001425, |
| "grad_norm": 2.6865806579589844, |
| "grad_norm_var": 0.06622606037023218, |
| "learning_rate": 5.6999999999999996e-05, |
| "loss": 340.0853, |
| "loss/crossentropy": 2.6511263847351074, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07812380790710449, |
| "loss/reg": 337.35601806640625, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.00145, |
| "grad_norm": 2.625458002090454, |
| "grad_norm_var": 0.06664228909772092, |
| "learning_rate": 5.8e-05, |
| "loss": 339.9197, |
| "loss/crossentropy": 2.7347943782806396, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.08070610463619232, |
| "loss/reg": 337.10418701171875, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.001475, |
| "grad_norm": 3.2201623916625977, |
| "grad_norm_var": 0.08059432957815199, |
| "learning_rate": 5.9e-05, |
| "loss": 339.55, |
| "loss/crossentropy": 2.606354236602783, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.08992001414299011, |
| "loss/reg": 336.853759765625, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0015, |
| "grad_norm": 2.9708340167999268, |
| "grad_norm_var": 0.07963074673126963, |
| "learning_rate": 6e-05, |
| "loss": 339.8324, |
| "loss/crossentropy": 3.144277572631836, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09014974534511566, |
| "loss/reg": 336.59796142578125, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.001525, |
| "grad_norm": 3.067251443862915, |
| "grad_norm_var": 0.08246093717145656, |
| "learning_rate": 6.1e-05, |
| "loss": 339.0275, |
| "loss/crossentropy": 2.5903847217559814, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.10276832431554794, |
| "loss/reg": 336.3343200683594, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.00155, |
| "grad_norm": 3.0696287155151367, |
| "grad_norm_var": 0.07855122581927745, |
| "learning_rate": 6.2e-05, |
| "loss": 339.1453, |
| "loss/crossentropy": 2.9826090335845947, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09507931768894196, |
| "loss/reg": 336.0675964355469, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.001575, |
| "grad_norm": 4.388347148895264, |
| "grad_norm_var": 0.2147750922195494, |
| "learning_rate": 6.3e-05, |
| "loss": 338.7891, |
| "loss/crossentropy": 2.883762836456299, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.12124631553888321, |
| "loss/reg": 335.7840576171875, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.0016, |
| "grad_norm": 2.8880841732025146, |
| "grad_norm_var": 0.20950431287563612, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 338.2764, |
| "loss/crossentropy": 2.6835389137268066, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09364684671163559, |
| "loss/reg": 335.49920654296875, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.001625, |
| "grad_norm": 2.813340425491333, |
| "grad_norm_var": 0.2085356207302605, |
| "learning_rate": 6.500000000000001e-05, |
| "loss": 338.2492, |
| "loss/crossentropy": 2.953268527984619, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09223559498786926, |
| "loss/reg": 335.2037353515625, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.00165, |
| "grad_norm": 2.741157054901123, |
| "grad_norm_var": 0.19959997265595816, |
| "learning_rate": 6.6e-05, |
| "loss": 337.3503, |
| "loss/crossentropy": 2.3697705268859863, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.07957518845796585, |
| "loss/reg": 334.90087890625, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.001675, |
| "grad_norm": 3.6085925102233887, |
| "grad_norm_var": 0.21895872310333644, |
| "learning_rate": 6.7e-05, |
| "loss": 337.5594, |
| "loss/crossentropy": 2.8747456073760986, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09977666288614273, |
| "loss/reg": 334.5848388671875, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0017, |
| "grad_norm": 3.3652493953704834, |
| "grad_norm_var": 0.22027918073739935, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 337.4217, |
| "loss/crossentropy": 3.0556256771087646, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.10333959013223648, |
| "loss/reg": 334.2626953125, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.001725, |
| "grad_norm": 3.1251964569091797, |
| "grad_norm_var": 0.2181387434263814, |
| "learning_rate": 6.9e-05, |
| "loss": 336.7281, |
| "loss/crossentropy": 2.6668808460235596, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1156100332736969, |
| "loss/reg": 333.9456481933594, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.00175, |
| "grad_norm": 2.704983711242676, |
| "grad_norm_var": 0.219459742286683, |
| "learning_rate": 7e-05, |
| "loss": 336.4565, |
| "loss/crossentropy": 2.7357680797576904, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09776544570922852, |
| "loss/reg": 333.6229553222656, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.001775, |
| "grad_norm": 2.7894554138183594, |
| "grad_norm_var": 0.21546068539459284, |
| "learning_rate": 7.1e-05, |
| "loss": 336.4183, |
| "loss/crossentropy": 3.014235258102417, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.10774749517440796, |
| "loss/reg": 333.2962951660156, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.0018, |
| "grad_norm": 2.5992395877838135, |
| "grad_norm_var": 0.2081999960008865, |
| "learning_rate": 7.2e-05, |
| "loss": 335.7705, |
| "loss/crossentropy": 2.7298364639282227, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.08281661570072174, |
| "loss/reg": 332.9578552246094, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.001825, |
| "grad_norm": 2.682684898376465, |
| "grad_norm_var": 0.20838528770162246, |
| "learning_rate": 7.3e-05, |
| "loss": 335.5829, |
| "loss/crossentropy": 2.8684210777282715, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.0963786244392395, |
| "loss/reg": 332.6180725097656, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.00185, |
| "grad_norm": 2.8167314529418945, |
| "grad_norm_var": 0.20006842089726434, |
| "learning_rate": 7.4e-05, |
| "loss": 335.2414, |
| "loss/crossentropy": 2.8619723320007324, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09637870639562607, |
| "loss/reg": 332.2830810546875, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.001875, |
| "grad_norm": 2.792503595352173, |
| "grad_norm_var": 0.20197785150176603, |
| "learning_rate": 7.500000000000001e-05, |
| "loss": 334.7866, |
| "loss/crossentropy": 2.752199411392212, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.10928215831518173, |
| "loss/reg": 331.9251403808594, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.0019, |
| "grad_norm": 3.629366636276245, |
| "grad_norm_var": 0.22419816294486522, |
| "learning_rate": 7.6e-05, |
| "loss": 334.6898, |
| "loss/crossentropy": 3.008653402328491, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11774040013551712, |
| "loss/reg": 331.5634460449219, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.001925, |
| "grad_norm": 2.7170186042785645, |
| "grad_norm_var": 0.2318815003938539, |
| "learning_rate": 7.7e-05, |
| "loss": 334.0274, |
| "loss/crossentropy": 2.7513821125030518, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.0983215868473053, |
| "loss/reg": 331.177734375, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.00195, |
| "grad_norm": 3.206778049468994, |
| "grad_norm_var": 0.2334942618955277, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 334.0803, |
| "loss/crossentropy": 3.193337917327881, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.10869012027978897, |
| "loss/reg": 330.7782897949219, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.001975, |
| "grad_norm": 2.8564422130584717, |
| "grad_norm_var": 0.10767969782056426, |
| "learning_rate": 7.900000000000001e-05, |
| "loss": 333.2658, |
| "loss/crossentropy": 2.764251947402954, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.09988151490688324, |
| "loss/reg": 330.4017028808594, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.002, |
| "grad_norm": 2.5950961112976074, |
| "grad_norm_var": 0.11579763493383836, |
| "learning_rate": 8e-05, |
| "loss": 332.7263, |
| "loss/crossentropy": 2.626741409301758, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.088912233710289, |
| "loss/reg": 330.01068115234375, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.002025, |
| "grad_norm": 2.9864420890808105, |
| "grad_norm_var": 0.11474153182901219, |
| "learning_rate": 8.1e-05, |
| "loss": 332.4809, |
| "loss/crossentropy": 2.768440008163452, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11446887254714966, |
| "loss/reg": 329.5980224609375, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.00205, |
| "grad_norm": 2.9913902282714844, |
| "grad_norm_var": 0.11165182755541589, |
| "learning_rate": 8.2e-05, |
| "loss": 332.2192, |
| "loss/crossentropy": 2.921024799346924, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.10505437850952148, |
| "loss/reg": 329.193115234375, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.002075, |
| "grad_norm": 3.1324057579040527, |
| "grad_norm_var": 0.08506906493201592, |
| "learning_rate": 8.3e-05, |
| "loss": 331.8305, |
| "loss/crossentropy": 2.9233036041259766, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11960877478122711, |
| "loss/reg": 328.78759765625, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.0021, |
| "grad_norm": 4.698559761047363, |
| "grad_norm_var": 0.27231954898200667, |
| "learning_rate": 8.4e-05, |
| "loss": 331.6368, |
| "loss/crossentropy": 3.0924150943756104, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17615610361099243, |
| "loss/reg": 328.3682861328125, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.002125, |
| "grad_norm": 3.177739143371582, |
| "grad_norm_var": 0.27322718837700205, |
| "learning_rate": 8.5e-05, |
| "loss": 330.8532, |
| "loss/crossentropy": 2.7933664321899414, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11796407401561737, |
| "loss/reg": 327.9419250488281, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.00215, |
| "grad_norm": 3.030458688735962, |
| "grad_norm_var": 0.26602324580689635, |
| "learning_rate": 8.6e-05, |
| "loss": 330.4252, |
| "loss/crossentropy": 2.8065192699432373, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11260520666837692, |
| "loss/reg": 327.506103515625, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.002175, |
| "grad_norm": 2.590994119644165, |
| "grad_norm_var": 0.2752177678969853, |
| "learning_rate": 8.7e-05, |
| "loss": 329.8946, |
| "loss/crossentropy": 2.706664562225342, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11704559624195099, |
| "loss/reg": 327.0708923339844, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.0022, |
| "grad_norm": 2.971489429473877, |
| "grad_norm_var": 0.2624243358406903, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 329.6451, |
| "loss/crossentropy": 2.8904268741607666, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11559218168258667, |
| "loss/reg": 326.6390380859375, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.002225, |
| "grad_norm": 2.968080759048462, |
| "grad_norm_var": 0.25335665900247384, |
| "learning_rate": 8.900000000000001e-05, |
| "loss": 329.3453, |
| "loss/crossentropy": 3.0395331382751465, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.12317082285881042, |
| "loss/reg": 326.1826477050781, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.00225, |
| "grad_norm": 2.7764782905578613, |
| "grad_norm_var": 0.2548311632382782, |
| "learning_rate": 9e-05, |
| "loss": 328.5304, |
| "loss/crossentropy": 2.6738674640655518, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1210193932056427, |
| "loss/reg": 325.73553466796875, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.002275, |
| "grad_norm": 2.9657976627349854, |
| "grad_norm_var": 0.25029449720438685, |
| "learning_rate": 9.1e-05, |
| "loss": 327.8257, |
| "loss/crossentropy": 2.4202983379364014, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13400676846504211, |
| "loss/reg": 325.27142333984375, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.0023, |
| "grad_norm": 2.650662422180176, |
| "grad_norm_var": 0.23859044240333346, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 327.4639, |
| "loss/crossentropy": 2.5483720302581787, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11590175330638885, |
| "loss/reg": 324.7995910644531, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.002325, |
| "grad_norm": 3.443924903869629, |
| "grad_norm_var": 0.24227501888755848, |
| "learning_rate": 9.300000000000001e-05, |
| "loss": 327.2633, |
| "loss/crossentropy": 2.815500497817993, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1212223544716835, |
| "loss/reg": 324.32659912109375, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.00235, |
| "grad_norm": 3.134320020675659, |
| "grad_norm_var": 0.24123508076126352, |
| "learning_rate": 9.4e-05, |
| "loss": 326.7527, |
| "loss/crossentropy": 2.785496473312378, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13180789351463318, |
| "loss/reg": 323.8353576660156, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.002375, |
| "grad_norm": 2.610710382461548, |
| "grad_norm_var": 0.2516995603676444, |
| "learning_rate": 9.5e-05, |
| "loss": 326.0419, |
| "loss/crossentropy": 2.5883114337921143, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.11013612151145935, |
| "loss/reg": 323.34344482421875, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.0024, |
| "grad_norm": 3.3038907051086426, |
| "grad_norm_var": 0.24055345506962927, |
| "learning_rate": 9.6e-05, |
| "loss": 326.3564, |
| "loss/crossentropy": 3.360260009765625, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.12902778387069702, |
| "loss/reg": 322.8670959472656, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.002425, |
| "grad_norm": 3.0808234214782715, |
| "grad_norm_var": 0.23981223839680202, |
| "learning_rate": 9.7e-05, |
| "loss": 325.4279, |
| "loss/crossentropy": 2.9151549339294434, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1561610996723175, |
| "loss/reg": 322.3565979003906, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.00245, |
| "grad_norm": 3.7542147636413574, |
| "grad_norm_var": 0.2655938131607021, |
| "learning_rate": 9.8e-05, |
| "loss": 324.8318, |
| "loss/crossentropy": 2.83591628074646, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13835349678993225, |
| "loss/reg": 321.8575134277344, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.002475, |
| "grad_norm": 4.176578044891357, |
| "grad_norm_var": 0.3322401459220373, |
| "learning_rate": 9.900000000000001e-05, |
| "loss": 324.9787, |
| "loss/crossentropy": 3.4443886280059814, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20227839052677155, |
| "loss/reg": 321.3320007324219, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.0025, |
| "grad_norm": 3.2133636474609375, |
| "grad_norm_var": 0.17501650801164412, |
| "learning_rate": 0.0001, |
| "loss": 323.56, |
| "loss/crossentropy": 2.631444215774536, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13826847076416016, |
| "loss/reg": 320.7903137207031, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.002525, |
| "grad_norm": 3.3536131381988525, |
| "grad_norm_var": 0.17840699933572804, |
| "learning_rate": 0.0001, |
| "loss": 323.2537, |
| "loss/crossentropy": 2.823065757751465, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19433599710464478, |
| "loss/reg": 320.23626708984375, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.00255, |
| "grad_norm": 2.846142053604126, |
| "grad_norm_var": 0.18289270736203184, |
| "learning_rate": 0.0001, |
| "loss": 322.8064, |
| "loss/crossentropy": 2.977403163909912, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14205776154994965, |
| "loss/reg": 319.68695068359375, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.002575, |
| "grad_norm": 2.762763500213623, |
| "grad_norm_var": 0.1727341123234955, |
| "learning_rate": 0.0001, |
| "loss": 322.0067, |
| "loss/crossentropy": 2.7616305351257324, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13712918758392334, |
| "loss/reg": 319.10791015625, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.0026, |
| "grad_norm": 2.5470879077911377, |
| "grad_norm_var": 0.192723543133589, |
| "learning_rate": 0.0001, |
| "loss": 321.2095, |
| "loss/crossentropy": 2.548597812652588, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.12305562198162079, |
| "loss/reg": 318.5378723144531, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.002625, |
| "grad_norm": 4.06820821762085, |
| "grad_norm_var": 0.2491215448449329, |
| "learning_rate": 0.0001, |
| "loss": 320.8897, |
| "loss/crossentropy": 2.7753727436065674, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16259068250656128, |
| "loss/reg": 317.9517517089844, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.00265, |
| "grad_norm": 2.7462382316589355, |
| "grad_norm_var": 0.25075746320380765, |
| "learning_rate": 0.0001, |
| "loss": 320.5133, |
| "loss/crossentropy": 3.033684492111206, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1321936696767807, |
| "loss/reg": 317.347412109375, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.002675, |
| "grad_norm": 2.9147019386291504, |
| "grad_norm_var": 0.2522855635872415, |
| "learning_rate": 0.0001, |
| "loss": 319.5785, |
| "loss/crossentropy": 2.6685800552368164, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14506930112838745, |
| "loss/reg": 316.7648620605469, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.0027, |
| "grad_norm": 3.1959245204925537, |
| "grad_norm_var": 0.2336231557989919, |
| "learning_rate": 0.0001, |
| "loss": 319.3165, |
| "loss/crossentropy": 2.9722442626953125, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14754945039749146, |
| "loss/reg": 316.1966857910156, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.002725, |
| "grad_norm": 3.798722267150879, |
| "grad_norm_var": 0.2531703418887588, |
| "learning_rate": 0.0001, |
| "loss": 319.04, |
| "loss/crossentropy": 3.2570812702178955, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16630737483501434, |
| "loss/reg": 315.6166076660156, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.00275, |
| "grad_norm": 2.6525461673736572, |
| "grad_norm_var": 0.27312977627090806, |
| "learning_rate": 0.0001, |
| "loss": 317.8362, |
| "loss/crossentropy": 2.6261277198791504, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14820465445518494, |
| "loss/reg": 315.0619201660156, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.002775, |
| "grad_norm": 3.1293742656707764, |
| "grad_norm_var": 0.2499447068577022, |
| "learning_rate": 0.0001, |
| "loss": 317.2627, |
| "loss/crossentropy": 2.623718023300171, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14719776809215546, |
| "loss/reg": 314.4918212890625, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.0028, |
| "grad_norm": 15.958823204040527, |
| "grad_norm_var": 10.398151501641397, |
| "learning_rate": 0.0001, |
| "loss": 316.8877, |
| "loss/crossentropy": 2.8262336254119873, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.129408597946167, |
| "loss/reg": 313.9320373535156, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.002825, |
| "grad_norm": 3.9413416385650635, |
| "grad_norm_var": 10.337541876862074, |
| "learning_rate": 0.0001, |
| "loss": 316.3959, |
| "loss/crossentropy": 2.8791425228118896, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15744474530220032, |
| "loss/reg": 313.3592834472656, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.00285, |
| "grad_norm": 2.8014914989471436, |
| "grad_norm_var": 10.433906911606512, |
| "learning_rate": 0.0001, |
| "loss": 315.6562, |
| "loss/crossentropy": 2.7462589740753174, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14004458487033844, |
| "loss/reg": 312.7699279785156, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.002875, |
| "grad_norm": 2.7303318977355957, |
| "grad_norm_var": 10.531872222449612, |
| "learning_rate": 0.0001, |
| "loss": 315.0281, |
| "loss/crossentropy": 2.6918511390686035, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13197588920593262, |
| "loss/reg": 312.2042541503906, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.0029, |
| "grad_norm": 3.1385786533355713, |
| "grad_norm_var": 10.5392309058265, |
| "learning_rate": 0.0001, |
| "loss": 314.5278, |
| "loss/crossentropy": 2.748079299926758, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15540769696235657, |
| "loss/reg": 311.6242980957031, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.002925, |
| "grad_norm": 2.856105327606201, |
| "grad_norm_var": 10.591715440353928, |
| "learning_rate": 0.0001, |
| "loss": 313.9369, |
| "loss/crossentropy": 2.706995725631714, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1622162163257599, |
| "loss/reg": 311.0676574707031, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.00295, |
| "grad_norm": 3.0335209369659424, |
| "grad_norm_var": 10.568067027910804, |
| "learning_rate": 0.0001, |
| "loss": 313.4418, |
| "loss/crossentropy": 2.7954583168029785, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14605656266212463, |
| "loss/reg": 310.50030517578125, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.002975, |
| "grad_norm": 17.778278350830078, |
| "grad_norm_var": 22.39839291836262, |
| "learning_rate": 0.0001, |
| "loss": 313.1016, |
| "loss/crossentropy": 3.015498399734497, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14635121822357178, |
| "loss/reg": 309.9397277832031, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.003, |
| "grad_norm": 2.5111374855041504, |
| "grad_norm_var": 22.409419960433475, |
| "learning_rate": 0.0001, |
| "loss": 312.4484, |
| "loss/crossentropy": 2.9234232902526855, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15030893683433533, |
| "loss/reg": 309.3746337890625, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.003025, |
| "grad_norm": 3.0075595378875732, |
| "grad_norm_var": 22.58724529715737, |
| "learning_rate": 0.0001, |
| "loss": 311.6718, |
| "loss/crossentropy": 2.7128963470458984, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15696708858013153, |
| "loss/reg": 308.8019714355469, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.00305, |
| "grad_norm": 2.8516271114349365, |
| "grad_norm_var": 22.55961193976418, |
| "learning_rate": 0.0001, |
| "loss": 311.5919, |
| "loss/crossentropy": 3.201800584793091, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1552673727273941, |
| "loss/reg": 308.2347717285156, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.003075, |
| "grad_norm": 2.7313201427459717, |
| "grad_norm_var": 22.60704699907139, |
| "learning_rate": 0.0001, |
| "loss": 310.4214, |
| "loss/crossentropy": 2.6282145977020264, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13782621920108795, |
| "loss/reg": 307.6553955078125, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.0031, |
| "grad_norm": 3.00699520111084, |
| "grad_norm_var": 22.64860965541018, |
| "learning_rate": 0.0001, |
| "loss": 309.9948, |
| "loss/crossentropy": 2.773193597793579, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15227621793746948, |
| "loss/reg": 307.0693359375, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.003125, |
| "grad_norm": 2.9907143115997314, |
| "grad_norm_var": 22.791413325771067, |
| "learning_rate": 0.0001, |
| "loss": 309.6375, |
| "loss/crossentropy": 3.001366138458252, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14761751890182495, |
| "loss/reg": 306.488525390625, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.00315, |
| "grad_norm": 2.7048888206481934, |
| "grad_norm_var": 22.777330344225966, |
| "learning_rate": 0.0001, |
| "loss": 308.8778, |
| "loss/crossentropy": 2.8160345554351807, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.144578754901886, |
| "loss/reg": 305.9171447753906, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.003175, |
| "grad_norm": 3.0401437282562256, |
| "grad_norm_var": 22.796493590486268, |
| "learning_rate": 0.0001, |
| "loss": 308.0832, |
| "loss/crossentropy": 2.607736825942993, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13729776442050934, |
| "loss/reg": 305.3381652832031, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.0032, |
| "grad_norm": 2.584563732147217, |
| "grad_norm_var": 13.885740820076437, |
| "learning_rate": 0.0001, |
| "loss": 307.6585, |
| "loss/crossentropy": 2.750492572784424, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13798421621322632, |
| "loss/reg": 304.77001953125, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.003225, |
| "grad_norm": 2.501339912414551, |
| "grad_norm_var": 13.999106697222624, |
| "learning_rate": 0.0001, |
| "loss": 306.8389, |
| "loss/crossentropy": 2.499746084213257, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14136828482151031, |
| "loss/reg": 304.19781494140625, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.00325, |
| "grad_norm": 3.1528725624084473, |
| "grad_norm_var": 13.961598599715407, |
| "learning_rate": 0.0001, |
| "loss": 306.6842, |
| "loss/crossentropy": 2.9020321369171143, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15830577909946442, |
| "loss/reg": 303.62384033203125, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.003275, |
| "grad_norm": 2.893075704574585, |
| "grad_norm_var": 13.94028717778272, |
| "learning_rate": 0.0001, |
| "loss": 305.9274, |
| "loss/crossentropy": 2.743729591369629, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14728468656539917, |
| "loss/reg": 303.0364074707031, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.0033, |
| "grad_norm": 3.0060131549835205, |
| "grad_norm_var": 13.953057327464618, |
| "learning_rate": 0.0001, |
| "loss": 305.4517, |
| "loss/crossentropy": 2.834550142288208, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16237227618694305, |
| "loss/reg": 302.4547424316406, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.003325, |
| "grad_norm": 3.0694620609283447, |
| "grad_norm_var": 13.929317309938057, |
| "learning_rate": 0.0001, |
| "loss": 304.5747, |
| "loss/crossentropy": 2.5429139137268066, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1559884250164032, |
| "loss/reg": 301.8758544921875, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.00335, |
| "grad_norm": 2.8518848419189453, |
| "grad_norm_var": 13.950038126308876, |
| "learning_rate": 0.0001, |
| "loss": 304.1976, |
| "loss/crossentropy": 2.765056610107422, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14854753017425537, |
| "loss/reg": 301.28399658203125, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.003375, |
| "grad_norm": 3.1176578998565674, |
| "grad_norm_var": 0.044937653308535865, |
| "learning_rate": 0.0001, |
| "loss": 303.7528, |
| "loss/crossentropy": 2.9260571002960205, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1468181163072586, |
| "loss/reg": 300.6799011230469, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.0034, |
| "grad_norm": 2.880009889602661, |
| "grad_norm_var": 0.035480645828153946, |
| "learning_rate": 0.0001, |
| "loss": 302.9741, |
| "loss/crossentropy": 2.736276149749756, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1469912827014923, |
| "loss/reg": 300.0908508300781, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.003425, |
| "grad_norm": 3.2396042346954346, |
| "grad_norm_var": 0.04219284656746065, |
| "learning_rate": 0.0001, |
| "loss": 302.0787, |
| "loss/crossentropy": 2.436084747314453, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15764698386192322, |
| "loss/reg": 299.4849548339844, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.00345, |
| "grad_norm": 2.818772315979004, |
| "grad_norm_var": 0.04253304441969187, |
| "learning_rate": 0.0001, |
| "loss": 301.9017, |
| "loss/crossentropy": 2.8690006732940674, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13555486500263214, |
| "loss/reg": 298.8971252441406, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.003475, |
| "grad_norm": 3.1128132343292236, |
| "grad_norm_var": 0.042447214695410435, |
| "learning_rate": 0.0001, |
| "loss": 301.0966, |
| "loss/crossentropy": 2.6258418560028076, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17426033318042755, |
| "loss/reg": 298.2965393066406, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0035, |
| "grad_norm": 3.2976646423339844, |
| "grad_norm_var": 0.05049181223860704, |
| "learning_rate": 0.0001, |
| "loss": 301.2447, |
| "loss/crossentropy": 3.3375208377838135, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19012662768363953, |
| "loss/reg": 297.7170104980469, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.003525, |
| "grad_norm": 2.7214531898498535, |
| "grad_norm_var": 0.05369940885392074, |
| "learning_rate": 0.0001, |
| "loss": 299.7301, |
| "loss/crossentropy": 2.44716477394104, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14688129723072052, |
| "loss/reg": 297.13604736328125, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.00355, |
| "grad_norm": 3.3967669010162354, |
| "grad_norm_var": 0.0622042440975612, |
| "learning_rate": 0.0001, |
| "loss": 299.2944, |
| "loss/crossentropy": 2.5750997066497803, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17346450686454773, |
| "loss/reg": 296.5458068847656, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.003575, |
| "grad_norm": 2.718705654144287, |
| "grad_norm_var": 0.06609520888261215, |
| "learning_rate": 0.0001, |
| "loss": 298.7035, |
| "loss/crossentropy": 2.576488733291626, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1555417776107788, |
| "loss/reg": 295.9714660644531, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.0036, |
| "grad_norm": 3.459041118621826, |
| "grad_norm_var": 0.07009550701137926, |
| "learning_rate": 0.0001, |
| "loss": 298.3419, |
| "loss/crossentropy": 2.7831904888153076, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16926178336143494, |
| "loss/reg": 295.3894958496094, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.003625, |
| "grad_norm": 3.599822998046875, |
| "grad_norm_var": 0.07030535777098426, |
| "learning_rate": 0.0001, |
| "loss": 298.2544, |
| "loss/crossentropy": 3.2596793174743652, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18854370713233948, |
| "loss/reg": 294.80621337890625, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.00365, |
| "grad_norm": 3.6367578506469727, |
| "grad_norm_var": 0.08941673085661629, |
| "learning_rate": 0.0001, |
| "loss": 297.6298, |
| "loss/crossentropy": 3.228911876678467, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15578624606132507, |
| "loss/reg": 294.24505615234375, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.003675, |
| "grad_norm": 3.3731987476348877, |
| "grad_norm_var": 0.08969931689454427, |
| "learning_rate": 0.0001, |
| "loss": 296.5468, |
| "loss/crossentropy": 2.6783909797668457, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18989479541778564, |
| "loss/reg": 293.678466796875, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.0037, |
| "grad_norm": 3.004645347595215, |
| "grad_norm_var": 0.08972454925701427, |
| "learning_rate": 0.0001, |
| "loss": 296.139, |
| "loss/crossentropy": 2.887582302093506, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15772318840026855, |
| "loss/reg": 293.09368896484375, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.003725, |
| "grad_norm": 3.8851213455200195, |
| "grad_norm_var": 0.1232384713331688, |
| "learning_rate": 0.0001, |
| "loss": 295.1292, |
| "loss/crossentropy": 2.43943452835083, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17688119411468506, |
| "loss/reg": 292.5129089355469, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.00375, |
| "grad_norm": 2.969780206680298, |
| "grad_norm_var": 0.11871959357580385, |
| "learning_rate": 0.0001, |
| "loss": 294.82, |
| "loss/crossentropy": 2.720794439315796, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14547483623027802, |
| "loss/reg": 291.9537658691406, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.003775, |
| "grad_norm": 3.3458027839660645, |
| "grad_norm_var": 0.1194074455570662, |
| "learning_rate": 0.0001, |
| "loss": 294.4663, |
| "loss/crossentropy": 2.914644718170166, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19076432287693024, |
| "loss/reg": 291.3608703613281, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.0038, |
| "grad_norm": 2.9391496181488037, |
| "grad_norm_var": 0.11697470608086556, |
| "learning_rate": 0.0001, |
| "loss": 294.1329, |
| "loss/crossentropy": 3.177144765853882, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17206624150276184, |
| "loss/reg": 290.7837219238281, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.003825, |
| "grad_norm": 3.448251485824585, |
| "grad_norm_var": 0.12024250794295975, |
| "learning_rate": 0.0001, |
| "loss": 293.2957, |
| "loss/crossentropy": 2.916529893875122, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18904531002044678, |
| "loss/reg": 290.1900939941406, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.00385, |
| "grad_norm": 2.7693159580230713, |
| "grad_norm_var": 0.12312676691166922, |
| "learning_rate": 0.0001, |
| "loss": 292.1465, |
| "loss/crossentropy": 2.3761188983917236, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16432620584964752, |
| "loss/reg": 289.6059875488281, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.003875, |
| "grad_norm": 2.8227667808532715, |
| "grad_norm_var": 0.13291251787620695, |
| "learning_rate": 0.0001, |
| "loss": 291.6444, |
| "loss/crossentropy": 2.4790537357330322, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15044492483139038, |
| "loss/reg": 289.0148620605469, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.0039, |
| "grad_norm": 2.769251823425293, |
| "grad_norm_var": 0.14431173200531475, |
| "learning_rate": 0.0001, |
| "loss": 291.3051, |
| "loss/crossentropy": 2.724376916885376, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15646930038928986, |
| "loss/reg": 288.42431640625, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.003925, |
| "grad_norm": 2.9450161457061768, |
| "grad_norm_var": 0.1338045365052719, |
| "learning_rate": 0.0001, |
| "loss": 290.7092, |
| "loss/crossentropy": 2.7232112884521484, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17326687276363373, |
| "loss/reg": 287.81268310546875, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.00395, |
| "grad_norm": 2.7544944286346436, |
| "grad_norm_var": 0.1421121590872642, |
| "learning_rate": 0.0001, |
| "loss": 289.9852, |
| "loss/crossentropy": 2.628100633621216, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15836934745311737, |
| "loss/reg": 287.1986999511719, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.003975, |
| "grad_norm": 3.282729148864746, |
| "grad_norm_var": 0.12936684677523522, |
| "learning_rate": 0.0001, |
| "loss": 289.6616, |
| "loss/crossentropy": 2.853585720062256, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20318502187728882, |
| "loss/reg": 286.6048583984375, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.004, |
| "grad_norm": 2.7608420848846436, |
| "grad_norm_var": 0.1345857719286831, |
| "learning_rate": 0.0001, |
| "loss": 288.7971, |
| "loss/crossentropy": 2.639310836791992, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15059445798397064, |
| "loss/reg": 286.0071716308594, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.004025, |
| "grad_norm": 2.615666151046753, |
| "grad_norm_var": 0.1353317229456477, |
| "learning_rate": 0.0001, |
| "loss": 288.2175, |
| "loss/crossentropy": 2.6622371673583984, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1381928026676178, |
| "loss/reg": 285.41705322265625, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.00405, |
| "grad_norm": 2.699389934539795, |
| "grad_norm_var": 0.12099720896805645, |
| "learning_rate": 0.0001, |
| "loss": 287.8594, |
| "loss/crossentropy": 2.9092233180999756, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14686957001686096, |
| "loss/reg": 284.8033142089844, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.004075, |
| "grad_norm": 2.52980375289917, |
| "grad_norm_var": 0.12619606783641663, |
| "learning_rate": 0.0001, |
| "loss": 287.099, |
| "loss/crossentropy": 2.7334139347076416, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1602194458246231, |
| "loss/reg": 284.2053527832031, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.0041, |
| "grad_norm": 2.9028260707855225, |
| "grad_norm_var": 0.1263923635326608, |
| "learning_rate": 0.0001, |
| "loss": 286.6346, |
| "loss/crossentropy": 2.8802831172943115, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1618650257587433, |
| "loss/reg": 283.59246826171875, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.004125, |
| "grad_norm": 3.010094404220581, |
| "grad_norm_var": 0.06689759190688846, |
| "learning_rate": 0.0001, |
| "loss": 286.137, |
| "loss/crossentropy": 2.9807565212249756, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17984265089035034, |
| "loss/reg": 282.9764404296875, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.00415, |
| "grad_norm": 2.747976779937744, |
| "grad_norm_var": 0.06821403807519436, |
| "learning_rate": 0.0001, |
| "loss": 285.1411, |
| "loss/crossentropy": 2.6191065311431885, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17148274183273315, |
| "loss/reg": 282.3504943847656, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.004175, |
| "grad_norm": 2.72310471534729, |
| "grad_norm_var": 0.05514136557764762, |
| "learning_rate": 0.0001, |
| "loss": 284.5961, |
| "loss/crossentropy": 2.7177231311798096, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15355822443962097, |
| "loss/reg": 281.72479248046875, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.0042, |
| "grad_norm": 2.5135159492492676, |
| "grad_norm_var": 0.06183281601049633, |
| "learning_rate": 0.0001, |
| "loss": 283.7144, |
| "loss/crossentropy": 2.4525184631347656, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.13518790900707245, |
| "loss/reg": 281.1266784667969, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.004225, |
| "grad_norm": 2.6755948066711426, |
| "grad_norm_var": 0.03554926963080618, |
| "learning_rate": 0.0001, |
| "loss": 283.5094, |
| "loss/crossentropy": 2.8291640281677246, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17008362710475922, |
| "loss/reg": 280.5101318359375, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.00425, |
| "grad_norm": 3.0268986225128174, |
| "grad_norm_var": 0.03923814612245735, |
| "learning_rate": 0.0001, |
| "loss": 283.1964, |
| "loss/crossentropy": 3.105539083480835, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18000483512878418, |
| "loss/reg": 279.9109191894531, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.004275, |
| "grad_norm": 2.5846283435821533, |
| "grad_norm_var": 0.04201988364738796, |
| "learning_rate": 0.0001, |
| "loss": 282.1588, |
| "loss/crossentropy": 2.6842048168182373, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16073930263519287, |
| "loss/reg": 279.3138732910156, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.0043, |
| "grad_norm": 2.8193604946136475, |
| "grad_norm_var": 0.0420791835209684, |
| "learning_rate": 0.0001, |
| "loss": 281.6269, |
| "loss/crossentropy": 2.7535877227783203, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17149388790130615, |
| "loss/reg": 278.7017822265625, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.004325, |
| "grad_norm": 2.695707321166992, |
| "grad_norm_var": 0.04071110069445254, |
| "learning_rate": 0.0001, |
| "loss": 280.9163, |
| "loss/crossentropy": 2.694293260574341, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15487328171730042, |
| "loss/reg": 278.0671081542969, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.00435, |
| "grad_norm": 3.0142641067504883, |
| "grad_norm_var": 0.04434257349895461, |
| "learning_rate": 0.0001, |
| "loss": 280.5464, |
| "loss/crossentropy": 2.912869691848755, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1794903725385666, |
| "loss/reg": 277.45404052734375, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.004375, |
| "grad_norm": 3.4833621978759766, |
| "grad_norm_var": 0.060102318830361885, |
| "learning_rate": 0.0001, |
| "loss": 280.0757, |
| "loss/crossentropy": 3.031020164489746, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1958397924900055, |
| "loss/reg": 276.84881591796875, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.0044, |
| "grad_norm": 2.6899683475494385, |
| "grad_norm_var": 0.060788090400682414, |
| "learning_rate": 0.0001, |
| "loss": 279.0604, |
| "loss/crossentropy": 2.689755439758301, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1408371478319168, |
| "loss/reg": 276.22979736328125, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.004425, |
| "grad_norm": 2.686345338821411, |
| "grad_norm_var": 0.059403126018499584, |
| "learning_rate": 0.0001, |
| "loss": 278.8184, |
| "loss/crossentropy": 3.0344579219818115, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1603156328201294, |
| "loss/reg": 275.6236267089844, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.00445, |
| "grad_norm": 2.9537172317504883, |
| "grad_norm_var": 0.060028034669986144, |
| "learning_rate": 0.0001, |
| "loss": 277.8445, |
| "loss/crossentropy": 2.6497297286987305, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1772124469280243, |
| "loss/reg": 275.01751708984375, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.004475, |
| "grad_norm": 2.6830756664276123, |
| "grad_norm_var": 0.055646030147755474, |
| "learning_rate": 0.0001, |
| "loss": 277.3781, |
| "loss/crossentropy": 2.7995009422302246, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15789446234703064, |
| "loss/reg": 274.420654296875, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.0045, |
| "grad_norm": 2.8642189502716064, |
| "grad_norm_var": 0.05534192722217289, |
| "learning_rate": 0.0001, |
| "loss": 276.8572, |
| "loss/crossentropy": 2.8247008323669434, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19499436020851135, |
| "loss/reg": 273.8375244140625, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.004525, |
| "grad_norm": 2.3742892742156982, |
| "grad_norm_var": 0.06476700330631478, |
| "learning_rate": 0.0001, |
| "loss": 276.0537, |
| "loss/crossentropy": 2.6782188415527344, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14065401256084442, |
| "loss/reg": 273.2348327636719, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.00455, |
| "grad_norm": 2.8511719703674316, |
| "grad_norm_var": 0.06494378033136528, |
| "learning_rate": 0.0001, |
| "loss": 275.8217, |
| "loss/crossentropy": 3.0019702911376953, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17122197151184082, |
| "loss/reg": 272.6484375, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.004575, |
| "grad_norm": 2.4934184551239014, |
| "grad_norm_var": 0.07028818692612214, |
| "learning_rate": 0.0001, |
| "loss": 275.005, |
| "loss/crossentropy": 2.765932559967041, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1687726229429245, |
| "loss/reg": 272.0703125, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.0046, |
| "grad_norm": 3.0428080558776855, |
| "grad_norm_var": 0.06930197860402411, |
| "learning_rate": 0.0001, |
| "loss": 274.319, |
| "loss/crossentropy": 2.6140480041503906, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20197075605392456, |
| "loss/reg": 271.50299072265625, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.004625, |
| "grad_norm": 2.7042367458343506, |
| "grad_norm_var": 0.06884502087402401, |
| "learning_rate": 0.0001, |
| "loss": 273.7751, |
| "loss/crossentropy": 2.7085049152374268, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16187983751296997, |
| "loss/reg": 270.90478515625, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.00465, |
| "grad_norm": 2.4192299842834473, |
| "grad_norm_var": 0.07438801189002385, |
| "learning_rate": 0.0001, |
| "loss": 272.9952, |
| "loss/crossentropy": 2.509551525115967, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15606564283370972, |
| "loss/reg": 270.3295593261719, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.004675, |
| "grad_norm": 2.6057181358337402, |
| "grad_norm_var": 0.07388755541776296, |
| "learning_rate": 0.0001, |
| "loss": 272.7285, |
| "loss/crossentropy": 2.832514762878418, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1625734269618988, |
| "loss/reg": 269.7333984375, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.0047, |
| "grad_norm": 2.5072972774505615, |
| "grad_norm_var": 0.07807856050180723, |
| "learning_rate": 0.0001, |
| "loss": 272.1894, |
| "loss/crossentropy": 2.8929169178009033, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15297307074069977, |
| "loss/reg": 269.1435241699219, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.004725, |
| "grad_norm": 2.668285369873047, |
| "grad_norm_var": 0.07833979493371525, |
| "learning_rate": 0.0001, |
| "loss": 271.5161, |
| "loss/crossentropy": 2.798143148422241, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1715821623802185, |
| "loss/reg": 268.5463562011719, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.00475, |
| "grad_norm": 2.869302272796631, |
| "grad_norm_var": 0.07459542490979629, |
| "learning_rate": 0.0001, |
| "loss": 271.0226, |
| "loss/crossentropy": 2.913928270339966, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16282354295253754, |
| "loss/reg": 267.94586181640625, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.004775, |
| "grad_norm": 2.4999892711639404, |
| "grad_norm_var": 0.03802990774269475, |
| "learning_rate": 0.0001, |
| "loss": 270.1609, |
| "loss/crossentropy": 2.6663661003112793, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1597089171409607, |
| "loss/reg": 267.33489990234375, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.0048, |
| "grad_norm": 2.7010915279388428, |
| "grad_norm_var": 0.03804935894705916, |
| "learning_rate": 0.0001, |
| "loss": 269.6252, |
| "loss/crossentropy": 2.7287211418151855, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1760062277317047, |
| "loss/reg": 266.720458984375, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.004825, |
| "grad_norm": 2.635450839996338, |
| "grad_norm_var": 0.03818693477124479, |
| "learning_rate": 0.0001, |
| "loss": 269.0576, |
| "loss/crossentropy": 2.7622997760772705, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1753700077533722, |
| "loss/reg": 266.11993408203125, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.00485, |
| "grad_norm": 2.54504656791687, |
| "grad_norm_var": 0.03368765909132942, |
| "learning_rate": 0.0001, |
| "loss": 268.4639, |
| "loss/crossentropy": 2.774796962738037, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16008035838603973, |
| "loss/reg": 265.5290222167969, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.004875, |
| "grad_norm": 2.8884477615356445, |
| "grad_norm_var": 0.03711886375720705, |
| "learning_rate": 0.0001, |
| "loss": 267.7791, |
| "loss/crossentropy": 2.678165912628174, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16350409388542175, |
| "loss/reg": 264.9374694824219, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.0049, |
| "grad_norm": 3.0534660816192627, |
| "grad_norm_var": 0.044336834869120406, |
| "learning_rate": 0.0001, |
| "loss": 267.5437, |
| "loss/crossentropy": 2.990952491760254, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20335815846920013, |
| "loss/reg": 264.3493957519531, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.004925, |
| "grad_norm": 2.8830363750457764, |
| "grad_norm_var": 0.03986402384874263, |
| "learning_rate": 0.0001, |
| "loss": 266.6284, |
| "loss/crossentropy": 2.674790859222412, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18334966897964478, |
| "loss/reg": 263.77020263671875, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.00495, |
| "grad_norm": 3.1203083992004395, |
| "grad_norm_var": 0.04943917591481958, |
| "learning_rate": 0.0001, |
| "loss": 265.9243, |
| "loss/crossentropy": 2.5693366527557373, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16054046154022217, |
| "loss/reg": 263.1944580078125, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.004975, |
| "grad_norm": 2.7858941555023193, |
| "grad_norm_var": 0.04566411871986986, |
| "learning_rate": 0.0001, |
| "loss": 265.842, |
| "loss/crossentropy": 3.063640832901001, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1918787658214569, |
| "loss/reg": 262.5865478515625, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.005, |
| "grad_norm": 4.905181407928467, |
| "grad_norm_var": 0.3362427866457504, |
| "learning_rate": 0.0001, |
| "loss": 265.0334, |
| "loss/crossentropy": 2.75864839553833, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.28906920552253723, |
| "loss/reg": 261.9857177734375, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.005025, |
| "grad_norm": 2.969449281692505, |
| "grad_norm_var": 0.33506015925643545, |
| "learning_rate": 0.0001, |
| "loss": 264.667, |
| "loss/crossentropy": 3.093539237976074, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19913670420646667, |
| "loss/reg": 261.37432861328125, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.00505, |
| "grad_norm": 3.2088537216186523, |
| "grad_norm_var": 0.32566799990582446, |
| "learning_rate": 0.0001, |
| "loss": 264.2178, |
| "loss/crossentropy": 3.255889415740967, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20012566447257996, |
| "loss/reg": 260.76177978515625, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.005075, |
| "grad_norm": 3.11022686958313, |
| "grad_norm_var": 0.3199018318121316, |
| "learning_rate": 0.0001, |
| "loss": 263.1096, |
| "loss/crossentropy": 2.753197193145752, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19567176699638367, |
| "loss/reg": 260.16070556640625, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.0051, |
| "grad_norm": 2.5052154064178467, |
| "grad_norm_var": 0.32002761472599534, |
| "learning_rate": 0.0001, |
| "loss": 262.7169, |
| "loss/crossentropy": 2.981990098953247, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17743879556655884, |
| "loss/reg": 259.55743408203125, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.005125, |
| "grad_norm": 3.437074899673462, |
| "grad_norm_var": 0.3271340622124995, |
| "learning_rate": 0.0001, |
| "loss": 262.1845, |
| "loss/crossentropy": 3.0462844371795654, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17453166842460632, |
| "loss/reg": 258.96368408203125, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.00515, |
| "grad_norm": 3.3540172576904297, |
| "grad_norm_var": 0.33289475403727997, |
| "learning_rate": 0.0001, |
| "loss": 261.2446, |
| "loss/crossentropy": 2.684351682662964, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18615120649337769, |
| "loss/reg": 258.37408447265625, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.005175, |
| "grad_norm": 2.6485707759857178, |
| "grad_norm_var": 0.32362257450873433, |
| "learning_rate": 0.0001, |
| "loss": 260.6506, |
| "loss/crossentropy": 2.712137222290039, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16978460550308228, |
| "loss/reg": 257.7686767578125, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.0052, |
| "grad_norm": 2.638559103012085, |
| "grad_norm_var": 0.326750686147299, |
| "learning_rate": 0.0001, |
| "loss": 259.9714, |
| "loss/crossentropy": 2.6442058086395264, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16394683718681335, |
| "loss/reg": 257.1632385253906, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.005225, |
| "grad_norm": 2.62022066116333, |
| "grad_norm_var": 0.32759289115149387, |
| "learning_rate": 0.0001, |
| "loss": 259.6152, |
| "loss/crossentropy": 2.86051607131958, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1837988793849945, |
| "loss/reg": 256.5708923339844, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.00525, |
| "grad_norm": 3.0118000507354736, |
| "grad_norm_var": 0.31027566591164846, |
| "learning_rate": 0.0001, |
| "loss": 258.8864, |
| "loss/crossentropy": 2.7158284187316895, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18835628032684326, |
| "loss/reg": 255.98226928710938, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.005275, |
| "grad_norm": 2.738693952560425, |
| "grad_norm_var": 0.31532774991742346, |
| "learning_rate": 0.0001, |
| "loss": 258.2583, |
| "loss/crossentropy": 2.6809372901916504, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1898835152387619, |
| "loss/reg": 255.387451171875, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.0053, |
| "grad_norm": 2.739337205886841, |
| "grad_norm_var": 0.32184874512720374, |
| "learning_rate": 0.0001, |
| "loss": 257.8907, |
| "loss/crossentropy": 2.906891107559204, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1842600256204605, |
| "loss/reg": 254.79959106445312, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.005325, |
| "grad_norm": 5.883702754974365, |
| "grad_norm_var": 0.820888078604249, |
| "learning_rate": 0.0001, |
| "loss": 257.1401, |
| "loss/crossentropy": 2.7662065029144287, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17315037548542023, |
| "loss/reg": 254.2007293701172, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.00535, |
| "grad_norm": 3.2972004413604736, |
| "grad_norm_var": 0.8202608766175215, |
| "learning_rate": 0.0001, |
| "loss": 256.5279, |
| "loss/crossentropy": 2.737523317337036, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1854221224784851, |
| "loss/reg": 253.6049346923828, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.005375, |
| "grad_norm": 3.02815580368042, |
| "grad_norm_var": 0.8092324619971265, |
| "learning_rate": 0.0001, |
| "loss": 256.0627, |
| "loss/crossentropy": 2.8625571727752686, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19949959218502045, |
| "loss/reg": 253.0006866455078, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.0054, |
| "grad_norm": 3.2634494304656982, |
| "grad_norm_var": 0.6166894091876105, |
| "learning_rate": 0.0001, |
| "loss": 255.4964, |
| "loss/crossentropy": 2.924158811569214, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16927270591259003, |
| "loss/reg": 252.40298461914062, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.005425, |
| "grad_norm": 3.190610647201538, |
| "grad_norm_var": 0.614321823565594, |
| "learning_rate": 0.0001, |
| "loss": 254.9168, |
| "loss/crossentropy": 2.959676742553711, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1694745123386383, |
| "loss/reg": 251.78762817382812, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.00545, |
| "grad_norm": 2.910163402557373, |
| "grad_norm_var": 0.6182401597326245, |
| "learning_rate": 0.0001, |
| "loss": 254.3778, |
| "loss/crossentropy": 3.0126121044158936, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19637390971183777, |
| "loss/reg": 251.16880798339844, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.005475, |
| "grad_norm": 2.5100033283233643, |
| "grad_norm_var": 0.6438249148085596, |
| "learning_rate": 0.0001, |
| "loss": 253.619, |
| "loss/crossentropy": 2.8760910034179688, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17724955081939697, |
| "loss/reg": 250.565673828125, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.0055, |
| "grad_norm": 2.6412065029144287, |
| "grad_norm_var": 0.6339957102239564, |
| "learning_rate": 0.0001, |
| "loss": 253.0932, |
| "loss/crossentropy": 2.9451711177825928, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1847916692495346, |
| "loss/reg": 249.96322631835938, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.005525, |
| "grad_norm": 2.4829912185668945, |
| "grad_norm_var": 0.6504949610281291, |
| "learning_rate": 0.0001, |
| "loss": 251.9582, |
| "loss/crossentropy": 2.4317898750305176, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15323440730571747, |
| "loss/reg": 249.37318420410156, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.00555, |
| "grad_norm": 2.709721565246582, |
| "grad_norm_var": 0.6511748224192303, |
| "learning_rate": 0.0001, |
| "loss": 251.9418, |
| "loss/crossentropy": 2.9580609798431396, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19112282991409302, |
| "loss/reg": 248.7926788330078, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.005575, |
| "grad_norm": 2.7896413803100586, |
| "grad_norm_var": 0.6454388623794479, |
| "learning_rate": 0.0001, |
| "loss": 251.1011, |
| "loss/crossentropy": 2.6989221572875977, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18577811121940613, |
| "loss/reg": 248.21641540527344, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.0056, |
| "grad_norm": 2.763256549835205, |
| "grad_norm_var": 0.6399279824277821, |
| "learning_rate": 0.0001, |
| "loss": 250.6324, |
| "loss/crossentropy": 2.8038721084594727, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19551604986190796, |
| "loss/reg": 247.633056640625, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.005625, |
| "grad_norm": 2.6225526332855225, |
| "grad_norm_var": 0.6397989634558596, |
| "learning_rate": 0.0001, |
| "loss": 250.0981, |
| "loss/crossentropy": 2.873964548110962, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18002215027809143, |
| "loss/reg": 247.0440673828125, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.00565, |
| "grad_norm": 2.7640974521636963, |
| "grad_norm_var": 0.6444463916989489, |
| "learning_rate": 0.0001, |
| "loss": 249.6578, |
| "loss/crossentropy": 3.0052733421325684, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19472980499267578, |
| "loss/reg": 246.45779418945312, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.005675, |
| "grad_norm": 4.11354398727417, |
| "grad_norm_var": 0.7108481451659288, |
| "learning_rate": 0.0001, |
| "loss": 249.1703, |
| "loss/crossentropy": 3.090904951095581, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20938721299171448, |
| "loss/reg": 245.87001037597656, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.0057, |
| "grad_norm": 2.749560594558716, |
| "grad_norm_var": 0.7103537111133307, |
| "learning_rate": 0.0001, |
| "loss": 247.8932, |
| "loss/crossentropy": 2.4375710487365723, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16894645988941193, |
| "loss/reg": 245.28668212890625, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.005725, |
| "grad_norm": 2.942262649536133, |
| "grad_norm_var": 0.16230004133669051, |
| "learning_rate": 0.0001, |
| "loss": 247.8172, |
| "loss/crossentropy": 2.9423258304595947, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1861574649810791, |
| "loss/reg": 244.6887664794922, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.00575, |
| "grad_norm": 2.7158396244049072, |
| "grad_norm_var": 0.1544682228743985, |
| "learning_rate": 0.0001, |
| "loss": 246.9261, |
| "loss/crossentropy": 2.6655476093292236, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1642124503850937, |
| "loss/reg": 244.0963592529297, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.005775, |
| "grad_norm": 3.031425714492798, |
| "grad_norm_var": 0.15453029560278514, |
| "learning_rate": 0.0001, |
| "loss": 246.3331, |
| "loss/crossentropy": 2.641087532043457, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1917201280593872, |
| "loss/reg": 243.5002899169922, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.0058, |
| "grad_norm": 2.781378984451294, |
| "grad_norm_var": 0.1448915546965355, |
| "learning_rate": 0.0001, |
| "loss": 245.8157, |
| "loss/crossentropy": 2.7308170795440674, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.178788423538208, |
| "loss/reg": 242.90609741210938, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.005825, |
| "grad_norm": 2.6632707118988037, |
| "grad_norm_var": 0.13884268173051903, |
| "learning_rate": 0.0001, |
| "loss": 245.2104, |
| "loss/crossentropy": 2.7034757137298584, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19913603365421295, |
| "loss/reg": 242.30780029296875, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.00585, |
| "grad_norm": 2.6279611587524414, |
| "grad_norm_var": 0.14059426093737837, |
| "learning_rate": 0.0001, |
| "loss": 244.448, |
| "loss/crossentropy": 2.5541391372680664, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1769060492515564, |
| "loss/reg": 241.71693420410156, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.005875, |
| "grad_norm": 2.669217586517334, |
| "grad_norm_var": 0.13587813046198485, |
| "learning_rate": 0.0001, |
| "loss": 243.9826, |
| "loss/crossentropy": 2.677557945251465, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1696961522102356, |
| "loss/reg": 241.13534545898438, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.0059, |
| "grad_norm": 4.654603958129883, |
| "grad_norm_var": 0.34211473789542096, |
| "learning_rate": 0.0001, |
| "loss": 243.4455, |
| "loss/crossentropy": 2.6129984855651855, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24794022738933563, |
| "loss/reg": 240.58453369140625, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.005925, |
| "grad_norm": 2.7966091632843018, |
| "grad_norm_var": 0.32904384319109853, |
| "learning_rate": 0.0001, |
| "loss": 243.0811, |
| "loss/crossentropy": 2.8610379695892334, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18956679105758667, |
| "loss/reg": 240.03045654296875, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.00595, |
| "grad_norm": 2.4518496990203857, |
| "grad_norm_var": 0.3418803558489948, |
| "learning_rate": 0.0001, |
| "loss": 242.0064, |
| "loss/crossentropy": 2.412545680999756, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.14585809409618378, |
| "loss/reg": 239.44796752929688, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.005975, |
| "grad_norm": 14.31179141998291, |
| "grad_norm_var": 8.399062121815462, |
| "learning_rate": 0.0001, |
| "loss": 241.8318, |
| "loss/crossentropy": 2.7919762134552, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17732976377010345, |
| "loss/reg": 238.86245727539062, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.006, |
| "grad_norm": 2.6747586727142334, |
| "grad_norm_var": 8.410206107231419, |
| "learning_rate": 0.0001, |
| "loss": 241.3024, |
| "loss/crossentropy": 2.8322153091430664, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19286081194877625, |
| "loss/reg": 238.27737426757812, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.006025, |
| "grad_norm": 2.5404036045074463, |
| "grad_norm_var": 8.421998600034392, |
| "learning_rate": 0.0001, |
| "loss": 240.354, |
| "loss/crossentropy": 2.509608507156372, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15739041566848755, |
| "loss/reg": 237.68695068359375, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.00605, |
| "grad_norm": 2.88310170173645, |
| "grad_norm_var": 8.408739063367712, |
| "learning_rate": 0.0001, |
| "loss": 239.9893, |
| "loss/crossentropy": 2.714906692504883, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17041458189487457, |
| "loss/reg": 237.10398864746094, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.006075, |
| "grad_norm": 2.5657036304473877, |
| "grad_norm_var": 8.465489057845808, |
| "learning_rate": 0.0001, |
| "loss": 239.4222, |
| "loss/crossentropy": 2.726501226425171, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17326998710632324, |
| "loss/reg": 236.5224151611328, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.0061, |
| "grad_norm": 3.74009370803833, |
| "grad_norm_var": 8.41895240057901, |
| "learning_rate": 0.0001, |
| "loss": 238.9056, |
| "loss/crossentropy": 2.748196840286255, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21651685237884521, |
| "loss/reg": 235.94091796875, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.006125, |
| "grad_norm": 2.7053475379943848, |
| "grad_norm_var": 8.444126473060866, |
| "learning_rate": 0.0001, |
| "loss": 238.032, |
| "loss/crossentropy": 2.4857230186462402, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1798083782196045, |
| "loss/reg": 235.36642456054688, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.00615, |
| "grad_norm": 3.132035255432129, |
| "grad_norm_var": 8.405148171874606, |
| "learning_rate": 0.0001, |
| "loss": 237.9786, |
| "loss/crossentropy": 2.9837818145751953, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2059348225593567, |
| "loss/reg": 234.78887939453125, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.006175, |
| "grad_norm": 2.4898736476898193, |
| "grad_norm_var": 8.467374226379958, |
| "learning_rate": 0.0001, |
| "loss": 237.2757, |
| "loss/crossentropy": 2.880187749862671, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17742741107940674, |
| "loss/reg": 234.21804809570312, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.0062, |
| "grad_norm": 2.655679702758789, |
| "grad_norm_var": 8.482173935471064, |
| "learning_rate": 0.0001, |
| "loss": 236.6548, |
| "loss/crossentropy": 2.8338496685028076, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1855936199426651, |
| "loss/reg": 233.63536071777344, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.006225, |
| "grad_norm": 2.869889259338379, |
| "grad_norm_var": 8.459100961664873, |
| "learning_rate": 0.0001, |
| "loss": 236.2722, |
| "loss/crossentropy": 3.00722074508667, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19864147901535034, |
| "loss/reg": 233.06637573242188, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.00625, |
| "grad_norm": 2.8619699478149414, |
| "grad_norm_var": 8.431865311995418, |
| "learning_rate": 0.0001, |
| "loss": 235.1938, |
| "loss/crossentropy": 2.527130365371704, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18343941867351532, |
| "loss/reg": 232.48324584960938, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.006275, |
| "grad_norm": 2.570348024368286, |
| "grad_norm_var": 8.445078379379984, |
| "learning_rate": 0.0001, |
| "loss": 234.9153, |
| "loss/crossentropy": 2.8627750873565674, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16326285898685455, |
| "loss/reg": 231.88925170898438, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.0063, |
| "grad_norm": 3.70983624458313, |
| "grad_norm_var": 8.370411445912024, |
| "learning_rate": 0.0001, |
| "loss": 234.453, |
| "loss/crossentropy": 2.9494543075561523, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1932690143585205, |
| "loss/reg": 231.31028747558594, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.006325, |
| "grad_norm": 2.636204957962036, |
| "grad_norm_var": 8.38834540620384, |
| "learning_rate": 0.0001, |
| "loss": 233.79, |
| "loss/crossentropy": 2.870368003845215, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18195605278015137, |
| "loss/reg": 230.73770141601562, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.00635, |
| "grad_norm": 2.7411065101623535, |
| "grad_norm_var": 8.35122443905512, |
| "learning_rate": 0.0001, |
| "loss": 233.4192, |
| "loss/crossentropy": 3.0693013668060303, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17932993173599243, |
| "loss/reg": 230.17056274414062, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.006375, |
| "grad_norm": 2.631800889968872, |
| "grad_norm_var": 0.1459736729753594, |
| "learning_rate": 0.0001, |
| "loss": 232.6018, |
| "loss/crossentropy": 2.812204360961914, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1809658408164978, |
| "loss/reg": 229.60862731933594, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.0064, |
| "grad_norm": 2.6473164558410645, |
| "grad_norm_var": 0.14661806909008704, |
| "learning_rate": 0.0001, |
| "loss": 232.0037, |
| "loss/crossentropy": 2.8154137134552, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16016216576099396, |
| "loss/reg": 229.02810668945312, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.006425, |
| "grad_norm": 2.8096730709075928, |
| "grad_norm_var": 0.14052644786456767, |
| "learning_rate": 0.0001, |
| "loss": 231.4434, |
| "loss/crossentropy": 2.8410181999206543, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.15286508202552795, |
| "loss/reg": 228.4495391845703, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.00645, |
| "grad_norm": 2.902038335800171, |
| "grad_norm_var": 0.14062455090234588, |
| "learning_rate": 0.0001, |
| "loss": 230.797, |
| "loss/crossentropy": 2.71943998336792, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1998460292816162, |
| "loss/reg": 227.87771606445312, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.006475, |
| "grad_norm": 2.605620861053467, |
| "grad_norm_var": 0.1391881027932695, |
| "learning_rate": 0.0001, |
| "loss": 230.5228, |
| "loss/crossentropy": 3.017008066177368, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18362455070018768, |
| "loss/reg": 227.32220458984375, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.0065, |
| "grad_norm": 3.0760796070098877, |
| "grad_norm_var": 0.08854286659723035, |
| "learning_rate": 0.0001, |
| "loss": 229.5804, |
| "loss/crossentropy": 2.6169934272766113, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20932170748710632, |
| "loss/reg": 226.75404357910156, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.006525, |
| "grad_norm": 2.7260305881500244, |
| "grad_norm_var": 0.08826637957723141, |
| "learning_rate": 0.0001, |
| "loss": 229.1744, |
| "loss/crossentropy": 2.800166368484497, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16581743955612183, |
| "loss/reg": 226.20838928222656, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.00655, |
| "grad_norm": 2.520145893096924, |
| "grad_norm_var": 0.08593156013952026, |
| "learning_rate": 0.0001, |
| "loss": 228.4811, |
| "loss/crossentropy": 2.66050124168396, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17131495475769043, |
| "loss/reg": 225.64930725097656, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.006575, |
| "grad_norm": 2.666048288345337, |
| "grad_norm_var": 0.08109508789745874, |
| "learning_rate": 0.0001, |
| "loss": 228.21, |
| "loss/crossentropy": 2.948183298110962, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1878964900970459, |
| "loss/reg": 225.0739288330078, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.0066, |
| "grad_norm": 2.762319326400757, |
| "grad_norm_var": 0.07990506415366098, |
| "learning_rate": 0.0001, |
| "loss": 227.5642, |
| "loss/crossentropy": 2.8666961193084717, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19062572717666626, |
| "loss/reg": 224.5068817138672, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.006625, |
| "grad_norm": 3.026460647583008, |
| "grad_norm_var": 0.08297919370627369, |
| "learning_rate": 0.0001, |
| "loss": 227.1483, |
| "loss/crossentropy": 3.004094362258911, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1900184154510498, |
| "loss/reg": 223.9541473388672, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.00665, |
| "grad_norm": 2.7408447265625, |
| "grad_norm_var": 0.08298920636462223, |
| "learning_rate": 0.0001, |
| "loss": 226.3396, |
| "loss/crossentropy": 2.745393753051758, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20416490733623505, |
| "loss/reg": 223.39002990722656, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.006675, |
| "grad_norm": 2.686131238937378, |
| "grad_norm_var": 0.0803088906266936, |
| "learning_rate": 0.0001, |
| "loss": 225.7382, |
| "loss/crossentropy": 2.730952262878418, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19333088397979736, |
| "loss/reg": 222.8138885498047, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.0067, |
| "grad_norm": 3.0181736946105957, |
| "grad_norm_var": 0.026807333288596878, |
| "learning_rate": 0.0001, |
| "loss": 225.3613, |
| "loss/crossentropy": 2.9188590049743652, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2012787163257599, |
| "loss/reg": 222.24114990234375, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.006725, |
| "grad_norm": 2.6647708415985107, |
| "grad_norm_var": 0.026378256663085863, |
| "learning_rate": 0.0001, |
| "loss": 224.5769, |
| "loss/crossentropy": 2.7387688159942627, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18105106055736542, |
| "loss/reg": 221.6570587158203, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.00675, |
| "grad_norm": 3.005382776260376, |
| "grad_norm_var": 0.029935448640787853, |
| "learning_rate": 0.0001, |
| "loss": 223.7069, |
| "loss/crossentropy": 2.4513492584228516, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17486891150474548, |
| "loss/reg": 221.08065795898438, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.006775, |
| "grad_norm": 2.534144163131714, |
| "grad_norm_var": 0.03246837804460086, |
| "learning_rate": 0.0001, |
| "loss": 223.3926, |
| "loss/crossentropy": 2.698563575744629, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1854040026664734, |
| "loss/reg": 220.50863647460938, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.0068, |
| "grad_norm": 2.7375473976135254, |
| "grad_norm_var": 0.031447726867909864, |
| "learning_rate": 0.0001, |
| "loss": 223.1195, |
| "loss/crossentropy": 2.990828037261963, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19145527482032776, |
| "loss/reg": 219.937255859375, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.006825, |
| "grad_norm": 2.594477415084839, |
| "grad_norm_var": 0.03349317904539077, |
| "learning_rate": 0.0001, |
| "loss": 222.1966, |
| "loss/crossentropy": 2.6549155712127686, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18063394725322723, |
| "loss/reg": 219.36102294921875, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.00685, |
| "grad_norm": 3.0771312713623047, |
| "grad_norm_var": 0.03857028263729087, |
| "learning_rate": 0.0001, |
| "loss": 221.9057, |
| "loss/crossentropy": 2.904212713241577, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19918082654476166, |
| "loss/reg": 218.80227661132812, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.006875, |
| "grad_norm": 2.548748731613159, |
| "grad_norm_var": 0.040076406355323495, |
| "learning_rate": 0.0001, |
| "loss": 221.1609, |
| "loss/crossentropy": 2.7445976734161377, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17896625399589539, |
| "loss/reg": 218.23736572265625, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.0069, |
| "grad_norm": 3.4485270977020264, |
| "grad_norm_var": 0.0637460442930499, |
| "learning_rate": 0.0001, |
| "loss": 220.8704, |
| "loss/crossentropy": 2.9816956520080566, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19691282510757446, |
| "loss/reg": 217.6918182373047, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.006925, |
| "grad_norm": 2.622490644454956, |
| "grad_norm_var": 0.06540004680390439, |
| "learning_rate": 0.0001, |
| "loss": 219.9821, |
| "loss/crossentropy": 2.6642203330993652, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1839863508939743, |
| "loss/reg": 217.13385009765625, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.00695, |
| "grad_norm": 2.6010499000549316, |
| "grad_norm_var": 0.06288917084829108, |
| "learning_rate": 0.0001, |
| "loss": 219.4899, |
| "loss/crossentropy": 2.710230827331543, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19618386030197144, |
| "loss/reg": 216.58343505859375, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.006975, |
| "grad_norm": 2.880260705947876, |
| "grad_norm_var": 0.06204859700921664, |
| "learning_rate": 0.0001, |
| "loss": 219.1407, |
| "loss/crossentropy": 2.898496150970459, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2143867015838623, |
| "loss/reg": 216.02783203125, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.007, |
| "grad_norm": 3.233513593673706, |
| "grad_norm_var": 0.07297482892120115, |
| "learning_rate": 0.0001, |
| "loss": 218.5671, |
| "loss/crossentropy": 2.876899003982544, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20586279034614563, |
| "loss/reg": 215.48439025878906, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.007025, |
| "grad_norm": 2.415728807449341, |
| "grad_norm_var": 0.08099970934078797, |
| "learning_rate": 0.0001, |
| "loss": 217.7508, |
| "loss/crossentropy": 2.639965057373047, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1753409504890442, |
| "loss/reg": 214.93548583984375, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.00705, |
| "grad_norm": 2.7094812393188477, |
| "grad_norm_var": 0.08131089617161227, |
| "learning_rate": 0.0001, |
| "loss": 217.2597, |
| "loss/crossentropy": 2.687687873840332, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18249084055423737, |
| "loss/reg": 214.38949584960938, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.007075, |
| "grad_norm": 2.616089105606079, |
| "grad_norm_var": 0.08266783158203429, |
| "learning_rate": 0.0001, |
| "loss": 216.6144, |
| "loss/crossentropy": 2.6135380268096924, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1692454218864441, |
| "loss/reg": 213.83157348632812, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.0071, |
| "grad_norm": 2.598839521408081, |
| "grad_norm_var": 0.08113636926467128, |
| "learning_rate": 0.0001, |
| "loss": 216.4407, |
| "loss/crossentropy": 2.969308853149414, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18686597049236298, |
| "loss/reg": 213.28448486328125, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.007125, |
| "grad_norm": 2.68222713470459, |
| "grad_norm_var": 0.0809151212890202, |
| "learning_rate": 0.0001, |
| "loss": 215.8076, |
| "loss/crossentropy": 2.8860280513763428, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17992877960205078, |
| "loss/reg": 212.74166870117188, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.00715, |
| "grad_norm": 2.798988103866577, |
| "grad_norm_var": 0.07707527762625711, |
| "learning_rate": 0.0001, |
| "loss": 215.3757, |
| "loss/crossentropy": 2.9692792892456055, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21567609906196594, |
| "loss/reg": 212.19068908691406, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.007175, |
| "grad_norm": 2.5780839920043945, |
| "grad_norm_var": 0.07589498443210232, |
| "learning_rate": 0.0001, |
| "loss": 214.3753, |
| "loss/crossentropy": 2.5596561431884766, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.177793949842453, |
| "loss/reg": 211.63784790039062, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.0072, |
| "grad_norm": 2.6318469047546387, |
| "grad_norm_var": 0.07689489347470727, |
| "learning_rate": 0.0001, |
| "loss": 214.0618, |
| "loss/crossentropy": 2.7902228832244873, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18190298974514008, |
| "loss/reg": 211.0896759033203, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.007225, |
| "grad_norm": 3.0802104473114014, |
| "grad_norm_var": 0.08141687457795115, |
| "learning_rate": 0.0001, |
| "loss": 213.4452, |
| "loss/crossentropy": 2.7214579582214355, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18961231410503387, |
| "loss/reg": 210.5341796875, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.00725, |
| "grad_norm": 3.087554693222046, |
| "grad_norm_var": 0.08183286107987228, |
| "learning_rate": 0.0001, |
| "loss": 213.064, |
| "loss/crossentropy": 2.886679172515869, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17951728403568268, |
| "loss/reg": 209.9978485107422, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.007275, |
| "grad_norm": 2.7192165851593018, |
| "grad_norm_var": 0.07831674565225223, |
| "learning_rate": 0.0001, |
| "loss": 212.4935, |
| "loss/crossentropy": 2.835261821746826, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19229570031166077, |
| "loss/reg": 209.4658966064453, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.0073, |
| "grad_norm": 2.9835152626037598, |
| "grad_norm_var": 0.051250203523905684, |
| "learning_rate": 0.0001, |
| "loss": 211.8015, |
| "loss/crossentropy": 2.691941499710083, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18396204710006714, |
| "loss/reg": 208.92559814453125, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.007325, |
| "grad_norm": 3.2173898220062256, |
| "grad_norm_var": 0.062069919928694615, |
| "learning_rate": 0.0001, |
| "loss": 211.2997, |
| "loss/crossentropy": 2.6862618923187256, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22537732124328613, |
| "loss/reg": 208.3881072998047, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.00735, |
| "grad_norm": 3.5398621559143066, |
| "grad_norm_var": 0.09198591178203515, |
| "learning_rate": 0.0001, |
| "loss": 210.6651, |
| "loss/crossentropy": 2.612825393676758, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19290409982204437, |
| "loss/reg": 207.85940551757812, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.007375, |
| "grad_norm": 2.561976671218872, |
| "grad_norm_var": 0.09749160768811103, |
| "learning_rate": 0.0001, |
| "loss": 210.1769, |
| "loss/crossentropy": 2.6648244857788086, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20216211676597595, |
| "loss/reg": 207.30992126464844, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.0074, |
| "grad_norm": 7.271378040313721, |
| "grad_norm_var": 1.3278853272864029, |
| "learning_rate": 0.0001, |
| "loss": 209.9314, |
| "loss/crossentropy": 2.9801483154296875, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19208793342113495, |
| "loss/reg": 206.75917053222656, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.007425, |
| "grad_norm": 3.091381788253784, |
| "grad_norm_var": 1.295378929230882, |
| "learning_rate": 0.0001, |
| "loss": 209.1877, |
| "loss/crossentropy": 2.7932381629943848, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18018564581871033, |
| "loss/reg": 206.21424865722656, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.00745, |
| "grad_norm": 2.8066790103912354, |
| "grad_norm_var": 1.2904482820319183, |
| "learning_rate": 0.0001, |
| "loss": 208.8806, |
| "loss/crossentropy": 3.017073392868042, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19379809498786926, |
| "loss/reg": 205.66973876953125, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.007475, |
| "grad_norm": 2.856220006942749, |
| "grad_norm_var": 1.2772274114279487, |
| "learning_rate": 0.0001, |
| "loss": 208.1113, |
| "loss/crossentropy": 2.81599497795105, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18530045449733734, |
| "loss/reg": 205.11001586914062, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.0075, |
| "grad_norm": 2.9903173446655273, |
| "grad_norm_var": 1.2576931439437788, |
| "learning_rate": 0.0001, |
| "loss": 207.5814, |
| "loss/crossentropy": 2.826772451400757, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19662591814994812, |
| "loss/reg": 204.55799865722656, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.007525, |
| "grad_norm": 2.5510308742523193, |
| "grad_norm_var": 1.2674948009951321, |
| "learning_rate": 0.0001, |
| "loss": 206.9224, |
| "loss/crossentropy": 2.712320327758789, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19106364250183105, |
| "loss/reg": 204.01902770996094, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.00755, |
| "grad_norm": 2.508366107940674, |
| "grad_norm_var": 1.2872607464713248, |
| "learning_rate": 0.0001, |
| "loss": 206.1158, |
| "loss/crossentropy": 2.4741883277893066, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17461839318275452, |
| "loss/reg": 203.46702575683594, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.007575, |
| "grad_norm": 2.976346969604492, |
| "grad_norm_var": 1.266555341337554, |
| "learning_rate": 0.0001, |
| "loss": 205.8845, |
| "loss/crossentropy": 2.732536554336548, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2265537679195404, |
| "loss/reg": 202.92544555664062, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.0076, |
| "grad_norm": 2.8182311058044434, |
| "grad_norm_var": 1.2551146741564971, |
| "learning_rate": 0.0001, |
| "loss": 205.3437, |
| "loss/crossentropy": 2.774559736251831, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19560466706752777, |
| "loss/reg": 202.37356567382812, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.007625, |
| "grad_norm": 2.6641106605529785, |
| "grad_norm_var": 1.2720952145177449, |
| "learning_rate": 0.0001, |
| "loss": 204.813, |
| "loss/crossentropy": 2.7902915477752686, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18885600566864014, |
| "loss/reg": 201.83387756347656, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.00765, |
| "grad_norm": 2.7573277950286865, |
| "grad_norm_var": 1.2823306075296226, |
| "learning_rate": 0.0001, |
| "loss": 204.3929, |
| "loss/crossentropy": 2.89617919921875, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20617598295211792, |
| "loss/reg": 201.29051208496094, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.007675, |
| "grad_norm": 2.9883832931518555, |
| "grad_norm_var": 1.2715927971824363, |
| "learning_rate": 0.0001, |
| "loss": 203.6661, |
| "loss/crossentropy": 2.736328363418579, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17194727063179016, |
| "loss/reg": 200.75784301757812, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.0077, |
| "grad_norm": 2.671340227127075, |
| "grad_norm_var": 1.2850880861748617, |
| "learning_rate": 0.0001, |
| "loss": 203.2853, |
| "loss/crossentropy": 2.837782859802246, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20544278621673584, |
| "loss/reg": 200.2420654296875, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.007725, |
| "grad_norm": 2.5738258361816406, |
| "grad_norm_var": 1.304496016414119, |
| "learning_rate": 0.0001, |
| "loss": 202.6491, |
| "loss/crossentropy": 2.764209270477295, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17526961863040924, |
| "loss/reg": 199.7096710205078, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.00775, |
| "grad_norm": 2.727646827697754, |
| "grad_norm_var": 1.2982730821691781, |
| "learning_rate": 0.0001, |
| "loss": 201.8776, |
| "loss/crossentropy": 2.507113456726074, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18204720318317413, |
| "loss/reg": 199.1884765625, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.007775, |
| "grad_norm": 3.019986629486084, |
| "grad_norm_var": 1.2815257147172248, |
| "learning_rate": 0.0001, |
| "loss": 201.4175, |
| "loss/crossentropy": 2.580369234085083, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18372410535812378, |
| "loss/reg": 198.6533660888672, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.0078, |
| "grad_norm": 4.663088321685791, |
| "grad_norm_var": 0.24891895975728176, |
| "learning_rate": 0.0001, |
| "loss": 201.1794, |
| "loss/crossentropy": 2.814059019088745, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23769119381904602, |
| "loss/reg": 198.1276092529297, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.007825, |
| "grad_norm": 2.9102060794830322, |
| "grad_norm_var": 0.24674634897498038, |
| "learning_rate": 0.0001, |
| "loss": 200.4867, |
| "loss/crossentropy": 2.6932103633880615, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1972815990447998, |
| "loss/reg": 197.5962371826172, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.00785, |
| "grad_norm": 2.906273603439331, |
| "grad_norm_var": 0.24605808105580232, |
| "learning_rate": 0.0001, |
| "loss": 200.3487, |
| "loss/crossentropy": 3.0405361652374268, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22804442048072815, |
| "loss/reg": 197.0801544189453, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.007875, |
| "grad_norm": 2.5689239501953125, |
| "grad_norm_var": 0.2533312249281851, |
| "learning_rate": 0.0001, |
| "loss": 199.4335, |
| "loss/crossentropy": 2.7003679275512695, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1923004686832428, |
| "loss/reg": 196.54086303710938, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.0079, |
| "grad_norm": 2.8070003986358643, |
| "grad_norm_var": 0.2530642011976994, |
| "learning_rate": 0.0001, |
| "loss": 199.2196, |
| "loss/crossentropy": 3.0146713256835938, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1969759613275528, |
| "loss/reg": 196.0079803466797, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.007925, |
| "grad_norm": 2.6455047130584717, |
| "grad_norm_var": 0.2494529065085686, |
| "learning_rate": 0.0001, |
| "loss": 198.6955, |
| "loss/crossentropy": 3.0136356353759766, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20378613471984863, |
| "loss/reg": 195.4780731201172, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.00795, |
| "grad_norm": 2.6797735691070557, |
| "grad_norm_var": 0.2426149646107582, |
| "learning_rate": 0.0001, |
| "loss": 197.5625, |
| "loss/crossentropy": 2.4390504360198975, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17309829592704773, |
| "loss/reg": 194.95030212402344, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.007975, |
| "grad_norm": 2.668614149093628, |
| "grad_norm_var": 0.2453445922218293, |
| "learning_rate": 0.0001, |
| "loss": 197.1961, |
| "loss/crossentropy": 2.5966238975524902, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1794152557849884, |
| "loss/reg": 194.42005920410156, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 2.552947521209717, |
| "grad_norm_var": 0.2519063072997559, |
| "learning_rate": 0.0001, |
| "loss": 196.5726, |
| "loss/crossentropy": 2.5113775730133057, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16936884820461273, |
| "loss/reg": 193.891845703125, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.008025, |
| "grad_norm": 2.836556911468506, |
| "grad_norm_var": 0.2491962625539448, |
| "learning_rate": 0.0001, |
| "loss": 196.1308, |
| "loss/crossentropy": 2.5704989433288574, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19436271488666534, |
| "loss/reg": 193.3658905029297, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.00805, |
| "grad_norm": 2.6601991653442383, |
| "grad_norm_var": 0.25129150502635655, |
| "learning_rate": 0.0001, |
| "loss": 196.0067, |
| "loss/crossentropy": 2.9537734985351562, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21296489238739014, |
| "loss/reg": 192.83993530273438, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.008075, |
| "grad_norm": 2.7744781970977783, |
| "grad_norm_var": 0.25070402667003294, |
| "learning_rate": 0.0001, |
| "loss": 195.5274, |
| "loss/crossentropy": 3.016360282897949, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19973087310791016, |
| "loss/reg": 192.3113250732422, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.0081, |
| "grad_norm": 3.1214191913604736, |
| "grad_norm_var": 0.252394334847159, |
| "learning_rate": 0.0001, |
| "loss": 195.3513, |
| "loss/crossentropy": 3.329318046569824, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23599132895469666, |
| "loss/reg": 191.78594970703125, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.008125, |
| "grad_norm": 2.6547012329101562, |
| "grad_norm_var": 0.2494769798717158, |
| "learning_rate": 0.0001, |
| "loss": 194.3078, |
| "loss/crossentropy": 2.844395399093628, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20515291392803192, |
| "loss/reg": 191.2582550048828, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.00815, |
| "grad_norm": 2.800093650817871, |
| "grad_norm_var": 0.24826251752535028, |
| "learning_rate": 0.0001, |
| "loss": 193.7962, |
| "loss/crossentropy": 2.856586217880249, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1940813660621643, |
| "loss/reg": 190.74557495117188, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.008175, |
| "grad_norm": 2.558241844177246, |
| "grad_norm_var": 0.2536998205740133, |
| "learning_rate": 0.0001, |
| "loss": 192.9891, |
| "loss/crossentropy": 2.562021493911743, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18647362291812897, |
| "loss/reg": 190.24061584472656, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.0082, |
| "grad_norm": 2.952324390411377, |
| "grad_norm_var": 0.026016228267095206, |
| "learning_rate": 0.0001, |
| "loss": 192.9934, |
| "loss/crossentropy": 3.0561769008636475, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20818649232387543, |
| "loss/reg": 189.72903442382812, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.008225, |
| "grad_norm": 3.3000900745391846, |
| "grad_norm_var": 0.043529065715877856, |
| "learning_rate": 0.0001, |
| "loss": 192.3072, |
| "loss/crossentropy": 2.8769357204437256, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21247068047523499, |
| "loss/reg": 189.21775817871094, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.00825, |
| "grad_norm": 2.693880081176758, |
| "grad_norm_var": 0.04278518629817896, |
| "learning_rate": 0.0001, |
| "loss": 191.9053, |
| "loss/crossentropy": 2.990717649459839, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20985952019691467, |
| "loss/reg": 188.7047119140625, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.008275, |
| "grad_norm": 2.718574285507202, |
| "grad_norm_var": 0.04022917313372621, |
| "learning_rate": 0.0001, |
| "loss": 191.1251, |
| "loss/crossentropy": 2.72544527053833, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18823279440402985, |
| "loss/reg": 188.21141052246094, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.0083, |
| "grad_norm": 2.4027438163757324, |
| "grad_norm_var": 0.04880048181735764, |
| "learning_rate": 0.0001, |
| "loss": 190.5635, |
| "loss/crossentropy": 2.6736438274383545, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16881534457206726, |
| "loss/reg": 187.72108459472656, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.008325, |
| "grad_norm": 4.033091068267822, |
| "grad_norm_var": 0.14957197834728883, |
| "learning_rate": 0.0001, |
| "loss": 190.4425, |
| "loss/crossentropy": 2.9758944511413574, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23931945860385895, |
| "loss/reg": 187.22726440429688, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.00835, |
| "grad_norm": 2.87990140914917, |
| "grad_norm_var": 0.14785355856326585, |
| "learning_rate": 0.0001, |
| "loss": 189.6929, |
| "loss/crossentropy": 2.762582302093506, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1953504979610443, |
| "loss/reg": 186.7349853515625, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.008375, |
| "grad_norm": 2.569755792617798, |
| "grad_norm_var": 0.15086170983481276, |
| "learning_rate": 0.0001, |
| "loss": 189.2558, |
| "loss/crossentropy": 2.8176991939544678, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2055705040693283, |
| "loss/reg": 186.23252868652344, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.0084, |
| "grad_norm": 2.7810733318328857, |
| "grad_norm_var": 0.14525191609682603, |
| "learning_rate": 0.0001, |
| "loss": 188.7248, |
| "loss/crossentropy": 2.7931947708129883, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20690809190273285, |
| "loss/reg": 185.72470092773438, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.008425, |
| "grad_norm": 2.921773910522461, |
| "grad_norm_var": 0.1454556654203382, |
| "learning_rate": 0.0001, |
| "loss": 188.15, |
| "loss/crossentropy": 2.723609447479248, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2181951254606247, |
| "loss/reg": 185.20816040039062, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.00845, |
| "grad_norm": 3.092437505722046, |
| "grad_norm_var": 0.14539310884848244, |
| "learning_rate": 0.0001, |
| "loss": 187.8704, |
| "loss/crossentropy": 2.962968111038208, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2084237039089203, |
| "loss/reg": 184.69903564453125, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.008475, |
| "grad_norm": 3.1183600425720215, |
| "grad_norm_var": 0.14744546795223623, |
| "learning_rate": 0.0001, |
| "loss": 187.327, |
| "loss/crossentropy": 2.9327218532562256, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21510222554206848, |
| "loss/reg": 184.17919921875, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.0085, |
| "grad_norm": 3.118776559829712, |
| "grad_norm_var": 0.14737225768415765, |
| "learning_rate": 0.0001, |
| "loss": 186.937, |
| "loss/crossentropy": 3.054023265838623, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21871572732925415, |
| "loss/reg": 183.66429138183594, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.008525, |
| "grad_norm": 3.009570360183716, |
| "grad_norm_var": 0.14305740155455335, |
| "learning_rate": 0.0001, |
| "loss": 186.0073, |
| "loss/crossentropy": 2.6760642528533936, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1764220893383026, |
| "loss/reg": 183.15478515625, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.00855, |
| "grad_norm": 2.6750733852386475, |
| "grad_norm_var": 0.14627338296657608, |
| "learning_rate": 0.0001, |
| "loss": 185.687, |
| "loss/crossentropy": 2.8624348640441895, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1818752884864807, |
| "loss/reg": 182.64266967773438, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.008575, |
| "grad_norm": 2.536036252975464, |
| "grad_norm_var": 0.14739482829090184, |
| "learning_rate": 0.0001, |
| "loss": 185.0605, |
| "loss/crossentropy": 2.755174398422241, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1642666757106781, |
| "loss/reg": 182.1410369873047, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.0086, |
| "grad_norm": 2.7478532791137695, |
| "grad_norm_var": 0.14926881515059046, |
| "learning_rate": 0.0001, |
| "loss": 184.4167, |
| "loss/crossentropy": 2.5819690227508545, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19197826087474823, |
| "loss/reg": 181.64279174804688, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.008625, |
| "grad_norm": 2.914630651473999, |
| "grad_norm_var": 0.1386317271370025, |
| "learning_rate": 0.0001, |
| "loss": 184.1337, |
| "loss/crossentropy": 2.7838339805603027, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19958782196044922, |
| "loss/reg": 181.15025329589844, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.00865, |
| "grad_norm": 2.6812732219696045, |
| "grad_norm_var": 0.13896854058942837, |
| "learning_rate": 0.0001, |
| "loss": 183.8571, |
| "loss/crossentropy": 3.0270869731903076, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17617946863174438, |
| "loss/reg": 180.65380859375, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.008675, |
| "grad_norm": 2.7997968196868896, |
| "grad_norm_var": 0.13755082338921434, |
| "learning_rate": 0.0001, |
| "loss": 183.1842, |
| "loss/crossentropy": 2.830026865005493, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2012362778186798, |
| "loss/reg": 180.15289306640625, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.0087, |
| "grad_norm": 2.632431745529175, |
| "grad_norm_var": 0.1258451860116639, |
| "learning_rate": 0.0001, |
| "loss": 182.56, |
| "loss/crossentropy": 2.704882860183716, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1936817169189453, |
| "loss/reg": 179.66139221191406, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.008725, |
| "grad_norm": 2.971341371536255, |
| "grad_norm_var": 0.036883841878242646, |
| "learning_rate": 0.0001, |
| "loss": 182.6145, |
| "loss/crossentropy": 3.2539191246032715, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19264021515846252, |
| "loss/reg": 179.1678924560547, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.00875, |
| "grad_norm": 3.221050500869751, |
| "grad_norm_var": 0.04594406550980518, |
| "learning_rate": 0.0001, |
| "loss": 181.849, |
| "loss/crossentropy": 2.9532337188720703, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2272038757801056, |
| "loss/reg": 178.6685333251953, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.008775, |
| "grad_norm": 2.788480520248413, |
| "grad_norm_var": 0.04041268740233453, |
| "learning_rate": 0.0001, |
| "loss": 181.0548, |
| "loss/crossentropy": 2.6735174655914307, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20720481872558594, |
| "loss/reg": 178.17410278320312, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.0088, |
| "grad_norm": 2.8164358139038086, |
| "grad_norm_var": 0.040045045030925056, |
| "learning_rate": 0.0001, |
| "loss": 180.8875, |
| "loss/crossentropy": 3.0074617862701416, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20497429370880127, |
| "loss/reg": 177.67507934570312, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.008825, |
| "grad_norm": 2.459709882736206, |
| "grad_norm_var": 0.05068183434945368, |
| "learning_rate": 0.0001, |
| "loss": 180.1328, |
| "loss/crossentropy": 2.7839858531951904, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16963033378124237, |
| "loss/reg": 177.17913818359375, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.00885, |
| "grad_norm": 2.582552909851074, |
| "grad_norm_var": 0.05037757045170774, |
| "learning_rate": 0.0001, |
| "loss": 179.5129, |
| "loss/crossentropy": 2.63332200050354, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18901947140693665, |
| "loss/reg": 176.6905975341797, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.008875, |
| "grad_norm": 2.6625685691833496, |
| "grad_norm_var": 0.04505259166354089, |
| "learning_rate": 0.0001, |
| "loss": 179.0017, |
| "loss/crossentropy": 2.6259818077087402, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1885865181684494, |
| "loss/reg": 176.18716430664062, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.0089, |
| "grad_norm": 2.9031405448913574, |
| "grad_norm_var": 0.03846567871036252, |
| "learning_rate": 0.0001, |
| "loss": 178.6655, |
| "loss/crossentropy": 2.7855353355407715, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19966638088226318, |
| "loss/reg": 175.6802520751953, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.008925, |
| "grad_norm": 2.6948978900909424, |
| "grad_norm_var": 0.03481774262957226, |
| "learning_rate": 0.0001, |
| "loss": 178.1122, |
| "loss/crossentropy": 2.7389872074127197, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20075306296348572, |
| "loss/reg": 175.1724090576172, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.00895, |
| "grad_norm": 2.755830764770508, |
| "grad_norm_var": 0.03435983560266701, |
| "learning_rate": 0.0001, |
| "loss": 177.4502, |
| "loss/crossentropy": 2.594728946685791, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19329878687858582, |
| "loss/reg": 174.6621551513672, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.008975, |
| "grad_norm": 2.4237000942230225, |
| "grad_norm_var": 0.038510630346210446, |
| "learning_rate": 0.0001, |
| "loss": 176.9384, |
| "loss/crossentropy": 2.599057197570801, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1731238067150116, |
| "loss/reg": 174.16619873046875, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.009, |
| "grad_norm": 2.7944130897521973, |
| "grad_norm_var": 0.03861118264588394, |
| "learning_rate": 0.0001, |
| "loss": 176.9084, |
| "loss/crossentropy": 2.9929091930389404, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24037709832191467, |
| "loss/reg": 173.67515563964844, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.009025, |
| "grad_norm": 2.8927605152130127, |
| "grad_norm_var": 0.03817964658272786, |
| "learning_rate": 0.0001, |
| "loss": 176.0866, |
| "loss/crossentropy": 2.695925712585449, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2011643946170807, |
| "loss/reg": 173.1894989013672, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.00905, |
| "grad_norm": 2.654205799102783, |
| "grad_norm_var": 0.038491602775841474, |
| "learning_rate": 0.0001, |
| "loss": 175.8716, |
| "loss/crossentropy": 2.9696269035339355, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19813314080238342, |
| "loss/reg": 172.70388793945312, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.009075, |
| "grad_norm": 2.500924587249756, |
| "grad_norm_var": 0.0422227970984134, |
| "learning_rate": 0.0001, |
| "loss": 175.0925, |
| "loss/crossentropy": 2.694688558578491, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19237878918647766, |
| "loss/reg": 172.20538330078125, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.0091, |
| "grad_norm": 2.856943130493164, |
| "grad_norm_var": 0.04231316052960944, |
| "learning_rate": 0.0001, |
| "loss": 174.687, |
| "loss/crossentropy": 2.7701804637908936, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19237452745437622, |
| "loss/reg": 171.72442626953125, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.009125, |
| "grad_norm": 3.0055325031280518, |
| "grad_norm_var": 0.043401276039503264, |
| "learning_rate": 0.0001, |
| "loss": 174.6459, |
| "loss/crossentropy": 3.1644747257232666, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24399538338184357, |
| "loss/reg": 171.23745727539062, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.00915, |
| "grad_norm": 2.485426664352417, |
| "grad_norm_var": 0.031101142054486056, |
| "learning_rate": 0.0001, |
| "loss": 173.739, |
| "loss/crossentropy": 2.804879903793335, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1790430247783661, |
| "loss/reg": 170.75502014160156, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.009175, |
| "grad_norm": 2.922252655029297, |
| "grad_norm_var": 0.033711321846642286, |
| "learning_rate": 0.0001, |
| "loss": 173.451, |
| "loss/crossentropy": 2.9688096046447754, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20513179898262024, |
| "loss/reg": 170.2770538330078, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.0092, |
| "grad_norm": 2.730825662612915, |
| "grad_norm_var": 0.032991054055673645, |
| "learning_rate": 0.0001, |
| "loss": 172.7964, |
| "loss/crossentropy": 2.7933456897735596, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20646029710769653, |
| "loss/reg": 169.796630859375, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.009225, |
| "grad_norm": 2.482426881790161, |
| "grad_norm_var": 0.03227169195139723, |
| "learning_rate": 0.0001, |
| "loss": 172.0632, |
| "loss/crossentropy": 2.5588743686676025, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18388806283473969, |
| "loss/reg": 169.3204803466797, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.00925, |
| "grad_norm": 2.739718198776245, |
| "grad_norm_var": 0.03115998847033552, |
| "learning_rate": 0.0001, |
| "loss": 171.5819, |
| "loss/crossentropy": 2.54241681098938, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19010567665100098, |
| "loss/reg": 168.84933471679688, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.009275, |
| "grad_norm": 2.8765339851379395, |
| "grad_norm_var": 0.03240860179928783, |
| "learning_rate": 0.0001, |
| "loss": 171.0873, |
| "loss/crossentropy": 2.5227410793304443, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18850305676460266, |
| "loss/reg": 168.37608337402344, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.0093, |
| "grad_norm": 2.648451089859009, |
| "grad_norm_var": 0.030667067483867782, |
| "learning_rate": 0.0001, |
| "loss": 170.7962, |
| "loss/crossentropy": 2.701418161392212, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18235236406326294, |
| "loss/reg": 167.9124755859375, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.009325, |
| "grad_norm": 2.3994297981262207, |
| "grad_norm_var": 0.03697651271872549, |
| "learning_rate": 0.0001, |
| "loss": 170.5056, |
| "loss/crossentropy": 2.8883211612701416, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17240813374519348, |
| "loss/reg": 167.44482421875, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.00935, |
| "grad_norm": 2.760050058364868, |
| "grad_norm_var": 0.03701011100701498, |
| "learning_rate": 0.0001, |
| "loss": 170.0168, |
| "loss/crossentropy": 2.828566551208496, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20667554438114166, |
| "loss/reg": 166.98158264160156, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.009375, |
| "grad_norm": 2.598747968673706, |
| "grad_norm_var": 0.032514977652635696, |
| "learning_rate": 0.0001, |
| "loss": 169.915, |
| "loss/crossentropy": 3.2055323123931885, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18947294354438782, |
| "loss/reg": 166.51995849609375, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.0094, |
| "grad_norm": 2.5448288917541504, |
| "grad_norm_var": 0.03357553294952117, |
| "learning_rate": 0.0001, |
| "loss": 169.237, |
| "loss/crossentropy": 2.9864261150360107, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19935306906700134, |
| "loss/reg": 166.05117797851562, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.009425, |
| "grad_norm": 2.9218602180480957, |
| "grad_norm_var": 0.034400838745598135, |
| "learning_rate": 0.0001, |
| "loss": 169.3156, |
| "loss/crossentropy": 3.513453722000122, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21727558970451355, |
| "loss/reg": 165.5848846435547, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.00945, |
| "grad_norm": 2.6745686531066895, |
| "grad_norm_var": 0.03431461157821616, |
| "learning_rate": 0.0001, |
| "loss": 167.9062, |
| "loss/crossentropy": 2.607114791870117, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17556855082511902, |
| "loss/reg": 165.1234893798828, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.009475, |
| "grad_norm": 2.8298020362854004, |
| "grad_norm_var": 0.03248619908528383, |
| "learning_rate": 0.0001, |
| "loss": 167.8138, |
| "loss/crossentropy": 2.9435441493988037, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20177403092384338, |
| "loss/reg": 164.66851806640625, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.0095, |
| "grad_norm": 2.4895458221435547, |
| "grad_norm_var": 0.03408372867624981, |
| "learning_rate": 0.0001, |
| "loss": 167.1126, |
| "loss/crossentropy": 2.7143633365631104, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1809697151184082, |
| "loss/reg": 164.2172393798828, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.009525, |
| "grad_norm": 2.7371509075164795, |
| "grad_norm_var": 0.027450997371564274, |
| "learning_rate": 0.0001, |
| "loss": 166.7423, |
| "loss/crossentropy": 2.7732856273651123, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1965845823287964, |
| "loss/reg": 163.7724609375, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.00955, |
| "grad_norm": 2.873455047607422, |
| "grad_norm_var": 0.026918816484265782, |
| "learning_rate": 0.0001, |
| "loss": 166.429, |
| "loss/crossentropy": 2.9108941555023193, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20937275886535645, |
| "loss/reg": 163.30874633789062, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.009575, |
| "grad_norm": 2.4213316440582275, |
| "grad_norm_var": 0.02788105642846309, |
| "learning_rate": 0.0001, |
| "loss": 165.6412, |
| "loss/crossentropy": 2.609408140182495, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18198969960212708, |
| "loss/reg": 162.84983825683594, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.0096, |
| "grad_norm": 2.3272271156311035, |
| "grad_norm_var": 0.03481792449697399, |
| "learning_rate": 0.0001, |
| "loss": 165.1343, |
| "loss/crossentropy": 2.555964469909668, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19197088479995728, |
| "loss/reg": 162.3863983154297, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.009625, |
| "grad_norm": 2.934056282043457, |
| "grad_norm_var": 0.03775698672306949, |
| "learning_rate": 0.0001, |
| "loss": 165.2302, |
| "loss/crossentropy": 3.0558810234069824, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2526497542858124, |
| "loss/reg": 161.92169189453125, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.00965, |
| "grad_norm": 2.6826000213623047, |
| "grad_norm_var": 0.03745695106176162, |
| "learning_rate": 0.0001, |
| "loss": 163.9906, |
| "loss/crossentropy": 2.3432304859161377, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18046601116657257, |
| "loss/reg": 161.46690368652344, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.009675, |
| "grad_norm": 2.6484644412994385, |
| "grad_norm_var": 0.03442670004298994, |
| "learning_rate": 0.0001, |
| "loss": 163.9132, |
| "loss/crossentropy": 2.702923536300659, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2004016935825348, |
| "loss/reg": 161.00990295410156, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.0097, |
| "grad_norm": 2.6372387409210205, |
| "grad_norm_var": 0.03444542888671454, |
| "learning_rate": 0.0001, |
| "loss": 163.1478, |
| "loss/crossentropy": 2.4251551628112793, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.16877132654190063, |
| "loss/reg": 160.5538787841797, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.009725, |
| "grad_norm": 2.702305793762207, |
| "grad_norm_var": 0.029857082413892993, |
| "learning_rate": 0.0001, |
| "loss": 163.1074, |
| "loss/crossentropy": 2.8232805728912354, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.185012549161911, |
| "loss/reg": 160.09906005859375, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.00975, |
| "grad_norm": 2.8600754737854004, |
| "grad_norm_var": 0.031630664651837746, |
| "learning_rate": 0.0001, |
| "loss": 162.6523, |
| "loss/crossentropy": 2.8138694763183594, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19447121024131775, |
| "loss/reg": 159.64395141601562, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.009775, |
| "grad_norm": 3.1134562492370605, |
| "grad_norm_var": 0.0425983283858803, |
| "learning_rate": 0.0001, |
| "loss": 162.2286, |
| "loss/crossentropy": 2.8298885822296143, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19982922077178955, |
| "loss/reg": 159.1988983154297, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.0098, |
| "grad_norm": 2.7824184894561768, |
| "grad_norm_var": 0.04081881578391986, |
| "learning_rate": 0.0001, |
| "loss": 161.8162, |
| "loss/crossentropy": 2.8324615955352783, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23998713493347168, |
| "loss/reg": 158.74378967285156, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.009825, |
| "grad_norm": 2.5551321506500244, |
| "grad_norm_var": 0.039707183410192214, |
| "learning_rate": 0.0001, |
| "loss": 161.2205, |
| "loss/crossentropy": 2.7324817180633545, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20197457075119019, |
| "loss/reg": 158.28604125976562, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.00985, |
| "grad_norm": 2.9523658752441406, |
| "grad_norm_var": 0.04342908454165884, |
| "learning_rate": 0.0001, |
| "loss": 161.0285, |
| "loss/crossentropy": 2.9625911712646484, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23311267793178558, |
| "loss/reg": 157.83279418945312, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.009875, |
| "grad_norm": 2.851621627807617, |
| "grad_norm_var": 0.043773443776307396, |
| "learning_rate": 0.0001, |
| "loss": 160.4359, |
| "loss/crossentropy": 2.845726728439331, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20467609167099, |
| "loss/reg": 157.38543701171875, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.0099, |
| "grad_norm": 2.7198143005371094, |
| "grad_norm_var": 0.03991894337190445, |
| "learning_rate": 0.0001, |
| "loss": 159.8738, |
| "loss/crossentropy": 2.746978998184204, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19763833284378052, |
| "loss/reg": 156.92922973632812, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.009925, |
| "grad_norm": 3.205587148666382, |
| "grad_norm_var": 0.05361669114409532, |
| "learning_rate": 0.0001, |
| "loss": 159.7989, |
| "loss/crossentropy": 3.081794261932373, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2381635308265686, |
| "loss/reg": 156.47898864746094, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.00995, |
| "grad_norm": 2.616609573364258, |
| "grad_norm_var": 0.05408374472743371, |
| "learning_rate": 0.0001, |
| "loss": 159.1617, |
| "loss/crossentropy": 2.927096128463745, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21107865869998932, |
| "loss/reg": 156.02352905273438, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.009975, |
| "grad_norm": 2.7694790363311768, |
| "grad_norm_var": 0.04637258989533374, |
| "learning_rate": 0.0001, |
| "loss": 158.5532, |
| "loss/crossentropy": 2.7599570751190186, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22352541983127594, |
| "loss/reg": 155.56968688964844, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 2.700110912322998, |
| "grad_norm_var": 0.032929538641367155, |
| "learning_rate": 0.0001, |
| "loss": 157.9857, |
| "loss/crossentropy": 2.6631603240966797, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20680838823318481, |
| "loss/reg": 155.11572265625, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.010025, |
| "grad_norm": 2.723098039627075, |
| "grad_norm_var": 0.031819586107117694, |
| "learning_rate": 0.0001, |
| "loss": 157.8913, |
| "loss/crossentropy": 3.0035312175750732, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21532365679740906, |
| "loss/reg": 154.67247009277344, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.01005, |
| "grad_norm": 3.2790751457214355, |
| "grad_norm_var": 0.046109071751097255, |
| "learning_rate": 0.0001, |
| "loss": 157.6402, |
| "loss/crossentropy": 3.1786797046661377, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2247261106967926, |
| "loss/reg": 154.2367706298828, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.010075, |
| "grad_norm": 3.035478353500366, |
| "grad_norm_var": 0.04662890368927582, |
| "learning_rate": 0.0001, |
| "loss": 156.9016, |
| "loss/crossentropy": 2.859160900115967, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23791690170764923, |
| "loss/reg": 153.80447387695312, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.0101, |
| "grad_norm": 3.228116750717163, |
| "grad_norm_var": 0.05216118625241369, |
| "learning_rate": 0.0001, |
| "loss": 156.2435, |
| "loss/crossentropy": 2.6365485191345215, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23830467462539673, |
| "loss/reg": 153.36862182617188, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.010125, |
| "grad_norm": 2.873171091079712, |
| "grad_norm_var": 0.049916639205365716, |
| "learning_rate": 0.0001, |
| "loss": 156.137, |
| "loss/crossentropy": 2.9804184436798096, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2179238498210907, |
| "loss/reg": 152.9386444091797, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.01015, |
| "grad_norm": 2.98555326461792, |
| "grad_norm_var": 0.05037325371620132, |
| "learning_rate": 0.0001, |
| "loss": 155.7607, |
| "loss/crossentropy": 3.0471410751342773, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20413793623447418, |
| "loss/reg": 152.50946044921875, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.010175, |
| "grad_norm": 7.3229899406433105, |
| "grad_norm_var": 1.2780035865068755, |
| "learning_rate": 0.0001, |
| "loss": 155.6208, |
| "loss/crossentropy": 3.179004192352295, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.3593263626098633, |
| "loss/reg": 152.0824432373047, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.0102, |
| "grad_norm": 2.5819976329803467, |
| "grad_norm_var": 1.2906719922528245, |
| "learning_rate": 0.0001, |
| "loss": 154.8975, |
| "loss/crossentropy": 3.030137538909912, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20869307219982147, |
| "loss/reg": 151.65867614746094, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.010225, |
| "grad_norm": 2.881765604019165, |
| "grad_norm_var": 1.2714323685464743, |
| "learning_rate": 0.0001, |
| "loss": 154.2088, |
| "loss/crossentropy": 2.7575833797454834, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2238922119140625, |
| "loss/reg": 151.2273712158203, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.01025, |
| "grad_norm": 3.032489061355591, |
| "grad_norm_var": 1.269504032877872, |
| "learning_rate": 0.0001, |
| "loss": 154.2042, |
| "loss/crossentropy": 3.1616742610931396, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24033766984939575, |
| "loss/reg": 150.8022003173828, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.010275, |
| "grad_norm": 2.8271679878234863, |
| "grad_norm_var": 1.270597194896881, |
| "learning_rate": 0.0001, |
| "loss": 153.2748, |
| "loss/crossentropy": 2.699521780014038, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20507997274398804, |
| "loss/reg": 150.3701934814453, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.0103, |
| "grad_norm": 2.8864877223968506, |
| "grad_norm_var": 1.2622421001984312, |
| "learning_rate": 0.0001, |
| "loss": 152.7086, |
| "loss/crossentropy": 2.557340383529663, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20991156995296478, |
| "loss/reg": 149.94137573242188, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.010325, |
| "grad_norm": 2.665037155151367, |
| "grad_norm_var": 1.278971707345721, |
| "learning_rate": 0.0001, |
| "loss": 152.6803, |
| "loss/crossentropy": 2.9646856784820557, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19997093081474304, |
| "loss/reg": 149.51559448242188, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.01035, |
| "grad_norm": 2.9376747608184814, |
| "grad_norm_var": 1.2625575568111116, |
| "learning_rate": 0.0001, |
| "loss": 151.8695, |
| "loss/crossentropy": 2.5759167671203613, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2004423588514328, |
| "loss/reg": 149.09315490722656, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.010375, |
| "grad_norm": 2.8804266452789307, |
| "grad_norm_var": 1.2573930188301232, |
| "learning_rate": 0.0001, |
| "loss": 151.6834, |
| "loss/crossentropy": 2.805604934692383, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19938749074935913, |
| "loss/reg": 148.67840576171875, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.0104, |
| "grad_norm": 2.844179153442383, |
| "grad_norm_var": 1.2495192648568405, |
| "learning_rate": 0.0001, |
| "loss": 151.2533, |
| "loss/crossentropy": 2.804511547088623, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19970285892486572, |
| "loss/reg": 148.2490692138672, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.010425, |
| "grad_norm": 2.5592336654663086, |
| "grad_norm_var": 1.2613231291949638, |
| "learning_rate": 0.0001, |
| "loss": 150.7822, |
| "loss/crossentropy": 2.7462940216064453, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20548762381076813, |
| "loss/reg": 147.8304443359375, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.01045, |
| "grad_norm": 2.6821486949920654, |
| "grad_norm_var": 1.2754135282406718, |
| "learning_rate": 0.0001, |
| "loss": 150.2964, |
| "loss/crossentropy": 2.685582399368286, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20678475499153137, |
| "loss/reg": 147.40402221679688, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.010475, |
| "grad_norm": 2.9765594005584717, |
| "grad_norm_var": 1.2764437045171377, |
| "learning_rate": 0.0001, |
| "loss": 150.4573, |
| "loss/crossentropy": 3.2364401817321777, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2370205670595169, |
| "loss/reg": 146.98388671875, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.0105, |
| "grad_norm": 2.849670886993408, |
| "grad_norm_var": 1.2807121780616888, |
| "learning_rate": 0.0001, |
| "loss": 149.4418, |
| "loss/crossentropy": 2.6446595191955566, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23380397260189056, |
| "loss/reg": 146.56333923339844, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.010525, |
| "grad_norm": 2.668278694152832, |
| "grad_norm_var": 1.289851246662234, |
| "learning_rate": 0.0001, |
| "loss": 149.259, |
| "loss/crossentropy": 2.9065048694610596, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21474584937095642, |
| "loss/reg": 146.1377716064453, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.01055, |
| "grad_norm": 2.594972848892212, |
| "grad_norm_var": 1.3052862072893956, |
| "learning_rate": 0.0001, |
| "loss": 148.7854, |
| "loss/crossentropy": 2.8732926845550537, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19787928462028503, |
| "loss/reg": 145.71426391601562, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.010575, |
| "grad_norm": 3.493597984313965, |
| "grad_norm_var": 0.05255425549001392, |
| "learning_rate": 0.0001, |
| "loss": 148.5448, |
| "loss/crossentropy": 2.979468584060669, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2681131958961487, |
| "loss/reg": 145.29721069335938, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.0106, |
| "grad_norm": 2.545316219329834, |
| "grad_norm_var": 0.05387626475652117, |
| "learning_rate": 0.0001, |
| "loss": 147.7063, |
| "loss/crossentropy": 2.627589225769043, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1998068243265152, |
| "loss/reg": 144.87893676757812, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.010625, |
| "grad_norm": 2.801800489425659, |
| "grad_norm_var": 0.05375398155694313, |
| "learning_rate": 0.0001, |
| "loss": 147.7385, |
| "loss/crossentropy": 3.051680088043213, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23187664151191711, |
| "loss/reg": 144.4549560546875, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.01065, |
| "grad_norm": 2.5229389667510986, |
| "grad_norm_var": 0.056076010247692425, |
| "learning_rate": 0.0001, |
| "loss": 146.9956, |
| "loss/crossentropy": 2.7526652812957764, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20328941941261292, |
| "loss/reg": 144.0396270751953, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.010675, |
| "grad_norm": 2.885693311691284, |
| "grad_norm_var": 0.05653354974819133, |
| "learning_rate": 0.0001, |
| "loss": 146.6748, |
| "loss/crossentropy": 2.816437244415283, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22479213774204254, |
| "loss/reg": 143.633544921875, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.0107, |
| "grad_norm": 2.888580083847046, |
| "grad_norm_var": 0.05655805617718836, |
| "learning_rate": 0.0001, |
| "loss": 146.5772, |
| "loss/crossentropy": 3.1261234283447266, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2150314301252365, |
| "loss/reg": 143.23602294921875, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.010725, |
| "grad_norm": 2.679644823074341, |
| "grad_norm_var": 0.05630900067725785, |
| "learning_rate": 0.0001, |
| "loss": 145.6467, |
| "loss/crossentropy": 2.6079299449920654, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21127401292324066, |
| "loss/reg": 142.82745361328125, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.01075, |
| "grad_norm": 2.5422425270080566, |
| "grad_norm_var": 0.05885842547493757, |
| "learning_rate": 0.0001, |
| "loss": 145.2591, |
| "loss/crossentropy": 2.6437766551971436, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19190073013305664, |
| "loss/reg": 142.42344665527344, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.010775, |
| "grad_norm": 2.6776621341705322, |
| "grad_norm_var": 0.058603604392913886, |
| "learning_rate": 0.0001, |
| "loss": 145.2115, |
| "loss/crossentropy": 2.9833149909973145, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2141813039779663, |
| "loss/reg": 142.0140380859375, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.0108, |
| "grad_norm": 2.8127005100250244, |
| "grad_norm_var": 0.058326000336826195, |
| "learning_rate": 0.0001, |
| "loss": 145.2914, |
| "loss/crossentropy": 3.4850382804870605, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1998693197965622, |
| "loss/reg": 141.60650634765625, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.010825, |
| "grad_norm": 2.7787222862243652, |
| "grad_norm_var": 0.05542301102962029, |
| "learning_rate": 0.0001, |
| "loss": 144.1324, |
| "loss/crossentropy": 2.7224390506744385, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21212972700595856, |
| "loss/reg": 141.1977996826172, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.01085, |
| "grad_norm": 2.582594156265259, |
| "grad_norm_var": 0.057275397262241276, |
| "learning_rate": 0.0001, |
| "loss": 143.8046, |
| "loss/crossentropy": 2.824321985244751, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19551266729831696, |
| "loss/reg": 140.7847900390625, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.010875, |
| "grad_norm": 2.746096134185791, |
| "grad_norm_var": 0.05421119495302354, |
| "learning_rate": 0.0001, |
| "loss": 143.3764, |
| "loss/crossentropy": 2.780687093734741, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21173939108848572, |
| "loss/reg": 140.38397216796875, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.0109, |
| "grad_norm": 3.0453367233276367, |
| "grad_norm_var": 0.05908933324654957, |
| "learning_rate": 0.0001, |
| "loss": 143.2668, |
| "loss/crossentropy": 3.0895285606384277, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2058122158050537, |
| "loss/reg": 139.97145080566406, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.010925, |
| "grad_norm": 2.8102662563323975, |
| "grad_norm_var": 0.058487291701452664, |
| "learning_rate": 0.0001, |
| "loss": 142.5939, |
| "loss/crossentropy": 2.8098881244659424, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21639777719974518, |
| "loss/reg": 139.56761169433594, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.01095, |
| "grad_norm": 2.786220073699951, |
| "grad_norm_var": 0.05616962127030523, |
| "learning_rate": 0.0001, |
| "loss": 142.2782, |
| "loss/crossentropy": 2.889941453933716, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22128039598464966, |
| "loss/reg": 139.16693115234375, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.010975, |
| "grad_norm": 2.496029853820801, |
| "grad_norm_var": 0.024443678618327407, |
| "learning_rate": 0.0001, |
| "loss": 141.7307, |
| "loss/crossentropy": 2.751573085784912, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19620397686958313, |
| "loss/reg": 138.782958984375, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.011, |
| "grad_norm": 4.073153495788574, |
| "grad_norm_var": 0.13370943824087528, |
| "learning_rate": 0.0001, |
| "loss": 141.1084, |
| "loss/crossentropy": 2.547888994216919, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1862032264471054, |
| "loss/reg": 138.37429809570312, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.011025, |
| "grad_norm": 2.8439884185791016, |
| "grad_norm_var": 0.13371489998645056, |
| "learning_rate": 0.0001, |
| "loss": 141.1169, |
| "loss/crossentropy": 2.9181034564971924, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22650295495986938, |
| "loss/reg": 137.97227478027344, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.01105, |
| "grad_norm": 3.0714433193206787, |
| "grad_norm_var": 0.13055613818799353, |
| "learning_rate": 0.0001, |
| "loss": 140.6507, |
| "loss/crossentropy": 2.857541084289551, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21894115209579468, |
| "loss/reg": 137.57418823242188, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.011075, |
| "grad_norm": 2.4183578491210938, |
| "grad_norm_var": 0.14245098271452789, |
| "learning_rate": 0.0001, |
| "loss": 140.0377, |
| "loss/crossentropy": 2.6772847175598145, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18452061712741852, |
| "loss/reg": 137.17588806152344, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.0111, |
| "grad_norm": 3.0140979290008545, |
| "grad_norm_var": 0.14444423385113864, |
| "learning_rate": 0.0001, |
| "loss": 139.7851, |
| "loss/crossentropy": 2.8040971755981445, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19091713428497314, |
| "loss/reg": 136.79010009765625, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.011125, |
| "grad_norm": 2.9585142135620117, |
| "grad_norm_var": 0.14348511163545558, |
| "learning_rate": 0.0001, |
| "loss": 139.3333, |
| "loss/crossentropy": 2.7211837768554688, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2089160978794098, |
| "loss/reg": 136.40313720703125, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.01115, |
| "grad_norm": 2.8655238151550293, |
| "grad_norm_var": 0.1365966991771605, |
| "learning_rate": 0.0001, |
| "loss": 139.1563, |
| "loss/crossentropy": 2.9270730018615723, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21874740719795227, |
| "loss/reg": 136.01043701171875, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.011175, |
| "grad_norm": 3.5568222999572754, |
| "grad_norm_var": 0.16191349512506553, |
| "learning_rate": 0.0001, |
| "loss": 138.9605, |
| "loss/crossentropy": 3.0990545749664307, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2374783754348755, |
| "loss/reg": 135.62399291992188, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.0112, |
| "grad_norm": 3.069288730621338, |
| "grad_norm_var": 0.16205836586185096, |
| "learning_rate": 0.0001, |
| "loss": 138.2294, |
| "loss/crossentropy": 2.8115599155426025, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19097256660461426, |
| "loss/reg": 135.22686767578125, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.011225, |
| "grad_norm": 2.829019784927368, |
| "grad_norm_var": 0.16110285265393878, |
| "learning_rate": 0.0001, |
| "loss": 137.3381, |
| "loss/crossentropy": 2.2983205318450928, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20224307477474213, |
| "loss/reg": 134.8375701904297, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.01125, |
| "grad_norm": 2.959508180618286, |
| "grad_norm_var": 0.1516222356653503, |
| "learning_rate": 0.0001, |
| "loss": 137.696, |
| "loss/crossentropy": 3.0192501544952393, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22333860397338867, |
| "loss/reg": 134.4534149169922, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.011275, |
| "grad_norm": 2.742422342300415, |
| "grad_norm_var": 0.15173348060539713, |
| "learning_rate": 0.0001, |
| "loss": 137.0164, |
| "loss/crossentropy": 2.738880157470703, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2160961925983429, |
| "loss/reg": 134.0614776611328, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.0113, |
| "grad_norm": 2.8377208709716797, |
| "grad_norm_var": 0.1523766132789395, |
| "learning_rate": 0.0001, |
| "loss": 136.7287, |
| "loss/crossentropy": 2.8547892570495605, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21014469861984253, |
| "loss/reg": 133.6637420654297, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.011325, |
| "grad_norm": 2.859851121902466, |
| "grad_norm_var": 0.15155175629658876, |
| "learning_rate": 0.0001, |
| "loss": 136.1562, |
| "loss/crossentropy": 2.66414213180542, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22573256492614746, |
| "loss/reg": 133.26629638671875, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.01135, |
| "grad_norm": 2.6915457248687744, |
| "grad_norm_var": 0.15432295238523253, |
| "learning_rate": 0.0001, |
| "loss": 135.9197, |
| "loss/crossentropy": 2.8435401916503906, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19813670217990875, |
| "loss/reg": 132.87806701660156, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.011375, |
| "grad_norm": 2.7335429191589355, |
| "grad_norm_var": 0.14329945186020698, |
| "learning_rate": 0.0001, |
| "loss": 135.273, |
| "loss/crossentropy": 2.5880157947540283, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19232623279094696, |
| "loss/reg": 132.49264526367188, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.0114, |
| "grad_norm": 2.695430040359497, |
| "grad_norm_var": 0.059341799627403206, |
| "learning_rate": 0.0001, |
| "loss": 135.0541, |
| "loss/crossentropy": 2.746819257736206, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20626962184906006, |
| "loss/reg": 132.10101318359375, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.011425, |
| "grad_norm": 2.8293468952178955, |
| "grad_norm_var": 0.05943368425061877, |
| "learning_rate": 0.0001, |
| "loss": 134.5696, |
| "loss/crossentropy": 2.6393778324127197, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21043707430362701, |
| "loss/reg": 131.71983337402344, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.01145, |
| "grad_norm": 2.8794851303100586, |
| "grad_norm_var": 0.05692067856874985, |
| "learning_rate": 0.0001, |
| "loss": 134.1734, |
| "loss/crossentropy": 2.650709390640259, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19179099798202515, |
| "loss/reg": 131.3309326171875, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.011475, |
| "grad_norm": 2.8426058292388916, |
| "grad_norm_var": 0.042549658611572026, |
| "learning_rate": 0.0001, |
| "loss": 133.6375, |
| "loss/crossentropy": 2.513119697570801, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17655596137046814, |
| "loss/reg": 130.9477996826172, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.0115, |
| "grad_norm": 3.155630588531494, |
| "grad_norm_var": 0.045996375141740486, |
| "learning_rate": 0.0001, |
| "loss": 133.6152, |
| "loss/crossentropy": 2.833134651184082, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21260111033916473, |
| "loss/reg": 130.5695037841797, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.011525, |
| "grad_norm": 2.494323492050171, |
| "grad_norm_var": 0.05625290696184659, |
| "learning_rate": 0.0001, |
| "loss": 133.0751, |
| "loss/crossentropy": 2.6999471187591553, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18340814113616943, |
| "loss/reg": 130.19174194335938, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.01155, |
| "grad_norm": 2.6319777965545654, |
| "grad_norm_var": 0.06003884724692152, |
| "learning_rate": 0.0001, |
| "loss": 132.8793, |
| "loss/crossentropy": 2.860808849334717, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20245122909545898, |
| "loss/reg": 129.8160400390625, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.011575, |
| "grad_norm": 2.514202117919922, |
| "grad_norm_var": 0.03153201551004561, |
| "learning_rate": 0.0001, |
| "loss": 132.3611, |
| "loss/crossentropy": 2.730762481689453, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18745794892311096, |
| "loss/reg": 129.44293212890625, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.0116, |
| "grad_norm": 2.5370423793792725, |
| "grad_norm_var": 0.029975769359208235, |
| "learning_rate": 0.0001, |
| "loss": 131.8961, |
| "loss/crossentropy": 2.640779972076416, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18950456380844116, |
| "loss/reg": 129.0658416748047, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.011625, |
| "grad_norm": 4.233380317687988, |
| "grad_norm_var": 0.16530188527350068, |
| "learning_rate": 0.0001, |
| "loss": 131.9097, |
| "loss/crossentropy": 2.9330646991729736, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2857138216495514, |
| "loss/reg": 128.69088745117188, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.01165, |
| "grad_norm": 2.779022216796875, |
| "grad_norm_var": 0.16475971985575838, |
| "learning_rate": 0.0001, |
| "loss": 131.55, |
| "loss/crossentropy": 3.005692481994629, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22528377175331116, |
| "loss/reg": 128.3190460205078, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.011675, |
| "grad_norm": 2.610145092010498, |
| "grad_norm_var": 0.16759359645252517, |
| "learning_rate": 0.0001, |
| "loss": 130.7001, |
| "loss/crossentropy": 2.568135976791382, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17949801683425903, |
| "loss/reg": 127.9524917602539, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.0117, |
| "grad_norm": 2.7065391540527344, |
| "grad_norm_var": 0.1685835608909353, |
| "learning_rate": 0.0001, |
| "loss": 130.8896, |
| "loss/crossentropy": 3.0850789546966553, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21300601959228516, |
| "loss/reg": 127.591552734375, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.011725, |
| "grad_norm": 2.577502965927124, |
| "grad_norm_var": 0.17224012017982868, |
| "learning_rate": 0.0001, |
| "loss": 130.1112, |
| "loss/crossentropy": 2.673410177230835, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21455711126327515, |
| "loss/reg": 127.2232666015625, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.01175, |
| "grad_norm": 2.620901346206665, |
| "grad_norm_var": 0.17363936391112228, |
| "learning_rate": 0.0001, |
| "loss": 130.2284, |
| "loss/crossentropy": 3.125420093536377, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23448872566223145, |
| "loss/reg": 126.86849975585938, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.011775, |
| "grad_norm": 2.6767570972442627, |
| "grad_norm_var": 0.1743635181200638, |
| "learning_rate": 0.0001, |
| "loss": 129.5788, |
| "loss/crossentropy": 2.8606624603271484, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.214532732963562, |
| "loss/reg": 126.50355529785156, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.0118, |
| "grad_norm": 2.6148667335510254, |
| "grad_norm_var": 0.17588189249079184, |
| "learning_rate": 0.0001, |
| "loss": 129.3386, |
| "loss/crossentropy": 2.9896390438079834, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20493671298027039, |
| "loss/reg": 126.14399719238281, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.011825, |
| "grad_norm": 2.431541681289673, |
| "grad_norm_var": 0.1838967324892555, |
| "learning_rate": 0.0001, |
| "loss": 128.7304, |
| "loss/crossentropy": 2.748108386993408, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20910008251667023, |
| "loss/reg": 125.7732162475586, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.01185, |
| "grad_norm": 2.770341157913208, |
| "grad_norm_var": 0.18303516965960565, |
| "learning_rate": 0.0001, |
| "loss": 128.8167, |
| "loss/crossentropy": 3.200181245803833, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20696839690208435, |
| "loss/reg": 125.40956115722656, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.011875, |
| "grad_norm": 2.664780378341675, |
| "grad_norm_var": 0.1831074521196359, |
| "learning_rate": 0.0001, |
| "loss": 128.242, |
| "loss/crossentropy": 2.969964027404785, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22718313336372375, |
| "loss/reg": 125.04483032226562, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.0119, |
| "grad_norm": 2.510708808898926, |
| "grad_norm_var": 0.17432457651303418, |
| "learning_rate": 0.0001, |
| "loss": 127.4987, |
| "loss/crossentropy": 2.6007182598114014, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21277430653572083, |
| "loss/reg": 124.68523406982422, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.011925, |
| "grad_norm": 2.9702818393707275, |
| "grad_norm_var": 0.17474036873515666, |
| "learning_rate": 0.0001, |
| "loss": 127.41, |
| "loss/crossentropy": 2.86385178565979, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2170620560646057, |
| "loss/reg": 124.32904052734375, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.01195, |
| "grad_norm": 2.6219518184661865, |
| "grad_norm_var": 0.17489188976909203, |
| "learning_rate": 0.0001, |
| "loss": 126.9219, |
| "loss/crossentropy": 2.7511582374572754, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2034904807806015, |
| "loss/reg": 123.96720886230469, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.011975, |
| "grad_norm": 2.599919080734253, |
| "grad_norm_var": 0.17277049923038704, |
| "learning_rate": 0.0001, |
| "loss": 126.5413, |
| "loss/crossentropy": 2.7556185722351074, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1808776557445526, |
| "loss/reg": 123.60482025146484, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 2.5657715797424316, |
| "grad_norm_var": 0.17202413016778048, |
| "learning_rate": 0.0001, |
| "loss": 126.311, |
| "loss/crossentropy": 2.8689675331115723, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20293821394443512, |
| "loss/reg": 123.23907470703125, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.012025, |
| "grad_norm": 3.2952754497528076, |
| "grad_norm_var": 0.04112811192426212, |
| "learning_rate": 0.0001, |
| "loss": 125.9246, |
| "loss/crossentropy": 2.818702220916748, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2267230898141861, |
| "loss/reg": 122.87913513183594, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.01205, |
| "grad_norm": 2.615551233291626, |
| "grad_norm_var": 0.040825667865743515, |
| "learning_rate": 0.0001, |
| "loss": 125.7597, |
| "loss/crossentropy": 3.017942428588867, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21883933246135712, |
| "loss/reg": 122.52295684814453, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.012075, |
| "grad_norm": 3.4193038940429688, |
| "grad_norm_var": 0.07439346615546445, |
| "learning_rate": 0.0001, |
| "loss": 125.1558, |
| "loss/crossentropy": 2.7724950313568115, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21510450541973114, |
| "loss/reg": 122.16816711425781, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.0121, |
| "grad_norm": 2.505560874938965, |
| "grad_norm_var": 0.07751650924940326, |
| "learning_rate": 0.0001, |
| "loss": 124.927, |
| "loss/crossentropy": 2.9188194274902344, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19483289122581482, |
| "loss/reg": 121.81333923339844, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.012125, |
| "grad_norm": 2.7361960411071777, |
| "grad_norm_var": 0.07615337485008998, |
| "learning_rate": 0.0001, |
| "loss": 124.5378, |
| "loss/crossentropy": 2.8456363677978516, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22739849984645844, |
| "loss/reg": 121.46476745605469, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.01215, |
| "grad_norm": 2.5417439937591553, |
| "grad_norm_var": 0.07765668354199894, |
| "learning_rate": 0.0001, |
| "loss": 124.0698, |
| "loss/crossentropy": 2.7495946884155273, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20217089354991913, |
| "loss/reg": 121.11808013916016, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.012175, |
| "grad_norm": 2.67401385307312, |
| "grad_norm_var": 0.07767344047614415, |
| "learning_rate": 0.0001, |
| "loss": 123.7152, |
| "loss/crossentropy": 2.7279324531555176, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22579890489578247, |
| "loss/reg": 120.76143646240234, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.0122, |
| "grad_norm": 2.488034248352051, |
| "grad_norm_var": 0.08047557627683753, |
| "learning_rate": 0.0001, |
| "loss": 123.451, |
| "loss/crossentropy": 2.8445310592651367, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19405901432037354, |
| "loss/reg": 120.41239929199219, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.012225, |
| "grad_norm": 2.5098259449005127, |
| "grad_norm_var": 0.07791882719567696, |
| "learning_rate": 0.0001, |
| "loss": 123.1306, |
| "loss/crossentropy": 2.8643126487731934, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19319729506969452, |
| "loss/reg": 120.0730972290039, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.01225, |
| "grad_norm": 2.768183469772339, |
| "grad_norm_var": 0.07790408271164324, |
| "learning_rate": 0.0001, |
| "loss": 123.1131, |
| "loss/crossentropy": 3.1591904163360596, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23094454407691956, |
| "loss/reg": 119.72293853759766, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.012275, |
| "grad_norm": 2.5662379264831543, |
| "grad_norm_var": 0.0792095113967413, |
| "learning_rate": 0.0001, |
| "loss": 122.4756, |
| "loss/crossentropy": 2.9084203243255615, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19523289799690247, |
| "loss/reg": 119.37194061279297, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.0123, |
| "grad_norm": 2.7122464179992676, |
| "grad_norm_var": 0.07634484398728673, |
| "learning_rate": 0.0001, |
| "loss": 122.0971, |
| "loss/crossentropy": 2.8523452281951904, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22050230205059052, |
| "loss/reg": 119.02421569824219, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.012325, |
| "grad_norm": 2.6066133975982666, |
| "grad_norm_var": 0.0726872533289639, |
| "learning_rate": 0.0001, |
| "loss": 121.826, |
| "loss/crossentropy": 2.943951368331909, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2006131410598755, |
| "loss/reg": 118.68144989013672, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.01235, |
| "grad_norm": 2.752058506011963, |
| "grad_norm_var": 0.07236263717393664, |
| "learning_rate": 0.0001, |
| "loss": 121.3462, |
| "loss/crossentropy": 2.8118808269500732, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19854308664798737, |
| "loss/reg": 118.33576965332031, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.012375, |
| "grad_norm": 2.920879364013672, |
| "grad_norm_var": 0.07409949102501111, |
| "learning_rate": 0.0001, |
| "loss": 120.8552, |
| "loss/crossentropy": 2.6617374420166016, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19966919720172882, |
| "loss/reg": 117.99381256103516, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.0124, |
| "grad_norm": 2.7351551055908203, |
| "grad_norm_var": 0.07218718704733244, |
| "learning_rate": 0.0001, |
| "loss": 121.014, |
| "loss/crossentropy": 3.1586992740631104, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20681920647621155, |
| "loss/reg": 117.6484603881836, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.012425, |
| "grad_norm": 2.35911226272583, |
| "grad_norm_var": 0.05770549480844901, |
| "learning_rate": 0.0001, |
| "loss": 120.1416, |
| "loss/crossentropy": 2.6408603191375732, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19257068634033203, |
| "loss/reg": 117.30819702148438, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.01245, |
| "grad_norm": 2.738931655883789, |
| "grad_norm_var": 0.057565104717087694, |
| "learning_rate": 0.0001, |
| "loss": 119.9577, |
| "loss/crossentropy": 2.8233439922332764, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.17061106860637665, |
| "loss/reg": 116.96371459960938, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.012475, |
| "grad_norm": 3.1137208938598633, |
| "grad_norm_var": 0.03367133349540812, |
| "learning_rate": 0.0001, |
| "loss": 119.923, |
| "loss/crossentropy": 3.0651209354400635, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23511168360710144, |
| "loss/reg": 116.62273406982422, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.0125, |
| "grad_norm": 2.931697130203247, |
| "grad_norm_var": 0.03564747630760602, |
| "learning_rate": 0.0001, |
| "loss": 119.9291, |
| "loss/crossentropy": 3.406992197036743, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23726436495780945, |
| "loss/reg": 116.28487396240234, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.012525, |
| "grad_norm": 2.700242280960083, |
| "grad_norm_var": 0.035541163062665034, |
| "learning_rate": 0.0001, |
| "loss": 119.0308, |
| "loss/crossentropy": 2.8376407623291016, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24376104772090912, |
| "loss/reg": 115.94940948486328, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.01255, |
| "grad_norm": 2.5305213928222656, |
| "grad_norm_var": 0.03577823695906375, |
| "learning_rate": 0.0001, |
| "loss": 118.5299, |
| "loss/crossentropy": 2.7097437381744385, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21053358912467957, |
| "loss/reg": 115.60960388183594, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.012575, |
| "grad_norm": 2.849966526031494, |
| "grad_norm_var": 0.03723922016397386, |
| "learning_rate": 0.0001, |
| "loss": 118.188, |
| "loss/crossentropy": 2.718074083328247, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20146432518959045, |
| "loss/reg": 115.26842498779297, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.0126, |
| "grad_norm": 6.95852518081665, |
| "grad_norm_var": 1.1568663516812001, |
| "learning_rate": 0.0001, |
| "loss": 118.1133, |
| "loss/crossentropy": 2.935373306274414, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23765507340431213, |
| "loss/reg": 114.94031524658203, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.012625, |
| "grad_norm": 2.5411369800567627, |
| "grad_norm_var": 1.1549454537059127, |
| "learning_rate": 0.0001, |
| "loss": 117.2537, |
| "loss/crossentropy": 2.4439382553100586, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19584742188453674, |
| "loss/reg": 114.61389923095703, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.01265, |
| "grad_norm": 2.5562095642089844, |
| "grad_norm_var": 1.1639262533207444, |
| "learning_rate": 0.0001, |
| "loss": 117.1436, |
| "loss/crossentropy": 2.656878709793091, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20227234065532684, |
| "loss/reg": 114.28450012207031, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.012675, |
| "grad_norm": 2.452930212020874, |
| "grad_norm_var": 1.1708788671982782, |
| "learning_rate": 0.0001, |
| "loss": 116.7552, |
| "loss/crossentropy": 2.6194908618927, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18559934198856354, |
| "loss/reg": 113.95010375976562, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.0127, |
| "grad_norm": 2.600403308868408, |
| "grad_norm_var": 1.1754484294589225, |
| "learning_rate": 0.0001, |
| "loss": 116.878, |
| "loss/crossentropy": 3.047492504119873, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21343189477920532, |
| "loss/reg": 113.61707305908203, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.012725, |
| "grad_norm": 2.8134710788726807, |
| "grad_norm_var": 1.1683965532079448, |
| "learning_rate": 0.0001, |
| "loss": 116.1436, |
| "loss/crossentropy": 2.6284308433532715, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22599661350250244, |
| "loss/reg": 113.28915405273438, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.01275, |
| "grad_norm": 3.0693750381469727, |
| "grad_norm_var": 1.1653763573131297, |
| "learning_rate": 0.0001, |
| "loss": 116.0373, |
| "loss/crossentropy": 2.8555266857147217, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21752798557281494, |
| "loss/reg": 112.96427154541016, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.012775, |
| "grad_norm": 3.6787264347076416, |
| "grad_norm_var": 1.1940838877816609, |
| "learning_rate": 0.0001, |
| "loss": 115.4654, |
| "loss/crossentropy": 2.5687825679779053, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2617979049682617, |
| "loss/reg": 112.63484191894531, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.0128, |
| "grad_norm": 2.7873494625091553, |
| "grad_norm_var": 1.192136957506784, |
| "learning_rate": 0.0001, |
| "loss": 115.1653, |
| "loss/crossentropy": 2.6631510257720947, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.191485196352005, |
| "loss/reg": 112.31067657470703, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.012825, |
| "grad_norm": 2.6256515979766846, |
| "grad_norm_var": 1.1722853783887095, |
| "learning_rate": 0.0001, |
| "loss": 114.9084, |
| "loss/crossentropy": 2.7325098514556885, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19808584451675415, |
| "loss/reg": 111.97781372070312, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.01285, |
| "grad_norm": 2.967942237854004, |
| "grad_norm_var": 1.1657807662503947, |
| "learning_rate": 0.0001, |
| "loss": 114.5064, |
| "loss/crossentropy": 2.6529791355133057, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20329859852790833, |
| "loss/reg": 111.650146484375, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.012875, |
| "grad_norm": 2.934296131134033, |
| "grad_norm_var": 1.166833422533542, |
| "learning_rate": 0.0001, |
| "loss": 114.5007, |
| "loss/crossentropy": 2.951188087463379, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22600007057189941, |
| "loss/reg": 111.32347869873047, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.0129, |
| "grad_norm": 2.6087300777435303, |
| "grad_norm_var": 1.178981137512823, |
| "learning_rate": 0.0001, |
| "loss": 114.303, |
| "loss/crossentropy": 3.0892579555511475, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.219014972448349, |
| "loss/reg": 110.99468994140625, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.012925, |
| "grad_norm": 2.6592049598693848, |
| "grad_norm_var": 1.1809575567663138, |
| "learning_rate": 0.0001, |
| "loss": 113.656, |
| "loss/crossentropy": 2.759434461593628, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23085039854049683, |
| "loss/reg": 110.66567993164062, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.01295, |
| "grad_norm": 2.7460856437683105, |
| "grad_norm_var": 1.169228407645687, |
| "learning_rate": 0.0001, |
| "loss": 113.58, |
| "loss/crossentropy": 3.019660472869873, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22078245878219604, |
| "loss/reg": 110.3395767211914, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.012975, |
| "grad_norm": 2.9937329292297363, |
| "grad_norm_var": 1.1666258859232923, |
| "learning_rate": 0.0001, |
| "loss": 112.7048, |
| "loss/crossentropy": 2.477316379547119, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21349506080150604, |
| "loss/reg": 110.01399993896484, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.013, |
| "grad_norm": 2.8256266117095947, |
| "grad_norm_var": 0.0870475905341967, |
| "learning_rate": 0.0001, |
| "loss": 112.8148, |
| "loss/crossentropy": 2.904346466064453, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22682815790176392, |
| "loss/reg": 109.68363952636719, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.013025, |
| "grad_norm": 2.6826443672180176, |
| "grad_norm_var": 0.08334319224761823, |
| "learning_rate": 0.0001, |
| "loss": 112.4453, |
| "loss/crossentropy": 2.8740198612213135, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21089796721935272, |
| "loss/reg": 109.36034393310547, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.01305, |
| "grad_norm": 2.876612424850464, |
| "grad_norm_var": 0.07880413790800654, |
| "learning_rate": 0.0001, |
| "loss": 112.014, |
| "loss/crossentropy": 2.752030372619629, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22804582118988037, |
| "loss/reg": 109.03395080566406, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.013075, |
| "grad_norm": 2.6571719646453857, |
| "grad_norm_var": 0.07107003720365886, |
| "learning_rate": 0.0001, |
| "loss": 112.0514, |
| "loss/crossentropy": 3.134460926055908, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20280694961547852, |
| "loss/reg": 108.7141342163086, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.0131, |
| "grad_norm": 2.6310062408447266, |
| "grad_norm_var": 0.07012872943871476, |
| "learning_rate": 0.0001, |
| "loss": 111.5231, |
| "loss/crossentropy": 2.899681329727173, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2234225869178772, |
| "loss/reg": 108.39997863769531, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.013125, |
| "grad_norm": 2.8861429691314697, |
| "grad_norm_var": 0.07013051549444356, |
| "learning_rate": 0.0001, |
| "loss": 111.3517, |
| "loss/crossentropy": 3.016164779663086, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2486555576324463, |
| "loss/reg": 108.08689880371094, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.01315, |
| "grad_norm": 11.108626365661621, |
| "grad_norm_var": 4.342596426812534, |
| "learning_rate": 0.0001, |
| "loss": 111.424, |
| "loss/crossentropy": 3.3841826915740967, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.27035826444625854, |
| "loss/reg": 107.76946258544922, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.013175, |
| "grad_norm": 2.765962600708008, |
| "grad_norm_var": 4.355189952794281, |
| "learning_rate": 0.0001, |
| "loss": 110.3478, |
| "loss/crossentropy": 2.66182279586792, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2297273576259613, |
| "loss/reg": 107.45628356933594, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.0132, |
| "grad_norm": 3.090799570083618, |
| "grad_norm_var": 4.340312503643294, |
| "learning_rate": 0.0001, |
| "loss": 110.4536, |
| "loss/crossentropy": 3.0606637001037598, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2523239552974701, |
| "loss/reg": 107.14060974121094, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.013225, |
| "grad_norm": 2.63191819190979, |
| "grad_norm_var": 4.339737919036126, |
| "learning_rate": 0.0001, |
| "loss": 109.6669, |
| "loss/crossentropy": 2.6498641967773438, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1968306005001068, |
| "loss/reg": 106.8201904296875, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.01325, |
| "grad_norm": 2.5872347354888916, |
| "grad_norm_var": 4.366497639190141, |
| "learning_rate": 0.0001, |
| "loss": 109.3358, |
| "loss/crossentropy": 2.615196466445923, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21458838880062103, |
| "loss/reg": 106.50599670410156, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.013275, |
| "grad_norm": 3.557237148284912, |
| "grad_norm_var": 4.360969037365885, |
| "learning_rate": 0.0001, |
| "loss": 109.2467, |
| "loss/crossentropy": 2.8124053478240967, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2458515763282776, |
| "loss/reg": 106.18840789794922, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.0133, |
| "grad_norm": 9.031005859375, |
| "grad_norm_var": 6.319656798143334, |
| "learning_rate": 0.0001, |
| "loss": 108.9254, |
| "loss/crossentropy": 2.764629602432251, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2858869433403015, |
| "loss/reg": 105.87490844726562, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.013325, |
| "grad_norm": 3.397695302963257, |
| "grad_norm_var": 6.247992121947124, |
| "learning_rate": 0.0001, |
| "loss": 108.913, |
| "loss/crossentropy": 3.053879737854004, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2985405921936035, |
| "loss/reg": 105.5605697631836, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.01335, |
| "grad_norm": 5.6299591064453125, |
| "grad_norm_var": 6.370482684906529, |
| "learning_rate": 0.0001, |
| "loss": 108.2695, |
| "loss/crossentropy": 2.793619155883789, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2356250137090683, |
| "loss/reg": 105.24028778076172, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.013375, |
| "grad_norm": 3.0469837188720703, |
| "grad_norm_var": 6.363802254153521, |
| "learning_rate": 0.0001, |
| "loss": 108.5512, |
| "loss/crossentropy": 3.3861045837402344, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23634299635887146, |
| "loss/reg": 104.92872619628906, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.0134, |
| "grad_norm": 2.819078207015991, |
| "grad_norm_var": 6.364797923503401, |
| "learning_rate": 0.0001, |
| "loss": 107.8459, |
| "loss/crossentropy": 2.994354724884033, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23442190885543823, |
| "loss/reg": 104.6171646118164, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.013425, |
| "grad_norm": 5.84004545211792, |
| "grad_norm_var": 6.449067359728785, |
| "learning_rate": 0.0001, |
| "loss": 107.6525, |
| "loss/crossentropy": 3.0567657947540283, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2871782183647156, |
| "loss/reg": 104.30860137939453, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.01345, |
| "grad_norm": 2.5338470935821533, |
| "grad_norm_var": 6.515056601417896, |
| "learning_rate": 0.0001, |
| "loss": 106.7596, |
| "loss/crossentropy": 2.5571489334106445, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20038628578186035, |
| "loss/reg": 104.00208282470703, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.013475, |
| "grad_norm": 2.7171146869659424, |
| "grad_norm_var": 6.503442502818018, |
| "learning_rate": 0.0001, |
| "loss": 106.7376, |
| "loss/crossentropy": 2.8135974407196045, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22917550802230835, |
| "loss/reg": 103.69479370117188, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.0135, |
| "grad_norm": 6.479078769683838, |
| "grad_norm_var": 6.653581035332929, |
| "learning_rate": 0.0001, |
| "loss": 106.5785, |
| "loss/crossentropy": 2.8556628227233887, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.33066555857658386, |
| "loss/reg": 103.3922119140625, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.013525, |
| "grad_norm": 2.6559464931488037, |
| "grad_norm_var": 6.702825655017004, |
| "learning_rate": 0.0001, |
| "loss": 106.0303, |
| "loss/crossentropy": 2.7272911071777344, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21066486835479736, |
| "loss/reg": 103.09235382080078, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.01355, |
| "grad_norm": 2.68630313873291, |
| "grad_norm_var": 3.5670498293655744, |
| "learning_rate": 0.0001, |
| "loss": 105.6865, |
| "loss/crossentropy": 2.6875898838043213, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20091715455055237, |
| "loss/reg": 102.79798126220703, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.013575, |
| "grad_norm": 2.6095407009124756, |
| "grad_norm_var": 3.59101884290791, |
| "learning_rate": 0.0001, |
| "loss": 105.3207, |
| "loss/crossentropy": 2.607323408126831, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20690517127513885, |
| "loss/reg": 102.50646209716797, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.0136, |
| "grad_norm": 2.6883742809295654, |
| "grad_norm_var": 3.6409168446953246, |
| "learning_rate": 0.0001, |
| "loss": 105.2911, |
| "loss/crossentropy": 2.8654534816741943, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21081852912902832, |
| "loss/reg": 102.21478271484375, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.013625, |
| "grad_norm": 2.5874383449554443, |
| "grad_norm_var": 3.648009256619466, |
| "learning_rate": 0.0001, |
| "loss": 104.9956, |
| "loss/crossentropy": 2.858520984649658, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21754832565784454, |
| "loss/reg": 101.91951751708984, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.01365, |
| "grad_norm": 2.54441499710083, |
| "grad_norm_var": 3.6550717570432996, |
| "learning_rate": 0.0001, |
| "loss": 104.3815, |
| "loss/crossentropy": 2.5437026023864746, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2128550112247467, |
| "loss/reg": 101.62492370605469, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.013675, |
| "grad_norm": 2.422410488128662, |
| "grad_norm_var": 3.7725212936238335, |
| "learning_rate": 0.0001, |
| "loss": 104.2026, |
| "loss/crossentropy": 2.678537607192993, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19898445904254913, |
| "loss/reg": 101.32511138916016, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.0137, |
| "grad_norm": 2.816225051879883, |
| "grad_norm_var": 1.7943565080708708, |
| "learning_rate": 0.0001, |
| "loss": 104.3567, |
| "loss/crossentropy": 3.10886549949646, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22006894648075104, |
| "loss/reg": 101.02771759033203, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.013725, |
| "grad_norm": 2.9122800827026367, |
| "grad_norm_var": 1.805488475198342, |
| "learning_rate": 0.0001, |
| "loss": 103.6283, |
| "loss/crossentropy": 2.6595304012298584, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23327180743217468, |
| "loss/reg": 100.73548126220703, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.01375, |
| "grad_norm": 2.715149402618408, |
| "grad_norm_var": 1.4355691908428239, |
| "learning_rate": 0.0001, |
| "loss": 103.5837, |
| "loss/crossentropy": 2.9111499786376953, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2255546748638153, |
| "loss/reg": 100.4470443725586, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.013775, |
| "grad_norm": 2.6600630283355713, |
| "grad_norm_var": 1.4491900778787985, |
| "learning_rate": 0.0001, |
| "loss": 102.9596, |
| "loss/crossentropy": 2.591811180114746, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2164371758699417, |
| "loss/reg": 100.15138244628906, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.0138, |
| "grad_norm": 2.811176061630249, |
| "grad_norm_var": 1.4494957147530347, |
| "learning_rate": 0.0001, |
| "loss": 102.7969, |
| "loss/crossentropy": 2.7154479026794434, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22633354365825653, |
| "loss/reg": 99.85511779785156, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.013825, |
| "grad_norm": 2.5404551029205322, |
| "grad_norm_var": 0.9266648578686081, |
| "learning_rate": 0.0001, |
| "loss": 102.5948, |
| "loss/crossentropy": 2.83707857131958, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19505436718463898, |
| "loss/reg": 99.56267547607422, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.01385, |
| "grad_norm": 3.074483871459961, |
| "grad_norm_var": 0.9186296960512579, |
| "learning_rate": 0.0001, |
| "loss": 102.4509, |
| "loss/crossentropy": 2.912346363067627, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2622039020061493, |
| "loss/reg": 99.27632904052734, |
| "step": 554 |
| }, |
| { |
| "epoch": 0.013875, |
| "grad_norm": 2.686178207397461, |
| "grad_norm_var": 0.9195780649456933, |
| "learning_rate": 0.0001, |
| "loss": 102.0401, |
| "loss/crossentropy": 2.8341047763824463, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21929171681404114, |
| "loss/reg": 98.98668670654297, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.0139, |
| "grad_norm": 2.656536340713501, |
| "grad_norm_var": 0.024253446700591517, |
| "learning_rate": 0.0001, |
| "loss": 101.5869, |
| "loss/crossentropy": 2.6577670574188232, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23103003203868866, |
| "loss/reg": 98.69813537597656, |
| "step": 556 |
| }, |
| { |
| "epoch": 0.013925, |
| "grad_norm": 2.5510590076446533, |
| "grad_norm_var": 0.025440849818042465, |
| "learning_rate": 0.0001, |
| "loss": 100.9329, |
| "loss/crossentropy": 2.326843738555908, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18946564197540283, |
| "loss/reg": 98.41661071777344, |
| "step": 557 |
| }, |
| { |
| "epoch": 0.01395, |
| "grad_norm": 2.6039011478424072, |
| "grad_norm_var": 0.02585234669650897, |
| "learning_rate": 0.0001, |
| "loss": 101.0391, |
| "loss/crossentropy": 2.6925387382507324, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2089564949274063, |
| "loss/reg": 98.13763427734375, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.013975, |
| "grad_norm": 2.4973230361938477, |
| "grad_norm_var": 0.027693340503882762, |
| "learning_rate": 0.0001, |
| "loss": 100.7989, |
| "loss/crossentropy": 2.747386932373047, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2009141445159912, |
| "loss/reg": 97.85063934326172, |
| "step": 559 |
| }, |
| { |
| "epoch": 0.014, |
| "grad_norm": 2.5563371181488037, |
| "grad_norm_var": 0.028511705384668556, |
| "learning_rate": 0.0001, |
| "loss": 100.641, |
| "loss/crossentropy": 2.8731026649475098, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20369890332221985, |
| "loss/reg": 97.56423950195312, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.014025, |
| "grad_norm": 2.81292724609375, |
| "grad_norm_var": 0.029366212464935634, |
| "learning_rate": 0.0001, |
| "loss": 100.1638, |
| "loss/crossentropy": 2.6780025959014893, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20065699517726898, |
| "loss/reg": 97.28515625, |
| "step": 561 |
| }, |
| { |
| "epoch": 0.01405, |
| "grad_norm": 2.867063522338867, |
| "grad_norm_var": 0.03009105233082489, |
| "learning_rate": 0.0001, |
| "loss": 99.9774, |
| "loss/crossentropy": 2.7445759773254395, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2291974425315857, |
| "loss/reg": 97.00367736816406, |
| "step": 562 |
| }, |
| { |
| "epoch": 0.014075, |
| "grad_norm": 2.4345862865448, |
| "grad_norm_var": 0.02965133530149539, |
| "learning_rate": 0.0001, |
| "loss": 99.6903, |
| "loss/crossentropy": 2.768172264099121, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19444233179092407, |
| "loss/reg": 96.72771453857422, |
| "step": 563 |
| }, |
| { |
| "epoch": 0.0141, |
| "grad_norm": 2.8722984790802, |
| "grad_norm_var": 0.0307187897240811, |
| "learning_rate": 0.0001, |
| "loss": 99.6226, |
| "loss/crossentropy": 2.9429454803466797, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2194785326719284, |
| "loss/reg": 96.46016693115234, |
| "step": 564 |
| }, |
| { |
| "epoch": 0.014125, |
| "grad_norm": 2.9882256984710693, |
| "grad_norm_var": 0.03319604425916699, |
| "learning_rate": 0.0001, |
| "loss": 99.6137, |
| "loss/crossentropy": 3.175783395767212, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24568451941013336, |
| "loss/reg": 96.19220733642578, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.01415, |
| "grad_norm": 2.822582721710205, |
| "grad_norm_var": 0.03402003702614896, |
| "learning_rate": 0.0001, |
| "loss": 99.1781, |
| "loss/crossentropy": 3.020627021789551, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2289256751537323, |
| "loss/reg": 95.92849731445312, |
| "step": 566 |
| }, |
| { |
| "epoch": 0.014175, |
| "grad_norm": 2.7644307613372803, |
| "grad_norm_var": 0.03394051714071698, |
| "learning_rate": 0.0001, |
| "loss": 98.6117, |
| "loss/crossentropy": 2.714923858642578, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2449963539838791, |
| "loss/reg": 95.65177154541016, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.0142, |
| "grad_norm": 2.699716091156006, |
| "grad_norm_var": 0.03338014972604976, |
| "learning_rate": 0.0001, |
| "loss": 98.4869, |
| "loss/crossentropy": 2.897766351699829, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2104054093360901, |
| "loss/reg": 95.37874603271484, |
| "step": 568 |
| }, |
| { |
| "epoch": 0.014225, |
| "grad_norm": 2.8579864501953125, |
| "grad_norm_var": 0.03232346391338676, |
| "learning_rate": 0.0001, |
| "loss": 98.2656, |
| "loss/crossentropy": 2.9202752113342285, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23916882276535034, |
| "loss/reg": 95.10614013671875, |
| "step": 569 |
| }, |
| { |
| "epoch": 0.01425, |
| "grad_norm": 2.753019332885742, |
| "grad_norm_var": 0.024192763356738093, |
| "learning_rate": 0.0001, |
| "loss": 98.0069, |
| "loss/crossentropy": 2.944240093231201, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22491341829299927, |
| "loss/reg": 94.8377914428711, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.014275, |
| "grad_norm": 2.60459566116333, |
| "grad_norm_var": 0.02491149826441017, |
| "learning_rate": 0.0001, |
| "loss": 97.608, |
| "loss/crossentropy": 2.819348096847534, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22620517015457153, |
| "loss/reg": 94.56243133544922, |
| "step": 571 |
| }, |
| { |
| "epoch": 0.0143, |
| "grad_norm": 2.870256185531616, |
| "grad_norm_var": 0.02627376883378929, |
| "learning_rate": 0.0001, |
| "loss": 97.2411, |
| "loss/crossentropy": 2.742431879043579, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21404124796390533, |
| "loss/reg": 94.28459167480469, |
| "step": 572 |
| }, |
| { |
| "epoch": 0.014325, |
| "grad_norm": 2.3389241695404053, |
| "grad_norm_var": 0.033928965438431644, |
| "learning_rate": 0.0001, |
| "loss": 96.7127, |
| "loss/crossentropy": 2.517883777618408, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19041433930397034, |
| "loss/reg": 94.0043716430664, |
| "step": 573 |
| }, |
| { |
| "epoch": 0.01435, |
| "grad_norm": 2.59843111038208, |
| "grad_norm_var": 0.034007496068778426, |
| "learning_rate": 0.0001, |
| "loss": 96.8607, |
| "loss/crossentropy": 2.9151978492736816, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21700671315193176, |
| "loss/reg": 93.72846221923828, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.014375, |
| "grad_norm": 3.3285093307495117, |
| "grad_norm_var": 0.05376453050990312, |
| "learning_rate": 0.0001, |
| "loss": 96.8123, |
| "loss/crossentropy": 3.1045849323272705, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2494984120130539, |
| "loss/reg": 93.45817565917969, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.0144, |
| "grad_norm": 2.6628782749176025, |
| "grad_norm_var": 0.0515720576900262, |
| "learning_rate": 0.0001, |
| "loss": 96.3792, |
| "loss/crossentropy": 2.955598831176758, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2415456622838974, |
| "loss/reg": 93.18202209472656, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.014425, |
| "grad_norm": 4.186471939086914, |
| "grad_norm_var": 0.1778464831949935, |
| "learning_rate": 0.0001, |
| "loss": 96.081, |
| "loss/crossentropy": 2.8772761821746826, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2912898063659668, |
| "loss/reg": 92.9124526977539, |
| "step": 577 |
| }, |
| { |
| "epoch": 0.01445, |
| "grad_norm": 2.668438196182251, |
| "grad_norm_var": 0.17994305561740684, |
| "learning_rate": 0.0001, |
| "loss": 95.7919, |
| "loss/crossentropy": 2.9419538974761963, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21084845066070557, |
| "loss/reg": 92.63912200927734, |
| "step": 578 |
| }, |
| { |
| "epoch": 0.014475, |
| "grad_norm": 2.549457311630249, |
| "grad_norm_var": 0.17454752775228427, |
| "learning_rate": 0.0001, |
| "loss": 95.1398, |
| "loss/crossentropy": 2.5472424030303955, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2210979163646698, |
| "loss/reg": 92.37150573730469, |
| "step": 579 |
| }, |
| { |
| "epoch": 0.0145, |
| "grad_norm": 3.0102109909057617, |
| "grad_norm_var": 0.17618512136317932, |
| "learning_rate": 0.0001, |
| "loss": 95.1484, |
| "loss/crossentropy": 2.8119232654571533, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2312673032283783, |
| "loss/reg": 92.10520935058594, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.014525, |
| "grad_norm": 2.673635721206665, |
| "grad_norm_var": 0.17684562367797388, |
| "learning_rate": 0.0001, |
| "loss": 94.916, |
| "loss/crossentropy": 2.840714693069458, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24130380153656006, |
| "loss/reg": 91.83395385742188, |
| "step": 581 |
| }, |
| { |
| "epoch": 0.01455, |
| "grad_norm": 2.600407361984253, |
| "grad_norm_var": 0.18035328363555816, |
| "learning_rate": 0.0001, |
| "loss": 94.4715, |
| "loss/crossentropy": 2.7048165798187256, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.202452152967453, |
| "loss/reg": 91.56422424316406, |
| "step": 582 |
| }, |
| { |
| "epoch": 0.014575, |
| "grad_norm": 3.064331531524658, |
| "grad_norm_var": 0.18363414575108336, |
| "learning_rate": 0.0001, |
| "loss": 94.3897, |
| "loss/crossentropy": 2.871046304702759, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21794039011001587, |
| "loss/reg": 91.30074310302734, |
| "step": 583 |
| }, |
| { |
| "epoch": 0.0146, |
| "grad_norm": 2.4271910190582275, |
| "grad_norm_var": 0.19343539696492276, |
| "learning_rate": 0.0001, |
| "loss": 94.3164, |
| "loss/crossentropy": 3.068648099899292, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20914003252983093, |
| "loss/reg": 91.03858947753906, |
| "step": 584 |
| }, |
| { |
| "epoch": 0.014625, |
| "grad_norm": 2.773268699645996, |
| "grad_norm_var": 0.1935076502725669, |
| "learning_rate": 0.0001, |
| "loss": 93.7491, |
| "loss/crossentropy": 2.722926378250122, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24941931664943695, |
| "loss/reg": 90.77677154541016, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.01465, |
| "grad_norm": 2.473400592803955, |
| "grad_norm_var": 0.2008682828648034, |
| "learning_rate": 0.0001, |
| "loss": 93.6203, |
| "loss/crossentropy": 2.8903560638427734, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21103140711784363, |
| "loss/reg": 90.51893615722656, |
| "step": 586 |
| }, |
| { |
| "epoch": 0.014675, |
| "grad_norm": 2.6214702129364014, |
| "grad_norm_var": 0.20044215566161913, |
| "learning_rate": 0.0001, |
| "loss": 93.2864, |
| "loss/crossentropy": 2.8173437118530273, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21141797304153442, |
| "loss/reg": 90.25767517089844, |
| "step": 587 |
| }, |
| { |
| "epoch": 0.0147, |
| "grad_norm": 2.776019811630249, |
| "grad_norm_var": 0.20015155933538128, |
| "learning_rate": 0.0001, |
| "loss": 92.9381, |
| "loss/crossentropy": 2.7188355922698975, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21627959609031677, |
| "loss/reg": 90.00298309326172, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.014725, |
| "grad_norm": 2.63840389251709, |
| "grad_norm_var": 0.18746319834137826, |
| "learning_rate": 0.0001, |
| "loss": 92.7113, |
| "loss/crossentropy": 2.7562100887298584, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2082541435956955, |
| "loss/reg": 89.74687957763672, |
| "step": 589 |
| }, |
| { |
| "epoch": 0.01475, |
| "grad_norm": 2.835721015930176, |
| "grad_norm_var": 0.1841056372587597, |
| "learning_rate": 0.0001, |
| "loss": 92.324, |
| "loss/crossentropy": 2.6325843334198, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20466585457324982, |
| "loss/reg": 89.4867935180664, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.014775, |
| "grad_norm": 2.687760829925537, |
| "grad_norm_var": 0.16722875087179906, |
| "learning_rate": 0.0001, |
| "loss": 92.2685, |
| "loss/crossentropy": 2.82861328125, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2116677463054657, |
| "loss/reg": 89.22821044921875, |
| "step": 591 |
| }, |
| { |
| "epoch": 0.0148, |
| "grad_norm": 3.145275354385376, |
| "grad_norm_var": 0.17356006417378372, |
| "learning_rate": 0.0001, |
| "loss": 91.9296, |
| "loss/crossentropy": 2.7166497707366943, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24187356233596802, |
| "loss/reg": 88.9710693359375, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.014825, |
| "grad_norm": 2.6644723415374756, |
| "grad_norm_var": 0.04118301322724444, |
| "learning_rate": 0.0001, |
| "loss": 91.8205, |
| "loss/crossentropy": 2.891160249710083, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2127176821231842, |
| "loss/reg": 88.71666717529297, |
| "step": 593 |
| }, |
| { |
| "epoch": 0.01485, |
| "grad_norm": 2.5222256183624268, |
| "grad_norm_var": 0.043633350924598586, |
| "learning_rate": 0.0001, |
| "loss": 91.2693, |
| "loss/crossentropy": 2.588534355163574, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2142462432384491, |
| "loss/reg": 88.46653747558594, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.014875, |
| "grad_norm": 2.6689634323120117, |
| "grad_norm_var": 0.04186501943967471, |
| "learning_rate": 0.0001, |
| "loss": 91.1498, |
| "loss/crossentropy": 2.731534957885742, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20442616939544678, |
| "loss/reg": 88.2138442993164, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.0149, |
| "grad_norm": 2.733809471130371, |
| "grad_norm_var": 0.03608913512671942, |
| "learning_rate": 0.0001, |
| "loss": 91.2427, |
| "loss/crossentropy": 3.0743908882141113, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21326132118701935, |
| "loss/reg": 87.95504760742188, |
| "step": 596 |
| }, |
| { |
| "epoch": 0.014925, |
| "grad_norm": 2.6387839317321777, |
| "grad_norm_var": 0.03631845228885571, |
| "learning_rate": 0.0001, |
| "loss": 90.743, |
| "loss/crossentropy": 2.8121533393859863, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22556063532829285, |
| "loss/reg": 87.70523834228516, |
| "step": 597 |
| }, |
| { |
| "epoch": 0.01495, |
| "grad_norm": 2.6532163619995117, |
| "grad_norm_var": 0.035760032396463734, |
| "learning_rate": 0.0001, |
| "loss": 90.4519, |
| "loss/crossentropy": 2.7932353019714355, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20535191893577576, |
| "loss/reg": 87.45335388183594, |
| "step": 598 |
| }, |
| { |
| "epoch": 0.014975, |
| "grad_norm": 2.998858690261841, |
| "grad_norm_var": 0.03291526795530686, |
| "learning_rate": 0.0001, |
| "loss": 90.3378, |
| "loss/crossentropy": 2.908860206604004, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22954756021499634, |
| "loss/reg": 87.19943237304688, |
| "step": 599 |
| }, |
| { |
| "epoch": 0.015, |
| "grad_norm": 13.023781776428223, |
| "grad_norm_var": 6.660256756747305, |
| "learning_rate": 0.0001, |
| "loss": 90.0128, |
| "loss/crossentropy": 2.8209877014160156, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2434820979833603, |
| "loss/reg": 86.94837188720703, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.015025, |
| "grad_norm": 3.2308173179626465, |
| "grad_norm_var": 6.637182891814268, |
| "learning_rate": 0.0001, |
| "loss": 89.5445, |
| "loss/crossentropy": 2.5709166526794434, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.27399659156799316, |
| "loss/reg": 86.69955444335938, |
| "step": 601 |
| }, |
| { |
| "epoch": 0.01505, |
| "grad_norm": 2.7746541500091553, |
| "grad_norm_var": 6.605854606820831, |
| "learning_rate": 0.0001, |
| "loss": 89.6479, |
| "loss/crossentropy": 2.955522060394287, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2408815622329712, |
| "loss/reg": 86.45153045654297, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.015075, |
| "grad_norm": 2.534477472305298, |
| "grad_norm_var": 6.615513089198629, |
| "learning_rate": 0.0001, |
| "loss": 89.5647, |
| "loss/crossentropy": 3.1383554935455322, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22149553894996643, |
| "loss/reg": 86.2048110961914, |
| "step": 603 |
| }, |
| { |
| "epoch": 0.0151, |
| "grad_norm": 2.6757757663726807, |
| "grad_norm_var": 6.624587476581763, |
| "learning_rate": 0.0001, |
| "loss": 88.9872, |
| "loss/crossentropy": 2.8011903762817383, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22577175498008728, |
| "loss/reg": 85.960205078125, |
| "step": 604 |
| }, |
| { |
| "epoch": 0.015125, |
| "grad_norm": 2.5399699211120605, |
| "grad_norm_var": 6.635210790627498, |
| "learning_rate": 0.0001, |
| "loss": 88.6121, |
| "loss/crossentropy": 2.686098337173462, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21132361888885498, |
| "loss/reg": 85.71470642089844, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.01515, |
| "grad_norm": 3.094851016998291, |
| "grad_norm_var": 6.620065609075882, |
| "learning_rate": 0.0001, |
| "loss": 88.5017, |
| "loss/crossentropy": 2.7985479831695557, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23451220989227295, |
| "loss/reg": 85.46868896484375, |
| "step": 606 |
| }, |
| { |
| "epoch": 0.015175, |
| "grad_norm": 2.8335752487182617, |
| "grad_norm_var": 6.607319105523461, |
| "learning_rate": 0.0001, |
| "loss": 88.5627, |
| "loss/crossentropy": 3.1135289669036865, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22703945636749268, |
| "loss/reg": 85.22212219238281, |
| "step": 607 |
| }, |
| { |
| "epoch": 0.0152, |
| "grad_norm": 2.805544137954712, |
| "grad_norm_var": 6.627015267189299, |
| "learning_rate": 0.0001, |
| "loss": 88.0827, |
| "loss/crossentropy": 2.8752803802490234, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22529396414756775, |
| "loss/reg": 84.98210144042969, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.015225, |
| "grad_norm": 2.7001230716705322, |
| "grad_norm_var": 6.623600272248156, |
| "learning_rate": 0.0001, |
| "loss": 87.6396, |
| "loss/crossentropy": 2.6898162364959717, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21284155547618866, |
| "loss/reg": 84.73692321777344, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.01525, |
| "grad_norm": 3.2781596183776855, |
| "grad_norm_var": 6.570657725923031, |
| "learning_rate": 0.0001, |
| "loss": 87.7335, |
| "loss/crossentropy": 2.9826276302337646, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25785699486732483, |
| "loss/reg": 84.4930191040039, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.015275, |
| "grad_norm": 2.449453830718994, |
| "grad_norm_var": 6.596501814133948, |
| "learning_rate": 0.0001, |
| "loss": 87.1477, |
| "loss/crossentropy": 2.6783015727996826, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22048676013946533, |
| "loss/reg": 84.2489242553711, |
| "step": 611 |
| }, |
| { |
| "epoch": 0.0153, |
| "grad_norm": 2.459174394607544, |
| "grad_norm_var": 6.62690543519524, |
| "learning_rate": 0.0001, |
| "loss": 86.776, |
| "loss/crossentropy": 2.574934244155884, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19860099256038666, |
| "loss/reg": 84.0024185180664, |
| "step": 612 |
| }, |
| { |
| "epoch": 0.015325, |
| "grad_norm": 2.941655158996582, |
| "grad_norm_var": 6.601163552477004, |
| "learning_rate": 0.0001, |
| "loss": 86.9776, |
| "loss/crossentropy": 2.986813545227051, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23044732213020325, |
| "loss/reg": 83.76029968261719, |
| "step": 613 |
| }, |
| { |
| "epoch": 0.01535, |
| "grad_norm": 2.6428897380828857, |
| "grad_norm_var": 6.602249575617103, |
| "learning_rate": 0.0001, |
| "loss": 86.4506, |
| "loss/crossentropy": 2.6974997520446777, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2369295060634613, |
| "loss/reg": 83.51612091064453, |
| "step": 614 |
| }, |
| { |
| "epoch": 0.015375, |
| "grad_norm": 2.6571388244628906, |
| "grad_norm_var": 6.629487272361301, |
| "learning_rate": 0.0001, |
| "loss": 86.3883, |
| "loss/crossentropy": 2.886151075363159, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22736769914627075, |
| "loss/reg": 83.27473449707031, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.0154, |
| "grad_norm": 2.609266996383667, |
| "grad_norm_var": 0.06577351547558692, |
| "learning_rate": 0.0001, |
| "loss": 86.0519, |
| "loss/crossentropy": 2.794356346130371, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22279015183448792, |
| "loss/reg": 83.0347900390625, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.015425, |
| "grad_norm": 3.1104328632354736, |
| "grad_norm_var": 0.05918982306040803, |
| "learning_rate": 0.0001, |
| "loss": 85.9367, |
| "loss/crossentropy": 2.882596015930176, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25845298171043396, |
| "loss/reg": 82.79560852050781, |
| "step": 617 |
| }, |
| { |
| "epoch": 0.01545, |
| "grad_norm": 2.470853090286255, |
| "grad_norm_var": 0.06423085419138465, |
| "learning_rate": 0.0001, |
| "loss": 85.5665, |
| "loss/crossentropy": 2.796099901199341, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.214504212141037, |
| "loss/reg": 82.55593872070312, |
| "step": 618 |
| }, |
| { |
| "epoch": 0.015475, |
| "grad_norm": 2.714553117752075, |
| "grad_norm_var": 0.06137795437797967, |
| "learning_rate": 0.0001, |
| "loss": 85.3188, |
| "loss/crossentropy": 2.7812206745147705, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2175397127866745, |
| "loss/reg": 82.32003021240234, |
| "step": 619 |
| }, |
| { |
| "epoch": 0.0155, |
| "grad_norm": 2.775290012359619, |
| "grad_norm_var": 0.061025800256583315, |
| "learning_rate": 0.0001, |
| "loss": 85.1917, |
| "loss/crossentropy": 2.8803231716156006, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22989241778850555, |
| "loss/reg": 82.0815200805664, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.015525, |
| "grad_norm": 3.064432382583618, |
| "grad_norm_var": 0.06316760074340261, |
| "learning_rate": 0.0001, |
| "loss": 84.9975, |
| "loss/crossentropy": 2.9176337718963623, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2374771535396576, |
| "loss/reg": 81.8424072265625, |
| "step": 621 |
| }, |
| { |
| "epoch": 0.01555, |
| "grad_norm": 2.5047502517700195, |
| "grad_norm_var": 0.0607852310360857, |
| "learning_rate": 0.0001, |
| "loss": 84.4023, |
| "loss/crossentropy": 2.5925066471099854, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20711900293827057, |
| "loss/reg": 81.6026611328125, |
| "step": 622 |
| }, |
| { |
| "epoch": 0.015575, |
| "grad_norm": 2.9647669792175293, |
| "grad_norm_var": 0.06330394741956387, |
| "learning_rate": 0.0001, |
| "loss": 84.5351, |
| "loss/crossentropy": 2.934492826461792, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23662039637565613, |
| "loss/reg": 81.36399841308594, |
| "step": 623 |
| }, |
| { |
| "epoch": 0.0156, |
| "grad_norm": 2.6432931423187256, |
| "grad_norm_var": 0.06394843640099997, |
| "learning_rate": 0.0001, |
| "loss": 84.4146, |
| "loss/crossentropy": 3.0514731407165527, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22952324151992798, |
| "loss/reg": 81.13361358642578, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.015625, |
| "grad_norm": 2.54736065864563, |
| "grad_norm_var": 0.06640534283560531, |
| "learning_rate": 0.0001, |
| "loss": 83.7896, |
| "loss/crossentropy": 2.670376777648926, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21750324964523315, |
| "loss/reg": 80.90174865722656, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.01565, |
| "grad_norm": 2.8746328353881836, |
| "grad_norm_var": 0.047605595082000934, |
| "learning_rate": 0.0001, |
| "loss": 83.6016, |
| "loss/crossentropy": 2.705821990966797, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22585441172122955, |
| "loss/reg": 80.66989135742188, |
| "step": 626 |
| }, |
| { |
| "epoch": 0.015675, |
| "grad_norm": 4.1663737297058105, |
| "grad_norm_var": 0.17119830661165428, |
| "learning_rate": 0.0001, |
| "loss": 83.7693, |
| "loss/crossentropy": 3.0719547271728516, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25689268112182617, |
| "loss/reg": 80.44044494628906, |
| "step": 627 |
| }, |
| { |
| "epoch": 0.0157, |
| "grad_norm": 2.9214797019958496, |
| "grad_norm_var": 0.16221115285843457, |
| "learning_rate": 0.0001, |
| "loss": 83.4368, |
| "loss/crossentropy": 2.9782841205596924, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24059516191482544, |
| "loss/reg": 80.21793365478516, |
| "step": 628 |
| }, |
| { |
| "epoch": 0.015725, |
| "grad_norm": 2.3547921180725098, |
| "grad_norm_var": 0.17660964070513122, |
| "learning_rate": 0.0001, |
| "loss": 82.9563, |
| "loss/crossentropy": 2.759288787841797, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2049524188041687, |
| "loss/reg": 79.99205780029297, |
| "step": 629 |
| }, |
| { |
| "epoch": 0.01575, |
| "grad_norm": 2.5864202976226807, |
| "grad_norm_var": 0.17809647704259438, |
| "learning_rate": 0.0001, |
| "loss": 82.856, |
| "loss/crossentropy": 2.891166925430298, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19927763938903809, |
| "loss/reg": 79.76554870605469, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.015775, |
| "grad_norm": 2.579041004180908, |
| "grad_norm_var": 0.1800732301463995, |
| "learning_rate": 0.0001, |
| "loss": 82.3371, |
| "loss/crossentropy": 2.578899621963501, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21475175023078918, |
| "loss/reg": 79.54348754882812, |
| "step": 631 |
| }, |
| { |
| "epoch": 0.0158, |
| "grad_norm": 2.8605313301086426, |
| "grad_norm_var": 0.17744545594942238, |
| "learning_rate": 0.0001, |
| "loss": 82.2226, |
| "loss/crossentropy": 2.6955201625823975, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20546796917915344, |
| "loss/reg": 79.32160186767578, |
| "step": 632 |
| }, |
| { |
| "epoch": 0.015825, |
| "grad_norm": 2.458681344985962, |
| "grad_norm_var": 0.1788587470198704, |
| "learning_rate": 0.0001, |
| "loss": 81.9029, |
| "loss/crossentropy": 2.6130025386810303, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19218957424163818, |
| "loss/reg": 79.09767150878906, |
| "step": 633 |
| }, |
| { |
| "epoch": 0.01585, |
| "grad_norm": 2.6389007568359375, |
| "grad_norm_var": 0.1736867369098557, |
| "learning_rate": 0.0001, |
| "loss": 82.1122, |
| "loss/crossentropy": 2.994948148727417, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24412985146045685, |
| "loss/reg": 78.87307739257812, |
| "step": 634 |
| }, |
| { |
| "epoch": 0.015875, |
| "grad_norm": 2.7426064014434814, |
| "grad_norm_var": 0.17345014249303006, |
| "learning_rate": 0.0001, |
| "loss": 81.7675, |
| "loss/crossentropy": 2.9031403064727783, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2145020067691803, |
| "loss/reg": 78.64982604980469, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.0159, |
| "grad_norm": 2.5872960090637207, |
| "grad_norm_var": 0.176095637618936, |
| "learning_rate": 0.0001, |
| "loss": 81.2096, |
| "loss/crossentropy": 2.5721232891082764, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20824509859085083, |
| "loss/reg": 78.42925262451172, |
| "step": 636 |
| }, |
| { |
| "epoch": 0.015925, |
| "grad_norm": 2.639803886413574, |
| "grad_norm_var": 0.17131557533067104, |
| "learning_rate": 0.0001, |
| "loss": 81.7399, |
| "loss/crossentropy": 3.311671018600464, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2240554839372635, |
| "loss/reg": 78.20418548583984, |
| "step": 637 |
| }, |
| { |
| "epoch": 0.01595, |
| "grad_norm": 2.7064106464385986, |
| "grad_norm_var": 0.1671441066181302, |
| "learning_rate": 0.0001, |
| "loss": 81.0741, |
| "loss/crossentropy": 2.8790698051452637, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21729126572608948, |
| "loss/reg": 77.97772979736328, |
| "step": 638 |
| }, |
| { |
| "epoch": 0.015975, |
| "grad_norm": 2.6912827491760254, |
| "grad_norm_var": 0.16460811219505161, |
| "learning_rate": 0.0001, |
| "loss": 81.0138, |
| "loss/crossentropy": 3.024972915649414, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2377013862133026, |
| "loss/reg": 77.75112915039062, |
| "step": 639 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 2.8207814693450928, |
| "grad_norm_var": 0.16405338147699144, |
| "learning_rate": 0.0001, |
| "loss": 80.6592, |
| "loss/crossentropy": 2.9033145904541016, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23379957675933838, |
| "loss/reg": 77.52207946777344, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.016025, |
| "grad_norm": 2.738354444503784, |
| "grad_norm_var": 0.16089216214350396, |
| "learning_rate": 0.0001, |
| "loss": 80.3146, |
| "loss/crossentropy": 2.778174877166748, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24535414576530457, |
| "loss/reg": 77.29109954833984, |
| "step": 641 |
| }, |
| { |
| "epoch": 0.01605, |
| "grad_norm": 3.2551162242889404, |
| "grad_norm_var": 0.17509802330359298, |
| "learning_rate": 0.0001, |
| "loss": 80.2895, |
| "loss/crossentropy": 2.9776339530944824, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24247264862060547, |
| "loss/reg": 77.0693588256836, |
| "step": 642 |
| }, |
| { |
| "epoch": 0.016075, |
| "grad_norm": 2.881504535675049, |
| "grad_norm_var": 0.043638895164599255, |
| "learning_rate": 0.0001, |
| "loss": 80.1407, |
| "loss/crossentropy": 3.0473055839538574, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24725341796875, |
| "loss/reg": 76.84616088867188, |
| "step": 643 |
| }, |
| { |
| "epoch": 0.0161, |
| "grad_norm": 2.992424249649048, |
| "grad_norm_var": 0.045893014160188275, |
| "learning_rate": 0.0001, |
| "loss": 79.5319, |
| "loss/crossentropy": 2.674531936645508, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23270609974861145, |
| "loss/reg": 76.62467956542969, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.016125, |
| "grad_norm": 2.4216911792755127, |
| "grad_norm_var": 0.04290734773771661, |
| "learning_rate": 0.0001, |
| "loss": 79.4997, |
| "loss/crossentropy": 2.8802490234375, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21543535590171814, |
| "loss/reg": 76.40399932861328, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.01615, |
| "grad_norm": 2.5457189083099365, |
| "grad_norm_var": 0.043763224077495264, |
| "learning_rate": 0.0001, |
| "loss": 79.235, |
| "loss/crossentropy": 2.8406364917755127, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20543472468852997, |
| "loss/reg": 76.18891143798828, |
| "step": 646 |
| }, |
| { |
| "epoch": 0.016175, |
| "grad_norm": 2.762531280517578, |
| "grad_norm_var": 0.04235751730274515, |
| "learning_rate": 0.0001, |
| "loss": 78.9388, |
| "loss/crossentropy": 2.7533273696899414, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21493667364120483, |
| "loss/reg": 75.97050476074219, |
| "step": 647 |
| }, |
| { |
| "epoch": 0.0162, |
| "grad_norm": 2.390070676803589, |
| "grad_norm_var": 0.048252346296710394, |
| "learning_rate": 0.0001, |
| "loss": 78.7404, |
| "loss/crossentropy": 2.7817790508270264, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21161305904388428, |
| "loss/reg": 75.74702453613281, |
| "step": 648 |
| }, |
| { |
| "epoch": 0.016225, |
| "grad_norm": 2.534242868423462, |
| "grad_norm_var": 0.046131862120249896, |
| "learning_rate": 0.0001, |
| "loss": 78.2552, |
| "loss/crossentropy": 2.5109705924987793, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20796047151088715, |
| "loss/reg": 75.53626251220703, |
| "step": 649 |
| }, |
| { |
| "epoch": 0.01625, |
| "grad_norm": 2.9576361179351807, |
| "grad_norm_var": 0.04948971532782949, |
| "learning_rate": 0.0001, |
| "loss": 78.448, |
| "loss/crossentropy": 2.9173483848571777, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20767123997211456, |
| "loss/reg": 75.32296752929688, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.016275, |
| "grad_norm": 3.188835620880127, |
| "grad_norm_var": 0.06273138119426373, |
| "learning_rate": 0.0001, |
| "loss": 78.3685, |
| "loss/crossentropy": 3.024512529373169, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23480483889579773, |
| "loss/reg": 75.10919189453125, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.0163, |
| "grad_norm": 2.9349732398986816, |
| "grad_norm_var": 0.06241445749091478, |
| "learning_rate": 0.0001, |
| "loss": 78.1327, |
| "loss/crossentropy": 3.004549264907837, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2360680103302002, |
| "loss/reg": 74.8920669555664, |
| "step": 652 |
| }, |
| { |
| "epoch": 0.016325, |
| "grad_norm": 2.927232265472412, |
| "grad_norm_var": 0.06224965786214606, |
| "learning_rate": 0.0001, |
| "loss": 77.718, |
| "loss/crossentropy": 2.8078725337982178, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22627294063568115, |
| "loss/reg": 74.68383026123047, |
| "step": 653 |
| }, |
| { |
| "epoch": 0.01635, |
| "grad_norm": 2.6294076442718506, |
| "grad_norm_var": 0.06354828695962549, |
| "learning_rate": 0.0001, |
| "loss": 77.6217, |
| "loss/crossentropy": 2.906186580657959, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24207134544849396, |
| "loss/reg": 74.47340393066406, |
| "step": 654 |
| }, |
| { |
| "epoch": 0.016375, |
| "grad_norm": 2.4185891151428223, |
| "grad_norm_var": 0.07185744774492757, |
| "learning_rate": 0.0001, |
| "loss": 77.1087, |
| "loss/crossentropy": 2.6219069957733154, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21973133087158203, |
| "loss/reg": 74.26707458496094, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.0164, |
| "grad_norm": 3.0168731212615967, |
| "grad_norm_var": 0.07545913020925733, |
| "learning_rate": 0.0001, |
| "loss": 77.1465, |
| "loss/crossentropy": 2.838118076324463, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24065163731575012, |
| "loss/reg": 74.0677719116211, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.016425, |
| "grad_norm": 2.7818074226379395, |
| "grad_norm_var": 0.07529414177001831, |
| "learning_rate": 0.0001, |
| "loss": 76.9287, |
| "loss/crossentropy": 2.8338921070098877, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22610336542129517, |
| "loss/reg": 73.86873626708984, |
| "step": 657 |
| }, |
| { |
| "epoch": 0.01645, |
| "grad_norm": 2.854801654815674, |
| "grad_norm_var": 0.06047968099176065, |
| "learning_rate": 0.0001, |
| "loss": 76.6613, |
| "loss/crossentropy": 2.7867751121520996, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.209943026304245, |
| "loss/reg": 73.6645736694336, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.016475, |
| "grad_norm": 2.9934234619140625, |
| "grad_norm_var": 0.063002636345387, |
| "learning_rate": 0.0001, |
| "loss": 76.8956, |
| "loss/crossentropy": 3.177863359451294, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24757607281208038, |
| "loss/reg": 73.47013854980469, |
| "step": 659 |
| }, |
| { |
| "epoch": 0.0165, |
| "grad_norm": 2.6304805278778076, |
| "grad_norm_var": 0.06054759846060639, |
| "learning_rate": 0.0001, |
| "loss": 76.1319, |
| "loss/crossentropy": 2.6409659385681152, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21888282895088196, |
| "loss/reg": 73.27206420898438, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.016525, |
| "grad_norm": 2.7408792972564697, |
| "grad_norm_var": 0.05297394175892407, |
| "learning_rate": 0.0001, |
| "loss": 76.4855, |
| "loss/crossentropy": 3.1915743350982666, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2194949984550476, |
| "loss/reg": 73.07437896728516, |
| "step": 661 |
| }, |
| { |
| "epoch": 0.01655, |
| "grad_norm": 2.7700512409210205, |
| "grad_norm_var": 0.04943414917226316, |
| "learning_rate": 0.0001, |
| "loss": 75.886, |
| "loss/crossentropy": 2.7812137603759766, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2344488501548767, |
| "loss/reg": 72.8703384399414, |
| "step": 662 |
| }, |
| { |
| "epoch": 0.016575, |
| "grad_norm": 2.6418073177337646, |
| "grad_norm_var": 0.050678375391551594, |
| "learning_rate": 0.0001, |
| "loss": 75.6395, |
| "loss/crossentropy": 2.7417471408843994, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2233429104089737, |
| "loss/reg": 72.67445373535156, |
| "step": 663 |
| }, |
| { |
| "epoch": 0.0166, |
| "grad_norm": 3.158125162124634, |
| "grad_norm_var": 0.04805692783096613, |
| "learning_rate": 0.0001, |
| "loss": 75.6306, |
| "loss/crossentropy": 2.9241199493408203, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22319327294826508, |
| "loss/reg": 72.48330688476562, |
| "step": 664 |
| }, |
| { |
| "epoch": 0.016625, |
| "grad_norm": 2.736354351043701, |
| "grad_norm_var": 0.042809702674100926, |
| "learning_rate": 0.0001, |
| "loss": 75.3328, |
| "loss/crossentropy": 2.8228261470794678, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22843888401985168, |
| "loss/reg": 72.28150939941406, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.01665, |
| "grad_norm": 2.7677245140075684, |
| "grad_norm_var": 0.04199219130631846, |
| "learning_rate": 0.0001, |
| "loss": 74.9261, |
| "loss/crossentropy": 2.627197027206421, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22012290358543396, |
| "loss/reg": 72.07875061035156, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.016675, |
| "grad_norm": 2.5004570484161377, |
| "grad_norm_var": 0.03816500903700728, |
| "learning_rate": 0.0001, |
| "loss": 74.777, |
| "loss/crossentropy": 2.672111988067627, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22588777542114258, |
| "loss/reg": 71.87902069091797, |
| "step": 667 |
| }, |
| { |
| "epoch": 0.0167, |
| "grad_norm": 2.466562509536743, |
| "grad_norm_var": 0.042288959656074356, |
| "learning_rate": 0.0001, |
| "loss": 74.7091, |
| "loss/crossentropy": 2.8123865127563477, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21958181262016296, |
| "loss/reg": 71.6771469116211, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.016725, |
| "grad_norm": 2.751133918762207, |
| "grad_norm_var": 0.04011649012775607, |
| "learning_rate": 0.0001, |
| "loss": 74.4675, |
| "loss/crossentropy": 2.754987955093384, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23784713447093964, |
| "loss/reg": 71.4747085571289, |
| "step": 669 |
| }, |
| { |
| "epoch": 0.01675, |
| "grad_norm": 2.6867825984954834, |
| "grad_norm_var": 0.03946736718336652, |
| "learning_rate": 0.0001, |
| "loss": 74.5895, |
| "loss/crossentropy": 3.101982593536377, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21628157794475555, |
| "loss/reg": 71.27123260498047, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.016775, |
| "grad_norm": 2.500910997390747, |
| "grad_norm_var": 0.03631099988891246, |
| "learning_rate": 0.0001, |
| "loss": 73.8414, |
| "loss/crossentropy": 2.5617518424987793, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20989301800727844, |
| "loss/reg": 71.06979370117188, |
| "step": 671 |
| }, |
| { |
| "epoch": 0.0168, |
| "grad_norm": 2.473396062850952, |
| "grad_norm_var": 0.03542460303709258, |
| "learning_rate": 0.0001, |
| "loss": 73.9086, |
| "loss/crossentropy": 2.824228286743164, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2157234251499176, |
| "loss/reg": 70.86860656738281, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.016825, |
| "grad_norm": 2.8463189601898193, |
| "grad_norm_var": 0.036251456664882414, |
| "learning_rate": 0.0001, |
| "loss": 73.6464, |
| "loss/crossentropy": 2.7426350116729736, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23804283142089844, |
| "loss/reg": 70.66572570800781, |
| "step": 673 |
| }, |
| { |
| "epoch": 0.01685, |
| "grad_norm": 2.4676475524902344, |
| "grad_norm_var": 0.03865839021441365, |
| "learning_rate": 0.0001, |
| "loss": 73.4992, |
| "loss/crossentropy": 2.813382863998413, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21776646375656128, |
| "loss/reg": 70.46805572509766, |
| "step": 674 |
| }, |
| { |
| "epoch": 0.016875, |
| "grad_norm": 3.2696657180786133, |
| "grad_norm_var": 0.054391622405608804, |
| "learning_rate": 0.0001, |
| "loss": 73.4994, |
| "loss/crossentropy": 2.9931130409240723, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2343236804008484, |
| "loss/reg": 70.27191925048828, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.0169, |
| "grad_norm": 2.960261344909668, |
| "grad_norm_var": 0.05755957057574411, |
| "learning_rate": 0.0001, |
| "loss": 73.1637, |
| "loss/crossentropy": 2.845167398452759, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24311169981956482, |
| "loss/reg": 70.0754165649414, |
| "step": 676 |
| }, |
| { |
| "epoch": 0.016925, |
| "grad_norm": 2.8058366775512695, |
| "grad_norm_var": 0.057886073712972795, |
| "learning_rate": 0.0001, |
| "loss": 72.9262, |
| "loss/crossentropy": 2.7900030612945557, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2522013783454895, |
| "loss/reg": 69.8840103149414, |
| "step": 677 |
| }, |
| { |
| "epoch": 0.01695, |
| "grad_norm": 2.6820642948150635, |
| "grad_norm_var": 0.05799027827774973, |
| "learning_rate": 0.0001, |
| "loss": 72.9552, |
| "loss/crossentropy": 3.0160844326019287, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25032639503479004, |
| "loss/reg": 69.68882751464844, |
| "step": 678 |
| }, |
| { |
| "epoch": 0.016975, |
| "grad_norm": 2.756159782409668, |
| "grad_norm_var": 0.05742948572952381, |
| "learning_rate": 0.0001, |
| "loss": 72.4603, |
| "loss/crossentropy": 2.7341785430908203, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23339924216270447, |
| "loss/reg": 69.49270629882812, |
| "step": 679 |
| }, |
| { |
| "epoch": 0.017, |
| "grad_norm": 2.85196852684021, |
| "grad_norm_var": 0.04619244950055569, |
| "learning_rate": 0.0001, |
| "loss": 72.4025, |
| "loss/crossentropy": 2.8541834354400635, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2513972520828247, |
| "loss/reg": 69.29695129394531, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.017025, |
| "grad_norm": 2.745920419692993, |
| "grad_norm_var": 0.046218769763096884, |
| "learning_rate": 0.0001, |
| "loss": 72.2013, |
| "loss/crossentropy": 2.8387668132781982, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2571982443332672, |
| "loss/reg": 69.1053237915039, |
| "step": 681 |
| }, |
| { |
| "epoch": 0.01705, |
| "grad_norm": 2.7895922660827637, |
| "grad_norm_var": 0.046385473001735275, |
| "learning_rate": 0.0001, |
| "loss": 72.1027, |
| "loss/crossentropy": 2.9553239345550537, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23462611436843872, |
| "loss/reg": 68.9127426147461, |
| "step": 682 |
| }, |
| { |
| "epoch": 0.017075, |
| "grad_norm": 3.0583336353302, |
| "grad_norm_var": 0.049345512699435545, |
| "learning_rate": 0.0001, |
| "loss": 71.6065, |
| "loss/crossentropy": 2.6561527252197266, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22944068908691406, |
| "loss/reg": 68.72093963623047, |
| "step": 683 |
| }, |
| { |
| "epoch": 0.0171, |
| "grad_norm": 10.506353378295898, |
| "grad_norm_var": 3.7778572455504134, |
| "learning_rate": 0.0001, |
| "loss": 71.4093, |
| "loss/crossentropy": 2.677732467651367, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20256610214710236, |
| "loss/reg": 68.52899169921875, |
| "step": 684 |
| }, |
| { |
| "epoch": 0.017125, |
| "grad_norm": 2.7688822746276855, |
| "grad_norm_var": 3.77667386049867, |
| "learning_rate": 0.0001, |
| "loss": 71.4833, |
| "loss/crossentropy": 2.910447359085083, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23493064939975739, |
| "loss/reg": 68.33790588378906, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.01715, |
| "grad_norm": 2.6812407970428467, |
| "grad_norm_var": 3.77709980042232, |
| "learning_rate": 0.0001, |
| "loss": 70.9918, |
| "loss/crossentropy": 2.609196424484253, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23388031125068665, |
| "loss/reg": 68.14875030517578, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.017175, |
| "grad_norm": 2.8212389945983887, |
| "grad_norm_var": 3.7510797794332498, |
| "learning_rate": 0.0001, |
| "loss": 71.0687, |
| "loss/crossentropy": 2.8833634853363037, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22528675198554993, |
| "loss/reg": 67.9600830078125, |
| "step": 687 |
| }, |
| { |
| "epoch": 0.0172, |
| "grad_norm": 2.6118838787078857, |
| "grad_norm_var": 3.737378850831258, |
| "learning_rate": 0.0001, |
| "loss": 70.4259, |
| "loss/crossentropy": 2.438934564590454, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21688072383403778, |
| "loss/reg": 67.7700424194336, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.017225, |
| "grad_norm": 2.8803842067718506, |
| "grad_norm_var": 3.7354408858260792, |
| "learning_rate": 0.0001, |
| "loss": 70.6865, |
| "loss/crossentropy": 2.8623788356781006, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23739641904830933, |
| "loss/reg": 67.58670043945312, |
| "step": 689 |
| }, |
| { |
| "epoch": 0.01725, |
| "grad_norm": 2.9390385150909424, |
| "grad_norm_var": 3.697573889963882, |
| "learning_rate": 0.0001, |
| "loss": 70.1731, |
| "loss/crossentropy": 2.5417747497558594, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2275683581829071, |
| "loss/reg": 67.40377044677734, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.017275, |
| "grad_norm": 3.4491701126098633, |
| "grad_norm_var": 3.6983698569325107, |
| "learning_rate": 0.0001, |
| "loss": 70.0581, |
| "loss/crossentropy": 2.6056199073791504, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23559321463108063, |
| "loss/reg": 67.2168960571289, |
| "step": 691 |
| }, |
| { |
| "epoch": 0.0173, |
| "grad_norm": 3.0338120460510254, |
| "grad_norm_var": 3.695064661679089, |
| "learning_rate": 0.0001, |
| "loss": 70.0615, |
| "loss/crossentropy": 2.7806742191314697, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24799373745918274, |
| "loss/reg": 67.03279876708984, |
| "step": 692 |
| }, |
| { |
| "epoch": 0.017325, |
| "grad_norm": 2.7937796115875244, |
| "grad_norm_var": 3.6959266334784027, |
| "learning_rate": 0.0001, |
| "loss": 69.9194, |
| "loss/crossentropy": 2.838820219039917, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22827139496803284, |
| "loss/reg": 66.85226440429688, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.01735, |
| "grad_norm": 2.4988961219787598, |
| "grad_norm_var": 3.713984810158103, |
| "learning_rate": 0.0001, |
| "loss": 69.5952, |
| "loss/crossentropy": 2.718376398086548, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20215165615081787, |
| "loss/reg": 66.67467498779297, |
| "step": 694 |
| }, |
| { |
| "epoch": 0.017375, |
| "grad_norm": 2.665208339691162, |
| "grad_norm_var": 3.721389950709211, |
| "learning_rate": 0.0001, |
| "loss": 69.5769, |
| "loss/crossentropy": 2.8547093868255615, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2223222255706787, |
| "loss/reg": 66.49986267089844, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.0174, |
| "grad_norm": 2.7634010314941406, |
| "grad_norm_var": 3.7273892640509603, |
| "learning_rate": 0.0001, |
| "loss": 69.3502, |
| "loss/crossentropy": 2.8282363414764404, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2077331691980362, |
| "loss/reg": 66.31417846679688, |
| "step": 696 |
| }, |
| { |
| "epoch": 0.017425, |
| "grad_norm": 3.8694913387298584, |
| "grad_norm_var": 3.7213441994990677, |
| "learning_rate": 0.0001, |
| "loss": 70.1241, |
| "loss/crossentropy": 3.720386266708374, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2679758071899414, |
| "loss/reg": 66.13578796386719, |
| "step": 697 |
| }, |
| { |
| "epoch": 0.01745, |
| "grad_norm": 5.2169013023376465, |
| "grad_norm_var": 3.897477580961625, |
| "learning_rate": 0.0001, |
| "loss": 69.5964, |
| "loss/crossentropy": 3.217229127883911, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.42148905992507935, |
| "loss/reg": 65.95770263671875, |
| "step": 698 |
| }, |
| { |
| "epoch": 0.017475, |
| "grad_norm": 3.6213738918304443, |
| "grad_norm_var": 3.8815159738632965, |
| "learning_rate": 0.0001, |
| "loss": 68.5239, |
| "loss/crossentropy": 2.502047061920166, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24625252187252045, |
| "loss/reg": 65.7756118774414, |
| "step": 699 |
| }, |
| { |
| "epoch": 0.0175, |
| "grad_norm": 2.716905355453491, |
| "grad_norm_var": 0.46975474422527747, |
| "learning_rate": 0.0001, |
| "loss": 68.5619, |
| "loss/crossentropy": 2.764882802963257, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20099103450775146, |
| "loss/reg": 65.5960693359375, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.017525, |
| "grad_norm": 2.6961023807525635, |
| "grad_norm_var": 0.47313618338585167, |
| "learning_rate": 0.0001, |
| "loss": 68.6203, |
| "loss/crossentropy": 2.978670358657837, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2190450131893158, |
| "loss/reg": 65.42259216308594, |
| "step": 701 |
| }, |
| { |
| "epoch": 0.01755, |
| "grad_norm": 2.7581896781921387, |
| "grad_norm_var": 0.46942862049069445, |
| "learning_rate": 0.0001, |
| "loss": 68.2458, |
| "loss/crossentropy": 2.773253917694092, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2290889024734497, |
| "loss/reg": 65.24346160888672, |
| "step": 702 |
| }, |
| { |
| "epoch": 0.017575, |
| "grad_norm": 2.856942653656006, |
| "grad_norm_var": 0.468259868516004, |
| "learning_rate": 0.0001, |
| "loss": 68.2141, |
| "loss/crossentropy": 2.897153854370117, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24900856614112854, |
| "loss/reg": 65.06795501708984, |
| "step": 703 |
| }, |
| { |
| "epoch": 0.0176, |
| "grad_norm": 2.5926945209503174, |
| "grad_norm_var": 0.469495224772921, |
| "learning_rate": 0.0001, |
| "loss": 67.9397, |
| "loss/crossentropy": 2.8245160579681396, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2201787531375885, |
| "loss/reg": 64.8949966430664, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.017625, |
| "grad_norm": 2.5850887298583984, |
| "grad_norm_var": 0.48298250086362465, |
| "learning_rate": 0.0001, |
| "loss": 67.705, |
| "loss/crossentropy": 2.749037265777588, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2325897514820099, |
| "loss/reg": 64.72339630126953, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.01765, |
| "grad_norm": 2.7953805923461914, |
| "grad_norm_var": 0.486705412463463, |
| "learning_rate": 0.0001, |
| "loss": 68.0719, |
| "loss/crossentropy": 3.281177282333374, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23989194631576538, |
| "loss/reg": 64.55084991455078, |
| "step": 706 |
| }, |
| { |
| "epoch": 0.017675, |
| "grad_norm": 2.8562774658203125, |
| "grad_norm_var": 0.4776801572940296, |
| "learning_rate": 0.0001, |
| "loss": 67.7388, |
| "loss/crossentropy": 3.113478899002075, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2437664419412613, |
| "loss/reg": 64.38153839111328, |
| "step": 707 |
| }, |
| { |
| "epoch": 0.0177, |
| "grad_norm": 2.6234002113342285, |
| "grad_norm_var": 0.4874532296205454, |
| "learning_rate": 0.0001, |
| "loss": 67.1132, |
| "loss/crossentropy": 2.668200731277466, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2298266887664795, |
| "loss/reg": 64.21515655517578, |
| "step": 708 |
| }, |
| { |
| "epoch": 0.017725, |
| "grad_norm": 3.07820725440979, |
| "grad_norm_var": 0.48490202715237274, |
| "learning_rate": 0.0001, |
| "loss": 67.0545, |
| "loss/crossentropy": 2.7720115184783936, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24164924025535583, |
| "loss/reg": 64.04081726074219, |
| "step": 709 |
| }, |
| { |
| "epoch": 0.01775, |
| "grad_norm": 3.3483076095581055, |
| "grad_norm_var": 0.4718668398271717, |
| "learning_rate": 0.0001, |
| "loss": 67.0299, |
| "loss/crossentropy": 2.9127249717712402, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24779383838176727, |
| "loss/reg": 63.869354248046875, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.017775, |
| "grad_norm": 2.7884206771850586, |
| "grad_norm_var": 0.46624379181974973, |
| "learning_rate": 0.0001, |
| "loss": 66.733, |
| "loss/crossentropy": 2.7825210094451904, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2500211000442505, |
| "loss/reg": 63.70044708251953, |
| "step": 711 |
| }, |
| { |
| "epoch": 0.0178, |
| "grad_norm": 2.7660884857177734, |
| "grad_norm_var": 0.4661333259783419, |
| "learning_rate": 0.0001, |
| "loss": 66.8389, |
| "loss/crossentropy": 3.069040298461914, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24203762412071228, |
| "loss/reg": 63.52777862548828, |
| "step": 712 |
| }, |
| { |
| "epoch": 0.017825, |
| "grad_norm": 2.770434617996216, |
| "grad_norm_var": 0.42492635693953673, |
| "learning_rate": 0.0001, |
| "loss": 66.559, |
| "loss/crossentropy": 2.9704928398132324, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23337113857269287, |
| "loss/reg": 63.3551139831543, |
| "step": 713 |
| }, |
| { |
| "epoch": 0.01785, |
| "grad_norm": 2.4780144691467285, |
| "grad_norm_var": 0.08580528270122821, |
| "learning_rate": 0.0001, |
| "loss": 66.2605, |
| "loss/crossentropy": 2.8575258255004883, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2185363471508026, |
| "loss/reg": 63.184391021728516, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.017875, |
| "grad_norm": 2.620495080947876, |
| "grad_norm_var": 0.043238218869502514, |
| "learning_rate": 0.0001, |
| "loss": 65.9609, |
| "loss/crossentropy": 2.7202653884887695, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22694343328475952, |
| "loss/reg": 63.01369094848633, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.0179, |
| "grad_norm": 2.542977809906006, |
| "grad_norm_var": 0.04637604671543893, |
| "learning_rate": 0.0001, |
| "loss": 65.9499, |
| "loss/crossentropy": 2.8846218585968018, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21847984194755554, |
| "loss/reg": 62.84674835205078, |
| "step": 716 |
| }, |
| { |
| "epoch": 0.017925, |
| "grad_norm": 2.3846898078918457, |
| "grad_norm_var": 0.05508256728689957, |
| "learning_rate": 0.0001, |
| "loss": 65.4993, |
| "loss/crossentropy": 2.6143085956573486, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20310330390930176, |
| "loss/reg": 62.681915283203125, |
| "step": 717 |
| }, |
| { |
| "epoch": 0.01795, |
| "grad_norm": 2.73012375831604, |
| "grad_norm_var": 0.05506504227467772, |
| "learning_rate": 0.0001, |
| "loss": 65.5976, |
| "loss/crossentropy": 2.8506314754486084, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22879712283611298, |
| "loss/reg": 62.518165588378906, |
| "step": 718 |
| }, |
| { |
| "epoch": 0.017975, |
| "grad_norm": 2.4760758876800537, |
| "grad_norm_var": 0.05812137756337279, |
| "learning_rate": 0.0001, |
| "loss": 65.5302, |
| "loss/crossentropy": 2.9573020935058594, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2235342562198639, |
| "loss/reg": 62.34933090209961, |
| "step": 719 |
| }, |
| { |
| "epoch": 0.018, |
| "grad_norm": 2.9984333515167236, |
| "grad_norm_var": 0.06180506886074681, |
| "learning_rate": 0.0001, |
| "loss": 65.2478, |
| "loss/crossentropy": 2.825148344039917, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24277126789093018, |
| "loss/reg": 62.17986297607422, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.018025, |
| "grad_norm": 2.449723243713379, |
| "grad_norm_var": 0.06574898435085288, |
| "learning_rate": 0.0001, |
| "loss": 64.8758, |
| "loss/crossentropy": 2.6417102813720703, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22177964448928833, |
| "loss/reg": 62.012306213378906, |
| "step": 721 |
| }, |
| { |
| "epoch": 0.01805, |
| "grad_norm": 2.6764326095581055, |
| "grad_norm_var": 0.06562316783263214, |
| "learning_rate": 0.0001, |
| "loss": 64.9029, |
| "loss/crossentropy": 2.828677177429199, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23033303022384644, |
| "loss/reg": 61.84387969970703, |
| "step": 722 |
| }, |
| { |
| "epoch": 0.018075, |
| "grad_norm": 2.8159947395324707, |
| "grad_norm_var": 0.06501549731222649, |
| "learning_rate": 0.0001, |
| "loss": 65.0769, |
| "loss/crossentropy": 3.174980878829956, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22877538204193115, |
| "loss/reg": 61.67316818237305, |
| "step": 723 |
| }, |
| { |
| "epoch": 0.0181, |
| "grad_norm": 3.2005398273468018, |
| "grad_norm_var": 0.07826629049727458, |
| "learning_rate": 0.0001, |
| "loss": 64.9549, |
| "loss/crossentropy": 3.203920364379883, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24600070714950562, |
| "loss/reg": 61.50501251220703, |
| "step": 724 |
| }, |
| { |
| "epoch": 0.018125, |
| "grad_norm": 2.9479753971099854, |
| "grad_norm_var": 0.07376285343432466, |
| "learning_rate": 0.0001, |
| "loss": 64.4933, |
| "loss/crossentropy": 2.9181010723114014, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23694497346878052, |
| "loss/reg": 61.338253021240234, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.01815, |
| "grad_norm": 3.087327718734741, |
| "grad_norm_var": 0.05718879220165339, |
| "learning_rate": 0.0001, |
| "loss": 64.0707, |
| "loss/crossentropy": 2.6616029739379883, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2374861240386963, |
| "loss/reg": 61.17159652709961, |
| "step": 726 |
| }, |
| { |
| "epoch": 0.018175, |
| "grad_norm": 2.491910934448242, |
| "grad_norm_var": 0.06050683436232778, |
| "learning_rate": 0.0001, |
| "loss": 63.8745, |
| "loss/crossentropy": 2.6548359394073486, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2142171561717987, |
| "loss/reg": 61.00545120239258, |
| "step": 727 |
| }, |
| { |
| "epoch": 0.0182, |
| "grad_norm": 2.7890079021453857, |
| "grad_norm_var": 0.060696315605900726, |
| "learning_rate": 0.0001, |
| "loss": 63.9746, |
| "loss/crossentropy": 2.92938232421875, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20511876046657562, |
| "loss/reg": 60.84012985229492, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.018225, |
| "grad_norm": 2.7196593284606934, |
| "grad_norm_var": 0.06049068327400467, |
| "learning_rate": 0.0001, |
| "loss": 63.5493, |
| "loss/crossentropy": 2.639850378036499, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23349660634994507, |
| "loss/reg": 60.675933837890625, |
| "step": 729 |
| }, |
| { |
| "epoch": 0.01825, |
| "grad_norm": 2.650775671005249, |
| "grad_norm_var": 0.05694124485670666, |
| "learning_rate": 0.0001, |
| "loss": 63.5801, |
| "loss/crossentropy": 2.8353641033172607, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23342850804328918, |
| "loss/reg": 60.51133346557617, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.018275, |
| "grad_norm": 2.598869800567627, |
| "grad_norm_var": 0.05726858156747217, |
| "learning_rate": 0.0001, |
| "loss": 63.436, |
| "loss/crossentropy": 2.8569741249084473, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23304705321788788, |
| "loss/reg": 60.345970153808594, |
| "step": 731 |
| }, |
| { |
| "epoch": 0.0183, |
| "grad_norm": 2.6495063304901123, |
| "grad_norm_var": 0.055427494487368514, |
| "learning_rate": 0.0001, |
| "loss": 63.132, |
| "loss/crossentropy": 2.7324588298797607, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21595758199691772, |
| "loss/reg": 60.183570861816406, |
| "step": 732 |
| }, |
| { |
| "epoch": 0.018325, |
| "grad_norm": 2.603133201599121, |
| "grad_norm_var": 0.04837598895656067, |
| "learning_rate": 0.0001, |
| "loss": 62.695, |
| "loss/crossentropy": 2.4560225009918213, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21853625774383545, |
| "loss/reg": 60.02042770385742, |
| "step": 733 |
| }, |
| { |
| "epoch": 0.01835, |
| "grad_norm": 3.1586227416992188, |
| "grad_norm_var": 0.05912500309994281, |
| "learning_rate": 0.0001, |
| "loss": 63.1673, |
| "loss/crossentropy": 3.0193772315979004, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.28715232014656067, |
| "loss/reg": 59.86080551147461, |
| "step": 734 |
| }, |
| { |
| "epoch": 0.018375, |
| "grad_norm": 2.6718556880950928, |
| "grad_norm_var": 0.053857831483746094, |
| "learning_rate": 0.0001, |
| "loss": 62.6891, |
| "loss/crossentropy": 2.7639520168304443, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22497007250785828, |
| "loss/reg": 59.70021057128906, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.0184, |
| "grad_norm": 2.7182719707489014, |
| "grad_norm_var": 0.05067343602402825, |
| "learning_rate": 0.0001, |
| "loss": 62.8571, |
| "loss/crossentropy": 3.092790365219116, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22454728186130524, |
| "loss/reg": 59.53980255126953, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.018425, |
| "grad_norm": 2.63033127784729, |
| "grad_norm_var": 0.04513557987957005, |
| "learning_rate": 0.0001, |
| "loss": 62.4324, |
| "loss/crossentropy": 2.814718008041382, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23878329992294312, |
| "loss/reg": 59.37885284423828, |
| "step": 737 |
| }, |
| { |
| "epoch": 0.01845, |
| "grad_norm": 2.9615981578826904, |
| "grad_norm_var": 0.046446030177343306, |
| "learning_rate": 0.0001, |
| "loss": 62.5695, |
| "loss/crossentropy": 3.1102991104125977, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2397473156452179, |
| "loss/reg": 59.219425201416016, |
| "step": 738 |
| }, |
| { |
| "epoch": 0.018475, |
| "grad_norm": 2.7155027389526367, |
| "grad_norm_var": 0.04677527116378618, |
| "learning_rate": 0.0001, |
| "loss": 62.0607, |
| "loss/crossentropy": 2.7697606086730957, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23005826771259308, |
| "loss/reg": 59.06086349487305, |
| "step": 739 |
| }, |
| { |
| "epoch": 0.0185, |
| "grad_norm": 2.794189214706421, |
| "grad_norm_var": 0.03469948264410713, |
| "learning_rate": 0.0001, |
| "loss": 61.9033, |
| "loss/crossentropy": 2.7485909461975098, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24854815006256104, |
| "loss/reg": 58.9061279296875, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.018525, |
| "grad_norm": 2.6141436100006104, |
| "grad_norm_var": 0.03337714746116092, |
| "learning_rate": 0.0001, |
| "loss": 61.7766, |
| "loss/crossentropy": 2.809553384780884, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21490439772605896, |
| "loss/reg": 58.752098083496094, |
| "step": 741 |
| }, |
| { |
| "epoch": 0.01855, |
| "grad_norm": 2.6113595962524414, |
| "grad_norm_var": 0.025552325556027947, |
| "learning_rate": 0.0001, |
| "loss": 61.7345, |
| "loss/crossentropy": 2.905996322631836, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22810769081115723, |
| "loss/reg": 58.60040283203125, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.018575, |
| "grad_norm": 2.6710562705993652, |
| "grad_norm_var": 0.022320882287599632, |
| "learning_rate": 0.0001, |
| "loss": 61.7047, |
| "loss/crossentropy": 3.0256869792938232, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23153835535049438, |
| "loss/reg": 58.4475212097168, |
| "step": 743 |
| }, |
| { |
| "epoch": 0.0186, |
| "grad_norm": 2.6590538024902344, |
| "grad_norm_var": 0.022221697868613433, |
| "learning_rate": 0.0001, |
| "loss": 61.3096, |
| "loss/crossentropy": 2.7802655696868896, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23257814347743988, |
| "loss/reg": 58.29676818847656, |
| "step": 744 |
| }, |
| { |
| "epoch": 0.018625, |
| "grad_norm": 2.6750831604003906, |
| "grad_norm_var": 0.022313711031313233, |
| "learning_rate": 0.0001, |
| "loss": 60.9886, |
| "loss/crossentropy": 2.6292662620544434, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21325910091400146, |
| "loss/reg": 58.146026611328125, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.01865, |
| "grad_norm": 2.910602569580078, |
| "grad_norm_var": 0.02443077895978618, |
| "learning_rate": 0.0001, |
| "loss": 61.2949, |
| "loss/crossentropy": 3.0462257862091064, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25037214159965515, |
| "loss/reg": 57.99826431274414, |
| "step": 746 |
| }, |
| { |
| "epoch": 0.018675, |
| "grad_norm": 2.7559032440185547, |
| "grad_norm_var": 0.023274603878721933, |
| "learning_rate": 0.0001, |
| "loss": 61.0211, |
| "loss/crossentropy": 2.9264190196990967, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24024531245231628, |
| "loss/reg": 57.85441970825195, |
| "step": 747 |
| }, |
| { |
| "epoch": 0.0187, |
| "grad_norm": 11.09821891784668, |
| "grad_norm_var": 4.385431661130277, |
| "learning_rate": 0.0001, |
| "loss": 60.7765, |
| "loss/crossentropy": 2.7616488933563232, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.3043861985206604, |
| "loss/reg": 57.71049118041992, |
| "step": 748 |
| }, |
| { |
| "epoch": 0.018725, |
| "grad_norm": 3.4629063606262207, |
| "grad_norm_var": 4.355694283900243, |
| "learning_rate": 0.0001, |
| "loss": 60.7808, |
| "loss/crossentropy": 2.9968600273132324, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22214075922966003, |
| "loss/reg": 57.56179428100586, |
| "step": 749 |
| }, |
| { |
| "epoch": 0.01875, |
| "grad_norm": 2.9865288734436035, |
| "grad_norm_var": 4.361232034357083, |
| "learning_rate": 0.0001, |
| "loss": 60.3824, |
| "loss/crossentropy": 2.712460994720459, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25824302434921265, |
| "loss/reg": 57.4117431640625, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.018775, |
| "grad_norm": 2.966188430786133, |
| "grad_norm_var": 4.341660332338729, |
| "learning_rate": 0.0001, |
| "loss": 60.2275, |
| "loss/crossentropy": 2.749971389770508, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2149219810962677, |
| "loss/reg": 57.26259231567383, |
| "step": 751 |
| }, |
| { |
| "epoch": 0.0188, |
| "grad_norm": 3.0538909435272217, |
| "grad_norm_var": 4.32146321783135, |
| "learning_rate": 0.0001, |
| "loss": 60.1386, |
| "loss/crossentropy": 2.7705466747283936, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25299209356307983, |
| "loss/reg": 57.11503219604492, |
| "step": 752 |
| }, |
| { |
| "epoch": 0.018825, |
| "grad_norm": 3.7716422080993652, |
| "grad_norm_var": 4.293677767872144, |
| "learning_rate": 0.0001, |
| "loss": 60.374, |
| "loss/crossentropy": 3.1197762489318848, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2896609902381897, |
| "loss/reg": 56.964542388916016, |
| "step": 753 |
| }, |
| { |
| "epoch": 0.01885, |
| "grad_norm": 2.8715505599975586, |
| "grad_norm_var": 4.29967918105209, |
| "learning_rate": 0.0001, |
| "loss": 59.6617, |
| "loss/crossentropy": 2.6158196926116943, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23370970785617828, |
| "loss/reg": 56.81219482421875, |
| "step": 754 |
| }, |
| { |
| "epoch": 0.018875, |
| "grad_norm": 2.87619686126709, |
| "grad_norm_var": 4.286335448113463, |
| "learning_rate": 0.0001, |
| "loss": 59.5822, |
| "loss/crossentropy": 2.6575210094451904, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26069819927215576, |
| "loss/reg": 56.66400146484375, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.0189, |
| "grad_norm": 2.656797409057617, |
| "grad_norm_var": 4.299046394966134, |
| "learning_rate": 0.0001, |
| "loss": 59.3342, |
| "loss/crossentropy": 2.594022035598755, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22433573007583618, |
| "loss/reg": 56.515830993652344, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.018925, |
| "grad_norm": 2.692230224609375, |
| "grad_norm_var": 4.291088604047604, |
| "learning_rate": 0.0001, |
| "loss": 59.2204, |
| "loss/crossentropy": 2.6446311473846436, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2099241316318512, |
| "loss/reg": 56.36582565307617, |
| "step": 757 |
| }, |
| { |
| "epoch": 0.01895, |
| "grad_norm": 2.687993288040161, |
| "grad_norm_var": 4.283193607489185, |
| "learning_rate": 0.0001, |
| "loss": 59.2018, |
| "loss/crossentropy": 2.7599661350250244, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22358274459838867, |
| "loss/reg": 56.21822738647461, |
| "step": 758 |
| }, |
| { |
| "epoch": 0.018975, |
| "grad_norm": 2.611598253250122, |
| "grad_norm_var": 4.289389567894268, |
| "learning_rate": 0.0001, |
| "loss": 59.0686, |
| "loss/crossentropy": 2.787243604660034, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21356835961341858, |
| "loss/reg": 56.06779098510742, |
| "step": 759 |
| }, |
| { |
| "epoch": 0.019, |
| "grad_norm": 2.4653584957122803, |
| "grad_norm_var": 4.31141311016122, |
| "learning_rate": 0.0001, |
| "loss": 59.0029, |
| "loss/crossentropy": 2.8415911197662354, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24345867335796356, |
| "loss/reg": 55.9178466796875, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.019025, |
| "grad_norm": 2.8678863048553467, |
| "grad_norm_var": 4.2948716677726795, |
| "learning_rate": 0.0001, |
| "loss": 58.6391, |
| "loss/crossentropy": 2.6471433639526367, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21930553019046783, |
| "loss/reg": 55.772621154785156, |
| "step": 761 |
| }, |
| { |
| "epoch": 0.01905, |
| "grad_norm": 2.790867805480957, |
| "grad_norm_var": 4.303915496486924, |
| "learning_rate": 0.0001, |
| "loss": 58.5621, |
| "loss/crossentropy": 2.691446542739868, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2425747513771057, |
| "loss/reg": 55.628082275390625, |
| "step": 762 |
| }, |
| { |
| "epoch": 0.019075, |
| "grad_norm": 2.7855591773986816, |
| "grad_norm_var": 4.301370303985943, |
| "learning_rate": 0.0001, |
| "loss": 58.4557, |
| "loss/crossentropy": 2.732412815093994, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23795194923877716, |
| "loss/reg": 55.48536682128906, |
| "step": 763 |
| }, |
| { |
| "epoch": 0.0191, |
| "grad_norm": 2.5391337871551514, |
| "grad_norm_var": 0.11220097224740945, |
| "learning_rate": 0.0001, |
| "loss": 58.1866, |
| "loss/crossentropy": 2.628573179244995, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21366839110851288, |
| "loss/reg": 55.344329833984375, |
| "step": 764 |
| }, |
| { |
| "epoch": 0.019125, |
| "grad_norm": 2.4486775398254395, |
| "grad_norm_var": 0.09771899643028424, |
| "learning_rate": 0.0001, |
| "loss": 58.3236, |
| "loss/crossentropy": 2.8930563926696777, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22997558116912842, |
| "loss/reg": 55.20055389404297, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.01915, |
| "grad_norm": 2.766298532485962, |
| "grad_norm_var": 0.09577246439944981, |
| "learning_rate": 0.0001, |
| "loss": 58.3324, |
| "loss/crossentropy": 3.025320291519165, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24967965483665466, |
| "loss/reg": 55.057376861572266, |
| "step": 766 |
| }, |
| { |
| "epoch": 0.019175, |
| "grad_norm": 3.201991558074951, |
| "grad_norm_var": 0.1043707670856025, |
| "learning_rate": 0.0001, |
| "loss": 58.3737, |
| "loss/crossentropy": 3.2072536945343018, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24961410462856293, |
| "loss/reg": 54.916839599609375, |
| "step": 767 |
| }, |
| { |
| "epoch": 0.0192, |
| "grad_norm": 2.3997886180877686, |
| "grad_norm_var": 0.11053669150879818, |
| "learning_rate": 0.0001, |
| "loss": 57.8128, |
| "loss/crossentropy": 2.814892053604126, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.221253901720047, |
| "loss/reg": 54.776668548583984, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.019225, |
| "grad_norm": 2.467320680618286, |
| "grad_norm_var": 0.04390441270999475, |
| "learning_rate": 0.0001, |
| "loss": 57.848, |
| "loss/crossentropy": 2.9831323623657227, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22602063417434692, |
| "loss/reg": 54.63887405395508, |
| "step": 769 |
| }, |
| { |
| "epoch": 0.01925, |
| "grad_norm": 2.8920133113861084, |
| "grad_norm_var": 0.04441070048479005, |
| "learning_rate": 0.0001, |
| "loss": 57.6435, |
| "loss/crossentropy": 2.9186885356903076, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2257421761751175, |
| "loss/reg": 54.49909210205078, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.019275, |
| "grad_norm": 2.7764594554901123, |
| "grad_norm_var": 0.04264750323779935, |
| "learning_rate": 0.0001, |
| "loss": 56.9818, |
| "loss/crossentropy": 2.4131717681884766, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20915502309799194, |
| "loss/reg": 54.35948181152344, |
| "step": 771 |
| }, |
| { |
| "epoch": 0.0193, |
| "grad_norm": 2.9066452980041504, |
| "grad_norm_var": 0.04542215413656076, |
| "learning_rate": 0.0001, |
| "loss": 57.4967, |
| "loss/crossentropy": 3.0247690677642822, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2509494423866272, |
| "loss/reg": 54.22095489501953, |
| "step": 772 |
| }, |
| { |
| "epoch": 0.019325, |
| "grad_norm": 2.6702942848205566, |
| "grad_norm_var": 0.04549320067607141, |
| "learning_rate": 0.0001, |
| "loss": 57.237, |
| "loss/crossentropy": 2.915168523788452, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23676593601703644, |
| "loss/reg": 54.08510208129883, |
| "step": 773 |
| }, |
| { |
| "epoch": 0.01935, |
| "grad_norm": 2.5919129848480225, |
| "grad_norm_var": 0.046286340421070805, |
| "learning_rate": 0.0001, |
| "loss": 56.9803, |
| "loss/crossentropy": 2.8107874393463135, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2236163318157196, |
| "loss/reg": 53.94586181640625, |
| "step": 774 |
| }, |
| { |
| "epoch": 0.019375, |
| "grad_norm": 2.6954429149627686, |
| "grad_norm_var": 0.045750154457365015, |
| "learning_rate": 0.0001, |
| "loss": 57.0791, |
| "loss/crossentropy": 3.017504930496216, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2500165104866028, |
| "loss/reg": 53.81157684326172, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.0194, |
| "grad_norm": 2.492471218109131, |
| "grad_norm_var": 0.044933029105976394, |
| "learning_rate": 0.0001, |
| "loss": 56.6907, |
| "loss/crossentropy": 2.7895073890686035, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2253413051366806, |
| "loss/reg": 53.67582702636719, |
| "step": 776 |
| }, |
| { |
| "epoch": 0.019425, |
| "grad_norm": 2.369546413421631, |
| "grad_norm_var": 0.04968441666320113, |
| "learning_rate": 0.0001, |
| "loss": 56.6367, |
| "loss/crossentropy": 2.9010658264160156, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.1958373337984085, |
| "loss/reg": 53.53976058959961, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.01945, |
| "grad_norm": 3.021898031234741, |
| "grad_norm_var": 0.05660028336942536, |
| "learning_rate": 0.0001, |
| "loss": 56.5493, |
| "loss/crossentropy": 2.893462896347046, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2516905665397644, |
| "loss/reg": 53.404178619384766, |
| "step": 778 |
| }, |
| { |
| "epoch": 0.019475, |
| "grad_norm": 2.691014528274536, |
| "grad_norm_var": 0.055942876476975316, |
| "learning_rate": 0.0001, |
| "loss": 55.9807, |
| "loss/crossentropy": 2.465963840484619, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24134577810764313, |
| "loss/reg": 53.27339553833008, |
| "step": 779 |
| }, |
| { |
| "epoch": 0.0195, |
| "grad_norm": 2.3977248668670654, |
| "grad_norm_var": 0.05990861359730009, |
| "learning_rate": 0.0001, |
| "loss": 56.0824, |
| "loss/crossentropy": 2.7291431427001953, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21200142800807953, |
| "loss/reg": 53.141212463378906, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.019525, |
| "grad_norm": 2.8265068531036377, |
| "grad_norm_var": 0.057462358496513606, |
| "learning_rate": 0.0001, |
| "loss": 56.3862, |
| "loss/crossentropy": 3.13495135307312, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24010971188545227, |
| "loss/reg": 53.011112213134766, |
| "step": 781 |
| }, |
| { |
| "epoch": 0.01955, |
| "grad_norm": 2.549685001373291, |
| "grad_norm_var": 0.05842115228572311, |
| "learning_rate": 0.0001, |
| "loss": 55.8148, |
| "loss/crossentropy": 2.680000066757202, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25199833512306213, |
| "loss/reg": 52.88276672363281, |
| "step": 782 |
| }, |
| { |
| "epoch": 0.019575, |
| "grad_norm": 2.601576328277588, |
| "grad_norm_var": 0.03951790591236583, |
| "learning_rate": 0.0001, |
| "loss": 55.6334, |
| "loss/crossentropy": 2.6754848957061768, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2111392617225647, |
| "loss/reg": 52.746726989746094, |
| "step": 783 |
| }, |
| { |
| "epoch": 0.0196, |
| "grad_norm": 8.212234497070312, |
| "grad_norm_var": 1.9595461171198734, |
| "learning_rate": 0.0001, |
| "loss": 55.6696, |
| "loss/crossentropy": 2.8070499897003174, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25141945481300354, |
| "loss/reg": 52.61111068725586, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.019625, |
| "grad_norm": 2.7455339431762695, |
| "grad_norm_var": 1.9442466683120376, |
| "learning_rate": 0.0001, |
| "loss": 55.6931, |
| "loss/crossentropy": 2.996835947036743, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21819031238555908, |
| "loss/reg": 52.47810363769531, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.01965, |
| "grad_norm": 2.630183696746826, |
| "grad_norm_var": 1.9532633581534922, |
| "learning_rate": 0.0001, |
| "loss": 55.4071, |
| "loss/crossentropy": 2.804309844970703, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25376999378204346, |
| "loss/reg": 52.3490104675293, |
| "step": 786 |
| }, |
| { |
| "epoch": 0.019675, |
| "grad_norm": 3.049198865890503, |
| "grad_norm_var": 1.9493762909636376, |
| "learning_rate": 0.0001, |
| "loss": 55.0084, |
| "loss/crossentropy": 2.542832374572754, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24326905608177185, |
| "loss/reg": 52.222328186035156, |
| "step": 787 |
| }, |
| { |
| "epoch": 0.0197, |
| "grad_norm": 4.536539554595947, |
| "grad_norm_var": 2.0889857251666357, |
| "learning_rate": 0.0001, |
| "loss": 55.1162, |
| "loss/crossentropy": 2.8076493740081787, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21975521743297577, |
| "loss/reg": 52.08875274658203, |
| "step": 788 |
| }, |
| { |
| "epoch": 0.019725, |
| "grad_norm": 4.922837734222412, |
| "grad_norm_var": 2.2680069995824983, |
| "learning_rate": 0.0001, |
| "loss": 55.0792, |
| "loss/crossentropy": 2.878237724304199, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2420842945575714, |
| "loss/reg": 51.95884323120117, |
| "step": 789 |
| }, |
| { |
| "epoch": 0.01975, |
| "grad_norm": 2.30218243598938, |
| "grad_norm_var": 2.2994830337610286, |
| "learning_rate": 0.0001, |
| "loss": 54.8028, |
| "loss/crossentropy": 2.758139133453369, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2122175395488739, |
| "loss/reg": 51.83245086669922, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.019775, |
| "grad_norm": 2.6652355194091797, |
| "grad_norm_var": 2.3017848488088544, |
| "learning_rate": 0.0001, |
| "loss": 54.6171, |
| "loss/crossentropy": 2.6950416564941406, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2227291464805603, |
| "loss/reg": 51.699317932128906, |
| "step": 791 |
| }, |
| { |
| "epoch": 0.0198, |
| "grad_norm": 2.5915586948394775, |
| "grad_norm_var": 2.29237841360286, |
| "learning_rate": 0.0001, |
| "loss": 54.58, |
| "loss/crossentropy": 2.798182487487793, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21353884041309357, |
| "loss/reg": 51.568267822265625, |
| "step": 792 |
| }, |
| { |
| "epoch": 0.019825, |
| "grad_norm": 2.6132395267486572, |
| "grad_norm_var": 2.2672515903509027, |
| "learning_rate": 0.0001, |
| "loss": 54.4863, |
| "loss/crossentropy": 2.8284451961517334, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22251488268375397, |
| "loss/reg": 51.43537902832031, |
| "step": 793 |
| }, |
| { |
| "epoch": 0.01985, |
| "grad_norm": 2.9160501956939697, |
| "grad_norm_var": 2.2714860685983003, |
| "learning_rate": 0.0001, |
| "loss": 54.4318, |
| "loss/crossentropy": 2.8874566555023193, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24243222177028656, |
| "loss/reg": 51.301944732666016, |
| "step": 794 |
| }, |
| { |
| "epoch": 0.019875, |
| "grad_norm": 2.767122983932495, |
| "grad_norm_var": 2.266016244984087, |
| "learning_rate": 0.0001, |
| "loss": 53.9738, |
| "loss/crossentropy": 2.5667829513549805, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2378443330526352, |
| "loss/reg": 51.16920852661133, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.0199, |
| "grad_norm": 2.9074504375457764, |
| "grad_norm_var": 2.222940734299857, |
| "learning_rate": 0.0001, |
| "loss": 53.9377, |
| "loss/crossentropy": 2.659839391708374, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23761096596717834, |
| "loss/reg": 51.040260314941406, |
| "step": 796 |
| }, |
| { |
| "epoch": 0.019925, |
| "grad_norm": 2.6332762241363525, |
| "grad_norm_var": 2.237533280064099, |
| "learning_rate": 0.0001, |
| "loss": 53.8857, |
| "loss/crossentropy": 2.7650234699249268, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21053263545036316, |
| "loss/reg": 50.91017532348633, |
| "step": 797 |
| }, |
| { |
| "epoch": 0.01995, |
| "grad_norm": 2.6881556510925293, |
| "grad_norm_var": 2.225058902631474, |
| "learning_rate": 0.0001, |
| "loss": 53.6563, |
| "loss/crossentropy": 2.6306800842285156, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24274012446403503, |
| "loss/reg": 50.78291320800781, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.019975, |
| "grad_norm": 2.7099571228027344, |
| "grad_norm_var": 2.215716208024157, |
| "learning_rate": 0.0001, |
| "loss": 53.4381, |
| "loss/crossentropy": 2.52323842048645, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.259515643119812, |
| "loss/reg": 50.65534591674805, |
| "step": 799 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 2.960604429244995, |
| "grad_norm_var": 0.5037824266901583, |
| "learning_rate": 0.0001, |
| "loss": 53.5623, |
| "loss/crossentropy": 2.7882955074310303, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24377551674842834, |
| "loss/reg": 50.53020095825195, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.020025, |
| "grad_norm": 2.7040579319000244, |
| "grad_norm_var": 0.505172444748995, |
| "learning_rate": 0.0001, |
| "loss": 53.4338, |
| "loss/crossentropy": 2.8009700775146484, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2242647111415863, |
| "loss/reg": 50.408573150634766, |
| "step": 801 |
| }, |
| { |
| "epoch": 0.02005, |
| "grad_norm": 2.7675962448120117, |
| "grad_norm_var": 0.5000376610377392, |
| "learning_rate": 0.0001, |
| "loss": 53.1763, |
| "loss/crossentropy": 2.6563568115234375, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23715972900390625, |
| "loss/reg": 50.28279113769531, |
| "step": 802 |
| }, |
| { |
| "epoch": 0.020075, |
| "grad_norm": 2.9913876056671143, |
| "grad_norm_var": 0.49973967585988155, |
| "learning_rate": 0.0001, |
| "loss": 53.2435, |
| "loss/crossentropy": 2.856863498687744, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22863171994686127, |
| "loss/reg": 50.158050537109375, |
| "step": 803 |
| }, |
| { |
| "epoch": 0.0201, |
| "grad_norm": 2.579055070877075, |
| "grad_norm_var": 0.3329253447165852, |
| "learning_rate": 0.0001, |
| "loss": 53.0455, |
| "loss/crossentropy": 2.7912509441375732, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21997642517089844, |
| "loss/reg": 50.0342903137207, |
| "step": 804 |
| }, |
| { |
| "epoch": 0.020125, |
| "grad_norm": 2.906846046447754, |
| "grad_norm_var": 0.03177485529399533, |
| "learning_rate": 0.0001, |
| "loss": 52.9991, |
| "loss/crossentropy": 2.8479995727539062, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24032440781593323, |
| "loss/reg": 49.91073989868164, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.02015, |
| "grad_norm": 3.0859761238098145, |
| "grad_norm_var": 0.025305915418744006, |
| "learning_rate": 0.0001, |
| "loss": 52.9709, |
| "loss/crossentropy": 2.920821189880371, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26482486724853516, |
| "loss/reg": 49.78524398803711, |
| "step": 806 |
| }, |
| { |
| "epoch": 0.020175, |
| "grad_norm": 2.7434208393096924, |
| "grad_norm_var": 0.02448665601768667, |
| "learning_rate": 0.0001, |
| "loss": 52.9071, |
| "loss/crossentropy": 3.0034008026123047, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24188174307346344, |
| "loss/reg": 49.66181945800781, |
| "step": 807 |
| }, |
| { |
| "epoch": 0.0202, |
| "grad_norm": 2.88177227973938, |
| "grad_norm_var": 0.02225149356486682, |
| "learning_rate": 0.0001, |
| "loss": 52.4653, |
| "loss/crossentropy": 2.6840085983276367, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.245322123169899, |
| "loss/reg": 49.53594207763672, |
| "step": 808 |
| }, |
| { |
| "epoch": 0.020225, |
| "grad_norm": 3.1109049320220947, |
| "grad_norm_var": 0.02510624438714686, |
| "learning_rate": 0.0001, |
| "loss": 52.547, |
| "loss/crossentropy": 2.8825902938842773, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2539767920970917, |
| "loss/reg": 49.41044616699219, |
| "step": 809 |
| }, |
| { |
| "epoch": 0.02025, |
| "grad_norm": 2.4495651721954346, |
| "grad_norm_var": 0.033640854815441185, |
| "learning_rate": 0.0001, |
| "loss": 51.7535, |
| "loss/crossentropy": 2.28265380859375, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.18538084626197815, |
| "loss/reg": 49.28546905517578, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.020275, |
| "grad_norm": 2.864192247390747, |
| "grad_norm_var": 0.03373374858250576, |
| "learning_rate": 0.0001, |
| "loss": 52.0966, |
| "loss/crossentropy": 2.682021379470825, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2524881362915039, |
| "loss/reg": 49.162086486816406, |
| "step": 811 |
| }, |
| { |
| "epoch": 0.0203, |
| "grad_norm": 2.7251250743865967, |
| "grad_norm_var": 0.03347917919778235, |
| "learning_rate": 0.0001, |
| "loss": 52.1571, |
| "loss/crossentropy": 2.893673896789551, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2223658412694931, |
| "loss/reg": 49.04104232788086, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.020325, |
| "grad_norm": 2.496249198913574, |
| "grad_norm_var": 0.037700954552336914, |
| "learning_rate": 0.0001, |
| "loss": 51.914, |
| "loss/crossentropy": 2.755688190460205, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23637332022190094, |
| "loss/reg": 48.92195510864258, |
| "step": 813 |
| }, |
| { |
| "epoch": 0.02035, |
| "grad_norm": 2.5698087215423584, |
| "grad_norm_var": 0.04020791484434175, |
| "learning_rate": 0.0001, |
| "loss": 51.7827, |
| "loss/crossentropy": 2.7431833744049072, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23644611239433289, |
| "loss/reg": 48.80311584472656, |
| "step": 814 |
| }, |
| { |
| "epoch": 0.020375, |
| "grad_norm": 2.952713966369629, |
| "grad_norm_var": 0.04148941052159193, |
| "learning_rate": 0.0001, |
| "loss": 51.7364, |
| "loss/crossentropy": 2.824826717376709, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23083820939064026, |
| "loss/reg": 48.68075942993164, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.0204, |
| "grad_norm": 2.635850191116333, |
| "grad_norm_var": 0.0410977076632508, |
| "learning_rate": 0.0001, |
| "loss": 51.7439, |
| "loss/crossentropy": 2.93487548828125, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24603211879730225, |
| "loss/reg": 48.56298828125, |
| "step": 816 |
| }, |
| { |
| "epoch": 0.020425, |
| "grad_norm": 2.40620493888855, |
| "grad_norm_var": 0.049620007024852225, |
| "learning_rate": 0.0001, |
| "loss": 51.304, |
| "loss/crossentropy": 2.656850814819336, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2041395902633667, |
| "loss/reg": 48.44300079345703, |
| "step": 817 |
| }, |
| { |
| "epoch": 0.02045, |
| "grad_norm": 2.646141290664673, |
| "grad_norm_var": 0.05042569960910441, |
| "learning_rate": 0.0001, |
| "loss": 51.235, |
| "loss/crossentropy": 2.664440155029297, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2458214908838272, |
| "loss/reg": 48.3246955871582, |
| "step": 818 |
| }, |
| { |
| "epoch": 0.020475, |
| "grad_norm": 2.8259634971618652, |
| "grad_norm_var": 0.04687417195843082, |
| "learning_rate": 0.0001, |
| "loss": 51.6837, |
| "loss/crossentropy": 3.2239608764648438, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2503930926322937, |
| "loss/reg": 48.2093505859375, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.0205, |
| "grad_norm": 2.993224620819092, |
| "grad_norm_var": 0.04857006914218402, |
| "learning_rate": 0.0001, |
| "loss": 51.27, |
| "loss/crossentropy": 2.8996071815490723, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.27634817361831665, |
| "loss/reg": 48.094078063964844, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.020525, |
| "grad_norm": 3.131995916366577, |
| "grad_norm_var": 0.05589532321223274, |
| "learning_rate": 0.0001, |
| "loss": 51.3641, |
| "loss/crossentropy": 3.1158416271209717, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.272818386554718, |
| "loss/reg": 47.975460052490234, |
| "step": 821 |
| }, |
| { |
| "epoch": 0.02055, |
| "grad_norm": 2.6322062015533447, |
| "grad_norm_var": 0.05040003879316449, |
| "learning_rate": 0.0001, |
| "loss": 50.9655, |
| "loss/crossentropy": 2.874999761581421, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2292400300502777, |
| "loss/reg": 47.86125564575195, |
| "step": 822 |
| }, |
| { |
| "epoch": 0.020575, |
| "grad_norm": 2.631922960281372, |
| "grad_norm_var": 0.05133554293392611, |
| "learning_rate": 0.0001, |
| "loss": 50.457, |
| "loss/crossentropy": 2.5026562213897705, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20776653289794922, |
| "loss/reg": 47.74656677246094, |
| "step": 823 |
| }, |
| { |
| "epoch": 0.0206, |
| "grad_norm": 2.881382942199707, |
| "grad_norm_var": 0.051328562127300144, |
| "learning_rate": 0.0001, |
| "loss": 50.542, |
| "loss/crossentropy": 2.688274621963501, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22304531931877136, |
| "loss/reg": 47.63066482543945, |
| "step": 824 |
| }, |
| { |
| "epoch": 0.020625, |
| "grad_norm": 2.683779239654541, |
| "grad_norm_var": 0.04201158273686285, |
| "learning_rate": 0.0001, |
| "loss": 50.5567, |
| "loss/crossentropy": 2.7942123413085938, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2437441498041153, |
| "loss/reg": 47.518707275390625, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.02065, |
| "grad_norm": 2.7014639377593994, |
| "grad_norm_var": 0.03688115494732453, |
| "learning_rate": 0.0001, |
| "loss": 50.5527, |
| "loss/crossentropy": 2.8857181072235107, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26020658016204834, |
| "loss/reg": 47.40679168701172, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.020675, |
| "grad_norm": 13.778948783874512, |
| "grad_norm_var": 7.668981462187241, |
| "learning_rate": 0.0001, |
| "loss": 50.3418, |
| "loss/crossentropy": 2.836272716522217, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.213333398103714, |
| "loss/reg": 47.29219436645508, |
| "step": 827 |
| }, |
| { |
| "epoch": 0.0207, |
| "grad_norm": 2.6730599403381348, |
| "grad_norm_var": 7.673962997287033, |
| "learning_rate": 0.0001, |
| "loss": 50.3743, |
| "loss/crossentropy": 2.9492764472961426, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24545426666736603, |
| "loss/reg": 47.17961120605469, |
| "step": 828 |
| }, |
| { |
| "epoch": 0.020725, |
| "grad_norm": 2.44629168510437, |
| "grad_norm_var": 7.680239164125849, |
| "learning_rate": 0.0001, |
| "loss": 49.7865, |
| "loss/crossentropy": 2.5040016174316406, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21603502333164215, |
| "loss/reg": 47.06645965576172, |
| "step": 829 |
| }, |
| { |
| "epoch": 0.02075, |
| "grad_norm": 2.927917003631592, |
| "grad_norm_var": 7.6480446113119305, |
| "learning_rate": 0.0001, |
| "loss": 50.4814, |
| "loss/crossentropy": 3.270429849624634, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25596797466278076, |
| "loss/reg": 46.9549674987793, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.020775, |
| "grad_norm": 3.077221155166626, |
| "grad_norm_var": 7.641018421905691, |
| "learning_rate": 0.0001, |
| "loss": 50.196, |
| "loss/crossentropy": 3.097292423248291, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2553516924381256, |
| "loss/reg": 46.843326568603516, |
| "step": 831 |
| }, |
| { |
| "epoch": 0.0208, |
| "grad_norm": 3.5145583152770996, |
| "grad_norm_var": 7.5948155070728856, |
| "learning_rate": 0.0001, |
| "loss": 49.5672, |
| "loss/crossentropy": 2.601334571838379, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2364489734172821, |
| "loss/reg": 46.729393005371094, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.020825, |
| "grad_norm": 3.11055326461792, |
| "grad_norm_var": 7.523380552917458, |
| "learning_rate": 0.0001, |
| "loss": 49.711, |
| "loss/crossentropy": 2.8619778156280518, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2326866090297699, |
| "loss/reg": 46.61635971069336, |
| "step": 833 |
| }, |
| { |
| "epoch": 0.02085, |
| "grad_norm": 3.700695276260376, |
| "grad_norm_var": 7.467056690865655, |
| "learning_rate": 0.0001, |
| "loss": 49.5927, |
| "loss/crossentropy": 2.8446598052978516, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2441275715827942, |
| "loss/reg": 46.50389099121094, |
| "step": 834 |
| }, |
| { |
| "epoch": 0.020875, |
| "grad_norm": 2.789363145828247, |
| "grad_norm_var": 7.470951661037448, |
| "learning_rate": 0.0001, |
| "loss": 49.4515, |
| "loss/crossentropy": 2.822880268096924, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23577329516410828, |
| "loss/reg": 46.39282989501953, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.0209, |
| "grad_norm": 2.612974166870117, |
| "grad_norm_var": 7.510988449534897, |
| "learning_rate": 0.0001, |
| "loss": 49.3286, |
| "loss/crossentropy": 2.8413946628570557, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.20617251098155975, |
| "loss/reg": 46.281070709228516, |
| "step": 836 |
| }, |
| { |
| "epoch": 0.020925, |
| "grad_norm": 2.69990611076355, |
| "grad_norm_var": 7.548519312088183, |
| "learning_rate": 0.0001, |
| "loss": 49.2338, |
| "loss/crossentropy": 2.829338550567627, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23464342951774597, |
| "loss/reg": 46.16982650756836, |
| "step": 837 |
| }, |
| { |
| "epoch": 0.02095, |
| "grad_norm": 3.1259517669677734, |
| "grad_norm_var": 7.503078866049228, |
| "learning_rate": 0.0001, |
| "loss": 48.9702, |
| "loss/crossentropy": 2.642885208129883, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2709121108055115, |
| "loss/reg": 46.05638885498047, |
| "step": 838 |
| }, |
| { |
| "epoch": 0.020975, |
| "grad_norm": 2.6453773975372314, |
| "grad_norm_var": 7.50138088085608, |
| "learning_rate": 0.0001, |
| "loss": 49.101, |
| "loss/crossentropy": 2.90779972076416, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2505376935005188, |
| "loss/reg": 45.94264602661133, |
| "step": 839 |
| }, |
| { |
| "epoch": 0.021, |
| "grad_norm": 2.7109532356262207, |
| "grad_norm_var": 7.519198653159886, |
| "learning_rate": 0.0001, |
| "loss": 48.8333, |
| "loss/crossentropy": 2.7746942043304443, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23057350516319275, |
| "loss/reg": 45.8280029296875, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.021025, |
| "grad_norm": 2.9908559322357178, |
| "grad_norm_var": 7.488604931370973, |
| "learning_rate": 0.0001, |
| "loss": 48.7675, |
| "loss/crossentropy": 2.7737483978271484, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.27381837368011475, |
| "loss/reg": 45.71989822387695, |
| "step": 841 |
| }, |
| { |
| "epoch": 0.02105, |
| "grad_norm": 2.8714518547058105, |
| "grad_norm_var": 7.47017858293949, |
| "learning_rate": 0.0001, |
| "loss": 48.9529, |
| "loss/crossentropy": 3.073143243789673, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26654019951820374, |
| "loss/reg": 45.61322021484375, |
| "step": 842 |
| }, |
| { |
| "epoch": 0.021075, |
| "grad_norm": 3.0701894760131836, |
| "grad_norm_var": 0.11045821533903463, |
| "learning_rate": 0.0001, |
| "loss": 48.3711, |
| "loss/crossentropy": 2.606783390045166, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26009494066238403, |
| "loss/reg": 45.504241943359375, |
| "step": 843 |
| }, |
| { |
| "epoch": 0.0211, |
| "grad_norm": 2.5917296409606934, |
| "grad_norm_var": 0.11371707836486636, |
| "learning_rate": 0.0001, |
| "loss": 48.4286, |
| "loss/crossentropy": 2.8071954250335693, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2240188866853714, |
| "loss/reg": 45.39739227294922, |
| "step": 844 |
| }, |
| { |
| "epoch": 0.021125, |
| "grad_norm": 2.990241289138794, |
| "grad_norm_var": 0.09710077586091354, |
| "learning_rate": 0.0001, |
| "loss": 48.4181, |
| "loss/crossentropy": 2.8722949028015137, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25466662645339966, |
| "loss/reg": 45.29114532470703, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.02115, |
| "grad_norm": 2.7038068771362305, |
| "grad_norm_var": 0.10132916045419596, |
| "learning_rate": 0.0001, |
| "loss": 48.5483, |
| "loss/crossentropy": 3.1085779666900635, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25524720549583435, |
| "loss/reg": 45.18445587158203, |
| "step": 846 |
| }, |
| { |
| "epoch": 0.021175, |
| "grad_norm": 2.6508195400238037, |
| "grad_norm_var": 0.10548054452361626, |
| "learning_rate": 0.0001, |
| "loss": 48.4558, |
| "loss/crossentropy": 3.1368627548217773, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23976629972457886, |
| "loss/reg": 45.079139709472656, |
| "step": 847 |
| }, |
| { |
| "epoch": 0.0212, |
| "grad_norm": 2.988497734069824, |
| "grad_norm_var": 0.0813341385370144, |
| "learning_rate": 0.0001, |
| "loss": 48.1072, |
| "loss/crossentropy": 2.8759114742279053, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25515347719192505, |
| "loss/reg": 44.97610855102539, |
| "step": 848 |
| }, |
| { |
| "epoch": 0.021225, |
| "grad_norm": 2.769603729248047, |
| "grad_norm_var": 0.0786111905047143, |
| "learning_rate": 0.0001, |
| "loss": 48.2049, |
| "loss/crossentropy": 3.070866346359253, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.258023738861084, |
| "loss/reg": 44.8759765625, |
| "step": 849 |
| }, |
| { |
| "epoch": 0.02125, |
| "grad_norm": 2.505613327026367, |
| "grad_norm_var": 0.03543295310148456, |
| "learning_rate": 0.0001, |
| "loss": 47.7345, |
| "loss/crossentropy": 2.7244465351104736, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2349836379289627, |
| "loss/reg": 44.77507019042969, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.021275, |
| "grad_norm": 2.5850486755371094, |
| "grad_norm_var": 0.038190999955643436, |
| "learning_rate": 0.0001, |
| "loss": 47.6796, |
| "loss/crossentropy": 2.797024726867676, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21167223155498505, |
| "loss/reg": 44.670875549316406, |
| "step": 851 |
| }, |
| { |
| "epoch": 0.0213, |
| "grad_norm": 3.048123359680176, |
| "grad_norm_var": 0.04021511405729043, |
| "learning_rate": 0.0001, |
| "loss": 47.6982, |
| "loss/crossentropy": 2.882145881652832, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25044912099838257, |
| "loss/reg": 44.56555938720703, |
| "step": 852 |
| }, |
| { |
| "epoch": 0.021325, |
| "grad_norm": 2.6726913452148438, |
| "grad_norm_var": 0.04065821192390615, |
| "learning_rate": 0.0001, |
| "loss": 47.6222, |
| "loss/crossentropy": 2.9055726528167725, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2560499608516693, |
| "loss/reg": 44.460567474365234, |
| "step": 853 |
| }, |
| { |
| "epoch": 0.02135, |
| "grad_norm": 2.616454601287842, |
| "grad_norm_var": 0.03525310786582168, |
| "learning_rate": 0.0001, |
| "loss": 47.3723, |
| "loss/crossentropy": 2.7777538299560547, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23695862293243408, |
| "loss/reg": 44.357627868652344, |
| "step": 854 |
| }, |
| { |
| "epoch": 0.021375, |
| "grad_norm": 2.6965930461883545, |
| "grad_norm_var": 0.03452699702972097, |
| "learning_rate": 0.0001, |
| "loss": 47.1636, |
| "loss/crossentropy": 2.6849050521850586, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22069215774536133, |
| "loss/reg": 44.257965087890625, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.0214, |
| "grad_norm": 2.5102341175079346, |
| "grad_norm_var": 0.0388638936666986, |
| "learning_rate": 0.0001, |
| "loss": 47.262, |
| "loss/crossentropy": 2.860992908477783, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24046388268470764, |
| "loss/reg": 44.160499572753906, |
| "step": 856 |
| }, |
| { |
| "epoch": 0.021425, |
| "grad_norm": 2.9143855571746826, |
| "grad_norm_var": 0.036940528281403574, |
| "learning_rate": 0.0001, |
| "loss": 47.0549, |
| "loss/crossentropy": 2.7531962394714355, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23869255185127258, |
| "loss/reg": 44.063053131103516, |
| "step": 857 |
| }, |
| { |
| "epoch": 0.02145, |
| "grad_norm": 2.7312750816345215, |
| "grad_norm_var": 0.03611533132415896, |
| "learning_rate": 0.0001, |
| "loss": 47.2825, |
| "loss/crossentropy": 3.063075304031372, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2570626139640808, |
| "loss/reg": 43.962337493896484, |
| "step": 858 |
| }, |
| { |
| "epoch": 0.021475, |
| "grad_norm": 2.6700384616851807, |
| "grad_norm_var": 0.029190745995408444, |
| "learning_rate": 0.0001, |
| "loss": 46.8626, |
| "loss/crossentropy": 2.765472173690796, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23290973901748657, |
| "loss/reg": 43.86418533325195, |
| "step": 859 |
| }, |
| { |
| "epoch": 0.0215, |
| "grad_norm": 2.5979349613189697, |
| "grad_norm_var": 0.029080552835625658, |
| "learning_rate": 0.0001, |
| "loss": 46.8266, |
| "loss/crossentropy": 2.843446969985962, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22045856714248657, |
| "loss/reg": 43.76265335083008, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.021525, |
| "grad_norm": 2.8044228553771973, |
| "grad_norm_var": 0.0247465536201607, |
| "learning_rate": 0.0001, |
| "loss": 46.8942, |
| "loss/crossentropy": 2.9794631004333496, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25055602192878723, |
| "loss/reg": 43.664146423339844, |
| "step": 861 |
| }, |
| { |
| "epoch": 0.02155, |
| "grad_norm": 3.0274248123168945, |
| "grad_norm_var": 0.03074023090764418, |
| "learning_rate": 0.0001, |
| "loss": 46.4618, |
| "loss/crossentropy": 2.6372880935668945, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26163622736930847, |
| "loss/reg": 43.56292724609375, |
| "step": 862 |
| }, |
| { |
| "epoch": 0.021575, |
| "grad_norm": 2.771188974380493, |
| "grad_norm_var": 0.030265496058056365, |
| "learning_rate": 0.0001, |
| "loss": 46.687, |
| "loss/crossentropy": 2.977721691131592, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24795976281166077, |
| "loss/reg": 43.46126937866211, |
| "step": 863 |
| }, |
| { |
| "epoch": 0.0216, |
| "grad_norm": 3.0198278427124023, |
| "grad_norm_var": 0.031346752653106096, |
| "learning_rate": 0.0001, |
| "loss": 46.4047, |
| "loss/crossentropy": 2.805306911468506, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23715007305145264, |
| "loss/reg": 43.36224365234375, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.021625, |
| "grad_norm": 2.8970999717712402, |
| "grad_norm_var": 0.032758795573165383, |
| "learning_rate": 0.0001, |
| "loss": 46.3235, |
| "loss/crossentropy": 2.81503963470459, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2449081391096115, |
| "loss/reg": 43.263526916503906, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.02165, |
| "grad_norm": 2.6958978176116943, |
| "grad_norm_var": 0.028713014180188404, |
| "learning_rate": 0.0001, |
| "loss": 46.4609, |
| "loss/crossentropy": 3.0649611949920654, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23137107491493225, |
| "loss/reg": 43.164615631103516, |
| "step": 866 |
| }, |
| { |
| "epoch": 0.021675, |
| "grad_norm": 2.738786458969116, |
| "grad_norm_var": 0.026477629275204133, |
| "learning_rate": 0.0001, |
| "loss": 46.2192, |
| "loss/crossentropy": 2.900941848754883, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25012895464897156, |
| "loss/reg": 43.06816482543945, |
| "step": 867 |
| }, |
| { |
| "epoch": 0.0217, |
| "grad_norm": 2.6869115829467773, |
| "grad_norm_var": 0.021515463936426234, |
| "learning_rate": 0.0001, |
| "loss": 46.0673, |
| "loss/crossentropy": 2.8564064502716064, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23792710900306702, |
| "loss/reg": 42.972923278808594, |
| "step": 868 |
| }, |
| { |
| "epoch": 0.021725, |
| "grad_norm": 2.827066421508789, |
| "grad_norm_var": 0.02134784747631405, |
| "learning_rate": 0.0001, |
| "loss": 45.8876, |
| "loss/crossentropy": 2.7518844604492188, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2576528787612915, |
| "loss/reg": 42.87803268432617, |
| "step": 869 |
| }, |
| { |
| "epoch": 0.02175, |
| "grad_norm": 2.626676321029663, |
| "grad_norm_var": 0.02115486089260775, |
| "learning_rate": 0.0001, |
| "loss": 45.7523, |
| "loss/crossentropy": 2.7327208518981934, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23792991042137146, |
| "loss/reg": 42.78163528442383, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.021775, |
| "grad_norm": 2.7007803916931152, |
| "grad_norm_var": 0.021118609979900307, |
| "learning_rate": 0.0001, |
| "loss": 45.7201, |
| "loss/crossentropy": 2.7678470611572266, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26777875423431396, |
| "loss/reg": 42.68449020385742, |
| "step": 871 |
| }, |
| { |
| "epoch": 0.0218, |
| "grad_norm": 2.867694616317749, |
| "grad_norm_var": 0.01702195773885459, |
| "learning_rate": 0.0001, |
| "loss": 45.5534, |
| "loss/crossentropy": 2.7220120429992676, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24349060654640198, |
| "loss/reg": 42.58792495727539, |
| "step": 872 |
| }, |
| { |
| "epoch": 0.021825, |
| "grad_norm": 2.7054524421691895, |
| "grad_norm_var": 0.016176199233639704, |
| "learning_rate": 0.0001, |
| "loss": 45.5576, |
| "loss/crossentropy": 2.821178913116455, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24668864905834198, |
| "loss/reg": 42.489776611328125, |
| "step": 873 |
| }, |
| { |
| "epoch": 0.02185, |
| "grad_norm": 2.7255043983459473, |
| "grad_norm_var": 0.01621040773991519, |
| "learning_rate": 0.0001, |
| "loss": 45.2197, |
| "loss/crossentropy": 2.5901880264282227, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24032941460609436, |
| "loss/reg": 42.389225006103516, |
| "step": 874 |
| }, |
| { |
| "epoch": 0.021875, |
| "grad_norm": 2.680649757385254, |
| "grad_norm_var": 0.016072239099798935, |
| "learning_rate": 0.0001, |
| "loss": 45.4495, |
| "loss/crossentropy": 2.920884132385254, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24124376475811005, |
| "loss/reg": 42.28740692138672, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.0219, |
| "grad_norm": 2.7948641777038574, |
| "grad_norm_var": 0.013890606167317543, |
| "learning_rate": 0.0001, |
| "loss": 45.2185, |
| "loss/crossentropy": 2.7859625816345215, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24146077036857605, |
| "loss/reg": 42.191123962402344, |
| "step": 876 |
| }, |
| { |
| "epoch": 0.021925, |
| "grad_norm": 2.503937005996704, |
| "grad_norm_var": 0.018781331446677995, |
| "learning_rate": 0.0001, |
| "loss": 45.2798, |
| "loss/crossentropy": 2.9733173847198486, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21078920364379883, |
| "loss/reg": 42.09564971923828, |
| "step": 877 |
| }, |
| { |
| "epoch": 0.02195, |
| "grad_norm": 2.723689556121826, |
| "grad_norm_var": 0.013994920468044824, |
| "learning_rate": 0.0001, |
| "loss": 45.1466, |
| "loss/crossentropy": 2.9063451290130615, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24084623157978058, |
| "loss/reg": 41.999412536621094, |
| "step": 878 |
| }, |
| { |
| "epoch": 0.021975, |
| "grad_norm": 2.604905366897583, |
| "grad_norm_var": 0.015206201216361136, |
| "learning_rate": 0.0001, |
| "loss": 44.9635, |
| "loss/crossentropy": 2.8358519077301025, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22244781255722046, |
| "loss/reg": 41.905216217041016, |
| "step": 879 |
| }, |
| { |
| "epoch": 0.022, |
| "grad_norm": 2.6830010414123535, |
| "grad_norm_var": 0.009616840170906471, |
| "learning_rate": 0.0001, |
| "loss": 44.8298, |
| "loss/crossentropy": 2.7782702445983887, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2416587471961975, |
| "loss/reg": 41.80987548828125, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.022025, |
| "grad_norm": 2.701080322265625, |
| "grad_norm_var": 0.007296400643871455, |
| "learning_rate": 0.0001, |
| "loss": 44.458, |
| "loss/crossentropy": 2.5311830043792725, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21052095293998718, |
| "loss/reg": 41.71625900268555, |
| "step": 881 |
| }, |
| { |
| "epoch": 0.02205, |
| "grad_norm": 2.5038342475891113, |
| "grad_norm_var": 0.009814048940694183, |
| "learning_rate": 0.0001, |
| "loss": 44.6839, |
| "loss/crossentropy": 2.8363115787506104, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22428202629089355, |
| "loss/reg": 41.623291015625, |
| "step": 882 |
| }, |
| { |
| "epoch": 0.022075, |
| "grad_norm": 2.757178783416748, |
| "grad_norm_var": 0.009949491806351792, |
| "learning_rate": 0.0001, |
| "loss": 44.6845, |
| "loss/crossentropy": 2.905855417251587, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2473723292350769, |
| "loss/reg": 41.531280517578125, |
| "step": 883 |
| }, |
| { |
| "epoch": 0.0221, |
| "grad_norm": 2.660141706466675, |
| "grad_norm_var": 0.0100171783636083, |
| "learning_rate": 0.0001, |
| "loss": 44.3141, |
| "loss/crossentropy": 2.6371467113494873, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23851189017295837, |
| "loss/reg": 41.43846893310547, |
| "step": 884 |
| }, |
| { |
| "epoch": 0.022125, |
| "grad_norm": 4.251846790313721, |
| "grad_norm_var": 0.1626166050996685, |
| "learning_rate": 0.0001, |
| "loss": 44.3927, |
| "loss/crossentropy": 2.787137746810913, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25860458612442017, |
| "loss/reg": 41.346946716308594, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.02215, |
| "grad_norm": 2.9530351161956787, |
| "grad_norm_var": 0.16257111931878943, |
| "learning_rate": 0.0001, |
| "loss": 44.171, |
| "loss/crossentropy": 2.694733142852783, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22275252640247345, |
| "loss/reg": 41.25349426269531, |
| "step": 886 |
| }, |
| { |
| "epoch": 0.022175, |
| "grad_norm": 2.991832971572876, |
| "grad_norm_var": 0.1639725008568424, |
| "learning_rate": 0.0001, |
| "loss": 44.5867, |
| "loss/crossentropy": 3.156198501586914, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2699030637741089, |
| "loss/reg": 41.16058349609375, |
| "step": 887 |
| }, |
| { |
| "epoch": 0.0222, |
| "grad_norm": 3.1930086612701416, |
| "grad_norm_var": 0.1726863652995225, |
| "learning_rate": 0.0001, |
| "loss": 43.9933, |
| "loss/crossentropy": 2.6939613819122314, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22894969582557678, |
| "loss/reg": 41.07038879394531, |
| "step": 888 |
| }, |
| { |
| "epoch": 0.022225, |
| "grad_norm": 2.8903820514678955, |
| "grad_norm_var": 0.17151552786031227, |
| "learning_rate": 0.0001, |
| "loss": 44.0888, |
| "loss/crossentropy": 2.862319231033325, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24869811534881592, |
| "loss/reg": 40.97781753540039, |
| "step": 889 |
| }, |
| { |
| "epoch": 0.02225, |
| "grad_norm": 2.881136655807495, |
| "grad_norm_var": 0.17042145948309156, |
| "learning_rate": 0.0001, |
| "loss": 44.3592, |
| "loss/crossentropy": 3.2091801166534424, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2633001506328583, |
| "loss/reg": 40.88669967651367, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.022275, |
| "grad_norm": 2.62827467918396, |
| "grad_norm_var": 0.17185170990455442, |
| "learning_rate": 0.0001, |
| "loss": 43.7441, |
| "loss/crossentropy": 2.69647479057312, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25092294812202454, |
| "loss/reg": 40.79666519165039, |
| "step": 891 |
| }, |
| { |
| "epoch": 0.0223, |
| "grad_norm": 2.436519145965576, |
| "grad_norm_var": 0.18287652337390886, |
| "learning_rate": 0.0001, |
| "loss": 43.7102, |
| "loss/crossentropy": 2.7679152488708496, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23737739026546478, |
| "loss/reg": 40.704925537109375, |
| "step": 892 |
| }, |
| { |
| "epoch": 0.022325, |
| "grad_norm": 2.700011968612671, |
| "grad_norm_var": 0.17661805600996155, |
| "learning_rate": 0.0001, |
| "loss": 43.6811, |
| "loss/crossentropy": 2.8262124061584473, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23837611079216003, |
| "loss/reg": 40.61653137207031, |
| "step": 893 |
| }, |
| { |
| "epoch": 0.02235, |
| "grad_norm": 2.5235769748687744, |
| "grad_norm_var": 0.18242413999566054, |
| "learning_rate": 0.0001, |
| "loss": 43.4445, |
| "loss/crossentropy": 2.6917617321014404, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22554537653923035, |
| "loss/reg": 40.527217864990234, |
| "step": 894 |
| }, |
| { |
| "epoch": 0.022375, |
| "grad_norm": 2.7341766357421875, |
| "grad_norm_var": 0.17950288283988294, |
| "learning_rate": 0.0001, |
| "loss": 43.7484, |
| "loss/crossentropy": 3.029411792755127, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.28348681330680847, |
| "loss/reg": 40.43548583984375, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.0224, |
| "grad_norm": 2.659266233444214, |
| "grad_norm_var": 0.18004463619057087, |
| "learning_rate": 0.0001, |
| "loss": 43.1614, |
| "loss/crossentropy": 2.5794694423675537, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23672150075435638, |
| "loss/reg": 40.34519577026367, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.022425, |
| "grad_norm": 3.1245830059051514, |
| "grad_norm_var": 0.1833206141640287, |
| "learning_rate": 0.0001, |
| "loss": 43.5047, |
| "loss/crossentropy": 3.009214162826538, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24004200100898743, |
| "loss/reg": 40.25539779663086, |
| "step": 897 |
| }, |
| { |
| "epoch": 0.02245, |
| "grad_norm": 3.283797264099121, |
| "grad_norm_var": 0.1834653295253453, |
| "learning_rate": 0.0001, |
| "loss": 42.9276, |
| "loss/crossentropy": 2.5301198959350586, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23050135374069214, |
| "loss/reg": 40.16700744628906, |
| "step": 898 |
| }, |
| { |
| "epoch": 0.022475, |
| "grad_norm": 2.4279603958129883, |
| "grad_norm_var": 0.19724598877930527, |
| "learning_rate": 0.0001, |
| "loss": 43.0253, |
| "loss/crossentropy": 2.7136290073394775, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23277553915977478, |
| "loss/reg": 40.078914642333984, |
| "step": 899 |
| }, |
| { |
| "epoch": 0.0225, |
| "grad_norm": 2.5088298320770264, |
| "grad_norm_var": 0.2034398420188303, |
| "learning_rate": 0.0001, |
| "loss": 43.1582, |
| "loss/crossentropy": 2.948103189468384, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21640148758888245, |
| "loss/reg": 39.99374008178711, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.022525, |
| "grad_norm": 2.6329402923583984, |
| "grad_norm_var": 0.07258482335469481, |
| "learning_rate": 0.0001, |
| "loss": 43.0828, |
| "loss/crossentropy": 2.9187188148498535, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25746428966522217, |
| "loss/reg": 39.90663528442383, |
| "step": 901 |
| }, |
| { |
| "epoch": 0.02255, |
| "grad_norm": 2.9308879375457764, |
| "grad_norm_var": 0.07212100124237182, |
| "learning_rate": 0.0001, |
| "loss": 42.8767, |
| "loss/crossentropy": 2.779430866241455, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2763053774833679, |
| "loss/reg": 39.82095718383789, |
| "step": 902 |
| }, |
| { |
| "epoch": 0.022575, |
| "grad_norm": 2.6100316047668457, |
| "grad_norm_var": 0.07066177999980709, |
| "learning_rate": 0.0001, |
| "loss": 42.7168, |
| "loss/crossentropy": 2.7670066356658936, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21381208300590515, |
| "loss/reg": 39.73594665527344, |
| "step": 903 |
| }, |
| { |
| "epoch": 0.0226, |
| "grad_norm": 2.6517529487609863, |
| "grad_norm_var": 0.057746798972484, |
| "learning_rate": 0.0001, |
| "loss": 42.6053, |
| "loss/crossentropy": 2.7067525386810303, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2483087182044983, |
| "loss/reg": 39.65021896362305, |
| "step": 904 |
| }, |
| { |
| "epoch": 0.022625, |
| "grad_norm": 2.896390914916992, |
| "grad_norm_var": 0.057880348550103614, |
| "learning_rate": 0.0001, |
| "loss": 42.4123, |
| "loss/crossentropy": 2.6049654483795166, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24248483777046204, |
| "loss/reg": 39.56484603881836, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.02265, |
| "grad_norm": 2.7035367488861084, |
| "grad_norm_var": 0.05619899439359794, |
| "learning_rate": 0.0001, |
| "loss": 42.5853, |
| "loss/crossentropy": 2.8769829273223877, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2273760437965393, |
| "loss/reg": 39.48089599609375, |
| "step": 906 |
| }, |
| { |
| "epoch": 0.022675, |
| "grad_norm": 2.7665393352508545, |
| "grad_norm_var": 0.05578056314836128, |
| "learning_rate": 0.0001, |
| "loss": 42.4535, |
| "loss/crossentropy": 2.8200128078460693, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2363729327917099, |
| "loss/reg": 39.39712905883789, |
| "step": 907 |
| }, |
| { |
| "epoch": 0.0227, |
| "grad_norm": 2.480107545852661, |
| "grad_norm_var": 0.05422606208670284, |
| "learning_rate": 0.0001, |
| "loss": 42.4059, |
| "loss/crossentropy": 2.8648123741149902, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22969950735569, |
| "loss/reg": 39.31142044067383, |
| "step": 908 |
| }, |
| { |
| "epoch": 0.022725, |
| "grad_norm": 2.7900424003601074, |
| "grad_norm_var": 0.05440689578418396, |
| "learning_rate": 0.0001, |
| "loss": 42.4378, |
| "loss/crossentropy": 2.9586613178253174, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25124895572662354, |
| "loss/reg": 39.2279052734375, |
| "step": 909 |
| }, |
| { |
| "epoch": 0.02275, |
| "grad_norm": 2.8045692443847656, |
| "grad_norm_var": 0.051503902709208425, |
| "learning_rate": 0.0001, |
| "loss": 42.0578, |
| "loss/crossentropy": 2.6795575618743896, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23502734303474426, |
| "loss/reg": 39.14320373535156, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.022775, |
| "grad_norm": 3.0532169342041016, |
| "grad_norm_var": 0.05717807714263259, |
| "learning_rate": 0.0001, |
| "loss": 42.4633, |
| "loss/crossentropy": 3.152160882949829, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25327080488204956, |
| "loss/reg": 39.05790710449219, |
| "step": 911 |
| }, |
| { |
| "epoch": 0.0228, |
| "grad_norm": 2.6680328845977783, |
| "grad_norm_var": 0.05705311999868907, |
| "learning_rate": 0.0001, |
| "loss": 42.2034, |
| "loss/crossentropy": 2.99041748046875, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23903341591358185, |
| "loss/reg": 38.97396469116211, |
| "step": 912 |
| }, |
| { |
| "epoch": 0.022825, |
| "grad_norm": 3.3302087783813477, |
| "grad_norm_var": 0.0693946111033938, |
| "learning_rate": 0.0001, |
| "loss": 42.0845, |
| "loss/crossentropy": 2.930839776992798, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2614016532897949, |
| "loss/reg": 38.892234802246094, |
| "step": 913 |
| }, |
| { |
| "epoch": 0.02285, |
| "grad_norm": 2.6165523529052734, |
| "grad_norm_var": 0.05272697596004718, |
| "learning_rate": 0.0001, |
| "loss": 41.7748, |
| "loss/crossentropy": 2.7452762126922607, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2193608283996582, |
| "loss/reg": 38.81016540527344, |
| "step": 914 |
| }, |
| { |
| "epoch": 0.022875, |
| "grad_norm": 2.933302402496338, |
| "grad_norm_var": 0.04752966494528721, |
| "learning_rate": 0.0001, |
| "loss": 41.9394, |
| "loss/crossentropy": 2.9566924571990967, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25487080216407776, |
| "loss/reg": 38.72779846191406, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.0229, |
| "grad_norm": 2.6887619495391846, |
| "grad_norm_var": 0.043202036673411236, |
| "learning_rate": 0.0001, |
| "loss": 41.6663, |
| "loss/crossentropy": 2.798774242401123, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2221754789352417, |
| "loss/reg": 38.645347595214844, |
| "step": 916 |
| }, |
| { |
| "epoch": 0.022925, |
| "grad_norm": 2.8121073246002197, |
| "grad_norm_var": 0.04158046028161924, |
| "learning_rate": 0.0001, |
| "loss": 41.463, |
| "loss/crossentropy": 2.6423110961914062, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2565896213054657, |
| "loss/reg": 38.56413269042969, |
| "step": 917 |
| }, |
| { |
| "epoch": 0.02295, |
| "grad_norm": 2.509282350540161, |
| "grad_norm_var": 0.045107458768999464, |
| "learning_rate": 0.0001, |
| "loss": 41.6478, |
| "loss/crossentropy": 2.930504322052002, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2339283674955368, |
| "loss/reg": 38.48335647583008, |
| "step": 918 |
| }, |
| { |
| "epoch": 0.022975, |
| "grad_norm": 2.6414058208465576, |
| "grad_norm_var": 0.04450125049984308, |
| "learning_rate": 0.0001, |
| "loss": 41.2222, |
| "loss/crossentropy": 2.609210252761841, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2125830054283142, |
| "loss/reg": 38.40039825439453, |
| "step": 919 |
| }, |
| { |
| "epoch": 0.023, |
| "grad_norm": 2.715888261795044, |
| "grad_norm_var": 0.0437333648592715, |
| "learning_rate": 0.0001, |
| "loss": 41.2045, |
| "loss/crossentropy": 2.648813009262085, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2370688021183014, |
| "loss/reg": 38.318572998046875, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.023025, |
| "grad_norm": 2.711751937866211, |
| "grad_norm_var": 0.04289092327507144, |
| "learning_rate": 0.0001, |
| "loss": 41.3706, |
| "loss/crossentropy": 2.885514736175537, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24639633297920227, |
| "loss/reg": 38.238643646240234, |
| "step": 921 |
| }, |
| { |
| "epoch": 0.02305, |
| "grad_norm": 2.6456990242004395, |
| "grad_norm_var": 0.04356690227298922, |
| "learning_rate": 0.0001, |
| "loss": 41.3587, |
| "loss/crossentropy": 2.941075563430786, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2592896819114685, |
| "loss/reg": 38.15837478637695, |
| "step": 922 |
| }, |
| { |
| "epoch": 0.023075, |
| "grad_norm": 2.8968873023986816, |
| "grad_norm_var": 0.044734353597355184, |
| "learning_rate": 0.0001, |
| "loss": 41.2417, |
| "loss/crossentropy": 2.9061999320983887, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25584694743156433, |
| "loss/reg": 38.07965850830078, |
| "step": 923 |
| }, |
| { |
| "epoch": 0.0231, |
| "grad_norm": 2.6788389682769775, |
| "grad_norm_var": 0.039558045732947515, |
| "learning_rate": 0.0001, |
| "loss": 40.9876, |
| "loss/crossentropy": 2.746546506881714, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23900818824768066, |
| "loss/reg": 38.002079010009766, |
| "step": 924 |
| }, |
| { |
| "epoch": 0.023125, |
| "grad_norm": 3.128474712371826, |
| "grad_norm_var": 0.047123059326641466, |
| "learning_rate": 0.0001, |
| "loss": 41.2668, |
| "loss/crossentropy": 3.087491035461426, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25750723481178284, |
| "loss/reg": 37.921783447265625, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.02315, |
| "grad_norm": 2.8882362842559814, |
| "grad_norm_var": 0.04758715374512785, |
| "learning_rate": 0.0001, |
| "loss": 40.9121, |
| "loss/crossentropy": 2.817408800125122, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2543855309486389, |
| "loss/reg": 37.84029006958008, |
| "step": 926 |
| }, |
| { |
| "epoch": 0.023175, |
| "grad_norm": 2.8361799716949463, |
| "grad_norm_var": 0.043418151431792985, |
| "learning_rate": 0.0001, |
| "loss": 40.832, |
| "loss/crossentropy": 2.8375983238220215, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23513871431350708, |
| "loss/reg": 37.75931167602539, |
| "step": 927 |
| }, |
| { |
| "epoch": 0.0232, |
| "grad_norm": 2.909255266189575, |
| "grad_norm_var": 0.043008241613055276, |
| "learning_rate": 0.0001, |
| "loss": 41.1667, |
| "loss/crossentropy": 3.246528387069702, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24166589975357056, |
| "loss/reg": 37.67852783203125, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.023225, |
| "grad_norm": 2.552696704864502, |
| "grad_norm_var": 0.026750676712286037, |
| "learning_rate": 0.0001, |
| "loss": 40.818, |
| "loss/crossentropy": 2.9691615104675293, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24937693774700165, |
| "loss/reg": 37.59941101074219, |
| "step": 929 |
| }, |
| { |
| "epoch": 0.02325, |
| "grad_norm": 2.887279748916626, |
| "grad_norm_var": 0.026141477988341915, |
| "learning_rate": 0.0001, |
| "loss": 40.7307, |
| "loss/crossentropy": 2.956307888031006, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2544440031051636, |
| "loss/reg": 37.51991653442383, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.023275, |
| "grad_norm": 2.615774154663086, |
| "grad_norm_var": 0.02583631071739016, |
| "learning_rate": 0.0001, |
| "loss": 40.504, |
| "loss/crossentropy": 2.836076259613037, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22688478231430054, |
| "loss/reg": 37.4410285949707, |
| "step": 931 |
| }, |
| { |
| "epoch": 0.0233, |
| "grad_norm": 3.346874475479126, |
| "grad_norm_var": 0.046882285076200486, |
| "learning_rate": 0.0001, |
| "loss": 40.2933, |
| "loss/crossentropy": 2.698227643966675, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23271670937538147, |
| "loss/reg": 37.36236572265625, |
| "step": 932 |
| }, |
| { |
| "epoch": 0.023325, |
| "grad_norm": 3.105316162109375, |
| "grad_norm_var": 0.052785925698018585, |
| "learning_rate": 0.0001, |
| "loss": 40.2366, |
| "loss/crossentropy": 2.690444231033325, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2598923146724701, |
| "loss/reg": 37.28624725341797, |
| "step": 933 |
| }, |
| { |
| "epoch": 0.02335, |
| "grad_norm": 2.8327033519744873, |
| "grad_norm_var": 0.046059668983682654, |
| "learning_rate": 0.0001, |
| "loss": 40.4552, |
| "loss/crossentropy": 3.0131001472473145, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23372459411621094, |
| "loss/reg": 37.20838165283203, |
| "step": 934 |
| }, |
| { |
| "epoch": 0.023375, |
| "grad_norm": 2.720252513885498, |
| "grad_norm_var": 0.04439112918428994, |
| "learning_rate": 0.0001, |
| "loss": 40.1157, |
| "loss/crossentropy": 2.748929977416992, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23494702577590942, |
| "loss/reg": 37.13186264038086, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.0234, |
| "grad_norm": 2.510647773742676, |
| "grad_norm_var": 0.05047514191979848, |
| "learning_rate": 0.0001, |
| "loss": 39.8904, |
| "loss/crossentropy": 2.602600336074829, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23265476524829865, |
| "loss/reg": 37.05512237548828, |
| "step": 936 |
| }, |
| { |
| "epoch": 0.023425, |
| "grad_norm": 2.9770853519439697, |
| "grad_norm_var": 0.05072093631072979, |
| "learning_rate": 0.0001, |
| "loss": 40.3788, |
| "loss/crossentropy": 3.1354868412017822, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2637995779514313, |
| "loss/reg": 36.97951889038086, |
| "step": 937 |
| }, |
| { |
| "epoch": 0.02345, |
| "grad_norm": 2.5489563941955566, |
| "grad_norm_var": 0.053886506006241085, |
| "learning_rate": 0.0001, |
| "loss": 39.9812, |
| "loss/crossentropy": 2.8318233489990234, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24620817601680756, |
| "loss/reg": 36.90315246582031, |
| "step": 938 |
| }, |
| { |
| "epoch": 0.023475, |
| "grad_norm": 2.9009687900543213, |
| "grad_norm_var": 0.05391865958705443, |
| "learning_rate": 0.0001, |
| "loss": 39.9177, |
| "loss/crossentropy": 2.8244364261627197, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2657071053981781, |
| "loss/reg": 36.82755661010742, |
| "step": 939 |
| }, |
| { |
| "epoch": 0.0235, |
| "grad_norm": 2.9793155193328857, |
| "grad_norm_var": 0.05310601324464391, |
| "learning_rate": 0.0001, |
| "loss": 40.0234, |
| "loss/crossentropy": 3.0105204582214355, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2618894875049591, |
| "loss/reg": 36.750953674316406, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.023525, |
| "grad_norm": 2.8539981842041016, |
| "grad_norm_var": 0.0479435574644467, |
| "learning_rate": 0.0001, |
| "loss": 39.8117, |
| "loss/crossentropy": 2.8921923637390137, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2426648885011673, |
| "loss/reg": 36.67680358886719, |
| "step": 941 |
| }, |
| { |
| "epoch": 0.02355, |
| "grad_norm": 2.6014649868011475, |
| "grad_norm_var": 0.05130008365882167, |
| "learning_rate": 0.0001, |
| "loss": 39.7408, |
| "loss/crossentropy": 2.896888017654419, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2410784810781479, |
| "loss/reg": 36.60279083251953, |
| "step": 942 |
| }, |
| { |
| "epoch": 0.023575, |
| "grad_norm": 2.8801045417785645, |
| "grad_norm_var": 0.05149391710301548, |
| "learning_rate": 0.0001, |
| "loss": 39.7228, |
| "loss/crossentropy": 2.9397897720336914, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25510135293006897, |
| "loss/reg": 36.52793502807617, |
| "step": 943 |
| }, |
| { |
| "epoch": 0.0236, |
| "grad_norm": 2.8186585903167725, |
| "grad_norm_var": 0.05100626896681509, |
| "learning_rate": 0.0001, |
| "loss": 39.47, |
| "loss/crossentropy": 2.75411057472229, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26445087790489197, |
| "loss/reg": 36.451393127441406, |
| "step": 944 |
| }, |
| { |
| "epoch": 0.023625, |
| "grad_norm": 2.9633240699768066, |
| "grad_norm_var": 0.04686836082955779, |
| "learning_rate": 0.0001, |
| "loss": 39.3906, |
| "loss/crossentropy": 2.7534892559051514, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25982236862182617, |
| "loss/reg": 36.377254486083984, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.02365, |
| "grad_norm": 2.929901123046875, |
| "grad_norm_var": 0.04721409535759804, |
| "learning_rate": 0.0001, |
| "loss": 39.2373, |
| "loss/crossentropy": 2.6932575702667236, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2382897436618805, |
| "loss/reg": 36.305755615234375, |
| "step": 946 |
| }, |
| { |
| "epoch": 0.023675, |
| "grad_norm": 2.675022602081299, |
| "grad_norm_var": 0.04559039355538038, |
| "learning_rate": 0.0001, |
| "loss": 39.2789, |
| "loss/crossentropy": 2.8028876781463623, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24303217232227325, |
| "loss/reg": 36.23302459716797, |
| "step": 947 |
| }, |
| { |
| "epoch": 0.0237, |
| "grad_norm": 2.8433735370635986, |
| "grad_norm_var": 0.0282651774333182, |
| "learning_rate": 0.0001, |
| "loss": 39.181, |
| "loss/crossentropy": 2.774716854095459, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24557200074195862, |
| "loss/reg": 36.16071319580078, |
| "step": 948 |
| }, |
| { |
| "epoch": 0.023725, |
| "grad_norm": 2.809319257736206, |
| "grad_norm_var": 0.022532732866163013, |
| "learning_rate": 0.0001, |
| "loss": 39.33, |
| "loss/crossentropy": 3.011596918106079, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.228348970413208, |
| "loss/reg": 36.09001159667969, |
| "step": 949 |
| }, |
| { |
| "epoch": 0.02375, |
| "grad_norm": 2.9273681640625, |
| "grad_norm_var": 0.023470027420536846, |
| "learning_rate": 0.0001, |
| "loss": 38.9413, |
| "loss/crossentropy": 2.6855454444885254, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23887701332569122, |
| "loss/reg": 36.01683044433594, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.023775, |
| "grad_norm": 3.389801263809204, |
| "grad_norm_var": 0.043589378411585736, |
| "learning_rate": 0.0001, |
| "loss": 39.4398, |
| "loss/crossentropy": 3.211602210998535, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.283491313457489, |
| "loss/reg": 35.944705963134766, |
| "step": 951 |
| }, |
| { |
| "epoch": 0.0238, |
| "grad_norm": 2.8046481609344482, |
| "grad_norm_var": 0.03566620795445464, |
| "learning_rate": 0.0001, |
| "loss": 38.8264, |
| "loss/crossentropy": 2.689462661743164, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2632759213447571, |
| "loss/reg": 35.87367248535156, |
| "step": 952 |
| }, |
| { |
| "epoch": 0.023825, |
| "grad_norm": 2.9345734119415283, |
| "grad_norm_var": 0.03516626203340228, |
| "learning_rate": 0.0001, |
| "loss": 39.0027, |
| "loss/crossentropy": 2.9450294971466064, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2565668821334839, |
| "loss/reg": 35.8011474609375, |
| "step": 953 |
| }, |
| { |
| "epoch": 0.02385, |
| "grad_norm": 2.4629881381988525, |
| "grad_norm_var": 0.03926570002250382, |
| "learning_rate": 0.0001, |
| "loss": 38.6895, |
| "loss/crossentropy": 2.7172951698303223, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24316415190696716, |
| "loss/reg": 35.72907257080078, |
| "step": 954 |
| }, |
| { |
| "epoch": 0.023875, |
| "grad_norm": 3.0658795833587646, |
| "grad_norm_var": 0.041845868526290055, |
| "learning_rate": 0.0001, |
| "loss": 38.8817, |
| "loss/crossentropy": 2.9409422874450684, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2831678092479706, |
| "loss/reg": 35.65754699707031, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.0239, |
| "grad_norm": 2.8264553546905518, |
| "grad_norm_var": 0.04110340640186886, |
| "learning_rate": 0.0001, |
| "loss": 38.6798, |
| "loss/crossentropy": 2.841045618057251, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25310787558555603, |
| "loss/reg": 35.58567428588867, |
| "step": 956 |
| }, |
| { |
| "epoch": 0.023925, |
| "grad_norm": 2.8380401134490967, |
| "grad_norm_var": 0.041135667710252004, |
| "learning_rate": 0.0001, |
| "loss": 38.459, |
| "loss/crossentropy": 2.6821844577789307, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2615047097206116, |
| "loss/reg": 35.515350341796875, |
| "step": 957 |
| }, |
| { |
| "epoch": 0.02395, |
| "grad_norm": 3.5808513164520264, |
| "grad_norm_var": 0.06723561445601618, |
| "learning_rate": 0.0001, |
| "loss": 38.8643, |
| "loss/crossentropy": 3.114391803741455, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.30714309215545654, |
| "loss/reg": 35.44272994995117, |
| "step": 958 |
| }, |
| { |
| "epoch": 0.023975, |
| "grad_norm": 2.6595239639282227, |
| "grad_norm_var": 0.07150567007109672, |
| "learning_rate": 0.0001, |
| "loss": 38.0653, |
| "loss/crossentropy": 2.4589178562164307, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23348893225193024, |
| "loss/reg": 35.37288284301758, |
| "step": 959 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 2.541332483291626, |
| "grad_norm_var": 0.07962008638647618, |
| "learning_rate": 0.0001, |
| "loss": 38.2545, |
| "loss/crossentropy": 2.7399823665618896, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2123335599899292, |
| "loss/reg": 35.30220413208008, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.024025, |
| "grad_norm": 3.438565254211426, |
| "grad_norm_var": 0.09833307021827574, |
| "learning_rate": 0.0001, |
| "loss": 38.371, |
| "loss/crossentropy": 2.8811817169189453, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25691962242126465, |
| "loss/reg": 35.232913970947266, |
| "step": 961 |
| }, |
| { |
| "epoch": 0.02405, |
| "grad_norm": 2.7037241458892822, |
| "grad_norm_var": 0.10124614126430061, |
| "learning_rate": 0.0001, |
| "loss": 38.2645, |
| "loss/crossentropy": 2.8557090759277344, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24377232789993286, |
| "loss/reg": 35.1650505065918, |
| "step": 962 |
| }, |
| { |
| "epoch": 0.024075, |
| "grad_norm": 2.69218111038208, |
| "grad_norm_var": 0.10073533014069654, |
| "learning_rate": 0.0001, |
| "loss": 38.0522, |
| "loss/crossentropy": 2.7217583656311035, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2358003854751587, |
| "loss/reg": 35.09463882446289, |
| "step": 963 |
| }, |
| { |
| "epoch": 0.0241, |
| "grad_norm": 3.008634090423584, |
| "grad_norm_var": 0.10103115408336763, |
| "learning_rate": 0.0001, |
| "loss": 38.3344, |
| "loss/crossentropy": 3.036750316619873, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2726588249206543, |
| "loss/reg": 35.02499771118164, |
| "step": 964 |
| }, |
| { |
| "epoch": 0.024125, |
| "grad_norm": 2.662174940109253, |
| "grad_norm_var": 0.10451155871548182, |
| "learning_rate": 0.0001, |
| "loss": 38.3726, |
| "loss/crossentropy": 3.1680073738098145, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24728718400001526, |
| "loss/reg": 34.95726013183594, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.02415, |
| "grad_norm": 2.8360965251922607, |
| "grad_norm_var": 0.10480316259746587, |
| "learning_rate": 0.0001, |
| "loss": 37.8496, |
| "loss/crossentropy": 2.714099884033203, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2466110736131668, |
| "loss/reg": 34.888893127441406, |
| "step": 966 |
| }, |
| { |
| "epoch": 0.024175, |
| "grad_norm": 2.738298177719116, |
| "grad_norm_var": 0.08903093948484392, |
| "learning_rate": 0.0001, |
| "loss": 38.2404, |
| "loss/crossentropy": 3.152971029281616, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.268291711807251, |
| "loss/reg": 34.819091796875, |
| "step": 967 |
| }, |
| { |
| "epoch": 0.0242, |
| "grad_norm": 2.773857831954956, |
| "grad_norm_var": 0.08932614783542002, |
| "learning_rate": 0.0001, |
| "loss": 38.0543, |
| "loss/crossentropy": 3.0643906593322754, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2395814061164856, |
| "loss/reg": 34.7503547668457, |
| "step": 968 |
| }, |
| { |
| "epoch": 0.024225, |
| "grad_norm": 2.555189609527588, |
| "grad_norm_var": 0.09455968532326603, |
| "learning_rate": 0.0001, |
| "loss": 37.7541, |
| "loss/crossentropy": 2.834400177001953, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23769161105155945, |
| "loss/reg": 34.68204879760742, |
| "step": 969 |
| }, |
| { |
| "epoch": 0.02425, |
| "grad_norm": 3.086888551712036, |
| "grad_norm_var": 0.08781776896609182, |
| "learning_rate": 0.0001, |
| "loss": 37.8627, |
| "loss/crossentropy": 2.9810502529144287, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26743876934051514, |
| "loss/reg": 34.61421585083008, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.024275, |
| "grad_norm": 4.088360786437988, |
| "grad_norm_var": 0.17911672859183908, |
| "learning_rate": 0.0001, |
| "loss": 37.7253, |
| "loss/crossentropy": 2.8682823181152344, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.30868783593177795, |
| "loss/reg": 34.54836654663086, |
| "step": 971 |
| }, |
| { |
| "epoch": 0.0243, |
| "grad_norm": 3.1919515132904053, |
| "grad_norm_var": 0.1819625200877444, |
| "learning_rate": 0.0001, |
| "loss": 37.4812, |
| "loss/crossentropy": 2.7390143871307373, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.26100480556488037, |
| "loss/reg": 34.481136322021484, |
| "step": 972 |
| }, |
| { |
| "epoch": 0.024325, |
| "grad_norm": 2.575105667114258, |
| "grad_norm_var": 0.19063724665019186, |
| "learning_rate": 0.0001, |
| "loss": 37.4356, |
| "loss/crossentropy": 2.8003249168395996, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2204829901456833, |
| "loss/reg": 34.41484069824219, |
| "step": 973 |
| }, |
| { |
| "epoch": 0.02435, |
| "grad_norm": 2.9630026817321777, |
| "grad_norm_var": 0.16218006358210366, |
| "learning_rate": 0.0001, |
| "loss": 37.519, |
| "loss/crossentropy": 2.894831657409668, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2747398912906647, |
| "loss/reg": 34.34943771362305, |
| "step": 974 |
| }, |
| { |
| "epoch": 0.024375, |
| "grad_norm": 2.6025750637054443, |
| "grad_norm_var": 0.16426326415777884, |
| "learning_rate": 0.0001, |
| "loss": 37.0734, |
| "loss/crossentropy": 2.554583787918091, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23361030220985413, |
| "loss/reg": 34.28517532348633, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.0244, |
| "grad_norm": 2.6642050743103027, |
| "grad_norm_var": 0.15927148910242578, |
| "learning_rate": 0.0001, |
| "loss": 37.4822, |
| "loss/crossentropy": 3.0164380073547363, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24384766817092896, |
| "loss/reg": 34.22189712524414, |
| "step": 976 |
| }, |
| { |
| "epoch": 0.024425, |
| "grad_norm": 3.38037371635437, |
| "grad_norm_var": 0.15539215192371383, |
| "learning_rate": 0.0001, |
| "loss": 37.2464, |
| "loss/crossentropy": 2.8206450939178467, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2702368497848511, |
| "loss/reg": 34.155494689941406, |
| "step": 977 |
| }, |
| { |
| "epoch": 0.02445, |
| "grad_norm": 4.222375392913818, |
| "grad_norm_var": 0.2582409245142865, |
| "learning_rate": 0.0001, |
| "loss": 37.0559, |
| "loss/crossentropy": 2.7178001403808594, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24727877974510193, |
| "loss/reg": 34.0908317565918, |
| "step": 978 |
| }, |
| { |
| "epoch": 0.024475, |
| "grad_norm": 2.7761597633361816, |
| "grad_norm_var": 0.2552061228003131, |
| "learning_rate": 0.0001, |
| "loss": 37.1297, |
| "loss/crossentropy": 2.860049247741699, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24505718052387238, |
| "loss/reg": 34.02457046508789, |
| "step": 979 |
| }, |
| { |
| "epoch": 0.0245, |
| "grad_norm": 3.6322226524353027, |
| "grad_norm_var": 0.2795770565181319, |
| "learning_rate": 0.0001, |
| "loss": 36.8498, |
| "loss/crossentropy": 2.6204307079315186, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2698652446269989, |
| "loss/reg": 33.95948791503906, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.024525, |
| "grad_norm": 2.981858491897583, |
| "grad_norm_var": 0.2695698630538421, |
| "learning_rate": 0.0001, |
| "loss": 36.9264, |
| "loss/crossentropy": 2.737006187438965, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.29314208030700684, |
| "loss/reg": 33.89624786376953, |
| "step": 981 |
| }, |
| { |
| "epoch": 0.02455, |
| "grad_norm": 2.74503755569458, |
| "grad_norm_var": 0.2728889013063165, |
| "learning_rate": 0.0001, |
| "loss": 36.943, |
| "loss/crossentropy": 2.8646838665008545, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24609914422035217, |
| "loss/reg": 33.8322639465332, |
| "step": 982 |
| }, |
| { |
| "epoch": 0.024575, |
| "grad_norm": 3.004685640335083, |
| "grad_norm_var": 0.26585896787462554, |
| "learning_rate": 0.0001, |
| "loss": 36.8484, |
| "loss/crossentropy": 2.8109850883483887, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.27065545320510864, |
| "loss/reg": 33.76673889160156, |
| "step": 983 |
| }, |
| { |
| "epoch": 0.0246, |
| "grad_norm": 2.5903549194335938, |
| "grad_norm_var": 0.2753986673068833, |
| "learning_rate": 0.0001, |
| "loss": 36.6906, |
| "loss/crossentropy": 2.7470083236694336, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23907317221164703, |
| "loss/reg": 33.70456314086914, |
| "step": 984 |
| }, |
| { |
| "epoch": 0.024625, |
| "grad_norm": 3.030545234680176, |
| "grad_norm_var": 0.2571285872129806, |
| "learning_rate": 0.0001, |
| "loss": 36.79, |
| "loss/crossentropy": 2.900418996810913, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.24839098751544952, |
| "loss/reg": 33.64120101928711, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.02465, |
| "grad_norm": 4.879204750061035, |
| "grad_norm_var": 0.45573044942737867, |
| "learning_rate": 0.0001, |
| "loss": 36.6408, |
| "loss/crossentropy": 2.826655626296997, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2366310954093933, |
| "loss/reg": 33.57756042480469, |
| "step": 986 |
| }, |
| { |
| "epoch": 0.024675, |
| "grad_norm": 2.7205452919006348, |
| "grad_norm_var": 0.4121069666646612, |
| "learning_rate": 0.0001, |
| "loss": 36.6661, |
| "loss/crossentropy": 2.88576078414917, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2657344341278076, |
| "loss/reg": 33.51458740234375, |
| "step": 987 |
| }, |
| { |
| "epoch": 0.0247, |
| "grad_norm": 2.815150737762451, |
| "grad_norm_var": 0.4174920306889968, |
| "learning_rate": 0.0001, |
| "loss": 36.4976, |
| "loss/crossentropy": 2.8254778385162354, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22087544202804565, |
| "loss/reg": 33.45124435424805, |
| "step": 988 |
| }, |
| { |
| "epoch": 0.024725, |
| "grad_norm": 2.422433614730835, |
| "grad_norm_var": 0.4296126043076429, |
| "learning_rate": 0.0001, |
| "loss": 36.3345, |
| "loss/crossentropy": 2.748582124710083, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.19592465460300446, |
| "loss/reg": 33.390045166015625, |
| "step": 989 |
| }, |
| { |
| "epoch": 0.02475, |
| "grad_norm": 3.05722713470459, |
| "grad_norm_var": 0.42857927278229774, |
| "learning_rate": 0.0001, |
| "loss": 36.6306, |
| "loss/crossentropy": 3.035149574279785, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2665707468986511, |
| "loss/reg": 33.32892990112305, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.024775, |
| "grad_norm": 2.711097240447998, |
| "grad_norm_var": 0.42218565465504604, |
| "learning_rate": 0.0001, |
| "loss": 36.1321, |
| "loss/crossentropy": 2.623349189758301, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.23991812765598297, |
| "loss/reg": 33.26885986328125, |
| "step": 991 |
| }, |
| { |
| "epoch": 0.0248, |
| "grad_norm": 2.7607216835021973, |
| "grad_norm_var": 0.4171327516174946, |
| "learning_rate": 0.0001, |
| "loss": 36.2218, |
| "loss/crossentropy": 2.785637140274048, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.22857247292995453, |
| "loss/reg": 33.20759201049805, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.024825, |
| "grad_norm": 2.80138897895813, |
| "grad_norm_var": 0.4170671328116583, |
| "learning_rate": 0.0001, |
| "loss": 36.5573, |
| "loss/crossentropy": 3.1539735794067383, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2574368119239807, |
| "loss/reg": 33.14588165283203, |
| "step": 993 |
| }, |
| { |
| "epoch": 0.02485, |
| "grad_norm": 2.9705312252044678, |
| "grad_norm_var": 0.3229893124537945, |
| "learning_rate": 0.0001, |
| "loss": 36.1044, |
| "loss/crossentropy": 2.742678165435791, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.27794939279556274, |
| "loss/reg": 33.08378219604492, |
| "step": 994 |
| }, |
| { |
| "epoch": 0.024875, |
| "grad_norm": 3.338322401046753, |
| "grad_norm_var": 0.3264354213345731, |
| "learning_rate": 0.0001, |
| "loss": 36.47, |
| "loss/crossentropy": 3.1671180725097656, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2787696421146393, |
| "loss/reg": 33.024147033691406, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.0249, |
| "grad_norm": 2.7873196601867676, |
| "grad_norm_var": 0.3030776384523582, |
| "learning_rate": 0.0001, |
| "loss": 36.0396, |
| "loss/crossentropy": 2.798398733139038, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2761346995830536, |
| "loss/reg": 32.96507263183594, |
| "step": 996 |
| }, |
| { |
| "epoch": 0.024925, |
| "grad_norm": 2.831047534942627, |
| "grad_norm_var": 0.3043818652657305, |
| "learning_rate": 0.0001, |
| "loss": 35.9753, |
| "loss/crossentropy": 2.8229103088378906, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2463880479335785, |
| "loss/reg": 32.90603256225586, |
| "step": 997 |
| }, |
| { |
| "epoch": 0.02495, |
| "grad_norm": 2.631510019302368, |
| "grad_norm_var": 0.3085412012750756, |
| "learning_rate": 0.0001, |
| "loss": 35.9105, |
| "loss/crossentropy": 2.8491134643554688, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.21344730257987976, |
| "loss/reg": 32.84790802001953, |
| "step": 998 |
| }, |
| { |
| "epoch": 0.024975, |
| "grad_norm": 2.7426092624664307, |
| "grad_norm_var": 0.311255195789613, |
| "learning_rate": 0.0001, |
| "loss": 35.8303, |
| "loss/crossentropy": 2.779572010040283, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.2633829116821289, |
| "loss/reg": 32.78731155395508, |
| "step": 999 |
| }, |
| { |
| "epoch": 0.025, |
| "grad_norm": 2.9469587802886963, |
| "grad_norm_var": 0.30242983856450084, |
| "learning_rate": 0.0001, |
| "loss": 36.044, |
| "loss/crossentropy": 3.054746150970459, |
| "loss/hidden": 0.0, |
| "loss/logits": 0.25968533754348755, |
| "loss/reg": 32.72958755493164, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 40000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 9223372036854775807, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": true, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.455688167424e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|