diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24018 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25, + "eval_steps": 250, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000125, + "grad_norm": 4.097814559936523, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.1655, + "loss/crossentropy": 2.343535900115967, + "loss/hidden": 0.9296875, + "loss/logits": 0.17379230260849, + "loss/reg": 0.006198255345225334, + "step": 1 + }, + { + "epoch": 0.00025, + "grad_norm": 3.662576913833618, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.4973, + "loss/crossentropy": 2.318769931793213, + "loss/hidden": 1.1875, + "loss/logits": 0.24786217510700226, + "loss/reg": 0.006198255345225334, + "step": 2 + }, + { + "epoch": 0.000375, + "grad_norm": 2.8296749591827393, + "learning_rate": 3e-06, + "loss": 1.2258, + "loss/crossentropy": 2.4907937049865723, + "loss/hidden": 0.97265625, + "loss/logits": 0.19112952053546906, + "loss/reg": 0.006198245566338301, + "step": 3 + }, + { + "epoch": 0.0005, + "grad_norm": 3.057624578475952, + "learning_rate": 4.000000000000001e-06, + "loss": 1.1136, + "loss/crossentropy": 2.744520902633667, + "loss/hidden": 0.890625, + "loss/logits": 0.16101403534412384, + "loss/reg": 0.006198232993483543, + "step": 4 + }, + { + "epoch": 0.000625, + "grad_norm": 2.7055587768554688, + "learning_rate": 5e-06, + "loss": 1.1943, + "loss/crossentropy": 2.5722062587738037, + "loss/hidden": 0.94921875, + "loss/logits": 0.18310005962848663, + "loss/reg": 0.0061982134357094765, + "step": 5 + }, + { + "epoch": 0.00075, + "grad_norm": 3.789276361465454, + "learning_rate": 6e-06, + "loss": 1.247, + "loss/crossentropy": 2.613312005996704, + "loss/hidden": 1.0078125, + "loss/logits": 0.17725251615047455, + "loss/reg": 0.006198191549628973, + "step": 6 + }, + { + "epoch": 0.000875, + "grad_norm": 3.997910499572754, + "learning_rate": 7.000000000000001e-06, + "loss": 1.4206, + "loss/crossentropy": 2.4207534790039062, + "loss/hidden": 1.125, + "loss/logits": 0.2336406409740448, + "loss/reg": 0.006198164541274309, + "step": 7 + }, + { + "epoch": 0.001, + "grad_norm": 2.5986244678497314, + "learning_rate": 8.000000000000001e-06, + "loss": 1.0878, + "loss/crossentropy": 2.536424160003662, + "loss/hidden": 0.8671875, + "loss/logits": 0.1585812270641327, + "loss/reg": 0.006198132876306772, + "step": 8 + }, + { + "epoch": 0.001125, + "grad_norm": 2.2757976055145264, + "learning_rate": 9e-06, + "loss": 1.1175, + "loss/crossentropy": 2.745281219482422, + "loss/hidden": 0.89453125, + "loss/logits": 0.16094230115413666, + "loss/reg": 0.006198094692081213, + "step": 9 + }, + { + "epoch": 0.00125, + "grad_norm": 2.261094808578491, + "learning_rate": 1e-05, + "loss": 1.0803, + "loss/crossentropy": 2.3173577785491943, + "loss/hidden": 0.8671875, + "loss/logits": 0.15108685195446014, + "loss/reg": 0.0061980499885976315, + "step": 10 + }, + { + "epoch": 0.001375, + "grad_norm": 21.777265548706055, + "learning_rate": 1.1000000000000001e-05, + "loss": 2.0501, + "loss/crossentropy": 3.2122714519500732, + "loss/hidden": 1.7109375, + "loss/logits": 0.27713608741760254, + "loss/reg": 0.006198008079081774, + "step": 11 + }, + { + "epoch": 0.0015, + "grad_norm": 2.5655505657196045, + "learning_rate": 1.2e-05, + "loss": 1.151, + "loss/crossentropy": 2.706430196762085, + "loss/hidden": 0.8984375, + "loss/logits": 0.19056561589241028, + "loss/reg": 0.0061979577876627445, + "step": 12 + }, + { + "epoch": 0.001625, + "grad_norm": 2.403053045272827, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.0719, + "loss/crossentropy": 2.0466296672821045, + "loss/hidden": 0.88671875, + "loss/logits": 0.12316589802503586, + "loss/reg": 0.0061978911980986595, + "step": 13 + }, + { + "epoch": 0.00175, + "grad_norm": 3.840881586074829, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.5441, + "loss/crossentropy": 2.3191423416137695, + "loss/hidden": 1.234375, + "loss/logits": 0.24779079854488373, + "loss/reg": 0.00619781669229269, + "step": 14 + }, + { + "epoch": 0.001875, + "grad_norm": 2.557331085205078, + "learning_rate": 1.5e-05, + "loss": 0.9444, + "loss/crossentropy": 2.6370084285736084, + "loss/hidden": 0.76953125, + "loss/logits": 0.11287336051464081, + "loss/reg": 0.006197733338922262, + "step": 15 + }, + { + "epoch": 0.002, + "grad_norm": 3.1850404739379883, + "grad_norm_var": 22.31061335402559, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.3213, + "loss/crossentropy": 2.676577091217041, + "loss/hidden": 1.0546875, + "loss/logits": 0.2046227753162384, + "loss/reg": 0.006197639741003513, + "step": 16 + }, + { + "epoch": 0.002125, + "grad_norm": 2.2587289810180664, + "grad_norm_var": 22.553268201402446, + "learning_rate": 1.7000000000000003e-05, + "loss": 1.0312, + "loss/crossentropy": 2.4961040019989014, + "loss/hidden": 0.8203125, + "loss/logits": 0.148894801735878, + "loss/reg": 0.006197560112923384, + "step": 17 + }, + { + "epoch": 0.00225, + "grad_norm": 3.3259811401367188, + "grad_norm_var": 22.58044614452358, + "learning_rate": 1.8e-05, + "loss": 1.3626, + "loss/crossentropy": 2.5914387702941895, + "loss/hidden": 1.046875, + "loss/logits": 0.25370728969573975, + "loss/reg": 0.006197475362569094, + "step": 18 + }, + { + "epoch": 0.002375, + "grad_norm": 2.468914747238159, + "grad_norm_var": 22.649171856957494, + "learning_rate": 1.9e-05, + "loss": 1.1683, + "loss/crossentropy": 2.6096584796905518, + "loss/hidden": 0.921875, + "loss/logits": 0.18447336554527283, + "loss/reg": 0.00619738781824708, + "step": 19 + }, + { + "epoch": 0.0025, + "grad_norm": 2.3097646236419678, + "grad_norm_var": 22.784756315801523, + "learning_rate": 2e-05, + "loss": 1.1605, + "loss/crossentropy": 2.299048662185669, + "loss/hidden": 0.9375, + "loss/logits": 0.16106057167053223, + "loss/reg": 0.006197274662554264, + "step": 20 + }, + { + "epoch": 0.002625, + "grad_norm": 2.1111207008361816, + "grad_norm_var": 22.911025462198744, + "learning_rate": 2.1e-05, + "loss": 0.939, + "loss/crossentropy": 2.547258138656616, + "loss/hidden": 0.75, + "loss/logits": 0.12698382139205933, + "loss/reg": 0.006197154987603426, + "step": 21 + }, + { + "epoch": 0.00275, + "grad_norm": 2.4918222427368164, + "grad_norm_var": 23.049732177187614, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.2047, + "loss/crossentropy": 2.2802374362945557, + "loss/hidden": 0.953125, + "loss/logits": 0.18965375423431396, + "loss/reg": 0.006197045091539621, + "step": 22 + }, + { + "epoch": 0.002875, + "grad_norm": 3.3273494243621826, + "grad_norm_var": 23.069242834486193, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.2554, + "loss/crossentropy": 2.3062734603881836, + "loss/hidden": 1.0078125, + "loss/logits": 0.18566077947616577, + "loss/reg": 0.006196921691298485, + "step": 23 + }, + { + "epoch": 0.003, + "grad_norm": 2.5644068717956543, + "grad_norm_var": 23.075070365271714, + "learning_rate": 2.4e-05, + "loss": 1.2266, + "loss/crossentropy": 2.460878372192383, + "loss/hidden": 0.98046875, + "loss/logits": 0.18418912589550018, + "loss/reg": 0.006196786183863878, + "step": 24 + }, + { + "epoch": 0.003125, + "grad_norm": 2.3506264686584473, + "grad_norm_var": 23.059636834121356, + "learning_rate": 2.5e-05, + "loss": 1.0205, + "loss/crossentropy": 2.4281811714172363, + "loss/hidden": 0.82421875, + "loss/logits": 0.13434948027133942, + "loss/reg": 0.0061966474168002605, + "step": 25 + }, + { + "epoch": 0.00325, + "grad_norm": 2.25004506111145, + "grad_norm_var": 23.062003716592635, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.1133, + "loss/crossentropy": 2.326843500137329, + "loss/hidden": 0.9140625, + "loss/logits": 0.13725802302360535, + "loss/reg": 0.006196498870849609, + "step": 26 + }, + { + "epoch": 0.003375, + "grad_norm": 2.283770799636841, + "grad_norm_var": 0.2469546323472817, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.1459, + "loss/crossentropy": 2.3002493381500244, + "loss/hidden": 0.9140625, + "loss/logits": 0.16987068951129913, + "loss/reg": 0.006196335889399052, + "step": 27 + }, + { + "epoch": 0.0035, + "grad_norm": 2.805088758468628, + "grad_norm_var": 0.24805442740468303, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.0272, + "loss/crossentropy": 2.510472536087036, + "loss/hidden": 0.8359375, + "loss/logits": 0.12927240133285522, + "loss/reg": 0.006196176633238792, + "step": 28 + }, + { + "epoch": 0.003625, + "grad_norm": 2.0331132411956787, + "grad_norm_var": 0.2692014993258605, + "learning_rate": 2.9e-05, + "loss": 1.0913, + "loss/crossentropy": 2.51584529876709, + "loss/hidden": 0.87109375, + "loss/logits": 0.15820594131946564, + "loss/reg": 0.006195997819304466, + "step": 29 + }, + { + "epoch": 0.00375, + "grad_norm": 2.1523566246032715, + "grad_norm_var": 0.17596421900176604, + "learning_rate": 3e-05, + "loss": 1.0026, + "loss/crossentropy": 2.704220771789551, + "loss/hidden": 0.796875, + "loss/logits": 0.14372289180755615, + "loss/reg": 0.0061958180740475655, + "step": 30 + }, + { + "epoch": 0.003875, + "grad_norm": 2.6658694744110107, + "grad_norm_var": 0.1771001402109505, + "learning_rate": 3.1e-05, + "loss": 1.122, + "loss/crossentropy": 2.4840426445007324, + "loss/hidden": 0.89453125, + "loss/logits": 0.1655040979385376, + "loss/reg": 0.006195634603500366, + "step": 31 + }, + { + "epoch": 0.004, + "grad_norm": 2.813079595565796, + "grad_norm_var": 0.153583095436327, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.0653, + "loss/crossentropy": 2.442962646484375, + "loss/hidden": 0.859375, + "loss/logits": 0.14400474727153778, + "loss/reg": 0.00619542459025979, + "step": 32 + }, + { + "epoch": 0.004125, + "grad_norm": 2.4273953437805176, + "grad_norm_var": 0.1496371777315666, + "learning_rate": 3.3e-05, + "loss": 1.1025, + "loss/crossentropy": 2.515721559524536, + "loss/hidden": 0.89453125, + "loss/logits": 0.1460331827402115, + "loss/reg": 0.006195210851728916, + "step": 33 + }, + { + "epoch": 0.00425, + "grad_norm": 2.0594100952148438, + "grad_norm_var": 0.11442956053255457, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.118, + "loss/crossentropy": 2.5347506999969482, + "loss/hidden": 0.8984375, + "loss/logits": 0.15760375559329987, + "loss/reg": 0.006195001769810915, + "step": 34 + }, + { + "epoch": 0.004375, + "grad_norm": 2.497893810272217, + "grad_norm_var": 0.11457586733464495, + "learning_rate": 3.5e-05, + "loss": 1.2359, + "loss/crossentropy": 1.7681002616882324, + "loss/hidden": 1.0390625, + "loss/logits": 0.13490143418312073, + "loss/reg": 0.006194803398102522, + "step": 35 + }, + { + "epoch": 0.0045, + "grad_norm": 3.3231709003448486, + "grad_norm_var": 0.16029457606237638, + "learning_rate": 3.6e-05, + "loss": 1.3588, + "loss/crossentropy": 2.729518175125122, + "loss/hidden": 1.09375, + "loss/logits": 0.20313453674316406, + "loss/reg": 0.00619460316374898, + "step": 36 + }, + { + "epoch": 0.004625, + "grad_norm": 2.5542962551116943, + "grad_norm_var": 0.14901290879942408, + "learning_rate": 3.7e-05, + "loss": 1.1671, + "loss/crossentropy": 2.3359429836273193, + "loss/hidden": 0.9296875, + "loss/logits": 0.17546769976615906, + "loss/reg": 0.006194361485540867, + "step": 37 + }, + { + "epoch": 0.00475, + "grad_norm": 3.5138309001922607, + "grad_norm_var": 0.2080724542279834, + "learning_rate": 3.8e-05, + "loss": 1.2044, + "loss/crossentropy": 2.447890520095825, + "loss/hidden": 0.96484375, + "loss/logits": 0.17756858468055725, + "loss/reg": 0.0061941081658005714, + "step": 38 + }, + { + "epoch": 0.004875, + "grad_norm": 3.813410758972168, + "grad_norm_var": 0.2698887106917669, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.0819, + "loss/crossentropy": 2.766765832901001, + "loss/hidden": 0.88671875, + "loss/logits": 0.13325469195842743, + "loss/reg": 0.006193886045366526, + "step": 39 + }, + { + "epoch": 0.005, + "grad_norm": 3.1502718925476074, + "grad_norm_var": 0.2860816910243668, + "learning_rate": 4e-05, + "loss": 1.3622, + "loss/crossentropy": 2.3325388431549072, + "loss/hidden": 1.109375, + "loss/logits": 0.19087004661560059, + "loss/reg": 0.006193609442561865, + "step": 40 + }, + { + "epoch": 0.005125, + "grad_norm": 2.422366142272949, + "grad_norm_var": 0.28336421674108553, + "learning_rate": 4.1e-05, + "loss": 1.2212, + "loss/crossentropy": 2.3002498149871826, + "loss/hidden": 0.96875, + "loss/logits": 0.19054222106933594, + "loss/reg": 0.00619333703070879, + "step": 41 + }, + { + "epoch": 0.00525, + "grad_norm": 2.7353622913360596, + "grad_norm_var": 0.2707266796228128, + "learning_rate": 4.2e-05, + "loss": 1.0549, + "loss/crossentropy": 2.0319221019744873, + "loss/hidden": 0.87890625, + "loss/logits": 0.1140664741396904, + "loss/reg": 0.006193041335791349, + "step": 42 + }, + { + "epoch": 0.005375, + "grad_norm": 1.9425387382507324, + "grad_norm_var": 0.2970857034274398, + "learning_rate": 4.3e-05, + "loss": 1.0366, + "loss/crossentropy": 2.431666374206543, + "loss/hidden": 0.83203125, + "loss/logits": 0.1426728069782257, + "loss/reg": 0.006192733999341726, + "step": 43 + }, + { + "epoch": 0.0055, + "grad_norm": 2.7009642124176025, + "grad_norm_var": 0.2960522402202514, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.9824, + "loss/crossentropy": 2.391608476638794, + "loss/hidden": 0.78515625, + "loss/logits": 0.13533324003219604, + "loss/reg": 0.006192411296069622, + "step": 44 + }, + { + "epoch": 0.005625, + "grad_norm": 2.6632983684539795, + "grad_norm_var": 0.2669107471214488, + "learning_rate": 4.5e-05, + "loss": 1.1067, + "loss/crossentropy": 2.7733116149902344, + "loss/hidden": 0.87109375, + "loss/logits": 0.1736893653869629, + "loss/reg": 0.006192059256136417, + "step": 45 + }, + { + "epoch": 0.00575, + "grad_norm": 2.1037468910217285, + "grad_norm_var": 0.2707032714108967, + "learning_rate": 4.600000000000001e-05, + "loss": 0.9831, + "loss/crossentropy": 2.4606895446777344, + "loss/hidden": 0.7890625, + "loss/logits": 0.13213258981704712, + "loss/reg": 0.006191718857735395, + "step": 46 + }, + { + "epoch": 0.005875, + "grad_norm": 2.1911983489990234, + "grad_norm_var": 0.28768473978113296, + "learning_rate": 4.7e-05, + "loss": 0.9509, + "loss/crossentropy": 2.6825270652770996, + "loss/hidden": 0.76953125, + "loss/logits": 0.11942489445209503, + "loss/reg": 0.006191306747496128, + "step": 47 + }, + { + "epoch": 0.006, + "grad_norm": 3.2640700340270996, + "grad_norm_var": 0.30827796768009724, + "learning_rate": 4.8e-05, + "loss": 1.0346, + "loss/crossentropy": 2.3665199279785156, + "loss/hidden": 0.83203125, + "loss/logits": 0.14068934321403503, + "loss/reg": 0.0061909533105790615, + "step": 48 + }, + { + "epoch": 0.006125, + "grad_norm": 2.259894847869873, + "grad_norm_var": 0.3163475179157634, + "learning_rate": 4.9e-05, + "loss": 0.9647, + "loss/crossentropy": 2.4414587020874023, + "loss/hidden": 0.79296875, + "loss/logits": 0.10987477004528046, + "loss/reg": 0.0061905342154204845, + "step": 49 + }, + { + "epoch": 0.00625, + "grad_norm": 2.7616565227508545, + "grad_norm_var": 0.28721415330329, + "learning_rate": 5e-05, + "loss": 1.019, + "loss/crossentropy": 2.0829460620880127, + "loss/hidden": 0.83984375, + "loss/logits": 0.11724002659320831, + "loss/reg": 0.0061900559812784195, + "step": 50 + }, + { + "epoch": 0.006375, + "grad_norm": 2.7897861003875732, + "grad_norm_var": 0.28297568806904866, + "learning_rate": 5.1000000000000006e-05, + "loss": 0.853, + "loss/crossentropy": 2.5636909008026123, + "loss/hidden": 0.6953125, + "loss/logits": 0.09577471762895584, + "loss/reg": 0.00618965458124876, + "step": 51 + }, + { + "epoch": 0.0065, + "grad_norm": 2.3134403228759766, + "grad_norm_var": 0.2711290924819705, + "learning_rate": 5.2000000000000004e-05, + "loss": 1.0497, + "loss/crossentropy": 2.440258026123047, + "loss/hidden": 0.83984375, + "loss/logits": 0.14791719615459442, + "loss/reg": 0.006189141888171434, + "step": 52 + }, + { + "epoch": 0.006625, + "grad_norm": 2.2032997608184814, + "grad_norm_var": 0.2855897568404882, + "learning_rate": 5.300000000000001e-05, + "loss": 0.9934, + "loss/crossentropy": 2.4747955799102783, + "loss/hidden": 0.796875, + "loss/logits": 0.13461169600486755, + "loss/reg": 0.006188610102981329, + "step": 53 + }, + { + "epoch": 0.00675, + "grad_norm": 2.267400026321411, + "grad_norm_var": 0.24358579758792467, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.1149, + "loss/crossentropy": 2.705127477645874, + "loss/hidden": 0.89453125, + "loss/logits": 0.1585235595703125, + "loss/reg": 0.0061880191788077354, + "step": 54 + }, + { + "epoch": 0.006875, + "grad_norm": 2.281036853790283, + "grad_norm_var": 0.14220569464836952, + "learning_rate": 5.500000000000001e-05, + "loss": 0.9642, + "loss/crossentropy": 2.545010805130005, + "loss/hidden": 0.78515625, + "loss/logits": 0.11717304587364197, + "loss/reg": 0.006187579594552517, + "step": 55 + }, + { + "epoch": 0.007, + "grad_norm": 4.942420959472656, + "grad_norm_var": 0.4975759650139497, + "learning_rate": 5.6000000000000006e-05, + "loss": 1.1237, + "loss/crossentropy": 2.7698795795440674, + "loss/hidden": 0.91796875, + "loss/logits": 0.14385326206684113, + "loss/reg": 0.006187067367136478, + "step": 56 + }, + { + "epoch": 0.007125, + "grad_norm": 2.4213955402374268, + "grad_norm_var": 0.4976009733976563, + "learning_rate": 5.6999999999999996e-05, + "loss": 1.0386, + "loss/crossentropy": 2.572023868560791, + "loss/hidden": 0.84765625, + "loss/logits": 0.12909512221813202, + "loss/reg": 0.006186594720929861, + "step": 57 + }, + { + "epoch": 0.00725, + "grad_norm": 2.15891695022583, + "grad_norm_var": 0.5091253321428854, + "learning_rate": 5.8e-05, + "loss": 0.961, + "loss/crossentropy": 2.283557415008545, + "loss/hidden": 0.7734375, + "loss/logits": 0.12568500638008118, + "loss/reg": 0.006185955833643675, + "step": 58 + }, + { + "epoch": 0.007375, + "grad_norm": 2.36811900138855, + "grad_norm_var": 0.48432608682591366, + "learning_rate": 5.9e-05, + "loss": 0.8386, + "loss/crossentropy": 2.453810453414917, + "loss/hidden": 0.6796875, + "loss/logits": 0.09709502756595612, + "loss/reg": 0.0061853062361478806, + "step": 59 + }, + { + "epoch": 0.0075, + "grad_norm": 2.591327667236328, + "grad_norm_var": 0.4836842483889178, + "learning_rate": 6e-05, + "loss": 1.033, + "loss/crossentropy": 2.8110511302948, + "loss/hidden": 0.81640625, + "loss/logits": 0.1547423005104065, + "loss/reg": 0.006184632424265146, + "step": 60 + }, + { + "epoch": 0.007625, + "grad_norm": 2.0103816986083984, + "grad_norm_var": 0.5047142009615214, + "learning_rate": 6.1e-05, + "loss": 0.9296, + "loss/crossentropy": 2.15134334564209, + "loss/hidden": 0.7578125, + "loss/logits": 0.1099701076745987, + "loss/reg": 0.0061841062270104885, + "step": 61 + }, + { + "epoch": 0.00775, + "grad_norm": 1.80124831199646, + "grad_norm_var": 0.5287549745746596, + "learning_rate": 6.2e-05, + "loss": 0.9266, + "loss/crossentropy": 2.7054479122161865, + "loss/hidden": 0.7421875, + "loss/logits": 0.12253857403993607, + "loss/reg": 0.0061835781671106815, + "step": 62 + }, + { + "epoch": 0.007875, + "grad_norm": 2.277440309524536, + "grad_norm_var": 0.5252193383179133, + "learning_rate": 6.3e-05, + "loss": 0.914, + "loss/crossentropy": 2.6631381511688232, + "loss/hidden": 0.734375, + "loss/logits": 0.1177992895245552, + "loss/reg": 0.0061830319464206696, + "step": 63 + }, + { + "epoch": 0.008, + "grad_norm": 3.3314151763916016, + "grad_norm_var": 0.531964164332922, + "learning_rate": 6.400000000000001e-05, + "loss": 1.29, + "loss/crossentropy": 2.1269633769989014, + "loss/hidden": 1.0625, + "loss/logits": 0.16565865278244019, + "loss/reg": 0.006182366982102394, + "step": 64 + }, + { + "epoch": 0.008125, + "grad_norm": 4.333358287811279, + "grad_norm_var": 0.7208240839518936, + "learning_rate": 6.500000000000001e-05, + "loss": 1.1615, + "loss/crossentropy": 2.714442491531372, + "loss/hidden": 0.94140625, + "loss/logits": 0.15825161337852478, + "loss/reg": 0.006181675940752029, + "step": 65 + }, + { + "epoch": 0.00825, + "grad_norm": 2.853740930557251, + "grad_norm_var": 0.7223776199927481, + "learning_rate": 6.6e-05, + "loss": 1.062, + "loss/crossentropy": 2.2147135734558105, + "loss/hidden": 0.8515625, + "loss/logits": 0.14859826862812042, + "loss/reg": 0.006180979777127504, + "step": 66 + }, + { + "epoch": 0.008375, + "grad_norm": 2.8853657245635986, + "grad_norm_var": 0.7242961395218184, + "learning_rate": 6.7e-05, + "loss": 0.9533, + "loss/crossentropy": 2.619598388671875, + "loss/hidden": 0.7734375, + "loss/logits": 0.11804014444351196, + "loss/reg": 0.006180332973599434, + "step": 67 + }, + { + "epoch": 0.0085, + "grad_norm": 2.725229501724243, + "grad_norm_var": 0.7142181363616674, + "learning_rate": 6.800000000000001e-05, + "loss": 1.1308, + "loss/crossentropy": 2.4091367721557617, + "loss/hidden": 0.90234375, + "loss/logits": 0.16662752628326416, + "loss/reg": 0.006179714575409889, + "step": 68 + }, + { + "epoch": 0.008625, + "grad_norm": 2.93643856048584, + "grad_norm_var": 0.6977178730278022, + "learning_rate": 6.9e-05, + "loss": 1.1414, + "loss/crossentropy": 2.509793281555176, + "loss/hidden": 0.90234375, + "loss/logits": 0.17730477452278137, + "loss/reg": 0.0061789220198988914, + "step": 69 + }, + { + "epoch": 0.00875, + "grad_norm": 2.4086973667144775, + "grad_norm_var": 0.6896555586144653, + "learning_rate": 7e-05, + "loss": 0.9852, + "loss/crossentropy": 2.7080371379852295, + "loss/hidden": 0.7890625, + "loss/logits": 0.1343374401330948, + "loss/reg": 0.0061781019903719425, + "step": 70 + }, + { + "epoch": 0.008875, + "grad_norm": 1.9355547428131104, + "grad_norm_var": 0.7196579708330165, + "learning_rate": 7.1e-05, + "loss": 0.9176, + "loss/crossentropy": 2.451488494873047, + "loss/hidden": 0.7421875, + "loss/logits": 0.11365102231502533, + "loss/reg": 0.006177456583827734, + "step": 71 + }, + { + "epoch": 0.009, + "grad_norm": 2.273902654647827, + "grad_norm_var": 0.38422972669649574, + "learning_rate": 7.2e-05, + "loss": 1.0112, + "loss/crossentropy": 2.4479947090148926, + "loss/hidden": 0.8125, + "loss/logits": 0.13690924644470215, + "loss/reg": 0.006176764145493507, + "step": 72 + }, + { + "epoch": 0.009125, + "grad_norm": 3.385849952697754, + "grad_norm_var": 0.4217084598233742, + "learning_rate": 7.3e-05, + "loss": 1.3992, + "loss/crossentropy": 2.3916804790496826, + "loss/hidden": 1.1484375, + "loss/logits": 0.18896484375, + "loss/reg": 0.006176079623401165, + "step": 73 + }, + { + "epoch": 0.00925, + "grad_norm": 1.893932580947876, + "grad_norm_var": 0.44317594415441114, + "learning_rate": 7.4e-05, + "loss": 0.9357, + "loss/crossentropy": 2.3809518814086914, + "loss/hidden": 0.74609375, + "loss/logits": 0.12787015736103058, + "loss/reg": 0.00617539556697011, + "step": 74 + }, + { + "epoch": 0.009375, + "grad_norm": 2.431032657623291, + "grad_norm_var": 0.4412621914582907, + "learning_rate": 7.500000000000001e-05, + "loss": 1.0796, + "loss/crossentropy": 2.5346295833587646, + "loss/hidden": 0.86328125, + "loss/logits": 0.1545613557100296, + "loss/reg": 0.006174764130264521, + "step": 75 + }, + { + "epoch": 0.0095, + "grad_norm": 2.2421321868896484, + "grad_norm_var": 0.45066905079875685, + "learning_rate": 7.6e-05, + "loss": 0.9869, + "loss/crossentropy": 2.756843090057373, + "loss/hidden": 0.796875, + "loss/logits": 0.1282375454902649, + "loss/reg": 0.006174163427203894, + "step": 76 + }, + { + "epoch": 0.009625, + "grad_norm": 2.7022979259490967, + "grad_norm_var": 0.4254703741989109, + "learning_rate": 7.7e-05, + "loss": 1.2503, + "loss/crossentropy": 2.0696699619293213, + "loss/hidden": 1.015625, + "loss/logits": 0.1729813814163208, + "loss/reg": 0.006173421163111925, + "step": 77 + }, + { + "epoch": 0.00975, + "grad_norm": 2.501106023788452, + "grad_norm_var": 0.37677934250983375, + "learning_rate": 7.800000000000001e-05, + "loss": 1.0516, + "loss/crossentropy": 2.629380941390991, + "loss/hidden": 0.83984375, + "loss/logits": 0.15003597736358643, + "loss/reg": 0.006172672379761934, + "step": 78 + }, + { + "epoch": 0.009875, + "grad_norm": 2.137601137161255, + "grad_norm_var": 0.3857841035513881, + "learning_rate": 7.900000000000001e-05, + "loss": 0.9388, + "loss/crossentropy": 2.6841280460357666, + "loss/hidden": 0.75, + "loss/logits": 0.12706515192985535, + "loss/reg": 0.006171974819153547, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 4.655951976776123, + "grad_norm_var": 0.6093991769416703, + "learning_rate": 8e-05, + "loss": 1.2659, + "loss/crossentropy": 2.4634439945220947, + "loss/hidden": 1.0390625, + "loss/logits": 0.16511483490467072, + "loss/reg": 0.006171175744384527, + "step": 80 + }, + { + "epoch": 0.010125, + "grad_norm": 2.2418179512023926, + "grad_norm_var": 0.44652068466097317, + "learning_rate": 8.1e-05, + "loss": 1.0773, + "loss/crossentropy": 2.479743480682373, + "loss/hidden": 0.87890625, + "loss/logits": 0.1366729438304901, + "loss/reg": 0.006170437205582857, + "step": 81 + }, + { + "epoch": 0.01025, + "grad_norm": 2.0470192432403564, + "grad_norm_var": 0.4640077865797357, + "learning_rate": 8.2e-05, + "loss": 0.8599, + "loss/crossentropy": 2.440803050994873, + "loss/hidden": 0.68359375, + "loss/logits": 0.11458206921815872, + "loss/reg": 0.0061697582714259624, + "step": 82 + }, + { + "epoch": 0.010375, + "grad_norm": 2.0131125450134277, + "grad_norm_var": 0.47694604476552793, + "learning_rate": 8.3e-05, + "loss": 0.8585, + "loss/crossentropy": 2.480877637863159, + "loss/hidden": 0.6875, + "loss/logits": 0.10927767306566238, + "loss/reg": 0.006169027183204889, + "step": 83 + }, + { + "epoch": 0.0105, + "grad_norm": 2.2644267082214355, + "grad_norm_var": 0.47842071328175656, + "learning_rate": 8.4e-05, + "loss": 0.8351, + "loss/crossentropy": 2.693246841430664, + "loss/hidden": 0.67578125, + "loss/logits": 0.09764716029167175, + "loss/reg": 0.006168315652757883, + "step": 84 + }, + { + "epoch": 0.010625, + "grad_norm": 3.1729207038879395, + "grad_norm_var": 0.4955376038232837, + "learning_rate": 8.5e-05, + "loss": 1.2314, + "loss/crossentropy": 2.3339309692382812, + "loss/hidden": 1.015625, + "loss/logits": 0.15408015251159668, + "loss/reg": 0.006167604587972164, + "step": 85 + }, + { + "epoch": 0.01075, + "grad_norm": 2.281872510910034, + "grad_norm_var": 0.4984116504809473, + "learning_rate": 8.6e-05, + "loss": 1.1113, + "loss/crossentropy": 2.410794258117676, + "loss/hidden": 0.8828125, + "loss/logits": 0.16686803102493286, + "loss/reg": 0.0061669000424444675, + "step": 86 + }, + { + "epoch": 0.010875, + "grad_norm": 2.701244354248047, + "grad_norm_var": 0.4762769450482454, + "learning_rate": 8.7e-05, + "loss": 0.9115, + "loss/crossentropy": 2.5270962715148926, + "loss/hidden": 0.73046875, + "loss/logits": 0.11935658752918243, + "loss/reg": 0.0061660343781113625, + "step": 87 + }, + { + "epoch": 0.011, + "grad_norm": 2.0738677978515625, + "grad_norm_var": 0.4863854399313406, + "learning_rate": 8.800000000000001e-05, + "loss": 0.9634, + "loss/crossentropy": 2.625903844833374, + "loss/hidden": 0.7734375, + "loss/logits": 0.12826378643512726, + "loss/reg": 0.006165289785712957, + "step": 88 + }, + { + "epoch": 0.011125, + "grad_norm": 2.827744245529175, + "grad_norm_var": 0.44340376520124375, + "learning_rate": 8.900000000000001e-05, + "loss": 1.0134, + "loss/crossentropy": 2.2436654567718506, + "loss/hidden": 0.80078125, + "loss/logits": 0.15097512304782867, + "loss/reg": 0.006164397578686476, + "step": 89 + }, + { + "epoch": 0.01125, + "grad_norm": 2.412203788757324, + "grad_norm_var": 0.4174983019540292, + "learning_rate": 9e-05, + "loss": 0.9541, + "loss/crossentropy": 2.4847052097320557, + "loss/hidden": 0.78515625, + "loss/logits": 0.10735376924276352, + "loss/reg": 0.006163434591144323, + "step": 90 + }, + { + "epoch": 0.011375, + "grad_norm": 2.385309934616089, + "grad_norm_var": 0.41831854842319344, + "learning_rate": 9.1e-05, + "loss": 1.0455, + "loss/crossentropy": 2.1011688709259033, + "loss/hidden": 0.828125, + "loss/logits": 0.15577414631843567, + "loss/reg": 0.0061626131646335125, + "step": 91 + }, + { + "epoch": 0.0115, + "grad_norm": 2.779266595840454, + "grad_norm_var": 0.4149256226543306, + "learning_rate": 9.200000000000001e-05, + "loss": 0.9782, + "loss/crossentropy": 2.770954132080078, + "loss/hidden": 0.78125, + "loss/logits": 0.13530117273330688, + "loss/reg": 0.006161784287542105, + "step": 92 + }, + { + "epoch": 0.011625, + "grad_norm": 2.816206216812134, + "grad_norm_var": 0.41767206123470924, + "learning_rate": 9.300000000000001e-05, + "loss": 1.2584, + "loss/crossentropy": 2.4919488430023193, + "loss/hidden": 1.0234375, + "loss/logits": 0.17335021495819092, + "loss/reg": 0.006160792429000139, + "step": 93 + }, + { + "epoch": 0.01175, + "grad_norm": 2.1000349521636963, + "grad_norm_var": 0.4320504871954351, + "learning_rate": 9.4e-05, + "loss": 0.9293, + "loss/crossentropy": 2.6951355934143066, + "loss/hidden": 0.7421875, + "loss/logits": 0.12551091611385345, + "loss/reg": 0.006159830838441849, + "step": 94 + }, + { + "epoch": 0.011875, + "grad_norm": 2.6696228981018066, + "grad_norm_var": 0.4199965621062515, + "learning_rate": 9.5e-05, + "loss": 1.0491, + "loss/crossentropy": 2.6532485485076904, + "loss/hidden": 0.83984375, + "loss/logits": 0.14771661162376404, + "loss/reg": 0.006158801261335611, + "step": 95 + }, + { + "epoch": 0.012, + "grad_norm": 2.308758020401001, + "grad_norm_var": 0.11782165750081125, + "learning_rate": 9.6e-05, + "loss": 1.1178, + "loss/crossentropy": 2.38185977935791, + "loss/hidden": 0.90625, + "loss/logits": 0.1499352604150772, + "loss/reg": 0.006157839670777321, + "step": 96 + }, + { + "epoch": 0.012125, + "grad_norm": 2.4204304218292236, + "grad_norm_var": 0.11501335190634426, + "learning_rate": 9.7e-05, + "loss": 1.092, + "loss/crossentropy": 2.4358534812927246, + "loss/hidden": 0.86328125, + "loss/logits": 0.16712763905525208, + "loss/reg": 0.006156752817332745, + "step": 97 + }, + { + "epoch": 0.01225, + "grad_norm": 3.7184524536132812, + "grad_norm_var": 0.198780236272727, + "learning_rate": 9.8e-05, + "loss": 1.4311, + "loss/crossentropy": 2.1283679008483887, + "loss/hidden": 1.171875, + "loss/logits": 0.1976230889558792, + "loss/reg": 0.006155804730951786, + "step": 98 + }, + { + "epoch": 0.012375, + "grad_norm": 3.2656571865081787, + "grad_norm_var": 0.20565265002658914, + "learning_rate": 9.900000000000001e-05, + "loss": 1.017, + "loss/crossentropy": 2.6715664863586426, + "loss/hidden": 0.80078125, + "loss/logits": 0.15465494990348816, + "loss/reg": 0.006154791917651892, + "step": 99 + }, + { + "epoch": 0.0125, + "grad_norm": 2.915663719177246, + "grad_norm_var": 0.19977570339779593, + "learning_rate": 0.0001, + "loss": 0.98, + "loss/crossentropy": 2.5455305576324463, + "loss/hidden": 0.77734375, + "loss/logits": 0.1410846710205078, + "loss/reg": 0.0061536673456430435, + "step": 100 + }, + { + "epoch": 0.012625, + "grad_norm": 3.3153059482574463, + "grad_norm_var": 0.2104372314148539, + "learning_rate": 0.0001, + "loss": 1.1039, + "loss/crossentropy": 2.455479621887207, + "loss/hidden": 0.90625, + "loss/logits": 0.13615351915359497, + "loss/reg": 0.0061526307836174965, + "step": 101 + }, + { + "epoch": 0.01275, + "grad_norm": 2.40315318107605, + "grad_norm_var": 0.20480568897691, + "learning_rate": 0.0001, + "loss": 0.9588, + "loss/crossentropy": 2.6359853744506836, + "loss/hidden": 0.76953125, + "loss/logits": 0.1277719885110855, + "loss/reg": 0.006151493173092604, + "step": 102 + }, + { + "epoch": 0.012875, + "grad_norm": 3.625624895095825, + "grad_norm_var": 0.25903479701245613, + "learning_rate": 0.0001, + "loss": 1.2481, + "loss/crossentropy": 2.0148656368255615, + "loss/hidden": 1.046875, + "loss/logits": 0.13969773054122925, + "loss/reg": 0.006150420755147934, + "step": 103 + }, + { + "epoch": 0.013, + "grad_norm": 2.497906446456909, + "grad_norm_var": 0.23191354079432358, + "learning_rate": 0.0001, + "loss": 1.0603, + "loss/crossentropy": 2.3493525981903076, + "loss/hidden": 0.86328125, + "loss/logits": 0.13548779487609863, + "loss/reg": 0.006149281747639179, + "step": 104 + }, + { + "epoch": 0.013125, + "grad_norm": 3.258059501647949, + "grad_norm_var": 0.24629299643454275, + "learning_rate": 0.0001, + "loss": 0.9497, + "loss/crossentropy": 2.6988418102264404, + "loss/hidden": 0.7734375, + "loss/logits": 0.11473990976810455, + "loss/reg": 0.006148339249193668, + "step": 105 + }, + { + "epoch": 0.01325, + "grad_norm": 3.1279666423797607, + "grad_norm_var": 0.24075672502018505, + "learning_rate": 0.0001, + "loss": 1.1195, + "loss/crossentropy": 2.578716278076172, + "loss/hidden": 0.875, + "loss/logits": 0.18304204940795898, + "loss/reg": 0.006147205363959074, + "step": 106 + }, + { + "epoch": 0.013375, + "grad_norm": 2.760901927947998, + "grad_norm_var": 0.22627915570051277, + "learning_rate": 0.0001, + "loss": 0.9369, + "loss/crossentropy": 2.5835328102111816, + "loss/hidden": 0.75, + "loss/logits": 0.12544697523117065, + "loss/reg": 0.006146106868982315, + "step": 107 + }, + { + "epoch": 0.0135, + "grad_norm": 3.2917559146881104, + "grad_norm_var": 0.23622539643692994, + "learning_rate": 0.0001, + "loss": 1.1437, + "loss/crossentropy": 2.6001460552215576, + "loss/hidden": 0.91796875, + "loss/logits": 0.16428819298744202, + "loss/reg": 0.006144997663795948, + "step": 108 + }, + { + "epoch": 0.013625, + "grad_norm": 3.3908517360687256, + "grad_norm_var": 0.2499864352593607, + "learning_rate": 0.0001, + "loss": 1.0747, + "loss/crossentropy": 2.6003377437591553, + "loss/hidden": 0.87109375, + "loss/logits": 0.14213082194328308, + "loss/reg": 0.00614393362775445, + "step": 109 + }, + { + "epoch": 0.01375, + "grad_norm": 2.7455620765686035, + "grad_norm_var": 0.2035723185991922, + "learning_rate": 0.0001, + "loss": 1.1844, + "loss/crossentropy": 2.446432113647461, + "loss/hidden": 0.94921875, + "loss/logits": 0.17372827231884003, + "loss/reg": 0.00614282488822937, + "step": 110 + }, + { + "epoch": 0.013875, + "grad_norm": 2.899392604827881, + "grad_norm_var": 0.1972949454934593, + "learning_rate": 0.0001, + "loss": 1.0314, + "loss/crossentropy": 2.4233920574188232, + "loss/hidden": 0.83984375, + "loss/logits": 0.13018067181110382, + "loss/reg": 0.00614172825589776, + "step": 111 + }, + { + "epoch": 0.014, + "grad_norm": 2.204866647720337, + "grad_norm_var": 0.20749751086427656, + "learning_rate": 0.0001, + "loss": 0.9867, + "loss/crossentropy": 2.4006736278533936, + "loss/hidden": 0.79296875, + "loss/logits": 0.13233302533626556, + "loss/reg": 0.006140332669019699, + "step": 112 + }, + { + "epoch": 0.014125, + "grad_norm": 2.5094263553619385, + "grad_norm_var": 0.20123279411857975, + "learning_rate": 0.0001, + "loss": 1.2429, + "loss/crossentropy": 2.2730560302734375, + "loss/hidden": 1.0078125, + "loss/logits": 0.1737476885318756, + "loss/reg": 0.006138913799077272, + "step": 113 + }, + { + "epoch": 0.01425, + "grad_norm": 2.590543031692505, + "grad_norm_var": 0.17204464736018749, + "learning_rate": 0.0001, + "loss": 1.0086, + "loss/crossentropy": 2.5709896087646484, + "loss/hidden": 0.79296875, + "loss/logits": 0.1542350947856903, + "loss/reg": 0.0061377594247460365, + "step": 114 + }, + { + "epoch": 0.014375, + "grad_norm": 2.5024876594543457, + "grad_norm_var": 0.17379926494707643, + "learning_rate": 0.0001, + "loss": 1.0309, + "loss/crossentropy": 2.539165496826172, + "loss/hidden": 0.828125, + "loss/logits": 0.14142319560050964, + "loss/reg": 0.006136584095656872, + "step": 115 + }, + { + "epoch": 0.0145, + "grad_norm": 3.2216732501983643, + "grad_norm_var": 0.18121036366206128, + "learning_rate": 0.0001, + "loss": 0.9404, + "loss/crossentropy": 2.7685325145721436, + "loss/hidden": 0.765625, + "loss/logits": 0.1133967787027359, + "loss/reg": 0.006135319825261831, + "step": 116 + }, + { + "epoch": 0.014625, + "grad_norm": 2.3834009170532227, + "grad_norm_var": 0.18346146088524526, + "learning_rate": 0.0001, + "loss": 1.1432, + "loss/crossentropy": 2.4507999420166016, + "loss/hidden": 0.92578125, + "loss/logits": 0.1561031937599182, + "loss/reg": 0.006133983377367258, + "step": 117 + }, + { + "epoch": 0.01475, + "grad_norm": 2.4703636169433594, + "grad_norm_var": 0.17984383474256424, + "learning_rate": 0.0001, + "loss": 1.0541, + "loss/crossentropy": 2.3506076335906982, + "loss/hidden": 0.84765625, + "loss/logits": 0.14511807262897491, + "loss/reg": 0.006132753100246191, + "step": 118 + }, + { + "epoch": 0.014875, + "grad_norm": 2.5960817337036133, + "grad_norm_var": 0.13859654880591943, + "learning_rate": 0.0001, + "loss": 1.2156, + "loss/crossentropy": 2.427006244659424, + "loss/hidden": 0.96875, + "loss/logits": 0.1855170726776123, + "loss/reg": 0.006131566129624844, + "step": 119 + }, + { + "epoch": 0.015, + "grad_norm": 2.908734083175659, + "grad_norm_var": 0.13379147574996655, + "learning_rate": 0.0001, + "loss": 1.0136, + "loss/crossentropy": 2.4075210094451904, + "loss/hidden": 0.81640625, + "loss/logits": 0.13592825829982758, + "loss/reg": 0.006130332592874765, + "step": 120 + }, + { + "epoch": 0.015125, + "grad_norm": 3.450002670288086, + "grad_norm_var": 0.147717685364636, + "learning_rate": 0.0001, + "loss": 1.1584, + "loss/crossentropy": 2.446925640106201, + "loss/hidden": 0.92578125, + "loss/logits": 0.17129938304424286, + "loss/reg": 0.0061291721649467945, + "step": 121 + }, + { + "epoch": 0.01525, + "grad_norm": 2.941195011138916, + "grad_norm_var": 0.14212594790061886, + "learning_rate": 0.0001, + "loss": 1.0996, + "loss/crossentropy": 2.5499086380004883, + "loss/hidden": 0.87109375, + "loss/logits": 0.1672220528125763, + "loss/reg": 0.006127914879471064, + "step": 122 + }, + { + "epoch": 0.015375, + "grad_norm": 2.951799154281616, + "grad_norm_var": 0.14330143067309015, + "learning_rate": 0.0001, + "loss": 1.0862, + "loss/crossentropy": 2.654383420944214, + "loss/hidden": 0.87109375, + "loss/logits": 0.15379250049591064, + "loss/reg": 0.006126696243882179, + "step": 123 + }, + { + "epoch": 0.0155, + "grad_norm": 2.5093131065368652, + "grad_norm_var": 0.13194533540905293, + "learning_rate": 0.0001, + "loss": 1.0905, + "loss/crossentropy": 2.4646618366241455, + "loss/hidden": 0.87890625, + "loss/logits": 0.15029752254486084, + "loss/reg": 0.006125394720584154, + "step": 124 + }, + { + "epoch": 0.015625, + "grad_norm": 2.357142448425293, + "grad_norm_var": 0.11277765633995311, + "learning_rate": 0.0001, + "loss": 1.0794, + "loss/crossentropy": 2.4590322971343994, + "loss/hidden": 0.87109375, + "loss/logits": 0.1471107453107834, + "loss/reg": 0.0061240773648023605, + "step": 125 + }, + { + "epoch": 0.01575, + "grad_norm": 2.0443954467773438, + "grad_norm_var": 0.13949059079901172, + "learning_rate": 0.0001, + "loss": 1.0064, + "loss/crossentropy": 2.6105568408966064, + "loss/hidden": 0.80859375, + "loss/logits": 0.13658249378204346, + "loss/reg": 0.006122750695794821, + "step": 126 + }, + { + "epoch": 0.015875, + "grad_norm": 2.334003448486328, + "grad_norm_var": 0.1413326038540049, + "learning_rate": 0.0001, + "loss": 1.128, + "loss/crossentropy": 2.3226428031921387, + "loss/hidden": 0.8984375, + "loss/logits": 0.16836631298065186, + "loss/reg": 0.006121381651610136, + "step": 127 + }, + { + "epoch": 0.016, + "grad_norm": 2.6693766117095947, + "grad_norm_var": 0.12889249481462456, + "learning_rate": 0.0001, + "loss": 1.0478, + "loss/crossentropy": 2.5844597816467285, + "loss/hidden": 0.84765625, + "loss/logits": 0.1388963758945465, + "loss/reg": 0.006120136007666588, + "step": 128 + }, + { + "epoch": 0.016125, + "grad_norm": 3.935439348220825, + "grad_norm_var": 0.22878447427120438, + "learning_rate": 0.0001, + "loss": 1.1726, + "loss/crossentropy": 2.7213780879974365, + "loss/hidden": 0.9375, + "loss/logits": 0.1738772690296173, + "loss/reg": 0.006118897348642349, + "step": 129 + }, + { + "epoch": 0.01625, + "grad_norm": 3.463432788848877, + "grad_norm_var": 0.25882213944617144, + "learning_rate": 0.0001, + "loss": 1.0898, + "loss/crossentropy": 2.3635873794555664, + "loss/hidden": 0.8828125, + "loss/logits": 0.1457763910293579, + "loss/reg": 0.006117486394941807, + "step": 130 + }, + { + "epoch": 0.016375, + "grad_norm": 3.779526948928833, + "grad_norm_var": 0.31074183113488135, + "learning_rate": 0.0001, + "loss": 1.2078, + "loss/crossentropy": 2.316762924194336, + "loss/hidden": 0.98046875, + "loss/logits": 0.16614478826522827, + "loss/reg": 0.006116243079304695, + "step": 131 + }, + { + "epoch": 0.0165, + "grad_norm": 2.7554008960723877, + "grad_norm_var": 0.3028391023812749, + "learning_rate": 0.0001, + "loss": 0.9769, + "loss/crossentropy": 2.458954095840454, + "loss/hidden": 0.7890625, + "loss/logits": 0.12667913734912872, + "loss/reg": 0.006114880088716745, + "step": 132 + }, + { + "epoch": 0.016625, + "grad_norm": 2.342526435852051, + "grad_norm_var": 0.30546929082944035, + "learning_rate": 0.0001, + "loss": 1.1137, + "loss/crossentropy": 2.6329517364501953, + "loss/hidden": 0.890625, + "loss/logits": 0.161947563290596, + "loss/reg": 0.0061136274598538876, + "step": 133 + }, + { + "epoch": 0.01675, + "grad_norm": 2.2754058837890625, + "grad_norm_var": 0.31756495416411024, + "learning_rate": 0.0001, + "loss": 1.1703, + "loss/crossentropy": 2.2747550010681152, + "loss/hidden": 0.94921875, + "loss/logits": 0.15994513034820557, + "loss/reg": 0.006112351547926664, + "step": 134 + }, + { + "epoch": 0.016875, + "grad_norm": 3.1313912868499756, + "grad_norm_var": 0.3186282278045513, + "learning_rate": 0.0001, + "loss": 1.2333, + "loss/crossentropy": 2.4932894706726074, + "loss/hidden": 0.99609375, + "loss/logits": 0.17612434923648834, + "loss/reg": 0.006111042574048042, + "step": 135 + }, + { + "epoch": 0.017, + "grad_norm": 3.960482358932495, + "grad_norm_var": 0.39381746513703864, + "learning_rate": 0.0001, + "loss": 1.3101, + "loss/crossentropy": 2.581660747528076, + "loss/hidden": 1.0625, + "loss/logits": 0.18646802008152008, + "loss/reg": 0.006109676789492369, + "step": 136 + }, + { + "epoch": 0.017125, + "grad_norm": 2.7605810165405273, + "grad_norm_var": 0.37584340109069647, + "learning_rate": 0.0001, + "loss": 0.8792, + "loss/crossentropy": 2.6490936279296875, + "loss/hidden": 0.703125, + "loss/logits": 0.1150316372513771, + "loss/reg": 0.006108277477324009, + "step": 137 + }, + { + "epoch": 0.01725, + "grad_norm": 2.6196203231811523, + "grad_norm_var": 0.38003486499210315, + "learning_rate": 0.0001, + "loss": 0.955, + "loss/crossentropy": 2.633441209793091, + "loss/hidden": 0.76953125, + "loss/logits": 0.1244344562292099, + "loss/reg": 0.006106934975832701, + "step": 138 + }, + { + "epoch": 0.017375, + "grad_norm": 4.534512519836426, + "grad_norm_var": 0.554255985026353, + "learning_rate": 0.0001, + "loss": 1.4104, + "loss/crossentropy": 2.2204151153564453, + "loss/hidden": 1.1796875, + "loss/logits": 0.1696874350309372, + "loss/reg": 0.0061056241393089294, + "step": 139 + }, + { + "epoch": 0.0175, + "grad_norm": 2.192370653152466, + "grad_norm_var": 0.5798771099829023, + "learning_rate": 0.0001, + "loss": 1.1299, + "loss/crossentropy": 2.375506639480591, + "loss/hidden": 0.921875, + "loss/logits": 0.14694982767105103, + "loss/reg": 0.0061043244786560535, + "step": 140 + }, + { + "epoch": 0.017625, + "grad_norm": 4.368403911590576, + "grad_norm_var": 0.6744588881998081, + "learning_rate": 0.0001, + "loss": 1.278, + "loss/crossentropy": 2.3692545890808105, + "loss/hidden": 1.03125, + "loss/logits": 0.18568292260169983, + "loss/reg": 0.006102937273681164, + "step": 141 + }, + { + "epoch": 0.01775, + "grad_norm": 2.2753779888153076, + "grad_norm_var": 0.6461169960118004, + "learning_rate": 0.0001, + "loss": 1.0276, + "loss/crossentropy": 2.470676898956299, + "loss/hidden": 0.82421875, + "loss/logits": 0.14231771230697632, + "loss/reg": 0.006101653911173344, + "step": 142 + }, + { + "epoch": 0.017875, + "grad_norm": 2.6550562381744385, + "grad_norm_var": 0.6203099666067883, + "learning_rate": 0.0001, + "loss": 0.8712, + "loss/crossentropy": 2.8198063373565674, + "loss/hidden": 0.69921875, + "loss/logits": 0.11099085956811905, + "loss/reg": 0.006100376136600971, + "step": 143 + }, + { + "epoch": 0.018, + "grad_norm": 2.8701858520507812, + "grad_norm_var": 0.6111015072729884, + "learning_rate": 0.0001, + "loss": 1.1794, + "loss/crossentropy": 2.413463830947876, + "loss/hidden": 0.96484375, + "loss/logits": 0.15351834893226624, + "loss/reg": 0.006099053658545017, + "step": 144 + }, + { + "epoch": 0.018125, + "grad_norm": 2.2347958087921143, + "grad_norm_var": 0.6069563505613275, + "learning_rate": 0.0001, + "loss": 1.0832, + "loss/crossentropy": 2.446056604385376, + "loss/hidden": 0.8671875, + "loss/logits": 0.1550455242395401, + "loss/reg": 0.006097796373069286, + "step": 145 + }, + { + "epoch": 0.01825, + "grad_norm": 2.60143780708313, + "grad_norm_var": 0.6017061449507364, + "learning_rate": 0.0001, + "loss": 1.1216, + "loss/crossentropy": 2.2890260219573975, + "loss/hidden": 0.8984375, + "loss/logits": 0.16223573684692383, + "loss/reg": 0.006096460856497288, + "step": 146 + }, + { + "epoch": 0.018375, + "grad_norm": 3.656100273132324, + "grad_norm_var": 0.5891684064627459, + "learning_rate": 0.0001, + "loss": 1.2759, + "loss/crossentropy": 2.2077646255493164, + "loss/hidden": 1.0546875, + "loss/logits": 0.16024138033390045, + "loss/reg": 0.006095105782151222, + "step": 147 + }, + { + "epoch": 0.0185, + "grad_norm": 2.8190999031066895, + "grad_norm_var": 0.5877513730221795, + "learning_rate": 0.0001, + "loss": 1.1416, + "loss/crossentropy": 2.4892842769622803, + "loss/hidden": 0.9140625, + "loss/logits": 0.1665700376033783, + "loss/reg": 0.0060938019305467606, + "step": 148 + }, + { + "epoch": 0.018625, + "grad_norm": 2.6578848361968994, + "grad_norm_var": 0.568168306773175, + "learning_rate": 0.0001, + "loss": 1.1443, + "loss/crossentropy": 2.3138527870178223, + "loss/hidden": 0.93359375, + "loss/logits": 0.14977282285690308, + "loss/reg": 0.006092346739023924, + "step": 149 + }, + { + "epoch": 0.01875, + "grad_norm": 2.656559944152832, + "grad_norm_var": 0.5416540961853636, + "learning_rate": 0.0001, + "loss": 0.9868, + "loss/crossentropy": 2.7701377868652344, + "loss/hidden": 0.796875, + "loss/logits": 0.12901648879051208, + "loss/reg": 0.006090943701565266, + "step": 150 + }, + { + "epoch": 0.018875, + "grad_norm": 1.9359983205795288, + "grad_norm_var": 0.6099613145708634, + "learning_rate": 0.0001, + "loss": 0.9127, + "loss/crossentropy": 2.55560040473938, + "loss/hidden": 0.73828125, + "loss/logits": 0.11351295560598373, + "loss/reg": 0.00608965614810586, + "step": 151 + }, + { + "epoch": 0.019, + "grad_norm": 3.7978732585906982, + "grad_norm_var": 0.5891613317586338, + "learning_rate": 0.0001, + "loss": 1.2275, + "loss/crossentropy": 2.4227731227874756, + "loss/hidden": 0.98828125, + "loss/logits": 0.17836451530456543, + "loss/reg": 0.006088252179324627, + "step": 152 + }, + { + "epoch": 0.019125, + "grad_norm": 2.8193647861480713, + "grad_norm_var": 0.588169020521083, + "learning_rate": 0.0001, + "loss": 0.9739, + "loss/crossentropy": 2.474368095397949, + "loss/hidden": 0.80078125, + "loss/logits": 0.11225409805774689, + "loss/reg": 0.006086937617510557, + "step": 153 + }, + { + "epoch": 0.01925, + "grad_norm": 2.2882325649261475, + "grad_norm_var": 0.6082348956957436, + "learning_rate": 0.0001, + "loss": 1.0395, + "loss/crossentropy": 2.3776350021362305, + "loss/hidden": 0.82421875, + "loss/logits": 0.15443992614746094, + "loss/reg": 0.0060854703187942505, + "step": 154 + }, + { + "epoch": 0.019375, + "grad_norm": 2.006150245666504, + "grad_norm_var": 0.4559805309993303, + "learning_rate": 0.0001, + "loss": 0.9762, + "loss/crossentropy": 2.7556076049804688, + "loss/hidden": 0.78515625, + "loss/logits": 0.13019207119941711, + "loss/reg": 0.006084186024963856, + "step": 155 + }, + { + "epoch": 0.0195, + "grad_norm": 2.8143231868743896, + "grad_norm_var": 0.43477030174237014, + "learning_rate": 0.0001, + "loss": 1.1927, + "loss/crossentropy": 2.652045249938965, + "loss/hidden": 0.94140625, + "loss/logits": 0.19042611122131348, + "loss/reg": 0.00608274107798934, + "step": 156 + }, + { + "epoch": 0.019625, + "grad_norm": 2.957540988922119, + "grad_norm_var": 0.2601037584282233, + "learning_rate": 0.0001, + "loss": 1.0641, + "loss/crossentropy": 2.546213150024414, + "loss/hidden": 0.86328125, + "loss/logits": 0.14000022411346436, + "loss/reg": 0.006081291940063238, + "step": 157 + }, + { + "epoch": 0.01975, + "grad_norm": 2.625493288040161, + "grad_norm_var": 0.24839219907499052, + "learning_rate": 0.0001, + "loss": 1.012, + "loss/crossentropy": 2.5120432376861572, + "loss/hidden": 0.81640625, + "loss/logits": 0.13474689424037933, + "loss/reg": 0.006079958751797676, + "step": 158 + }, + { + "epoch": 0.019875, + "grad_norm": 2.6614878177642822, + "grad_norm_var": 0.2483457330217589, + "learning_rate": 0.0001, + "loss": 0.9873, + "loss/crossentropy": 2.312061071395874, + "loss/hidden": 0.80859375, + "loss/logits": 0.11790065467357635, + "loss/reg": 0.006078665144741535, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 2.6204919815063477, + "grad_norm_var": 0.24699792562249925, + "learning_rate": 0.0001, + "loss": 1.0488, + "loss/crossentropy": 2.505072593688965, + "loss/hidden": 0.84375, + "loss/logits": 0.14428117871284485, + "loss/reg": 0.006077310536056757, + "step": 160 + }, + { + "epoch": 0.020125, + "grad_norm": 3.107072591781616, + "grad_norm_var": 0.24079003208151678, + "learning_rate": 0.0001, + "loss": 1.1736, + "loss/crossentropy": 2.6514599323272705, + "loss/hidden": 0.96484375, + "loss/logits": 0.1480400413274765, + "loss/reg": 0.006076075602322817, + "step": 161 + }, + { + "epoch": 0.02025, + "grad_norm": 2.669001817703247, + "grad_norm_var": 0.23972287159530806, + "learning_rate": 0.0001, + "loss": 1.1966, + "loss/crossentropy": 2.4616479873657227, + "loss/hidden": 0.9765625, + "loss/logits": 0.15933012962341309, + "loss/reg": 0.006074720993638039, + "step": 162 + }, + { + "epoch": 0.020375, + "grad_norm": 2.5872421264648438, + "grad_norm_var": 0.1828196031273113, + "learning_rate": 0.0001, + "loss": 1.0551, + "loss/crossentropy": 2.5483999252319336, + "loss/hidden": 0.83984375, + "loss/logits": 0.1544739305973053, + "loss/reg": 0.006073469761759043, + "step": 163 + }, + { + "epoch": 0.0205, + "grad_norm": 2.3342509269714355, + "grad_norm_var": 0.1891007671877621, + "learning_rate": 0.0001, + "loss": 1.1418, + "loss/crossentropy": 2.610344171524048, + "loss/hidden": 0.90234375, + "loss/logits": 0.17876723408699036, + "loss/reg": 0.006072178483009338, + "step": 164 + }, + { + "epoch": 0.020625, + "grad_norm": 2.548274278640747, + "grad_norm_var": 0.18986337395058156, + "learning_rate": 0.0001, + "loss": 0.9512, + "loss/crossentropy": 2.747725009918213, + "loss/hidden": 0.7734375, + "loss/logits": 0.11706214398145676, + "loss/reg": 0.00607073912397027, + "step": 165 + }, + { + "epoch": 0.02075, + "grad_norm": 2.666066884994507, + "grad_norm_var": 0.18987501227134793, + "learning_rate": 0.0001, + "loss": 1.0557, + "loss/crossentropy": 2.3086578845977783, + "loss/hidden": 0.83984375, + "loss/logits": 0.1551416665315628, + "loss/reg": 0.006069260183721781, + "step": 166 + }, + { + "epoch": 0.020875, + "grad_norm": 3.363084554672241, + "grad_norm_var": 0.18083982986582872, + "learning_rate": 0.0001, + "loss": 0.9886, + "loss/crossentropy": 2.7422661781311035, + "loss/hidden": 0.79296875, + "loss/logits": 0.13497118651866913, + "loss/reg": 0.006067754700779915, + "step": 167 + }, + { + "epoch": 0.021, + "grad_norm": 2.717400550842285, + "grad_norm_var": 0.10163689874761227, + "learning_rate": 0.0001, + "loss": 1.2413, + "loss/crossentropy": 2.341296672821045, + "loss/hidden": 1.0078125, + "loss/logits": 0.17277640104293823, + "loss/reg": 0.006066245958209038, + "step": 168 + }, + { + "epoch": 0.021125, + "grad_norm": 2.2773897647857666, + "grad_norm_var": 0.10949759007257095, + "learning_rate": 0.0001, + "loss": 0.9531, + "loss/crossentropy": 2.492532968521118, + "loss/hidden": 0.76953125, + "loss/logits": 0.12295819818973541, + "loss/reg": 0.006064848508685827, + "step": 169 + }, + { + "epoch": 0.02125, + "grad_norm": 2.7625067234039307, + "grad_norm_var": 0.1012976809853086, + "learning_rate": 0.0001, + "loss": 1.0102, + "loss/crossentropy": 2.3799381256103516, + "loss/hidden": 0.80859375, + "loss/logits": 0.140989288687706, + "loss/reg": 0.0060633583925664425, + "step": 170 + }, + { + "epoch": 0.021375, + "grad_norm": 3.713162899017334, + "grad_norm_var": 0.1323542313667114, + "learning_rate": 0.0001, + "loss": 1.0173, + "loss/crossentropy": 2.7296385765075684, + "loss/hidden": 0.80078125, + "loss/logits": 0.1559314727783203, + "loss/reg": 0.006062004715204239, + "step": 171 + }, + { + "epoch": 0.0215, + "grad_norm": 2.8448026180267334, + "grad_norm_var": 0.13256580340874963, + "learning_rate": 0.0001, + "loss": 1.0945, + "loss/crossentropy": 2.211848497390747, + "loss/hidden": 0.87890625, + "loss/logits": 0.15503031015396118, + "loss/reg": 0.006060663145035505, + "step": 172 + }, + { + "epoch": 0.021625, + "grad_norm": 2.951566696166992, + "grad_norm_var": 0.13242537871232402, + "learning_rate": 0.0001, + "loss": 1.243, + "loss/crossentropy": 2.6379833221435547, + "loss/hidden": 0.96484375, + "loss/logits": 0.21754613518714905, + "loss/reg": 0.00605935649946332, + "step": 173 + }, + { + "epoch": 0.02175, + "grad_norm": 2.6862404346466064, + "grad_norm_var": 0.13142011502921586, + "learning_rate": 0.0001, + "loss": 1.0053, + "loss/crossentropy": 2.3807766437530518, + "loss/hidden": 0.80078125, + "loss/logits": 0.14393460750579834, + "loss/reg": 0.006058130878955126, + "step": 174 + }, + { + "epoch": 0.021875, + "grad_norm": 2.5145609378814697, + "grad_norm_var": 0.13512780159794507, + "learning_rate": 0.0001, + "loss": 1.0609, + "loss/crossentropy": 2.4608380794525146, + "loss/hidden": 0.85546875, + "loss/logits": 0.14485566318035126, + "loss/reg": 0.006056922487914562, + "step": 175 + }, + { + "epoch": 0.022, + "grad_norm": 3.23178768157959, + "grad_norm_var": 0.14607750168249728, + "learning_rate": 0.0001, + "loss": 1.1294, + "loss/crossentropy": 2.9791719913482666, + "loss/hidden": 0.91796875, + "loss/logits": 0.1508345603942871, + "loss/reg": 0.006055623292922974, + "step": 176 + }, + { + "epoch": 0.022125, + "grad_norm": 2.7397234439849854, + "grad_norm_var": 0.14000512423072375, + "learning_rate": 0.0001, + "loss": 1.0578, + "loss/crossentropy": 2.4559919834136963, + "loss/hidden": 0.86328125, + "loss/logits": 0.1340080350637436, + "loss/reg": 0.0060544307343661785, + "step": 177 + }, + { + "epoch": 0.02225, + "grad_norm": 2.6637048721313477, + "grad_norm_var": 0.14009088002925954, + "learning_rate": 0.0001, + "loss": 1.076, + "loss/crossentropy": 2.3794586658477783, + "loss/hidden": 0.86328125, + "loss/logits": 0.15214313566684723, + "loss/reg": 0.0060530174523591995, + "step": 178 + }, + { + "epoch": 0.022375, + "grad_norm": 2.0105221271514893, + "grad_norm_var": 0.17628626628935157, + "learning_rate": 0.0001, + "loss": 0.9703, + "loss/crossentropy": 2.3926336765289307, + "loss/hidden": 0.77734375, + "loss/logits": 0.13244566321372986, + "loss/reg": 0.0060517978854477406, + "step": 179 + }, + { + "epoch": 0.0225, + "grad_norm": 2.571902275085449, + "grad_norm_var": 0.16659277386996318, + "learning_rate": 0.0001, + "loss": 1.0739, + "loss/crossentropy": 2.7502923011779785, + "loss/hidden": 0.8515625, + "loss/logits": 0.16181406378746033, + "loss/reg": 0.006050686351954937, + "step": 180 + }, + { + "epoch": 0.022625, + "grad_norm": 2.700366973876953, + "grad_norm_var": 0.1636147823311904, + "learning_rate": 0.0001, + "loss": 1.0113, + "loss/crossentropy": 2.502389669418335, + "loss/hidden": 0.8125, + "loss/logits": 0.138347327709198, + "loss/reg": 0.006049246061593294, + "step": 181 + }, + { + "epoch": 0.02275, + "grad_norm": 2.7259435653686523, + "grad_norm_var": 0.1629618050893432, + "learning_rate": 0.0001, + "loss": 1.0192, + "loss/crossentropy": 2.2493560314178467, + "loss/hidden": 0.82421875, + "loss/logits": 0.1344609260559082, + "loss/reg": 0.006048021838068962, + "step": 182 + }, + { + "epoch": 0.022875, + "grad_norm": 4.930091857910156, + "grad_norm_var": 0.43832731745023895, + "learning_rate": 0.0001, + "loss": 1.1874, + "loss/crossentropy": 2.649231433868408, + "loss/hidden": 0.94140625, + "loss/logits": 0.1855432242155075, + "loss/reg": 0.006046844646334648, + "step": 183 + }, + { + "epoch": 0.023, + "grad_norm": 2.288604259490967, + "grad_norm_var": 0.4589782783160859, + "learning_rate": 0.0001, + "loss": 1.0354, + "loss/crossentropy": 3.0482568740844727, + "loss/hidden": 0.8203125, + "loss/logits": 0.15461647510528564, + "loss/reg": 0.006045445334166288, + "step": 184 + }, + { + "epoch": 0.023125, + "grad_norm": 2.7902991771698, + "grad_norm_var": 0.4362058684835667, + "learning_rate": 0.0001, + "loss": 1.0744, + "loss/crossentropy": 2.726069211959839, + "loss/hidden": 0.8359375, + "loss/logits": 0.17799492180347443, + "loss/reg": 0.006044231820851564, + "step": 185 + }, + { + "epoch": 0.02325, + "grad_norm": 3.597017526626587, + "grad_norm_var": 0.46633972017124825, + "learning_rate": 0.0001, + "loss": 1.0985, + "loss/crossentropy": 2.200692892074585, + "loss/hidden": 0.8984375, + "loss/logits": 0.13961729407310486, + "loss/reg": 0.006042772904038429, + "step": 186 + }, + { + "epoch": 0.023375, + "grad_norm": 2.969062566757202, + "grad_norm_var": 0.42374272593361867, + "learning_rate": 0.0001, + "loss": 1.2314, + "loss/crossentropy": 2.3744540214538574, + "loss/hidden": 0.96875, + "loss/logits": 0.20225511491298676, + "loss/reg": 0.006041594315320253, + "step": 187 + }, + { + "epoch": 0.0235, + "grad_norm": 3.2257020473480225, + "grad_norm_var": 0.4305906329857976, + "learning_rate": 0.0001, + "loss": 1.0982, + "loss/crossentropy": 2.442505121231079, + "loss/hidden": 0.875, + "loss/logits": 0.16284233331680298, + "loss/reg": 0.006040407810360193, + "step": 188 + }, + { + "epoch": 0.023625, + "grad_norm": 3.670443058013916, + "grad_norm_var": 0.4666515285365591, + "learning_rate": 0.0001, + "loss": 1.2391, + "loss/crossentropy": 2.533158540725708, + "loss/hidden": 0.98046875, + "loss/logits": 0.19827201962471008, + "loss/reg": 0.0060392809100449085, + "step": 189 + }, + { + "epoch": 0.02375, + "grad_norm": 7.53206729888916, + "grad_norm_var": 1.7591779439754056, + "learning_rate": 0.0001, + "loss": 1.1689, + "loss/crossentropy": 2.3104734420776367, + "loss/hidden": 0.96875, + "loss/logits": 0.13976144790649414, + "loss/reg": 0.006038178689777851, + "step": 190 + }, + { + "epoch": 0.023875, + "grad_norm": 4.658889293670654, + "grad_norm_var": 1.833400975261701, + "learning_rate": 0.0001, + "loss": 1.3266, + "loss/crossentropy": 2.286229133605957, + "loss/hidden": 1.1015625, + "loss/logits": 0.16465552151203156, + "loss/reg": 0.006036726757884026, + "step": 191 + }, + { + "epoch": 0.024, + "grad_norm": 3.2109904289245605, + "grad_norm_var": 1.8338781863373583, + "learning_rate": 0.0001, + "loss": 1.278, + "loss/crossentropy": 2.5849151611328125, + "loss/hidden": 1.0078125, + "loss/logits": 0.20983844995498657, + "loss/reg": 0.006035543512552977, + "step": 192 + }, + { + "epoch": 0.024125, + "grad_norm": 2.556408643722534, + "grad_norm_var": 1.8519417466969637, + "learning_rate": 0.0001, + "loss": 1.0335, + "loss/crossentropy": 2.635669231414795, + "loss/hidden": 0.8359375, + "loss/logits": 0.13721294701099396, + "loss/reg": 0.006034051068127155, + "step": 193 + }, + { + "epoch": 0.02425, + "grad_norm": 3.4185855388641357, + "grad_norm_var": 1.8153229069184569, + "learning_rate": 0.0001, + "loss": 1.0115, + "loss/crossentropy": 2.3127341270446777, + "loss/hidden": 0.828125, + "loss/logits": 0.12303752452135086, + "loss/reg": 0.00603274954482913, + "step": 194 + }, + { + "epoch": 0.024375, + "grad_norm": 3.639681816101074, + "grad_norm_var": 1.6731808292397734, + "learning_rate": 0.0001, + "loss": 1.2374, + "loss/crossentropy": 2.4363749027252197, + "loss/hidden": 0.98046875, + "loss/logits": 0.19659578800201416, + "loss/reg": 0.006031363736838102, + "step": 195 + }, + { + "epoch": 0.0245, + "grad_norm": 3.266385078430176, + "grad_norm_var": 1.614572274352353, + "learning_rate": 0.0001, + "loss": 1.19, + "loss/crossentropy": 2.2824337482452393, + "loss/hidden": 0.9609375, + "loss/logits": 0.16878634691238403, + "loss/reg": 0.006029782351106405, + "step": 196 + }, + { + "epoch": 0.024625, + "grad_norm": 3.0692105293273926, + "grad_norm_var": 1.5801212385016838, + "learning_rate": 0.0001, + "loss": 1.1495, + "loss/crossentropy": 2.518056631088257, + "loss/hidden": 0.921875, + "loss/logits": 0.16731634736061096, + "loss/reg": 0.006028252653777599, + "step": 197 + }, + { + "epoch": 0.02475, + "grad_norm": 3.390202283859253, + "grad_norm_var": 1.530565626963321, + "learning_rate": 0.0001, + "loss": 1.1783, + "loss/crossentropy": 2.3565316200256348, + "loss/hidden": 0.9375, + "loss/logits": 0.18055224418640137, + "loss/reg": 0.006026738323271275, + "step": 198 + }, + { + "epoch": 0.024875, + "grad_norm": 2.524461030960083, + "grad_norm_var": 1.4779304822181976, + "learning_rate": 0.0001, + "loss": 1.095, + "loss/crossentropy": 2.3489255905151367, + "loss/hidden": 0.88671875, + "loss/logits": 0.1480264812707901, + "loss/reg": 0.006025230046361685, + "step": 199 + }, + { + "epoch": 0.025, + "grad_norm": 2.8753433227539062, + "grad_norm_var": 1.4056158732497617, + "learning_rate": 0.0001, + "loss": 1.1396, + "loss/crossentropy": 2.379971504211426, + "loss/hidden": 0.90625, + "loss/logits": 0.17312359809875488, + "loss/reg": 0.0060236188583076, + "step": 200 + }, + { + "epoch": 0.025125, + "grad_norm": 2.2297983169555664, + "grad_norm_var": 1.4801331513155804, + "learning_rate": 0.0001, + "loss": 1.1642, + "loss/crossentropy": 2.401499032974243, + "loss/hidden": 0.9296875, + "loss/logits": 0.1743072271347046, + "loss/reg": 0.006021994166076183, + "step": 201 + }, + { + "epoch": 0.02525, + "grad_norm": 2.7430193424224854, + "grad_norm_var": 1.5134885749372204, + "learning_rate": 0.0001, + "loss": 1.3503, + "loss/crossentropy": 2.3397345542907715, + "loss/hidden": 1.09375, + "loss/logits": 0.1963859498500824, + "loss/reg": 0.006020485423505306, + "step": 202 + }, + { + "epoch": 0.025375, + "grad_norm": 3.3862688541412354, + "grad_norm_var": 1.4983780502999742, + "learning_rate": 0.0001, + "loss": 1.3154, + "loss/crossentropy": 2.3259048461914062, + "loss/hidden": 1.09375, + "loss/logits": 0.1614416241645813, + "loss/reg": 0.0060190120711922646, + "step": 203 + }, + { + "epoch": 0.0255, + "grad_norm": 2.554938316345215, + "grad_norm_var": 1.547662147741073, + "learning_rate": 0.0001, + "loss": 1.1147, + "loss/crossentropy": 2.559544801712036, + "loss/hidden": 0.890625, + "loss/logits": 0.16388913989067078, + "loss/reg": 0.006017730105668306, + "step": 204 + }, + { + "epoch": 0.025625, + "grad_norm": 2.6290361881256104, + "grad_norm_var": 1.5807281675134672, + "learning_rate": 0.0001, + "loss": 1.049, + "loss/crossentropy": 2.7080090045928955, + "loss/hidden": 0.828125, + "loss/logits": 0.16068041324615479, + "loss/reg": 0.006016433704644442, + "step": 205 + }, + { + "epoch": 0.02575, + "grad_norm": 2.234259605407715, + "grad_norm_var": 0.38456120947827777, + "learning_rate": 0.0001, + "loss": 1.0392, + "loss/crossentropy": 2.3816347122192383, + "loss/hidden": 0.8359375, + "loss/logits": 0.14315146207809448, + "loss/reg": 0.0060149249620735645, + "step": 206 + }, + { + "epoch": 0.025875, + "grad_norm": 2.810352325439453, + "grad_norm_var": 0.19522907990381644, + "learning_rate": 0.0001, + "loss": 1.1385, + "loss/crossentropy": 2.6245384216308594, + "loss/hidden": 0.90625, + "loss/logits": 0.17206540703773499, + "loss/reg": 0.006013684440404177, + "step": 207 + }, + { + "epoch": 0.026, + "grad_norm": 2.198707342147827, + "grad_norm_var": 0.21847125065788287, + "learning_rate": 0.0001, + "loss": 0.9762, + "loss/crossentropy": 2.3812787532806396, + "loss/hidden": 0.796875, + "loss/logits": 0.119233138859272, + "loss/reg": 0.006012204568833113, + "step": 208 + }, + { + "epoch": 0.026125, + "grad_norm": 2.5001378059387207, + "grad_norm_var": 0.22083751043745087, + "learning_rate": 0.0001, + "loss": 1.2526, + "loss/crossentropy": 2.5999109745025635, + "loss/hidden": 0.984375, + "loss/logits": 0.20815744996070862, + "loss/reg": 0.006010920740664005, + "step": 209 + }, + { + "epoch": 0.02625, + "grad_norm": 3.175185203552246, + "grad_norm_var": 0.20582482438127556, + "learning_rate": 0.0001, + "loss": 1.239, + "loss/crossentropy": 2.3893682956695557, + "loss/hidden": 1.0234375, + "loss/logits": 0.15550163388252258, + "loss/reg": 0.006009369157254696, + "step": 210 + }, + { + "epoch": 0.026375, + "grad_norm": 3.482342481613159, + "grad_norm_var": 0.19031657232839597, + "learning_rate": 0.0001, + "loss": 1.1572, + "loss/crossentropy": 2.382542848587036, + "loss/hidden": 0.94921875, + "loss/logits": 0.14788678288459778, + "loss/reg": 0.006007815711200237, + "step": 211 + }, + { + "epoch": 0.0265, + "grad_norm": 2.285135507583618, + "grad_norm_var": 0.19168098803167197, + "learning_rate": 0.0001, + "loss": 0.9667, + "loss/crossentropy": 2.552724838256836, + "loss/hidden": 0.78125, + "loss/logits": 0.1254206746816635, + "loss/reg": 0.006006232462823391, + "step": 212 + }, + { + "epoch": 0.026625, + "grad_norm": 2.991971969604492, + "grad_norm_var": 0.1888233667670041, + "learning_rate": 0.0001, + "loss": 1.1472, + "loss/crossentropy": 2.472437620162964, + "loss/hidden": 0.9296875, + "loss/logits": 0.15750399231910706, + "loss/reg": 0.0060045006684958935, + "step": 213 + }, + { + "epoch": 0.02675, + "grad_norm": 2.3775179386138916, + "grad_norm_var": 0.1665701003974154, + "learning_rate": 0.0001, + "loss": 1.1938, + "loss/crossentropy": 2.294337749481201, + "loss/hidden": 0.95703125, + "loss/logits": 0.17671090364456177, + "loss/reg": 0.006002978887408972, + "step": 214 + }, + { + "epoch": 0.026875, + "grad_norm": 2.2992701530456543, + "grad_norm_var": 0.17463199132661936, + "learning_rate": 0.0001, + "loss": 1.2097, + "loss/crossentropy": 2.3843300342559814, + "loss/hidden": 0.9609375, + "loss/logits": 0.18876615166664124, + "loss/reg": 0.006001432426273823, + "step": 215 + }, + { + "epoch": 0.027, + "grad_norm": 2.4926228523254395, + "grad_norm_var": 0.17347807328228151, + "learning_rate": 0.0001, + "loss": 1.3156, + "loss/crossentropy": 2.326836585998535, + "loss/hidden": 1.0625, + "loss/logits": 0.19308596849441528, + "loss/reg": 0.005999880842864513, + "step": 216 + }, + { + "epoch": 0.027125, + "grad_norm": 2.552459478378296, + "grad_norm_var": 0.16193263198218044, + "learning_rate": 0.0001, + "loss": 1.1424, + "loss/crossentropy": 2.6629388332366943, + "loss/hidden": 0.91015625, + "loss/logits": 0.1722826063632965, + "loss/reg": 0.005998372100293636, + "step": 217 + }, + { + "epoch": 0.02725, + "grad_norm": 2.866387128829956, + "grad_norm_var": 0.16409192036900605, + "learning_rate": 0.0001, + "loss": 1.0142, + "loss/crossentropy": 2.8154890537261963, + "loss/hidden": 0.80078125, + "loss/logits": 0.15349115431308746, + "loss/reg": 0.005996840540319681, + "step": 218 + }, + { + "epoch": 0.027375, + "grad_norm": 2.77524471282959, + "grad_norm_var": 0.12966566207502767, + "learning_rate": 0.0001, + "loss": 1.4111, + "loss/crossentropy": 2.4509928226470947, + "loss/hidden": 1.1015625, + "loss/logits": 0.249616801738739, + "loss/reg": 0.005995343904942274, + "step": 219 + }, + { + "epoch": 0.0275, + "grad_norm": 2.887923240661621, + "grad_norm_var": 0.13285907347625023, + "learning_rate": 0.0001, + "loss": 1.2886, + "loss/crossentropy": 2.4280507564544678, + "loss/hidden": 1.0234375, + "loss/logits": 0.20519307255744934, + "loss/reg": 0.005993579979985952, + "step": 220 + }, + { + "epoch": 0.027625, + "grad_norm": 2.5383920669555664, + "grad_norm_var": 0.1337457284607846, + "learning_rate": 0.0001, + "loss": 1.3292, + "loss/crossentropy": 2.0803585052490234, + "loss/hidden": 1.09375, + "loss/logits": 0.17551109194755554, + "loss/reg": 0.005991705227643251, + "step": 221 + }, + { + "epoch": 0.02775, + "grad_norm": 2.639490842819214, + "grad_norm_var": 0.12131687494494538, + "learning_rate": 0.0001, + "loss": 1.0593, + "loss/crossentropy": 2.293325901031494, + "loss/hidden": 0.8515625, + "loss/logits": 0.14782238006591797, + "loss/reg": 0.005989882629364729, + "step": 222 + }, + { + "epoch": 0.027875, + "grad_norm": 2.4396984577178955, + "grad_norm_var": 0.12344012810124999, + "learning_rate": 0.0001, + "loss": 1.0587, + "loss/crossentropy": 2.7268667221069336, + "loss/hidden": 0.84765625, + "loss/logits": 0.15114662051200867, + "loss/reg": 0.0059883627109229565, + "step": 223 + }, + { + "epoch": 0.028, + "grad_norm": 2.227886438369751, + "grad_norm_var": 0.12171264621671582, + "learning_rate": 0.0001, + "loss": 1.0087, + "loss/crossentropy": 2.4431943893432617, + "loss/hidden": 0.81640625, + "loss/logits": 0.13243696093559265, + "loss/reg": 0.005986812058836222, + "step": 224 + }, + { + "epoch": 0.028125, + "grad_norm": 3.690627098083496, + "grad_norm_var": 0.18519755428341872, + "learning_rate": 0.0001, + "loss": 1.0732, + "loss/crossentropy": 2.4630942344665527, + "loss/hidden": 0.875, + "loss/logits": 0.13830721378326416, + "loss/reg": 0.005985158029943705, + "step": 225 + }, + { + "epoch": 0.02825, + "grad_norm": 3.377890110015869, + "grad_norm_var": 0.19972658805784155, + "learning_rate": 0.0001, + "loss": 1.1848, + "loss/crossentropy": 2.2899203300476074, + "loss/hidden": 0.9609375, + "loss/logits": 0.16401749849319458, + "loss/reg": 0.005983633920550346, + "step": 226 + }, + { + "epoch": 0.028375, + "grad_norm": 2.7600386142730713, + "grad_norm_var": 0.16135214723361363, + "learning_rate": 0.0001, + "loss": 1.0223, + "loss/crossentropy": 2.8077659606933594, + "loss/hidden": 0.8203125, + "loss/logits": 0.14218226075172424, + "loss/reg": 0.005982026923447847, + "step": 227 + }, + { + "epoch": 0.0285, + "grad_norm": 2.3397345542907715, + "grad_norm_var": 0.15851713921701366, + "learning_rate": 0.0001, + "loss": 1.077, + "loss/crossentropy": 2.438030958175659, + "loss/hidden": 0.875, + "loss/logits": 0.14217695593833923, + "loss/reg": 0.005980519577860832, + "step": 228 + }, + { + "epoch": 0.028625, + "grad_norm": 2.744401216506958, + "grad_norm_var": 0.15282793193407448, + "learning_rate": 0.0001, + "loss": 1.1967, + "loss/crossentropy": 2.557457447052002, + "loss/hidden": 0.97265625, + "loss/logits": 0.16425767540931702, + "loss/reg": 0.005979116074740887, + "step": 229 + }, + { + "epoch": 0.02875, + "grad_norm": 2.4241418838500977, + "grad_norm_var": 0.15103305834679168, + "learning_rate": 0.0001, + "loss": 1.0402, + "loss/crossentropy": 2.743885040283203, + "loss/hidden": 0.828125, + "loss/logits": 0.15231972932815552, + "loss/reg": 0.005977709777653217, + "step": 230 + }, + { + "epoch": 0.028875, + "grad_norm": 2.0828442573547363, + "grad_norm_var": 0.16526500993595217, + "learning_rate": 0.0001, + "loss": 0.9747, + "loss/crossentropy": 2.719327688217163, + "loss/hidden": 0.78125, + "loss/logits": 0.133681058883667, + "loss/reg": 0.005976095795631409, + "step": 231 + }, + { + "epoch": 0.029, + "grad_norm": 2.127495527267456, + "grad_norm_var": 0.18259721536013085, + "learning_rate": 0.0001, + "loss": 1.0588, + "loss/crossentropy": 2.8147058486938477, + "loss/hidden": 0.85546875, + "loss/logits": 0.14354225993156433, + "loss/reg": 0.005974431522190571, + "step": 232 + }, + { + "epoch": 0.029125, + "grad_norm": 4.263195991516113, + "grad_norm_var": 0.34219781045772657, + "learning_rate": 0.0001, + "loss": 1.1724, + "loss/crossentropy": 2.5414481163024902, + "loss/hidden": 0.96484375, + "loss/logits": 0.1478062868118286, + "loss/reg": 0.005972826853394508, + "step": 233 + }, + { + "epoch": 0.02925, + "grad_norm": 2.9974324703216553, + "grad_norm_var": 0.34510225788824467, + "learning_rate": 0.0001, + "loss": 1.3152, + "loss/crossentropy": 2.697648763656616, + "loss/hidden": 1.0546875, + "loss/logits": 0.20080995559692383, + "loss/reg": 0.005971227772533894, + "step": 234 + }, + { + "epoch": 0.029375, + "grad_norm": 3.4798855781555176, + "grad_norm_var": 0.37664835069757197, + "learning_rate": 0.0001, + "loss": 1.2096, + "loss/crossentropy": 2.3990559577941895, + "loss/hidden": 0.95703125, + "loss/logits": 0.19287389516830444, + "loss/reg": 0.005969603545963764, + "step": 235 + }, + { + "epoch": 0.0295, + "grad_norm": 2.43911075592041, + "grad_norm_var": 0.3848032740432508, + "learning_rate": 0.0001, + "loss": 1.0658, + "loss/crossentropy": 1.966374158859253, + "loss/hidden": 0.875, + "loss/logits": 0.13115233182907104, + "loss/reg": 0.005967943929135799, + "step": 236 + }, + { + "epoch": 0.029625, + "grad_norm": 3.7423646450042725, + "grad_norm_var": 0.4356891905379257, + "learning_rate": 0.0001, + "loss": 1.2397, + "loss/crossentropy": 2.718675374984741, + "loss/hidden": 0.9921875, + "loss/logits": 0.18789833784103394, + "loss/reg": 0.00596608454361558, + "step": 237 + }, + { + "epoch": 0.02975, + "grad_norm": 3.328033924102783, + "grad_norm_var": 0.4449827328026664, + "learning_rate": 0.0001, + "loss": 1.5581, + "loss/crossentropy": 2.272303819656372, + "loss/hidden": 1.2421875, + "loss/logits": 0.2562662661075592, + "loss/reg": 0.005964066833257675, + "step": 238 + }, + { + "epoch": 0.029875, + "grad_norm": 2.8761045932769775, + "grad_norm_var": 0.42986649641521024, + "learning_rate": 0.0001, + "loss": 1.1392, + "loss/crossentropy": 2.6973013877868652, + "loss/hidden": 0.91796875, + "loss/logits": 0.16159963607788086, + "loss/reg": 0.005962541792541742, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 2.4458563327789307, + "grad_norm_var": 0.4123921579785623, + "learning_rate": 0.0001, + "loss": 1.178, + "loss/crossentropy": 2.5731561183929443, + "loss/hidden": 0.9375, + "loss/logits": 0.18093177676200867, + "loss/reg": 0.005961006972938776, + "step": 240 + }, + { + "epoch": 0.030125, + "grad_norm": 2.4645614624023438, + "grad_norm_var": 0.3844441578530656, + "learning_rate": 0.0001, + "loss": 1.0932, + "loss/crossentropy": 2.648738145828247, + "loss/hidden": 0.890625, + "loss/logits": 0.14302745461463928, + "loss/reg": 0.005959144793450832, + "step": 241 + }, + { + "epoch": 0.03025, + "grad_norm": 3.0715034008026123, + "grad_norm_var": 0.3694944025754277, + "learning_rate": 0.0001, + "loss": 1.1916, + "loss/crossentropy": 2.4820139408111572, + "loss/hidden": 0.94921875, + "loss/logits": 0.18278783559799194, + "loss/reg": 0.005957332905381918, + "step": 242 + }, + { + "epoch": 0.030375, + "grad_norm": 2.479677677154541, + "grad_norm_var": 0.37773887013444374, + "learning_rate": 0.0001, + "loss": 1.0787, + "loss/crossentropy": 2.614309549331665, + "loss/hidden": 0.87109375, + "loss/logits": 0.14808647334575653, + "loss/reg": 0.005955492611974478, + "step": 243 + }, + { + "epoch": 0.0305, + "grad_norm": 3.0970399379730225, + "grad_norm_var": 0.36391299171458796, + "learning_rate": 0.0001, + "loss": 1.1987, + "loss/crossentropy": 2.2731809616088867, + "loss/hidden": 0.95703125, + "loss/logits": 0.18210504949092865, + "loss/reg": 0.00595364673063159, + "step": 244 + }, + { + "epoch": 0.030625, + "grad_norm": 2.388214588165283, + "grad_norm_var": 0.37823356386532864, + "learning_rate": 0.0001, + "loss": 1.1283, + "loss/crossentropy": 2.532259225845337, + "loss/hidden": 0.91015625, + "loss/logits": 0.15858401358127594, + "loss/reg": 0.005952049978077412, + "step": 245 + }, + { + "epoch": 0.03075, + "grad_norm": 2.97310733795166, + "grad_norm_var": 0.36540629077152076, + "learning_rate": 0.0001, + "loss": 1.1177, + "loss/crossentropy": 2.5206258296966553, + "loss/hidden": 0.89453125, + "loss/logits": 0.16365137696266174, + "loss/reg": 0.005950110498815775, + "step": 246 + }, + { + "epoch": 0.030875, + "grad_norm": 2.15498423576355, + "grad_norm_var": 0.3579579158371985, + "learning_rate": 0.0001, + "loss": 1.1046, + "loss/crossentropy": 2.478773832321167, + "loss/hidden": 0.8828125, + "loss/logits": 0.162343829870224, + "loss/reg": 0.005948282778263092, + "step": 247 + }, + { + "epoch": 0.031, + "grad_norm": 2.3404128551483154, + "grad_norm_var": 0.338987407645584, + "learning_rate": 0.0001, + "loss": 1.1555, + "loss/crossentropy": 2.1949751377105713, + "loss/hidden": 0.93359375, + "loss/logits": 0.1624409407377243, + "loss/reg": 0.005946675315499306, + "step": 248 + }, + { + "epoch": 0.031125, + "grad_norm": 2.8813085556030273, + "grad_norm_var": 0.20879640313171802, + "learning_rate": 0.0001, + "loss": 1.1599, + "loss/crossentropy": 2.556128978729248, + "loss/hidden": 0.9296875, + "loss/logits": 0.1707805097103119, + "loss/reg": 0.005944731179624796, + "step": 249 + }, + { + "epoch": 0.03125, + "grad_norm": 3.309937000274658, + "grad_norm_var": 0.22219010027481143, + "learning_rate": 0.0001, + "loss": 1.0939, + "loss/crossentropy": 2.4590022563934326, + "loss/hidden": 0.88671875, + "loss/logits": 0.14774294197559357, + "loss/reg": 0.005942681338638067, + "step": 250 + }, + { + "epoch": 0.031375, + "grad_norm": 3.1676676273345947, + "grad_norm_var": 0.201728293925846, + "learning_rate": 0.0001, + "loss": 1.3162, + "loss/crossentropy": 2.419811487197876, + "loss/hidden": 1.015625, + "loss/logits": 0.24120670557022095, + "loss/reg": 0.005940672475844622, + "step": 251 + }, + { + "epoch": 0.0315, + "grad_norm": 2.6006832122802734, + "grad_norm_var": 0.1951007002723287, + "learning_rate": 0.0001, + "loss": 1.3903, + "loss/crossentropy": 2.170666456222534, + "loss/hidden": 1.140625, + "loss/logits": 0.19024603068828583, + "loss/reg": 0.005938523914664984, + "step": 252 + }, + { + "epoch": 0.031625, + "grad_norm": 2.4954755306243896, + "grad_norm_var": 0.14101991304577552, + "learning_rate": 0.0001, + "loss": 1.1465, + "loss/crossentropy": 2.262831449508667, + "loss/hidden": 0.93359375, + "loss/logits": 0.1535283327102661, + "loss/reg": 0.00593681400641799, + "step": 253 + }, + { + "epoch": 0.03175, + "grad_norm": 2.339406728744507, + "grad_norm_var": 0.12652605714113535, + "learning_rate": 0.0001, + "loss": 0.984, + "loss/crossentropy": 2.2793617248535156, + "loss/hidden": 0.796875, + "loss/logits": 0.12778240442276, + "loss/reg": 0.005935273133218288, + "step": 254 + }, + { + "epoch": 0.031875, + "grad_norm": 2.3391647338867188, + "grad_norm_var": 0.131427049667937, + "learning_rate": 0.0001, + "loss": 1.0622, + "loss/crossentropy": 2.4579379558563232, + "loss/hidden": 0.83984375, + "loss/logits": 0.16299216449260712, + "loss/reg": 0.0059331608936190605, + "step": 255 + }, + { + "epoch": 0.032, + "grad_norm": 2.3896231651306152, + "grad_norm_var": 0.13322512800125588, + "learning_rate": 0.0001, + "loss": 1.057, + "loss/crossentropy": 2.8022475242614746, + "loss/hidden": 0.85546875, + "loss/logits": 0.14219465851783752, + "loss/reg": 0.005931555759161711, + "step": 256 + }, + { + "epoch": 0.032125, + "grad_norm": 2.125249147415161, + "grad_norm_var": 0.14907278605534582, + "learning_rate": 0.0001, + "loss": 1.0611, + "loss/crossentropy": 2.33644700050354, + "loss/hidden": 0.8515625, + "loss/logits": 0.15020999312400818, + "loss/reg": 0.005930029321461916, + "step": 257 + }, + { + "epoch": 0.03225, + "grad_norm": 2.521933078765869, + "grad_norm_var": 0.13593429417580463, + "learning_rate": 0.0001, + "loss": 1.0436, + "loss/crossentropy": 2.512619733810425, + "loss/hidden": 0.8203125, + "loss/logits": 0.16396166384220123, + "loss/reg": 0.00592817785218358, + "step": 258 + }, + { + "epoch": 0.032375, + "grad_norm": 2.5966317653656006, + "grad_norm_var": 0.13490910688263208, + "learning_rate": 0.0001, + "loss": 1.1331, + "loss/crossentropy": 2.248013734817505, + "loss/hidden": 0.91015625, + "loss/logits": 0.16364812850952148, + "loss/reg": 0.00592625979334116, + "step": 259 + }, + { + "epoch": 0.0325, + "grad_norm": 2.2045137882232666, + "grad_norm_var": 0.12644607438415487, + "learning_rate": 0.0001, + "loss": 1.0015, + "loss/crossentropy": 2.3253698348999023, + "loss/hidden": 0.796875, + "loss/logits": 0.14540287852287292, + "loss/reg": 0.005924653727561235, + "step": 260 + }, + { + "epoch": 0.032625, + "grad_norm": 2.4450156688690186, + "grad_norm_var": 0.1254090419850094, + "learning_rate": 0.0001, + "loss": 0.9932, + "loss/crossentropy": 2.2374210357666016, + "loss/hidden": 0.80078125, + "loss/logits": 0.13316848874092102, + "loss/reg": 0.005922792013734579, + "step": 261 + }, + { + "epoch": 0.03275, + "grad_norm": 7.747511863708496, + "grad_norm_var": 1.8160510254643325, + "learning_rate": 0.0001, + "loss": 1.2542, + "loss/crossentropy": 2.8747429847717285, + "loss/hidden": 1.0234375, + "loss/logits": 0.17151576280593872, + "loss/reg": 0.005921173375099897, + "step": 262 + }, + { + "epoch": 0.032875, + "grad_norm": 2.1854233741760254, + "grad_norm_var": 1.8132730792650582, + "learning_rate": 0.0001, + "loss": 1.0069, + "loss/crossentropy": 2.4989960193634033, + "loss/hidden": 0.8125, + "loss/logits": 0.13518914580345154, + "loss/reg": 0.005919379647821188, + "step": 263 + }, + { + "epoch": 0.033, + "grad_norm": 3.5132219791412354, + "grad_norm_var": 1.8186749991604263, + "learning_rate": 0.0001, + "loss": 1.054, + "loss/crossentropy": 2.497178316116333, + "loss/hidden": 0.84765625, + "loss/logits": 0.1471494734287262, + "loss/reg": 0.005917761009186506, + "step": 264 + }, + { + "epoch": 0.033125, + "grad_norm": 4.302145481109619, + "grad_norm_var": 1.9358282916849012, + "learning_rate": 0.0001, + "loss": 1.3123, + "loss/crossentropy": 2.1725542545318604, + "loss/hidden": 1.0859375, + "loss/logits": 0.16722658276557922, + "loss/reg": 0.0059160212986171246, + "step": 265 + }, + { + "epoch": 0.03325, + "grad_norm": 2.3225510120391846, + "grad_norm_var": 1.9582913809461102, + "learning_rate": 0.0001, + "loss": 1.0153, + "loss/crossentropy": 2.6670029163360596, + "loss/hidden": 0.80859375, + "loss/logits": 0.1475904881954193, + "loss/reg": 0.0059142098762094975, + "step": 266 + }, + { + "epoch": 0.033375, + "grad_norm": 5.196990013122559, + "grad_norm_var": 2.27294427304937, + "learning_rate": 0.0001, + "loss": 1.1665, + "loss/crossentropy": 2.6792731285095215, + "loss/hidden": 0.94140625, + "loss/logits": 0.1659836769104004, + "loss/reg": 0.00591221172362566, + "step": 267 + }, + { + "epoch": 0.0335, + "grad_norm": 3.5144336223602295, + "grad_norm_var": 2.26638445070385, + "learning_rate": 0.0001, + "loss": 1.2502, + "loss/crossentropy": 2.2949023246765137, + "loss/hidden": 1.0234375, + "loss/logits": 0.1677004098892212, + "loss/reg": 0.005910532083362341, + "step": 268 + }, + { + "epoch": 0.033625, + "grad_norm": 2.861222267150879, + "grad_norm_var": 2.2433162495019436, + "learning_rate": 0.0001, + "loss": 1.3308, + "loss/crossentropy": 2.5955142974853516, + "loss/hidden": 1.0703125, + "loss/logits": 0.2013990730047226, + "loss/reg": 0.005908492021262646, + "step": 269 + }, + { + "epoch": 0.03375, + "grad_norm": 2.964390754699707, + "grad_norm_var": 2.1991134738974947, + "learning_rate": 0.0001, + "loss": 1.0975, + "loss/crossentropy": 2.483924150466919, + "loss/hidden": 0.8828125, + "loss/logits": 0.15562227368354797, + "loss/reg": 0.005906403064727783, + "step": 270 + }, + { + "epoch": 0.033875, + "grad_norm": 2.75604510307312, + "grad_norm_var": 2.1620222961988325, + "learning_rate": 0.0001, + "loss": 1.2196, + "loss/crossentropy": 2.39125394821167, + "loss/hidden": 0.9765625, + "loss/logits": 0.18403753638267517, + "loss/reg": 0.00590470340102911, + "step": 271 + }, + { + "epoch": 0.034, + "grad_norm": 2.360309362411499, + "grad_norm_var": 2.165352535939727, + "learning_rate": 0.0001, + "loss": 1.0194, + "loss/crossentropy": 2.530670404434204, + "loss/hidden": 0.8046875, + "loss/logits": 0.15565866231918335, + "loss/reg": 0.005902664735913277, + "step": 272 + }, + { + "epoch": 0.034125, + "grad_norm": 2.496027946472168, + "grad_norm_var": 2.1195219252368287, + "learning_rate": 0.0001, + "loss": 1.2228, + "loss/crossentropy": 2.7535252571105957, + "loss/hidden": 0.9609375, + "loss/logits": 0.20284873247146606, + "loss/reg": 0.005900639574974775, + "step": 273 + }, + { + "epoch": 0.03425, + "grad_norm": 2.854250431060791, + "grad_norm_var": 2.0941964139517344, + "learning_rate": 0.0001, + "loss": 1.1387, + "loss/crossentropy": 2.134964942932129, + "loss/hidden": 0.9296875, + "loss/logits": 0.15002194046974182, + "loss/reg": 0.005898929201066494, + "step": 274 + }, + { + "epoch": 0.034375, + "grad_norm": 4.497798442840576, + "grad_norm_var": 2.149396374832277, + "learning_rate": 0.0001, + "loss": 1.2312, + "loss/crossentropy": 2.3270835876464844, + "loss/hidden": 0.99609375, + "loss/logits": 0.17617599666118622, + "loss/reg": 0.0058972095139324665, + "step": 275 + }, + { + "epoch": 0.0345, + "grad_norm": 2.321152448654175, + "grad_norm_var": 2.1318278315927155, + "learning_rate": 0.0001, + "loss": 1.1523, + "loss/crossentropy": 1.858445644378662, + "loss/hidden": 0.94921875, + "loss/logits": 0.14408603310585022, + "loss/reg": 0.005895303096622229, + "step": 276 + }, + { + "epoch": 0.034625, + "grad_norm": 2.4426257610321045, + "grad_norm_var": 2.1321312734782243, + "learning_rate": 0.0001, + "loss": 1.0267, + "loss/crossentropy": 2.4483628273010254, + "loss/hidden": 0.82421875, + "loss/logits": 0.1435263752937317, + "loss/reg": 0.005893299821764231, + "step": 277 + }, + { + "epoch": 0.03475, + "grad_norm": 2.144637107849121, + "grad_norm_var": 0.843351985629086, + "learning_rate": 0.0001, + "loss": 1.0517, + "loss/crossentropy": 2.237915277481079, + "loss/hidden": 0.8515625, + "loss/logits": 0.14119011163711548, + "loss/reg": 0.005891298409551382, + "step": 278 + }, + { + "epoch": 0.034875, + "grad_norm": 2.32000732421875, + "grad_norm_var": 0.8290445100225684, + "learning_rate": 0.0001, + "loss": 1.0462, + "loss/crossentropy": 2.6588850021362305, + "loss/hidden": 0.83203125, + "loss/logits": 0.1552983820438385, + "loss/reg": 0.0058892290107905865, + "step": 279 + }, + { + "epoch": 0.035, + "grad_norm": 3.3390939235687256, + "grad_norm_var": 0.820283282746707, + "learning_rate": 0.0001, + "loss": 1.1937, + "loss/crossentropy": 2.5243186950683594, + "loss/hidden": 0.953125, + "loss/logits": 0.1817275732755661, + "loss/reg": 0.00588742271065712, + "step": 280 + }, + { + "epoch": 0.035125, + "grad_norm": 3.1800894737243652, + "grad_norm_var": 0.7106469411621028, + "learning_rate": 0.0001, + "loss": 1.1937, + "loss/crossentropy": 2.556126832962036, + "loss/hidden": 0.953125, + "loss/logits": 0.18167603015899658, + "loss/reg": 0.005885709077119827, + "step": 281 + }, + { + "epoch": 0.03525, + "grad_norm": 4.466390132904053, + "grad_norm_var": 0.8119073339313209, + "learning_rate": 0.0001, + "loss": 1.27, + "loss/crossentropy": 2.5671539306640625, + "loss/hidden": 0.984375, + "loss/logits": 0.2267427146434784, + "loss/reg": 0.0058837407268583775, + "step": 282 + }, + { + "epoch": 0.035375, + "grad_norm": 3.2809953689575195, + "grad_norm_var": 0.5074810718943117, + "learning_rate": 0.0001, + "loss": 1.1245, + "loss/crossentropy": 2.1554338932037354, + "loss/hidden": 0.9140625, + "loss/logits": 0.1516391634941101, + "loss/reg": 0.005881770513951778, + "step": 283 + }, + { + "epoch": 0.0355, + "grad_norm": 2.9982316493988037, + "grad_norm_var": 0.48786559613454966, + "learning_rate": 0.0001, + "loss": 1.1286, + "loss/crossentropy": 2.6773006916046143, + "loss/hidden": 0.90625, + "loss/logits": 0.1635606288909912, + "loss/reg": 0.005880062934011221, + "step": 284 + }, + { + "epoch": 0.035625, + "grad_norm": 2.387657880783081, + "grad_norm_var": 0.5078162485774572, + "learning_rate": 0.0001, + "loss": 1.1214, + "loss/crossentropy": 2.4741320610046387, + "loss/hidden": 0.8984375, + "loss/logits": 0.1641697734594345, + "loss/reg": 0.0058782072737813, + "step": 285 + }, + { + "epoch": 0.03575, + "grad_norm": 271.6628112792969, + "grad_norm_var": 4514.324895160767, + "learning_rate": 0.0001, + "loss": 1.6171, + "loss/crossentropy": 2.5766143798828125, + "loss/hidden": 1.375, + "loss/logits": 0.1833469420671463, + "loss/reg": 0.005876271054148674, + "step": 286 + }, + { + "epoch": 0.035875, + "grad_norm": 3.545677900314331, + "grad_norm_var": 4512.577903953303, + "learning_rate": 0.0001, + "loss": 1.1466, + "loss/crossentropy": 2.5389881134033203, + "loss/hidden": 0.88671875, + "loss/logits": 0.20117658376693726, + "loss/reg": 0.005874336697161198, + "step": 287 + }, + { + "epoch": 0.036, + "grad_norm": 2.9219233989715576, + "grad_norm_var": 4511.294050983276, + "learning_rate": 0.0001, + "loss": 1.1121, + "loss/crossentropy": 2.3270509243011475, + "loss/hidden": 0.8828125, + "loss/logits": 0.17058232426643372, + "loss/reg": 0.005872361361980438, + "step": 288 + }, + { + "epoch": 0.036125, + "grad_norm": 2.831878423690796, + "grad_norm_var": 4510.526061571783, + "learning_rate": 0.0001, + "loss": 1.148, + "loss/crossentropy": 2.4853744506835938, + "loss/hidden": 0.91796875, + "loss/logits": 0.17128118872642517, + "loss/reg": 0.005870639346539974, + "step": 289 + }, + { + "epoch": 0.03625, + "grad_norm": 2.284134864807129, + "grad_norm_var": 4511.83639181831, + "learning_rate": 0.0001, + "loss": 1.0599, + "loss/crossentropy": 2.3107759952545166, + "loss/hidden": 0.8515625, + "loss/logits": 0.14969472587108612, + "loss/reg": 0.005868903826922178, + "step": 290 + }, + { + "epoch": 0.036375, + "grad_norm": 2.2008161544799805, + "grad_norm_var": 4516.84932017332, + "learning_rate": 0.0001, + "loss": 1.0902, + "loss/crossentropy": 2.4265358448028564, + "loss/hidden": 0.86328125, + "loss/logits": 0.1682073473930359, + "loss/reg": 0.0058671231381595135, + "step": 291 + }, + { + "epoch": 0.0365, + "grad_norm": 2.6285743713378906, + "grad_norm_var": 4516.145108725088, + "learning_rate": 0.0001, + "loss": 1.2494, + "loss/crossentropy": 2.372230291366577, + "loss/hidden": 0.98046875, + "loss/logits": 0.2102714478969574, + "loss/reg": 0.005865375977009535, + "step": 292 + }, + { + "epoch": 0.036625, + "grad_norm": 2.6784040927886963, + "grad_norm_var": 4515.607170253259, + "learning_rate": 0.0001, + "loss": 1.0752, + "loss/crossentropy": 2.6276440620422363, + "loss/hidden": 0.875, + "loss/logits": 0.14159329235553741, + "loss/reg": 0.005863656289875507, + "step": 293 + }, + { + "epoch": 0.03675, + "grad_norm": 2.6373047828674316, + "grad_norm_var": 4514.470495103465, + "learning_rate": 0.0001, + "loss": 1.1694, + "loss/crossentropy": 2.70892333984375, + "loss/hidden": 0.9453125, + "loss/logits": 0.16546514630317688, + "loss/reg": 0.005862091202288866, + "step": 294 + }, + { + "epoch": 0.036875, + "grad_norm": 2.384430170059204, + "grad_norm_var": 4514.321377312488, + "learning_rate": 0.0001, + "loss": 1.2472, + "loss/crossentropy": 2.1273090839385986, + "loss/hidden": 1.0, + "loss/logits": 0.18860690295696259, + "loss/reg": 0.005860424134880304, + "step": 295 + }, + { + "epoch": 0.037, + "grad_norm": 2.5959692001342773, + "grad_norm_var": 4515.978398966678, + "learning_rate": 0.0001, + "loss": 1.0376, + "loss/crossentropy": 2.7293522357940674, + "loss/hidden": 0.8203125, + "loss/logits": 0.1587076485157013, + "loss/reg": 0.0058588446117937565, + "step": 296 + }, + { + "epoch": 0.037125, + "grad_norm": 2.2753238677978516, + "grad_norm_var": 4518.0185669920775, + "learning_rate": 0.0001, + "loss": 1.0063, + "loss/crossentropy": 2.4602949619293213, + "loss/hidden": 0.8125, + "loss/logits": 0.13525693118572235, + "loss/reg": 0.005857320036739111, + "step": 297 + }, + { + "epoch": 0.03725, + "grad_norm": 3.009300708770752, + "grad_norm_var": 4521.093589717446, + "learning_rate": 0.0001, + "loss": 1.2573, + "loss/crossentropy": 2.8883349895477295, + "loss/hidden": 0.9921875, + "loss/logits": 0.20657645165920258, + "loss/reg": 0.005855792202055454, + "step": 298 + }, + { + "epoch": 0.037375, + "grad_norm": 2.700221538543701, + "grad_norm_var": 4522.372179334166, + "learning_rate": 0.0001, + "loss": 1.1557, + "loss/crossentropy": 2.5446314811706543, + "loss/hidden": 0.90234375, + "loss/logits": 0.19479964673519135, + "loss/reg": 0.005854278337210417, + "step": 299 + }, + { + "epoch": 0.0375, + "grad_norm": 2.3786559104919434, + "grad_norm_var": 4523.758055495688, + "learning_rate": 0.0001, + "loss": 1.1224, + "loss/crossentropy": 2.469960927963257, + "loss/hidden": 0.90234375, + "loss/logits": 0.16156738996505737, + "loss/reg": 0.00585273839533329, + "step": 300 + }, + { + "epoch": 0.037625, + "grad_norm": 2.7032158374786377, + "grad_norm_var": 4523.046593599144, + "learning_rate": 0.0001, + "loss": 1.1947, + "loss/crossentropy": 2.7451162338256836, + "loss/hidden": 0.94140625, + "loss/logits": 0.19476984441280365, + "loss/reg": 0.0058509958907961845, + "step": 301 + }, + { + "epoch": 0.03775, + "grad_norm": 2.507664442062378, + "grad_norm_var": 0.11250867537391755, + "learning_rate": 0.0001, + "loss": 0.9899, + "loss/crossentropy": 2.53341007232666, + "loss/hidden": 0.796875, + "loss/logits": 0.1345081329345703, + "loss/reg": 0.005849248263984919, + "step": 302 + }, + { + "epoch": 0.037875, + "grad_norm": 3.027892589569092, + "grad_norm_var": 0.06692647718721882, + "learning_rate": 0.0001, + "loss": 1.0973, + "loss/crossentropy": 2.7899296283721924, + "loss/hidden": 0.890625, + "loss/logits": 0.1482122391462326, + "loss/reg": 0.005847662687301636, + "step": 303 + }, + { + "epoch": 0.038, + "grad_norm": 2.1617183685302734, + "grad_norm_var": 0.07146536810277529, + "learning_rate": 0.0001, + "loss": 0.969, + "loss/crossentropy": 2.4700305461883545, + "loss/hidden": 0.78125, + "loss/logits": 0.12925508618354797, + "loss/reg": 0.005846073850989342, + "step": 304 + }, + { + "epoch": 0.038125, + "grad_norm": 2.3791332244873047, + "grad_norm_var": 0.06803597239225306, + "learning_rate": 0.0001, + "loss": 1.1912, + "loss/crossentropy": 2.4171202182769775, + "loss/hidden": 0.9453125, + "loss/logits": 0.18739524483680725, + "loss/reg": 0.005844476167112589, + "step": 305 + }, + { + "epoch": 0.03825, + "grad_norm": 2.7622976303100586, + "grad_norm_var": 0.06636088237049004, + "learning_rate": 0.0001, + "loss": 1.0808, + "loss/crossentropy": 2.5030367374420166, + "loss/hidden": 0.8359375, + "loss/logits": 0.18643516302108765, + "loss/reg": 0.005842759273946285, + "step": 306 + }, + { + "epoch": 0.038375, + "grad_norm": 2.4079246520996094, + "grad_norm_var": 0.059000676657357566, + "learning_rate": 0.0001, + "loss": 1.0359, + "loss/crossentropy": 2.381542682647705, + "loss/hidden": 0.828125, + "loss/logits": 0.1493588387966156, + "loss/reg": 0.0058412267826497555, + "step": 307 + }, + { + "epoch": 0.0385, + "grad_norm": 2.5356478691101074, + "grad_norm_var": 0.058906038923372726, + "learning_rate": 0.0001, + "loss": 1.087, + "loss/crossentropy": 2.4928808212280273, + "loss/hidden": 0.875, + "loss/logits": 0.15363982319831848, + "loss/reg": 0.0058394852094352245, + "step": 308 + }, + { + "epoch": 0.038625, + "grad_norm": 2.4036688804626465, + "grad_norm_var": 0.0597099908353601, + "learning_rate": 0.0001, + "loss": 0.986, + "loss/crossentropy": 2.5816946029663086, + "loss/hidden": 0.7890625, + "loss/logits": 0.13851355016231537, + "loss/reg": 0.005837727338075638, + "step": 309 + }, + { + "epoch": 0.03875, + "grad_norm": 2.630572557449341, + "grad_norm_var": 0.05963840398777146, + "learning_rate": 0.0001, + "loss": 1.0333, + "loss/crossentropy": 2.140015125274658, + "loss/hidden": 0.828125, + "loss/logits": 0.14680367708206177, + "loss/reg": 0.005835913587361574, + "step": 310 + }, + { + "epoch": 0.038875, + "grad_norm": 2.3641905784606934, + "grad_norm_var": 0.06012154861927167, + "learning_rate": 0.0001, + "loss": 1.0947, + "loss/crossentropy": 2.3300833702087402, + "loss/hidden": 0.8828125, + "loss/logits": 0.15358075499534607, + "loss/reg": 0.005834224168211222, + "step": 311 + }, + { + "epoch": 0.039, + "grad_norm": 2.215728759765625, + "grad_norm_var": 0.06696490679455162, + "learning_rate": 0.0001, + "loss": 1.1411, + "loss/crossentropy": 2.4583277702331543, + "loss/hidden": 0.9140625, + "loss/logits": 0.1687404215335846, + "loss/reg": 0.005832599475979805, + "step": 312 + }, + { + "epoch": 0.039125, + "grad_norm": 2.8934550285339355, + "grad_norm_var": 0.06994228066174794, + "learning_rate": 0.0001, + "loss": 1.2763, + "loss/crossentropy": 2.409702777862549, + "loss/hidden": 1.0390625, + "loss/logits": 0.17889352142810822, + "loss/reg": 0.005831001792103052, + "step": 313 + }, + { + "epoch": 0.03925, + "grad_norm": 8.741681098937988, + "grad_norm_var": 2.4613182467650705, + "learning_rate": 0.0001, + "loss": 1.1972, + "loss/crossentropy": 2.3858492374420166, + "loss/hidden": 0.96875, + "loss/logits": 0.1701970100402832, + "loss/reg": 0.005829236935824156, + "step": 314 + }, + { + "epoch": 0.039375, + "grad_norm": 7.412417411804199, + "grad_norm_var": 3.707354176329111, + "learning_rate": 0.0001, + "loss": 1.3096, + "loss/crossentropy": 2.3804125785827637, + "loss/hidden": 1.1015625, + "loss/logits": 0.149795800447464, + "loss/reg": 0.005827469285577536, + "step": 315 + }, + { + "epoch": 0.0395, + "grad_norm": 3.1443870067596436, + "grad_norm_var": 3.6580641482995806, + "learning_rate": 0.0001, + "loss": 1.1365, + "loss/crossentropy": 2.481820583343506, + "loss/hidden": 0.90234375, + "loss/logits": 0.1759084165096283, + "loss/reg": 0.005825776606798172, + "step": 316 + }, + { + "epoch": 0.039625, + "grad_norm": 2.8567562103271484, + "grad_norm_var": 3.6479706732170993, + "learning_rate": 0.0001, + "loss": 1.0023, + "loss/crossentropy": 2.5141823291778564, + "loss/hidden": 0.80078125, + "loss/logits": 0.14331723749637604, + "loss/reg": 0.005824015475809574, + "step": 317 + }, + { + "epoch": 0.03975, + "grad_norm": 2.2817444801330566, + "grad_norm_var": 3.674359828489624, + "learning_rate": 0.0001, + "loss": 1.0893, + "loss/crossentropy": 2.184128999710083, + "loss/hidden": 0.875, + "loss/logits": 0.15605026483535767, + "loss/reg": 0.00582248717546463, + "step": 318 + }, + { + "epoch": 0.039875, + "grad_norm": 2.249969005584717, + "grad_norm_var": 3.736641439481692, + "learning_rate": 0.0001, + "loss": 1.008, + "loss/crossentropy": 2.768484354019165, + "loss/hidden": 0.80078125, + "loss/logits": 0.14897163212299347, + "loss/reg": 0.00582079216837883, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 2.6358306407928467, + "grad_norm_var": 3.684102068428194, + "learning_rate": 0.0001, + "loss": 1.3237, + "loss/crossentropy": 2.301954507827759, + "loss/hidden": 1.015625, + "loss/logits": 0.24987459182739258, + "loss/reg": 0.005819002632051706, + "step": 320 + }, + { + "epoch": 0.040125, + "grad_norm": 2.353457450866699, + "grad_norm_var": 3.6871065280104496, + "learning_rate": 0.0001, + "loss": 1.1095, + "loss/crossentropy": 2.379765272140503, + "loss/hidden": 0.89453125, + "loss/logits": 0.15680107474327087, + "loss/reg": 0.005817302968353033, + "step": 321 + }, + { + "epoch": 0.04025, + "grad_norm": 2.4568967819213867, + "grad_norm_var": 3.712514538750317, + "learning_rate": 0.0001, + "loss": 0.9706, + "loss/crossentropy": 2.380795955657959, + "loss/hidden": 0.77734375, + "loss/logits": 0.13508911430835724, + "loss/reg": 0.005815597716718912, + "step": 322 + }, + { + "epoch": 0.040375, + "grad_norm": 3.207794189453125, + "grad_norm_var": 3.6654654630236734, + "learning_rate": 0.0001, + "loss": 1.3668, + "loss/crossentropy": 1.949703574180603, + "loss/hidden": 1.1171875, + "loss/logits": 0.19150257110595703, + "loss/reg": 0.005813860800117254, + "step": 323 + }, + { + "epoch": 0.0405, + "grad_norm": 3.156318187713623, + "grad_norm_var": 3.6284383166396252, + "learning_rate": 0.0001, + "loss": 1.2742, + "loss/crossentropy": 2.1970410346984863, + "loss/hidden": 1.0, + "loss/logits": 0.21606677770614624, + "loss/reg": 0.005812041461467743, + "step": 324 + }, + { + "epoch": 0.040625, + "grad_norm": 2.556889533996582, + "grad_norm_var": 3.611332493108523, + "learning_rate": 0.0001, + "loss": 0.9529, + "loss/crossentropy": 2.7647974491119385, + "loss/hidden": 0.7578125, + "loss/logits": 0.1369488537311554, + "loss/reg": 0.00581031059846282, + "step": 325 + }, + { + "epoch": 0.04075, + "grad_norm": 2.2634167671203613, + "grad_norm_var": 3.653624545749698, + "learning_rate": 0.0001, + "loss": 1.0757, + "loss/crossentropy": 2.334134340286255, + "loss/hidden": 0.859375, + "loss/logits": 0.1581987738609314, + "loss/reg": 0.005808570422232151, + "step": 326 + }, + { + "epoch": 0.040875, + "grad_norm": 2.3521125316619873, + "grad_norm_var": 3.6551397839485555, + "learning_rate": 0.0001, + "loss": 0.9965, + "loss/crossentropy": 2.78828763961792, + "loss/hidden": 0.79296875, + "loss/logits": 0.1454332172870636, + "loss/reg": 0.005806888919323683, + "step": 327 + }, + { + "epoch": 0.041, + "grad_norm": 3.0836093425750732, + "grad_norm_var": 3.5768996944618254, + "learning_rate": 0.0001, + "loss": 1.1938, + "loss/crossentropy": 2.2781612873077393, + "loss/hidden": 0.9609375, + "loss/logits": 0.1747758537530899, + "loss/reg": 0.005805303808301687, + "step": 328 + }, + { + "epoch": 0.041125, + "grad_norm": 3.6110970973968506, + "grad_norm_var": 3.5651235487558246, + "learning_rate": 0.0001, + "loss": 1.1693, + "loss/crossentropy": 2.812913417816162, + "loss/hidden": 0.9375, + "loss/logits": 0.17377659678459167, + "loss/reg": 0.005803780164569616, + "step": 329 + }, + { + "epoch": 0.04125, + "grad_norm": 2.5020155906677246, + "grad_norm_var": 1.552569952590708, + "learning_rate": 0.0001, + "loss": 1.0862, + "loss/crossentropy": 2.6585140228271484, + "loss/hidden": 0.86328125, + "loss/logits": 0.16489718854427338, + "loss/reg": 0.005802258383482695, + "step": 330 + }, + { + "epoch": 0.041375, + "grad_norm": 2.383924961090088, + "grad_norm_var": 0.17978007457456116, + "learning_rate": 0.0001, + "loss": 1.1592, + "loss/crossentropy": 2.4862210750579834, + "loss/hidden": 0.94921875, + "loss/logits": 0.15199331939220428, + "loss/reg": 0.005800731014460325, + "step": 331 + }, + { + "epoch": 0.0415, + "grad_norm": 2.187321424484253, + "grad_norm_var": 0.17949311071790794, + "learning_rate": 0.0001, + "loss": 1.0507, + "loss/crossentropy": 2.6380603313446045, + "loss/hidden": 0.84765625, + "loss/logits": 0.14507073163986206, + "loss/reg": 0.005798923317342997, + "step": 332 + }, + { + "epoch": 0.041625, + "grad_norm": 2.21768856048584, + "grad_norm_var": 0.18601193201957902, + "learning_rate": 0.0001, + "loss": 1.1027, + "loss/crossentropy": 2.3925793170928955, + "loss/hidden": 0.875, + "loss/logits": 0.16972869634628296, + "loss/reg": 0.00579707371070981, + "step": 333 + }, + { + "epoch": 0.04175, + "grad_norm": 2.682497262954712, + "grad_norm_var": 0.17937770683656615, + "learning_rate": 0.0001, + "loss": 1.3272, + "loss/crossentropy": 2.3586106300354004, + "loss/hidden": 1.078125, + "loss/logits": 0.1911502480506897, + "loss/reg": 0.005795224104076624, + "step": 334 + }, + { + "epoch": 0.041875, + "grad_norm": 3.0983307361602783, + "grad_norm_var": 0.1826395003188658, + "learning_rate": 0.0001, + "loss": 1.1675, + "loss/crossentropy": 2.436326265335083, + "loss/hidden": 0.91796875, + "loss/logits": 0.1915540099143982, + "loss/reg": 0.005793258547782898, + "step": 335 + }, + { + "epoch": 0.042, + "grad_norm": 6.251674652099609, + "grad_norm_var": 0.982431631272856, + "learning_rate": 0.0001, + "loss": 1.6879, + "loss/crossentropy": 2.3841142654418945, + "loss/hidden": 1.265625, + "loss/logits": 0.3643344044685364, + "loss/reg": 0.0057912725023925304, + "step": 336 + }, + { + "epoch": 0.042125, + "grad_norm": 3.0111782550811768, + "grad_norm_var": 0.9617308564996427, + "learning_rate": 0.0001, + "loss": 1.3497, + "loss/crossentropy": 2.430532217025757, + "loss/hidden": 1.0703125, + "loss/logits": 0.2214677333831787, + "loss/reg": 0.00578899122774601, + "step": 337 + }, + { + "epoch": 0.04225, + "grad_norm": 2.4221205711364746, + "grad_norm_var": 0.9640415151512265, + "learning_rate": 0.0001, + "loss": 1.0955, + "loss/crossentropy": 2.4376015663146973, + "loss/hidden": 0.890625, + "loss/logits": 0.1470467746257782, + "loss/reg": 0.005786662455648184, + "step": 338 + }, + { + "epoch": 0.042375, + "grad_norm": 2.615758180618286, + "grad_norm_var": 0.9645524062068328, + "learning_rate": 0.0001, + "loss": 1.0887, + "loss/crossentropy": 2.5318005084991455, + "loss/hidden": 0.875, + "loss/logits": 0.15580901503562927, + "loss/reg": 0.0057848175056278706, + "step": 339 + }, + { + "epoch": 0.0425, + "grad_norm": 2.857177972793579, + "grad_norm_var": 0.9599117798964886, + "learning_rate": 0.0001, + "loss": 1.1153, + "loss/crossentropy": 2.4260058403015137, + "loss/hidden": 0.89453125, + "loss/logits": 0.16291844844818115, + "loss/reg": 0.005782809574157, + "step": 340 + }, + { + "epoch": 0.042625, + "grad_norm": 2.4030630588531494, + "grad_norm_var": 0.9680393035693963, + "learning_rate": 0.0001, + "loss": 1.2054, + "loss/crossentropy": 2.3009443283081055, + "loss/hidden": 0.953125, + "loss/logits": 0.194431871175766, + "loss/reg": 0.005780525505542755, + "step": 341 + }, + { + "epoch": 0.04275, + "grad_norm": 2.264251470565796, + "grad_norm_var": 0.9679716782722624, + "learning_rate": 0.0001, + "loss": 1.0227, + "loss/crossentropy": 2.597288131713867, + "loss/hidden": 0.8203125, + "loss/logits": 0.14457917213439941, + "loss/reg": 0.005778233055025339, + "step": 342 + }, + { + "epoch": 0.042875, + "grad_norm": 2.2368180751800537, + "grad_norm_var": 0.9767866404468121, + "learning_rate": 0.0001, + "loss": 0.943, + "loss/crossentropy": 2.4534237384796143, + "loss/hidden": 0.7578125, + "loss/logits": 0.12742644548416138, + "loss/reg": 0.005776000674813986, + "step": 343 + }, + { + "epoch": 0.043, + "grad_norm": 2.469120979309082, + "grad_norm_var": 0.9824165851632264, + "learning_rate": 0.0001, + "loss": 1.0531, + "loss/crossentropy": 2.793834686279297, + "loss/hidden": 0.83984375, + "loss/logits": 0.15554235875606537, + "loss/reg": 0.005774145945906639, + "step": 344 + }, + { + "epoch": 0.043125, + "grad_norm": 2.8334686756134033, + "grad_norm_var": 0.9387961568478952, + "learning_rate": 0.0001, + "loss": 0.9467, + "loss/crossentropy": 2.678666830062866, + "loss/hidden": 0.7578125, + "loss/logits": 0.13116785883903503, + "loss/reg": 0.005771928001195192, + "step": 345 + }, + { + "epoch": 0.04325, + "grad_norm": 7.863356590270996, + "grad_norm_var": 2.5385263322105893, + "learning_rate": 0.0001, + "loss": 1.4695, + "loss/crossentropy": 2.613318920135498, + "loss/hidden": 1.2734375, + "loss/logits": 0.13832132518291473, + "loss/reg": 0.005770097486674786, + "step": 346 + }, + { + "epoch": 0.043375, + "grad_norm": 2.763582468032837, + "grad_norm_var": 2.510660987467067, + "learning_rate": 0.0001, + "loss": 1.1302, + "loss/crossentropy": 2.846453905105591, + "loss/hidden": 0.90625, + "loss/logits": 0.166295126080513, + "loss/reg": 0.0057678911834955215, + "step": 347 + }, + { + "epoch": 0.0435, + "grad_norm": 3.600456714630127, + "grad_norm_var": 2.4567056984087676, + "learning_rate": 0.0001, + "loss": 1.2108, + "loss/crossentropy": 2.515092372894287, + "loss/hidden": 0.96875, + "loss/logits": 0.18436874449253082, + "loss/reg": 0.005765695124864578, + "step": 348 + }, + { + "epoch": 0.043625, + "grad_norm": 4.2698073387146, + "grad_norm_var": 2.4444505062987636, + "learning_rate": 0.0001, + "loss": 1.1224, + "loss/crossentropy": 2.3673834800720215, + "loss/hidden": 0.8984375, + "loss/logits": 0.16628439724445343, + "loss/reg": 0.005763507913798094, + "step": 349 + }, + { + "epoch": 0.04375, + "grad_norm": 2.962045192718506, + "grad_norm_var": 2.42435544256402, + "learning_rate": 0.0001, + "loss": 1.079, + "loss/crossentropy": 2.9470205307006836, + "loss/hidden": 0.83203125, + "loss/logits": 0.1893935650587082, + "loss/reg": 0.005761242005974054, + "step": 350 + }, + { + "epoch": 0.043875, + "grad_norm": 3.0306880474090576, + "grad_norm_var": 2.427092851603572, + "learning_rate": 0.0001, + "loss": 1.0201, + "loss/crossentropy": 2.3637542724609375, + "loss/hidden": 0.83203125, + "loss/logits": 0.13047108054161072, + "loss/reg": 0.0057592191733419895, + "step": 351 + }, + { + "epoch": 0.044, + "grad_norm": 2.599585771560669, + "grad_norm_var": 1.855493477227511, + "learning_rate": 0.0001, + "loss": 0.9429, + "loss/crossentropy": 2.9222559928894043, + "loss/hidden": 0.7578125, + "loss/logits": 0.12747693061828613, + "loss/reg": 0.005757040809839964, + "step": 352 + }, + { + "epoch": 0.044125, + "grad_norm": 2.4723081588745117, + "grad_norm_var": 1.882729557078295, + "learning_rate": 0.0001, + "loss": 1.2276, + "loss/crossentropy": 2.5835001468658447, + "loss/hidden": 0.94921875, + "loss/logits": 0.220790833234787, + "loss/reg": 0.005754764657467604, + "step": 353 + }, + { + "epoch": 0.04425, + "grad_norm": 2.5266165733337402, + "grad_norm_var": 1.873911870827686, + "learning_rate": 0.0001, + "loss": 1.1879, + "loss/crossentropy": 2.4273722171783447, + "loss/hidden": 0.97265625, + "loss/logits": 0.15772980451583862, + "loss/reg": 0.005752884317189455, + "step": 354 + }, + { + "epoch": 0.044375, + "grad_norm": 2.8139867782592773, + "grad_norm_var": 1.8632913443851133, + "learning_rate": 0.0001, + "loss": 1.2803, + "loss/crossentropy": 2.591078996658325, + "loss/hidden": 1.0234375, + "loss/logits": 0.19931599497795105, + "loss/reg": 0.0057507967576384544, + "step": 355 + }, + { + "epoch": 0.0445, + "grad_norm": 2.0173490047454834, + "grad_norm_var": 1.9371277324683585, + "learning_rate": 0.0001, + "loss": 1.0066, + "loss/crossentropy": 2.415416955947876, + "loss/hidden": 0.80859375, + "loss/logits": 0.14050991833209991, + "loss/reg": 0.005748571362346411, + "step": 356 + }, + { + "epoch": 0.044625, + "grad_norm": 3.5304269790649414, + "grad_norm_var": 1.916250206343263, + "learning_rate": 0.0001, + "loss": 1.2665, + "loss/crossentropy": 2.7149741649627686, + "loss/hidden": 1.0390625, + "loss/logits": 0.16997796297073364, + "loss/reg": 0.005746254697442055, + "step": 357 + }, + { + "epoch": 0.04475, + "grad_norm": 47.96537399291992, + "grad_norm_var": 127.11164707702224, + "learning_rate": 0.0001, + "loss": 1.4579, + "loss/crossentropy": 2.7637100219726562, + "loss/hidden": 1.2265625, + "loss/logits": 0.17390823364257812, + "loss/reg": 0.005744417663663626, + "step": 358 + }, + { + "epoch": 0.044875, + "grad_norm": 2.253833055496216, + "grad_norm_var": 127.10313415769795, + "learning_rate": 0.0001, + "loss": 1.1382, + "loss/crossentropy": 2.3016419410705566, + "loss/hidden": 0.9140625, + "loss/logits": 0.16676074266433716, + "loss/reg": 0.005742207169532776, + "step": 359 + }, + { + "epoch": 0.045, + "grad_norm": 3.2059576511383057, + "grad_norm_var": 126.79034824550331, + "learning_rate": 0.0001, + "loss": 1.2389, + "loss/crossentropy": 2.624589204788208, + "loss/hidden": 1.0, + "loss/logits": 0.18154433369636536, + "loss/reg": 0.005740353371948004, + "step": 360 + }, + { + "epoch": 0.045125, + "grad_norm": 2.456129789352417, + "grad_norm_var": 126.9607902891753, + "learning_rate": 0.0001, + "loss": 1.0342, + "loss/crossentropy": 2.500290870666504, + "loss/hidden": 0.83203125, + "loss/logits": 0.14475134015083313, + "loss/reg": 0.005738324951380491, + "step": 361 + }, + { + "epoch": 0.04525, + "grad_norm": 3.081372022628784, + "grad_norm_var": 127.21513938268541, + "learning_rate": 0.0001, + "loss": 1.1093, + "loss/crossentropy": 2.3305118083953857, + "loss/hidden": 0.8984375, + "loss/logits": 0.15346962213516235, + "loss/reg": 0.0057361493818461895, + "step": 362 + }, + { + "epoch": 0.045375, + "grad_norm": 2.2634801864624023, + "grad_norm_var": 127.4280286195785, + "learning_rate": 0.0001, + "loss": 1.0956, + "loss/crossentropy": 2.4553990364074707, + "loss/hidden": 0.875, + "loss/logits": 0.16324618458747864, + "loss/reg": 0.005734298378229141, + "step": 363 + }, + { + "epoch": 0.0455, + "grad_norm": 3.9597907066345215, + "grad_norm_var": 127.3359579534097, + "learning_rate": 0.0001, + "loss": 1.3557, + "loss/crossentropy": 2.6449685096740723, + "loss/hidden": 1.078125, + "loss/logits": 0.2202637791633606, + "loss/reg": 0.005732398014515638, + "step": 364 + }, + { + "epoch": 0.045625, + "grad_norm": 2.7794013023376465, + "grad_norm_var": 127.76159157574789, + "learning_rate": 0.0001, + "loss": 1.0787, + "loss/crossentropy": 2.3118059635162354, + "loss/hidden": 0.86328125, + "loss/logits": 0.1581302285194397, + "loss/reg": 0.005730301141738892, + "step": 365 + }, + { + "epoch": 0.04575, + "grad_norm": 4.7589192390441895, + "grad_norm_var": 127.32661229099328, + "learning_rate": 0.0001, + "loss": 1.3244, + "loss/crossentropy": 2.5914306640625, + "loss/hidden": 1.078125, + "loss/logits": 0.18898184597492218, + "loss/reg": 0.005728167947381735, + "step": 366 + }, + { + "epoch": 0.045875, + "grad_norm": 4.024761199951172, + "grad_norm_var": 127.03030673720949, + "learning_rate": 0.0001, + "loss": 1.421, + "loss/crossentropy": 2.083667755126953, + "loss/hidden": 1.1640625, + "loss/logits": 0.1997053027153015, + "loss/reg": 0.005726283416152, + "step": 367 + }, + { + "epoch": 0.046, + "grad_norm": 2.9291043281555176, + "grad_norm_var": 126.89672944049376, + "learning_rate": 0.0001, + "loss": 1.1321, + "loss/crossentropy": 2.7017500400543213, + "loss/hidden": 0.90625, + "loss/logits": 0.1686232089996338, + "loss/reg": 0.005724436603486538, + "step": 368 + }, + { + "epoch": 0.046125, + "grad_norm": 2.289379119873047, + "grad_norm_var": 126.98034912166224, + "learning_rate": 0.0001, + "loss": 1.0433, + "loss/crossentropy": 2.404045581817627, + "loss/hidden": 0.8359375, + "loss/logits": 0.1501048356294632, + "loss/reg": 0.005722455680370331, + "step": 369 + }, + { + "epoch": 0.04625, + "grad_norm": 2.5955307483673096, + "grad_norm_var": 126.95053618311779, + "learning_rate": 0.0001, + "loss": 1.1052, + "loss/crossentropy": 2.555497407913208, + "loss/hidden": 0.87890625, + "loss/logits": 0.16912290453910828, + "loss/reg": 0.0057206167839467525, + "step": 370 + }, + { + "epoch": 0.046375, + "grad_norm": 2.5631515979766846, + "grad_norm_var": 127.05459572518181, + "learning_rate": 0.0001, + "loss": 1.0105, + "loss/crossentropy": 2.3253824710845947, + "loss/hidden": 0.80859375, + "loss/logits": 0.14470672607421875, + "loss/reg": 0.005718756001442671, + "step": 371 + }, + { + "epoch": 0.0465, + "grad_norm": 2.8995003700256348, + "grad_norm_var": 126.65924311218065, + "learning_rate": 0.0001, + "loss": 1.0727, + "loss/crossentropy": 2.5171523094177246, + "loss/hidden": 0.859375, + "loss/logits": 0.15616215765476227, + "loss/reg": 0.005716769490391016, + "step": 372 + }, + { + "epoch": 0.046625, + "grad_norm": 2.4674322605133057, + "grad_norm_var": 127.0582358856119, + "learning_rate": 0.0001, + "loss": 0.9544, + "loss/crossentropy": 2.426679849624634, + "loss/hidden": 0.765625, + "loss/logits": 0.13166998326778412, + "loss/reg": 0.005714884493499994, + "step": 373 + }, + { + "epoch": 0.04675, + "grad_norm": 2.1486146450042725, + "grad_norm_var": 0.5554253140062239, + "learning_rate": 0.0001, + "loss": 1.0123, + "loss/crossentropy": 2.3567564487457275, + "loss/hidden": 0.8203125, + "loss/logits": 0.1348218023777008, + "loss/reg": 0.0057129692286252975, + "step": 374 + }, + { + "epoch": 0.046875, + "grad_norm": 2.4249770641326904, + "grad_norm_var": 0.5421168003854054, + "learning_rate": 0.0001, + "loss": 1.0005, + "loss/crossentropy": 2.575383424758911, + "loss/hidden": 0.80078125, + "loss/logits": 0.1425924003124237, + "loss/reg": 0.005710979457944632, + "step": 375 + }, + { + "epoch": 0.047, + "grad_norm": 3.9449760913848877, + "grad_norm_var": 0.6036429091311817, + "learning_rate": 0.0001, + "loss": 1.1428, + "loss/crossentropy": 2.5839173793792725, + "loss/hidden": 0.94921875, + "loss/logits": 0.13653349876403809, + "loss/reg": 0.0057089440524578094, + "step": 376 + }, + { + "epoch": 0.047125, + "grad_norm": 2.3119592666625977, + "grad_norm_var": 0.6148998912723904, + "learning_rate": 0.0001, + "loss": 1.088, + "loss/crossentropy": 2.492663860321045, + "loss/hidden": 0.859375, + "loss/logits": 0.1715661883354187, + "loss/reg": 0.005707095842808485, + "step": 377 + }, + { + "epoch": 0.04725, + "grad_norm": 3.586817979812622, + "grad_norm_var": 0.6386998540868449, + "learning_rate": 0.0001, + "loss": 1.0907, + "loss/crossentropy": 2.8210177421569824, + "loss/hidden": 0.87890625, + "loss/logits": 0.15476316213607788, + "loss/reg": 0.005705154500901699, + "step": 378 + }, + { + "epoch": 0.047375, + "grad_norm": 2.805647850036621, + "grad_norm_var": 0.6040650287121667, + "learning_rate": 0.0001, + "loss": 1.0792, + "loss/crossentropy": 2.54019832611084, + "loss/hidden": 0.859375, + "loss/logits": 0.16280022263526917, + "loss/reg": 0.005703243892639875, + "step": 379 + }, + { + "epoch": 0.0475, + "grad_norm": 2.7932748794555664, + "grad_norm_var": 0.5445939245804574, + "learning_rate": 0.0001, + "loss": 1.4621, + "loss/crossentropy": 2.2343437671661377, + "loss/hidden": 1.1953125, + "loss/logits": 0.20978981256484985, + "loss/reg": 0.005701290909200907, + "step": 380 + }, + { + "epoch": 0.047625, + "grad_norm": 2.661917209625244, + "grad_norm_var": 0.5482497924242672, + "learning_rate": 0.0001, + "loss": 0.9746, + "loss/crossentropy": 2.782052516937256, + "loss/hidden": 0.78125, + "loss/logits": 0.13640211522579193, + "loss/reg": 0.0056994096376001835, + "step": 381 + }, + { + "epoch": 0.04775, + "grad_norm": 2.4914302825927734, + "grad_norm_var": 0.3228126995822395, + "learning_rate": 0.0001, + "loss": 1.126, + "loss/crossentropy": 2.166295051574707, + "loss/hidden": 0.91015625, + "loss/logits": 0.1589164137840271, + "loss/reg": 0.005697426851838827, + "step": 382 + }, + { + "epoch": 0.047875, + "grad_norm": 2.961653709411621, + "grad_norm_var": 0.22106978564282992, + "learning_rate": 0.0001, + "loss": 1.1071, + "loss/crossentropy": 2.5477302074432373, + "loss/hidden": 0.8828125, + "loss/logits": 0.16730068624019623, + "loss/reg": 0.005695413798093796, + "step": 383 + }, + { + "epoch": 0.048, + "grad_norm": 2.9396286010742188, + "grad_norm_var": 0.22133896443579198, + "learning_rate": 0.0001, + "loss": 1.0254, + "loss/crossentropy": 2.555258274078369, + "loss/hidden": 0.828125, + "loss/logits": 0.1403425633907318, + "loss/reg": 0.005693417973816395, + "step": 384 + }, + { + "epoch": 0.048125, + "grad_norm": 2.8298912048339844, + "grad_norm_var": 0.20691636732209961, + "learning_rate": 0.0001, + "loss": 1.195, + "loss/crossentropy": 2.472844362258911, + "loss/hidden": 0.984375, + "loss/logits": 0.15367946028709412, + "loss/reg": 0.005691539496183395, + "step": 385 + }, + { + "epoch": 0.04825, + "grad_norm": 15.47062873840332, + "grad_norm_var": 10.256501481265339, + "learning_rate": 0.0001, + "loss": 1.4448, + "loss/crossentropy": 2.521524667739868, + "loss/hidden": 1.203125, + "loss/logits": 0.1847420334815979, + "loss/reg": 0.005689616315066814, + "step": 386 + }, + { + "epoch": 0.048375, + "grad_norm": 2.455294370651245, + "grad_norm_var": 10.271871141002237, + "learning_rate": 0.0001, + "loss": 1.1018, + "loss/crossentropy": 2.309390068054199, + "loss/hidden": 0.89453125, + "loss/logits": 0.15039557218551636, + "loss/reg": 0.005687698721885681, + "step": 387 + }, + { + "epoch": 0.0485, + "grad_norm": 3.23420786857605, + "grad_norm_var": 10.248744715041969, + "learning_rate": 0.0001, + "loss": 1.2879, + "loss/crossentropy": 2.4902544021606445, + "loss/hidden": 1.015625, + "loss/logits": 0.2154603898525238, + "loss/reg": 0.005685731768608093, + "step": 388 + }, + { + "epoch": 0.048625, + "grad_norm": 2.660858631134033, + "grad_norm_var": 10.221989434520331, + "learning_rate": 0.0001, + "loss": 1.025, + "loss/crossentropy": 2.31535267829895, + "loss/hidden": 0.8359375, + "loss/logits": 0.13224059343338013, + "loss/reg": 0.005683773662894964, + "step": 389 + }, + { + "epoch": 0.04875, + "grad_norm": 2.4209847450256348, + "grad_norm_var": 10.173641089965429, + "learning_rate": 0.0001, + "loss": 0.9974, + "loss/crossentropy": 2.1761093139648438, + "loss/hidden": 0.8125, + "loss/logits": 0.12805956602096558, + "loss/reg": 0.005681932438164949, + "step": 390 + }, + { + "epoch": 0.048875, + "grad_norm": 3.108008623123169, + "grad_norm_var": 10.09354551501582, + "learning_rate": 0.0001, + "loss": 0.979, + "loss/crossentropy": 2.721165657043457, + "loss/hidden": 0.78125, + "loss/logits": 0.14099523425102234, + "loss/reg": 0.005679869093000889, + "step": 391 + }, + { + "epoch": 0.049, + "grad_norm": 2.6531527042388916, + "grad_norm_var": 10.150022289467502, + "learning_rate": 0.0001, + "loss": 1.1723, + "loss/crossentropy": 2.518146514892578, + "loss/hidden": 0.9375, + "loss/logits": 0.17805764079093933, + "loss/reg": 0.005677856504917145, + "step": 392 + }, + { + "epoch": 0.049125, + "grad_norm": 2.2534499168395996, + "grad_norm_var": 10.160179916565673, + "learning_rate": 0.0001, + "loss": 1.1292, + "loss/crossentropy": 2.633385181427002, + "loss/hidden": 0.91015625, + "loss/logits": 0.16230204701423645, + "loss/reg": 0.005675735417753458, + "step": 393 + }, + { + "epoch": 0.04925, + "grad_norm": 2.9424333572387695, + "grad_norm_var": 10.185797665159741, + "learning_rate": 0.0001, + "loss": 1.4214, + "loss/crossentropy": 2.62923002243042, + "loss/hidden": 1.15625, + "loss/logits": 0.20838308334350586, + "loss/reg": 0.00567356962710619, + "step": 394 + }, + { + "epoch": 0.049375, + "grad_norm": 2.622178792953491, + "grad_norm_var": 10.20593051221178, + "learning_rate": 0.0001, + "loss": 0.9697, + "loss/crossentropy": 2.5544826984405518, + "loss/hidden": 0.78125, + "loss/logits": 0.13172510266304016, + "loss/reg": 0.005671407096087933, + "step": 395 + }, + { + "epoch": 0.0495, + "grad_norm": 2.635505199432373, + "grad_norm_var": 10.223008906743342, + "learning_rate": 0.0001, + "loss": 0.933, + "loss/crossentropy": 2.5959105491638184, + "loss/hidden": 0.75390625, + "loss/logits": 0.12239634245634079, + "loss/reg": 0.0056692929938435555, + "step": 396 + }, + { + "epoch": 0.049625, + "grad_norm": 2.6063406467437744, + "grad_norm_var": 10.229570355797922, + "learning_rate": 0.0001, + "loss": 1.0478, + "loss/crossentropy": 2.719916343688965, + "loss/hidden": 0.83984375, + "loss/logits": 0.15127256512641907, + "loss/reg": 0.0056673381477594376, + "step": 397 + }, + { + "epoch": 0.04975, + "grad_norm": 2.589893102645874, + "grad_norm_var": 10.216701025853546, + "learning_rate": 0.0001, + "loss": 1.1265, + "loss/crossentropy": 2.3730130195617676, + "loss/hidden": 0.90234375, + "loss/logits": 0.16749918460845947, + "loss/reg": 0.0056652189232409, + "step": 398 + }, + { + "epoch": 0.049875, + "grad_norm": 2.1503751277923584, + "grad_norm_var": 10.318666846324161, + "learning_rate": 0.0001, + "loss": 1.1685, + "loss/crossentropy": 2.2147741317749023, + "loss/hidden": 0.92578125, + "loss/logits": 0.1860472559928894, + "loss/reg": 0.005663097370415926, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 3.6945109367370605, + "grad_norm_var": 10.300567557859127, + "learning_rate": 0.0001, + "loss": 1.1272, + "loss/crossentropy": 2.4212143421173096, + "loss/hidden": 0.921875, + "loss/logits": 0.1487593650817871, + "loss/reg": 0.005661314353346825, + "step": 400 + }, + { + "epoch": 0.050125, + "grad_norm": 3.7444777488708496, + "grad_norm_var": 10.268632820538057, + "learning_rate": 0.0001, + "loss": 1.1221, + "loss/crossentropy": 2.5369904041290283, + "loss/hidden": 0.90625, + "loss/logits": 0.15929211676120758, + "loss/reg": 0.005659462418407202, + "step": 401 + }, + { + "epoch": 0.05025, + "grad_norm": 5.121776580810547, + "grad_norm_var": 0.5518050614602837, + "learning_rate": 0.0001, + "loss": 1.4671, + "loss/crossentropy": 2.2371129989624023, + "loss/hidden": 1.2109375, + "loss/logits": 0.19960111379623413, + "loss/reg": 0.005657529458403587, + "step": 402 + }, + { + "epoch": 0.050375, + "grad_norm": 28.607572555541992, + "grad_norm_var": 41.63994308721723, + "learning_rate": 0.0001, + "loss": 1.1515, + "loss/crossentropy": 2.84385347366333, + "loss/hidden": 0.90234375, + "loss/logits": 0.19263674318790436, + "loss/reg": 0.005655454937368631, + "step": 403 + }, + { + "epoch": 0.0505, + "grad_norm": 2.38948655128479, + "grad_norm_var": 41.834466994087045, + "learning_rate": 0.0001, + "loss": 1.0929, + "loss/crossentropy": 2.2518088817596436, + "loss/hidden": 0.8984375, + "loss/logits": 0.13791221380233765, + "loss/reg": 0.005653408356010914, + "step": 404 + }, + { + "epoch": 0.050625, + "grad_norm": 6.887917518615723, + "grad_norm_var": 41.907583648135414, + "learning_rate": 0.0001, + "loss": 1.2522, + "loss/crossentropy": 2.8729405403137207, + "loss/hidden": 1.046875, + "loss/logits": 0.14880970120429993, + "loss/reg": 0.005651514511555433, + "step": 405 + }, + { + "epoch": 0.05075, + "grad_norm": 3.2420449256896973, + "grad_norm_var": 41.69182027548524, + "learning_rate": 0.0001, + "loss": 1.2031, + "loss/crossentropy": 2.598705530166626, + "loss/hidden": 0.98046875, + "loss/logits": 0.16617505252361298, + "loss/reg": 0.005649634636938572, + "step": 406 + }, + { + "epoch": 0.050875, + "grad_norm": 2.3294692039489746, + "grad_norm_var": 41.9082544413822, + "learning_rate": 0.0001, + "loss": 1.0316, + "loss/crossentropy": 2.7743589878082275, + "loss/hidden": 0.84375, + "loss/logits": 0.13134868443012238, + "loss/reg": 0.005647764541208744, + "step": 407 + }, + { + "epoch": 0.051, + "grad_norm": 2.3849406242370605, + "grad_norm_var": 41.988788990047645, + "learning_rate": 0.0001, + "loss": 1.1579, + "loss/crossentropy": 2.2934722900390625, + "loss/hidden": 0.9375, + "loss/logits": 0.16397064924240112, + "loss/reg": 0.00564591446891427, + "step": 408 + }, + { + "epoch": 0.051125, + "grad_norm": 2.616523504257202, + "grad_norm_var": 41.875558070811756, + "learning_rate": 0.0001, + "loss": 0.9281, + "loss/crossentropy": 2.617312431335449, + "loss/hidden": 0.7734375, + "loss/logits": 0.09819567203521729, + "loss/reg": 0.005644225515425205, + "step": 409 + }, + { + "epoch": 0.05125, + "grad_norm": 2.302281141281128, + "grad_norm_var": 42.058469053043055, + "learning_rate": 0.0001, + "loss": 1.0583, + "loss/crossentropy": 2.8029561042785645, + "loss/hidden": 0.859375, + "loss/logits": 0.14253735542297363, + "loss/reg": 0.005642317235469818, + "step": 410 + }, + { + "epoch": 0.051375, + "grad_norm": 2.1521739959716797, + "grad_norm_var": 42.20532780726832, + "learning_rate": 0.0001, + "loss": 0.996, + "loss/crossentropy": 2.5798304080963135, + "loss/hidden": 0.80078125, + "loss/logits": 0.13881272077560425, + "loss/reg": 0.005640234332531691, + "step": 411 + }, + { + "epoch": 0.0515, + "grad_norm": 4.3292155265808105, + "grad_norm_var": 41.914794683811124, + "learning_rate": 0.0001, + "loss": 1.3517, + "loss/crossentropy": 2.4219868183135986, + "loss/hidden": 1.0390625, + "loss/logits": 0.2562292516231537, + "loss/reg": 0.005638125352561474, + "step": 412 + }, + { + "epoch": 0.051625, + "grad_norm": 19.01975440979004, + "grad_norm_var": 53.903843358167165, + "learning_rate": 0.0001, + "loss": 1.3283, + "loss/crossentropy": 2.2926077842712402, + "loss/hidden": 1.078125, + "loss/logits": 0.19380658864974976, + "loss/reg": 0.005636140704154968, + "step": 413 + }, + { + "epoch": 0.05175, + "grad_norm": 2.859027862548828, + "grad_norm_var": 53.791467006877085, + "learning_rate": 0.0001, + "loss": 1.1115, + "loss/crossentropy": 2.429117441177368, + "loss/hidden": 0.90234375, + "loss/logits": 0.1528070569038391, + "loss/reg": 0.005634027067571878, + "step": 414 + }, + { + "epoch": 0.051875, + "grad_norm": 2.385204792022705, + "grad_norm_var": 53.67862289213027, + "learning_rate": 0.0001, + "loss": 1.0186, + "loss/crossentropy": 2.710325002670288, + "loss/hidden": 0.81640625, + "loss/logits": 0.1458669900894165, + "loss/reg": 0.005631967913359404, + "step": 415 + }, + { + "epoch": 0.052, + "grad_norm": 2.3011677265167236, + "grad_norm_var": 54.20582073402194, + "learning_rate": 0.0001, + "loss": 1.0843, + "loss/crossentropy": 2.485734701156616, + "loss/hidden": 0.87109375, + "loss/logits": 0.1569264829158783, + "loss/reg": 0.0056300037540495396, + "step": 416 + }, + { + "epoch": 0.052125, + "grad_norm": 2.7714357376098633, + "grad_norm_var": 54.53064815195892, + "learning_rate": 0.0001, + "loss": 1.0741, + "loss/crossentropy": 2.6249403953552246, + "loss/hidden": 0.85546875, + "loss/logits": 0.1623522937297821, + "loss/reg": 0.0056281075812876225, + "step": 417 + }, + { + "epoch": 0.05225, + "grad_norm": 2.376473903656006, + "grad_norm_var": 55.22478277620113, + "learning_rate": 0.0001, + "loss": 1.2116, + "loss/crossentropy": 2.5150105953216553, + "loss/hidden": 0.95703125, + "loss/logits": 0.19830524921417236, + "loss/reg": 0.005626222584396601, + "step": 418 + }, + { + "epoch": 0.052375, + "grad_norm": 2.6247470378875732, + "grad_norm_var": 17.572360223815615, + "learning_rate": 0.0001, + "loss": 1.172, + "loss/crossentropy": 2.7201685905456543, + "loss/hidden": 0.9453125, + "loss/logits": 0.17042091488838196, + "loss/reg": 0.005624283570796251, + "step": 419 + }, + { + "epoch": 0.0525, + "grad_norm": 49.02815628051758, + "grad_norm_var": 143.90483482694842, + "learning_rate": 0.0001, + "loss": 5.3824, + "loss/crossentropy": 2.692047357559204, + "loss/hidden": 4.84375, + "loss/logits": 0.48245739936828613, + "loss/reg": 0.005622203927487135, + "step": 420 + }, + { + "epoch": 0.052625, + "grad_norm": 2.6867082118988037, + "grad_norm_var": 144.9870986829453, + "learning_rate": 0.0001, + "loss": 1.2507, + "loss/crossentropy": 2.404517412185669, + "loss/hidden": 1.0, + "loss/logits": 0.19445687532424927, + "loss/reg": 0.005620268173515797, + "step": 421 + }, + { + "epoch": 0.05275, + "grad_norm": 4.397704124450684, + "grad_norm_var": 144.55498651709914, + "learning_rate": 0.0001, + "loss": 1.4596, + "loss/crossentropy": 2.1510226726531982, + "loss/hidden": 1.2109375, + "loss/logits": 0.19246640801429749, + "loss/reg": 0.005618296563625336, + "step": 422 + }, + { + "epoch": 0.052875, + "grad_norm": 4.239573955535889, + "grad_norm_var": 143.68003611616095, + "learning_rate": 0.0001, + "loss": 1.3275, + "loss/crossentropy": 2.686849355697632, + "loss/hidden": 1.09375, + "loss/logits": 0.17758557200431824, + "loss/reg": 0.005616751033812761, + "step": 423 + }, + { + "epoch": 0.053, + "grad_norm": 2.749202251434326, + "grad_norm_var": 143.4748837350726, + "learning_rate": 0.0001, + "loss": 1.0827, + "loss/crossentropy": 2.8104846477508545, + "loss/hidden": 0.8828125, + "loss/logits": 0.1437493860721588, + "loss/reg": 0.005615332629531622, + "step": 424 + }, + { + "epoch": 0.053125, + "grad_norm": 2.459291458129883, + "grad_norm_var": 143.5641839570371, + "learning_rate": 0.0001, + "loss": 1.0548, + "loss/crossentropy": 2.5806379318237305, + "loss/hidden": 0.8515625, + "loss/logits": 0.14714661240577698, + "loss/reg": 0.005613364279270172, + "step": 425 + }, + { + "epoch": 0.05325, + "grad_norm": 2.294171094894409, + "grad_norm_var": 143.56904366210821, + "learning_rate": 0.0001, + "loss": 1.1486, + "loss/crossentropy": 2.6366002559661865, + "loss/hidden": 0.90234375, + "loss/logits": 0.19014191627502441, + "loss/reg": 0.005611394997686148, + "step": 426 + }, + { + "epoch": 0.053375, + "grad_norm": 2.2255382537841797, + "grad_norm_var": 143.52399251007708, + "learning_rate": 0.0001, + "loss": 1.0752, + "loss/crossentropy": 2.542306661605835, + "loss/hidden": 0.875, + "loss/logits": 0.14408408105373383, + "loss/reg": 0.005609368905425072, + "step": 427 + }, + { + "epoch": 0.0535, + "grad_norm": 3.5708723068237305, + "grad_norm_var": 143.80942972780392, + "learning_rate": 0.0001, + "loss": 1.0863, + "loss/crossentropy": 2.2636356353759766, + "loss/hidden": 0.8828125, + "loss/logits": 0.14744916558265686, + "loss/reg": 0.005607361439615488, + "step": 428 + }, + { + "epoch": 0.053625, + "grad_norm": 2.9189610481262207, + "grad_norm_var": 133.66980873374825, + "learning_rate": 0.0001, + "loss": 0.9895, + "loss/crossentropy": 2.7651426792144775, + "loss/hidden": 0.78515625, + "loss/logits": 0.1482805609703064, + "loss/reg": 0.005605428479611874, + "step": 429 + }, + { + "epoch": 0.05375, + "grad_norm": 3.2735564708709717, + "grad_norm_var": 133.5211490137515, + "learning_rate": 0.0001, + "loss": 1.2363, + "loss/crossentropy": 2.248082399368286, + "loss/hidden": 0.98046875, + "loss/logits": 0.19977417588233948, + "loss/reg": 0.0056034415028989315, + "step": 430 + }, + { + "epoch": 0.053875, + "grad_norm": 3.5670769214630127, + "grad_norm_var": 133.0752341056661, + "learning_rate": 0.0001, + "loss": 1.2766, + "loss/crossentropy": 2.500338554382324, + "loss/hidden": 1.0234375, + "loss/logits": 0.19719059765338898, + "loss/reg": 0.005601502023637295, + "step": 431 + }, + { + "epoch": 0.054, + "grad_norm": 2.2697787284851074, + "grad_norm_var": 133.0901180807591, + "learning_rate": 0.0001, + "loss": 0.9931, + "loss/crossentropy": 2.6418793201446533, + "loss/hidden": 0.7890625, + "loss/logits": 0.14799568057060242, + "loss/reg": 0.005599519703537226, + "step": 432 + }, + { + "epoch": 0.054125, + "grad_norm": 3.220383405685425, + "grad_norm_var": 132.91898234062202, + "learning_rate": 0.0001, + "loss": 1.2515, + "loss/crossentropy": 2.5073025226593018, + "loss/hidden": 1.0390625, + "loss/logits": 0.15643876791000366, + "loss/reg": 0.005597477313131094, + "step": 433 + }, + { + "epoch": 0.05425, + "grad_norm": 3.2845206260681152, + "grad_norm_var": 132.5476800488924, + "learning_rate": 0.0001, + "loss": 1.1441, + "loss/crossentropy": 2.509037971496582, + "loss/hidden": 0.9296875, + "loss/logits": 0.15849418938159943, + "loss/reg": 0.005595567170530558, + "step": 434 + }, + { + "epoch": 0.054375, + "grad_norm": 2.254239320755005, + "grad_norm_var": 132.71932731242507, + "learning_rate": 0.0001, + "loss": 0.9815, + "loss/crossentropy": 2.567584991455078, + "loss/hidden": 0.78125, + "loss/logits": 0.14433184266090393, + "loss/reg": 0.005593593697994947, + "step": 435 + }, + { + "epoch": 0.0545, + "grad_norm": 3.2273480892181396, + "grad_norm_var": 0.4676980414191933, + "learning_rate": 0.0001, + "loss": 1.1645, + "loss/crossentropy": 2.3639349937438965, + "loss/hidden": 0.94921875, + "loss/logits": 0.15934088826179504, + "loss/reg": 0.0055916691198945045, + "step": 436 + }, + { + "epoch": 0.054625, + "grad_norm": 2.6044058799743652, + "grad_norm_var": 0.47199755801328347, + "learning_rate": 0.0001, + "loss": 1.1033, + "loss/crossentropy": 2.539247989654541, + "loss/hidden": 0.8984375, + "loss/logits": 0.14898554980754852, + "loss/reg": 0.005589775741100311, + "step": 437 + }, + { + "epoch": 0.05475, + "grad_norm": 2.9674391746520996, + "grad_norm_var": 0.3399405404704983, + "learning_rate": 0.0001, + "loss": 1.252, + "loss/crossentropy": 2.5642499923706055, + "loss/hidden": 0.9921875, + "loss/logits": 0.20391228795051575, + "loss/reg": 0.005587900057435036, + "step": 438 + }, + { + "epoch": 0.054875, + "grad_norm": 2.4164047241210938, + "grad_norm_var": 0.23308679379454797, + "learning_rate": 0.0001, + "loss": 1.1939, + "loss/crossentropy": 2.3462696075439453, + "loss/hidden": 0.93359375, + "loss/logits": 0.2044137418270111, + "loss/reg": 0.005585688166320324, + "step": 439 + }, + { + "epoch": 0.055, + "grad_norm": 2.7590599060058594, + "grad_norm_var": 0.2329847653181711, + "learning_rate": 0.0001, + "loss": 1.0377, + "loss/crossentropy": 2.775485038757324, + "loss/hidden": 0.84375, + "loss/logits": 0.13808496296405792, + "loss/reg": 0.0055835009552538395, + "step": 440 + }, + { + "epoch": 0.055125, + "grad_norm": 2.7251267433166504, + "grad_norm_var": 0.224188675724659, + "learning_rate": 0.0001, + "loss": 1.0001, + "loss/crossentropy": 2.4934420585632324, + "loss/hidden": 0.80859375, + "loss/logits": 0.1357189267873764, + "loss/reg": 0.005581483710557222, + "step": 441 + }, + { + "epoch": 0.05525, + "grad_norm": 2.4774584770202637, + "grad_norm_var": 0.21273704839308963, + "learning_rate": 0.0001, + "loss": 1.2166, + "loss/crossentropy": 2.426271438598633, + "loss/hidden": 0.95703125, + "loss/logits": 0.20375394821166992, + "loss/reg": 0.0055792308412492275, + "step": 442 + }, + { + "epoch": 0.055375, + "grad_norm": 3.2236833572387695, + "grad_norm_var": 0.1905493662305197, + "learning_rate": 0.0001, + "loss": 1.1724, + "loss/crossentropy": 2.9799797534942627, + "loss/hidden": 0.92578125, + "loss/logits": 0.19083930552005768, + "loss/reg": 0.005577271804213524, + "step": 443 + }, + { + "epoch": 0.0555, + "grad_norm": 2.5997183322906494, + "grad_norm_var": 0.16554225723918894, + "learning_rate": 0.0001, + "loss": 1.126, + "loss/crossentropy": 2.2098257541656494, + "loss/hidden": 0.92578125, + "loss/logits": 0.14447355270385742, + "loss/reg": 0.005575183313339949, + "step": 444 + }, + { + "epoch": 0.055625, + "grad_norm": 2.5179152488708496, + "grad_norm_var": 0.1725392629592297, + "learning_rate": 0.0001, + "loss": 1.2018, + "loss/crossentropy": 2.0029213428497314, + "loss/hidden": 0.98046875, + "loss/logits": 0.1655960977077484, + "loss/reg": 0.005572900176048279, + "step": 445 + }, + { + "epoch": 0.05575, + "grad_norm": 2.5075204372406006, + "grad_norm_var": 0.16460110044899826, + "learning_rate": 0.0001, + "loss": 1.0614, + "loss/crossentropy": 2.3672924041748047, + "loss/hidden": 0.85546875, + "loss/logits": 0.15021467208862305, + "loss/reg": 0.005570439621806145, + "step": 446 + }, + { + "epoch": 0.055875, + "grad_norm": 2.441183567047119, + "grad_norm_var": 0.12700610259855102, + "learning_rate": 0.0001, + "loss": 0.9323, + "loss/crossentropy": 2.311056137084961, + "loss/hidden": 0.7578125, + "loss/logits": 0.11881721019744873, + "loss/reg": 0.00556844100356102, + "step": 447 + }, + { + "epoch": 0.056, + "grad_norm": 2.6724319458007812, + "grad_norm_var": 0.11304803744365562, + "learning_rate": 0.0001, + "loss": 1.0937, + "loss/crossentropy": 2.562101364135742, + "loss/hidden": 0.8671875, + "loss/logits": 0.1708334982395172, + "loss/reg": 0.005566492676734924, + "step": 448 + }, + { + "epoch": 0.056125, + "grad_norm": 2.196300506591797, + "grad_norm_var": 0.11350312697665288, + "learning_rate": 0.0001, + "loss": 0.9882, + "loss/crossentropy": 2.4227116107940674, + "loss/hidden": 0.80078125, + "loss/logits": 0.13182450830936432, + "loss/reg": 0.00556437112390995, + "step": 449 + }, + { + "epoch": 0.05625, + "grad_norm": 2.912667989730835, + "grad_norm_var": 0.0921566818687341, + "learning_rate": 0.0001, + "loss": 1.3721, + "loss/crossentropy": 1.9439491033554077, + "loss/hidden": 1.109375, + "loss/logits": 0.2070913016796112, + "loss/reg": 0.0055623650550842285, + "step": 450 + }, + { + "epoch": 0.056375, + "grad_norm": 2.011991500854492, + "grad_norm_var": 0.10881512213368959, + "learning_rate": 0.0001, + "loss": 1.0172, + "loss/crossentropy": 2.498812675476074, + "loss/hidden": 0.81640625, + "loss/logits": 0.14521706104278564, + "loss/reg": 0.005560221150517464, + "step": 451 + }, + { + "epoch": 0.0565, + "grad_norm": 2.2709267139434814, + "grad_norm_var": 0.0912508163184422, + "learning_rate": 0.0001, + "loss": 1.1384, + "loss/crossentropy": 2.320579767227173, + "loss/hidden": 0.9140625, + "loss/logits": 0.16879746317863464, + "loss/reg": 0.005558326840400696, + "step": 452 + }, + { + "epoch": 0.056625, + "grad_norm": 2.954127788543701, + "grad_norm_var": 0.09996231296479816, + "learning_rate": 0.0001, + "loss": 1.2415, + "loss/crossentropy": 2.483376979827881, + "loss/hidden": 0.99609375, + "loss/logits": 0.18988527357578278, + "loss/reg": 0.005556488875299692, + "step": 453 + }, + { + "epoch": 0.05675, + "grad_norm": 2.442729949951172, + "grad_norm_var": 0.0916992305907788, + "learning_rate": 0.0001, + "loss": 1.0533, + "loss/crossentropy": 2.414472818374634, + "loss/hidden": 0.84765625, + "loss/logits": 0.1501239389181137, + "loss/reg": 0.005554646719247103, + "step": 454 + }, + { + "epoch": 0.056875, + "grad_norm": 2.598292589187622, + "grad_norm_var": 0.09002796513685567, + "learning_rate": 0.0001, + "loss": 0.9797, + "loss/crossentropy": 2.8175811767578125, + "loss/hidden": 0.78515625, + "loss/logits": 0.13899990916252136, + "loss/reg": 0.005552831571549177, + "step": 455 + }, + { + "epoch": 0.057, + "grad_norm": 2.284618616104126, + "grad_norm_var": 0.09289234998963139, + "learning_rate": 0.0001, + "loss": 1.1767, + "loss/crossentropy": 2.5178730487823486, + "loss/hidden": 0.953125, + "loss/logits": 0.1680239588022232, + "loss/reg": 0.005550856236368418, + "step": 456 + }, + { + "epoch": 0.057125, + "grad_norm": 2.9749691486358643, + "grad_norm_var": 0.10255115779464533, + "learning_rate": 0.0001, + "loss": 1.146, + "loss/crossentropy": 2.6965036392211914, + "loss/hidden": 0.89453125, + "loss/logits": 0.19602364301681519, + "loss/reg": 0.005548745859414339, + "step": 457 + }, + { + "epoch": 0.05725, + "grad_norm": 2.4419991970062256, + "grad_norm_var": 0.10305738190390912, + "learning_rate": 0.0001, + "loss": 1.0782, + "loss/crossentropy": 2.507200241088867, + "loss/hidden": 0.87890625, + "loss/logits": 0.14385411143302917, + "loss/reg": 0.005546758882701397, + "step": 458 + }, + { + "epoch": 0.057375, + "grad_norm": 2.41898250579834, + "grad_norm_var": 0.07293072023693033, + "learning_rate": 0.0001, + "loss": 1.0665, + "loss/crossentropy": 2.4068796634674072, + "loss/hidden": 0.87109375, + "loss/logits": 0.13996180891990662, + "loss/reg": 0.005544655025005341, + "step": 459 + }, + { + "epoch": 0.0575, + "grad_norm": 3.584895372390747, + "grad_norm_var": 0.1446675774892469, + "learning_rate": 0.0001, + "loss": 1.419, + "loss/crossentropy": 2.4029970169067383, + "loss/hidden": 1.15625, + "loss/logits": 0.20734865963459015, + "loss/reg": 0.005542535334825516, + "step": 460 + }, + { + "epoch": 0.057625, + "grad_norm": 2.5190699100494385, + "grad_norm_var": 0.14465856873481447, + "learning_rate": 0.0001, + "loss": 1.0687, + "loss/crossentropy": 2.632817268371582, + "loss/hidden": 0.84375, + "loss/logits": 0.16959112882614136, + "loss/reg": 0.005540382582694292, + "step": 461 + }, + { + "epoch": 0.05775, + "grad_norm": 3.293412446975708, + "grad_norm_var": 0.1759751166057581, + "learning_rate": 0.0001, + "loss": 1.2079, + "loss/crossentropy": 1.8526346683502197, + "loss/hidden": 0.984375, + "loss/logits": 0.16817334294319153, + "loss/reg": 0.005538390018045902, + "step": 462 + }, + { + "epoch": 0.057875, + "grad_norm": 2.090097665786743, + "grad_norm_var": 0.1923380804679141, + "learning_rate": 0.0001, + "loss": 1.0403, + "loss/crossentropy": 2.7256767749786377, + "loss/hidden": 0.83984375, + "loss/logits": 0.14509689807891846, + "loss/reg": 0.005536381620913744, + "step": 463 + }, + { + "epoch": 0.058, + "grad_norm": 2.367372751235962, + "grad_norm_var": 0.19537989350592183, + "learning_rate": 0.0001, + "loss": 0.967, + "loss/crossentropy": 2.440683603286743, + "loss/hidden": 0.78125, + "loss/logits": 0.13041679561138153, + "loss/reg": 0.005534291733056307, + "step": 464 + }, + { + "epoch": 0.058125, + "grad_norm": 2.5434730052948, + "grad_norm_var": 0.18491306851457617, + "learning_rate": 0.0001, + "loss": 1.1396, + "loss/crossentropy": 2.811406373977661, + "loss/hidden": 0.91015625, + "loss/logits": 0.1740744560956955, + "loss/reg": 0.005532294511795044, + "step": 465 + }, + { + "epoch": 0.05825, + "grad_norm": 2.613758087158203, + "grad_norm_var": 0.17830906169392974, + "learning_rate": 0.0001, + "loss": 1.0313, + "loss/crossentropy": 2.5138356685638428, + "loss/hidden": 0.828125, + "loss/logits": 0.1479034125804901, + "loss/reg": 0.005530340131372213, + "step": 466 + }, + { + "epoch": 0.058375, + "grad_norm": 3.6053991317749023, + "grad_norm_var": 0.21458171164135606, + "learning_rate": 0.0001, + "loss": 1.2109, + "loss/crossentropy": 1.9949983358383179, + "loss/hidden": 1.0, + "loss/logits": 0.155661940574646, + "loss/reg": 0.0055284383706748486, + "step": 467 + }, + { + "epoch": 0.0585, + "grad_norm": 2.2574644088745117, + "grad_norm_var": 0.21534123971961966, + "learning_rate": 0.0001, + "loss": 1.08, + "loss/crossentropy": 2.514662981033325, + "loss/hidden": 0.859375, + "loss/logits": 0.16538314521312714, + "loss/reg": 0.005526562221348286, + "step": 468 + }, + { + "epoch": 0.058625, + "grad_norm": 2.2614095211029053, + "grad_norm_var": 0.2206521084247221, + "learning_rate": 0.0001, + "loss": 1.2297, + "loss/crossentropy": 2.4910507202148438, + "loss/hidden": 0.98046875, + "loss/logits": 0.19400066137313843, + "loss/reg": 0.005524714011698961, + "step": 469 + }, + { + "epoch": 0.05875, + "grad_norm": 3.083524465560913, + "grad_norm_var": 0.22915168035201153, + "learning_rate": 0.0001, + "loss": 1.1725, + "loss/crossentropy": 2.5548853874206543, + "loss/hidden": 0.92578125, + "loss/logits": 0.19151920080184937, + "loss/reg": 0.005522689316421747, + "step": 470 + }, + { + "epoch": 0.058875, + "grad_norm": 2.6530709266662598, + "grad_norm_var": 0.2287156357176549, + "learning_rate": 0.0001, + "loss": 0.9819, + "loss/crossentropy": 2.5769848823547363, + "loss/hidden": 0.79296875, + "loss/logits": 0.1337730437517166, + "loss/reg": 0.00552078802138567, + "step": 471 + }, + { + "epoch": 0.059, + "grad_norm": 2.857489585876465, + "grad_norm_var": 0.21848469951039154, + "learning_rate": 0.0001, + "loss": 1.2335, + "loss/crossentropy": 2.6933629512786865, + "loss/hidden": 0.98828125, + "loss/logits": 0.19003306329250336, + "loss/reg": 0.005518974736332893, + "step": 472 + }, + { + "epoch": 0.059125, + "grad_norm": 1.960106372833252, + "grad_norm_var": 0.24874750636482734, + "learning_rate": 0.0001, + "loss": 0.9776, + "loss/crossentropy": 2.534855365753174, + "loss/hidden": 0.7890625, + "loss/logits": 0.13338381052017212, + "loss/reg": 0.005517229437828064, + "step": 473 + }, + { + "epoch": 0.05925, + "grad_norm": 2.787822961807251, + "grad_norm_var": 0.24619457779295406, + "learning_rate": 0.0001, + "loss": 1.0858, + "loss/crossentropy": 2.396390438079834, + "loss/hidden": 0.88671875, + "loss/logits": 0.14397624135017395, + "loss/reg": 0.005515479948371649, + "step": 474 + }, + { + "epoch": 0.059375, + "grad_norm": 2.3396122455596924, + "grad_norm_var": 0.24936205040752385, + "learning_rate": 0.0001, + "loss": 1.0392, + "loss/crossentropy": 2.6306259632110596, + "loss/hidden": 0.83984375, + "loss/logits": 0.14426180720329285, + "loss/reg": 0.005513759795576334, + "step": 475 + }, + { + "epoch": 0.0595, + "grad_norm": 2.367551803588867, + "grad_norm_var": 0.19447740210993794, + "learning_rate": 0.0001, + "loss": 1.1071, + "loss/crossentropy": 2.342672348022461, + "loss/hidden": 0.890625, + "loss/logits": 0.16136375069618225, + "loss/reg": 0.0055120959877967834, + "step": 476 + }, + { + "epoch": 0.059625, + "grad_norm": 2.3029873371124268, + "grad_norm_var": 0.19972845357339655, + "learning_rate": 0.0001, + "loss": 0.9785, + "loss/crossentropy": 2.725276231765747, + "loss/hidden": 0.796875, + "loss/logits": 0.12647491693496704, + "loss/reg": 0.0055101178586483, + "step": 477 + }, + { + "epoch": 0.05975, + "grad_norm": 2.3109138011932373, + "grad_norm_var": 0.1674590503375268, + "learning_rate": 0.0001, + "loss": 1.012, + "loss/crossentropy": 2.6665799617767334, + "loss/hidden": 0.81640625, + "loss/logits": 0.14054208993911743, + "loss/reg": 0.005508116912096739, + "step": 478 + }, + { + "epoch": 0.059875, + "grad_norm": 2.8778023719787598, + "grad_norm_var": 0.1605488706137739, + "learning_rate": 0.0001, + "loss": 1.0028, + "loss/crossentropy": 2.599010705947876, + "loss/hidden": 0.80078125, + "loss/logits": 0.14697444438934326, + "loss/reg": 0.0055063748732209206, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 2.7762978076934814, + "grad_norm_var": 0.15971446982347573, + "learning_rate": 0.0001, + "loss": 1.1492, + "loss/crossentropy": 2.6345436573028564, + "loss/hidden": 0.9296875, + "loss/logits": 0.1645045280456543, + "loss/reg": 0.005504653323441744, + "step": 480 + }, + { + "epoch": 0.060125, + "grad_norm": 3.0745112895965576, + "grad_norm_var": 0.1733429982183973, + "learning_rate": 0.0001, + "loss": 1.2914, + "loss/crossentropy": 2.1021008491516113, + "loss/hidden": 1.0546875, + "loss/logits": 0.18168240785598755, + "loss/reg": 0.005502650048583746, + "step": 481 + }, + { + "epoch": 0.06025, + "grad_norm": 2.5635828971862793, + "grad_norm_var": 0.17362979402171655, + "learning_rate": 0.0001, + "loss": 1.1746, + "loss/crossentropy": 2.599754810333252, + "loss/hidden": 0.9453125, + "loss/logits": 0.1743006557226181, + "loss/reg": 0.005500909872353077, + "step": 482 + }, + { + "epoch": 0.060375, + "grad_norm": 2.982170343399048, + "grad_norm_var": 0.11685041441696337, + "learning_rate": 0.0001, + "loss": 1.084, + "loss/crossentropy": 2.780411958694458, + "loss/hidden": 0.875, + "loss/logits": 0.15399503707885742, + "loss/reg": 0.005499421618878841, + "step": 483 + }, + { + "epoch": 0.0605, + "grad_norm": 6.475743770599365, + "grad_norm_var": 1.0413639393420129, + "learning_rate": 0.0001, + "loss": 2.1473, + "loss/crossentropy": 2.3867931365966797, + "loss/hidden": 1.703125, + "loss/logits": 0.38922837376594543, + "loss/reg": 0.005497433710843325, + "step": 484 + }, + { + "epoch": 0.060625, + "grad_norm": 2.522434711456299, + "grad_norm_var": 1.024975132918582, + "learning_rate": 0.0001, + "loss": 1.0915, + "loss/crossentropy": 2.741684675216675, + "loss/hidden": 0.88671875, + "loss/logits": 0.14987404644489288, + "loss/reg": 0.0054954588413238525, + "step": 485 + }, + { + "epoch": 0.06075, + "grad_norm": 2.6852359771728516, + "grad_norm_var": 1.0236023483547378, + "learning_rate": 0.0001, + "loss": 1.0905, + "loss/crossentropy": 2.2552525997161865, + "loss/hidden": 0.8984375, + "loss/logits": 0.13711076974868774, + "loss/reg": 0.005493887234479189, + "step": 486 + }, + { + "epoch": 0.060875, + "grad_norm": 6.048346996307373, + "grad_norm_var": 1.65671866064532, + "learning_rate": 0.0001, + "loss": 1.4058, + "loss/crossentropy": 3.1526873111724854, + "loss/hidden": 1.0625, + "loss/logits": 0.2884060740470886, + "loss/reg": 0.005492268595844507, + "step": 487 + }, + { + "epoch": 0.061, + "grad_norm": 5.24729061126709, + "grad_norm_var": 1.9496829900519608, + "learning_rate": 0.0001, + "loss": 1.5487, + "loss/crossentropy": 2.391798496246338, + "loss/hidden": 1.234375, + "loss/logits": 0.2594112157821655, + "loss/reg": 0.0054903156124055386, + "step": 488 + }, + { + "epoch": 0.061125, + "grad_norm": 3.4879932403564453, + "grad_norm_var": 1.8414378354073275, + "learning_rate": 0.0001, + "loss": 1.2408, + "loss/crossentropy": 2.3853161334991455, + "loss/hidden": 1.015625, + "loss/logits": 0.1702655553817749, + "loss/reg": 0.005488729570060968, + "step": 489 + }, + { + "epoch": 0.06125, + "grad_norm": 2.416243076324463, + "grad_norm_var": 1.875598350696971, + "learning_rate": 0.0001, + "loss": 1.0646, + "loss/crossentropy": 2.310605049133301, + "loss/hidden": 0.86328125, + "loss/logits": 0.146418958902359, + "loss/reg": 0.005487216170877218, + "step": 490 + }, + { + "epoch": 0.061375, + "grad_norm": 2.9619152545928955, + "grad_norm_var": 1.8217813283025472, + "learning_rate": 0.0001, + "loss": 1.2577, + "loss/crossentropy": 2.3735132217407227, + "loss/hidden": 1.015625, + "loss/logits": 0.18721503019332886, + "loss/reg": 0.005485245026648045, + "step": 491 + }, + { + "epoch": 0.0615, + "grad_norm": 2.9602112770080566, + "grad_norm_var": 1.7685642295810833, + "learning_rate": 0.0001, + "loss": 1.1274, + "loss/crossentropy": 2.6420083045959473, + "loss/hidden": 0.90234375, + "loss/logits": 0.17025524377822876, + "loss/reg": 0.005483296699821949, + "step": 492 + }, + { + "epoch": 0.061625, + "grad_norm": 2.5772223472595215, + "grad_norm_var": 1.7347667738241757, + "learning_rate": 0.0001, + "loss": 1.1004, + "loss/crossentropy": 2.4166319370269775, + "loss/hidden": 0.890625, + "loss/logits": 0.15491390228271484, + "loss/reg": 0.005481342785060406, + "step": 493 + }, + { + "epoch": 0.06175, + "grad_norm": 2.6494603157043457, + "grad_norm_var": 1.693988292922673, + "learning_rate": 0.0001, + "loss": 1.0762, + "loss/crossentropy": 2.7021005153656006, + "loss/hidden": 0.8671875, + "loss/logits": 0.1542307734489441, + "loss/reg": 0.005479689687490463, + "step": 494 + }, + { + "epoch": 0.061875, + "grad_norm": 2.065351963043213, + "grad_norm_var": 1.7911776893626628, + "learning_rate": 0.0001, + "loss": 1.015, + "loss/crossentropy": 2.4842755794525146, + "loss/hidden": 0.8203125, + "loss/logits": 0.13995476067066193, + "loss/reg": 0.005478002596646547, + "step": 495 + }, + { + "epoch": 0.062, + "grad_norm": 2.650660753250122, + "grad_norm_var": 1.8016636980513454, + "learning_rate": 0.0001, + "loss": 1.1699, + "loss/crossentropy": 2.3899097442626953, + "loss/hidden": 0.94921875, + "loss/logits": 0.16591498255729675, + "loss/reg": 0.005476430524140596, + "step": 496 + }, + { + "epoch": 0.062125, + "grad_norm": 3.412050724029541, + "grad_norm_var": 1.7970375838694677, + "learning_rate": 0.0001, + "loss": 1.1983, + "loss/crossentropy": 2.4459383487701416, + "loss/hidden": 0.94140625, + "loss/logits": 0.20212361216545105, + "loss/reg": 0.005474465899169445, + "step": 497 + }, + { + "epoch": 0.06225, + "grad_norm": 2.7389674186706543, + "grad_norm_var": 1.7804152177025587, + "learning_rate": 0.0001, + "loss": 1.1076, + "loss/crossentropy": 2.6794888973236084, + "loss/hidden": 0.90625, + "loss/logits": 0.1465749740600586, + "loss/reg": 0.005472847726196051, + "step": 498 + }, + { + "epoch": 0.062375, + "grad_norm": 20.56003761291504, + "grad_norm_var": 20.18846043733062, + "learning_rate": 0.0001, + "loss": 1.0568, + "loss/crossentropy": 2.527268409729004, + "loss/hidden": 0.859375, + "loss/logits": 0.14275437593460083, + "loss/reg": 0.005471326876431704, + "step": 499 + }, + { + "epoch": 0.0625, + "grad_norm": 2.9909119606018066, + "grad_norm_var": 20.013739807194945, + "learning_rate": 0.0001, + "loss": 1.0002, + "loss/crossentropy": 2.311053991317749, + "loss/hidden": 0.80859375, + "loss/logits": 0.13688521087169647, + "loss/reg": 0.005469587165862322, + "step": 500 + }, + { + "epoch": 0.062625, + "grad_norm": 2.6013972759246826, + "grad_norm_var": 19.995957990662593, + "learning_rate": 0.0001, + "loss": 1.1804, + "loss/crossentropy": 2.4651801586151123, + "loss/hidden": 0.94921875, + "loss/logits": 0.17647257447242737, + "loss/reg": 0.005467594135552645, + "step": 501 + }, + { + "epoch": 0.06275, + "grad_norm": 3.800658702850342, + "grad_norm_var": 19.840506630980723, + "learning_rate": 0.0001, + "loss": 0.9929, + "loss/crossentropy": 2.3829755783081055, + "loss/hidden": 0.8125, + "loss/logits": 0.1257120817899704, + "loss/reg": 0.005465896334499121, + "step": 502 + }, + { + "epoch": 0.062875, + "grad_norm": 2.5520195960998535, + "grad_norm_var": 19.800229612103674, + "learning_rate": 0.0001, + "loss": 0.9335, + "loss/crossentropy": 2.318957567214966, + "loss/hidden": 0.73828125, + "loss/logits": 0.14062564074993134, + "loss/reg": 0.005463926587253809, + "step": 503 + }, + { + "epoch": 0.063, + "grad_norm": 2.2392287254333496, + "grad_norm_var": 19.907422060176042, + "learning_rate": 0.0001, + "loss": 0.9748, + "loss/crossentropy": 2.6403889656066895, + "loss/hidden": 0.79296875, + "loss/logits": 0.12718063592910767, + "loss/reg": 0.005461950786411762, + "step": 504 + }, + { + "epoch": 0.063125, + "grad_norm": 59.44195556640625, + "grad_norm_var": 212.38825001063205, + "learning_rate": 0.0001, + "loss": 1.0377, + "loss/crossentropy": 2.3318405151367188, + "loss/hidden": 0.859375, + "loss/logits": 0.12373203039169312, + "loss/reg": 0.005460206884890795, + "step": 505 + }, + { + "epoch": 0.06325, + "grad_norm": 2.711045265197754, + "grad_norm_var": 212.19724917857505, + "learning_rate": 0.0001, + "loss": 1.0909, + "loss/crossentropy": 2.5284109115600586, + "loss/hidden": 0.8984375, + "loss/logits": 0.13792011141777039, + "loss/reg": 0.0054582892917096615, + "step": 506 + }, + { + "epoch": 0.063375, + "grad_norm": 2.2481954097747803, + "grad_norm_var": 212.65447803299938, + "learning_rate": 0.0001, + "loss": 0.9413, + "loss/crossentropy": 2.5293643474578857, + "loss/hidden": 0.7578125, + "loss/logits": 0.12894126772880554, + "loss/reg": 0.005456262268126011, + "step": 507 + }, + { + "epoch": 0.0635, + "grad_norm": 2.7053897380828857, + "grad_norm_var": 212.80895755175322, + "learning_rate": 0.0001, + "loss": 1.0773, + "loss/crossentropy": 2.506075143814087, + "loss/hidden": 0.87109375, + "loss/logits": 0.15166552364826202, + "loss/reg": 0.005454184953123331, + "step": 508 + }, + { + "epoch": 0.063625, + "grad_norm": 2.2455244064331055, + "grad_norm_var": 213.0278691549671, + "learning_rate": 0.0001, + "loss": 0.9768, + "loss/crossentropy": 2.6757209300994873, + "loss/hidden": 0.7890625, + "loss/logits": 0.13317805528640747, + "loss/reg": 0.005452104844152927, + "step": 509 + }, + { + "epoch": 0.06375, + "grad_norm": 2.599388360977173, + "grad_norm_var": 213.05941324718256, + "learning_rate": 0.0001, + "loss": 0.9486, + "loss/crossentropy": 2.5610482692718506, + "loss/hidden": 0.76953125, + "loss/logits": 0.1245487853884697, + "loss/reg": 0.005449967924505472, + "step": 510 + }, + { + "epoch": 0.063875, + "grad_norm": 2.648411989212036, + "grad_norm_var": 212.67000591016287, + "learning_rate": 0.0001, + "loss": 1.0344, + "loss/crossentropy": 2.4520514011383057, + "loss/hidden": 0.86328125, + "loss/logits": 0.11664330959320068, + "loss/reg": 0.005447922740131617, + "step": 511 + }, + { + "epoch": 0.064, + "grad_norm": 2.380727767944336, + "grad_norm_var": 212.84492196466834, + "learning_rate": 0.0001, + "loss": 1.0266, + "loss/crossentropy": 2.1030967235565186, + "loss/hidden": 0.83984375, + "loss/logits": 0.13228739798069, + "loss/reg": 0.0054458137601614, + "step": 512 + }, + { + "epoch": 0.064125, + "grad_norm": 2.198631763458252, + "grad_norm_var": 213.5768536641417, + "learning_rate": 0.0001, + "loss": 1.1228, + "loss/crossentropy": 2.3845298290252686, + "loss/hidden": 0.90625, + "loss/logits": 0.16212098300457, + "loss/reg": 0.0054436735808849335, + "step": 513 + }, + { + "epoch": 0.06425, + "grad_norm": 2.8499302864074707, + "grad_norm_var": 213.51026966359936, + "learning_rate": 0.0001, + "loss": 1.114, + "loss/crossentropy": 2.8064866065979004, + "loss/hidden": 0.89453125, + "loss/logits": 0.16508570313453674, + "loss/reg": 0.0054416959173977375, + "step": 514 + }, + { + "epoch": 0.064375, + "grad_norm": 2.5610392093658447, + "grad_norm_var": 201.9317150765609, + "learning_rate": 0.0001, + "loss": 0.9877, + "loss/crossentropy": 2.331996202468872, + "loss/hidden": 0.79296875, + "loss/logits": 0.14031504094600677, + "loss/reg": 0.005439713131636381, + "step": 515 + }, + { + "epoch": 0.0645, + "grad_norm": 2.0442378520965576, + "grad_norm_var": 202.38943138060174, + "learning_rate": 0.0001, + "loss": 1.0106, + "loss/crossentropy": 2.4245524406433105, + "loss/hidden": 0.8203125, + "loss/logits": 0.13589094579219818, + "loss/reg": 0.0054377601481974125, + "step": 516 + }, + { + "epoch": 0.064625, + "grad_norm": 2.7959375381469727, + "grad_norm_var": 202.30067826507621, + "learning_rate": 0.0001, + "loss": 1.1462, + "loss/crossentropy": 2.4928247928619385, + "loss/hidden": 0.9375, + "loss/logits": 0.154384583234787, + "loss/reg": 0.005435979925096035, + "step": 517 + }, + { + "epoch": 0.06475, + "grad_norm": 2.5521795749664307, + "grad_norm_var": 202.78524814255965, + "learning_rate": 0.0001, + "loss": 1.1979, + "loss/crossentropy": 2.4683828353881836, + "loss/hidden": 0.96484375, + "loss/logits": 0.17875435948371887, + "loss/reg": 0.005434305872768164, + "step": 518 + }, + { + "epoch": 0.064875, + "grad_norm": 3.04142165184021, + "grad_norm_var": 202.5720686279479, + "learning_rate": 0.0001, + "loss": 1.0734, + "loss/crossentropy": 2.5951480865478516, + "loss/hidden": 0.87890625, + "loss/logits": 0.14021140336990356, + "loss/reg": 0.005432285368442535, + "step": 519 + }, + { + "epoch": 0.065, + "grad_norm": 2.7776551246643066, + "grad_norm_var": 202.31453305561996, + "learning_rate": 0.0001, + "loss": 1.0546, + "loss/crossentropy": 2.4622325897216797, + "loss/hidden": 0.85546875, + "loss/logits": 0.14482024312019348, + "loss/reg": 0.0054303682409226894, + "step": 520 + }, + { + "epoch": 0.065125, + "grad_norm": 3.091510057449341, + "grad_norm_var": 0.0909682998746592, + "learning_rate": 0.0001, + "loss": 1.2902, + "loss/crossentropy": 2.1909449100494385, + "loss/hidden": 1.0703125, + "loss/logits": 0.1655557006597519, + "loss/reg": 0.005428609903901815, + "step": 521 + }, + { + "epoch": 0.06525, + "grad_norm": 2.5140562057495117, + "grad_norm_var": 0.09023274223209772, + "learning_rate": 0.0001, + "loss": 1.0304, + "loss/crossentropy": 2.4252545833587646, + "loss/hidden": 0.8125, + "loss/logits": 0.163617342710495, + "loss/reg": 0.005426718853414059, + "step": 522 + }, + { + "epoch": 0.065375, + "grad_norm": 2.4871199131011963, + "grad_norm_var": 0.08328167859519695, + "learning_rate": 0.0001, + "loss": 1.0136, + "loss/crossentropy": 2.8315296173095703, + "loss/hidden": 0.80859375, + "loss/logits": 0.15072785317897797, + "loss/reg": 0.0054249088279902935, + "step": 523 + }, + { + "epoch": 0.0655, + "grad_norm": 2.5452873706817627, + "grad_norm_var": 0.082491431169228, + "learning_rate": 0.0001, + "loss": 1.1747, + "loss/crossentropy": 2.4235100746154785, + "loss/hidden": 0.953125, + "loss/logits": 0.1673499345779419, + "loss/reg": 0.005423161666840315, + "step": 524 + }, + { + "epoch": 0.065625, + "grad_norm": 2.5849223136901855, + "grad_norm_var": 0.0744047548688132, + "learning_rate": 0.0001, + "loss": 1.1537, + "loss/crossentropy": 2.517444133758545, + "loss/hidden": 0.9375, + "loss/logits": 0.16202498972415924, + "loss/reg": 0.005421151407063007, + "step": 525 + }, + { + "epoch": 0.06575, + "grad_norm": 2.5873005390167236, + "grad_norm_var": 0.07442217159387093, + "learning_rate": 0.0001, + "loss": 1.0671, + "loss/crossentropy": 2.467041254043579, + "loss/hidden": 0.8671875, + "loss/logits": 0.1457323431968689, + "loss/reg": 0.0054191285744309425, + "step": 526 + }, + { + "epoch": 0.065875, + "grad_norm": 2.362294912338257, + "grad_norm_var": 0.07783568042826777, + "learning_rate": 0.0001, + "loss": 1.0313, + "loss/crossentropy": 2.1884958744049072, + "loss/hidden": 0.8359375, + "loss/logits": 0.14114870131015778, + "loss/reg": 0.005417390260845423, + "step": 527 + }, + { + "epoch": 0.066, + "grad_norm": 2.9457032680511475, + "grad_norm_var": 0.08233057116115011, + "learning_rate": 0.0001, + "loss": 1.1121, + "loss/crossentropy": 2.4786124229431152, + "loss/hidden": 0.9140625, + "loss/logits": 0.14386197924613953, + "loss/reg": 0.005415752530097961, + "step": 528 + }, + { + "epoch": 0.066125, + "grad_norm": 2.302025556564331, + "grad_norm_var": 0.07717323196560505, + "learning_rate": 0.0001, + "loss": 1.0216, + "loss/crossentropy": 2.3902788162231445, + "loss/hidden": 0.81640625, + "loss/logits": 0.15102702379226685, + "loss/reg": 0.005413680803030729, + "step": 529 + }, + { + "epoch": 0.06625, + "grad_norm": 2.2427210807800293, + "grad_norm_var": 0.08222220602995639, + "learning_rate": 0.0001, + "loss": 1.0503, + "loss/crossentropy": 2.5187177658081055, + "loss/hidden": 0.85546875, + "loss/logits": 0.1406846046447754, + "loss/reg": 0.005411935038864613, + "step": 530 + }, + { + "epoch": 0.066375, + "grad_norm": 2.215160846710205, + "grad_norm_var": 0.09102156065525453, + "learning_rate": 0.0001, + "loss": 1.0564, + "loss/crossentropy": 2.7218358516693115, + "loss/hidden": 0.8515625, + "loss/logits": 0.15077106654644012, + "loss/reg": 0.00541025260463357, + "step": 531 + }, + { + "epoch": 0.0665, + "grad_norm": 49.282718658447266, + "grad_norm_var": 136.25864998814177, + "learning_rate": 0.0001, + "loss": 1.0971, + "loss/crossentropy": 2.2880303859710693, + "loss/hidden": 0.90625, + "loss/logits": 0.136735200881958, + "loss/reg": 0.005408551078289747, + "step": 532 + }, + { + "epoch": 0.066625, + "grad_norm": 2.9009287357330322, + "grad_norm_var": 136.22119824556128, + "learning_rate": 0.0001, + "loss": 1.2051, + "loss/crossentropy": 2.069505214691162, + "loss/hidden": 0.96875, + "loss/logits": 0.18224555253982544, + "loss/reg": 0.00540671544149518, + "step": 533 + }, + { + "epoch": 0.06675, + "grad_norm": 3.366948127746582, + "grad_norm_var": 135.93950988587417, + "learning_rate": 0.0001, + "loss": 1.2696, + "loss/crossentropy": 2.5054562091827393, + "loss/hidden": 1.0234375, + "loss/logits": 0.19209496676921844, + "loss/reg": 0.005404717288911343, + "step": 534 + }, + { + "epoch": 0.066875, + "grad_norm": 2.380380868911743, + "grad_norm_var": 136.19039047350162, + "learning_rate": 0.0001, + "loss": 1.0532, + "loss/crossentropy": 2.4644546508789062, + "loss/hidden": 0.85546875, + "loss/logits": 0.14370107650756836, + "loss/reg": 0.005402736831456423, + "step": 535 + }, + { + "epoch": 0.067, + "grad_norm": 2.2630538940429688, + "grad_norm_var": 136.3962470934156, + "learning_rate": 0.0001, + "loss": 1.106, + "loss/crossentropy": 2.362761974334717, + "loss/hidden": 0.88671875, + "loss/logits": 0.16529247164726257, + "loss/reg": 0.005401079077273607, + "step": 536 + }, + { + "epoch": 0.067125, + "grad_norm": 2.2985637187957764, + "grad_norm_var": 136.69066191681563, + "learning_rate": 0.0001, + "loss": 1.1188, + "loss/crossentropy": 2.2513458728790283, + "loss/hidden": 0.90625, + "loss/logits": 0.158562570810318, + "loss/reg": 0.005399197805672884, + "step": 537 + }, + { + "epoch": 0.06725, + "grad_norm": 2.9178526401519775, + "grad_norm_var": 136.54251636267432, + "learning_rate": 0.0001, + "loss": 1.2523, + "loss/crossentropy": 2.4714279174804688, + "loss/hidden": 1.0, + "loss/logits": 0.1983010470867157, + "loss/reg": 0.005397453438490629, + "step": 538 + }, + { + "epoch": 0.067375, + "grad_norm": 2.1185367107391357, + "grad_norm_var": 136.69809974879476, + "learning_rate": 0.0001, + "loss": 1.0218, + "loss/crossentropy": 2.5675318241119385, + "loss/hidden": 0.83203125, + "loss/logits": 0.135833740234375, + "loss/reg": 0.005395461805164814, + "step": 539 + }, + { + "epoch": 0.0675, + "grad_norm": 2.471010684967041, + "grad_norm_var": 136.72728236316837, + "learning_rate": 0.0001, + "loss": 0.9973, + "loss/crossentropy": 2.434710741043091, + "loss/hidden": 0.80078125, + "loss/logits": 0.1425883173942566, + "loss/reg": 0.005393547471612692, + "step": 540 + }, + { + "epoch": 0.067625, + "grad_norm": 2.24953556060791, + "grad_norm_var": 136.86254598209106, + "learning_rate": 0.0001, + "loss": 1.031, + "loss/crossentropy": 2.2015278339385986, + "loss/hidden": 0.83984375, + "loss/logits": 0.13719773292541504, + "loss/reg": 0.005391509272158146, + "step": 541 + }, + { + "epoch": 0.06775, + "grad_norm": 3.6081595420837402, + "grad_norm_var": 136.54053740800046, + "learning_rate": 0.0001, + "loss": 1.4308, + "loss/crossentropy": 2.6074843406677246, + "loss/hidden": 1.1328125, + "loss/logits": 0.244051992893219, + "loss/reg": 0.0053895004093647, + "step": 542 + }, + { + "epoch": 0.067875, + "grad_norm": 2.293153762817383, + "grad_norm_var": 136.5697192568711, + "learning_rate": 0.0001, + "loss": 1.0019, + "loss/crossentropy": 2.604656934738159, + "loss/hidden": 0.8125, + "loss/logits": 0.13555657863616943, + "loss/reg": 0.005387555807828903, + "step": 543 + }, + { + "epoch": 0.068, + "grad_norm": 3.9054906368255615, + "grad_norm_var": 136.30156429508213, + "learning_rate": 0.0001, + "loss": 1.2067, + "loss/crossentropy": 2.723475456237793, + "loss/hidden": 1.0078125, + "loss/logits": 0.14498497545719147, + "loss/reg": 0.005385412368923426, + "step": 544 + }, + { + "epoch": 0.068125, + "grad_norm": 2.571354389190674, + "grad_norm_var": 136.18942504783266, + "learning_rate": 0.0001, + "loss": 1.1473, + "loss/crossentropy": 2.686601400375366, + "loss/hidden": 0.87109375, + "loss/logits": 0.22237557172775269, + "loss/reg": 0.005383248440921307, + "step": 545 + }, + { + "epoch": 0.06825, + "grad_norm": 2.71347975730896, + "grad_norm_var": 135.99456491905767, + "learning_rate": 0.0001, + "loss": 1.2265, + "loss/crossentropy": 2.3059206008911133, + "loss/hidden": 0.99609375, + "loss/logits": 0.17658907175064087, + "loss/reg": 0.00538119999691844, + "step": 546 + }, + { + "epoch": 0.068375, + "grad_norm": 2.2055041790008545, + "grad_norm_var": 135.99892540184646, + "learning_rate": 0.0001, + "loss": 1.227, + "loss/crossentropy": 2.1022770404815674, + "loss/hidden": 1.0078125, + "loss/logits": 0.165423184633255, + "loss/reg": 0.00537898438051343, + "step": 547 + }, + { + "epoch": 0.0685, + "grad_norm": 2.9577088356018066, + "grad_norm_var": 0.2900975003793434, + "learning_rate": 0.0001, + "loss": 1.0223, + "loss/crossentropy": 2.7894651889801025, + "loss/hidden": 0.8359375, + "loss/logits": 0.13254427909851074, + "loss/reg": 0.0053769489750266075, + "step": 548 + }, + { + "epoch": 0.068625, + "grad_norm": 2.289632797241211, + "grad_norm_var": 0.2971860973100412, + "learning_rate": 0.0001, + "loss": 0.9706, + "loss/crossentropy": 1.9662660360336304, + "loss/hidden": 0.796875, + "loss/logits": 0.11997567117214203, + "loss/reg": 0.005374929867684841, + "step": 549 + }, + { + "epoch": 0.06875, + "grad_norm": 2.7140934467315674, + "grad_norm_var": 0.26256089477725764, + "learning_rate": 0.0001, + "loss": 1.0877, + "loss/crossentropy": 2.600255012512207, + "loss/hidden": 0.875, + "loss/logits": 0.15900644659996033, + "loss/reg": 0.005373071413487196, + "step": 550 + }, + { + "epoch": 0.068875, + "grad_norm": 2.7848618030548096, + "grad_norm_var": 0.25973690827483153, + "learning_rate": 0.0001, + "loss": 1.1564, + "loss/crossentropy": 2.347534656524658, + "loss/hidden": 0.9296875, + "loss/logits": 0.1730039119720459, + "loss/reg": 0.005370850209146738, + "step": 551 + }, + { + "epoch": 0.069, + "grad_norm": 2.1625566482543945, + "grad_norm_var": 0.26552124449597064, + "learning_rate": 0.0001, + "loss": 1.0454, + "loss/crossentropy": 2.766214609146118, + "loss/hidden": 0.84765625, + "loss/logits": 0.1440483182668686, + "loss/reg": 0.005368667654693127, + "step": 552 + }, + { + "epoch": 0.069125, + "grad_norm": 2.8881192207336426, + "grad_norm_var": 0.2602997020069113, + "learning_rate": 0.0001, + "loss": 1.0958, + "loss/crossentropy": 2.808046817779541, + "loss/hidden": 0.89453125, + "loss/logits": 0.14757747948169708, + "loss/reg": 0.005366665776818991, + "step": 553 + }, + { + "epoch": 0.06925, + "grad_norm": 2.6187915802001953, + "grad_norm_var": 0.2563330715515538, + "learning_rate": 0.0001, + "loss": 1.2873, + "loss/crossentropy": 2.5937459468841553, + "loss/hidden": 1.03125, + "loss/logits": 0.20239150524139404, + "loss/reg": 0.005364455748349428, + "step": 554 + }, + { + "epoch": 0.069375, + "grad_norm": 2.6418044567108154, + "grad_norm_var": 0.23570370249948383, + "learning_rate": 0.0001, + "loss": 1.1877, + "loss/crossentropy": 2.5635995864868164, + "loss/hidden": 0.9609375, + "loss/logits": 0.17318235337734222, + "loss/reg": 0.005362290423363447, + "step": 555 + }, + { + "epoch": 0.0695, + "grad_norm": 2.79367733001709, + "grad_norm_var": 0.2326946034348102, + "learning_rate": 0.0001, + "loss": 1.2018, + "loss/crossentropy": 2.13350510597229, + "loss/hidden": 1.0, + "loss/logits": 0.14820048213005066, + "loss/reg": 0.005360215436667204, + "step": 556 + }, + { + "epoch": 0.069625, + "grad_norm": 3.387333869934082, + "grad_norm_var": 0.2433911623755942, + "learning_rate": 0.0001, + "loss": 1.0866, + "loss/crossentropy": 2.4682462215423584, + "loss/hidden": 0.859375, + "loss/logits": 0.17359653115272522, + "loss/reg": 0.005357977002859116, + "step": 557 + }, + { + "epoch": 0.06975, + "grad_norm": 2.9143950939178467, + "grad_norm_var": 0.19718877969401258, + "learning_rate": 0.0001, + "loss": 1.1093, + "loss/crossentropy": 2.6049532890319824, + "loss/hidden": 0.87890625, + "loss/logits": 0.1768435537815094, + "loss/reg": 0.005355944857001305, + "step": 558 + }, + { + "epoch": 0.069875, + "grad_norm": 2.495455741882324, + "grad_norm_var": 0.18769030937939207, + "learning_rate": 0.0001, + "loss": 1.0522, + "loss/crossentropy": 2.2471513748168945, + "loss/hidden": 0.8203125, + "loss/logits": 0.17833727598190308, + "loss/reg": 0.005353772081434727, + "step": 559 + }, + { + "epoch": 0.07, + "grad_norm": 3.548495054244995, + "grad_norm_var": 0.140786672247814, + "learning_rate": 0.0001, + "loss": 1.2284, + "loss/crossentropy": 2.0618865489959717, + "loss/hidden": 0.984375, + "loss/logits": 0.1905221790075302, + "loss/reg": 0.005351651925593615, + "step": 560 + }, + { + "epoch": 0.070125, + "grad_norm": 2.8271002769470215, + "grad_norm_var": 0.1394493347625937, + "learning_rate": 0.0001, + "loss": 1.0633, + "loss/crossentropy": 2.392343282699585, + "loss/hidden": 0.87109375, + "loss/logits": 0.1387328803539276, + "loss/reg": 0.005349620711058378, + "step": 561 + }, + { + "epoch": 0.07025, + "grad_norm": 2.02376389503479, + "grad_norm_var": 0.17221200465598158, + "learning_rate": 0.0001, + "loss": 1.0841, + "loss/crossentropy": 2.26953125, + "loss/hidden": 0.87890625, + "loss/logits": 0.15167732536792755, + "loss/reg": 0.005347614176571369, + "step": 562 + }, + { + "epoch": 0.070375, + "grad_norm": 3.3085341453552246, + "grad_norm_var": 0.17503849488183504, + "learning_rate": 0.0001, + "loss": 0.9333, + "loss/crossentropy": 2.278923511505127, + "loss/hidden": 0.76171875, + "loss/logits": 0.11809547245502472, + "loss/reg": 0.005345623474568129, + "step": 563 + }, + { + "epoch": 0.0705, + "grad_norm": 6.067057132720947, + "grad_norm_var": 0.8561705035714908, + "learning_rate": 0.0001, + "loss": 1.6764, + "loss/crossentropy": 2.513504981994629, + "loss/hidden": 1.421875, + "loss/logits": 0.20107370615005493, + "loss/reg": 0.00534354243427515, + "step": 564 + }, + { + "epoch": 0.070625, + "grad_norm": 2.2686562538146973, + "grad_norm_var": 0.8580914081280743, + "learning_rate": 0.0001, + "loss": 1.0927, + "loss/crossentropy": 2.2088499069213867, + "loss/hidden": 0.88671875, + "loss/logits": 0.15257079899311066, + "loss/reg": 0.005341436248272657, + "step": 565 + }, + { + "epoch": 0.07075, + "grad_norm": 3.1334891319274902, + "grad_norm_var": 0.8550377421403706, + "learning_rate": 0.0001, + "loss": 1.3017, + "loss/crossentropy": 2.117647886276245, + "loss/hidden": 1.0703125, + "loss/logits": 0.17795339226722717, + "loss/reg": 0.005339318886399269, + "step": 566 + }, + { + "epoch": 0.070875, + "grad_norm": 4.6122727394104, + "grad_norm_var": 1.0134023805364716, + "learning_rate": 0.0001, + "loss": 1.4641, + "loss/crossentropy": 2.7603936195373535, + "loss/hidden": 1.140625, + "loss/logits": 0.27013444900512695, + "loss/reg": 0.005337177775800228, + "step": 567 + }, + { + "epoch": 0.071, + "grad_norm": 2.583162307739258, + "grad_norm_var": 0.9715659491999304, + "learning_rate": 0.0001, + "loss": 1.2051, + "loss/crossentropy": 2.380053758621216, + "loss/hidden": 0.9765625, + "loss/logits": 0.1751583367586136, + "loss/reg": 0.005335117690265179, + "step": 568 + }, + { + "epoch": 0.071125, + "grad_norm": 2.2990646362304688, + "grad_norm_var": 1.0124076074311148, + "learning_rate": 0.0001, + "loss": 1.1188, + "loss/crossentropy": 2.587369203567505, + "loss/hidden": 0.8984375, + "loss/logits": 0.16701750457286835, + "loss/reg": 0.005332810804247856, + "step": 569 + }, + { + "epoch": 0.07125, + "grad_norm": 3.1470165252685547, + "grad_norm_var": 0.9962936596825245, + "learning_rate": 0.0001, + "loss": 1.0239, + "loss/crossentropy": 2.492009162902832, + "loss/hidden": 0.79296875, + "loss/logits": 0.17766177654266357, + "loss/reg": 0.0053307050839066505, + "step": 570 + }, + { + "epoch": 0.071375, + "grad_norm": 2.8765156269073486, + "grad_norm_var": 0.9845149270166076, + "learning_rate": 0.0001, + "loss": 1.1206, + "loss/crossentropy": 2.714184045791626, + "loss/hidden": 0.91015625, + "loss/logits": 0.15715843439102173, + "loss/reg": 0.005328655708581209, + "step": 571 + }, + { + "epoch": 0.0715, + "grad_norm": 3.696258068084717, + "grad_norm_var": 0.9934068745617035, + "learning_rate": 0.0001, + "loss": 1.7557, + "loss/crossentropy": 2.322871685028076, + "loss/hidden": 1.40625, + "loss/logits": 0.2961430847644806, + "loss/reg": 0.005326449871063232, + "step": 572 + }, + { + "epoch": 0.071625, + "grad_norm": 2.3756725788116455, + "grad_norm_var": 1.0320075552341808, + "learning_rate": 0.0001, + "loss": 1.0069, + "loss/crossentropy": 2.446782350540161, + "loss/hidden": 0.80859375, + "loss/logits": 0.14505374431610107, + "loss/reg": 0.005324224475771189, + "step": 573 + }, + { + "epoch": 0.07175, + "grad_norm": 3.0114002227783203, + "grad_norm_var": 1.0297287032782758, + "learning_rate": 0.0001, + "loss": 1.0473, + "loss/crossentropy": 2.815708637237549, + "loss/hidden": 0.84765625, + "loss/logits": 0.146395742893219, + "loss/reg": 0.005321910604834557, + "step": 574 + }, + { + "epoch": 0.071875, + "grad_norm": 2.7700350284576416, + "grad_norm_var": 1.0107660796879199, + "learning_rate": 0.0001, + "loss": 0.9849, + "loss/crossentropy": 2.579951286315918, + "loss/hidden": 0.79296875, + "loss/logits": 0.13869163393974304, + "loss/reg": 0.005319789983332157, + "step": 575 + }, + { + "epoch": 0.072, + "grad_norm": 2.1934142112731934, + "grad_norm_var": 1.0552091073780958, + "learning_rate": 0.0001, + "loss": 1.0376, + "loss/crossentropy": 2.7635080814361572, + "loss/hidden": 0.83203125, + "loss/logits": 0.1523815095424652, + "loss/reg": 0.005317789036780596, + "step": 576 + }, + { + "epoch": 0.072125, + "grad_norm": 2.203432321548462, + "grad_norm_var": 1.1000992612664597, + "learning_rate": 0.0001, + "loss": 1.0214, + "loss/crossentropy": 2.2990267276763916, + "loss/hidden": 0.81640625, + "loss/logits": 0.15180249512195587, + "loss/reg": 0.005315590649843216, + "step": 577 + }, + { + "epoch": 0.07225, + "grad_norm": 2.7597663402557373, + "grad_norm_var": 1.0346594183063509, + "learning_rate": 0.0001, + "loss": 1.2895, + "loss/crossentropy": 2.7941789627075195, + "loss/hidden": 1.0390625, + "loss/logits": 0.1973191797733307, + "loss/reg": 0.00531340204179287, + "step": 578 + }, + { + "epoch": 0.072375, + "grad_norm": 2.151498794555664, + "grad_norm_var": 1.0833220696738444, + "learning_rate": 0.0001, + "loss": 0.9999, + "loss/crossentropy": 2.4545745849609375, + "loss/hidden": 0.8125, + "loss/logits": 0.1343034952878952, + "loss/reg": 0.005311093758791685, + "step": 579 + }, + { + "epoch": 0.0725, + "grad_norm": 2.758521318435669, + "grad_norm_var": 0.4185770203540026, + "learning_rate": 0.0001, + "loss": 1.1404, + "loss/crossentropy": 2.3098721504211426, + "loss/hidden": 0.93359375, + "loss/logits": 0.1536703109741211, + "loss/reg": 0.00530878035351634, + "step": 580 + }, + { + "epoch": 0.072625, + "grad_norm": 3.102933406829834, + "grad_norm_var": 0.40269379192041677, + "learning_rate": 0.0001, + "loss": 1.4046, + "loss/crossentropy": 2.4322452545166016, + "loss/hidden": 1.125, + "loss/logits": 0.22653597593307495, + "loss/reg": 0.005306490696966648, + "step": 581 + }, + { + "epoch": 0.07275, + "grad_norm": 2.831894636154175, + "grad_norm_var": 0.39716603194751554, + "learning_rate": 0.0001, + "loss": 1.0557, + "loss/crossentropy": 2.5340616703033447, + "loss/hidden": 0.859375, + "loss/logits": 0.14325933158397675, + "loss/reg": 0.005304399877786636, + "step": 582 + }, + { + "epoch": 0.072875, + "grad_norm": 2.5802910327911377, + "grad_norm_var": 0.17392503265121587, + "learning_rate": 0.0001, + "loss": 1.0795, + "loss/crossentropy": 2.5839056968688965, + "loss/hidden": 0.86328125, + "loss/logits": 0.16314582526683807, + "loss/reg": 0.005302343517541885, + "step": 583 + }, + { + "epoch": 0.073, + "grad_norm": 2.650399684906006, + "grad_norm_var": 0.17308120367785024, + "learning_rate": 0.0001, + "loss": 1.1285, + "loss/crossentropy": 2.487835645675659, + "loss/hidden": 0.91796875, + "loss/logits": 0.1575045883655548, + "loss/reg": 0.005300293210893869, + "step": 584 + }, + { + "epoch": 0.073125, + "grad_norm": 2.5146095752716064, + "grad_norm_var": 0.16408850139509898, + "learning_rate": 0.0001, + "loss": 0.8801, + "loss/crossentropy": 2.710824728012085, + "loss/hidden": 0.71875, + "loss/logits": 0.10839369148015976, + "loss/reg": 0.005298234056681395, + "step": 585 + }, + { + "epoch": 0.07325, + "grad_norm": 3.5579047203063965, + "grad_norm_var": 0.19767952383566936, + "learning_rate": 0.0001, + "loss": 1.0659, + "loss/crossentropy": 2.5979011058807373, + "loss/hidden": 0.86328125, + "loss/logits": 0.1496235430240631, + "loss/reg": 0.005296017974615097, + "step": 586 + }, + { + "epoch": 0.073375, + "grad_norm": 3.229036569595337, + "grad_norm_var": 0.21129156050842327, + "learning_rate": 0.0001, + "loss": 1.3736, + "loss/crossentropy": 2.0404820442199707, + "loss/hidden": 1.109375, + "loss/logits": 0.2113049328327179, + "loss/reg": 0.005293776281177998, + "step": 587 + }, + { + "epoch": 0.0735, + "grad_norm": 3.219778537750244, + "grad_norm_var": 0.1669016788033178, + "learning_rate": 0.0001, + "loss": 1.1673, + "loss/crossentropy": 2.5946946144104004, + "loss/hidden": 0.96875, + "loss/logits": 0.14563477039337158, + "loss/reg": 0.005291698966175318, + "step": 588 + }, + { + "epoch": 0.073625, + "grad_norm": 2.622143030166626, + "grad_norm_var": 0.15858063234232014, + "learning_rate": 0.0001, + "loss": 1.2658, + "loss/crossentropy": 2.2183616161346436, + "loss/hidden": 1.0078125, + "loss/logits": 0.20505878329277039, + "loss/reg": 0.0052896649576723576, + "step": 589 + }, + { + "epoch": 0.07375, + "grad_norm": 2.496985673904419, + "grad_norm_var": 0.15786373129452994, + "learning_rate": 0.0001, + "loss": 1.008, + "loss/crossentropy": 2.4871459007263184, + "loss/hidden": 0.80078125, + "loss/logits": 0.15438680350780487, + "loss/reg": 0.005287437699735165, + "step": 590 + }, + { + "epoch": 0.073875, + "grad_norm": 3.0701065063476562, + "grad_norm_var": 0.1651866047673136, + "learning_rate": 0.0001, + "loss": 1.1327, + "loss/crossentropy": 2.414074420928955, + "loss/hidden": 0.9140625, + "loss/logits": 0.16581328213214874, + "loss/reg": 0.005285393912345171, + "step": 591 + }, + { + "epoch": 0.074, + "grad_norm": 3.9464497566223145, + "grad_norm_var": 0.22799900213871613, + "learning_rate": 0.0001, + "loss": 1.5728, + "loss/crossentropy": 2.3605363368988037, + "loss/hidden": 1.265625, + "loss/logits": 0.25435870885849, + "loss/reg": 0.0052834744565188885, + "step": 592 + }, + { + "epoch": 0.074125, + "grad_norm": 2.5776402950286865, + "grad_norm_var": 0.20419228079197158, + "learning_rate": 0.0001, + "loss": 1.2113, + "loss/crossentropy": 2.3519833087921143, + "loss/hidden": 0.9765625, + "loss/logits": 0.18191702663898468, + "loss/reg": 0.0052813272923231125, + "step": 593 + }, + { + "epoch": 0.07425, + "grad_norm": 2.2281813621520996, + "grad_norm_var": 0.23033113710586123, + "learning_rate": 0.0001, + "loss": 1.072, + "loss/crossentropy": 2.38511323928833, + "loss/hidden": 0.875, + "loss/logits": 0.14420956373214722, + "loss/reg": 0.0052796173840761185, + "step": 594 + }, + { + "epoch": 0.074375, + "grad_norm": 3.1069376468658447, + "grad_norm_var": 0.19889239941204717, + "learning_rate": 0.0001, + "loss": 1.1811, + "loss/crossentropy": 2.413785696029663, + "loss/hidden": 0.984375, + "loss/logits": 0.14399147033691406, + "loss/reg": 0.005277944263070822, + "step": 595 + }, + { + "epoch": 0.0745, + "grad_norm": 3.3450682163238525, + "grad_norm_var": 0.2088716594217845, + "learning_rate": 0.0001, + "loss": 1.2822, + "loss/crossentropy": 2.434509038925171, + "loss/hidden": 1.0390625, + "loss/logits": 0.1903418004512787, + "loss/reg": 0.005276298616081476, + "step": 596 + }, + { + "epoch": 0.074625, + "grad_norm": 2.371161937713623, + "grad_norm_var": 0.22668853941960734, + "learning_rate": 0.0001, + "loss": 1.0528, + "loss/crossentropy": 2.396265745162964, + "loss/hidden": 0.85546875, + "loss/logits": 0.14461319148540497, + "loss/reg": 0.005274245049804449, + "step": 597 + }, + { + "epoch": 0.07475, + "grad_norm": 3.314265251159668, + "grad_norm_var": 0.2370575162921483, + "learning_rate": 0.0001, + "loss": 1.1987, + "loss/crossentropy": 2.7262494564056396, + "loss/hidden": 0.9765625, + "loss/logits": 0.16941672563552856, + "loss/reg": 0.0052725388668477535, + "step": 598 + }, + { + "epoch": 0.074875, + "grad_norm": 3.7527589797973633, + "grad_norm_var": 0.2687845607887461, + "learning_rate": 0.0001, + "loss": 1.2156, + "loss/crossentropy": 2.6718039512634277, + "loss/hidden": 0.98046875, + "loss/logits": 0.18241068720817566, + "loss/reg": 0.005270869936794043, + "step": 599 + }, + { + "epoch": 0.075, + "grad_norm": 2.5073466300964355, + "grad_norm_var": 0.2767358438012515, + "learning_rate": 0.0001, + "loss": 1.0856, + "loss/crossentropy": 2.3735952377319336, + "loss/hidden": 0.875, + "loss/logits": 0.15789943933486938, + "loss/reg": 0.005268939305096865, + "step": 600 + }, + { + "epoch": 0.075125, + "grad_norm": 4.061317443847656, + "grad_norm_var": 0.3279536252115766, + "learning_rate": 0.0001, + "loss": 1.3519, + "loss/crossentropy": 2.5530035495758057, + "loss/hidden": 1.1015625, + "loss/logits": 0.19769783318042755, + "loss/reg": 0.005267218686640263, + "step": 601 + }, + { + "epoch": 0.07525, + "grad_norm": 2.4795703887939453, + "grad_norm_var": 0.33305877012988483, + "learning_rate": 0.0001, + "loss": 1.1642, + "loss/crossentropy": 2.4662020206451416, + "loss/hidden": 0.93359375, + "loss/logits": 0.17794585227966309, + "loss/reg": 0.005265380721539259, + "step": 602 + }, + { + "epoch": 0.075375, + "grad_norm": 3.2844204902648926, + "grad_norm_var": 0.33479007900948143, + "learning_rate": 0.0001, + "loss": 1.0854, + "loss/crossentropy": 2.5502281188964844, + "loss/hidden": 0.87890625, + "loss/logits": 0.15388712286949158, + "loss/reg": 0.005263412371277809, + "step": 603 + }, + { + "epoch": 0.0755, + "grad_norm": 2.4871444702148438, + "grad_norm_var": 0.3492133912507728, + "learning_rate": 0.0001, + "loss": 1.0922, + "loss/crossentropy": 2.5286808013916016, + "loss/hidden": 0.86328125, + "loss/logits": 0.17630262672901154, + "loss/reg": 0.005261610262095928, + "step": 604 + }, + { + "epoch": 0.075625, + "grad_norm": 2.9881107807159424, + "grad_norm_var": 0.3402092077326716, + "learning_rate": 0.0001, + "loss": 1.1525, + "loss/crossentropy": 2.522861957550049, + "loss/hidden": 0.9375, + "loss/logits": 0.16244357824325562, + "loss/reg": 0.005259564146399498, + "step": 605 + }, + { + "epoch": 0.07575, + "grad_norm": 2.3586983680725098, + "grad_norm_var": 0.35069927923201, + "learning_rate": 0.0001, + "loss": 1.1314, + "loss/crossentropy": 2.2875781059265137, + "loss/hidden": 0.921875, + "loss/logits": 0.15690375864505768, + "loss/reg": 0.0052574859000742435, + "step": 606 + }, + { + "epoch": 0.075875, + "grad_norm": 2.6491522789001465, + "grad_norm_var": 0.35741571312755316, + "learning_rate": 0.0001, + "loss": 1.1971, + "loss/crossentropy": 2.2390084266662598, + "loss/hidden": 0.95703125, + "loss/logits": 0.1875428408384323, + "loss/reg": 0.005255614407360554, + "step": 607 + }, + { + "epoch": 0.076, + "grad_norm": 2.073080539703369, + "grad_norm_var": 0.33189569909254335, + "learning_rate": 0.0001, + "loss": 0.9664, + "loss/crossentropy": 2.3911304473876953, + "loss/hidden": 0.7890625, + "loss/logits": 0.12475378811359406, + "loss/reg": 0.005253734532743692, + "step": 608 + }, + { + "epoch": 0.076125, + "grad_norm": 2.265080451965332, + "grad_norm_var": 0.34931259933064945, + "learning_rate": 0.0001, + "loss": 1.1539, + "loss/crossentropy": 2.5041847229003906, + "loss/hidden": 0.9375, + "loss/logits": 0.16390517354011536, + "loss/reg": 0.005251840688288212, + "step": 609 + }, + { + "epoch": 0.07625, + "grad_norm": 2.0803020000457764, + "grad_norm_var": 0.3625360811458743, + "learning_rate": 0.0001, + "loss": 1.0503, + "loss/crossentropy": 2.504490375518799, + "loss/hidden": 0.84765625, + "loss/logits": 0.15017710626125336, + "loss/reg": 0.0052499608136713505, + "step": 610 + }, + { + "epoch": 0.076375, + "grad_norm": 2.5562875270843506, + "grad_norm_var": 0.3604403500297356, + "learning_rate": 0.0001, + "loss": 1.0444, + "loss/crossentropy": 2.462942361831665, + "loss/hidden": 0.82421875, + "loss/logits": 0.16772450506687164, + "loss/reg": 0.005247869063168764, + "step": 611 + }, + { + "epoch": 0.0765, + "grad_norm": 2.2976746559143066, + "grad_norm_var": 0.3509101683634808, + "learning_rate": 0.0001, + "loss": 1.0751, + "loss/crossentropy": 2.423419713973999, + "loss/hidden": 0.87890625, + "loss/logits": 0.1437685340642929, + "loss/reg": 0.005245808511972427, + "step": 612 + }, + { + "epoch": 0.076625, + "grad_norm": 2.0925452709198, + "grad_norm_var": 0.368735612720054, + "learning_rate": 0.0001, + "loss": 1.0097, + "loss/crossentropy": 2.422513961791992, + "loss/hidden": 0.80078125, + "loss/logits": 0.15644872188568115, + "loss/reg": 0.0052436222322285175, + "step": 613 + }, + { + "epoch": 0.07675, + "grad_norm": 3.361826181411743, + "grad_norm_var": 0.3727533997750771, + "learning_rate": 0.0001, + "loss": 1.0847, + "loss/crossentropy": 2.6778650283813477, + "loss/hidden": 0.88671875, + "loss/logits": 0.14552772045135498, + "loss/reg": 0.005241374485194683, + "step": 614 + }, + { + "epoch": 0.076875, + "grad_norm": 2.418203353881836, + "grad_norm_var": 0.29779963975303353, + "learning_rate": 0.0001, + "loss": 0.9395, + "loss/crossentropy": 2.5650947093963623, + "loss/hidden": 0.765625, + "loss/logits": 0.12145733833312988, + "loss/reg": 0.005239336285740137, + "step": 615 + }, + { + "epoch": 0.077, + "grad_norm": 2.0879790782928467, + "grad_norm_var": 0.3152329983661199, + "learning_rate": 0.0001, + "loss": 1.0952, + "loss/crossentropy": 2.393650531768799, + "loss/hidden": 0.8828125, + "loss/logits": 0.15999376773834229, + "loss/reg": 0.005237067583948374, + "step": 616 + }, + { + "epoch": 0.077125, + "grad_norm": 2.236255168914795, + "grad_norm_var": 0.1669205481679067, + "learning_rate": 0.0001, + "loss": 0.997, + "loss/crossentropy": 2.7451670169830322, + "loss/hidden": 0.796875, + "loss/logits": 0.1478239744901657, + "loss/reg": 0.005235039163380861, + "step": 617 + }, + { + "epoch": 0.07725, + "grad_norm": 3.4291763305664062, + "grad_norm_var": 0.2229381174587303, + "learning_rate": 0.0001, + "loss": 1.4063, + "loss/crossentropy": 2.2041330337524414, + "loss/hidden": 1.1484375, + "loss/logits": 0.2055673450231552, + "loss/reg": 0.005233013071119785, + "step": 618 + }, + { + "epoch": 0.077375, + "grad_norm": 2.1689059734344482, + "grad_norm_var": 0.19023093415819758, + "learning_rate": 0.0001, + "loss": 1.1399, + "loss/crossentropy": 2.3984005451202393, + "loss/hidden": 0.9296875, + "loss/logits": 0.1579177975654602, + "loss/reg": 0.0052308449521660805, + "step": 619 + }, + { + "epoch": 0.0775, + "grad_norm": 2.2414422035217285, + "grad_norm_var": 0.1935046668737487, + "learning_rate": 0.0001, + "loss": 0.997, + "loss/crossentropy": 2.448946475982666, + "loss/hidden": 0.81640625, + "loss/logits": 0.12831541895866394, + "loss/reg": 0.005228678695857525, + "step": 620 + }, + { + "epoch": 0.077625, + "grad_norm": 2.2024965286254883, + "grad_norm_var": 0.17639827374390885, + "learning_rate": 0.0001, + "loss": 1.1341, + "loss/crossentropy": 2.475011110305786, + "loss/hidden": 0.9140625, + "loss/logits": 0.16773179173469543, + "loss/reg": 0.005226653069257736, + "step": 621 + }, + { + "epoch": 0.07775, + "grad_norm": 3.0845813751220703, + "grad_norm_var": 0.20461207914334617, + "learning_rate": 0.0001, + "loss": 1.2647, + "loss/crossentropy": 2.6116580963134766, + "loss/hidden": 1.0234375, + "loss/logits": 0.188987135887146, + "loss/reg": 0.005224402993917465, + "step": 622 + }, + { + "epoch": 0.077875, + "grad_norm": 1.887999415397644, + "grad_norm_var": 0.2208956692966997, + "learning_rate": 0.0001, + "loss": 1.0901, + "loss/crossentropy": 2.1740164756774902, + "loss/hidden": 0.87890625, + "loss/logits": 0.15901124477386475, + "loss/reg": 0.005222304258495569, + "step": 623 + }, + { + "epoch": 0.078, + "grad_norm": 2.436877489089966, + "grad_norm_var": 0.2130556319156203, + "learning_rate": 0.0001, + "loss": 1.1171, + "loss/crossentropy": 2.405428647994995, + "loss/hidden": 0.9140625, + "loss/logits": 0.1507887840270996, + "loss/reg": 0.005220047663897276, + "step": 624 + }, + { + "epoch": 0.078125, + "grad_norm": 2.1241559982299805, + "grad_norm_var": 0.21735767872165226, + "learning_rate": 0.0001, + "loss": 1.0586, + "loss/crossentropy": 2.534162759780884, + "loss/hidden": 0.84765625, + "loss/logits": 0.1587330847978592, + "loss/reg": 0.005217918660491705, + "step": 625 + }, + { + "epoch": 0.07825, + "grad_norm": 3.6194941997528076, + "grad_norm_var": 0.2958829671732156, + "learning_rate": 0.0001, + "loss": 1.1975, + "loss/crossentropy": 2.4529123306274414, + "loss/hidden": 0.953125, + "loss/logits": 0.1922522485256195, + "loss/reg": 0.005215668119490147, + "step": 626 + }, + { + "epoch": 0.078375, + "grad_norm": 2.9666078090667725, + "grad_norm_var": 0.3086442760245996, + "learning_rate": 0.0001, + "loss": 1.3623, + "loss/crossentropy": 2.2351934909820557, + "loss/hidden": 1.1015625, + "loss/logits": 0.2085917890071869, + "loss/reg": 0.005213598720729351, + "step": 627 + }, + { + "epoch": 0.0785, + "grad_norm": 3.424123764038086, + "grad_norm_var": 0.35140186017642155, + "learning_rate": 0.0001, + "loss": 1.0209, + "loss/crossentropy": 2.593291997909546, + "loss/hidden": 0.8359375, + "loss/logits": 0.13284316658973694, + "loss/reg": 0.005211306270211935, + "step": 628 + }, + { + "epoch": 0.078625, + "grad_norm": 3.1431374549865723, + "grad_norm_var": 0.34770286145400553, + "learning_rate": 0.0001, + "loss": 1.1602, + "loss/crossentropy": 2.5281360149383545, + "loss/hidden": 0.9375, + "loss/logits": 0.1706121563911438, + "loss/reg": 0.005208863411098719, + "step": 629 + }, + { + "epoch": 0.07875, + "grad_norm": 2.7911853790283203, + "grad_norm_var": 0.315955495515612, + "learning_rate": 0.0001, + "loss": 1.2247, + "loss/crossentropy": 1.7649813890457153, + "loss/hidden": 0.99609375, + "loss/logits": 0.17649176716804504, + "loss/reg": 0.005206458270549774, + "step": 630 + }, + { + "epoch": 0.078875, + "grad_norm": 2.941204309463501, + "grad_norm_var": 0.3174858804582356, + "learning_rate": 0.0001, + "loss": 1.1613, + "loss/crossentropy": 2.402109146118164, + "loss/hidden": 0.921875, + "loss/logits": 0.1873561143875122, + "loss/reg": 0.0052040074951946735, + "step": 631 + }, + { + "epoch": 0.079, + "grad_norm": 2.391481876373291, + "grad_norm_var": 0.2995243667524064, + "learning_rate": 0.0001, + "loss": 1.178, + "loss/crossentropy": 2.353907346725464, + "loss/hidden": 0.9609375, + "loss/logits": 0.165082648396492, + "loss/reg": 0.005201911553740501, + "step": 632 + }, + { + "epoch": 0.079125, + "grad_norm": 2.87488055229187, + "grad_norm_var": 0.2861166812267043, + "learning_rate": 0.0001, + "loss": 1.1462, + "loss/crossentropy": 2.653827667236328, + "loss/hidden": 0.91796875, + "loss/logits": 0.17622330784797668, + "loss/reg": 0.00519942119717598, + "step": 633 + }, + { + "epoch": 0.07925, + "grad_norm": 2.4115476608276367, + "grad_norm_var": 0.2563777078404484, + "learning_rate": 0.0001, + "loss": 1.1225, + "loss/crossentropy": 2.4889907836914062, + "loss/hidden": 0.8984375, + "loss/logits": 0.1720612496137619, + "loss/reg": 0.005197320133447647, + "step": 634 + }, + { + "epoch": 0.079375, + "grad_norm": 2.4616434574127197, + "grad_norm_var": 0.24219922325532028, + "learning_rate": 0.0001, + "loss": 1.1805, + "loss/crossentropy": 2.3522822856903076, + "loss/hidden": 0.94140625, + "loss/logits": 0.1871228963136673, + "loss/reg": 0.005195194855332375, + "step": 635 + }, + { + "epoch": 0.0795, + "grad_norm": 2.386276960372925, + "grad_norm_var": 0.23489288483797985, + "learning_rate": 0.0001, + "loss": 0.9463, + "loss/crossentropy": 2.584338426589966, + "loss/hidden": 0.765625, + "loss/logits": 0.12873858213424683, + "loss/reg": 0.005192761775106192, + "step": 636 + }, + { + "epoch": 0.079625, + "grad_norm": 2.454456090927124, + "grad_norm_var": 0.22225700139133117, + "learning_rate": 0.0001, + "loss": 1.0199, + "loss/crossentropy": 2.4049668312072754, + "loss/hidden": 0.80859375, + "loss/logits": 0.15936976671218872, + "loss/reg": 0.005190614145249128, + "step": 637 + }, + { + "epoch": 0.07975, + "grad_norm": 2.1882073879241943, + "grad_norm_var": 0.22800243516591642, + "learning_rate": 0.0001, + "loss": 1.1182, + "loss/crossentropy": 2.5147759914398193, + "loss/hidden": 0.92578125, + "loss/logits": 0.14053833484649658, + "loss/reg": 0.005188319832086563, + "step": 638 + }, + { + "epoch": 0.079875, + "grad_norm": 2.214505434036255, + "grad_norm_var": 0.20121127216839246, + "learning_rate": 0.0001, + "loss": 0.9821, + "loss/crossentropy": 2.6784555912017822, + "loss/hidden": 0.79296875, + "loss/logits": 0.13726986944675446, + "loss/reg": 0.0051859593950212, + "step": 639 + }, + { + "epoch": 0.08, + "grad_norm": 2.8519279956817627, + "grad_norm_var": 0.19869721717550323, + "learning_rate": 0.0001, + "loss": 1.3825, + "loss/crossentropy": 2.4976460933685303, + "loss/hidden": 1.1171875, + "loss/logits": 0.21343019604682922, + "loss/reg": 0.005183514207601547, + "step": 640 + }, + { + "epoch": 0.080125, + "grad_norm": 3.2747607231140137, + "grad_norm_var": 0.1926680012221444, + "learning_rate": 0.0001, + "loss": 1.0585, + "loss/crossentropy": 2.265321969985962, + "loss/hidden": 0.83203125, + "loss/logits": 0.174637109041214, + "loss/reg": 0.005180996377021074, + "step": 641 + }, + { + "epoch": 0.08025, + "grad_norm": 2.433096408843994, + "grad_norm_var": 0.14700668719525894, + "learning_rate": 0.0001, + "loss": 1.0417, + "loss/crossentropy": 2.4713757038116455, + "loss/hidden": 0.83984375, + "loss/logits": 0.15003418922424316, + "loss/reg": 0.005178460851311684, + "step": 642 + }, + { + "epoch": 0.080375, + "grad_norm": 3.037181854248047, + "grad_norm_var": 0.149821407729875, + "learning_rate": 0.0001, + "loss": 1.2828, + "loss/crossentropy": 2.387122631072998, + "loss/hidden": 1.0625, + "loss/logits": 0.16852089762687683, + "loss/reg": 0.005175705999135971, + "step": 643 + }, + { + "epoch": 0.0805, + "grad_norm": 4.6729350090026855, + "grad_norm_var": 0.3670359647179557, + "learning_rate": 0.0001, + "loss": 1.4674, + "loss/crossentropy": 1.862856149673462, + "loss/hidden": 1.21875, + "loss/logits": 0.196872740983963, + "loss/reg": 0.005172953009605408, + "step": 644 + }, + { + "epoch": 0.080625, + "grad_norm": 3.1887784004211426, + "grad_norm_var": 0.3693575970723629, + "learning_rate": 0.0001, + "loss": 1.3902, + "loss/crossentropy": 2.418116569519043, + "loss/hidden": 1.0703125, + "loss/logits": 0.26818597316741943, + "loss/reg": 0.005170162301510572, + "step": 645 + }, + { + "epoch": 0.08075, + "grad_norm": 2.502976179122925, + "grad_norm_var": 0.37434523124654007, + "learning_rate": 0.0001, + "loss": 0.9914, + "loss/crossentropy": 2.666684865951538, + "loss/hidden": 0.796875, + "loss/logits": 0.14282414317131042, + "loss/reg": 0.005168135743588209, + "step": 646 + }, + { + "epoch": 0.080875, + "grad_norm": 3.457416534423828, + "grad_norm_var": 0.4029304846601008, + "learning_rate": 0.0001, + "loss": 1.1007, + "loss/crossentropy": 2.6663055419921875, + "loss/hidden": 0.8671875, + "loss/logits": 0.18181806802749634, + "loss/reg": 0.005165606737136841, + "step": 647 + }, + { + "epoch": 0.081, + "grad_norm": 3.3614838123321533, + "grad_norm_var": 0.40888510034567177, + "learning_rate": 0.0001, + "loss": 1.1832, + "loss/crossentropy": 2.758859157562256, + "loss/hidden": 0.97265625, + "loss/logits": 0.15888546407222748, + "loss/reg": 0.005163096822798252, + "step": 648 + }, + { + "epoch": 0.081125, + "grad_norm": 3.84016752243042, + "grad_norm_var": 0.46893935653157826, + "learning_rate": 0.0001, + "loss": 1.1826, + "loss/crossentropy": 2.6556475162506104, + "loss/hidden": 0.91796875, + "loss/logits": 0.21306458115577698, + "loss/reg": 0.00516059435904026, + "step": 649 + }, + { + "epoch": 0.08125, + "grad_norm": 3.2409677505493164, + "grad_norm_var": 0.45558605122396995, + "learning_rate": 0.0001, + "loss": 1.3848, + "loss/crossentropy": 2.2988944053649902, + "loss/hidden": 1.078125, + "loss/logits": 0.25512105226516724, + "loss/reg": 0.005158509127795696, + "step": 650 + }, + { + "epoch": 0.081375, + "grad_norm": 5.813977241516113, + "grad_norm_var": 0.9294389115085245, + "learning_rate": 0.0001, + "loss": 1.3714, + "loss/crossentropy": 2.1522276401519775, + "loss/hidden": 1.09375, + "loss/logits": 0.22606953978538513, + "loss/reg": 0.005156443454325199, + "step": 651 + }, + { + "epoch": 0.0815, + "grad_norm": 3.5314903259277344, + "grad_norm_var": 0.8898375889113737, + "learning_rate": 0.0001, + "loss": 1.2683, + "loss/crossentropy": 2.523782253265381, + "loss/hidden": 1.015625, + "loss/logits": 0.2011091113090515, + "loss/reg": 0.00515406858175993, + "step": 652 + }, + { + "epoch": 0.081625, + "grad_norm": 2.5944650173187256, + "grad_norm_var": 0.8761365904132077, + "learning_rate": 0.0001, + "loss": 1.1349, + "loss/crossentropy": 2.3588359355926514, + "loss/hidden": 0.921875, + "loss/logits": 0.16150620579719543, + "loss/reg": 0.005151691380888224, + "step": 653 + }, + { + "epoch": 0.08175, + "grad_norm": 3.0321786403656006, + "grad_norm_var": 0.7997344400314499, + "learning_rate": 0.0001, + "loss": 1.2751, + "loss/crossentropy": 2.35185170173645, + "loss/hidden": 1.046875, + "loss/logits": 0.17669130861759186, + "loss/reg": 0.005149615928530693, + "step": 654 + }, + { + "epoch": 0.081875, + "grad_norm": 3.2158820629119873, + "grad_norm_var": 0.7154026962141908, + "learning_rate": 0.0001, + "loss": 1.0974, + "loss/crossentropy": 2.809157133102417, + "loss/hidden": 0.8984375, + "loss/logits": 0.14747856557369232, + "loss/reg": 0.005147217772901058, + "step": 655 + }, + { + "epoch": 0.082, + "grad_norm": 2.1148674488067627, + "grad_norm_var": 0.8010662785463902, + "learning_rate": 0.0001, + "loss": 1.056, + "loss/crossentropy": 2.5246095657348633, + "loss/hidden": 0.84765625, + "loss/logits": 0.1568629890680313, + "loss/reg": 0.005145091563463211, + "step": 656 + }, + { + "epoch": 0.082125, + "grad_norm": 2.541887044906616, + "grad_norm_var": 0.8402323056924367, + "learning_rate": 0.0001, + "loss": 1.2087, + "loss/crossentropy": 2.3598315715789795, + "loss/hidden": 1.0, + "loss/logits": 0.15723757445812225, + "loss/reg": 0.005142755340784788, + "step": 657 + }, + { + "epoch": 0.08225, + "grad_norm": 2.292616605758667, + "grad_norm_var": 0.8574455385669723, + "learning_rate": 0.0001, + "loss": 1.0515, + "loss/crossentropy": 2.5242886543273926, + "loss/hidden": 0.84375, + "loss/logits": 0.15634778141975403, + "loss/reg": 0.005140629597008228, + "step": 658 + }, + { + "epoch": 0.082375, + "grad_norm": 3.6106507778167725, + "grad_norm_var": 0.8596278513525417, + "learning_rate": 0.0001, + "loss": 1.3178, + "loss/crossentropy": 2.6528077125549316, + "loss/hidden": 1.09375, + "loss/logits": 0.17268945276737213, + "loss/reg": 0.005138530861586332, + "step": 659 + }, + { + "epoch": 0.0825, + "grad_norm": 2.4270260334014893, + "grad_norm_var": 0.7677345681069748, + "learning_rate": 0.0001, + "loss": 1.072, + "loss/crossentropy": 2.5859150886535645, + "loss/hidden": 0.875, + "loss/logits": 0.14567336440086365, + "loss/reg": 0.005136391613632441, + "step": 660 + }, + { + "epoch": 0.082625, + "grad_norm": 10.746210098266602, + "grad_norm_var": 4.3533807562107985, + "learning_rate": 0.0001, + "loss": 1.3555, + "loss/crossentropy": 2.6105966567993164, + "loss/hidden": 1.09375, + "loss/logits": 0.21044138073921204, + "loss/reg": 0.005134167615324259, + "step": 661 + }, + { + "epoch": 0.08275, + "grad_norm": 2.277845621109009, + "grad_norm_var": 4.3908370843377496, + "learning_rate": 0.0001, + "loss": 1.1244, + "loss/crossentropy": 2.624403715133667, + "loss/hidden": 0.9140625, + "loss/logits": 0.15903490781784058, + "loss/reg": 0.005131968762725592, + "step": 662 + }, + { + "epoch": 0.082875, + "grad_norm": 2.2439072132110596, + "grad_norm_var": 4.510992587376572, + "learning_rate": 0.0001, + "loss": 1.1974, + "loss/crossentropy": 2.325004816055298, + "loss/hidden": 0.96875, + "loss/logits": 0.1773640513420105, + "loss/reg": 0.005129888188093901, + "step": 663 + }, + { + "epoch": 0.083, + "grad_norm": 2.5274457931518555, + "grad_norm_var": 4.57602786514919, + "learning_rate": 0.0001, + "loss": 1.0481, + "loss/crossentropy": 2.7819478511810303, + "loss/hidden": 0.8515625, + "loss/logits": 0.1452445089817047, + "loss/reg": 0.005127874203026295, + "step": 664 + }, + { + "epoch": 0.083125, + "grad_norm": 2.381653308868408, + "grad_norm_var": 4.643456939433319, + "learning_rate": 0.0001, + "loss": 1.0594, + "loss/crossentropy": 2.476351737976074, + "loss/hidden": 0.859375, + "loss/logits": 0.14875781536102295, + "loss/reg": 0.005125833675265312, + "step": 665 + }, + { + "epoch": 0.08325, + "grad_norm": 2.565531015396118, + "grad_norm_var": 4.687379253455119, + "learning_rate": 0.0001, + "loss": 1.092, + "loss/crossentropy": 2.164882183074951, + "loss/hidden": 0.88671875, + "loss/logits": 0.15401628613471985, + "loss/reg": 0.005123757291585207, + "step": 666 + }, + { + "epoch": 0.083375, + "grad_norm": 2.686464309692383, + "grad_norm_var": 4.279508443254811, + "learning_rate": 0.0001, + "loss": 1.1857, + "loss/crossentropy": 2.49807071685791, + "loss/hidden": 0.9609375, + "loss/logits": 0.17354023456573486, + "loss/reg": 0.005121580790728331, + "step": 667 + }, + { + "epoch": 0.0835, + "grad_norm": 2.211970806121826, + "grad_norm_var": 4.325501093334068, + "learning_rate": 0.0001, + "loss": 0.9944, + "loss/crossentropy": 2.3563051223754883, + "loss/hidden": 0.80078125, + "loss/logits": 0.14245735108852386, + "loss/reg": 0.005119378212839365, + "step": 668 + }, + { + "epoch": 0.083625, + "grad_norm": 40.70305252075195, + "grad_norm_var": 92.56442532718542, + "learning_rate": 0.0001, + "loss": 1.1762, + "loss/crossentropy": 2.488436460494995, + "loss/hidden": 0.9453125, + "loss/logits": 0.1797066330909729, + "loss/reg": 0.0051171439699828625, + "step": 669 + }, + { + "epoch": 0.08375, + "grad_norm": 2.748046875, + "grad_norm_var": 92.66196615048598, + "learning_rate": 0.0001, + "loss": 1.1097, + "loss/crossentropy": 2.310842990875244, + "loss/hidden": 0.91796875, + "loss/logits": 0.14054188132286072, + "loss/reg": 0.0051149362698197365, + "step": 670 + }, + { + "epoch": 0.083875, + "grad_norm": 3.018019199371338, + "grad_norm_var": 92.72350960683757, + "learning_rate": 0.0001, + "loss": 1.1882, + "loss/crossentropy": 2.5257725715637207, + "loss/hidden": 0.9921875, + "loss/logits": 0.144926518201828, + "loss/reg": 0.005112735088914633, + "step": 671 + }, + { + "epoch": 0.084, + "grad_norm": 3.2833805084228516, + "grad_norm_var": 92.29023014918404, + "learning_rate": 0.0001, + "loss": 1.3506, + "loss/crossentropy": 2.737273693084717, + "loss/hidden": 1.0703125, + "loss/logits": 0.22916561365127563, + "loss/reg": 0.005110514350235462, + "step": 672 + }, + { + "epoch": 0.084125, + "grad_norm": 3.5235512256622314, + "grad_norm_var": 91.96110241564834, + "learning_rate": 0.0001, + "loss": 1.1667, + "loss/crossentropy": 2.603550434112549, + "loss/hidden": 0.97265625, + "loss/logits": 0.14292669296264648, + "loss/reg": 0.00510829733684659, + "step": 673 + }, + { + "epoch": 0.08425, + "grad_norm": 8.889531135559082, + "grad_norm_var": 91.7913062331738, + "learning_rate": 0.0001, + "loss": 1.6779, + "loss/crossentropy": 2.7345638275146484, + "loss/hidden": 1.25, + "loss/logits": 0.37679582834243774, + "loss/reg": 0.005105969030410051, + "step": 674 + }, + { + "epoch": 0.084375, + "grad_norm": 3.833319664001465, + "grad_norm_var": 91.72375618009856, + "learning_rate": 0.0001, + "loss": 1.1576, + "loss/crossentropy": 2.670295000076294, + "loss/hidden": 0.91015625, + "loss/logits": 0.19638602435588837, + "loss/reg": 0.005103633739054203, + "step": 675 + }, + { + "epoch": 0.0845, + "grad_norm": 13.332308769226074, + "grad_norm_var": 93.95525708687954, + "learning_rate": 0.0001, + "loss": 1.2802, + "loss/crossentropy": 2.4972240924835205, + "loss/hidden": 1.046875, + "loss/logits": 0.18228942155838013, + "loss/reg": 0.005101518705487251, + "step": 676 + }, + { + "epoch": 0.084625, + "grad_norm": 3.3041481971740723, + "grad_norm_var": 93.38769696489507, + "learning_rate": 0.0001, + "loss": 1.1323, + "loss/crossentropy": 2.318303108215332, + "loss/hidden": 0.92578125, + "loss/logits": 0.15550082921981812, + "loss/reg": 0.005099330097436905, + "step": 677 + }, + { + "epoch": 0.08475, + "grad_norm": 3.0213379859924316, + "grad_norm_var": 93.03138783085473, + "learning_rate": 0.0001, + "loss": 1.2046, + "loss/crossentropy": 2.330695629119873, + "loss/hidden": 0.921875, + "loss/logits": 0.23171411454677582, + "loss/reg": 0.005097060929983854, + "step": 678 + }, + { + "epoch": 0.084875, + "grad_norm": 2.5998549461364746, + "grad_norm_var": 92.84836678832802, + "learning_rate": 0.0001, + "loss": 1.0649, + "loss/crossentropy": 2.699117422103882, + "loss/hidden": 0.859375, + "loss/logits": 0.15458270907402039, + "loss/reg": 0.005094949621707201, + "step": 679 + }, + { + "epoch": 0.085, + "grad_norm": 2.244635581970215, + "grad_norm_var": 92.99521966737969, + "learning_rate": 0.0001, + "loss": 1.1168, + "loss/crossentropy": 2.572654962539673, + "loss/hidden": 0.90234375, + "loss/logits": 0.1635233759880066, + "loss/reg": 0.005092862527817488, + "step": 680 + }, + { + "epoch": 0.085125, + "grad_norm": 2.6163716316223145, + "grad_norm_var": 92.87692169982786, + "learning_rate": 0.0001, + "loss": 1.1651, + "loss/crossentropy": 2.735013961791992, + "loss/hidden": 0.93359375, + "loss/logits": 0.18059232831001282, + "loss/reg": 0.005090588703751564, + "step": 681 + }, + { + "epoch": 0.08525, + "grad_norm": 2.5760252475738525, + "grad_norm_var": 92.8717223043897, + "learning_rate": 0.0001, + "loss": 0.9637, + "loss/crossentropy": 2.4715609550476074, + "loss/hidden": 0.7890625, + "loss/logits": 0.1237054169178009, + "loss/reg": 0.0050884694792330265, + "step": 682 + }, + { + "epoch": 0.085375, + "grad_norm": 2.3052964210510254, + "grad_norm_var": 93.06379073504952, + "learning_rate": 0.0001, + "loss": 1.1353, + "loss/crossentropy": 2.2887136936187744, + "loss/hidden": 0.92578125, + "loss/logits": 0.15868628025054932, + "loss/reg": 0.005086386110633612, + "step": 683 + }, + { + "epoch": 0.0855, + "grad_norm": 2.239668130874634, + "grad_norm_var": 93.04887766727983, + "learning_rate": 0.0001, + "loss": 1.033, + "loss/crossentropy": 2.420260429382324, + "loss/hidden": 0.84765625, + "loss/logits": 0.13450753688812256, + "loss/reg": 0.005084337200969458, + "step": 684 + }, + { + "epoch": 0.085625, + "grad_norm": 2.7788403034210205, + "grad_norm_var": 8.800650863003963, + "learning_rate": 0.0001, + "loss": 1.2414, + "loss/crossentropy": 2.505138397216797, + "loss/hidden": 1.03125, + "loss/logits": 0.15931686758995056, + "loss/reg": 0.005082385148853064, + "step": 685 + }, + { + "epoch": 0.08575, + "grad_norm": 2.274430513381958, + "grad_norm_var": 8.887076805039055, + "learning_rate": 0.0001, + "loss": 1.0324, + "loss/crossentropy": 2.24831485748291, + "loss/hidden": 0.83984375, + "loss/logits": 0.14174425601959229, + "loss/reg": 0.00508028594776988, + "step": 686 + }, + { + "epoch": 0.085875, + "grad_norm": 2.8570923805236816, + "grad_norm_var": 8.906869950057784, + "learning_rate": 0.0001, + "loss": 1.2426, + "loss/crossentropy": 2.1708295345306396, + "loss/hidden": 1.0078125, + "loss/logits": 0.18402233719825745, + "loss/reg": 0.005078236572444439, + "step": 687 + }, + { + "epoch": 0.086, + "grad_norm": 2.6168949604034424, + "grad_norm_var": 8.985428302339566, + "learning_rate": 0.0001, + "loss": 1.1911, + "loss/crossentropy": 2.4759016036987305, + "loss/hidden": 0.97265625, + "loss/logits": 0.1677204668521881, + "loss/reg": 0.005076236091554165, + "step": 688 + }, + { + "epoch": 0.086125, + "grad_norm": 2.738102674484253, + "grad_norm_var": 9.054334077972502, + "learning_rate": 0.0001, + "loss": 1.1386, + "loss/crossentropy": 2.4160187244415283, + "loss/hidden": 0.94140625, + "loss/logits": 0.1464519500732422, + "loss/reg": 0.005074144806712866, + "step": 689 + }, + { + "epoch": 0.08625, + "grad_norm": 2.7573044300079346, + "grad_norm_var": 7.214004841898908, + "learning_rate": 0.0001, + "loss": 1.1327, + "loss/crossentropy": 2.342536449432373, + "loss/hidden": 0.9296875, + "loss/logits": 0.15226896107196808, + "loss/reg": 0.0050718653947114944, + "step": 690 + }, + { + "epoch": 0.086375, + "grad_norm": 2.4906835556030273, + "grad_norm_var": 7.245694276683736, + "learning_rate": 0.0001, + "loss": 1.043, + "loss/crossentropy": 2.6861307621002197, + "loss/hidden": 0.84765625, + "loss/logits": 0.14462026953697205, + "loss/reg": 0.005069798789918423, + "step": 691 + }, + { + "epoch": 0.0865, + "grad_norm": 2.3750221729278564, + "grad_norm_var": 0.08836772564593408, + "learning_rate": 0.0001, + "loss": 1.1331, + "loss/crossentropy": 2.5460920333862305, + "loss/hidden": 0.91796875, + "loss/logits": 0.1644265055656433, + "loss/reg": 0.0050675952807068825, + "step": 692 + }, + { + "epoch": 0.086625, + "grad_norm": 2.2382612228393555, + "grad_norm_var": 0.06104096205675281, + "learning_rate": 0.0001, + "loss": 1.1182, + "loss/crossentropy": 2.5386240482330322, + "loss/hidden": 0.9140625, + "loss/logits": 0.15351390838623047, + "loss/reg": 0.005065726116299629, + "step": 693 + }, + { + "epoch": 0.08675, + "grad_norm": 2.582509994506836, + "grad_norm_var": 0.04524178053572901, + "learning_rate": 0.0001, + "loss": 1.1349, + "loss/crossentropy": 2.5054309368133545, + "loss/hidden": 0.92578125, + "loss/logits": 0.15845400094985962, + "loss/reg": 0.00506393238902092, + "step": 694 + }, + { + "epoch": 0.086875, + "grad_norm": 3.3852474689483643, + "grad_norm_var": 0.09234654068112012, + "learning_rate": 0.0001, + "loss": 1.375, + "loss/crossentropy": 2.463137626647949, + "loss/hidden": 1.09375, + "loss/logits": 0.23065921664237976, + "loss/reg": 0.005062177777290344, + "step": 695 + }, + { + "epoch": 0.087, + "grad_norm": 2.7022039890289307, + "grad_norm_var": 0.085748197103725, + "learning_rate": 0.0001, + "loss": 1.1825, + "loss/crossentropy": 2.2784640789031982, + "loss/hidden": 0.953125, + "loss/logits": 0.17874625325202942, + "loss/reg": 0.005060084629803896, + "step": 696 + }, + { + "epoch": 0.087125, + "grad_norm": 3.218095064163208, + "grad_norm_var": 0.11002230581329756, + "learning_rate": 0.0001, + "loss": 1.4532, + "loss/crossentropy": 2.372589111328125, + "loss/hidden": 1.1875, + "loss/logits": 0.21512824296951294, + "loss/reg": 0.005058267153799534, + "step": 697 + }, + { + "epoch": 0.08725, + "grad_norm": 2.2941925525665283, + "grad_norm_var": 0.11714567363764346, + "learning_rate": 0.0001, + "loss": 1.2103, + "loss/crossentropy": 2.2349698543548584, + "loss/hidden": 0.99609375, + "loss/logits": 0.16368569433689117, + "loss/reg": 0.005056225229054689, + "step": 698 + }, + { + "epoch": 0.087375, + "grad_norm": 2.4463765621185303, + "grad_norm_var": 0.11254763430842919, + "learning_rate": 0.0001, + "loss": 1.0284, + "loss/crossentropy": 2.49548077583313, + "loss/hidden": 0.84375, + "loss/logits": 0.13410091400146484, + "loss/reg": 0.005054513458162546, + "step": 699 + }, + { + "epoch": 0.0875, + "grad_norm": 2.5363550186157227, + "grad_norm_var": 0.1028185685472406, + "learning_rate": 0.0001, + "loss": 0.9756, + "loss/crossentropy": 2.900705099105835, + "loss/hidden": 0.7890625, + "loss/logits": 0.13605856895446777, + "loss/reg": 0.005052678752690554, + "step": 700 + }, + { + "epoch": 0.087625, + "grad_norm": 3.4383292198181152, + "grad_norm_var": 0.1419262550473822, + "learning_rate": 0.0001, + "loss": 1.2789, + "loss/crossentropy": 2.239861488342285, + "loss/hidden": 1.0390625, + "loss/logits": 0.18931907415390015, + "loss/reg": 0.005051023792475462, + "step": 701 + }, + { + "epoch": 0.08775, + "grad_norm": 2.1923646926879883, + "grad_norm_var": 0.14683359089866196, + "learning_rate": 0.0001, + "loss": 1.0723, + "loss/crossentropy": 2.5594210624694824, + "loss/hidden": 0.8515625, + "loss/logits": 0.17024339735507965, + "loss/reg": 0.005049179773777723, + "step": 702 + }, + { + "epoch": 0.087875, + "grad_norm": 2.492584466934204, + "grad_norm_var": 0.1464975365420211, + "learning_rate": 0.0001, + "loss": 1.0832, + "loss/crossentropy": 2.549513339996338, + "loss/hidden": 0.890625, + "loss/logits": 0.1421511173248291, + "loss/reg": 0.005047108978033066, + "step": 703 + }, + { + "epoch": 0.088, + "grad_norm": 3.04917311668396, + "grad_norm_var": 0.15589194049567348, + "learning_rate": 0.0001, + "loss": 0.9286, + "loss/crossentropy": 2.3838376998901367, + "loss/hidden": 0.7578125, + "loss/logits": 0.12035049498081207, + "loss/reg": 0.005045315716415644, + "step": 704 + }, + { + "epoch": 0.088125, + "grad_norm": 3.7284188270568848, + "grad_norm_var": 0.22439052206896856, + "learning_rate": 0.0001, + "loss": 1.2283, + "loss/crossentropy": 2.4002139568328857, + "loss/hidden": 1.0, + "loss/logits": 0.17786875367164612, + "loss/reg": 0.005043353885412216, + "step": 705 + }, + { + "epoch": 0.08825, + "grad_norm": 2.3665406703948975, + "grad_norm_var": 0.2333161514143832, + "learning_rate": 0.0001, + "loss": 1.0864, + "loss/crossentropy": 2.2537431716918945, + "loss/hidden": 0.88671875, + "loss/logits": 0.14927825331687927, + "loss/reg": 0.005041591357439756, + "step": 706 + }, + { + "epoch": 0.088375, + "grad_norm": 2.461461067199707, + "grad_norm_var": 0.23426700013735413, + "learning_rate": 0.0001, + "loss": 0.9665, + "loss/crossentropy": 2.267686605453491, + "loss/hidden": 0.78125, + "loss/logits": 0.1348324567079544, + "loss/reg": 0.005039647221565247, + "step": 707 + }, + { + "epoch": 0.0885, + "grad_norm": 2.219465494155884, + "grad_norm_var": 0.24291783945608714, + "learning_rate": 0.0001, + "loss": 1.0682, + "loss/crossentropy": 2.5170199871063232, + "loss/hidden": 0.8828125, + "loss/logits": 0.13497616350650787, + "loss/reg": 0.0050375694409012794, + "step": 708 + }, + { + "epoch": 0.088625, + "grad_norm": 2.5682785511016846, + "grad_norm_var": 0.22899036593855726, + "learning_rate": 0.0001, + "loss": 1.1712, + "loss/crossentropy": 2.3696398735046387, + "loss/hidden": 0.95703125, + "loss/logits": 0.16378942131996155, + "loss/reg": 0.00503552844747901, + "step": 709 + }, + { + "epoch": 0.08875, + "grad_norm": 2.2680654525756836, + "grad_norm_var": 0.2413579176160397, + "learning_rate": 0.0001, + "loss": 1.1427, + "loss/crossentropy": 2.4121482372283936, + "loss/hidden": 0.9296875, + "loss/logits": 0.16271916031837463, + "loss/reg": 0.005033775232732296, + "step": 710 + }, + { + "epoch": 0.088875, + "grad_norm": 7.707209587097168, + "grad_norm_var": 1.7976793028734939, + "learning_rate": 0.0001, + "loss": 1.4757, + "loss/crossentropy": 2.64532470703125, + "loss/hidden": 1.1953125, + "loss/logits": 0.23005220293998718, + "loss/reg": 0.005031922832131386, + "step": 711 + }, + { + "epoch": 0.089, + "grad_norm": 2.4962596893310547, + "grad_norm_var": 1.8079738281494115, + "learning_rate": 0.0001, + "loss": 1.1922, + "loss/crossentropy": 2.481624126434326, + "loss/hidden": 0.96875, + "loss/logits": 0.17317567765712738, + "loss/reg": 0.005030201282352209, + "step": 712 + }, + { + "epoch": 0.089125, + "grad_norm": 2.164900779724121, + "grad_norm_var": 1.842137749294079, + "learning_rate": 0.0001, + "loss": 1.064, + "loss/crossentropy": 2.228675365447998, + "loss/hidden": 0.85546875, + "loss/logits": 0.1582651436328888, + "loss/reg": 0.005028109531849623, + "step": 713 + }, + { + "epoch": 0.08925, + "grad_norm": 2.5871829986572266, + "grad_norm_var": 1.8237636675870703, + "learning_rate": 0.0001, + "loss": 1.1265, + "loss/crossentropy": 2.4661970138549805, + "loss/hidden": 0.9140625, + "loss/logits": 0.16222231090068817, + "loss/reg": 0.005026375409215689, + "step": 714 + }, + { + "epoch": 0.089375, + "grad_norm": 2.6158599853515625, + "grad_norm_var": 1.814851924792763, + "learning_rate": 0.0001, + "loss": 1.2786, + "loss/crossentropy": 2.4959585666656494, + "loss/hidden": 1.0390625, + "loss/logits": 0.18929770588874817, + "loss/reg": 0.005024294834583998, + "step": 715 + }, + { + "epoch": 0.0895, + "grad_norm": 1.9927250146865845, + "grad_norm_var": 1.8619121365324653, + "learning_rate": 0.0001, + "loss": 1.0324, + "loss/crossentropy": 2.471590518951416, + "loss/hidden": 0.828125, + "loss/logits": 0.15405681729316711, + "loss/reg": 0.005022158846259117, + "step": 716 + }, + { + "epoch": 0.089625, + "grad_norm": 2.2087814807891846, + "grad_norm_var": 1.8676209281098621, + "learning_rate": 0.0001, + "loss": 1.0376, + "loss/crossentropy": 2.4889180660247803, + "loss/hidden": 0.83984375, + "loss/logits": 0.14751726388931274, + "loss/reg": 0.005020026583224535, + "step": 717 + }, + { + "epoch": 0.08975, + "grad_norm": 2.185058116912842, + "grad_norm_var": 1.8682356690613582, + "learning_rate": 0.0001, + "loss": 1.1229, + "loss/crossentropy": 2.4008331298828125, + "loss/hidden": 0.921875, + "loss/logits": 0.15089130401611328, + "loss/reg": 0.005017881281673908, + "step": 718 + }, + { + "epoch": 0.089875, + "grad_norm": 2.22664737701416, + "grad_norm_var": 1.8842476127138506, + "learning_rate": 0.0001, + "loss": 0.9793, + "loss/crossentropy": 2.6462786197662354, + "loss/hidden": 0.7890625, + "loss/logits": 0.14010348916053772, + "loss/reg": 0.0050159962847828865, + "step": 719 + }, + { + "epoch": 0.09, + "grad_norm": 7.475221633911133, + "grad_norm_var": 3.2539659864593964, + "learning_rate": 0.0001, + "loss": 1.2258, + "loss/crossentropy": 2.450388193130493, + "loss/hidden": 1.015625, + "loss/logits": 0.16006067395210266, + "loss/reg": 0.005013884510844946, + "step": 720 + }, + { + "epoch": 0.090125, + "grad_norm": 2.1259288787841797, + "grad_norm_var": 3.275813935195105, + "learning_rate": 0.0001, + "loss": 1.1558, + "loss/crossentropy": 2.3211934566497803, + "loss/hidden": 0.9375, + "loss/logits": 0.1681801825761795, + "loss/reg": 0.0050118486396968365, + "step": 721 + }, + { + "epoch": 0.09025, + "grad_norm": 3.284715414047241, + "grad_norm_var": 3.2534822002252284, + "learning_rate": 0.0001, + "loss": 1.227, + "loss/crossentropy": 2.3303604125976562, + "loss/hidden": 1.0, + "loss/logits": 0.17691665887832642, + "loss/reg": 0.005009867250919342, + "step": 722 + }, + { + "epoch": 0.090375, + "grad_norm": 2.481712818145752, + "grad_norm_var": 3.2519544593852943, + "learning_rate": 0.0001, + "loss": 1.0745, + "loss/crossentropy": 2.516172409057617, + "loss/hidden": 0.86328125, + "loss/logits": 0.1610938012599945, + "loss/reg": 0.005008057691156864, + "step": 723 + }, + { + "epoch": 0.0905, + "grad_norm": 2.6934256553649902, + "grad_norm_var": 3.2142672637689955, + "learning_rate": 0.0001, + "loss": 1.0241, + "loss/crossentropy": 2.4445412158966064, + "loss/hidden": 0.83203125, + "loss/logits": 0.14205417037010193, + "loss/reg": 0.0050062634982168674, + "step": 724 + }, + { + "epoch": 0.090625, + "grad_norm": 2.8393290042877197, + "grad_norm_var": 3.2008126847008653, + "learning_rate": 0.0001, + "loss": 1.4807, + "loss/crossentropy": 2.155627489089966, + "loss/hidden": 1.203125, + "loss/logits": 0.22749567031860352, + "loss/reg": 0.005004186183214188, + "step": 725 + }, + { + "epoch": 0.09075, + "grad_norm": 2.7673983573913574, + "grad_norm_var": 3.16203540734179, + "learning_rate": 0.0001, + "loss": 1.0853, + "loss/crossentropy": 2.841604709625244, + "loss/hidden": 0.859375, + "loss/logits": 0.17592039704322815, + "loss/reg": 0.005002181977033615, + "step": 726 + }, + { + "epoch": 0.090875, + "grad_norm": 2.4957730770111084, + "grad_norm_var": 1.6690794582387851, + "learning_rate": 0.0001, + "loss": 1.0616, + "loss/crossentropy": 2.4109010696411133, + "loss/hidden": 0.85546875, + "loss/logits": 0.15614046156406403, + "loss/reg": 0.005000332836061716, + "step": 727 + }, + { + "epoch": 0.091, + "grad_norm": 2.2517614364624023, + "grad_norm_var": 1.6823934112280023, + "learning_rate": 0.0001, + "loss": 0.9522, + "loss/crossentropy": 2.5019116401672363, + "loss/hidden": 0.77734375, + "loss/logits": 0.12488029897212982, + "loss/reg": 0.004998230375349522, + "step": 728 + }, + { + "epoch": 0.091125, + "grad_norm": 3.1151885986328125, + "grad_norm_var": 1.6615595314428224, + "learning_rate": 0.0001, + "loss": 1.3341, + "loss/crossentropy": 2.0731663703918457, + "loss/hidden": 1.140625, + "loss/logits": 0.1435013860464096, + "loss/reg": 0.00499630905687809, + "step": 729 + }, + { + "epoch": 0.09125, + "grad_norm": 2.27622127532959, + "grad_norm_var": 1.6778435468635766, + "learning_rate": 0.0001, + "loss": 0.9945, + "loss/crossentropy": 2.5911362171173096, + "loss/hidden": 0.8046875, + "loss/logits": 0.13988272845745087, + "loss/reg": 0.004994215443730354, + "step": 730 + }, + { + "epoch": 0.091375, + "grad_norm": 2.634037971496582, + "grad_norm_var": 1.6773821814765582, + "learning_rate": 0.0001, + "loss": 1.0821, + "loss/crossentropy": 2.217550754547119, + "loss/hidden": 0.8828125, + "loss/logits": 0.14941135048866272, + "loss/reg": 0.004992038011550903, + "step": 731 + }, + { + "epoch": 0.0915, + "grad_norm": 3.696157693862915, + "grad_norm_var": 1.6717809998289452, + "learning_rate": 0.0001, + "loss": 1.4666, + "loss/crossentropy": 2.527979850769043, + "loss/hidden": 1.1796875, + "loss/logits": 0.23702046275138855, + "loss/reg": 0.004989837761968374, + "step": 732 + }, + { + "epoch": 0.091625, + "grad_norm": 2.2505931854248047, + "grad_norm_var": 1.6679122787177638, + "learning_rate": 0.0001, + "loss": 1.1594, + "loss/crossentropy": 2.539890766143799, + "loss/hidden": 0.93359375, + "loss/logits": 0.17588791251182556, + "loss/reg": 0.004987762775272131, + "step": 733 + }, + { + "epoch": 0.09175, + "grad_norm": 2.468350887298584, + "grad_norm_var": 1.6449808034713405, + "learning_rate": 0.0001, + "loss": 0.9491, + "loss/crossentropy": 2.3645944595336914, + "loss/hidden": 0.77734375, + "loss/logits": 0.12187166512012482, + "loss/reg": 0.004985733889043331, + "step": 734 + }, + { + "epoch": 0.091875, + "grad_norm": 2.4574854373931885, + "grad_norm_var": 1.6262736490095788, + "learning_rate": 0.0001, + "loss": 0.9553, + "loss/crossentropy": 2.5622363090515137, + "loss/hidden": 0.78515625, + "loss/logits": 0.12034176290035248, + "loss/reg": 0.004983709193766117, + "step": 735 + }, + { + "epoch": 0.092, + "grad_norm": 2.2961714267730713, + "grad_norm_var": 0.18272698620191838, + "learning_rate": 0.0001, + "loss": 0.9857, + "loss/crossentropy": 2.65541934967041, + "loss/hidden": 0.8046875, + "loss/logits": 0.13119381666183472, + "loss/reg": 0.004981704521924257, + "step": 736 + }, + { + "epoch": 0.092125, + "grad_norm": 1.9556196928024292, + "grad_norm_var": 0.19606320022039506, + "learning_rate": 0.0001, + "loss": 1.1183, + "loss/crossentropy": 2.4507250785827637, + "loss/hidden": 0.90625, + "loss/logits": 0.16227680444717407, + "loss/reg": 0.004979623947292566, + "step": 737 + }, + { + "epoch": 0.09225, + "grad_norm": 4.161533832550049, + "grad_norm_var": 0.32150407886374516, + "learning_rate": 0.0001, + "loss": 1.2986, + "loss/crossentropy": 2.07208514213562, + "loss/hidden": 1.0703125, + "loss/logits": 0.17855030298233032, + "loss/reg": 0.0049775131046772, + "step": 738 + }, + { + "epoch": 0.092375, + "grad_norm": 3.0142910480499268, + "grad_norm_var": 0.3253252453994368, + "learning_rate": 0.0001, + "loss": 0.9659, + "loss/crossentropy": 2.569051742553711, + "loss/hidden": 0.78515625, + "loss/logits": 0.13097813725471497, + "loss/reg": 0.004975371062755585, + "step": 739 + }, + { + "epoch": 0.0925, + "grad_norm": 2.2635934352874756, + "grad_norm_var": 0.33787014856401254, + "learning_rate": 0.0001, + "loss": 1.0329, + "loss/crossentropy": 2.4809088706970215, + "loss/hidden": 0.83203125, + "loss/logits": 0.15113815665245056, + "loss/reg": 0.004973322618752718, + "step": 740 + }, + { + "epoch": 0.092625, + "grad_norm": 2.633608818054199, + "grad_norm_var": 0.33625377709689125, + "learning_rate": 0.0001, + "loss": 1.1759, + "loss/crossentropy": 2.336703062057495, + "loss/hidden": 0.953125, + "loss/logits": 0.17310968041419983, + "loss/reg": 0.004971369635313749, + "step": 741 + }, + { + "epoch": 0.09275, + "grad_norm": 2.431776285171509, + "grad_norm_var": 0.3389851198561174, + "learning_rate": 0.0001, + "loss": 1.0201, + "loss/crossentropy": 2.4961469173431396, + "loss/hidden": 0.8203125, + "loss/logits": 0.15009689331054688, + "loss/reg": 0.004969437140971422, + "step": 742 + }, + { + "epoch": 0.092875, + "grad_norm": 2.756232976913452, + "grad_norm_var": 0.3378643921182785, + "learning_rate": 0.0001, + "loss": 1.0276, + "loss/crossentropy": 2.4552102088928223, + "loss/hidden": 0.8515625, + "loss/logits": 0.12634404003620148, + "loss/reg": 0.0049674008041620255, + "step": 743 + }, + { + "epoch": 0.093, + "grad_norm": 2.5648443698883057, + "grad_norm_var": 0.32668128102186145, + "learning_rate": 0.0001, + "loss": 1.14, + "loss/crossentropy": 2.811657667160034, + "loss/hidden": 0.93359375, + "loss/logits": 0.15678739547729492, + "loss/reg": 0.0049654701724648476, + "step": 744 + }, + { + "epoch": 0.093125, + "grad_norm": 2.283196210861206, + "grad_norm_var": 0.32233157119037936, + "learning_rate": 0.0001, + "loss": 1.0877, + "loss/crossentropy": 2.3052849769592285, + "loss/hidden": 0.8828125, + "loss/logits": 0.1552238166332245, + "loss/reg": 0.004963380750268698, + "step": 745 + }, + { + "epoch": 0.09325, + "grad_norm": 2.500383138656616, + "grad_norm_var": 0.3147792588205774, + "learning_rate": 0.0001, + "loss": 1.0794, + "loss/crossentropy": 2.6310482025146484, + "loss/hidden": 0.88671875, + "loss/logits": 0.14305052161216736, + "loss/reg": 0.0049613784067332745, + "step": 746 + }, + { + "epoch": 0.093375, + "grad_norm": 2.7470545768737793, + "grad_norm_var": 0.3153672801439085, + "learning_rate": 0.0001, + "loss": 1.1538, + "loss/crossentropy": 2.645195722579956, + "loss/hidden": 0.921875, + "loss/logits": 0.18234500288963318, + "loss/reg": 0.004959197249263525, + "step": 747 + }, + { + "epoch": 0.0935, + "grad_norm": 2.790817975997925, + "grad_norm_var": 0.24092132942115865, + "learning_rate": 0.0001, + "loss": 1.015, + "loss/crossentropy": 2.0862972736358643, + "loss/hidden": 0.8359375, + "loss/logits": 0.12952454388141632, + "loss/reg": 0.004957180004566908, + "step": 748 + }, + { + "epoch": 0.093625, + "grad_norm": 2.898916482925415, + "grad_norm_var": 0.23711979067914365, + "learning_rate": 0.0001, + "loss": 1.0498, + "loss/crossentropy": 2.522505760192871, + "loss/hidden": 0.8515625, + "loss/logits": 0.14867964386940002, + "loss/reg": 0.004955058917403221, + "step": 749 + }, + { + "epoch": 0.09375, + "grad_norm": 3.0828936100006104, + "grad_norm_var": 0.24674152232076801, + "learning_rate": 0.0001, + "loss": 1.3487, + "loss/crossentropy": 2.078137159347534, + "loss/hidden": 1.0703125, + "loss/logits": 0.2288488745689392, + "loss/reg": 0.004952888935804367, + "step": 750 + }, + { + "epoch": 0.093875, + "grad_norm": 4.464056491851807, + "grad_norm_var": 0.439550102142455, + "learning_rate": 0.0001, + "loss": 1.125, + "loss/crossentropy": 2.6811301708221436, + "loss/hidden": 0.9140625, + "loss/logits": 0.16146372258663177, + "loss/reg": 0.004950782749801874, + "step": 751 + }, + { + "epoch": 0.094, + "grad_norm": 2.9327120780944824, + "grad_norm_var": 0.4218744680947139, + "learning_rate": 0.0001, + "loss": 1.1226, + "loss/crossentropy": 2.502683639526367, + "loss/hidden": 0.91015625, + "loss/logits": 0.16296005249023438, + "loss/reg": 0.004948711488395929, + "step": 752 + }, + { + "epoch": 0.094125, + "grad_norm": 2.4569215774536133, + "grad_norm_var": 0.3782952433457505, + "learning_rate": 0.0001, + "loss": 1.1117, + "loss/crossentropy": 2.5091257095336914, + "loss/hidden": 0.90234375, + "loss/logits": 0.15988323092460632, + "loss/reg": 0.004946760833263397, + "step": 753 + }, + { + "epoch": 0.09425, + "grad_norm": 5.152282238006592, + "grad_norm_var": 0.6097367248532388, + "learning_rate": 0.0001, + "loss": 1.3929, + "loss/crossentropy": 2.2930901050567627, + "loss/hidden": 1.1484375, + "loss/logits": 0.19504866003990173, + "loss/reg": 0.004944849293678999, + "step": 754 + }, + { + "epoch": 0.094375, + "grad_norm": 2.604393243789673, + "grad_norm_var": 0.6159506323654304, + "learning_rate": 0.0001, + "loss": 1.0457, + "loss/crossentropy": 2.59228777885437, + "loss/hidden": 0.8359375, + "loss/logits": 0.16035211086273193, + "loss/reg": 0.004942973144352436, + "step": 755 + }, + { + "epoch": 0.0945, + "grad_norm": 2.197974443435669, + "grad_norm_var": 0.6218773019698792, + "learning_rate": 0.0001, + "loss": 1.0755, + "loss/crossentropy": 2.3389058113098145, + "loss/hidden": 0.8828125, + "loss/logits": 0.14329570531845093, + "loss/reg": 0.004941044840961695, + "step": 756 + }, + { + "epoch": 0.094625, + "grad_norm": 3.0741820335388184, + "grad_norm_var": 0.6180001684099087, + "learning_rate": 0.0001, + "loss": 1.4422, + "loss/crossentropy": 2.2101480960845947, + "loss/hidden": 1.1796875, + "loss/logits": 0.2130812704563141, + "loss/reg": 0.004938756115734577, + "step": 757 + }, + { + "epoch": 0.09475, + "grad_norm": 2.604829788208008, + "grad_norm_var": 0.6082914113291829, + "learning_rate": 0.0001, + "loss": 1.1533, + "loss/crossentropy": 2.431821584701538, + "loss/hidden": 0.94140625, + "loss/logits": 0.16252049803733826, + "loss/reg": 0.0049363370053470135, + "step": 758 + }, + { + "epoch": 0.094875, + "grad_norm": 2.824411630630493, + "grad_norm_var": 0.606870668349819, + "learning_rate": 0.0001, + "loss": 1.2267, + "loss/crossentropy": 2.3011207580566406, + "loss/hidden": 0.97265625, + "loss/logits": 0.20468328893184662, + "loss/reg": 0.004933919291943312, + "step": 759 + }, + { + "epoch": 0.095, + "grad_norm": 2.5809361934661865, + "grad_norm_var": 0.606063171082104, + "learning_rate": 0.0001, + "loss": 1.0216, + "loss/crossentropy": 2.5506176948547363, + "loss/hidden": 0.8359375, + "loss/logits": 0.13629919290542603, + "loss/reg": 0.004931787494570017, + "step": 760 + }, + { + "epoch": 0.095125, + "grad_norm": 4.444363117218018, + "grad_norm_var": 0.7059078117077803, + "learning_rate": 0.0001, + "loss": 1.3498, + "loss/crossentropy": 2.8235206604003906, + "loss/hidden": 1.109375, + "loss/logits": 0.191168874502182, + "loss/reg": 0.0049294959753751755, + "step": 761 + }, + { + "epoch": 0.09525, + "grad_norm": 2.169010639190674, + "grad_norm_var": 0.7385929926525419, + "learning_rate": 0.0001, + "loss": 0.9763, + "loss/crossentropy": 2.706693172454834, + "loss/hidden": 0.78515625, + "loss/logits": 0.1419064998626709, + "loss/reg": 0.004927367437630892, + "step": 762 + }, + { + "epoch": 0.095375, + "grad_norm": 2.4050183296203613, + "grad_norm_var": 0.7603640408605048, + "learning_rate": 0.0001, + "loss": 1.043, + "loss/crossentropy": 2.622105836868286, + "loss/hidden": 0.84765625, + "loss/logits": 0.1461138278245926, + "loss/reg": 0.004925237502902746, + "step": 763 + }, + { + "epoch": 0.0955, + "grad_norm": 2.4372246265411377, + "grad_norm_var": 0.7800550132454624, + "learning_rate": 0.0001, + "loss": 1.058, + "loss/crossentropy": 2.4866652488708496, + "loss/hidden": 0.86328125, + "loss/logits": 0.14548787474632263, + "loss/reg": 0.004923122003674507, + "step": 764 + }, + { + "epoch": 0.095625, + "grad_norm": 2.517997980117798, + "grad_norm_var": 0.7953055666315338, + "learning_rate": 0.0001, + "loss": 1.2213, + "loss/crossentropy": 2.3578498363494873, + "loss/hidden": 1.015625, + "loss/logits": 0.1564468890428543, + "loss/reg": 0.004920901730656624, + "step": 765 + }, + { + "epoch": 0.09575, + "grad_norm": 2.278327226638794, + "grad_norm_var": 0.8265305072858796, + "learning_rate": 0.0001, + "loss": 1.0466, + "loss/crossentropy": 2.55232310295105, + "loss/hidden": 0.84765625, + "loss/logits": 0.14976537227630615, + "loss/reg": 0.0049185301177203655, + "step": 766 + }, + { + "epoch": 0.095875, + "grad_norm": 2.337240219116211, + "grad_norm_var": 0.6789092499013821, + "learning_rate": 0.0001, + "loss": 1.1614, + "loss/crossentropy": 2.672149658203125, + "loss/hidden": 0.94921875, + "loss/logits": 0.16301041841506958, + "loss/reg": 0.004916144534945488, + "step": 767 + }, + { + "epoch": 0.096, + "grad_norm": 8.908835411071777, + "grad_norm_var": 3.005936619726153, + "learning_rate": 0.0001, + "loss": 0.9922, + "loss/crossentropy": 2.3541464805603027, + "loss/hidden": 0.8359375, + "loss/logits": 0.10713944584131241, + "loss/reg": 0.0049139889888465405, + "step": 768 + }, + { + "epoch": 0.096125, + "grad_norm": 2.1774449348449707, + "grad_norm_var": 3.0380281733161834, + "learning_rate": 0.0001, + "loss": 0.9611, + "loss/crossentropy": 2.5367493629455566, + "loss/hidden": 0.80078125, + "loss/logits": 0.11117491126060486, + "loss/reg": 0.00491185300052166, + "step": 769 + }, + { + "epoch": 0.09625, + "grad_norm": 4.010209083557129, + "grad_norm_var": 2.8176414116633506, + "learning_rate": 0.0001, + "loss": 1.6116, + "loss/crossentropy": 2.4206178188323975, + "loss/hidden": 1.2734375, + "loss/logits": 0.28903743624687195, + "loss/reg": 0.00490949209779501, + "step": 770 + }, + { + "epoch": 0.096375, + "grad_norm": 2.3479456901550293, + "grad_norm_var": 2.838639045972002, + "learning_rate": 0.0001, + "loss": 1.0775, + "loss/crossentropy": 2.3782246112823486, + "loss/hidden": 0.87109375, + "loss/logits": 0.15734228491783142, + "loss/reg": 0.004907363560050726, + "step": 771 + }, + { + "epoch": 0.0965, + "grad_norm": 2.5986974239349365, + "grad_norm_var": 2.801428785253172, + "learning_rate": 0.0001, + "loss": 1.0758, + "loss/crossentropy": 2.4752702713012695, + "loss/hidden": 0.890625, + "loss/logits": 0.13615593314170837, + "loss/reg": 0.0049048978835344315, + "step": 772 + }, + { + "epoch": 0.096625, + "grad_norm": 2.3621203899383545, + "grad_norm_var": 2.8362617972026043, + "learning_rate": 0.0001, + "loss": 0.9929, + "loss/crossentropy": 2.6385865211486816, + "loss/hidden": 0.80859375, + "loss/logits": 0.13527554273605347, + "loss/reg": 0.004902740474790335, + "step": 773 + }, + { + "epoch": 0.09675, + "grad_norm": 2.2466084957122803, + "grad_norm_var": 2.866155351424041, + "learning_rate": 0.0001, + "loss": 1.1506, + "loss/crossentropy": 2.5522897243499756, + "loss/hidden": 0.93359375, + "loss/logits": 0.1680143177509308, + "loss/reg": 0.004900622647255659, + "step": 774 + }, + { + "epoch": 0.096875, + "grad_norm": 2.6764907836914062, + "grad_norm_var": 2.871782767876315, + "learning_rate": 0.0001, + "loss": 1.1166, + "loss/crossentropy": 2.5808451175689697, + "loss/hidden": 0.90625, + "loss/logits": 0.16133888065814972, + "loss/reg": 0.004898467101156712, + "step": 775 + }, + { + "epoch": 0.097, + "grad_norm": 2.1325623989105225, + "grad_norm_var": 2.911263182397389, + "learning_rate": 0.0001, + "loss": 1.1775, + "loss/crossentropy": 2.3073110580444336, + "loss/hidden": 0.953125, + "loss/logits": 0.17539924383163452, + "loss/reg": 0.004896300844848156, + "step": 776 + }, + { + "epoch": 0.097125, + "grad_norm": 2.845750570297241, + "grad_norm_var": 2.7637895893424673, + "learning_rate": 0.0001, + "loss": 0.9791, + "loss/crossentropy": 2.452115774154663, + "loss/hidden": 0.796875, + "loss/logits": 0.13326548039913177, + "loss/reg": 0.004894034005701542, + "step": 777 + }, + { + "epoch": 0.09725, + "grad_norm": 6.139473915100098, + "grad_norm_var": 3.360390097314651, + "learning_rate": 0.0001, + "loss": 1.1467, + "loss/crossentropy": 2.44069242477417, + "loss/hidden": 0.94140625, + "loss/logits": 0.15639880299568176, + "loss/reg": 0.0048917257227003574, + "step": 778 + }, + { + "epoch": 0.097375, + "grad_norm": 2.3391048908233643, + "grad_norm_var": 3.3672209294330075, + "learning_rate": 0.0001, + "loss": 1.137, + "loss/crossentropy": 2.5613744258880615, + "loss/hidden": 0.92578125, + "loss/logits": 0.16234168410301208, + "loss/reg": 0.004889402538537979, + "step": 779 + }, + { + "epoch": 0.0975, + "grad_norm": 2.2655279636383057, + "grad_norm_var": 3.3853179937680844, + "learning_rate": 0.0001, + "loss": 1.1061, + "loss/crossentropy": 2.3578813076019287, + "loss/hidden": 0.90234375, + "loss/logits": 0.15491583943367004, + "loss/reg": 0.004887087736278772, + "step": 780 + }, + { + "epoch": 0.097625, + "grad_norm": 2.2749481201171875, + "grad_norm_var": 3.4090543314963564, + "learning_rate": 0.0001, + "loss": 1.0216, + "loss/crossentropy": 2.4622697830200195, + "loss/hidden": 0.828125, + "loss/logits": 0.14463937282562256, + "loss/reg": 0.004884790629148483, + "step": 781 + }, + { + "epoch": 0.09775, + "grad_norm": 2.891165256500244, + "grad_norm_var": 3.363644225109554, + "learning_rate": 0.0001, + "loss": 1.1747, + "loss/crossentropy": 2.5788161754608154, + "loss/hidden": 0.953125, + "loss/logits": 0.1727641224861145, + "loss/reg": 0.0048824455589056015, + "step": 782 + }, + { + "epoch": 0.097875, + "grad_norm": 2.347449541091919, + "grad_norm_var": 3.362531263350426, + "learning_rate": 0.0001, + "loss": 0.995, + "loss/crossentropy": 2.5813019275665283, + "loss/hidden": 0.8046875, + "loss/logits": 0.14150115847587585, + "loss/reg": 0.004880187567323446, + "step": 783 + }, + { + "epoch": 0.098, + "grad_norm": 3.219748020172119, + "grad_norm_var": 1.0248437110061321, + "learning_rate": 0.0001, + "loss": 1.331, + "loss/crossentropy": 2.4393255710601807, + "loss/hidden": 1.078125, + "loss/logits": 0.20414334535598755, + "loss/reg": 0.004877839703112841, + "step": 784 + }, + { + "epoch": 0.098125, + "grad_norm": 2.7481374740600586, + "grad_norm_var": 0.997469803821544, + "learning_rate": 0.0001, + "loss": 1.1664, + "loss/crossentropy": 2.856651544570923, + "loss/hidden": 0.93359375, + "loss/logits": 0.18401256203651428, + "loss/reg": 0.0048755621537566185, + "step": 785 + }, + { + "epoch": 0.09825, + "grad_norm": 3.8722689151763916, + "grad_norm_var": 0.9771433382716601, + "learning_rate": 0.0001, + "loss": 1.1632, + "loss/crossentropy": 2.7695257663726807, + "loss/hidden": 0.92578125, + "loss/logits": 0.18866363167762756, + "loss/reg": 0.004873441066592932, + "step": 786 + }, + { + "epoch": 0.098375, + "grad_norm": 2.877189874649048, + "grad_norm_var": 0.9605094695400339, + "learning_rate": 0.0001, + "loss": 1.1935, + "loss/crossentropy": 2.569603443145752, + "loss/hidden": 0.92578125, + "loss/logits": 0.21895866096019745, + "loss/reg": 0.004871242213994265, + "step": 787 + }, + { + "epoch": 0.0985, + "grad_norm": 2.430058717727661, + "grad_norm_var": 0.9682708910971911, + "learning_rate": 0.0001, + "loss": 0.9791, + "loss/crossentropy": 2.767273187637329, + "loss/hidden": 0.80078125, + "loss/logits": 0.12963160872459412, + "loss/reg": 0.004869125317782164, + "step": 788 + }, + { + "epoch": 0.098625, + "grad_norm": 2.63755202293396, + "grad_norm_var": 0.9549378382865437, + "learning_rate": 0.0001, + "loss": 0.9765, + "loss/crossentropy": 2.550255537033081, + "loss/hidden": 0.79296875, + "loss/logits": 0.13482339680194855, + "loss/reg": 0.004866961855441332, + "step": 789 + }, + { + "epoch": 0.09875, + "grad_norm": 2.9017562866210938, + "grad_norm_var": 0.9271776289312942, + "learning_rate": 0.0001, + "loss": 1.152, + "loss/crossentropy": 2.3682990074157715, + "loss/hidden": 0.91796875, + "loss/logits": 0.18535348773002625, + "loss/reg": 0.004864787682890892, + "step": 790 + }, + { + "epoch": 0.098875, + "grad_norm": 2.388214349746704, + "grad_norm_var": 0.9414410795556288, + "learning_rate": 0.0001, + "loss": 1.4406, + "loss/crossentropy": 2.2244787216186523, + "loss/hidden": 1.1640625, + "loss/logits": 0.2279416173696518, + "loss/reg": 0.0048627713695168495, + "step": 791 + }, + { + "epoch": 0.099, + "grad_norm": 2.3403820991516113, + "grad_norm_var": 0.9230295318881718, + "learning_rate": 0.0001, + "loss": 1.1661, + "loss/crossentropy": 2.3729617595672607, + "loss/hidden": 0.95703125, + "loss/logits": 0.16043922305107117, + "loss/reg": 0.004860777873545885, + "step": 792 + }, + { + "epoch": 0.099125, + "grad_norm": 2.9416215419769287, + "grad_norm_var": 0.9228156704300846, + "learning_rate": 0.0001, + "loss": 1.0463, + "loss/crossentropy": 2.6738815307617188, + "loss/hidden": 0.85546875, + "loss/logits": 0.14225679636001587, + "loss/reg": 0.0048586721532046795, + "step": 793 + }, + { + "epoch": 0.09925, + "grad_norm": 3.0802805423736572, + "grad_norm_var": 0.1918460569244303, + "learning_rate": 0.0001, + "loss": 1.095, + "loss/crossentropy": 2.368016242980957, + "loss/hidden": 0.890625, + "loss/logits": 0.15585456788539886, + "loss/reg": 0.0048565310426056385, + "step": 794 + }, + { + "epoch": 0.099375, + "grad_norm": 2.2744922637939453, + "grad_norm_var": 0.19540746296378658, + "learning_rate": 0.0001, + "loss": 1.056, + "loss/crossentropy": 2.4861905574798584, + "loss/hidden": 0.85546875, + "loss/logits": 0.15199777483940125, + "loss/reg": 0.004854561761021614, + "step": 795 + }, + { + "epoch": 0.0995, + "grad_norm": 2.4716484546661377, + "grad_norm_var": 0.1856228513035127, + "learning_rate": 0.0001, + "loss": 1.1547, + "loss/crossentropy": 2.315619468688965, + "loss/hidden": 0.95703125, + "loss/logits": 0.1491631120443344, + "loss/reg": 0.004852783400565386, + "step": 796 + }, + { + "epoch": 0.099625, + "grad_norm": 1.8618899583816528, + "grad_norm_var": 0.22140635444161577, + "learning_rate": 0.0001, + "loss": 0.9489, + "loss/crossentropy": 2.5243325233459473, + "loss/hidden": 0.77734375, + "loss/logits": 0.12307024002075195, + "loss/reg": 0.004850673023611307, + "step": 797 + }, + { + "epoch": 0.09975, + "grad_norm": 2.3018720149993896, + "grad_norm_var": 0.22850198783917974, + "learning_rate": 0.0001, + "loss": 1.1395, + "loss/crossentropy": 2.437481164932251, + "loss/hidden": 0.93359375, + "loss/logits": 0.1574450135231018, + "loss/reg": 0.0048488411121070385, + "step": 798 + }, + { + "epoch": 0.099875, + "grad_norm": 2.351703643798828, + "grad_norm_var": 0.22832106568478797, + "learning_rate": 0.0001, + "loss": 1.1459, + "loss/crossentropy": 2.7124335765838623, + "loss/hidden": 0.91796875, + "loss/logits": 0.17945504188537598, + "loss/reg": 0.004846641793847084, + "step": 799 + }, + { + "epoch": 0.1, + "grad_norm": 3.841269016265869, + "grad_norm_var": 0.29813113065747426, + "learning_rate": 0.0001, + "loss": 1.4057, + "loss/crossentropy": 2.2013328075408936, + "loss/hidden": 1.0703125, + "loss/logits": 0.2869468331336975, + "loss/reg": 0.00484456866979599, + "step": 800 + }, + { + "epoch": 0.100125, + "grad_norm": 2.1647138595581055, + "grad_norm_var": 0.3162455329850776, + "learning_rate": 0.0001, + "loss": 1.1089, + "loss/crossentropy": 2.5788028240203857, + "loss/hidden": 0.91015625, + "loss/logits": 0.15036620199680328, + "loss/reg": 0.004842570051550865, + "step": 801 + }, + { + "epoch": 0.10025, + "grad_norm": 7.012945652008057, + "grad_norm_var": 1.4357519156716128, + "learning_rate": 0.0001, + "loss": 1.3117, + "loss/crossentropy": 2.785094738006592, + "loss/hidden": 1.078125, + "loss/logits": 0.18520238995552063, + "loss/reg": 0.004840615671128035, + "step": 802 + }, + { + "epoch": 0.100375, + "grad_norm": 2.403449535369873, + "grad_norm_var": 1.4491572072812606, + "learning_rate": 0.0001, + "loss": 1.0264, + "loss/crossentropy": 2.464657783508301, + "loss/hidden": 0.83984375, + "loss/logits": 0.1381913125514984, + "loss/reg": 0.004838695749640465, + "step": 803 + }, + { + "epoch": 0.1005, + "grad_norm": 2.554766893386841, + "grad_norm_var": 1.4433503798033878, + "learning_rate": 0.0001, + "loss": 1.025, + "loss/crossentropy": 2.506692409515381, + "loss/hidden": 0.84765625, + "loss/logits": 0.1289561688899994, + "loss/reg": 0.0048366026021540165, + "step": 804 + }, + { + "epoch": 0.100625, + "grad_norm": 2.103414535522461, + "grad_norm_var": 1.4759940006075438, + "learning_rate": 0.0001, + "loss": 1.0631, + "loss/crossentropy": 2.4695632457733154, + "loss/hidden": 0.86328125, + "loss/logits": 0.15149196982383728, + "loss/reg": 0.004834519233554602, + "step": 805 + }, + { + "epoch": 0.10075, + "grad_norm": 2.1140382289886475, + "grad_norm_var": 1.5053641044502488, + "learning_rate": 0.0001, + "loss": 1.0509, + "loss/crossentropy": 2.536261796951294, + "loss/hidden": 0.84375, + "loss/logits": 0.15881776809692383, + "loss/reg": 0.004832423757761717, + "step": 806 + }, + { + "epoch": 0.100875, + "grad_norm": 2.4623467922210693, + "grad_norm_var": 1.5020038787198757, + "learning_rate": 0.0001, + "loss": 1.0163, + "loss/crossentropy": 2.6407546997070312, + "loss/hidden": 0.828125, + "loss/logits": 0.1398705244064331, + "loss/reg": 0.004830438643693924, + "step": 807 + }, + { + "epoch": 0.101, + "grad_norm": 2.196262836456299, + "grad_norm_var": 1.5115104848001217, + "learning_rate": 0.0001, + "loss": 0.9786, + "loss/crossentropy": 2.7860398292541504, + "loss/hidden": 0.78515625, + "loss/logits": 0.1451636254787445, + "loss/reg": 0.0048283860087394714, + "step": 808 + }, + { + "epoch": 0.101125, + "grad_norm": 1.9615752696990967, + "grad_norm_var": 1.547617987738648, + "learning_rate": 0.0001, + "loss": 0.9815, + "loss/crossentropy": 2.4804906845092773, + "loss/hidden": 0.796875, + "loss/logits": 0.13634686172008514, + "loss/reg": 0.004826539196074009, + "step": 809 + }, + { + "epoch": 0.10125, + "grad_norm": 2.813462257385254, + "grad_norm_var": 1.5384423691934326, + "learning_rate": 0.0001, + "loss": 1.0387, + "loss/crossentropy": 2.4615039825439453, + "loss/hidden": 0.84375, + "loss/logits": 0.1467183232307434, + "loss/reg": 0.004824436269700527, + "step": 810 + }, + { + "epoch": 0.101375, + "grad_norm": 2.155719518661499, + "grad_norm_var": 1.5457555739015585, + "learning_rate": 0.0001, + "loss": 1.0501, + "loss/crossentropy": 2.5954880714416504, + "loss/hidden": 0.85546875, + "loss/logits": 0.1464114934206009, + "loss/reg": 0.004822410177439451, + "step": 811 + }, + { + "epoch": 0.1015, + "grad_norm": 9.283843040466309, + "grad_norm_var": 4.263069385825235, + "learning_rate": 0.0001, + "loss": 2.7627, + "loss/crossentropy": 2.139838218688965, + "loss/hidden": 2.375, + "loss/logits": 0.3395351767539978, + "loss/reg": 0.0048205070197582245, + "step": 812 + }, + { + "epoch": 0.101625, + "grad_norm": 2.0784647464752197, + "grad_norm_var": 4.230278658390638, + "learning_rate": 0.0001, + "loss": 0.9664, + "loss/crossentropy": 2.3511369228363037, + "loss/hidden": 0.7890625, + "loss/logits": 0.12912708520889282, + "loss/reg": 0.0048186322674155235, + "step": 813 + }, + { + "epoch": 0.10175, + "grad_norm": 2.328005313873291, + "grad_norm_var": 4.227496791404921, + "learning_rate": 0.0001, + "loss": 1.2465, + "loss/crossentropy": 2.512450933456421, + "loss/hidden": 0.96875, + "loss/logits": 0.22958813607692719, + "loss/reg": 0.004816535394638777, + "step": 814 + }, + { + "epoch": 0.101875, + "grad_norm": 2.315840721130371, + "grad_norm_var": 4.231222857846664, + "learning_rate": 0.0001, + "loss": 1.1597, + "loss/crossentropy": 2.5316786766052246, + "loss/hidden": 0.94140625, + "loss/logits": 0.1701403111219406, + "loss/reg": 0.004814418964087963, + "step": 815 + }, + { + "epoch": 0.102, + "grad_norm": 2.384153127670288, + "grad_norm_var": 4.222215248181442, + "learning_rate": 0.0001, + "loss": 1.0444, + "loss/crossentropy": 2.540562629699707, + "loss/hidden": 0.86328125, + "loss/logits": 0.13295237720012665, + "loss/reg": 0.004812437575310469, + "step": 816 + }, + { + "epoch": 0.102125, + "grad_norm": 2.4128434658050537, + "grad_norm_var": 4.197740139734587, + "learning_rate": 0.0001, + "loss": 1.1886, + "loss/crossentropy": 2.070441246032715, + "loss/hidden": 0.96875, + "loss/logits": 0.17178985476493835, + "loss/reg": 0.0048103369772434235, + "step": 817 + }, + { + "epoch": 0.10225, + "grad_norm": 2.2246007919311523, + "grad_norm_var": 3.0918953553368502, + "learning_rate": 0.0001, + "loss": 1.1071, + "loss/crossentropy": 2.6368751525878906, + "loss/hidden": 0.8984375, + "loss/logits": 0.16057901084423065, + "loss/reg": 0.004808461759239435, + "step": 818 + }, + { + "epoch": 0.102375, + "grad_norm": 2.450014591217041, + "grad_norm_var": 3.089959662810087, + "learning_rate": 0.0001, + "loss": 0.9846, + "loss/crossentropy": 2.521289587020874, + "loss/hidden": 0.8046875, + "loss/logits": 0.13186746835708618, + "loss/reg": 0.004806382581591606, + "step": 819 + }, + { + "epoch": 0.1025, + "grad_norm": 2.4022092819213867, + "grad_norm_var": 3.095181282590954, + "learning_rate": 0.0001, + "loss": 1.0461, + "loss/crossentropy": 2.563452959060669, + "loss/hidden": 0.8515625, + "loss/logits": 0.146540105342865, + "loss/reg": 0.00480444822460413, + "step": 820 + }, + { + "epoch": 0.102625, + "grad_norm": 2.3734519481658936, + "grad_norm_var": 3.0771633032177723, + "learning_rate": 0.0001, + "loss": 1.0145, + "loss/crossentropy": 2.565798759460449, + "loss/hidden": 0.8125, + "loss/logits": 0.15395045280456543, + "loss/reg": 0.004802408628165722, + "step": 821 + }, + { + "epoch": 0.10275, + "grad_norm": 2.9813950061798096, + "grad_norm_var": 3.0509471234209884, + "learning_rate": 0.0001, + "loss": 1.1867, + "loss/crossentropy": 2.463094711303711, + "loss/hidden": 0.96875, + "loss/logits": 0.169905886054039, + "loss/reg": 0.004800358321517706, + "step": 822 + }, + { + "epoch": 0.102875, + "grad_norm": 2.231248617172241, + "grad_norm_var": 3.06473574306492, + "learning_rate": 0.0001, + "loss": 0.95, + "loss/crossentropy": 2.479421615600586, + "loss/hidden": 0.78125, + "loss/logits": 0.12073921412229538, + "loss/reg": 0.00479841185733676, + "step": 823 + }, + { + "epoch": 0.103, + "grad_norm": 2.4154672622680664, + "grad_norm_var": 3.050471285485306, + "learning_rate": 0.0001, + "loss": 1.0843, + "loss/crossentropy": 2.3831257820129395, + "loss/hidden": 0.85546875, + "loss/logits": 0.1808249056339264, + "loss/reg": 0.004796158988028765, + "step": 824 + }, + { + "epoch": 0.103125, + "grad_norm": 2.0772101879119873, + "grad_norm_var": 3.0383683290584145, + "learning_rate": 0.0001, + "loss": 1.0639, + "loss/crossentropy": 2.4004011154174805, + "loss/hidden": 0.859375, + "loss/logits": 0.15656441450119019, + "loss/reg": 0.004794239532202482, + "step": 825 + }, + { + "epoch": 0.10325, + "grad_norm": 2.839860200881958, + "grad_norm_var": 3.0384311233460473, + "learning_rate": 0.0001, + "loss": 1.0861, + "loss/crossentropy": 2.6619362831115723, + "loss/hidden": 0.87890625, + "loss/logits": 0.15924152731895447, + "loss/reg": 0.004792260471731424, + "step": 826 + }, + { + "epoch": 0.103375, + "grad_norm": 1.972954511642456, + "grad_norm_var": 3.056454118437356, + "learning_rate": 0.0001, + "loss": 0.9135, + "loss/crossentropy": 2.7229561805725098, + "loss/hidden": 0.7421875, + "loss/logits": 0.1234164908528328, + "loss/reg": 0.004790398757904768, + "step": 827 + }, + { + "epoch": 0.1035, + "grad_norm": 2.682563066482544, + "grad_norm_var": 0.07155742185727737, + "learning_rate": 0.0001, + "loss": 1.1117, + "loss/crossentropy": 2.5051097869873047, + "loss/hidden": 0.89453125, + "loss/logits": 0.16926732659339905, + "loss/reg": 0.00478832283988595, + "step": 828 + }, + { + "epoch": 0.103625, + "grad_norm": 3.4523675441741943, + "grad_norm_var": 0.13326196210074007, + "learning_rate": 0.0001, + "loss": 0.9601, + "loss/crossentropy": 2.658190965652466, + "loss/hidden": 0.78125, + "loss/logits": 0.13097809255123138, + "loss/reg": 0.004786360543221235, + "step": 829 + }, + { + "epoch": 0.10375, + "grad_norm": 3.48335599899292, + "grad_norm_var": 0.19458248394843167, + "learning_rate": 0.0001, + "loss": 1.2642, + "loss/crossentropy": 2.538571357727051, + "loss/hidden": 1.0234375, + "loss/logits": 0.1929442286491394, + "loss/reg": 0.00478436890989542, + "step": 830 + }, + { + "epoch": 0.103875, + "grad_norm": 2.62361216545105, + "grad_norm_var": 0.19115134798182468, + "learning_rate": 0.0001, + "loss": 1.0218, + "loss/crossentropy": 2.067270040512085, + "loss/hidden": 0.83984375, + "loss/logits": 0.13410484790802002, + "loss/reg": 0.004782302770763636, + "step": 831 + }, + { + "epoch": 0.104, + "grad_norm": 2.9096603393554688, + "grad_norm_var": 0.1958828676095777, + "learning_rate": 0.0001, + "loss": 1.0541, + "loss/crossentropy": 2.664367198944092, + "loss/hidden": 0.859375, + "loss/logits": 0.1468919813632965, + "loss/reg": 0.004780208226293325, + "step": 832 + }, + { + "epoch": 0.104125, + "grad_norm": 2.942896604537964, + "grad_norm_var": 0.20051234736722562, + "learning_rate": 0.0001, + "loss": 1.131, + "loss/crossentropy": 3.0427563190460205, + "loss/hidden": 0.89453125, + "loss/logits": 0.18868675827980042, + "loss/reg": 0.004778183531016111, + "step": 833 + }, + { + "epoch": 0.10425, + "grad_norm": 2.589113712310791, + "grad_norm_var": 0.18916564172237998, + "learning_rate": 0.0001, + "loss": 1.0925, + "loss/crossentropy": 2.8398096561431885, + "loss/hidden": 0.8828125, + "loss/logits": 0.16195189952850342, + "loss/reg": 0.004776162561029196, + "step": 834 + }, + { + "epoch": 0.104375, + "grad_norm": 3.4404890537261963, + "grad_norm_var": 0.22384389332806312, + "learning_rate": 0.0001, + "loss": 1.3237, + "loss/crossentropy": 2.627953290939331, + "loss/hidden": 1.0234375, + "loss/logits": 0.252508282661438, + "loss/reg": 0.004774080123752356, + "step": 835 + }, + { + "epoch": 0.1045, + "grad_norm": 3.8319575786590576, + "grad_norm_var": 0.2922407313041238, + "learning_rate": 0.0001, + "loss": 1.1363, + "loss/crossentropy": 2.598100185394287, + "loss/hidden": 0.94140625, + "loss/logits": 0.14722198247909546, + "loss/reg": 0.004771828651428223, + "step": 836 + }, + { + "epoch": 0.104625, + "grad_norm": 2.5748939514160156, + "grad_norm_var": 0.28324037377024425, + "learning_rate": 0.0001, + "loss": 1.154, + "loss/crossentropy": 2.4594998359680176, + "loss/hidden": 0.92578125, + "loss/logits": 0.1805158108472824, + "loss/reg": 0.004769548308104277, + "step": 837 + }, + { + "epoch": 0.10475, + "grad_norm": 2.4466588497161865, + "grad_norm_var": 0.2892884485845587, + "learning_rate": 0.0001, + "loss": 1.1832, + "loss/crossentropy": 2.3166677951812744, + "loss/hidden": 0.94921875, + "loss/logits": 0.18628260493278503, + "loss/reg": 0.004767347127199173, + "step": 838 + }, + { + "epoch": 0.104875, + "grad_norm": 2.3794689178466797, + "grad_norm_var": 0.2797743363037663, + "learning_rate": 0.0001, + "loss": 1.0935, + "loss/crossentropy": 2.5804922580718994, + "loss/hidden": 0.8984375, + "loss/logits": 0.14742916822433472, + "loss/reg": 0.004765105899423361, + "step": 839 + }, + { + "epoch": 0.105, + "grad_norm": 3.077510118484497, + "grad_norm_var": 0.27398293806763996, + "learning_rate": 0.0001, + "loss": 1.3542, + "loss/crossentropy": 2.3323357105255127, + "loss/hidden": 1.1015625, + "loss/logits": 0.20499414205551147, + "loss/reg": 0.004762987140566111, + "step": 840 + }, + { + "epoch": 0.105125, + "grad_norm": 2.878331422805786, + "grad_norm_var": 0.23338745113117412, + "learning_rate": 0.0001, + "loss": 1.1036, + "loss/crossentropy": 2.450366258621216, + "loss/hidden": 0.88671875, + "loss/logits": 0.16925078630447388, + "loss/reg": 0.004760903771966696, + "step": 841 + }, + { + "epoch": 0.10525, + "grad_norm": 3.3161306381225586, + "grad_norm_var": 0.24483420410498696, + "learning_rate": 0.0001, + "loss": 1.2537, + "loss/crossentropy": 2.5501890182495117, + "loss/hidden": 1.0234375, + "loss/logits": 0.18264800310134888, + "loss/reg": 0.004758887458592653, + "step": 842 + }, + { + "epoch": 0.105375, + "grad_norm": 3.6941001415252686, + "grad_norm_var": 0.21433980549929005, + "learning_rate": 0.0001, + "loss": 1.1001, + "loss/crossentropy": 2.984022378921509, + "loss/hidden": 0.890625, + "loss/logits": 0.16192708909511566, + "loss/reg": 0.004756839480251074, + "step": 843 + }, + { + "epoch": 0.1055, + "grad_norm": 2.4516656398773193, + "grad_norm_var": 0.22806633375319052, + "learning_rate": 0.0001, + "loss": 1.1966, + "loss/crossentropy": 2.304518699645996, + "loss/hidden": 0.96484375, + "loss/logits": 0.18419940769672394, + "loss/reg": 0.0047547114081680775, + "step": 844 + }, + { + "epoch": 0.105625, + "grad_norm": 2.1033735275268555, + "grad_norm_var": 0.2614740255031826, + "learning_rate": 0.0001, + "loss": 1.1366, + "loss/crossentropy": 2.54909086227417, + "loss/hidden": 0.93359375, + "loss/logits": 0.15546000003814697, + "loss/reg": 0.004752539098262787, + "step": 845 + }, + { + "epoch": 0.10575, + "grad_norm": 2.3745031356811523, + "grad_norm_var": 0.2552452215100343, + "learning_rate": 0.0001, + "loss": 1.0427, + "loss/crossentropy": 2.629120349884033, + "loss/hidden": 0.828125, + "loss/logits": 0.16704407334327698, + "loss/reg": 0.004750436637550592, + "step": 846 + }, + { + "epoch": 0.105875, + "grad_norm": 2.338932514190674, + "grad_norm_var": 0.26898497299798596, + "learning_rate": 0.0001, + "loss": 1.0028, + "loss/crossentropy": 2.5220935344696045, + "loss/hidden": 0.828125, + "loss/logits": 0.12721288204193115, + "loss/reg": 0.004748245235532522, + "step": 847 + }, + { + "epoch": 0.106, + "grad_norm": 2.051365375518799, + "grad_norm_var": 0.3064090147954744, + "learning_rate": 0.0001, + "loss": 1.0144, + "loss/crossentropy": 2.5917866230010986, + "loss/hidden": 0.828125, + "loss/logits": 0.13877776265144348, + "loss/reg": 0.0047462498769164085, + "step": 848 + }, + { + "epoch": 0.106125, + "grad_norm": 2.7592124938964844, + "grad_norm_var": 0.3045456563454231, + "learning_rate": 0.0001, + "loss": 1.0817, + "loss/crossentropy": 2.400892972946167, + "loss/hidden": 0.88671875, + "loss/logits": 0.14757487177848816, + "loss/reg": 0.004744186066091061, + "step": 849 + }, + { + "epoch": 0.10625, + "grad_norm": 2.843409538269043, + "grad_norm_var": 0.3024802042353009, + "learning_rate": 0.0001, + "loss": 1.1031, + "loss/crossentropy": 2.4847569465637207, + "loss/hidden": 0.90234375, + "loss/logits": 0.153322234749794, + "loss/reg": 0.004742183722555637, + "step": 850 + }, + { + "epoch": 0.106375, + "grad_norm": 2.1840505599975586, + "grad_norm_var": 0.291355140168911, + "learning_rate": 0.0001, + "loss": 0.9921, + "loss/crossentropy": 2.5778043270111084, + "loss/hidden": 0.80859375, + "loss/logits": 0.13607874512672424, + "loss/reg": 0.004740222357213497, + "step": 851 + }, + { + "epoch": 0.1065, + "grad_norm": 2.0575978755950928, + "grad_norm_var": 0.2218880841874949, + "learning_rate": 0.0001, + "loss": 0.97, + "loss/crossentropy": 2.3618547916412354, + "loss/hidden": 0.79296875, + "loss/logits": 0.12968632578849792, + "loss/reg": 0.004738117568194866, + "step": 852 + }, + { + "epoch": 0.106625, + "grad_norm": 1.9274516105651855, + "grad_norm_var": 0.2498830541667985, + "learning_rate": 0.0001, + "loss": 0.9944, + "loss/crossentropy": 2.616579532623291, + "loss/hidden": 0.80859375, + "loss/logits": 0.13845369219779968, + "loss/reg": 0.004735942464321852, + "step": 853 + }, + { + "epoch": 0.10675, + "grad_norm": 4.250617980957031, + "grad_norm_var": 0.42715921119526734, + "learning_rate": 0.0001, + "loss": 1.0122, + "loss/crossentropy": 2.658142328262329, + "loss/hidden": 0.828125, + "loss/logits": 0.13677959144115448, + "loss/reg": 0.004733935464173555, + "step": 854 + }, + { + "epoch": 0.106875, + "grad_norm": 2.366472005844116, + "grad_norm_var": 0.42766974025784443, + "learning_rate": 0.0001, + "loss": 0.951, + "loss/crossentropy": 2.4926974773406982, + "loss/hidden": 0.78125, + "loss/logits": 0.12242163717746735, + "loss/reg": 0.004731933120638132, + "step": 855 + }, + { + "epoch": 0.107, + "grad_norm": 2.754833221435547, + "grad_norm_var": 0.41652297282437467, + "learning_rate": 0.0001, + "loss": 1.1565, + "loss/crossentropy": 2.3678700923919678, + "loss/hidden": 0.93359375, + "loss/logits": 0.17560826241970062, + "loss/reg": 0.004729805048555136, + "step": 856 + }, + { + "epoch": 0.107125, + "grad_norm": 3.3536951541900635, + "grad_norm_var": 0.44530816036993104, + "learning_rate": 0.0001, + "loss": 1.3138, + "loss/crossentropy": 2.4103269577026367, + "loss/hidden": 1.0703125, + "loss/logits": 0.19625738263130188, + "loss/reg": 0.004727587569504976, + "step": 857 + }, + { + "epoch": 0.10725, + "grad_norm": 2.150568962097168, + "grad_norm_var": 0.43084581061497124, + "learning_rate": 0.0001, + "loss": 1.1975, + "loss/crossentropy": 2.288213014602661, + "loss/hidden": 0.984375, + "loss/logits": 0.16585032641887665, + "loss/reg": 0.004725386388599873, + "step": 858 + }, + { + "epoch": 0.107375, + "grad_norm": 2.526709794998169, + "grad_norm_var": 0.3463235885418951, + "learning_rate": 0.0001, + "loss": 1.0485, + "loss/crossentropy": 2.668027639389038, + "loss/hidden": 0.8515625, + "loss/logits": 0.14968323707580566, + "loss/reg": 0.0047230906784534454, + "step": 859 + }, + { + "epoch": 0.1075, + "grad_norm": 2.573915958404541, + "grad_norm_var": 0.3459660758761667, + "learning_rate": 0.0001, + "loss": 1.285, + "loss/crossentropy": 2.4578278064727783, + "loss/hidden": 1.046875, + "loss/logits": 0.19090218842029572, + "loss/reg": 0.004720703698694706, + "step": 860 + }, + { + "epoch": 0.107625, + "grad_norm": 2.0752198696136475, + "grad_norm_var": 0.3476491685761097, + "learning_rate": 0.0001, + "loss": 1.0997, + "loss/crossentropy": 2.5527398586273193, + "loss/hidden": 0.8984375, + "loss/logits": 0.15407393872737885, + "loss/reg": 0.004718627315014601, + "step": 861 + }, + { + "epoch": 0.10775, + "grad_norm": 2.0546956062316895, + "grad_norm_var": 0.3609613231593753, + "learning_rate": 0.0001, + "loss": 0.9752, + "loss/crossentropy": 2.371561050415039, + "loss/hidden": 0.796875, + "loss/logits": 0.13113868236541748, + "loss/reg": 0.0047163935378193855, + "step": 862 + }, + { + "epoch": 0.107875, + "grad_norm": 2.4428114891052246, + "grad_norm_var": 0.35917223636502416, + "learning_rate": 0.0001, + "loss": 1.1124, + "loss/crossentropy": 2.4627277851104736, + "loss/hidden": 0.90625, + "loss/logits": 0.15898552536964417, + "loss/reg": 0.004714163951575756, + "step": 863 + }, + { + "epoch": 0.108, + "grad_norm": 9.508520126342773, + "grad_norm_var": 3.365516663733607, + "learning_rate": 0.0001, + "loss": 2.2474, + "loss/crossentropy": 2.571873426437378, + "loss/hidden": 1.8203125, + "loss/logits": 0.3799425959587097, + "loss/reg": 0.00471192691475153, + "step": 864 + }, + { + "epoch": 0.108125, + "grad_norm": 1.8954740762710571, + "grad_norm_var": 3.4386495429465245, + "learning_rate": 0.0001, + "loss": 0.9501, + "loss/crossentropy": 2.6599161624908447, + "loss/hidden": 0.7734375, + "loss/logits": 0.12957873940467834, + "loss/reg": 0.00470972154289484, + "step": 865 + }, + { + "epoch": 0.10825, + "grad_norm": 2.4904675483703613, + "grad_norm_var": 3.4507629712816508, + "learning_rate": 0.0001, + "loss": 1.0515, + "loss/crossentropy": 2.4854214191436768, + "loss/hidden": 0.8671875, + "loss/logits": 0.13725632429122925, + "loss/reg": 0.004707681480795145, + "step": 866 + }, + { + "epoch": 0.108375, + "grad_norm": 2.947456121444702, + "grad_norm_var": 3.4129568938410895, + "learning_rate": 0.0001, + "loss": 1.2279, + "loss/crossentropy": 2.4842288494110107, + "loss/hidden": 0.9921875, + "loss/logits": 0.1886221170425415, + "loss/reg": 0.004705703817307949, + "step": 867 + }, + { + "epoch": 0.1085, + "grad_norm": 2.2447702884674072, + "grad_norm_var": 3.3926001028643817, + "learning_rate": 0.0001, + "loss": 1.0195, + "loss/crossentropy": 2.148575782775879, + "loss/hidden": 0.84375, + "loss/logits": 0.1287107914686203, + "loss/reg": 0.0047035738825798035, + "step": 868 + }, + { + "epoch": 0.108625, + "grad_norm": 2.057748317718506, + "grad_norm_var": 3.3755016690991604, + "learning_rate": 0.0001, + "loss": 1.0202, + "loss/crossentropy": 2.483203172683716, + "loss/hidden": 0.83203125, + "loss/logits": 0.14113682508468628, + "loss/reg": 0.004701647907495499, + "step": 869 + }, + { + "epoch": 0.10875, + "grad_norm": 2.5249640941619873, + "grad_norm_var": 3.2694673269882863, + "learning_rate": 0.0001, + "loss": 1.113, + "loss/crossentropy": 2.5260114669799805, + "loss/hidden": 0.91015625, + "loss/logits": 0.1558540314435959, + "loss/reg": 0.004699505399912596, + "step": 870 + }, + { + "epoch": 0.108875, + "grad_norm": 2.737452507019043, + "grad_norm_var": 3.2530130532767125, + "learning_rate": 0.0001, + "loss": 1.281, + "loss/crossentropy": 2.0429792404174805, + "loss/hidden": 1.0625, + "loss/logits": 0.1715242862701416, + "loss/reg": 0.004697592929005623, + "step": 871 + }, + { + "epoch": 0.109, + "grad_norm": 3.320223569869995, + "grad_norm_var": 3.2623347194326366, + "learning_rate": 0.0001, + "loss": 1.1007, + "loss/crossentropy": 2.4485509395599365, + "loss/hidden": 0.91015625, + "loss/logits": 0.14355334639549255, + "loss/reg": 0.004695762414485216, + "step": 872 + }, + { + "epoch": 0.109125, + "grad_norm": 2.352004051208496, + "grad_norm_var": 3.268664190896945, + "learning_rate": 0.0001, + "loss": 0.9754, + "loss/crossentropy": 2.566997528076172, + "loss/hidden": 0.81640625, + "loss/logits": 0.11203782260417938, + "loss/reg": 0.004693967290222645, + "step": 873 + }, + { + "epoch": 0.10925, + "grad_norm": 2.532027244567871, + "grad_norm_var": 3.2412215675029845, + "learning_rate": 0.0001, + "loss": 1.019, + "loss/crossentropy": 2.5623605251312256, + "loss/hidden": 0.82421875, + "loss/logits": 0.14787867665290833, + "loss/reg": 0.0046923235058784485, + "step": 874 + }, + { + "epoch": 0.109375, + "grad_norm": 2.850015878677368, + "grad_norm_var": 3.231974182838817, + "learning_rate": 0.0001, + "loss": 1.3183, + "loss/crossentropy": 2.292745351791382, + "loss/hidden": 1.09375, + "loss/logits": 0.17765681445598602, + "loss/reg": 0.004690241534262896, + "step": 875 + }, + { + "epoch": 0.1095, + "grad_norm": 2.694929361343384, + "grad_norm_var": 3.2274185214577464, + "learning_rate": 0.0001, + "loss": 1.3804, + "loss/crossentropy": 2.5017001628875732, + "loss/hidden": 1.1171875, + "loss/logits": 0.21630127727985382, + "loss/reg": 0.004688601475208998, + "step": 876 + }, + { + "epoch": 0.109625, + "grad_norm": 3.1318111419677734, + "grad_norm_var": 3.1781036409629513, + "learning_rate": 0.0001, + "loss": 1.0142, + "loss/crossentropy": 2.467827796936035, + "loss/hidden": 0.80859375, + "loss/logits": 0.15873777866363525, + "loss/reg": 0.004686909727752209, + "step": 877 + }, + { + "epoch": 0.10975, + "grad_norm": 2.534363269805908, + "grad_norm_var": 3.132884034258479, + "learning_rate": 0.0001, + "loss": 1.0536, + "loss/crossentropy": 2.559304714202881, + "loss/hidden": 0.84765625, + "loss/logits": 0.15912304818630219, + "loss/reg": 0.004684917628765106, + "step": 878 + }, + { + "epoch": 0.109875, + "grad_norm": 2.3481605052948, + "grad_norm_var": 3.1406848036532913, + "learning_rate": 0.0001, + "loss": 1.1184, + "loss/crossentropy": 2.436594247817993, + "loss/hidden": 0.91796875, + "loss/logits": 0.15359237790107727, + "loss/reg": 0.004682839848101139, + "step": 879 + }, + { + "epoch": 0.11, + "grad_norm": 2.825532913208008, + "grad_norm_var": 0.1420546261528272, + "learning_rate": 0.0001, + "loss": 1.05, + "loss/crossentropy": 2.62540602684021, + "loss/hidden": 0.859375, + "loss/logits": 0.14383457601070404, + "loss/reg": 0.004681065212935209, + "step": 880 + }, + { + "epoch": 0.110125, + "grad_norm": 2.246893882751465, + "grad_norm_var": 0.11709161648718099, + "learning_rate": 0.0001, + "loss": 1.1922, + "loss/crossentropy": 2.3017749786376953, + "loss/hidden": 0.98046875, + "loss/logits": 0.1649210900068283, + "loss/reg": 0.004679176490753889, + "step": 881 + }, + { + "epoch": 0.11025, + "grad_norm": 2.405453681945801, + "grad_norm_var": 0.11895408888104815, + "learning_rate": 0.0001, + "loss": 1.0762, + "loss/crossentropy": 2.4893195629119873, + "loss/hidden": 0.87109375, + "loss/logits": 0.15834550559520721, + "loss/reg": 0.004677077289670706, + "step": 882 + }, + { + "epoch": 0.110375, + "grad_norm": 2.809741258621216, + "grad_norm_var": 0.11393595478610692, + "learning_rate": 0.0001, + "loss": 1.0382, + "loss/crossentropy": 2.4846301078796387, + "loss/hidden": 0.84375, + "loss/logits": 0.1476888507604599, + "loss/reg": 0.004674948286265135, + "step": 883 + }, + { + "epoch": 0.1105, + "grad_norm": 2.79677152633667, + "grad_norm_var": 0.10676105158749939, + "learning_rate": 0.0001, + "loss": 1.0628, + "loss/crossentropy": 2.4412240982055664, + "loss/hidden": 0.859375, + "loss/logits": 0.15672242641448975, + "loss/reg": 0.0046728490851819515, + "step": 884 + }, + { + "epoch": 0.110625, + "grad_norm": 2.2540183067321777, + "grad_norm_var": 0.09404914291929553, + "learning_rate": 0.0001, + "loss": 1.0763, + "loss/crossentropy": 2.42918062210083, + "loss/hidden": 0.87890625, + "loss/logits": 0.15069469809532166, + "loss/reg": 0.004670663271099329, + "step": 885 + }, + { + "epoch": 0.11075, + "grad_norm": 2.4896061420440674, + "grad_norm_var": 0.09470624757332567, + "learning_rate": 0.0001, + "loss": 1.0982, + "loss/crossentropy": 2.2188639640808105, + "loss/hidden": 0.91796875, + "loss/logits": 0.13358637690544128, + "loss/reg": 0.004668715409934521, + "step": 886 + }, + { + "epoch": 0.110875, + "grad_norm": 2.1856141090393066, + "grad_norm_var": 0.10697799820098434, + "learning_rate": 0.0001, + "loss": 1.0706, + "loss/crossentropy": 2.508702039718628, + "loss/hidden": 0.87109375, + "loss/logits": 0.15285125374794006, + "loss/reg": 0.0046665905974805355, + "step": 887 + }, + { + "epoch": 0.111, + "grad_norm": 2.0393009185791016, + "grad_norm_var": 0.08841005951741536, + "learning_rate": 0.0001, + "loss": 1.1924, + "loss/crossentropy": 2.38267183303833, + "loss/hidden": 0.97265625, + "loss/logits": 0.17313829064369202, + "loss/reg": 0.004664612468332052, + "step": 888 + }, + { + "epoch": 0.111125, + "grad_norm": 2.3410797119140625, + "grad_norm_var": 0.08867826223563284, + "learning_rate": 0.0001, + "loss": 1.074, + "loss/crossentropy": 2.1943254470825195, + "loss/hidden": 0.87109375, + "loss/logits": 0.1563197374343872, + "loss/reg": 0.004662699997425079, + "step": 889 + }, + { + "epoch": 0.11125, + "grad_norm": 2.1273703575134277, + "grad_norm_var": 0.09882102282956354, + "learning_rate": 0.0001, + "loss": 1.1126, + "loss/crossentropy": 2.685147762298584, + "loss/hidden": 0.8984375, + "loss/logits": 0.16760051250457764, + "loss/reg": 0.00466081453487277, + "step": 890 + }, + { + "epoch": 0.111375, + "grad_norm": 1.990721583366394, + "grad_norm_var": 0.1054455812123658, + "learning_rate": 0.0001, + "loss": 1.0555, + "loss/crossentropy": 2.5216498374938965, + "loss/hidden": 0.86328125, + "loss/logits": 0.14559441804885864, + "loss/reg": 0.004659009166061878, + "step": 891 + }, + { + "epoch": 0.1115, + "grad_norm": 2.070897340774536, + "grad_norm_var": 0.10951603310177038, + "learning_rate": 0.0001, + "loss": 1.1023, + "loss/crossentropy": 2.40985369682312, + "loss/hidden": 0.890625, + "loss/logits": 0.16513003408908844, + "loss/reg": 0.0046569365076720715, + "step": 892 + }, + { + "epoch": 0.111625, + "grad_norm": 4.464876651763916, + "grad_norm_var": 0.3484639481637311, + "learning_rate": 0.0001, + "loss": 0.9826, + "loss/crossentropy": 2.7498655319213867, + "loss/hidden": 0.80859375, + "loss/logits": 0.12744669616222382, + "loss/reg": 0.0046548242680728436, + "step": 893 + }, + { + "epoch": 0.11175, + "grad_norm": 3.390195608139038, + "grad_norm_var": 0.39865960381583576, + "learning_rate": 0.0001, + "loss": 1.221, + "loss/crossentropy": 2.4615988731384277, + "loss/hidden": 0.9453125, + "loss/logits": 0.22916388511657715, + "loss/reg": 0.004652821458876133, + "step": 894 + }, + { + "epoch": 0.111875, + "grad_norm": 5.686069488525391, + "grad_norm_var": 1.005565195852795, + "learning_rate": 0.0001, + "loss": 1.6881, + "loss/crossentropy": 2.871785879135132, + "loss/hidden": 1.3515625, + "loss/logits": 0.2899933457374573, + "loss/reg": 0.0046508111990988255, + "step": 895 + }, + { + "epoch": 0.112, + "grad_norm": 2.610992193222046, + "grad_norm_var": 1.006503225573829, + "learning_rate": 0.0001, + "loss": 1.2867, + "loss/crossentropy": 2.433950901031494, + "loss/hidden": 1.046875, + "loss/logits": 0.19335989654064178, + "loss/reg": 0.004648844711482525, + "step": 896 + }, + { + "epoch": 0.112125, + "grad_norm": 2.4823808670043945, + "grad_norm_var": 0.9943498438598022, + "learning_rate": 0.0001, + "loss": 1.0789, + "loss/crossentropy": 2.469367742538452, + "loss/hidden": 0.86328125, + "loss/logits": 0.16918183863162994, + "loss/reg": 0.004646934103220701, + "step": 897 + }, + { + "epoch": 0.11225, + "grad_norm": 2.520416736602783, + "grad_norm_var": 0.9897555293936913, + "learning_rate": 0.0001, + "loss": 1.0086, + "loss/crossentropy": 2.4925904273986816, + "loss/hidden": 0.828125, + "loss/logits": 0.13399645686149597, + "loss/reg": 0.004645092878490686, + "step": 898 + }, + { + "epoch": 0.112375, + "grad_norm": 3.859619140625, + "grad_norm_var": 1.064733358455831, + "learning_rate": 0.0001, + "loss": 1.392, + "loss/crossentropy": 2.2787818908691406, + "loss/hidden": 1.125, + "loss/logits": 0.22061912715435028, + "loss/reg": 0.004643063060939312, + "step": 899 + }, + { + "epoch": 0.1125, + "grad_norm": 2.3090193271636963, + "grad_norm_var": 1.081884870890947, + "learning_rate": 0.0001, + "loss": 1.0228, + "loss/crossentropy": 2.5625975131988525, + "loss/hidden": 0.83984375, + "loss/logits": 0.13652384281158447, + "loss/reg": 0.004641035571694374, + "step": 900 + }, + { + "epoch": 0.112625, + "grad_norm": 3.0584754943847656, + "grad_norm_var": 1.063620631316445, + "learning_rate": 0.0001, + "loss": 1.1209, + "loss/crossentropy": 2.5125515460968018, + "loss/hidden": 0.91015625, + "loss/logits": 0.16438385844230652, + "loss/reg": 0.004638944752514362, + "step": 901 + }, + { + "epoch": 0.11275, + "grad_norm": 2.350011110305786, + "grad_norm_var": 1.0715774319545346, + "learning_rate": 0.0001, + "loss": 1.1854, + "loss/crossentropy": 2.548809766769409, + "loss/hidden": 0.9765625, + "loss/logits": 0.16251316666603088, + "loss/reg": 0.004636852536350489, + "step": 902 + }, + { + "epoch": 0.112875, + "grad_norm": 2.3605165481567383, + "grad_norm_var": 1.0581603064239917, + "learning_rate": 0.0001, + "loss": 1.0876, + "loss/crossentropy": 2.338804006576538, + "loss/hidden": 0.90234375, + "loss/logits": 0.13895326852798462, + "loss/reg": 0.004634756129235029, + "step": 903 + }, + { + "epoch": 0.113, + "grad_norm": 2.982060432434082, + "grad_norm_var": 1.0113174770986684, + "learning_rate": 0.0001, + "loss": 1.122, + "loss/crossentropy": 2.604545831680298, + "loss/hidden": 0.91796875, + "loss/logits": 0.15767651796340942, + "loss/reg": 0.004632753320038319, + "step": 904 + }, + { + "epoch": 0.113125, + "grad_norm": 2.4179961681365967, + "grad_norm_var": 1.0058240052270713, + "learning_rate": 0.0001, + "loss": 1.0464, + "loss/crossentropy": 2.3685691356658936, + "loss/hidden": 0.85546875, + "loss/logits": 0.14459514617919922, + "loss/reg": 0.0046308403834700584, + "step": 905 + }, + { + "epoch": 0.11325, + "grad_norm": 2.9855105876922607, + "grad_norm_var": 0.9614321136201335, + "learning_rate": 0.0001, + "loss": 1.0734, + "loss/crossentropy": 2.4018514156341553, + "loss/hidden": 0.875, + "loss/logits": 0.15208232402801514, + "loss/reg": 0.00462888041511178, + "step": 906 + }, + { + "epoch": 0.113375, + "grad_norm": 3.7471723556518555, + "grad_norm_var": 0.9246222750158322, + "learning_rate": 0.0001, + "loss": 1.5348, + "loss/crossentropy": 2.3634986877441406, + "loss/hidden": 1.2265625, + "loss/logits": 0.26200127601623535, + "loss/reg": 0.0046269698068499565, + "step": 907 + }, + { + "epoch": 0.1135, + "grad_norm": 2.5677998065948486, + "grad_norm_var": 0.8731304087305998, + "learning_rate": 0.0001, + "loss": 1.2106, + "loss/crossentropy": 2.3270931243896484, + "loss/hidden": 0.984375, + "loss/logits": 0.17999057471752167, + "loss/reg": 0.004624930210411549, + "step": 908 + }, + { + "epoch": 0.113625, + "grad_norm": 2.649965286254883, + "grad_norm_var": 0.7516360272379186, + "learning_rate": 0.0001, + "loss": 1.1275, + "loss/crossentropy": 2.627746343612671, + "loss/hidden": 0.9140625, + "loss/logits": 0.16723725199699402, + "loss/reg": 0.0046230582520365715, + "step": 909 + }, + { + "epoch": 0.11375, + "grad_norm": 2.4064176082611084, + "grad_norm_var": 0.7607639000768924, + "learning_rate": 0.0001, + "loss": 1.0182, + "loss/crossentropy": 2.47015380859375, + "loss/hidden": 0.81640625, + "loss/logits": 0.15559975802898407, + "loss/reg": 0.00462103309109807, + "step": 910 + }, + { + "epoch": 0.113875, + "grad_norm": 2.1189992427825928, + "grad_norm_var": 0.24860211648851874, + "learning_rate": 0.0001, + "loss": 1.1855, + "loss/crossentropy": 2.2618770599365234, + "loss/hidden": 0.96484375, + "loss/logits": 0.1744484156370163, + "loss/reg": 0.004618941340595484, + "step": 911 + }, + { + "epoch": 0.114, + "grad_norm": 2.220656633377075, + "grad_norm_var": 0.26349665304340514, + "learning_rate": 0.0001, + "loss": 1.2433, + "loss/crossentropy": 2.232293128967285, + "loss/hidden": 1.0, + "loss/logits": 0.19716452062129974, + "loss/reg": 0.004616775084286928, + "step": 912 + }, + { + "epoch": 0.114125, + "grad_norm": 2.525851011276245, + "grad_norm_var": 0.2624124723651427, + "learning_rate": 0.0001, + "loss": 1.2848, + "loss/crossentropy": 2.2225029468536377, + "loss/hidden": 1.0703125, + "loss/logits": 0.16834387183189392, + "loss/reg": 0.004614519886672497, + "step": 913 + }, + { + "epoch": 0.11425, + "grad_norm": 2.6499905586242676, + "grad_norm_var": 0.2604882837896163, + "learning_rate": 0.0001, + "loss": 1.1621, + "loss/crossentropy": 2.7070581912994385, + "loss/hidden": 0.953125, + "loss/logits": 0.16290049254894257, + "loss/reg": 0.004612345714122057, + "step": 914 + }, + { + "epoch": 0.114375, + "grad_norm": 2.417423963546753, + "grad_norm_var": 0.1676183523849204, + "learning_rate": 0.0001, + "loss": 1.2021, + "loss/crossentropy": 1.8924601078033447, + "loss/hidden": 0.97265625, + "loss/logits": 0.183339461684227, + "loss/reg": 0.004610271658748388, + "step": 915 + }, + { + "epoch": 0.1145, + "grad_norm": 2.935338258743286, + "grad_norm_var": 0.16695985677138575, + "learning_rate": 0.0001, + "loss": 1.2591, + "loss/crossentropy": 2.2739417552948, + "loss/hidden": 1.046875, + "loss/logits": 0.1661624014377594, + "loss/reg": 0.00460821995511651, + "step": 916 + }, + { + "epoch": 0.114625, + "grad_norm": 2.1026499271392822, + "grad_norm_var": 0.17195618728893744, + "learning_rate": 0.0001, + "loss": 0.9901, + "loss/crossentropy": 2.655440330505371, + "loss/hidden": 0.796875, + "loss/logits": 0.14714528620243073, + "loss/reg": 0.004606001079082489, + "step": 917 + }, + { + "epoch": 0.11475, + "grad_norm": 2.9179234504699707, + "grad_norm_var": 0.17394937416602924, + "learning_rate": 0.0001, + "loss": 1.014, + "loss/crossentropy": 2.5501797199249268, + "loss/hidden": 0.8125, + "loss/logits": 0.15548643469810486, + "loss/reg": 0.004603679291903973, + "step": 918 + }, + { + "epoch": 0.114875, + "grad_norm": 2.9957125186920166, + "grad_norm_var": 0.17673345245174207, + "learning_rate": 0.0001, + "loss": 1.0982, + "loss/crossentropy": 2.3883516788482666, + "loss/hidden": 0.8828125, + "loss/logits": 0.16936160624027252, + "loss/reg": 0.004601585678756237, + "step": 919 + }, + { + "epoch": 0.115, + "grad_norm": 2.67694354057312, + "grad_norm_var": 0.16965697193046006, + "learning_rate": 0.0001, + "loss": 1.1013, + "loss/crossentropy": 2.3806746006011963, + "loss/hidden": 0.91015625, + "loss/logits": 0.14516542851924896, + "loss/reg": 0.004599516745656729, + "step": 920 + }, + { + "epoch": 0.115125, + "grad_norm": 2.1424858570098877, + "grad_norm_var": 0.1827775525511394, + "learning_rate": 0.0001, + "loss": 1.0246, + "loss/crossentropy": 2.2575674057006836, + "loss/hidden": 0.828125, + "loss/logits": 0.15050096809864044, + "loss/reg": 0.004597416613250971, + "step": 921 + }, + { + "epoch": 0.11525, + "grad_norm": 5.457708358764648, + "grad_norm_var": 0.682343045667132, + "learning_rate": 0.0001, + "loss": 1.7819, + "loss/crossentropy": 2.941784381866455, + "loss/hidden": 1.40625, + "loss/logits": 0.3296935558319092, + "loss/reg": 0.004595189820975065, + "step": 922 + }, + { + "epoch": 0.115375, + "grad_norm": 2.692840814590454, + "grad_norm_var": 0.6163222739990933, + "learning_rate": 0.0001, + "loss": 1.3758, + "loss/crossentropy": 2.374514102935791, + "loss/hidden": 1.1171875, + "loss/logits": 0.21266797184944153, + "loss/reg": 0.004593092482537031, + "step": 923 + }, + { + "epoch": 0.1155, + "grad_norm": 3.2622177600860596, + "grad_norm_var": 0.6326076754218235, + "learning_rate": 0.0001, + "loss": 1.1365, + "loss/crossentropy": 2.4111597537994385, + "loss/hidden": 0.89453125, + "loss/logits": 0.19610172510147095, + "loss/reg": 0.004590968135744333, + "step": 924 + }, + { + "epoch": 0.115625, + "grad_norm": 3.9593584537506104, + "grad_norm_var": 0.7204108733775624, + "learning_rate": 0.0001, + "loss": 1.4237, + "loss/crossentropy": 2.770081043243408, + "loss/hidden": 1.1796875, + "loss/logits": 0.19811320304870605, + "loss/reg": 0.004588917829096317, + "step": 925 + }, + { + "epoch": 0.11575, + "grad_norm": 2.504918336868286, + "grad_norm_var": 0.7152879483588092, + "learning_rate": 0.0001, + "loss": 1.0618, + "loss/crossentropy": 2.4681365489959717, + "loss/hidden": 0.86328125, + "loss/logits": 0.1526341289281845, + "loss/reg": 0.004586971364915371, + "step": 926 + }, + { + "epoch": 0.115875, + "grad_norm": 3.0209288597106934, + "grad_norm_var": 0.6783647636612234, + "learning_rate": 0.0001, + "loss": 1.138, + "loss/crossentropy": 2.3891263008117676, + "loss/hidden": 0.91796875, + "loss/logits": 0.17415405809879303, + "loss/reg": 0.004584896378219128, + "step": 927 + }, + { + "epoch": 0.116, + "grad_norm": 2.3569037914276123, + "grad_norm_var": 0.6670896431721521, + "learning_rate": 0.0001, + "loss": 1.1342, + "loss/crossentropy": 2.373786687850952, + "loss/hidden": 0.93359375, + "loss/logits": 0.15477266907691956, + "loss/reg": 0.004582802765071392, + "step": 928 + }, + { + "epoch": 0.116125, + "grad_norm": 2.584897041320801, + "grad_norm_var": 0.6642540884373107, + "learning_rate": 0.0001, + "loss": 1.187, + "loss/crossentropy": 2.5142221450805664, + "loss/hidden": 0.95703125, + "loss/logits": 0.18413202464580536, + "loss/reg": 0.004580747336149216, + "step": 929 + }, + { + "epoch": 0.11625, + "grad_norm": 3.1578471660614014, + "grad_norm_var": 0.6622672348995062, + "learning_rate": 0.0001, + "loss": 1.1606, + "loss/crossentropy": 2.6126952171325684, + "loss/hidden": 0.91796875, + "loss/logits": 0.19688570499420166, + "loss/reg": 0.004578826949000359, + "step": 930 + }, + { + "epoch": 0.116375, + "grad_norm": 2.6005330085754395, + "grad_norm_var": 0.6513814069878736, + "learning_rate": 0.0001, + "loss": 1.0783, + "loss/crossentropy": 2.405707597732544, + "loss/hidden": 0.87109375, + "loss/logits": 0.16143286228179932, + "loss/reg": 0.004576742183417082, + "step": 931 + }, + { + "epoch": 0.1165, + "grad_norm": 2.879091501235962, + "grad_norm_var": 0.6517684060932252, + "learning_rate": 0.0001, + "loss": 1.0623, + "loss/crossentropy": 2.5898799896240234, + "loss/hidden": 0.84765625, + "loss/logits": 0.16887424886226654, + "loss/reg": 0.004575024824589491, + "step": 932 + }, + { + "epoch": 0.116625, + "grad_norm": 3.1379029750823975, + "grad_norm_var": 0.6008152897241831, + "learning_rate": 0.0001, + "loss": 1.2402, + "loss/crossentropy": 2.3071014881134033, + "loss/hidden": 1.0234375, + "loss/logits": 0.1710711419582367, + "loss/reg": 0.004573314916342497, + "step": 933 + }, + { + "epoch": 0.11675, + "grad_norm": 4.292084217071533, + "grad_norm_var": 0.6998094594428453, + "learning_rate": 0.0001, + "loss": 1.2793, + "loss/crossentropy": 2.428403854370117, + "loss/hidden": 1.0078125, + "loss/logits": 0.22576534748077393, + "loss/reg": 0.004571723286062479, + "step": 934 + }, + { + "epoch": 0.116875, + "grad_norm": 3.4453883171081543, + "grad_norm_var": 0.7057361661794924, + "learning_rate": 0.0001, + "loss": 1.3151, + "loss/crossentropy": 2.603928327560425, + "loss/hidden": 1.09375, + "loss/logits": 0.1756502389907837, + "loss/reg": 0.004570134915411472, + "step": 935 + }, + { + "epoch": 0.117, + "grad_norm": 2.8401777744293213, + "grad_norm_var": 0.6974157138244702, + "learning_rate": 0.0001, + "loss": 1.1098, + "loss/crossentropy": 2.304800033569336, + "loss/hidden": 0.91015625, + "loss/logits": 0.15392211079597473, + "loss/reg": 0.004568077158182859, + "step": 936 + }, + { + "epoch": 0.117125, + "grad_norm": 2.5680997371673584, + "grad_norm_var": 0.6517920111715199, + "learning_rate": 0.0001, + "loss": 1.1496, + "loss/crossentropy": 2.341132402420044, + "loss/hidden": 0.94140625, + "loss/logits": 0.1625438630580902, + "loss/reg": 0.004566343035548925, + "step": 937 + }, + { + "epoch": 0.11725, + "grad_norm": 2.748947858810425, + "grad_norm_var": 0.2850544648160998, + "learning_rate": 0.0001, + "loss": 1.2239, + "loss/crossentropy": 2.4194369316101074, + "loss/hidden": 0.99609375, + "loss/logits": 0.18219077587127686, + "loss/reg": 0.004564360249787569, + "step": 938 + }, + { + "epoch": 0.117375, + "grad_norm": 3.1744375228881836, + "grad_norm_var": 0.2796176021162296, + "learning_rate": 0.0001, + "loss": 1.049, + "loss/crossentropy": 2.328961133956909, + "loss/hidden": 0.86328125, + "loss/logits": 0.14012068510055542, + "loss/reg": 0.0045626200735569, + "step": 939 + }, + { + "epoch": 0.1175, + "grad_norm": 2.8904807567596436, + "grad_norm_var": 0.276910977824096, + "learning_rate": 0.0001, + "loss": 1.0396, + "loss/crossentropy": 2.96333646774292, + "loss/hidden": 0.84765625, + "loss/logits": 0.146368145942688, + "loss/reg": 0.004560848698019981, + "step": 940 + }, + { + "epoch": 0.117625, + "grad_norm": 2.7542107105255127, + "grad_norm_var": 0.2151558946350927, + "learning_rate": 0.0001, + "loss": 1.0154, + "loss/crossentropy": 2.326488494873047, + "loss/hidden": 0.828125, + "loss/logits": 0.14169706404209137, + "loss/reg": 0.0045591117814183235, + "step": 941 + }, + { + "epoch": 0.11775, + "grad_norm": 2.8061575889587402, + "grad_norm_var": 0.20356104069779402, + "learning_rate": 0.0001, + "loss": 1.067, + "loss/crossentropy": 2.483823537826538, + "loss/hidden": 0.85546875, + "loss/logits": 0.16598659753799438, + "loss/reg": 0.004557049833238125, + "step": 942 + }, + { + "epoch": 0.117875, + "grad_norm": 2.4513025283813477, + "grad_norm_var": 0.2187293570918861, + "learning_rate": 0.0001, + "loss": 1.1831, + "loss/crossentropy": 2.3770830631256104, + "loss/hidden": 0.9609375, + "loss/logits": 0.176588237285614, + "loss/reg": 0.0045554060488939285, + "step": 943 + }, + { + "epoch": 0.118, + "grad_norm": 2.757690906524658, + "grad_norm_var": 0.19878318945221735, + "learning_rate": 0.0001, + "loss": 1.2487, + "loss/crossentropy": 2.335298538208008, + "loss/hidden": 1.0078125, + "loss/logits": 0.19532084465026855, + "loss/reg": 0.00455334922298789, + "step": 944 + }, + { + "epoch": 0.118125, + "grad_norm": 3.0741937160491943, + "grad_norm_var": 0.19037881818987876, + "learning_rate": 0.0001, + "loss": 1.1111, + "loss/crossentropy": 2.5113272666931152, + "loss/hidden": 0.8984375, + "loss/logits": 0.16711819171905518, + "loss/reg": 0.004551599267870188, + "step": 945 + }, + { + "epoch": 0.11825, + "grad_norm": 2.156649589538574, + "grad_norm_var": 0.22844079123048383, + "learning_rate": 0.0001, + "loss": 1.0124, + "loss/crossentropy": 2.654160976409912, + "loss/hidden": 0.8203125, + "loss/logits": 0.14654606580734253, + "loss/reg": 0.004549470264464617, + "step": 946 + }, + { + "epoch": 0.118375, + "grad_norm": 3.1699886322021484, + "grad_norm_var": 0.22512891612332073, + "learning_rate": 0.0001, + "loss": 1.2032, + "loss/crossentropy": 2.6897716522216797, + "loss/hidden": 0.9921875, + "loss/logits": 0.16554811596870422, + "loss/reg": 0.0045473333448171616, + "step": 947 + }, + { + "epoch": 0.1185, + "grad_norm": 54.68584442138672, + "grad_norm_var": 167.50451750408678, + "learning_rate": 0.0001, + "loss": 1.0296, + "loss/crossentropy": 2.5791327953338623, + "loss/hidden": 0.85546875, + "loss/logits": 0.12866336107254028, + "loss/reg": 0.004545523319393396, + "step": 948 + }, + { + "epoch": 0.118625, + "grad_norm": 3.2524545192718506, + "grad_norm_var": 167.45880382689273, + "learning_rate": 0.0001, + "loss": 1.1858, + "loss/crossentropy": 2.6711151599884033, + "loss/hidden": 0.95703125, + "loss/logits": 0.1833563894033432, + "loss/reg": 0.00454343156889081, + "step": 949 + }, + { + "epoch": 0.11875, + "grad_norm": 5.301136016845703, + "grad_norm_var": 167.26685801766013, + "learning_rate": 0.0001, + "loss": 1.553, + "loss/crossentropy": 2.7770638465881348, + "loss/hidden": 1.28125, + "loss/logits": 0.22637835144996643, + "loss/reg": 0.004541344009339809, + "step": 950 + }, + { + "epoch": 0.118875, + "grad_norm": 2.384737730026245, + "grad_norm_var": 167.73447965423813, + "learning_rate": 0.0001, + "loss": 1.0308, + "loss/crossentropy": 2.795858144760132, + "loss/hidden": 0.83984375, + "loss/logits": 0.14552150666713715, + "loss/reg": 0.004539397079497576, + "step": 951 + }, + { + "epoch": 0.119, + "grad_norm": 3.921651601791382, + "grad_norm_var": 167.32475778000344, + "learning_rate": 0.0001, + "loss": 1.3721, + "loss/crossentropy": 2.6767590045928955, + "loss/hidden": 1.140625, + "loss/logits": 0.1861056089401245, + "loss/reg": 0.004537293687462807, + "step": 952 + }, + { + "epoch": 0.119125, + "grad_norm": 2.567948818206787, + "grad_norm_var": 167.32483199379854, + "learning_rate": 0.0001, + "loss": 1.022, + "loss/crossentropy": 2.5112462043762207, + "loss/hidden": 0.82421875, + "loss/logits": 0.1524544656276703, + "loss/reg": 0.004535375162959099, + "step": 953 + }, + { + "epoch": 0.11925, + "grad_norm": 3.9113171100616455, + "grad_norm_var": 166.86572618880632, + "learning_rate": 0.0001, + "loss": 1.4182, + "loss/crossentropy": 2.447330951690674, + "loss/hidden": 1.1640625, + "loss/logits": 0.20876702666282654, + "loss/reg": 0.004533402621746063, + "step": 954 + }, + { + "epoch": 0.119375, + "grad_norm": 2.407547950744629, + "grad_norm_var": 167.22501953627514, + "learning_rate": 0.0001, + "loss": 1.0107, + "loss/crossentropy": 2.4667739868164062, + "loss/hidden": 0.82421875, + "loss/logits": 0.1411362886428833, + "loss/reg": 0.004531473852694035, + "step": 955 + }, + { + "epoch": 0.1195, + "grad_norm": 3.312300205230713, + "grad_norm_var": 167.0454581165803, + "learning_rate": 0.0001, + "loss": 1.1527, + "loss/crossentropy": 2.6299078464508057, + "loss/hidden": 0.9375, + "loss/logits": 0.16988055408000946, + "loss/reg": 0.0045296428725123405, + "step": 956 + }, + { + "epoch": 0.119625, + "grad_norm": 2.991645097732544, + "grad_norm_var": 166.93650144942362, + "learning_rate": 0.0001, + "loss": 1.145, + "loss/crossentropy": 2.484005928039551, + "loss/hidden": 0.94140625, + "loss/logits": 0.158270001411438, + "loss/reg": 0.0045278542675077915, + "step": 957 + }, + { + "epoch": 0.11975, + "grad_norm": 2.3066608905792236, + "grad_norm_var": 167.18625092351098, + "learning_rate": 0.0001, + "loss": 1.0326, + "loss/crossentropy": 2.3425681591033936, + "loss/hidden": 0.828125, + "loss/logits": 0.1592123955488205, + "loss/reg": 0.004525760654360056, + "step": 958 + }, + { + "epoch": 0.119875, + "grad_norm": 2.4849298000335693, + "grad_norm_var": 167.16910661257995, + "learning_rate": 0.0001, + "loss": 1.016, + "loss/crossentropy": 2.577565908432007, + "loss/hidden": 0.8203125, + "loss/logits": 0.15045437216758728, + "loss/reg": 0.00452386075630784, + "step": 959 + }, + { + "epoch": 0.12, + "grad_norm": 3.1377410888671875, + "grad_norm_var": 166.99899214100878, + "learning_rate": 0.0001, + "loss": 1.1344, + "loss/crossentropy": 2.570946455001831, + "loss/hidden": 0.921875, + "loss/logits": 0.16726532578468323, + "loss/reg": 0.004521827679127455, + "step": 960 + }, + { + "epoch": 0.120125, + "grad_norm": 4.738165378570557, + "grad_norm_var": 166.45265671613615, + "learning_rate": 0.0001, + "loss": 1.4551, + "loss/crossentropy": 2.71376371383667, + "loss/hidden": 1.203125, + "loss/logits": 0.20679137110710144, + "loss/reg": 0.0045196013525128365, + "step": 961 + }, + { + "epoch": 0.12025, + "grad_norm": 3.7636489868164062, + "grad_norm_var": 165.70042257567124, + "learning_rate": 0.0001, + "loss": 1.3982, + "loss/crossentropy": 2.3803255558013916, + "loss/hidden": 1.1015625, + "loss/logits": 0.2515062689781189, + "loss/reg": 0.004517595283687115, + "step": 962 + }, + { + "epoch": 0.120375, + "grad_norm": 2.638967752456665, + "grad_norm_var": 165.95531506158162, + "learning_rate": 0.0001, + "loss": 1.2337, + "loss/crossentropy": 2.2372846603393555, + "loss/hidden": 1.0234375, + "loss/logits": 0.16512709856033325, + "loss/reg": 0.004515463951975107, + "step": 963 + }, + { + "epoch": 0.1205, + "grad_norm": 45.81782531738281, + "grad_norm_var": 113.88107496030446, + "learning_rate": 0.0001, + "loss": 1.1605, + "loss/crossentropy": 2.742631435394287, + "loss/hidden": 0.9453125, + "loss/logits": 0.17006908357143402, + "loss/reg": 0.004513174295425415, + "step": 964 + }, + { + "epoch": 0.120625, + "grad_norm": 2.8806581497192383, + "grad_norm_var": 114.02262985566777, + "learning_rate": 0.0001, + "loss": 1.2776, + "loss/crossentropy": 2.4858973026275635, + "loss/hidden": 1.0234375, + "loss/logits": 0.20908400416374207, + "loss/reg": 0.004510868340730667, + "step": 965 + }, + { + "epoch": 0.12075, + "grad_norm": 2.2246646881103516, + "grad_norm_var": 114.86410220669458, + "learning_rate": 0.0001, + "loss": 1.1264, + "loss/crossentropy": 2.499250888824463, + "loss/hidden": 0.91015625, + "loss/logits": 0.17112451791763306, + "loss/reg": 0.004508919548243284, + "step": 966 + }, + { + "epoch": 0.120875, + "grad_norm": 2.6645731925964355, + "grad_norm_var": 114.74462216300226, + "learning_rate": 0.0001, + "loss": 1.2525, + "loss/crossentropy": 2.5269155502319336, + "loss/hidden": 1.03125, + "loss/logits": 0.17614489793777466, + "loss/reg": 0.004506917670369148, + "step": 967 + }, + { + "epoch": 0.121, + "grad_norm": 2.5289218425750732, + "grad_norm_var": 115.20270599436985, + "learning_rate": 0.0001, + "loss": 0.9631, + "loss/crossentropy": 2.5555944442749023, + "loss/hidden": 0.78515625, + "loss/logits": 0.13286443054676056, + "loss/reg": 0.004505137912929058, + "step": 968 + }, + { + "epoch": 0.121125, + "grad_norm": 2.0906901359558105, + "grad_norm_var": 115.41297732177252, + "learning_rate": 0.0001, + "loss": 1.0284, + "loss/crossentropy": 2.4322237968444824, + "loss/hidden": 0.8515625, + "loss/logits": 0.1318206787109375, + "loss/reg": 0.0045034573413431644, + "step": 969 + }, + { + "epoch": 0.12125, + "grad_norm": 2.5444202423095703, + "grad_norm_var": 115.84094031889703, + "learning_rate": 0.0001, + "loss": 1.1368, + "loss/crossentropy": 2.548710584640503, + "loss/hidden": 0.9296875, + "loss/logits": 0.16206462681293488, + "loss/reg": 0.004501515068113804, + "step": 970 + }, + { + "epoch": 0.121375, + "grad_norm": 2.175011157989502, + "grad_norm_var": 115.94123463799326, + "learning_rate": 0.0001, + "loss": 1.0439, + "loss/crossentropy": 2.5955300331115723, + "loss/hidden": 0.8515625, + "loss/logits": 0.14735567569732666, + "loss/reg": 0.004499473143368959, + "step": 971 + }, + { + "epoch": 0.1215, + "grad_norm": 2.360872507095337, + "grad_norm_var": 116.2777207470046, + "learning_rate": 0.0001, + "loss": 1.078, + "loss/crossentropy": 2.328791379928589, + "loss/hidden": 0.88671875, + "loss/logits": 0.14633293449878693, + "loss/reg": 0.00449743214994669, + "step": 972 + }, + { + "epoch": 0.121625, + "grad_norm": 2.2626869678497314, + "grad_norm_var": 116.55077789644868, + "learning_rate": 0.0001, + "loss": 1.0265, + "loss/crossentropy": 2.6862330436706543, + "loss/hidden": 0.83984375, + "loss/logits": 0.1417045295238495, + "loss/reg": 0.004495698027312756, + "step": 973 + }, + { + "epoch": 0.12175, + "grad_norm": 2.239927291870117, + "grad_norm_var": 116.57870277427698, + "learning_rate": 0.0001, + "loss": 0.9865, + "loss/crossentropy": 2.6493136882781982, + "loss/hidden": 0.8046875, + "loss/logits": 0.13682736456394196, + "loss/reg": 0.004493638873100281, + "step": 974 + }, + { + "epoch": 0.121875, + "grad_norm": 3.526413917541504, + "grad_norm_var": 116.24036193195982, + "learning_rate": 0.0001, + "loss": 1.4212, + "loss/crossentropy": 2.3598437309265137, + "loss/hidden": 1.1640625, + "loss/logits": 0.21225669980049133, + "loss/reg": 0.004491583444178104, + "step": 975 + }, + { + "epoch": 0.122, + "grad_norm": 2.4012386798858643, + "grad_norm_var": 116.5037542152016, + "learning_rate": 0.0001, + "loss": 1.0692, + "loss/crossentropy": 2.8136839866638184, + "loss/hidden": 0.8828125, + "loss/logits": 0.14152291417121887, + "loss/reg": 0.004489597398787737, + "step": 976 + }, + { + "epoch": 0.122125, + "grad_norm": 3.48690128326416, + "grad_norm_var": 116.71680821300758, + "learning_rate": 0.0001, + "loss": 1.1166, + "loss/crossentropy": 2.5272679328918457, + "loss/hidden": 0.921875, + "loss/logits": 0.1498267650604248, + "loss/reg": 0.004487714730203152, + "step": 977 + }, + { + "epoch": 0.12225, + "grad_norm": 2.596402406692505, + "grad_norm_var": 117.0489228171561, + "learning_rate": 0.0001, + "loss": 1.0618, + "loss/crossentropy": 2.5853097438812256, + "loss/hidden": 0.875, + "loss/logits": 0.14198589324951172, + "loss/reg": 0.0044856141321361065, + "step": 978 + }, + { + "epoch": 0.122375, + "grad_norm": 2.4075088500976562, + "grad_norm_var": 117.1336997192439, + "learning_rate": 0.0001, + "loss": 1.0708, + "loss/crossentropy": 2.4603147506713867, + "loss/hidden": 0.875, + "loss/logits": 0.1509513258934021, + "loss/reg": 0.004483620636165142, + "step": 979 + }, + { + "epoch": 0.1225, + "grad_norm": 2.511711597442627, + "grad_norm_var": 0.17809257377307758, + "learning_rate": 0.0001, + "loss": 1.0112, + "loss/crossentropy": 2.369588613510132, + "loss/hidden": 0.80859375, + "loss/logits": 0.15779206156730652, + "loss/reg": 0.00448161456733942, + "step": 980 + }, + { + "epoch": 0.122625, + "grad_norm": 2.4200518131256104, + "grad_norm_var": 0.1714391921620101, + "learning_rate": 0.0001, + "loss": 1.1021, + "loss/crossentropy": 2.5299947261810303, + "loss/hidden": 0.8828125, + "loss/logits": 0.17450904846191406, + "loss/reg": 0.004479666240513325, + "step": 981 + }, + { + "epoch": 0.12275, + "grad_norm": 2.180694580078125, + "grad_norm_var": 0.17333618624260225, + "learning_rate": 0.0001, + "loss": 0.9857, + "loss/crossentropy": 2.4115102291107178, + "loss/hidden": 0.80859375, + "loss/logits": 0.13229887187480927, + "loss/reg": 0.004477777983993292, + "step": 982 + }, + { + "epoch": 0.122875, + "grad_norm": 2.063762664794922, + "grad_norm_var": 0.1847061967544647, + "learning_rate": 0.0001, + "loss": 1.0019, + "loss/crossentropy": 2.617342948913574, + "loss/hidden": 0.8046875, + "loss/logits": 0.1524919718503952, + "loss/reg": 0.004475918132811785, + "step": 983 + }, + { + "epoch": 0.123, + "grad_norm": 2.1369118690490723, + "grad_norm_var": 0.19213655390988696, + "learning_rate": 0.0001, + "loss": 0.9803, + "loss/crossentropy": 2.2203562259674072, + "loss/hidden": 0.796875, + "loss/logits": 0.13871444761753082, + "loss/reg": 0.004473875742405653, + "step": 984 + }, + { + "epoch": 0.123125, + "grad_norm": 2.5142624378204346, + "grad_norm_var": 0.18233307349070932, + "learning_rate": 0.0001, + "loss": 1.2147, + "loss/crossentropy": 2.369795322418213, + "loss/hidden": 0.98046875, + "loss/logits": 0.1895258128643036, + "loss/reg": 0.00447199959307909, + "step": 985 + }, + { + "epoch": 0.12325, + "grad_norm": 2.2619707584381104, + "grad_norm_var": 0.18524330473807685, + "learning_rate": 0.0001, + "loss": 1.0356, + "loss/crossentropy": 2.594536781311035, + "loss/hidden": 0.84765625, + "loss/logits": 0.1432015299797058, + "loss/reg": 0.0044701374135911465, + "step": 986 + }, + { + "epoch": 0.123375, + "grad_norm": 2.548429012298584, + "grad_norm_var": 0.1791892169034893, + "learning_rate": 0.0001, + "loss": 1.1041, + "loss/crossentropy": 2.4283149242401123, + "loss/hidden": 0.88671875, + "loss/logits": 0.17271637916564941, + "loss/reg": 0.004468323662877083, + "step": 987 + }, + { + "epoch": 0.1235, + "grad_norm": 1.967695951461792, + "grad_norm_var": 0.19588156260189724, + "learning_rate": 0.0001, + "loss": 1.1141, + "loss/crossentropy": 2.6532421112060547, + "loss/hidden": 0.9140625, + "loss/logits": 0.15532562136650085, + "loss/reg": 0.004466407001018524, + "step": 988 + }, + { + "epoch": 0.123625, + "grad_norm": 1.9731650352478027, + "grad_norm_var": 0.2091392377621749, + "learning_rate": 0.0001, + "loss": 0.9699, + "loss/crossentropy": 2.6200947761535645, + "loss/hidden": 0.796875, + "loss/logits": 0.1283724009990692, + "loss/reg": 0.004464692436158657, + "step": 989 + }, + { + "epoch": 0.12375, + "grad_norm": 2.0132744312286377, + "grad_norm_var": 0.21876841065467237, + "learning_rate": 0.0001, + "loss": 1.0469, + "loss/crossentropy": 2.4755969047546387, + "loss/hidden": 0.84375, + "loss/logits": 0.1585705578327179, + "loss/reg": 0.00446262676268816, + "step": 990 + }, + { + "epoch": 0.123875, + "grad_norm": 2.1972060203552246, + "grad_norm_var": 0.13632242813181178, + "learning_rate": 0.0001, + "loss": 1.0328, + "loss/crossentropy": 2.3555867671966553, + "loss/hidden": 0.85546875, + "loss/logits": 0.13275080919265747, + "loss/reg": 0.004460789728909731, + "step": 991 + }, + { + "epoch": 0.124, + "grad_norm": 3.8369944095611572, + "grad_norm_var": 0.2739970385829828, + "learning_rate": 0.0001, + "loss": 1.7339, + "loss/crossentropy": 2.2441718578338623, + "loss/hidden": 1.3984375, + "loss/logits": 0.2908269166946411, + "loss/reg": 0.00445876969024539, + "step": 992 + }, + { + "epoch": 0.124125, + "grad_norm": 2.786052703857422, + "grad_norm_var": 0.20731647630768535, + "learning_rate": 0.0001, + "loss": 1.1283, + "loss/crossentropy": 2.646028995513916, + "loss/hidden": 0.9140625, + "loss/logits": 0.1697021722793579, + "loss/reg": 0.004456843715161085, + "step": 993 + }, + { + "epoch": 0.12425, + "grad_norm": 2.5664174556732178, + "grad_norm_var": 0.20659147596586322, + "learning_rate": 0.0001, + "loss": 1.1557, + "loss/crossentropy": 2.3940696716308594, + "loss/hidden": 0.94921875, + "loss/logits": 0.16191905736923218, + "loss/reg": 0.004455073736608028, + "step": 994 + }, + { + "epoch": 0.124375, + "grad_norm": 2.2383058071136475, + "grad_norm_var": 0.20819184179118794, + "learning_rate": 0.0001, + "loss": 0.9266, + "loss/crossentropy": 2.499830722808838, + "loss/hidden": 0.76171875, + "loss/logits": 0.12036766111850739, + "loss/reg": 0.004453308880329132, + "step": 995 + }, + { + "epoch": 0.1245, + "grad_norm": 2.340665340423584, + "grad_norm_var": 0.207211701006554, + "learning_rate": 0.0001, + "loss": 1.1757, + "loss/crossentropy": 2.1450212001800537, + "loss/hidden": 0.96484375, + "loss/logits": 0.16630741953849792, + "loss/reg": 0.004451683722436428, + "step": 996 + }, + { + "epoch": 0.124625, + "grad_norm": 2.18617582321167, + "grad_norm_var": 0.20931483319411062, + "learning_rate": 0.0001, + "loss": 1.1681, + "loss/crossentropy": 2.145817518234253, + "loss/hidden": 0.94140625, + "loss/logits": 0.18221250176429749, + "loss/reg": 0.004450384993106127, + "step": 997 + }, + { + "epoch": 0.12475, + "grad_norm": 2.809575319290161, + "grad_norm_var": 0.218725690321934, + "learning_rate": 0.0001, + "loss": 1.0028, + "loss/crossentropy": 2.4370713233947754, + "loss/hidden": 0.81640625, + "loss/logits": 0.14186908304691315, + "loss/reg": 0.004448299296200275, + "step": 998 + }, + { + "epoch": 0.124875, + "grad_norm": 2.1984119415283203, + "grad_norm_var": 0.21377643978811706, + "learning_rate": 0.0001, + "loss": 1.3013, + "loss/crossentropy": 2.4677696228027344, + "loss/hidden": 1.078125, + "loss/logits": 0.1787460744380951, + "loss/reg": 0.004446576349437237, + "step": 999 + }, + { + "epoch": 0.125, + "grad_norm": 2.6378896236419678, + "grad_norm_var": 0.2111563626515095, + "learning_rate": 0.0001, + "loss": 1.1774, + "loss/crossentropy": 2.496150255203247, + "loss/hidden": 0.93359375, + "loss/logits": 0.19933247566223145, + "loss/reg": 0.004444715566933155, + "step": 1000 + }, + { + "epoch": 0.125125, + "grad_norm": 2.227482795715332, + "grad_norm_var": 0.213544138660752, + "learning_rate": 0.0001, + "loss": 1.0916, + "loss/crossentropy": 2.3874671459198, + "loss/hidden": 0.89453125, + "loss/logits": 0.15264838933944702, + "loss/reg": 0.004442666191607714, + "step": 1001 + }, + { + "epoch": 0.12525, + "grad_norm": 2.6360232830047607, + "grad_norm_var": 0.21419004520447135, + "learning_rate": 0.0001, + "loss": 1.2063, + "loss/crossentropy": 2.181767225265503, + "loss/hidden": 1.0234375, + "loss/logits": 0.1384468972682953, + "loss/reg": 0.004440974909812212, + "step": 1002 + }, + { + "epoch": 0.125375, + "grad_norm": 2.564113140106201, + "grad_norm_var": 0.2144159920830437, + "learning_rate": 0.0001, + "loss": 1.2358, + "loss/crossentropy": 2.3964531421661377, + "loss/hidden": 1.0234375, + "loss/logits": 0.16797608137130737, + "loss/reg": 0.00443902425467968, + "step": 1003 + }, + { + "epoch": 0.1255, + "grad_norm": 2.2647745609283447, + "grad_norm_var": 0.20087855485435188, + "learning_rate": 0.0001, + "loss": 1.1095, + "loss/crossentropy": 2.4876301288604736, + "loss/hidden": 0.90625, + "loss/logits": 0.1588534414768219, + "loss/reg": 0.004437169525772333, + "step": 1004 + }, + { + "epoch": 0.125625, + "grad_norm": 2.9978485107421875, + "grad_norm_var": 0.19899346976312698, + "learning_rate": 0.0001, + "loss": 1.0965, + "loss/crossentropy": 2.5442988872528076, + "loss/hidden": 0.8984375, + "loss/logits": 0.15375682711601257, + "loss/reg": 0.00443507032468915, + "step": 1005 + }, + { + "epoch": 0.12575, + "grad_norm": 3.734666585922241, + "grad_norm_var": 0.26529031932980823, + "learning_rate": 0.0001, + "loss": 1.3977, + "loss/crossentropy": 2.6442348957061768, + "loss/hidden": 1.109375, + "loss/logits": 0.24400165677070618, + "loss/reg": 0.0044328500516712666, + "step": 1006 + }, + { + "epoch": 0.125875, + "grad_norm": 2.0463345050811768, + "grad_norm_var": 0.27559841867322327, + "learning_rate": 0.0001, + "loss": 0.9624, + "loss/crossentropy": 2.744716167449951, + "loss/hidden": 0.77734375, + "loss/logits": 0.14069810509681702, + "loss/reg": 0.004430860280990601, + "step": 1007 + }, + { + "epoch": 0.126, + "grad_norm": 2.5981385707855225, + "grad_norm_var": 0.1720635201097591, + "learning_rate": 0.0001, + "loss": 1.1825, + "loss/crossentropy": 2.3077027797698975, + "loss/hidden": 0.95703125, + "loss/logits": 0.18115082383155823, + "loss/reg": 0.004428706131875515, + "step": 1008 + }, + { + "epoch": 0.126125, + "grad_norm": 2.8853800296783447, + "grad_norm_var": 0.17577912545770383, + "learning_rate": 0.0001, + "loss": 1.1727, + "loss/crossentropy": 3.0455784797668457, + "loss/hidden": 0.9375, + "loss/logits": 0.19088850915431976, + "loss/reg": 0.004426531493663788, + "step": 1009 + }, + { + "epoch": 0.12625, + "grad_norm": 3.3546810150146484, + "grad_norm_var": 0.2154711693487454, + "learning_rate": 0.0001, + "loss": 1.356, + "loss/crossentropy": 2.360203504562378, + "loss/hidden": 1.125, + "loss/logits": 0.18671754002571106, + "loss/reg": 0.004424425307661295, + "step": 1010 + }, + { + "epoch": 0.126375, + "grad_norm": 3.1136724948883057, + "grad_norm_var": 0.2202687348010554, + "learning_rate": 0.0001, + "loss": 1.5006, + "loss/crossentropy": 1.7926069498062134, + "loss/hidden": 1.2265625, + "loss/logits": 0.22981694340705872, + "loss/reg": 0.004422247409820557, + "step": 1011 + }, + { + "epoch": 0.1265, + "grad_norm": 2.9608895778656006, + "grad_norm_var": 0.2177180299988663, + "learning_rate": 0.0001, + "loss": 0.9969, + "loss/crossentropy": 2.716583251953125, + "loss/hidden": 0.80859375, + "loss/logits": 0.14412102103233337, + "loss/reg": 0.004419958218932152, + "step": 1012 + }, + { + "epoch": 0.126625, + "grad_norm": 2.627195358276367, + "grad_norm_var": 0.1996009545068233, + "learning_rate": 0.0001, + "loss": 1.2014, + "loss/crossentropy": 2.4077627658843994, + "loss/hidden": 0.96484375, + "loss/logits": 0.19235679507255554, + "loss/reg": 0.004417847376316786, + "step": 1013 + }, + { + "epoch": 0.12675, + "grad_norm": 2.6041698455810547, + "grad_norm_var": 0.20001931967983994, + "learning_rate": 0.0001, + "loss": 1.0357, + "loss/crossentropy": 2.6691579818725586, + "loss/hidden": 0.85546875, + "loss/logits": 0.13609513640403748, + "loss/reg": 0.0044156271032989025, + "step": 1014 + }, + { + "epoch": 0.126875, + "grad_norm": 2.6999282836914062, + "grad_norm_var": 0.18114680748988857, + "learning_rate": 0.0001, + "loss": 1.1957, + "loss/crossentropy": 2.2058401107788086, + "loss/hidden": 0.98828125, + "loss/logits": 0.16330450773239136, + "loss/reg": 0.004413560498505831, + "step": 1015 + }, + { + "epoch": 0.127, + "grad_norm": 2.9605417251586914, + "grad_norm_var": 0.1829561774470515, + "learning_rate": 0.0001, + "loss": 1.1698, + "loss/crossentropy": 2.5311779975891113, + "loss/hidden": 0.9296875, + "loss/logits": 0.19603696465492249, + "loss/reg": 0.0044115264900028706, + "step": 1016 + }, + { + "epoch": 0.127125, + "grad_norm": 3.1632213592529297, + "grad_norm_var": 0.17033870731749232, + "learning_rate": 0.0001, + "loss": 1.1371, + "loss/crossentropy": 2.628852367401123, + "loss/hidden": 0.9453125, + "loss/logits": 0.1477031111717224, + "loss/reg": 0.0044094715267419815, + "step": 1017 + }, + { + "epoch": 0.12725, + "grad_norm": 2.1563079357147217, + "grad_norm_var": 0.19685525865983494, + "learning_rate": 0.0001, + "loss": 1.156, + "loss/crossentropy": 2.51649808883667, + "loss/hidden": 0.93359375, + "loss/logits": 0.17829856276512146, + "loss/reg": 0.004407336004078388, + "step": 1018 + }, + { + "epoch": 0.127375, + "grad_norm": 2.838027238845825, + "grad_norm_var": 0.19308506502144737, + "learning_rate": 0.0001, + "loss": 1.1716, + "loss/crossentropy": 1.851514458656311, + "loss/hidden": 0.984375, + "loss/logits": 0.1431439369916916, + "loss/reg": 0.004405440296977758, + "step": 1019 + }, + { + "epoch": 0.1275, + "grad_norm": 3.7514472007751465, + "grad_norm_var": 0.2225789179284817, + "learning_rate": 0.0001, + "loss": 1.2725, + "loss/crossentropy": 2.4725522994995117, + "loss/hidden": 0.99609375, + "loss/logits": 0.23235675692558289, + "loss/reg": 0.004403635859489441, + "step": 1020 + }, + { + "epoch": 0.127625, + "grad_norm": 2.899569034576416, + "grad_norm_var": 0.22197611268337217, + "learning_rate": 0.0001, + "loss": 1.3405, + "loss/crossentropy": 2.393155336380005, + "loss/hidden": 1.109375, + "loss/logits": 0.187089741230011, + "loss/reg": 0.004401590209454298, + "step": 1021 + }, + { + "epoch": 0.12775, + "grad_norm": 3.2884371280670166, + "grad_norm_var": 0.18473910601510302, + "learning_rate": 0.0001, + "loss": 1.6095, + "loss/crossentropy": 2.3898444175720215, + "loss/hidden": 1.3203125, + "loss/logits": 0.24515338242053986, + "loss/reg": 0.004399486817419529, + "step": 1022 + }, + { + "epoch": 0.127875, + "grad_norm": 3.160599708557129, + "grad_norm_var": 0.13970793310639895, + "learning_rate": 0.0001, + "loss": 1.1549, + "loss/crossentropy": 2.3978278636932373, + "loss/hidden": 0.96484375, + "loss/logits": 0.14608745276927948, + "loss/reg": 0.004397205542773008, + "step": 1023 + }, + { + "epoch": 0.128, + "grad_norm": 3.4500718116760254, + "grad_norm_var": 0.14607975431924464, + "learning_rate": 0.0001, + "loss": 1.3923, + "loss/crossentropy": 1.8279949426651, + "loss/hidden": 1.1640625, + "loss/logits": 0.18429754674434662, + "loss/reg": 0.00439491355791688, + "step": 1024 + }, + { + "epoch": 0.128125, + "grad_norm": 3.99407696723938, + "grad_norm_var": 0.20675474417581274, + "learning_rate": 0.0001, + "loss": 1.5049, + "loss/crossentropy": 2.702716588973999, + "loss/hidden": 1.2265625, + "loss/logits": 0.23438766598701477, + "loss/reg": 0.004392672795802355, + "step": 1025 + }, + { + "epoch": 0.12825, + "grad_norm": 2.2928214073181152, + "grad_norm_var": 0.23606107387848377, + "learning_rate": 0.0001, + "loss": 1.2118, + "loss/crossentropy": 2.398599624633789, + "loss/hidden": 0.9921875, + "loss/logits": 0.17572686076164246, + "loss/reg": 0.004390507936477661, + "step": 1026 + }, + { + "epoch": 0.128375, + "grad_norm": 3.0106916427612305, + "grad_norm_var": 0.2351295893724907, + "learning_rate": 0.0001, + "loss": 1.2084, + "loss/crossentropy": 2.242279529571533, + "loss/hidden": 0.98046875, + "loss/logits": 0.18401223421096802, + "loss/reg": 0.004388165660202503, + "step": 1027 + }, + { + "epoch": 0.1285, + "grad_norm": 3.3926002979278564, + "grad_norm_var": 0.24503759295084418, + "learning_rate": 0.0001, + "loss": 1.2829, + "loss/crossentropy": 2.689535617828369, + "loss/hidden": 1.0625, + "loss/logits": 0.1765148937702179, + "loss/reg": 0.0043861158192157745, + "step": 1028 + }, + { + "epoch": 0.128625, + "grad_norm": 4.1095452308654785, + "grad_norm_var": 0.3051103506304455, + "learning_rate": 0.0001, + "loss": 1.4167, + "loss/crossentropy": 2.3078904151916504, + "loss/hidden": 1.1484375, + "loss/logits": 0.224439799785614, + "loss/reg": 0.004383730702102184, + "step": 1029 + }, + { + "epoch": 0.12875, + "grad_norm": 2.599076747894287, + "grad_norm_var": 0.30545598256471346, + "learning_rate": 0.0001, + "loss": 1.1392, + "loss/crossentropy": 2.7188191413879395, + "loss/hidden": 0.890625, + "loss/logits": 0.20472605526447296, + "loss/reg": 0.004381467588245869, + "step": 1030 + }, + { + "epoch": 0.128875, + "grad_norm": 2.078481435775757, + "grad_norm_var": 0.36360767736667393, + "learning_rate": 0.0001, + "loss": 1.0199, + "loss/crossentropy": 2.741642713546753, + "loss/hidden": 0.82421875, + "loss/logits": 0.15186432003974915, + "loss/reg": 0.00437910296022892, + "step": 1031 + }, + { + "epoch": 0.129, + "grad_norm": 5.224343776702881, + "grad_norm_var": 0.6503873685492408, + "learning_rate": 0.0001, + "loss": 1.3017, + "loss/crossentropy": 2.672182559967041, + "loss/hidden": 1.0390625, + "loss/logits": 0.21886497735977173, + "loss/reg": 0.0043770503252744675, + "step": 1032 + }, + { + "epoch": 0.129125, + "grad_norm": 3.197111129760742, + "grad_norm_var": 0.6502338467882434, + "learning_rate": 0.0001, + "loss": 1.3376, + "loss/crossentropy": 2.442309617996216, + "loss/hidden": 1.1015625, + "loss/logits": 0.19233301281929016, + "loss/reg": 0.004375019110739231, + "step": 1033 + }, + { + "epoch": 0.12925, + "grad_norm": 2.78690767288208, + "grad_norm_var": 0.5860556952241872, + "learning_rate": 0.0001, + "loss": 1.1761, + "loss/crossentropy": 2.430758237838745, + "loss/hidden": 0.9609375, + "loss/logits": 0.17138496041297913, + "loss/reg": 0.00437304237857461, + "step": 1034 + }, + { + "epoch": 0.129375, + "grad_norm": 2.5011260509490967, + "grad_norm_var": 0.6118626954588627, + "learning_rate": 0.0001, + "loss": 1.0916, + "loss/crossentropy": 2.541623115539551, + "loss/hidden": 0.89453125, + "loss/logits": 0.15334823727607727, + "loss/reg": 0.004370801616460085, + "step": 1035 + }, + { + "epoch": 0.1295, + "grad_norm": 2.1834847927093506, + "grad_norm_var": 0.6572482832048148, + "learning_rate": 0.0001, + "loss": 1.002, + "loss/crossentropy": 2.252703905105591, + "loss/hidden": 0.81640625, + "loss/logits": 0.1418866515159607, + "loss/reg": 0.004368768073618412, + "step": 1036 + }, + { + "epoch": 0.129625, + "grad_norm": 3.4379196166992188, + "grad_norm_var": 0.6584227357505256, + "learning_rate": 0.0001, + "loss": 1.2578, + "loss/crossentropy": 2.5080173015594482, + "loss/hidden": 1.03125, + "loss/logits": 0.18284769356250763, + "loss/reg": 0.004366564564406872, + "step": 1037 + }, + { + "epoch": 0.12975, + "grad_norm": 2.793656587600708, + "grad_norm_var": 0.6658574542034102, + "learning_rate": 0.0001, + "loss": 1.5344, + "loss/crossentropy": 2.028933048248291, + "loss/hidden": 1.265625, + "loss/logits": 0.22508540749549866, + "loss/reg": 0.004364544991403818, + "step": 1038 + }, + { + "epoch": 0.129875, + "grad_norm": 2.5851385593414307, + "grad_norm_var": 0.6848422923307773, + "learning_rate": 0.0001, + "loss": 1.1177, + "loss/crossentropy": 2.3001692295074463, + "loss/hidden": 0.9140625, + "loss/logits": 0.16001108288764954, + "loss/reg": 0.0043626632541418076, + "step": 1039 + }, + { + "epoch": 0.13, + "grad_norm": 3.0864601135253906, + "grad_norm_var": 0.6762458829728395, + "learning_rate": 0.0001, + "loss": 1.2109, + "loss/crossentropy": 2.3810689449310303, + "loss/hidden": 0.98828125, + "loss/logits": 0.17900380492210388, + "loss/reg": 0.0043608080595731735, + "step": 1040 + }, + { + "epoch": 0.130125, + "grad_norm": 2.6737496852874756, + "grad_norm_var": 0.6242103012797673, + "learning_rate": 0.0001, + "loss": 1.3919, + "loss/crossentropy": 2.0557336807250977, + "loss/hidden": 1.1328125, + "loss/logits": 0.2154931128025055, + "loss/reg": 0.004358771722763777, + "step": 1041 + }, + { + "epoch": 0.13025, + "grad_norm": 2.583439350128174, + "grad_norm_var": 0.6022000179942284, + "learning_rate": 0.0001, + "loss": 1.0657, + "loss/crossentropy": 2.530609607696533, + "loss/hidden": 0.875, + "loss/logits": 0.14716514945030212, + "loss/reg": 0.004356934688985348, + "step": 1042 + }, + { + "epoch": 0.130375, + "grad_norm": 2.3127689361572266, + "grad_norm_var": 0.6330661539787814, + "learning_rate": 0.0001, + "loss": 1.0062, + "loss/crossentropy": 2.5741043090820312, + "loss/hidden": 0.83203125, + "loss/logits": 0.1306590735912323, + "loss/reg": 0.004354908596724272, + "step": 1043 + }, + { + "epoch": 0.1305, + "grad_norm": 2.1028034687042236, + "grad_norm_var": 0.6646412556630875, + "learning_rate": 0.0001, + "loss": 1.005, + "loss/crossentropy": 2.2496836185455322, + "loss/hidden": 0.8359375, + "loss/logits": 0.12550613284111023, + "loss/reg": 0.004353053402155638, + "step": 1044 + }, + { + "epoch": 0.130625, + "grad_norm": 2.9018990993499756, + "grad_norm_var": 0.5595824371855532, + "learning_rate": 0.0001, + "loss": 1.0929, + "loss/crossentropy": 2.459836959838867, + "loss/hidden": 0.88671875, + "loss/logits": 0.16267293691635132, + "loss/reg": 0.004351151175796986, + "step": 1045 + }, + { + "epoch": 0.13075, + "grad_norm": 2.1798477172851562, + "grad_norm_var": 0.582665735357125, + "learning_rate": 0.0001, + "loss": 1.0568, + "loss/crossentropy": 2.393702983856201, + "loss/hidden": 0.85546875, + "loss/logits": 0.15785646438598633, + "loss/reg": 0.004349268972873688, + "step": 1046 + }, + { + "epoch": 0.130875, + "grad_norm": 3.7185163497924805, + "grad_norm_var": 0.5953326384247966, + "learning_rate": 0.0001, + "loss": 1.114, + "loss/crossentropy": 2.5875675678253174, + "loss/hidden": 0.91796875, + "loss/logits": 0.1525106430053711, + "loss/reg": 0.004347451031208038, + "step": 1047 + }, + { + "epoch": 0.131, + "grad_norm": 2.5062150955200195, + "grad_norm_var": 0.21175117035612606, + "learning_rate": 0.0001, + "loss": 1.2681, + "loss/crossentropy": 2.432136297225952, + "loss/hidden": 1.0390625, + "loss/logits": 0.18555600941181183, + "loss/reg": 0.004345426335930824, + "step": 1048 + }, + { + "epoch": 0.131125, + "grad_norm": 2.811901807785034, + "grad_norm_var": 0.19661994295048071, + "learning_rate": 0.0001, + "loss": 1.041, + "loss/crossentropy": 2.586029291152954, + "loss/hidden": 0.85546875, + "loss/logits": 0.14211300015449524, + "loss/reg": 0.004343352280557156, + "step": 1049 + }, + { + "epoch": 0.13125, + "grad_norm": 2.6836400032043457, + "grad_norm_var": 0.19606042121245745, + "learning_rate": 0.0001, + "loss": 1.1189, + "loss/crossentropy": 2.2877132892608643, + "loss/hidden": 0.92578125, + "loss/logits": 0.14965856075286865, + "loss/reg": 0.004341335967183113, + "step": 1050 + }, + { + "epoch": 0.131375, + "grad_norm": 3.004340171813965, + "grad_norm_var": 0.19911977640688458, + "learning_rate": 0.0001, + "loss": 0.9704, + "loss/crossentropy": 2.545414686203003, + "loss/hidden": 0.79296875, + "loss/logits": 0.13407136499881744, + "loss/reg": 0.004339275881648064, + "step": 1051 + }, + { + "epoch": 0.1315, + "grad_norm": 2.3175387382507324, + "grad_norm_var": 0.19060218969874068, + "learning_rate": 0.0001, + "loss": 1.2081, + "loss/crossentropy": 2.1056127548217773, + "loss/hidden": 1.0, + "loss/logits": 0.16477006673812866, + "loss/reg": 0.004337204620242119, + "step": 1052 + }, + { + "epoch": 0.131625, + "grad_norm": 2.9183707237243652, + "grad_norm_var": 0.15851891177443728, + "learning_rate": 0.0001, + "loss": 1.1596, + "loss/crossentropy": 2.5549087524414062, + "loss/hidden": 0.93359375, + "loss/logits": 0.18266820907592773, + "loss/reg": 0.004335105884820223, + "step": 1053 + }, + { + "epoch": 0.13175, + "grad_norm": 2.005140781402588, + "grad_norm_var": 0.18740257136220345, + "learning_rate": 0.0001, + "loss": 1.0446, + "loss/crossentropy": 2.5802786350250244, + "loss/hidden": 0.84765625, + "loss/logits": 0.1536553055047989, + "loss/reg": 0.00433309143409133, + "step": 1054 + }, + { + "epoch": 0.131875, + "grad_norm": 2.5984582901000977, + "grad_norm_var": 0.18729938166855695, + "learning_rate": 0.0001, + "loss": 1.1794, + "loss/crossentropy": 2.447845458984375, + "loss/hidden": 0.95703125, + "loss/logits": 0.17910084128379822, + "loss/reg": 0.0043309698812663555, + "step": 1055 + }, + { + "epoch": 0.132, + "grad_norm": 2.2338852882385254, + "grad_norm_var": 0.1831504662831539, + "learning_rate": 0.0001, + "loss": 1.0552, + "loss/crossentropy": 2.322484254837036, + "loss/hidden": 0.859375, + "loss/logits": 0.15252941846847534, + "loss/reg": 0.004328942392021418, + "step": 1056 + }, + { + "epoch": 0.132125, + "grad_norm": 2.383525848388672, + "grad_norm_var": 0.18544613518572697, + "learning_rate": 0.0001, + "loss": 1.2197, + "loss/crossentropy": 2.3245205879211426, + "loss/hidden": 1.0078125, + "loss/logits": 0.16857783496379852, + "loss/reg": 0.004326963797211647, + "step": 1057 + }, + { + "epoch": 0.13225, + "grad_norm": 3.00066876411438, + "grad_norm_var": 0.19657906255275273, + "learning_rate": 0.0001, + "loss": 1.3914, + "loss/crossentropy": 2.3864874839782715, + "loss/hidden": 1.109375, + "loss/logits": 0.23875172436237335, + "loss/reg": 0.004325190093368292, + "step": 1058 + }, + { + "epoch": 0.132375, + "grad_norm": 3.0184719562530518, + "grad_norm_var": 0.20021081345078998, + "learning_rate": 0.0001, + "loss": 1.1478, + "loss/crossentropy": 2.5393733978271484, + "loss/hidden": 0.91015625, + "loss/logits": 0.1943821907043457, + "loss/reg": 0.004323435481637716, + "step": 1059 + }, + { + "epoch": 0.1325, + "grad_norm": 4.587968826293945, + "grad_norm_var": 0.4052032312871603, + "learning_rate": 0.0001, + "loss": 1.2345, + "loss/crossentropy": 2.2922558784484863, + "loss/hidden": 1.015625, + "loss/logits": 0.17564015090465546, + "loss/reg": 0.00432176748290658, + "step": 1060 + }, + { + "epoch": 0.132625, + "grad_norm": 8.61639404296875, + "grad_norm_var": 2.5204572599607027, + "learning_rate": 0.0001, + "loss": 1.7566, + "loss/crossentropy": 2.5616378784179688, + "loss/hidden": 1.5, + "loss/logits": 0.21341325342655182, + "loss/reg": 0.0043197330087423325, + "step": 1061 + }, + { + "epoch": 0.13275, + "grad_norm": 3.9714202880859375, + "grad_norm_var": 2.486558816101914, + "learning_rate": 0.0001, + "loss": 1.449, + "loss/crossentropy": 2.4953112602233887, + "loss/hidden": 1.1875, + "loss/logits": 0.21827445924282074, + "loss/reg": 0.0043179914355278015, + "step": 1062 + }, + { + "epoch": 0.132875, + "grad_norm": 4.598790168762207, + "grad_norm_var": 2.587217087573132, + "learning_rate": 0.0001, + "loss": 1.46, + "loss/crossentropy": 2.4032106399536133, + "loss/hidden": 1.1796875, + "loss/logits": 0.23717749118804932, + "loss/reg": 0.004315928090363741, + "step": 1063 + }, + { + "epoch": 0.133, + "grad_norm": 2.2761240005493164, + "grad_norm_var": 2.6157540828571424, + "learning_rate": 0.0001, + "loss": 1.1132, + "loss/crossentropy": 2.2180681228637695, + "loss/hidden": 0.91015625, + "loss/logits": 0.15992170572280884, + "loss/reg": 0.004314035642892122, + "step": 1064 + }, + { + "epoch": 0.133125, + "grad_norm": 5.624876022338867, + "grad_norm_var": 2.921925131142435, + "learning_rate": 0.0001, + "loss": 1.5429, + "loss/crossentropy": 2.552769899368286, + "loss/hidden": 1.2734375, + "loss/logits": 0.22636112570762634, + "loss/reg": 0.004311975557357073, + "step": 1065 + }, + { + "epoch": 0.13325, + "grad_norm": 2.610703945159912, + "grad_norm_var": 2.9300990717021325, + "learning_rate": 0.0001, + "loss": 1.152, + "loss/crossentropy": 2.693028450012207, + "loss/hidden": 0.92578125, + "loss/logits": 0.18307983875274658, + "loss/reg": 0.00430967565625906, + "step": 1066 + }, + { + "epoch": 0.133375, + "grad_norm": 2.063502311706543, + "grad_norm_var": 3.0457713158671065, + "learning_rate": 0.0001, + "loss": 0.9481, + "loss/crossentropy": 2.3977253437042236, + "loss/hidden": 0.7890625, + "loss/logits": 0.11597828567028046, + "loss/reg": 0.004307459108531475, + "step": 1067 + }, + { + "epoch": 0.1335, + "grad_norm": 2.2253830432891846, + "grad_norm_var": 3.059929800360324, + "learning_rate": 0.0001, + "loss": 1.1051, + "loss/crossentropy": 2.43769907951355, + "loss/hidden": 0.90234375, + "loss/logits": 0.1597452163696289, + "loss/reg": 0.0043051764369010925, + "step": 1068 + }, + { + "epoch": 0.133625, + "grad_norm": 2.5021073818206787, + "grad_norm_var": 3.0986482846073766, + "learning_rate": 0.0001, + "loss": 1.213, + "loss/crossentropy": 2.4540703296661377, + "loss/hidden": 0.99609375, + "loss/logits": 0.1738748550415039, + "loss/reg": 0.004303151275962591, + "step": 1069 + }, + { + "epoch": 0.13375, + "grad_norm": 2.6333425045013428, + "grad_norm_var": 3.006911696263074, + "learning_rate": 0.0001, + "loss": 1.1423, + "loss/crossentropy": 2.481794595718384, + "loss/hidden": 0.9375, + "loss/logits": 0.16177596151828766, + "loss/reg": 0.00430120388045907, + "step": 1070 + }, + { + "epoch": 0.133875, + "grad_norm": 4.2749247550964355, + "grad_norm_var": 2.995780076937819, + "learning_rate": 0.0001, + "loss": 1.232, + "loss/crossentropy": 2.376904249191284, + "loss/hidden": 1.046875, + "loss/logits": 0.1420845091342926, + "loss/reg": 0.004299336113035679, + "step": 1071 + }, + { + "epoch": 0.134, + "grad_norm": 3.749925374984741, + "grad_norm_var": 2.8756386517730372, + "learning_rate": 0.0001, + "loss": 1.6449, + "loss/crossentropy": 2.1668522357940674, + "loss/hidden": 1.3515625, + "loss/logits": 0.2503596842288971, + "loss/reg": 0.004297502338886261, + "step": 1072 + }, + { + "epoch": 0.134125, + "grad_norm": 2.394890069961548, + "grad_norm_var": 2.873752523963793, + "learning_rate": 0.0001, + "loss": 1.0081, + "loss/crossentropy": 2.4769225120544434, + "loss/hidden": 0.8203125, + "loss/logits": 0.14485791325569153, + "loss/reg": 0.004295617341995239, + "step": 1073 + }, + { + "epoch": 0.13425, + "grad_norm": 2.521232843399048, + "grad_norm_var": 2.9286262129865914, + "learning_rate": 0.0001, + "loss": 1.1533, + "loss/crossentropy": 2.302316665649414, + "loss/hidden": 0.9375, + "loss/logits": 0.1728420853614807, + "loss/reg": 0.004293751437216997, + "step": 1074 + }, + { + "epoch": 0.134375, + "grad_norm": 3.0982587337493896, + "grad_norm_var": 2.92279106991038, + "learning_rate": 0.0001, + "loss": 1.0719, + "loss/crossentropy": 2.5007832050323486, + "loss/hidden": 0.8828125, + "loss/logits": 0.14618419110774994, + "loss/reg": 0.004291870631277561, + "step": 1075 + }, + { + "epoch": 0.1345, + "grad_norm": 2.1802334785461426, + "grad_norm_var": 2.9709529639568184, + "learning_rate": 0.0001, + "loss": 1.0165, + "loss/crossentropy": 2.736433982849121, + "loss/hidden": 0.83984375, + "loss/logits": 0.1337248980998993, + "loss/reg": 0.00428979704156518, + "step": 1076 + }, + { + "epoch": 0.134625, + "grad_norm": 2.309565782546997, + "grad_norm_var": 1.1199522794543773, + "learning_rate": 0.0001, + "loss": 1.1145, + "loss/crossentropy": 2.277212619781494, + "loss/hidden": 0.91796875, + "loss/logits": 0.1536218523979187, + "loss/reg": 0.004287887830287218, + "step": 1077 + }, + { + "epoch": 0.13475, + "grad_norm": 2.270759105682373, + "grad_norm_var": 1.0951157521634287, + "learning_rate": 0.0001, + "loss": 0.9921, + "loss/crossentropy": 2.6023480892181396, + "loss/hidden": 0.8046875, + "loss/logits": 0.14457595348358154, + "loss/reg": 0.00428583100438118, + "step": 1078 + }, + { + "epoch": 0.134875, + "grad_norm": 2.0448989868164062, + "grad_norm_var": 0.9441842031089095, + "learning_rate": 0.0001, + "loss": 1.0484, + "loss/crossentropy": 2.619910955429077, + "loss/hidden": 0.84765625, + "loss/logits": 0.15788725018501282, + "loss/reg": 0.0042838454246521, + "step": 1079 + }, + { + "epoch": 0.135, + "grad_norm": 2.8515126705169678, + "grad_norm_var": 0.9247776412201837, + "learning_rate": 0.0001, + "loss": 0.9959, + "loss/crossentropy": 2.4871740341186523, + "loss/hidden": 0.8203125, + "loss/logits": 0.13277903199195862, + "loss/reg": 0.004281722474843264, + "step": 1080 + }, + { + "epoch": 0.135125, + "grad_norm": 2.5293588638305664, + "grad_norm_var": 0.3720854176506897, + "learning_rate": 0.0001, + "loss": 1.0284, + "loss/crossentropy": 2.8959267139434814, + "loss/hidden": 0.828125, + "loss/logits": 0.15750843286514282, + "loss/reg": 0.004279691725969315, + "step": 1081 + }, + { + "epoch": 0.13525, + "grad_norm": 2.639998197555542, + "grad_norm_var": 0.37201959594675976, + "learning_rate": 0.0001, + "loss": 1.1434, + "loss/crossentropy": 2.4725499153137207, + "loss/hidden": 0.93359375, + "loss/logits": 0.16706131398677826, + "loss/reg": 0.0042777759954333305, + "step": 1082 + }, + { + "epoch": 0.135375, + "grad_norm": 2.474238157272339, + "grad_norm_var": 0.35082104566976846, + "learning_rate": 0.0001, + "loss": 0.9969, + "loss/crossentropy": 2.4064362049102783, + "loss/hidden": 0.81640625, + "loss/logits": 0.13769997656345367, + "loss/reg": 0.004275754559785128, + "step": 1083 + }, + { + "epoch": 0.1355, + "grad_norm": 5.083343982696533, + "grad_norm_var": 0.6923522790607459, + "learning_rate": 0.0001, + "loss": 1.1973, + "loss/crossentropy": 2.635760545730591, + "loss/hidden": 0.98828125, + "loss/logits": 0.16631773114204407, + "loss/reg": 0.0042738220654428005, + "step": 1084 + }, + { + "epoch": 0.135625, + "grad_norm": 2.9828197956085205, + "grad_norm_var": 0.6846627645265992, + "learning_rate": 0.0001, + "loss": 1.4219, + "loss/crossentropy": 2.64700984954834, + "loss/hidden": 1.171875, + "loss/logits": 0.20732998847961426, + "loss/reg": 0.0042719184421002865, + "step": 1085 + }, + { + "epoch": 0.13575, + "grad_norm": 2.5733866691589355, + "grad_norm_var": 0.6868389075343996, + "learning_rate": 0.0001, + "loss": 1.0162, + "loss/crossentropy": 2.6855356693267822, + "loss/hidden": 0.83203125, + "loss/logits": 0.14148542284965515, + "loss/reg": 0.004269769415259361, + "step": 1086 + }, + { + "epoch": 0.135875, + "grad_norm": 3.7361340522766113, + "grad_norm_var": 0.6043207840777595, + "learning_rate": 0.0001, + "loss": 1.6576, + "loss/crossentropy": 2.38763689994812, + "loss/hidden": 1.328125, + "loss/logits": 0.286837637424469, + "loss/reg": 0.004267562180757523, + "step": 1087 + }, + { + "epoch": 0.136, + "grad_norm": 2.4554789066314697, + "grad_norm_var": 0.5520046435601859, + "learning_rate": 0.0001, + "loss": 1.0165, + "loss/crossentropy": 2.697847366333008, + "loss/hidden": 0.8203125, + "loss/logits": 0.15349683165550232, + "loss/reg": 0.004265283700078726, + "step": 1088 + }, + { + "epoch": 0.136125, + "grad_norm": 2.2833757400512695, + "grad_norm_var": 0.5581976166383347, + "learning_rate": 0.0001, + "loss": 1.1297, + "loss/crossentropy": 2.6681339740753174, + "loss/hidden": 0.91796875, + "loss/logits": 0.16904997825622559, + "loss/reg": 0.004263162147253752, + "step": 1089 + }, + { + "epoch": 0.13625, + "grad_norm": 2.1826672554016113, + "grad_norm_var": 0.5757864160069344, + "learning_rate": 0.0001, + "loss": 1.0232, + "loss/crossentropy": 2.3187673091888428, + "loss/hidden": 0.84375, + "loss/logits": 0.136864572763443, + "loss/reg": 0.004261130001395941, + "step": 1090 + }, + { + "epoch": 0.136375, + "grad_norm": 3.739326238632202, + "grad_norm_var": 0.632863410677898, + "learning_rate": 0.0001, + "loss": 1.1344, + "loss/crossentropy": 2.590574264526367, + "loss/hidden": 0.9140625, + "loss/logits": 0.1777951866388321, + "loss/reg": 0.004259143024682999, + "step": 1091 + }, + { + "epoch": 0.1365, + "grad_norm": 2.1081202030181885, + "grad_norm_var": 0.6388693719171433, + "learning_rate": 0.0001, + "loss": 1.0255, + "loss/crossentropy": 2.8422060012817383, + "loss/hidden": 0.84375, + "loss/logits": 0.13917985558509827, + "loss/reg": 0.0042572119273245335, + "step": 1092 + }, + { + "epoch": 0.136625, + "grad_norm": 1.9327036142349243, + "grad_norm_var": 0.6707091951265027, + "learning_rate": 0.0001, + "loss": 0.9773, + "loss/crossentropy": 2.6250529289245605, + "loss/hidden": 0.796875, + "loss/logits": 0.13788098096847534, + "loss/reg": 0.004255138337612152, + "step": 1093 + }, + { + "epoch": 0.13675, + "grad_norm": 2.4659841060638428, + "grad_norm_var": 0.6607986154781931, + "learning_rate": 0.0001, + "loss": 1.159, + "loss/crossentropy": 2.2569518089294434, + "loss/hidden": 0.94140625, + "loss/logits": 0.1750330626964569, + "loss/reg": 0.004253007471561432, + "step": 1094 + }, + { + "epoch": 0.136875, + "grad_norm": 2.6554629802703857, + "grad_norm_var": 0.6262725765926574, + "learning_rate": 0.0001, + "loss": 1.3871, + "loss/crossentropy": 2.323014974594116, + "loss/hidden": 1.1015625, + "loss/logits": 0.2430596649646759, + "loss/reg": 0.004250808618962765, + "step": 1095 + }, + { + "epoch": 0.137, + "grad_norm": 2.722032070159912, + "grad_norm_var": 0.6263166142478738, + "learning_rate": 0.0001, + "loss": 1.0409, + "loss/crossentropy": 2.4675052165985107, + "loss/hidden": 0.86328125, + "loss/logits": 0.13518026471138, + "loss/reg": 0.004248757380992174, + "step": 1096 + }, + { + "epoch": 0.137125, + "grad_norm": 2.510000228881836, + "grad_norm_var": 0.6270005997929298, + "learning_rate": 0.0001, + "loss": 1.0616, + "loss/crossentropy": 2.3804080486297607, + "loss/hidden": 0.87109375, + "loss/logits": 0.14798954129219055, + "loss/reg": 0.004246733151376247, + "step": 1097 + }, + { + "epoch": 0.13725, + "grad_norm": 3.058847188949585, + "grad_norm_var": 0.6299195109389221, + "learning_rate": 0.0001, + "loss": 1.0296, + "loss/crossentropy": 2.7722012996673584, + "loss/hidden": 0.83984375, + "loss/logits": 0.1473253071308136, + "loss/reg": 0.004244515672326088, + "step": 1098 + }, + { + "epoch": 0.137375, + "grad_norm": 4.170520782470703, + "grad_norm_var": 0.7337604064260393, + "learning_rate": 0.0001, + "loss": 1.1115, + "loss/crossentropy": 3.227797508239746, + "loss/hidden": 0.8828125, + "loss/logits": 0.18627075850963593, + "loss/reg": 0.004242491442710161, + "step": 1099 + }, + { + "epoch": 0.1375, + "grad_norm": 2.20058536529541, + "grad_norm_var": 0.4201977001022351, + "learning_rate": 0.0001, + "loss": 1.0558, + "loss/crossentropy": 2.337442636489868, + "loss/hidden": 0.8671875, + "loss/logits": 0.14616578817367554, + "loss/reg": 0.004240325652062893, + "step": 1100 + }, + { + "epoch": 0.137625, + "grad_norm": 2.378139019012451, + "grad_norm_var": 0.42315778530047454, + "learning_rate": 0.0001, + "loss": 0.9849, + "loss/crossentropy": 2.418541431427002, + "loss/hidden": 0.82421875, + "loss/logits": 0.1183251217007637, + "loss/reg": 0.00423810537904501, + "step": 1101 + }, + { + "epoch": 0.13775, + "grad_norm": 2.4013891220092773, + "grad_norm_var": 0.42787131976948645, + "learning_rate": 0.0001, + "loss": 1.044, + "loss/crossentropy": 2.642296552658081, + "loss/hidden": 0.8359375, + "loss/logits": 0.16568773984909058, + "loss/reg": 0.004236077889800072, + "step": 1102 + }, + { + "epoch": 0.137875, + "grad_norm": 2.395822286605835, + "grad_norm_var": 0.35275757091918025, + "learning_rate": 0.0001, + "loss": 1.0828, + "loss/crossentropy": 2.457338571548462, + "loss/hidden": 0.8828125, + "loss/logits": 0.15768851339817047, + "loss/reg": 0.004234058782458305, + "step": 1103 + }, + { + "epoch": 0.138, + "grad_norm": 2.5931308269500732, + "grad_norm_var": 0.35121999529942605, + "learning_rate": 0.0001, + "loss": 1.2445, + "loss/crossentropy": 2.370553731918335, + "loss/hidden": 1.015625, + "loss/logits": 0.1865496039390564, + "loss/reg": 0.004232100211083889, + "step": 1104 + }, + { + "epoch": 0.138125, + "grad_norm": 2.5963783264160156, + "grad_norm_var": 0.3436125305875331, + "learning_rate": 0.0001, + "loss": 0.9801, + "loss/crossentropy": 2.5301551818847656, + "loss/hidden": 0.80078125, + "loss/logits": 0.1369716078042984, + "loss/reg": 0.004230163525789976, + "step": 1105 + }, + { + "epoch": 0.13825, + "grad_norm": 2.951883316040039, + "grad_norm_var": 0.3345145438296379, + "learning_rate": 0.0001, + "loss": 0.9827, + "loss/crossentropy": 3.0806362628936768, + "loss/hidden": 0.8046875, + "loss/logits": 0.13572098314762115, + "loss/reg": 0.0042281243950128555, + "step": 1106 + }, + { + "epoch": 0.138375, + "grad_norm": 2.111954927444458, + "grad_norm_var": 0.2701844296523925, + "learning_rate": 0.0001, + "loss": 1.1983, + "loss/crossentropy": 2.3894336223602295, + "loss/hidden": 0.984375, + "loss/logits": 0.1716623604297638, + "loss/reg": 0.004226126708090305, + "step": 1107 + }, + { + "epoch": 0.1385, + "grad_norm": 3.0929460525512695, + "grad_norm_var": 0.2690614225265311, + "learning_rate": 0.0001, + "loss": 1.1983, + "loss/crossentropy": 2.3786656856536865, + "loss/hidden": 1.0, + "loss/logits": 0.15606652200222015, + "loss/reg": 0.004224089439958334, + "step": 1108 + }, + { + "epoch": 0.138625, + "grad_norm": 3.325866460800171, + "grad_norm_var": 0.25900974055676873, + "learning_rate": 0.0001, + "loss": 1.0968, + "loss/crossentropy": 2.668332099914551, + "loss/hidden": 0.8671875, + "loss/logits": 0.1874256730079651, + "loss/reg": 0.00422210618853569, + "step": 1109 + }, + { + "epoch": 0.13875, + "grad_norm": 3.774113178253174, + "grad_norm_var": 0.32044570279647266, + "learning_rate": 0.0001, + "loss": 0.9994, + "loss/crossentropy": 2.6192150115966797, + "loss/hidden": 0.8203125, + "loss/logits": 0.13685137033462524, + "loss/reg": 0.004220074508339167, + "step": 1110 + }, + { + "epoch": 0.138875, + "grad_norm": 3.656229019165039, + "grad_norm_var": 0.362595306683378, + "learning_rate": 0.0001, + "loss": 1.1159, + "loss/crossentropy": 2.3791658878326416, + "loss/hidden": 0.91796875, + "loss/logits": 0.1557137668132782, + "loss/reg": 0.004218076355755329, + "step": 1111 + }, + { + "epoch": 0.139, + "grad_norm": 2.500005006790161, + "grad_norm_var": 0.37009339748612907, + "learning_rate": 0.0001, + "loss": 1.1123, + "loss/crossentropy": 2.2807750701904297, + "loss/hidden": 0.89453125, + "loss/logits": 0.17557448148727417, + "loss/reg": 0.004216110333800316, + "step": 1112 + }, + { + "epoch": 0.139125, + "grad_norm": 2.6660091876983643, + "grad_norm_var": 0.36438900758074033, + "learning_rate": 0.0001, + "loss": 1.0914, + "loss/crossentropy": 2.6014909744262695, + "loss/hidden": 0.89453125, + "loss/logits": 0.1547611951828003, + "loss/reg": 0.004214086104184389, + "step": 1113 + }, + { + "epoch": 0.13925, + "grad_norm": 2.3377296924591064, + "grad_norm_var": 0.37845468238227015, + "learning_rate": 0.0001, + "loss": 1.1139, + "loss/crossentropy": 2.7464191913604736, + "loss/hidden": 0.90234375, + "loss/logits": 0.16947275400161743, + "loss/reg": 0.004212013445794582, + "step": 1114 + }, + { + "epoch": 0.139375, + "grad_norm": 2.4301981925964355, + "grad_norm_var": 0.2548452172505712, + "learning_rate": 0.0001, + "loss": 1.072, + "loss/crossentropy": 2.5051262378692627, + "loss/hidden": 0.8828125, + "loss/logits": 0.14710885286331177, + "loss/reg": 0.0042099012061953545, + "step": 1115 + }, + { + "epoch": 0.1395, + "grad_norm": 2.815537452697754, + "grad_norm_var": 0.23644342440034408, + "learning_rate": 0.0001, + "loss": 1.1469, + "loss/crossentropy": 2.6059226989746094, + "loss/hidden": 0.92578125, + "loss/logits": 0.17900194227695465, + "loss/reg": 0.0042078145779669285, + "step": 1116 + }, + { + "epoch": 0.139625, + "grad_norm": 2.9746217727661133, + "grad_norm_var": 0.22897005663627562, + "learning_rate": 0.0001, + "loss": 1.1095, + "loss/crossentropy": 2.892439842224121, + "loss/hidden": 0.91015625, + "loss/logits": 0.15726403892040253, + "loss/reg": 0.004205791745334864, + "step": 1117 + }, + { + "epoch": 0.13975, + "grad_norm": 2.1311089992523193, + "grad_norm_var": 0.24750381735717988, + "learning_rate": 0.0001, + "loss": 1.0609, + "loss/crossentropy": 2.477149248123169, + "loss/hidden": 0.8671875, + "loss/logits": 0.15162619948387146, + "loss/reg": 0.004203863441944122, + "step": 1118 + }, + { + "epoch": 0.139875, + "grad_norm": 2.1927154064178467, + "grad_norm_var": 0.2602719277895896, + "learning_rate": 0.0001, + "loss": 1.0056, + "loss/crossentropy": 2.3840315341949463, + "loss/hidden": 0.8046875, + "loss/logits": 0.15886801481246948, + "loss/reg": 0.004202014300972223, + "step": 1119 + }, + { + "epoch": 0.14, + "grad_norm": 2.850994348526001, + "grad_norm_var": 0.25871108381456814, + "learning_rate": 0.0001, + "loss": 1.2744, + "loss/crossentropy": 2.935702085494995, + "loss/hidden": 1.0390625, + "loss/logits": 0.1933366060256958, + "loss/reg": 0.004200007766485214, + "step": 1120 + }, + { + "epoch": 0.140125, + "grad_norm": 2.7692806720733643, + "grad_norm_var": 0.2564497076881023, + "learning_rate": 0.0001, + "loss": 1.1429, + "loss/crossentropy": 2.305704355239868, + "loss/hidden": 0.89453125, + "loss/logits": 0.20640595257282257, + "loss/reg": 0.0041981167159974575, + "step": 1121 + }, + { + "epoch": 0.14025, + "grad_norm": 2.4898860454559326, + "grad_norm_var": 0.2595914437365072, + "learning_rate": 0.0001, + "loss": 0.9756, + "loss/crossentropy": 2.6060853004455566, + "loss/hidden": 0.7890625, + "loss/logits": 0.14455264806747437, + "loss/reg": 0.004196107853204012, + "step": 1122 + }, + { + "epoch": 0.140375, + "grad_norm": 31.69025421142578, + "grad_norm_var": 52.39364291838152, + "learning_rate": 0.0001, + "loss": 1.1367, + "loss/crossentropy": 2.816709518432617, + "loss/hidden": 0.9453125, + "loss/logits": 0.14948531985282898, + "loss/reg": 0.004194286651909351, + "step": 1123 + }, + { + "epoch": 0.1405, + "grad_norm": 2.6551873683929443, + "grad_norm_var": 52.493939083618166, + "learning_rate": 0.0001, + "loss": 1.1028, + "loss/crossentropy": 2.7032201290130615, + "loss/hidden": 0.88671875, + "loss/logits": 0.17413891851902008, + "loss/reg": 0.004192298278212547, + "step": 1124 + }, + { + "epoch": 0.140625, + "grad_norm": 2.712380886077881, + "grad_norm_var": 52.61994398728479, + "learning_rate": 0.0001, + "loss": 1.1739, + "loss/crossentropy": 2.622096300125122, + "loss/hidden": 0.95703125, + "loss/logits": 0.17497727274894714, + "loss/reg": 0.0041902982629835606, + "step": 1125 + }, + { + "epoch": 0.14075, + "grad_norm": 2.355632781982422, + "grad_norm_var": 52.890626023811876, + "learning_rate": 0.0001, + "loss": 0.9784, + "loss/crossentropy": 2.3552989959716797, + "loss/hidden": 0.79296875, + "loss/logits": 0.143496572971344, + "loss/reg": 0.004188500810414553, + "step": 1126 + }, + { + "epoch": 0.140875, + "grad_norm": 2.7159671783447266, + "grad_norm_var": 53.045613069983446, + "learning_rate": 0.0001, + "loss": 1.1407, + "loss/crossentropy": 2.45786190032959, + "loss/hidden": 0.93359375, + "loss/logits": 0.16524501144886017, + "loss/reg": 0.004186683334410191, + "step": 1127 + }, + { + "epoch": 0.141, + "grad_norm": 3.830094575881958, + "grad_norm_var": 52.820475932071744, + "learning_rate": 0.0001, + "loss": 1.0811, + "loss/crossentropy": 2.1976234912872314, + "loss/hidden": 0.88671875, + "loss/logits": 0.1525820791721344, + "loss/reg": 0.004184682387858629, + "step": 1128 + }, + { + "epoch": 0.141125, + "grad_norm": 2.6621882915496826, + "grad_norm_var": 52.82139900035407, + "learning_rate": 0.0001, + "loss": 1.0459, + "loss/crossentropy": 2.539736270904541, + "loss/hidden": 0.81640625, + "loss/logits": 0.1876693218946457, + "loss/reg": 0.004182685166597366, + "step": 1129 + }, + { + "epoch": 0.14125, + "grad_norm": 2.478451728820801, + "grad_norm_var": 52.78251904082665, + "learning_rate": 0.0001, + "loss": 1.0815, + "loss/crossentropy": 2.4196884632110596, + "loss/hidden": 0.88671875, + "loss/logits": 0.1529858261346817, + "loss/reg": 0.00418076990172267, + "step": 1130 + }, + { + "epoch": 0.141375, + "grad_norm": 3.299400568008423, + "grad_norm_var": 52.59163994639377, + "learning_rate": 0.0001, + "loss": 1.2759, + "loss/crossentropy": 2.5420329570770264, + "loss/hidden": 1.0625, + "loss/logits": 0.17164292931556702, + "loss/reg": 0.004178792238235474, + "step": 1131 + }, + { + "epoch": 0.1415, + "grad_norm": 2.3637001514434814, + "grad_norm_var": 52.70822859008106, + "learning_rate": 0.0001, + "loss": 1.1613, + "loss/crossentropy": 2.6717777252197266, + "loss/hidden": 0.9453125, + "loss/logits": 0.17417655885219574, + "loss/reg": 0.004176879767328501, + "step": 1132 + }, + { + "epoch": 0.141625, + "grad_norm": 2.500570058822632, + "grad_norm_var": 52.819367266798494, + "learning_rate": 0.0001, + "loss": 0.9633, + "loss/crossentropy": 2.6206371784210205, + "loss/hidden": 0.78515625, + "loss/logits": 0.13640643656253815, + "loss/reg": 0.004175043664872646, + "step": 1133 + }, + { + "epoch": 0.14175, + "grad_norm": 2.323843479156494, + "grad_norm_var": 52.76129867971668, + "learning_rate": 0.0001, + "loss": 1.0942, + "loss/crossentropy": 2.5218851566314697, + "loss/hidden": 0.8984375, + "loss/logits": 0.15399572253227234, + "loss/reg": 0.004173224791884422, + "step": 1134 + }, + { + "epoch": 0.141875, + "grad_norm": 2.0677201747894287, + "grad_norm_var": 52.80061443559793, + "learning_rate": 0.0001, + "loss": 1.0591, + "loss/crossentropy": 2.580735683441162, + "loss/hidden": 0.87109375, + "loss/logits": 0.14630158245563507, + "loss/reg": 0.0041715288534760475, + "step": 1135 + }, + { + "epoch": 0.142, + "grad_norm": 2.8460326194763184, + "grad_norm_var": 52.80169720296209, + "learning_rate": 0.0001, + "loss": 1.2675, + "loss/crossentropy": 2.315237283706665, + "loss/hidden": 1.0625, + "loss/logits": 0.16331645846366882, + "loss/reg": 0.0041695088148117065, + "step": 1136 + }, + { + "epoch": 0.142125, + "grad_norm": 2.169602632522583, + "grad_norm_var": 52.96135990851253, + "learning_rate": 0.0001, + "loss": 1.0633, + "loss/crossentropy": 2.5278215408325195, + "loss/hidden": 0.8828125, + "loss/logits": 0.1388036012649536, + "loss/reg": 0.004167445003986359, + "step": 1137 + }, + { + "epoch": 0.14225, + "grad_norm": 2.01615047454834, + "grad_norm_var": 53.099042280735006, + "learning_rate": 0.0001, + "loss": 0.9819, + "loss/crossentropy": 2.4882190227508545, + "loss/hidden": 0.8046875, + "loss/logits": 0.13553820550441742, + "loss/reg": 0.004165465943515301, + "step": 1138 + }, + { + "epoch": 0.142375, + "grad_norm": 2.4339070320129395, + "grad_norm_var": 0.2098356411642726, + "learning_rate": 0.0001, + "loss": 0.9859, + "loss/crossentropy": 2.644569158554077, + "loss/hidden": 0.80859375, + "loss/logits": 0.13571619987487793, + "loss/reg": 0.00416343891993165, + "step": 1139 + }, + { + "epoch": 0.1425, + "grad_norm": 2.3045334815979004, + "grad_norm_var": 0.2144459690924632, + "learning_rate": 0.0001, + "loss": 1.061, + "loss/crossentropy": 2.491943597793579, + "loss/hidden": 0.859375, + "loss/logits": 0.16004428267478943, + "loss/reg": 0.004161354620009661, + "step": 1140 + }, + { + "epoch": 0.142625, + "grad_norm": 2.173408031463623, + "grad_norm_var": 0.22219091176189235, + "learning_rate": 0.0001, + "loss": 1.0067, + "loss/crossentropy": 2.374779224395752, + "loss/hidden": 0.8203125, + "loss/logits": 0.14481596648693085, + "loss/reg": 0.004159385804086924, + "step": 1141 + }, + { + "epoch": 0.14275, + "grad_norm": 3.0710461139678955, + "grad_norm_var": 0.23718192859111392, + "learning_rate": 0.0001, + "loss": 1.0164, + "loss/crossentropy": 2.6076908111572266, + "loss/hidden": 0.83984375, + "loss/logits": 0.13495643436908722, + "loss/reg": 0.004157309886068106, + "step": 1142 + }, + { + "epoch": 0.142875, + "grad_norm": 3.763521432876587, + "grad_norm_var": 0.3249627427406568, + "learning_rate": 0.0001, + "loss": 1.1313, + "loss/crossentropy": 2.3016669750213623, + "loss/hidden": 0.92578125, + "loss/logits": 0.16395646333694458, + "loss/reg": 0.004155360162258148, + "step": 1143 + }, + { + "epoch": 0.143, + "grad_norm": 8.867222785949707, + "grad_norm_var": 2.70734825211357, + "learning_rate": 0.0001, + "loss": 1.3971, + "loss/crossentropy": 2.549853563308716, + "loss/hidden": 1.1171875, + "loss/logits": 0.23840667307376862, + "loss/reg": 0.0041534146293997765, + "step": 1144 + }, + { + "epoch": 0.143125, + "grad_norm": 4.079037189483643, + "grad_norm_var": 2.7767747967197427, + "learning_rate": 0.0001, + "loss": 1.2853, + "loss/crossentropy": 2.298107862472534, + "loss/hidden": 1.0859375, + "loss/logits": 0.15788918733596802, + "loss/reg": 0.004151403903961182, + "step": 1145 + }, + { + "epoch": 0.14325, + "grad_norm": 2.5469048023223877, + "grad_norm_var": 2.7718749700746383, + "learning_rate": 0.0001, + "loss": 1.1133, + "loss/crossentropy": 2.588602066040039, + "loss/hidden": 0.8984375, + "loss/logits": 0.17332546412944794, + "loss/reg": 0.004149466287344694, + "step": 1146 + }, + { + "epoch": 0.143375, + "grad_norm": 2.2044925689697266, + "grad_norm_var": 2.8106347308786437, + "learning_rate": 0.0001, + "loss": 0.9728, + "loss/crossentropy": 2.4486637115478516, + "loss/hidden": 0.79296875, + "loss/logits": 0.13837072253227234, + "loss/reg": 0.004147485829889774, + "step": 1147 + }, + { + "epoch": 0.1435, + "grad_norm": 2.0629138946533203, + "grad_norm_var": 2.841135428686917, + "learning_rate": 0.0001, + "loss": 1.0498, + "loss/crossentropy": 2.611081123352051, + "loss/hidden": 0.8515625, + "loss/logits": 0.1567818820476532, + "loss/reg": 0.004145504906773567, + "step": 1148 + }, + { + "epoch": 0.143625, + "grad_norm": 2.686124324798584, + "grad_norm_var": 2.8318111276034035, + "learning_rate": 0.0001, + "loss": 1.1465, + "loss/crossentropy": 2.3890397548675537, + "loss/hidden": 0.9453125, + "loss/logits": 0.1597655862569809, + "loss/reg": 0.004143500700592995, + "step": 1149 + }, + { + "epoch": 0.14375, + "grad_norm": 2.1094777584075928, + "grad_norm_var": 2.8533239929343903, + "learning_rate": 0.0001, + "loss": 1.1325, + "loss/crossentropy": 2.349745273590088, + "loss/hidden": 0.91796875, + "loss/logits": 0.17312946915626526, + "loss/reg": 0.0041414061561226845, + "step": 1150 + }, + { + "epoch": 0.143875, + "grad_norm": 2.2789037227630615, + "grad_norm_var": 2.830912674059921, + "learning_rate": 0.0001, + "loss": 1.0119, + "loss/crossentropy": 2.482297897338867, + "loss/hidden": 0.828125, + "loss/logits": 0.14236654341220856, + "loss/reg": 0.004139502998441458, + "step": 1151 + }, + { + "epoch": 0.144, + "grad_norm": 2.3409323692321777, + "grad_norm_var": 2.855599485961874, + "learning_rate": 0.0001, + "loss": 1.0217, + "loss/crossentropy": 2.4027068614959717, + "loss/hidden": 0.8359375, + "loss/logits": 0.1443997174501419, + "loss/reg": 0.004137733485549688, + "step": 1152 + }, + { + "epoch": 0.144125, + "grad_norm": 3.2646842002868652, + "grad_norm_var": 2.8174411429914015, + "learning_rate": 0.0001, + "loss": 1.1094, + "loss/crossentropy": 2.5013201236724854, + "loss/hidden": 0.91015625, + "loss/logits": 0.15787330269813538, + "loss/reg": 0.004135794471949339, + "step": 1153 + }, + { + "epoch": 0.14425, + "grad_norm": 3.01409912109375, + "grad_norm_var": 2.747083786295129, + "learning_rate": 0.0001, + "loss": 1.4099, + "loss/crossentropy": 2.341641426086426, + "loss/hidden": 1.171875, + "loss/logits": 0.1966913640499115, + "loss/reg": 0.004133842419832945, + "step": 1154 + }, + { + "epoch": 0.144375, + "grad_norm": 2.1998982429504395, + "grad_norm_var": 2.770511502568856, + "learning_rate": 0.0001, + "loss": 1.027, + "loss/crossentropy": 2.67824649810791, + "loss/hidden": 0.8359375, + "loss/logits": 0.1497696489095688, + "loss/reg": 0.004131934605538845, + "step": 1155 + }, + { + "epoch": 0.1445, + "grad_norm": 2.5142602920532227, + "grad_norm_var": 2.7521224578865087, + "learning_rate": 0.0001, + "loss": 1.1456, + "loss/crossentropy": 2.4931130409240723, + "loss/hidden": 0.921875, + "loss/logits": 0.18238465487957, + "loss/reg": 0.004129941575229168, + "step": 1156 + }, + { + "epoch": 0.144625, + "grad_norm": 2.1878349781036377, + "grad_norm_var": 2.750403944498737, + "learning_rate": 0.0001, + "loss": 1.0802, + "loss/crossentropy": 2.387873888015747, + "loss/hidden": 0.890625, + "loss/logits": 0.1483183354139328, + "loss/reg": 0.004127953667193651, + "step": 1157 + }, + { + "epoch": 0.14475, + "grad_norm": 2.688089609146118, + "grad_norm_var": 2.759744220974267, + "learning_rate": 0.0001, + "loss": 1.0354, + "loss/crossentropy": 2.6154494285583496, + "loss/hidden": 0.85546875, + "loss/logits": 0.13863371312618256, + "loss/reg": 0.004126036539673805, + "step": 1158 + }, + { + "epoch": 0.144875, + "grad_norm": 3.1651244163513184, + "grad_norm_var": 2.7252368192156586, + "learning_rate": 0.0001, + "loss": 1.2689, + "loss/crossentropy": 2.3369271755218506, + "loss/hidden": 1.0234375, + "loss/logits": 0.20426858961582184, + "loss/reg": 0.004124056547880173, + "step": 1159 + }, + { + "epoch": 0.145, + "grad_norm": 2.437448740005493, + "grad_norm_var": 0.290374675783868, + "learning_rate": 0.0001, + "loss": 1.1068, + "loss/crossentropy": 2.446481227874756, + "loss/hidden": 0.91015625, + "loss/logits": 0.15547212958335876, + "loss/reg": 0.0041221086867153645, + "step": 1160 + }, + { + "epoch": 0.145125, + "grad_norm": 3.550361394882202, + "grad_norm_var": 0.20437982896584472, + "learning_rate": 0.0001, + "loss": 1.6847, + "loss/crossentropy": 2.6945204734802246, + "loss/hidden": 1.28125, + "loss/logits": 0.36227577924728394, + "loss/reg": 0.004120130091905594, + "step": 1161 + }, + { + "epoch": 0.14525, + "grad_norm": 2.150529384613037, + "grad_norm_var": 0.21585453142654387, + "learning_rate": 0.0001, + "loss": 1.0391, + "loss/crossentropy": 2.3554482460021973, + "loss/hidden": 0.84765625, + "loss/logits": 0.15022103488445282, + "loss/reg": 0.0041182260029017925, + "step": 1162 + }, + { + "epoch": 0.145375, + "grad_norm": 4.6839704513549805, + "grad_norm_var": 0.48472891056564626, + "learning_rate": 0.0001, + "loss": 1.5962, + "loss/crossentropy": 2.456437587738037, + "loss/hidden": 1.2890625, + "loss/logits": 0.2660132944583893, + "loss/reg": 0.004116271156817675, + "step": 1163 + }, + { + "epoch": 0.1455, + "grad_norm": 2.3992183208465576, + "grad_norm_var": 0.4628530155911977, + "learning_rate": 0.0001, + "loss": 1.0986, + "loss/crossentropy": 2.535423755645752, + "loss/hidden": 0.9140625, + "loss/logits": 0.14340469241142273, + "loss/reg": 0.0041144127026200294, + "step": 1164 + }, + { + "epoch": 0.145625, + "grad_norm": 2.455538034439087, + "grad_norm_var": 0.4675077175097224, + "learning_rate": 0.0001, + "loss": 1.1422, + "loss/crossentropy": 2.5659542083740234, + "loss/hidden": 0.9296875, + "loss/logits": 0.17135412991046906, + "loss/reg": 0.004112581722438335, + "step": 1165 + }, + { + "epoch": 0.14575, + "grad_norm": 3.7746567726135254, + "grad_norm_var": 0.5063635000809102, + "learning_rate": 0.0001, + "loss": 1.4588, + "loss/crossentropy": 2.807483196258545, + "loss/hidden": 1.1953125, + "loss/logits": 0.22239741683006287, + "loss/reg": 0.004110958427190781, + "step": 1166 + }, + { + "epoch": 0.145875, + "grad_norm": 2.2929201126098633, + "grad_norm_var": 0.5053662377320952, + "learning_rate": 0.0001, + "loss": 1.1351, + "loss/crossentropy": 2.5097239017486572, + "loss/hidden": 0.921875, + "loss/logits": 0.17209036648273468, + "loss/reg": 0.004109338391572237, + "step": 1167 + }, + { + "epoch": 0.146, + "grad_norm": 4.034673690795898, + "grad_norm_var": 0.5764809506272021, + "learning_rate": 0.0001, + "loss": 1.4816, + "loss/crossentropy": 2.6598453521728516, + "loss/hidden": 1.1953125, + "loss/logits": 0.24519138038158417, + "loss/reg": 0.0041076927445828915, + "step": 1168 + }, + { + "epoch": 0.146125, + "grad_norm": 2.2866017818450928, + "grad_norm_var": 0.5920811915580522, + "learning_rate": 0.0001, + "loss": 1.177, + "loss/crossentropy": 2.3927576541900635, + "loss/hidden": 0.953125, + "loss/logits": 0.18283367156982422, + "loss/reg": 0.004106137901544571, + "step": 1169 + }, + { + "epoch": 0.14625, + "grad_norm": 2.899941921234131, + "grad_norm_var": 0.5906217092668515, + "learning_rate": 0.0001, + "loss": 1.1387, + "loss/crossentropy": 2.4218740463256836, + "loss/hidden": 0.92578125, + "loss/logits": 0.17189282178878784, + "loss/reg": 0.004104320891201496, + "step": 1170 + }, + { + "epoch": 0.146375, + "grad_norm": 2.9730427265167236, + "grad_norm_var": 0.5601848624373048, + "learning_rate": 0.0001, + "loss": 1.0993, + "loss/crossentropy": 2.669074296951294, + "loss/hidden": 0.90234375, + "loss/logits": 0.15591827034950256, + "loss/reg": 0.004102461040019989, + "step": 1171 + }, + { + "epoch": 0.1465, + "grad_norm": 3.55232572555542, + "grad_norm_var": 0.5733288711493515, + "learning_rate": 0.0001, + "loss": 1.2912, + "loss/crossentropy": 2.384329319000244, + "loss/hidden": 1.0234375, + "loss/logits": 0.22678744792938232, + "loss/reg": 0.004100624471902847, + "step": 1172 + }, + { + "epoch": 0.146625, + "grad_norm": 2.591209650039673, + "grad_norm_var": 0.541389636484242, + "learning_rate": 0.0001, + "loss": 1.1135, + "loss/crossentropy": 2.367767810821533, + "loss/hidden": 0.9375, + "loss/logits": 0.13497930765151978, + "loss/reg": 0.004098633769899607, + "step": 1173 + }, + { + "epoch": 0.14675, + "grad_norm": 2.9488017559051514, + "grad_norm_var": 0.534935103556153, + "learning_rate": 0.0001, + "loss": 1.4669, + "loss/crossentropy": 1.9882688522338867, + "loss/hidden": 1.25, + "loss/logits": 0.17595870792865753, + "loss/reg": 0.004096675664186478, + "step": 1174 + }, + { + "epoch": 0.146875, + "grad_norm": 2.837010145187378, + "grad_norm_var": 0.5349767501482856, + "learning_rate": 0.0001, + "loss": 1.1548, + "loss/crossentropy": 2.8100757598876953, + "loss/hidden": 0.97265625, + "loss/logits": 0.14116618037223816, + "loss/reg": 0.004094698466360569, + "step": 1175 + }, + { + "epoch": 0.147, + "grad_norm": 2.5989038944244385, + "grad_norm_var": 0.5246730089916675, + "learning_rate": 0.0001, + "loss": 1.2746, + "loss/crossentropy": 2.367872953414917, + "loss/hidden": 1.0703125, + "loss/logits": 0.16331195831298828, + "loss/reg": 0.004092712886631489, + "step": 1176 + }, + { + "epoch": 0.147125, + "grad_norm": 3.459014892578125, + "grad_norm_var": 0.5185139879820743, + "learning_rate": 0.0001, + "loss": 1.1223, + "loss/crossentropy": 2.533998727798462, + "loss/hidden": 0.92578125, + "loss/logits": 0.15561142563819885, + "loss/reg": 0.004090711008757353, + "step": 1177 + }, + { + "epoch": 0.14725, + "grad_norm": 5.084310054779053, + "grad_norm_var": 0.7256747423481064, + "learning_rate": 0.0001, + "loss": 1.2782, + "loss/crossentropy": 2.582167863845825, + "loss/hidden": 1.0625, + "loss/logits": 0.1748521625995636, + "loss/reg": 0.004088713321834803, + "step": 1178 + }, + { + "epoch": 0.147375, + "grad_norm": 2.778517961502075, + "grad_norm_var": 0.5703725263929961, + "learning_rate": 0.0001, + "loss": 1.0471, + "loss/crossentropy": 2.336113929748535, + "loss/hidden": 0.8515625, + "loss/logits": 0.15465718507766724, + "loss/reg": 0.004086779430508614, + "step": 1179 + }, + { + "epoch": 0.1475, + "grad_norm": 2.8686232566833496, + "grad_norm_var": 0.5427611216294432, + "learning_rate": 0.0001, + "loss": 1.1435, + "loss/crossentropy": 2.776197910308838, + "loss/hidden": 0.9453125, + "loss/logits": 0.1573391556739807, + "loss/reg": 0.0040848455391824245, + "step": 1180 + }, + { + "epoch": 0.147625, + "grad_norm": 2.511221170425415, + "grad_norm_var": 0.5382462121749844, + "learning_rate": 0.0001, + "loss": 1.0821, + "loss/crossentropy": 2.7198173999786377, + "loss/hidden": 0.87890625, + "loss/logits": 0.16234460473060608, + "loss/reg": 0.004082926083356142, + "step": 1181 + }, + { + "epoch": 0.14775, + "grad_norm": 2.1064679622650146, + "grad_norm_var": 0.5606094401847085, + "learning_rate": 0.0001, + "loss": 0.98, + "loss/crossentropy": 2.6126627922058105, + "loss/hidden": 0.8046875, + "loss/logits": 0.1345212161540985, + "loss/reg": 0.004080874379724264, + "step": 1182 + }, + { + "epoch": 0.147875, + "grad_norm": 4.242970943450928, + "grad_norm_var": 0.6172993082607185, + "learning_rate": 0.0001, + "loss": 1.2843, + "loss/crossentropy": 2.121168375015259, + "loss/hidden": 1.0703125, + "loss/logits": 0.1731598824262619, + "loss/reg": 0.0040789819322526455, + "step": 1183 + }, + { + "epoch": 0.148, + "grad_norm": 2.6352462768554688, + "grad_norm_var": 0.5673230040929977, + "learning_rate": 0.0001, + "loss": 1.0252, + "loss/crossentropy": 2.692413091659546, + "loss/hidden": 0.84375, + "loss/logits": 0.14067476987838745, + "loss/reg": 0.004077126272022724, + "step": 1184 + }, + { + "epoch": 0.148125, + "grad_norm": 2.905735731124878, + "grad_norm_var": 0.5304583396362827, + "learning_rate": 0.0001, + "loss": 1.2004, + "loss/crossentropy": 2.290217399597168, + "loss/hidden": 0.98046875, + "loss/logits": 0.17918136715888977, + "loss/reg": 0.004075322765856981, + "step": 1185 + }, + { + "epoch": 0.14825, + "grad_norm": 2.0793650150299072, + "grad_norm_var": 0.5902824998400082, + "learning_rate": 0.0001, + "loss": 0.949, + "loss/crossentropy": 2.605687379837036, + "loss/hidden": 0.77734375, + "loss/logits": 0.1309652477502823, + "loss/reg": 0.00407352764159441, + "step": 1186 + }, + { + "epoch": 0.148375, + "grad_norm": 2.730095624923706, + "grad_norm_var": 0.5951944585982081, + "learning_rate": 0.0001, + "loss": 1.018, + "loss/crossentropy": 2.399637222290039, + "loss/hidden": 0.84375, + "loss/logits": 0.13348934054374695, + "loss/reg": 0.004071622621268034, + "step": 1187 + }, + { + "epoch": 0.1485, + "grad_norm": 2.3118958473205566, + "grad_norm_var": 0.5992861461620653, + "learning_rate": 0.0001, + "loss": 0.969, + "loss/crossentropy": 2.583070993423462, + "loss/hidden": 0.7890625, + "loss/logits": 0.13926547765731812, + "loss/reg": 0.004069886170327663, + "step": 1188 + }, + { + "epoch": 0.148625, + "grad_norm": 2.281296491622925, + "grad_norm_var": 0.6187961724202968, + "learning_rate": 0.0001, + "loss": 1.1247, + "loss/crossentropy": 2.4077699184417725, + "loss/hidden": 0.91796875, + "loss/logits": 0.16607880592346191, + "loss/reg": 0.00406790804117918, + "step": 1189 + }, + { + "epoch": 0.14875, + "grad_norm": 3.76084041595459, + "grad_norm_var": 0.6654318302540614, + "learning_rate": 0.0001, + "loss": 1.3819, + "loss/crossentropy": 2.7298569679260254, + "loss/hidden": 1.0546875, + "loss/logits": 0.28656214475631714, + "loss/reg": 0.004065926186740398, + "step": 1190 + }, + { + "epoch": 0.148875, + "grad_norm": 3.076002597808838, + "grad_norm_var": 0.6654180683387788, + "learning_rate": 0.0001, + "loss": 1.605, + "loss/crossentropy": 1.9846049547195435, + "loss/hidden": 1.3125, + "loss/logits": 0.2518633008003235, + "loss/reg": 0.004063920117914677, + "step": 1191 + }, + { + "epoch": 0.149, + "grad_norm": 2.316555976867676, + "grad_norm_var": 0.6841604530042008, + "learning_rate": 0.0001, + "loss": 1.0979, + "loss/crossentropy": 2.3328921794891357, + "loss/hidden": 0.89453125, + "loss/logits": 0.16276977956295013, + "loss/reg": 0.004061955027282238, + "step": 1192 + }, + { + "epoch": 0.149125, + "grad_norm": 2.814150810241699, + "grad_norm_var": 0.6661064219784556, + "learning_rate": 0.0001, + "loss": 1.203, + "loss/crossentropy": 2.3084676265716553, + "loss/hidden": 0.984375, + "loss/logits": 0.17800584435462952, + "loss/reg": 0.004059869330376387, + "step": 1193 + }, + { + "epoch": 0.14925, + "grad_norm": 4.5728559494018555, + "grad_norm_var": 0.5339391843004021, + "learning_rate": 0.0001, + "loss": 1.235, + "loss/crossentropy": 2.786238431930542, + "loss/hidden": 1.0078125, + "loss/logits": 0.18664950132369995, + "loss/reg": 0.004058040212839842, + "step": 1194 + }, + { + "epoch": 0.149375, + "grad_norm": 2.267996311187744, + "grad_norm_var": 0.556761488955063, + "learning_rate": 0.0001, + "loss": 1.1708, + "loss/crossentropy": 2.1988203525543213, + "loss/hidden": 0.96484375, + "loss/logits": 0.1653607189655304, + "loss/reg": 0.004056154750287533, + "step": 1195 + }, + { + "epoch": 0.1495, + "grad_norm": 2.1123440265655518, + "grad_norm_var": 0.5898830056876873, + "learning_rate": 0.0001, + "loss": 0.9765, + "loss/crossentropy": 2.301168441772461, + "loss/hidden": 0.80859375, + "loss/logits": 0.12740209698677063, + "loss/reg": 0.004054322373121977, + "step": 1196 + }, + { + "epoch": 0.149625, + "grad_norm": 3.071465253829956, + "grad_norm_var": 0.5882785049222033, + "learning_rate": 0.0001, + "loss": 1.2782, + "loss/crossentropy": 2.6485557556152344, + "loss/hidden": 1.0234375, + "loss/logits": 0.21422387659549713, + "loss/reg": 0.004052514210343361, + "step": 1197 + }, + { + "epoch": 0.14975, + "grad_norm": 2.4168970584869385, + "grad_norm_var": 0.5643403352790185, + "learning_rate": 0.0001, + "loss": 1.0625, + "loss/crossentropy": 2.5202739238739014, + "loss/hidden": 0.87109375, + "loss/logits": 0.15086334943771362, + "loss/reg": 0.004050557967275381, + "step": 1198 + }, + { + "epoch": 0.149875, + "grad_norm": 2.1462841033935547, + "grad_norm_var": 0.44960492320894496, + "learning_rate": 0.0001, + "loss": 1.0988, + "loss/crossentropy": 2.435528039932251, + "loss/hidden": 0.90234375, + "loss/logits": 0.1560034155845642, + "loss/reg": 0.004048600792884827, + "step": 1199 + }, + { + "epoch": 0.15, + "grad_norm": 2.1528284549713135, + "grad_norm_var": 0.46951760615473387, + "learning_rate": 0.0001, + "loss": 1.3208, + "loss/crossentropy": 2.4528579711914062, + "loss/hidden": 1.078125, + "loss/logits": 0.2022087574005127, + "loss/reg": 0.0040466394275426865, + "step": 1200 + }, + { + "epoch": 0.150125, + "grad_norm": 2.827105760574341, + "grad_norm_var": 0.46762692410470286, + "learning_rate": 0.0001, + "loss": 1.098, + "loss/crossentropy": 2.634087562561035, + "loss/hidden": 0.88671875, + "loss/logits": 0.17087715864181519, + "loss/reg": 0.004044875968247652, + "step": 1201 + }, + { + "epoch": 0.15025, + "grad_norm": 2.268160104751587, + "grad_norm_var": 0.45464383775398576, + "learning_rate": 0.0001, + "loss": 1.0992, + "loss/crossentropy": 2.574035167694092, + "loss/hidden": 0.89453125, + "loss/logits": 0.16423508524894714, + "loss/reg": 0.004043125547468662, + "step": 1202 + }, + { + "epoch": 0.150375, + "grad_norm": 2.10304594039917, + "grad_norm_var": 0.4763194687664772, + "learning_rate": 0.0001, + "loss": 1.0253, + "loss/crossentropy": 2.686739444732666, + "loss/hidden": 0.83984375, + "loss/logits": 0.14499551057815552, + "loss/reg": 0.004041461274027824, + "step": 1203 + }, + { + "epoch": 0.1505, + "grad_norm": 2.8614940643310547, + "grad_norm_var": 0.46996517485336897, + "learning_rate": 0.0001, + "loss": 1.3262, + "loss/crossentropy": 2.327242136001587, + "loss/hidden": 1.09375, + "loss/logits": 0.1920863389968872, + "loss/reg": 0.004039805382490158, + "step": 1204 + }, + { + "epoch": 0.150625, + "grad_norm": 2.0004308223724365, + "grad_norm_var": 0.4902227797061412, + "learning_rate": 0.0001, + "loss": 0.9759, + "loss/crossentropy": 2.55458664894104, + "loss/hidden": 0.80078125, + "loss/logits": 0.13473068177700043, + "loss/reg": 0.00403786962851882, + "step": 1205 + }, + { + "epoch": 0.15075, + "grad_norm": 2.0800395011901855, + "grad_norm_var": 0.42300499990140544, + "learning_rate": 0.0001, + "loss": 1.0823, + "loss/crossentropy": 2.2755930423736572, + "loss/hidden": 0.89453125, + "loss/logits": 0.14743617177009583, + "loss/reg": 0.00403629383072257, + "step": 1206 + }, + { + "epoch": 0.150875, + "grad_norm": 2.8714046478271484, + "grad_norm_var": 0.41176251270089, + "learning_rate": 0.0001, + "loss": 1.0406, + "loss/crossentropy": 2.389319896697998, + "loss/hidden": 0.83984375, + "loss/logits": 0.16044044494628906, + "loss/reg": 0.004034355282783508, + "step": 1207 + }, + { + "epoch": 0.151, + "grad_norm": 2.136133909225464, + "grad_norm_var": 0.41953769445076433, + "learning_rate": 0.0001, + "loss": 1.0709, + "loss/crossentropy": 2.745492458343506, + "loss/hidden": 0.8828125, + "loss/logits": 0.14779764413833618, + "loss/reg": 0.004032687284052372, + "step": 1208 + }, + { + "epoch": 0.151125, + "grad_norm": 2.6204607486724854, + "grad_norm_var": 0.4149034970549467, + "learning_rate": 0.0001, + "loss": 1.0942, + "loss/crossentropy": 2.473696708679199, + "loss/hidden": 0.91015625, + "loss/logits": 0.14370602369308472, + "loss/reg": 0.004030975513160229, + "step": 1209 + }, + { + "epoch": 0.15125, + "grad_norm": 2.2523136138916016, + "grad_norm_var": 0.11994939680660437, + "learning_rate": 0.0001, + "loss": 1.1321, + "loss/crossentropy": 2.7228078842163086, + "loss/hidden": 0.9296875, + "loss/logits": 0.16212627291679382, + "loss/reg": 0.004029178526252508, + "step": 1210 + }, + { + "epoch": 0.151375, + "grad_norm": 3.1284432411193848, + "grad_norm_var": 0.15259538885302745, + "learning_rate": 0.0001, + "loss": 1.0887, + "loss/crossentropy": 2.81272029876709, + "loss/hidden": 0.88671875, + "loss/logits": 0.16168195009231567, + "loss/reg": 0.004027185495942831, + "step": 1211 + }, + { + "epoch": 0.1515, + "grad_norm": 2.160048246383667, + "grad_norm_var": 0.15065002461184704, + "learning_rate": 0.0001, + "loss": 0.9783, + "loss/crossentropy": 2.311624526977539, + "loss/hidden": 0.8046875, + "loss/logits": 0.13339203596115112, + "loss/reg": 0.0040252963081002235, + "step": 1212 + }, + { + "epoch": 0.151625, + "grad_norm": 2.9693639278411865, + "grad_norm_var": 0.14275322843417157, + "learning_rate": 0.0001, + "loss": 0.9642, + "loss/crossentropy": 2.664057970046997, + "loss/hidden": 0.7890625, + "loss/logits": 0.13490962982177734, + "loss/reg": 0.0040232837200164795, + "step": 1213 + }, + { + "epoch": 0.15175, + "grad_norm": 3.021174669265747, + "grad_norm_var": 0.16394313365960494, + "learning_rate": 0.0001, + "loss": 1.2676, + "loss/crossentropy": 2.6081109046936035, + "loss/hidden": 1.046875, + "loss/logits": 0.18048575520515442, + "loss/reg": 0.004021205008029938, + "step": 1214 + }, + { + "epoch": 0.151875, + "grad_norm": 2.3486392498016357, + "grad_norm_var": 0.15763551716868943, + "learning_rate": 0.0001, + "loss": 0.9701, + "loss/crossentropy": 2.4917783737182617, + "loss/hidden": 0.79296875, + "loss/logits": 0.13695400953292847, + "loss/reg": 0.004019314423203468, + "step": 1215 + }, + { + "epoch": 0.152, + "grad_norm": 2.184790849685669, + "grad_norm_var": 0.15627282346626145, + "learning_rate": 0.0001, + "loss": 1.0597, + "loss/crossentropy": 2.4108786582946777, + "loss/hidden": 0.86328125, + "loss/logits": 0.15620392560958862, + "loss/reg": 0.004017516039311886, + "step": 1216 + }, + { + "epoch": 0.152125, + "grad_norm": 2.46875262260437, + "grad_norm_var": 0.1481710731830276, + "learning_rate": 0.0001, + "loss": 1.067, + "loss/crossentropy": 2.67449688911438, + "loss/hidden": 0.86328125, + "loss/logits": 0.1635635495185852, + "loss/reg": 0.004015681799501181, + "step": 1217 + }, + { + "epoch": 0.15225, + "grad_norm": 2.1573519706726074, + "grad_norm_var": 0.15187870918378293, + "learning_rate": 0.0001, + "loss": 1.1704, + "loss/crossentropy": 2.501345634460449, + "loss/hidden": 0.94140625, + "loss/logits": 0.18887701630592346, + "loss/reg": 0.0040138899348676205, + "step": 1218 + }, + { + "epoch": 0.152375, + "grad_norm": 2.381070137023926, + "grad_norm_var": 0.14346854325687347, + "learning_rate": 0.0001, + "loss": 1.0696, + "loss/crossentropy": 2.485503673553467, + "loss/hidden": 0.859375, + "loss/logits": 0.1700785756111145, + "loss/reg": 0.004012054763734341, + "step": 1219 + }, + { + "epoch": 0.1525, + "grad_norm": 2.212719678878784, + "grad_norm_var": 0.13656890921584572, + "learning_rate": 0.0001, + "loss": 1.0381, + "loss/crossentropy": 2.6198275089263916, + "loss/hidden": 0.84765625, + "loss/logits": 0.15031346678733826, + "loss/reg": 0.004010040778666735, + "step": 1220 + }, + { + "epoch": 0.152625, + "grad_norm": 2.1536829471588135, + "grad_norm_var": 0.12911465723150664, + "learning_rate": 0.0001, + "loss": 0.9917, + "loss/crossentropy": 2.542593240737915, + "loss/hidden": 0.8203125, + "loss/logits": 0.13133659958839417, + "loss/reg": 0.0040080067701637745, + "step": 1221 + }, + { + "epoch": 0.15275, + "grad_norm": 2.5055301189422607, + "grad_norm_var": 0.11963125742360768, + "learning_rate": 0.0001, + "loss": 1.1078, + "loss/crossentropy": 2.522934675216675, + "loss/hidden": 0.8671875, + "loss/logits": 0.20054848492145538, + "loss/reg": 0.004005954600870609, + "step": 1222 + }, + { + "epoch": 0.152875, + "grad_norm": 2.0575757026672363, + "grad_norm_var": 0.1178213242465496, + "learning_rate": 0.0001, + "loss": 1.0593, + "loss/crossentropy": 2.6986868381500244, + "loss/hidden": 0.875, + "loss/logits": 0.14429835975170135, + "loss/reg": 0.004004053305834532, + "step": 1223 + }, + { + "epoch": 0.153, + "grad_norm": 6.1870503425598145, + "grad_norm_var": 0.9888346629412268, + "learning_rate": 0.0001, + "loss": 1.1983, + "loss/crossentropy": 2.4771060943603516, + "loss/hidden": 0.99609375, + "loss/logits": 0.1621606945991516, + "loss/reg": 0.004002041183412075, + "step": 1224 + }, + { + "epoch": 0.153125, + "grad_norm": 2.094531774520874, + "grad_norm_var": 1.009986051026931, + "learning_rate": 0.0001, + "loss": 1.0328, + "loss/crossentropy": 2.4067788124084473, + "loss/hidden": 0.8515625, + "loss/logits": 0.141241192817688, + "loss/reg": 0.004000107757747173, + "step": 1225 + }, + { + "epoch": 0.15325, + "grad_norm": 2.305340528488159, + "grad_norm_var": 1.007401731577304, + "learning_rate": 0.0001, + "loss": 1.1055, + "loss/crossentropy": 2.624967336654663, + "loss/hidden": 0.91796875, + "loss/logits": 0.1475597620010376, + "loss/reg": 0.003998105879873037, + "step": 1226 + }, + { + "epoch": 0.153375, + "grad_norm": 9.206901550292969, + "grad_norm_var": 3.7076283352537036, + "learning_rate": 0.0001, + "loss": 1.7336, + "loss/crossentropy": 2.340308666229248, + "loss/hidden": 1.265625, + "loss/logits": 0.4280090034008026, + "loss/reg": 0.003996132407337427, + "step": 1227 + }, + { + "epoch": 0.1535, + "grad_norm": 7.1393914222717285, + "grad_norm_var": 4.682389594648043, + "learning_rate": 0.0001, + "loss": 2.1514, + "loss/crossentropy": 2.1572506427764893, + "loss/hidden": 1.7578125, + "loss/logits": 0.3536328077316284, + "loss/reg": 0.0039941999129951, + "step": 1228 + }, + { + "epoch": 0.153625, + "grad_norm": 2.1037001609802246, + "grad_norm_var": 4.771672156590602, + "learning_rate": 0.0001, + "loss": 0.9941, + "loss/crossentropy": 2.7013680934906006, + "loss/hidden": 0.80859375, + "loss/logits": 0.14557519555091858, + "loss/reg": 0.003992319572716951, + "step": 1229 + }, + { + "epoch": 0.15375, + "grad_norm": 2.157334804534912, + "grad_norm_var": 4.84846901790952, + "learning_rate": 0.0001, + "loss": 1.0878, + "loss/crossentropy": 2.556549310684204, + "loss/hidden": 0.88671875, + "loss/logits": 0.16115835309028625, + "loss/reg": 0.003990530967712402, + "step": 1230 + }, + { + "epoch": 0.153875, + "grad_norm": 2.0853333473205566, + "grad_norm_var": 4.883710165437185, + "learning_rate": 0.0001, + "loss": 1.0472, + "loss/crossentropy": 2.114297866821289, + "loss/hidden": 0.86328125, + "loss/logits": 0.14400479197502136, + "loss/reg": 0.003988809883594513, + "step": 1231 + }, + { + "epoch": 0.154, + "grad_norm": 9.08906364440918, + "grad_norm_var": 6.916882811324148, + "learning_rate": 0.0001, + "loss": 1.2453, + "loss/crossentropy": 2.644493818283081, + "loss/hidden": 1.046875, + "loss/logits": 0.15859541296958923, + "loss/reg": 0.003986929077655077, + "step": 1232 + }, + { + "epoch": 0.154125, + "grad_norm": 2.883406639099121, + "grad_norm_var": 6.862648195671316, + "learning_rate": 0.0001, + "loss": 1.4104, + "loss/crossentropy": 2.49361252784729, + "loss/hidden": 1.140625, + "loss/logits": 0.22993981838226318, + "loss/reg": 0.003985217306762934, + "step": 1233 + }, + { + "epoch": 0.15425, + "grad_norm": 2.5608956813812256, + "grad_norm_var": 6.7914369374581725, + "learning_rate": 0.0001, + "loss": 1.0612, + "loss/crossentropy": 2.4652509689331055, + "loss/hidden": 0.8671875, + "loss/logits": 0.15421175956726074, + "loss/reg": 0.003983261063694954, + "step": 1234 + }, + { + "epoch": 0.154375, + "grad_norm": 2.1633126735687256, + "grad_norm_var": 6.832556056171203, + "learning_rate": 0.0001, + "loss": 1.0324, + "loss/crossentropy": 2.364274024963379, + "loss/hidden": 0.8515625, + "loss/logits": 0.14098191261291504, + "loss/reg": 0.003981346730142832, + "step": 1235 + }, + { + "epoch": 0.1545, + "grad_norm": 2.7880406379699707, + "grad_norm_var": 6.740565356111725, + "learning_rate": 0.0001, + "loss": 1.2871, + "loss/crossentropy": 2.7725203037261963, + "loss/hidden": 1.0390625, + "loss/logits": 0.20828330516815186, + "loss/reg": 0.003979409113526344, + "step": 1236 + }, + { + "epoch": 0.154625, + "grad_norm": 2.0032100677490234, + "grad_norm_var": 6.773356796491393, + "learning_rate": 0.0001, + "loss": 1.0215, + "loss/crossentropy": 2.426025390625, + "loss/hidden": 0.84375, + "loss/logits": 0.13792634010314941, + "loss/reg": 0.003977475222200155, + "step": 1237 + }, + { + "epoch": 0.15475, + "grad_norm": 2.803041696548462, + "grad_norm_var": 6.731182546058613, + "learning_rate": 0.0001, + "loss": 1.1383, + "loss/crossentropy": 2.7533788681030273, + "loss/hidden": 0.93359375, + "loss/logits": 0.1649492084980011, + "loss/reg": 0.0039755916222929955, + "step": 1238 + }, + { + "epoch": 0.154875, + "grad_norm": 2.6718337535858154, + "grad_norm_var": 6.618056769993946, + "learning_rate": 0.0001, + "loss": 1.1889, + "loss/crossentropy": 2.4473633766174316, + "loss/hidden": 0.96484375, + "loss/logits": 0.1843622773885727, + "loss/reg": 0.003973691258579493, + "step": 1239 + }, + { + "epoch": 0.155, + "grad_norm": 2.8771820068359375, + "grad_norm_var": 6.233935399852195, + "learning_rate": 0.0001, + "loss": 1.1242, + "loss/crossentropy": 2.7280538082122803, + "loss/hidden": 0.9140625, + "loss/logits": 0.17046083509922028, + "loss/reg": 0.0039716921746730804, + "step": 1240 + }, + { + "epoch": 0.155125, + "grad_norm": 2.2324299812316895, + "grad_norm_var": 6.208210747435883, + "learning_rate": 0.0001, + "loss": 1.001, + "loss/crossentropy": 2.7301042079925537, + "loss/hidden": 0.82421875, + "loss/logits": 0.1370982974767685, + "loss/reg": 0.003969752229750156, + "step": 1241 + }, + { + "epoch": 0.15525, + "grad_norm": 2.414759874343872, + "grad_norm_var": 6.1905538159398015, + "learning_rate": 0.0001, + "loss": 1.0086, + "loss/crossentropy": 2.161578893661499, + "loss/hidden": 0.83203125, + "loss/logits": 0.13684576749801636, + "loss/reg": 0.00396784907206893, + "step": 1242 + }, + { + "epoch": 0.155375, + "grad_norm": 2.278144359588623, + "grad_norm_var": 3.986925647036762, + "learning_rate": 0.0001, + "loss": 1.0588, + "loss/crossentropy": 2.1866931915283203, + "loss/hidden": 0.8828125, + "loss/logits": 0.13629356026649475, + "loss/reg": 0.0039659528993070126, + "step": 1243 + }, + { + "epoch": 0.1555, + "grad_norm": 2.3172078132629395, + "grad_norm_var": 2.8692718796241876, + "learning_rate": 0.0001, + "loss": 1.0269, + "loss/crossentropy": 2.7066776752471924, + "loss/hidden": 0.84375, + "loss/logits": 0.14347760379314423, + "loss/reg": 0.003964039962738752, + "step": 1244 + }, + { + "epoch": 0.155625, + "grad_norm": 2.5149173736572266, + "grad_norm_var": 2.8395080960927275, + "learning_rate": 0.0001, + "loss": 1.147, + "loss/crossentropy": 2.6849911212921143, + "loss/hidden": 0.93359375, + "loss/logits": 0.17379310727119446, + "loss/reg": 0.00396218616515398, + "step": 1245 + }, + { + "epoch": 0.15575, + "grad_norm": 3.1449217796325684, + "grad_norm_var": 2.807281033079679, + "learning_rate": 0.0001, + "loss": 1.3347, + "loss/crossentropy": 2.340134620666504, + "loss/hidden": 1.09375, + "loss/logits": 0.20130982995033264, + "loss/reg": 0.003960458096116781, + "step": 1246 + }, + { + "epoch": 0.155875, + "grad_norm": 2.4941136837005615, + "grad_norm_var": 2.771865274736687, + "learning_rate": 0.0001, + "loss": 1.0381, + "loss/crossentropy": 2.569666862487793, + "loss/hidden": 0.84375, + "loss/logits": 0.15474390983581543, + "loss/reg": 0.003958826884627342, + "step": 1247 + }, + { + "epoch": 0.156, + "grad_norm": 2.905941963195801, + "grad_norm_var": 0.10203846777971345, + "learning_rate": 0.0001, + "loss": 1.0627, + "loss/crossentropy": 2.7658021450042725, + "loss/hidden": 0.87109375, + "loss/logits": 0.1519913673400879, + "loss/reg": 0.003957261331379414, + "step": 1248 + }, + { + "epoch": 0.156125, + "grad_norm": 2.955136299133301, + "grad_norm_var": 0.10539728005771849, + "learning_rate": 0.0001, + "loss": 1.1496, + "loss/crossentropy": 2.783196449279785, + "loss/hidden": 0.94140625, + "loss/logits": 0.16862890124320984, + "loss/reg": 0.0039556450210511684, + "step": 1249 + }, + { + "epoch": 0.15625, + "grad_norm": 2.683770179748535, + "grad_norm_var": 0.10618654391323404, + "learning_rate": 0.0001, + "loss": 1.2244, + "loss/crossentropy": 2.306131362915039, + "loss/hidden": 1.0, + "loss/logits": 0.18487581610679626, + "loss/reg": 0.0039537097327411175, + "step": 1250 + }, + { + "epoch": 0.156375, + "grad_norm": 2.1440393924713135, + "grad_norm_var": 0.10727540575258346, + "learning_rate": 0.0001, + "loss": 1.0298, + "loss/crossentropy": 2.5097758769989014, + "loss/hidden": 0.8515625, + "loss/logits": 0.13874852657318115, + "loss/reg": 0.003951748367398977, + "step": 1251 + }, + { + "epoch": 0.1565, + "grad_norm": 5.823795795440674, + "grad_norm_var": 0.7687695668695557, + "learning_rate": 0.0001, + "loss": 1.229, + "loss/crossentropy": 2.7339322566986084, + "loss/hidden": 1.046875, + "loss/logits": 0.14258000254631042, + "loss/reg": 0.003950015641748905, + "step": 1252 + }, + { + "epoch": 0.156625, + "grad_norm": 3.07033109664917, + "grad_norm_var": 0.7313342744887732, + "learning_rate": 0.0001, + "loss": 1.2339, + "loss/crossentropy": 2.9465441703796387, + "loss/hidden": 1.03125, + "loss/logits": 0.16319361329078674, + "loss/reg": 0.003948097582906485, + "step": 1253 + }, + { + "epoch": 0.15675, + "grad_norm": 2.2648935317993164, + "grad_norm_var": 0.7516000874171217, + "learning_rate": 0.0001, + "loss": 1.0369, + "loss/crossentropy": 2.462608814239502, + "loss/hidden": 0.83984375, + "loss/logits": 0.157545804977417, + "loss/reg": 0.003946339711546898, + "step": 1254 + }, + { + "epoch": 0.156875, + "grad_norm": 8.53079605102539, + "grad_norm_var": 2.7972635310947167, + "learning_rate": 0.0001, + "loss": 1.3134, + "loss/crossentropy": 2.634783983230591, + "loss/hidden": 1.1328125, + "loss/logits": 0.1411134898662567, + "loss/reg": 0.003944624215364456, + "step": 1255 + }, + { + "epoch": 0.157, + "grad_norm": 2.3669466972351074, + "grad_norm_var": 2.833168083556588, + "learning_rate": 0.0001, + "loss": 1.1473, + "loss/crossentropy": 2.3000428676605225, + "loss/hidden": 0.95703125, + "loss/logits": 0.15085983276367188, + "loss/reg": 0.003942654933780432, + "step": 1256 + }, + { + "epoch": 0.157125, + "grad_norm": 3.7946250438690186, + "grad_norm_var": 2.797930128567604, + "learning_rate": 0.0001, + "loss": 1.257, + "loss/crossentropy": 2.2643165588378906, + "loss/hidden": 1.0546875, + "loss/logits": 0.16289997100830078, + "loss/reg": 0.003940712660551071, + "step": 1257 + }, + { + "epoch": 0.15725, + "grad_norm": 2.7023470401763916, + "grad_norm_var": 2.7717805963911966, + "learning_rate": 0.0001, + "loss": 1.0415, + "loss/crossentropy": 2.6158485412597656, + "loss/hidden": 0.8515625, + "loss/logits": 0.15053331851959229, + "loss/reg": 0.003938745241612196, + "step": 1258 + }, + { + "epoch": 0.157375, + "grad_norm": 2.1059186458587646, + "grad_norm_var": 2.7959400050235406, + "learning_rate": 0.0001, + "loss": 0.9954, + "loss/crossentropy": 2.2793667316436768, + "loss/hidden": 0.828125, + "loss/logits": 0.12795627117156982, + "loss/reg": 0.003936750814318657, + "step": 1259 + }, + { + "epoch": 0.1575, + "grad_norm": 3.712144136428833, + "grad_norm_var": 2.74615990110932, + "learning_rate": 0.0001, + "loss": 1.5484, + "loss/crossentropy": 2.348719835281372, + "loss/hidden": 1.265625, + "loss/logits": 0.2434225231409073, + "loss/reg": 0.003934717271476984, + "step": 1260 + }, + { + "epoch": 0.157625, + "grad_norm": 2.565690755844116, + "grad_norm_var": 2.7408307436849646, + "learning_rate": 0.0001, + "loss": 0.9844, + "loss/crossentropy": 2.635305404663086, + "loss/hidden": 0.80078125, + "loss/logits": 0.14425452053546906, + "loss/reg": 0.003932674881070852, + "step": 1261 + }, + { + "epoch": 0.15775, + "grad_norm": 2.1676135063171387, + "grad_norm_var": 2.824524782775095, + "learning_rate": 0.0001, + "loss": 1.0319, + "loss/crossentropy": 2.582833766937256, + "loss/hidden": 0.84375, + "loss/logits": 0.1488630175590515, + "loss/reg": 0.003930607810616493, + "step": 1262 + }, + { + "epoch": 0.157875, + "grad_norm": 2.2690494060516357, + "grad_norm_var": 2.8509140700262754, + "learning_rate": 0.0001, + "loss": 0.984, + "loss/crossentropy": 2.694347620010376, + "loss/hidden": 0.8125, + "loss/logits": 0.13225148618221283, + "loss/reg": 0.003928603138774633, + "step": 1263 + }, + { + "epoch": 0.158, + "grad_norm": 3.369201183319092, + "grad_norm_var": 2.842832034310379, + "learning_rate": 0.0001, + "loss": 1.1406, + "loss/crossentropy": 2.450927972793579, + "loss/hidden": 0.94140625, + "loss/logits": 0.1599656641483307, + "loss/reg": 0.003926686476916075, + "step": 1264 + }, + { + "epoch": 0.158125, + "grad_norm": 1.9869253635406494, + "grad_norm_var": 2.9437333300576247, + "learning_rate": 0.0001, + "loss": 1.0556, + "loss/crossentropy": 2.3994815349578857, + "loss/hidden": 0.87890625, + "loss/logits": 0.1374390572309494, + "loss/reg": 0.003924795426428318, + "step": 1265 + }, + { + "epoch": 0.15825, + "grad_norm": 2.7239654064178467, + "grad_norm_var": 2.9409477001102284, + "learning_rate": 0.0001, + "loss": 1.1347, + "loss/crossentropy": 2.549830913543701, + "loss/hidden": 0.9453125, + "loss/logits": 0.15013578534126282, + "loss/reg": 0.003923265729099512, + "step": 1266 + }, + { + "epoch": 0.158375, + "grad_norm": 3.253939151763916, + "grad_norm_var": 2.857988200257295, + "learning_rate": 0.0001, + "loss": 1.5404, + "loss/crossentropy": 2.297355890274048, + "loss/hidden": 1.234375, + "loss/logits": 0.2668222188949585, + "loss/reg": 0.003921460825949907, + "step": 1267 + }, + { + "epoch": 0.1585, + "grad_norm": 4.972027778625488, + "grad_norm_var": 2.6160556342714547, + "learning_rate": 0.0001, + "loss": 1.1048, + "loss/crossentropy": 2.5779943466186523, + "loss/hidden": 0.8828125, + "loss/logits": 0.18274636566638947, + "loss/reg": 0.003919865936040878, + "step": 1268 + }, + { + "epoch": 0.158625, + "grad_norm": 3.8399877548217773, + "grad_norm_var": 2.635561990200084, + "learning_rate": 0.0001, + "loss": 1.1315, + "loss/crossentropy": 2.8448750972747803, + "loss/hidden": 0.9296875, + "loss/logits": 0.16261914372444153, + "loss/reg": 0.003917965106666088, + "step": 1269 + }, + { + "epoch": 0.15875, + "grad_norm": 4.097829341888428, + "grad_norm_var": 2.595225849251786, + "learning_rate": 0.0001, + "loss": 1.2206, + "loss/crossentropy": 2.9463918209075928, + "loss/hidden": 0.98828125, + "loss/logits": 0.19318252801895142, + "loss/reg": 0.0039165741764009, + "step": 1270 + }, + { + "epoch": 0.158875, + "grad_norm": 4.908977508544922, + "grad_norm_var": 0.9391465897119967, + "learning_rate": 0.0001, + "loss": 1.1711, + "loss/crossentropy": 2.5089914798736572, + "loss/hidden": 0.96875, + "loss/logits": 0.16315871477127075, + "loss/reg": 0.003915099892765284, + "step": 1271 + }, + { + "epoch": 0.159, + "grad_norm": 2.4336774349212646, + "grad_norm_var": 0.9322146223506894, + "learning_rate": 0.0001, + "loss": 1.0449, + "loss/crossentropy": 2.3010151386260986, + "loss/hidden": 0.85546875, + "loss/logits": 0.15030014514923096, + "loss/reg": 0.0039135850965976715, + "step": 1272 + }, + { + "epoch": 0.159125, + "grad_norm": 2.9904537200927734, + "grad_norm_var": 0.9068912920584419, + "learning_rate": 0.0001, + "loss": 1.1666, + "loss/crossentropy": 2.428278923034668, + "loss/hidden": 0.96484375, + "loss/logits": 0.16259220242500305, + "loss/reg": 0.0039116572588682175, + "step": 1273 + }, + { + "epoch": 0.15925, + "grad_norm": 1.83692467212677, + "grad_norm_var": 1.0031901798578553, + "learning_rate": 0.0001, + "loss": 0.9189, + "loss/crossentropy": 2.4613776206970215, + "loss/hidden": 0.76171875, + "loss/logits": 0.11810196936130524, + "loss/reg": 0.003909708932042122, + "step": 1274 + }, + { + "epoch": 0.159375, + "grad_norm": 2.4019277095794678, + "grad_norm_var": 0.9703342604960014, + "learning_rate": 0.0001, + "loss": 1.1094, + "loss/crossentropy": 2.298656702041626, + "loss/hidden": 0.9296875, + "loss/logits": 0.14063113927841187, + "loss/reg": 0.003907748498022556, + "step": 1275 + }, + { + "epoch": 0.1595, + "grad_norm": 2.206735372543335, + "grad_norm_var": 0.9882309911375784, + "learning_rate": 0.0001, + "loss": 1.0258, + "loss/crossentropy": 2.4418487548828125, + "loss/hidden": 0.84375, + "loss/logits": 0.1430271565914154, + "loss/reg": 0.003905918914824724, + "step": 1276 + }, + { + "epoch": 0.159625, + "grad_norm": 2.794393539428711, + "grad_norm_var": 0.9782088480890478, + "learning_rate": 0.0001, + "loss": 1.1423, + "loss/crossentropy": 2.5516860485076904, + "loss/hidden": 0.92578125, + "loss/logits": 0.1774648129940033, + "loss/reg": 0.003904127748683095, + "step": 1277 + }, + { + "epoch": 0.15975, + "grad_norm": 2.9287729263305664, + "grad_norm_var": 0.928333134335493, + "learning_rate": 0.0001, + "loss": 1.2932, + "loss/crossentropy": 2.533024311065674, + "loss/hidden": 1.0625, + "loss/logits": 0.19170062243938446, + "loss/reg": 0.0039022008422762156, + "step": 1278 + }, + { + "epoch": 0.159875, + "grad_norm": 2.364227056503296, + "grad_norm_var": 0.918818410696285, + "learning_rate": 0.0001, + "loss": 1.0528, + "loss/crossentropy": 2.4888012409210205, + "loss/hidden": 0.859375, + "loss/logits": 0.15442782640457153, + "loss/reg": 0.0039004147984087467, + "step": 1279 + }, + { + "epoch": 0.16, + "grad_norm": 2.9671173095703125, + "grad_norm_var": 0.9128487251694849, + "learning_rate": 0.0001, + "loss": 1.4056, + "loss/crossentropy": 2.2953407764434814, + "loss/hidden": 1.15625, + "loss/logits": 0.21034780144691467, + "loss/reg": 0.0038986552972346544, + "step": 1280 + }, + { + "epoch": 0.160125, + "grad_norm": 2.1895735263824463, + "grad_norm_var": 0.8868469140494813, + "learning_rate": 0.0001, + "loss": 1.1023, + "loss/crossentropy": 2.418267011642456, + "loss/hidden": 0.91015625, + "loss/logits": 0.15315604209899902, + "loss/reg": 0.0038967591244727373, + "step": 1281 + }, + { + "epoch": 0.16025, + "grad_norm": 2.0041797161102295, + "grad_norm_var": 0.9511806175749209, + "learning_rate": 0.0001, + "loss": 1.0711, + "loss/crossentropy": 2.6270132064819336, + "loss/hidden": 0.87890625, + "loss/logits": 0.15329018235206604, + "loss/reg": 0.0038950201123952866, + "step": 1282 + }, + { + "epoch": 0.160375, + "grad_norm": 2.7158069610595703, + "grad_norm_var": 0.9519147622693618, + "learning_rate": 0.0001, + "loss": 1.0333, + "loss/crossentropy": 2.49900221824646, + "loss/hidden": 0.84765625, + "loss/logits": 0.14673107862472534, + "loss/reg": 0.0038934045005589724, + "step": 1283 + }, + { + "epoch": 0.1605, + "grad_norm": 2.5366370677948, + "grad_norm_var": 0.6752056332094691, + "learning_rate": 0.0001, + "loss": 1.1833, + "loss/crossentropy": 2.5442113876342773, + "loss/hidden": 0.98046875, + "loss/logits": 0.1639258712530136, + "loss/reg": 0.003891737898811698, + "step": 1284 + }, + { + "epoch": 0.160625, + "grad_norm": 3.805074453353882, + "grad_norm_var": 0.6705619509398928, + "learning_rate": 0.0001, + "loss": 1.2758, + "loss/crossentropy": 2.0399181842803955, + "loss/hidden": 1.078125, + "loss/logits": 0.15876121819019318, + "loss/reg": 0.003889812156558037, + "step": 1285 + }, + { + "epoch": 0.16075, + "grad_norm": 2.670008420944214, + "grad_norm_var": 0.5554521676123362, + "learning_rate": 0.0001, + "loss": 1.2431, + "loss/crossentropy": 2.538208484649658, + "loss/hidden": 1.0234375, + "loss/logits": 0.18076473474502563, + "loss/reg": 0.003887931350618601, + "step": 1286 + }, + { + "epoch": 0.160875, + "grad_norm": 2.1082379817962646, + "grad_norm_var": 0.23374974294705825, + "learning_rate": 0.0001, + "loss": 1.0764, + "loss/crossentropy": 2.4281837940216064, + "loss/hidden": 0.87109375, + "loss/logits": 0.16639548540115356, + "loss/reg": 0.003886270336806774, + "step": 1287 + }, + { + "epoch": 0.161, + "grad_norm": 2.232421398162842, + "grad_norm_var": 0.23966051398124927, + "learning_rate": 0.0001, + "loss": 0.9928, + "loss/crossentropy": 2.5111913681030273, + "loss/hidden": 0.8203125, + "loss/logits": 0.1336727738380432, + "loss/reg": 0.003884353907778859, + "step": 1288 + }, + { + "epoch": 0.161125, + "grad_norm": 1.8333826065063477, + "grad_norm_var": 0.25492677200505887, + "learning_rate": 0.0001, + "loss": 1.0349, + "loss/crossentropy": 2.4361870288848877, + "loss/hidden": 0.859375, + "loss/logits": 0.13674689829349518, + "loss/reg": 0.0038827096577733755, + "step": 1289 + }, + { + "epoch": 0.16125, + "grad_norm": 2.227004051208496, + "grad_norm_var": 0.23126510746358592, + "learning_rate": 0.0001, + "loss": 1.0776, + "loss/crossentropy": 2.1035265922546387, + "loss/hidden": 0.88671875, + "loss/logits": 0.1520470380783081, + "loss/reg": 0.0038811014965176582, + "step": 1290 + }, + { + "epoch": 0.161375, + "grad_norm": 2.4726340770721436, + "grad_norm_var": 0.23066153493828262, + "learning_rate": 0.0001, + "loss": 1.3176, + "loss/crossentropy": 2.5521106719970703, + "loss/hidden": 1.078125, + "loss/logits": 0.20070654153823853, + "loss/reg": 0.0038791669066995382, + "step": 1291 + }, + { + "epoch": 0.1615, + "grad_norm": 2.505643606185913, + "grad_norm_var": 0.22441776850007666, + "learning_rate": 0.0001, + "loss": 0.9609, + "loss/crossentropy": 2.562110662460327, + "loss/hidden": 0.7890625, + "loss/logits": 0.1330765336751938, + "loss/reg": 0.003877209033817053, + "step": 1292 + }, + { + "epoch": 0.161625, + "grad_norm": 2.2074098587036133, + "grad_norm_var": 0.22464862758212098, + "learning_rate": 0.0001, + "loss": 1.0223, + "loss/crossentropy": 2.4465317726135254, + "loss/hidden": 0.8359375, + "loss/logits": 0.1476120948791504, + "loss/reg": 0.0038752437103539705, + "step": 1293 + }, + { + "epoch": 0.16175, + "grad_norm": 4.475348949432373, + "grad_norm_var": 0.4655478968178611, + "learning_rate": 0.0001, + "loss": 1.2869, + "loss/crossentropy": 2.6663005352020264, + "loss/hidden": 1.0546875, + "loss/logits": 0.1934768706560135, + "loss/reg": 0.0038730644155293703, + "step": 1294 + }, + { + "epoch": 0.161875, + "grad_norm": 3.0794217586517334, + "grad_norm_var": 0.4767340552867606, + "learning_rate": 0.0001, + "loss": 1.1619, + "loss/crossentropy": 2.463402271270752, + "loss/hidden": 0.96484375, + "loss/logits": 0.15831388533115387, + "loss/reg": 0.0038711209781467915, + "step": 1295 + }, + { + "epoch": 0.162, + "grad_norm": 2.185600757598877, + "grad_norm_var": 0.47945242338888056, + "learning_rate": 0.0001, + "loss": 1.2293, + "loss/crossentropy": 2.240428924560547, + "loss/hidden": 1.0078125, + "loss/logits": 0.18282610177993774, + "loss/reg": 0.003869203384965658, + "step": 1296 + }, + { + "epoch": 0.162125, + "grad_norm": 2.6001839637756348, + "grad_norm_var": 0.46872306833601746, + "learning_rate": 0.0001, + "loss": 1.1285, + "loss/crossentropy": 2.6634023189544678, + "loss/hidden": 0.9296875, + "loss/logits": 0.16017360985279083, + "loss/reg": 0.0038670580834150314, + "step": 1297 + }, + { + "epoch": 0.16225, + "grad_norm": 2.9891393184661865, + "grad_norm_var": 0.4506250664032753, + "learning_rate": 0.0001, + "loss": 1.2209, + "loss/crossentropy": 2.2928833961486816, + "loss/hidden": 0.98046875, + "loss/logits": 0.2018088847398758, + "loss/reg": 0.0038651188369840384, + "step": 1298 + }, + { + "epoch": 0.162375, + "grad_norm": 2.5674245357513428, + "grad_norm_var": 0.45100085978748794, + "learning_rate": 0.0001, + "loss": 1.099, + "loss/crossentropy": 2.604619026184082, + "loss/hidden": 0.87109375, + "loss/logits": 0.18927830457687378, + "loss/reg": 0.003863039892166853, + "step": 1299 + }, + { + "epoch": 0.1625, + "grad_norm": 5.368757247924805, + "grad_norm_var": 0.9072441308021212, + "learning_rate": 0.0001, + "loss": 1.5845, + "loss/crossentropy": 1.9008941650390625, + "loss/hidden": 1.234375, + "loss/logits": 0.31154388189315796, + "loss/reg": 0.0038609288167208433, + "step": 1300 + }, + { + "epoch": 0.162625, + "grad_norm": 2.402621030807495, + "grad_norm_var": 0.8483983819636706, + "learning_rate": 0.0001, + "loss": 1.224, + "loss/crossentropy": 2.582455635070801, + "loss/hidden": 1.0078125, + "loss/logits": 0.17763689160346985, + "loss/reg": 0.0038588044699281454, + "step": 1301 + }, + { + "epoch": 0.16275, + "grad_norm": 2.1669483184814453, + "grad_norm_var": 0.8692672249500539, + "learning_rate": 0.0001, + "loss": 1.1932, + "loss/crossentropy": 2.2148444652557373, + "loss/hidden": 0.98828125, + "loss/logits": 0.1663748174905777, + "loss/reg": 0.0038566740695387125, + "step": 1302 + }, + { + "epoch": 0.162875, + "grad_norm": 2.7048940658569336, + "grad_norm_var": 0.8433353029278644, + "learning_rate": 0.0001, + "loss": 1.0917, + "loss/crossentropy": 2.576660394668579, + "loss/hidden": 0.89453125, + "loss/logits": 0.15860411524772644, + "loss/reg": 0.0038547737058252096, + "step": 1303 + }, + { + "epoch": 0.163, + "grad_norm": 7.6692304611206055, + "grad_norm_var": 2.314715920521659, + "learning_rate": 0.0001, + "loss": 1.7499, + "loss/crossentropy": 2.1765997409820557, + "loss/hidden": 1.5078125, + "loss/logits": 0.2035428285598755, + "loss/reg": 0.0038528875447809696, + "step": 1304 + }, + { + "epoch": 0.163125, + "grad_norm": 2.3525166511535645, + "grad_norm_var": 2.2445116172134316, + "learning_rate": 0.0001, + "loss": 1.0827, + "loss/crossentropy": 2.7200188636779785, + "loss/hidden": 0.875, + "loss/logits": 0.16918785870075226, + "loss/reg": 0.003850834909826517, + "step": 1305 + }, + { + "epoch": 0.16325, + "grad_norm": 2.1109790802001953, + "grad_norm_var": 2.259220587304002, + "learning_rate": 0.0001, + "loss": 0.9954, + "loss/crossentropy": 2.4676554203033447, + "loss/hidden": 0.81640625, + "loss/logits": 0.14051809906959534, + "loss/reg": 0.003848861902952194, + "step": 1306 + }, + { + "epoch": 0.163375, + "grad_norm": 2.2503230571746826, + "grad_norm_var": 2.281384886865043, + "learning_rate": 0.0001, + "loss": 1.0349, + "loss/crossentropy": 2.4450364112854004, + "loss/hidden": 0.8515625, + "loss/logits": 0.14489130675792694, + "loss/reg": 0.0038469466380774975, + "step": 1307 + }, + { + "epoch": 0.1635, + "grad_norm": 3.661198139190674, + "grad_norm_var": 2.272915770254127, + "learning_rate": 0.0001, + "loss": 1.2995, + "loss/crossentropy": 3.0266950130462646, + "loss/hidden": 1.0390625, + "loss/logits": 0.22195252776145935, + "loss/reg": 0.0038448853883892298, + "step": 1308 + }, + { + "epoch": 0.163625, + "grad_norm": 2.6725175380706787, + "grad_norm_var": 2.226462629702375, + "learning_rate": 0.0001, + "loss": 1.1629, + "loss/crossentropy": 2.4850494861602783, + "loss/hidden": 0.9609375, + "loss/logits": 0.16352644562721252, + "loss/reg": 0.003842818085104227, + "step": 1309 + }, + { + "epoch": 0.16375, + "grad_norm": 2.4281601905822754, + "grad_norm_var": 2.1412558591766695, + "learning_rate": 0.0001, + "loss": 1.0183, + "loss/crossentropy": 2.629995107650757, + "loss/hidden": 0.83984375, + "loss/logits": 0.14006099104881287, + "loss/reg": 0.0038410108536481857, + "step": 1310 + }, + { + "epoch": 0.163875, + "grad_norm": 2.779705762863159, + "grad_norm_var": 2.146718277972097, + "learning_rate": 0.0001, + "loss": 1.0696, + "loss/crossentropy": 2.845191240310669, + "loss/hidden": 0.87890625, + "loss/logits": 0.1523093283176422, + "loss/reg": 0.0038392541464418173, + "step": 1311 + }, + { + "epoch": 0.164, + "grad_norm": 2.1593542098999023, + "grad_norm_var": 2.1498104356164496, + "learning_rate": 0.0001, + "loss": 1.1257, + "loss/crossentropy": 2.5214879512786865, + "loss/hidden": 0.93359375, + "loss/logits": 0.15370512008666992, + "loss/reg": 0.0038373004645109177, + "step": 1312 + }, + { + "epoch": 0.164125, + "grad_norm": 2.261361837387085, + "grad_norm_var": 2.177543523879501, + "learning_rate": 0.0001, + "loss": 1.193, + "loss/crossentropy": 2.370466947555542, + "loss/hidden": 0.9921875, + "loss/logits": 0.16246996819972992, + "loss/reg": 0.0038352562114596367, + "step": 1313 + }, + { + "epoch": 0.16425, + "grad_norm": 4.186817169189453, + "grad_norm_var": 2.260020426671607, + "learning_rate": 0.0001, + "loss": 1.3862, + "loss/crossentropy": 2.2702040672302246, + "loss/hidden": 1.1328125, + "loss/logits": 0.21508020162582397, + "loss/reg": 0.0038333996199071407, + "step": 1314 + }, + { + "epoch": 0.164375, + "grad_norm": 3.48109769821167, + "grad_norm_var": 2.2462280124966996, + "learning_rate": 0.0001, + "loss": 1.0443, + "loss/crossentropy": 2.6975324153900146, + "loss/hidden": 0.85546875, + "loss/logits": 0.15051524341106415, + "loss/reg": 0.0038314287085086107, + "step": 1315 + }, + { + "epoch": 0.1645, + "grad_norm": 2.3781416416168213, + "grad_norm_var": 1.9268796990894694, + "learning_rate": 0.0001, + "loss": 1.069, + "loss/crossentropy": 2.565978527069092, + "loss/hidden": 0.8671875, + "loss/logits": 0.1635463535785675, + "loss/reg": 0.003829606808722019, + "step": 1316 + }, + { + "epoch": 0.164625, + "grad_norm": 2.532841682434082, + "grad_norm_var": 1.9179299858722438, + "learning_rate": 0.0001, + "loss": 1.1731, + "loss/crossentropy": 2.379420757293701, + "loss/hidden": 0.9453125, + "loss/logits": 0.18955053389072418, + "loss/reg": 0.003827982349321246, + "step": 1317 + }, + { + "epoch": 0.16475, + "grad_norm": 2.824406862258911, + "grad_norm_var": 1.8730366601404502, + "learning_rate": 0.0001, + "loss": 1.2196, + "loss/crossentropy": 2.745607852935791, + "loss/hidden": 0.98828125, + "loss/logits": 0.19303223490715027, + "loss/reg": 0.003826139261946082, + "step": 1318 + }, + { + "epoch": 0.164875, + "grad_norm": 2.927591562271118, + "grad_norm_var": 1.866532019300668, + "learning_rate": 0.0001, + "loss": 1.0242, + "loss/crossentropy": 2.745173215866089, + "loss/hidden": 0.8515625, + "loss/logits": 0.13440603017807007, + "loss/reg": 0.003824233775958419, + "step": 1319 + }, + { + "epoch": 0.165, + "grad_norm": 2.249950408935547, + "grad_norm_var": 0.3587598970037938, + "learning_rate": 0.0001, + "loss": 1.1167, + "loss/crossentropy": 2.182314395904541, + "loss/hidden": 0.9140625, + "loss/logits": 0.16442248225212097, + "loss/reg": 0.0038224998861551285, + "step": 1320 + }, + { + "epoch": 0.165125, + "grad_norm": 2.136213779449463, + "grad_norm_var": 0.3718083111594938, + "learning_rate": 0.0001, + "loss": 1.2208, + "loss/crossentropy": 2.3340518474578857, + "loss/hidden": 0.99609375, + "loss/logits": 0.18647333979606628, + "loss/reg": 0.0038206097669899464, + "step": 1321 + }, + { + "epoch": 0.16525, + "grad_norm": 2.3353726863861084, + "grad_norm_var": 0.3576302941917286, + "learning_rate": 0.0001, + "loss": 1.0998, + "loss/crossentropy": 2.558197498321533, + "loss/hidden": 0.890625, + "loss/logits": 0.17098355293273926, + "loss/reg": 0.003818872617557645, + "step": 1322 + }, + { + "epoch": 0.165375, + "grad_norm": 2.0035364627838135, + "grad_norm_var": 0.376367123736596, + "learning_rate": 0.0001, + "loss": 0.9672, + "loss/crossentropy": 2.0580060482025146, + "loss/hidden": 0.8046875, + "loss/logits": 0.12435194849967957, + "loss/reg": 0.003817170625552535, + "step": 1323 + }, + { + "epoch": 0.1655, + "grad_norm": 2.704374074935913, + "grad_norm_var": 0.3095112579833424, + "learning_rate": 0.0001, + "loss": 1.2064, + "loss/crossentropy": 2.056586742401123, + "loss/hidden": 1.03125, + "loss/logits": 0.1370464414358139, + "loss/reg": 0.003815267700701952, + "step": 1324 + }, + { + "epoch": 0.165625, + "grad_norm": 2.861746072769165, + "grad_norm_var": 0.3128512221250444, + "learning_rate": 0.0001, + "loss": 1.2048, + "loss/crossentropy": 2.649263381958008, + "loss/hidden": 0.9921875, + "loss/logits": 0.1744484305381775, + "loss/reg": 0.0038135608192533255, + "step": 1325 + }, + { + "epoch": 0.16575, + "grad_norm": 4.303040027618408, + "grad_norm_var": 0.4794263231115619, + "learning_rate": 0.0001, + "loss": 1.396, + "loss/crossentropy": 2.5463757514953613, + "loss/hidden": 1.0859375, + "loss/logits": 0.27196431159973145, + "loss/reg": 0.003811680944636464, + "step": 1326 + }, + { + "epoch": 0.165875, + "grad_norm": 2.6042354106903076, + "grad_norm_var": 0.48083927966075424, + "learning_rate": 0.0001, + "loss": 1.3305, + "loss/crossentropy": 2.652782917022705, + "loss/hidden": 1.0625, + "loss/logits": 0.2298596352338791, + "loss/reg": 0.003809748450294137, + "step": 1327 + }, + { + "epoch": 0.166, + "grad_norm": 2.099306344985962, + "grad_norm_var": 0.4857685954885028, + "learning_rate": 0.0001, + "loss": 1.1101, + "loss/crossentropy": 2.54836368560791, + "loss/hidden": 0.89453125, + "loss/logits": 0.17748317122459412, + "loss/reg": 0.0038078485522419214, + "step": 1328 + }, + { + "epoch": 0.166125, + "grad_norm": 5.760173320770264, + "grad_norm_var": 1.026126259782783, + "learning_rate": 0.0001, + "loss": 1.4286, + "loss/crossentropy": 2.3031833171844482, + "loss/hidden": 1.0703125, + "loss/logits": 0.3202149271965027, + "loss/reg": 0.003806003602221608, + "step": 1329 + }, + { + "epoch": 0.16625, + "grad_norm": 2.2108118534088135, + "grad_norm_var": 0.9474122587329703, + "learning_rate": 0.0001, + "loss": 1.0457, + "loss/crossentropy": 2.5074656009674072, + "loss/hidden": 0.859375, + "loss/logits": 0.14826124906539917, + "loss/reg": 0.003804128849878907, + "step": 1330 + }, + { + "epoch": 0.166375, + "grad_norm": 3.3911850452423096, + "grad_norm_var": 0.9402114702613211, + "learning_rate": 0.0001, + "loss": 1.2478, + "loss/crossentropy": 2.4859514236450195, + "loss/hidden": 1.015625, + "loss/logits": 0.19414550065994263, + "loss/reg": 0.0038022748194634914, + "step": 1331 + }, + { + "epoch": 0.1665, + "grad_norm": 2.3483588695526123, + "grad_norm_var": 0.9420719086390626, + "learning_rate": 0.0001, + "loss": 1.1072, + "loss/crossentropy": 2.7186105251312256, + "loss/hidden": 0.90234375, + "loss/logits": 0.16681891679763794, + "loss/reg": 0.003800415899604559, + "step": 1332 + }, + { + "epoch": 0.166625, + "grad_norm": 2.478461503982544, + "grad_norm_var": 0.9444172935081412, + "learning_rate": 0.0001, + "loss": 1.2747, + "loss/crossentropy": 2.414044141769409, + "loss/hidden": 1.046875, + "loss/logits": 0.18985730409622192, + "loss/reg": 0.003798494813963771, + "step": 1333 + }, + { + "epoch": 0.16675, + "grad_norm": 3.826606273651123, + "grad_norm_var": 1.0067895170922097, + "learning_rate": 0.0001, + "loss": 1.5191, + "loss/crossentropy": 2.52571964263916, + "loss/hidden": 1.25, + "loss/logits": 0.2311021387577057, + "loss/reg": 0.0037966351956129074, + "step": 1334 + }, + { + "epoch": 0.166875, + "grad_norm": 2.2956812381744385, + "grad_norm_var": 1.0285842417783648, + "learning_rate": 0.0001, + "loss": 0.9857, + "loss/crossentropy": 2.333362579345703, + "loss/hidden": 0.8046875, + "loss/logits": 0.14301563799381256, + "loss/reg": 0.003794773481786251, + "step": 1335 + }, + { + "epoch": 0.167, + "grad_norm": 2.215144634246826, + "grad_norm_var": 1.0314472749301025, + "learning_rate": 0.0001, + "loss": 1.0197, + "loss/crossentropy": 2.6435439586639404, + "loss/hidden": 0.83203125, + "loss/logits": 0.14977750182151794, + "loss/reg": 0.003792962059378624, + "step": 1336 + }, + { + "epoch": 0.167125, + "grad_norm": 2.289198875427246, + "grad_norm_var": 1.0183830630566924, + "learning_rate": 0.0001, + "loss": 1.0596, + "loss/crossentropy": 2.4188473224639893, + "loss/hidden": 0.875, + "loss/logits": 0.14672745764255524, + "loss/reg": 0.0037910572718828917, + "step": 1337 + }, + { + "epoch": 0.16725, + "grad_norm": 2.585996627807617, + "grad_norm_var": 1.0048460491356954, + "learning_rate": 0.0001, + "loss": 1.1657, + "loss/crossentropy": 2.459104537963867, + "loss/hidden": 0.95703125, + "loss/logits": 0.17073991894721985, + "loss/reg": 0.0037892020773142576, + "step": 1338 + }, + { + "epoch": 0.167375, + "grad_norm": 2.2020533084869385, + "grad_norm_var": 0.9842790473450236, + "learning_rate": 0.0001, + "loss": 1.0632, + "loss/crossentropy": 2.602936029434204, + "loss/hidden": 0.875, + "loss/logits": 0.1503719538450241, + "loss/reg": 0.00378743140026927, + "step": 1339 + }, + { + "epoch": 0.1675, + "grad_norm": 2.0642025470733643, + "grad_norm_var": 1.0253976633091109, + "learning_rate": 0.0001, + "loss": 1.0186, + "loss/crossentropy": 2.5779831409454346, + "loss/hidden": 0.8359375, + "loss/logits": 0.14476892352104187, + "loss/reg": 0.0037853880785405636, + "step": 1340 + }, + { + "epoch": 0.167625, + "grad_norm": 2.0386452674865723, + "grad_norm_var": 1.0660144013342174, + "learning_rate": 0.0001, + "loss": 1.0255, + "loss/crossentropy": 2.599996328353882, + "loss/hidden": 0.8359375, + "loss/logits": 0.15170088410377502, + "loss/reg": 0.0037834926042705774, + "step": 1341 + }, + { + "epoch": 0.16775, + "grad_norm": 2.469545841217041, + "grad_norm_var": 0.9073509513912974, + "learning_rate": 0.0001, + "loss": 1.1727, + "loss/crossentropy": 2.4470956325531006, + "loss/hidden": 0.96875, + "loss/logits": 0.1660883128643036, + "loss/reg": 0.003781634848564863, + "step": 1342 + }, + { + "epoch": 0.167875, + "grad_norm": 2.972076892852783, + "grad_norm_var": 0.9120929514276962, + "learning_rate": 0.0001, + "loss": 1.0577, + "loss/crossentropy": 2.2371230125427246, + "loss/hidden": 0.87890625, + "loss/logits": 0.14097647368907928, + "loss/reg": 0.0037799072451889515, + "step": 1343 + }, + { + "epoch": 0.168, + "grad_norm": 2.47365403175354, + "grad_norm_var": 0.8907210075164879, + "learning_rate": 0.0001, + "loss": 1.1224, + "loss/crossentropy": 2.574859857559204, + "loss/hidden": 0.91015625, + "loss/logits": 0.1744510978460312, + "loss/reg": 0.003778197569772601, + "step": 1344 + }, + { + "epoch": 0.168125, + "grad_norm": 3.3522775173187256, + "grad_norm_var": 0.27908018822892944, + "learning_rate": 0.0001, + "loss": 1.1439, + "loss/crossentropy": 2.3602190017700195, + "loss/hidden": 0.94921875, + "loss/logits": 0.15688063204288483, + "loss/reg": 0.0037766145542263985, + "step": 1345 + }, + { + "epoch": 0.16825, + "grad_norm": 2.2945148944854736, + "grad_norm_var": 0.27544389245511314, + "learning_rate": 0.0001, + "loss": 1.0967, + "loss/crossentropy": 2.0383851528167725, + "loss/hidden": 0.94140625, + "loss/logits": 0.11755125969648361, + "loss/reg": 0.0037747540045529604, + "step": 1346 + }, + { + "epoch": 0.168375, + "grad_norm": 2.3891069889068604, + "grad_norm_var": 0.2299681545095474, + "learning_rate": 0.0001, + "loss": 1.1306, + "loss/crossentropy": 2.3858978748321533, + "loss/hidden": 0.91796875, + "loss/logits": 0.174921452999115, + "loss/reg": 0.003772968426346779, + "step": 1347 + }, + { + "epoch": 0.1685, + "grad_norm": 2.070483684539795, + "grad_norm_var": 0.24109670204344696, + "learning_rate": 0.0001, + "loss": 0.9534, + "loss/crossentropy": 2.616994619369507, + "loss/hidden": 0.7890625, + "loss/logits": 0.12660518288612366, + "loss/reg": 0.003771234769374132, + "step": 1348 + }, + { + "epoch": 0.168625, + "grad_norm": 2.5373406410217285, + "grad_norm_var": 0.24113562481536305, + "learning_rate": 0.0001, + "loss": 1.149, + "loss/crossentropy": 2.5550339221954346, + "loss/hidden": 0.9296875, + "loss/logits": 0.18165861070156097, + "loss/reg": 0.0037693637423217297, + "step": 1349 + }, + { + "epoch": 0.16875, + "grad_norm": 2.4782979488372803, + "grad_norm_var": 0.11712655452237944, + "learning_rate": 0.0001, + "loss": 1.1058, + "loss/crossentropy": 2.424433469772339, + "loss/hidden": 0.90625, + "loss/logits": 0.16192512214183807, + "loss/reg": 0.003767443122342229, + "step": 1350 + }, + { + "epoch": 0.168875, + "grad_norm": 2.141442060470581, + "grad_norm_var": 0.12118062200624896, + "learning_rate": 0.0001, + "loss": 1.0005, + "loss/crossentropy": 2.4995741844177246, + "loss/hidden": 0.828125, + "loss/logits": 0.13474415242671967, + "loss/reg": 0.0037655706983059645, + "step": 1351 + }, + { + "epoch": 0.169, + "grad_norm": 3.0165703296661377, + "grad_norm_var": 0.1404083277914895, + "learning_rate": 0.0001, + "loss": 1.0107, + "loss/crossentropy": 2.753091812133789, + "loss/hidden": 0.81640625, + "loss/logits": 0.1566263735294342, + "loss/reg": 0.0037636584602296352, + "step": 1352 + }, + { + "epoch": 0.169125, + "grad_norm": 2.388427257537842, + "grad_norm_var": 0.1387512034039254, + "learning_rate": 0.0001, + "loss": 0.9903, + "loss/crossentropy": 2.3668482303619385, + "loss/hidden": 0.8125, + "loss/logits": 0.14017178118228912, + "loss/reg": 0.0037617513444274664, + "step": 1353 + }, + { + "epoch": 0.16925, + "grad_norm": 1.86200749874115, + "grad_norm_var": 0.1600401535940278, + "learning_rate": 0.0001, + "loss": 0.9282, + "loss/crossentropy": 2.6024041175842285, + "loss/hidden": 0.765625, + "loss/logits": 0.12501290440559387, + "loss/reg": 0.003759781364351511, + "step": 1354 + }, + { + "epoch": 0.169375, + "grad_norm": 2.6516194343566895, + "grad_norm_var": 0.15949300228247545, + "learning_rate": 0.0001, + "loss": 1.0633, + "loss/crossentropy": 2.66141939163208, + "loss/hidden": 0.87109375, + "loss/logits": 0.1546727865934372, + "loss/reg": 0.0037579077761620283, + "step": 1355 + }, + { + "epoch": 0.1695, + "grad_norm": 2.536879777908325, + "grad_norm_var": 0.1491417929813607, + "learning_rate": 0.0001, + "loss": 1.0189, + "loss/crossentropy": 2.509375810623169, + "loss/hidden": 0.828125, + "loss/logits": 0.1532239019870758, + "loss/reg": 0.0037560255732387304, + "step": 1356 + }, + { + "epoch": 0.169625, + "grad_norm": 2.869354724884033, + "grad_norm_var": 0.14343589299983103, + "learning_rate": 0.0001, + "loss": 1.0562, + "loss/crossentropy": 2.676845073699951, + "loss/hidden": 0.84765625, + "loss/logits": 0.1710355579853058, + "loss/reg": 0.0037540122866630554, + "step": 1357 + }, + { + "epoch": 0.16975, + "grad_norm": 3.0909178256988525, + "grad_norm_var": 0.16243653600033944, + "learning_rate": 0.0001, + "loss": 0.9284, + "loss/crossentropy": 2.4438745975494385, + "loss/hidden": 0.76953125, + "loss/logits": 0.12132885307073593, + "loss/reg": 0.003752180142328143, + "step": 1358 + }, + { + "epoch": 0.169875, + "grad_norm": 2.1628170013427734, + "grad_norm_var": 0.16001678424908092, + "learning_rate": 0.0001, + "loss": 0.9578, + "loss/crossentropy": 2.4705512523651123, + "loss/hidden": 0.7890625, + "loss/logits": 0.13126134872436523, + "loss/reg": 0.0037503293715417385, + "step": 1359 + }, + { + "epoch": 0.17, + "grad_norm": 2.031930923461914, + "grad_norm_var": 0.17492556648024848, + "learning_rate": 0.0001, + "loss": 0.9667, + "loss/crossentropy": 2.6681065559387207, + "loss/hidden": 0.796875, + "loss/logits": 0.1323142796754837, + "loss/reg": 0.003748575458303094, + "step": 1360 + }, + { + "epoch": 0.170125, + "grad_norm": 2.473466634750366, + "grad_norm_var": 0.12240658206718862, + "learning_rate": 0.0001, + "loss": 1.1075, + "loss/crossentropy": 2.744309902191162, + "loss/hidden": 0.8984375, + "loss/logits": 0.17157718539237976, + "loss/reg": 0.003746669040992856, + "step": 1361 + }, + { + "epoch": 0.17025, + "grad_norm": 2.9182069301605225, + "grad_norm_var": 0.1348531412058311, + "learning_rate": 0.0001, + "loss": 0.9539, + "loss/crossentropy": 2.7478702068328857, + "loss/hidden": 0.77734375, + "loss/logits": 0.13914981484413147, + "loss/reg": 0.003744750050827861, + "step": 1362 + }, + { + "epoch": 0.170375, + "grad_norm": 2.5078988075256348, + "grad_norm_var": 0.13435597843808764, + "learning_rate": 0.0001, + "loss": 0.99, + "loss/crossentropy": 2.5372326374053955, + "loss/hidden": 0.8046875, + "loss/logits": 0.14791148900985718, + "loss/reg": 0.003742997534573078, + "step": 1363 + }, + { + "epoch": 0.1705, + "grad_norm": 2.880740165710449, + "grad_norm_var": 0.13075709652999348, + "learning_rate": 0.0001, + "loss": 1.2749, + "loss/crossentropy": 2.247453451156616, + "loss/hidden": 1.0546875, + "loss/logits": 0.18284665048122406, + "loss/reg": 0.003741198917850852, + "step": 1364 + }, + { + "epoch": 0.170625, + "grad_norm": 2.802058219909668, + "grad_norm_var": 0.13524607605756855, + "learning_rate": 0.0001, + "loss": 1.0738, + "loss/crossentropy": 2.5267081260681152, + "loss/hidden": 0.890625, + "loss/logits": 0.1457740068435669, + "loss/reg": 0.0037392526865005493, + "step": 1365 + }, + { + "epoch": 0.17075, + "grad_norm": 2.8533761501312256, + "grad_norm_var": 0.14041346014174008, + "learning_rate": 0.0001, + "loss": 1.1666, + "loss/crossentropy": 2.620199680328369, + "loss/hidden": 0.94921875, + "loss/logits": 0.1800282597541809, + "loss/reg": 0.0037372722290456295, + "step": 1366 + }, + { + "epoch": 0.170875, + "grad_norm": 3.108058214187622, + "grad_norm_var": 0.14303122083475486, + "learning_rate": 0.0001, + "loss": 1.223, + "loss/crossentropy": 2.40995717048645, + "loss/hidden": 1.015625, + "loss/logits": 0.17001160979270935, + "loss/reg": 0.0037354743108153343, + "step": 1367 + }, + { + "epoch": 0.171, + "grad_norm": 2.5328550338745117, + "grad_norm_var": 0.13302262467848933, + "learning_rate": 0.0001, + "loss": 1.1243, + "loss/crossentropy": 2.48756742477417, + "loss/hidden": 0.91015625, + "loss/logits": 0.17680081725120544, + "loss/reg": 0.0037336875684559345, + "step": 1368 + }, + { + "epoch": 0.171125, + "grad_norm": 2.2643351554870605, + "grad_norm_var": 0.13755867625505444, + "learning_rate": 0.0001, + "loss": 0.9683, + "loss/crossentropy": 2.642735719680786, + "loss/hidden": 0.80078125, + "loss/logits": 0.13022208213806152, + "loss/reg": 0.003731830744072795, + "step": 1369 + }, + { + "epoch": 0.17125, + "grad_norm": 1.9663736820220947, + "grad_norm_var": 0.12801642728851045, + "learning_rate": 0.0001, + "loss": 1.0785, + "loss/crossentropy": 2.4266912937164307, + "loss/hidden": 0.90625, + "loss/logits": 0.13490843772888184, + "loss/reg": 0.0037299375981092453, + "step": 1370 + }, + { + "epoch": 0.171375, + "grad_norm": 2.1911253929138184, + "grad_norm_var": 0.1382957404551554, + "learning_rate": 0.0001, + "loss": 1.1049, + "loss/crossentropy": 2.4400012493133545, + "loss/hidden": 0.90625, + "loss/logits": 0.1613638699054718, + "loss/reg": 0.003728190902620554, + "step": 1371 + }, + { + "epoch": 0.1715, + "grad_norm": 2.3214714527130127, + "grad_norm_var": 0.14227339992063978, + "learning_rate": 0.0001, + "loss": 0.9918, + "loss/crossentropy": 2.5530331134796143, + "loss/hidden": 0.81640625, + "loss/logits": 0.13810178637504578, + "loss/reg": 0.003726301481947303, + "step": 1372 + }, + { + "epoch": 0.171625, + "grad_norm": 2.2973015308380127, + "grad_norm_var": 0.13920199708697206, + "learning_rate": 0.0001, + "loss": 0.997, + "loss/crossentropy": 2.51094913482666, + "loss/hidden": 0.82421875, + "loss/logits": 0.13557234406471252, + "loss/reg": 0.0037244099657982588, + "step": 1373 + }, + { + "epoch": 0.17175, + "grad_norm": 2.0010879039764404, + "grad_norm_var": 0.1312278234613122, + "learning_rate": 0.0001, + "loss": 1.0181, + "loss/crossentropy": 2.556290864944458, + "loss/hidden": 0.8359375, + "loss/logits": 0.1449136734008789, + "loss/reg": 0.00372238177806139, + "step": 1374 + }, + { + "epoch": 0.171875, + "grad_norm": 2.9626214504241943, + "grad_norm_var": 0.13982906840783826, + "learning_rate": 0.0001, + "loss": 1.1252, + "loss/crossentropy": 2.6870641708374023, + "loss/hidden": 0.9453125, + "loss/logits": 0.14271126687526703, + "loss/reg": 0.003720562905073166, + "step": 1375 + }, + { + "epoch": 0.172, + "grad_norm": 2.2013375759124756, + "grad_norm_var": 0.1308908021708324, + "learning_rate": 0.0001, + "loss": 0.9615, + "loss/crossentropy": 2.4653432369232178, + "loss/hidden": 0.79296875, + "loss/logits": 0.13133826851844788, + "loss/reg": 0.0037186951376497746, + "step": 1376 + }, + { + "epoch": 0.172125, + "grad_norm": 1.8874136209487915, + "grad_norm_var": 0.15580902298580662, + "learning_rate": 0.0001, + "loss": 0.9726, + "loss/crossentropy": 2.7080636024475098, + "loss/hidden": 0.80078125, + "loss/logits": 0.13469372689723969, + "loss/reg": 0.0037167875561863184, + "step": 1377 + }, + { + "epoch": 0.17225, + "grad_norm": 2.2416839599609375, + "grad_norm_var": 0.14497829998406733, + "learning_rate": 0.0001, + "loss": 1.0436, + "loss/crossentropy": 2.523420572280884, + "loss/hidden": 0.8515625, + "loss/logits": 0.15487292408943176, + "loss/reg": 0.003714931197464466, + "step": 1378 + }, + { + "epoch": 0.172375, + "grad_norm": 2.8195207118988037, + "grad_norm_var": 0.15392134715338787, + "learning_rate": 0.0001, + "loss": 1.1432, + "loss/crossentropy": 2.4862663745880127, + "loss/hidden": 0.99609375, + "loss/logits": 0.10999385267496109, + "loss/reg": 0.003713154001161456, + "step": 1379 + }, + { + "epoch": 0.1725, + "grad_norm": 2.118751287460327, + "grad_norm_var": 0.14728210095098948, + "learning_rate": 0.0001, + "loss": 1.0546, + "loss/crossentropy": 2.33610200881958, + "loss/hidden": 0.875, + "loss/logits": 0.14247506856918335, + "loss/reg": 0.0037111735437065363, + "step": 1380 + }, + { + "epoch": 0.172625, + "grad_norm": 2.783078193664551, + "grad_norm_var": 0.14631392823386963, + "learning_rate": 0.0001, + "loss": 1.1765, + "loss/crossentropy": 2.6499156951904297, + "loss/hidden": 0.96484375, + "loss/logits": 0.17454375326633453, + "loss/reg": 0.0037092994898557663, + "step": 1381 + }, + { + "epoch": 0.17275, + "grad_norm": 2.4130048751831055, + "grad_norm_var": 0.13236574600067233, + "learning_rate": 0.0001, + "loss": 0.9868, + "loss/crossentropy": 2.5483062267303467, + "loss/hidden": 0.80859375, + "loss/logits": 0.14112114906311035, + "loss/reg": 0.003707532538101077, + "step": 1382 + }, + { + "epoch": 0.172875, + "grad_norm": 2.2584409713745117, + "grad_norm_var": 0.09521777507376417, + "learning_rate": 0.0001, + "loss": 1.1614, + "loss/crossentropy": 2.5460643768310547, + "loss/hidden": 0.9609375, + "loss/logits": 0.16338081657886505, + "loss/reg": 0.0037057846784591675, + "step": 1383 + }, + { + "epoch": 0.173, + "grad_norm": 3.7609903812408447, + "grad_norm_var": 0.2229059105024603, + "learning_rate": 0.0001, + "loss": 1.0447, + "loss/crossentropy": 2.775035858154297, + "loss/hidden": 0.85546875, + "loss/logits": 0.15221986174583435, + "loss/reg": 0.0037039562594145536, + "step": 1384 + }, + { + "epoch": 0.173125, + "grad_norm": 2.734567642211914, + "grad_norm_var": 0.22787300757804296, + "learning_rate": 0.0001, + "loss": 1.0205, + "loss/crossentropy": 2.524040937423706, + "loss/hidden": 0.83984375, + "loss/logits": 0.14358995854854584, + "loss/reg": 0.003702066373080015, + "step": 1385 + }, + { + "epoch": 0.17325, + "grad_norm": 25.204544067382812, + "grad_norm_var": 32.52689382508577, + "learning_rate": 0.0001, + "loss": 1.2611, + "loss/crossentropy": 2.7667076587677, + "loss/hidden": 1.078125, + "loss/logits": 0.1459766924381256, + "loss/reg": 0.0037002949975430965, + "step": 1386 + }, + { + "epoch": 0.173375, + "grad_norm": 6.416390419006348, + "grad_norm_var": 32.687121260827944, + "learning_rate": 0.0001, + "loss": 1.2171, + "loss/crossentropy": 2.594097137451172, + "loss/hidden": 1.0546875, + "loss/logits": 0.1254766285419464, + "loss/reg": 0.0036984088364988565, + "step": 1387 + }, + { + "epoch": 0.1735, + "grad_norm": 1.9167765378952026, + "grad_norm_var": 32.796098433775775, + "learning_rate": 0.0001, + "loss": 0.9744, + "loss/crossentropy": 2.2913591861724854, + "loss/hidden": 0.8125, + "loss/logits": 0.12497787177562714, + "loss/reg": 0.003696783911436796, + "step": 1388 + }, + { + "epoch": 0.173625, + "grad_norm": 2.2160396575927734, + "grad_norm_var": 32.816325970432494, + "learning_rate": 0.0001, + "loss": 0.9752, + "loss/crossentropy": 2.8192343711853027, + "loss/hidden": 0.8046875, + "loss/logits": 0.13359323143959045, + "loss/reg": 0.0036951396614313126, + "step": 1389 + }, + { + "epoch": 0.17375, + "grad_norm": 2.288888931274414, + "grad_norm_var": 32.74015382821924, + "learning_rate": 0.0001, + "loss": 1.1827, + "loss/crossentropy": 2.637406826019287, + "loss/hidden": 0.97265625, + "loss/logits": 0.17314687371253967, + "loss/reg": 0.0036930718924850225, + "step": 1390 + }, + { + "epoch": 0.173875, + "grad_norm": 2.411794662475586, + "grad_norm_var": 32.8455146358098, + "learning_rate": 0.0001, + "loss": 1.0526, + "loss/crossentropy": 2.189383029937744, + "loss/hidden": 0.86328125, + "loss/logits": 0.1524190902709961, + "loss/reg": 0.0036910499911755323, + "step": 1391 + }, + { + "epoch": 0.174, + "grad_norm": 4.155782222747803, + "grad_norm_var": 32.58828549446247, + "learning_rate": 0.0001, + "loss": 1.0565, + "loss/crossentropy": 2.463545560836792, + "loss/hidden": 0.87890625, + "loss/logits": 0.14067277312278748, + "loss/reg": 0.0036890122573822737, + "step": 1392 + }, + { + "epoch": 0.174125, + "grad_norm": 5.347140312194824, + "grad_norm_var": 32.25727325951478, + "learning_rate": 0.0001, + "loss": 1.3215, + "loss/crossentropy": 2.153430223464966, + "loss/hidden": 1.1171875, + "loss/logits": 0.1673976629972458, + "loss/reg": 0.0036869607865810394, + "step": 1393 + }, + { + "epoch": 0.17425, + "grad_norm": 2.7766270637512207, + "grad_norm_var": 32.11815070371227, + "learning_rate": 0.0001, + "loss": 1.0493, + "loss/crossentropy": 2.4740800857543945, + "loss/hidden": 0.88671875, + "loss/logits": 0.1257045567035675, + "loss/reg": 0.003685306990519166, + "step": 1394 + }, + { + "epoch": 0.174375, + "grad_norm": 4.125362396240234, + "grad_norm_var": 31.936244846904135, + "learning_rate": 0.0001, + "loss": 1.3623, + "loss/crossentropy": 2.216123342514038, + "loss/hidden": 1.1328125, + "loss/logits": 0.1926833689212799, + "loss/reg": 0.003683644812554121, + "step": 1395 + }, + { + "epoch": 0.1745, + "grad_norm": 2.5013086795806885, + "grad_norm_var": 31.82097080900541, + "learning_rate": 0.0001, + "loss": 1.1619, + "loss/crossentropy": 2.2907092571258545, + "loss/hidden": 0.98046875, + "loss/logits": 0.14458391070365906, + "loss/reg": 0.00368195166811347, + "step": 1396 + }, + { + "epoch": 0.174625, + "grad_norm": 2.9556281566619873, + "grad_norm_var": 31.78144628269959, + "learning_rate": 0.0001, + "loss": 1.3064, + "loss/crossentropy": 2.600534439086914, + "loss/hidden": 0.98828125, + "loss/logits": 0.2812826335430145, + "loss/reg": 0.0036801020614802837, + "step": 1397 + }, + { + "epoch": 0.17475, + "grad_norm": 2.312662363052368, + "grad_norm_var": 31.811237788762753, + "learning_rate": 0.0001, + "loss": 1.2056, + "loss/crossentropy": 2.346238613128662, + "loss/hidden": 1.0078125, + "loss/logits": 0.16105079650878906, + "loss/reg": 0.0036781977396458387, + "step": 1398 + }, + { + "epoch": 0.174875, + "grad_norm": 2.161569356918335, + "grad_norm_var": 31.841893155076757, + "learning_rate": 0.0001, + "loss": 1.0891, + "loss/crossentropy": 2.6312341690063477, + "loss/hidden": 0.8984375, + "loss/logits": 0.15386250615119934, + "loss/reg": 0.0036762990057468414, + "step": 1399 + }, + { + "epoch": 0.175, + "grad_norm": 2.7927346229553223, + "grad_norm_var": 32.00627187711323, + "learning_rate": 0.0001, + "loss": 1.0835, + "loss/crossentropy": 2.3059043884277344, + "loss/hidden": 0.90234375, + "loss/logits": 0.1444191038608551, + "loss/reg": 0.0036745734978467226, + "step": 1400 + }, + { + "epoch": 0.175125, + "grad_norm": 3.7161078453063965, + "grad_norm_var": 31.832840403479917, + "learning_rate": 0.0001, + "loss": 1.2519, + "loss/crossentropy": 2.26311993598938, + "loss/hidden": 1.015625, + "loss/logits": 0.19954730570316315, + "loss/reg": 0.0036726652178913355, + "step": 1401 + }, + { + "epoch": 0.17525, + "grad_norm": 2.494056224822998, + "grad_norm_var": 1.6194340047826798, + "learning_rate": 0.0001, + "loss": 1.1725, + "loss/crossentropy": 2.2084007263183594, + "loss/hidden": 0.97265625, + "loss/logits": 0.16313332319259644, + "loss/reg": 0.00367086473852396, + "step": 1402 + }, + { + "epoch": 0.175375, + "grad_norm": 8.896705627441406, + "grad_norm_var": 3.0802516385388357, + "learning_rate": 0.0001, + "loss": 1.8271, + "loss/crossentropy": 2.3072290420532227, + "loss/hidden": 1.546875, + "loss/logits": 0.24357590079307556, + "loss/reg": 0.0036691350396722555, + "step": 1403 + }, + { + "epoch": 0.1755, + "grad_norm": 2.695955753326416, + "grad_norm_var": 2.972744932112194, + "learning_rate": 0.0001, + "loss": 1.2248, + "loss/crossentropy": 2.350301504135132, + "loss/hidden": 1.015625, + "loss/logits": 0.1725081503391266, + "loss/reg": 0.003667246550321579, + "step": 1404 + }, + { + "epoch": 0.175625, + "grad_norm": 2.44710373878479, + "grad_norm_var": 2.940667944838992, + "learning_rate": 0.0001, + "loss": 1.1112, + "loss/crossentropy": 2.5243077278137207, + "loss/hidden": 0.921875, + "loss/logits": 0.15269093215465546, + "loss/reg": 0.0036653466522693634, + "step": 1405 + }, + { + "epoch": 0.17575, + "grad_norm": 2.1895623207092285, + "grad_norm_var": 2.9557342642141196, + "learning_rate": 0.0001, + "loss": 1.035, + "loss/crossentropy": 2.4991495609283447, + "loss/hidden": 0.85546875, + "loss/logits": 0.14287039637565613, + "loss/reg": 0.003663522657006979, + "step": 1406 + }, + { + "epoch": 0.175875, + "grad_norm": 6.32980489730835, + "grad_norm_var": 3.412629436693123, + "learning_rate": 0.0001, + "loss": 1.4889, + "loss/crossentropy": 2.193056583404541, + "loss/hidden": 1.265625, + "loss/logits": 0.18663693964481354, + "loss/reg": 0.0036618507001549006, + "step": 1407 + }, + { + "epoch": 0.176, + "grad_norm": 2.4646599292755127, + "grad_norm_var": 3.4702546151327094, + "learning_rate": 0.0001, + "loss": 1.1915, + "loss/crossentropy": 2.23488450050354, + "loss/hidden": 0.984375, + "loss/logits": 0.17051902413368225, + "loss/reg": 0.003659995039924979, + "step": 1408 + }, + { + "epoch": 0.176125, + "grad_norm": 2.211646318435669, + "grad_norm_var": 3.317894410006067, + "learning_rate": 0.0001, + "loss": 0.9964, + "loss/crossentropy": 2.5030038356781006, + "loss/hidden": 0.8125, + "loss/logits": 0.1473085880279541, + "loss/reg": 0.0036581193562597036, + "step": 1409 + }, + { + "epoch": 0.17625, + "grad_norm": 2.623199224472046, + "grad_norm_var": 3.330419454675635, + "learning_rate": 0.0001, + "loss": 1.3115, + "loss/crossentropy": 2.550668716430664, + "loss/hidden": 1.0859375, + "loss/logits": 0.18897610902786255, + "loss/reg": 0.0036564578767865896, + "step": 1410 + }, + { + "epoch": 0.176375, + "grad_norm": 2.587066173553467, + "grad_norm_var": 3.3105432674443476, + "learning_rate": 0.0001, + "loss": 1.0134, + "loss/crossentropy": 2.560816764831543, + "loss/hidden": 0.8359375, + "loss/logits": 0.14094150066375732, + "loss/reg": 0.003654823638498783, + "step": 1411 + }, + { + "epoch": 0.1765, + "grad_norm": 3.003955602645874, + "grad_norm_var": 3.278755120231675, + "learning_rate": 0.0001, + "loss": 1.1132, + "loss/crossentropy": 2.602020740509033, + "loss/hidden": 0.9296875, + "loss/logits": 0.14699101448059082, + "loss/reg": 0.0036529425997287035, + "step": 1412 + }, + { + "epoch": 0.176625, + "grad_norm": 2.4333064556121826, + "grad_norm_var": 3.315795478379337, + "learning_rate": 0.0001, + "loss": 0.9971, + "loss/crossentropy": 2.402600049972534, + "loss/hidden": 0.82421875, + "loss/logits": 0.13639256358146667, + "loss/reg": 0.003651064820587635, + "step": 1413 + }, + { + "epoch": 0.17675, + "grad_norm": 3.2620654106140137, + "grad_norm_var": 3.2585387544687285, + "learning_rate": 0.0001, + "loss": 1.2037, + "loss/crossentropy": 2.5123348236083984, + "loss/hidden": 0.98828125, + "loss/logits": 0.17892900109291077, + "loss/reg": 0.0036491919308900833, + "step": 1414 + }, + { + "epoch": 0.176875, + "grad_norm": 2.7178690433502197, + "grad_norm_var": 3.195713317595645, + "learning_rate": 0.0001, + "loss": 1.1421, + "loss/crossentropy": 2.5939574241638184, + "loss/hidden": 0.94140625, + "loss/logits": 0.16417454183101654, + "loss/reg": 0.003647380042821169, + "step": 1415 + }, + { + "epoch": 0.177, + "grad_norm": 2.7069778442382812, + "grad_norm_var": 3.2020201720099597, + "learning_rate": 0.0001, + "loss": 1.0438, + "loss/crossentropy": 2.4623637199401855, + "loss/hidden": 0.859375, + "loss/logits": 0.14793148636817932, + "loss/reg": 0.0036454948130995035, + "step": 1416 + }, + { + "epoch": 0.177125, + "grad_norm": 3.3774526119232178, + "grad_norm_var": 3.1903428630054806, + "learning_rate": 0.0001, + "loss": 1.2305, + "loss/crossentropy": 2.493734836578369, + "loss/hidden": 1.03125, + "loss/logits": 0.1627763956785202, + "loss/reg": 0.003643598174676299, + "step": 1417 + }, + { + "epoch": 0.17725, + "grad_norm": 3.0849242210388184, + "grad_norm_var": 3.1504347640183825, + "learning_rate": 0.0001, + "loss": 1.1358, + "loss/crossentropy": 2.631613254547119, + "loss/hidden": 0.92578125, + "loss/logits": 0.1735854148864746, + "loss/reg": 0.0036417231895029545, + "step": 1418 + }, + { + "epoch": 0.177375, + "grad_norm": 2.294229030609131, + "grad_norm_var": 0.9608081109988983, + "learning_rate": 0.0001, + "loss": 1.0362, + "loss/crossentropy": 2.3616018295288086, + "loss/hidden": 0.8515625, + "loss/logits": 0.1482659876346588, + "loss/reg": 0.003640011651441455, + "step": 1419 + }, + { + "epoch": 0.1775, + "grad_norm": 2.5457763671875, + "grad_norm_var": 0.9663407595303662, + "learning_rate": 0.0001, + "loss": 1.1153, + "loss/crossentropy": 2.293968439102173, + "loss/hidden": 0.92578125, + "loss/logits": 0.1531478762626648, + "loss/reg": 0.003638186492025852, + "step": 1420 + }, + { + "epoch": 0.177625, + "grad_norm": 2.2807705402374268, + "grad_norm_var": 0.9779472660718359, + "learning_rate": 0.0001, + "loss": 1.043, + "loss/crossentropy": 2.3743157386779785, + "loss/hidden": 0.859375, + "loss/logits": 0.14730778336524963, + "loss/reg": 0.003636348759755492, + "step": 1421 + }, + { + "epoch": 0.17775, + "grad_norm": 3.111150026321411, + "grad_norm_var": 0.9459346801334104, + "learning_rate": 0.0001, + "loss": 1.445, + "loss/crossentropy": 2.40647292137146, + "loss/hidden": 1.1875, + "loss/logits": 0.22118544578552246, + "loss/reg": 0.0036345720291137695, + "step": 1422 + }, + { + "epoch": 0.177875, + "grad_norm": 2.957676410675049, + "grad_norm_var": 0.13237886720594336, + "learning_rate": 0.0001, + "loss": 1.454, + "loss/crossentropy": 2.2662835121154785, + "loss/hidden": 1.1953125, + "loss/logits": 0.22237327694892883, + "loss/reg": 0.0036327948328107595, + "step": 1423 + }, + { + "epoch": 0.178, + "grad_norm": 2.8840086460113525, + "grad_norm_var": 0.12859406693209482, + "learning_rate": 0.0001, + "loss": 1.1926, + "loss/crossentropy": 2.29878830909729, + "loss/hidden": 0.98828125, + "loss/logits": 0.16803184151649475, + "loss/reg": 0.003631110303103924, + "step": 1424 + }, + { + "epoch": 0.178125, + "grad_norm": 2.56199049949646, + "grad_norm_var": 0.11087788727616968, + "learning_rate": 0.0001, + "loss": 1.1011, + "loss/crossentropy": 2.510556221008301, + "loss/hidden": 0.88671875, + "loss/logits": 0.1780451089143753, + "loss/reg": 0.003629653248935938, + "step": 1425 + }, + { + "epoch": 0.17825, + "grad_norm": 2.547821044921875, + "grad_norm_var": 0.11277902977970579, + "learning_rate": 0.0001, + "loss": 1.1145, + "loss/crossentropy": 2.4963526725769043, + "loss/hidden": 0.9140625, + "loss/logits": 0.1641579121351242, + "loss/reg": 0.003628302598372102, + "step": 1426 + }, + { + "epoch": 0.178375, + "grad_norm": 4.021576404571533, + "grad_norm_var": 0.20596057757328007, + "learning_rate": 0.0001, + "loss": 1.0427, + "loss/crossentropy": 2.6380093097686768, + "loss/hidden": 0.875, + "loss/logits": 0.13147366046905518, + "loss/reg": 0.00362660875543952, + "step": 1427 + }, + { + "epoch": 0.1785, + "grad_norm": 2.6613025665283203, + "grad_norm_var": 0.2068119512618426, + "learning_rate": 0.0001, + "loss": 1.1507, + "loss/crossentropy": 2.449720859527588, + "loss/hidden": 0.94140625, + "loss/logits": 0.17299975454807281, + "loss/reg": 0.003625056240707636, + "step": 1428 + }, + { + "epoch": 0.178625, + "grad_norm": 2.810811996459961, + "grad_norm_var": 0.19522032187841584, + "learning_rate": 0.0001, + "loss": 1.2604, + "loss/crossentropy": 2.5181782245635986, + "loss/hidden": 1.0390625, + "loss/logits": 0.18506762385368347, + "loss/reg": 0.0036235651932656765, + "step": 1429 + }, + { + "epoch": 0.17875, + "grad_norm": 2.3159515857696533, + "grad_norm_var": 0.20096961733446103, + "learning_rate": 0.0001, + "loss": 1.0049, + "loss/crossentropy": 2.4013493061065674, + "loss/hidden": 0.83203125, + "loss/logits": 0.13662859797477722, + "loss/reg": 0.0036217791493982077, + "step": 1430 + }, + { + "epoch": 0.178875, + "grad_norm": 2.760279893875122, + "grad_norm_var": 0.20058922636977528, + "learning_rate": 0.0001, + "loss": 1.1095, + "loss/crossentropy": 2.494328737258911, + "loss/hidden": 0.90234375, + "loss/logits": 0.17097754776477814, + "loss/reg": 0.0036199174355715513, + "step": 1431 + }, + { + "epoch": 0.179, + "grad_norm": 2.1179399490356445, + "grad_norm_var": 0.23018267869760295, + "learning_rate": 0.0001, + "loss": 0.9199, + "loss/crossentropy": 2.328307628631592, + "loss/hidden": 0.76171875, + "loss/logits": 0.12199117988348007, + "loss/reg": 0.0036181595642119646, + "step": 1432 + }, + { + "epoch": 0.179125, + "grad_norm": 1.972982406616211, + "grad_norm_var": 0.23987289746597373, + "learning_rate": 0.0001, + "loss": 1.1017, + "loss/crossentropy": 2.3198962211608887, + "loss/hidden": 0.91015625, + "loss/logits": 0.15536776185035706, + "loss/reg": 0.0036163018085062504, + "step": 1433 + }, + { + "epoch": 0.17925, + "grad_norm": 2.297431707382202, + "grad_norm_var": 0.23643810387164474, + "learning_rate": 0.0001, + "loss": 1.0198, + "loss/crossentropy": 2.3877716064453125, + "loss/hidden": 0.84765625, + "loss/logits": 0.13596263527870178, + "loss/reg": 0.003614293411374092, + "step": 1434 + }, + { + "epoch": 0.179375, + "grad_norm": 4.3237714767456055, + "grad_norm_var": 0.4019732306137899, + "learning_rate": 0.0001, + "loss": 1.8528, + "loss/crossentropy": 2.4731862545013428, + "loss/hidden": 1.515625, + "loss/logits": 0.3010145425796509, + "loss/reg": 0.0036123625468462706, + "step": 1435 + }, + { + "epoch": 0.1795, + "grad_norm": 2.4513933658599854, + "grad_norm_var": 0.4052347077082838, + "learning_rate": 0.0001, + "loss": 1.0992, + "loss/crossentropy": 2.420868396759033, + "loss/hidden": 0.91796875, + "loss/logits": 0.14515507221221924, + "loss/reg": 0.003610546700656414, + "step": 1436 + }, + { + "epoch": 0.179625, + "grad_norm": 3.088550329208374, + "grad_norm_var": 0.39496121989805694, + "learning_rate": 0.0001, + "loss": 0.9986, + "loss/crossentropy": 2.5977416038513184, + "loss/hidden": 0.8046875, + "loss/logits": 0.1577831655740738, + "loss/reg": 0.0036085534375160933, + "step": 1437 + }, + { + "epoch": 0.17975, + "grad_norm": 2.4659948348999023, + "grad_norm_var": 0.3946649959456747, + "learning_rate": 0.0001, + "loss": 1.12, + "loss/crossentropy": 2.3049263954162598, + "loss/hidden": 0.9296875, + "loss/logits": 0.1542476862668991, + "loss/reg": 0.0036067250184714794, + "step": 1438 + }, + { + "epoch": 0.179875, + "grad_norm": 3.8259365558624268, + "grad_norm_var": 0.46409173226906736, + "learning_rate": 0.0001, + "loss": 1.0567, + "loss/crossentropy": 2.495133876800537, + "loss/hidden": 0.87890625, + "loss/logits": 0.14177045226097107, + "loss/reg": 0.0036046463064849377, + "step": 1439 + }, + { + "epoch": 0.18, + "grad_norm": 4.203658103942871, + "grad_norm_var": 0.5843312188094536, + "learning_rate": 0.0001, + "loss": 1.1694, + "loss/crossentropy": 2.5536646842956543, + "loss/hidden": 0.98828125, + "loss/logits": 0.14505554735660553, + "loss/reg": 0.0036028120666742325, + "step": 1440 + }, + { + "epoch": 0.180125, + "grad_norm": 2.6005194187164307, + "grad_norm_var": 0.5826787847955602, + "learning_rate": 0.0001, + "loss": 1.0808, + "loss/crossentropy": 2.723029613494873, + "loss/hidden": 0.87890625, + "loss/logits": 0.16591691970825195, + "loss/reg": 0.003600981319323182, + "step": 1441 + }, + { + "epoch": 0.18025, + "grad_norm": 2.6143887042999268, + "grad_norm_var": 0.5797933388848217, + "learning_rate": 0.0001, + "loss": 1.0984, + "loss/crossentropy": 2.677304744720459, + "loss/hidden": 0.890625, + "loss/logits": 0.17178688943386078, + "loss/reg": 0.003598999697715044, + "step": 1442 + }, + { + "epoch": 0.180375, + "grad_norm": 3.0147054195404053, + "grad_norm_var": 0.4936957943628516, + "learning_rate": 0.0001, + "loss": 1.1792, + "loss/crossentropy": 2.567859649658203, + "loss/hidden": 0.95703125, + "loss/logits": 0.18622992932796478, + "loss/reg": 0.00359702087007463, + "step": 1443 + }, + { + "epoch": 0.1805, + "grad_norm": 2.5004806518554688, + "grad_norm_var": 0.4992588141750974, + "learning_rate": 0.0001, + "loss": 1.0141, + "loss/crossentropy": 2.335355043411255, + "loss/hidden": 0.84765625, + "loss/logits": 0.1304875910282135, + "loss/reg": 0.0035950199235230684, + "step": 1444 + }, + { + "epoch": 0.180625, + "grad_norm": 2.9756336212158203, + "grad_norm_var": 0.5004185509481144, + "learning_rate": 0.0001, + "loss": 1.1931, + "loss/crossentropy": 2.262080430984497, + "loss/hidden": 1.0, + "loss/logits": 0.15718932449817657, + "loss/reg": 0.00359291210770607, + "step": 1445 + }, + { + "epoch": 0.18075, + "grad_norm": 2.8728082180023193, + "grad_norm_var": 0.48047395147950145, + "learning_rate": 0.0001, + "loss": 1.1377, + "loss/crossentropy": 2.4912827014923096, + "loss/hidden": 0.91796875, + "loss/logits": 0.18385069072246552, + "loss/reg": 0.0035907707642763853, + "step": 1446 + }, + { + "epoch": 0.180875, + "grad_norm": 2.440415143966675, + "grad_norm_var": 0.4919916999810859, + "learning_rate": 0.0001, + "loss": 1.1763, + "loss/crossentropy": 2.539290189743042, + "loss/hidden": 0.95703125, + "loss/logits": 0.18333688378334045, + "loss/reg": 0.003588638501241803, + "step": 1447 + }, + { + "epoch": 0.181, + "grad_norm": 2.3080894947052, + "grad_norm_var": 0.4754273782914159, + "learning_rate": 0.0001, + "loss": 1.0376, + "loss/crossentropy": 2.552766799926758, + "loss/hidden": 0.8515625, + "loss/logits": 0.1501522660255432, + "loss/reg": 0.0035868044942617416, + "step": 1448 + }, + { + "epoch": 0.181125, + "grad_norm": 2.343794822692871, + "grad_norm_var": 0.43955761846480074, + "learning_rate": 0.0001, + "loss": 1.3478, + "loss/crossentropy": 2.1275408267974854, + "loss/hidden": 1.109375, + "loss/logits": 0.2026042938232422, + "loss/reg": 0.0035850289277732372, + "step": 1449 + }, + { + "epoch": 0.18125, + "grad_norm": 2.139036178588867, + "grad_norm_var": 0.45375597061421685, + "learning_rate": 0.0001, + "loss": 1.0913, + "loss/crossentropy": 2.6152234077453613, + "loss/hidden": 0.89453125, + "loss/logits": 0.16098400950431824, + "loss/reg": 0.003583215642720461, + "step": 1450 + }, + { + "epoch": 0.181375, + "grad_norm": 6.257389545440674, + "grad_norm_var": 1.0582259715841054, + "learning_rate": 0.0001, + "loss": 1.7871, + "loss/crossentropy": 2.396705150604248, + "loss/hidden": 1.4140625, + "loss/logits": 0.3372613787651062, + "loss/reg": 0.003581451950594783, + "step": 1451 + }, + { + "epoch": 0.1815, + "grad_norm": 2.660068988800049, + "grad_norm_var": 1.0455046997651758, + "learning_rate": 0.0001, + "loss": 1.1302, + "loss/crossentropy": 2.4636411666870117, + "loss/hidden": 0.9375, + "loss/logits": 0.15687166154384613, + "loss/reg": 0.0035797141026705503, + "step": 1452 + }, + { + "epoch": 0.181625, + "grad_norm": 2.0276317596435547, + "grad_norm_var": 1.1060792073261607, + "learning_rate": 0.0001, + "loss": 1.0576, + "loss/crossentropy": 2.645092010498047, + "loss/hidden": 0.8671875, + "loss/logits": 0.15465402603149414, + "loss/reg": 0.003578024450689554, + "step": 1453 + }, + { + "epoch": 0.18175, + "grad_norm": 2.706411123275757, + "grad_norm_var": 1.0940753984711251, + "learning_rate": 0.0001, + "loss": 1.2783, + "loss/crossentropy": 2.465183973312378, + "loss/hidden": 1.0703125, + "loss/logits": 0.1722460836172104, + "loss/reg": 0.003576185554265976, + "step": 1454 + }, + { + "epoch": 0.181875, + "grad_norm": 2.883852243423462, + "grad_norm_var": 1.0418023995859433, + "learning_rate": 0.0001, + "loss": 1.1553, + "loss/crossentropy": 2.62016224861145, + "loss/hidden": 0.9609375, + "loss/logits": 0.15863552689552307, + "loss/reg": 0.0035745068453252316, + "step": 1455 + }, + { + "epoch": 0.182, + "grad_norm": 2.712892532348633, + "grad_norm_var": 0.9234243773258602, + "learning_rate": 0.0001, + "loss": 1.2436, + "loss/crossentropy": 2.7399840354919434, + "loss/hidden": 1.0078125, + "loss/logits": 0.20005175471305847, + "loss/reg": 0.0035726907663047314, + "step": 1456 + }, + { + "epoch": 0.182125, + "grad_norm": 2.2389307022094727, + "grad_norm_var": 0.9419911218676927, + "learning_rate": 0.0001, + "loss": 1.0805, + "loss/crossentropy": 2.4630889892578125, + "loss/hidden": 0.88671875, + "loss/logits": 0.15808376669883728, + "loss/reg": 0.003570869332179427, + "step": 1457 + }, + { + "epoch": 0.18225, + "grad_norm": 2.272373676300049, + "grad_norm_var": 0.9574713564477899, + "learning_rate": 0.0001, + "loss": 1.0423, + "loss/crossentropy": 2.420292377471924, + "loss/hidden": 0.8515625, + "loss/logits": 0.1550384759902954, + "loss/reg": 0.003569073276594281, + "step": 1458 + }, + { + "epoch": 0.182375, + "grad_norm": 1.9881032705307007, + "grad_norm_var": 0.9901407757083646, + "learning_rate": 0.0001, + "loss": 1.0698, + "loss/crossentropy": 2.412853240966797, + "loss/hidden": 0.890625, + "loss/logits": 0.14347431063652039, + "loss/reg": 0.0035672772210091352, + "step": 1459 + }, + { + "epoch": 0.1825, + "grad_norm": 9.163015365600586, + "grad_norm_var": 3.5801338990358595, + "learning_rate": 0.0001, + "loss": 1.2578, + "loss/crossentropy": 2.492182731628418, + "loss/hidden": 1.0390625, + "loss/logits": 0.18306787312030792, + "loss/reg": 0.003565459046512842, + "step": 1460 + }, + { + "epoch": 0.182625, + "grad_norm": 2.5656776428222656, + "grad_norm_var": 3.598769741394437, + "learning_rate": 0.0001, + "loss": 1.001, + "loss/crossentropy": 2.5562655925750732, + "loss/hidden": 0.82421875, + "loss/logits": 0.14114579558372498, + "loss/reg": 0.003563658567145467, + "step": 1461 + }, + { + "epoch": 0.18275, + "grad_norm": 2.286931276321411, + "grad_norm_var": 3.6378752514728876, + "learning_rate": 0.0001, + "loss": 0.9807, + "loss/crossentropy": 2.698147773742676, + "loss/hidden": 0.8125, + "loss/logits": 0.13260522484779358, + "loss/reg": 0.0035618396941572428, + "step": 1462 + }, + { + "epoch": 0.182875, + "grad_norm": 2.371244430541992, + "grad_norm_var": 3.643908523891269, + "learning_rate": 0.0001, + "loss": 1.1124, + "loss/crossentropy": 2.432371139526367, + "loss/hidden": 0.91015625, + "loss/logits": 0.16662147641181946, + "loss/reg": 0.0035599328111857176, + "step": 1463 + }, + { + "epoch": 0.183, + "grad_norm": 2.6184229850769043, + "grad_norm_var": 3.6189046702026477, + "learning_rate": 0.0001, + "loss": 1.0458, + "loss/crossentropy": 2.947530508041382, + "loss/hidden": 0.8671875, + "loss/logits": 0.143006831407547, + "loss/reg": 0.0035581255797296762, + "step": 1464 + }, + { + "epoch": 0.183125, + "grad_norm": 2.143239736557007, + "grad_norm_var": 3.641031281987516, + "learning_rate": 0.0001, + "loss": 1.0678, + "loss/crossentropy": 2.6318516731262207, + "loss/hidden": 0.88671875, + "loss/logits": 0.14551308751106262, + "loss/reg": 0.00355625175870955, + "step": 1465 + }, + { + "epoch": 0.18325, + "grad_norm": 2.7335619926452637, + "grad_norm_var": 3.589745013057074, + "learning_rate": 0.0001, + "loss": 1.2078, + "loss/crossentropy": 2.375401020050049, + "loss/hidden": 0.9765625, + "loss/logits": 0.19566710293293, + "loss/reg": 0.0035545255523175, + "step": 1466 + }, + { + "epoch": 0.183375, + "grad_norm": 2.3009843826293945, + "grad_norm_var": 2.903458838671109, + "learning_rate": 0.0001, + "loss": 1.1591, + "loss/crossentropy": 2.5083110332489014, + "loss/hidden": 0.94921875, + "loss/logits": 0.1743055284023285, + "loss/reg": 0.003552833804860711, + "step": 1467 + }, + { + "epoch": 0.1835, + "grad_norm": 1.9567177295684814, + "grad_norm_var": 2.952619415111201, + "learning_rate": 0.0001, + "loss": 0.9881, + "loss/crossentropy": 2.5743629932403564, + "loss/hidden": 0.80859375, + "loss/logits": 0.14398899674415588, + "loss/reg": 0.003551185131072998, + "step": 1468 + }, + { + "epoch": 0.183625, + "grad_norm": 2.2741682529449463, + "grad_norm_var": 2.9306800113679072, + "learning_rate": 0.0001, + "loss": 1.0048, + "loss/crossentropy": 2.4656572341918945, + "loss/hidden": 0.83203125, + "loss/logits": 0.1372774988412857, + "loss/reg": 0.003549505490809679, + "step": 1469 + }, + { + "epoch": 0.18375, + "grad_norm": 2.9343631267547607, + "grad_norm_var": 2.9302919053315675, + "learning_rate": 0.0001, + "loss": 1.1254, + "loss/crossentropy": 2.4442336559295654, + "loss/hidden": 0.8984375, + "loss/logits": 0.19146251678466797, + "loss/reg": 0.003547689877450466, + "step": 1470 + }, + { + "epoch": 0.183875, + "grad_norm": 2.1364428997039795, + "grad_norm_var": 2.9608635231208154, + "learning_rate": 0.0001, + "loss": 1.0281, + "loss/crossentropy": 2.4912197589874268, + "loss/hidden": 0.84765625, + "loss/logits": 0.14495311677455902, + "loss/reg": 0.003545962506905198, + "step": 1471 + }, + { + "epoch": 0.184, + "grad_norm": 2.244561195373535, + "grad_norm_var": 2.979609556239146, + "learning_rate": 0.0001, + "loss": 0.9575, + "loss/crossentropy": 2.461921215057373, + "loss/hidden": 0.80078125, + "loss/logits": 0.12127329409122467, + "loss/reg": 0.0035440947394818068, + "step": 1472 + }, + { + "epoch": 0.184125, + "grad_norm": 2.6610682010650635, + "grad_norm_var": 2.96117686540241, + "learning_rate": 0.0001, + "loss": 1.0065, + "loss/crossentropy": 2.4418022632598877, + "loss/hidden": 0.828125, + "loss/logits": 0.14295433461666107, + "loss/reg": 0.003542231861501932, + "step": 1473 + }, + { + "epoch": 0.18425, + "grad_norm": 2.17217755317688, + "grad_norm_var": 2.9687286207062233, + "learning_rate": 0.0001, + "loss": 1.0648, + "loss/crossentropy": 2.4521894454956055, + "loss/hidden": 0.8828125, + "loss/logits": 0.146602600812912, + "loss/reg": 0.003540375269949436, + "step": 1474 + }, + { + "epoch": 0.184375, + "grad_norm": 2.5094003677368164, + "grad_norm_var": 2.930364197494137, + "learning_rate": 0.0001, + "loss": 1.0535, + "loss/crossentropy": 2.2975356578826904, + "loss/hidden": 0.890625, + "loss/logits": 0.12753836810588837, + "loss/reg": 0.003538495395332575, + "step": 1475 + }, + { + "epoch": 0.1845, + "grad_norm": 2.8615875244140625, + "grad_norm_var": 0.08025149529701801, + "learning_rate": 0.0001, + "loss": 1.1913, + "loss/crossentropy": 2.4363348484039307, + "loss/hidden": 0.98046875, + "loss/logits": 0.1754908561706543, + "loss/reg": 0.0035365556832402945, + "step": 1476 + }, + { + "epoch": 0.184625, + "grad_norm": 2.3295183181762695, + "grad_norm_var": 0.07924959319393471, + "learning_rate": 0.0001, + "loss": 1.085, + "loss/crossentropy": 2.641104221343994, + "loss/hidden": 0.89453125, + "loss/logits": 0.15514951944351196, + "loss/reg": 0.0035345894284546375, + "step": 1477 + }, + { + "epoch": 0.18475, + "grad_norm": 3.0680177211761475, + "grad_norm_var": 0.10473031746968976, + "learning_rate": 0.0001, + "loss": 1.0339, + "loss/crossentropy": 2.1963143348693848, + "loss/hidden": 0.84765625, + "loss/logits": 0.15095466375350952, + "loss/reg": 0.003532707691192627, + "step": 1478 + }, + { + "epoch": 0.184875, + "grad_norm": 2.1066739559173584, + "grad_norm_var": 0.11213794701280312, + "learning_rate": 0.0001, + "loss": 0.9353, + "loss/crossentropy": 2.988297700881958, + "loss/hidden": 0.78515625, + "loss/logits": 0.11482476443052292, + "loss/reg": 0.0035308676306158304, + "step": 1479 + }, + { + "epoch": 0.185, + "grad_norm": 2.4566586017608643, + "grad_norm_var": 0.10993979963402485, + "learning_rate": 0.0001, + "loss": 1.3199, + "loss/crossentropy": 2.324228048324585, + "loss/hidden": 1.09375, + "loss/logits": 0.19089123606681824, + "loss/reg": 0.0035288881044834852, + "step": 1480 + }, + { + "epoch": 0.185125, + "grad_norm": 2.360886573791504, + "grad_norm_var": 0.10456219156340367, + "learning_rate": 0.0001, + "loss": 1.11, + "loss/crossentropy": 2.5515263080596924, + "loss/hidden": 0.90625, + "loss/logits": 0.16846542060375214, + "loss/reg": 0.003527080873027444, + "step": 1481 + }, + { + "epoch": 0.18525, + "grad_norm": 2.6080853939056396, + "grad_norm_var": 0.10070469690842856, + "learning_rate": 0.0001, + "loss": 1.1455, + "loss/crossentropy": 2.456618070602417, + "loss/hidden": 0.9296875, + "loss/logits": 0.18056106567382812, + "loss/reg": 0.003525231732055545, + "step": 1482 + }, + { + "epoch": 0.185375, + "grad_norm": 2.5533339977264404, + "grad_norm_var": 0.10013072862828926, + "learning_rate": 0.0001, + "loss": 1.0745, + "loss/crossentropy": 2.6200571060180664, + "loss/hidden": 0.89453125, + "loss/logits": 0.14470210671424866, + "loss/reg": 0.003523309249430895, + "step": 1483 + }, + { + "epoch": 0.1855, + "grad_norm": 2.5463483333587646, + "grad_norm_var": 0.08291376946414909, + "learning_rate": 0.0001, + "loss": 1.1705, + "loss/crossentropy": 2.4352331161499023, + "loss/hidden": 0.95703125, + "loss/logits": 0.1782476007938385, + "loss/reg": 0.0035214636009186506, + "step": 1484 + }, + { + "epoch": 0.185625, + "grad_norm": 2.1944315433502197, + "grad_norm_var": 0.08559466734096356, + "learning_rate": 0.0001, + "loss": 0.9894, + "loss/crossentropy": 2.350627899169922, + "loss/hidden": 0.82421875, + "loss/logits": 0.13000428676605225, + "loss/reg": 0.0035196379758417606, + "step": 1485 + }, + { + "epoch": 0.18575, + "grad_norm": 2.352766990661621, + "grad_norm_var": 0.0718094639254095, + "learning_rate": 0.0001, + "loss": 1.1058, + "loss/crossentropy": 2.625410556793213, + "loss/hidden": 0.921875, + "loss/logits": 0.1486971080303192, + "loss/reg": 0.003517881967127323, + "step": 1486 + }, + { + "epoch": 0.185875, + "grad_norm": 2.4021310806274414, + "grad_norm_var": 0.06519778826045103, + "learning_rate": 0.0001, + "loss": 0.9818, + "loss/crossentropy": 2.3567581176757812, + "loss/hidden": 0.8203125, + "loss/logits": 0.1262809932231903, + "loss/reg": 0.0035160251427441835, + "step": 1487 + }, + { + "epoch": 0.186, + "grad_norm": 2.510427951812744, + "grad_norm_var": 0.06182866367774575, + "learning_rate": 0.0001, + "loss": 1.1382, + "loss/crossentropy": 2.475712537765503, + "loss/hidden": 0.921875, + "loss/logits": 0.1811387538909912, + "loss/reg": 0.003514372045174241, + "step": 1488 + }, + { + "epoch": 0.186125, + "grad_norm": 2.0962274074554443, + "grad_norm_var": 0.0681959672911449, + "learning_rate": 0.0001, + "loss": 0.9282, + "loss/crossentropy": 2.600541830062866, + "loss/hidden": 0.76953125, + "loss/logits": 0.12352926284074783, + "loss/reg": 0.0035127766896039248, + "step": 1489 + }, + { + "epoch": 0.18625, + "grad_norm": 2.7332022190093994, + "grad_norm_var": 0.06741919371529713, + "learning_rate": 0.0001, + "loss": 1.0116, + "loss/crossentropy": 2.5983481407165527, + "loss/hidden": 0.8359375, + "loss/logits": 0.14054536819458008, + "loss/reg": 0.003511229529976845, + "step": 1490 + }, + { + "epoch": 0.186375, + "grad_norm": 2.262906789779663, + "grad_norm_var": 0.07027029030217118, + "learning_rate": 0.0001, + "loss": 1.2372, + "loss/crossentropy": 2.5684587955474854, + "loss/hidden": 1.015625, + "loss/logits": 0.1864907145500183, + "loss/reg": 0.0035094181075692177, + "step": 1491 + }, + { + "epoch": 0.1865, + "grad_norm": 2.142493963241577, + "grad_norm_var": 0.06458349300590362, + "learning_rate": 0.0001, + "loss": 0.9329, + "loss/crossentropy": 2.6954498291015625, + "loss/hidden": 0.765625, + "loss/logits": 0.13216395676136017, + "loss/reg": 0.003507613204419613, + "step": 1492 + }, + { + "epoch": 0.186625, + "grad_norm": 2.312235116958618, + "grad_norm_var": 0.06481126280718001, + "learning_rate": 0.0001, + "loss": 1.0631, + "loss/crossentropy": 2.7299952507019043, + "loss/hidden": 0.8828125, + "loss/logits": 0.1452367901802063, + "loss/reg": 0.003506068605929613, + "step": 1493 + }, + { + "epoch": 0.18675, + "grad_norm": 2.66845703125, + "grad_norm_var": 0.040222462022599596, + "learning_rate": 0.0001, + "loss": 1.1768, + "loss/crossentropy": 2.5572774410247803, + "loss/hidden": 0.96875, + "loss/logits": 0.17304158210754395, + "loss/reg": 0.0035042495001107454, + "step": 1494 + }, + { + "epoch": 0.186875, + "grad_norm": 2.0365381240844727, + "grad_norm_var": 0.04321872460463207, + "learning_rate": 0.0001, + "loss": 1.0286, + "loss/crossentropy": 2.845195770263672, + "loss/hidden": 0.8515625, + "loss/logits": 0.14202150702476501, + "loss/reg": 0.0035027535632252693, + "step": 1495 + }, + { + "epoch": 0.187, + "grad_norm": 2.2023096084594727, + "grad_norm_var": 0.04499537551175739, + "learning_rate": 0.0001, + "loss": 1.1091, + "loss/crossentropy": 2.4882686138153076, + "loss/hidden": 0.91796875, + "loss/logits": 0.15609663724899292, + "loss/reg": 0.0035012420266866684, + "step": 1496 + }, + { + "epoch": 0.187125, + "grad_norm": 2.208552122116089, + "grad_norm_var": 0.04671054126144914, + "learning_rate": 0.0001, + "loss": 1.2829, + "loss/crossentropy": 2.6602842807769775, + "loss/hidden": 1.0546875, + "loss/logits": 0.19319944083690643, + "loss/reg": 0.0034994245506823063, + "step": 1497 + }, + { + "epoch": 0.18725, + "grad_norm": 2.562239170074463, + "grad_norm_var": 0.04535231939183457, + "learning_rate": 0.0001, + "loss": 1.0075, + "loss/crossentropy": 2.238691806793213, + "loss/hidden": 0.8359375, + "loss/logits": 0.136610209941864, + "loss/reg": 0.0034975947346538305, + "step": 1498 + }, + { + "epoch": 0.187375, + "grad_norm": 2.401848316192627, + "grad_norm_var": 0.04291264261425264, + "learning_rate": 0.0001, + "loss": 1.0662, + "loss/crossentropy": 2.685908079147339, + "loss/hidden": 0.890625, + "loss/logits": 0.1406560093164444, + "loss/reg": 0.0034957744646817446, + "step": 1499 + }, + { + "epoch": 0.1875, + "grad_norm": 2.12306809425354, + "grad_norm_var": 0.04314595548621488, + "learning_rate": 0.0001, + "loss": 1.1621, + "loss/crossentropy": 2.2005772590637207, + "loss/hidden": 0.9609375, + "loss/logits": 0.1661786586046219, + "loss/reg": 0.0034941888879984617, + "step": 1500 + }, + { + "epoch": 0.187625, + "grad_norm": 3.0501010417938232, + "grad_norm_var": 0.07394000618437152, + "learning_rate": 0.0001, + "loss": 1.3451, + "loss/crossentropy": 2.623453140258789, + "loss/hidden": 1.09375, + "loss/logits": 0.21640917658805847, + "loss/reg": 0.003492384683340788, + "step": 1501 + }, + { + "epoch": 0.18775, + "grad_norm": 3.257550001144409, + "grad_norm_var": 0.12192848616994235, + "learning_rate": 0.0001, + "loss": 1.3142, + "loss/crossentropy": 2.670974016189575, + "loss/hidden": 1.0859375, + "loss/logits": 0.19331884384155273, + "loss/reg": 0.0034905769862234592, + "step": 1502 + }, + { + "epoch": 0.187875, + "grad_norm": 1.9728327989578247, + "grad_norm_var": 0.13536526430902043, + "learning_rate": 0.0001, + "loss": 0.9776, + "loss/crossentropy": 2.603712320327759, + "loss/hidden": 0.8046875, + "loss/logits": 0.1380743682384491, + "loss/reg": 0.0034887471701949835, + "step": 1503 + }, + { + "epoch": 0.188, + "grad_norm": 41.34659194946289, + "grad_norm_var": 94.9270262878801, + "learning_rate": 0.0001, + "loss": 1.3036, + "loss/crossentropy": 2.142944812774658, + "loss/hidden": 1.0859375, + "loss/logits": 0.1828157901763916, + "loss/reg": 0.003486843081191182, + "step": 1504 + }, + { + "epoch": 0.188125, + "grad_norm": 2.199233293533325, + "grad_norm_var": 94.89006007533027, + "learning_rate": 0.0001, + "loss": 1.027, + "loss/crossentropy": 2.5525684356689453, + "loss/hidden": 0.8515625, + "loss/logits": 0.14055398106575012, + "loss/reg": 0.0034850805532187223, + "step": 1505 + }, + { + "epoch": 0.18825, + "grad_norm": 2.0132434368133545, + "grad_norm_var": 95.12493831851324, + "learning_rate": 0.0001, + "loss": 0.9913, + "loss/crossentropy": 2.3946876525878906, + "loss/hidden": 0.82421875, + "loss/logits": 0.13224273920059204, + "loss/reg": 0.0034832614473998547, + "step": 1506 + }, + { + "epoch": 0.188375, + "grad_norm": 2.1540908813476562, + "grad_norm_var": 95.16245243204516, + "learning_rate": 0.0001, + "loss": 1.0228, + "loss/crossentropy": 2.4561750888824463, + "loss/hidden": 0.83203125, + "loss/logits": 0.15593823790550232, + "loss/reg": 0.003481344785541296, + "step": 1507 + }, + { + "epoch": 0.1885, + "grad_norm": 2.6965999603271484, + "grad_norm_var": 94.98598958949965, + "learning_rate": 0.0001, + "loss": 1.2656, + "loss/crossentropy": 2.38210391998291, + "loss/hidden": 1.0390625, + "loss/logits": 0.19175776839256287, + "loss/reg": 0.003479481441900134, + "step": 1508 + }, + { + "epoch": 0.188625, + "grad_norm": 3.4180994033813477, + "grad_norm_var": 94.69186888365496, + "learning_rate": 0.0001, + "loss": 1.2663, + "loss/crossentropy": 2.4697988033294678, + "loss/hidden": 1.046875, + "loss/logits": 0.18465159833431244, + "loss/reg": 0.0034777566324919462, + "step": 1509 + }, + { + "epoch": 0.18875, + "grad_norm": 2.273015022277832, + "grad_norm_var": 94.81900961164247, + "learning_rate": 0.0001, + "loss": 1.0814, + "loss/crossentropy": 2.623250722885132, + "loss/hidden": 0.87890625, + "loss/logits": 0.16774481534957886, + "loss/reg": 0.003475895617157221, + "step": 1510 + }, + { + "epoch": 0.188875, + "grad_norm": 2.247028112411499, + "grad_norm_var": 94.74226385976161, + "learning_rate": 0.0001, + "loss": 1.117, + "loss/crossentropy": 2.674126148223877, + "loss/hidden": 0.921875, + "loss/logits": 0.1603851318359375, + "loss/reg": 0.0034741731360554695, + "step": 1511 + }, + { + "epoch": 0.189, + "grad_norm": 4.356043338775635, + "grad_norm_var": 94.26240397096606, + "learning_rate": 0.0001, + "loss": 1.2795, + "loss/crossentropy": 2.6789023876190186, + "loss/hidden": 1.0546875, + "loss/logits": 0.19006717205047607, + "loss/reg": 0.0034723973367363214, + "step": 1512 + }, + { + "epoch": 0.189125, + "grad_norm": 2.381601572036743, + "grad_norm_var": 94.19946382080788, + "learning_rate": 0.0001, + "loss": 1.245, + "loss/crossentropy": 2.090708017349243, + "loss/hidden": 1.046875, + "loss/logits": 0.1634030044078827, + "loss/reg": 0.0034707069862633944, + "step": 1513 + }, + { + "epoch": 0.18925, + "grad_norm": 4.63723087310791, + "grad_norm_var": 93.78628244843514, + "learning_rate": 0.0001, + "loss": 1.0803, + "loss/crossentropy": 2.9273617267608643, + "loss/hidden": 0.90625, + "loss/logits": 0.13935977220535278, + "loss/reg": 0.0034691800829023123, + "step": 1514 + }, + { + "epoch": 0.189375, + "grad_norm": 3.1558918952941895, + "grad_norm_var": 93.54471655609011, + "learning_rate": 0.0001, + "loss": 1.1021, + "loss/crossentropy": 2.6062216758728027, + "loss/hidden": 0.8984375, + "loss/logits": 0.16897618770599365, + "loss/reg": 0.003467726521193981, + "step": 1515 + }, + { + "epoch": 0.1895, + "grad_norm": 2.6901845932006836, + "grad_norm_var": 93.331765452413, + "learning_rate": 0.0001, + "loss": 1.0927, + "loss/crossentropy": 2.563309907913208, + "loss/hidden": 0.9140625, + "loss/logits": 0.14401212334632874, + "loss/reg": 0.003465942805632949, + "step": 1516 + }, + { + "epoch": 0.189625, + "grad_norm": 2.6145591735839844, + "grad_norm_var": 93.47082774818871, + "learning_rate": 0.0001, + "loss": 1.1674, + "loss/crossentropy": 2.80975604057312, + "loss/hidden": 0.96484375, + "loss/logits": 0.1679481714963913, + "loss/reg": 0.0034644228871911764, + "step": 1517 + }, + { + "epoch": 0.18975, + "grad_norm": 2.1897029876708984, + "grad_norm_var": 93.82056409785089, + "learning_rate": 0.0001, + "loss": 1.1465, + "loss/crossentropy": 2.4143311977386475, + "loss/hidden": 0.94140625, + "loss/logits": 0.17049774527549744, + "loss/reg": 0.003462952096015215, + "step": 1518 + }, + { + "epoch": 0.189875, + "grad_norm": 2.565324068069458, + "grad_norm_var": 93.59177882800311, + "learning_rate": 0.0001, + "loss": 1.3113, + "loss/crossentropy": 2.0998597145080566, + "loss/hidden": 1.0703125, + "loss/logits": 0.20640692114830017, + "loss/reg": 0.0034615371841937304, + "step": 1519 + }, + { + "epoch": 0.19, + "grad_norm": 2.4154701232910156, + "grad_norm_var": 0.6036209187642118, + "learning_rate": 0.0001, + "loss": 0.9326, + "loss/crossentropy": 2.6523633003234863, + "loss/hidden": 0.76171875, + "loss/logits": 0.1362442970275879, + "loss/reg": 0.0034597725607454777, + "step": 1520 + }, + { + "epoch": 0.190125, + "grad_norm": 2.2444441318511963, + "grad_norm_var": 0.6004258293545394, + "learning_rate": 0.0001, + "loss": 1.128, + "loss/crossentropy": 2.0900661945343018, + "loss/hidden": 0.95703125, + "loss/logits": 0.136434406042099, + "loss/reg": 0.0034582833759486675, + "step": 1521 + }, + { + "epoch": 0.19025, + "grad_norm": 2.411386728286743, + "grad_norm_var": 0.5710476325004736, + "learning_rate": 0.0001, + "loss": 1.2059, + "loss/crossentropy": 2.381988286972046, + "loss/hidden": 0.99609375, + "loss/logits": 0.17528721690177917, + "loss/reg": 0.0034567993134260178, + "step": 1522 + }, + { + "epoch": 0.190375, + "grad_norm": 2.5889010429382324, + "grad_norm_var": 0.5466832532559577, + "learning_rate": 0.0001, + "loss": 1.0782, + "loss/crossentropy": 2.6577584743499756, + "loss/hidden": 0.8984375, + "loss/logits": 0.14523936808109283, + "loss/reg": 0.003455315949395299, + "step": 1523 + }, + { + "epoch": 0.1905, + "grad_norm": 2.219857692718506, + "grad_norm_var": 0.56780075329749, + "learning_rate": 0.0001, + "loss": 1.0392, + "loss/crossentropy": 2.7978203296661377, + "loss/hidden": 0.8359375, + "loss/logits": 0.16872358322143555, + "loss/reg": 0.0034539303742349148, + "step": 1524 + }, + { + "epoch": 0.190625, + "grad_norm": 2.3635013103485107, + "grad_norm_var": 0.5469604537174282, + "learning_rate": 0.0001, + "loss": 1.0257, + "loss/crossentropy": 2.6388540267944336, + "loss/hidden": 0.8359375, + "loss/logits": 0.15525703132152557, + "loss/reg": 0.0034521608613431454, + "step": 1525 + }, + { + "epoch": 0.19075, + "grad_norm": 2.082015037536621, + "grad_norm_var": 0.560359742807311, + "learning_rate": 0.0001, + "loss": 1.1153, + "loss/crossentropy": 2.446864128112793, + "loss/hidden": 0.921875, + "loss/logits": 0.15890395641326904, + "loss/reg": 0.003450631396844983, + "step": 1526 + }, + { + "epoch": 0.190875, + "grad_norm": 3.5639352798461914, + "grad_norm_var": 0.5896182471249951, + "learning_rate": 0.0001, + "loss": 1.193, + "loss/crossentropy": 2.174250602722168, + "loss/hidden": 1.0234375, + "loss/logits": 0.13503766059875488, + "loss/reg": 0.0034488984383642673, + "step": 1527 + }, + { + "epoch": 0.191, + "grad_norm": 3.0047361850738525, + "grad_norm_var": 0.4197832623445635, + "learning_rate": 0.0001, + "loss": 1.2854, + "loss/crossentropy": 2.883190631866455, + "loss/hidden": 1.0625, + "loss/logits": 0.1883969008922577, + "loss/reg": 0.003447153139859438, + "step": 1528 + }, + { + "epoch": 0.191125, + "grad_norm": 2.433673858642578, + "grad_norm_var": 0.4177730223981217, + "learning_rate": 0.0001, + "loss": 1.1853, + "loss/crossentropy": 2.390415668487549, + "loss/hidden": 0.9765625, + "loss/logits": 0.17427203059196472, + "loss/reg": 0.003445402719080448, + "step": 1529 + }, + { + "epoch": 0.19125, + "grad_norm": 3.7508089542388916, + "grad_norm_var": 0.23777977315326002, + "learning_rate": 0.0001, + "loss": 1.2249, + "loss/crossentropy": 2.995112895965576, + "loss/hidden": 1.015625, + "loss/logits": 0.17482982575893402, + "loss/reg": 0.003443735418841243, + "step": 1530 + }, + { + "epoch": 0.191375, + "grad_norm": 2.349748373031616, + "grad_norm_var": 0.22331083482360797, + "learning_rate": 0.0001, + "loss": 1.1455, + "loss/crossentropy": 2.5757083892822266, + "loss/hidden": 0.9453125, + "loss/logits": 0.16579627990722656, + "loss/reg": 0.00344208930619061, + "step": 1531 + }, + { + "epoch": 0.1915, + "grad_norm": 2.0953431129455566, + "grad_norm_var": 0.23771892232560557, + "learning_rate": 0.0001, + "loss": 1.0868, + "loss/crossentropy": 2.493220567703247, + "loss/hidden": 0.8828125, + "loss/logits": 0.169611394405365, + "loss/reg": 0.003440374741330743, + "step": 1532 + }, + { + "epoch": 0.191625, + "grad_norm": 2.2157857418060303, + "grad_norm_var": 0.24453549562240368, + "learning_rate": 0.0001, + "loss": 1.071, + "loss/crossentropy": 2.383723735809326, + "loss/hidden": 0.8984375, + "loss/logits": 0.1382179707288742, + "loss/reg": 0.0034387765917927027, + "step": 1533 + }, + { + "epoch": 0.19175, + "grad_norm": 2.3576011657714844, + "grad_norm_var": 0.23865884883084618, + "learning_rate": 0.0001, + "loss": 1.0066, + "loss/crossentropy": 2.542468547821045, + "loss/hidden": 0.828125, + "loss/logits": 0.14409103989601135, + "loss/reg": 0.0034371281508356333, + "step": 1534 + }, + { + "epoch": 0.191875, + "grad_norm": 3.1461169719696045, + "grad_norm_var": 0.2615933880776604, + "learning_rate": 0.0001, + "loss": 1.2554, + "loss/crossentropy": 2.378319025039673, + "loss/hidden": 1.046875, + "loss/logits": 0.17414087057113647, + "loss/reg": 0.0034355788957327604, + "step": 1535 + }, + { + "epoch": 0.192, + "grad_norm": 2.0135512351989746, + "grad_norm_var": 0.28038375054829506, + "learning_rate": 0.0001, + "loss": 1.083, + "loss/crossentropy": 1.9969367980957031, + "loss/hidden": 0.90625, + "loss/logits": 0.1424512416124344, + "loss/reg": 0.0034339565318077803, + "step": 1536 + }, + { + "epoch": 0.192125, + "grad_norm": 2.7598655223846436, + "grad_norm_var": 0.27581093075343593, + "learning_rate": 0.0001, + "loss": 1.2095, + "loss/crossentropy": 2.7056527137756348, + "loss/hidden": 1.0, + "loss/logits": 0.1751493215560913, + "loss/reg": 0.0034322130959481, + "step": 1537 + }, + { + "epoch": 0.19225, + "grad_norm": 2.116117000579834, + "grad_norm_var": 0.28808717203202694, + "learning_rate": 0.0001, + "loss": 1.0636, + "loss/crossentropy": 2.7393789291381836, + "loss/hidden": 0.87890625, + "loss/logits": 0.15042325854301453, + "loss/reg": 0.003430649871006608, + "step": 1538 + }, + { + "epoch": 0.192375, + "grad_norm": 3.571504592895508, + "grad_norm_var": 0.35138636847546134, + "learning_rate": 0.0001, + "loss": 1.3128, + "loss/crossentropy": 2.3102715015411377, + "loss/hidden": 1.0859375, + "loss/logits": 0.19261470437049866, + "loss/reg": 0.0034290915355086327, + "step": 1539 + }, + { + "epoch": 0.1925, + "grad_norm": 2.954730987548828, + "grad_norm_var": 0.34517124347645045, + "learning_rate": 0.0001, + "loss": 1.2081, + "loss/crossentropy": 2.3605659008026123, + "loss/hidden": 1.0078125, + "loss/logits": 0.16598157584667206, + "loss/reg": 0.003427294548600912, + "step": 1540 + }, + { + "epoch": 0.192625, + "grad_norm": 2.188232660293579, + "grad_norm_var": 0.3543400274390722, + "learning_rate": 0.0001, + "loss": 1.0359, + "loss/crossentropy": 2.4097092151641846, + "loss/hidden": 0.8515625, + "loss/logits": 0.15011939406394958, + "loss/reg": 0.0034254533238708973, + "step": 1541 + }, + { + "epoch": 0.19275, + "grad_norm": 1.8244503736495972, + "grad_norm_var": 0.37842932295744913, + "learning_rate": 0.0001, + "loss": 1.1172, + "loss/crossentropy": 2.4335646629333496, + "loss/hidden": 0.92578125, + "loss/logits": 0.15719163417816162, + "loss/reg": 0.003423537826165557, + "step": 1542 + }, + { + "epoch": 0.192875, + "grad_norm": 2.0547943115234375, + "grad_norm_var": 0.33619594757236554, + "learning_rate": 0.0001, + "loss": 1.1579, + "loss/crossentropy": 2.5234384536743164, + "loss/hidden": 0.9453125, + "loss/logits": 0.17836084961891174, + "loss/reg": 0.0034218020737171173, + "step": 1543 + }, + { + "epoch": 0.193, + "grad_norm": 2.1028945446014404, + "grad_norm_var": 0.33262686711846395, + "learning_rate": 0.0001, + "loss": 1.024, + "loss/crossentropy": 2.5168957710266113, + "loss/hidden": 0.86328125, + "loss/logits": 0.1265672892332077, + "loss/reg": 0.003420063992962241, + "step": 1544 + }, + { + "epoch": 0.193125, + "grad_norm": 3.1589317321777344, + "grad_norm_var": 0.3594795180238894, + "learning_rate": 0.0001, + "loss": 1.3999, + "loss/crossentropy": 2.19030499458313, + "loss/hidden": 1.171875, + "loss/logits": 0.19384470582008362, + "loss/reg": 0.0034181931987404823, + "step": 1545 + }, + { + "epoch": 0.19325, + "grad_norm": 2.3448944091796875, + "grad_norm_var": 0.2562841379896558, + "learning_rate": 0.0001, + "loss": 1.1032, + "loss/crossentropy": 2.6743974685668945, + "loss/hidden": 0.91015625, + "loss/logits": 0.15885460376739502, + "loss/reg": 0.003416434396058321, + "step": 1546 + }, + { + "epoch": 0.193375, + "grad_norm": 2.1164956092834473, + "grad_norm_var": 0.2629084863422197, + "learning_rate": 0.0001, + "loss": 1.1537, + "loss/crossentropy": 2.233337879180908, + "loss/hidden": 0.96875, + "loss/logits": 0.15075814723968506, + "loss/reg": 0.0034145053941756487, + "step": 1547 + }, + { + "epoch": 0.1935, + "grad_norm": 2.2009761333465576, + "grad_norm_var": 0.2587680482498602, + "learning_rate": 0.0001, + "loss": 1.066, + "loss/crossentropy": 2.364064931869507, + "loss/hidden": 0.87890625, + "loss/logits": 0.15300363302230835, + "loss/reg": 0.003412702353671193, + "step": 1548 + }, + { + "epoch": 0.193625, + "grad_norm": 2.375746250152588, + "grad_norm_var": 0.2554693062414391, + "learning_rate": 0.0001, + "loss": 1.1073, + "loss/crossentropy": 2.6182949542999268, + "loss/hidden": 0.92578125, + "loss/logits": 0.14743448793888092, + "loss/reg": 0.0034108073450624943, + "step": 1549 + }, + { + "epoch": 0.19375, + "grad_norm": 2.5327534675598145, + "grad_norm_var": 0.25510200809180733, + "learning_rate": 0.0001, + "loss": 1.0888, + "loss/crossentropy": 2.710033416748047, + "loss/hidden": 0.88671875, + "loss/logits": 0.16803640127182007, + "loss/reg": 0.0034089069813489914, + "step": 1550 + }, + { + "epoch": 0.193875, + "grad_norm": 2.195998430252075, + "grad_norm_var": 0.22541138413578582, + "learning_rate": 0.0001, + "loss": 1.1853, + "loss/crossentropy": 2.199061393737793, + "loss/hidden": 0.9765625, + "loss/logits": 0.17465950548648834, + "loss/reg": 0.003407144919037819, + "step": 1551 + }, + { + "epoch": 0.194, + "grad_norm": 2.297307014465332, + "grad_norm_var": 0.2155580849353057, + "learning_rate": 0.0001, + "loss": 0.9825, + "loss/crossentropy": 2.3787636756896973, + "loss/hidden": 0.82421875, + "loss/logits": 0.12426453083753586, + "loss/reg": 0.0034053786657750607, + "step": 1552 + }, + { + "epoch": 0.194125, + "grad_norm": 42.85431671142578, + "grad_norm_var": 102.47997721663579, + "learning_rate": 0.0001, + "loss": 1.0865, + "loss/crossentropy": 2.3751256465911865, + "loss/hidden": 0.8984375, + "loss/logits": 0.15405802428722382, + "loss/reg": 0.0034037018194794655, + "step": 1553 + }, + { + "epoch": 0.19425, + "grad_norm": 2.448805809020996, + "grad_norm_var": 102.36204705695503, + "learning_rate": 0.0001, + "loss": 1.1293, + "loss/crossentropy": 2.632035255432129, + "loss/hidden": 0.91796875, + "loss/logits": 0.17729425430297852, + "loss/reg": 0.0034019986633211374, + "step": 1554 + }, + { + "epoch": 0.194375, + "grad_norm": 2.3281965255737305, + "grad_norm_var": 102.68741632356571, + "learning_rate": 0.0001, + "loss": 1.1374, + "loss/crossentropy": 2.444570302963257, + "loss/hidden": 0.9375, + "loss/logits": 0.16586540639400482, + "loss/reg": 0.003400270827114582, + "step": 1555 + }, + { + "epoch": 0.1945, + "grad_norm": 2.293851852416992, + "grad_norm_var": 102.8838099010742, + "learning_rate": 0.0001, + "loss": 1.0756, + "loss/crossentropy": 2.545522928237915, + "loss/hidden": 0.875, + "loss/logits": 0.16658270359039307, + "loss/reg": 0.0033986270427703857, + "step": 1556 + }, + { + "epoch": 0.194625, + "grad_norm": 4.360113143920898, + "grad_norm_var": 102.41291327849744, + "learning_rate": 0.0001, + "loss": 1.2058, + "loss/crossentropy": 2.3416459560394287, + "loss/hidden": 1.0234375, + "loss/logits": 0.14838361740112305, + "loss/reg": 0.003396830288693309, + "step": 1557 + }, + { + "epoch": 0.19475, + "grad_norm": 2.38606858253479, + "grad_norm_var": 102.19721826513539, + "learning_rate": 0.0001, + "loss": 1.0499, + "loss/crossentropy": 2.7640974521636963, + "loss/hidden": 0.875, + "loss/logits": 0.1409429907798767, + "loss/reg": 0.0033951113000512123, + "step": 1558 + }, + { + "epoch": 0.194875, + "grad_norm": 2.7758493423461914, + "grad_norm_var": 101.94624591139775, + "learning_rate": 0.0001, + "loss": 1.0704, + "loss/crossentropy": 2.497638702392578, + "loss/hidden": 0.890625, + "loss/logits": 0.14583545923233032, + "loss/reg": 0.0033934745006263256, + "step": 1559 + }, + { + "epoch": 0.195, + "grad_norm": 3.479710340499878, + "grad_norm_var": 101.52401358472737, + "learning_rate": 0.0001, + "loss": 1.1734, + "loss/crossentropy": 2.457615375518799, + "loss/hidden": 0.9609375, + "loss/logits": 0.1785276234149933, + "loss/reg": 0.0033916765823960304, + "step": 1560 + }, + { + "epoch": 0.195125, + "grad_norm": 2.017589807510376, + "grad_norm_var": 101.90605089709193, + "learning_rate": 0.0001, + "loss": 1.1169, + "loss/crossentropy": 2.240902900695801, + "loss/hidden": 0.90625, + "loss/logits": 0.17671313881874084, + "loss/reg": 0.0033898656256496906, + "step": 1561 + }, + { + "epoch": 0.19525, + "grad_norm": 3.5527615547180176, + "grad_norm_var": 101.55947999989262, + "learning_rate": 0.0001, + "loss": 1.3431, + "loss/crossentropy": 1.965632677078247, + "loss/hidden": 1.1484375, + "loss/logits": 0.16078650951385498, + "loss/reg": 0.0033880271948873997, + "step": 1562 + }, + { + "epoch": 0.195375, + "grad_norm": 2.345229387283325, + "grad_norm_var": 101.47058431829664, + "learning_rate": 0.0001, + "loss": 1.0961, + "loss/crossentropy": 2.583942174911499, + "loss/hidden": 0.8984375, + "loss/logits": 0.1638316810131073, + "loss/reg": 0.003386161755770445, + "step": 1563 + }, + { + "epoch": 0.1955, + "grad_norm": 3.6912682056427, + "grad_norm_var": 101.02284512008372, + "learning_rate": 0.0001, + "loss": 1.1561, + "loss/crossentropy": 2.344486713409424, + "loss/hidden": 0.9140625, + "loss/logits": 0.2081700563430786, + "loss/reg": 0.0033844145946204662, + "step": 1564 + }, + { + "epoch": 0.195625, + "grad_norm": 2.0806732177734375, + "grad_norm_var": 101.14121040687311, + "learning_rate": 0.0001, + "loss": 1.1902, + "loss/crossentropy": 2.3066744804382324, + "loss/hidden": 0.9765625, + "loss/logits": 0.17980894446372986, + "loss/reg": 0.0033826008439064026, + "step": 1565 + }, + { + "epoch": 0.19575, + "grad_norm": 2.6736996173858643, + "grad_norm_var": 101.09180955446247, + "learning_rate": 0.0001, + "loss": 1.1123, + "loss/crossentropy": 2.6491916179656982, + "loss/hidden": 0.83984375, + "loss/logits": 0.23861975967884064, + "loss/reg": 0.0033807456493377686, + "step": 1566 + }, + { + "epoch": 0.195875, + "grad_norm": 2.320620059967041, + "grad_norm_var": 101.0422612381744, + "learning_rate": 0.0001, + "loss": 0.9997, + "loss/crossentropy": 2.432460069656372, + "loss/hidden": 0.83984375, + "loss/logits": 0.12610265612602234, + "loss/reg": 0.003379035508260131, + "step": 1567 + }, + { + "epoch": 0.196, + "grad_norm": 2.170931816101074, + "grad_norm_var": 101.09291343176476, + "learning_rate": 0.0001, + "loss": 1.0037, + "loss/crossentropy": 2.6432549953460693, + "loss/hidden": 0.83203125, + "loss/logits": 0.13785216212272644, + "loss/reg": 0.0033771616872400045, + "step": 1568 + }, + { + "epoch": 0.196125, + "grad_norm": 1.9352695941925049, + "grad_norm_var": 0.5014398496234489, + "learning_rate": 0.0001, + "loss": 1.0546, + "loss/crossentropy": 2.368238925933838, + "loss/hidden": 0.8671875, + "loss/logits": 0.15368467569351196, + "loss/reg": 0.0033752431627362967, + "step": 1569 + }, + { + "epoch": 0.19625, + "grad_norm": 1.7521263360977173, + "grad_norm_var": 0.5531383546467253, + "learning_rate": 0.0001, + "loss": 0.964, + "loss/crossentropy": 2.388716220855713, + "loss/hidden": 0.796875, + "loss/logits": 0.13340537250041962, + "loss/reg": 0.0033735185861587524, + "step": 1570 + }, + { + "epoch": 0.196375, + "grad_norm": 2.942868709564209, + "grad_norm_var": 0.5515874670900174, + "learning_rate": 0.0001, + "loss": 1.246, + "loss/crossentropy": 2.4898412227630615, + "loss/hidden": 1.03125, + "loss/logits": 0.18107619881629944, + "loss/reg": 0.0033718394115567207, + "step": 1571 + }, + { + "epoch": 0.1965, + "grad_norm": 2.828531503677368, + "grad_norm_var": 0.5423780354131977, + "learning_rate": 0.0001, + "loss": 1.1128, + "loss/crossentropy": 2.526690721511841, + "loss/hidden": 0.9140625, + "loss/logits": 0.1650082767009735, + "loss/reg": 0.0033701006323099136, + "step": 1572 + }, + { + "epoch": 0.196625, + "grad_norm": 1.8868728876113892, + "grad_norm_var": 0.3795729319831989, + "learning_rate": 0.0001, + "loss": 0.9528, + "loss/crossentropy": 2.4025442600250244, + "loss/hidden": 0.78125, + "loss/logits": 0.13787029683589935, + "loss/reg": 0.0033683953806757927, + "step": 1573 + }, + { + "epoch": 0.19675, + "grad_norm": 2.290252208709717, + "grad_norm_var": 0.38227303455985767, + "learning_rate": 0.0001, + "loss": 1.0692, + "loss/crossentropy": 2.349837064743042, + "loss/hidden": 0.87109375, + "loss/logits": 0.16445474326610565, + "loss/reg": 0.0033667993266135454, + "step": 1574 + }, + { + "epoch": 0.196875, + "grad_norm": 2.190880537033081, + "grad_norm_var": 0.385772762292572, + "learning_rate": 0.0001, + "loss": 0.9696, + "loss/crossentropy": 2.2366368770599365, + "loss/hidden": 0.796875, + "loss/logits": 0.13903136551380157, + "loss/reg": 0.0033651133999228477, + "step": 1575 + }, + { + "epoch": 0.197, + "grad_norm": 3.2513270378112793, + "grad_norm_var": 0.35950258294762044, + "learning_rate": 0.0001, + "loss": 1.2407, + "loss/crossentropy": 2.3932385444641113, + "loss/hidden": 1.03125, + "loss/logits": 0.1757686287164688, + "loss/reg": 0.003363401163369417, + "step": 1576 + }, + { + "epoch": 0.197125, + "grad_norm": 2.902137279510498, + "grad_norm_var": 0.35201813546933614, + "learning_rate": 0.0001, + "loss": 1.3339, + "loss/crossentropy": 2.5385305881500244, + "loss/hidden": 1.1171875, + "loss/logits": 0.18306049704551697, + "loss/reg": 0.0033617597073316574, + "step": 1577 + }, + { + "epoch": 0.19725, + "grad_norm": 2.4924919605255127, + "grad_norm_var": 0.2806556923123916, + "learning_rate": 0.0001, + "loss": 1.101, + "loss/crossentropy": 2.913592576980591, + "loss/hidden": 0.9140625, + "loss/logits": 0.15331600606441498, + "loss/reg": 0.003360015107318759, + "step": 1578 + }, + { + "epoch": 0.197375, + "grad_norm": 2.6192233562469482, + "grad_norm_var": 0.2802525663669532, + "learning_rate": 0.0001, + "loss": 1.2151, + "loss/crossentropy": 2.543704032897949, + "loss/hidden": 1.0234375, + "loss/logits": 0.1581249237060547, + "loss/reg": 0.003358310554176569, + "step": 1579 + }, + { + "epoch": 0.1975, + "grad_norm": 2.3134047985076904, + "grad_norm_var": 0.1803902922500375, + "learning_rate": 0.0001, + "loss": 1.217, + "loss/crossentropy": 2.3356986045837402, + "loss/hidden": 1.0234375, + "loss/logits": 0.15994945168495178, + "loss/reg": 0.003356639062985778, + "step": 1580 + }, + { + "epoch": 0.197625, + "grad_norm": 2.918966054916382, + "grad_norm_var": 0.18686370719447395, + "learning_rate": 0.0001, + "loss": 1.4755, + "loss/crossentropy": 2.0418221950531006, + "loss/hidden": 1.2109375, + "loss/logits": 0.2309650182723999, + "loss/reg": 0.00335493846796453, + "step": 1581 + }, + { + "epoch": 0.19775, + "grad_norm": 2.1771440505981445, + "grad_norm_var": 0.1886619692371113, + "learning_rate": 0.0001, + "loss": 0.9935, + "loss/crossentropy": 2.502734661102295, + "loss/hidden": 0.82421875, + "loss/logits": 0.1357189416885376, + "loss/reg": 0.0033531710505485535, + "step": 1582 + }, + { + "epoch": 0.197875, + "grad_norm": 2.2884721755981445, + "grad_norm_var": 0.18922569213149815, + "learning_rate": 0.0001, + "loss": 0.9984, + "loss/crossentropy": 2.4832651615142822, + "loss/hidden": 0.8125, + "loss/logits": 0.15235686302185059, + "loss/reg": 0.00335147837176919, + "step": 1583 + }, + { + "epoch": 0.198, + "grad_norm": 2.547328472137451, + "grad_norm_var": 0.18482493667708866, + "learning_rate": 0.0001, + "loss": 1.1119, + "loss/crossentropy": 2.575803279876709, + "loss/hidden": 0.92578125, + "loss/logits": 0.15261293947696686, + "loss/reg": 0.003349804785102606, + "step": 1584 + }, + { + "epoch": 0.198125, + "grad_norm": 3.402043581008911, + "grad_norm_var": 0.21694510449547597, + "learning_rate": 0.0001, + "loss": 1.3013, + "loss/crossentropy": 2.0987796783447266, + "loss/hidden": 1.09375, + "loss/logits": 0.17406561970710754, + "loss/reg": 0.0033480448182672262, + "step": 1585 + }, + { + "epoch": 0.19825, + "grad_norm": 2.545072317123413, + "grad_norm_var": 0.17185981683361665, + "learning_rate": 0.0001, + "loss": 1.1464, + "loss/crossentropy": 2.5816264152526855, + "loss/hidden": 0.9375, + "loss/logits": 0.17544244229793549, + "loss/reg": 0.003346419893205166, + "step": 1586 + }, + { + "epoch": 0.198375, + "grad_norm": 3.0395045280456543, + "grad_norm_var": 0.17686366063397868, + "learning_rate": 0.0001, + "loss": 1.248, + "loss/crossentropy": 2.302675247192383, + "loss/hidden": 1.015625, + "loss/logits": 0.19897626340389252, + "loss/reg": 0.0033445856533944607, + "step": 1587 + }, + { + "epoch": 0.1985, + "grad_norm": 2.4645659923553467, + "grad_norm_var": 0.17433679981741357, + "learning_rate": 0.0001, + "loss": 1.1567, + "loss/crossentropy": 2.4756343364715576, + "loss/hidden": 0.95703125, + "loss/logits": 0.16626408696174622, + "loss/reg": 0.0033427351154386997, + "step": 1588 + }, + { + "epoch": 0.198625, + "grad_norm": 2.810246706008911, + "grad_norm_var": 0.14190777744166377, + "learning_rate": 0.0001, + "loss": 1.1902, + "loss/crossentropy": 2.5803637504577637, + "loss/hidden": 0.9765625, + "loss/logits": 0.18018130958080292, + "loss/reg": 0.0033410657197237015, + "step": 1589 + }, + { + "epoch": 0.19875, + "grad_norm": 2.4339115619659424, + "grad_norm_var": 0.13648274466220206, + "learning_rate": 0.0001, + "loss": 1.028, + "loss/crossentropy": 2.399829149246216, + "loss/hidden": 0.828125, + "loss/logits": 0.16653020679950714, + "loss/reg": 0.0033393700141459703, + "step": 1590 + }, + { + "epoch": 0.198875, + "grad_norm": 3.239158868789673, + "grad_norm_var": 0.141020529033392, + "learning_rate": 0.0001, + "loss": 1.1675, + "loss/crossentropy": 2.2257063388824463, + "loss/hidden": 0.9609375, + "loss/logits": 0.17314554750919342, + "loss/reg": 0.0033375280909240246, + "step": 1591 + }, + { + "epoch": 0.199, + "grad_norm": 2.437912940979004, + "grad_norm_var": 0.12423960548648268, + "learning_rate": 0.0001, + "loss": 0.922, + "loss/crossentropy": 2.4248645305633545, + "loss/hidden": 0.765625, + "loss/logits": 0.12298044562339783, + "loss/reg": 0.00333569198846817, + "step": 1592 + }, + { + "epoch": 0.199125, + "grad_norm": 2.8941097259521484, + "grad_norm_var": 0.12398925250324358, + "learning_rate": 0.0001, + "loss": 1.1451, + "loss/crossentropy": 2.6563851833343506, + "loss/hidden": 0.93359375, + "loss/logits": 0.17815393209457397, + "loss/reg": 0.0033339851070195436, + "step": 1593 + }, + { + "epoch": 0.19925, + "grad_norm": 2.803880214691162, + "grad_norm_var": 0.12292982191307994, + "learning_rate": 0.0001, + "loss": 1.1357, + "loss/crossentropy": 2.2759344577789307, + "loss/hidden": 0.9609375, + "loss/logits": 0.141413152217865, + "loss/reg": 0.003332150634378195, + "step": 1594 + }, + { + "epoch": 0.199375, + "grad_norm": 2.237532138824463, + "grad_norm_var": 0.13530315628679926, + "learning_rate": 0.0001, + "loss": 1.2666, + "loss/crossentropy": 2.077180862426758, + "loss/hidden": 1.0625, + "loss/logits": 0.1708061397075653, + "loss/reg": 0.003330171573907137, + "step": 1595 + }, + { + "epoch": 0.1995, + "grad_norm": 1.8717583417892456, + "grad_norm_var": 0.16787872576412224, + "learning_rate": 0.0001, + "loss": 0.933, + "loss/crossentropy": 2.5325214862823486, + "loss/hidden": 0.7734375, + "loss/logits": 0.12631843984127045, + "loss/reg": 0.0033281673677265644, + "step": 1596 + }, + { + "epoch": 0.199625, + "grad_norm": 2.3740711212158203, + "grad_norm_var": 0.16558500323165828, + "learning_rate": 0.0001, + "loss": 0.9823, + "loss/crossentropy": 2.456437349319458, + "loss/hidden": 0.81640625, + "loss/logits": 0.13264372944831848, + "loss/reg": 0.003326438134536147, + "step": 1597 + }, + { + "epoch": 0.19975, + "grad_norm": 2.2797210216522217, + "grad_norm_var": 0.16048771364269968, + "learning_rate": 0.0001, + "loss": 1.1277, + "loss/crossentropy": 2.478566884994507, + "loss/hidden": 0.92578125, + "loss/logits": 0.16867585480213165, + "loss/reg": 0.0033244146034121513, + "step": 1598 + }, + { + "epoch": 0.199875, + "grad_norm": 2.273390293121338, + "grad_norm_var": 0.1611370953897993, + "learning_rate": 0.0001, + "loss": 1.0944, + "loss/crossentropy": 2.3444995880126953, + "loss/hidden": 0.91015625, + "loss/logits": 0.15098696947097778, + "loss/reg": 0.0033223910722881556, + "step": 1599 + }, + { + "epoch": 0.2, + "grad_norm": 2.5064868927001953, + "grad_norm_var": 0.1615466221150351, + "learning_rate": 0.0001, + "loss": 1.1207, + "loss/crossentropy": 2.6761395931243896, + "loss/hidden": 0.91796875, + "loss/logits": 0.16954849660396576, + "loss/reg": 0.003320206655189395, + "step": 1600 + }, + { + "epoch": 0.200125, + "grad_norm": 2.1371653079986572, + "grad_norm_var": 0.12641732646446915, + "learning_rate": 0.0001, + "loss": 1.1086, + "loss/crossentropy": 2.167865037918091, + "loss/hidden": 0.921875, + "loss/logits": 0.15358451008796692, + "loss/reg": 0.0033184492494910955, + "step": 1601 + }, + { + "epoch": 0.20025, + "grad_norm": 2.7362449169158936, + "grad_norm_var": 0.12929521265355667, + "learning_rate": 0.0001, + "loss": 1.1727, + "loss/crossentropy": 2.7397940158843994, + "loss/hidden": 0.9609375, + "loss/logits": 0.17860586941242218, + "loss/reg": 0.0033167051151394844, + "step": 1602 + }, + { + "epoch": 0.200375, + "grad_norm": 2.124201536178589, + "grad_norm_var": 0.11993136224217782, + "learning_rate": 0.0001, + "loss": 1.0476, + "loss/crossentropy": 2.3766019344329834, + "loss/hidden": 0.875, + "loss/logits": 0.13947069644927979, + "loss/reg": 0.003314658999443054, + "step": 1603 + }, + { + "epoch": 0.2005, + "grad_norm": 2.1093101501464844, + "grad_norm_var": 0.12838562389595007, + "learning_rate": 0.0001, + "loss": 1.0742, + "loss/crossentropy": 2.491436004638672, + "loss/hidden": 0.87109375, + "loss/logits": 0.16998031735420227, + "loss/reg": 0.0033127006608992815, + "step": 1604 + }, + { + "epoch": 0.200625, + "grad_norm": 2.2037789821624756, + "grad_norm_var": 0.12259215079400192, + "learning_rate": 0.0001, + "loss": 1.1443, + "loss/crossentropy": 2.429650068283081, + "loss/hidden": 0.93359375, + "loss/logits": 0.1775752156972885, + "loss/reg": 0.0033109041396528482, + "step": 1605 + }, + { + "epoch": 0.20075, + "grad_norm": 2.0681684017181396, + "grad_norm_var": 0.13009940320814947, + "learning_rate": 0.0001, + "loss": 0.9251, + "loss/crossentropy": 2.5921154022216797, + "loss/hidden": 0.7734375, + "loss/logits": 0.11855532228946686, + "loss/reg": 0.0033087998162955046, + "step": 1606 + }, + { + "epoch": 0.200875, + "grad_norm": 2.6388769149780273, + "grad_norm_var": 0.08494051001129363, + "learning_rate": 0.0001, + "loss": 1.0706, + "loss/crossentropy": 2.5055747032165527, + "loss/hidden": 0.89453125, + "loss/logits": 0.14303961396217346, + "loss/reg": 0.0033070738427340984, + "step": 1607 + }, + { + "epoch": 0.201, + "grad_norm": 2.181697368621826, + "grad_norm_var": 0.08624639517303852, + "learning_rate": 0.0001, + "loss": 1.0201, + "loss/crossentropy": 2.4525694847106934, + "loss/hidden": 0.84375, + "loss/logits": 0.14332106709480286, + "loss/reg": 0.003305203514173627, + "step": 1608 + }, + { + "epoch": 0.201125, + "grad_norm": 2.6697564125061035, + "grad_norm_var": 0.07281751738566834, + "learning_rate": 0.0001, + "loss": 1.0102, + "loss/crossentropy": 2.7797605991363525, + "loss/hidden": 0.828125, + "loss/logits": 0.14901109039783478, + "loss/reg": 0.003303457982838154, + "step": 1609 + }, + { + "epoch": 0.20125, + "grad_norm": 2.582929849624634, + "grad_norm_var": 0.06179040816687683, + "learning_rate": 0.0001, + "loss": 1.0996, + "loss/crossentropy": 2.530728578567505, + "loss/hidden": 0.91015625, + "loss/logits": 0.1564064621925354, + "loss/reg": 0.0033016535453498363, + "step": 1610 + }, + { + "epoch": 0.201375, + "grad_norm": 1.8796063661575317, + "grad_norm_var": 0.07336041461658145, + "learning_rate": 0.0001, + "loss": 1.0591, + "loss/crossentropy": 2.5042145252227783, + "loss/hidden": 0.87890625, + "loss/logits": 0.14722013473510742, + "loss/reg": 0.0032997329253703356, + "step": 1611 + }, + { + "epoch": 0.2015, + "grad_norm": 3.313866138458252, + "grad_norm_var": 0.1229542381526608, + "learning_rate": 0.0001, + "loss": 1.2483, + "loss/crossentropy": 2.173633575439453, + "loss/hidden": 1.0390625, + "loss/logits": 0.17622298002243042, + "loss/reg": 0.00329802418127656, + "step": 1612 + }, + { + "epoch": 0.201625, + "grad_norm": 2.6430091857910156, + "grad_norm_var": 0.12726375044356592, + "learning_rate": 0.0001, + "loss": 1.0964, + "loss/crossentropy": 2.1356022357940674, + "loss/hidden": 0.91015625, + "loss/logits": 0.15332993865013123, + "loss/reg": 0.0032957610674202442, + "step": 1613 + }, + { + "epoch": 0.20175, + "grad_norm": 2.283130645751953, + "grad_norm_var": 0.12721126777018643, + "learning_rate": 0.0001, + "loss": 1.1047, + "loss/crossentropy": 2.6446938514709473, + "loss/hidden": 0.9140625, + "loss/logits": 0.15773829817771912, + "loss/reg": 0.003294040448963642, + "step": 1614 + }, + { + "epoch": 0.201875, + "grad_norm": 2.009594678878784, + "grad_norm_var": 0.1359073820378919, + "learning_rate": 0.0001, + "loss": 1.3146, + "loss/crossentropy": 2.353555679321289, + "loss/hidden": 1.0859375, + "loss/logits": 0.1957610696554184, + "loss/reg": 0.0032923046965152025, + "step": 1615 + }, + { + "epoch": 0.202, + "grad_norm": 2.2532074451446533, + "grad_norm_var": 0.1356617628627058, + "learning_rate": 0.0001, + "loss": 1.0633, + "loss/crossentropy": 2.421433210372925, + "loss/hidden": 0.890625, + "loss/logits": 0.139800027012825, + "loss/reg": 0.0032904213294386864, + "step": 1616 + }, + { + "epoch": 0.202125, + "grad_norm": 2.2480552196502686, + "grad_norm_var": 0.1330667309785141, + "learning_rate": 0.0001, + "loss": 1.1926, + "loss/crossentropy": 2.4565775394439697, + "loss/hidden": 0.99609375, + "loss/logits": 0.16361382603645325, + "loss/reg": 0.0032883703242987394, + "step": 1617 + }, + { + "epoch": 0.20225, + "grad_norm": 4.123042106628418, + "grad_norm_var": 0.3206941892301216, + "learning_rate": 0.0001, + "loss": 1.4135, + "loss/crossentropy": 2.3067095279693604, + "loss/hidden": 1.140625, + "loss/logits": 0.2400142401456833, + "loss/reg": 0.0032866299152374268, + "step": 1618 + }, + { + "epoch": 0.202375, + "grad_norm": 2.0010764598846436, + "grad_norm_var": 0.3271258788637292, + "learning_rate": 0.0001, + "loss": 1.0215, + "loss/crossentropy": 2.4679181575775146, + "loss/hidden": 0.83984375, + "loss/logits": 0.14884260296821594, + "loss/reg": 0.0032845879904925823, + "step": 1619 + }, + { + "epoch": 0.2025, + "grad_norm": 2.1155405044555664, + "grad_norm_var": 0.3268448163523752, + "learning_rate": 0.0001, + "loss": 1.1275, + "loss/crossentropy": 2.7230353355407715, + "loss/hidden": 0.9375, + "loss/logits": 0.15715520083904266, + "loss/reg": 0.0032828382682055235, + "step": 1620 + }, + { + "epoch": 0.202625, + "grad_norm": 2.0337374210357666, + "grad_norm_var": 0.3342560560773141, + "learning_rate": 0.0001, + "loss": 1.0079, + "loss/crossentropy": 2.726454734802246, + "loss/hidden": 0.8359375, + "loss/logits": 0.13911572098731995, + "loss/reg": 0.003280794247984886, + "step": 1621 + }, + { + "epoch": 0.20275, + "grad_norm": 2.493293285369873, + "grad_norm_var": 0.32445634627695763, + "learning_rate": 0.0001, + "loss": 1.3205, + "loss/crossentropy": 2.4674136638641357, + "loss/hidden": 1.1015625, + "loss/logits": 0.1861811727285385, + "loss/reg": 0.003278720658272505, + "step": 1622 + }, + { + "epoch": 0.202875, + "grad_norm": 2.0618650913238525, + "grad_norm_var": 0.3320343293044615, + "learning_rate": 0.0001, + "loss": 1.0074, + "loss/crossentropy": 2.5865259170532227, + "loss/hidden": 0.83984375, + "loss/logits": 0.13479462265968323, + "loss/reg": 0.0032766172662377357, + "step": 1623 + }, + { + "epoch": 0.203, + "grad_norm": 4.24573278427124, + "grad_norm_var": 0.5297347853177716, + "learning_rate": 0.0001, + "loss": 1.0286, + "loss/crossentropy": 2.7966721057891846, + "loss/hidden": 0.84765625, + "loss/logits": 0.1481596827507019, + "loss/reg": 0.0032749150414019823, + "step": 1624 + }, + { + "epoch": 0.203125, + "grad_norm": 4.453259468078613, + "grad_norm_var": 0.7546780963902354, + "learning_rate": 0.0001, + "loss": 1.4379, + "loss/crossentropy": 2.324061155319214, + "loss/hidden": 1.1875, + "loss/logits": 0.21765878796577454, + "loss/reg": 0.003273224225267768, + "step": 1625 + }, + { + "epoch": 0.20325, + "grad_norm": 2.1763815879821777, + "grad_norm_var": 0.7697989170952411, + "learning_rate": 0.0001, + "loss": 1.2117, + "loss/crossentropy": 2.2315030097961426, + "loss/hidden": 1.0234375, + "loss/logits": 0.1554989069700241, + "loss/reg": 0.0032715124543756247, + "step": 1626 + }, + { + "epoch": 0.203375, + "grad_norm": 2.147721529006958, + "grad_norm_var": 0.7468977871556014, + "learning_rate": 0.0001, + "loss": 1.1699, + "loss/crossentropy": 2.5481529235839844, + "loss/hidden": 0.953125, + "loss/logits": 0.18407508730888367, + "loss/reg": 0.0032695841509848833, + "step": 1627 + }, + { + "epoch": 0.2035, + "grad_norm": 2.1995062828063965, + "grad_norm_var": 0.727752660020807, + "learning_rate": 0.0001, + "loss": 1.0413, + "loss/crossentropy": 2.62490177154541, + "loss/hidden": 0.86328125, + "loss/logits": 0.14532649517059326, + "loss/reg": 0.0032678483985364437, + "step": 1628 + }, + { + "epoch": 0.203625, + "grad_norm": 2.6025071144104004, + "grad_norm_var": 0.7275851745925003, + "learning_rate": 0.0001, + "loss": 1.3071, + "loss/crossentropy": 2.261453628540039, + "loss/hidden": 1.0703125, + "loss/logits": 0.2041451334953308, + "loss/reg": 0.0032659387215971947, + "step": 1629 + }, + { + "epoch": 0.20375, + "grad_norm": 2.3895230293273926, + "grad_norm_var": 0.7239327077368195, + "learning_rate": 0.0001, + "loss": 1.1771, + "loss/crossentropy": 2.437429904937744, + "loss/hidden": 0.95703125, + "loss/logits": 0.1874021738767624, + "loss/reg": 0.003264203667640686, + "step": 1630 + }, + { + "epoch": 0.203875, + "grad_norm": 2.5536670684814453, + "grad_norm_var": 0.6998122275898206, + "learning_rate": 0.0001, + "loss": 1.2855, + "loss/crossentropy": 2.308814287185669, + "loss/hidden": 1.078125, + "loss/logits": 0.17473715543746948, + "loss/reg": 0.003262232756242156, + "step": 1631 + }, + { + "epoch": 0.204, + "grad_norm": 1.893472671508789, + "grad_norm_var": 0.7260273238761611, + "learning_rate": 0.0001, + "loss": 1.0432, + "loss/crossentropy": 2.4681177139282227, + "loss/hidden": 0.859375, + "loss/logits": 0.15119843184947968, + "loss/reg": 0.003260491183027625, + "step": 1632 + }, + { + "epoch": 0.204125, + "grad_norm": 2.6406593322753906, + "grad_norm_var": 0.7167848758234858, + "learning_rate": 0.0001, + "loss": 0.9315, + "loss/crossentropy": 2.692917585372925, + "loss/hidden": 0.76953125, + "loss/logits": 0.12942397594451904, + "loss/reg": 0.003258763812482357, + "step": 1633 + }, + { + "epoch": 0.20425, + "grad_norm": 2.6017487049102783, + "grad_norm_var": 0.5592297482073356, + "learning_rate": 0.0001, + "loss": 1.4767, + "loss/crossentropy": 1.8698807954788208, + "loss/hidden": 1.2109375, + "loss/logits": 0.2331976294517517, + "loss/reg": 0.0032570629846304655, + "step": 1634 + }, + { + "epoch": 0.204375, + "grad_norm": 2.134735107421875, + "grad_norm_var": 0.5507758063156113, + "learning_rate": 0.0001, + "loss": 0.9424, + "loss/crossentropy": 2.657256841659546, + "loss/hidden": 0.78515625, + "loss/logits": 0.1246921718120575, + "loss/reg": 0.003255224786698818, + "step": 1635 + }, + { + "epoch": 0.2045, + "grad_norm": 2.775542736053467, + "grad_norm_var": 0.5400799961918049, + "learning_rate": 0.0001, + "loss": 1.1952, + "loss/crossentropy": 2.3927488327026367, + "loss/hidden": 0.98046875, + "loss/logits": 0.18224084377288818, + "loss/reg": 0.003253570292145014, + "step": 1636 + }, + { + "epoch": 0.204625, + "grad_norm": 3.4010424613952637, + "grad_norm_var": 0.5559319990050954, + "learning_rate": 0.0001, + "loss": 0.9753, + "loss/crossentropy": 2.674126386642456, + "loss/hidden": 0.80859375, + "loss/logits": 0.1341724693775177, + "loss/reg": 0.0032518133521080017, + "step": 1637 + }, + { + "epoch": 0.20475, + "grad_norm": 2.3625357151031494, + "grad_norm_var": 0.5601365603978583, + "learning_rate": 0.0001, + "loss": 1.1191, + "loss/crossentropy": 2.00201416015625, + "loss/hidden": 0.93359375, + "loss/logits": 0.15301145613193512, + "loss/reg": 0.003249979577958584, + "step": 1638 + }, + { + "epoch": 0.204875, + "grad_norm": 2.233456611633301, + "grad_norm_var": 0.5481778857364833, + "learning_rate": 0.0001, + "loss": 1.0923, + "loss/crossentropy": 2.260629892349243, + "loss/hidden": 0.9140625, + "loss/logits": 0.14572405815124512, + "loss/reg": 0.003248338820412755, + "step": 1639 + }, + { + "epoch": 0.205, + "grad_norm": 2.4094839096069336, + "grad_norm_var": 0.37452435323996436, + "learning_rate": 0.0001, + "loss": 1.0426, + "loss/crossentropy": 2.3188180923461914, + "loss/hidden": 0.8515625, + "loss/logits": 0.15855032205581665, + "loss/reg": 0.0032465672120451927, + "step": 1640 + }, + { + "epoch": 0.205125, + "grad_norm": 2.5767383575439453, + "grad_norm_var": 0.12114709294457929, + "learning_rate": 0.0001, + "loss": 1.107, + "loss/crossentropy": 2.5933563709259033, + "loss/hidden": 0.92578125, + "loss/logits": 0.14877736568450928, + "loss/reg": 0.003244933672249317, + "step": 1641 + }, + { + "epoch": 0.20525, + "grad_norm": 2.55334734916687, + "grad_norm_var": 0.11659405774919757, + "learning_rate": 0.0001, + "loss": 1.4642, + "loss/crossentropy": 2.26149582862854, + "loss/hidden": 1.1953125, + "loss/logits": 0.23644839227199554, + "loss/reg": 0.0032434100285172462, + "step": 1642 + }, + { + "epoch": 0.205375, + "grad_norm": 2.3793838024139404, + "grad_norm_var": 0.11007918089816542, + "learning_rate": 0.0001, + "loss": 1.0786, + "loss/crossentropy": 2.1200153827667236, + "loss/hidden": 0.89453125, + "loss/logits": 0.15168824791908264, + "loss/reg": 0.0032420321367681026, + "step": 1643 + }, + { + "epoch": 0.2055, + "grad_norm": 2.159534215927124, + "grad_norm_var": 0.11168307348258182, + "learning_rate": 0.0001, + "loss": 1.2839, + "loss/crossentropy": 2.5448434352874756, + "loss/hidden": 1.078125, + "loss/logits": 0.17339983582496643, + "loss/reg": 0.003240725724026561, + "step": 1644 + }, + { + "epoch": 0.205625, + "grad_norm": 2.512781858444214, + "grad_norm_var": 0.1107112022419983, + "learning_rate": 0.0001, + "loss": 1.0038, + "loss/crossentropy": 2.5887203216552734, + "loss/hidden": 0.83984375, + "loss/logits": 0.1315690279006958, + "loss/reg": 0.0032390966080129147, + "step": 1645 + }, + { + "epoch": 0.20575, + "grad_norm": 2.471038818359375, + "grad_norm_var": 0.11021265436342276, + "learning_rate": 0.0001, + "loss": 1.0674, + "loss/crossentropy": 2.4827725887298584, + "loss/hidden": 0.8984375, + "loss/logits": 0.1366003006696701, + "loss/reg": 0.003237416036427021, + "step": 1646 + }, + { + "epoch": 0.205875, + "grad_norm": 2.5491740703582764, + "grad_norm_var": 0.11016900462870065, + "learning_rate": 0.0001, + "loss": 0.9898, + "loss/crossentropy": 2.808387279510498, + "loss/hidden": 0.8203125, + "loss/logits": 0.1371297538280487, + "loss/reg": 0.0032356702722609043, + "step": 1647 + }, + { + "epoch": 0.206, + "grad_norm": 2.476713180541992, + "grad_norm_var": 0.08594114936163895, + "learning_rate": 0.0001, + "loss": 1.1398, + "loss/crossentropy": 2.4765665531158447, + "loss/hidden": 0.9375, + "loss/logits": 0.1700000762939453, + "loss/reg": 0.003234060015529394, + "step": 1648 + }, + { + "epoch": 0.206125, + "grad_norm": 2.1351890563964844, + "grad_norm_var": 0.09343219350858452, + "learning_rate": 0.0001, + "loss": 1.1968, + "loss/crossentropy": 2.2499215602874756, + "loss/hidden": 0.98828125, + "loss/logits": 0.176192969083786, + "loss/reg": 0.0032324332278221846, + "step": 1649 + }, + { + "epoch": 0.20625, + "grad_norm": 2.1201257705688477, + "grad_norm_var": 0.10032196484461338, + "learning_rate": 0.0001, + "loss": 1.0841, + "loss/crossentropy": 2.6280465126037598, + "loss/hidden": 0.890625, + "loss/logits": 0.16118907928466797, + "loss/reg": 0.003230888629332185, + "step": 1650 + }, + { + "epoch": 0.206375, + "grad_norm": 2.829747200012207, + "grad_norm_var": 0.10100266775164073, + "learning_rate": 0.0001, + "loss": 1.0978, + "loss/crossentropy": 2.610457420349121, + "loss/hidden": 0.90234375, + "loss/logits": 0.16314734518527985, + "loss/reg": 0.003229183377698064, + "step": 1651 + }, + { + "epoch": 0.2065, + "grad_norm": 2.5310165882110596, + "grad_norm_var": 0.09564570596240832, + "learning_rate": 0.0001, + "loss": 0.9424, + "loss/crossentropy": 2.548105478286743, + "loss/hidden": 0.7890625, + "loss/logits": 0.1211063414812088, + "loss/reg": 0.0032274452969431877, + "step": 1652 + }, + { + "epoch": 0.206625, + "grad_norm": 2.257979393005371, + "grad_norm_var": 0.037136142432717394, + "learning_rate": 0.0001, + "loss": 1.1971, + "loss/crossentropy": 2.516902446746826, + "loss/hidden": 0.97265625, + "loss/logits": 0.19221451878547668, + "loss/reg": 0.003225695574656129, + "step": 1653 + }, + { + "epoch": 0.20675, + "grad_norm": 2.3023855686187744, + "grad_norm_var": 0.03774205518613461, + "learning_rate": 0.0001, + "loss": 0.9739, + "loss/crossentropy": 2.71262788772583, + "loss/hidden": 0.80859375, + "loss/logits": 0.1330820769071579, + "loss/reg": 0.003224144922569394, + "step": 1654 + }, + { + "epoch": 0.206875, + "grad_norm": 2.521340847015381, + "grad_norm_var": 0.0362938578599632, + "learning_rate": 0.0001, + "loss": 1.098, + "loss/crossentropy": 2.643906593322754, + "loss/hidden": 0.90625, + "loss/logits": 0.15947577357292175, + "loss/reg": 0.003222482278943062, + "step": 1655 + }, + { + "epoch": 0.207, + "grad_norm": 5.88134765625, + "grad_norm_var": 0.7828817213139186, + "learning_rate": 0.0001, + "loss": 1.4211, + "loss/crossentropy": 2.313748836517334, + "loss/hidden": 1.2109375, + "loss/logits": 0.17793220281600952, + "loss/reg": 0.0032208384945988655, + "step": 1656 + }, + { + "epoch": 0.207125, + "grad_norm": 2.5916645526885986, + "grad_norm_var": 0.7827675255288795, + "learning_rate": 0.0001, + "loss": 1.2521, + "loss/crossentropy": 2.4316515922546387, + "loss/hidden": 1.03125, + "loss/logits": 0.18862737715244293, + "loss/reg": 0.0032191697973757982, + "step": 1657 + }, + { + "epoch": 0.20725, + "grad_norm": 2.2206692695617676, + "grad_norm_var": 0.7936192015383082, + "learning_rate": 0.0001, + "loss": 1.0245, + "loss/crossentropy": 2.6382174491882324, + "loss/hidden": 0.84765625, + "loss/logits": 0.14466163516044617, + "loss/reg": 0.00321741821244359, + "step": 1658 + }, + { + "epoch": 0.207375, + "grad_norm": 2.8307604789733887, + "grad_norm_var": 0.7917962945035991, + "learning_rate": 0.0001, + "loss": 1.4057, + "loss/crossentropy": 2.4240641593933105, + "loss/hidden": 1.15625, + "loss/logits": 0.21726641058921814, + "loss/reg": 0.0032156051602214575, + "step": 1659 + }, + { + "epoch": 0.2075, + "grad_norm": 2.049522638320923, + "grad_norm_var": 0.7997391376511624, + "learning_rate": 0.0001, + "loss": 0.9971, + "loss/crossentropy": 2.6553359031677246, + "loss/hidden": 0.83203125, + "loss/logits": 0.1329321563243866, + "loss/reg": 0.003213758347555995, + "step": 1660 + }, + { + "epoch": 0.207625, + "grad_norm": 3.276784658432007, + "grad_norm_var": 0.8229971260041339, + "learning_rate": 0.0001, + "loss": 1.44, + "loss/crossentropy": 2.5629796981811523, + "loss/hidden": 1.171875, + "loss/logits": 0.23601150512695312, + "loss/reg": 0.0032118717208504677, + "step": 1661 + }, + { + "epoch": 0.20775, + "grad_norm": 3.7711448669433594, + "grad_norm_var": 0.8906238399602217, + "learning_rate": 0.0001, + "loss": 1.4247, + "loss/crossentropy": 2.474085569381714, + "loss/hidden": 1.1875, + "loss/logits": 0.20507219433784485, + "loss/reg": 0.0032101524993777275, + "step": 1662 + }, + { + "epoch": 0.207875, + "grad_norm": 2.741032361984253, + "grad_norm_var": 0.887234593717244, + "learning_rate": 0.0001, + "loss": 1.1324, + "loss/crossentropy": 2.6519734859466553, + "loss/hidden": 0.9296875, + "loss/logits": 0.17065690457820892, + "loss/reg": 0.0032084728591144085, + "step": 1663 + }, + { + "epoch": 0.208, + "grad_norm": 3.385438919067383, + "grad_norm_var": 0.9016638698725956, + "learning_rate": 0.0001, + "loss": 1.4632, + "loss/crossentropy": 2.8452768325805664, + "loss/hidden": 1.21875, + "loss/logits": 0.2123820185661316, + "loss/reg": 0.0032068106811493635, + "step": 1664 + }, + { + "epoch": 0.208125, + "grad_norm": 2.4284839630126953, + "grad_norm_var": 0.8794628798393888, + "learning_rate": 0.0001, + "loss": 1.1069, + "loss/crossentropy": 2.3913896083831787, + "loss/hidden": 0.91796875, + "loss/logits": 0.15689219534397125, + "loss/reg": 0.0032050481531769037, + "step": 1665 + }, + { + "epoch": 0.20825, + "grad_norm": 1.9970413446426392, + "grad_norm_var": 0.8925309231944419, + "learning_rate": 0.0001, + "loss": 1.0111, + "loss/crossentropy": 2.1553092002868652, + "loss/hidden": 0.8359375, + "loss/logits": 0.14312410354614258, + "loss/reg": 0.003203297033905983, + "step": 1666 + }, + { + "epoch": 0.208375, + "grad_norm": 2.629917621612549, + "grad_norm_var": 0.8955935228773692, + "learning_rate": 0.0001, + "loss": 1.0293, + "loss/crossentropy": 2.4769906997680664, + "loss/hidden": 0.828125, + "loss/logits": 0.16919545829296112, + "loss/reg": 0.003201601095497608, + "step": 1667 + }, + { + "epoch": 0.2085, + "grad_norm": 2.4040727615356445, + "grad_norm_var": 0.9018056713863368, + "learning_rate": 0.0001, + "loss": 0.9916, + "loss/crossentropy": 2.4735305309295654, + "loss/hidden": 0.8046875, + "loss/logits": 0.15492868423461914, + "loss/reg": 0.003199809929355979, + "step": 1668 + }, + { + "epoch": 0.208625, + "grad_norm": 2.666597843170166, + "grad_norm_var": 0.8810435015232821, + "learning_rate": 0.0001, + "loss": 0.9514, + "loss/crossentropy": 2.219966173171997, + "loss/hidden": 0.78515625, + "loss/logits": 0.13430990278720856, + "loss/reg": 0.003198012476786971, + "step": 1669 + }, + { + "epoch": 0.20875, + "grad_norm": 2.6586005687713623, + "grad_norm_var": 0.8626734234562614, + "learning_rate": 0.0001, + "loss": 1.162, + "loss/crossentropy": 2.411811351776123, + "loss/hidden": 0.9765625, + "loss/logits": 0.15343676507472992, + "loss/reg": 0.0031961523927748203, + "step": 1670 + }, + { + "epoch": 0.208875, + "grad_norm": 3.573573350906372, + "grad_norm_var": 0.8817782564271193, + "learning_rate": 0.0001, + "loss": 1.0668, + "loss/crossentropy": 2.475907802581787, + "loss/hidden": 0.88671875, + "loss/logits": 0.14815643429756165, + "loss/reg": 0.003194718388840556, + "step": 1671 + }, + { + "epoch": 0.209, + "grad_norm": 2.6452560424804688, + "grad_norm_var": 0.26896437314466315, + "learning_rate": 0.0001, + "loss": 1.0764, + "loss/crossentropy": 2.2734243869781494, + "loss/hidden": 0.8828125, + "loss/logits": 0.16163568198680878, + "loss/reg": 0.0031934131402522326, + "step": 1672 + }, + { + "epoch": 0.209125, + "grad_norm": 2.2780601978302, + "grad_norm_var": 0.28139345731230725, + "learning_rate": 0.0001, + "loss": 1.1114, + "loss/crossentropy": 2.6828384399414062, + "loss/hidden": 0.90625, + "loss/logits": 0.17327217757701874, + "loss/reg": 0.003191707655787468, + "step": 1673 + }, + { + "epoch": 0.20925, + "grad_norm": 3.0598275661468506, + "grad_norm_var": 0.2692776803865986, + "learning_rate": 0.0001, + "loss": 1.1438, + "loss/crossentropy": 2.626523017883301, + "loss/hidden": 0.94921875, + "loss/logits": 0.16267018020153046, + "loss/reg": 0.0031899947207421064, + "step": 1674 + }, + { + "epoch": 0.209375, + "grad_norm": 2.3135793209075928, + "grad_norm_var": 0.28213310678472675, + "learning_rate": 0.0001, + "loss": 1.2395, + "loss/crossentropy": 2.501697063446045, + "loss/hidden": 1.015625, + "loss/logits": 0.19198425114154816, + "loss/reg": 0.003188441740348935, + "step": 1675 + }, + { + "epoch": 0.2095, + "grad_norm": 3.061948537826538, + "grad_norm_var": 0.25265989074379286, + "learning_rate": 0.0001, + "loss": 1.0785, + "loss/crossentropy": 2.630337715148926, + "loss/hidden": 0.8984375, + "loss/logits": 0.14819283783435822, + "loss/reg": 0.003186658024787903, + "step": 1676 + }, + { + "epoch": 0.209625, + "grad_norm": 2.0030603408813477, + "grad_norm_var": 0.27405567589367946, + "learning_rate": 0.0001, + "loss": 1.0526, + "loss/crossentropy": 2.529303789138794, + "loss/hidden": 0.87109375, + "loss/logits": 0.1496969759464264, + "loss/reg": 0.0031848950311541557, + "step": 1677 + }, + { + "epoch": 0.20975, + "grad_norm": 5.00962495803833, + "grad_norm_var": 0.5424888351687083, + "learning_rate": 0.0001, + "loss": 1.1409, + "loss/crossentropy": 2.745102882385254, + "loss/hidden": 0.95703125, + "loss/logits": 0.15204139053821564, + "loss/reg": 0.0031832093372941017, + "step": 1678 + }, + { + "epoch": 0.209875, + "grad_norm": 2.7985355854034424, + "grad_norm_var": 0.5422164981145214, + "learning_rate": 0.0001, + "loss": 1.2753, + "loss/crossentropy": 2.2465641498565674, + "loss/hidden": 1.0625, + "loss/logits": 0.1809433400630951, + "loss/reg": 0.0031814700923860073, + "step": 1679 + }, + { + "epoch": 0.21, + "grad_norm": 4.798881530761719, + "grad_norm_var": 0.7760732092314867, + "learning_rate": 0.0001, + "loss": 1.466, + "loss/crossentropy": 2.329308271408081, + "loss/hidden": 1.2421875, + "loss/logits": 0.1920267790555954, + "loss/reg": 0.003179659601300955, + "step": 1680 + }, + { + "epoch": 0.210125, + "grad_norm": 2.5762155055999756, + "grad_norm_var": 0.7682393360080743, + "learning_rate": 0.0001, + "loss": 1.0703, + "loss/crossentropy": 2.5117287635803223, + "loss/hidden": 0.8828125, + "loss/logits": 0.1556781381368637, + "loss/reg": 0.003177785314619541, + "step": 1681 + }, + { + "epoch": 0.21025, + "grad_norm": 2.2130141258239746, + "grad_norm_var": 0.7450180582948017, + "learning_rate": 0.0001, + "loss": 1.0062, + "loss/crossentropy": 2.4735710620880127, + "loss/hidden": 0.83203125, + "loss/logits": 0.14245635271072388, + "loss/reg": 0.0031759634148329496, + "step": 1682 + }, + { + "epoch": 0.210375, + "grad_norm": 2.1119225025177, + "grad_norm_var": 0.7816966335511644, + "learning_rate": 0.0001, + "loss": 1.1029, + "loss/crossentropy": 2.5301568508148193, + "loss/hidden": 0.90234375, + "loss/logits": 0.1688534915447235, + "loss/reg": 0.0031742649152874947, + "step": 1683 + }, + { + "epoch": 0.2105, + "grad_norm": 2.5934348106384277, + "grad_norm_var": 0.7717750228974445, + "learning_rate": 0.0001, + "loss": 1.3001, + "loss/crossentropy": 2.7114899158477783, + "loss/hidden": 1.078125, + "loss/logits": 0.19023996591567993, + "loss/reg": 0.0031723512802273035, + "step": 1684 + }, + { + "epoch": 0.210625, + "grad_norm": 2.822317361831665, + "grad_norm_var": 0.7684936610933231, + "learning_rate": 0.0001, + "loss": 1.1469, + "loss/crossentropy": 2.4820547103881836, + "loss/hidden": 0.95703125, + "loss/logits": 0.15812493860721588, + "loss/reg": 0.0031706641893833876, + "step": 1685 + }, + { + "epoch": 0.21075, + "grad_norm": 2.204843044281006, + "grad_norm_var": 0.7964126984830915, + "learning_rate": 0.0001, + "loss": 1.0662, + "loss/crossentropy": 2.5764718055725098, + "loss/hidden": 0.8828125, + "loss/logits": 0.15168514847755432, + "loss/reg": 0.0031687715090811253, + "step": 1686 + }, + { + "epoch": 0.210875, + "grad_norm": 1.9805725812911987, + "grad_norm_var": 0.8074897214563511, + "learning_rate": 0.0001, + "loss": 0.9752, + "loss/crossentropy": 2.5879499912261963, + "loss/hidden": 0.81640625, + "loss/logits": 0.12714409828186035, + "loss/reg": 0.0031668762676417828, + "step": 1687 + }, + { + "epoch": 0.211, + "grad_norm": 2.5287587642669678, + "grad_norm_var": 0.8104222753256015, + "learning_rate": 0.0001, + "loss": 1.0874, + "loss/crossentropy": 2.712538957595825, + "loss/hidden": 0.90234375, + "loss/logits": 0.1533837765455246, + "loss/reg": 0.0031649123411625624, + "step": 1688 + }, + { + "epoch": 0.211125, + "grad_norm": 3.2568001747131348, + "grad_norm_var": 0.8058133582529289, + "learning_rate": 0.0001, + "loss": 1.0633, + "loss/crossentropy": 2.5451202392578125, + "loss/hidden": 0.90625, + "loss/logits": 0.12538479268550873, + "loss/reg": 0.0031630489975214005, + "step": 1689 + }, + { + "epoch": 0.21125, + "grad_norm": 2.1492838859558105, + "grad_norm_var": 0.8301337770059201, + "learning_rate": 0.0001, + "loss": 1.0673, + "loss/crossentropy": 2.457829236984253, + "loss/hidden": 0.89453125, + "loss/logits": 0.14114046096801758, + "loss/reg": 0.0031612419988960028, + "step": 1690 + }, + { + "epoch": 0.211375, + "grad_norm": 2.0851340293884277, + "grad_norm_var": 0.8474934557513625, + "learning_rate": 0.0001, + "loss": 1.0295, + "loss/crossentropy": 2.593254327774048, + "loss/hidden": 0.8515625, + "loss/logits": 0.14630341529846191, + "loss/reg": 0.0031592664308845997, + "step": 1691 + }, + { + "epoch": 0.2115, + "grad_norm": 2.692077159881592, + "grad_norm_var": 0.8412586771616655, + "learning_rate": 0.0001, + "loss": 1.1157, + "loss/crossentropy": 2.491499900817871, + "loss/hidden": 0.9375, + "loss/logits": 0.14665257930755615, + "loss/reg": 0.003157339058816433, + "step": 1692 + }, + { + "epoch": 0.211625, + "grad_norm": 1.9899176359176636, + "grad_norm_var": 0.8425591567104391, + "learning_rate": 0.0001, + "loss": 1.043, + "loss/crossentropy": 2.550938367843628, + "loss/hidden": 0.8671875, + "loss/logits": 0.14427784085273743, + "loss/reg": 0.0031556852627545595, + "step": 1693 + }, + { + "epoch": 0.21175, + "grad_norm": 1.861344575881958, + "grad_norm_var": 0.508564313907548, + "learning_rate": 0.0001, + "loss": 0.9407, + "loss/crossentropy": 2.7122299671173096, + "loss/hidden": 0.78515625, + "loss/logits": 0.12400847673416138, + "loss/reg": 0.0031540419440716505, + "step": 1694 + }, + { + "epoch": 0.211875, + "grad_norm": 2.5342886447906494, + "grad_norm_var": 0.5038702664044002, + "learning_rate": 0.0001, + "loss": 1.2289, + "loss/crossentropy": 2.5080294609069824, + "loss/hidden": 0.9921875, + "loss/logits": 0.20523084700107574, + "loss/reg": 0.00315248966217041, + "step": 1695 + }, + { + "epoch": 0.212, + "grad_norm": 2.043546438217163, + "grad_norm_var": 0.14296074842546982, + "learning_rate": 0.0001, + "loss": 0.9838, + "loss/crossentropy": 2.683417320251465, + "loss/hidden": 0.80078125, + "loss/logits": 0.15149196982383728, + "loss/reg": 0.003150953445583582, + "step": 1696 + }, + { + "epoch": 0.212125, + "grad_norm": 2.110501289367676, + "grad_norm_var": 0.14263816283126, + "learning_rate": 0.0001, + "loss": 1.1321, + "loss/crossentropy": 2.24113392829895, + "loss/hidden": 0.9453125, + "loss/logits": 0.155296191573143, + "loss/reg": 0.003149296622723341, + "step": 1697 + }, + { + "epoch": 0.21225, + "grad_norm": 2.7359836101531982, + "grad_norm_var": 0.1520199744222225, + "learning_rate": 0.0001, + "loss": 1.1932, + "loss/crossentropy": 2.5510122776031494, + "loss/hidden": 0.9765625, + "loss/logits": 0.1851990818977356, + "loss/reg": 0.003147589974105358, + "step": 1698 + }, + { + "epoch": 0.212375, + "grad_norm": 2.346010208129883, + "grad_norm_var": 0.1478174979612748, + "learning_rate": 0.0001, + "loss": 1.0448, + "loss/crossentropy": 2.4974029064178467, + "loss/hidden": 0.87890625, + "loss/logits": 0.13440260291099548, + "loss/reg": 0.0031458197627216578, + "step": 1699 + }, + { + "epoch": 0.2125, + "grad_norm": 2.1124165058135986, + "grad_norm_var": 0.14800787911656718, + "learning_rate": 0.0001, + "loss": 1.1699, + "loss/crossentropy": 2.265637159347534, + "loss/hidden": 0.9765625, + "loss/logits": 0.16191065311431885, + "loss/reg": 0.003143977839499712, + "step": 1700 + }, + { + "epoch": 0.212625, + "grad_norm": 2.4635298252105713, + "grad_norm_var": 0.13302139739859153, + "learning_rate": 0.0001, + "loss": 1.2526, + "loss/crossentropy": 2.3409652709960938, + "loss/hidden": 1.03125, + "loss/logits": 0.18988975882530212, + "loss/reg": 0.0031423657201230526, + "step": 1701 + }, + { + "epoch": 0.21275, + "grad_norm": 2.1505675315856934, + "grad_norm_var": 0.1340275686171452, + "learning_rate": 0.0001, + "loss": 1.1472, + "loss/crossentropy": 2.48573899269104, + "loss/hidden": 0.95703125, + "loss/logits": 0.15877564251422882, + "loss/reg": 0.0031405584886670113, + "step": 1702 + }, + { + "epoch": 0.212875, + "grad_norm": 2.3113481998443604, + "grad_norm_var": 0.12611443887347676, + "learning_rate": 0.0001, + "loss": 1.0576, + "loss/crossentropy": 2.165038824081421, + "loss/hidden": 0.88671875, + "loss/logits": 0.139505535364151, + "loss/reg": 0.0031387642957270145, + "step": 1703 + }, + { + "epoch": 0.213, + "grad_norm": 2.0424857139587402, + "grad_norm_var": 0.12837729482331822, + "learning_rate": 0.0001, + "loss": 1.189, + "loss/crossentropy": 2.2224714756011963, + "loss/hidden": 0.9609375, + "loss/logits": 0.19673088192939758, + "loss/reg": 0.0031370187643915415, + "step": 1704 + }, + { + "epoch": 0.213125, + "grad_norm": 2.016124725341797, + "grad_norm_var": 0.06718613229377196, + "learning_rate": 0.0001, + "loss": 0.9392, + "loss/crossentropy": 2.3173725605010986, + "loss/hidden": 0.78125, + "loss/logits": 0.1265917271375656, + "loss/reg": 0.0031352161895483732, + "step": 1705 + }, + { + "epoch": 0.21325, + "grad_norm": 2.1489810943603516, + "grad_norm_var": 0.06718930728756754, + "learning_rate": 0.0001, + "loss": 0.9914, + "loss/crossentropy": 2.3617665767669678, + "loss/hidden": 0.828125, + "loss/logits": 0.13194003701210022, + "loss/reg": 0.0031336136162281036, + "step": 1706 + }, + { + "epoch": 0.213375, + "grad_norm": 2.2248895168304443, + "grad_norm_var": 0.06575221726070399, + "learning_rate": 0.0001, + "loss": 0.9928, + "loss/crossentropy": 2.3003880977630615, + "loss/hidden": 0.83984375, + "loss/logits": 0.12159569561481476, + "loss/reg": 0.0031318794935941696, + "step": 1707 + }, + { + "epoch": 0.2135, + "grad_norm": 2.7656562328338623, + "grad_norm_var": 0.07056003633158045, + "learning_rate": 0.0001, + "loss": 1.0895, + "loss/crossentropy": 2.4966132640838623, + "loss/hidden": 0.90625, + "loss/logits": 0.1519893854856491, + "loss/reg": 0.0031300997361540794, + "step": 1708 + }, + { + "epoch": 0.213625, + "grad_norm": 2.4991726875305176, + "grad_norm_var": 0.06971341387025494, + "learning_rate": 0.0001, + "loss": 1.0979, + "loss/crossentropy": 2.0994441509246826, + "loss/hidden": 0.90234375, + "loss/logits": 0.16422367095947266, + "loss/reg": 0.003128266194835305, + "step": 1709 + }, + { + "epoch": 0.21375, + "grad_norm": 2.4266979694366455, + "grad_norm_var": 0.05866460350893187, + "learning_rate": 0.0001, + "loss": 1.0559, + "loss/crossentropy": 2.5089967250823975, + "loss/hidden": 0.87890625, + "loss/logits": 0.14569693803787231, + "loss/reg": 0.003126643830910325, + "step": 1710 + }, + { + "epoch": 0.213875, + "grad_norm": 2.5570080280303955, + "grad_norm_var": 0.05938155406816629, + "learning_rate": 0.0001, + "loss": 1.1512, + "loss/crossentropy": 2.390232801437378, + "loss/hidden": 0.98046875, + "loss/logits": 0.13949596881866455, + "loss/reg": 0.0031250508036464453, + "step": 1711 + }, + { + "epoch": 0.214, + "grad_norm": 2.4763524532318115, + "grad_norm_var": 0.05573108256271908, + "learning_rate": 0.0001, + "loss": 1.2109, + "loss/crossentropy": 2.4115543365478516, + "loss/hidden": 1.0, + "loss/logits": 0.17969435453414917, + "loss/reg": 0.003123391419649124, + "step": 1712 + }, + { + "epoch": 0.214125, + "grad_norm": 1.9605647325515747, + "grad_norm_var": 0.061658860743410496, + "learning_rate": 0.0001, + "loss": 1.004, + "loss/crossentropy": 2.1503031253814697, + "loss/hidden": 0.83203125, + "loss/logits": 0.14071419835090637, + "loss/reg": 0.0031219625379890203, + "step": 1713 + }, + { + "epoch": 0.21425, + "grad_norm": 2.206909656524658, + "grad_norm_var": 0.05032832725278471, + "learning_rate": 0.0001, + "loss": 0.9881, + "loss/crossentropy": 2.740894079208374, + "loss/hidden": 0.81640625, + "loss/logits": 0.14050860702991486, + "loss/reg": 0.0031206535641103983, + "step": 1714 + }, + { + "epoch": 0.214375, + "grad_norm": 2.084909677505493, + "grad_norm_var": 0.05278877705570242, + "learning_rate": 0.0001, + "loss": 1.1267, + "loss/crossentropy": 2.4331443309783936, + "loss/hidden": 0.9375, + "loss/logits": 0.1579878181219101, + "loss/reg": 0.003117976011708379, + "step": 1715 + }, + { + "epoch": 0.2145, + "grad_norm": 2.2570641040802, + "grad_norm_var": 0.050903424022511426, + "learning_rate": 0.0001, + "loss": 0.9784, + "loss/crossentropy": 2.688154935836792, + "loss/hidden": 0.80078125, + "loss/logits": 0.14644555747509003, + "loss/reg": 0.0031148470006883144, + "step": 1716 + }, + { + "epoch": 0.214625, + "grad_norm": 2.1008963584899902, + "grad_norm_var": 0.05058773933843851, + "learning_rate": 0.0001, + "loss": 1.0819, + "loss/crossentropy": 2.5256693363189697, + "loss/hidden": 0.90234375, + "loss/logits": 0.14845484495162964, + "loss/reg": 0.003113400423899293, + "step": 1717 + }, + { + "epoch": 0.21475, + "grad_norm": 2.4614713191986084, + "grad_norm_var": 0.05191226779637403, + "learning_rate": 0.0001, + "loss": 1.0564, + "loss/crossentropy": 2.5726735591888428, + "loss/hidden": 0.87109375, + "loss/logits": 0.15416929125785828, + "loss/reg": 0.0031117405742406845, + "step": 1718 + }, + { + "epoch": 0.214875, + "grad_norm": 2.3064160346984863, + "grad_norm_var": 0.05189566088851283, + "learning_rate": 0.0001, + "loss": 1.1013, + "loss/crossentropy": 2.7065393924713135, + "loss/hidden": 0.90234375, + "loss/logits": 0.16784769296646118, + "loss/reg": 0.0031088702380657196, + "step": 1719 + }, + { + "epoch": 0.215, + "grad_norm": 9.079465866088867, + "grad_norm_var": 2.9207271705017384, + "learning_rate": 0.0001, + "loss": 1.3854, + "loss/crossentropy": 2.3284268379211426, + "loss/hidden": 1.1328125, + "loss/logits": 0.22156238555908203, + "loss/reg": 0.003107408294454217, + "step": 1720 + }, + { + "epoch": 0.215125, + "grad_norm": 2.1745572090148926, + "grad_norm_var": 2.9073576589134493, + "learning_rate": 0.0001, + "loss": 1.1169, + "loss/crossentropy": 2.095036745071411, + "loss/hidden": 0.93359375, + "loss/logits": 0.15225940942764282, + "loss/reg": 0.003105347976088524, + "step": 1721 + }, + { + "epoch": 0.21525, + "grad_norm": 2.243680238723755, + "grad_norm_var": 2.9005416312984256, + "learning_rate": 0.0001, + "loss": 1.0107, + "loss/crossentropy": 2.2994542121887207, + "loss/hidden": 0.84375, + "loss/logits": 0.1359337866306305, + "loss/reg": 0.003103848546743393, + "step": 1722 + }, + { + "epoch": 0.215375, + "grad_norm": 2.6446003913879395, + "grad_norm_var": 2.882775101197618, + "learning_rate": 0.0001, + "loss": 1.2141, + "loss/crossentropy": 2.498286008834839, + "loss/hidden": 1.0234375, + "loss/logits": 0.1595914363861084, + "loss/reg": 0.003102482995018363, + "step": 1723 + }, + { + "epoch": 0.2155, + "grad_norm": 4.21211576461792, + "grad_norm_var": 3.0136016192372757, + "learning_rate": 0.0001, + "loss": 1.0643, + "loss/crossentropy": 2.2280969619750977, + "loss/hidden": 0.8828125, + "loss/logits": 0.15044564008712769, + "loss/reg": 0.0031010708771646023, + "step": 1724 + }, + { + "epoch": 0.215625, + "grad_norm": 2.093863010406494, + "grad_norm_var": 3.0431383662912457, + "learning_rate": 0.0001, + "loss": 0.9815, + "loss/crossentropy": 2.27380108833313, + "loss/hidden": 0.82421875, + "loss/logits": 0.12631164491176605, + "loss/reg": 0.0030996648129075766, + "step": 1725 + }, + { + "epoch": 0.21575, + "grad_norm": 2.16963267326355, + "grad_norm_var": 3.061105934508265, + "learning_rate": 0.0001, + "loss": 1.1956, + "loss/crossentropy": 2.180711269378662, + "loss/hidden": 0.9921875, + "loss/logits": 0.17242172360420227, + "loss/reg": 0.00309771322645247, + "step": 1726 + }, + { + "epoch": 0.215875, + "grad_norm": 3.120957136154175, + "grad_norm_var": 3.0616334113432386, + "learning_rate": 0.0001, + "loss": 1.1386, + "loss/crossentropy": 2.4755892753601074, + "loss/hidden": 0.92578125, + "loss/logits": 0.18183037638664246, + "loss/reg": 0.0030961879529058933, + "step": 1727 + }, + { + "epoch": 0.216, + "grad_norm": 2.810375213623047, + "grad_norm_var": 3.051983920589513, + "learning_rate": 0.0001, + "loss": 1.0356, + "loss/crossentropy": 2.844090461730957, + "loss/hidden": 0.85546875, + "loss/logits": 0.14921194314956665, + "loss/reg": 0.0030942922458052635, + "step": 1728 + }, + { + "epoch": 0.216125, + "grad_norm": 2.351074695587158, + "grad_norm_var": 3.01413823672746, + "learning_rate": 0.0001, + "loss": 1.1406, + "loss/crossentropy": 2.617952823638916, + "loss/hidden": 0.9375, + "loss/logits": 0.17222052812576294, + "loss/reg": 0.00309238163754344, + "step": 1729 + }, + { + "epoch": 0.21625, + "grad_norm": 3.7283058166503906, + "grad_norm_var": 3.0192480530971872, + "learning_rate": 0.0001, + "loss": 1.3238, + "loss/crossentropy": 2.559141159057617, + "loss/hidden": 0.99609375, + "loss/logits": 0.2968454360961914, + "loss/reg": 0.0030908475164324045, + "step": 1730 + }, + { + "epoch": 0.216375, + "grad_norm": 2.9617199897766113, + "grad_norm_var": 2.961489976152211, + "learning_rate": 0.0001, + "loss": 1.3122, + "loss/crossentropy": 2.598041534423828, + "loss/hidden": 1.078125, + "loss/logits": 0.2031644731760025, + "loss/reg": 0.00308916624635458, + "step": 1731 + }, + { + "epoch": 0.2165, + "grad_norm": 2.236485481262207, + "grad_norm_var": 2.9636777426758716, + "learning_rate": 0.0001, + "loss": 1.0227, + "loss/crossentropy": 2.625614881515503, + "loss/hidden": 0.83984375, + "loss/logits": 0.15201476216316223, + "loss/reg": 0.003087525488808751, + "step": 1732 + }, + { + "epoch": 0.216625, + "grad_norm": 2.3135018348693848, + "grad_norm_var": 2.9397831294271772, + "learning_rate": 0.0001, + "loss": 0.9559, + "loss/crossentropy": 2.3203794956207275, + "loss/hidden": 0.7890625, + "loss/logits": 0.13600721955299377, + "loss/reg": 0.003086032345890999, + "step": 1733 + }, + { + "epoch": 0.21675, + "grad_norm": 2.0412943363189697, + "grad_norm_var": 2.984167856020982, + "learning_rate": 0.0001, + "loss": 1.0881, + "loss/crossentropy": 2.715367317199707, + "loss/hidden": 0.89453125, + "loss/logits": 0.1627357304096222, + "loss/reg": 0.0030844947323203087, + "step": 1734 + }, + { + "epoch": 0.216875, + "grad_norm": 3.1161649227142334, + "grad_norm_var": 2.946971551780882, + "learning_rate": 0.0001, + "loss": 1.0572, + "loss/crossentropy": 2.4200503826141357, + "loss/hidden": 0.8828125, + "loss/logits": 0.14354635775089264, + "loss/reg": 0.0030825661960989237, + "step": 1735 + }, + { + "epoch": 0.217, + "grad_norm": 3.2668862342834473, + "grad_norm_var": 0.4098138660932987, + "learning_rate": 0.0001, + "loss": 1.1414, + "loss/crossentropy": 2.4813458919525146, + "loss/hidden": 0.953125, + "loss/logits": 0.15744319558143616, + "loss/reg": 0.003080642083659768, + "step": 1736 + }, + { + "epoch": 0.217125, + "grad_norm": 1.9718143939971924, + "grad_norm_var": 0.42706875074818434, + "learning_rate": 0.0001, + "loss": 1.0276, + "loss/crossentropy": 2.2978994846343994, + "loss/hidden": 0.8671875, + "loss/logits": 0.12966391444206238, + "loss/reg": 0.0030790595337748528, + "step": 1737 + }, + { + "epoch": 0.21725, + "grad_norm": 2.1405398845672607, + "grad_norm_var": 0.4340798374863008, + "learning_rate": 0.0001, + "loss": 1.1405, + "loss/crossentropy": 2.091478109359741, + "loss/hidden": 0.94140625, + "loss/logits": 0.16836108267307281, + "loss/reg": 0.0030774776823818684, + "step": 1738 + }, + { + "epoch": 0.217375, + "grad_norm": 3.8788137435913086, + "grad_norm_var": 0.5203809166356061, + "learning_rate": 0.0001, + "loss": 1.751, + "loss/crossentropy": 2.3941147327423096, + "loss/hidden": 1.4453125, + "loss/logits": 0.274971067905426, + "loss/reg": 0.003075655549764633, + "step": 1739 + }, + { + "epoch": 0.2175, + "grad_norm": 1.9825366735458374, + "grad_norm_var": 0.4040997474989869, + "learning_rate": 0.0001, + "loss": 1.1001, + "loss/crossentropy": 2.233893632888794, + "loss/hidden": 0.91796875, + "loss/logits": 0.15135906636714935, + "loss/reg": 0.0030739654321223497, + "step": 1740 + }, + { + "epoch": 0.217625, + "grad_norm": 2.0987682342529297, + "grad_norm_var": 0.4037463519266102, + "learning_rate": 0.0001, + "loss": 1.0675, + "loss/crossentropy": 2.4455137252807617, + "loss/hidden": 0.875, + "loss/logits": 0.1617324948310852, + "loss/reg": 0.0030723894014954567, + "step": 1741 + }, + { + "epoch": 0.21775, + "grad_norm": 3.920135498046875, + "grad_norm_var": 0.48622454106490515, + "learning_rate": 0.0001, + "loss": 1.1559, + "loss/crossentropy": 2.6298489570617676, + "loss/hidden": 0.94140625, + "loss/logits": 0.18376988172531128, + "loss/reg": 0.0030707602854818106, + "step": 1742 + }, + { + "epoch": 0.217875, + "grad_norm": 2.690436363220215, + "grad_norm_var": 0.4762973265463903, + "learning_rate": 0.0001, + "loss": 1.2467, + "loss/crossentropy": 2.1424267292022705, + "loss/hidden": 1.0625, + "loss/logits": 0.15353702008724213, + "loss/reg": 0.0030691707506775856, + "step": 1743 + }, + { + "epoch": 0.218, + "grad_norm": 2.604951858520508, + "grad_norm_var": 0.47644030986631136, + "learning_rate": 0.0001, + "loss": 1.1014, + "loss/crossentropy": 2.54174542427063, + "loss/hidden": 0.91015625, + "loss/logits": 0.1606130450963974, + "loss/reg": 0.00306738936342299, + "step": 1744 + }, + { + "epoch": 0.218125, + "grad_norm": 3.3228073120117188, + "grad_norm_var": 0.48941099514094233, + "learning_rate": 0.0001, + "loss": 1.0578, + "loss/crossentropy": 2.3931949138641357, + "loss/hidden": 0.890625, + "loss/logits": 0.13647425174713135, + "loss/reg": 0.003065774915739894, + "step": 1745 + }, + { + "epoch": 0.21825, + "grad_norm": 2.641117811203003, + "grad_norm_var": 0.42396390393689143, + "learning_rate": 0.0001, + "loss": 0.971, + "loss/crossentropy": 2.4205129146575928, + "loss/hidden": 0.83203125, + "loss/logits": 0.10831993818283081, + "loss/reg": 0.003064037999138236, + "step": 1746 + }, + { + "epoch": 0.218375, + "grad_norm": 2.7706568241119385, + "grad_norm_var": 0.4195589879953497, + "learning_rate": 0.0001, + "loss": 1.1104, + "loss/crossentropy": 2.719468355178833, + "loss/hidden": 0.9140625, + "loss/logits": 0.16571125388145447, + "loss/reg": 0.0030624454375356436, + "step": 1747 + }, + { + "epoch": 0.2185, + "grad_norm": 2.2494888305664062, + "grad_norm_var": 0.41878793071204773, + "learning_rate": 0.0001, + "loss": 1.0321, + "loss/crossentropy": 2.7627296447753906, + "loss/hidden": 0.84765625, + "loss/logits": 0.15384793281555176, + "loss/reg": 0.003060864983126521, + "step": 1748 + }, + { + "epoch": 0.218625, + "grad_norm": 2.4056599140167236, + "grad_norm_var": 0.41471554214321593, + "learning_rate": 0.0001, + "loss": 1.0339, + "loss/crossentropy": 2.546539068222046, + "loss/hidden": 0.8671875, + "loss/logits": 0.13611117005348206, + "loss/reg": 0.0030592146795243025, + "step": 1749 + }, + { + "epoch": 0.21875, + "grad_norm": 2.493166923522949, + "grad_norm_var": 0.38815929501957475, + "learning_rate": 0.0001, + "loss": 1.0489, + "loss/crossentropy": 2.243028402328491, + "loss/hidden": 0.87109375, + "loss/logits": 0.1472695916891098, + "loss/reg": 0.0030575725249946117, + "step": 1750 + }, + { + "epoch": 0.218875, + "grad_norm": 2.213844060897827, + "grad_norm_var": 0.3916385925474808, + "learning_rate": 0.0001, + "loss": 1.0008, + "loss/crossentropy": 2.419025182723999, + "loss/hidden": 0.83984375, + "loss/logits": 0.13044574856758118, + "loss/reg": 0.003055924316868186, + "step": 1751 + }, + { + "epoch": 0.219, + "grad_norm": 2.769224166870117, + "grad_norm_var": 0.3672278962107095, + "learning_rate": 0.0001, + "loss": 1.0385, + "loss/crossentropy": 2.7889208793640137, + "loss/hidden": 0.8671875, + "loss/logits": 0.1408083736896515, + "loss/reg": 0.0030543410684913397, + "step": 1752 + }, + { + "epoch": 0.219125, + "grad_norm": 2.4764599800109863, + "grad_norm_var": 0.33854682568550765, + "learning_rate": 0.0001, + "loss": 1.0605, + "loss/crossentropy": 2.422456741333008, + "loss/hidden": 0.89453125, + "loss/logits": 0.1354484111070633, + "loss/reg": 0.0030527592170983553, + "step": 1753 + }, + { + "epoch": 0.21925, + "grad_norm": 2.322908639907837, + "grad_norm_var": 0.32784450880299965, + "learning_rate": 0.0001, + "loss": 1.1523, + "loss/crossentropy": 2.441586494445801, + "loss/hidden": 0.94921875, + "loss/logits": 0.17252781987190247, + "loss/reg": 0.003051069099456072, + "step": 1754 + }, + { + "epoch": 0.219375, + "grad_norm": 2.6239187717437744, + "grad_norm_var": 0.22527430071220297, + "learning_rate": 0.0001, + "loss": 1.1402, + "loss/crossentropy": 2.3880774974823, + "loss/hidden": 0.94140625, + "loss/logits": 0.16831141710281372, + "loss/reg": 0.0030493696685880423, + "step": 1755 + }, + { + "epoch": 0.2195, + "grad_norm": 17.505414962768555, + "grad_norm_var": 14.009084703934427, + "learning_rate": 0.0001, + "loss": 1.3282, + "loss/crossentropy": 2.6417012214660645, + "loss/hidden": 1.1328125, + "loss/logits": 0.16491027176380157, + "loss/reg": 0.003047748701646924, + "step": 1756 + }, + { + "epoch": 0.219625, + "grad_norm": 4.20535135269165, + "grad_norm_var": 13.873398017294948, + "learning_rate": 0.0001, + "loss": 1.131, + "loss/crossentropy": 2.521103858947754, + "loss/hidden": 0.96484375, + "loss/logits": 0.13569971919059753, + "loss/reg": 0.0030460914131253958, + "step": 1757 + }, + { + "epoch": 0.21975, + "grad_norm": 2.9682602882385254, + "grad_norm_var": 13.902211592229294, + "learning_rate": 0.0001, + "loss": 1.1961, + "loss/crossentropy": 2.4939701557159424, + "loss/hidden": 1.0, + "loss/logits": 0.165610671043396, + "loss/reg": 0.0030445046722888947, + "step": 1758 + }, + { + "epoch": 0.219875, + "grad_norm": 2.7488410472869873, + "grad_norm_var": 13.895018738483493, + "learning_rate": 0.0001, + "loss": 1.0071, + "loss/crossentropy": 2.5324008464813232, + "loss/hidden": 0.84765625, + "loss/logits": 0.12899169325828552, + "loss/reg": 0.003042889991775155, + "step": 1759 + }, + { + "epoch": 0.22, + "grad_norm": 2.1646311283111572, + "grad_norm_var": 13.968204624958085, + "learning_rate": 0.0001, + "loss": 1.1448, + "loss/crossentropy": 2.5288894176483154, + "loss/hidden": 0.94921875, + "loss/logits": 0.16518330574035645, + "loss/reg": 0.0030412436462938786, + "step": 1760 + }, + { + "epoch": 0.220125, + "grad_norm": 1.9247658252716064, + "grad_norm_var": 14.145314883597047, + "learning_rate": 0.0001, + "loss": 1.0406, + "loss/crossentropy": 2.5389163494110107, + "loss/hidden": 0.8671875, + "loss/logits": 0.1429884135723114, + "loss/reg": 0.003039830131456256, + "step": 1761 + }, + { + "epoch": 0.22025, + "grad_norm": 2.1075799465179443, + "grad_norm_var": 14.226356437632456, + "learning_rate": 0.0001, + "loss": 1.1968, + "loss/crossentropy": 2.422471046447754, + "loss/hidden": 1.0078125, + "loss/logits": 0.15863800048828125, + "loss/reg": 0.0030385232530534267, + "step": 1762 + }, + { + "epoch": 0.220375, + "grad_norm": 2.6114046573638916, + "grad_norm_var": 14.243361987467381, + "learning_rate": 0.0001, + "loss": 0.9892, + "loss/crossentropy": 2.574613571166992, + "loss/hidden": 0.83203125, + "loss/logits": 0.1268356591463089, + "loss/reg": 0.0030372380279004574, + "step": 1763 + }, + { + "epoch": 0.2205, + "grad_norm": 2.0650744438171387, + "grad_norm_var": 14.275914518581828, + "learning_rate": 0.0001, + "loss": 1.1595, + "loss/crossentropy": 2.4691414833068848, + "loss/hidden": 0.98046875, + "loss/logits": 0.1486453115940094, + "loss/reg": 0.003035652916878462, + "step": 1764 + }, + { + "epoch": 0.220625, + "grad_norm": 2.160597085952759, + "grad_norm_var": 14.314622026235172, + "learning_rate": 0.0001, + "loss": 1.1022, + "loss/crossentropy": 2.609285593032837, + "loss/hidden": 0.921875, + "loss/logits": 0.15001603960990906, + "loss/reg": 0.0030343951657414436, + "step": 1765 + }, + { + "epoch": 0.22075, + "grad_norm": 3.069577217102051, + "grad_norm_var": 14.261074973549244, + "learning_rate": 0.0001, + "loss": 1.1575, + "loss/crossentropy": 2.647573947906494, + "loss/hidden": 0.96875, + "loss/logits": 0.1584310233592987, + "loss/reg": 0.003032844513654709, + "step": 1766 + }, + { + "epoch": 0.220875, + "grad_norm": 2.568370819091797, + "grad_norm_var": 14.208317261947524, + "learning_rate": 0.0001, + "loss": 1.0488, + "loss/crossentropy": 2.6948702335357666, + "loss/hidden": 0.87109375, + "loss/logits": 0.1474071443080902, + "loss/reg": 0.0030314817558974028, + "step": 1767 + }, + { + "epoch": 0.221, + "grad_norm": 2.68941068649292, + "grad_norm_var": 14.21668663304105, + "learning_rate": 0.0001, + "loss": 1.0456, + "loss/crossentropy": 2.5210416316986084, + "loss/hidden": 0.87109375, + "loss/logits": 0.14420348405838013, + "loss/reg": 0.0030302261002361774, + "step": 1768 + }, + { + "epoch": 0.221125, + "grad_norm": 1.9921388626098633, + "grad_norm_var": 14.298301261709696, + "learning_rate": 0.0001, + "loss": 1.0236, + "loss/crossentropy": 2.522998332977295, + "loss/hidden": 0.84765625, + "loss/logits": 0.14565014839172363, + "loss/reg": 0.00302865426056087, + "step": 1769 + }, + { + "epoch": 0.22125, + "grad_norm": 1.8499289751052856, + "grad_norm_var": 14.38544404016637, + "learning_rate": 0.0001, + "loss": 1.0027, + "loss/crossentropy": 2.466783046722412, + "loss/hidden": 0.828125, + "loss/logits": 0.14433151483535767, + "loss/reg": 0.0030273436568677425, + "step": 1770 + }, + { + "epoch": 0.221375, + "grad_norm": 2.4636669158935547, + "grad_norm_var": 14.404773691988826, + "learning_rate": 0.0001, + "loss": 1.0178, + "loss/crossentropy": 2.4166789054870605, + "loss/hidden": 0.84765625, + "loss/logits": 0.13983449339866638, + "loss/reg": 0.0030260428320616484, + "step": 1771 + }, + { + "epoch": 0.2215, + "grad_norm": 2.67075514793396, + "grad_norm_var": 0.3450175902124835, + "learning_rate": 0.0001, + "loss": 1.0199, + "loss/crossentropy": 2.695432662963867, + "loss/hidden": 0.84765625, + "loss/logits": 0.14194995164871216, + "loss/reg": 0.0030244754161685705, + "step": 1772 + }, + { + "epoch": 0.221625, + "grad_norm": 2.4573214054107666, + "grad_norm_var": 0.14231832979352052, + "learning_rate": 0.0001, + "loss": 0.9742, + "loss/crossentropy": 2.5035762786865234, + "loss/hidden": 0.8203125, + "loss/logits": 0.12364549934864044, + "loss/reg": 0.003022938035428524, + "step": 1773 + }, + { + "epoch": 0.22175, + "grad_norm": 2.343374729156494, + "grad_norm_var": 0.11996201542798221, + "learning_rate": 0.0001, + "loss": 1.0924, + "loss/crossentropy": 2.43853497505188, + "loss/hidden": 0.9140625, + "loss/logits": 0.14812292158603668, + "loss/reg": 0.003021500539034605, + "step": 1774 + }, + { + "epoch": 0.221875, + "grad_norm": 3.566265344619751, + "grad_norm_var": 0.2032350727933528, + "learning_rate": 0.0001, + "loss": 1.1878, + "loss/crossentropy": 2.46608567237854, + "loss/hidden": 0.97265625, + "loss/logits": 0.18495416641235352, + "loss/reg": 0.00301993521861732, + "step": 1775 + }, + { + "epoch": 0.222, + "grad_norm": 2.747910976409912, + "grad_norm_var": 0.20471190685867377, + "learning_rate": 0.0001, + "loss": 1.0738, + "loss/crossentropy": 2.6310603618621826, + "loss/hidden": 0.87890625, + "loss/logits": 0.16469183564186096, + "loss/reg": 0.003018364543095231, + "step": 1776 + }, + { + "epoch": 0.222125, + "grad_norm": 2.1164255142211914, + "grad_norm_var": 0.1934448052628894, + "learning_rate": 0.0001, + "loss": 0.9762, + "loss/crossentropy": 2.8867831230163574, + "loss/hidden": 0.8046875, + "loss/logits": 0.1413034349679947, + "loss/reg": 0.0030167822260409594, + "step": 1777 + }, + { + "epoch": 0.22225, + "grad_norm": 3.035059690475464, + "grad_norm_var": 0.20270085598930473, + "learning_rate": 0.0001, + "loss": 1.1703, + "loss/crossentropy": 2.1823151111602783, + "loss/hidden": 0.9765625, + "loss/logits": 0.16358500719070435, + "loss/reg": 0.0030153028201311827, + "step": 1778 + }, + { + "epoch": 0.222375, + "grad_norm": 2.5045015811920166, + "grad_norm_var": 0.20219002055305796, + "learning_rate": 0.0001, + "loss": 1.2038, + "loss/crossentropy": 2.401313543319702, + "loss/hidden": 1.0078125, + "loss/logits": 0.16582559049129486, + "loss/reg": 0.0030142655596137047, + "step": 1779 + }, + { + "epoch": 0.2225, + "grad_norm": 2.473170042037964, + "grad_norm_var": 0.1879118733867992, + "learning_rate": 0.0001, + "loss": 1.0329, + "loss/crossentropy": 2.824697494506836, + "loss/hidden": 0.859375, + "loss/logits": 0.14334872364997864, + "loss/reg": 0.003013218054547906, + "step": 1780 + }, + { + "epoch": 0.222625, + "grad_norm": 2.2666685581207275, + "grad_norm_var": 0.18318870026567513, + "learning_rate": 0.0001, + "loss": 0.91, + "loss/crossentropy": 2.4266369342803955, + "loss/hidden": 0.7734375, + "loss/logits": 0.10641002655029297, + "loss/reg": 0.0030120951123535633, + "step": 1781 + }, + { + "epoch": 0.22275, + "grad_norm": 2.411001205444336, + "grad_norm_var": 0.16475203538871402, + "learning_rate": 0.0001, + "loss": 0.9902, + "loss/crossentropy": 2.7164502143859863, + "loss/hidden": 0.82421875, + "loss/logits": 0.13585732877254486, + "loss/reg": 0.0030110396910458803, + "step": 1782 + }, + { + "epoch": 0.222875, + "grad_norm": 2.555788040161133, + "grad_norm_var": 0.16466357931168235, + "learning_rate": 0.0001, + "loss": 0.9362, + "loss/crossentropy": 2.4590888023376465, + "loss/hidden": 0.78515625, + "loss/logits": 0.12098520994186401, + "loss/reg": 0.0030094829853624105, + "step": 1783 + }, + { + "epoch": 0.223, + "grad_norm": 2.0648751258850098, + "grad_norm_var": 0.17401513224721246, + "learning_rate": 0.0001, + "loss": 0.9556, + "loss/crossentropy": 2.3822426795959473, + "loss/hidden": 0.78515625, + "loss/logits": 0.1403384804725647, + "loss/reg": 0.003007945604622364, + "step": 1784 + }, + { + "epoch": 0.223125, + "grad_norm": 4.551196098327637, + "grad_norm_var": 0.4202881155883109, + "learning_rate": 0.0001, + "loss": 1.8349, + "loss/crossentropy": 1.9226468801498413, + "loss/hidden": 1.4609375, + "loss/logits": 0.3438549041748047, + "loss/reg": 0.003006393788382411, + "step": 1785 + }, + { + "epoch": 0.22325, + "grad_norm": 3.032957077026367, + "grad_norm_var": 0.38473481866021925, + "learning_rate": 0.0001, + "loss": 1.091, + "loss/crossentropy": 2.3850667476654053, + "loss/hidden": 0.90625, + "loss/logits": 0.15473738312721252, + "loss/reg": 0.0030050212517380714, + "step": 1786 + }, + { + "epoch": 0.223375, + "grad_norm": 2.6902732849121094, + "grad_norm_var": 0.3806885371660407, + "learning_rate": 0.0001, + "loss": 1.1889, + "loss/crossentropy": 2.324967861175537, + "loss/hidden": 0.984375, + "loss/logits": 0.17453113198280334, + "loss/reg": 0.0030037900432944298, + "step": 1787 + }, + { + "epoch": 0.2235, + "grad_norm": 2.168304443359375, + "grad_norm_var": 0.3996302660743254, + "learning_rate": 0.0001, + "loss": 1.0764, + "loss/crossentropy": 2.2956700325012207, + "loss/hidden": 0.8984375, + "loss/logits": 0.14791284501552582, + "loss/reg": 0.003002553479745984, + "step": 1788 + }, + { + "epoch": 0.223625, + "grad_norm": 2.74092435836792, + "grad_norm_var": 0.3959885005070151, + "learning_rate": 0.0001, + "loss": 0.9535, + "loss/crossentropy": 2.631945848464966, + "loss/hidden": 0.79296875, + "loss/logits": 0.1305209994316101, + "loss/reg": 0.0030011215712875128, + "step": 1789 + }, + { + "epoch": 0.22375, + "grad_norm": 2.1927857398986816, + "grad_norm_var": 0.4046525348789257, + "learning_rate": 0.0001, + "loss": 1.0108, + "loss/crossentropy": 2.404738664627075, + "loss/hidden": 0.83984375, + "loss/logits": 0.14092186093330383, + "loss/reg": 0.002999563468620181, + "step": 1790 + }, + { + "epoch": 0.223875, + "grad_norm": 2.6189329624176025, + "grad_norm_var": 0.35067712323398886, + "learning_rate": 0.0001, + "loss": 0.9651, + "loss/crossentropy": 2.389113664627075, + "loss/hidden": 0.78515625, + "loss/logits": 0.15001046657562256, + "loss/reg": 0.002998023759573698, + "step": 1791 + }, + { + "epoch": 0.224, + "grad_norm": 2.298084020614624, + "grad_norm_var": 0.35659197751071947, + "learning_rate": 0.0001, + "loss": 1.0233, + "loss/crossentropy": 2.5539371967315674, + "loss/hidden": 0.84765625, + "loss/logits": 0.14568671584129333, + "loss/reg": 0.002996444469317794, + "step": 1792 + }, + { + "epoch": 0.224125, + "grad_norm": 2.6242432594299316, + "grad_norm_var": 0.3394552173241588, + "learning_rate": 0.0001, + "loss": 1.1973, + "loss/crossentropy": 2.5641584396362305, + "loss/hidden": 0.984375, + "loss/logits": 0.18293890357017517, + "loss/reg": 0.002994769951328635, + "step": 1793 + }, + { + "epoch": 0.22425, + "grad_norm": 3.7077572345733643, + "grad_norm_var": 0.4032349111532985, + "learning_rate": 0.0001, + "loss": 1.26, + "loss/crossentropy": 1.6599891185760498, + "loss/hidden": 1.09375, + "loss/logits": 0.13635557889938354, + "loss/reg": 0.0029932132456451654, + "step": 1794 + }, + { + "epoch": 0.224375, + "grad_norm": 3.1581003665924072, + "grad_norm_var": 0.41452339637513186, + "learning_rate": 0.0001, + "loss": 0.9995, + "loss/crossentropy": 2.8043997287750244, + "loss/hidden": 0.83203125, + "loss/logits": 0.1375463604927063, + "loss/reg": 0.0029914977494627237, + "step": 1795 + }, + { + "epoch": 0.2245, + "grad_norm": 3.2209360599517822, + "grad_norm_var": 0.42464256487447377, + "learning_rate": 0.0001, + "loss": 1.1752, + "loss/crossentropy": 2.321751832962036, + "loss/hidden": 0.9921875, + "loss/logits": 0.15309128165245056, + "loss/reg": 0.0029897540807724, + "step": 1796 + }, + { + "epoch": 0.224625, + "grad_norm": 3.6839895248413086, + "grad_norm_var": 0.45527767818373166, + "learning_rate": 0.0001, + "loss": 1.2765, + "loss/crossentropy": 2.8542239665985107, + "loss/hidden": 1.0234375, + "loss/logits": 0.2231515347957611, + "loss/reg": 0.002988190157338977, + "step": 1797 + }, + { + "epoch": 0.22475, + "grad_norm": 3.53092622756958, + "grad_norm_var": 0.4669931032592082, + "learning_rate": 0.0001, + "loss": 1.2964, + "loss/crossentropy": 2.4456238746643066, + "loss/hidden": 1.0703125, + "loss/logits": 0.19626130163669586, + "loss/reg": 0.002986533334478736, + "step": 1798 + }, + { + "epoch": 0.224875, + "grad_norm": 3.053359031677246, + "grad_norm_var": 0.4578059410900018, + "learning_rate": 0.0001, + "loss": 0.9795, + "loss/crossentropy": 2.8894784450531006, + "loss/hidden": 0.8203125, + "loss/logits": 0.12932872772216797, + "loss/reg": 0.0029848606791347265, + "step": 1799 + }, + { + "epoch": 0.225, + "grad_norm": 3.2195730209350586, + "grad_norm_var": 0.4035408308703057, + "learning_rate": 0.0001, + "loss": 1.4028, + "loss/crossentropy": 2.3305225372314453, + "loss/hidden": 1.1875, + "loss/logits": 0.18548178672790527, + "loss/reg": 0.002983321435749531, + "step": 1800 + }, + { + "epoch": 0.225125, + "grad_norm": 2.86997652053833, + "grad_norm_var": 0.23937467026572654, + "learning_rate": 0.0001, + "loss": 1.1938, + "loss/crossentropy": 2.579697847366333, + "loss/hidden": 0.98828125, + "loss/logits": 0.17568713426589966, + "loss/reg": 0.0029817591421306133, + "step": 1801 + }, + { + "epoch": 0.22525, + "grad_norm": 3.553030252456665, + "grad_norm_var": 0.26371729729337307, + "learning_rate": 0.0001, + "loss": 1.1028, + "loss/crossentropy": 2.8150346279144287, + "loss/hidden": 0.93359375, + "loss/logits": 0.13943785429000854, + "loss/reg": 0.002980235731229186, + "step": 1802 + }, + { + "epoch": 0.225375, + "grad_norm": 3.208162307739258, + "grad_norm_var": 0.26197953760215476, + "learning_rate": 0.0001, + "loss": 1.1901, + "loss/crossentropy": 2.499155044555664, + "loss/hidden": 0.94921875, + "loss/logits": 0.21110975742340088, + "loss/reg": 0.002978700678795576, + "step": 1803 + }, + { + "epoch": 0.2255, + "grad_norm": 4.348999500274658, + "grad_norm_var": 0.3201132095155003, + "learning_rate": 0.0001, + "loss": 1.1629, + "loss/crossentropy": 2.4431777000427246, + "loss/hidden": 0.984375, + "loss/logits": 0.14872056245803833, + "loss/reg": 0.002977173076942563, + "step": 1804 + }, + { + "epoch": 0.225625, + "grad_norm": 2.438546895980835, + "grad_norm_var": 0.34138753432729513, + "learning_rate": 0.0001, + "loss": 1.0855, + "loss/crossentropy": 2.518237352371216, + "loss/hidden": 0.89453125, + "loss/logits": 0.16119888424873352, + "loss/reg": 0.002975636161863804, + "step": 1805 + }, + { + "epoch": 0.22575, + "grad_norm": 3.8823599815368652, + "grad_norm_var": 0.31363593562404785, + "learning_rate": 0.0001, + "loss": 1.1725, + "loss/crossentropy": 2.419356107711792, + "loss/hidden": 0.96875, + "loss/logits": 0.17398422956466675, + "loss/reg": 0.002974169095978141, + "step": 1806 + }, + { + "epoch": 0.225875, + "grad_norm": 5.898488521575928, + "grad_norm_var": 0.725838270489255, + "learning_rate": 0.0001, + "loss": 1.3984, + "loss/crossentropy": 2.538686990737915, + "loss/hidden": 1.140625, + "loss/logits": 0.22802415490150452, + "loss/reg": 0.002972749760374427, + "step": 1807 + }, + { + "epoch": 0.226, + "grad_norm": 2.616062879562378, + "grad_norm_var": 0.6846537892399804, + "learning_rate": 0.0001, + "loss": 0.9967, + "loss/crossentropy": 2.91412091255188, + "loss/hidden": 0.828125, + "loss/logits": 0.13887512683868408, + "loss/reg": 0.0029713378753513098, + "step": 1808 + }, + { + "epoch": 0.226125, + "grad_norm": 2.6786835193634033, + "grad_norm_var": 0.678929251874992, + "learning_rate": 0.0001, + "loss": 1.3028, + "loss/crossentropy": 2.530329465866089, + "loss/hidden": 1.0859375, + "loss/logits": 0.18713583052158356, + "loss/reg": 0.0029699981678277254, + "step": 1809 + }, + { + "epoch": 0.22625, + "grad_norm": 2.3305532932281494, + "grad_norm_var": 0.7486371828354244, + "learning_rate": 0.0001, + "loss": 1.1417, + "loss/crossentropy": 2.5374956130981445, + "loss/hidden": 0.9453125, + "loss/logits": 0.16669398546218872, + "loss/reg": 0.0029686312191188335, + "step": 1810 + }, + { + "epoch": 0.226375, + "grad_norm": 2.9841620922088623, + "grad_norm_var": 0.7551115699539401, + "learning_rate": 0.0001, + "loss": 1.1804, + "loss/crossentropy": 2.4051711559295654, + "loss/hidden": 0.97265625, + "loss/logits": 0.17810006439685822, + "loss/reg": 0.002967282198369503, + "step": 1811 + }, + { + "epoch": 0.2265, + "grad_norm": 1.9030566215515137, + "grad_norm_var": 0.885438078387665, + "learning_rate": 0.0001, + "loss": 1.0451, + "loss/crossentropy": 2.449751615524292, + "loss/hidden": 0.87109375, + "loss/logits": 0.14433653652668, + "loss/reg": 0.0029659466817975044, + "step": 1812 + }, + { + "epoch": 0.226625, + "grad_norm": 2.075324773788452, + "grad_norm_var": 0.9567700729341877, + "learning_rate": 0.0001, + "loss": 1.0281, + "loss/crossentropy": 2.4204511642456055, + "loss/hidden": 0.87109375, + "loss/logits": 0.12739630043506622, + "loss/reg": 0.0029646598268300295, + "step": 1813 + }, + { + "epoch": 0.22675, + "grad_norm": 2.0552451610565186, + "grad_norm_var": 1.0202742097321296, + "learning_rate": 0.0001, + "loss": 0.9888, + "loss/crossentropy": 2.424468755722046, + "loss/hidden": 0.81640625, + "loss/logits": 0.14275333285331726, + "loss/reg": 0.002963270992040634, + "step": 1814 + }, + { + "epoch": 0.226875, + "grad_norm": 2.0605251789093018, + "grad_norm_var": 1.0840480132956123, + "learning_rate": 0.0001, + "loss": 1.0656, + "loss/crossentropy": 2.625704050064087, + "loss/hidden": 0.89453125, + "loss/logits": 0.1414736658334732, + "loss/reg": 0.0029617997352033854, + "step": 1815 + }, + { + "epoch": 0.227, + "grad_norm": 2.0984017848968506, + "grad_norm_var": 1.1309350809822944, + "learning_rate": 0.0001, + "loss": 1.1418, + "loss/crossentropy": 2.2266194820404053, + "loss/hidden": 0.9453125, + "loss/logits": 0.16688337922096252, + "loss/reg": 0.002960240002721548, + "step": 1816 + }, + { + "epoch": 0.227125, + "grad_norm": 2.257150411605835, + "grad_norm_var": 1.1599327396837886, + "learning_rate": 0.0001, + "loss": 1.2117, + "loss/crossentropy": 2.2862818241119385, + "loss/hidden": 1.0078125, + "loss/logits": 0.1742808222770691, + "loss/reg": 0.0029587687458842993, + "step": 1817 + }, + { + "epoch": 0.22725, + "grad_norm": 2.0277411937713623, + "grad_norm_var": 1.1723884671928733, + "learning_rate": 0.0001, + "loss": 1.0665, + "loss/crossentropy": 2.4400901794433594, + "loss/hidden": 0.89453125, + "loss/logits": 0.14237791299819946, + "loss/reg": 0.0029572048224508762, + "step": 1818 + }, + { + "epoch": 0.227375, + "grad_norm": 2.6541121006011963, + "grad_norm_var": 1.161714891934859, + "learning_rate": 0.0001, + "loss": 1.0231, + "loss/crossentropy": 2.5243773460388184, + "loss/hidden": 0.85546875, + "loss/logits": 0.13802778720855713, + "loss/reg": 0.0029556897934526205, + "step": 1819 + }, + { + "epoch": 0.2275, + "grad_norm": 2.3991587162017822, + "grad_norm_var": 0.9886539748965065, + "learning_rate": 0.0001, + "loss": 1.3316, + "loss/crossentropy": 2.3047027587890625, + "loss/hidden": 1.1328125, + "loss/logits": 0.16926254332065582, + "loss/reg": 0.0029541929252445698, + "step": 1820 + }, + { + "epoch": 0.227625, + "grad_norm": 2.27185320854187, + "grad_norm_var": 0.9950342111305434, + "learning_rate": 0.0001, + "loss": 1.0846, + "loss/crossentropy": 2.3836841583251953, + "loss/hidden": 0.90625, + "loss/logits": 0.14881691336631775, + "loss/reg": 0.002952732378616929, + "step": 1821 + }, + { + "epoch": 0.22775, + "grad_norm": 2.8672397136688232, + "grad_norm_var": 0.8908872852448818, + "learning_rate": 0.0001, + "loss": 1.2423, + "loss/crossentropy": 2.2023377418518066, + "loss/hidden": 0.9921875, + "loss/logits": 0.22060072422027588, + "loss/reg": 0.0029511942993849516, + "step": 1822 + }, + { + "epoch": 0.227875, + "grad_norm": 2.2803525924682617, + "grad_norm_var": 0.10508732681826037, + "learning_rate": 0.0001, + "loss": 1.0304, + "loss/crossentropy": 2.372196912765503, + "loss/hidden": 0.8515625, + "loss/logits": 0.14933274686336517, + "loss/reg": 0.00294972350820899, + "step": 1823 + }, + { + "epoch": 0.228, + "grad_norm": 2.6830813884735107, + "grad_norm_var": 0.10776807926507198, + "learning_rate": 0.0001, + "loss": 1.1309, + "loss/crossentropy": 2.475736618041992, + "loss/hidden": 0.9296875, + "loss/logits": 0.17176774144172668, + "loss/reg": 0.002948229666799307, + "step": 1824 + }, + { + "epoch": 0.228125, + "grad_norm": 2.7077255249023438, + "grad_norm_var": 0.1090870968752443, + "learning_rate": 0.0001, + "loss": 1.1178, + "loss/crossentropy": 2.6619277000427246, + "loss/hidden": 0.92578125, + "loss/logits": 0.16253264248371124, + "loss/reg": 0.002946724882349372, + "step": 1825 + }, + { + "epoch": 0.22825, + "grad_norm": 2.7542712688446045, + "grad_norm_var": 0.11901288025463994, + "learning_rate": 0.0001, + "loss": 1.023, + "loss/crossentropy": 2.3113021850585938, + "loss/hidden": 0.84375, + "loss/logits": 0.14976277947425842, + "loss/reg": 0.00294527318328619, + "step": 1826 + }, + { + "epoch": 0.228375, + "grad_norm": 2.1749322414398193, + "grad_norm_var": 0.0947496886136868, + "learning_rate": 0.0001, + "loss": 1.076, + "loss/crossentropy": 2.3755033016204834, + "loss/hidden": 0.890625, + "loss/logits": 0.1559135913848877, + "loss/reg": 0.0029437614139169455, + "step": 1827 + }, + { + "epoch": 0.2285, + "grad_norm": 2.2916691303253174, + "grad_norm_var": 0.08209817483413247, + "learning_rate": 0.0001, + "loss": 1.2318, + "loss/crossentropy": 2.409095287322998, + "loss/hidden": 1.03125, + "loss/logits": 0.17112311720848083, + "loss/reg": 0.0029421907383948565, + "step": 1828 + }, + { + "epoch": 0.228625, + "grad_norm": 3.1720941066741943, + "grad_norm_var": 0.11657495418614777, + "learning_rate": 0.0001, + "loss": 1.0408, + "loss/crossentropy": 2.571678400039673, + "loss/hidden": 0.85546875, + "loss/logits": 0.15594975650310516, + "loss/reg": 0.0029407108668237925, + "step": 1829 + }, + { + "epoch": 0.22875, + "grad_norm": 3.709214448928833, + "grad_norm_var": 0.20662170797658444, + "learning_rate": 0.0001, + "loss": 1.4423, + "loss/crossentropy": 2.3802380561828613, + "loss/hidden": 1.2265625, + "loss/logits": 0.18629974126815796, + "loss/reg": 0.0029392328578978777, + "step": 1830 + }, + { + "epoch": 0.228875, + "grad_norm": 2.151437520980835, + "grad_norm_var": 0.20150086001236314, + "learning_rate": 0.0001, + "loss": 1.1299, + "loss/crossentropy": 2.409843921661377, + "loss/hidden": 0.9296875, + "loss/logits": 0.17083843052387238, + "loss/reg": 0.0029376852326095104, + "step": 1831 + }, + { + "epoch": 0.229, + "grad_norm": 2.108267307281494, + "grad_norm_var": 0.20093753742009024, + "learning_rate": 0.0001, + "loss": 1.0452, + "loss/crossentropy": 2.7673611640930176, + "loss/hidden": 0.86328125, + "loss/logits": 0.15255650877952576, + "loss/reg": 0.0029360908083617687, + "step": 1832 + }, + { + "epoch": 0.229125, + "grad_norm": 2.295905828475952, + "grad_norm_var": 0.19961170535207368, + "learning_rate": 0.0001, + "loss": 1.1137, + "loss/crossentropy": 2.4432260990142822, + "loss/hidden": 0.93359375, + "loss/logits": 0.15079358220100403, + "loss/reg": 0.002934559714049101, + "step": 1833 + }, + { + "epoch": 0.22925, + "grad_norm": 2.0841267108917236, + "grad_norm_var": 0.19600194880262786, + "learning_rate": 0.0001, + "loss": 1.1015, + "loss/crossentropy": 2.406099319458008, + "loss/hidden": 0.91015625, + "loss/logits": 0.1619967818260193, + "loss/reg": 0.002933042123913765, + "step": 1834 + }, + { + "epoch": 0.229375, + "grad_norm": 2.2038230895996094, + "grad_norm_var": 0.20169366112067166, + "learning_rate": 0.0001, + "loss": 1.1717, + "loss/crossentropy": 2.36592435836792, + "loss/hidden": 0.9765625, + "loss/logits": 0.1658266931772232, + "loss/reg": 0.0029314891435205936, + "step": 1835 + }, + { + "epoch": 0.2295, + "grad_norm": 4.329677581787109, + "grad_norm_var": 0.40617225913744975, + "learning_rate": 0.0001, + "loss": 1.0861, + "loss/crossentropy": 2.428100824356079, + "loss/hidden": 0.90625, + "loss/logits": 0.15050062537193298, + "loss/reg": 0.002929947804659605, + "step": 1836 + }, + { + "epoch": 0.229625, + "grad_norm": 2.962228536605835, + "grad_norm_var": 0.4029608323643475, + "learning_rate": 0.0001, + "loss": 1.2032, + "loss/crossentropy": 2.661451816558838, + "loss/hidden": 1.0078125, + "loss/logits": 0.166073739528656, + "loss/reg": 0.0029283969197422266, + "step": 1837 + }, + { + "epoch": 0.22975, + "grad_norm": 2.1242473125457764, + "grad_norm_var": 0.418270528733823, + "learning_rate": 0.0001, + "loss": 1.3265, + "loss/crossentropy": 2.3514444828033447, + "loss/hidden": 1.09375, + "loss/logits": 0.20345856249332428, + "loss/reg": 0.002926844172179699, + "step": 1838 + }, + { + "epoch": 0.229875, + "grad_norm": 2.6637930870056152, + "grad_norm_var": 0.4097338351488264, + "learning_rate": 0.0001, + "loss": 1.0729, + "loss/crossentropy": 2.8255255222320557, + "loss/hidden": 0.890625, + "loss/logits": 0.15303359925746918, + "loss/reg": 0.0029252341482788324, + "step": 1839 + }, + { + "epoch": 0.23, + "grad_norm": 2.8248424530029297, + "grad_norm_var": 0.4115956483187119, + "learning_rate": 0.0001, + "loss": 1.2172, + "loss/crossentropy": 2.730475902557373, + "loss/hidden": 0.9921875, + "loss/logits": 0.19581902027130127, + "loss/reg": 0.0029237025883048773, + "step": 1840 + }, + { + "epoch": 0.230125, + "grad_norm": 2.1636877059936523, + "grad_norm_var": 0.42662438202454495, + "learning_rate": 0.0001, + "loss": 1.2799, + "loss/crossentropy": 2.4928698539733887, + "loss/hidden": 1.078125, + "loss/logits": 0.1725054681301117, + "loss/reg": 0.002922156360000372, + "step": 1841 + }, + { + "epoch": 0.23025, + "grad_norm": 2.155371904373169, + "grad_norm_var": 0.4387901405468398, + "learning_rate": 0.0001, + "loss": 1.0823, + "loss/crossentropy": 2.8904709815979004, + "loss/hidden": 0.89453125, + "loss/logits": 0.15856406092643738, + "loss/reg": 0.002920587779954076, + "step": 1842 + }, + { + "epoch": 0.230375, + "grad_norm": 2.688028573989868, + "grad_norm_var": 0.4269539462286275, + "learning_rate": 0.0001, + "loss": 0.9496, + "loss/crossentropy": 2.479196548461914, + "loss/hidden": 0.80859375, + "loss/logits": 0.1118479073047638, + "loss/reg": 0.0029189754277467728, + "step": 1843 + }, + { + "epoch": 0.2305, + "grad_norm": 3.4530677795410156, + "grad_norm_var": 0.46033235618827817, + "learning_rate": 0.0001, + "loss": 1.3334, + "loss/crossentropy": 2.3263473510742188, + "loss/hidden": 1.109375, + "loss/logits": 0.19483482837677002, + "loss/reg": 0.0029173565562814474, + "step": 1844 + }, + { + "epoch": 0.230625, + "grad_norm": 2.624260425567627, + "grad_norm_var": 0.44410306117912624, + "learning_rate": 0.0001, + "loss": 1.1256, + "loss/crossentropy": 2.4004266262054443, + "loss/hidden": 0.921875, + "loss/logits": 0.1746046245098114, + "loss/reg": 0.00291584269143641, + "step": 1845 + }, + { + "epoch": 0.23075, + "grad_norm": 2.484194755554199, + "grad_norm_var": 0.36633673651388654, + "learning_rate": 0.0001, + "loss": 1.1565, + "loss/crossentropy": 2.500600814819336, + "loss/hidden": 0.96484375, + "loss/logits": 0.1624952256679535, + "loss/reg": 0.0029144061263650656, + "step": 1846 + }, + { + "epoch": 0.230875, + "grad_norm": 2.1600430011749268, + "grad_norm_var": 0.36584698292128315, + "learning_rate": 0.0001, + "loss": 1.0968, + "loss/crossentropy": 2.7809672355651855, + "loss/hidden": 0.9140625, + "loss/logits": 0.1535949558019638, + "loss/reg": 0.00291292741894722, + "step": 1847 + }, + { + "epoch": 0.231, + "grad_norm": 1.9736144542694092, + "grad_norm_var": 0.37550067856469693, + "learning_rate": 0.0001, + "loss": 1.0361, + "loss/crossentropy": 2.4563684463500977, + "loss/hidden": 0.86328125, + "loss/logits": 0.14372900128364563, + "loss/reg": 0.002911404939368367, + "step": 1848 + }, + { + "epoch": 0.231125, + "grad_norm": 2.2191903591156006, + "grad_norm_var": 0.3787174770815568, + "learning_rate": 0.0001, + "loss": 1.0114, + "loss/crossentropy": 2.5433239936828613, + "loss/hidden": 0.83984375, + "loss/logits": 0.14246006309986115, + "loss/reg": 0.002909915754571557, + "step": 1849 + }, + { + "epoch": 0.23125, + "grad_norm": 2.5526938438415527, + "grad_norm_var": 0.3621070968588713, + "learning_rate": 0.0001, + "loss": 0.9695, + "loss/crossentropy": 2.5453128814697266, + "loss/hidden": 0.8125, + "loss/logits": 0.1279653012752533, + "loss/reg": 0.0029083597473800182, + "step": 1850 + }, + { + "epoch": 0.231375, + "grad_norm": 1.9157322645187378, + "grad_norm_var": 0.38247098077205705, + "learning_rate": 0.0001, + "loss": 0.9632, + "loss/crossentropy": 2.626375913619995, + "loss/hidden": 0.796875, + "loss/logits": 0.13728055357933044, + "loss/reg": 0.0029068856965750456, + "step": 1851 + }, + { + "epoch": 0.2315, + "grad_norm": 2.353100538253784, + "grad_norm_var": 0.16577489550661723, + "learning_rate": 0.0001, + "loss": 1.1287, + "loss/crossentropy": 2.479382038116455, + "loss/hidden": 0.9296875, + "loss/logits": 0.16994883120059967, + "loss/reg": 0.0029053473845124245, + "step": 1852 + }, + { + "epoch": 0.231625, + "grad_norm": 2.6295785903930664, + "grad_norm_var": 0.1502992299825194, + "learning_rate": 0.0001, + "loss": 1.2396, + "loss/crossentropy": 2.2818477153778076, + "loss/hidden": 1.0390625, + "loss/logits": 0.17154169082641602, + "loss/reg": 0.002903790445998311, + "step": 1853 + }, + { + "epoch": 0.23175, + "grad_norm": 2.1487951278686523, + "grad_norm_var": 0.14931457999495443, + "learning_rate": 0.0001, + "loss": 1.0425, + "loss/crossentropy": 2.6548938751220703, + "loss/hidden": 0.86328125, + "loss/logits": 0.15023818612098694, + "loss/reg": 0.0029021373484283686, + "step": 1854 + }, + { + "epoch": 0.231875, + "grad_norm": 2.2083189487457275, + "grad_norm_var": 0.14857580667151063, + "learning_rate": 0.0001, + "loss": 1.0317, + "loss/crossentropy": 2.3384876251220703, + "loss/hidden": 0.8671875, + "loss/logits": 0.13553163409233093, + "loss/reg": 0.0029005296528339386, + "step": 1855 + }, + { + "epoch": 0.232, + "grad_norm": 1.848423957824707, + "grad_norm_var": 0.1541103110008151, + "learning_rate": 0.0001, + "loss": 0.9426, + "loss/crossentropy": 2.4241435527801514, + "loss/hidden": 0.8046875, + "loss/logits": 0.1089288517832756, + "loss/reg": 0.002899044193327427, + "step": 1856 + }, + { + "epoch": 0.232125, + "grad_norm": 2.036503553390503, + "grad_norm_var": 0.15825755313067677, + "learning_rate": 0.0001, + "loss": 0.9461, + "loss/crossentropy": 2.3014883995056152, + "loss/hidden": 0.78515625, + "loss/logits": 0.1319369375705719, + "loss/reg": 0.0028974406886845827, + "step": 1857 + }, + { + "epoch": 0.23225, + "grad_norm": 2.712230682373047, + "grad_norm_var": 0.16387938230163232, + "learning_rate": 0.0001, + "loss": 1.2105, + "loss/crossentropy": 2.365384101867676, + "loss/hidden": 1.0078125, + "loss/logits": 0.17370465397834778, + "loss/reg": 0.0028959375340491533, + "step": 1858 + }, + { + "epoch": 0.232375, + "grad_norm": 2.5608208179473877, + "grad_norm_var": 0.15958970126699837, + "learning_rate": 0.0001, + "loss": 1.1594, + "loss/crossentropy": 2.623105049133301, + "loss/hidden": 0.94140625, + "loss/logits": 0.1890484094619751, + "loss/reg": 0.002894355682656169, + "step": 1859 + }, + { + "epoch": 0.2325, + "grad_norm": 1.9694890975952148, + "grad_norm_var": 0.08242289833427534, + "learning_rate": 0.0001, + "loss": 0.8934, + "loss/crossentropy": 2.1551883220672607, + "loss/hidden": 0.74609375, + "loss/logits": 0.11839769035577774, + "loss/reg": 0.0028928006067872047, + "step": 1860 + }, + { + "epoch": 0.232625, + "grad_norm": 2.146641492843628, + "grad_norm_var": 0.0744266244705272, + "learning_rate": 0.0001, + "loss": 1.0166, + "loss/crossentropy": 2.694187641143799, + "loss/hidden": 0.85546875, + "loss/logits": 0.1322515606880188, + "loss/reg": 0.002891267416998744, + "step": 1861 + }, + { + "epoch": 0.23275, + "grad_norm": 2.412816286087036, + "grad_norm_var": 0.07246823357878984, + "learning_rate": 0.0001, + "loss": 1.182, + "loss/crossentropy": 2.708153009414673, + "loss/hidden": 0.96875, + "loss/logits": 0.1843324601650238, + "loss/reg": 0.0028897603042423725, + "step": 1862 + }, + { + "epoch": 0.232875, + "grad_norm": 3.3550024032592773, + "grad_norm_var": 0.14889475511776779, + "learning_rate": 0.0001, + "loss": 1.0168, + "loss/crossentropy": 2.240648031234741, + "loss/hidden": 0.8671875, + "loss/logits": 0.12075857073068619, + "loss/reg": 0.0028883127961307764, + "step": 1863 + }, + { + "epoch": 0.233, + "grad_norm": 2.539043664932251, + "grad_norm_var": 0.14312548265110311, + "learning_rate": 0.0001, + "loss": 0.9847, + "loss/crossentropy": 2.3216147422790527, + "loss/hidden": 0.828125, + "loss/logits": 0.12774166464805603, + "loss/reg": 0.0028867912478744984, + "step": 1864 + }, + { + "epoch": 0.233125, + "grad_norm": 2.358919382095337, + "grad_norm_var": 0.14189893172667017, + "learning_rate": 0.0001, + "loss": 1.0164, + "loss/crossentropy": 2.5843939781188965, + "loss/hidden": 0.83984375, + "loss/logits": 0.14774055778980255, + "loss/reg": 0.0028853206895291805, + "step": 1865 + }, + { + "epoch": 0.23325, + "grad_norm": 2.560542106628418, + "grad_norm_var": 0.14210520060771656, + "learning_rate": 0.0001, + "loss": 1.3539, + "loss/crossentropy": 2.2003164291381836, + "loss/hidden": 1.1015625, + "loss/logits": 0.22345313429832458, + "loss/reg": 0.002883851993829012, + "step": 1866 + }, + { + "epoch": 0.233375, + "grad_norm": 2.2669901847839355, + "grad_norm_var": 0.12902140426953868, + "learning_rate": 0.0001, + "loss": 1.0907, + "loss/crossentropy": 2.550511360168457, + "loss/hidden": 0.89453125, + "loss/logits": 0.16732323169708252, + "loss/reg": 0.0028824072796851397, + "step": 1867 + }, + { + "epoch": 0.2335, + "grad_norm": 1.9912503957748413, + "grad_norm_var": 0.1385847546259417, + "learning_rate": 0.0001, + "loss": 1.1119, + "loss/crossentropy": 2.4620280265808105, + "loss/hidden": 0.92578125, + "loss/logits": 0.15735240280628204, + "loss/reg": 0.0028808878269046545, + "step": 1868 + }, + { + "epoch": 0.233625, + "grad_norm": 2.5993523597717285, + "grad_norm_var": 0.13755172432265106, + "learning_rate": 0.0001, + "loss": 1.4922, + "loss/crossentropy": 2.358572244644165, + "loss/hidden": 1.2578125, + "loss/logits": 0.20562607049942017, + "loss/reg": 0.0028794193640351295, + "step": 1869 + }, + { + "epoch": 0.23375, + "grad_norm": 3.30731463432312, + "grad_norm_var": 0.18924561660283715, + "learning_rate": 0.0001, + "loss": 1.376, + "loss/crossentropy": 2.7302675247192383, + "loss/hidden": 1.140625, + "loss/logits": 0.20656052231788635, + "loss/reg": 0.0028778668493032455, + "step": 1870 + }, + { + "epoch": 0.233875, + "grad_norm": 2.8096041679382324, + "grad_norm_var": 0.19410140740735352, + "learning_rate": 0.0001, + "loss": 1.2133, + "loss/crossentropy": 2.322666645050049, + "loss/hidden": 1.0078125, + "loss/logits": 0.17673031985759735, + "loss/reg": 0.0028763674199581146, + "step": 1871 + }, + { + "epoch": 0.234, + "grad_norm": 2.766726493835449, + "grad_norm_var": 0.17104518125896953, + "learning_rate": 0.0001, + "loss": 1.194, + "loss/crossentropy": 2.258401393890381, + "loss/hidden": 0.99609375, + "loss/logits": 0.169195756316185, + "loss/reg": 0.0028748363256454468, + "step": 1872 + }, + { + "epoch": 0.234125, + "grad_norm": 2.1711134910583496, + "grad_norm_var": 0.16341771516509496, + "learning_rate": 0.0001, + "loss": 0.9274, + "loss/crossentropy": 2.3150506019592285, + "loss/hidden": 0.77734375, + "loss/logits": 0.12128670513629913, + "loss/reg": 0.002873300574719906, + "step": 1873 + }, + { + "epoch": 0.23425, + "grad_norm": 2.1112637519836426, + "grad_norm_var": 0.17162801880261083, + "learning_rate": 0.0001, + "loss": 0.9732, + "loss/crossentropy": 2.506507396697998, + "loss/hidden": 0.8046875, + "loss/logits": 0.13975682854652405, + "loss/reg": 0.0028718682006001472, + "step": 1874 + }, + { + "epoch": 0.234375, + "grad_norm": 2.183361053466797, + "grad_norm_var": 0.17724180763689687, + "learning_rate": 0.0001, + "loss": 1.1296, + "loss/crossentropy": 2.4888036251068115, + "loss/hidden": 0.93359375, + "loss/logits": 0.16730833053588867, + "loss/reg": 0.002870464464649558, + "step": 1875 + }, + { + "epoch": 0.2345, + "grad_norm": 2.735436201095581, + "grad_norm_var": 0.16260582148087882, + "learning_rate": 0.0001, + "loss": 1.1365, + "loss/crossentropy": 2.1528542041778564, + "loss/hidden": 0.96484375, + "loss/logits": 0.14295265078544617, + "loss/reg": 0.002869043732061982, + "step": 1876 + }, + { + "epoch": 0.234625, + "grad_norm": 3.999509572982788, + "grad_norm_var": 0.28500931963959975, + "learning_rate": 0.0001, + "loss": 1.3381, + "loss/crossentropy": 3.042414903640747, + "loss/hidden": 1.0703125, + "loss/logits": 0.23910346627235413, + "loss/reg": 0.0028676455840468407, + "step": 1877 + }, + { + "epoch": 0.23475, + "grad_norm": 2.6461031436920166, + "grad_norm_var": 0.2814837056327926, + "learning_rate": 0.0001, + "loss": 0.9627, + "loss/crossentropy": 2.62857985496521, + "loss/hidden": 0.7890625, + "loss/logits": 0.14501667022705078, + "loss/reg": 0.002866254420951009, + "step": 1878 + }, + { + "epoch": 0.234875, + "grad_norm": 2.356269359588623, + "grad_norm_var": 0.2499569691597026, + "learning_rate": 0.0001, + "loss": 1.1219, + "loss/crossentropy": 2.535068988800049, + "loss/hidden": 0.9453125, + "loss/logits": 0.14790667593479156, + "loss/reg": 0.002864871872588992, + "step": 1879 + }, + { + "epoch": 0.235, + "grad_norm": 2.321554660797119, + "grad_norm_var": 0.25432354819466757, + "learning_rate": 0.0001, + "loss": 1.0411, + "loss/crossentropy": 2.5561270713806152, + "loss/hidden": 0.86328125, + "loss/logits": 0.14915838837623596, + "loss/reg": 0.0028635053895413876, + "step": 1880 + }, + { + "epoch": 0.235125, + "grad_norm": 2.6328799724578857, + "grad_norm_var": 0.2511549738430517, + "learning_rate": 0.0001, + "loss": 1.2299, + "loss/crossentropy": 2.174438238143921, + "loss/hidden": 1.015625, + "loss/logits": 0.18564572930335999, + "loss/reg": 0.0028620159719139338, + "step": 1881 + }, + { + "epoch": 0.23525, + "grad_norm": 2.7941927909851074, + "grad_norm_var": 0.25361177630329473, + "learning_rate": 0.0001, + "loss": 1.0417, + "loss/crossentropy": 2.698789596557617, + "loss/hidden": 0.87109375, + "loss/logits": 0.14195041358470917, + "loss/reg": 0.0028606669511646032, + "step": 1882 + }, + { + "epoch": 0.235375, + "grad_norm": 2.9317786693573, + "grad_norm_var": 0.251201000396558, + "learning_rate": 0.0001, + "loss": 1.222, + "loss/crossentropy": 2.6760761737823486, + "loss/hidden": 1.015625, + "loss/logits": 0.17773698270320892, + "loss/reg": 0.0028592266608029604, + "step": 1883 + }, + { + "epoch": 0.2355, + "grad_norm": 2.170546293258667, + "grad_norm_var": 0.23752522799550563, + "learning_rate": 0.0001, + "loss": 1.0143, + "loss/crossentropy": 2.390109062194824, + "loss/hidden": 0.83984375, + "loss/logits": 0.14584662020206451, + "loss/reg": 0.0028577372431755066, + "step": 1884 + }, + { + "epoch": 0.235625, + "grad_norm": 3.2109017372131348, + "grad_norm_var": 0.25607174442198255, + "learning_rate": 0.0001, + "loss": 1.0177, + "loss/crossentropy": 2.848381519317627, + "loss/hidden": 0.83203125, + "loss/logits": 0.1571502387523651, + "loss/reg": 0.0028563509695231915, + "step": 1885 + }, + { + "epoch": 0.23575, + "grad_norm": 3.3898189067840576, + "grad_norm_var": 0.2632133556348774, + "learning_rate": 0.0001, + "loss": 0.9715, + "loss/crossentropy": 2.551968574523926, + "loss/hidden": 0.8046875, + "loss/logits": 0.13831113278865814, + "loss/reg": 0.0028548440895974636, + "step": 1886 + }, + { + "epoch": 0.235875, + "grad_norm": 2.2735543251037598, + "grad_norm_var": 0.27347767108519155, + "learning_rate": 0.0001, + "loss": 1.0096, + "loss/crossentropy": 2.356642484664917, + "loss/hidden": 0.84765625, + "loss/logits": 0.1334502100944519, + "loss/reg": 0.0028533469885587692, + "step": 1887 + }, + { + "epoch": 0.236, + "grad_norm": 2.8403327465057373, + "grad_norm_var": 0.274780906820475, + "learning_rate": 0.0001, + "loss": 1.1044, + "loss/crossentropy": 2.49477481842041, + "loss/hidden": 0.89453125, + "loss/logits": 0.18135184049606323, + "loss/reg": 0.002851872704923153, + "step": 1888 + }, + { + "epoch": 0.236125, + "grad_norm": 2.4770002365112305, + "grad_norm_var": 0.26015786291882914, + "learning_rate": 0.0001, + "loss": 1.1539, + "loss/crossentropy": 2.3961706161499023, + "loss/hidden": 0.9765625, + "loss/logits": 0.14887994527816772, + "loss/reg": 0.002850309479981661, + "step": 1889 + }, + { + "epoch": 0.23625, + "grad_norm": 97.88726806640625, + "grad_norm_var": 566.1572677979839, + "learning_rate": 0.0001, + "loss": 1.2242, + "loss/crossentropy": 2.2282721996307373, + "loss/hidden": 1.0234375, + "loss/logits": 0.17226368188858032, + "loss/reg": 0.002848886651918292, + "step": 1890 + }, + { + "epoch": 0.236375, + "grad_norm": 2.3098387718200684, + "grad_norm_var": 566.048741327807, + "learning_rate": 0.0001, + "loss": 1.0645, + "loss/crossentropy": 2.4957542419433594, + "loss/hidden": 0.88671875, + "loss/logits": 0.14935660362243652, + "loss/reg": 0.0028473958373069763, + "step": 1891 + }, + { + "epoch": 0.2365, + "grad_norm": 3.0709023475646973, + "grad_norm_var": 565.7896104746242, + "learning_rate": 0.0001, + "loss": 1.0863, + "loss/crossentropy": 2.701103925704956, + "loss/hidden": 0.90234375, + "loss/logits": 0.15548059344291687, + "loss/reg": 0.0028457811567932367, + "step": 1892 + }, + { + "epoch": 0.236625, + "grad_norm": 4.502190113067627, + "grad_norm_var": 565.4898863883287, + "learning_rate": 0.0001, + "loss": 1.4863, + "loss/crossentropy": 2.418233871459961, + "loss/hidden": 1.265625, + "loss/logits": 0.1922321766614914, + "loss/reg": 0.002844167174771428, + "step": 1893 + }, + { + "epoch": 0.23675, + "grad_norm": 3.704207420349121, + "grad_norm_var": 564.7003492594727, + "learning_rate": 0.0001, + "loss": 1.2931, + "loss/crossentropy": 2.34171462059021, + "loss/hidden": 1.078125, + "loss/logits": 0.18653494119644165, + "loss/reg": 0.0028426784556359053, + "step": 1894 + }, + { + "epoch": 0.236875, + "grad_norm": 4.64210319519043, + "grad_norm_var": 563.0616126406595, + "learning_rate": 0.0001, + "loss": 1.3692, + "loss/crossentropy": 2.5464377403259277, + "loss/hidden": 1.140625, + "loss/logits": 0.2001732736825943, + "loss/reg": 0.0028411895036697388, + "step": 1895 + }, + { + "epoch": 0.237, + "grad_norm": 2.472402572631836, + "grad_norm_var": 562.9297680002472, + "learning_rate": 0.0001, + "loss": 1.1043, + "loss/crossentropy": 2.488330602645874, + "loss/hidden": 0.9140625, + "loss/logits": 0.1618151217699051, + "loss/reg": 0.002839608583599329, + "step": 1896 + }, + { + "epoch": 0.237125, + "grad_norm": 3.018164873123169, + "grad_norm_var": 562.6141740686131, + "learning_rate": 0.0001, + "loss": 1.0354, + "loss/crossentropy": 2.5816335678100586, + "loss/hidden": 0.8671875, + "loss/logits": 0.13983488082885742, + "loss/reg": 0.0028381715528666973, + "step": 1897 + }, + { + "epoch": 0.23725, + "grad_norm": 3.750331401824951, + "grad_norm_var": 561.8825919502568, + "learning_rate": 0.0001, + "loss": 1.1906, + "loss/crossentropy": 2.758951187133789, + "loss/hidden": 0.98828125, + "loss/logits": 0.17395856976509094, + "loss/reg": 0.002836685860529542, + "step": 1898 + }, + { + "epoch": 0.237375, + "grad_norm": 2.309964895248413, + "grad_norm_var": 562.4132399812779, + "learning_rate": 0.0001, + "loss": 1.1765, + "loss/crossentropy": 2.7613322734832764, + "loss/hidden": 0.96875, + "loss/logits": 0.17939506471157074, + "loss/reg": 0.0028351792134344578, + "step": 1899 + }, + { + "epoch": 0.2375, + "grad_norm": 3.629230499267578, + "grad_norm_var": 561.2175971903463, + "learning_rate": 0.0001, + "loss": 1.1221, + "loss/crossentropy": 2.351802110671997, + "loss/hidden": 0.94921875, + "loss/logits": 0.1445481777191162, + "loss/reg": 0.002833602949976921, + "step": 1900 + }, + { + "epoch": 0.237625, + "grad_norm": 2.0922610759735107, + "grad_norm_var": 562.1731362143731, + "learning_rate": 0.0001, + "loss": 0.9687, + "loss/crossentropy": 2.6160526275634766, + "loss/hidden": 0.80859375, + "loss/logits": 0.13180279731750488, + "loss/reg": 0.0028320997953414917, + "step": 1901 + }, + { + "epoch": 0.23775, + "grad_norm": 2.7156622409820557, + "grad_norm_var": 562.7079033711701, + "learning_rate": 0.0001, + "loss": 1.1645, + "loss/crossentropy": 2.891047239303589, + "loss/hidden": 0.9453125, + "loss/logits": 0.19086427986621857, + "loss/reg": 0.0028305284213274717, + "step": 1902 + }, + { + "epoch": 0.237875, + "grad_norm": 2.3164191246032715, + "grad_norm_var": 562.6696833086194, + "learning_rate": 0.0001, + "loss": 1.1205, + "loss/crossentropy": 2.3813233375549316, + "loss/hidden": 0.9453125, + "loss/logits": 0.1469373106956482, + "loss/reg": 0.0028289342299103737, + "step": 1903 + }, + { + "epoch": 0.238, + "grad_norm": 2.695955991744995, + "grad_norm_var": 562.7892462486658, + "learning_rate": 0.0001, + "loss": 0.9631, + "loss/crossentropy": 2.7349748611450195, + "loss/hidden": 0.80078125, + "loss/logits": 0.13401402533054352, + "loss/reg": 0.0028272622730582952, + "step": 1904 + }, + { + "epoch": 0.238125, + "grad_norm": 2.295044422149658, + "grad_norm_var": 562.9489527602544, + "learning_rate": 0.0001, + "loss": 1.0814, + "loss/crossentropy": 2.499460458755493, + "loss/hidden": 0.91796875, + "loss/logits": 0.1352241486310959, + "loss/reg": 0.0028255698271095753, + "step": 1905 + }, + { + "epoch": 0.23825, + "grad_norm": 2.244823455810547, + "grad_norm_var": 0.6781732251602759, + "learning_rate": 0.0001, + "loss": 1.0582, + "loss/crossentropy": 2.587311029434204, + "loss/hidden": 0.8828125, + "loss/logits": 0.14716576039791107, + "loss/reg": 0.0028240818064659834, + "step": 1906 + }, + { + "epoch": 0.238375, + "grad_norm": 2.270704507827759, + "grad_norm_var": 0.6817949672684023, + "learning_rate": 0.0001, + "loss": 1.1391, + "loss/crossentropy": 2.707805633544922, + "loss/hidden": 0.93359375, + "loss/logits": 0.1773141622543335, + "loss/reg": 0.0028225905261933804, + "step": 1907 + }, + { + "epoch": 0.2385, + "grad_norm": 3.1557998657226562, + "grad_norm_var": 0.6832387916335013, + "learning_rate": 0.0001, + "loss": 1.1169, + "loss/crossentropy": 2.439981460571289, + "loss/hidden": 0.93359375, + "loss/logits": 0.15508322417736053, + "loss/reg": 0.0028210440650582314, + "step": 1908 + }, + { + "epoch": 0.238625, + "grad_norm": 2.238996982574463, + "grad_norm_var": 0.54658289647939, + "learning_rate": 0.0001, + "loss": 1.0766, + "loss/crossentropy": 2.543613910675049, + "loss/hidden": 0.890625, + "loss/logits": 0.1577475666999817, + "loss/reg": 0.0028194894548505545, + "step": 1909 + }, + { + "epoch": 0.23875, + "grad_norm": 2.3347530364990234, + "grad_norm_var": 0.5072756946952609, + "learning_rate": 0.0001, + "loss": 1.1956, + "loss/crossentropy": 2.391685962677002, + "loss/hidden": 1.0, + "loss/logits": 0.16745620965957642, + "loss/reg": 0.002817926462739706, + "step": 1910 + }, + { + "epoch": 0.238875, + "grad_norm": 3.038079261779785, + "grad_norm_var": 0.26585868434679016, + "learning_rate": 0.0001, + "loss": 1.1833, + "loss/crossentropy": 2.6581690311431885, + "loss/hidden": 0.9609375, + "loss/logits": 0.19423067569732666, + "loss/reg": 0.00281645474024117, + "step": 1911 + }, + { + "epoch": 0.239, + "grad_norm": 2.801513671875, + "grad_norm_var": 0.26434526750178977, + "learning_rate": 0.0001, + "loss": 1.2958, + "loss/crossentropy": 2.2399282455444336, + "loss/hidden": 1.0859375, + "loss/logits": 0.18169432878494263, + "loss/reg": 0.002814988372847438, + "step": 1912 + }, + { + "epoch": 0.239125, + "grad_norm": 2.835973024368286, + "grad_norm_var": 0.2582471639147646, + "learning_rate": 0.0001, + "loss": 1.2229, + "loss/crossentropy": 2.8033790588378906, + "loss/hidden": 1.0, + "loss/logits": 0.19473232328891754, + "loss/reg": 0.0028135550674051046, + "step": 1913 + }, + { + "epoch": 0.23925, + "grad_norm": 2.284269094467163, + "grad_norm_var": 0.18147043790236214, + "learning_rate": 0.0001, + "loss": 1.0563, + "loss/crossentropy": 2.4895052909851074, + "loss/hidden": 0.875, + "loss/logits": 0.15318460762500763, + "loss/reg": 0.0028121541254222393, + "step": 1914 + }, + { + "epoch": 0.239375, + "grad_norm": 2.1640424728393555, + "grad_norm_var": 0.18803017488825446, + "learning_rate": 0.0001, + "loss": 1.0567, + "loss/crossentropy": 2.5001635551452637, + "loss/hidden": 0.8828125, + "loss/logits": 0.14573311805725098, + "loss/reg": 0.0028106593526899815, + "step": 1915 + }, + { + "epoch": 0.2395, + "grad_norm": 1.9871270656585693, + "grad_norm_var": 0.12455762918461702, + "learning_rate": 0.0001, + "loss": 1.0899, + "loss/crossentropy": 2.402726888656616, + "loss/hidden": 0.9140625, + "loss/logits": 0.14770221710205078, + "loss/reg": 0.0028093019500374794, + "step": 1916 + }, + { + "epoch": 0.239625, + "grad_norm": 1.9124521017074585, + "grad_norm_var": 0.13556166178302545, + "learning_rate": 0.0001, + "loss": 1.0272, + "loss/crossentropy": 2.475602388381958, + "loss/hidden": 0.8515625, + "loss/logits": 0.14752693474292755, + "loss/reg": 0.002807790180668235, + "step": 1917 + }, + { + "epoch": 0.23975, + "grad_norm": 2.6607353687286377, + "grad_norm_var": 0.1338465573837538, + "learning_rate": 0.0001, + "loss": 1.0073, + "loss/crossentropy": 2.5274460315704346, + "loss/hidden": 0.8359375, + "loss/logits": 0.14331699907779694, + "loss/reg": 0.0028062344063073397, + "step": 1918 + }, + { + "epoch": 0.239875, + "grad_norm": 2.725597381591797, + "grad_norm_var": 0.13689784558561602, + "learning_rate": 0.0001, + "loss": 1.3288, + "loss/crossentropy": 2.164072275161743, + "loss/hidden": 1.109375, + "loss/logits": 0.19133153557777405, + "loss/reg": 0.002804698422551155, + "step": 1919 + }, + { + "epoch": 0.24, + "grad_norm": 2.5143990516662598, + "grad_norm_var": 0.13367861240944112, + "learning_rate": 0.0001, + "loss": 1.1068, + "loss/crossentropy": 2.6517221927642822, + "loss/hidden": 0.91015625, + "loss/logits": 0.16858091950416565, + "loss/reg": 0.002803155919536948, + "step": 1920 + }, + { + "epoch": 0.240125, + "grad_norm": 4.250274658203125, + "grad_norm_var": 0.32790836135060264, + "learning_rate": 0.0001, + "loss": 1.2321, + "loss/crossentropy": 2.756089925765991, + "loss/hidden": 0.9375, + "loss/logits": 0.2666040062904358, + "loss/reg": 0.00280156172811985, + "step": 1921 + }, + { + "epoch": 0.24025, + "grad_norm": 3.579580307006836, + "grad_norm_var": 0.3780541826973238, + "learning_rate": 0.0001, + "loss": 1.1615, + "loss/crossentropy": 2.4776012897491455, + "loss/hidden": 0.95703125, + "loss/logits": 0.17651261389255524, + "loss/reg": 0.002799983136355877, + "step": 1922 + }, + { + "epoch": 0.240375, + "grad_norm": 2.552597761154175, + "grad_norm_var": 0.3679322737687584, + "learning_rate": 0.0001, + "loss": 1.1309, + "loss/crossentropy": 2.535444498062134, + "loss/hidden": 0.9453125, + "loss/logits": 0.1575796902179718, + "loss/reg": 0.0027983970940113068, + "step": 1923 + }, + { + "epoch": 0.2405, + "grad_norm": 2.1624505519866943, + "grad_norm_var": 0.3678785758486583, + "learning_rate": 0.0001, + "loss": 1.2067, + "loss/crossentropy": 2.214770555496216, + "loss/hidden": 1.0234375, + "loss/logits": 0.15530873835086823, + "loss/reg": 0.0027969072107225657, + "step": 1924 + }, + { + "epoch": 0.240625, + "grad_norm": 2.1517112255096436, + "grad_norm_var": 0.3728782554598296, + "learning_rate": 0.0001, + "loss": 1.0171, + "loss/crossentropy": 2.520151138305664, + "loss/hidden": 0.84765625, + "loss/logits": 0.14149996638298035, + "loss/reg": 0.002795466920360923, + "step": 1925 + }, + { + "epoch": 0.24075, + "grad_norm": 2.09903883934021, + "grad_norm_var": 0.3853855727658185, + "learning_rate": 0.0001, + "loss": 1.0477, + "loss/crossentropy": 2.3022382259368896, + "loss/hidden": 0.875, + "loss/logits": 0.14477625489234924, + "loss/reg": 0.0027940254658460617, + "step": 1926 + }, + { + "epoch": 0.240875, + "grad_norm": 2.2137794494628906, + "grad_norm_var": 0.3805278519877115, + "learning_rate": 0.0001, + "loss": 1.0049, + "loss/crossentropy": 2.397075653076172, + "loss/hidden": 0.84375, + "loss/logits": 0.13318374752998352, + "loss/reg": 0.00279267062433064, + "step": 1927 + }, + { + "epoch": 0.241, + "grad_norm": 2.1640615463256836, + "grad_norm_var": 0.3850549000224494, + "learning_rate": 0.0001, + "loss": 1.2428, + "loss/crossentropy": 2.292894124984741, + "loss/hidden": 1.0390625, + "loss/logits": 0.17587056756019592, + "loss/reg": 0.002791155595332384, + "step": 1928 + }, + { + "epoch": 0.241125, + "grad_norm": 2.0617549419403076, + "grad_norm_var": 0.38950121594236903, + "learning_rate": 0.0001, + "loss": 1.091, + "loss/crossentropy": 2.3114614486694336, + "loss/hidden": 0.9140625, + "loss/logits": 0.14906063675880432, + "loss/reg": 0.0027897644322365522, + "step": 1929 + }, + { + "epoch": 0.24125, + "grad_norm": 2.2500698566436768, + "grad_norm_var": 0.3904109329361792, + "learning_rate": 0.0001, + "loss": 1.0731, + "loss/crossentropy": 2.530860662460327, + "loss/hidden": 0.8984375, + "loss/logits": 0.14675912261009216, + "loss/reg": 0.0027884161099791527, + "step": 1930 + }, + { + "epoch": 0.241375, + "grad_norm": 3.118149518966675, + "grad_norm_var": 0.40894295029893557, + "learning_rate": 0.0001, + "loss": 1.0974, + "loss/crossentropy": 2.5993683338165283, + "loss/hidden": 0.90625, + "loss/logits": 0.16328555345535278, + "loss/reg": 0.0027870717458426952, + "step": 1931 + }, + { + "epoch": 0.2415, + "grad_norm": 2.231571912765503, + "grad_norm_var": 0.39513912896007114, + "learning_rate": 0.0001, + "loss": 1.0225, + "loss/crossentropy": 2.673400402069092, + "loss/hidden": 0.85546875, + "loss/logits": 0.13916508853435516, + "loss/reg": 0.0027857308741658926, + "step": 1932 + }, + { + "epoch": 0.241625, + "grad_norm": 2.2554473876953125, + "grad_norm_var": 0.3737690186064941, + "learning_rate": 0.0001, + "loss": 1.1379, + "loss/crossentropy": 2.538700819015503, + "loss/hidden": 0.9453125, + "loss/logits": 0.16477283835411072, + "loss/reg": 0.002784265670925379, + "step": 1933 + }, + { + "epoch": 0.24175, + "grad_norm": 2.140296459197998, + "grad_norm_var": 0.3838427455168045, + "learning_rate": 0.0001, + "loss": 1.0912, + "loss/crossentropy": 2.588484287261963, + "loss/hidden": 0.8984375, + "loss/logits": 0.1649562418460846, + "loss/reg": 0.002782786963507533, + "step": 1934 + }, + { + "epoch": 0.241875, + "grad_norm": 2.2381234169006348, + "grad_norm_var": 0.38594407304694867, + "learning_rate": 0.0001, + "loss": 1.0233, + "loss/crossentropy": 2.5911705493927, + "loss/hidden": 0.859375, + "loss/logits": 0.13616088032722473, + "loss/reg": 0.0027812945190817118, + "step": 1935 + }, + { + "epoch": 0.242, + "grad_norm": 1.9112108945846558, + "grad_norm_var": 0.40744186602944354, + "learning_rate": 0.0001, + "loss": 0.9393, + "loss/crossentropy": 2.5094900131225586, + "loss/hidden": 0.78515625, + "loss/logits": 0.12632881104946136, + "loss/reg": 0.002779774833470583, + "step": 1936 + }, + { + "epoch": 0.242125, + "grad_norm": 5.132593631744385, + "grad_norm_var": 0.6665618029327437, + "learning_rate": 0.0001, + "loss": 1.2427, + "loss/crossentropy": 2.5614383220672607, + "loss/hidden": 1.0703125, + "loss/logits": 0.144596129655838, + "loss/reg": 0.002778239781036973, + "step": 1937 + }, + { + "epoch": 0.24225, + "grad_norm": 3.1231844425201416, + "grad_norm_var": 0.6148830410155909, + "learning_rate": 0.0001, + "loss": 0.8716, + "loss/crossentropy": 2.501601457595825, + "loss/hidden": 0.734375, + "loss/logits": 0.1094457358121872, + "loss/reg": 0.002776721026748419, + "step": 1938 + }, + { + "epoch": 0.242375, + "grad_norm": 2.5960471630096436, + "grad_norm_var": 0.6153759718928247, + "learning_rate": 0.0001, + "loss": 1.1199, + "loss/crossentropy": 2.5568716526031494, + "loss/hidden": 0.91796875, + "loss/logits": 0.17422887682914734, + "loss/reg": 0.0027751729357987642, + "step": 1939 + }, + { + "epoch": 0.2425, + "grad_norm": 2.3119328022003174, + "grad_norm_var": 0.6102323306014952, + "learning_rate": 0.0001, + "loss": 0.9679, + "loss/crossentropy": 2.3882250785827637, + "loss/hidden": 0.8046875, + "loss/logits": 0.13546867668628693, + "loss/reg": 0.0027736674528568983, + "step": 1940 + }, + { + "epoch": 0.242625, + "grad_norm": 4.160861015319824, + "grad_norm_var": 0.7692402881847016, + "learning_rate": 0.0001, + "loss": 1.2181, + "loss/crossentropy": 2.2808122634887695, + "loss/hidden": 0.984375, + "loss/logits": 0.2060256004333496, + "loss/reg": 0.0027721913065761328, + "step": 1941 + }, + { + "epoch": 0.24275, + "grad_norm": 2.8490424156188965, + "grad_norm_var": 0.7517497358643694, + "learning_rate": 0.0001, + "loss": 1.2525, + "loss/crossentropy": 2.345285415649414, + "loss/hidden": 1.03125, + "loss/logits": 0.19358152151107788, + "loss/reg": 0.0027707451954483986, + "step": 1942 + }, + { + "epoch": 0.242875, + "grad_norm": 7.469876289367676, + "grad_norm_var": 2.157014120724763, + "learning_rate": 0.0001, + "loss": 1.4803, + "loss/crossentropy": 2.0654871463775635, + "loss/hidden": 1.25, + "loss/logits": 0.20259954035282135, + "loss/reg": 0.002769321436062455, + "step": 1943 + }, + { + "epoch": 0.243, + "grad_norm": 2.437960624694824, + "grad_norm_var": 2.131142079716708, + "learning_rate": 0.0001, + "loss": 1.1002, + "loss/crossentropy": 2.231019973754883, + "loss/hidden": 0.921875, + "loss/logits": 0.1506144404411316, + "loss/reg": 0.002767904195934534, + "step": 1944 + }, + { + "epoch": 0.243125, + "grad_norm": 3.1553590297698975, + "grad_norm_var": 2.066455279052258, + "learning_rate": 0.0001, + "loss": 1.1744, + "loss/crossentropy": 2.465205669403076, + "loss/hidden": 0.95703125, + "loss/logits": 0.18965642154216766, + "loss/reg": 0.0027665095403790474, + "step": 1945 + }, + { + "epoch": 0.24325, + "grad_norm": 2.4227399826049805, + "grad_norm_var": 2.0490651192590494, + "learning_rate": 0.0001, + "loss": 1.2199, + "loss/crossentropy": 2.8017942905426025, + "loss/hidden": 0.9921875, + "loss/logits": 0.20010419189929962, + "loss/reg": 0.0027651283890008926, + "step": 1946 + }, + { + "epoch": 0.243375, + "grad_norm": 1.9351319074630737, + "grad_norm_var": 2.1332233829394576, + "learning_rate": 0.0001, + "loss": 1.1051, + "loss/crossentropy": 2.385049819946289, + "loss/hidden": 0.921875, + "loss/logits": 0.15559425950050354, + "loss/reg": 0.002763670403510332, + "step": 1947 + }, + { + "epoch": 0.2435, + "grad_norm": 2.439415454864502, + "grad_norm_var": 2.1139850344569364, + "learning_rate": 0.0001, + "loss": 1.0697, + "loss/crossentropy": 2.5390353202819824, + "loss/hidden": 0.859375, + "loss/logits": 0.18273773789405823, + "loss/reg": 0.002762230345979333, + "step": 1948 + }, + { + "epoch": 0.243625, + "grad_norm": 3.9348716735839844, + "grad_norm_var": 2.1154351813563985, + "learning_rate": 0.0001, + "loss": 1.5285, + "loss/crossentropy": 2.544994354248047, + "loss/hidden": 1.2265625, + "loss/logits": 0.27432870864868164, + "loss/reg": 0.0027608247473835945, + "step": 1949 + }, + { + "epoch": 0.24375, + "grad_norm": 2.6699066162109375, + "grad_norm_var": 2.0622895626261593, + "learning_rate": 0.0001, + "loss": 1.1582, + "loss/crossentropy": 2.3712096214294434, + "loss/hidden": 0.9609375, + "loss/logits": 0.16963256895542145, + "loss/reg": 0.002759524155408144, + "step": 1950 + }, + { + "epoch": 0.243875, + "grad_norm": 2.510438919067383, + "grad_norm_var": 2.032934141151713, + "learning_rate": 0.0001, + "loss": 1.1192, + "loss/crossentropy": 2.6832380294799805, + "loss/hidden": 0.9140625, + "loss/logits": 0.17755383253097534, + "loss/reg": 0.002758244751021266, + "step": 1951 + }, + { + "epoch": 0.244, + "grad_norm": 2.703141689300537, + "grad_norm_var": 1.9369671914291076, + "learning_rate": 0.0001, + "loss": 1.2835, + "loss/crossentropy": 2.1617352962493896, + "loss/hidden": 1.0625, + "loss/logits": 0.1934407502412796, + "loss/reg": 0.002756967907771468, + "step": 1952 + }, + { + "epoch": 0.244125, + "grad_norm": 2.3727617263793945, + "grad_norm_var": 1.7168647286460454, + "learning_rate": 0.0001, + "loss": 1.1945, + "loss/crossentropy": 2.636251211166382, + "loss/hidden": 0.9765625, + "loss/logits": 0.19042374193668365, + "loss/reg": 0.0027555141132324934, + "step": 1953 + }, + { + "epoch": 0.24425, + "grad_norm": 2.1022789478302, + "grad_norm_var": 1.774533228862542, + "learning_rate": 0.0001, + "loss": 1.0134, + "loss/crossentropy": 2.517247200012207, + "loss/hidden": 0.85546875, + "loss/logits": 0.13039694726467133, + "loss/reg": 0.0027542109601199627, + "step": 1954 + }, + { + "epoch": 0.244375, + "grad_norm": 2.1997644901275635, + "grad_norm_var": 1.8059291585278145, + "learning_rate": 0.0001, + "loss": 1.0473, + "loss/crossentropy": 2.655820608139038, + "loss/hidden": 0.87109375, + "loss/logits": 0.1486739218235016, + "loss/reg": 0.002752919914200902, + "step": 1955 + }, + { + "epoch": 0.2445, + "grad_norm": 2.6467058658599854, + "grad_norm_var": 1.7831262007346442, + "learning_rate": 0.0001, + "loss": 1.3302, + "loss/crossentropy": 2.114410400390625, + "loss/hidden": 1.125, + "loss/logits": 0.17764800786972046, + "loss/reg": 0.0027516759000718594, + "step": 1956 + }, + { + "epoch": 0.244625, + "grad_norm": 2.7396280765533447, + "grad_norm_var": 1.68951109645124, + "learning_rate": 0.0001, + "loss": 1.1134, + "loss/crossentropy": 2.5355465412139893, + "loss/hidden": 0.921875, + "loss/logits": 0.16403117775917053, + "loss/reg": 0.002750229090452194, + "step": 1957 + }, + { + "epoch": 0.24475, + "grad_norm": 2.410583019256592, + "grad_norm_var": 1.7051962159964043, + "learning_rate": 0.0001, + "loss": 1.1862, + "loss/crossentropy": 2.0772080421447754, + "loss/hidden": 1.015625, + "loss/logits": 0.1430494487285614, + "loss/reg": 0.0027487878687679768, + "step": 1958 + }, + { + "epoch": 0.244875, + "grad_norm": 2.3453474044799805, + "grad_norm_var": 0.21338224169162548, + "learning_rate": 0.0001, + "loss": 0.9842, + "loss/crossentropy": 2.786540985107422, + "loss/hidden": 0.80859375, + "loss/logits": 0.14812374114990234, + "loss/reg": 0.002747328719124198, + "step": 1959 + }, + { + "epoch": 0.245, + "grad_norm": 3.010045051574707, + "grad_norm_var": 0.22421355318186417, + "learning_rate": 0.0001, + "loss": 1.2039, + "loss/crossentropy": 2.6426658630371094, + "loss/hidden": 0.953125, + "loss/logits": 0.22334754467010498, + "loss/reg": 0.0027458607219159603, + "step": 1960 + }, + { + "epoch": 0.245125, + "grad_norm": 2.431356191635132, + "grad_norm_var": 0.20335259794886623, + "learning_rate": 0.0001, + "loss": 1.0896, + "loss/crossentropy": 2.223994255065918, + "loss/hidden": 0.921875, + "loss/logits": 0.14024955034255981, + "loss/reg": 0.0027444439474493265, + "step": 1961 + }, + { + "epoch": 0.24525, + "grad_norm": 2.264490842819214, + "grad_norm_var": 0.20770068539455805, + "learning_rate": 0.0001, + "loss": 0.9975, + "loss/crossentropy": 2.5207021236419678, + "loss/hidden": 0.83203125, + "loss/logits": 0.13799090683460236, + "loss/reg": 0.0027429983019828796, + "step": 1962 + }, + { + "epoch": 0.245375, + "grad_norm": 2.420308828353882, + "grad_norm_var": 0.18297715933091632, + "learning_rate": 0.0001, + "loss": 1.2007, + "loss/crossentropy": 2.4860877990722656, + "loss/hidden": 0.9921875, + "loss/logits": 0.1810786873102188, + "loss/reg": 0.00274151680059731, + "step": 1963 + }, + { + "epoch": 0.2455, + "grad_norm": 2.414461851119995, + "grad_norm_var": 0.18346740397452094, + "learning_rate": 0.0001, + "loss": 1.071, + "loss/crossentropy": 2.4090845584869385, + "loss/hidden": 0.90234375, + "loss/logits": 0.14120522141456604, + "loss/reg": 0.0027401153929531574, + "step": 1964 + }, + { + "epoch": 0.245625, + "grad_norm": 2.4071054458618164, + "grad_norm_var": 0.05203356240904213, + "learning_rate": 0.0001, + "loss": 1.0379, + "loss/crossentropy": 2.6016945838928223, + "loss/hidden": 0.86328125, + "loss/logits": 0.14726971089839935, + "loss/reg": 0.0027385957073420286, + "step": 1965 + }, + { + "epoch": 0.24575, + "grad_norm": 2.428882360458374, + "grad_norm_var": 0.04949778844412904, + "learning_rate": 0.0001, + "loss": 0.9793, + "loss/crossentropy": 2.6819698810577393, + "loss/hidden": 0.8203125, + "loss/logits": 0.13164296746253967, + "loss/reg": 0.002737129107117653, + "step": 1966 + }, + { + "epoch": 0.245875, + "grad_norm": 2.2721593379974365, + "grad_norm_var": 0.05153780887834784, + "learning_rate": 0.0001, + "loss": 1.0456, + "loss/crossentropy": 2.2785394191741943, + "loss/hidden": 0.86328125, + "loss/logits": 0.1549597829580307, + "loss/reg": 0.002735583111643791, + "step": 1967 + }, + { + "epoch": 0.246, + "grad_norm": 2.5156705379486084, + "grad_norm_var": 0.04735843285122859, + "learning_rate": 0.0001, + "loss": 1.1212, + "loss/crossentropy": 2.4128031730651855, + "loss/hidden": 0.9375, + "loss/logits": 0.15634964406490326, + "loss/reg": 0.002734163776040077, + "step": 1968 + }, + { + "epoch": 0.246125, + "grad_norm": 2.315765380859375, + "grad_norm_var": 0.04804468545032871, + "learning_rate": 0.0001, + "loss": 1.1106, + "loss/crossentropy": 2.5273008346557617, + "loss/hidden": 0.93359375, + "loss/logits": 0.1496410369873047, + "loss/reg": 0.0027327670250087976, + "step": 1969 + }, + { + "epoch": 0.24625, + "grad_norm": 3.655541181564331, + "grad_norm_var": 0.13038539827467327, + "learning_rate": 0.0001, + "loss": 1.1545, + "loss/crossentropy": 2.816701889038086, + "loss/hidden": 0.9453125, + "loss/logits": 0.18190214037895203, + "loss/reg": 0.002731376327574253, + "step": 1970 + }, + { + "epoch": 0.246375, + "grad_norm": 2.0330469608306885, + "grad_norm_var": 0.13946034117999087, + "learning_rate": 0.0001, + "loss": 0.9543, + "loss/crossentropy": 2.5322234630584717, + "loss/hidden": 0.8046875, + "loss/logits": 0.12230876088142395, + "loss/reg": 0.0027299553621560335, + "step": 1971 + }, + { + "epoch": 0.2465, + "grad_norm": 2.3114256858825684, + "grad_norm_var": 0.1407970077955942, + "learning_rate": 0.0001, + "loss": 1.0766, + "loss/crossentropy": 2.403012752532959, + "loss/hidden": 0.90625, + "loss/logits": 0.14305217564105988, + "loss/reg": 0.0027285972610116005, + "step": 1972 + }, + { + "epoch": 0.246625, + "grad_norm": 2.872380256652832, + "grad_norm_var": 0.14616669234115964, + "learning_rate": 0.0001, + "loss": 1.0601, + "loss/crossentropy": 2.620347261428833, + "loss/hidden": 0.875, + "loss/logits": 0.15787017345428467, + "loss/reg": 0.0027271404396742582, + "step": 1973 + }, + { + "epoch": 0.24675, + "grad_norm": 2.204484462738037, + "grad_norm_var": 0.15146511044817218, + "learning_rate": 0.0001, + "loss": 1.0066, + "loss/crossentropy": 2.5610415935516357, + "loss/hidden": 0.828125, + "loss/logits": 0.15118342638015747, + "loss/reg": 0.002725655445829034, + "step": 1974 + }, + { + "epoch": 0.246875, + "grad_norm": 2.2441866397857666, + "grad_norm_var": 0.15410845728410152, + "learning_rate": 0.0001, + "loss": 0.9559, + "loss/crossentropy": 2.513578414916992, + "loss/hidden": 0.8046875, + "loss/logits": 0.12393586337566376, + "loss/reg": 0.002724139718338847, + "step": 1975 + }, + { + "epoch": 0.247, + "grad_norm": 2.8840277194976807, + "grad_norm_var": 0.14632239260073235, + "learning_rate": 0.0001, + "loss": 1.0875, + "loss/crossentropy": 2.491835832595825, + "loss/hidden": 0.90625, + "loss/logits": 0.15406697988510132, + "loss/reg": 0.0027226670645177364, + "step": 1976 + }, + { + "epoch": 0.247125, + "grad_norm": 2.744213342666626, + "grad_norm_var": 0.15042299567527168, + "learning_rate": 0.0001, + "loss": 1.2954, + "loss/crossentropy": 2.233274459838867, + "loss/hidden": 1.0859375, + "loss/logits": 0.18220359086990356, + "loss/reg": 0.0027212114073336124, + "step": 1977 + }, + { + "epoch": 0.24725, + "grad_norm": 2.3532700538635254, + "grad_norm_var": 0.1481365956517531, + "learning_rate": 0.0001, + "loss": 1.1782, + "loss/crossentropy": 2.70269775390625, + "loss/hidden": 0.96484375, + "loss/logits": 0.18613766133785248, + "loss/reg": 0.002719811163842678, + "step": 1978 + }, + { + "epoch": 0.247375, + "grad_norm": 2.016599416732788, + "grad_norm_var": 0.16287134788210172, + "learning_rate": 0.0001, + "loss": 1.0368, + "loss/crossentropy": 2.7286431789398193, + "loss/hidden": 0.859375, + "loss/logits": 0.1502460092306137, + "loss/reg": 0.0027184404898434877, + "step": 1979 + }, + { + "epoch": 0.2475, + "grad_norm": 2.468029499053955, + "grad_norm_var": 0.16258562087950257, + "learning_rate": 0.0001, + "loss": 1.0856, + "loss/crossentropy": 2.4531679153442383, + "loss/hidden": 0.9140625, + "loss/logits": 0.14435534179210663, + "loss/reg": 0.0027170274406671524, + "step": 1980 + }, + { + "epoch": 0.247625, + "grad_norm": 2.8880743980407715, + "grad_norm_var": 0.1721816167867452, + "learning_rate": 0.0001, + "loss": 1.0137, + "loss/crossentropy": 2.8085103034973145, + "loss/hidden": 0.8203125, + "loss/logits": 0.16619496047496796, + "loss/reg": 0.0027155885472893715, + "step": 1981 + }, + { + "epoch": 0.24775, + "grad_norm": 2.2269508838653564, + "grad_norm_var": 0.1769945282356974, + "learning_rate": 0.0001, + "loss": 1.0995, + "loss/crossentropy": 2.7113091945648193, + "loss/hidden": 0.92578125, + "loss/logits": 0.1465437412261963, + "loss/reg": 0.0027142076287418604, + "step": 1982 + }, + { + "epoch": 0.247875, + "grad_norm": 2.3174233436584473, + "grad_norm_var": 0.17574531851225003, + "learning_rate": 0.0001, + "loss": 1.4501, + "loss/crossentropy": 1.9189180135726929, + "loss/hidden": 1.1953125, + "loss/logits": 0.22766916453838348, + "loss/reg": 0.0027127759531140327, + "step": 1983 + }, + { + "epoch": 0.248, + "grad_norm": 1.7954802513122559, + "grad_norm_var": 0.20696429693966606, + "learning_rate": 0.0001, + "loss": 1.0248, + "loss/crossentropy": 2.484433174133301, + "loss/hidden": 0.859375, + "loss/logits": 0.1383214145898819, + "loss/reg": 0.0027115046977996826, + "step": 1984 + }, + { + "epoch": 0.248125, + "grad_norm": 1.98469078540802, + "grad_norm_var": 0.22010164823287107, + "learning_rate": 0.0001, + "loss": 1.0636, + "loss/crossentropy": 2.355754852294922, + "loss/hidden": 0.875, + "loss/logits": 0.16146710515022278, + "loss/reg": 0.0027100895531475544, + "step": 1985 + }, + { + "epoch": 0.24825, + "grad_norm": 3.0588693618774414, + "grad_norm_var": 0.14544907650537522, + "learning_rate": 0.0001, + "loss": 1.2953, + "loss/crossentropy": 2.55202317237854, + "loss/hidden": 1.0625, + "loss/logits": 0.20571765303611755, + "loss/reg": 0.002708751941099763, + "step": 1986 + }, + { + "epoch": 0.248375, + "grad_norm": 1.9647020101547241, + "grad_norm_var": 0.14908673013686075, + "learning_rate": 0.0001, + "loss": 1.0645, + "loss/crossentropy": 2.5482850074768066, + "loss/hidden": 0.8828125, + "loss/logits": 0.15463218092918396, + "loss/reg": 0.0027072790544480085, + "step": 1987 + }, + { + "epoch": 0.2485, + "grad_norm": 2.367072582244873, + "grad_norm_var": 0.1486533124992943, + "learning_rate": 0.0001, + "loss": 1.087, + "loss/crossentropy": 2.4881367683410645, + "loss/hidden": 0.90625, + "loss/logits": 0.15367110073566437, + "loss/reg": 0.0027056580875068903, + "step": 1988 + }, + { + "epoch": 0.248625, + "grad_norm": 2.4453704357147217, + "grad_norm_var": 0.1331206329775275, + "learning_rate": 0.0001, + "loss": 1.0873, + "loss/crossentropy": 2.556847333908081, + "loss/hidden": 0.88671875, + "loss/logits": 0.17354083061218262, + "loss/reg": 0.0027040427085012197, + "step": 1989 + }, + { + "epoch": 0.24875, + "grad_norm": 9.09829044342041, + "grad_norm_var": 2.9487722333659785, + "learning_rate": 0.0001, + "loss": 1.2835, + "loss/crossentropy": 2.4852135181427, + "loss/hidden": 1.09375, + "loss/logits": 0.16272324323654175, + "loss/reg": 0.0027026128955185413, + "step": 1990 + }, + { + "epoch": 0.248875, + "grad_norm": 2.424456834793091, + "grad_norm_var": 2.9373577672795688, + "learning_rate": 0.0001, + "loss": 1.0357, + "loss/crossentropy": 2.4856038093566895, + "loss/hidden": 0.8515625, + "loss/logits": 0.15710175037384033, + "loss/reg": 0.0027011926285922527, + "step": 1991 + }, + { + "epoch": 0.249, + "grad_norm": 2.624159336090088, + "grad_norm_var": 2.939181373576417, + "learning_rate": 0.0001, + "loss": 1.3347, + "loss/crossentropy": 2.166581869125366, + "loss/hidden": 1.109375, + "loss/logits": 0.19837325811386108, + "loss/reg": 0.0026996470987796783, + "step": 1992 + }, + { + "epoch": 0.249125, + "grad_norm": 2.957742691040039, + "grad_norm_var": 2.9404825335519735, + "learning_rate": 0.0001, + "loss": 1.3991, + "loss/crossentropy": 2.6836252212524414, + "loss/hidden": 1.140625, + "loss/logits": 0.23152483999729156, + "loss/reg": 0.002698224736377597, + "step": 1993 + }, + { + "epoch": 0.24925, + "grad_norm": 2.1770012378692627, + "grad_norm_var": 2.9532045555307374, + "learning_rate": 0.0001, + "loss": 1.099, + "loss/crossentropy": 2.48860239982605, + "loss/hidden": 0.9296875, + "loss/logits": 0.14239296317100525, + "loss/reg": 0.002696766285225749, + "step": 1994 + }, + { + "epoch": 0.249375, + "grad_norm": 2.590374708175659, + "grad_norm_var": 2.9137765910812217, + "learning_rate": 0.0001, + "loss": 1.1914, + "loss/crossentropy": 2.4991402626037598, + "loss/hidden": 0.98828125, + "loss/logits": 0.17614206671714783, + "loss/reg": 0.0026953339111059904, + "step": 1995 + }, + { + "epoch": 0.2495, + "grad_norm": 2.9202098846435547, + "grad_norm_var": 2.904322765602717, + "learning_rate": 0.0001, + "loss": 1.1382, + "loss/crossentropy": 2.286001682281494, + "loss/hidden": 0.9453125, + "loss/logits": 0.1659601628780365, + "loss/reg": 0.0026938265655189753, + "step": 1996 + }, + { + "epoch": 0.249625, + "grad_norm": 2.133800745010376, + "grad_norm_var": 2.93756568739632, + "learning_rate": 0.0001, + "loss": 1.1125, + "loss/crossentropy": 2.492587089538574, + "loss/hidden": 0.91015625, + "loss/logits": 0.17543631792068481, + "loss/reg": 0.002692408859729767, + "step": 1997 + }, + { + "epoch": 0.24975, + "grad_norm": 2.7813234329223633, + "grad_norm_var": 2.9130920460482055, + "learning_rate": 0.0001, + "loss": 1.0991, + "loss/crossentropy": 2.636305332183838, + "loss/hidden": 0.921875, + "loss/logits": 0.15036045014858246, + "loss/reg": 0.0026909259613603354, + "step": 1998 + }, + { + "epoch": 0.249875, + "grad_norm": 2.2112627029418945, + "grad_norm_var": 2.9213711600102763, + "learning_rate": 0.0001, + "loss": 0.885, + "loss/crossentropy": 2.672691822052002, + "loss/hidden": 0.73828125, + "loss/logits": 0.11980107426643372, + "loss/reg": 0.002689523156732321, + "step": 1999 + }, + { + "epoch": 0.25, + "grad_norm": 2.095968246459961, + "grad_norm_var": 2.8849283178664864, + "learning_rate": 0.0001, + "loss": 0.9991, + "loss/crossentropy": 2.556314706802368, + "loss/hidden": 0.83203125, + "loss/logits": 0.14016053080558777, + "loss/reg": 0.002688055392354727, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 8000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.28811723128832e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}