{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.75, "eval_steps": 250, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 4.097814559936523, "learning_rate": 1.0000000000000002e-06, "loss": 1.1655, "loss/crossentropy": 2.343535900115967, "loss/hidden": 0.9296875, "loss/logits": 0.17379230260849, "loss/reg": 0.006198255345225334, "step": 1 }, { "epoch": 0.00025, "grad_norm": 3.662576913833618, "learning_rate": 2.0000000000000003e-06, "loss": 1.4973, "loss/crossentropy": 2.318769931793213, "loss/hidden": 1.1875, "loss/logits": 0.24786217510700226, "loss/reg": 0.006198255345225334, "step": 2 }, { "epoch": 0.000375, "grad_norm": 2.8296749591827393, "learning_rate": 3e-06, "loss": 1.2258, "loss/crossentropy": 2.4907937049865723, "loss/hidden": 0.97265625, "loss/logits": 0.19112952053546906, "loss/reg": 0.006198245566338301, "step": 3 }, { "epoch": 0.0005, "grad_norm": 3.057624578475952, "learning_rate": 4.000000000000001e-06, "loss": 1.1136, "loss/crossentropy": 2.744520902633667, "loss/hidden": 0.890625, "loss/logits": 0.16101403534412384, "loss/reg": 0.006198232993483543, "step": 4 }, { "epoch": 0.000625, "grad_norm": 2.7055587768554688, "learning_rate": 5e-06, "loss": 1.1943, "loss/crossentropy": 2.5722062587738037, "loss/hidden": 0.94921875, "loss/logits": 0.18310005962848663, "loss/reg": 0.0061982134357094765, "step": 5 }, { "epoch": 0.00075, "grad_norm": 3.789276361465454, "learning_rate": 6e-06, "loss": 1.247, "loss/crossentropy": 2.613312005996704, "loss/hidden": 1.0078125, "loss/logits": 0.17725251615047455, "loss/reg": 0.006198191549628973, "step": 6 }, { "epoch": 0.000875, "grad_norm": 3.997910499572754, "learning_rate": 7.000000000000001e-06, "loss": 1.4206, "loss/crossentropy": 2.4207534790039062, "loss/hidden": 1.125, "loss/logits": 0.2336406409740448, "loss/reg": 0.006198164541274309, "step": 7 }, { "epoch": 0.001, "grad_norm": 2.5986244678497314, "learning_rate": 8.000000000000001e-06, "loss": 1.0878, "loss/crossentropy": 2.536424160003662, "loss/hidden": 0.8671875, "loss/logits": 0.1585812270641327, "loss/reg": 0.006198132876306772, "step": 8 }, { "epoch": 0.001125, "grad_norm": 2.2757976055145264, "learning_rate": 9e-06, "loss": 1.1175, "loss/crossentropy": 2.745281219482422, "loss/hidden": 0.89453125, "loss/logits": 0.16094230115413666, "loss/reg": 0.006198094692081213, "step": 9 }, { "epoch": 0.00125, "grad_norm": 2.261094808578491, "learning_rate": 1e-05, "loss": 1.0803, "loss/crossentropy": 2.3173577785491943, "loss/hidden": 0.8671875, "loss/logits": 0.15108685195446014, "loss/reg": 0.0061980499885976315, "step": 10 }, { "epoch": 0.001375, "grad_norm": 21.777265548706055, "learning_rate": 1.1000000000000001e-05, "loss": 2.0501, "loss/crossentropy": 3.2122714519500732, "loss/hidden": 1.7109375, "loss/logits": 0.27713608741760254, "loss/reg": 0.006198008079081774, "step": 11 }, { "epoch": 0.0015, "grad_norm": 2.5655505657196045, "learning_rate": 1.2e-05, "loss": 1.151, "loss/crossentropy": 2.706430196762085, "loss/hidden": 0.8984375, "loss/logits": 0.19056561589241028, "loss/reg": 0.0061979577876627445, "step": 12 }, { "epoch": 0.001625, "grad_norm": 2.403053045272827, "learning_rate": 1.3000000000000001e-05, "loss": 1.0719, "loss/crossentropy": 2.0466296672821045, "loss/hidden": 0.88671875, "loss/logits": 0.12316589802503586, "loss/reg": 0.0061978911980986595, "step": 13 }, { "epoch": 0.00175, "grad_norm": 3.840881586074829, "learning_rate": 1.4000000000000001e-05, "loss": 1.5441, "loss/crossentropy": 2.3191423416137695, "loss/hidden": 1.234375, "loss/logits": 0.24779079854488373, "loss/reg": 0.00619781669229269, "step": 14 }, { "epoch": 0.001875, "grad_norm": 2.557331085205078, "learning_rate": 1.5e-05, "loss": 0.9444, "loss/crossentropy": 2.6370084285736084, "loss/hidden": 0.76953125, "loss/logits": 0.11287336051464081, "loss/reg": 0.006197733338922262, "step": 15 }, { "epoch": 0.002, "grad_norm": 3.1850404739379883, "grad_norm_var": 22.31061335402559, "learning_rate": 1.6000000000000003e-05, "loss": 1.3213, "loss/crossentropy": 2.676577091217041, "loss/hidden": 1.0546875, "loss/logits": 0.2046227753162384, "loss/reg": 0.006197639741003513, "step": 16 }, { "epoch": 0.002125, "grad_norm": 2.2587289810180664, "grad_norm_var": 22.553268201402446, "learning_rate": 1.7000000000000003e-05, "loss": 1.0312, "loss/crossentropy": 2.4961040019989014, "loss/hidden": 0.8203125, "loss/logits": 0.148894801735878, "loss/reg": 0.006197560112923384, "step": 17 }, { "epoch": 0.00225, "grad_norm": 3.3259811401367188, "grad_norm_var": 22.58044614452358, "learning_rate": 1.8e-05, "loss": 1.3626, "loss/crossentropy": 2.5914387702941895, "loss/hidden": 1.046875, "loss/logits": 0.25370728969573975, "loss/reg": 0.006197475362569094, "step": 18 }, { "epoch": 0.002375, "grad_norm": 2.468914747238159, "grad_norm_var": 22.649171856957494, "learning_rate": 1.9e-05, "loss": 1.1683, "loss/crossentropy": 2.6096584796905518, "loss/hidden": 0.921875, "loss/logits": 0.18447336554527283, "loss/reg": 0.00619738781824708, "step": 19 }, { "epoch": 0.0025, "grad_norm": 2.3097646236419678, "grad_norm_var": 22.784756315801523, "learning_rate": 2e-05, "loss": 1.1605, "loss/crossentropy": 2.299048662185669, "loss/hidden": 0.9375, "loss/logits": 0.16106057167053223, "loss/reg": 0.006197274662554264, "step": 20 }, { "epoch": 0.002625, "grad_norm": 2.1111207008361816, "grad_norm_var": 22.911025462198744, "learning_rate": 2.1e-05, "loss": 0.939, "loss/crossentropy": 2.547258138656616, "loss/hidden": 0.75, "loss/logits": 0.12698382139205933, "loss/reg": 0.006197154987603426, "step": 21 }, { "epoch": 0.00275, "grad_norm": 2.4918222427368164, "grad_norm_var": 23.049732177187614, "learning_rate": 2.2000000000000003e-05, "loss": 1.2047, "loss/crossentropy": 2.2802374362945557, "loss/hidden": 0.953125, "loss/logits": 0.18965375423431396, "loss/reg": 0.006197045091539621, "step": 22 }, { "epoch": 0.002875, "grad_norm": 3.3273494243621826, "grad_norm_var": 23.069242834486193, "learning_rate": 2.3000000000000003e-05, "loss": 1.2554, "loss/crossentropy": 2.3062734603881836, "loss/hidden": 1.0078125, "loss/logits": 0.18566077947616577, "loss/reg": 0.006196921691298485, "step": 23 }, { "epoch": 0.003, "grad_norm": 2.5644068717956543, "grad_norm_var": 23.075070365271714, "learning_rate": 2.4e-05, "loss": 1.2266, "loss/crossentropy": 2.460878372192383, "loss/hidden": 0.98046875, "loss/logits": 0.18418912589550018, "loss/reg": 0.006196786183863878, "step": 24 }, { "epoch": 0.003125, "grad_norm": 2.3506264686584473, "grad_norm_var": 23.059636834121356, "learning_rate": 2.5e-05, "loss": 1.0205, "loss/crossentropy": 2.4281811714172363, "loss/hidden": 0.82421875, "loss/logits": 0.13434948027133942, "loss/reg": 0.0061966474168002605, "step": 25 }, { "epoch": 0.00325, "grad_norm": 2.25004506111145, "grad_norm_var": 23.062003716592635, "learning_rate": 2.6000000000000002e-05, "loss": 1.1133, "loss/crossentropy": 2.326843500137329, "loss/hidden": 0.9140625, "loss/logits": 0.13725802302360535, "loss/reg": 0.006196498870849609, "step": 26 }, { "epoch": 0.003375, "grad_norm": 2.283770799636841, "grad_norm_var": 0.2469546323472817, "learning_rate": 2.7000000000000002e-05, "loss": 1.1459, "loss/crossentropy": 2.3002493381500244, "loss/hidden": 0.9140625, "loss/logits": 0.16987068951129913, "loss/reg": 0.006196335889399052, "step": 27 }, { "epoch": 0.0035, "grad_norm": 2.805088758468628, "grad_norm_var": 0.24805442740468303, "learning_rate": 2.8000000000000003e-05, "loss": 1.0272, "loss/crossentropy": 2.510472536087036, "loss/hidden": 0.8359375, "loss/logits": 0.12927240133285522, "loss/reg": 0.006196176633238792, "step": 28 }, { "epoch": 0.003625, "grad_norm": 2.0331132411956787, "grad_norm_var": 0.2692014993258605, "learning_rate": 2.9e-05, "loss": 1.0913, "loss/crossentropy": 2.51584529876709, "loss/hidden": 0.87109375, "loss/logits": 0.15820594131946564, "loss/reg": 0.006195997819304466, "step": 29 }, { "epoch": 0.00375, "grad_norm": 2.1523566246032715, "grad_norm_var": 0.17596421900176604, "learning_rate": 3e-05, "loss": 1.0026, "loss/crossentropy": 2.704220771789551, "loss/hidden": 0.796875, "loss/logits": 0.14372289180755615, "loss/reg": 0.0061958180740475655, "step": 30 }, { "epoch": 0.003875, "grad_norm": 2.6658694744110107, "grad_norm_var": 0.1771001402109505, "learning_rate": 3.1e-05, "loss": 1.122, "loss/crossentropy": 2.4840426445007324, "loss/hidden": 0.89453125, "loss/logits": 0.1655040979385376, "loss/reg": 0.006195634603500366, "step": 31 }, { "epoch": 0.004, "grad_norm": 2.813079595565796, "grad_norm_var": 0.153583095436327, "learning_rate": 3.2000000000000005e-05, "loss": 1.0653, "loss/crossentropy": 2.442962646484375, "loss/hidden": 0.859375, "loss/logits": 0.14400474727153778, "loss/reg": 0.00619542459025979, "step": 32 }, { "epoch": 0.004125, "grad_norm": 2.4273953437805176, "grad_norm_var": 0.1496371777315666, "learning_rate": 3.3e-05, "loss": 1.1025, "loss/crossentropy": 2.515721559524536, "loss/hidden": 0.89453125, "loss/logits": 0.1460331827402115, "loss/reg": 0.006195210851728916, "step": 33 }, { "epoch": 0.00425, "grad_norm": 2.0594100952148438, "grad_norm_var": 0.11442956053255457, "learning_rate": 3.4000000000000007e-05, "loss": 1.118, "loss/crossentropy": 2.5347506999969482, "loss/hidden": 0.8984375, "loss/logits": 0.15760375559329987, "loss/reg": 0.006195001769810915, "step": 34 }, { "epoch": 0.004375, "grad_norm": 2.497893810272217, "grad_norm_var": 0.11457586733464495, "learning_rate": 3.5e-05, "loss": 1.2359, "loss/crossentropy": 1.7681002616882324, "loss/hidden": 1.0390625, "loss/logits": 0.13490143418312073, "loss/reg": 0.006194803398102522, "step": 35 }, { "epoch": 0.0045, "grad_norm": 3.3231709003448486, "grad_norm_var": 0.16029457606237638, "learning_rate": 3.6e-05, "loss": 1.3588, "loss/crossentropy": 2.729518175125122, "loss/hidden": 1.09375, "loss/logits": 0.20313453674316406, "loss/reg": 0.00619460316374898, "step": 36 }, { "epoch": 0.004625, "grad_norm": 2.5542962551116943, "grad_norm_var": 0.14901290879942408, "learning_rate": 3.7e-05, "loss": 1.1671, "loss/crossentropy": 2.3359429836273193, "loss/hidden": 0.9296875, "loss/logits": 0.17546769976615906, "loss/reg": 0.006194361485540867, "step": 37 }, { "epoch": 0.00475, "grad_norm": 3.5138309001922607, "grad_norm_var": 0.2080724542279834, "learning_rate": 3.8e-05, "loss": 1.2044, "loss/crossentropy": 2.447890520095825, "loss/hidden": 0.96484375, "loss/logits": 0.17756858468055725, "loss/reg": 0.0061941081658005714, "step": 38 }, { "epoch": 0.004875, "grad_norm": 3.813410758972168, "grad_norm_var": 0.2698887106917669, "learning_rate": 3.9000000000000006e-05, "loss": 1.0819, "loss/crossentropy": 2.766765832901001, "loss/hidden": 0.88671875, "loss/logits": 0.13325469195842743, "loss/reg": 0.006193886045366526, "step": 39 }, { "epoch": 0.005, "grad_norm": 3.1502718925476074, "grad_norm_var": 0.2860816910243668, "learning_rate": 4e-05, "loss": 1.3622, "loss/crossentropy": 2.3325388431549072, "loss/hidden": 1.109375, "loss/logits": 0.19087004661560059, "loss/reg": 0.006193609442561865, "step": 40 }, { "epoch": 0.005125, "grad_norm": 2.422366142272949, "grad_norm_var": 0.28336421674108553, "learning_rate": 4.1e-05, "loss": 1.2212, "loss/crossentropy": 2.3002498149871826, "loss/hidden": 0.96875, "loss/logits": 0.19054222106933594, "loss/reg": 0.00619333703070879, "step": 41 }, { "epoch": 0.00525, "grad_norm": 2.7353622913360596, "grad_norm_var": 0.2707266796228128, "learning_rate": 4.2e-05, "loss": 1.0549, "loss/crossentropy": 2.0319221019744873, "loss/hidden": 0.87890625, "loss/logits": 0.1140664741396904, "loss/reg": 0.006193041335791349, "step": 42 }, { "epoch": 0.005375, "grad_norm": 1.9425387382507324, "grad_norm_var": 0.2970857034274398, "learning_rate": 4.3e-05, "loss": 1.0366, "loss/crossentropy": 2.431666374206543, "loss/hidden": 0.83203125, "loss/logits": 0.1426728069782257, "loss/reg": 0.006192733999341726, "step": 43 }, { "epoch": 0.0055, "grad_norm": 2.7009642124176025, "grad_norm_var": 0.2960522402202514, "learning_rate": 4.4000000000000006e-05, "loss": 0.9824, "loss/crossentropy": 2.391608476638794, "loss/hidden": 0.78515625, "loss/logits": 0.13533324003219604, "loss/reg": 0.006192411296069622, "step": 44 }, { "epoch": 0.005625, "grad_norm": 2.6632983684539795, "grad_norm_var": 0.2669107471214488, "learning_rate": 4.5e-05, "loss": 1.1067, "loss/crossentropy": 2.7733116149902344, "loss/hidden": 0.87109375, "loss/logits": 0.1736893653869629, "loss/reg": 0.006192059256136417, "step": 45 }, { "epoch": 0.00575, "grad_norm": 2.1037468910217285, "grad_norm_var": 0.2707032714108967, "learning_rate": 4.600000000000001e-05, "loss": 0.9831, "loss/crossentropy": 2.4606895446777344, "loss/hidden": 0.7890625, "loss/logits": 0.13213258981704712, "loss/reg": 0.006191718857735395, "step": 46 }, { "epoch": 0.005875, "grad_norm": 2.1911983489990234, "grad_norm_var": 0.28768473978113296, "learning_rate": 4.7e-05, "loss": 0.9509, "loss/crossentropy": 2.6825270652770996, "loss/hidden": 0.76953125, "loss/logits": 0.11942489445209503, "loss/reg": 0.006191306747496128, "step": 47 }, { "epoch": 0.006, "grad_norm": 3.2640700340270996, "grad_norm_var": 0.30827796768009724, "learning_rate": 4.8e-05, "loss": 1.0346, "loss/crossentropy": 2.3665199279785156, "loss/hidden": 0.83203125, "loss/logits": 0.14068934321403503, "loss/reg": 0.0061909533105790615, "step": 48 }, { "epoch": 0.006125, "grad_norm": 2.259894847869873, "grad_norm_var": 0.3163475179157634, "learning_rate": 4.9e-05, "loss": 0.9647, "loss/crossentropy": 2.4414587020874023, "loss/hidden": 0.79296875, "loss/logits": 0.10987477004528046, "loss/reg": 0.0061905342154204845, "step": 49 }, { "epoch": 0.00625, "grad_norm": 2.7616565227508545, "grad_norm_var": 0.28721415330329, "learning_rate": 5e-05, "loss": 1.019, "loss/crossentropy": 2.0829460620880127, "loss/hidden": 0.83984375, "loss/logits": 0.11724002659320831, "loss/reg": 0.0061900559812784195, "step": 50 }, { "epoch": 0.006375, "grad_norm": 2.7897861003875732, "grad_norm_var": 0.28297568806904866, "learning_rate": 5.1000000000000006e-05, "loss": 0.853, "loss/crossentropy": 2.5636909008026123, "loss/hidden": 0.6953125, "loss/logits": 0.09577471762895584, "loss/reg": 0.00618965458124876, "step": 51 }, { "epoch": 0.0065, "grad_norm": 2.3134403228759766, "grad_norm_var": 0.2711290924819705, "learning_rate": 5.2000000000000004e-05, "loss": 1.0497, "loss/crossentropy": 2.440258026123047, "loss/hidden": 0.83984375, "loss/logits": 0.14791719615459442, "loss/reg": 0.006189141888171434, "step": 52 }, { "epoch": 0.006625, "grad_norm": 2.2032997608184814, "grad_norm_var": 0.2855897568404882, "learning_rate": 5.300000000000001e-05, "loss": 0.9934, "loss/crossentropy": 2.4747955799102783, "loss/hidden": 0.796875, "loss/logits": 0.13461169600486755, "loss/reg": 0.006188610102981329, "step": 53 }, { "epoch": 0.00675, "grad_norm": 2.267400026321411, "grad_norm_var": 0.24358579758792467, "learning_rate": 5.4000000000000005e-05, "loss": 1.1149, "loss/crossentropy": 2.705127477645874, "loss/hidden": 0.89453125, "loss/logits": 0.1585235595703125, "loss/reg": 0.0061880191788077354, "step": 54 }, { "epoch": 0.006875, "grad_norm": 2.281036853790283, "grad_norm_var": 0.14220569464836952, "learning_rate": 5.500000000000001e-05, "loss": 0.9642, "loss/crossentropy": 2.545010805130005, "loss/hidden": 0.78515625, "loss/logits": 0.11717304587364197, "loss/reg": 0.006187579594552517, "step": 55 }, { "epoch": 0.007, "grad_norm": 4.942420959472656, "grad_norm_var": 0.4975759650139497, "learning_rate": 5.6000000000000006e-05, "loss": 1.1237, "loss/crossentropy": 2.7698795795440674, "loss/hidden": 0.91796875, "loss/logits": 0.14385326206684113, "loss/reg": 0.006187067367136478, "step": 56 }, { "epoch": 0.007125, "grad_norm": 2.4213955402374268, "grad_norm_var": 0.4976009733976563, "learning_rate": 5.6999999999999996e-05, "loss": 1.0386, "loss/crossentropy": 2.572023868560791, "loss/hidden": 0.84765625, "loss/logits": 0.12909512221813202, "loss/reg": 0.006186594720929861, "step": 57 }, { "epoch": 0.00725, "grad_norm": 2.15891695022583, "grad_norm_var": 0.5091253321428854, "learning_rate": 5.8e-05, "loss": 0.961, "loss/crossentropy": 2.283557415008545, "loss/hidden": 0.7734375, "loss/logits": 0.12568500638008118, "loss/reg": 0.006185955833643675, "step": 58 }, { "epoch": 0.007375, "grad_norm": 2.36811900138855, "grad_norm_var": 0.48432608682591366, "learning_rate": 5.9e-05, "loss": 0.8386, "loss/crossentropy": 2.453810453414917, "loss/hidden": 0.6796875, "loss/logits": 0.09709502756595612, "loss/reg": 0.0061853062361478806, "step": 59 }, { "epoch": 0.0075, "grad_norm": 2.591327667236328, "grad_norm_var": 0.4836842483889178, "learning_rate": 6e-05, "loss": 1.033, "loss/crossentropy": 2.8110511302948, "loss/hidden": 0.81640625, "loss/logits": 0.1547423005104065, "loss/reg": 0.006184632424265146, "step": 60 }, { "epoch": 0.007625, "grad_norm": 2.0103816986083984, "grad_norm_var": 0.5047142009615214, "learning_rate": 6.1e-05, "loss": 0.9296, "loss/crossentropy": 2.15134334564209, "loss/hidden": 0.7578125, "loss/logits": 0.1099701076745987, "loss/reg": 0.0061841062270104885, "step": 61 }, { "epoch": 0.00775, "grad_norm": 1.80124831199646, "grad_norm_var": 0.5287549745746596, "learning_rate": 6.2e-05, "loss": 0.9266, "loss/crossentropy": 2.7054479122161865, "loss/hidden": 0.7421875, "loss/logits": 0.12253857403993607, "loss/reg": 0.0061835781671106815, "step": 62 }, { "epoch": 0.007875, "grad_norm": 2.277440309524536, "grad_norm_var": 0.5252193383179133, "learning_rate": 6.3e-05, "loss": 0.914, "loss/crossentropy": 2.6631381511688232, "loss/hidden": 0.734375, "loss/logits": 0.1177992895245552, "loss/reg": 0.0061830319464206696, "step": 63 }, { "epoch": 0.008, "grad_norm": 3.3314151763916016, "grad_norm_var": 0.531964164332922, "learning_rate": 6.400000000000001e-05, "loss": 1.29, "loss/crossentropy": 2.1269633769989014, "loss/hidden": 1.0625, "loss/logits": 0.16565865278244019, "loss/reg": 0.006182366982102394, "step": 64 }, { "epoch": 0.008125, "grad_norm": 4.333358287811279, "grad_norm_var": 0.7208240839518936, "learning_rate": 6.500000000000001e-05, "loss": 1.1615, "loss/crossentropy": 2.714442491531372, "loss/hidden": 0.94140625, "loss/logits": 0.15825161337852478, "loss/reg": 0.006181675940752029, "step": 65 }, { "epoch": 0.00825, "grad_norm": 2.853740930557251, "grad_norm_var": 0.7223776199927481, "learning_rate": 6.6e-05, "loss": 1.062, "loss/crossentropy": 2.2147135734558105, "loss/hidden": 0.8515625, "loss/logits": 0.14859826862812042, "loss/reg": 0.006180979777127504, "step": 66 }, { "epoch": 0.008375, "grad_norm": 2.8853657245635986, "grad_norm_var": 0.7242961395218184, "learning_rate": 6.7e-05, "loss": 0.9533, "loss/crossentropy": 2.619598388671875, "loss/hidden": 0.7734375, "loss/logits": 0.11804014444351196, "loss/reg": 0.006180332973599434, "step": 67 }, { "epoch": 0.0085, "grad_norm": 2.725229501724243, "grad_norm_var": 0.7142181363616674, "learning_rate": 6.800000000000001e-05, "loss": 1.1308, "loss/crossentropy": 2.4091367721557617, "loss/hidden": 0.90234375, "loss/logits": 0.16662752628326416, "loss/reg": 0.006179714575409889, "step": 68 }, { "epoch": 0.008625, "grad_norm": 2.93643856048584, "grad_norm_var": 0.6977178730278022, "learning_rate": 6.9e-05, "loss": 1.1414, "loss/crossentropy": 2.509793281555176, "loss/hidden": 0.90234375, "loss/logits": 0.17730477452278137, "loss/reg": 0.0061789220198988914, "step": 69 }, { "epoch": 0.00875, "grad_norm": 2.4086973667144775, "grad_norm_var": 0.6896555586144653, "learning_rate": 7e-05, "loss": 0.9852, "loss/crossentropy": 2.7080371379852295, "loss/hidden": 0.7890625, "loss/logits": 0.1343374401330948, "loss/reg": 0.0061781019903719425, "step": 70 }, { "epoch": 0.008875, "grad_norm": 1.9355547428131104, "grad_norm_var": 0.7196579708330165, "learning_rate": 7.1e-05, "loss": 0.9176, "loss/crossentropy": 2.451488494873047, "loss/hidden": 0.7421875, "loss/logits": 0.11365102231502533, "loss/reg": 0.006177456583827734, "step": 71 }, { "epoch": 0.009, "grad_norm": 2.273902654647827, "grad_norm_var": 0.38422972669649574, "learning_rate": 7.2e-05, "loss": 1.0112, "loss/crossentropy": 2.4479947090148926, "loss/hidden": 0.8125, "loss/logits": 0.13690924644470215, "loss/reg": 0.006176764145493507, "step": 72 }, { "epoch": 0.009125, "grad_norm": 3.385849952697754, "grad_norm_var": 0.4217084598233742, "learning_rate": 7.3e-05, "loss": 1.3992, "loss/crossentropy": 2.3916804790496826, "loss/hidden": 1.1484375, "loss/logits": 0.18896484375, "loss/reg": 0.006176079623401165, "step": 73 }, { "epoch": 0.00925, "grad_norm": 1.893932580947876, "grad_norm_var": 0.44317594415441114, "learning_rate": 7.4e-05, "loss": 0.9357, "loss/crossentropy": 2.3809518814086914, "loss/hidden": 0.74609375, "loss/logits": 0.12787015736103058, "loss/reg": 0.00617539556697011, "step": 74 }, { "epoch": 0.009375, "grad_norm": 2.431032657623291, "grad_norm_var": 0.4412621914582907, "learning_rate": 7.500000000000001e-05, "loss": 1.0796, "loss/crossentropy": 2.5346295833587646, "loss/hidden": 0.86328125, "loss/logits": 0.1545613557100296, "loss/reg": 0.006174764130264521, "step": 75 }, { "epoch": 0.0095, "grad_norm": 2.2421321868896484, "grad_norm_var": 0.45066905079875685, "learning_rate": 7.6e-05, "loss": 0.9869, "loss/crossentropy": 2.756843090057373, "loss/hidden": 0.796875, "loss/logits": 0.1282375454902649, "loss/reg": 0.006174163427203894, "step": 76 }, { "epoch": 0.009625, "grad_norm": 2.7022979259490967, "grad_norm_var": 0.4254703741989109, "learning_rate": 7.7e-05, "loss": 1.2503, "loss/crossentropy": 2.0696699619293213, "loss/hidden": 1.015625, "loss/logits": 0.1729813814163208, "loss/reg": 0.006173421163111925, "step": 77 }, { "epoch": 0.00975, "grad_norm": 2.501106023788452, "grad_norm_var": 0.37677934250983375, "learning_rate": 7.800000000000001e-05, "loss": 1.0516, "loss/crossentropy": 2.629380941390991, "loss/hidden": 0.83984375, "loss/logits": 0.15003597736358643, "loss/reg": 0.006172672379761934, "step": 78 }, { "epoch": 0.009875, "grad_norm": 2.137601137161255, "grad_norm_var": 0.3857841035513881, "learning_rate": 7.900000000000001e-05, "loss": 0.9388, "loss/crossentropy": 2.6841280460357666, "loss/hidden": 0.75, "loss/logits": 0.12706515192985535, "loss/reg": 0.006171974819153547, "step": 79 }, { "epoch": 0.01, "grad_norm": 4.655951976776123, "grad_norm_var": 0.6093991769416703, "learning_rate": 8e-05, "loss": 1.2659, "loss/crossentropy": 2.4634439945220947, "loss/hidden": 1.0390625, "loss/logits": 0.16511483490467072, "loss/reg": 0.006171175744384527, "step": 80 }, { "epoch": 0.010125, "grad_norm": 2.2418179512023926, "grad_norm_var": 0.44652068466097317, "learning_rate": 8.1e-05, "loss": 1.0773, "loss/crossentropy": 2.479743480682373, "loss/hidden": 0.87890625, "loss/logits": 0.1366729438304901, "loss/reg": 0.006170437205582857, "step": 81 }, { "epoch": 0.01025, "grad_norm": 2.0470192432403564, "grad_norm_var": 0.4640077865797357, "learning_rate": 8.2e-05, "loss": 0.8599, "loss/crossentropy": 2.440803050994873, "loss/hidden": 0.68359375, "loss/logits": 0.11458206921815872, "loss/reg": 0.0061697582714259624, "step": 82 }, { "epoch": 0.010375, "grad_norm": 2.0131125450134277, "grad_norm_var": 0.47694604476552793, "learning_rate": 8.3e-05, "loss": 0.8585, "loss/crossentropy": 2.480877637863159, "loss/hidden": 0.6875, "loss/logits": 0.10927767306566238, "loss/reg": 0.006169027183204889, "step": 83 }, { "epoch": 0.0105, "grad_norm": 2.2644267082214355, "grad_norm_var": 0.47842071328175656, "learning_rate": 8.4e-05, "loss": 0.8351, "loss/crossentropy": 2.693246841430664, "loss/hidden": 0.67578125, "loss/logits": 0.09764716029167175, "loss/reg": 0.006168315652757883, "step": 84 }, { "epoch": 0.010625, "grad_norm": 3.1729207038879395, "grad_norm_var": 0.4955376038232837, "learning_rate": 8.5e-05, "loss": 1.2314, "loss/crossentropy": 2.3339309692382812, "loss/hidden": 1.015625, "loss/logits": 0.15408015251159668, "loss/reg": 0.006167604587972164, "step": 85 }, { "epoch": 0.01075, "grad_norm": 2.281872510910034, "grad_norm_var": 0.4984116504809473, "learning_rate": 8.6e-05, "loss": 1.1113, "loss/crossentropy": 2.410794258117676, "loss/hidden": 0.8828125, "loss/logits": 0.16686803102493286, "loss/reg": 0.0061669000424444675, "step": 86 }, { "epoch": 0.010875, "grad_norm": 2.701244354248047, "grad_norm_var": 0.4762769450482454, "learning_rate": 8.7e-05, "loss": 0.9115, "loss/crossentropy": 2.5270962715148926, "loss/hidden": 0.73046875, "loss/logits": 0.11935658752918243, "loss/reg": 0.0061660343781113625, "step": 87 }, { "epoch": 0.011, "grad_norm": 2.0738677978515625, "grad_norm_var": 0.4863854399313406, "learning_rate": 8.800000000000001e-05, "loss": 0.9634, "loss/crossentropy": 2.625903844833374, "loss/hidden": 0.7734375, "loss/logits": 0.12826378643512726, "loss/reg": 0.006165289785712957, "step": 88 }, { "epoch": 0.011125, "grad_norm": 2.827744245529175, "grad_norm_var": 0.44340376520124375, "learning_rate": 8.900000000000001e-05, "loss": 1.0134, "loss/crossentropy": 2.2436654567718506, "loss/hidden": 0.80078125, "loss/logits": 0.15097512304782867, "loss/reg": 0.006164397578686476, "step": 89 }, { "epoch": 0.01125, "grad_norm": 2.412203788757324, "grad_norm_var": 0.4174983019540292, "learning_rate": 9e-05, "loss": 0.9541, "loss/crossentropy": 2.4847052097320557, "loss/hidden": 0.78515625, "loss/logits": 0.10735376924276352, "loss/reg": 0.006163434591144323, "step": 90 }, { "epoch": 0.011375, "grad_norm": 2.385309934616089, "grad_norm_var": 0.41831854842319344, "learning_rate": 9.1e-05, "loss": 1.0455, "loss/crossentropy": 2.1011688709259033, "loss/hidden": 0.828125, "loss/logits": 0.15577414631843567, "loss/reg": 0.0061626131646335125, "step": 91 }, { "epoch": 0.0115, "grad_norm": 2.779266595840454, "grad_norm_var": 0.4149256226543306, "learning_rate": 9.200000000000001e-05, "loss": 0.9782, "loss/crossentropy": 2.770954132080078, "loss/hidden": 0.78125, "loss/logits": 0.13530117273330688, "loss/reg": 0.006161784287542105, "step": 92 }, { "epoch": 0.011625, "grad_norm": 2.816206216812134, "grad_norm_var": 0.41767206123470924, "learning_rate": 9.300000000000001e-05, "loss": 1.2584, "loss/crossentropy": 2.4919488430023193, "loss/hidden": 1.0234375, "loss/logits": 0.17335021495819092, "loss/reg": 0.006160792429000139, "step": 93 }, { "epoch": 0.01175, "grad_norm": 2.1000349521636963, "grad_norm_var": 0.4320504871954351, "learning_rate": 9.4e-05, "loss": 0.9293, "loss/crossentropy": 2.6951355934143066, "loss/hidden": 0.7421875, "loss/logits": 0.12551091611385345, "loss/reg": 0.006159830838441849, "step": 94 }, { "epoch": 0.011875, "grad_norm": 2.6696228981018066, "grad_norm_var": 0.4199965621062515, "learning_rate": 9.5e-05, "loss": 1.0491, "loss/crossentropy": 2.6532485485076904, "loss/hidden": 0.83984375, "loss/logits": 0.14771661162376404, "loss/reg": 0.006158801261335611, "step": 95 }, { "epoch": 0.012, "grad_norm": 2.308758020401001, "grad_norm_var": 0.11782165750081125, "learning_rate": 9.6e-05, "loss": 1.1178, "loss/crossentropy": 2.38185977935791, "loss/hidden": 0.90625, "loss/logits": 0.1499352604150772, "loss/reg": 0.006157839670777321, "step": 96 }, { "epoch": 0.012125, "grad_norm": 2.4204304218292236, "grad_norm_var": 0.11501335190634426, "learning_rate": 9.7e-05, "loss": 1.092, "loss/crossentropy": 2.4358534812927246, "loss/hidden": 0.86328125, "loss/logits": 0.16712763905525208, "loss/reg": 0.006156752817332745, "step": 97 }, { "epoch": 0.01225, "grad_norm": 3.7184524536132812, "grad_norm_var": 0.198780236272727, "learning_rate": 9.8e-05, "loss": 1.4311, "loss/crossentropy": 2.1283679008483887, "loss/hidden": 1.171875, "loss/logits": 0.1976230889558792, "loss/reg": 0.006155804730951786, "step": 98 }, { "epoch": 0.012375, "grad_norm": 3.2656571865081787, "grad_norm_var": 0.20565265002658914, "learning_rate": 9.900000000000001e-05, "loss": 1.017, "loss/crossentropy": 2.6715664863586426, "loss/hidden": 0.80078125, "loss/logits": 0.15465494990348816, "loss/reg": 0.006154791917651892, "step": 99 }, { "epoch": 0.0125, "grad_norm": 2.915663719177246, "grad_norm_var": 0.19977570339779593, "learning_rate": 0.0001, "loss": 0.98, "loss/crossentropy": 2.5455305576324463, "loss/hidden": 0.77734375, "loss/logits": 0.1410846710205078, "loss/reg": 0.0061536673456430435, "step": 100 }, { "epoch": 0.012625, "grad_norm": 3.3153059482574463, "grad_norm_var": 0.2104372314148539, "learning_rate": 0.0001, "loss": 1.1039, "loss/crossentropy": 2.455479621887207, "loss/hidden": 0.90625, "loss/logits": 0.13615351915359497, "loss/reg": 0.0061526307836174965, "step": 101 }, { "epoch": 0.01275, "grad_norm": 2.40315318107605, "grad_norm_var": 0.20480568897691, "learning_rate": 0.0001, "loss": 0.9588, "loss/crossentropy": 2.6359853744506836, "loss/hidden": 0.76953125, "loss/logits": 0.1277719885110855, "loss/reg": 0.006151493173092604, "step": 102 }, { "epoch": 0.012875, "grad_norm": 3.625624895095825, "grad_norm_var": 0.25903479701245613, "learning_rate": 0.0001, "loss": 1.2481, "loss/crossentropy": 2.0148656368255615, "loss/hidden": 1.046875, "loss/logits": 0.13969773054122925, "loss/reg": 0.006150420755147934, "step": 103 }, { "epoch": 0.013, "grad_norm": 2.497906446456909, "grad_norm_var": 0.23191354079432358, "learning_rate": 0.0001, "loss": 1.0603, "loss/crossentropy": 2.3493525981903076, "loss/hidden": 0.86328125, "loss/logits": 0.13548779487609863, "loss/reg": 0.006149281747639179, "step": 104 }, { "epoch": 0.013125, "grad_norm": 3.258059501647949, "grad_norm_var": 0.24629299643454275, "learning_rate": 0.0001, "loss": 0.9497, "loss/crossentropy": 2.6988418102264404, "loss/hidden": 0.7734375, "loss/logits": 0.11473990976810455, "loss/reg": 0.006148339249193668, "step": 105 }, { "epoch": 0.01325, "grad_norm": 3.1279666423797607, "grad_norm_var": 0.24075672502018505, "learning_rate": 0.0001, "loss": 1.1195, "loss/crossentropy": 2.578716278076172, "loss/hidden": 0.875, "loss/logits": 0.18304204940795898, "loss/reg": 0.006147205363959074, "step": 106 }, { "epoch": 0.013375, "grad_norm": 2.760901927947998, "grad_norm_var": 0.22627915570051277, "learning_rate": 0.0001, "loss": 0.9369, "loss/crossentropy": 2.5835328102111816, "loss/hidden": 0.75, "loss/logits": 0.12544697523117065, "loss/reg": 0.006146106868982315, "step": 107 }, { "epoch": 0.0135, "grad_norm": 3.2917559146881104, "grad_norm_var": 0.23622539643692994, "learning_rate": 0.0001, "loss": 1.1437, "loss/crossentropy": 2.6001460552215576, "loss/hidden": 0.91796875, "loss/logits": 0.16428819298744202, "loss/reg": 0.006144997663795948, "step": 108 }, { "epoch": 0.013625, "grad_norm": 3.3908517360687256, "grad_norm_var": 0.2499864352593607, "learning_rate": 0.0001, "loss": 1.0747, "loss/crossentropy": 2.6003377437591553, "loss/hidden": 0.87109375, "loss/logits": 0.14213082194328308, "loss/reg": 0.00614393362775445, "step": 109 }, { "epoch": 0.01375, "grad_norm": 2.7455620765686035, "grad_norm_var": 0.2035723185991922, "learning_rate": 0.0001, "loss": 1.1844, "loss/crossentropy": 2.446432113647461, "loss/hidden": 0.94921875, "loss/logits": 0.17372827231884003, "loss/reg": 0.00614282488822937, "step": 110 }, { "epoch": 0.013875, "grad_norm": 2.899392604827881, "grad_norm_var": 0.1972949454934593, "learning_rate": 0.0001, "loss": 1.0314, "loss/crossentropy": 2.4233920574188232, "loss/hidden": 0.83984375, "loss/logits": 0.13018067181110382, "loss/reg": 0.00614172825589776, "step": 111 }, { "epoch": 0.014, "grad_norm": 2.204866647720337, "grad_norm_var": 0.20749751086427656, "learning_rate": 0.0001, "loss": 0.9867, "loss/crossentropy": 2.4006736278533936, "loss/hidden": 0.79296875, "loss/logits": 0.13233302533626556, "loss/reg": 0.006140332669019699, "step": 112 }, { "epoch": 0.014125, "grad_norm": 2.5094263553619385, "grad_norm_var": 0.20123279411857975, "learning_rate": 0.0001, "loss": 1.2429, "loss/crossentropy": 2.2730560302734375, "loss/hidden": 1.0078125, "loss/logits": 0.1737476885318756, "loss/reg": 0.006138913799077272, "step": 113 }, { "epoch": 0.01425, "grad_norm": 2.590543031692505, "grad_norm_var": 0.17204464736018749, "learning_rate": 0.0001, "loss": 1.0086, "loss/crossentropy": 2.5709896087646484, "loss/hidden": 0.79296875, "loss/logits": 0.1542350947856903, "loss/reg": 0.0061377594247460365, "step": 114 }, { "epoch": 0.014375, "grad_norm": 2.5024876594543457, "grad_norm_var": 0.17379926494707643, "learning_rate": 0.0001, "loss": 1.0309, "loss/crossentropy": 2.539165496826172, "loss/hidden": 0.828125, "loss/logits": 0.14142319560050964, "loss/reg": 0.006136584095656872, "step": 115 }, { "epoch": 0.0145, "grad_norm": 3.2216732501983643, "grad_norm_var": 0.18121036366206128, "learning_rate": 0.0001, "loss": 0.9404, "loss/crossentropy": 2.7685325145721436, "loss/hidden": 0.765625, "loss/logits": 0.1133967787027359, "loss/reg": 0.006135319825261831, "step": 116 }, { "epoch": 0.014625, "grad_norm": 2.3834009170532227, "grad_norm_var": 0.18346146088524526, "learning_rate": 0.0001, "loss": 1.1432, "loss/crossentropy": 2.4507999420166016, "loss/hidden": 0.92578125, "loss/logits": 0.1561031937599182, "loss/reg": 0.006133983377367258, "step": 117 }, { "epoch": 0.01475, "grad_norm": 2.4703636169433594, "grad_norm_var": 0.17984383474256424, "learning_rate": 0.0001, "loss": 1.0541, "loss/crossentropy": 2.3506076335906982, "loss/hidden": 0.84765625, "loss/logits": 0.14511807262897491, "loss/reg": 0.006132753100246191, "step": 118 }, { "epoch": 0.014875, "grad_norm": 2.5960817337036133, "grad_norm_var": 0.13859654880591943, "learning_rate": 0.0001, "loss": 1.2156, "loss/crossentropy": 2.427006244659424, "loss/hidden": 0.96875, "loss/logits": 0.1855170726776123, "loss/reg": 0.006131566129624844, "step": 119 }, { "epoch": 0.015, "grad_norm": 2.908734083175659, "grad_norm_var": 0.13379147574996655, "learning_rate": 0.0001, "loss": 1.0136, "loss/crossentropy": 2.4075210094451904, "loss/hidden": 0.81640625, "loss/logits": 0.13592825829982758, "loss/reg": 0.006130332592874765, "step": 120 }, { "epoch": 0.015125, "grad_norm": 3.450002670288086, "grad_norm_var": 0.147717685364636, "learning_rate": 0.0001, "loss": 1.1584, "loss/crossentropy": 2.446925640106201, "loss/hidden": 0.92578125, "loss/logits": 0.17129938304424286, "loss/reg": 0.0061291721649467945, "step": 121 }, { "epoch": 0.01525, "grad_norm": 2.941195011138916, "grad_norm_var": 0.14212594790061886, "learning_rate": 0.0001, "loss": 1.0996, "loss/crossentropy": 2.5499086380004883, "loss/hidden": 0.87109375, "loss/logits": 0.1672220528125763, "loss/reg": 0.006127914879471064, "step": 122 }, { "epoch": 0.015375, "grad_norm": 2.951799154281616, "grad_norm_var": 0.14330143067309015, "learning_rate": 0.0001, "loss": 1.0862, "loss/crossentropy": 2.654383420944214, "loss/hidden": 0.87109375, "loss/logits": 0.15379250049591064, "loss/reg": 0.006126696243882179, "step": 123 }, { "epoch": 0.0155, "grad_norm": 2.5093131065368652, "grad_norm_var": 0.13194533540905293, "learning_rate": 0.0001, "loss": 1.0905, "loss/crossentropy": 2.4646618366241455, "loss/hidden": 0.87890625, "loss/logits": 0.15029752254486084, "loss/reg": 0.006125394720584154, "step": 124 }, { "epoch": 0.015625, "grad_norm": 2.357142448425293, "grad_norm_var": 0.11277765633995311, "learning_rate": 0.0001, "loss": 1.0794, "loss/crossentropy": 2.4590322971343994, "loss/hidden": 0.87109375, "loss/logits": 0.1471107453107834, "loss/reg": 0.0061240773648023605, "step": 125 }, { "epoch": 0.01575, "grad_norm": 2.0443954467773438, "grad_norm_var": 0.13949059079901172, "learning_rate": 0.0001, "loss": 1.0064, "loss/crossentropy": 2.6105568408966064, "loss/hidden": 0.80859375, "loss/logits": 0.13658249378204346, "loss/reg": 0.006122750695794821, "step": 126 }, { "epoch": 0.015875, "grad_norm": 2.334003448486328, "grad_norm_var": 0.1413326038540049, "learning_rate": 0.0001, "loss": 1.128, "loss/crossentropy": 2.3226428031921387, "loss/hidden": 0.8984375, "loss/logits": 0.16836631298065186, "loss/reg": 0.006121381651610136, "step": 127 }, { "epoch": 0.016, "grad_norm": 2.6693766117095947, "grad_norm_var": 0.12889249481462456, "learning_rate": 0.0001, "loss": 1.0478, "loss/crossentropy": 2.5844597816467285, "loss/hidden": 0.84765625, "loss/logits": 0.1388963758945465, "loss/reg": 0.006120136007666588, "step": 128 }, { "epoch": 0.016125, "grad_norm": 3.935439348220825, "grad_norm_var": 0.22878447427120438, "learning_rate": 0.0001, "loss": 1.1726, "loss/crossentropy": 2.7213780879974365, "loss/hidden": 0.9375, "loss/logits": 0.1738772690296173, "loss/reg": 0.006118897348642349, "step": 129 }, { "epoch": 0.01625, "grad_norm": 3.463432788848877, "grad_norm_var": 0.25882213944617144, "learning_rate": 0.0001, "loss": 1.0898, "loss/crossentropy": 2.3635873794555664, "loss/hidden": 0.8828125, "loss/logits": 0.1457763910293579, "loss/reg": 0.006117486394941807, "step": 130 }, { "epoch": 0.016375, "grad_norm": 3.779526948928833, "grad_norm_var": 0.31074183113488135, "learning_rate": 0.0001, "loss": 1.2078, "loss/crossentropy": 2.316762924194336, "loss/hidden": 0.98046875, "loss/logits": 0.16614478826522827, "loss/reg": 0.006116243079304695, "step": 131 }, { "epoch": 0.0165, "grad_norm": 2.7554008960723877, "grad_norm_var": 0.3028391023812749, "learning_rate": 0.0001, "loss": 0.9769, "loss/crossentropy": 2.458954095840454, "loss/hidden": 0.7890625, "loss/logits": 0.12667913734912872, "loss/reg": 0.006114880088716745, "step": 132 }, { "epoch": 0.016625, "grad_norm": 2.342526435852051, "grad_norm_var": 0.30546929082944035, "learning_rate": 0.0001, "loss": 1.1137, "loss/crossentropy": 2.6329517364501953, "loss/hidden": 0.890625, "loss/logits": 0.161947563290596, "loss/reg": 0.0061136274598538876, "step": 133 }, { "epoch": 0.01675, "grad_norm": 2.2754058837890625, "grad_norm_var": 0.31756495416411024, "learning_rate": 0.0001, "loss": 1.1703, "loss/crossentropy": 2.2747550010681152, "loss/hidden": 0.94921875, "loss/logits": 0.15994513034820557, "loss/reg": 0.006112351547926664, "step": 134 }, { "epoch": 0.016875, "grad_norm": 3.1313912868499756, "grad_norm_var": 0.3186282278045513, "learning_rate": 0.0001, "loss": 1.2333, "loss/crossentropy": 2.4932894706726074, "loss/hidden": 0.99609375, "loss/logits": 0.17612434923648834, "loss/reg": 0.006111042574048042, "step": 135 }, { "epoch": 0.017, "grad_norm": 3.960482358932495, "grad_norm_var": 0.39381746513703864, "learning_rate": 0.0001, "loss": 1.3101, "loss/crossentropy": 2.581660747528076, "loss/hidden": 1.0625, "loss/logits": 0.18646802008152008, "loss/reg": 0.006109676789492369, "step": 136 }, { "epoch": 0.017125, "grad_norm": 2.7605810165405273, "grad_norm_var": 0.37584340109069647, "learning_rate": 0.0001, "loss": 0.8792, "loss/crossentropy": 2.6490936279296875, "loss/hidden": 0.703125, "loss/logits": 0.1150316372513771, "loss/reg": 0.006108277477324009, "step": 137 }, { "epoch": 0.01725, "grad_norm": 2.6196203231811523, "grad_norm_var": 0.38003486499210315, "learning_rate": 0.0001, "loss": 0.955, "loss/crossentropy": 2.633441209793091, "loss/hidden": 0.76953125, "loss/logits": 0.1244344562292099, "loss/reg": 0.006106934975832701, "step": 138 }, { "epoch": 0.017375, "grad_norm": 4.534512519836426, "grad_norm_var": 0.554255985026353, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.2204151153564453, "loss/hidden": 1.1796875, "loss/logits": 0.1696874350309372, "loss/reg": 0.0061056241393089294, "step": 139 }, { "epoch": 0.0175, "grad_norm": 2.192370653152466, "grad_norm_var": 0.5798771099829023, "learning_rate": 0.0001, "loss": 1.1299, "loss/crossentropy": 2.375506639480591, "loss/hidden": 0.921875, "loss/logits": 0.14694982767105103, "loss/reg": 0.0061043244786560535, "step": 140 }, { "epoch": 0.017625, "grad_norm": 4.368403911590576, "grad_norm_var": 0.6744588881998081, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.3692545890808105, "loss/hidden": 1.03125, "loss/logits": 0.18568292260169983, "loss/reg": 0.006102937273681164, "step": 141 }, { "epoch": 0.01775, "grad_norm": 2.2753779888153076, "grad_norm_var": 0.6461169960118004, "learning_rate": 0.0001, "loss": 1.0276, "loss/crossentropy": 2.470676898956299, "loss/hidden": 0.82421875, "loss/logits": 0.14231771230697632, "loss/reg": 0.006101653911173344, "step": 142 }, { "epoch": 0.017875, "grad_norm": 2.6550562381744385, "grad_norm_var": 0.6203099666067883, "learning_rate": 0.0001, "loss": 0.8712, "loss/crossentropy": 2.8198063373565674, "loss/hidden": 0.69921875, "loss/logits": 0.11099085956811905, "loss/reg": 0.006100376136600971, "step": 143 }, { "epoch": 0.018, "grad_norm": 2.8701858520507812, "grad_norm_var": 0.6111015072729884, "learning_rate": 0.0001, "loss": 1.1794, "loss/crossentropy": 2.413463830947876, "loss/hidden": 0.96484375, "loss/logits": 0.15351834893226624, "loss/reg": 0.006099053658545017, "step": 144 }, { "epoch": 0.018125, "grad_norm": 2.2347958087921143, "grad_norm_var": 0.6069563505613275, "learning_rate": 0.0001, "loss": 1.0832, "loss/crossentropy": 2.446056604385376, "loss/hidden": 0.8671875, "loss/logits": 0.1550455242395401, "loss/reg": 0.006097796373069286, "step": 145 }, { "epoch": 0.01825, "grad_norm": 2.60143780708313, "grad_norm_var": 0.6017061449507364, "learning_rate": 0.0001, "loss": 1.1216, "loss/crossentropy": 2.2890260219573975, "loss/hidden": 0.8984375, "loss/logits": 0.16223573684692383, "loss/reg": 0.006096460856497288, "step": 146 }, { "epoch": 0.018375, "grad_norm": 3.656100273132324, "grad_norm_var": 0.5891684064627459, "learning_rate": 0.0001, "loss": 1.2759, "loss/crossentropy": 2.2077646255493164, "loss/hidden": 1.0546875, "loss/logits": 0.16024138033390045, "loss/reg": 0.006095105782151222, "step": 147 }, { "epoch": 0.0185, "grad_norm": 2.8190999031066895, "grad_norm_var": 0.5877513730221795, "learning_rate": 0.0001, "loss": 1.1416, "loss/crossentropy": 2.4892842769622803, "loss/hidden": 0.9140625, "loss/logits": 0.1665700376033783, "loss/reg": 0.0060938019305467606, "step": 148 }, { "epoch": 0.018625, "grad_norm": 2.6578848361968994, "grad_norm_var": 0.568168306773175, "learning_rate": 0.0001, "loss": 1.1443, "loss/crossentropy": 2.3138527870178223, "loss/hidden": 0.93359375, "loss/logits": 0.14977282285690308, "loss/reg": 0.006092346739023924, "step": 149 }, { "epoch": 0.01875, "grad_norm": 2.656559944152832, "grad_norm_var": 0.5416540961853636, "learning_rate": 0.0001, "loss": 0.9868, "loss/crossentropy": 2.7701377868652344, "loss/hidden": 0.796875, "loss/logits": 0.12901648879051208, "loss/reg": 0.006090943701565266, "step": 150 }, { "epoch": 0.018875, "grad_norm": 1.9359983205795288, "grad_norm_var": 0.6099613145708634, "learning_rate": 0.0001, "loss": 0.9127, "loss/crossentropy": 2.55560040473938, "loss/hidden": 0.73828125, "loss/logits": 0.11351295560598373, "loss/reg": 0.00608965614810586, "step": 151 }, { "epoch": 0.019, "grad_norm": 3.7978732585906982, "grad_norm_var": 0.5891613317586338, "learning_rate": 0.0001, "loss": 1.2275, "loss/crossentropy": 2.4227731227874756, "loss/hidden": 0.98828125, "loss/logits": 0.17836451530456543, "loss/reg": 0.006088252179324627, "step": 152 }, { "epoch": 0.019125, "grad_norm": 2.8193647861480713, "grad_norm_var": 0.588169020521083, "learning_rate": 0.0001, "loss": 0.9739, "loss/crossentropy": 2.474368095397949, "loss/hidden": 0.80078125, "loss/logits": 0.11225409805774689, "loss/reg": 0.006086937617510557, "step": 153 }, { "epoch": 0.01925, "grad_norm": 2.2882325649261475, "grad_norm_var": 0.6082348956957436, "learning_rate": 0.0001, "loss": 1.0395, "loss/crossentropy": 2.3776350021362305, "loss/hidden": 0.82421875, "loss/logits": 0.15443992614746094, "loss/reg": 0.0060854703187942505, "step": 154 }, { "epoch": 0.019375, "grad_norm": 2.006150245666504, "grad_norm_var": 0.4559805309993303, "learning_rate": 0.0001, "loss": 0.9762, "loss/crossentropy": 2.7556076049804688, "loss/hidden": 0.78515625, "loss/logits": 0.13019207119941711, "loss/reg": 0.006084186024963856, "step": 155 }, { "epoch": 0.0195, "grad_norm": 2.8143231868743896, "grad_norm_var": 0.43477030174237014, "learning_rate": 0.0001, "loss": 1.1927, "loss/crossentropy": 2.652045249938965, "loss/hidden": 0.94140625, "loss/logits": 0.19042611122131348, "loss/reg": 0.00608274107798934, "step": 156 }, { "epoch": 0.019625, "grad_norm": 2.957540988922119, "grad_norm_var": 0.2601037584282233, "learning_rate": 0.0001, "loss": 1.0641, "loss/crossentropy": 2.546213150024414, "loss/hidden": 0.86328125, "loss/logits": 0.14000022411346436, "loss/reg": 0.006081291940063238, "step": 157 }, { "epoch": 0.01975, "grad_norm": 2.625493288040161, "grad_norm_var": 0.24839219907499052, "learning_rate": 0.0001, "loss": 1.012, "loss/crossentropy": 2.5120432376861572, "loss/hidden": 0.81640625, "loss/logits": 0.13474689424037933, "loss/reg": 0.006079958751797676, "step": 158 }, { "epoch": 0.019875, "grad_norm": 2.6614878177642822, "grad_norm_var": 0.2483457330217589, "learning_rate": 0.0001, "loss": 0.9873, "loss/crossentropy": 2.312061071395874, "loss/hidden": 0.80859375, "loss/logits": 0.11790065467357635, "loss/reg": 0.006078665144741535, "step": 159 }, { "epoch": 0.02, "grad_norm": 2.6204919815063477, "grad_norm_var": 0.24699792562249925, "learning_rate": 0.0001, "loss": 1.0488, "loss/crossentropy": 2.505072593688965, "loss/hidden": 0.84375, "loss/logits": 0.14428117871284485, "loss/reg": 0.006077310536056757, "step": 160 }, { "epoch": 0.020125, "grad_norm": 3.107072591781616, "grad_norm_var": 0.24079003208151678, "learning_rate": 0.0001, "loss": 1.1736, "loss/crossentropy": 2.6514599323272705, "loss/hidden": 0.96484375, "loss/logits": 0.1480400413274765, "loss/reg": 0.006076075602322817, "step": 161 }, { "epoch": 0.02025, "grad_norm": 2.669001817703247, "grad_norm_var": 0.23972287159530806, "learning_rate": 0.0001, "loss": 1.1966, "loss/crossentropy": 2.4616479873657227, "loss/hidden": 0.9765625, "loss/logits": 0.15933012962341309, "loss/reg": 0.006074720993638039, "step": 162 }, { "epoch": 0.020375, "grad_norm": 2.5872421264648438, "grad_norm_var": 0.1828196031273113, "learning_rate": 0.0001, "loss": 1.0551, "loss/crossentropy": 2.5483999252319336, "loss/hidden": 0.83984375, "loss/logits": 0.1544739305973053, "loss/reg": 0.006073469761759043, "step": 163 }, { "epoch": 0.0205, "grad_norm": 2.3342509269714355, "grad_norm_var": 0.1891007671877621, "learning_rate": 0.0001, "loss": 1.1418, "loss/crossentropy": 2.610344171524048, "loss/hidden": 0.90234375, "loss/logits": 0.17876723408699036, "loss/reg": 0.006072178483009338, "step": 164 }, { "epoch": 0.020625, "grad_norm": 2.548274278640747, "grad_norm_var": 0.18986337395058156, "learning_rate": 0.0001, "loss": 0.9512, "loss/crossentropy": 2.747725009918213, "loss/hidden": 0.7734375, "loss/logits": 0.11706214398145676, "loss/reg": 0.00607073912397027, "step": 165 }, { "epoch": 0.02075, "grad_norm": 2.666066884994507, "grad_norm_var": 0.18987501227134793, "learning_rate": 0.0001, "loss": 1.0557, "loss/crossentropy": 2.3086578845977783, "loss/hidden": 0.83984375, "loss/logits": 0.1551416665315628, "loss/reg": 0.006069260183721781, "step": 166 }, { "epoch": 0.020875, "grad_norm": 3.363084554672241, "grad_norm_var": 0.18083982986582872, "learning_rate": 0.0001, "loss": 0.9886, "loss/crossentropy": 2.7422661781311035, "loss/hidden": 0.79296875, "loss/logits": 0.13497118651866913, "loss/reg": 0.006067754700779915, "step": 167 }, { "epoch": 0.021, "grad_norm": 2.717400550842285, "grad_norm_var": 0.10163689874761227, "learning_rate": 0.0001, "loss": 1.2413, "loss/crossentropy": 2.341296672821045, "loss/hidden": 1.0078125, "loss/logits": 0.17277640104293823, "loss/reg": 0.006066245958209038, "step": 168 }, { "epoch": 0.021125, "grad_norm": 2.2773897647857666, "grad_norm_var": 0.10949759007257095, "learning_rate": 0.0001, "loss": 0.9531, "loss/crossentropy": 2.492532968521118, "loss/hidden": 0.76953125, "loss/logits": 0.12295819818973541, "loss/reg": 0.006064848508685827, "step": 169 }, { "epoch": 0.02125, "grad_norm": 2.7625067234039307, "grad_norm_var": 0.1012976809853086, "learning_rate": 0.0001, "loss": 1.0102, "loss/crossentropy": 2.3799381256103516, "loss/hidden": 0.80859375, "loss/logits": 0.140989288687706, "loss/reg": 0.0060633583925664425, "step": 170 }, { "epoch": 0.021375, "grad_norm": 3.713162899017334, "grad_norm_var": 0.1323542313667114, "learning_rate": 0.0001, "loss": 1.0173, "loss/crossentropy": 2.7296385765075684, "loss/hidden": 0.80078125, "loss/logits": 0.1559314727783203, "loss/reg": 0.006062004715204239, "step": 171 }, { "epoch": 0.0215, "grad_norm": 2.8448026180267334, "grad_norm_var": 0.13256580340874963, "learning_rate": 0.0001, "loss": 1.0945, "loss/crossentropy": 2.211848497390747, "loss/hidden": 0.87890625, "loss/logits": 0.15503031015396118, "loss/reg": 0.006060663145035505, "step": 172 }, { "epoch": 0.021625, "grad_norm": 2.951566696166992, "grad_norm_var": 0.13242537871232402, "learning_rate": 0.0001, "loss": 1.243, "loss/crossentropy": 2.6379833221435547, "loss/hidden": 0.96484375, "loss/logits": 0.21754613518714905, "loss/reg": 0.00605935649946332, "step": 173 }, { "epoch": 0.02175, "grad_norm": 2.6862404346466064, "grad_norm_var": 0.13142011502921586, "learning_rate": 0.0001, "loss": 1.0053, "loss/crossentropy": 2.3807766437530518, "loss/hidden": 0.80078125, "loss/logits": 0.14393460750579834, "loss/reg": 0.006058130878955126, "step": 174 }, { "epoch": 0.021875, "grad_norm": 2.5145609378814697, "grad_norm_var": 0.13512780159794507, "learning_rate": 0.0001, "loss": 1.0609, "loss/crossentropy": 2.4608380794525146, "loss/hidden": 0.85546875, "loss/logits": 0.14485566318035126, "loss/reg": 0.006056922487914562, "step": 175 }, { "epoch": 0.022, "grad_norm": 3.23178768157959, "grad_norm_var": 0.14607750168249728, "learning_rate": 0.0001, "loss": 1.1294, "loss/crossentropy": 2.9791719913482666, "loss/hidden": 0.91796875, "loss/logits": 0.1508345603942871, "loss/reg": 0.006055623292922974, "step": 176 }, { "epoch": 0.022125, "grad_norm": 2.7397234439849854, "grad_norm_var": 0.14000512423072375, "learning_rate": 0.0001, "loss": 1.0578, "loss/crossentropy": 2.4559919834136963, "loss/hidden": 0.86328125, "loss/logits": 0.1340080350637436, "loss/reg": 0.0060544307343661785, "step": 177 }, { "epoch": 0.02225, "grad_norm": 2.6637048721313477, "grad_norm_var": 0.14009088002925954, "learning_rate": 0.0001, "loss": 1.076, "loss/crossentropy": 2.3794586658477783, "loss/hidden": 0.86328125, "loss/logits": 0.15214313566684723, "loss/reg": 0.0060530174523591995, "step": 178 }, { "epoch": 0.022375, "grad_norm": 2.0105221271514893, "grad_norm_var": 0.17628626628935157, "learning_rate": 0.0001, "loss": 0.9703, "loss/crossentropy": 2.3926336765289307, "loss/hidden": 0.77734375, "loss/logits": 0.13244566321372986, "loss/reg": 0.0060517978854477406, "step": 179 }, { "epoch": 0.0225, "grad_norm": 2.571902275085449, "grad_norm_var": 0.16659277386996318, "learning_rate": 0.0001, "loss": 1.0739, "loss/crossentropy": 2.7502923011779785, "loss/hidden": 0.8515625, "loss/logits": 0.16181406378746033, "loss/reg": 0.006050686351954937, "step": 180 }, { "epoch": 0.022625, "grad_norm": 2.700366973876953, "grad_norm_var": 0.1636147823311904, "learning_rate": 0.0001, "loss": 1.0113, "loss/crossentropy": 2.502389669418335, "loss/hidden": 0.8125, "loss/logits": 0.138347327709198, "loss/reg": 0.006049246061593294, "step": 181 }, { "epoch": 0.02275, "grad_norm": 2.7259435653686523, "grad_norm_var": 0.1629618050893432, "learning_rate": 0.0001, "loss": 1.0192, "loss/crossentropy": 2.2493560314178467, "loss/hidden": 0.82421875, "loss/logits": 0.1344609260559082, "loss/reg": 0.006048021838068962, "step": 182 }, { "epoch": 0.022875, "grad_norm": 4.930091857910156, "grad_norm_var": 0.43832731745023895, "learning_rate": 0.0001, "loss": 1.1874, "loss/crossentropy": 2.649231433868408, "loss/hidden": 0.94140625, "loss/logits": 0.1855432242155075, "loss/reg": 0.006046844646334648, "step": 183 }, { "epoch": 0.023, "grad_norm": 2.288604259490967, "grad_norm_var": 0.4589782783160859, "learning_rate": 0.0001, "loss": 1.0354, "loss/crossentropy": 3.0482568740844727, "loss/hidden": 0.8203125, "loss/logits": 0.15461647510528564, "loss/reg": 0.006045445334166288, "step": 184 }, { "epoch": 0.023125, "grad_norm": 2.7902991771698, "grad_norm_var": 0.4362058684835667, "learning_rate": 0.0001, "loss": 1.0744, "loss/crossentropy": 2.726069211959839, "loss/hidden": 0.8359375, "loss/logits": 0.17799492180347443, "loss/reg": 0.006044231820851564, "step": 185 }, { "epoch": 0.02325, "grad_norm": 3.597017526626587, "grad_norm_var": 0.46633972017124825, "learning_rate": 0.0001, "loss": 1.0985, "loss/crossentropy": 2.200692892074585, "loss/hidden": 0.8984375, "loss/logits": 0.13961729407310486, "loss/reg": 0.006042772904038429, "step": 186 }, { "epoch": 0.023375, "grad_norm": 2.969062566757202, "grad_norm_var": 0.42374272593361867, "learning_rate": 0.0001, "loss": 1.2314, "loss/crossentropy": 2.3744540214538574, "loss/hidden": 0.96875, "loss/logits": 0.20225511491298676, "loss/reg": 0.006041594315320253, "step": 187 }, { "epoch": 0.0235, "grad_norm": 3.2257020473480225, "grad_norm_var": 0.4305906329857976, "learning_rate": 0.0001, "loss": 1.0982, "loss/crossentropy": 2.442505121231079, "loss/hidden": 0.875, "loss/logits": 0.16284233331680298, "loss/reg": 0.006040407810360193, "step": 188 }, { "epoch": 0.023625, "grad_norm": 3.670443058013916, "grad_norm_var": 0.4666515285365591, "learning_rate": 0.0001, "loss": 1.2391, "loss/crossentropy": 2.533158540725708, "loss/hidden": 0.98046875, "loss/logits": 0.19827201962471008, "loss/reg": 0.0060392809100449085, "step": 189 }, { "epoch": 0.02375, "grad_norm": 7.53206729888916, "grad_norm_var": 1.7591779439754056, "learning_rate": 0.0001, "loss": 1.1689, "loss/crossentropy": 2.3104734420776367, "loss/hidden": 0.96875, "loss/logits": 0.13976144790649414, "loss/reg": 0.006038178689777851, "step": 190 }, { "epoch": 0.023875, "grad_norm": 4.658889293670654, "grad_norm_var": 1.833400975261701, "learning_rate": 0.0001, "loss": 1.3266, "loss/crossentropy": 2.286229133605957, "loss/hidden": 1.1015625, "loss/logits": 0.16465552151203156, "loss/reg": 0.006036726757884026, "step": 191 }, { "epoch": 0.024, "grad_norm": 3.2109904289245605, "grad_norm_var": 1.8338781863373583, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.5849151611328125, "loss/hidden": 1.0078125, "loss/logits": 0.20983844995498657, "loss/reg": 0.006035543512552977, "step": 192 }, { "epoch": 0.024125, "grad_norm": 2.556408643722534, "grad_norm_var": 1.8519417466969637, "learning_rate": 0.0001, "loss": 1.0335, "loss/crossentropy": 2.635669231414795, "loss/hidden": 0.8359375, "loss/logits": 0.13721294701099396, "loss/reg": 0.006034051068127155, "step": 193 }, { "epoch": 0.02425, "grad_norm": 3.4185855388641357, "grad_norm_var": 1.8153229069184569, "learning_rate": 0.0001, "loss": 1.0115, "loss/crossentropy": 2.3127341270446777, "loss/hidden": 0.828125, "loss/logits": 0.12303752452135086, "loss/reg": 0.00603274954482913, "step": 194 }, { "epoch": 0.024375, "grad_norm": 3.639681816101074, "grad_norm_var": 1.6731808292397734, "learning_rate": 0.0001, "loss": 1.2374, "loss/crossentropy": 2.4363749027252197, "loss/hidden": 0.98046875, "loss/logits": 0.19659578800201416, "loss/reg": 0.006031363736838102, "step": 195 }, { "epoch": 0.0245, "grad_norm": 3.266385078430176, "grad_norm_var": 1.614572274352353, "learning_rate": 0.0001, "loss": 1.19, "loss/crossentropy": 2.2824337482452393, "loss/hidden": 0.9609375, "loss/logits": 0.16878634691238403, "loss/reg": 0.006029782351106405, "step": 196 }, { "epoch": 0.024625, "grad_norm": 3.0692105293273926, "grad_norm_var": 1.5801212385016838, "learning_rate": 0.0001, "loss": 1.1495, "loss/crossentropy": 2.518056631088257, "loss/hidden": 0.921875, "loss/logits": 0.16731634736061096, "loss/reg": 0.006028252653777599, "step": 197 }, { "epoch": 0.02475, "grad_norm": 3.390202283859253, "grad_norm_var": 1.530565626963321, "learning_rate": 0.0001, "loss": 1.1783, "loss/crossentropy": 2.3565316200256348, "loss/hidden": 0.9375, "loss/logits": 0.18055224418640137, "loss/reg": 0.006026738323271275, "step": 198 }, { "epoch": 0.024875, "grad_norm": 2.524461030960083, "grad_norm_var": 1.4779304822181976, "learning_rate": 0.0001, "loss": 1.095, "loss/crossentropy": 2.3489255905151367, "loss/hidden": 0.88671875, "loss/logits": 0.1480264812707901, "loss/reg": 0.006025230046361685, "step": 199 }, { "epoch": 0.025, "grad_norm": 2.8753433227539062, "grad_norm_var": 1.4056158732497617, "learning_rate": 0.0001, "loss": 1.1396, "loss/crossentropy": 2.379971504211426, "loss/hidden": 0.90625, "loss/logits": 0.17312359809875488, "loss/reg": 0.0060236188583076, "step": 200 }, { "epoch": 0.025125, "grad_norm": 2.2297983169555664, "grad_norm_var": 1.4801331513155804, "learning_rate": 0.0001, "loss": 1.1642, "loss/crossentropy": 2.401499032974243, "loss/hidden": 0.9296875, "loss/logits": 0.1743072271347046, "loss/reg": 0.006021994166076183, "step": 201 }, { "epoch": 0.02525, "grad_norm": 2.7430193424224854, "grad_norm_var": 1.5134885749372204, "learning_rate": 0.0001, "loss": 1.3503, "loss/crossentropy": 2.3397345542907715, "loss/hidden": 1.09375, "loss/logits": 0.1963859498500824, "loss/reg": 0.006020485423505306, "step": 202 }, { "epoch": 0.025375, "grad_norm": 3.3862688541412354, "grad_norm_var": 1.4983780502999742, "learning_rate": 0.0001, "loss": 1.3154, "loss/crossentropy": 2.3259048461914062, "loss/hidden": 1.09375, "loss/logits": 0.1614416241645813, "loss/reg": 0.0060190120711922646, "step": 203 }, { "epoch": 0.0255, "grad_norm": 2.554938316345215, "grad_norm_var": 1.547662147741073, "learning_rate": 0.0001, "loss": 1.1147, "loss/crossentropy": 2.559544801712036, "loss/hidden": 0.890625, "loss/logits": 0.16388913989067078, "loss/reg": 0.006017730105668306, "step": 204 }, { "epoch": 0.025625, "grad_norm": 2.6290361881256104, "grad_norm_var": 1.5807281675134672, "learning_rate": 0.0001, "loss": 1.049, "loss/crossentropy": 2.7080090045928955, "loss/hidden": 0.828125, "loss/logits": 0.16068041324615479, "loss/reg": 0.006016433704644442, "step": 205 }, { "epoch": 0.02575, "grad_norm": 2.234259605407715, "grad_norm_var": 0.38456120947827777, "learning_rate": 0.0001, "loss": 1.0392, "loss/crossentropy": 2.3816347122192383, "loss/hidden": 0.8359375, "loss/logits": 0.14315146207809448, "loss/reg": 0.0060149249620735645, "step": 206 }, { "epoch": 0.025875, "grad_norm": 2.810352325439453, "grad_norm_var": 0.19522907990381644, "learning_rate": 0.0001, "loss": 1.1385, "loss/crossentropy": 2.6245384216308594, "loss/hidden": 0.90625, "loss/logits": 0.17206540703773499, "loss/reg": 0.006013684440404177, "step": 207 }, { "epoch": 0.026, "grad_norm": 2.198707342147827, "grad_norm_var": 0.21847125065788287, "learning_rate": 0.0001, "loss": 0.9762, "loss/crossentropy": 2.3812787532806396, "loss/hidden": 0.796875, "loss/logits": 0.119233138859272, "loss/reg": 0.006012204568833113, "step": 208 }, { "epoch": 0.026125, "grad_norm": 2.5001378059387207, "grad_norm_var": 0.22083751043745087, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.5999109745025635, "loss/hidden": 0.984375, "loss/logits": 0.20815744996070862, "loss/reg": 0.006010920740664005, "step": 209 }, { "epoch": 0.02625, "grad_norm": 3.175185203552246, "grad_norm_var": 0.20582482438127556, "learning_rate": 0.0001, "loss": 1.239, "loss/crossentropy": 2.3893682956695557, "loss/hidden": 1.0234375, "loss/logits": 0.15550163388252258, "loss/reg": 0.006009369157254696, "step": 210 }, { "epoch": 0.026375, "grad_norm": 3.482342481613159, "grad_norm_var": 0.19031657232839597, "learning_rate": 0.0001, "loss": 1.1572, "loss/crossentropy": 2.382542848587036, "loss/hidden": 0.94921875, "loss/logits": 0.14788678288459778, "loss/reg": 0.006007815711200237, "step": 211 }, { "epoch": 0.0265, "grad_norm": 2.285135507583618, "grad_norm_var": 0.19168098803167197, "learning_rate": 0.0001, "loss": 0.9667, "loss/crossentropy": 2.552724838256836, "loss/hidden": 0.78125, "loss/logits": 0.1254206746816635, "loss/reg": 0.006006232462823391, "step": 212 }, { "epoch": 0.026625, "grad_norm": 2.991971969604492, "grad_norm_var": 0.1888233667670041, "learning_rate": 0.0001, "loss": 1.1472, "loss/crossentropy": 2.472437620162964, "loss/hidden": 0.9296875, "loss/logits": 0.15750399231910706, "loss/reg": 0.0060045006684958935, "step": 213 }, { "epoch": 0.02675, "grad_norm": 2.3775179386138916, "grad_norm_var": 0.1665701003974154, "learning_rate": 0.0001, "loss": 1.1938, "loss/crossentropy": 2.294337749481201, "loss/hidden": 0.95703125, "loss/logits": 0.17671090364456177, "loss/reg": 0.006002978887408972, "step": 214 }, { "epoch": 0.026875, "grad_norm": 2.2992701530456543, "grad_norm_var": 0.17463199132661936, "learning_rate": 0.0001, "loss": 1.2097, "loss/crossentropy": 2.3843300342559814, "loss/hidden": 0.9609375, "loss/logits": 0.18876615166664124, "loss/reg": 0.006001432426273823, "step": 215 }, { "epoch": 0.027, "grad_norm": 2.4926228523254395, "grad_norm_var": 0.17347807328228151, "learning_rate": 0.0001, "loss": 1.3156, "loss/crossentropy": 2.326836585998535, "loss/hidden": 1.0625, "loss/logits": 0.19308596849441528, "loss/reg": 0.005999880842864513, "step": 216 }, { "epoch": 0.027125, "grad_norm": 2.552459478378296, "grad_norm_var": 0.16193263198218044, "learning_rate": 0.0001, "loss": 1.1424, "loss/crossentropy": 2.6629388332366943, "loss/hidden": 0.91015625, "loss/logits": 0.1722826063632965, "loss/reg": 0.005998372100293636, "step": 217 }, { "epoch": 0.02725, "grad_norm": 2.866387128829956, "grad_norm_var": 0.16409192036900605, "learning_rate": 0.0001, "loss": 1.0142, "loss/crossentropy": 2.8154890537261963, "loss/hidden": 0.80078125, "loss/logits": 0.15349115431308746, "loss/reg": 0.005996840540319681, "step": 218 }, { "epoch": 0.027375, "grad_norm": 2.77524471282959, "grad_norm_var": 0.12966566207502767, "learning_rate": 0.0001, "loss": 1.4111, "loss/crossentropy": 2.4509928226470947, "loss/hidden": 1.1015625, "loss/logits": 0.249616801738739, "loss/reg": 0.005995343904942274, "step": 219 }, { "epoch": 0.0275, "grad_norm": 2.887923240661621, "grad_norm_var": 0.13285907347625023, "learning_rate": 0.0001, "loss": 1.2886, "loss/crossentropy": 2.4280507564544678, "loss/hidden": 1.0234375, "loss/logits": 0.20519307255744934, "loss/reg": 0.005993579979985952, "step": 220 }, { "epoch": 0.027625, "grad_norm": 2.5383920669555664, "grad_norm_var": 0.1337457284607846, "learning_rate": 0.0001, "loss": 1.3292, "loss/crossentropy": 2.0803585052490234, "loss/hidden": 1.09375, "loss/logits": 0.17551109194755554, "loss/reg": 0.005991705227643251, "step": 221 }, { "epoch": 0.02775, "grad_norm": 2.639490842819214, "grad_norm_var": 0.12131687494494538, "learning_rate": 0.0001, "loss": 1.0593, "loss/crossentropy": 2.293325901031494, "loss/hidden": 0.8515625, "loss/logits": 0.14782238006591797, "loss/reg": 0.005989882629364729, "step": 222 }, { "epoch": 0.027875, "grad_norm": 2.4396984577178955, "grad_norm_var": 0.12344012810124999, "learning_rate": 0.0001, "loss": 1.0587, "loss/crossentropy": 2.7268667221069336, "loss/hidden": 0.84765625, "loss/logits": 0.15114662051200867, "loss/reg": 0.0059883627109229565, "step": 223 }, { "epoch": 0.028, "grad_norm": 2.227886438369751, "grad_norm_var": 0.12171264621671582, "learning_rate": 0.0001, "loss": 1.0087, "loss/crossentropy": 2.4431943893432617, "loss/hidden": 0.81640625, "loss/logits": 0.13243696093559265, "loss/reg": 0.005986812058836222, "step": 224 }, { "epoch": 0.028125, "grad_norm": 3.690627098083496, "grad_norm_var": 0.18519755428341872, "learning_rate": 0.0001, "loss": 1.0732, "loss/crossentropy": 2.4630942344665527, "loss/hidden": 0.875, "loss/logits": 0.13830721378326416, "loss/reg": 0.005985158029943705, "step": 225 }, { "epoch": 0.02825, "grad_norm": 3.377890110015869, "grad_norm_var": 0.19972658805784155, "learning_rate": 0.0001, "loss": 1.1848, "loss/crossentropy": 2.2899203300476074, "loss/hidden": 0.9609375, "loss/logits": 0.16401749849319458, "loss/reg": 0.005983633920550346, "step": 226 }, { "epoch": 0.028375, "grad_norm": 2.7600386142730713, "grad_norm_var": 0.16135214723361363, "learning_rate": 0.0001, "loss": 1.0223, "loss/crossentropy": 2.8077659606933594, "loss/hidden": 0.8203125, "loss/logits": 0.14218226075172424, "loss/reg": 0.005982026923447847, "step": 227 }, { "epoch": 0.0285, "grad_norm": 2.3397345542907715, "grad_norm_var": 0.15851713921701366, "learning_rate": 0.0001, "loss": 1.077, "loss/crossentropy": 2.438030958175659, "loss/hidden": 0.875, "loss/logits": 0.14217695593833923, "loss/reg": 0.005980519577860832, "step": 228 }, { "epoch": 0.028625, "grad_norm": 2.744401216506958, "grad_norm_var": 0.15282793193407448, "learning_rate": 0.0001, "loss": 1.1967, "loss/crossentropy": 2.557457447052002, "loss/hidden": 0.97265625, "loss/logits": 0.16425767540931702, "loss/reg": 0.005979116074740887, "step": 229 }, { "epoch": 0.02875, "grad_norm": 2.4241418838500977, "grad_norm_var": 0.15103305834679168, "learning_rate": 0.0001, "loss": 1.0402, "loss/crossentropy": 2.743885040283203, "loss/hidden": 0.828125, "loss/logits": 0.15231972932815552, "loss/reg": 0.005977709777653217, "step": 230 }, { "epoch": 0.028875, "grad_norm": 2.0828442573547363, "grad_norm_var": 0.16526500993595217, "learning_rate": 0.0001, "loss": 0.9747, "loss/crossentropy": 2.719327688217163, "loss/hidden": 0.78125, "loss/logits": 0.133681058883667, "loss/reg": 0.005976095795631409, "step": 231 }, { "epoch": 0.029, "grad_norm": 2.127495527267456, "grad_norm_var": 0.18259721536013085, "learning_rate": 0.0001, "loss": 1.0588, "loss/crossentropy": 2.8147058486938477, "loss/hidden": 0.85546875, "loss/logits": 0.14354225993156433, "loss/reg": 0.005974431522190571, "step": 232 }, { "epoch": 0.029125, "grad_norm": 4.263195991516113, "grad_norm_var": 0.34219781045772657, "learning_rate": 0.0001, "loss": 1.1724, "loss/crossentropy": 2.5414481163024902, "loss/hidden": 0.96484375, "loss/logits": 0.1478062868118286, "loss/reg": 0.005972826853394508, "step": 233 }, { "epoch": 0.02925, "grad_norm": 2.9974324703216553, "grad_norm_var": 0.34510225788824467, "learning_rate": 0.0001, "loss": 1.3152, "loss/crossentropy": 2.697648763656616, "loss/hidden": 1.0546875, "loss/logits": 0.20080995559692383, "loss/reg": 0.005971227772533894, "step": 234 }, { "epoch": 0.029375, "grad_norm": 3.4798855781555176, "grad_norm_var": 0.37664835069757197, "learning_rate": 0.0001, "loss": 1.2096, "loss/crossentropy": 2.3990559577941895, "loss/hidden": 0.95703125, "loss/logits": 0.19287389516830444, "loss/reg": 0.005969603545963764, "step": 235 }, { "epoch": 0.0295, "grad_norm": 2.43911075592041, "grad_norm_var": 0.3848032740432508, "learning_rate": 0.0001, "loss": 1.0658, "loss/crossentropy": 1.966374158859253, "loss/hidden": 0.875, "loss/logits": 0.13115233182907104, "loss/reg": 0.005967943929135799, "step": 236 }, { "epoch": 0.029625, "grad_norm": 3.7423646450042725, "grad_norm_var": 0.4356891905379257, "learning_rate": 0.0001, "loss": 1.2397, "loss/crossentropy": 2.718675374984741, "loss/hidden": 0.9921875, "loss/logits": 0.18789833784103394, "loss/reg": 0.00596608454361558, "step": 237 }, { "epoch": 0.02975, "grad_norm": 3.328033924102783, "grad_norm_var": 0.4449827328026664, "learning_rate": 0.0001, "loss": 1.5581, "loss/crossentropy": 2.272303819656372, "loss/hidden": 1.2421875, "loss/logits": 0.2562662661075592, "loss/reg": 0.005964066833257675, "step": 238 }, { "epoch": 0.029875, "grad_norm": 2.8761045932769775, "grad_norm_var": 0.42986649641521024, "learning_rate": 0.0001, "loss": 1.1392, "loss/crossentropy": 2.6973013877868652, "loss/hidden": 0.91796875, "loss/logits": 0.16159963607788086, "loss/reg": 0.005962541792541742, "step": 239 }, { "epoch": 0.03, "grad_norm": 2.4458563327789307, "grad_norm_var": 0.4123921579785623, "learning_rate": 0.0001, "loss": 1.178, "loss/crossentropy": 2.5731561183929443, "loss/hidden": 0.9375, "loss/logits": 0.18093177676200867, "loss/reg": 0.005961006972938776, "step": 240 }, { "epoch": 0.030125, "grad_norm": 2.4645614624023438, "grad_norm_var": 0.3844441578530656, "learning_rate": 0.0001, "loss": 1.0932, "loss/crossentropy": 2.648738145828247, "loss/hidden": 0.890625, "loss/logits": 0.14302745461463928, "loss/reg": 0.005959144793450832, "step": 241 }, { "epoch": 0.03025, "grad_norm": 3.0715034008026123, "grad_norm_var": 0.3694944025754277, "learning_rate": 0.0001, "loss": 1.1916, "loss/crossentropy": 2.4820139408111572, "loss/hidden": 0.94921875, "loss/logits": 0.18278783559799194, "loss/reg": 0.005957332905381918, "step": 242 }, { "epoch": 0.030375, "grad_norm": 2.479677677154541, "grad_norm_var": 0.37773887013444374, "learning_rate": 0.0001, "loss": 1.0787, "loss/crossentropy": 2.614309549331665, "loss/hidden": 0.87109375, "loss/logits": 0.14808647334575653, "loss/reg": 0.005955492611974478, "step": 243 }, { "epoch": 0.0305, "grad_norm": 3.0970399379730225, "grad_norm_var": 0.36391299171458796, "learning_rate": 0.0001, "loss": 1.1987, "loss/crossentropy": 2.2731809616088867, "loss/hidden": 0.95703125, "loss/logits": 0.18210504949092865, "loss/reg": 0.00595364673063159, "step": 244 }, { "epoch": 0.030625, "grad_norm": 2.388214588165283, "grad_norm_var": 0.37823356386532864, "learning_rate": 0.0001, "loss": 1.1283, "loss/crossentropy": 2.532259225845337, "loss/hidden": 0.91015625, "loss/logits": 0.15858401358127594, "loss/reg": 0.005952049978077412, "step": 245 }, { "epoch": 0.03075, "grad_norm": 2.97310733795166, "grad_norm_var": 0.36540629077152076, "learning_rate": 0.0001, "loss": 1.1177, "loss/crossentropy": 2.5206258296966553, "loss/hidden": 0.89453125, "loss/logits": 0.16365137696266174, "loss/reg": 0.005950110498815775, "step": 246 }, { "epoch": 0.030875, "grad_norm": 2.15498423576355, "grad_norm_var": 0.3579579158371985, "learning_rate": 0.0001, "loss": 1.1046, "loss/crossentropy": 2.478773832321167, "loss/hidden": 0.8828125, "loss/logits": 0.162343829870224, "loss/reg": 0.005948282778263092, "step": 247 }, { "epoch": 0.031, "grad_norm": 2.3404128551483154, "grad_norm_var": 0.338987407645584, "learning_rate": 0.0001, "loss": 1.1555, "loss/crossentropy": 2.1949751377105713, "loss/hidden": 0.93359375, "loss/logits": 0.1624409407377243, "loss/reg": 0.005946675315499306, "step": 248 }, { "epoch": 0.031125, "grad_norm": 2.8813085556030273, "grad_norm_var": 0.20879640313171802, "learning_rate": 0.0001, "loss": 1.1599, "loss/crossentropy": 2.556128978729248, "loss/hidden": 0.9296875, "loss/logits": 0.1707805097103119, "loss/reg": 0.005944731179624796, "step": 249 }, { "epoch": 0.03125, "grad_norm": 3.309937000274658, "grad_norm_var": 0.22219010027481143, "learning_rate": 0.0001, "loss": 1.0939, "loss/crossentropy": 2.4590022563934326, "loss/hidden": 0.88671875, "loss/logits": 0.14774294197559357, "loss/reg": 0.005942681338638067, "step": 250 }, { "epoch": 0.031375, "grad_norm": 3.1676676273345947, "grad_norm_var": 0.201728293925846, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.419811487197876, "loss/hidden": 1.015625, "loss/logits": 0.24120670557022095, "loss/reg": 0.005940672475844622, "step": 251 }, { "epoch": 0.0315, "grad_norm": 2.6006832122802734, "grad_norm_var": 0.1951007002723287, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.170666456222534, "loss/hidden": 1.140625, "loss/logits": 0.19024603068828583, "loss/reg": 0.005938523914664984, "step": 252 }, { "epoch": 0.031625, "grad_norm": 2.4954755306243896, "grad_norm_var": 0.14101991304577552, "learning_rate": 0.0001, "loss": 1.1465, "loss/crossentropy": 2.262831449508667, "loss/hidden": 0.93359375, "loss/logits": 0.1535283327102661, "loss/reg": 0.00593681400641799, "step": 253 }, { "epoch": 0.03175, "grad_norm": 2.339406728744507, "grad_norm_var": 0.12652605714113535, "learning_rate": 0.0001, "loss": 0.984, "loss/crossentropy": 2.2793617248535156, "loss/hidden": 0.796875, "loss/logits": 0.12778240442276, "loss/reg": 0.005935273133218288, "step": 254 }, { "epoch": 0.031875, "grad_norm": 2.3391647338867188, "grad_norm_var": 0.131427049667937, "learning_rate": 0.0001, "loss": 1.0622, "loss/crossentropy": 2.4579379558563232, "loss/hidden": 0.83984375, "loss/logits": 0.16299216449260712, "loss/reg": 0.0059331608936190605, "step": 255 }, { "epoch": 0.032, "grad_norm": 2.3896231651306152, "grad_norm_var": 0.13322512800125588, "learning_rate": 0.0001, "loss": 1.057, "loss/crossentropy": 2.8022475242614746, "loss/hidden": 0.85546875, "loss/logits": 0.14219465851783752, "loss/reg": 0.005931555759161711, "step": 256 }, { "epoch": 0.032125, "grad_norm": 2.125249147415161, "grad_norm_var": 0.14907278605534582, "learning_rate": 0.0001, "loss": 1.0611, "loss/crossentropy": 2.33644700050354, "loss/hidden": 0.8515625, "loss/logits": 0.15020999312400818, "loss/reg": 0.005930029321461916, "step": 257 }, { "epoch": 0.03225, "grad_norm": 2.521933078765869, "grad_norm_var": 0.13593429417580463, "learning_rate": 0.0001, "loss": 1.0436, "loss/crossentropy": 2.512619733810425, "loss/hidden": 0.8203125, "loss/logits": 0.16396166384220123, "loss/reg": 0.00592817785218358, "step": 258 }, { "epoch": 0.032375, "grad_norm": 2.5966317653656006, "grad_norm_var": 0.13490910688263208, "learning_rate": 0.0001, "loss": 1.1331, "loss/crossentropy": 2.248013734817505, "loss/hidden": 0.91015625, "loss/logits": 0.16364812850952148, "loss/reg": 0.00592625979334116, "step": 259 }, { "epoch": 0.0325, "grad_norm": 2.2045137882232666, "grad_norm_var": 0.12644607438415487, "learning_rate": 0.0001, "loss": 1.0015, "loss/crossentropy": 2.3253698348999023, "loss/hidden": 0.796875, "loss/logits": 0.14540287852287292, "loss/reg": 0.005924653727561235, "step": 260 }, { "epoch": 0.032625, "grad_norm": 2.4450156688690186, "grad_norm_var": 0.1254090419850094, "learning_rate": 0.0001, "loss": 0.9932, "loss/crossentropy": 2.2374210357666016, "loss/hidden": 0.80078125, "loss/logits": 0.13316848874092102, "loss/reg": 0.005922792013734579, "step": 261 }, { "epoch": 0.03275, "grad_norm": 7.747511863708496, "grad_norm_var": 1.8160510254643325, "learning_rate": 0.0001, "loss": 1.2542, "loss/crossentropy": 2.8747429847717285, "loss/hidden": 1.0234375, "loss/logits": 0.17151576280593872, "loss/reg": 0.005921173375099897, "step": 262 }, { "epoch": 0.032875, "grad_norm": 2.1854233741760254, "grad_norm_var": 1.8132730792650582, "learning_rate": 0.0001, "loss": 1.0069, "loss/crossentropy": 2.4989960193634033, "loss/hidden": 0.8125, "loss/logits": 0.13518914580345154, "loss/reg": 0.005919379647821188, "step": 263 }, { "epoch": 0.033, "grad_norm": 3.5132219791412354, "grad_norm_var": 1.8186749991604263, "learning_rate": 0.0001, "loss": 1.054, "loss/crossentropy": 2.497178316116333, "loss/hidden": 0.84765625, "loss/logits": 0.1471494734287262, "loss/reg": 0.005917761009186506, "step": 264 }, { "epoch": 0.033125, "grad_norm": 4.302145481109619, "grad_norm_var": 1.9358282916849012, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.1725542545318604, "loss/hidden": 1.0859375, "loss/logits": 0.16722658276557922, "loss/reg": 0.0059160212986171246, "step": 265 }, { "epoch": 0.03325, "grad_norm": 2.3225510120391846, "grad_norm_var": 1.9582913809461102, "learning_rate": 0.0001, "loss": 1.0153, "loss/crossentropy": 2.6670029163360596, "loss/hidden": 0.80859375, "loss/logits": 0.1475904881954193, "loss/reg": 0.0059142098762094975, "step": 266 }, { "epoch": 0.033375, "grad_norm": 5.196990013122559, "grad_norm_var": 2.27294427304937, "learning_rate": 0.0001, "loss": 1.1665, "loss/crossentropy": 2.6792731285095215, "loss/hidden": 0.94140625, "loss/logits": 0.1659836769104004, "loss/reg": 0.00591221172362566, "step": 267 }, { "epoch": 0.0335, "grad_norm": 3.5144336223602295, "grad_norm_var": 2.26638445070385, "learning_rate": 0.0001, "loss": 1.2502, "loss/crossentropy": 2.2949023246765137, "loss/hidden": 1.0234375, "loss/logits": 0.1677004098892212, "loss/reg": 0.005910532083362341, "step": 268 }, { "epoch": 0.033625, "grad_norm": 2.861222267150879, "grad_norm_var": 2.2433162495019436, "learning_rate": 0.0001, "loss": 1.3308, "loss/crossentropy": 2.5955142974853516, "loss/hidden": 1.0703125, "loss/logits": 0.2013990730047226, "loss/reg": 0.005908492021262646, "step": 269 }, { "epoch": 0.03375, "grad_norm": 2.964390754699707, "grad_norm_var": 2.1991134738974947, "learning_rate": 0.0001, "loss": 1.0975, "loss/crossentropy": 2.483924150466919, "loss/hidden": 0.8828125, "loss/logits": 0.15562227368354797, "loss/reg": 0.005906403064727783, "step": 270 }, { "epoch": 0.033875, "grad_norm": 2.75604510307312, "grad_norm_var": 2.1620222961988325, "learning_rate": 0.0001, "loss": 1.2196, "loss/crossentropy": 2.39125394821167, "loss/hidden": 0.9765625, "loss/logits": 0.18403753638267517, "loss/reg": 0.00590470340102911, "step": 271 }, { "epoch": 0.034, "grad_norm": 2.360309362411499, "grad_norm_var": 2.165352535939727, "learning_rate": 0.0001, "loss": 1.0194, "loss/crossentropy": 2.530670404434204, "loss/hidden": 0.8046875, "loss/logits": 0.15565866231918335, "loss/reg": 0.005902664735913277, "step": 272 }, { "epoch": 0.034125, "grad_norm": 2.496027946472168, "grad_norm_var": 2.1195219252368287, "learning_rate": 0.0001, "loss": 1.2228, "loss/crossentropy": 2.7535252571105957, "loss/hidden": 0.9609375, "loss/logits": 0.20284873247146606, "loss/reg": 0.005900639574974775, "step": 273 }, { "epoch": 0.03425, "grad_norm": 2.854250431060791, "grad_norm_var": 2.0941964139517344, "learning_rate": 0.0001, "loss": 1.1387, "loss/crossentropy": 2.134964942932129, "loss/hidden": 0.9296875, "loss/logits": 0.15002194046974182, "loss/reg": 0.005898929201066494, "step": 274 }, { "epoch": 0.034375, "grad_norm": 4.497798442840576, "grad_norm_var": 2.149396374832277, "learning_rate": 0.0001, "loss": 1.2312, "loss/crossentropy": 2.3270835876464844, "loss/hidden": 0.99609375, "loss/logits": 0.17617599666118622, "loss/reg": 0.0058972095139324665, "step": 275 }, { "epoch": 0.0345, "grad_norm": 2.321152448654175, "grad_norm_var": 2.1318278315927155, "learning_rate": 0.0001, "loss": 1.1523, "loss/crossentropy": 1.858445644378662, "loss/hidden": 0.94921875, "loss/logits": 0.14408603310585022, "loss/reg": 0.005895303096622229, "step": 276 }, { "epoch": 0.034625, "grad_norm": 2.4426257610321045, "grad_norm_var": 2.1321312734782243, "learning_rate": 0.0001, "loss": 1.0267, "loss/crossentropy": 2.4483628273010254, "loss/hidden": 0.82421875, "loss/logits": 0.1435263752937317, "loss/reg": 0.005893299821764231, "step": 277 }, { "epoch": 0.03475, "grad_norm": 2.144637107849121, "grad_norm_var": 0.843351985629086, "learning_rate": 0.0001, "loss": 1.0517, "loss/crossentropy": 2.237915277481079, "loss/hidden": 0.8515625, "loss/logits": 0.14119011163711548, "loss/reg": 0.005891298409551382, "step": 278 }, { "epoch": 0.034875, "grad_norm": 2.32000732421875, "grad_norm_var": 0.8290445100225684, "learning_rate": 0.0001, "loss": 1.0462, "loss/crossentropy": 2.6588850021362305, "loss/hidden": 0.83203125, "loss/logits": 0.1552983820438385, "loss/reg": 0.0058892290107905865, "step": 279 }, { "epoch": 0.035, "grad_norm": 3.3390939235687256, "grad_norm_var": 0.820283282746707, "learning_rate": 0.0001, "loss": 1.1937, "loss/crossentropy": 2.5243186950683594, "loss/hidden": 0.953125, "loss/logits": 0.1817275732755661, "loss/reg": 0.00588742271065712, "step": 280 }, { "epoch": 0.035125, "grad_norm": 3.1800894737243652, "grad_norm_var": 0.7106469411621028, "learning_rate": 0.0001, "loss": 1.1937, "loss/crossentropy": 2.556126832962036, "loss/hidden": 0.953125, "loss/logits": 0.18167603015899658, "loss/reg": 0.005885709077119827, "step": 281 }, { "epoch": 0.03525, "grad_norm": 4.466390132904053, "grad_norm_var": 0.8119073339313209, "learning_rate": 0.0001, "loss": 1.27, "loss/crossentropy": 2.5671539306640625, "loss/hidden": 0.984375, "loss/logits": 0.2267427146434784, "loss/reg": 0.0058837407268583775, "step": 282 }, { "epoch": 0.035375, "grad_norm": 3.2809953689575195, "grad_norm_var": 0.5074810718943117, "learning_rate": 0.0001, "loss": 1.1245, "loss/crossentropy": 2.1554338932037354, "loss/hidden": 0.9140625, "loss/logits": 0.1516391634941101, "loss/reg": 0.005881770513951778, "step": 283 }, { "epoch": 0.0355, "grad_norm": 2.9982316493988037, "grad_norm_var": 0.48786559613454966, "learning_rate": 0.0001, "loss": 1.1286, "loss/crossentropy": 2.6773006916046143, "loss/hidden": 0.90625, "loss/logits": 0.1635606288909912, "loss/reg": 0.005880062934011221, "step": 284 }, { "epoch": 0.035625, "grad_norm": 2.387657880783081, "grad_norm_var": 0.5078162485774572, "learning_rate": 0.0001, "loss": 1.1214, "loss/crossentropy": 2.4741320610046387, "loss/hidden": 0.8984375, "loss/logits": 0.1641697734594345, "loss/reg": 0.0058782072737813, "step": 285 }, { "epoch": 0.03575, "grad_norm": 271.6628112792969, "grad_norm_var": 4514.324895160767, "learning_rate": 0.0001, "loss": 1.6171, "loss/crossentropy": 2.5766143798828125, "loss/hidden": 1.375, "loss/logits": 0.1833469420671463, "loss/reg": 0.005876271054148674, "step": 286 }, { "epoch": 0.035875, "grad_norm": 3.545677900314331, "grad_norm_var": 4512.577903953303, "learning_rate": 0.0001, "loss": 1.1466, "loss/crossentropy": 2.5389881134033203, "loss/hidden": 0.88671875, "loss/logits": 0.20117658376693726, "loss/reg": 0.005874336697161198, "step": 287 }, { "epoch": 0.036, "grad_norm": 2.9219233989715576, "grad_norm_var": 4511.294050983276, "learning_rate": 0.0001, "loss": 1.1121, "loss/crossentropy": 2.3270509243011475, "loss/hidden": 0.8828125, "loss/logits": 0.17058232426643372, "loss/reg": 0.005872361361980438, "step": 288 }, { "epoch": 0.036125, "grad_norm": 2.831878423690796, "grad_norm_var": 4510.526061571783, "learning_rate": 0.0001, "loss": 1.148, "loss/crossentropy": 2.4853744506835938, "loss/hidden": 0.91796875, "loss/logits": 0.17128118872642517, "loss/reg": 0.005870639346539974, "step": 289 }, { "epoch": 0.03625, "grad_norm": 2.284134864807129, "grad_norm_var": 4511.83639181831, "learning_rate": 0.0001, "loss": 1.0599, "loss/crossentropy": 2.3107759952545166, "loss/hidden": 0.8515625, "loss/logits": 0.14969472587108612, "loss/reg": 0.005868903826922178, "step": 290 }, { "epoch": 0.036375, "grad_norm": 2.2008161544799805, "grad_norm_var": 4516.84932017332, "learning_rate": 0.0001, "loss": 1.0902, "loss/crossentropy": 2.4265358448028564, "loss/hidden": 0.86328125, "loss/logits": 0.1682073473930359, "loss/reg": 0.0058671231381595135, "step": 291 }, { "epoch": 0.0365, "grad_norm": 2.6285743713378906, "grad_norm_var": 4516.145108725088, "learning_rate": 0.0001, "loss": 1.2494, "loss/crossentropy": 2.372230291366577, "loss/hidden": 0.98046875, "loss/logits": 0.2102714478969574, "loss/reg": 0.005865375977009535, "step": 292 }, { "epoch": 0.036625, "grad_norm": 2.6784040927886963, "grad_norm_var": 4515.607170253259, "learning_rate": 0.0001, "loss": 1.0752, "loss/crossentropy": 2.6276440620422363, "loss/hidden": 0.875, "loss/logits": 0.14159329235553741, "loss/reg": 0.005863656289875507, "step": 293 }, { "epoch": 0.03675, "grad_norm": 2.6373047828674316, "grad_norm_var": 4514.470495103465, "learning_rate": 0.0001, "loss": 1.1694, "loss/crossentropy": 2.70892333984375, "loss/hidden": 0.9453125, "loss/logits": 0.16546514630317688, "loss/reg": 0.005862091202288866, "step": 294 }, { "epoch": 0.036875, "grad_norm": 2.384430170059204, "grad_norm_var": 4514.321377312488, "learning_rate": 0.0001, "loss": 1.2472, "loss/crossentropy": 2.1273090839385986, "loss/hidden": 1.0, "loss/logits": 0.18860690295696259, "loss/reg": 0.005860424134880304, "step": 295 }, { "epoch": 0.037, "grad_norm": 2.5959692001342773, "grad_norm_var": 4515.978398966678, "learning_rate": 0.0001, "loss": 1.0376, "loss/crossentropy": 2.7293522357940674, "loss/hidden": 0.8203125, "loss/logits": 0.1587076485157013, "loss/reg": 0.0058588446117937565, "step": 296 }, { "epoch": 0.037125, "grad_norm": 2.2753238677978516, "grad_norm_var": 4518.0185669920775, "learning_rate": 0.0001, "loss": 1.0063, "loss/crossentropy": 2.4602949619293213, "loss/hidden": 0.8125, "loss/logits": 0.13525693118572235, "loss/reg": 0.005857320036739111, "step": 297 }, { "epoch": 0.03725, "grad_norm": 3.009300708770752, "grad_norm_var": 4521.093589717446, "learning_rate": 0.0001, "loss": 1.2573, "loss/crossentropy": 2.8883349895477295, "loss/hidden": 0.9921875, "loss/logits": 0.20657645165920258, "loss/reg": 0.005855792202055454, "step": 298 }, { "epoch": 0.037375, "grad_norm": 2.700221538543701, "grad_norm_var": 4522.372179334166, "learning_rate": 0.0001, "loss": 1.1557, "loss/crossentropy": 2.5446314811706543, "loss/hidden": 0.90234375, "loss/logits": 0.19479964673519135, "loss/reg": 0.005854278337210417, "step": 299 }, { "epoch": 0.0375, "grad_norm": 2.3786559104919434, "grad_norm_var": 4523.758055495688, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.469960927963257, "loss/hidden": 0.90234375, "loss/logits": 0.16156738996505737, "loss/reg": 0.00585273839533329, "step": 300 }, { "epoch": 0.037625, "grad_norm": 2.7032158374786377, "grad_norm_var": 4523.046593599144, "learning_rate": 0.0001, "loss": 1.1947, "loss/crossentropy": 2.7451162338256836, "loss/hidden": 0.94140625, "loss/logits": 0.19476984441280365, "loss/reg": 0.0058509958907961845, "step": 301 }, { "epoch": 0.03775, "grad_norm": 2.507664442062378, "grad_norm_var": 0.11250867537391755, "learning_rate": 0.0001, "loss": 0.9899, "loss/crossentropy": 2.53341007232666, "loss/hidden": 0.796875, "loss/logits": 0.1345081329345703, "loss/reg": 0.005849248263984919, "step": 302 }, { "epoch": 0.037875, "grad_norm": 3.027892589569092, "grad_norm_var": 0.06692647718721882, "learning_rate": 0.0001, "loss": 1.0973, "loss/crossentropy": 2.7899296283721924, "loss/hidden": 0.890625, "loss/logits": 0.1482122391462326, "loss/reg": 0.005847662687301636, "step": 303 }, { "epoch": 0.038, "grad_norm": 2.1617183685302734, "grad_norm_var": 0.07146536810277529, "learning_rate": 0.0001, "loss": 0.969, "loss/crossentropy": 2.4700305461883545, "loss/hidden": 0.78125, "loss/logits": 0.12925508618354797, "loss/reg": 0.005846073850989342, "step": 304 }, { "epoch": 0.038125, "grad_norm": 2.3791332244873047, "grad_norm_var": 0.06803597239225306, "learning_rate": 0.0001, "loss": 1.1912, "loss/crossentropy": 2.4171202182769775, "loss/hidden": 0.9453125, "loss/logits": 0.18739524483680725, "loss/reg": 0.005844476167112589, "step": 305 }, { "epoch": 0.03825, "grad_norm": 2.7622976303100586, "grad_norm_var": 0.06636088237049004, "learning_rate": 0.0001, "loss": 1.0808, "loss/crossentropy": 2.5030367374420166, "loss/hidden": 0.8359375, "loss/logits": 0.18643516302108765, "loss/reg": 0.005842759273946285, "step": 306 }, { "epoch": 0.038375, "grad_norm": 2.4079246520996094, "grad_norm_var": 0.059000676657357566, "learning_rate": 0.0001, "loss": 1.0359, "loss/crossentropy": 2.381542682647705, "loss/hidden": 0.828125, "loss/logits": 0.1493588387966156, "loss/reg": 0.0058412267826497555, "step": 307 }, { "epoch": 0.0385, "grad_norm": 2.5356478691101074, "grad_norm_var": 0.058906038923372726, "learning_rate": 0.0001, "loss": 1.087, "loss/crossentropy": 2.4928808212280273, "loss/hidden": 0.875, "loss/logits": 0.15363982319831848, "loss/reg": 0.0058394852094352245, "step": 308 }, { "epoch": 0.038625, "grad_norm": 2.4036688804626465, "grad_norm_var": 0.0597099908353601, "learning_rate": 0.0001, "loss": 0.986, "loss/crossentropy": 2.5816946029663086, "loss/hidden": 0.7890625, "loss/logits": 0.13851355016231537, "loss/reg": 0.005837727338075638, "step": 309 }, { "epoch": 0.03875, "grad_norm": 2.630572557449341, "grad_norm_var": 0.05963840398777146, "learning_rate": 0.0001, "loss": 1.0333, "loss/crossentropy": 2.140015125274658, "loss/hidden": 0.828125, "loss/logits": 0.14680367708206177, "loss/reg": 0.005835913587361574, "step": 310 }, { "epoch": 0.038875, "grad_norm": 2.3641905784606934, "grad_norm_var": 0.06012154861927167, "learning_rate": 0.0001, "loss": 1.0947, "loss/crossentropy": 2.3300833702087402, "loss/hidden": 0.8828125, "loss/logits": 0.15358075499534607, "loss/reg": 0.005834224168211222, "step": 311 }, { "epoch": 0.039, "grad_norm": 2.215728759765625, "grad_norm_var": 0.06696490679455162, "learning_rate": 0.0001, "loss": 1.1411, "loss/crossentropy": 2.4583277702331543, "loss/hidden": 0.9140625, "loss/logits": 0.1687404215335846, "loss/reg": 0.005832599475979805, "step": 312 }, { "epoch": 0.039125, "grad_norm": 2.8934550285339355, "grad_norm_var": 0.06994228066174794, "learning_rate": 0.0001, "loss": 1.2763, "loss/crossentropy": 2.409702777862549, "loss/hidden": 1.0390625, "loss/logits": 0.17889352142810822, "loss/reg": 0.005831001792103052, "step": 313 }, { "epoch": 0.03925, "grad_norm": 8.741681098937988, "grad_norm_var": 2.4613182467650705, "learning_rate": 0.0001, "loss": 1.1972, "loss/crossentropy": 2.3858492374420166, "loss/hidden": 0.96875, "loss/logits": 0.1701970100402832, "loss/reg": 0.005829236935824156, "step": 314 }, { "epoch": 0.039375, "grad_norm": 7.412417411804199, "grad_norm_var": 3.707354176329111, "learning_rate": 0.0001, "loss": 1.3096, "loss/crossentropy": 2.3804125785827637, "loss/hidden": 1.1015625, "loss/logits": 0.149795800447464, "loss/reg": 0.005827469285577536, "step": 315 }, { "epoch": 0.0395, "grad_norm": 3.1443870067596436, "grad_norm_var": 3.6580641482995806, "learning_rate": 0.0001, "loss": 1.1365, "loss/crossentropy": 2.481820583343506, "loss/hidden": 0.90234375, "loss/logits": 0.1759084165096283, "loss/reg": 0.005825776606798172, "step": 316 }, { "epoch": 0.039625, "grad_norm": 2.8567562103271484, "grad_norm_var": 3.6479706732170993, "learning_rate": 0.0001, "loss": 1.0023, "loss/crossentropy": 2.5141823291778564, "loss/hidden": 0.80078125, "loss/logits": 0.14331723749637604, "loss/reg": 0.005824015475809574, "step": 317 }, { "epoch": 0.03975, "grad_norm": 2.2817444801330566, "grad_norm_var": 3.674359828489624, "learning_rate": 0.0001, "loss": 1.0893, "loss/crossentropy": 2.184128999710083, "loss/hidden": 0.875, "loss/logits": 0.15605026483535767, "loss/reg": 0.00582248717546463, "step": 318 }, { "epoch": 0.039875, "grad_norm": 2.249969005584717, "grad_norm_var": 3.736641439481692, "learning_rate": 0.0001, "loss": 1.008, "loss/crossentropy": 2.768484354019165, "loss/hidden": 0.80078125, "loss/logits": 0.14897163212299347, "loss/reg": 0.00582079216837883, "step": 319 }, { "epoch": 0.04, "grad_norm": 2.6358306407928467, "grad_norm_var": 3.684102068428194, "learning_rate": 0.0001, "loss": 1.3237, "loss/crossentropy": 2.301954507827759, "loss/hidden": 1.015625, "loss/logits": 0.24987459182739258, "loss/reg": 0.005819002632051706, "step": 320 }, { "epoch": 0.040125, "grad_norm": 2.353457450866699, "grad_norm_var": 3.6871065280104496, "learning_rate": 0.0001, "loss": 1.1095, "loss/crossentropy": 2.379765272140503, "loss/hidden": 0.89453125, "loss/logits": 0.15680107474327087, "loss/reg": 0.005817302968353033, "step": 321 }, { "epoch": 0.04025, "grad_norm": 2.4568967819213867, "grad_norm_var": 3.712514538750317, "learning_rate": 0.0001, "loss": 0.9706, "loss/crossentropy": 2.380795955657959, "loss/hidden": 0.77734375, "loss/logits": 0.13508911430835724, "loss/reg": 0.005815597716718912, "step": 322 }, { "epoch": 0.040375, "grad_norm": 3.207794189453125, "grad_norm_var": 3.6654654630236734, "learning_rate": 0.0001, "loss": 1.3668, "loss/crossentropy": 1.949703574180603, "loss/hidden": 1.1171875, "loss/logits": 0.19150257110595703, "loss/reg": 0.005813860800117254, "step": 323 }, { "epoch": 0.0405, "grad_norm": 3.156318187713623, "grad_norm_var": 3.6284383166396252, "learning_rate": 0.0001, "loss": 1.2742, "loss/crossentropy": 2.1970410346984863, "loss/hidden": 1.0, "loss/logits": 0.21606677770614624, "loss/reg": 0.005812041461467743, "step": 324 }, { "epoch": 0.040625, "grad_norm": 2.556889533996582, "grad_norm_var": 3.611332493108523, "learning_rate": 0.0001, "loss": 0.9529, "loss/crossentropy": 2.7647974491119385, "loss/hidden": 0.7578125, "loss/logits": 0.1369488537311554, "loss/reg": 0.00581031059846282, "step": 325 }, { "epoch": 0.04075, "grad_norm": 2.2634167671203613, "grad_norm_var": 3.653624545749698, "learning_rate": 0.0001, "loss": 1.0757, "loss/crossentropy": 2.334134340286255, "loss/hidden": 0.859375, "loss/logits": 0.1581987738609314, "loss/reg": 0.005808570422232151, "step": 326 }, { "epoch": 0.040875, "grad_norm": 2.3521125316619873, "grad_norm_var": 3.6551397839485555, "learning_rate": 0.0001, "loss": 0.9965, "loss/crossentropy": 2.78828763961792, "loss/hidden": 0.79296875, "loss/logits": 0.1454332172870636, "loss/reg": 0.005806888919323683, "step": 327 }, { "epoch": 0.041, "grad_norm": 3.0836093425750732, "grad_norm_var": 3.5768996944618254, "learning_rate": 0.0001, "loss": 1.1938, "loss/crossentropy": 2.2781612873077393, "loss/hidden": 0.9609375, "loss/logits": 0.1747758537530899, "loss/reg": 0.005805303808301687, "step": 328 }, { "epoch": 0.041125, "grad_norm": 3.6110970973968506, "grad_norm_var": 3.5651235487558246, "learning_rate": 0.0001, "loss": 1.1693, "loss/crossentropy": 2.812913417816162, "loss/hidden": 0.9375, "loss/logits": 0.17377659678459167, "loss/reg": 0.005803780164569616, "step": 329 }, { "epoch": 0.04125, "grad_norm": 2.5020155906677246, "grad_norm_var": 1.552569952590708, "learning_rate": 0.0001, "loss": 1.0862, "loss/crossentropy": 2.6585140228271484, "loss/hidden": 0.86328125, "loss/logits": 0.16489718854427338, "loss/reg": 0.005802258383482695, "step": 330 }, { "epoch": 0.041375, "grad_norm": 2.383924961090088, "grad_norm_var": 0.17978007457456116, "learning_rate": 0.0001, "loss": 1.1592, "loss/crossentropy": 2.4862210750579834, "loss/hidden": 0.94921875, "loss/logits": 0.15199331939220428, "loss/reg": 0.005800731014460325, "step": 331 }, { "epoch": 0.0415, "grad_norm": 2.187321424484253, "grad_norm_var": 0.17949311071790794, "learning_rate": 0.0001, "loss": 1.0507, "loss/crossentropy": 2.6380603313446045, "loss/hidden": 0.84765625, "loss/logits": 0.14507073163986206, "loss/reg": 0.005798923317342997, "step": 332 }, { "epoch": 0.041625, "grad_norm": 2.21768856048584, "grad_norm_var": 0.18601193201957902, "learning_rate": 0.0001, "loss": 1.1027, "loss/crossentropy": 2.3925793170928955, "loss/hidden": 0.875, "loss/logits": 0.16972869634628296, "loss/reg": 0.00579707371070981, "step": 333 }, { "epoch": 0.04175, "grad_norm": 2.682497262954712, "grad_norm_var": 0.17937770683656615, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.3586106300354004, "loss/hidden": 1.078125, "loss/logits": 0.1911502480506897, "loss/reg": 0.005795224104076624, "step": 334 }, { "epoch": 0.041875, "grad_norm": 3.0983307361602783, "grad_norm_var": 0.1826395003188658, "learning_rate": 0.0001, "loss": 1.1675, "loss/crossentropy": 2.436326265335083, "loss/hidden": 0.91796875, "loss/logits": 0.1915540099143982, "loss/reg": 0.005793258547782898, "step": 335 }, { "epoch": 0.042, "grad_norm": 6.251674652099609, "grad_norm_var": 0.982431631272856, "learning_rate": 0.0001, "loss": 1.6879, "loss/crossentropy": 2.3841142654418945, "loss/hidden": 1.265625, "loss/logits": 0.3643344044685364, "loss/reg": 0.0057912725023925304, "step": 336 }, { "epoch": 0.042125, "grad_norm": 3.0111782550811768, "grad_norm_var": 0.9617308564996427, "learning_rate": 0.0001, "loss": 1.3497, "loss/crossentropy": 2.430532217025757, "loss/hidden": 1.0703125, "loss/logits": 0.2214677333831787, "loss/reg": 0.00578899122774601, "step": 337 }, { "epoch": 0.04225, "grad_norm": 2.4221205711364746, "grad_norm_var": 0.9640415151512265, "learning_rate": 0.0001, "loss": 1.0955, "loss/crossentropy": 2.4376015663146973, "loss/hidden": 0.890625, "loss/logits": 0.1470467746257782, "loss/reg": 0.005786662455648184, "step": 338 }, { "epoch": 0.042375, "grad_norm": 2.615758180618286, "grad_norm_var": 0.9645524062068328, "learning_rate": 0.0001, "loss": 1.0887, "loss/crossentropy": 2.5318005084991455, "loss/hidden": 0.875, "loss/logits": 0.15580901503562927, "loss/reg": 0.0057848175056278706, "step": 339 }, { "epoch": 0.0425, "grad_norm": 2.857177972793579, "grad_norm_var": 0.9599117798964886, "learning_rate": 0.0001, "loss": 1.1153, "loss/crossentropy": 2.4260058403015137, "loss/hidden": 0.89453125, "loss/logits": 0.16291844844818115, "loss/reg": 0.005782809574157, "step": 340 }, { "epoch": 0.042625, "grad_norm": 2.4030630588531494, "grad_norm_var": 0.9680393035693963, "learning_rate": 0.0001, "loss": 1.2054, "loss/crossentropy": 2.3009443283081055, "loss/hidden": 0.953125, "loss/logits": 0.194431871175766, "loss/reg": 0.005780525505542755, "step": 341 }, { "epoch": 0.04275, "grad_norm": 2.264251470565796, "grad_norm_var": 0.9679716782722624, "learning_rate": 0.0001, "loss": 1.0227, "loss/crossentropy": 2.597288131713867, "loss/hidden": 0.8203125, "loss/logits": 0.14457917213439941, "loss/reg": 0.005778233055025339, "step": 342 }, { "epoch": 0.042875, "grad_norm": 2.2368180751800537, "grad_norm_var": 0.9767866404468121, "learning_rate": 0.0001, "loss": 0.943, "loss/crossentropy": 2.4534237384796143, "loss/hidden": 0.7578125, "loss/logits": 0.12742644548416138, "loss/reg": 0.005776000674813986, "step": 343 }, { "epoch": 0.043, "grad_norm": 2.469120979309082, "grad_norm_var": 0.9824165851632264, "learning_rate": 0.0001, "loss": 1.0531, "loss/crossentropy": 2.793834686279297, "loss/hidden": 0.83984375, "loss/logits": 0.15554235875606537, "loss/reg": 0.005774145945906639, "step": 344 }, { "epoch": 0.043125, "grad_norm": 2.8334686756134033, "grad_norm_var": 0.9387961568478952, "learning_rate": 0.0001, "loss": 0.9467, "loss/crossentropy": 2.678666830062866, "loss/hidden": 0.7578125, "loss/logits": 0.13116785883903503, "loss/reg": 0.005771928001195192, "step": 345 }, { "epoch": 0.04325, "grad_norm": 7.863356590270996, "grad_norm_var": 2.5385263322105893, "learning_rate": 0.0001, "loss": 1.4695, "loss/crossentropy": 2.613318920135498, "loss/hidden": 1.2734375, "loss/logits": 0.13832132518291473, "loss/reg": 0.005770097486674786, "step": 346 }, { "epoch": 0.043375, "grad_norm": 2.763582468032837, "grad_norm_var": 2.510660987467067, "learning_rate": 0.0001, "loss": 1.1302, "loss/crossentropy": 2.846453905105591, "loss/hidden": 0.90625, "loss/logits": 0.166295126080513, "loss/reg": 0.0057678911834955215, "step": 347 }, { "epoch": 0.0435, "grad_norm": 3.600456714630127, "grad_norm_var": 2.4567056984087676, "learning_rate": 0.0001, "loss": 1.2108, "loss/crossentropy": 2.515092372894287, "loss/hidden": 0.96875, "loss/logits": 0.18436874449253082, "loss/reg": 0.005765695124864578, "step": 348 }, { "epoch": 0.043625, "grad_norm": 4.2698073387146, "grad_norm_var": 2.4444505062987636, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.3673834800720215, "loss/hidden": 0.8984375, "loss/logits": 0.16628439724445343, "loss/reg": 0.005763507913798094, "step": 349 }, { "epoch": 0.04375, "grad_norm": 2.962045192718506, "grad_norm_var": 2.42435544256402, "learning_rate": 0.0001, "loss": 1.079, "loss/crossentropy": 2.9470205307006836, "loss/hidden": 0.83203125, "loss/logits": 0.1893935650587082, "loss/reg": 0.005761242005974054, "step": 350 }, { "epoch": 0.043875, "grad_norm": 3.0306880474090576, "grad_norm_var": 2.427092851603572, "learning_rate": 0.0001, "loss": 1.0201, "loss/crossentropy": 2.3637542724609375, "loss/hidden": 0.83203125, "loss/logits": 0.13047108054161072, "loss/reg": 0.0057592191733419895, "step": 351 }, { "epoch": 0.044, "grad_norm": 2.599585771560669, "grad_norm_var": 1.855493477227511, "learning_rate": 0.0001, "loss": 0.9429, "loss/crossentropy": 2.9222559928894043, "loss/hidden": 0.7578125, "loss/logits": 0.12747693061828613, "loss/reg": 0.005757040809839964, "step": 352 }, { "epoch": 0.044125, "grad_norm": 2.4723081588745117, "grad_norm_var": 1.882729557078295, "learning_rate": 0.0001, "loss": 1.2276, "loss/crossentropy": 2.5835001468658447, "loss/hidden": 0.94921875, "loss/logits": 0.220790833234787, "loss/reg": 0.005754764657467604, "step": 353 }, { "epoch": 0.04425, "grad_norm": 2.5266165733337402, "grad_norm_var": 1.873911870827686, "learning_rate": 0.0001, "loss": 1.1879, "loss/crossentropy": 2.4273722171783447, "loss/hidden": 0.97265625, "loss/logits": 0.15772980451583862, "loss/reg": 0.005752884317189455, "step": 354 }, { "epoch": 0.044375, "grad_norm": 2.8139867782592773, "grad_norm_var": 1.8632913443851133, "learning_rate": 0.0001, "loss": 1.2803, "loss/crossentropy": 2.591078996658325, "loss/hidden": 1.0234375, "loss/logits": 0.19931599497795105, "loss/reg": 0.0057507967576384544, "step": 355 }, { "epoch": 0.0445, "grad_norm": 2.0173490047454834, "grad_norm_var": 1.9371277324683585, "learning_rate": 0.0001, "loss": 1.0066, "loss/crossentropy": 2.415416955947876, "loss/hidden": 0.80859375, "loss/logits": 0.14050991833209991, "loss/reg": 0.005748571362346411, "step": 356 }, { "epoch": 0.044625, "grad_norm": 3.5304269790649414, "grad_norm_var": 1.916250206343263, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.7149741649627686, "loss/hidden": 1.0390625, "loss/logits": 0.16997796297073364, "loss/reg": 0.005746254697442055, "step": 357 }, { "epoch": 0.04475, "grad_norm": 47.96537399291992, "grad_norm_var": 127.11164707702224, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.7637100219726562, "loss/hidden": 1.2265625, "loss/logits": 0.17390823364257812, "loss/reg": 0.005744417663663626, "step": 358 }, { "epoch": 0.044875, "grad_norm": 2.253833055496216, "grad_norm_var": 127.10313415769795, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.3016419410705566, "loss/hidden": 0.9140625, "loss/logits": 0.16676074266433716, "loss/reg": 0.005742207169532776, "step": 359 }, { "epoch": 0.045, "grad_norm": 3.2059576511383057, "grad_norm_var": 126.79034824550331, "learning_rate": 0.0001, "loss": 1.2389, "loss/crossentropy": 2.624589204788208, "loss/hidden": 1.0, "loss/logits": 0.18154433369636536, "loss/reg": 0.005740353371948004, "step": 360 }, { "epoch": 0.045125, "grad_norm": 2.456129789352417, "grad_norm_var": 126.9607902891753, "learning_rate": 0.0001, "loss": 1.0342, "loss/crossentropy": 2.500290870666504, "loss/hidden": 0.83203125, "loss/logits": 0.14475134015083313, "loss/reg": 0.005738324951380491, "step": 361 }, { "epoch": 0.04525, "grad_norm": 3.081372022628784, "grad_norm_var": 127.21513938268541, "learning_rate": 0.0001, "loss": 1.1093, "loss/crossentropy": 2.3305118083953857, "loss/hidden": 0.8984375, "loss/logits": 0.15346962213516235, "loss/reg": 0.0057361493818461895, "step": 362 }, { "epoch": 0.045375, "grad_norm": 2.2634801864624023, "grad_norm_var": 127.4280286195785, "learning_rate": 0.0001, "loss": 1.0956, "loss/crossentropy": 2.4553990364074707, "loss/hidden": 0.875, "loss/logits": 0.16324618458747864, "loss/reg": 0.005734298378229141, "step": 363 }, { "epoch": 0.0455, "grad_norm": 3.9597907066345215, "grad_norm_var": 127.3359579534097, "learning_rate": 0.0001, "loss": 1.3557, "loss/crossentropy": 2.6449685096740723, "loss/hidden": 1.078125, "loss/logits": 0.2202637791633606, "loss/reg": 0.005732398014515638, "step": 364 }, { "epoch": 0.045625, "grad_norm": 2.7794013023376465, "grad_norm_var": 127.76159157574789, "learning_rate": 0.0001, "loss": 1.0787, "loss/crossentropy": 2.3118059635162354, "loss/hidden": 0.86328125, "loss/logits": 0.1581302285194397, "loss/reg": 0.005730301141738892, "step": 365 }, { "epoch": 0.04575, "grad_norm": 4.7589192390441895, "grad_norm_var": 127.32661229099328, "learning_rate": 0.0001, "loss": 1.3244, "loss/crossentropy": 2.5914306640625, "loss/hidden": 1.078125, "loss/logits": 0.18898184597492218, "loss/reg": 0.005728167947381735, "step": 366 }, { "epoch": 0.045875, "grad_norm": 4.024761199951172, "grad_norm_var": 127.03030673720949, "learning_rate": 0.0001, "loss": 1.421, "loss/crossentropy": 2.083667755126953, "loss/hidden": 1.1640625, "loss/logits": 0.1997053027153015, "loss/reg": 0.005726283416152, "step": 367 }, { "epoch": 0.046, "grad_norm": 2.9291043281555176, "grad_norm_var": 126.89672944049376, "learning_rate": 0.0001, "loss": 1.1321, "loss/crossentropy": 2.7017500400543213, "loss/hidden": 0.90625, "loss/logits": 0.1686232089996338, "loss/reg": 0.005724436603486538, "step": 368 }, { "epoch": 0.046125, "grad_norm": 2.289379119873047, "grad_norm_var": 126.98034912166224, "learning_rate": 0.0001, "loss": 1.0433, "loss/crossentropy": 2.404045581817627, "loss/hidden": 0.8359375, "loss/logits": 0.1501048356294632, "loss/reg": 0.005722455680370331, "step": 369 }, { "epoch": 0.04625, "grad_norm": 2.5955307483673096, "grad_norm_var": 126.95053618311779, "learning_rate": 0.0001, "loss": 1.1052, "loss/crossentropy": 2.555497407913208, "loss/hidden": 0.87890625, "loss/logits": 0.16912290453910828, "loss/reg": 0.0057206167839467525, "step": 370 }, { "epoch": 0.046375, "grad_norm": 2.5631515979766846, "grad_norm_var": 127.05459572518181, "learning_rate": 0.0001, "loss": 1.0105, "loss/crossentropy": 2.3253824710845947, "loss/hidden": 0.80859375, "loss/logits": 0.14470672607421875, "loss/reg": 0.005718756001442671, "step": 371 }, { "epoch": 0.0465, "grad_norm": 2.8995003700256348, "grad_norm_var": 126.65924311218065, "learning_rate": 0.0001, "loss": 1.0727, "loss/crossentropy": 2.5171523094177246, "loss/hidden": 0.859375, "loss/logits": 0.15616215765476227, "loss/reg": 0.005716769490391016, "step": 372 }, { "epoch": 0.046625, "grad_norm": 2.4674322605133057, "grad_norm_var": 127.0582358856119, "learning_rate": 0.0001, "loss": 0.9544, "loss/crossentropy": 2.426679849624634, "loss/hidden": 0.765625, "loss/logits": 0.13166998326778412, "loss/reg": 0.005714884493499994, "step": 373 }, { "epoch": 0.04675, "grad_norm": 2.1486146450042725, "grad_norm_var": 0.5554253140062239, "learning_rate": 0.0001, "loss": 1.0123, "loss/crossentropy": 2.3567564487457275, "loss/hidden": 0.8203125, "loss/logits": 0.1348218023777008, "loss/reg": 0.0057129692286252975, "step": 374 }, { "epoch": 0.046875, "grad_norm": 2.4249770641326904, "grad_norm_var": 0.5421168003854054, "learning_rate": 0.0001, "loss": 1.0005, "loss/crossentropy": 2.575383424758911, "loss/hidden": 0.80078125, "loss/logits": 0.1425924003124237, "loss/reg": 0.005710979457944632, "step": 375 }, { "epoch": 0.047, "grad_norm": 3.9449760913848877, "grad_norm_var": 0.6036429091311817, "learning_rate": 0.0001, "loss": 1.1428, "loss/crossentropy": 2.5839173793792725, "loss/hidden": 0.94921875, "loss/logits": 0.13653349876403809, "loss/reg": 0.0057089440524578094, "step": 376 }, { "epoch": 0.047125, "grad_norm": 2.3119592666625977, "grad_norm_var": 0.6148998912723904, "learning_rate": 0.0001, "loss": 1.088, "loss/crossentropy": 2.492663860321045, "loss/hidden": 0.859375, "loss/logits": 0.1715661883354187, "loss/reg": 0.005707095842808485, "step": 377 }, { "epoch": 0.04725, "grad_norm": 3.586817979812622, "grad_norm_var": 0.6386998540868449, "learning_rate": 0.0001, "loss": 1.0907, "loss/crossentropy": 2.8210177421569824, "loss/hidden": 0.87890625, "loss/logits": 0.15476316213607788, "loss/reg": 0.005705154500901699, "step": 378 }, { "epoch": 0.047375, "grad_norm": 2.805647850036621, "grad_norm_var": 0.6040650287121667, "learning_rate": 0.0001, "loss": 1.0792, "loss/crossentropy": 2.54019832611084, "loss/hidden": 0.859375, "loss/logits": 0.16280022263526917, "loss/reg": 0.005703243892639875, "step": 379 }, { "epoch": 0.0475, "grad_norm": 2.7932748794555664, "grad_norm_var": 0.5445939245804574, "learning_rate": 0.0001, "loss": 1.4621, "loss/crossentropy": 2.2343437671661377, "loss/hidden": 1.1953125, "loss/logits": 0.20978981256484985, "loss/reg": 0.005701290909200907, "step": 380 }, { "epoch": 0.047625, "grad_norm": 2.661917209625244, "grad_norm_var": 0.5482497924242672, "learning_rate": 0.0001, "loss": 0.9746, "loss/crossentropy": 2.782052516937256, "loss/hidden": 0.78125, "loss/logits": 0.13640211522579193, "loss/reg": 0.0056994096376001835, "step": 381 }, { "epoch": 0.04775, "grad_norm": 2.4914302825927734, "grad_norm_var": 0.3228126995822395, "learning_rate": 0.0001, "loss": 1.126, "loss/crossentropy": 2.166295051574707, "loss/hidden": 0.91015625, "loss/logits": 0.1589164137840271, "loss/reg": 0.005697426851838827, "step": 382 }, { "epoch": 0.047875, "grad_norm": 2.961653709411621, "grad_norm_var": 0.22106978564282992, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.5477302074432373, "loss/hidden": 0.8828125, "loss/logits": 0.16730068624019623, "loss/reg": 0.005695413798093796, "step": 383 }, { "epoch": 0.048, "grad_norm": 2.9396286010742188, "grad_norm_var": 0.22133896443579198, "learning_rate": 0.0001, "loss": 1.0254, "loss/crossentropy": 2.555258274078369, "loss/hidden": 0.828125, "loss/logits": 0.1403425633907318, "loss/reg": 0.005693417973816395, "step": 384 }, { "epoch": 0.048125, "grad_norm": 2.8298912048339844, "grad_norm_var": 0.20691636732209961, "learning_rate": 0.0001, "loss": 1.195, "loss/crossentropy": 2.472844362258911, "loss/hidden": 0.984375, "loss/logits": 0.15367946028709412, "loss/reg": 0.005691539496183395, "step": 385 }, { "epoch": 0.04825, "grad_norm": 15.47062873840332, "grad_norm_var": 10.256501481265339, "learning_rate": 0.0001, "loss": 1.4448, "loss/crossentropy": 2.521524667739868, "loss/hidden": 1.203125, "loss/logits": 0.1847420334815979, "loss/reg": 0.005689616315066814, "step": 386 }, { "epoch": 0.048375, "grad_norm": 2.455294370651245, "grad_norm_var": 10.271871141002237, "learning_rate": 0.0001, "loss": 1.1018, "loss/crossentropy": 2.309390068054199, "loss/hidden": 0.89453125, "loss/logits": 0.15039557218551636, "loss/reg": 0.005687698721885681, "step": 387 }, { "epoch": 0.0485, "grad_norm": 3.23420786857605, "grad_norm_var": 10.248744715041969, "learning_rate": 0.0001, "loss": 1.2879, "loss/crossentropy": 2.4902544021606445, "loss/hidden": 1.015625, "loss/logits": 0.2154603898525238, "loss/reg": 0.005685731768608093, "step": 388 }, { "epoch": 0.048625, "grad_norm": 2.660858631134033, "grad_norm_var": 10.221989434520331, "learning_rate": 0.0001, "loss": 1.025, "loss/crossentropy": 2.31535267829895, "loss/hidden": 0.8359375, "loss/logits": 0.13224059343338013, "loss/reg": 0.005683773662894964, "step": 389 }, { "epoch": 0.04875, "grad_norm": 2.4209847450256348, "grad_norm_var": 10.173641089965429, "learning_rate": 0.0001, "loss": 0.9974, "loss/crossentropy": 2.1761093139648438, "loss/hidden": 0.8125, "loss/logits": 0.12805956602096558, "loss/reg": 0.005681932438164949, "step": 390 }, { "epoch": 0.048875, "grad_norm": 3.108008623123169, "grad_norm_var": 10.09354551501582, "learning_rate": 0.0001, "loss": 0.979, "loss/crossentropy": 2.721165657043457, "loss/hidden": 0.78125, "loss/logits": 0.14099523425102234, "loss/reg": 0.005679869093000889, "step": 391 }, { "epoch": 0.049, "grad_norm": 2.6531527042388916, "grad_norm_var": 10.150022289467502, "learning_rate": 0.0001, "loss": 1.1723, "loss/crossentropy": 2.518146514892578, "loss/hidden": 0.9375, "loss/logits": 0.17805764079093933, "loss/reg": 0.005677856504917145, "step": 392 }, { "epoch": 0.049125, "grad_norm": 2.2534499168395996, "grad_norm_var": 10.160179916565673, "learning_rate": 0.0001, "loss": 1.1292, "loss/crossentropy": 2.633385181427002, "loss/hidden": 0.91015625, "loss/logits": 0.16230204701423645, "loss/reg": 0.005675735417753458, "step": 393 }, { "epoch": 0.04925, "grad_norm": 2.9424333572387695, "grad_norm_var": 10.185797665159741, "learning_rate": 0.0001, "loss": 1.4214, "loss/crossentropy": 2.62923002243042, "loss/hidden": 1.15625, "loss/logits": 0.20838308334350586, "loss/reg": 0.00567356962710619, "step": 394 }, { "epoch": 0.049375, "grad_norm": 2.622178792953491, "grad_norm_var": 10.20593051221178, "learning_rate": 0.0001, "loss": 0.9697, "loss/crossentropy": 2.5544826984405518, "loss/hidden": 0.78125, "loss/logits": 0.13172510266304016, "loss/reg": 0.005671407096087933, "step": 395 }, { "epoch": 0.0495, "grad_norm": 2.635505199432373, "grad_norm_var": 10.223008906743342, "learning_rate": 0.0001, "loss": 0.933, "loss/crossentropy": 2.5959105491638184, "loss/hidden": 0.75390625, "loss/logits": 0.12239634245634079, "loss/reg": 0.0056692929938435555, "step": 396 }, { "epoch": 0.049625, "grad_norm": 2.6063406467437744, "grad_norm_var": 10.229570355797922, "learning_rate": 0.0001, "loss": 1.0478, "loss/crossentropy": 2.719916343688965, "loss/hidden": 0.83984375, "loss/logits": 0.15127256512641907, "loss/reg": 0.0056673381477594376, "step": 397 }, { "epoch": 0.04975, "grad_norm": 2.589893102645874, "grad_norm_var": 10.216701025853546, "learning_rate": 0.0001, "loss": 1.1265, "loss/crossentropy": 2.3730130195617676, "loss/hidden": 0.90234375, "loss/logits": 0.16749918460845947, "loss/reg": 0.0056652189232409, "step": 398 }, { "epoch": 0.049875, "grad_norm": 2.1503751277923584, "grad_norm_var": 10.318666846324161, "learning_rate": 0.0001, "loss": 1.1685, "loss/crossentropy": 2.2147741317749023, "loss/hidden": 0.92578125, "loss/logits": 0.1860472559928894, "loss/reg": 0.005663097370415926, "step": 399 }, { "epoch": 0.05, "grad_norm": 3.6945109367370605, "grad_norm_var": 10.300567557859127, "learning_rate": 0.0001, "loss": 1.1272, "loss/crossentropy": 2.4212143421173096, "loss/hidden": 0.921875, "loss/logits": 0.1487593650817871, "loss/reg": 0.005661314353346825, "step": 400 }, { "epoch": 0.050125, "grad_norm": 3.7444777488708496, "grad_norm_var": 10.268632820538057, "learning_rate": 0.0001, "loss": 1.1221, "loss/crossentropy": 2.5369904041290283, "loss/hidden": 0.90625, "loss/logits": 0.15929211676120758, "loss/reg": 0.005659462418407202, "step": 401 }, { "epoch": 0.05025, "grad_norm": 5.121776580810547, "grad_norm_var": 0.5518050614602837, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.2371129989624023, "loss/hidden": 1.2109375, "loss/logits": 0.19960111379623413, "loss/reg": 0.005657529458403587, "step": 402 }, { "epoch": 0.050375, "grad_norm": 28.607572555541992, "grad_norm_var": 41.63994308721723, "learning_rate": 0.0001, "loss": 1.1515, "loss/crossentropy": 2.84385347366333, "loss/hidden": 0.90234375, "loss/logits": 0.19263674318790436, "loss/reg": 0.005655454937368631, "step": 403 }, { "epoch": 0.0505, "grad_norm": 2.38948655128479, "grad_norm_var": 41.834466994087045, "learning_rate": 0.0001, "loss": 1.0929, "loss/crossentropy": 2.2518088817596436, "loss/hidden": 0.8984375, "loss/logits": 0.13791221380233765, "loss/reg": 0.005653408356010914, "step": 404 }, { "epoch": 0.050625, "grad_norm": 6.887917518615723, "grad_norm_var": 41.907583648135414, "learning_rate": 0.0001, "loss": 1.2522, "loss/crossentropy": 2.8729405403137207, "loss/hidden": 1.046875, "loss/logits": 0.14880970120429993, "loss/reg": 0.005651514511555433, "step": 405 }, { "epoch": 0.05075, "grad_norm": 3.2420449256896973, "grad_norm_var": 41.69182027548524, "learning_rate": 0.0001, "loss": 1.2031, "loss/crossentropy": 2.598705530166626, "loss/hidden": 0.98046875, "loss/logits": 0.16617505252361298, "loss/reg": 0.005649634636938572, "step": 406 }, { "epoch": 0.050875, "grad_norm": 2.3294692039489746, "grad_norm_var": 41.9082544413822, "learning_rate": 0.0001, "loss": 1.0316, "loss/crossentropy": 2.7743589878082275, "loss/hidden": 0.84375, "loss/logits": 0.13134868443012238, "loss/reg": 0.005647764541208744, "step": 407 }, { "epoch": 0.051, "grad_norm": 2.3849406242370605, "grad_norm_var": 41.988788990047645, "learning_rate": 0.0001, "loss": 1.1579, "loss/crossentropy": 2.2934722900390625, "loss/hidden": 0.9375, "loss/logits": 0.16397064924240112, "loss/reg": 0.00564591446891427, "step": 408 }, { "epoch": 0.051125, "grad_norm": 2.616523504257202, "grad_norm_var": 41.875558070811756, "learning_rate": 0.0001, "loss": 0.9281, "loss/crossentropy": 2.617312431335449, "loss/hidden": 0.7734375, "loss/logits": 0.09819567203521729, "loss/reg": 0.005644225515425205, "step": 409 }, { "epoch": 0.05125, "grad_norm": 2.302281141281128, "grad_norm_var": 42.058469053043055, "learning_rate": 0.0001, "loss": 1.0583, "loss/crossentropy": 2.8029561042785645, "loss/hidden": 0.859375, "loss/logits": 0.14253735542297363, "loss/reg": 0.005642317235469818, "step": 410 }, { "epoch": 0.051375, "grad_norm": 2.1521739959716797, "grad_norm_var": 42.20532780726832, "learning_rate": 0.0001, "loss": 0.996, "loss/crossentropy": 2.5798304080963135, "loss/hidden": 0.80078125, "loss/logits": 0.13881272077560425, "loss/reg": 0.005640234332531691, "step": 411 }, { "epoch": 0.0515, "grad_norm": 4.3292155265808105, "grad_norm_var": 41.914794683811124, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.4219868183135986, "loss/hidden": 1.0390625, "loss/logits": 0.2562292516231537, "loss/reg": 0.005638125352561474, "step": 412 }, { "epoch": 0.051625, "grad_norm": 19.01975440979004, "grad_norm_var": 53.903843358167165, "learning_rate": 0.0001, "loss": 1.3283, "loss/crossentropy": 2.2926077842712402, "loss/hidden": 1.078125, "loss/logits": 0.19380658864974976, "loss/reg": 0.005636140704154968, "step": 413 }, { "epoch": 0.05175, "grad_norm": 2.859027862548828, "grad_norm_var": 53.791467006877085, "learning_rate": 0.0001, "loss": 1.1115, "loss/crossentropy": 2.429117441177368, "loss/hidden": 0.90234375, "loss/logits": 0.1528070569038391, "loss/reg": 0.005634027067571878, "step": 414 }, { "epoch": 0.051875, "grad_norm": 2.385204792022705, "grad_norm_var": 53.67862289213027, "learning_rate": 0.0001, "loss": 1.0186, "loss/crossentropy": 2.710325002670288, "loss/hidden": 0.81640625, "loss/logits": 0.1458669900894165, "loss/reg": 0.005631967913359404, "step": 415 }, { "epoch": 0.052, "grad_norm": 2.3011677265167236, "grad_norm_var": 54.20582073402194, "learning_rate": 0.0001, "loss": 1.0843, "loss/crossentropy": 2.485734701156616, "loss/hidden": 0.87109375, "loss/logits": 0.1569264829158783, "loss/reg": 0.0056300037540495396, "step": 416 }, { "epoch": 0.052125, "grad_norm": 2.7714357376098633, "grad_norm_var": 54.53064815195892, "learning_rate": 0.0001, "loss": 1.0741, "loss/crossentropy": 2.6249403953552246, "loss/hidden": 0.85546875, "loss/logits": 0.1623522937297821, "loss/reg": 0.0056281075812876225, "step": 417 }, { "epoch": 0.05225, "grad_norm": 2.376473903656006, "grad_norm_var": 55.22478277620113, "learning_rate": 0.0001, "loss": 1.2116, "loss/crossentropy": 2.5150105953216553, "loss/hidden": 0.95703125, "loss/logits": 0.19830524921417236, "loss/reg": 0.005626222584396601, "step": 418 }, { "epoch": 0.052375, "grad_norm": 2.6247470378875732, "grad_norm_var": 17.572360223815615, "learning_rate": 0.0001, "loss": 1.172, "loss/crossentropy": 2.7201685905456543, "loss/hidden": 0.9453125, "loss/logits": 0.17042091488838196, "loss/reg": 0.005624283570796251, "step": 419 }, { "epoch": 0.0525, "grad_norm": 49.02815628051758, "grad_norm_var": 143.90483482694842, "learning_rate": 0.0001, "loss": 5.3824, "loss/crossentropy": 2.692047357559204, "loss/hidden": 4.84375, "loss/logits": 0.48245739936828613, "loss/reg": 0.005622203927487135, "step": 420 }, { "epoch": 0.052625, "grad_norm": 2.6867082118988037, "grad_norm_var": 144.9870986829453, "learning_rate": 0.0001, "loss": 1.2507, "loss/crossentropy": 2.404517412185669, "loss/hidden": 1.0, "loss/logits": 0.19445687532424927, "loss/reg": 0.005620268173515797, "step": 421 }, { "epoch": 0.05275, "grad_norm": 4.397704124450684, "grad_norm_var": 144.55498651709914, "learning_rate": 0.0001, "loss": 1.4596, "loss/crossentropy": 2.1510226726531982, "loss/hidden": 1.2109375, "loss/logits": 0.19246640801429749, "loss/reg": 0.005618296563625336, "step": 422 }, { "epoch": 0.052875, "grad_norm": 4.239573955535889, "grad_norm_var": 143.68003611616095, "learning_rate": 0.0001, "loss": 1.3275, "loss/crossentropy": 2.686849355697632, "loss/hidden": 1.09375, "loss/logits": 0.17758557200431824, "loss/reg": 0.005616751033812761, "step": 423 }, { "epoch": 0.053, "grad_norm": 2.749202251434326, "grad_norm_var": 143.4748837350726, "learning_rate": 0.0001, "loss": 1.0827, "loss/crossentropy": 2.8104846477508545, "loss/hidden": 0.8828125, "loss/logits": 0.1437493860721588, "loss/reg": 0.005615332629531622, "step": 424 }, { "epoch": 0.053125, "grad_norm": 2.459291458129883, "grad_norm_var": 143.5641839570371, "learning_rate": 0.0001, "loss": 1.0548, "loss/crossentropy": 2.5806379318237305, "loss/hidden": 0.8515625, "loss/logits": 0.14714661240577698, "loss/reg": 0.005613364279270172, "step": 425 }, { "epoch": 0.05325, "grad_norm": 2.294171094894409, "grad_norm_var": 143.56904366210821, "learning_rate": 0.0001, "loss": 1.1486, "loss/crossentropy": 2.6366002559661865, "loss/hidden": 0.90234375, "loss/logits": 0.19014191627502441, "loss/reg": 0.005611394997686148, "step": 426 }, { "epoch": 0.053375, "grad_norm": 2.2255382537841797, "grad_norm_var": 143.52399251007708, "learning_rate": 0.0001, "loss": 1.0752, "loss/crossentropy": 2.542306661605835, "loss/hidden": 0.875, "loss/logits": 0.14408408105373383, "loss/reg": 0.005609368905425072, "step": 427 }, { "epoch": 0.0535, "grad_norm": 3.5708723068237305, "grad_norm_var": 143.80942972780392, "learning_rate": 0.0001, "loss": 1.0863, "loss/crossentropy": 2.2636356353759766, "loss/hidden": 0.8828125, "loss/logits": 0.14744916558265686, "loss/reg": 0.005607361439615488, "step": 428 }, { "epoch": 0.053625, "grad_norm": 2.9189610481262207, "grad_norm_var": 133.66980873374825, "learning_rate": 0.0001, "loss": 0.9895, "loss/crossentropy": 2.7651426792144775, "loss/hidden": 0.78515625, "loss/logits": 0.1482805609703064, "loss/reg": 0.005605428479611874, "step": 429 }, { "epoch": 0.05375, "grad_norm": 3.2735564708709717, "grad_norm_var": 133.5211490137515, "learning_rate": 0.0001, "loss": 1.2363, "loss/crossentropy": 2.248082399368286, "loss/hidden": 0.98046875, "loss/logits": 0.19977417588233948, "loss/reg": 0.0056034415028989315, "step": 430 }, { "epoch": 0.053875, "grad_norm": 3.5670769214630127, "grad_norm_var": 133.0752341056661, "learning_rate": 0.0001, "loss": 1.2766, "loss/crossentropy": 2.500338554382324, "loss/hidden": 1.0234375, "loss/logits": 0.19719059765338898, "loss/reg": 0.005601502023637295, "step": 431 }, { "epoch": 0.054, "grad_norm": 2.2697787284851074, "grad_norm_var": 133.0901180807591, "learning_rate": 0.0001, "loss": 0.9931, "loss/crossentropy": 2.6418793201446533, "loss/hidden": 0.7890625, "loss/logits": 0.14799568057060242, "loss/reg": 0.005599519703537226, "step": 432 }, { "epoch": 0.054125, "grad_norm": 3.220383405685425, "grad_norm_var": 132.91898234062202, "learning_rate": 0.0001, "loss": 1.2515, "loss/crossentropy": 2.5073025226593018, "loss/hidden": 1.0390625, "loss/logits": 0.15643876791000366, "loss/reg": 0.005597477313131094, "step": 433 }, { "epoch": 0.05425, "grad_norm": 3.2845206260681152, "grad_norm_var": 132.5476800488924, "learning_rate": 0.0001, "loss": 1.1441, "loss/crossentropy": 2.509037971496582, "loss/hidden": 0.9296875, "loss/logits": 0.15849418938159943, "loss/reg": 0.005595567170530558, "step": 434 }, { "epoch": 0.054375, "grad_norm": 2.254239320755005, "grad_norm_var": 132.71932731242507, "learning_rate": 0.0001, "loss": 0.9815, "loss/crossentropy": 2.567584991455078, "loss/hidden": 0.78125, "loss/logits": 0.14433184266090393, "loss/reg": 0.005593593697994947, "step": 435 }, { "epoch": 0.0545, "grad_norm": 3.2273480892181396, "grad_norm_var": 0.4676980414191933, "learning_rate": 0.0001, "loss": 1.1645, "loss/crossentropy": 2.3639349937438965, "loss/hidden": 0.94921875, "loss/logits": 0.15934088826179504, "loss/reg": 0.0055916691198945045, "step": 436 }, { "epoch": 0.054625, "grad_norm": 2.6044058799743652, "grad_norm_var": 0.47199755801328347, "learning_rate": 0.0001, "loss": 1.1033, "loss/crossentropy": 2.539247989654541, "loss/hidden": 0.8984375, "loss/logits": 0.14898554980754852, "loss/reg": 0.005589775741100311, "step": 437 }, { "epoch": 0.05475, "grad_norm": 2.9674391746520996, "grad_norm_var": 0.3399405404704983, "learning_rate": 0.0001, "loss": 1.252, "loss/crossentropy": 2.5642499923706055, "loss/hidden": 0.9921875, "loss/logits": 0.20391228795051575, "loss/reg": 0.005587900057435036, "step": 438 }, { "epoch": 0.054875, "grad_norm": 2.4164047241210938, "grad_norm_var": 0.23308679379454797, "learning_rate": 0.0001, "loss": 1.1939, "loss/crossentropy": 2.3462696075439453, "loss/hidden": 0.93359375, "loss/logits": 0.2044137418270111, "loss/reg": 0.005585688166320324, "step": 439 }, { "epoch": 0.055, "grad_norm": 2.7590599060058594, "grad_norm_var": 0.2329847653181711, "learning_rate": 0.0001, "loss": 1.0377, "loss/crossentropy": 2.775485038757324, "loss/hidden": 0.84375, "loss/logits": 0.13808496296405792, "loss/reg": 0.0055835009552538395, "step": 440 }, { "epoch": 0.055125, "grad_norm": 2.7251267433166504, "grad_norm_var": 0.224188675724659, "learning_rate": 0.0001, "loss": 1.0001, "loss/crossentropy": 2.4934420585632324, "loss/hidden": 0.80859375, "loss/logits": 0.1357189267873764, "loss/reg": 0.005581483710557222, "step": 441 }, { "epoch": 0.05525, "grad_norm": 2.4774584770202637, "grad_norm_var": 0.21273704839308963, "learning_rate": 0.0001, "loss": 1.2166, "loss/crossentropy": 2.426271438598633, "loss/hidden": 0.95703125, "loss/logits": 0.20375394821166992, "loss/reg": 0.0055792308412492275, "step": 442 }, { "epoch": 0.055375, "grad_norm": 3.2236833572387695, "grad_norm_var": 0.1905493662305197, "learning_rate": 0.0001, "loss": 1.1724, "loss/crossentropy": 2.9799797534942627, "loss/hidden": 0.92578125, "loss/logits": 0.19083930552005768, "loss/reg": 0.005577271804213524, "step": 443 }, { "epoch": 0.0555, "grad_norm": 2.5997183322906494, "grad_norm_var": 0.16554225723918894, "learning_rate": 0.0001, "loss": 1.126, "loss/crossentropy": 2.2098257541656494, "loss/hidden": 0.92578125, "loss/logits": 0.14447355270385742, "loss/reg": 0.005575183313339949, "step": 444 }, { "epoch": 0.055625, "grad_norm": 2.5179152488708496, "grad_norm_var": 0.1725392629592297, "learning_rate": 0.0001, "loss": 1.2018, "loss/crossentropy": 2.0029213428497314, "loss/hidden": 0.98046875, "loss/logits": 0.1655960977077484, "loss/reg": 0.005572900176048279, "step": 445 }, { "epoch": 0.05575, "grad_norm": 2.5075204372406006, "grad_norm_var": 0.16460110044899826, "learning_rate": 0.0001, "loss": 1.0614, "loss/crossentropy": 2.3672924041748047, "loss/hidden": 0.85546875, "loss/logits": 0.15021467208862305, "loss/reg": 0.005570439621806145, "step": 446 }, { "epoch": 0.055875, "grad_norm": 2.441183567047119, "grad_norm_var": 0.12700610259855102, "learning_rate": 0.0001, "loss": 0.9323, "loss/crossentropy": 2.311056137084961, "loss/hidden": 0.7578125, "loss/logits": 0.11881721019744873, "loss/reg": 0.00556844100356102, "step": 447 }, { "epoch": 0.056, "grad_norm": 2.6724319458007812, "grad_norm_var": 0.11304803744365562, "learning_rate": 0.0001, "loss": 1.0937, "loss/crossentropy": 2.562101364135742, "loss/hidden": 0.8671875, "loss/logits": 0.1708334982395172, "loss/reg": 0.005566492676734924, "step": 448 }, { "epoch": 0.056125, "grad_norm": 2.196300506591797, "grad_norm_var": 0.11350312697665288, "learning_rate": 0.0001, "loss": 0.9882, "loss/crossentropy": 2.4227116107940674, "loss/hidden": 0.80078125, "loss/logits": 0.13182450830936432, "loss/reg": 0.00556437112390995, "step": 449 }, { "epoch": 0.05625, "grad_norm": 2.912667989730835, "grad_norm_var": 0.0921566818687341, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 1.9439491033554077, "loss/hidden": 1.109375, "loss/logits": 0.2070913016796112, "loss/reg": 0.0055623650550842285, "step": 450 }, { "epoch": 0.056375, "grad_norm": 2.011991500854492, "grad_norm_var": 0.10881512213368959, "learning_rate": 0.0001, "loss": 1.0172, "loss/crossentropy": 2.498812675476074, "loss/hidden": 0.81640625, "loss/logits": 0.14521706104278564, "loss/reg": 0.005560221150517464, "step": 451 }, { "epoch": 0.0565, "grad_norm": 2.2709267139434814, "grad_norm_var": 0.0912508163184422, "learning_rate": 0.0001, "loss": 1.1384, "loss/crossentropy": 2.320579767227173, "loss/hidden": 0.9140625, "loss/logits": 0.16879746317863464, "loss/reg": 0.005558326840400696, "step": 452 }, { "epoch": 0.056625, "grad_norm": 2.954127788543701, "grad_norm_var": 0.09996231296479816, "learning_rate": 0.0001, "loss": 1.2415, "loss/crossentropy": 2.483376979827881, "loss/hidden": 0.99609375, "loss/logits": 0.18988527357578278, "loss/reg": 0.005556488875299692, "step": 453 }, { "epoch": 0.05675, "grad_norm": 2.442729949951172, "grad_norm_var": 0.0916992305907788, "learning_rate": 0.0001, "loss": 1.0533, "loss/crossentropy": 2.414472818374634, "loss/hidden": 0.84765625, "loss/logits": 0.1501239389181137, "loss/reg": 0.005554646719247103, "step": 454 }, { "epoch": 0.056875, "grad_norm": 2.598292589187622, "grad_norm_var": 0.09002796513685567, "learning_rate": 0.0001, "loss": 0.9797, "loss/crossentropy": 2.8175811767578125, "loss/hidden": 0.78515625, "loss/logits": 0.13899990916252136, "loss/reg": 0.005552831571549177, "step": 455 }, { "epoch": 0.057, "grad_norm": 2.284618616104126, "grad_norm_var": 0.09289234998963139, "learning_rate": 0.0001, "loss": 1.1767, "loss/crossentropy": 2.5178730487823486, "loss/hidden": 0.953125, "loss/logits": 0.1680239588022232, "loss/reg": 0.005550856236368418, "step": 456 }, { "epoch": 0.057125, "grad_norm": 2.9749691486358643, "grad_norm_var": 0.10255115779464533, "learning_rate": 0.0001, "loss": 1.146, "loss/crossentropy": 2.6965036392211914, "loss/hidden": 0.89453125, "loss/logits": 0.19602364301681519, "loss/reg": 0.005548745859414339, "step": 457 }, { "epoch": 0.05725, "grad_norm": 2.4419991970062256, "grad_norm_var": 0.10305738190390912, "learning_rate": 0.0001, "loss": 1.0782, "loss/crossentropy": 2.507200241088867, "loss/hidden": 0.87890625, "loss/logits": 0.14385411143302917, "loss/reg": 0.005546758882701397, "step": 458 }, { "epoch": 0.057375, "grad_norm": 2.41898250579834, "grad_norm_var": 0.07293072023693033, "learning_rate": 0.0001, "loss": 1.0665, "loss/crossentropy": 2.4068796634674072, "loss/hidden": 0.87109375, "loss/logits": 0.13996180891990662, "loss/reg": 0.005544655025005341, "step": 459 }, { "epoch": 0.0575, "grad_norm": 3.584895372390747, "grad_norm_var": 0.1446675774892469, "learning_rate": 0.0001, "loss": 1.419, "loss/crossentropy": 2.4029970169067383, "loss/hidden": 1.15625, "loss/logits": 0.20734865963459015, "loss/reg": 0.005542535334825516, "step": 460 }, { "epoch": 0.057625, "grad_norm": 2.5190699100494385, "grad_norm_var": 0.14465856873481447, "learning_rate": 0.0001, "loss": 1.0687, "loss/crossentropy": 2.632817268371582, "loss/hidden": 0.84375, "loss/logits": 0.16959112882614136, "loss/reg": 0.005540382582694292, "step": 461 }, { "epoch": 0.05775, "grad_norm": 3.293412446975708, "grad_norm_var": 0.1759751166057581, "learning_rate": 0.0001, "loss": 1.2079, "loss/crossentropy": 1.8526346683502197, "loss/hidden": 0.984375, "loss/logits": 0.16817334294319153, "loss/reg": 0.005538390018045902, "step": 462 }, { "epoch": 0.057875, "grad_norm": 2.090097665786743, "grad_norm_var": 0.1923380804679141, "learning_rate": 0.0001, "loss": 1.0403, "loss/crossentropy": 2.7256767749786377, "loss/hidden": 0.83984375, "loss/logits": 0.14509689807891846, "loss/reg": 0.005536381620913744, "step": 463 }, { "epoch": 0.058, "grad_norm": 2.367372751235962, "grad_norm_var": 0.19537989350592183, "learning_rate": 0.0001, "loss": 0.967, "loss/crossentropy": 2.440683603286743, "loss/hidden": 0.78125, "loss/logits": 0.13041679561138153, "loss/reg": 0.005534291733056307, "step": 464 }, { "epoch": 0.058125, "grad_norm": 2.5434730052948, "grad_norm_var": 0.18491306851457617, "learning_rate": 0.0001, "loss": 1.1396, "loss/crossentropy": 2.811406373977661, "loss/hidden": 0.91015625, "loss/logits": 0.1740744560956955, "loss/reg": 0.005532294511795044, "step": 465 }, { "epoch": 0.05825, "grad_norm": 2.613758087158203, "grad_norm_var": 0.17830906169392974, "learning_rate": 0.0001, "loss": 1.0313, "loss/crossentropy": 2.5138356685638428, "loss/hidden": 0.828125, "loss/logits": 0.1479034125804901, "loss/reg": 0.005530340131372213, "step": 466 }, { "epoch": 0.058375, "grad_norm": 3.6053991317749023, "grad_norm_var": 0.21458171164135606, "learning_rate": 0.0001, "loss": 1.2109, "loss/crossentropy": 1.9949983358383179, "loss/hidden": 1.0, "loss/logits": 0.155661940574646, "loss/reg": 0.0055284383706748486, "step": 467 }, { "epoch": 0.0585, "grad_norm": 2.2574644088745117, "grad_norm_var": 0.21534123971961966, "learning_rate": 0.0001, "loss": 1.08, "loss/crossentropy": 2.514662981033325, "loss/hidden": 0.859375, "loss/logits": 0.16538314521312714, "loss/reg": 0.005526562221348286, "step": 468 }, { "epoch": 0.058625, "grad_norm": 2.2614095211029053, "grad_norm_var": 0.2206521084247221, "learning_rate": 0.0001, "loss": 1.2297, "loss/crossentropy": 2.4910507202148438, "loss/hidden": 0.98046875, "loss/logits": 0.19400066137313843, "loss/reg": 0.005524714011698961, "step": 469 }, { "epoch": 0.05875, "grad_norm": 3.083524465560913, "grad_norm_var": 0.22915168035201153, "learning_rate": 0.0001, "loss": 1.1725, "loss/crossentropy": 2.5548853874206543, "loss/hidden": 0.92578125, "loss/logits": 0.19151920080184937, "loss/reg": 0.005522689316421747, "step": 470 }, { "epoch": 0.058875, "grad_norm": 2.6530709266662598, "grad_norm_var": 0.2287156357176549, "learning_rate": 0.0001, "loss": 0.9819, "loss/crossentropy": 2.5769848823547363, "loss/hidden": 0.79296875, "loss/logits": 0.1337730437517166, "loss/reg": 0.00552078802138567, "step": 471 }, { "epoch": 0.059, "grad_norm": 2.857489585876465, "grad_norm_var": 0.21848469951039154, "learning_rate": 0.0001, "loss": 1.2335, "loss/crossentropy": 2.6933629512786865, "loss/hidden": 0.98828125, "loss/logits": 0.19003306329250336, "loss/reg": 0.005518974736332893, "step": 472 }, { "epoch": 0.059125, "grad_norm": 1.960106372833252, "grad_norm_var": 0.24874750636482734, "learning_rate": 0.0001, "loss": 0.9776, "loss/crossentropy": 2.534855365753174, "loss/hidden": 0.7890625, "loss/logits": 0.13338381052017212, "loss/reg": 0.005517229437828064, "step": 473 }, { "epoch": 0.05925, "grad_norm": 2.787822961807251, "grad_norm_var": 0.24619457779295406, "learning_rate": 0.0001, "loss": 1.0858, "loss/crossentropy": 2.396390438079834, "loss/hidden": 0.88671875, "loss/logits": 0.14397624135017395, "loss/reg": 0.005515479948371649, "step": 474 }, { "epoch": 0.059375, "grad_norm": 2.3396122455596924, "grad_norm_var": 0.24936205040752385, "learning_rate": 0.0001, "loss": 1.0392, "loss/crossentropy": 2.6306259632110596, "loss/hidden": 0.83984375, "loss/logits": 0.14426180720329285, "loss/reg": 0.005513759795576334, "step": 475 }, { "epoch": 0.0595, "grad_norm": 2.367551803588867, "grad_norm_var": 0.19447740210993794, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.342672348022461, "loss/hidden": 0.890625, "loss/logits": 0.16136375069618225, "loss/reg": 0.0055120959877967834, "step": 476 }, { "epoch": 0.059625, "grad_norm": 2.3029873371124268, "grad_norm_var": 0.19972845357339655, "learning_rate": 0.0001, "loss": 0.9785, "loss/crossentropy": 2.725276231765747, "loss/hidden": 0.796875, "loss/logits": 0.12647491693496704, "loss/reg": 0.0055101178586483, "step": 477 }, { "epoch": 0.05975, "grad_norm": 2.3109138011932373, "grad_norm_var": 0.1674590503375268, "learning_rate": 0.0001, "loss": 1.012, "loss/crossentropy": 2.6665799617767334, "loss/hidden": 0.81640625, "loss/logits": 0.14054208993911743, "loss/reg": 0.005508116912096739, "step": 478 }, { "epoch": 0.059875, "grad_norm": 2.8778023719787598, "grad_norm_var": 0.1605488706137739, "learning_rate": 0.0001, "loss": 1.0028, "loss/crossentropy": 2.599010705947876, "loss/hidden": 0.80078125, "loss/logits": 0.14697444438934326, "loss/reg": 0.0055063748732209206, "step": 479 }, { "epoch": 0.06, "grad_norm": 2.7762978076934814, "grad_norm_var": 0.15971446982347573, "learning_rate": 0.0001, "loss": 1.1492, "loss/crossentropy": 2.6345436573028564, "loss/hidden": 0.9296875, "loss/logits": 0.1645045280456543, "loss/reg": 0.005504653323441744, "step": 480 }, { "epoch": 0.060125, "grad_norm": 3.0745112895965576, "grad_norm_var": 0.1733429982183973, "learning_rate": 0.0001, "loss": 1.2914, "loss/crossentropy": 2.1021008491516113, "loss/hidden": 1.0546875, "loss/logits": 0.18168240785598755, "loss/reg": 0.005502650048583746, "step": 481 }, { "epoch": 0.06025, "grad_norm": 2.5635828971862793, "grad_norm_var": 0.17362979402171655, "learning_rate": 0.0001, "loss": 1.1746, "loss/crossentropy": 2.599754810333252, "loss/hidden": 0.9453125, "loss/logits": 0.1743006557226181, "loss/reg": 0.005500909872353077, "step": 482 }, { "epoch": 0.060375, "grad_norm": 2.982170343399048, "grad_norm_var": 0.11685041441696337, "learning_rate": 0.0001, "loss": 1.084, "loss/crossentropy": 2.780411958694458, "loss/hidden": 0.875, "loss/logits": 0.15399503707885742, "loss/reg": 0.005499421618878841, "step": 483 }, { "epoch": 0.0605, "grad_norm": 6.475743770599365, "grad_norm_var": 1.0413639393420129, "learning_rate": 0.0001, "loss": 2.1473, "loss/crossentropy": 2.3867931365966797, "loss/hidden": 1.703125, "loss/logits": 0.38922837376594543, "loss/reg": 0.005497433710843325, "step": 484 }, { "epoch": 0.060625, "grad_norm": 2.522434711456299, "grad_norm_var": 1.024975132918582, "learning_rate": 0.0001, "loss": 1.0915, "loss/crossentropy": 2.741684675216675, "loss/hidden": 0.88671875, "loss/logits": 0.14987404644489288, "loss/reg": 0.0054954588413238525, "step": 485 }, { "epoch": 0.06075, "grad_norm": 2.6852359771728516, "grad_norm_var": 1.0236023483547378, "learning_rate": 0.0001, "loss": 1.0905, "loss/crossentropy": 2.2552525997161865, "loss/hidden": 0.8984375, "loss/logits": 0.13711076974868774, "loss/reg": 0.005493887234479189, "step": 486 }, { "epoch": 0.060875, "grad_norm": 6.048346996307373, "grad_norm_var": 1.65671866064532, "learning_rate": 0.0001, "loss": 1.4058, "loss/crossentropy": 3.1526873111724854, "loss/hidden": 1.0625, "loss/logits": 0.2884060740470886, "loss/reg": 0.005492268595844507, "step": 487 }, { "epoch": 0.061, "grad_norm": 5.24729061126709, "grad_norm_var": 1.9496829900519608, "learning_rate": 0.0001, "loss": 1.5487, "loss/crossentropy": 2.391798496246338, "loss/hidden": 1.234375, "loss/logits": 0.2594112157821655, "loss/reg": 0.0054903156124055386, "step": 488 }, { "epoch": 0.061125, "grad_norm": 3.4879932403564453, "grad_norm_var": 1.8414378354073275, "learning_rate": 0.0001, "loss": 1.2408, "loss/crossentropy": 2.3853161334991455, "loss/hidden": 1.015625, "loss/logits": 0.1702655553817749, "loss/reg": 0.005488729570060968, "step": 489 }, { "epoch": 0.06125, "grad_norm": 2.416243076324463, "grad_norm_var": 1.875598350696971, "learning_rate": 0.0001, "loss": 1.0646, "loss/crossentropy": 2.310605049133301, "loss/hidden": 0.86328125, "loss/logits": 0.146418958902359, "loss/reg": 0.005487216170877218, "step": 490 }, { "epoch": 0.061375, "grad_norm": 2.9619152545928955, "grad_norm_var": 1.8217813283025472, "learning_rate": 0.0001, "loss": 1.2577, "loss/crossentropy": 2.3735132217407227, "loss/hidden": 1.015625, "loss/logits": 0.18721503019332886, "loss/reg": 0.005485245026648045, "step": 491 }, { "epoch": 0.0615, "grad_norm": 2.9602112770080566, "grad_norm_var": 1.7685642295810833, "learning_rate": 0.0001, "loss": 1.1274, "loss/crossentropy": 2.6420083045959473, "loss/hidden": 0.90234375, "loss/logits": 0.17025524377822876, "loss/reg": 0.005483296699821949, "step": 492 }, { "epoch": 0.061625, "grad_norm": 2.5772223472595215, "grad_norm_var": 1.7347667738241757, "learning_rate": 0.0001, "loss": 1.1004, "loss/crossentropy": 2.4166319370269775, "loss/hidden": 0.890625, "loss/logits": 0.15491390228271484, "loss/reg": 0.005481342785060406, "step": 493 }, { "epoch": 0.06175, "grad_norm": 2.6494603157043457, "grad_norm_var": 1.693988292922673, "learning_rate": 0.0001, "loss": 1.0762, "loss/crossentropy": 2.7021005153656006, "loss/hidden": 0.8671875, "loss/logits": 0.1542307734489441, "loss/reg": 0.005479689687490463, "step": 494 }, { "epoch": 0.061875, "grad_norm": 2.065351963043213, "grad_norm_var": 1.7911776893626628, "learning_rate": 0.0001, "loss": 1.015, "loss/crossentropy": 2.4842755794525146, "loss/hidden": 0.8203125, "loss/logits": 0.13995476067066193, "loss/reg": 0.005478002596646547, "step": 495 }, { "epoch": 0.062, "grad_norm": 2.650660753250122, "grad_norm_var": 1.8016636980513454, "learning_rate": 0.0001, "loss": 1.1699, "loss/crossentropy": 2.3899097442626953, "loss/hidden": 0.94921875, "loss/logits": 0.16591498255729675, "loss/reg": 0.005476430524140596, "step": 496 }, { "epoch": 0.062125, "grad_norm": 3.412050724029541, "grad_norm_var": 1.7970375838694677, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.4459383487701416, "loss/hidden": 0.94140625, "loss/logits": 0.20212361216545105, "loss/reg": 0.005474465899169445, "step": 497 }, { "epoch": 0.06225, "grad_norm": 2.7389674186706543, "grad_norm_var": 1.7804152177025587, "learning_rate": 0.0001, "loss": 1.1076, "loss/crossentropy": 2.6794888973236084, "loss/hidden": 0.90625, "loss/logits": 0.1465749740600586, "loss/reg": 0.005472847726196051, "step": 498 }, { "epoch": 0.062375, "grad_norm": 20.56003761291504, "grad_norm_var": 20.18846043733062, "learning_rate": 0.0001, "loss": 1.0568, "loss/crossentropy": 2.527268409729004, "loss/hidden": 0.859375, "loss/logits": 0.14275437593460083, "loss/reg": 0.005471326876431704, "step": 499 }, { "epoch": 0.0625, "grad_norm": 2.9909119606018066, "grad_norm_var": 20.013739807194945, "learning_rate": 0.0001, "loss": 1.0002, "loss/crossentropy": 2.311053991317749, "loss/hidden": 0.80859375, "loss/logits": 0.13688521087169647, "loss/reg": 0.005469587165862322, "step": 500 }, { "epoch": 0.062625, "grad_norm": 2.6013972759246826, "grad_norm_var": 19.995957990662593, "learning_rate": 0.0001, "loss": 1.1804, "loss/crossentropy": 2.4651801586151123, "loss/hidden": 0.94921875, "loss/logits": 0.17647257447242737, "loss/reg": 0.005467594135552645, "step": 501 }, { "epoch": 0.06275, "grad_norm": 3.800658702850342, "grad_norm_var": 19.840506630980723, "learning_rate": 0.0001, "loss": 0.9929, "loss/crossentropy": 2.3829755783081055, "loss/hidden": 0.8125, "loss/logits": 0.1257120817899704, "loss/reg": 0.005465896334499121, "step": 502 }, { "epoch": 0.062875, "grad_norm": 2.5520195960998535, "grad_norm_var": 19.800229612103674, "learning_rate": 0.0001, "loss": 0.9335, "loss/crossentropy": 2.318957567214966, "loss/hidden": 0.73828125, "loss/logits": 0.14062564074993134, "loss/reg": 0.005463926587253809, "step": 503 }, { "epoch": 0.063, "grad_norm": 2.2392287254333496, "grad_norm_var": 19.907422060176042, "learning_rate": 0.0001, "loss": 0.9748, "loss/crossentropy": 2.6403889656066895, "loss/hidden": 0.79296875, "loss/logits": 0.12718063592910767, "loss/reg": 0.005461950786411762, "step": 504 }, { "epoch": 0.063125, "grad_norm": 59.44195556640625, "grad_norm_var": 212.38825001063205, "learning_rate": 0.0001, "loss": 1.0377, "loss/crossentropy": 2.3318405151367188, "loss/hidden": 0.859375, "loss/logits": 0.12373203039169312, "loss/reg": 0.005460206884890795, "step": 505 }, { "epoch": 0.06325, "grad_norm": 2.711045265197754, "grad_norm_var": 212.19724917857505, "learning_rate": 0.0001, "loss": 1.0909, "loss/crossentropy": 2.5284109115600586, "loss/hidden": 0.8984375, "loss/logits": 0.13792011141777039, "loss/reg": 0.0054582892917096615, "step": 506 }, { "epoch": 0.063375, "grad_norm": 2.2481954097747803, "grad_norm_var": 212.65447803299938, "learning_rate": 0.0001, "loss": 0.9413, "loss/crossentropy": 2.5293643474578857, "loss/hidden": 0.7578125, "loss/logits": 0.12894126772880554, "loss/reg": 0.005456262268126011, "step": 507 }, { "epoch": 0.0635, "grad_norm": 2.7053897380828857, "grad_norm_var": 212.80895755175322, "learning_rate": 0.0001, "loss": 1.0773, "loss/crossentropy": 2.506075143814087, "loss/hidden": 0.87109375, "loss/logits": 0.15166552364826202, "loss/reg": 0.005454184953123331, "step": 508 }, { "epoch": 0.063625, "grad_norm": 2.2455244064331055, "grad_norm_var": 213.0278691549671, "learning_rate": 0.0001, "loss": 0.9768, "loss/crossentropy": 2.6757209300994873, "loss/hidden": 0.7890625, "loss/logits": 0.13317805528640747, "loss/reg": 0.005452104844152927, "step": 509 }, { "epoch": 0.06375, "grad_norm": 2.599388360977173, "grad_norm_var": 213.05941324718256, "learning_rate": 0.0001, "loss": 0.9486, "loss/crossentropy": 2.5610482692718506, "loss/hidden": 0.76953125, "loss/logits": 0.1245487853884697, "loss/reg": 0.005449967924505472, "step": 510 }, { "epoch": 0.063875, "grad_norm": 2.648411989212036, "grad_norm_var": 212.67000591016287, "learning_rate": 0.0001, "loss": 1.0344, "loss/crossentropy": 2.4520514011383057, "loss/hidden": 0.86328125, "loss/logits": 0.11664330959320068, "loss/reg": 0.005447922740131617, "step": 511 }, { "epoch": 0.064, "grad_norm": 2.380727767944336, "grad_norm_var": 212.84492196466834, "learning_rate": 0.0001, "loss": 1.0266, "loss/crossentropy": 2.1030967235565186, "loss/hidden": 0.83984375, "loss/logits": 0.13228739798069, "loss/reg": 0.0054458137601614, "step": 512 }, { "epoch": 0.064125, "grad_norm": 2.198631763458252, "grad_norm_var": 213.5768536641417, "learning_rate": 0.0001, "loss": 1.1228, "loss/crossentropy": 2.3845298290252686, "loss/hidden": 0.90625, "loss/logits": 0.16212098300457, "loss/reg": 0.0054436735808849335, "step": 513 }, { "epoch": 0.06425, "grad_norm": 2.8499302864074707, "grad_norm_var": 213.51026966359936, "learning_rate": 0.0001, "loss": 1.114, "loss/crossentropy": 2.8064866065979004, "loss/hidden": 0.89453125, "loss/logits": 0.16508570313453674, "loss/reg": 0.0054416959173977375, "step": 514 }, { "epoch": 0.064375, "grad_norm": 2.5610392093658447, "grad_norm_var": 201.9317150765609, "learning_rate": 0.0001, "loss": 0.9877, "loss/crossentropy": 2.331996202468872, "loss/hidden": 0.79296875, "loss/logits": 0.14031504094600677, "loss/reg": 0.005439713131636381, "step": 515 }, { "epoch": 0.0645, "grad_norm": 2.0442378520965576, "grad_norm_var": 202.38943138060174, "learning_rate": 0.0001, "loss": 1.0106, "loss/crossentropy": 2.4245524406433105, "loss/hidden": 0.8203125, "loss/logits": 0.13589094579219818, "loss/reg": 0.0054377601481974125, "step": 516 }, { "epoch": 0.064625, "grad_norm": 2.7959375381469727, "grad_norm_var": 202.30067826507621, "learning_rate": 0.0001, "loss": 1.1462, "loss/crossentropy": 2.4928247928619385, "loss/hidden": 0.9375, "loss/logits": 0.154384583234787, "loss/reg": 0.005435979925096035, "step": 517 }, { "epoch": 0.06475, "grad_norm": 2.5521795749664307, "grad_norm_var": 202.78524814255965, "learning_rate": 0.0001, "loss": 1.1979, "loss/crossentropy": 2.4683828353881836, "loss/hidden": 0.96484375, "loss/logits": 0.17875435948371887, "loss/reg": 0.005434305872768164, "step": 518 }, { "epoch": 0.064875, "grad_norm": 3.04142165184021, "grad_norm_var": 202.5720686279479, "learning_rate": 0.0001, "loss": 1.0734, "loss/crossentropy": 2.5951480865478516, "loss/hidden": 0.87890625, "loss/logits": 0.14021140336990356, "loss/reg": 0.005432285368442535, "step": 519 }, { "epoch": 0.065, "grad_norm": 2.7776551246643066, "grad_norm_var": 202.31453305561996, "learning_rate": 0.0001, "loss": 1.0546, "loss/crossentropy": 2.4622325897216797, "loss/hidden": 0.85546875, "loss/logits": 0.14482024312019348, "loss/reg": 0.0054303682409226894, "step": 520 }, { "epoch": 0.065125, "grad_norm": 3.091510057449341, "grad_norm_var": 0.0909682998746592, "learning_rate": 0.0001, "loss": 1.2902, "loss/crossentropy": 2.1909449100494385, "loss/hidden": 1.0703125, "loss/logits": 0.1655557006597519, "loss/reg": 0.005428609903901815, "step": 521 }, { "epoch": 0.06525, "grad_norm": 2.5140562057495117, "grad_norm_var": 0.09023274223209772, "learning_rate": 0.0001, "loss": 1.0304, "loss/crossentropy": 2.4252545833587646, "loss/hidden": 0.8125, "loss/logits": 0.163617342710495, "loss/reg": 0.005426718853414059, "step": 522 }, { "epoch": 0.065375, "grad_norm": 2.4871199131011963, "grad_norm_var": 0.08328167859519695, "learning_rate": 0.0001, "loss": 1.0136, "loss/crossentropy": 2.8315296173095703, "loss/hidden": 0.80859375, "loss/logits": 0.15072785317897797, "loss/reg": 0.0054249088279902935, "step": 523 }, { "epoch": 0.0655, "grad_norm": 2.5452873706817627, "grad_norm_var": 0.082491431169228, "learning_rate": 0.0001, "loss": 1.1747, "loss/crossentropy": 2.4235100746154785, "loss/hidden": 0.953125, "loss/logits": 0.1673499345779419, "loss/reg": 0.005423161666840315, "step": 524 }, { "epoch": 0.065625, "grad_norm": 2.5849223136901855, "grad_norm_var": 0.0744047548688132, "learning_rate": 0.0001, "loss": 1.1537, "loss/crossentropy": 2.517444133758545, "loss/hidden": 0.9375, "loss/logits": 0.16202498972415924, "loss/reg": 0.005421151407063007, "step": 525 }, { "epoch": 0.06575, "grad_norm": 2.5873005390167236, "grad_norm_var": 0.07442217159387093, "learning_rate": 0.0001, "loss": 1.0671, "loss/crossentropy": 2.467041254043579, "loss/hidden": 0.8671875, "loss/logits": 0.1457323431968689, "loss/reg": 0.0054191285744309425, "step": 526 }, { "epoch": 0.065875, "grad_norm": 2.362294912338257, "grad_norm_var": 0.07783568042826777, "learning_rate": 0.0001, "loss": 1.0313, "loss/crossentropy": 2.1884958744049072, "loss/hidden": 0.8359375, "loss/logits": 0.14114870131015778, "loss/reg": 0.005417390260845423, "step": 527 }, { "epoch": 0.066, "grad_norm": 2.9457032680511475, "grad_norm_var": 0.08233057116115011, "learning_rate": 0.0001, "loss": 1.1121, "loss/crossentropy": 2.4786124229431152, "loss/hidden": 0.9140625, "loss/logits": 0.14386197924613953, "loss/reg": 0.005415752530097961, "step": 528 }, { "epoch": 0.066125, "grad_norm": 2.302025556564331, "grad_norm_var": 0.07717323196560505, "learning_rate": 0.0001, "loss": 1.0216, "loss/crossentropy": 2.3902788162231445, "loss/hidden": 0.81640625, "loss/logits": 0.15102702379226685, "loss/reg": 0.005413680803030729, "step": 529 }, { "epoch": 0.06625, "grad_norm": 2.2427210807800293, "grad_norm_var": 0.08222220602995639, "learning_rate": 0.0001, "loss": 1.0503, "loss/crossentropy": 2.5187177658081055, "loss/hidden": 0.85546875, "loss/logits": 0.1406846046447754, "loss/reg": 0.005411935038864613, "step": 530 }, { "epoch": 0.066375, "grad_norm": 2.215160846710205, "grad_norm_var": 0.09102156065525453, "learning_rate": 0.0001, "loss": 1.0564, "loss/crossentropy": 2.7218358516693115, "loss/hidden": 0.8515625, "loss/logits": 0.15077106654644012, "loss/reg": 0.00541025260463357, "step": 531 }, { "epoch": 0.0665, "grad_norm": 49.282718658447266, "grad_norm_var": 136.25864998814177, "learning_rate": 0.0001, "loss": 1.0971, "loss/crossentropy": 2.2880303859710693, "loss/hidden": 0.90625, "loss/logits": 0.136735200881958, "loss/reg": 0.005408551078289747, "step": 532 }, { "epoch": 0.066625, "grad_norm": 2.9009287357330322, "grad_norm_var": 136.22119824556128, "learning_rate": 0.0001, "loss": 1.2051, "loss/crossentropy": 2.069505214691162, "loss/hidden": 0.96875, "loss/logits": 0.18224555253982544, "loss/reg": 0.00540671544149518, "step": 533 }, { "epoch": 0.06675, "grad_norm": 3.366948127746582, "grad_norm_var": 135.93950988587417, "learning_rate": 0.0001, "loss": 1.2696, "loss/crossentropy": 2.5054562091827393, "loss/hidden": 1.0234375, "loss/logits": 0.19209496676921844, "loss/reg": 0.005404717288911343, "step": 534 }, { "epoch": 0.066875, "grad_norm": 2.380380868911743, "grad_norm_var": 136.19039047350162, "learning_rate": 0.0001, "loss": 1.0532, "loss/crossentropy": 2.4644546508789062, "loss/hidden": 0.85546875, "loss/logits": 0.14370107650756836, "loss/reg": 0.005402736831456423, "step": 535 }, { "epoch": 0.067, "grad_norm": 2.2630538940429688, "grad_norm_var": 136.3962470934156, "learning_rate": 0.0001, "loss": 1.106, "loss/crossentropy": 2.362761974334717, "loss/hidden": 0.88671875, "loss/logits": 0.16529247164726257, "loss/reg": 0.005401079077273607, "step": 536 }, { "epoch": 0.067125, "grad_norm": 2.2985637187957764, "grad_norm_var": 136.69066191681563, "learning_rate": 0.0001, "loss": 1.1188, "loss/crossentropy": 2.2513458728790283, "loss/hidden": 0.90625, "loss/logits": 0.158562570810318, "loss/reg": 0.005399197805672884, "step": 537 }, { "epoch": 0.06725, "grad_norm": 2.9178526401519775, "grad_norm_var": 136.54251636267432, "learning_rate": 0.0001, "loss": 1.2523, "loss/crossentropy": 2.4714279174804688, "loss/hidden": 1.0, "loss/logits": 0.1983010470867157, "loss/reg": 0.005397453438490629, "step": 538 }, { "epoch": 0.067375, "grad_norm": 2.1185367107391357, "grad_norm_var": 136.69809974879476, "learning_rate": 0.0001, "loss": 1.0218, "loss/crossentropy": 2.5675318241119385, "loss/hidden": 0.83203125, "loss/logits": 0.135833740234375, "loss/reg": 0.005395461805164814, "step": 539 }, { "epoch": 0.0675, "grad_norm": 2.471010684967041, "grad_norm_var": 136.72728236316837, "learning_rate": 0.0001, "loss": 0.9973, "loss/crossentropy": 2.434710741043091, "loss/hidden": 0.80078125, "loss/logits": 0.1425883173942566, "loss/reg": 0.005393547471612692, "step": 540 }, { "epoch": 0.067625, "grad_norm": 2.24953556060791, "grad_norm_var": 136.86254598209106, "learning_rate": 0.0001, "loss": 1.031, "loss/crossentropy": 2.2015278339385986, "loss/hidden": 0.83984375, "loss/logits": 0.13719773292541504, "loss/reg": 0.005391509272158146, "step": 541 }, { "epoch": 0.06775, "grad_norm": 3.6081595420837402, "grad_norm_var": 136.54053740800046, "learning_rate": 0.0001, "loss": 1.4308, "loss/crossentropy": 2.6074843406677246, "loss/hidden": 1.1328125, "loss/logits": 0.244051992893219, "loss/reg": 0.0053895004093647, "step": 542 }, { "epoch": 0.067875, "grad_norm": 2.293153762817383, "grad_norm_var": 136.5697192568711, "learning_rate": 0.0001, "loss": 1.0019, "loss/crossentropy": 2.604656934738159, "loss/hidden": 0.8125, "loss/logits": 0.13555657863616943, "loss/reg": 0.005387555807828903, "step": 543 }, { "epoch": 0.068, "grad_norm": 3.9054906368255615, "grad_norm_var": 136.30156429508213, "learning_rate": 0.0001, "loss": 1.2067, "loss/crossentropy": 2.723475456237793, "loss/hidden": 1.0078125, "loss/logits": 0.14498497545719147, "loss/reg": 0.005385412368923426, "step": 544 }, { "epoch": 0.068125, "grad_norm": 2.571354389190674, "grad_norm_var": 136.18942504783266, "learning_rate": 0.0001, "loss": 1.1473, "loss/crossentropy": 2.686601400375366, "loss/hidden": 0.87109375, "loss/logits": 0.22237557172775269, "loss/reg": 0.005383248440921307, "step": 545 }, { "epoch": 0.06825, "grad_norm": 2.71347975730896, "grad_norm_var": 135.99456491905767, "learning_rate": 0.0001, "loss": 1.2265, "loss/crossentropy": 2.3059206008911133, "loss/hidden": 0.99609375, "loss/logits": 0.17658907175064087, "loss/reg": 0.00538119999691844, "step": 546 }, { "epoch": 0.068375, "grad_norm": 2.2055041790008545, "grad_norm_var": 135.99892540184646, "learning_rate": 0.0001, "loss": 1.227, "loss/crossentropy": 2.1022770404815674, "loss/hidden": 1.0078125, "loss/logits": 0.165423184633255, "loss/reg": 0.00537898438051343, "step": 547 }, { "epoch": 0.0685, "grad_norm": 2.9577088356018066, "grad_norm_var": 0.2900975003793434, "learning_rate": 0.0001, "loss": 1.0223, "loss/crossentropy": 2.7894651889801025, "loss/hidden": 0.8359375, "loss/logits": 0.13254427909851074, "loss/reg": 0.0053769489750266075, "step": 548 }, { "epoch": 0.068625, "grad_norm": 2.289632797241211, "grad_norm_var": 0.2971860973100412, "learning_rate": 0.0001, "loss": 0.9706, "loss/crossentropy": 1.9662660360336304, "loss/hidden": 0.796875, "loss/logits": 0.11997567117214203, "loss/reg": 0.005374929867684841, "step": 549 }, { "epoch": 0.06875, "grad_norm": 2.7140934467315674, "grad_norm_var": 0.26256089477725764, "learning_rate": 0.0001, "loss": 1.0877, "loss/crossentropy": 2.600255012512207, "loss/hidden": 0.875, "loss/logits": 0.15900644659996033, "loss/reg": 0.005373071413487196, "step": 550 }, { "epoch": 0.068875, "grad_norm": 2.7848618030548096, "grad_norm_var": 0.25973690827483153, "learning_rate": 0.0001, "loss": 1.1564, "loss/crossentropy": 2.347534656524658, "loss/hidden": 0.9296875, "loss/logits": 0.1730039119720459, "loss/reg": 0.005370850209146738, "step": 551 }, { "epoch": 0.069, "grad_norm": 2.1625566482543945, "grad_norm_var": 0.26552124449597064, "learning_rate": 0.0001, "loss": 1.0454, "loss/crossentropy": 2.766214609146118, "loss/hidden": 0.84765625, "loss/logits": 0.1440483182668686, "loss/reg": 0.005368667654693127, "step": 552 }, { "epoch": 0.069125, "grad_norm": 2.8881192207336426, "grad_norm_var": 0.2602997020069113, "learning_rate": 0.0001, "loss": 1.0958, "loss/crossentropy": 2.808046817779541, "loss/hidden": 0.89453125, "loss/logits": 0.14757747948169708, "loss/reg": 0.005366665776818991, "step": 553 }, { "epoch": 0.06925, "grad_norm": 2.6187915802001953, "grad_norm_var": 0.2563330715515538, "learning_rate": 0.0001, "loss": 1.2873, "loss/crossentropy": 2.5937459468841553, "loss/hidden": 1.03125, "loss/logits": 0.20239150524139404, "loss/reg": 0.005364455748349428, "step": 554 }, { "epoch": 0.069375, "grad_norm": 2.6418044567108154, "grad_norm_var": 0.23570370249948383, "learning_rate": 0.0001, "loss": 1.1877, "loss/crossentropy": 2.5635995864868164, "loss/hidden": 0.9609375, "loss/logits": 0.17318235337734222, "loss/reg": 0.005362290423363447, "step": 555 }, { "epoch": 0.0695, "grad_norm": 2.79367733001709, "grad_norm_var": 0.2326946034348102, "learning_rate": 0.0001, "loss": 1.2018, "loss/crossentropy": 2.13350510597229, "loss/hidden": 1.0, "loss/logits": 0.14820048213005066, "loss/reg": 0.005360215436667204, "step": 556 }, { "epoch": 0.069625, "grad_norm": 3.387333869934082, "grad_norm_var": 0.2433911623755942, "learning_rate": 0.0001, "loss": 1.0866, "loss/crossentropy": 2.4682462215423584, "loss/hidden": 0.859375, "loss/logits": 0.17359653115272522, "loss/reg": 0.005357977002859116, "step": 557 }, { "epoch": 0.06975, "grad_norm": 2.9143950939178467, "grad_norm_var": 0.19718877969401258, "learning_rate": 0.0001, "loss": 1.1093, "loss/crossentropy": 2.6049532890319824, "loss/hidden": 0.87890625, "loss/logits": 0.1768435537815094, "loss/reg": 0.005355944857001305, "step": 558 }, { "epoch": 0.069875, "grad_norm": 2.495455741882324, "grad_norm_var": 0.18769030937939207, "learning_rate": 0.0001, "loss": 1.0522, "loss/crossentropy": 2.2471513748168945, "loss/hidden": 0.8203125, "loss/logits": 0.17833727598190308, "loss/reg": 0.005353772081434727, "step": 559 }, { "epoch": 0.07, "grad_norm": 3.548495054244995, "grad_norm_var": 0.140786672247814, "learning_rate": 0.0001, "loss": 1.2284, "loss/crossentropy": 2.0618865489959717, "loss/hidden": 0.984375, "loss/logits": 0.1905221790075302, "loss/reg": 0.005351651925593615, "step": 560 }, { "epoch": 0.070125, "grad_norm": 2.8271002769470215, "grad_norm_var": 0.1394493347625937, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.392343282699585, "loss/hidden": 0.87109375, "loss/logits": 0.1387328803539276, "loss/reg": 0.005349620711058378, "step": 561 }, { "epoch": 0.07025, "grad_norm": 2.02376389503479, "grad_norm_var": 0.17221200465598158, "learning_rate": 0.0001, "loss": 1.0841, "loss/crossentropy": 2.26953125, "loss/hidden": 0.87890625, "loss/logits": 0.15167732536792755, "loss/reg": 0.005347614176571369, "step": 562 }, { "epoch": 0.070375, "grad_norm": 3.3085341453552246, "grad_norm_var": 0.17503849488183504, "learning_rate": 0.0001, "loss": 0.9333, "loss/crossentropy": 2.278923511505127, "loss/hidden": 0.76171875, "loss/logits": 0.11809547245502472, "loss/reg": 0.005345623474568129, "step": 563 }, { "epoch": 0.0705, "grad_norm": 6.067057132720947, "grad_norm_var": 0.8561705035714908, "learning_rate": 0.0001, "loss": 1.6764, "loss/crossentropy": 2.513504981994629, "loss/hidden": 1.421875, "loss/logits": 0.20107370615005493, "loss/reg": 0.00534354243427515, "step": 564 }, { "epoch": 0.070625, "grad_norm": 2.2686562538146973, "grad_norm_var": 0.8580914081280743, "learning_rate": 0.0001, "loss": 1.0927, "loss/crossentropy": 2.2088499069213867, "loss/hidden": 0.88671875, "loss/logits": 0.15257079899311066, "loss/reg": 0.005341436248272657, "step": 565 }, { "epoch": 0.07075, "grad_norm": 3.1334891319274902, "grad_norm_var": 0.8550377421403706, "learning_rate": 0.0001, "loss": 1.3017, "loss/crossentropy": 2.117647886276245, "loss/hidden": 1.0703125, "loss/logits": 0.17795339226722717, "loss/reg": 0.005339318886399269, "step": 566 }, { "epoch": 0.070875, "grad_norm": 4.6122727394104, "grad_norm_var": 1.0134023805364716, "learning_rate": 0.0001, "loss": 1.4641, "loss/crossentropy": 2.7603936195373535, "loss/hidden": 1.140625, "loss/logits": 0.27013444900512695, "loss/reg": 0.005337177775800228, "step": 567 }, { "epoch": 0.071, "grad_norm": 2.583162307739258, "grad_norm_var": 0.9715659491999304, "learning_rate": 0.0001, "loss": 1.2051, "loss/crossentropy": 2.380053758621216, "loss/hidden": 0.9765625, "loss/logits": 0.1751583367586136, "loss/reg": 0.005335117690265179, "step": 568 }, { "epoch": 0.071125, "grad_norm": 2.2990646362304688, "grad_norm_var": 1.0124076074311148, "learning_rate": 0.0001, "loss": 1.1188, "loss/crossentropy": 2.587369203567505, "loss/hidden": 0.8984375, "loss/logits": 0.16701750457286835, "loss/reg": 0.005332810804247856, "step": 569 }, { "epoch": 0.07125, "grad_norm": 3.1470165252685547, "grad_norm_var": 0.9962936596825245, "learning_rate": 0.0001, "loss": 1.0239, "loss/crossentropy": 2.492009162902832, "loss/hidden": 0.79296875, "loss/logits": 0.17766177654266357, "loss/reg": 0.0053307050839066505, "step": 570 }, { "epoch": 0.071375, "grad_norm": 2.8765156269073486, "grad_norm_var": 0.9845149270166076, "learning_rate": 0.0001, "loss": 1.1206, "loss/crossentropy": 2.714184045791626, "loss/hidden": 0.91015625, "loss/logits": 0.15715843439102173, "loss/reg": 0.005328655708581209, "step": 571 }, { "epoch": 0.0715, "grad_norm": 3.696258068084717, "grad_norm_var": 0.9934068745617035, "learning_rate": 0.0001, "loss": 1.7557, "loss/crossentropy": 2.322871685028076, "loss/hidden": 1.40625, "loss/logits": 0.2961430847644806, "loss/reg": 0.005326449871063232, "step": 572 }, { "epoch": 0.071625, "grad_norm": 2.3756725788116455, "grad_norm_var": 1.0320075552341808, "learning_rate": 0.0001, "loss": 1.0069, "loss/crossentropy": 2.446782350540161, "loss/hidden": 0.80859375, "loss/logits": 0.14505374431610107, "loss/reg": 0.005324224475771189, "step": 573 }, { "epoch": 0.07175, "grad_norm": 3.0114002227783203, "grad_norm_var": 1.0297287032782758, "learning_rate": 0.0001, "loss": 1.0473, "loss/crossentropy": 2.815708637237549, "loss/hidden": 0.84765625, "loss/logits": 0.146395742893219, "loss/reg": 0.005321910604834557, "step": 574 }, { "epoch": 0.071875, "grad_norm": 2.7700350284576416, "grad_norm_var": 1.0107660796879199, "learning_rate": 0.0001, "loss": 0.9849, "loss/crossentropy": 2.579951286315918, "loss/hidden": 0.79296875, "loss/logits": 0.13869163393974304, "loss/reg": 0.005319789983332157, "step": 575 }, { "epoch": 0.072, "grad_norm": 2.1934142112731934, "grad_norm_var": 1.0552091073780958, "learning_rate": 0.0001, "loss": 1.0376, "loss/crossentropy": 2.7635080814361572, "loss/hidden": 0.83203125, "loss/logits": 0.1523815095424652, "loss/reg": 0.005317789036780596, "step": 576 }, { "epoch": 0.072125, "grad_norm": 2.203432321548462, "grad_norm_var": 1.1000992612664597, "learning_rate": 0.0001, "loss": 1.0214, "loss/crossentropy": 2.2990267276763916, "loss/hidden": 0.81640625, "loss/logits": 0.15180249512195587, "loss/reg": 0.005315590649843216, "step": 577 }, { "epoch": 0.07225, "grad_norm": 2.7597663402557373, "grad_norm_var": 1.0346594183063509, "learning_rate": 0.0001, "loss": 1.2895, "loss/crossentropy": 2.7941789627075195, "loss/hidden": 1.0390625, "loss/logits": 0.1973191797733307, "loss/reg": 0.00531340204179287, "step": 578 }, { "epoch": 0.072375, "grad_norm": 2.151498794555664, "grad_norm_var": 1.0833220696738444, "learning_rate": 0.0001, "loss": 0.9999, "loss/crossentropy": 2.4545745849609375, "loss/hidden": 0.8125, "loss/logits": 0.1343034952878952, "loss/reg": 0.005311093758791685, "step": 579 }, { "epoch": 0.0725, "grad_norm": 2.758521318435669, "grad_norm_var": 0.4185770203540026, "learning_rate": 0.0001, "loss": 1.1404, "loss/crossentropy": 2.3098721504211426, "loss/hidden": 0.93359375, "loss/logits": 0.1536703109741211, "loss/reg": 0.00530878035351634, "step": 580 }, { "epoch": 0.072625, "grad_norm": 3.102933406829834, "grad_norm_var": 0.40269379192041677, "learning_rate": 0.0001, "loss": 1.4046, "loss/crossentropy": 2.4322452545166016, "loss/hidden": 1.125, "loss/logits": 0.22653597593307495, "loss/reg": 0.005306490696966648, "step": 581 }, { "epoch": 0.07275, "grad_norm": 2.831894636154175, "grad_norm_var": 0.39716603194751554, "learning_rate": 0.0001, "loss": 1.0557, "loss/crossentropy": 2.5340616703033447, "loss/hidden": 0.859375, "loss/logits": 0.14325933158397675, "loss/reg": 0.005304399877786636, "step": 582 }, { "epoch": 0.072875, "grad_norm": 2.5802910327911377, "grad_norm_var": 0.17392503265121587, "learning_rate": 0.0001, "loss": 1.0795, "loss/crossentropy": 2.5839056968688965, "loss/hidden": 0.86328125, "loss/logits": 0.16314582526683807, "loss/reg": 0.005302343517541885, "step": 583 }, { "epoch": 0.073, "grad_norm": 2.650399684906006, "grad_norm_var": 0.17308120367785024, "learning_rate": 0.0001, "loss": 1.1285, "loss/crossentropy": 2.487835645675659, "loss/hidden": 0.91796875, "loss/logits": 0.1575045883655548, "loss/reg": 0.005300293210893869, "step": 584 }, { "epoch": 0.073125, "grad_norm": 2.5146095752716064, "grad_norm_var": 0.16408850139509898, "learning_rate": 0.0001, "loss": 0.8801, "loss/crossentropy": 2.710824728012085, "loss/hidden": 0.71875, "loss/logits": 0.10839369148015976, "loss/reg": 0.005298234056681395, "step": 585 }, { "epoch": 0.07325, "grad_norm": 3.5579047203063965, "grad_norm_var": 0.19767952383566936, "learning_rate": 0.0001, "loss": 1.0659, "loss/crossentropy": 2.5979011058807373, "loss/hidden": 0.86328125, "loss/logits": 0.1496235430240631, "loss/reg": 0.005296017974615097, "step": 586 }, { "epoch": 0.073375, "grad_norm": 3.229036569595337, "grad_norm_var": 0.21129156050842327, "learning_rate": 0.0001, "loss": 1.3736, "loss/crossentropy": 2.0404820442199707, "loss/hidden": 1.109375, "loss/logits": 0.2113049328327179, "loss/reg": 0.005293776281177998, "step": 587 }, { "epoch": 0.0735, "grad_norm": 3.219778537750244, "grad_norm_var": 0.1669016788033178, "learning_rate": 0.0001, "loss": 1.1673, "loss/crossentropy": 2.5946946144104004, "loss/hidden": 0.96875, "loss/logits": 0.14563477039337158, "loss/reg": 0.005291698966175318, "step": 588 }, { "epoch": 0.073625, "grad_norm": 2.622143030166626, "grad_norm_var": 0.15858063234232014, "learning_rate": 0.0001, "loss": 1.2658, "loss/crossentropy": 2.2183616161346436, "loss/hidden": 1.0078125, "loss/logits": 0.20505878329277039, "loss/reg": 0.0052896649576723576, "step": 589 }, { "epoch": 0.07375, "grad_norm": 2.496985673904419, "grad_norm_var": 0.15786373129452994, "learning_rate": 0.0001, "loss": 1.008, "loss/crossentropy": 2.4871459007263184, "loss/hidden": 0.80078125, "loss/logits": 0.15438680350780487, "loss/reg": 0.005287437699735165, "step": 590 }, { "epoch": 0.073875, "grad_norm": 3.0701065063476562, "grad_norm_var": 0.1651866047673136, "learning_rate": 0.0001, "loss": 1.1327, "loss/crossentropy": 2.414074420928955, "loss/hidden": 0.9140625, "loss/logits": 0.16581328213214874, "loss/reg": 0.005285393912345171, "step": 591 }, { "epoch": 0.074, "grad_norm": 3.9464497566223145, "grad_norm_var": 0.22799900213871613, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.3605363368988037, "loss/hidden": 1.265625, "loss/logits": 0.25435870885849, "loss/reg": 0.0052834744565188885, "step": 592 }, { "epoch": 0.074125, "grad_norm": 2.5776402950286865, "grad_norm_var": 0.20419228079197158, "learning_rate": 0.0001, "loss": 1.2113, "loss/crossentropy": 2.3519833087921143, "loss/hidden": 0.9765625, "loss/logits": 0.18191702663898468, "loss/reg": 0.0052813272923231125, "step": 593 }, { "epoch": 0.07425, "grad_norm": 2.2281813621520996, "grad_norm_var": 0.23033113710586123, "learning_rate": 0.0001, "loss": 1.072, "loss/crossentropy": 2.38511323928833, "loss/hidden": 0.875, "loss/logits": 0.14420956373214722, "loss/reg": 0.0052796173840761185, "step": 594 }, { "epoch": 0.074375, "grad_norm": 3.1069376468658447, "grad_norm_var": 0.19889239941204717, "learning_rate": 0.0001, "loss": 1.1811, "loss/crossentropy": 2.413785696029663, "loss/hidden": 0.984375, "loss/logits": 0.14399147033691406, "loss/reg": 0.005277944263070822, "step": 595 }, { "epoch": 0.0745, "grad_norm": 3.3450682163238525, "grad_norm_var": 0.2088716594217845, "learning_rate": 0.0001, "loss": 1.2822, "loss/crossentropy": 2.434509038925171, "loss/hidden": 1.0390625, "loss/logits": 0.1903418004512787, "loss/reg": 0.005276298616081476, "step": 596 }, { "epoch": 0.074625, "grad_norm": 2.371161937713623, "grad_norm_var": 0.22668853941960734, "learning_rate": 0.0001, "loss": 1.0528, "loss/crossentropy": 2.396265745162964, "loss/hidden": 0.85546875, "loss/logits": 0.14461319148540497, "loss/reg": 0.005274245049804449, "step": 597 }, { "epoch": 0.07475, "grad_norm": 3.314265251159668, "grad_norm_var": 0.2370575162921483, "learning_rate": 0.0001, "loss": 1.1987, "loss/crossentropy": 2.7262494564056396, "loss/hidden": 0.9765625, "loss/logits": 0.16941672563552856, "loss/reg": 0.0052725388668477535, "step": 598 }, { "epoch": 0.074875, "grad_norm": 3.7527589797973633, "grad_norm_var": 0.2687845607887461, "learning_rate": 0.0001, "loss": 1.2156, "loss/crossentropy": 2.6718039512634277, "loss/hidden": 0.98046875, "loss/logits": 0.18241068720817566, "loss/reg": 0.005270869936794043, "step": 599 }, { "epoch": 0.075, "grad_norm": 2.5073466300964355, "grad_norm_var": 0.2767358438012515, "learning_rate": 0.0001, "loss": 1.0856, "loss/crossentropy": 2.3735952377319336, "loss/hidden": 0.875, "loss/logits": 0.15789943933486938, "loss/reg": 0.005268939305096865, "step": 600 }, { "epoch": 0.075125, "grad_norm": 4.061317443847656, "grad_norm_var": 0.3279536252115766, "learning_rate": 0.0001, "loss": 1.3519, "loss/crossentropy": 2.5530035495758057, "loss/hidden": 1.1015625, "loss/logits": 0.19769783318042755, "loss/reg": 0.005267218686640263, "step": 601 }, { "epoch": 0.07525, "grad_norm": 2.4795703887939453, "grad_norm_var": 0.33305877012988483, "learning_rate": 0.0001, "loss": 1.1642, "loss/crossentropy": 2.4662020206451416, "loss/hidden": 0.93359375, "loss/logits": 0.17794585227966309, "loss/reg": 0.005265380721539259, "step": 602 }, { "epoch": 0.075375, "grad_norm": 3.2844204902648926, "grad_norm_var": 0.33479007900948143, "learning_rate": 0.0001, "loss": 1.0854, "loss/crossentropy": 2.5502281188964844, "loss/hidden": 0.87890625, "loss/logits": 0.15388712286949158, "loss/reg": 0.005263412371277809, "step": 603 }, { "epoch": 0.0755, "grad_norm": 2.4871444702148438, "grad_norm_var": 0.3492133912507728, "learning_rate": 0.0001, "loss": 1.0922, "loss/crossentropy": 2.5286808013916016, "loss/hidden": 0.86328125, "loss/logits": 0.17630262672901154, "loss/reg": 0.005261610262095928, "step": 604 }, { "epoch": 0.075625, "grad_norm": 2.9881107807159424, "grad_norm_var": 0.3402092077326716, "learning_rate": 0.0001, "loss": 1.1525, "loss/crossentropy": 2.522861957550049, "loss/hidden": 0.9375, "loss/logits": 0.16244357824325562, "loss/reg": 0.005259564146399498, "step": 605 }, { "epoch": 0.07575, "grad_norm": 2.3586983680725098, "grad_norm_var": 0.35069927923201, "learning_rate": 0.0001, "loss": 1.1314, "loss/crossentropy": 2.2875781059265137, "loss/hidden": 0.921875, "loss/logits": 0.15690375864505768, "loss/reg": 0.0052574859000742435, "step": 606 }, { "epoch": 0.075875, "grad_norm": 2.6491522789001465, "grad_norm_var": 0.35741571312755316, "learning_rate": 0.0001, "loss": 1.1971, "loss/crossentropy": 2.2390084266662598, "loss/hidden": 0.95703125, "loss/logits": 0.1875428408384323, "loss/reg": 0.005255614407360554, "step": 607 }, { "epoch": 0.076, "grad_norm": 2.073080539703369, "grad_norm_var": 0.33189569909254335, "learning_rate": 0.0001, "loss": 0.9664, "loss/crossentropy": 2.3911304473876953, "loss/hidden": 0.7890625, "loss/logits": 0.12475378811359406, "loss/reg": 0.005253734532743692, "step": 608 }, { "epoch": 0.076125, "grad_norm": 2.265080451965332, "grad_norm_var": 0.34931259933064945, "learning_rate": 0.0001, "loss": 1.1539, "loss/crossentropy": 2.5041847229003906, "loss/hidden": 0.9375, "loss/logits": 0.16390517354011536, "loss/reg": 0.005251840688288212, "step": 609 }, { "epoch": 0.07625, "grad_norm": 2.0803020000457764, "grad_norm_var": 0.3625360811458743, "learning_rate": 0.0001, "loss": 1.0503, "loss/crossentropy": 2.504490375518799, "loss/hidden": 0.84765625, "loss/logits": 0.15017710626125336, "loss/reg": 0.0052499608136713505, "step": 610 }, { "epoch": 0.076375, "grad_norm": 2.5562875270843506, "grad_norm_var": 0.3604403500297356, "learning_rate": 0.0001, "loss": 1.0444, "loss/crossentropy": 2.462942361831665, "loss/hidden": 0.82421875, "loss/logits": 0.16772450506687164, "loss/reg": 0.005247869063168764, "step": 611 }, { "epoch": 0.0765, "grad_norm": 2.2976746559143066, "grad_norm_var": 0.3509101683634808, "learning_rate": 0.0001, "loss": 1.0751, "loss/crossentropy": 2.423419713973999, "loss/hidden": 0.87890625, "loss/logits": 0.1437685340642929, "loss/reg": 0.005245808511972427, "step": 612 }, { "epoch": 0.076625, "grad_norm": 2.0925452709198, "grad_norm_var": 0.368735612720054, "learning_rate": 0.0001, "loss": 1.0097, "loss/crossentropy": 2.422513961791992, "loss/hidden": 0.80078125, "loss/logits": 0.15644872188568115, "loss/reg": 0.0052436222322285175, "step": 613 }, { "epoch": 0.07675, "grad_norm": 3.361826181411743, "grad_norm_var": 0.3727533997750771, "learning_rate": 0.0001, "loss": 1.0847, "loss/crossentropy": 2.6778650283813477, "loss/hidden": 0.88671875, "loss/logits": 0.14552772045135498, "loss/reg": 0.005241374485194683, "step": 614 }, { "epoch": 0.076875, "grad_norm": 2.418203353881836, "grad_norm_var": 0.29779963975303353, "learning_rate": 0.0001, "loss": 0.9395, "loss/crossentropy": 2.5650947093963623, "loss/hidden": 0.765625, "loss/logits": 0.12145733833312988, "loss/reg": 0.005239336285740137, "step": 615 }, { "epoch": 0.077, "grad_norm": 2.0879790782928467, "grad_norm_var": 0.3152329983661199, "learning_rate": 0.0001, "loss": 1.0952, "loss/crossentropy": 2.393650531768799, "loss/hidden": 0.8828125, "loss/logits": 0.15999376773834229, "loss/reg": 0.005237067583948374, "step": 616 }, { "epoch": 0.077125, "grad_norm": 2.236255168914795, "grad_norm_var": 0.1669205481679067, "learning_rate": 0.0001, "loss": 0.997, "loss/crossentropy": 2.7451670169830322, "loss/hidden": 0.796875, "loss/logits": 0.1478239744901657, "loss/reg": 0.005235039163380861, "step": 617 }, { "epoch": 0.07725, "grad_norm": 3.4291763305664062, "grad_norm_var": 0.2229381174587303, "learning_rate": 0.0001, "loss": 1.4063, "loss/crossentropy": 2.2041330337524414, "loss/hidden": 1.1484375, "loss/logits": 0.2055673450231552, "loss/reg": 0.005233013071119785, "step": 618 }, { "epoch": 0.077375, "grad_norm": 2.1689059734344482, "grad_norm_var": 0.19023093415819758, "learning_rate": 0.0001, "loss": 1.1399, "loss/crossentropy": 2.3984005451202393, "loss/hidden": 0.9296875, "loss/logits": 0.1579177975654602, "loss/reg": 0.0052308449521660805, "step": 619 }, { "epoch": 0.0775, "grad_norm": 2.2414422035217285, "grad_norm_var": 0.1935046668737487, "learning_rate": 0.0001, "loss": 0.997, "loss/crossentropy": 2.448946475982666, "loss/hidden": 0.81640625, "loss/logits": 0.12831541895866394, "loss/reg": 0.005228678695857525, "step": 620 }, { "epoch": 0.077625, "grad_norm": 2.2024965286254883, "grad_norm_var": 0.17639827374390885, "learning_rate": 0.0001, "loss": 1.1341, "loss/crossentropy": 2.475011110305786, "loss/hidden": 0.9140625, "loss/logits": 0.16773179173469543, "loss/reg": 0.005226653069257736, "step": 621 }, { "epoch": 0.07775, "grad_norm": 3.0845813751220703, "grad_norm_var": 0.20461207914334617, "learning_rate": 0.0001, "loss": 1.2647, "loss/crossentropy": 2.6116580963134766, "loss/hidden": 1.0234375, "loss/logits": 0.188987135887146, "loss/reg": 0.005224402993917465, "step": 622 }, { "epoch": 0.077875, "grad_norm": 1.887999415397644, "grad_norm_var": 0.2208956692966997, "learning_rate": 0.0001, "loss": 1.0901, "loss/crossentropy": 2.1740164756774902, "loss/hidden": 0.87890625, "loss/logits": 0.15901124477386475, "loss/reg": 0.005222304258495569, "step": 623 }, { "epoch": 0.078, "grad_norm": 2.436877489089966, "grad_norm_var": 0.2130556319156203, "learning_rate": 0.0001, "loss": 1.1171, "loss/crossentropy": 2.405428647994995, "loss/hidden": 0.9140625, "loss/logits": 0.1507887840270996, "loss/reg": 0.005220047663897276, "step": 624 }, { "epoch": 0.078125, "grad_norm": 2.1241559982299805, "grad_norm_var": 0.21735767872165226, "learning_rate": 0.0001, "loss": 1.0586, "loss/crossentropy": 2.534162759780884, "loss/hidden": 0.84765625, "loss/logits": 0.1587330847978592, "loss/reg": 0.005217918660491705, "step": 625 }, { "epoch": 0.07825, "grad_norm": 3.6194941997528076, "grad_norm_var": 0.2958829671732156, "learning_rate": 0.0001, "loss": 1.1975, "loss/crossentropy": 2.4529123306274414, "loss/hidden": 0.953125, "loss/logits": 0.1922522485256195, "loss/reg": 0.005215668119490147, "step": 626 }, { "epoch": 0.078375, "grad_norm": 2.9666078090667725, "grad_norm_var": 0.3086442760245996, "learning_rate": 0.0001, "loss": 1.3623, "loss/crossentropy": 2.2351934909820557, "loss/hidden": 1.1015625, "loss/logits": 0.2085917890071869, "loss/reg": 0.005213598720729351, "step": 627 }, { "epoch": 0.0785, "grad_norm": 3.424123764038086, "grad_norm_var": 0.35140186017642155, "learning_rate": 0.0001, "loss": 1.0209, "loss/crossentropy": 2.593291997909546, "loss/hidden": 0.8359375, "loss/logits": 0.13284316658973694, "loss/reg": 0.005211306270211935, "step": 628 }, { "epoch": 0.078625, "grad_norm": 3.1431374549865723, "grad_norm_var": 0.34770286145400553, "learning_rate": 0.0001, "loss": 1.1602, "loss/crossentropy": 2.5281360149383545, "loss/hidden": 0.9375, "loss/logits": 0.1706121563911438, "loss/reg": 0.005208863411098719, "step": 629 }, { "epoch": 0.07875, "grad_norm": 2.7911853790283203, "grad_norm_var": 0.315955495515612, "learning_rate": 0.0001, "loss": 1.2247, "loss/crossentropy": 1.7649813890457153, "loss/hidden": 0.99609375, "loss/logits": 0.17649176716804504, "loss/reg": 0.005206458270549774, "step": 630 }, { "epoch": 0.078875, "grad_norm": 2.941204309463501, "grad_norm_var": 0.3174858804582356, "learning_rate": 0.0001, "loss": 1.1613, "loss/crossentropy": 2.402109146118164, "loss/hidden": 0.921875, "loss/logits": 0.1873561143875122, "loss/reg": 0.0052040074951946735, "step": 631 }, { "epoch": 0.079, "grad_norm": 2.391481876373291, "grad_norm_var": 0.2995243667524064, "learning_rate": 0.0001, "loss": 1.178, "loss/crossentropy": 2.353907346725464, "loss/hidden": 0.9609375, "loss/logits": 0.165082648396492, "loss/reg": 0.005201911553740501, "step": 632 }, { "epoch": 0.079125, "grad_norm": 2.87488055229187, "grad_norm_var": 0.2861166812267043, "learning_rate": 0.0001, "loss": 1.1462, "loss/crossentropy": 2.653827667236328, "loss/hidden": 0.91796875, "loss/logits": 0.17622330784797668, "loss/reg": 0.00519942119717598, "step": 633 }, { "epoch": 0.07925, "grad_norm": 2.4115476608276367, "grad_norm_var": 0.2563777078404484, "learning_rate": 0.0001, "loss": 1.1225, "loss/crossentropy": 2.4889907836914062, "loss/hidden": 0.8984375, "loss/logits": 0.1720612496137619, "loss/reg": 0.005197320133447647, "step": 634 }, { "epoch": 0.079375, "grad_norm": 2.4616434574127197, "grad_norm_var": 0.24219922325532028, "learning_rate": 0.0001, "loss": 1.1805, "loss/crossentropy": 2.3522822856903076, "loss/hidden": 0.94140625, "loss/logits": 0.1871228963136673, "loss/reg": 0.005195194855332375, "step": 635 }, { "epoch": 0.0795, "grad_norm": 2.386276960372925, "grad_norm_var": 0.23489288483797985, "learning_rate": 0.0001, "loss": 0.9463, "loss/crossentropy": 2.584338426589966, "loss/hidden": 0.765625, "loss/logits": 0.12873858213424683, "loss/reg": 0.005192761775106192, "step": 636 }, { "epoch": 0.079625, "grad_norm": 2.454456090927124, "grad_norm_var": 0.22225700139133117, "learning_rate": 0.0001, "loss": 1.0199, "loss/crossentropy": 2.4049668312072754, "loss/hidden": 0.80859375, "loss/logits": 0.15936976671218872, "loss/reg": 0.005190614145249128, "step": 637 }, { "epoch": 0.07975, "grad_norm": 2.1882073879241943, "grad_norm_var": 0.22800243516591642, "learning_rate": 0.0001, "loss": 1.1182, "loss/crossentropy": 2.5147759914398193, "loss/hidden": 0.92578125, "loss/logits": 0.14053833484649658, "loss/reg": 0.005188319832086563, "step": 638 }, { "epoch": 0.079875, "grad_norm": 2.214505434036255, "grad_norm_var": 0.20121127216839246, "learning_rate": 0.0001, "loss": 0.9821, "loss/crossentropy": 2.6784555912017822, "loss/hidden": 0.79296875, "loss/logits": 0.13726986944675446, "loss/reg": 0.0051859593950212, "step": 639 }, { "epoch": 0.08, "grad_norm": 2.8519279956817627, "grad_norm_var": 0.19869721717550323, "learning_rate": 0.0001, "loss": 1.3825, "loss/crossentropy": 2.4976460933685303, "loss/hidden": 1.1171875, "loss/logits": 0.21343019604682922, "loss/reg": 0.005183514207601547, "step": 640 }, { "epoch": 0.080125, "grad_norm": 3.2747607231140137, "grad_norm_var": 0.1926680012221444, "learning_rate": 0.0001, "loss": 1.0585, "loss/crossentropy": 2.265321969985962, "loss/hidden": 0.83203125, "loss/logits": 0.174637109041214, "loss/reg": 0.005180996377021074, "step": 641 }, { "epoch": 0.08025, "grad_norm": 2.433096408843994, "grad_norm_var": 0.14700668719525894, "learning_rate": 0.0001, "loss": 1.0417, "loss/crossentropy": 2.4713757038116455, "loss/hidden": 0.83984375, "loss/logits": 0.15003418922424316, "loss/reg": 0.005178460851311684, "step": 642 }, { "epoch": 0.080375, "grad_norm": 3.037181854248047, "grad_norm_var": 0.149821407729875, "learning_rate": 0.0001, "loss": 1.2828, "loss/crossentropy": 2.387122631072998, "loss/hidden": 1.0625, "loss/logits": 0.16852089762687683, "loss/reg": 0.005175705999135971, "step": 643 }, { "epoch": 0.0805, "grad_norm": 4.6729350090026855, "grad_norm_var": 0.3670359647179557, "learning_rate": 0.0001, "loss": 1.4674, "loss/crossentropy": 1.862856149673462, "loss/hidden": 1.21875, "loss/logits": 0.196872740983963, "loss/reg": 0.005172953009605408, "step": 644 }, { "epoch": 0.080625, "grad_norm": 3.1887784004211426, "grad_norm_var": 0.3693575970723629, "learning_rate": 0.0001, "loss": 1.3902, "loss/crossentropy": 2.418116569519043, "loss/hidden": 1.0703125, "loss/logits": 0.26818597316741943, "loss/reg": 0.005170162301510572, "step": 645 }, { "epoch": 0.08075, "grad_norm": 2.502976179122925, "grad_norm_var": 0.37434523124654007, "learning_rate": 0.0001, "loss": 0.9914, "loss/crossentropy": 2.666684865951538, "loss/hidden": 0.796875, "loss/logits": 0.14282414317131042, "loss/reg": 0.005168135743588209, "step": 646 }, { "epoch": 0.080875, "grad_norm": 3.457416534423828, "grad_norm_var": 0.4029304846601008, "learning_rate": 0.0001, "loss": 1.1007, "loss/crossentropy": 2.6663055419921875, "loss/hidden": 0.8671875, "loss/logits": 0.18181806802749634, "loss/reg": 0.005165606737136841, "step": 647 }, { "epoch": 0.081, "grad_norm": 3.3614838123321533, "grad_norm_var": 0.40888510034567177, "learning_rate": 0.0001, "loss": 1.1832, "loss/crossentropy": 2.758859157562256, "loss/hidden": 0.97265625, "loss/logits": 0.15888546407222748, "loss/reg": 0.005163096822798252, "step": 648 }, { "epoch": 0.081125, "grad_norm": 3.84016752243042, "grad_norm_var": 0.46893935653157826, "learning_rate": 0.0001, "loss": 1.1826, "loss/crossentropy": 2.6556475162506104, "loss/hidden": 0.91796875, "loss/logits": 0.21306458115577698, "loss/reg": 0.00516059435904026, "step": 649 }, { "epoch": 0.08125, "grad_norm": 3.2409677505493164, "grad_norm_var": 0.45558605122396995, "learning_rate": 0.0001, "loss": 1.3848, "loss/crossentropy": 2.2988944053649902, "loss/hidden": 1.078125, "loss/logits": 0.25512105226516724, "loss/reg": 0.005158509127795696, "step": 650 }, { "epoch": 0.081375, "grad_norm": 5.813977241516113, "grad_norm_var": 0.9294389115085245, "learning_rate": 0.0001, "loss": 1.3714, "loss/crossentropy": 2.1522276401519775, "loss/hidden": 1.09375, "loss/logits": 0.22606953978538513, "loss/reg": 0.005156443454325199, "step": 651 }, { "epoch": 0.0815, "grad_norm": 3.5314903259277344, "grad_norm_var": 0.8898375889113737, "learning_rate": 0.0001, "loss": 1.2683, "loss/crossentropy": 2.523782253265381, "loss/hidden": 1.015625, "loss/logits": 0.2011091113090515, "loss/reg": 0.00515406858175993, "step": 652 }, { "epoch": 0.081625, "grad_norm": 2.5944650173187256, "grad_norm_var": 0.8761365904132077, "learning_rate": 0.0001, "loss": 1.1349, "loss/crossentropy": 2.3588359355926514, "loss/hidden": 0.921875, "loss/logits": 0.16150620579719543, "loss/reg": 0.005151691380888224, "step": 653 }, { "epoch": 0.08175, "grad_norm": 3.0321786403656006, "grad_norm_var": 0.7997344400314499, "learning_rate": 0.0001, "loss": 1.2751, "loss/crossentropy": 2.35185170173645, "loss/hidden": 1.046875, "loss/logits": 0.17669130861759186, "loss/reg": 0.005149615928530693, "step": 654 }, { "epoch": 0.081875, "grad_norm": 3.2158820629119873, "grad_norm_var": 0.7154026962141908, "learning_rate": 0.0001, "loss": 1.0974, "loss/crossentropy": 2.809157133102417, "loss/hidden": 0.8984375, "loss/logits": 0.14747856557369232, "loss/reg": 0.005147217772901058, "step": 655 }, { "epoch": 0.082, "grad_norm": 2.1148674488067627, "grad_norm_var": 0.8010662785463902, "learning_rate": 0.0001, "loss": 1.056, "loss/crossentropy": 2.5246095657348633, "loss/hidden": 0.84765625, "loss/logits": 0.1568629890680313, "loss/reg": 0.005145091563463211, "step": 656 }, { "epoch": 0.082125, "grad_norm": 2.541887044906616, "grad_norm_var": 0.8402323056924367, "learning_rate": 0.0001, "loss": 1.2087, "loss/crossentropy": 2.3598315715789795, "loss/hidden": 1.0, "loss/logits": 0.15723757445812225, "loss/reg": 0.005142755340784788, "step": 657 }, { "epoch": 0.08225, "grad_norm": 2.292616605758667, "grad_norm_var": 0.8574455385669723, "learning_rate": 0.0001, "loss": 1.0515, "loss/crossentropy": 2.5242886543273926, "loss/hidden": 0.84375, "loss/logits": 0.15634778141975403, "loss/reg": 0.005140629597008228, "step": 658 }, { "epoch": 0.082375, "grad_norm": 3.6106507778167725, "grad_norm_var": 0.8596278513525417, "learning_rate": 0.0001, "loss": 1.3178, "loss/crossentropy": 2.6528077125549316, "loss/hidden": 1.09375, "loss/logits": 0.17268945276737213, "loss/reg": 0.005138530861586332, "step": 659 }, { "epoch": 0.0825, "grad_norm": 2.4270260334014893, "grad_norm_var": 0.7677345681069748, "learning_rate": 0.0001, "loss": 1.072, "loss/crossentropy": 2.5859150886535645, "loss/hidden": 0.875, "loss/logits": 0.14567336440086365, "loss/reg": 0.005136391613632441, "step": 660 }, { "epoch": 0.082625, "grad_norm": 10.746210098266602, "grad_norm_var": 4.3533807562107985, "learning_rate": 0.0001, "loss": 1.3555, "loss/crossentropy": 2.6105966567993164, "loss/hidden": 1.09375, "loss/logits": 0.21044138073921204, "loss/reg": 0.005134167615324259, "step": 661 }, { "epoch": 0.08275, "grad_norm": 2.277845621109009, "grad_norm_var": 4.3908370843377496, "learning_rate": 0.0001, "loss": 1.1244, "loss/crossentropy": 2.624403715133667, "loss/hidden": 0.9140625, "loss/logits": 0.15903490781784058, "loss/reg": 0.005131968762725592, "step": 662 }, { "epoch": 0.082875, "grad_norm": 2.2439072132110596, "grad_norm_var": 4.510992587376572, "learning_rate": 0.0001, "loss": 1.1974, "loss/crossentropy": 2.325004816055298, "loss/hidden": 0.96875, "loss/logits": 0.1773640513420105, "loss/reg": 0.005129888188093901, "step": 663 }, { "epoch": 0.083, "grad_norm": 2.5274457931518555, "grad_norm_var": 4.57602786514919, "learning_rate": 0.0001, "loss": 1.0481, "loss/crossentropy": 2.7819478511810303, "loss/hidden": 0.8515625, "loss/logits": 0.1452445089817047, "loss/reg": 0.005127874203026295, "step": 664 }, { "epoch": 0.083125, "grad_norm": 2.381653308868408, "grad_norm_var": 4.643456939433319, "learning_rate": 0.0001, "loss": 1.0594, "loss/crossentropy": 2.476351737976074, "loss/hidden": 0.859375, "loss/logits": 0.14875781536102295, "loss/reg": 0.005125833675265312, "step": 665 }, { "epoch": 0.08325, "grad_norm": 2.565531015396118, "grad_norm_var": 4.687379253455119, "learning_rate": 0.0001, "loss": 1.092, "loss/crossentropy": 2.164882183074951, "loss/hidden": 0.88671875, "loss/logits": 0.15401628613471985, "loss/reg": 0.005123757291585207, "step": 666 }, { "epoch": 0.083375, "grad_norm": 2.686464309692383, "grad_norm_var": 4.279508443254811, "learning_rate": 0.0001, "loss": 1.1857, "loss/crossentropy": 2.49807071685791, "loss/hidden": 0.9609375, "loss/logits": 0.17354023456573486, "loss/reg": 0.005121580790728331, "step": 667 }, { "epoch": 0.0835, "grad_norm": 2.211970806121826, "grad_norm_var": 4.325501093334068, "learning_rate": 0.0001, "loss": 0.9944, "loss/crossentropy": 2.3563051223754883, "loss/hidden": 0.80078125, "loss/logits": 0.14245735108852386, "loss/reg": 0.005119378212839365, "step": 668 }, { "epoch": 0.083625, "grad_norm": 40.70305252075195, "grad_norm_var": 92.56442532718542, "learning_rate": 0.0001, "loss": 1.1762, "loss/crossentropy": 2.488436460494995, "loss/hidden": 0.9453125, "loss/logits": 0.1797066330909729, "loss/reg": 0.0051171439699828625, "step": 669 }, { "epoch": 0.08375, "grad_norm": 2.748046875, "grad_norm_var": 92.66196615048598, "learning_rate": 0.0001, "loss": 1.1097, "loss/crossentropy": 2.310842990875244, "loss/hidden": 0.91796875, "loss/logits": 0.14054188132286072, "loss/reg": 0.0051149362698197365, "step": 670 }, { "epoch": 0.083875, "grad_norm": 3.018019199371338, "grad_norm_var": 92.72350960683757, "learning_rate": 0.0001, "loss": 1.1882, "loss/crossentropy": 2.5257725715637207, "loss/hidden": 0.9921875, "loss/logits": 0.144926518201828, "loss/reg": 0.005112735088914633, "step": 671 }, { "epoch": 0.084, "grad_norm": 3.2833805084228516, "grad_norm_var": 92.29023014918404, "learning_rate": 0.0001, "loss": 1.3506, "loss/crossentropy": 2.737273693084717, "loss/hidden": 1.0703125, "loss/logits": 0.22916561365127563, "loss/reg": 0.005110514350235462, "step": 672 }, { "epoch": 0.084125, "grad_norm": 3.5235512256622314, "grad_norm_var": 91.96110241564834, "learning_rate": 0.0001, "loss": 1.1667, "loss/crossentropy": 2.603550434112549, "loss/hidden": 0.97265625, "loss/logits": 0.14292669296264648, "loss/reg": 0.00510829733684659, "step": 673 }, { "epoch": 0.08425, "grad_norm": 8.889531135559082, "grad_norm_var": 91.7913062331738, "learning_rate": 0.0001, "loss": 1.6779, "loss/crossentropy": 2.7345638275146484, "loss/hidden": 1.25, "loss/logits": 0.37679582834243774, "loss/reg": 0.005105969030410051, "step": 674 }, { "epoch": 0.084375, "grad_norm": 3.833319664001465, "grad_norm_var": 91.72375618009856, "learning_rate": 0.0001, "loss": 1.1576, "loss/crossentropy": 2.670295000076294, "loss/hidden": 0.91015625, "loss/logits": 0.19638602435588837, "loss/reg": 0.005103633739054203, "step": 675 }, { "epoch": 0.0845, "grad_norm": 13.332308769226074, "grad_norm_var": 93.95525708687954, "learning_rate": 0.0001, "loss": 1.2802, "loss/crossentropy": 2.4972240924835205, "loss/hidden": 1.046875, "loss/logits": 0.18228942155838013, "loss/reg": 0.005101518705487251, "step": 676 }, { "epoch": 0.084625, "grad_norm": 3.3041481971740723, "grad_norm_var": 93.38769696489507, "learning_rate": 0.0001, "loss": 1.1323, "loss/crossentropy": 2.318303108215332, "loss/hidden": 0.92578125, "loss/logits": 0.15550082921981812, "loss/reg": 0.005099330097436905, "step": 677 }, { "epoch": 0.08475, "grad_norm": 3.0213379859924316, "grad_norm_var": 93.03138783085473, "learning_rate": 0.0001, "loss": 1.2046, "loss/crossentropy": 2.330695629119873, "loss/hidden": 0.921875, "loss/logits": 0.23171411454677582, "loss/reg": 0.005097060929983854, "step": 678 }, { "epoch": 0.084875, "grad_norm": 2.5998549461364746, "grad_norm_var": 92.84836678832802, "learning_rate": 0.0001, "loss": 1.0649, "loss/crossentropy": 2.699117422103882, "loss/hidden": 0.859375, "loss/logits": 0.15458270907402039, "loss/reg": 0.005094949621707201, "step": 679 }, { "epoch": 0.085, "grad_norm": 2.244635581970215, "grad_norm_var": 92.99521966737969, "learning_rate": 0.0001, "loss": 1.1168, "loss/crossentropy": 2.572654962539673, "loss/hidden": 0.90234375, "loss/logits": 0.1635233759880066, "loss/reg": 0.005092862527817488, "step": 680 }, { "epoch": 0.085125, "grad_norm": 2.6163716316223145, "grad_norm_var": 92.87692169982786, "learning_rate": 0.0001, "loss": 1.1651, "loss/crossentropy": 2.735013961791992, "loss/hidden": 0.93359375, "loss/logits": 0.18059232831001282, "loss/reg": 0.005090588703751564, "step": 681 }, { "epoch": 0.08525, "grad_norm": 2.5760252475738525, "grad_norm_var": 92.8717223043897, "learning_rate": 0.0001, "loss": 0.9637, "loss/crossentropy": 2.4715609550476074, "loss/hidden": 0.7890625, "loss/logits": 0.1237054169178009, "loss/reg": 0.0050884694792330265, "step": 682 }, { "epoch": 0.085375, "grad_norm": 2.3052964210510254, "grad_norm_var": 93.06379073504952, "learning_rate": 0.0001, "loss": 1.1353, "loss/crossentropy": 2.2887136936187744, "loss/hidden": 0.92578125, "loss/logits": 0.15868628025054932, "loss/reg": 0.005086386110633612, "step": 683 }, { "epoch": 0.0855, "grad_norm": 2.239668130874634, "grad_norm_var": 93.04887766727983, "learning_rate": 0.0001, "loss": 1.033, "loss/crossentropy": 2.420260429382324, "loss/hidden": 0.84765625, "loss/logits": 0.13450753688812256, "loss/reg": 0.005084337200969458, "step": 684 }, { "epoch": 0.085625, "grad_norm": 2.7788403034210205, "grad_norm_var": 8.800650863003963, "learning_rate": 0.0001, "loss": 1.2414, "loss/crossentropy": 2.505138397216797, "loss/hidden": 1.03125, "loss/logits": 0.15931686758995056, "loss/reg": 0.005082385148853064, "step": 685 }, { "epoch": 0.08575, "grad_norm": 2.274430513381958, "grad_norm_var": 8.887076805039055, "learning_rate": 0.0001, "loss": 1.0324, "loss/crossentropy": 2.24831485748291, "loss/hidden": 0.83984375, "loss/logits": 0.14174425601959229, "loss/reg": 0.00508028594776988, "step": 686 }, { "epoch": 0.085875, "grad_norm": 2.8570923805236816, "grad_norm_var": 8.906869950057784, "learning_rate": 0.0001, "loss": 1.2426, "loss/crossentropy": 2.1708295345306396, "loss/hidden": 1.0078125, "loss/logits": 0.18402233719825745, "loss/reg": 0.005078236572444439, "step": 687 }, { "epoch": 0.086, "grad_norm": 2.6168949604034424, "grad_norm_var": 8.985428302339566, "learning_rate": 0.0001, "loss": 1.1911, "loss/crossentropy": 2.4759016036987305, "loss/hidden": 0.97265625, "loss/logits": 0.1677204668521881, "loss/reg": 0.005076236091554165, "step": 688 }, { "epoch": 0.086125, "grad_norm": 2.738102674484253, "grad_norm_var": 9.054334077972502, "learning_rate": 0.0001, "loss": 1.1386, "loss/crossentropy": 2.4160187244415283, "loss/hidden": 0.94140625, "loss/logits": 0.1464519500732422, "loss/reg": 0.005074144806712866, "step": 689 }, { "epoch": 0.08625, "grad_norm": 2.7573044300079346, "grad_norm_var": 7.214004841898908, "learning_rate": 0.0001, "loss": 1.1327, "loss/crossentropy": 2.342536449432373, "loss/hidden": 0.9296875, "loss/logits": 0.15226896107196808, "loss/reg": 0.0050718653947114944, "step": 690 }, { "epoch": 0.086375, "grad_norm": 2.4906835556030273, "grad_norm_var": 7.245694276683736, "learning_rate": 0.0001, "loss": 1.043, "loss/crossentropy": 2.6861307621002197, "loss/hidden": 0.84765625, "loss/logits": 0.14462026953697205, "loss/reg": 0.005069798789918423, "step": 691 }, { "epoch": 0.0865, "grad_norm": 2.3750221729278564, "grad_norm_var": 0.08836772564593408, "learning_rate": 0.0001, "loss": 1.1331, "loss/crossentropy": 2.5460920333862305, "loss/hidden": 0.91796875, "loss/logits": 0.1644265055656433, "loss/reg": 0.0050675952807068825, "step": 692 }, { "epoch": 0.086625, "grad_norm": 2.2382612228393555, "grad_norm_var": 0.06104096205675281, "learning_rate": 0.0001, "loss": 1.1182, "loss/crossentropy": 2.5386240482330322, "loss/hidden": 0.9140625, "loss/logits": 0.15351390838623047, "loss/reg": 0.005065726116299629, "step": 693 }, { "epoch": 0.08675, "grad_norm": 2.582509994506836, "grad_norm_var": 0.04524178053572901, "learning_rate": 0.0001, "loss": 1.1349, "loss/crossentropy": 2.5054309368133545, "loss/hidden": 0.92578125, "loss/logits": 0.15845400094985962, "loss/reg": 0.00506393238902092, "step": 694 }, { "epoch": 0.086875, "grad_norm": 3.3852474689483643, "grad_norm_var": 0.09234654068112012, "learning_rate": 0.0001, "loss": 1.375, "loss/crossentropy": 2.463137626647949, "loss/hidden": 1.09375, "loss/logits": 0.23065921664237976, "loss/reg": 0.005062177777290344, "step": 695 }, { "epoch": 0.087, "grad_norm": 2.7022039890289307, "grad_norm_var": 0.085748197103725, "learning_rate": 0.0001, "loss": 1.1825, "loss/crossentropy": 2.2784640789031982, "loss/hidden": 0.953125, "loss/logits": 0.17874625325202942, "loss/reg": 0.005060084629803896, "step": 696 }, { "epoch": 0.087125, "grad_norm": 3.218095064163208, "grad_norm_var": 0.11002230581329756, "learning_rate": 0.0001, "loss": 1.4532, "loss/crossentropy": 2.372589111328125, "loss/hidden": 1.1875, "loss/logits": 0.21512824296951294, "loss/reg": 0.005058267153799534, "step": 697 }, { "epoch": 0.08725, "grad_norm": 2.2941925525665283, "grad_norm_var": 0.11714567363764346, "learning_rate": 0.0001, "loss": 1.2103, "loss/crossentropy": 2.2349698543548584, "loss/hidden": 0.99609375, "loss/logits": 0.16368569433689117, "loss/reg": 0.005056225229054689, "step": 698 }, { "epoch": 0.087375, "grad_norm": 2.4463765621185303, "grad_norm_var": 0.11254763430842919, "learning_rate": 0.0001, "loss": 1.0284, "loss/crossentropy": 2.49548077583313, "loss/hidden": 0.84375, "loss/logits": 0.13410091400146484, "loss/reg": 0.005054513458162546, "step": 699 }, { "epoch": 0.0875, "grad_norm": 2.5363550186157227, "grad_norm_var": 0.1028185685472406, "learning_rate": 0.0001, "loss": 0.9756, "loss/crossentropy": 2.900705099105835, "loss/hidden": 0.7890625, "loss/logits": 0.13605856895446777, "loss/reg": 0.005052678752690554, "step": 700 }, { "epoch": 0.087625, "grad_norm": 3.4383292198181152, "grad_norm_var": 0.1419262550473822, "learning_rate": 0.0001, "loss": 1.2789, "loss/crossentropy": 2.239861488342285, "loss/hidden": 1.0390625, "loss/logits": 0.18931907415390015, "loss/reg": 0.005051023792475462, "step": 701 }, { "epoch": 0.08775, "grad_norm": 2.1923646926879883, "grad_norm_var": 0.14683359089866196, "learning_rate": 0.0001, "loss": 1.0723, "loss/crossentropy": 2.5594210624694824, "loss/hidden": 0.8515625, "loss/logits": 0.17024339735507965, "loss/reg": 0.005049179773777723, "step": 702 }, { "epoch": 0.087875, "grad_norm": 2.492584466934204, "grad_norm_var": 0.1464975365420211, "learning_rate": 0.0001, "loss": 1.0832, "loss/crossentropy": 2.549513339996338, "loss/hidden": 0.890625, "loss/logits": 0.1421511173248291, "loss/reg": 0.005047108978033066, "step": 703 }, { "epoch": 0.088, "grad_norm": 3.04917311668396, "grad_norm_var": 0.15589194049567348, "learning_rate": 0.0001, "loss": 0.9286, "loss/crossentropy": 2.3838376998901367, "loss/hidden": 0.7578125, "loss/logits": 0.12035049498081207, "loss/reg": 0.005045315716415644, "step": 704 }, { "epoch": 0.088125, "grad_norm": 3.7284188270568848, "grad_norm_var": 0.22439052206896856, "learning_rate": 0.0001, "loss": 1.2283, "loss/crossentropy": 2.4002139568328857, "loss/hidden": 1.0, "loss/logits": 0.17786875367164612, "loss/reg": 0.005043353885412216, "step": 705 }, { "epoch": 0.08825, "grad_norm": 2.3665406703948975, "grad_norm_var": 0.2333161514143832, "learning_rate": 0.0001, "loss": 1.0864, "loss/crossentropy": 2.2537431716918945, "loss/hidden": 0.88671875, "loss/logits": 0.14927825331687927, "loss/reg": 0.005041591357439756, "step": 706 }, { "epoch": 0.088375, "grad_norm": 2.461461067199707, "grad_norm_var": 0.23426700013735413, "learning_rate": 0.0001, "loss": 0.9665, "loss/crossentropy": 2.267686605453491, "loss/hidden": 0.78125, "loss/logits": 0.1348324567079544, "loss/reg": 0.005039647221565247, "step": 707 }, { "epoch": 0.0885, "grad_norm": 2.219465494155884, "grad_norm_var": 0.24291783945608714, "learning_rate": 0.0001, "loss": 1.0682, "loss/crossentropy": 2.5170199871063232, "loss/hidden": 0.8828125, "loss/logits": 0.13497616350650787, "loss/reg": 0.0050375694409012794, "step": 708 }, { "epoch": 0.088625, "grad_norm": 2.5682785511016846, "grad_norm_var": 0.22899036593855726, "learning_rate": 0.0001, "loss": 1.1712, "loss/crossentropy": 2.3696398735046387, "loss/hidden": 0.95703125, "loss/logits": 0.16378942131996155, "loss/reg": 0.00503552844747901, "step": 709 }, { "epoch": 0.08875, "grad_norm": 2.2680654525756836, "grad_norm_var": 0.2413579176160397, "learning_rate": 0.0001, "loss": 1.1427, "loss/crossentropy": 2.4121482372283936, "loss/hidden": 0.9296875, "loss/logits": 0.16271916031837463, "loss/reg": 0.005033775232732296, "step": 710 }, { "epoch": 0.088875, "grad_norm": 7.707209587097168, "grad_norm_var": 1.7976793028734939, "learning_rate": 0.0001, "loss": 1.4757, "loss/crossentropy": 2.64532470703125, "loss/hidden": 1.1953125, "loss/logits": 0.23005220293998718, "loss/reg": 0.005031922832131386, "step": 711 }, { "epoch": 0.089, "grad_norm": 2.4962596893310547, "grad_norm_var": 1.8079738281494115, "learning_rate": 0.0001, "loss": 1.1922, "loss/crossentropy": 2.481624126434326, "loss/hidden": 0.96875, "loss/logits": 0.17317567765712738, "loss/reg": 0.005030201282352209, "step": 712 }, { "epoch": 0.089125, "grad_norm": 2.164900779724121, "grad_norm_var": 1.842137749294079, "learning_rate": 0.0001, "loss": 1.064, "loss/crossentropy": 2.228675365447998, "loss/hidden": 0.85546875, "loss/logits": 0.1582651436328888, "loss/reg": 0.005028109531849623, "step": 713 }, { "epoch": 0.08925, "grad_norm": 2.5871829986572266, "grad_norm_var": 1.8237636675870703, "learning_rate": 0.0001, "loss": 1.1265, "loss/crossentropy": 2.4661970138549805, "loss/hidden": 0.9140625, "loss/logits": 0.16222231090068817, "loss/reg": 0.005026375409215689, "step": 714 }, { "epoch": 0.089375, "grad_norm": 2.6158599853515625, "grad_norm_var": 1.814851924792763, "learning_rate": 0.0001, "loss": 1.2786, "loss/crossentropy": 2.4959585666656494, "loss/hidden": 1.0390625, "loss/logits": 0.18929770588874817, "loss/reg": 0.005024294834583998, "step": 715 }, { "epoch": 0.0895, "grad_norm": 1.9927250146865845, "grad_norm_var": 1.8619121365324653, "learning_rate": 0.0001, "loss": 1.0324, "loss/crossentropy": 2.471590518951416, "loss/hidden": 0.828125, "loss/logits": 0.15405681729316711, "loss/reg": 0.005022158846259117, "step": 716 }, { "epoch": 0.089625, "grad_norm": 2.2087814807891846, "grad_norm_var": 1.8676209281098621, "learning_rate": 0.0001, "loss": 1.0376, "loss/crossentropy": 2.4889180660247803, "loss/hidden": 0.83984375, "loss/logits": 0.14751726388931274, "loss/reg": 0.005020026583224535, "step": 717 }, { "epoch": 0.08975, "grad_norm": 2.185058116912842, "grad_norm_var": 1.8682356690613582, "learning_rate": 0.0001, "loss": 1.1229, "loss/crossentropy": 2.4008331298828125, "loss/hidden": 0.921875, "loss/logits": 0.15089130401611328, "loss/reg": 0.005017881281673908, "step": 718 }, { "epoch": 0.089875, "grad_norm": 2.22664737701416, "grad_norm_var": 1.8842476127138506, "learning_rate": 0.0001, "loss": 0.9793, "loss/crossentropy": 2.6462786197662354, "loss/hidden": 0.7890625, "loss/logits": 0.14010348916053772, "loss/reg": 0.0050159962847828865, "step": 719 }, { "epoch": 0.09, "grad_norm": 7.475221633911133, "grad_norm_var": 3.2539659864593964, "learning_rate": 0.0001, "loss": 1.2258, "loss/crossentropy": 2.450388193130493, "loss/hidden": 1.015625, "loss/logits": 0.16006067395210266, "loss/reg": 0.005013884510844946, "step": 720 }, { "epoch": 0.090125, "grad_norm": 2.1259288787841797, "grad_norm_var": 3.275813935195105, "learning_rate": 0.0001, "loss": 1.1558, "loss/crossentropy": 2.3211934566497803, "loss/hidden": 0.9375, "loss/logits": 0.1681801825761795, "loss/reg": 0.0050118486396968365, "step": 721 }, { "epoch": 0.09025, "grad_norm": 3.284715414047241, "grad_norm_var": 3.2534822002252284, "learning_rate": 0.0001, "loss": 1.227, "loss/crossentropy": 2.3303604125976562, "loss/hidden": 1.0, "loss/logits": 0.17691665887832642, "loss/reg": 0.005009867250919342, "step": 722 }, { "epoch": 0.090375, "grad_norm": 2.481712818145752, "grad_norm_var": 3.2519544593852943, "learning_rate": 0.0001, "loss": 1.0745, "loss/crossentropy": 2.516172409057617, "loss/hidden": 0.86328125, "loss/logits": 0.1610938012599945, "loss/reg": 0.005008057691156864, "step": 723 }, { "epoch": 0.0905, "grad_norm": 2.6934256553649902, "grad_norm_var": 3.2142672637689955, "learning_rate": 0.0001, "loss": 1.0241, "loss/crossentropy": 2.4445412158966064, "loss/hidden": 0.83203125, "loss/logits": 0.14205417037010193, "loss/reg": 0.0050062634982168674, "step": 724 }, { "epoch": 0.090625, "grad_norm": 2.8393290042877197, "grad_norm_var": 3.2008126847008653, "learning_rate": 0.0001, "loss": 1.4807, "loss/crossentropy": 2.155627489089966, "loss/hidden": 1.203125, "loss/logits": 0.22749567031860352, "loss/reg": 0.005004186183214188, "step": 725 }, { "epoch": 0.09075, "grad_norm": 2.7673983573913574, "grad_norm_var": 3.16203540734179, "learning_rate": 0.0001, "loss": 1.0853, "loss/crossentropy": 2.841604709625244, "loss/hidden": 0.859375, "loss/logits": 0.17592039704322815, "loss/reg": 0.005002181977033615, "step": 726 }, { "epoch": 0.090875, "grad_norm": 2.4957730770111084, "grad_norm_var": 1.6690794582387851, "learning_rate": 0.0001, "loss": 1.0616, "loss/crossentropy": 2.4109010696411133, "loss/hidden": 0.85546875, "loss/logits": 0.15614046156406403, "loss/reg": 0.005000332836061716, "step": 727 }, { "epoch": 0.091, "grad_norm": 2.2517614364624023, "grad_norm_var": 1.6823934112280023, "learning_rate": 0.0001, "loss": 0.9522, "loss/crossentropy": 2.5019116401672363, "loss/hidden": 0.77734375, "loss/logits": 0.12488029897212982, "loss/reg": 0.004998230375349522, "step": 728 }, { "epoch": 0.091125, "grad_norm": 3.1151885986328125, "grad_norm_var": 1.6615595314428224, "learning_rate": 0.0001, "loss": 1.3341, "loss/crossentropy": 2.0731663703918457, "loss/hidden": 1.140625, "loss/logits": 0.1435013860464096, "loss/reg": 0.00499630905687809, "step": 729 }, { "epoch": 0.09125, "grad_norm": 2.27622127532959, "grad_norm_var": 1.6778435468635766, "learning_rate": 0.0001, "loss": 0.9945, "loss/crossentropy": 2.5911362171173096, "loss/hidden": 0.8046875, "loss/logits": 0.13988272845745087, "loss/reg": 0.004994215443730354, "step": 730 }, { "epoch": 0.091375, "grad_norm": 2.634037971496582, "grad_norm_var": 1.6773821814765582, "learning_rate": 0.0001, "loss": 1.0821, "loss/crossentropy": 2.217550754547119, "loss/hidden": 0.8828125, "loss/logits": 0.14941135048866272, "loss/reg": 0.004992038011550903, "step": 731 }, { "epoch": 0.0915, "grad_norm": 3.696157693862915, "grad_norm_var": 1.6717809998289452, "learning_rate": 0.0001, "loss": 1.4666, "loss/crossentropy": 2.527979850769043, "loss/hidden": 1.1796875, "loss/logits": 0.23702046275138855, "loss/reg": 0.004989837761968374, "step": 732 }, { "epoch": 0.091625, "grad_norm": 2.2505931854248047, "grad_norm_var": 1.6679122787177638, "learning_rate": 0.0001, "loss": 1.1594, "loss/crossentropy": 2.539890766143799, "loss/hidden": 0.93359375, "loss/logits": 0.17588791251182556, "loss/reg": 0.004987762775272131, "step": 733 }, { "epoch": 0.09175, "grad_norm": 2.468350887298584, "grad_norm_var": 1.6449808034713405, "learning_rate": 0.0001, "loss": 0.9491, "loss/crossentropy": 2.3645944595336914, "loss/hidden": 0.77734375, "loss/logits": 0.12187166512012482, "loss/reg": 0.004985733889043331, "step": 734 }, { "epoch": 0.091875, "grad_norm": 2.4574854373931885, "grad_norm_var": 1.6262736490095788, "learning_rate": 0.0001, "loss": 0.9553, "loss/crossentropy": 2.5622363090515137, "loss/hidden": 0.78515625, "loss/logits": 0.12034176290035248, "loss/reg": 0.004983709193766117, "step": 735 }, { "epoch": 0.092, "grad_norm": 2.2961714267730713, "grad_norm_var": 0.18272698620191838, "learning_rate": 0.0001, "loss": 0.9857, "loss/crossentropy": 2.65541934967041, "loss/hidden": 0.8046875, "loss/logits": 0.13119381666183472, "loss/reg": 0.004981704521924257, "step": 736 }, { "epoch": 0.092125, "grad_norm": 1.9556196928024292, "grad_norm_var": 0.19606320022039506, "learning_rate": 0.0001, "loss": 1.1183, "loss/crossentropy": 2.4507250785827637, "loss/hidden": 0.90625, "loss/logits": 0.16227680444717407, "loss/reg": 0.004979623947292566, "step": 737 }, { "epoch": 0.09225, "grad_norm": 4.161533832550049, "grad_norm_var": 0.32150407886374516, "learning_rate": 0.0001, "loss": 1.2986, "loss/crossentropy": 2.07208514213562, "loss/hidden": 1.0703125, "loss/logits": 0.17855030298233032, "loss/reg": 0.0049775131046772, "step": 738 }, { "epoch": 0.092375, "grad_norm": 3.0142910480499268, "grad_norm_var": 0.3253252453994368, "learning_rate": 0.0001, "loss": 0.9659, "loss/crossentropy": 2.569051742553711, "loss/hidden": 0.78515625, "loss/logits": 0.13097813725471497, "loss/reg": 0.004975371062755585, "step": 739 }, { "epoch": 0.0925, "grad_norm": 2.2635934352874756, "grad_norm_var": 0.33787014856401254, "learning_rate": 0.0001, "loss": 1.0329, "loss/crossentropy": 2.4809088706970215, "loss/hidden": 0.83203125, "loss/logits": 0.15113815665245056, "loss/reg": 0.004973322618752718, "step": 740 }, { "epoch": 0.092625, "grad_norm": 2.633608818054199, "grad_norm_var": 0.33625377709689125, "learning_rate": 0.0001, "loss": 1.1759, "loss/crossentropy": 2.336703062057495, "loss/hidden": 0.953125, "loss/logits": 0.17310968041419983, "loss/reg": 0.004971369635313749, "step": 741 }, { "epoch": 0.09275, "grad_norm": 2.431776285171509, "grad_norm_var": 0.3389851198561174, "learning_rate": 0.0001, "loss": 1.0201, "loss/crossentropy": 2.4961469173431396, "loss/hidden": 0.8203125, "loss/logits": 0.15009689331054688, "loss/reg": 0.004969437140971422, "step": 742 }, { "epoch": 0.092875, "grad_norm": 2.756232976913452, "grad_norm_var": 0.3378643921182785, "learning_rate": 0.0001, "loss": 1.0276, "loss/crossentropy": 2.4552102088928223, "loss/hidden": 0.8515625, "loss/logits": 0.12634404003620148, "loss/reg": 0.0049674008041620255, "step": 743 }, { "epoch": 0.093, "grad_norm": 2.5648443698883057, "grad_norm_var": 0.32668128102186145, "learning_rate": 0.0001, "loss": 1.14, "loss/crossentropy": 2.811657667160034, "loss/hidden": 0.93359375, "loss/logits": 0.15678739547729492, "loss/reg": 0.0049654701724648476, "step": 744 }, { "epoch": 0.093125, "grad_norm": 2.283196210861206, "grad_norm_var": 0.32233157119037936, "learning_rate": 0.0001, "loss": 1.0877, "loss/crossentropy": 2.3052849769592285, "loss/hidden": 0.8828125, "loss/logits": 0.1552238166332245, "loss/reg": 0.004963380750268698, "step": 745 }, { "epoch": 0.09325, "grad_norm": 2.500383138656616, "grad_norm_var": 0.3147792588205774, "learning_rate": 0.0001, "loss": 1.0794, "loss/crossentropy": 2.6310482025146484, "loss/hidden": 0.88671875, "loss/logits": 0.14305052161216736, "loss/reg": 0.0049613784067332745, "step": 746 }, { "epoch": 0.093375, "grad_norm": 2.7470545768737793, "grad_norm_var": 0.3153672801439085, "learning_rate": 0.0001, "loss": 1.1538, "loss/crossentropy": 2.645195722579956, "loss/hidden": 0.921875, "loss/logits": 0.18234500288963318, "loss/reg": 0.004959197249263525, "step": 747 }, { "epoch": 0.0935, "grad_norm": 2.790817975997925, "grad_norm_var": 0.24092132942115865, "learning_rate": 0.0001, "loss": 1.015, "loss/crossentropy": 2.0862972736358643, "loss/hidden": 0.8359375, "loss/logits": 0.12952454388141632, "loss/reg": 0.004957180004566908, "step": 748 }, { "epoch": 0.093625, "grad_norm": 2.898916482925415, "grad_norm_var": 0.23711979067914365, "learning_rate": 0.0001, "loss": 1.0498, "loss/crossentropy": 2.522505760192871, "loss/hidden": 0.8515625, "loss/logits": 0.14867964386940002, "loss/reg": 0.004955058917403221, "step": 749 }, { "epoch": 0.09375, "grad_norm": 3.0828936100006104, "grad_norm_var": 0.24674152232076801, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.078137159347534, "loss/hidden": 1.0703125, "loss/logits": 0.2288488745689392, "loss/reg": 0.004952888935804367, "step": 750 }, { "epoch": 0.093875, "grad_norm": 4.464056491851807, "grad_norm_var": 0.439550102142455, "learning_rate": 0.0001, "loss": 1.125, "loss/crossentropy": 2.6811301708221436, "loss/hidden": 0.9140625, "loss/logits": 0.16146372258663177, "loss/reg": 0.004950782749801874, "step": 751 }, { "epoch": 0.094, "grad_norm": 2.9327120780944824, "grad_norm_var": 0.4218744680947139, "learning_rate": 0.0001, "loss": 1.1226, "loss/crossentropy": 2.502683639526367, "loss/hidden": 0.91015625, "loss/logits": 0.16296005249023438, "loss/reg": 0.004948711488395929, "step": 752 }, { "epoch": 0.094125, "grad_norm": 2.4569215774536133, "grad_norm_var": 0.3782952433457505, "learning_rate": 0.0001, "loss": 1.1117, "loss/crossentropy": 2.5091257095336914, "loss/hidden": 0.90234375, "loss/logits": 0.15988323092460632, "loss/reg": 0.004946760833263397, "step": 753 }, { "epoch": 0.09425, "grad_norm": 5.152282238006592, "grad_norm_var": 0.6097367248532388, "learning_rate": 0.0001, "loss": 1.3929, "loss/crossentropy": 2.2930901050567627, "loss/hidden": 1.1484375, "loss/logits": 0.19504866003990173, "loss/reg": 0.004944849293678999, "step": 754 }, { "epoch": 0.094375, "grad_norm": 2.604393243789673, "grad_norm_var": 0.6159506323654304, "learning_rate": 0.0001, "loss": 1.0457, "loss/crossentropy": 2.59228777885437, "loss/hidden": 0.8359375, "loss/logits": 0.16035211086273193, "loss/reg": 0.004942973144352436, "step": 755 }, { "epoch": 0.0945, "grad_norm": 2.197974443435669, "grad_norm_var": 0.6218773019698792, "learning_rate": 0.0001, "loss": 1.0755, "loss/crossentropy": 2.3389058113098145, "loss/hidden": 0.8828125, "loss/logits": 0.14329570531845093, "loss/reg": 0.004941044840961695, "step": 756 }, { "epoch": 0.094625, "grad_norm": 3.0741820335388184, "grad_norm_var": 0.6180001684099087, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.2101480960845947, "loss/hidden": 1.1796875, "loss/logits": 0.2130812704563141, "loss/reg": 0.004938756115734577, "step": 757 }, { "epoch": 0.09475, "grad_norm": 2.604829788208008, "grad_norm_var": 0.6082914113291829, "learning_rate": 0.0001, "loss": 1.1533, "loss/crossentropy": 2.431821584701538, "loss/hidden": 0.94140625, "loss/logits": 0.16252049803733826, "loss/reg": 0.0049363370053470135, "step": 758 }, { "epoch": 0.094875, "grad_norm": 2.824411630630493, "grad_norm_var": 0.606870668349819, "learning_rate": 0.0001, "loss": 1.2267, "loss/crossentropy": 2.3011207580566406, "loss/hidden": 0.97265625, "loss/logits": 0.20468328893184662, "loss/reg": 0.004933919291943312, "step": 759 }, { "epoch": 0.095, "grad_norm": 2.5809361934661865, "grad_norm_var": 0.606063171082104, "learning_rate": 0.0001, "loss": 1.0216, "loss/crossentropy": 2.5506176948547363, "loss/hidden": 0.8359375, "loss/logits": 0.13629919290542603, "loss/reg": 0.004931787494570017, "step": 760 }, { "epoch": 0.095125, "grad_norm": 4.444363117218018, "grad_norm_var": 0.7059078117077803, "learning_rate": 0.0001, "loss": 1.3498, "loss/crossentropy": 2.8235206604003906, "loss/hidden": 1.109375, "loss/logits": 0.191168874502182, "loss/reg": 0.0049294959753751755, "step": 761 }, { "epoch": 0.09525, "grad_norm": 2.169010639190674, "grad_norm_var": 0.7385929926525419, "learning_rate": 0.0001, "loss": 0.9763, "loss/crossentropy": 2.706693172454834, "loss/hidden": 0.78515625, "loss/logits": 0.1419064998626709, "loss/reg": 0.004927367437630892, "step": 762 }, { "epoch": 0.095375, "grad_norm": 2.4050183296203613, "grad_norm_var": 0.7603640408605048, "learning_rate": 0.0001, "loss": 1.043, "loss/crossentropy": 2.622105836868286, "loss/hidden": 0.84765625, "loss/logits": 0.1461138278245926, "loss/reg": 0.004925237502902746, "step": 763 }, { "epoch": 0.0955, "grad_norm": 2.4372246265411377, "grad_norm_var": 0.7800550132454624, "learning_rate": 0.0001, "loss": 1.058, "loss/crossentropy": 2.4866652488708496, "loss/hidden": 0.86328125, "loss/logits": 0.14548787474632263, "loss/reg": 0.004923122003674507, "step": 764 }, { "epoch": 0.095625, "grad_norm": 2.517997980117798, "grad_norm_var": 0.7953055666315338, "learning_rate": 0.0001, "loss": 1.2213, "loss/crossentropy": 2.3578498363494873, "loss/hidden": 1.015625, "loss/logits": 0.1564468890428543, "loss/reg": 0.004920901730656624, "step": 765 }, { "epoch": 0.09575, "grad_norm": 2.278327226638794, "grad_norm_var": 0.8265305072858796, "learning_rate": 0.0001, "loss": 1.0466, "loss/crossentropy": 2.55232310295105, "loss/hidden": 0.84765625, "loss/logits": 0.14976537227630615, "loss/reg": 0.0049185301177203655, "step": 766 }, { "epoch": 0.095875, "grad_norm": 2.337240219116211, "grad_norm_var": 0.6789092499013821, "learning_rate": 0.0001, "loss": 1.1614, "loss/crossentropy": 2.672149658203125, "loss/hidden": 0.94921875, "loss/logits": 0.16301041841506958, "loss/reg": 0.004916144534945488, "step": 767 }, { "epoch": 0.096, "grad_norm": 8.908835411071777, "grad_norm_var": 3.005936619726153, "learning_rate": 0.0001, "loss": 0.9922, "loss/crossentropy": 2.3541464805603027, "loss/hidden": 0.8359375, "loss/logits": 0.10713944584131241, "loss/reg": 0.0049139889888465405, "step": 768 }, { "epoch": 0.096125, "grad_norm": 2.1774449348449707, "grad_norm_var": 3.0380281733161834, "learning_rate": 0.0001, "loss": 0.9611, "loss/crossentropy": 2.5367493629455566, "loss/hidden": 0.80078125, "loss/logits": 0.11117491126060486, "loss/reg": 0.00491185300052166, "step": 769 }, { "epoch": 0.09625, "grad_norm": 4.010209083557129, "grad_norm_var": 2.8176414116633506, "learning_rate": 0.0001, "loss": 1.6116, "loss/crossentropy": 2.4206178188323975, "loss/hidden": 1.2734375, "loss/logits": 0.28903743624687195, "loss/reg": 0.00490949209779501, "step": 770 }, { "epoch": 0.096375, "grad_norm": 2.3479456901550293, "grad_norm_var": 2.838639045972002, "learning_rate": 0.0001, "loss": 1.0775, "loss/crossentropy": 2.3782246112823486, "loss/hidden": 0.87109375, "loss/logits": 0.15734228491783142, "loss/reg": 0.004907363560050726, "step": 771 }, { "epoch": 0.0965, "grad_norm": 2.5986974239349365, "grad_norm_var": 2.801428785253172, "learning_rate": 0.0001, "loss": 1.0758, "loss/crossentropy": 2.4752702713012695, "loss/hidden": 0.890625, "loss/logits": 0.13615593314170837, "loss/reg": 0.0049048978835344315, "step": 772 }, { "epoch": 0.096625, "grad_norm": 2.3621203899383545, "grad_norm_var": 2.8362617972026043, "learning_rate": 0.0001, "loss": 0.9929, "loss/crossentropy": 2.6385865211486816, "loss/hidden": 0.80859375, "loss/logits": 0.13527554273605347, "loss/reg": 0.004902740474790335, "step": 773 }, { "epoch": 0.09675, "grad_norm": 2.2466084957122803, "grad_norm_var": 2.866155351424041, "learning_rate": 0.0001, "loss": 1.1506, "loss/crossentropy": 2.5522897243499756, "loss/hidden": 0.93359375, "loss/logits": 0.1680143177509308, "loss/reg": 0.004900622647255659, "step": 774 }, { "epoch": 0.096875, "grad_norm": 2.6764907836914062, "grad_norm_var": 2.871782767876315, "learning_rate": 0.0001, "loss": 1.1166, "loss/crossentropy": 2.5808451175689697, "loss/hidden": 0.90625, "loss/logits": 0.16133888065814972, "loss/reg": 0.004898467101156712, "step": 775 }, { "epoch": 0.097, "grad_norm": 2.1325623989105225, "grad_norm_var": 2.911263182397389, "learning_rate": 0.0001, "loss": 1.1775, "loss/crossentropy": 2.3073110580444336, "loss/hidden": 0.953125, "loss/logits": 0.17539924383163452, "loss/reg": 0.004896300844848156, "step": 776 }, { "epoch": 0.097125, "grad_norm": 2.845750570297241, "grad_norm_var": 2.7637895893424673, "learning_rate": 0.0001, "loss": 0.9791, "loss/crossentropy": 2.452115774154663, "loss/hidden": 0.796875, "loss/logits": 0.13326548039913177, "loss/reg": 0.004894034005701542, "step": 777 }, { "epoch": 0.09725, "grad_norm": 6.139473915100098, "grad_norm_var": 3.360390097314651, "learning_rate": 0.0001, "loss": 1.1467, "loss/crossentropy": 2.44069242477417, "loss/hidden": 0.94140625, "loss/logits": 0.15639880299568176, "loss/reg": 0.0048917257227003574, "step": 778 }, { "epoch": 0.097375, "grad_norm": 2.3391048908233643, "grad_norm_var": 3.3672209294330075, "learning_rate": 0.0001, "loss": 1.137, "loss/crossentropy": 2.5613744258880615, "loss/hidden": 0.92578125, "loss/logits": 0.16234168410301208, "loss/reg": 0.004889402538537979, "step": 779 }, { "epoch": 0.0975, "grad_norm": 2.2655279636383057, "grad_norm_var": 3.3853179937680844, "learning_rate": 0.0001, "loss": 1.1061, "loss/crossentropy": 2.3578813076019287, "loss/hidden": 0.90234375, "loss/logits": 0.15491583943367004, "loss/reg": 0.004887087736278772, "step": 780 }, { "epoch": 0.097625, "grad_norm": 2.2749481201171875, "grad_norm_var": 3.4090543314963564, "learning_rate": 0.0001, "loss": 1.0216, "loss/crossentropy": 2.4622697830200195, "loss/hidden": 0.828125, "loss/logits": 0.14463937282562256, "loss/reg": 0.004884790629148483, "step": 781 }, { "epoch": 0.09775, "grad_norm": 2.891165256500244, "grad_norm_var": 3.363644225109554, "learning_rate": 0.0001, "loss": 1.1747, "loss/crossentropy": 2.5788161754608154, "loss/hidden": 0.953125, "loss/logits": 0.1727641224861145, "loss/reg": 0.0048824455589056015, "step": 782 }, { "epoch": 0.097875, "grad_norm": 2.347449541091919, "grad_norm_var": 3.362531263350426, "learning_rate": 0.0001, "loss": 0.995, "loss/crossentropy": 2.5813019275665283, "loss/hidden": 0.8046875, "loss/logits": 0.14150115847587585, "loss/reg": 0.004880187567323446, "step": 783 }, { "epoch": 0.098, "grad_norm": 3.219748020172119, "grad_norm_var": 1.0248437110061321, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.4393255710601807, "loss/hidden": 1.078125, "loss/logits": 0.20414334535598755, "loss/reg": 0.004877839703112841, "step": 784 }, { "epoch": 0.098125, "grad_norm": 2.7481374740600586, "grad_norm_var": 0.997469803821544, "learning_rate": 0.0001, "loss": 1.1664, "loss/crossentropy": 2.856651544570923, "loss/hidden": 0.93359375, "loss/logits": 0.18401256203651428, "loss/reg": 0.0048755621537566185, "step": 785 }, { "epoch": 0.09825, "grad_norm": 3.8722689151763916, "grad_norm_var": 0.9771433382716601, "learning_rate": 0.0001, "loss": 1.1632, "loss/crossentropy": 2.7695257663726807, "loss/hidden": 0.92578125, "loss/logits": 0.18866363167762756, "loss/reg": 0.004873441066592932, "step": 786 }, { "epoch": 0.098375, "grad_norm": 2.877189874649048, "grad_norm_var": 0.9605094695400339, "learning_rate": 0.0001, "loss": 1.1935, "loss/crossentropy": 2.569603443145752, "loss/hidden": 0.92578125, "loss/logits": 0.21895866096019745, "loss/reg": 0.004871242213994265, "step": 787 }, { "epoch": 0.0985, "grad_norm": 2.430058717727661, "grad_norm_var": 0.9682708910971911, "learning_rate": 0.0001, "loss": 0.9791, "loss/crossentropy": 2.767273187637329, "loss/hidden": 0.80078125, "loss/logits": 0.12963160872459412, "loss/reg": 0.004869125317782164, "step": 788 }, { "epoch": 0.098625, "grad_norm": 2.63755202293396, "grad_norm_var": 0.9549378382865437, "learning_rate": 0.0001, "loss": 0.9765, "loss/crossentropy": 2.550255537033081, "loss/hidden": 0.79296875, "loss/logits": 0.13482339680194855, "loss/reg": 0.004866961855441332, "step": 789 }, { "epoch": 0.09875, "grad_norm": 2.9017562866210938, "grad_norm_var": 0.9271776289312942, "learning_rate": 0.0001, "loss": 1.152, "loss/crossentropy": 2.3682990074157715, "loss/hidden": 0.91796875, "loss/logits": 0.18535348773002625, "loss/reg": 0.004864787682890892, "step": 790 }, { "epoch": 0.098875, "grad_norm": 2.388214349746704, "grad_norm_var": 0.9414410795556288, "learning_rate": 0.0001, "loss": 1.4406, "loss/crossentropy": 2.2244787216186523, "loss/hidden": 1.1640625, "loss/logits": 0.2279416173696518, "loss/reg": 0.0048627713695168495, "step": 791 }, { "epoch": 0.099, "grad_norm": 2.3403820991516113, "grad_norm_var": 0.9230295318881718, "learning_rate": 0.0001, "loss": 1.1661, "loss/crossentropy": 2.3729617595672607, "loss/hidden": 0.95703125, "loss/logits": 0.16043922305107117, "loss/reg": 0.004860777873545885, "step": 792 }, { "epoch": 0.099125, "grad_norm": 2.9416215419769287, "grad_norm_var": 0.9228156704300846, "learning_rate": 0.0001, "loss": 1.0463, "loss/crossentropy": 2.6738815307617188, "loss/hidden": 0.85546875, "loss/logits": 0.14225679636001587, "loss/reg": 0.0048586721532046795, "step": 793 }, { "epoch": 0.09925, "grad_norm": 3.0802805423736572, "grad_norm_var": 0.1918460569244303, "learning_rate": 0.0001, "loss": 1.095, "loss/crossentropy": 2.368016242980957, "loss/hidden": 0.890625, "loss/logits": 0.15585456788539886, "loss/reg": 0.0048565310426056385, "step": 794 }, { "epoch": 0.099375, "grad_norm": 2.2744922637939453, "grad_norm_var": 0.19540746296378658, "learning_rate": 0.0001, "loss": 1.056, "loss/crossentropy": 2.4861905574798584, "loss/hidden": 0.85546875, "loss/logits": 0.15199777483940125, "loss/reg": 0.004854561761021614, "step": 795 }, { "epoch": 0.0995, "grad_norm": 2.4716484546661377, "grad_norm_var": 0.1856228513035127, "learning_rate": 0.0001, "loss": 1.1547, "loss/crossentropy": 2.315619468688965, "loss/hidden": 0.95703125, "loss/logits": 0.1491631120443344, "loss/reg": 0.004852783400565386, "step": 796 }, { "epoch": 0.099625, "grad_norm": 1.8618899583816528, "grad_norm_var": 0.22140635444161577, "learning_rate": 0.0001, "loss": 0.9489, "loss/crossentropy": 2.5243325233459473, "loss/hidden": 0.77734375, "loss/logits": 0.12307024002075195, "loss/reg": 0.004850673023611307, "step": 797 }, { "epoch": 0.09975, "grad_norm": 2.3018720149993896, "grad_norm_var": 0.22850198783917974, "learning_rate": 0.0001, "loss": 1.1395, "loss/crossentropy": 2.437481164932251, "loss/hidden": 0.93359375, "loss/logits": 0.1574450135231018, "loss/reg": 0.0048488411121070385, "step": 798 }, { "epoch": 0.099875, "grad_norm": 2.351703643798828, "grad_norm_var": 0.22832106568478797, "learning_rate": 0.0001, "loss": 1.1459, "loss/crossentropy": 2.7124335765838623, "loss/hidden": 0.91796875, "loss/logits": 0.17945504188537598, "loss/reg": 0.004846641793847084, "step": 799 }, { "epoch": 0.1, "grad_norm": 3.841269016265869, "grad_norm_var": 0.29813113065747426, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.2013328075408936, "loss/hidden": 1.0703125, "loss/logits": 0.2869468331336975, "loss/reg": 0.00484456866979599, "step": 800 }, { "epoch": 0.100125, "grad_norm": 2.1647138595581055, "grad_norm_var": 0.3162455329850776, "learning_rate": 0.0001, "loss": 1.1089, "loss/crossentropy": 2.5788028240203857, "loss/hidden": 0.91015625, "loss/logits": 0.15036620199680328, "loss/reg": 0.004842570051550865, "step": 801 }, { "epoch": 0.10025, "grad_norm": 7.012945652008057, "grad_norm_var": 1.4357519156716128, "learning_rate": 0.0001, "loss": 1.3117, "loss/crossentropy": 2.785094738006592, "loss/hidden": 1.078125, "loss/logits": 0.18520238995552063, "loss/reg": 0.004840615671128035, "step": 802 }, { "epoch": 0.100375, "grad_norm": 2.403449535369873, "grad_norm_var": 1.4491572072812606, "learning_rate": 0.0001, "loss": 1.0264, "loss/crossentropy": 2.464657783508301, "loss/hidden": 0.83984375, "loss/logits": 0.1381913125514984, "loss/reg": 0.004838695749640465, "step": 803 }, { "epoch": 0.1005, "grad_norm": 2.554766893386841, "grad_norm_var": 1.4433503798033878, "learning_rate": 0.0001, "loss": 1.025, "loss/crossentropy": 2.506692409515381, "loss/hidden": 0.84765625, "loss/logits": 0.1289561688899994, "loss/reg": 0.0048366026021540165, "step": 804 }, { "epoch": 0.100625, "grad_norm": 2.103414535522461, "grad_norm_var": 1.4759940006075438, "learning_rate": 0.0001, "loss": 1.0631, "loss/crossentropy": 2.4695632457733154, "loss/hidden": 0.86328125, "loss/logits": 0.15149196982383728, "loss/reg": 0.004834519233554602, "step": 805 }, { "epoch": 0.10075, "grad_norm": 2.1140382289886475, "grad_norm_var": 1.5053641044502488, "learning_rate": 0.0001, "loss": 1.0509, "loss/crossentropy": 2.536261796951294, "loss/hidden": 0.84375, "loss/logits": 0.15881776809692383, "loss/reg": 0.004832423757761717, "step": 806 }, { "epoch": 0.100875, "grad_norm": 2.4623467922210693, "grad_norm_var": 1.5020038787198757, "learning_rate": 0.0001, "loss": 1.0163, "loss/crossentropy": 2.6407546997070312, "loss/hidden": 0.828125, "loss/logits": 0.1398705244064331, "loss/reg": 0.004830438643693924, "step": 807 }, { "epoch": 0.101, "grad_norm": 2.196262836456299, "grad_norm_var": 1.5115104848001217, "learning_rate": 0.0001, "loss": 0.9786, "loss/crossentropy": 2.7860398292541504, "loss/hidden": 0.78515625, "loss/logits": 0.1451636254787445, "loss/reg": 0.0048283860087394714, "step": 808 }, { "epoch": 0.101125, "grad_norm": 1.9615752696990967, "grad_norm_var": 1.547617987738648, "learning_rate": 0.0001, "loss": 0.9815, "loss/crossentropy": 2.4804906845092773, "loss/hidden": 0.796875, "loss/logits": 0.13634686172008514, "loss/reg": 0.004826539196074009, "step": 809 }, { "epoch": 0.10125, "grad_norm": 2.813462257385254, "grad_norm_var": 1.5384423691934326, "learning_rate": 0.0001, "loss": 1.0387, "loss/crossentropy": 2.4615039825439453, "loss/hidden": 0.84375, "loss/logits": 0.1467183232307434, "loss/reg": 0.004824436269700527, "step": 810 }, { "epoch": 0.101375, "grad_norm": 2.155719518661499, "grad_norm_var": 1.5457555739015585, "learning_rate": 0.0001, "loss": 1.0501, "loss/crossentropy": 2.5954880714416504, "loss/hidden": 0.85546875, "loss/logits": 0.1464114934206009, "loss/reg": 0.004822410177439451, "step": 811 }, { "epoch": 0.1015, "grad_norm": 9.283843040466309, "grad_norm_var": 4.263069385825235, "learning_rate": 0.0001, "loss": 2.7627, "loss/crossentropy": 2.139838218688965, "loss/hidden": 2.375, "loss/logits": 0.3395351767539978, "loss/reg": 0.0048205070197582245, "step": 812 }, { "epoch": 0.101625, "grad_norm": 2.0784647464752197, "grad_norm_var": 4.230278658390638, "learning_rate": 0.0001, "loss": 0.9664, "loss/crossentropy": 2.3511369228363037, "loss/hidden": 0.7890625, "loss/logits": 0.12912708520889282, "loss/reg": 0.0048186322674155235, "step": 813 }, { "epoch": 0.10175, "grad_norm": 2.328005313873291, "grad_norm_var": 4.227496791404921, "learning_rate": 0.0001, "loss": 1.2465, "loss/crossentropy": 2.512450933456421, "loss/hidden": 0.96875, "loss/logits": 0.22958813607692719, "loss/reg": 0.004816535394638777, "step": 814 }, { "epoch": 0.101875, "grad_norm": 2.315840721130371, "grad_norm_var": 4.231222857846664, "learning_rate": 0.0001, "loss": 1.1597, "loss/crossentropy": 2.5316786766052246, "loss/hidden": 0.94140625, "loss/logits": 0.1701403111219406, "loss/reg": 0.004814418964087963, "step": 815 }, { "epoch": 0.102, "grad_norm": 2.384153127670288, "grad_norm_var": 4.222215248181442, "learning_rate": 0.0001, "loss": 1.0444, "loss/crossentropy": 2.540562629699707, "loss/hidden": 0.86328125, "loss/logits": 0.13295237720012665, "loss/reg": 0.004812437575310469, "step": 816 }, { "epoch": 0.102125, "grad_norm": 2.4128434658050537, "grad_norm_var": 4.197740139734587, "learning_rate": 0.0001, "loss": 1.1886, "loss/crossentropy": 2.070441246032715, "loss/hidden": 0.96875, "loss/logits": 0.17178985476493835, "loss/reg": 0.0048103369772434235, "step": 817 }, { "epoch": 0.10225, "grad_norm": 2.2246007919311523, "grad_norm_var": 3.0918953553368502, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.6368751525878906, "loss/hidden": 0.8984375, "loss/logits": 0.16057901084423065, "loss/reg": 0.004808461759239435, "step": 818 }, { "epoch": 0.102375, "grad_norm": 2.450014591217041, "grad_norm_var": 3.089959662810087, "learning_rate": 0.0001, "loss": 0.9846, "loss/crossentropy": 2.521289587020874, "loss/hidden": 0.8046875, "loss/logits": 0.13186746835708618, "loss/reg": 0.004806382581591606, "step": 819 }, { "epoch": 0.1025, "grad_norm": 2.4022092819213867, "grad_norm_var": 3.095181282590954, "learning_rate": 0.0001, "loss": 1.0461, "loss/crossentropy": 2.563452959060669, "loss/hidden": 0.8515625, "loss/logits": 0.146540105342865, "loss/reg": 0.00480444822460413, "step": 820 }, { "epoch": 0.102625, "grad_norm": 2.3734519481658936, "grad_norm_var": 3.0771633032177723, "learning_rate": 0.0001, "loss": 1.0145, "loss/crossentropy": 2.565798759460449, "loss/hidden": 0.8125, "loss/logits": 0.15395045280456543, "loss/reg": 0.004802408628165722, "step": 821 }, { "epoch": 0.10275, "grad_norm": 2.9813950061798096, "grad_norm_var": 3.0509471234209884, "learning_rate": 0.0001, "loss": 1.1867, "loss/crossentropy": 2.463094711303711, "loss/hidden": 0.96875, "loss/logits": 0.169905886054039, "loss/reg": 0.004800358321517706, "step": 822 }, { "epoch": 0.102875, "grad_norm": 2.231248617172241, "grad_norm_var": 3.06473574306492, "learning_rate": 0.0001, "loss": 0.95, "loss/crossentropy": 2.479421615600586, "loss/hidden": 0.78125, "loss/logits": 0.12073921412229538, "loss/reg": 0.00479841185733676, "step": 823 }, { "epoch": 0.103, "grad_norm": 2.4154672622680664, "grad_norm_var": 3.050471285485306, "learning_rate": 0.0001, "loss": 1.0843, "loss/crossentropy": 2.3831257820129395, "loss/hidden": 0.85546875, "loss/logits": 0.1808249056339264, "loss/reg": 0.004796158988028765, "step": 824 }, { "epoch": 0.103125, "grad_norm": 2.0772101879119873, "grad_norm_var": 3.0383683290584145, "learning_rate": 0.0001, "loss": 1.0639, "loss/crossentropy": 2.4004011154174805, "loss/hidden": 0.859375, "loss/logits": 0.15656441450119019, "loss/reg": 0.004794239532202482, "step": 825 }, { "epoch": 0.10325, "grad_norm": 2.839860200881958, "grad_norm_var": 3.0384311233460473, "learning_rate": 0.0001, "loss": 1.0861, "loss/crossentropy": 2.6619362831115723, "loss/hidden": 0.87890625, "loss/logits": 0.15924152731895447, "loss/reg": 0.004792260471731424, "step": 826 }, { "epoch": 0.103375, "grad_norm": 1.972954511642456, "grad_norm_var": 3.056454118437356, "learning_rate": 0.0001, "loss": 0.9135, "loss/crossentropy": 2.7229561805725098, "loss/hidden": 0.7421875, "loss/logits": 0.1234164908528328, "loss/reg": 0.004790398757904768, "step": 827 }, { "epoch": 0.1035, "grad_norm": 2.682563066482544, "grad_norm_var": 0.07155742185727737, "learning_rate": 0.0001, "loss": 1.1117, "loss/crossentropy": 2.5051097869873047, "loss/hidden": 0.89453125, "loss/logits": 0.16926732659339905, "loss/reg": 0.00478832283988595, "step": 828 }, { "epoch": 0.103625, "grad_norm": 3.4523675441741943, "grad_norm_var": 0.13326196210074007, "learning_rate": 0.0001, "loss": 0.9601, "loss/crossentropy": 2.658190965652466, "loss/hidden": 0.78125, "loss/logits": 0.13097809255123138, "loss/reg": 0.004786360543221235, "step": 829 }, { "epoch": 0.10375, "grad_norm": 3.48335599899292, "grad_norm_var": 0.19458248394843167, "learning_rate": 0.0001, "loss": 1.2642, "loss/crossentropy": 2.538571357727051, "loss/hidden": 1.0234375, "loss/logits": 0.1929442286491394, "loss/reg": 0.00478436890989542, "step": 830 }, { "epoch": 0.103875, "grad_norm": 2.62361216545105, "grad_norm_var": 0.19115134798182468, "learning_rate": 0.0001, "loss": 1.0218, "loss/crossentropy": 2.067270040512085, "loss/hidden": 0.83984375, "loss/logits": 0.13410484790802002, "loss/reg": 0.004782302770763636, "step": 831 }, { "epoch": 0.104, "grad_norm": 2.9096603393554688, "grad_norm_var": 0.1958828676095777, "learning_rate": 0.0001, "loss": 1.0541, "loss/crossentropy": 2.664367198944092, "loss/hidden": 0.859375, "loss/logits": 0.1468919813632965, "loss/reg": 0.004780208226293325, "step": 832 }, { "epoch": 0.104125, "grad_norm": 2.942896604537964, "grad_norm_var": 0.20051234736722562, "learning_rate": 0.0001, "loss": 1.131, "loss/crossentropy": 3.0427563190460205, "loss/hidden": 0.89453125, "loss/logits": 0.18868675827980042, "loss/reg": 0.004778183531016111, "step": 833 }, { "epoch": 0.10425, "grad_norm": 2.589113712310791, "grad_norm_var": 0.18916564172237998, "learning_rate": 0.0001, "loss": 1.0925, "loss/crossentropy": 2.8398096561431885, "loss/hidden": 0.8828125, "loss/logits": 0.16195189952850342, "loss/reg": 0.004776162561029196, "step": 834 }, { "epoch": 0.104375, "grad_norm": 3.4404890537261963, "grad_norm_var": 0.22384389332806312, "learning_rate": 0.0001, "loss": 1.3237, "loss/crossentropy": 2.627953290939331, "loss/hidden": 1.0234375, "loss/logits": 0.252508282661438, "loss/reg": 0.004774080123752356, "step": 835 }, { "epoch": 0.1045, "grad_norm": 3.8319575786590576, "grad_norm_var": 0.2922407313041238, "learning_rate": 0.0001, "loss": 1.1363, "loss/crossentropy": 2.598100185394287, "loss/hidden": 0.94140625, "loss/logits": 0.14722198247909546, "loss/reg": 0.004771828651428223, "step": 836 }, { "epoch": 0.104625, "grad_norm": 2.5748939514160156, "grad_norm_var": 0.28324037377024425, "learning_rate": 0.0001, "loss": 1.154, "loss/crossentropy": 2.4594998359680176, "loss/hidden": 0.92578125, "loss/logits": 0.1805158108472824, "loss/reg": 0.004769548308104277, "step": 837 }, { "epoch": 0.10475, "grad_norm": 2.4466588497161865, "grad_norm_var": 0.2892884485845587, "learning_rate": 0.0001, "loss": 1.1832, "loss/crossentropy": 2.3166677951812744, "loss/hidden": 0.94921875, "loss/logits": 0.18628260493278503, "loss/reg": 0.004767347127199173, "step": 838 }, { "epoch": 0.104875, "grad_norm": 2.3794689178466797, "grad_norm_var": 0.2797743363037663, "learning_rate": 0.0001, "loss": 1.0935, "loss/crossentropy": 2.5804922580718994, "loss/hidden": 0.8984375, "loss/logits": 0.14742916822433472, "loss/reg": 0.004765105899423361, "step": 839 }, { "epoch": 0.105, "grad_norm": 3.077510118484497, "grad_norm_var": 0.27398293806763996, "learning_rate": 0.0001, "loss": 1.3542, "loss/crossentropy": 2.3323357105255127, "loss/hidden": 1.1015625, "loss/logits": 0.20499414205551147, "loss/reg": 0.004762987140566111, "step": 840 }, { "epoch": 0.105125, "grad_norm": 2.878331422805786, "grad_norm_var": 0.23338745113117412, "learning_rate": 0.0001, "loss": 1.1036, "loss/crossentropy": 2.450366258621216, "loss/hidden": 0.88671875, "loss/logits": 0.16925078630447388, "loss/reg": 0.004760903771966696, "step": 841 }, { "epoch": 0.10525, "grad_norm": 3.3161306381225586, "grad_norm_var": 0.24483420410498696, "learning_rate": 0.0001, "loss": 1.2537, "loss/crossentropy": 2.5501890182495117, "loss/hidden": 1.0234375, "loss/logits": 0.18264800310134888, "loss/reg": 0.004758887458592653, "step": 842 }, { "epoch": 0.105375, "grad_norm": 3.6941001415252686, "grad_norm_var": 0.21433980549929005, "learning_rate": 0.0001, "loss": 1.1001, "loss/crossentropy": 2.984022378921509, "loss/hidden": 0.890625, "loss/logits": 0.16192708909511566, "loss/reg": 0.004756839480251074, "step": 843 }, { "epoch": 0.1055, "grad_norm": 2.4516656398773193, "grad_norm_var": 0.22806633375319052, "learning_rate": 0.0001, "loss": 1.1966, "loss/crossentropy": 2.304518699645996, "loss/hidden": 0.96484375, "loss/logits": 0.18419940769672394, "loss/reg": 0.0047547114081680775, "step": 844 }, { "epoch": 0.105625, "grad_norm": 2.1033735275268555, "grad_norm_var": 0.2614740255031826, "learning_rate": 0.0001, "loss": 1.1366, "loss/crossentropy": 2.54909086227417, "loss/hidden": 0.93359375, "loss/logits": 0.15546000003814697, "loss/reg": 0.004752539098262787, "step": 845 }, { "epoch": 0.10575, "grad_norm": 2.3745031356811523, "grad_norm_var": 0.2552452215100343, "learning_rate": 0.0001, "loss": 1.0427, "loss/crossentropy": 2.629120349884033, "loss/hidden": 0.828125, "loss/logits": 0.16704407334327698, "loss/reg": 0.004750436637550592, "step": 846 }, { "epoch": 0.105875, "grad_norm": 2.338932514190674, "grad_norm_var": 0.26898497299798596, "learning_rate": 0.0001, "loss": 1.0028, "loss/crossentropy": 2.5220935344696045, "loss/hidden": 0.828125, "loss/logits": 0.12721288204193115, "loss/reg": 0.004748245235532522, "step": 847 }, { "epoch": 0.106, "grad_norm": 2.051365375518799, "grad_norm_var": 0.3064090147954744, "learning_rate": 0.0001, "loss": 1.0144, "loss/crossentropy": 2.5917866230010986, "loss/hidden": 0.828125, "loss/logits": 0.13877776265144348, "loss/reg": 0.0047462498769164085, "step": 848 }, { "epoch": 0.106125, "grad_norm": 2.7592124938964844, "grad_norm_var": 0.3045456563454231, "learning_rate": 0.0001, "loss": 1.0817, "loss/crossentropy": 2.400892972946167, "loss/hidden": 0.88671875, "loss/logits": 0.14757487177848816, "loss/reg": 0.004744186066091061, "step": 849 }, { "epoch": 0.10625, "grad_norm": 2.843409538269043, "grad_norm_var": 0.3024802042353009, "learning_rate": 0.0001, "loss": 1.1031, "loss/crossentropy": 2.4847569465637207, "loss/hidden": 0.90234375, "loss/logits": 0.153322234749794, "loss/reg": 0.004742183722555637, "step": 850 }, { "epoch": 0.106375, "grad_norm": 2.1840505599975586, "grad_norm_var": 0.291355140168911, "learning_rate": 0.0001, "loss": 0.9921, "loss/crossentropy": 2.5778043270111084, "loss/hidden": 0.80859375, "loss/logits": 0.13607874512672424, "loss/reg": 0.004740222357213497, "step": 851 }, { "epoch": 0.1065, "grad_norm": 2.0575978755950928, "grad_norm_var": 0.2218880841874949, "learning_rate": 0.0001, "loss": 0.97, "loss/crossentropy": 2.3618547916412354, "loss/hidden": 0.79296875, "loss/logits": 0.12968632578849792, "loss/reg": 0.004738117568194866, "step": 852 }, { "epoch": 0.106625, "grad_norm": 1.9274516105651855, "grad_norm_var": 0.2498830541667985, "learning_rate": 0.0001, "loss": 0.9944, "loss/crossentropy": 2.616579532623291, "loss/hidden": 0.80859375, "loss/logits": 0.13845369219779968, "loss/reg": 0.004735942464321852, "step": 853 }, { "epoch": 0.10675, "grad_norm": 4.250617980957031, "grad_norm_var": 0.42715921119526734, "learning_rate": 0.0001, "loss": 1.0122, "loss/crossentropy": 2.658142328262329, "loss/hidden": 0.828125, "loss/logits": 0.13677959144115448, "loss/reg": 0.004733935464173555, "step": 854 }, { "epoch": 0.106875, "grad_norm": 2.366472005844116, "grad_norm_var": 0.42766974025784443, "learning_rate": 0.0001, "loss": 0.951, "loss/crossentropy": 2.4926974773406982, "loss/hidden": 0.78125, "loss/logits": 0.12242163717746735, "loss/reg": 0.004731933120638132, "step": 855 }, { "epoch": 0.107, "grad_norm": 2.754833221435547, "grad_norm_var": 0.41652297282437467, "learning_rate": 0.0001, "loss": 1.1565, "loss/crossentropy": 2.3678700923919678, "loss/hidden": 0.93359375, "loss/logits": 0.17560826241970062, "loss/reg": 0.004729805048555136, "step": 856 }, { "epoch": 0.107125, "grad_norm": 3.3536951541900635, "grad_norm_var": 0.44530816036993104, "learning_rate": 0.0001, "loss": 1.3138, "loss/crossentropy": 2.4103269577026367, "loss/hidden": 1.0703125, "loss/logits": 0.19625738263130188, "loss/reg": 0.004727587569504976, "step": 857 }, { "epoch": 0.10725, "grad_norm": 2.150568962097168, "grad_norm_var": 0.43084581061497124, "learning_rate": 0.0001, "loss": 1.1975, "loss/crossentropy": 2.288213014602661, "loss/hidden": 0.984375, "loss/logits": 0.16585032641887665, "loss/reg": 0.004725386388599873, "step": 858 }, { "epoch": 0.107375, "grad_norm": 2.526709794998169, "grad_norm_var": 0.3463235885418951, "learning_rate": 0.0001, "loss": 1.0485, "loss/crossentropy": 2.668027639389038, "loss/hidden": 0.8515625, "loss/logits": 0.14968323707580566, "loss/reg": 0.0047230906784534454, "step": 859 }, { "epoch": 0.1075, "grad_norm": 2.573915958404541, "grad_norm_var": 0.3459660758761667, "learning_rate": 0.0001, "loss": 1.285, "loss/crossentropy": 2.4578278064727783, "loss/hidden": 1.046875, "loss/logits": 0.19090218842029572, "loss/reg": 0.004720703698694706, "step": 860 }, { "epoch": 0.107625, "grad_norm": 2.0752198696136475, "grad_norm_var": 0.3476491685761097, "learning_rate": 0.0001, "loss": 1.0997, "loss/crossentropy": 2.5527398586273193, "loss/hidden": 0.8984375, "loss/logits": 0.15407393872737885, "loss/reg": 0.004718627315014601, "step": 861 }, { "epoch": 0.10775, "grad_norm": 2.0546956062316895, "grad_norm_var": 0.3609613231593753, "learning_rate": 0.0001, "loss": 0.9752, "loss/crossentropy": 2.371561050415039, "loss/hidden": 0.796875, "loss/logits": 0.13113868236541748, "loss/reg": 0.0047163935378193855, "step": 862 }, { "epoch": 0.107875, "grad_norm": 2.4428114891052246, "grad_norm_var": 0.35917223636502416, "learning_rate": 0.0001, "loss": 1.1124, "loss/crossentropy": 2.4627277851104736, "loss/hidden": 0.90625, "loss/logits": 0.15898552536964417, "loss/reg": 0.004714163951575756, "step": 863 }, { "epoch": 0.108, "grad_norm": 9.508520126342773, "grad_norm_var": 3.365516663733607, "learning_rate": 0.0001, "loss": 2.2474, "loss/crossentropy": 2.571873426437378, "loss/hidden": 1.8203125, "loss/logits": 0.3799425959587097, "loss/reg": 0.00471192691475153, "step": 864 }, { "epoch": 0.108125, "grad_norm": 1.8954740762710571, "grad_norm_var": 3.4386495429465245, "learning_rate": 0.0001, "loss": 0.9501, "loss/crossentropy": 2.6599161624908447, "loss/hidden": 0.7734375, "loss/logits": 0.12957873940467834, "loss/reg": 0.00470972154289484, "step": 865 }, { "epoch": 0.10825, "grad_norm": 2.4904675483703613, "grad_norm_var": 3.4507629712816508, "learning_rate": 0.0001, "loss": 1.0515, "loss/crossentropy": 2.4854214191436768, "loss/hidden": 0.8671875, "loss/logits": 0.13725632429122925, "loss/reg": 0.004707681480795145, "step": 866 }, { "epoch": 0.108375, "grad_norm": 2.947456121444702, "grad_norm_var": 3.4129568938410895, "learning_rate": 0.0001, "loss": 1.2279, "loss/crossentropy": 2.4842288494110107, "loss/hidden": 0.9921875, "loss/logits": 0.1886221170425415, "loss/reg": 0.004705703817307949, "step": 867 }, { "epoch": 0.1085, "grad_norm": 2.2447702884674072, "grad_norm_var": 3.3926001028643817, "learning_rate": 0.0001, "loss": 1.0195, "loss/crossentropy": 2.148575782775879, "loss/hidden": 0.84375, "loss/logits": 0.1287107914686203, "loss/reg": 0.0047035738825798035, "step": 868 }, { "epoch": 0.108625, "grad_norm": 2.057748317718506, "grad_norm_var": 3.3755016690991604, "learning_rate": 0.0001, "loss": 1.0202, "loss/crossentropy": 2.483203172683716, "loss/hidden": 0.83203125, "loss/logits": 0.14113682508468628, "loss/reg": 0.004701647907495499, "step": 869 }, { "epoch": 0.10875, "grad_norm": 2.5249640941619873, "grad_norm_var": 3.2694673269882863, "learning_rate": 0.0001, "loss": 1.113, "loss/crossentropy": 2.5260114669799805, "loss/hidden": 0.91015625, "loss/logits": 0.1558540314435959, "loss/reg": 0.004699505399912596, "step": 870 }, { "epoch": 0.108875, "grad_norm": 2.737452507019043, "grad_norm_var": 3.2530130532767125, "learning_rate": 0.0001, "loss": 1.281, "loss/crossentropy": 2.0429792404174805, "loss/hidden": 1.0625, "loss/logits": 0.1715242862701416, "loss/reg": 0.004697592929005623, "step": 871 }, { "epoch": 0.109, "grad_norm": 3.320223569869995, "grad_norm_var": 3.2623347194326366, "learning_rate": 0.0001, "loss": 1.1007, "loss/crossentropy": 2.4485509395599365, "loss/hidden": 0.91015625, "loss/logits": 0.14355334639549255, "loss/reg": 0.004695762414485216, "step": 872 }, { "epoch": 0.109125, "grad_norm": 2.352004051208496, "grad_norm_var": 3.268664190896945, "learning_rate": 0.0001, "loss": 0.9754, "loss/crossentropy": 2.566997528076172, "loss/hidden": 0.81640625, "loss/logits": 0.11203782260417938, "loss/reg": 0.004693967290222645, "step": 873 }, { "epoch": 0.10925, "grad_norm": 2.532027244567871, "grad_norm_var": 3.2412215675029845, "learning_rate": 0.0001, "loss": 1.019, "loss/crossentropy": 2.5623605251312256, "loss/hidden": 0.82421875, "loss/logits": 0.14787867665290833, "loss/reg": 0.0046923235058784485, "step": 874 }, { "epoch": 0.109375, "grad_norm": 2.850015878677368, "grad_norm_var": 3.231974182838817, "learning_rate": 0.0001, "loss": 1.3183, "loss/crossentropy": 2.292745351791382, "loss/hidden": 1.09375, "loss/logits": 0.17765681445598602, "loss/reg": 0.004690241534262896, "step": 875 }, { "epoch": 0.1095, "grad_norm": 2.694929361343384, "grad_norm_var": 3.2274185214577464, "learning_rate": 0.0001, "loss": 1.3804, "loss/crossentropy": 2.5017001628875732, "loss/hidden": 1.1171875, "loss/logits": 0.21630127727985382, "loss/reg": 0.004688601475208998, "step": 876 }, { "epoch": 0.109625, "grad_norm": 3.1318111419677734, "grad_norm_var": 3.1781036409629513, "learning_rate": 0.0001, "loss": 1.0142, "loss/crossentropy": 2.467827796936035, "loss/hidden": 0.80859375, "loss/logits": 0.15873777866363525, "loss/reg": 0.004686909727752209, "step": 877 }, { "epoch": 0.10975, "grad_norm": 2.534363269805908, "grad_norm_var": 3.132884034258479, "learning_rate": 0.0001, "loss": 1.0536, "loss/crossentropy": 2.559304714202881, "loss/hidden": 0.84765625, "loss/logits": 0.15912304818630219, "loss/reg": 0.004684917628765106, "step": 878 }, { "epoch": 0.109875, "grad_norm": 2.3481605052948, "grad_norm_var": 3.1406848036532913, "learning_rate": 0.0001, "loss": 1.1184, "loss/crossentropy": 2.436594247817993, "loss/hidden": 0.91796875, "loss/logits": 0.15359237790107727, "loss/reg": 0.004682839848101139, "step": 879 }, { "epoch": 0.11, "grad_norm": 2.825532913208008, "grad_norm_var": 0.1420546261528272, "learning_rate": 0.0001, "loss": 1.05, "loss/crossentropy": 2.62540602684021, "loss/hidden": 0.859375, "loss/logits": 0.14383457601070404, "loss/reg": 0.004681065212935209, "step": 880 }, { "epoch": 0.110125, "grad_norm": 2.246893882751465, "grad_norm_var": 0.11709161648718099, "learning_rate": 0.0001, "loss": 1.1922, "loss/crossentropy": 2.3017749786376953, "loss/hidden": 0.98046875, "loss/logits": 0.1649210900068283, "loss/reg": 0.004679176490753889, "step": 881 }, { "epoch": 0.11025, "grad_norm": 2.405453681945801, "grad_norm_var": 0.11895408888104815, "learning_rate": 0.0001, "loss": 1.0762, "loss/crossentropy": 2.4893195629119873, "loss/hidden": 0.87109375, "loss/logits": 0.15834550559520721, "loss/reg": 0.004677077289670706, "step": 882 }, { "epoch": 0.110375, "grad_norm": 2.809741258621216, "grad_norm_var": 0.11393595478610692, "learning_rate": 0.0001, "loss": 1.0382, "loss/crossentropy": 2.4846301078796387, "loss/hidden": 0.84375, "loss/logits": 0.1476888507604599, "loss/reg": 0.004674948286265135, "step": 883 }, { "epoch": 0.1105, "grad_norm": 2.79677152633667, "grad_norm_var": 0.10676105158749939, "learning_rate": 0.0001, "loss": 1.0628, "loss/crossentropy": 2.4412240982055664, "loss/hidden": 0.859375, "loss/logits": 0.15672242641448975, "loss/reg": 0.0046728490851819515, "step": 884 }, { "epoch": 0.110625, "grad_norm": 2.2540183067321777, "grad_norm_var": 0.09404914291929553, "learning_rate": 0.0001, "loss": 1.0763, "loss/crossentropy": 2.42918062210083, "loss/hidden": 0.87890625, "loss/logits": 0.15069469809532166, "loss/reg": 0.004670663271099329, "step": 885 }, { "epoch": 0.11075, "grad_norm": 2.4896061420440674, "grad_norm_var": 0.09470624757332567, "learning_rate": 0.0001, "loss": 1.0982, "loss/crossentropy": 2.2188639640808105, "loss/hidden": 0.91796875, "loss/logits": 0.13358637690544128, "loss/reg": 0.004668715409934521, "step": 886 }, { "epoch": 0.110875, "grad_norm": 2.1856141090393066, "grad_norm_var": 0.10697799820098434, "learning_rate": 0.0001, "loss": 1.0706, "loss/crossentropy": 2.508702039718628, "loss/hidden": 0.87109375, "loss/logits": 0.15285125374794006, "loss/reg": 0.0046665905974805355, "step": 887 }, { "epoch": 0.111, "grad_norm": 2.0393009185791016, "grad_norm_var": 0.08841005951741536, "learning_rate": 0.0001, "loss": 1.1924, "loss/crossentropy": 2.38267183303833, "loss/hidden": 0.97265625, "loss/logits": 0.17313829064369202, "loss/reg": 0.004664612468332052, "step": 888 }, { "epoch": 0.111125, "grad_norm": 2.3410797119140625, "grad_norm_var": 0.08867826223563284, "learning_rate": 0.0001, "loss": 1.074, "loss/crossentropy": 2.1943254470825195, "loss/hidden": 0.87109375, "loss/logits": 0.1563197374343872, "loss/reg": 0.004662699997425079, "step": 889 }, { "epoch": 0.11125, "grad_norm": 2.1273703575134277, "grad_norm_var": 0.09882102282956354, "learning_rate": 0.0001, "loss": 1.1126, "loss/crossentropy": 2.685147762298584, "loss/hidden": 0.8984375, "loss/logits": 0.16760051250457764, "loss/reg": 0.00466081453487277, "step": 890 }, { "epoch": 0.111375, "grad_norm": 1.990721583366394, "grad_norm_var": 0.1054455812123658, "learning_rate": 0.0001, "loss": 1.0555, "loss/crossentropy": 2.5216498374938965, "loss/hidden": 0.86328125, "loss/logits": 0.14559441804885864, "loss/reg": 0.004659009166061878, "step": 891 }, { "epoch": 0.1115, "grad_norm": 2.070897340774536, "grad_norm_var": 0.10951603310177038, "learning_rate": 0.0001, "loss": 1.1023, "loss/crossentropy": 2.40985369682312, "loss/hidden": 0.890625, "loss/logits": 0.16513003408908844, "loss/reg": 0.0046569365076720715, "step": 892 }, { "epoch": 0.111625, "grad_norm": 4.464876651763916, "grad_norm_var": 0.3484639481637311, "learning_rate": 0.0001, "loss": 0.9826, "loss/crossentropy": 2.7498655319213867, "loss/hidden": 0.80859375, "loss/logits": 0.12744669616222382, "loss/reg": 0.0046548242680728436, "step": 893 }, { "epoch": 0.11175, "grad_norm": 3.390195608139038, "grad_norm_var": 0.39865960381583576, "learning_rate": 0.0001, "loss": 1.221, "loss/crossentropy": 2.4615988731384277, "loss/hidden": 0.9453125, "loss/logits": 0.22916388511657715, "loss/reg": 0.004652821458876133, "step": 894 }, { "epoch": 0.111875, "grad_norm": 5.686069488525391, "grad_norm_var": 1.005565195852795, "learning_rate": 0.0001, "loss": 1.6881, "loss/crossentropy": 2.871785879135132, "loss/hidden": 1.3515625, "loss/logits": 0.2899933457374573, "loss/reg": 0.0046508111990988255, "step": 895 }, { "epoch": 0.112, "grad_norm": 2.610992193222046, "grad_norm_var": 1.006503225573829, "learning_rate": 0.0001, "loss": 1.2867, "loss/crossentropy": 2.433950901031494, "loss/hidden": 1.046875, "loss/logits": 0.19335989654064178, "loss/reg": 0.004648844711482525, "step": 896 }, { "epoch": 0.112125, "grad_norm": 2.4823808670043945, "grad_norm_var": 0.9943498438598022, "learning_rate": 0.0001, "loss": 1.0789, "loss/crossentropy": 2.469367742538452, "loss/hidden": 0.86328125, "loss/logits": 0.16918183863162994, "loss/reg": 0.004646934103220701, "step": 897 }, { "epoch": 0.11225, "grad_norm": 2.520416736602783, "grad_norm_var": 0.9897555293936913, "learning_rate": 0.0001, "loss": 1.0086, "loss/crossentropy": 2.4925904273986816, "loss/hidden": 0.828125, "loss/logits": 0.13399645686149597, "loss/reg": 0.004645092878490686, "step": 898 }, { "epoch": 0.112375, "grad_norm": 3.859619140625, "grad_norm_var": 1.064733358455831, "learning_rate": 0.0001, "loss": 1.392, "loss/crossentropy": 2.2787818908691406, "loss/hidden": 1.125, "loss/logits": 0.22061912715435028, "loss/reg": 0.004643063060939312, "step": 899 }, { "epoch": 0.1125, "grad_norm": 2.3090193271636963, "grad_norm_var": 1.081884870890947, "learning_rate": 0.0001, "loss": 1.0228, "loss/crossentropy": 2.5625975131988525, "loss/hidden": 0.83984375, "loss/logits": 0.13652384281158447, "loss/reg": 0.004641035571694374, "step": 900 }, { "epoch": 0.112625, "grad_norm": 3.0584754943847656, "grad_norm_var": 1.063620631316445, "learning_rate": 0.0001, "loss": 1.1209, "loss/crossentropy": 2.5125515460968018, "loss/hidden": 0.91015625, "loss/logits": 0.16438385844230652, "loss/reg": 0.004638944752514362, "step": 901 }, { "epoch": 0.11275, "grad_norm": 2.350011110305786, "grad_norm_var": 1.0715774319545346, "learning_rate": 0.0001, "loss": 1.1854, "loss/crossentropy": 2.548809766769409, "loss/hidden": 0.9765625, "loss/logits": 0.16251316666603088, "loss/reg": 0.004636852536350489, "step": 902 }, { "epoch": 0.112875, "grad_norm": 2.3605165481567383, "grad_norm_var": 1.0581603064239917, "learning_rate": 0.0001, "loss": 1.0876, "loss/crossentropy": 2.338804006576538, "loss/hidden": 0.90234375, "loss/logits": 0.13895326852798462, "loss/reg": 0.004634756129235029, "step": 903 }, { "epoch": 0.113, "grad_norm": 2.982060432434082, "grad_norm_var": 1.0113174770986684, "learning_rate": 0.0001, "loss": 1.122, "loss/crossentropy": 2.604545831680298, "loss/hidden": 0.91796875, "loss/logits": 0.15767651796340942, "loss/reg": 0.004632753320038319, "step": 904 }, { "epoch": 0.113125, "grad_norm": 2.4179961681365967, "grad_norm_var": 1.0058240052270713, "learning_rate": 0.0001, "loss": 1.0464, "loss/crossentropy": 2.3685691356658936, "loss/hidden": 0.85546875, "loss/logits": 0.14459514617919922, "loss/reg": 0.0046308403834700584, "step": 905 }, { "epoch": 0.11325, "grad_norm": 2.9855105876922607, "grad_norm_var": 0.9614321136201335, "learning_rate": 0.0001, "loss": 1.0734, "loss/crossentropy": 2.4018514156341553, "loss/hidden": 0.875, "loss/logits": 0.15208232402801514, "loss/reg": 0.00462888041511178, "step": 906 }, { "epoch": 0.113375, "grad_norm": 3.7471723556518555, "grad_norm_var": 0.9246222750158322, "learning_rate": 0.0001, "loss": 1.5348, "loss/crossentropy": 2.3634986877441406, "loss/hidden": 1.2265625, "loss/logits": 0.26200127601623535, "loss/reg": 0.0046269698068499565, "step": 907 }, { "epoch": 0.1135, "grad_norm": 2.5677998065948486, "grad_norm_var": 0.8731304087305998, "learning_rate": 0.0001, "loss": 1.2106, "loss/crossentropy": 2.3270931243896484, "loss/hidden": 0.984375, "loss/logits": 0.17999057471752167, "loss/reg": 0.004624930210411549, "step": 908 }, { "epoch": 0.113625, "grad_norm": 2.649965286254883, "grad_norm_var": 0.7516360272379186, "learning_rate": 0.0001, "loss": 1.1275, "loss/crossentropy": 2.627746343612671, "loss/hidden": 0.9140625, "loss/logits": 0.16723725199699402, "loss/reg": 0.0046230582520365715, "step": 909 }, { "epoch": 0.11375, "grad_norm": 2.4064176082611084, "grad_norm_var": 0.7607639000768924, "learning_rate": 0.0001, "loss": 1.0182, "loss/crossentropy": 2.47015380859375, "loss/hidden": 0.81640625, "loss/logits": 0.15559975802898407, "loss/reg": 0.00462103309109807, "step": 910 }, { "epoch": 0.113875, "grad_norm": 2.1189992427825928, "grad_norm_var": 0.24860211648851874, "learning_rate": 0.0001, "loss": 1.1855, "loss/crossentropy": 2.2618770599365234, "loss/hidden": 0.96484375, "loss/logits": 0.1744484156370163, "loss/reg": 0.004618941340595484, "step": 911 }, { "epoch": 0.114, "grad_norm": 2.220656633377075, "grad_norm_var": 0.26349665304340514, "learning_rate": 0.0001, "loss": 1.2433, "loss/crossentropy": 2.232293128967285, "loss/hidden": 1.0, "loss/logits": 0.19716452062129974, "loss/reg": 0.004616775084286928, "step": 912 }, { "epoch": 0.114125, "grad_norm": 2.525851011276245, "grad_norm_var": 0.2624124723651427, "learning_rate": 0.0001, "loss": 1.2848, "loss/crossentropy": 2.2225029468536377, "loss/hidden": 1.0703125, "loss/logits": 0.16834387183189392, "loss/reg": 0.004614519886672497, "step": 913 }, { "epoch": 0.11425, "grad_norm": 2.6499905586242676, "grad_norm_var": 0.2604882837896163, "learning_rate": 0.0001, "loss": 1.1621, "loss/crossentropy": 2.7070581912994385, "loss/hidden": 0.953125, "loss/logits": 0.16290049254894257, "loss/reg": 0.004612345714122057, "step": 914 }, { "epoch": 0.114375, "grad_norm": 2.417423963546753, "grad_norm_var": 0.1676183523849204, "learning_rate": 0.0001, "loss": 1.2021, "loss/crossentropy": 1.8924601078033447, "loss/hidden": 0.97265625, "loss/logits": 0.183339461684227, "loss/reg": 0.004610271658748388, "step": 915 }, { "epoch": 0.1145, "grad_norm": 2.935338258743286, "grad_norm_var": 0.16695985677138575, "learning_rate": 0.0001, "loss": 1.2591, "loss/crossentropy": 2.2739417552948, "loss/hidden": 1.046875, "loss/logits": 0.1661624014377594, "loss/reg": 0.00460821995511651, "step": 916 }, { "epoch": 0.114625, "grad_norm": 2.1026499271392822, "grad_norm_var": 0.17195618728893744, "learning_rate": 0.0001, "loss": 0.9901, "loss/crossentropy": 2.655440330505371, "loss/hidden": 0.796875, "loss/logits": 0.14714528620243073, "loss/reg": 0.004606001079082489, "step": 917 }, { "epoch": 0.11475, "grad_norm": 2.9179234504699707, "grad_norm_var": 0.17394937416602924, "learning_rate": 0.0001, "loss": 1.014, "loss/crossentropy": 2.5501797199249268, "loss/hidden": 0.8125, "loss/logits": 0.15548643469810486, "loss/reg": 0.004603679291903973, "step": 918 }, { "epoch": 0.114875, "grad_norm": 2.9957125186920166, "grad_norm_var": 0.17673345245174207, "learning_rate": 0.0001, "loss": 1.0982, "loss/crossentropy": 2.3883516788482666, "loss/hidden": 0.8828125, "loss/logits": 0.16936160624027252, "loss/reg": 0.004601585678756237, "step": 919 }, { "epoch": 0.115, "grad_norm": 2.67694354057312, "grad_norm_var": 0.16965697193046006, "learning_rate": 0.0001, "loss": 1.1013, "loss/crossentropy": 2.3806746006011963, "loss/hidden": 0.91015625, "loss/logits": 0.14516542851924896, "loss/reg": 0.004599516745656729, "step": 920 }, { "epoch": 0.115125, "grad_norm": 2.1424858570098877, "grad_norm_var": 0.1827775525511394, "learning_rate": 0.0001, "loss": 1.0246, "loss/crossentropy": 2.2575674057006836, "loss/hidden": 0.828125, "loss/logits": 0.15050096809864044, "loss/reg": 0.004597416613250971, "step": 921 }, { "epoch": 0.11525, "grad_norm": 5.457708358764648, "grad_norm_var": 0.682343045667132, "learning_rate": 0.0001, "loss": 1.7819, "loss/crossentropy": 2.941784381866455, "loss/hidden": 1.40625, "loss/logits": 0.3296935558319092, "loss/reg": 0.004595189820975065, "step": 922 }, { "epoch": 0.115375, "grad_norm": 2.692840814590454, "grad_norm_var": 0.6163222739990933, "learning_rate": 0.0001, "loss": 1.3758, "loss/crossentropy": 2.374514102935791, "loss/hidden": 1.1171875, "loss/logits": 0.21266797184944153, "loss/reg": 0.004593092482537031, "step": 923 }, { "epoch": 0.1155, "grad_norm": 3.2622177600860596, "grad_norm_var": 0.6326076754218235, "learning_rate": 0.0001, "loss": 1.1365, "loss/crossentropy": 2.4111597537994385, "loss/hidden": 0.89453125, "loss/logits": 0.19610172510147095, "loss/reg": 0.004590968135744333, "step": 924 }, { "epoch": 0.115625, "grad_norm": 3.9593584537506104, "grad_norm_var": 0.7204108733775624, "learning_rate": 0.0001, "loss": 1.4237, "loss/crossentropy": 2.770081043243408, "loss/hidden": 1.1796875, "loss/logits": 0.19811320304870605, "loss/reg": 0.004588917829096317, "step": 925 }, { "epoch": 0.11575, "grad_norm": 2.504918336868286, "grad_norm_var": 0.7152879483588092, "learning_rate": 0.0001, "loss": 1.0618, "loss/crossentropy": 2.4681365489959717, "loss/hidden": 0.86328125, "loss/logits": 0.1526341289281845, "loss/reg": 0.004586971364915371, "step": 926 }, { "epoch": 0.115875, "grad_norm": 3.0209288597106934, "grad_norm_var": 0.6783647636612234, "learning_rate": 0.0001, "loss": 1.138, "loss/crossentropy": 2.3891263008117676, "loss/hidden": 0.91796875, "loss/logits": 0.17415405809879303, "loss/reg": 0.004584896378219128, "step": 927 }, { "epoch": 0.116, "grad_norm": 2.3569037914276123, "grad_norm_var": 0.6670896431721521, "learning_rate": 0.0001, "loss": 1.1342, "loss/crossentropy": 2.373786687850952, "loss/hidden": 0.93359375, "loss/logits": 0.15477266907691956, "loss/reg": 0.004582802765071392, "step": 928 }, { "epoch": 0.116125, "grad_norm": 2.584897041320801, "grad_norm_var": 0.6642540884373107, "learning_rate": 0.0001, "loss": 1.187, "loss/crossentropy": 2.5142221450805664, "loss/hidden": 0.95703125, "loss/logits": 0.18413202464580536, "loss/reg": 0.004580747336149216, "step": 929 }, { "epoch": 0.11625, "grad_norm": 3.1578471660614014, "grad_norm_var": 0.6622672348995062, "learning_rate": 0.0001, "loss": 1.1606, "loss/crossentropy": 2.6126952171325684, "loss/hidden": 0.91796875, "loss/logits": 0.19688570499420166, "loss/reg": 0.004578826949000359, "step": 930 }, { "epoch": 0.116375, "grad_norm": 2.6005330085754395, "grad_norm_var": 0.6513814069878736, "learning_rate": 0.0001, "loss": 1.0783, "loss/crossentropy": 2.405707597732544, "loss/hidden": 0.87109375, "loss/logits": 0.16143286228179932, "loss/reg": 0.004576742183417082, "step": 931 }, { "epoch": 0.1165, "grad_norm": 2.879091501235962, "grad_norm_var": 0.6517684060932252, "learning_rate": 0.0001, "loss": 1.0623, "loss/crossentropy": 2.5898799896240234, "loss/hidden": 0.84765625, "loss/logits": 0.16887424886226654, "loss/reg": 0.004575024824589491, "step": 932 }, { "epoch": 0.116625, "grad_norm": 3.1379029750823975, "grad_norm_var": 0.6008152897241831, "learning_rate": 0.0001, "loss": 1.2402, "loss/crossentropy": 2.3071014881134033, "loss/hidden": 1.0234375, "loss/logits": 0.1710711419582367, "loss/reg": 0.004573314916342497, "step": 933 }, { "epoch": 0.11675, "grad_norm": 4.292084217071533, "grad_norm_var": 0.6998094594428453, "learning_rate": 0.0001, "loss": 1.2793, "loss/crossentropy": 2.428403854370117, "loss/hidden": 1.0078125, "loss/logits": 0.22576534748077393, "loss/reg": 0.004571723286062479, "step": 934 }, { "epoch": 0.116875, "grad_norm": 3.4453883171081543, "grad_norm_var": 0.7057361661794924, "learning_rate": 0.0001, "loss": 1.3151, "loss/crossentropy": 2.603928327560425, "loss/hidden": 1.09375, "loss/logits": 0.1756502389907837, "loss/reg": 0.004570134915411472, "step": 935 }, { "epoch": 0.117, "grad_norm": 2.8401777744293213, "grad_norm_var": 0.6974157138244702, "learning_rate": 0.0001, "loss": 1.1098, "loss/crossentropy": 2.304800033569336, "loss/hidden": 0.91015625, "loss/logits": 0.15392211079597473, "loss/reg": 0.004568077158182859, "step": 936 }, { "epoch": 0.117125, "grad_norm": 2.5680997371673584, "grad_norm_var": 0.6517920111715199, "learning_rate": 0.0001, "loss": 1.1496, "loss/crossentropy": 2.341132402420044, "loss/hidden": 0.94140625, "loss/logits": 0.1625438630580902, "loss/reg": 0.004566343035548925, "step": 937 }, { "epoch": 0.11725, "grad_norm": 2.748947858810425, "grad_norm_var": 0.2850544648160998, "learning_rate": 0.0001, "loss": 1.2239, "loss/crossentropy": 2.4194369316101074, "loss/hidden": 0.99609375, "loss/logits": 0.18219077587127686, "loss/reg": 0.004564360249787569, "step": 938 }, { "epoch": 0.117375, "grad_norm": 3.1744375228881836, "grad_norm_var": 0.2796176021162296, "learning_rate": 0.0001, "loss": 1.049, "loss/crossentropy": 2.328961133956909, "loss/hidden": 0.86328125, "loss/logits": 0.14012068510055542, "loss/reg": 0.0045626200735569, "step": 939 }, { "epoch": 0.1175, "grad_norm": 2.8904807567596436, "grad_norm_var": 0.276910977824096, "learning_rate": 0.0001, "loss": 1.0396, "loss/crossentropy": 2.96333646774292, "loss/hidden": 0.84765625, "loss/logits": 0.146368145942688, "loss/reg": 0.004560848698019981, "step": 940 }, { "epoch": 0.117625, "grad_norm": 2.7542107105255127, "grad_norm_var": 0.2151558946350927, "learning_rate": 0.0001, "loss": 1.0154, "loss/crossentropy": 2.326488494873047, "loss/hidden": 0.828125, "loss/logits": 0.14169706404209137, "loss/reg": 0.0045591117814183235, "step": 941 }, { "epoch": 0.11775, "grad_norm": 2.8061575889587402, "grad_norm_var": 0.20356104069779402, "learning_rate": 0.0001, "loss": 1.067, "loss/crossentropy": 2.483823537826538, "loss/hidden": 0.85546875, "loss/logits": 0.16598659753799438, "loss/reg": 0.004557049833238125, "step": 942 }, { "epoch": 0.117875, "grad_norm": 2.4513025283813477, "grad_norm_var": 0.2187293570918861, "learning_rate": 0.0001, "loss": 1.1831, "loss/crossentropy": 2.3770830631256104, "loss/hidden": 0.9609375, "loss/logits": 0.176588237285614, "loss/reg": 0.0045554060488939285, "step": 943 }, { "epoch": 0.118, "grad_norm": 2.757690906524658, "grad_norm_var": 0.19878318945221735, "learning_rate": 0.0001, "loss": 1.2487, "loss/crossentropy": 2.335298538208008, "loss/hidden": 1.0078125, "loss/logits": 0.19532084465026855, "loss/reg": 0.00455334922298789, "step": 944 }, { "epoch": 0.118125, "grad_norm": 3.0741937160491943, "grad_norm_var": 0.19037881818987876, "learning_rate": 0.0001, "loss": 1.1111, "loss/crossentropy": 2.5113272666931152, "loss/hidden": 0.8984375, "loss/logits": 0.16711819171905518, "loss/reg": 0.004551599267870188, "step": 945 }, { "epoch": 0.11825, "grad_norm": 2.156649589538574, "grad_norm_var": 0.22844079123048383, "learning_rate": 0.0001, "loss": 1.0124, "loss/crossentropy": 2.654160976409912, "loss/hidden": 0.8203125, "loss/logits": 0.14654606580734253, "loss/reg": 0.004549470264464617, "step": 946 }, { "epoch": 0.118375, "grad_norm": 3.1699886322021484, "grad_norm_var": 0.22512891612332073, "learning_rate": 0.0001, "loss": 1.2032, "loss/crossentropy": 2.6897716522216797, "loss/hidden": 0.9921875, "loss/logits": 0.16554811596870422, "loss/reg": 0.0045473333448171616, "step": 947 }, { "epoch": 0.1185, "grad_norm": 54.68584442138672, "grad_norm_var": 167.50451750408678, "learning_rate": 0.0001, "loss": 1.0296, "loss/crossentropy": 2.5791327953338623, "loss/hidden": 0.85546875, "loss/logits": 0.12866336107254028, "loss/reg": 0.004545523319393396, "step": 948 }, { "epoch": 0.118625, "grad_norm": 3.2524545192718506, "grad_norm_var": 167.45880382689273, "learning_rate": 0.0001, "loss": 1.1858, "loss/crossentropy": 2.6711151599884033, "loss/hidden": 0.95703125, "loss/logits": 0.1833563894033432, "loss/reg": 0.00454343156889081, "step": 949 }, { "epoch": 0.11875, "grad_norm": 5.301136016845703, "grad_norm_var": 167.26685801766013, "learning_rate": 0.0001, "loss": 1.553, "loss/crossentropy": 2.7770638465881348, "loss/hidden": 1.28125, "loss/logits": 0.22637835144996643, "loss/reg": 0.004541344009339809, "step": 950 }, { "epoch": 0.118875, "grad_norm": 2.384737730026245, "grad_norm_var": 167.73447965423813, "learning_rate": 0.0001, "loss": 1.0308, "loss/crossentropy": 2.795858144760132, "loss/hidden": 0.83984375, "loss/logits": 0.14552150666713715, "loss/reg": 0.004539397079497576, "step": 951 }, { "epoch": 0.119, "grad_norm": 3.921651601791382, "grad_norm_var": 167.32475778000344, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 2.6767590045928955, "loss/hidden": 1.140625, "loss/logits": 0.1861056089401245, "loss/reg": 0.004537293687462807, "step": 952 }, { "epoch": 0.119125, "grad_norm": 2.567948818206787, "grad_norm_var": 167.32483199379854, "learning_rate": 0.0001, "loss": 1.022, "loss/crossentropy": 2.5112462043762207, "loss/hidden": 0.82421875, "loss/logits": 0.1524544656276703, "loss/reg": 0.004535375162959099, "step": 953 }, { "epoch": 0.11925, "grad_norm": 3.9113171100616455, "grad_norm_var": 166.86572618880632, "learning_rate": 0.0001, "loss": 1.4182, "loss/crossentropy": 2.447330951690674, "loss/hidden": 1.1640625, "loss/logits": 0.20876702666282654, "loss/reg": 0.004533402621746063, "step": 954 }, { "epoch": 0.119375, "grad_norm": 2.407547950744629, "grad_norm_var": 167.22501953627514, "learning_rate": 0.0001, "loss": 1.0107, "loss/crossentropy": 2.4667739868164062, "loss/hidden": 0.82421875, "loss/logits": 0.1411362886428833, "loss/reg": 0.004531473852694035, "step": 955 }, { "epoch": 0.1195, "grad_norm": 3.312300205230713, "grad_norm_var": 167.0454581165803, "learning_rate": 0.0001, "loss": 1.1527, "loss/crossentropy": 2.6299078464508057, "loss/hidden": 0.9375, "loss/logits": 0.16988055408000946, "loss/reg": 0.0045296428725123405, "step": 956 }, { "epoch": 0.119625, "grad_norm": 2.991645097732544, "grad_norm_var": 166.93650144942362, "learning_rate": 0.0001, "loss": 1.145, "loss/crossentropy": 2.484005928039551, "loss/hidden": 0.94140625, "loss/logits": 0.158270001411438, "loss/reg": 0.0045278542675077915, "step": 957 }, { "epoch": 0.11975, "grad_norm": 2.3066608905792236, "grad_norm_var": 167.18625092351098, "learning_rate": 0.0001, "loss": 1.0326, "loss/crossentropy": 2.3425681591033936, "loss/hidden": 0.828125, "loss/logits": 0.1592123955488205, "loss/reg": 0.004525760654360056, "step": 958 }, { "epoch": 0.119875, "grad_norm": 2.4849298000335693, "grad_norm_var": 167.16910661257995, "learning_rate": 0.0001, "loss": 1.016, "loss/crossentropy": 2.577565908432007, "loss/hidden": 0.8203125, "loss/logits": 0.15045437216758728, "loss/reg": 0.00452386075630784, "step": 959 }, { "epoch": 0.12, "grad_norm": 3.1377410888671875, "grad_norm_var": 166.99899214100878, "learning_rate": 0.0001, "loss": 1.1344, "loss/crossentropy": 2.570946455001831, "loss/hidden": 0.921875, "loss/logits": 0.16726532578468323, "loss/reg": 0.004521827679127455, "step": 960 }, { "epoch": 0.120125, "grad_norm": 4.738165378570557, "grad_norm_var": 166.45265671613615, "learning_rate": 0.0001, "loss": 1.4551, "loss/crossentropy": 2.71376371383667, "loss/hidden": 1.203125, "loss/logits": 0.20679137110710144, "loss/reg": 0.0045196013525128365, "step": 961 }, { "epoch": 0.12025, "grad_norm": 3.7636489868164062, "grad_norm_var": 165.70042257567124, "learning_rate": 0.0001, "loss": 1.3982, "loss/crossentropy": 2.3803255558013916, "loss/hidden": 1.1015625, "loss/logits": 0.2515062689781189, "loss/reg": 0.004517595283687115, "step": 962 }, { "epoch": 0.120375, "grad_norm": 2.638967752456665, "grad_norm_var": 165.95531506158162, "learning_rate": 0.0001, "loss": 1.2337, "loss/crossentropy": 2.2372846603393555, "loss/hidden": 1.0234375, "loss/logits": 0.16512709856033325, "loss/reg": 0.004515463951975107, "step": 963 }, { "epoch": 0.1205, "grad_norm": 45.81782531738281, "grad_norm_var": 113.88107496030446, "learning_rate": 0.0001, "loss": 1.1605, "loss/crossentropy": 2.742631435394287, "loss/hidden": 0.9453125, "loss/logits": 0.17006908357143402, "loss/reg": 0.004513174295425415, "step": 964 }, { "epoch": 0.120625, "grad_norm": 2.8806581497192383, "grad_norm_var": 114.02262985566777, "learning_rate": 0.0001, "loss": 1.2776, "loss/crossentropy": 2.4858973026275635, "loss/hidden": 1.0234375, "loss/logits": 0.20908400416374207, "loss/reg": 0.004510868340730667, "step": 965 }, { "epoch": 0.12075, "grad_norm": 2.2246646881103516, "grad_norm_var": 114.86410220669458, "learning_rate": 0.0001, "loss": 1.1264, "loss/crossentropy": 2.499250888824463, "loss/hidden": 0.91015625, "loss/logits": 0.17112451791763306, "loss/reg": 0.004508919548243284, "step": 966 }, { "epoch": 0.120875, "grad_norm": 2.6645731925964355, "grad_norm_var": 114.74462216300226, "learning_rate": 0.0001, "loss": 1.2525, "loss/crossentropy": 2.5269155502319336, "loss/hidden": 1.03125, "loss/logits": 0.17614489793777466, "loss/reg": 0.004506917670369148, "step": 967 }, { "epoch": 0.121, "grad_norm": 2.5289218425750732, "grad_norm_var": 115.20270599436985, "learning_rate": 0.0001, "loss": 0.9631, "loss/crossentropy": 2.5555944442749023, "loss/hidden": 0.78515625, "loss/logits": 0.13286443054676056, "loss/reg": 0.004505137912929058, "step": 968 }, { "epoch": 0.121125, "grad_norm": 2.0906901359558105, "grad_norm_var": 115.41297732177252, "learning_rate": 0.0001, "loss": 1.0284, "loss/crossentropy": 2.4322237968444824, "loss/hidden": 0.8515625, "loss/logits": 0.1318206787109375, "loss/reg": 0.0045034573413431644, "step": 969 }, { "epoch": 0.12125, "grad_norm": 2.5444202423095703, "grad_norm_var": 115.84094031889703, "learning_rate": 0.0001, "loss": 1.1368, "loss/crossentropy": 2.548710584640503, "loss/hidden": 0.9296875, "loss/logits": 0.16206462681293488, "loss/reg": 0.004501515068113804, "step": 970 }, { "epoch": 0.121375, "grad_norm": 2.175011157989502, "grad_norm_var": 115.94123463799326, "learning_rate": 0.0001, "loss": 1.0439, "loss/crossentropy": 2.5955300331115723, "loss/hidden": 0.8515625, "loss/logits": 0.14735567569732666, "loss/reg": 0.004499473143368959, "step": 971 }, { "epoch": 0.1215, "grad_norm": 2.360872507095337, "grad_norm_var": 116.2777207470046, "learning_rate": 0.0001, "loss": 1.078, "loss/crossentropy": 2.328791379928589, "loss/hidden": 0.88671875, "loss/logits": 0.14633293449878693, "loss/reg": 0.00449743214994669, "step": 972 }, { "epoch": 0.121625, "grad_norm": 2.2626869678497314, "grad_norm_var": 116.55077789644868, "learning_rate": 0.0001, "loss": 1.0265, "loss/crossentropy": 2.6862330436706543, "loss/hidden": 0.83984375, "loss/logits": 0.1417045295238495, "loss/reg": 0.004495698027312756, "step": 973 }, { "epoch": 0.12175, "grad_norm": 2.239927291870117, "grad_norm_var": 116.57870277427698, "learning_rate": 0.0001, "loss": 0.9865, "loss/crossentropy": 2.6493136882781982, "loss/hidden": 0.8046875, "loss/logits": 0.13682736456394196, "loss/reg": 0.004493638873100281, "step": 974 }, { "epoch": 0.121875, "grad_norm": 3.526413917541504, "grad_norm_var": 116.24036193195982, "learning_rate": 0.0001, "loss": 1.4212, "loss/crossentropy": 2.3598437309265137, "loss/hidden": 1.1640625, "loss/logits": 0.21225669980049133, "loss/reg": 0.004491583444178104, "step": 975 }, { "epoch": 0.122, "grad_norm": 2.4012386798858643, "grad_norm_var": 116.5037542152016, "learning_rate": 0.0001, "loss": 1.0692, "loss/crossentropy": 2.8136839866638184, "loss/hidden": 0.8828125, "loss/logits": 0.14152291417121887, "loss/reg": 0.004489597398787737, "step": 976 }, { "epoch": 0.122125, "grad_norm": 3.48690128326416, "grad_norm_var": 116.71680821300758, "learning_rate": 0.0001, "loss": 1.1166, "loss/crossentropy": 2.5272679328918457, "loss/hidden": 0.921875, "loss/logits": 0.1498267650604248, "loss/reg": 0.004487714730203152, "step": 977 }, { "epoch": 0.12225, "grad_norm": 2.596402406692505, "grad_norm_var": 117.0489228171561, "learning_rate": 0.0001, "loss": 1.0618, "loss/crossentropy": 2.5853097438812256, "loss/hidden": 0.875, "loss/logits": 0.14198589324951172, "loss/reg": 0.0044856141321361065, "step": 978 }, { "epoch": 0.122375, "grad_norm": 2.4075088500976562, "grad_norm_var": 117.1336997192439, "learning_rate": 0.0001, "loss": 1.0708, "loss/crossentropy": 2.4603147506713867, "loss/hidden": 0.875, "loss/logits": 0.1509513258934021, "loss/reg": 0.004483620636165142, "step": 979 }, { "epoch": 0.1225, "grad_norm": 2.511711597442627, "grad_norm_var": 0.17809257377307758, "learning_rate": 0.0001, "loss": 1.0112, "loss/crossentropy": 2.369588613510132, "loss/hidden": 0.80859375, "loss/logits": 0.15779206156730652, "loss/reg": 0.00448161456733942, "step": 980 }, { "epoch": 0.122625, "grad_norm": 2.4200518131256104, "grad_norm_var": 0.1714391921620101, "learning_rate": 0.0001, "loss": 1.1021, "loss/crossentropy": 2.5299947261810303, "loss/hidden": 0.8828125, "loss/logits": 0.17450904846191406, "loss/reg": 0.004479666240513325, "step": 981 }, { "epoch": 0.12275, "grad_norm": 2.180694580078125, "grad_norm_var": 0.17333618624260225, "learning_rate": 0.0001, "loss": 0.9857, "loss/crossentropy": 2.4115102291107178, "loss/hidden": 0.80859375, "loss/logits": 0.13229887187480927, "loss/reg": 0.004477777983993292, "step": 982 }, { "epoch": 0.122875, "grad_norm": 2.063762664794922, "grad_norm_var": 0.1847061967544647, "learning_rate": 0.0001, "loss": 1.0019, "loss/crossentropy": 2.617342948913574, "loss/hidden": 0.8046875, "loss/logits": 0.1524919718503952, "loss/reg": 0.004475918132811785, "step": 983 }, { "epoch": 0.123, "grad_norm": 2.1369118690490723, "grad_norm_var": 0.19213655390988696, "learning_rate": 0.0001, "loss": 0.9803, "loss/crossentropy": 2.2203562259674072, "loss/hidden": 0.796875, "loss/logits": 0.13871444761753082, "loss/reg": 0.004473875742405653, "step": 984 }, { "epoch": 0.123125, "grad_norm": 2.5142624378204346, "grad_norm_var": 0.18233307349070932, "learning_rate": 0.0001, "loss": 1.2147, "loss/crossentropy": 2.369795322418213, "loss/hidden": 0.98046875, "loss/logits": 0.1895258128643036, "loss/reg": 0.00447199959307909, "step": 985 }, { "epoch": 0.12325, "grad_norm": 2.2619707584381104, "grad_norm_var": 0.18524330473807685, "learning_rate": 0.0001, "loss": 1.0356, "loss/crossentropy": 2.594536781311035, "loss/hidden": 0.84765625, "loss/logits": 0.1432015299797058, "loss/reg": 0.0044701374135911465, "step": 986 }, { "epoch": 0.123375, "grad_norm": 2.548429012298584, "grad_norm_var": 0.1791892169034893, "learning_rate": 0.0001, "loss": 1.1041, "loss/crossentropy": 2.4283149242401123, "loss/hidden": 0.88671875, "loss/logits": 0.17271637916564941, "loss/reg": 0.004468323662877083, "step": 987 }, { "epoch": 0.1235, "grad_norm": 1.967695951461792, "grad_norm_var": 0.19588156260189724, "learning_rate": 0.0001, "loss": 1.1141, "loss/crossentropy": 2.6532421112060547, "loss/hidden": 0.9140625, "loss/logits": 0.15532562136650085, "loss/reg": 0.004466407001018524, "step": 988 }, { "epoch": 0.123625, "grad_norm": 1.9731650352478027, "grad_norm_var": 0.2091392377621749, "learning_rate": 0.0001, "loss": 0.9699, "loss/crossentropy": 2.6200947761535645, "loss/hidden": 0.796875, "loss/logits": 0.1283724009990692, "loss/reg": 0.004464692436158657, "step": 989 }, { "epoch": 0.12375, "grad_norm": 2.0132744312286377, "grad_norm_var": 0.21876841065467237, "learning_rate": 0.0001, "loss": 1.0469, "loss/crossentropy": 2.4755969047546387, "loss/hidden": 0.84375, "loss/logits": 0.1585705578327179, "loss/reg": 0.00446262676268816, "step": 990 }, { "epoch": 0.123875, "grad_norm": 2.1972060203552246, "grad_norm_var": 0.13632242813181178, "learning_rate": 0.0001, "loss": 1.0328, "loss/crossentropy": 2.3555867671966553, "loss/hidden": 0.85546875, "loss/logits": 0.13275080919265747, "loss/reg": 0.004460789728909731, "step": 991 }, { "epoch": 0.124, "grad_norm": 3.8369944095611572, "grad_norm_var": 0.2739970385829828, "learning_rate": 0.0001, "loss": 1.7339, "loss/crossentropy": 2.2441718578338623, "loss/hidden": 1.3984375, "loss/logits": 0.2908269166946411, "loss/reg": 0.00445876969024539, "step": 992 }, { "epoch": 0.124125, "grad_norm": 2.786052703857422, "grad_norm_var": 0.20731647630768535, "learning_rate": 0.0001, "loss": 1.1283, "loss/crossentropy": 2.646028995513916, "loss/hidden": 0.9140625, "loss/logits": 0.1697021722793579, "loss/reg": 0.004456843715161085, "step": 993 }, { "epoch": 0.12425, "grad_norm": 2.5664174556732178, "grad_norm_var": 0.20659147596586322, "learning_rate": 0.0001, "loss": 1.1557, "loss/crossentropy": 2.3940696716308594, "loss/hidden": 0.94921875, "loss/logits": 0.16191905736923218, "loss/reg": 0.004455073736608028, "step": 994 }, { "epoch": 0.124375, "grad_norm": 2.2383058071136475, "grad_norm_var": 0.20819184179118794, "learning_rate": 0.0001, "loss": 0.9266, "loss/crossentropy": 2.499830722808838, "loss/hidden": 0.76171875, "loss/logits": 0.12036766111850739, "loss/reg": 0.004453308880329132, "step": 995 }, { "epoch": 0.1245, "grad_norm": 2.340665340423584, "grad_norm_var": 0.207211701006554, "learning_rate": 0.0001, "loss": 1.1757, "loss/crossentropy": 2.1450212001800537, "loss/hidden": 0.96484375, "loss/logits": 0.16630741953849792, "loss/reg": 0.004451683722436428, "step": 996 }, { "epoch": 0.124625, "grad_norm": 2.18617582321167, "grad_norm_var": 0.20931483319411062, "learning_rate": 0.0001, "loss": 1.1681, "loss/crossentropy": 2.145817518234253, "loss/hidden": 0.94140625, "loss/logits": 0.18221250176429749, "loss/reg": 0.004450384993106127, "step": 997 }, { "epoch": 0.12475, "grad_norm": 2.809575319290161, "grad_norm_var": 0.218725690321934, "learning_rate": 0.0001, "loss": 1.0028, "loss/crossentropy": 2.4370713233947754, "loss/hidden": 0.81640625, "loss/logits": 0.14186908304691315, "loss/reg": 0.004448299296200275, "step": 998 }, { "epoch": 0.124875, "grad_norm": 2.1984119415283203, "grad_norm_var": 0.21377643978811706, "learning_rate": 0.0001, "loss": 1.3013, "loss/crossentropy": 2.4677696228027344, "loss/hidden": 1.078125, "loss/logits": 0.1787460744380951, "loss/reg": 0.004446576349437237, "step": 999 }, { "epoch": 0.125, "grad_norm": 2.6378896236419678, "grad_norm_var": 0.2111563626515095, "learning_rate": 0.0001, "loss": 1.1774, "loss/crossentropy": 2.496150255203247, "loss/hidden": 0.93359375, "loss/logits": 0.19933247566223145, "loss/reg": 0.004444715566933155, "step": 1000 }, { "epoch": 0.125125, "grad_norm": 2.227482795715332, "grad_norm_var": 0.213544138660752, "learning_rate": 0.0001, "loss": 1.0916, "loss/crossentropy": 2.3874671459198, "loss/hidden": 0.89453125, "loss/logits": 0.15264838933944702, "loss/reg": 0.004442666191607714, "step": 1001 }, { "epoch": 0.12525, "grad_norm": 2.6360232830047607, "grad_norm_var": 0.21419004520447135, "learning_rate": 0.0001, "loss": 1.2063, "loss/crossentropy": 2.181767225265503, "loss/hidden": 1.0234375, "loss/logits": 0.1384468972682953, "loss/reg": 0.004440974909812212, "step": 1002 }, { "epoch": 0.125375, "grad_norm": 2.564113140106201, "grad_norm_var": 0.2144159920830437, "learning_rate": 0.0001, "loss": 1.2358, "loss/crossentropy": 2.3964531421661377, "loss/hidden": 1.0234375, "loss/logits": 0.16797608137130737, "loss/reg": 0.00443902425467968, "step": 1003 }, { "epoch": 0.1255, "grad_norm": 2.2647745609283447, "grad_norm_var": 0.20087855485435188, "learning_rate": 0.0001, "loss": 1.1095, "loss/crossentropy": 2.4876301288604736, "loss/hidden": 0.90625, "loss/logits": 0.1588534414768219, "loss/reg": 0.004437169525772333, "step": 1004 }, { "epoch": 0.125625, "grad_norm": 2.9978485107421875, "grad_norm_var": 0.19899346976312698, "learning_rate": 0.0001, "loss": 1.0965, "loss/crossentropy": 2.5442988872528076, "loss/hidden": 0.8984375, "loss/logits": 0.15375682711601257, "loss/reg": 0.00443507032468915, "step": 1005 }, { "epoch": 0.12575, "grad_norm": 3.734666585922241, "grad_norm_var": 0.26529031932980823, "learning_rate": 0.0001, "loss": 1.3977, "loss/crossentropy": 2.6442348957061768, "loss/hidden": 1.109375, "loss/logits": 0.24400165677070618, "loss/reg": 0.0044328500516712666, "step": 1006 }, { "epoch": 0.125875, "grad_norm": 2.0463345050811768, "grad_norm_var": 0.27559841867322327, "learning_rate": 0.0001, "loss": 0.9624, "loss/crossentropy": 2.744716167449951, "loss/hidden": 0.77734375, "loss/logits": 0.14069810509681702, "loss/reg": 0.004430860280990601, "step": 1007 }, { "epoch": 0.126, "grad_norm": 2.5981385707855225, "grad_norm_var": 0.1720635201097591, "learning_rate": 0.0001, "loss": 1.1825, "loss/crossentropy": 2.3077027797698975, "loss/hidden": 0.95703125, "loss/logits": 0.18115082383155823, "loss/reg": 0.004428706131875515, "step": 1008 }, { "epoch": 0.126125, "grad_norm": 2.8853800296783447, "grad_norm_var": 0.17577912545770383, "learning_rate": 0.0001, "loss": 1.1727, "loss/crossentropy": 3.0455784797668457, "loss/hidden": 0.9375, "loss/logits": 0.19088850915431976, "loss/reg": 0.004426531493663788, "step": 1009 }, { "epoch": 0.12625, "grad_norm": 3.3546810150146484, "grad_norm_var": 0.2154711693487454, "learning_rate": 0.0001, "loss": 1.356, "loss/crossentropy": 2.360203504562378, "loss/hidden": 1.125, "loss/logits": 0.18671754002571106, "loss/reg": 0.004424425307661295, "step": 1010 }, { "epoch": 0.126375, "grad_norm": 3.1136724948883057, "grad_norm_var": 0.2202687348010554, "learning_rate": 0.0001, "loss": 1.5006, "loss/crossentropy": 1.7926069498062134, "loss/hidden": 1.2265625, "loss/logits": 0.22981694340705872, "loss/reg": 0.004422247409820557, "step": 1011 }, { "epoch": 0.1265, "grad_norm": 2.9608895778656006, "grad_norm_var": 0.2177180299988663, "learning_rate": 0.0001, "loss": 0.9969, "loss/crossentropy": 2.716583251953125, "loss/hidden": 0.80859375, "loss/logits": 0.14412102103233337, "loss/reg": 0.004419958218932152, "step": 1012 }, { "epoch": 0.126625, "grad_norm": 2.627195358276367, "grad_norm_var": 0.1996009545068233, "learning_rate": 0.0001, "loss": 1.2014, "loss/crossentropy": 2.4077627658843994, "loss/hidden": 0.96484375, "loss/logits": 0.19235679507255554, "loss/reg": 0.004417847376316786, "step": 1013 }, { "epoch": 0.12675, "grad_norm": 2.6041698455810547, "grad_norm_var": 0.20001931967983994, "learning_rate": 0.0001, "loss": 1.0357, "loss/crossentropy": 2.6691579818725586, "loss/hidden": 0.85546875, "loss/logits": 0.13609513640403748, "loss/reg": 0.0044156271032989025, "step": 1014 }, { "epoch": 0.126875, "grad_norm": 2.6999282836914062, "grad_norm_var": 0.18114680748988857, "learning_rate": 0.0001, "loss": 1.1957, "loss/crossentropy": 2.2058401107788086, "loss/hidden": 0.98828125, "loss/logits": 0.16330450773239136, "loss/reg": 0.004413560498505831, "step": 1015 }, { "epoch": 0.127, "grad_norm": 2.9605417251586914, "grad_norm_var": 0.1829561774470515, "learning_rate": 0.0001, "loss": 1.1698, "loss/crossentropy": 2.5311779975891113, "loss/hidden": 0.9296875, "loss/logits": 0.19603696465492249, "loss/reg": 0.0044115264900028706, "step": 1016 }, { "epoch": 0.127125, "grad_norm": 3.1632213592529297, "grad_norm_var": 0.17033870731749232, "learning_rate": 0.0001, "loss": 1.1371, "loss/crossentropy": 2.628852367401123, "loss/hidden": 0.9453125, "loss/logits": 0.1477031111717224, "loss/reg": 0.0044094715267419815, "step": 1017 }, { "epoch": 0.12725, "grad_norm": 2.1563079357147217, "grad_norm_var": 0.19685525865983494, "learning_rate": 0.0001, "loss": 1.156, "loss/crossentropy": 2.51649808883667, "loss/hidden": 0.93359375, "loss/logits": 0.17829856276512146, "loss/reg": 0.004407336004078388, "step": 1018 }, { "epoch": 0.127375, "grad_norm": 2.838027238845825, "grad_norm_var": 0.19308506502144737, "learning_rate": 0.0001, "loss": 1.1716, "loss/crossentropy": 1.851514458656311, "loss/hidden": 0.984375, "loss/logits": 0.1431439369916916, "loss/reg": 0.004405440296977758, "step": 1019 }, { "epoch": 0.1275, "grad_norm": 3.7514472007751465, "grad_norm_var": 0.2225789179284817, "learning_rate": 0.0001, "loss": 1.2725, "loss/crossentropy": 2.4725522994995117, "loss/hidden": 0.99609375, "loss/logits": 0.23235675692558289, "loss/reg": 0.004403635859489441, "step": 1020 }, { "epoch": 0.127625, "grad_norm": 2.899569034576416, "grad_norm_var": 0.22197611268337217, "learning_rate": 0.0001, "loss": 1.3405, "loss/crossentropy": 2.393155336380005, "loss/hidden": 1.109375, "loss/logits": 0.187089741230011, "loss/reg": 0.004401590209454298, "step": 1021 }, { "epoch": 0.12775, "grad_norm": 3.2884371280670166, "grad_norm_var": 0.18473910601510302, "learning_rate": 0.0001, "loss": 1.6095, "loss/crossentropy": 2.3898444175720215, "loss/hidden": 1.3203125, "loss/logits": 0.24515338242053986, "loss/reg": 0.004399486817419529, "step": 1022 }, { "epoch": 0.127875, "grad_norm": 3.160599708557129, "grad_norm_var": 0.13970793310639895, "learning_rate": 0.0001, "loss": 1.1549, "loss/crossentropy": 2.3978278636932373, "loss/hidden": 0.96484375, "loss/logits": 0.14608745276927948, "loss/reg": 0.004397205542773008, "step": 1023 }, { "epoch": 0.128, "grad_norm": 3.4500718116760254, "grad_norm_var": 0.14607975431924464, "learning_rate": 0.0001, "loss": 1.3923, "loss/crossentropy": 1.8279949426651, "loss/hidden": 1.1640625, "loss/logits": 0.18429754674434662, "loss/reg": 0.00439491355791688, "step": 1024 }, { "epoch": 0.128125, "grad_norm": 3.99407696723938, "grad_norm_var": 0.20675474417581274, "learning_rate": 0.0001, "loss": 1.5049, "loss/crossentropy": 2.702716588973999, "loss/hidden": 1.2265625, "loss/logits": 0.23438766598701477, "loss/reg": 0.004392672795802355, "step": 1025 }, { "epoch": 0.12825, "grad_norm": 2.2928214073181152, "grad_norm_var": 0.23606107387848377, "learning_rate": 0.0001, "loss": 1.2118, "loss/crossentropy": 2.398599624633789, "loss/hidden": 0.9921875, "loss/logits": 0.17572686076164246, "loss/reg": 0.004390507936477661, "step": 1026 }, { "epoch": 0.128375, "grad_norm": 3.0106916427612305, "grad_norm_var": 0.2351295893724907, "learning_rate": 0.0001, "loss": 1.2084, "loss/crossentropy": 2.242279529571533, "loss/hidden": 0.98046875, "loss/logits": 0.18401223421096802, "loss/reg": 0.004388165660202503, "step": 1027 }, { "epoch": 0.1285, "grad_norm": 3.3926002979278564, "grad_norm_var": 0.24503759295084418, "learning_rate": 0.0001, "loss": 1.2829, "loss/crossentropy": 2.689535617828369, "loss/hidden": 1.0625, "loss/logits": 0.1765148937702179, "loss/reg": 0.0043861158192157745, "step": 1028 }, { "epoch": 0.128625, "grad_norm": 4.1095452308654785, "grad_norm_var": 0.3051103506304455, "learning_rate": 0.0001, "loss": 1.4167, "loss/crossentropy": 2.3078904151916504, "loss/hidden": 1.1484375, "loss/logits": 0.224439799785614, "loss/reg": 0.004383730702102184, "step": 1029 }, { "epoch": 0.12875, "grad_norm": 2.599076747894287, "grad_norm_var": 0.30545598256471346, "learning_rate": 0.0001, "loss": 1.1392, "loss/crossentropy": 2.7188191413879395, "loss/hidden": 0.890625, "loss/logits": 0.20472605526447296, "loss/reg": 0.004381467588245869, "step": 1030 }, { "epoch": 0.128875, "grad_norm": 2.078481435775757, "grad_norm_var": 0.36360767736667393, "learning_rate": 0.0001, "loss": 1.0199, "loss/crossentropy": 2.741642713546753, "loss/hidden": 0.82421875, "loss/logits": 0.15186432003974915, "loss/reg": 0.00437910296022892, "step": 1031 }, { "epoch": 0.129, "grad_norm": 5.224343776702881, "grad_norm_var": 0.6503873685492408, "learning_rate": 0.0001, "loss": 1.3017, "loss/crossentropy": 2.672182559967041, "loss/hidden": 1.0390625, "loss/logits": 0.21886497735977173, "loss/reg": 0.0043770503252744675, "step": 1032 }, { "epoch": 0.129125, "grad_norm": 3.197111129760742, "grad_norm_var": 0.6502338467882434, "learning_rate": 0.0001, "loss": 1.3376, "loss/crossentropy": 2.442309617996216, "loss/hidden": 1.1015625, "loss/logits": 0.19233301281929016, "loss/reg": 0.004375019110739231, "step": 1033 }, { "epoch": 0.12925, "grad_norm": 2.78690767288208, "grad_norm_var": 0.5860556952241872, "learning_rate": 0.0001, "loss": 1.1761, "loss/crossentropy": 2.430758237838745, "loss/hidden": 0.9609375, "loss/logits": 0.17138496041297913, "loss/reg": 0.00437304237857461, "step": 1034 }, { "epoch": 0.129375, "grad_norm": 2.5011260509490967, "grad_norm_var": 0.6118626954588627, "learning_rate": 0.0001, "loss": 1.0916, "loss/crossentropy": 2.541623115539551, "loss/hidden": 0.89453125, "loss/logits": 0.15334823727607727, "loss/reg": 0.004370801616460085, "step": 1035 }, { "epoch": 0.1295, "grad_norm": 2.1834847927093506, "grad_norm_var": 0.6572482832048148, "learning_rate": 0.0001, "loss": 1.002, "loss/crossentropy": 2.252703905105591, "loss/hidden": 0.81640625, "loss/logits": 0.1418866515159607, "loss/reg": 0.004368768073618412, "step": 1036 }, { "epoch": 0.129625, "grad_norm": 3.4379196166992188, "grad_norm_var": 0.6584227357505256, "learning_rate": 0.0001, "loss": 1.2578, "loss/crossentropy": 2.5080173015594482, "loss/hidden": 1.03125, "loss/logits": 0.18284769356250763, "loss/reg": 0.004366564564406872, "step": 1037 }, { "epoch": 0.12975, "grad_norm": 2.793656587600708, "grad_norm_var": 0.6658574542034102, "learning_rate": 0.0001, "loss": 1.5344, "loss/crossentropy": 2.028933048248291, "loss/hidden": 1.265625, "loss/logits": 0.22508540749549866, "loss/reg": 0.004364544991403818, "step": 1038 }, { "epoch": 0.129875, "grad_norm": 2.5851385593414307, "grad_norm_var": 0.6848422923307773, "learning_rate": 0.0001, "loss": 1.1177, "loss/crossentropy": 2.3001692295074463, "loss/hidden": 0.9140625, "loss/logits": 0.16001108288764954, "loss/reg": 0.0043626632541418076, "step": 1039 }, { "epoch": 0.13, "grad_norm": 3.0864601135253906, "grad_norm_var": 0.6762458829728395, "learning_rate": 0.0001, "loss": 1.2109, "loss/crossentropy": 2.3810689449310303, "loss/hidden": 0.98828125, "loss/logits": 0.17900380492210388, "loss/reg": 0.0043608080595731735, "step": 1040 }, { "epoch": 0.130125, "grad_norm": 2.6737496852874756, "grad_norm_var": 0.6242103012797673, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.0557336807250977, "loss/hidden": 1.1328125, "loss/logits": 0.2154931128025055, "loss/reg": 0.004358771722763777, "step": 1041 }, { "epoch": 0.13025, "grad_norm": 2.583439350128174, "grad_norm_var": 0.6022000179942284, "learning_rate": 0.0001, "loss": 1.0657, "loss/crossentropy": 2.530609607696533, "loss/hidden": 0.875, "loss/logits": 0.14716514945030212, "loss/reg": 0.004356934688985348, "step": 1042 }, { "epoch": 0.130375, "grad_norm": 2.3127689361572266, "grad_norm_var": 0.6330661539787814, "learning_rate": 0.0001, "loss": 1.0062, "loss/crossentropy": 2.5741043090820312, "loss/hidden": 0.83203125, "loss/logits": 0.1306590735912323, "loss/reg": 0.004354908596724272, "step": 1043 }, { "epoch": 0.1305, "grad_norm": 2.1028034687042236, "grad_norm_var": 0.6646412556630875, "learning_rate": 0.0001, "loss": 1.005, "loss/crossentropy": 2.2496836185455322, "loss/hidden": 0.8359375, "loss/logits": 0.12550613284111023, "loss/reg": 0.004353053402155638, "step": 1044 }, { "epoch": 0.130625, "grad_norm": 2.9018990993499756, "grad_norm_var": 0.5595824371855532, "learning_rate": 0.0001, "loss": 1.0929, "loss/crossentropy": 2.459836959838867, "loss/hidden": 0.88671875, "loss/logits": 0.16267293691635132, "loss/reg": 0.004351151175796986, "step": 1045 }, { "epoch": 0.13075, "grad_norm": 2.1798477172851562, "grad_norm_var": 0.582665735357125, "learning_rate": 0.0001, "loss": 1.0568, "loss/crossentropy": 2.393702983856201, "loss/hidden": 0.85546875, "loss/logits": 0.15785646438598633, "loss/reg": 0.004349268972873688, "step": 1046 }, { "epoch": 0.130875, "grad_norm": 3.7185163497924805, "grad_norm_var": 0.5953326384247966, "learning_rate": 0.0001, "loss": 1.114, "loss/crossentropy": 2.5875675678253174, "loss/hidden": 0.91796875, "loss/logits": 0.1525106430053711, "loss/reg": 0.004347451031208038, "step": 1047 }, { "epoch": 0.131, "grad_norm": 2.5062150955200195, "grad_norm_var": 0.21175117035612606, "learning_rate": 0.0001, "loss": 1.2681, "loss/crossentropy": 2.432136297225952, "loss/hidden": 1.0390625, "loss/logits": 0.18555600941181183, "loss/reg": 0.004345426335930824, "step": 1048 }, { "epoch": 0.131125, "grad_norm": 2.811901807785034, "grad_norm_var": 0.19661994295048071, "learning_rate": 0.0001, "loss": 1.041, "loss/crossentropy": 2.586029291152954, "loss/hidden": 0.85546875, "loss/logits": 0.14211300015449524, "loss/reg": 0.004343352280557156, "step": 1049 }, { "epoch": 0.13125, "grad_norm": 2.6836400032043457, "grad_norm_var": 0.19606042121245745, "learning_rate": 0.0001, "loss": 1.1189, "loss/crossentropy": 2.2877132892608643, "loss/hidden": 0.92578125, "loss/logits": 0.14965856075286865, "loss/reg": 0.004341335967183113, "step": 1050 }, { "epoch": 0.131375, "grad_norm": 3.004340171813965, "grad_norm_var": 0.19911977640688458, "learning_rate": 0.0001, "loss": 0.9704, "loss/crossentropy": 2.545414686203003, "loss/hidden": 0.79296875, "loss/logits": 0.13407136499881744, "loss/reg": 0.004339275881648064, "step": 1051 }, { "epoch": 0.1315, "grad_norm": 2.3175387382507324, "grad_norm_var": 0.19060218969874068, "learning_rate": 0.0001, "loss": 1.2081, "loss/crossentropy": 2.1056127548217773, "loss/hidden": 1.0, "loss/logits": 0.16477006673812866, "loss/reg": 0.004337204620242119, "step": 1052 }, { "epoch": 0.131625, "grad_norm": 2.9183707237243652, "grad_norm_var": 0.15851891177443728, "learning_rate": 0.0001, "loss": 1.1596, "loss/crossentropy": 2.5549087524414062, "loss/hidden": 0.93359375, "loss/logits": 0.18266820907592773, "loss/reg": 0.004335105884820223, "step": 1053 }, { "epoch": 0.13175, "grad_norm": 2.005140781402588, "grad_norm_var": 0.18740257136220345, "learning_rate": 0.0001, "loss": 1.0446, "loss/crossentropy": 2.5802786350250244, "loss/hidden": 0.84765625, "loss/logits": 0.1536553055047989, "loss/reg": 0.00433309143409133, "step": 1054 }, { "epoch": 0.131875, "grad_norm": 2.5984582901000977, "grad_norm_var": 0.18729938166855695, "learning_rate": 0.0001, "loss": 1.1794, "loss/crossentropy": 2.447845458984375, "loss/hidden": 0.95703125, "loss/logits": 0.17910084128379822, "loss/reg": 0.0043309698812663555, "step": 1055 }, { "epoch": 0.132, "grad_norm": 2.2338852882385254, "grad_norm_var": 0.1831504662831539, "learning_rate": 0.0001, "loss": 1.0552, "loss/crossentropy": 2.322484254837036, "loss/hidden": 0.859375, "loss/logits": 0.15252941846847534, "loss/reg": 0.004328942392021418, "step": 1056 }, { "epoch": 0.132125, "grad_norm": 2.383525848388672, "grad_norm_var": 0.18544613518572697, "learning_rate": 0.0001, "loss": 1.2197, "loss/crossentropy": 2.3245205879211426, "loss/hidden": 1.0078125, "loss/logits": 0.16857783496379852, "loss/reg": 0.004326963797211647, "step": 1057 }, { "epoch": 0.13225, "grad_norm": 3.00066876411438, "grad_norm_var": 0.19657906255275273, "learning_rate": 0.0001, "loss": 1.3914, "loss/crossentropy": 2.3864874839782715, "loss/hidden": 1.109375, "loss/logits": 0.23875172436237335, "loss/reg": 0.004325190093368292, "step": 1058 }, { "epoch": 0.132375, "grad_norm": 3.0184719562530518, "grad_norm_var": 0.20021081345078998, "learning_rate": 0.0001, "loss": 1.1478, "loss/crossentropy": 2.5393733978271484, "loss/hidden": 0.91015625, "loss/logits": 0.1943821907043457, "loss/reg": 0.004323435481637716, "step": 1059 }, { "epoch": 0.1325, "grad_norm": 4.587968826293945, "grad_norm_var": 0.4052032312871603, "learning_rate": 0.0001, "loss": 1.2345, "loss/crossentropy": 2.2922558784484863, "loss/hidden": 1.015625, "loss/logits": 0.17564015090465546, "loss/reg": 0.00432176748290658, "step": 1060 }, { "epoch": 0.132625, "grad_norm": 8.61639404296875, "grad_norm_var": 2.5204572599607027, "learning_rate": 0.0001, "loss": 1.7566, "loss/crossentropy": 2.5616378784179688, "loss/hidden": 1.5, "loss/logits": 0.21341325342655182, "loss/reg": 0.0043197330087423325, "step": 1061 }, { "epoch": 0.13275, "grad_norm": 3.9714202880859375, "grad_norm_var": 2.486558816101914, "learning_rate": 0.0001, "loss": 1.449, "loss/crossentropy": 2.4953112602233887, "loss/hidden": 1.1875, "loss/logits": 0.21827445924282074, "loss/reg": 0.0043179914355278015, "step": 1062 }, { "epoch": 0.132875, "grad_norm": 4.598790168762207, "grad_norm_var": 2.587217087573132, "learning_rate": 0.0001, "loss": 1.46, "loss/crossentropy": 2.4032106399536133, "loss/hidden": 1.1796875, "loss/logits": 0.23717749118804932, "loss/reg": 0.004315928090363741, "step": 1063 }, { "epoch": 0.133, "grad_norm": 2.2761240005493164, "grad_norm_var": 2.6157540828571424, "learning_rate": 0.0001, "loss": 1.1132, "loss/crossentropy": 2.2180681228637695, "loss/hidden": 0.91015625, "loss/logits": 0.15992170572280884, "loss/reg": 0.004314035642892122, "step": 1064 }, { "epoch": 0.133125, "grad_norm": 5.624876022338867, "grad_norm_var": 2.921925131142435, "learning_rate": 0.0001, "loss": 1.5429, "loss/crossentropy": 2.552769899368286, "loss/hidden": 1.2734375, "loss/logits": 0.22636112570762634, "loss/reg": 0.004311975557357073, "step": 1065 }, { "epoch": 0.13325, "grad_norm": 2.610703945159912, "grad_norm_var": 2.9300990717021325, "learning_rate": 0.0001, "loss": 1.152, "loss/crossentropy": 2.693028450012207, "loss/hidden": 0.92578125, "loss/logits": 0.18307983875274658, "loss/reg": 0.00430967565625906, "step": 1066 }, { "epoch": 0.133375, "grad_norm": 2.063502311706543, "grad_norm_var": 3.0457713158671065, "learning_rate": 0.0001, "loss": 0.9481, "loss/crossentropy": 2.3977253437042236, "loss/hidden": 0.7890625, "loss/logits": 0.11597828567028046, "loss/reg": 0.004307459108531475, "step": 1067 }, { "epoch": 0.1335, "grad_norm": 2.2253830432891846, "grad_norm_var": 3.059929800360324, "learning_rate": 0.0001, "loss": 1.1051, "loss/crossentropy": 2.43769907951355, "loss/hidden": 0.90234375, "loss/logits": 0.1597452163696289, "loss/reg": 0.0043051764369010925, "step": 1068 }, { "epoch": 0.133625, "grad_norm": 2.5021073818206787, "grad_norm_var": 3.0986482846073766, "learning_rate": 0.0001, "loss": 1.213, "loss/crossentropy": 2.4540703296661377, "loss/hidden": 0.99609375, "loss/logits": 0.1738748550415039, "loss/reg": 0.004303151275962591, "step": 1069 }, { "epoch": 0.13375, "grad_norm": 2.6333425045013428, "grad_norm_var": 3.006911696263074, "learning_rate": 0.0001, "loss": 1.1423, "loss/crossentropy": 2.481794595718384, "loss/hidden": 0.9375, "loss/logits": 0.16177596151828766, "loss/reg": 0.00430120388045907, "step": 1070 }, { "epoch": 0.133875, "grad_norm": 4.2749247550964355, "grad_norm_var": 2.995780076937819, "learning_rate": 0.0001, "loss": 1.232, "loss/crossentropy": 2.376904249191284, "loss/hidden": 1.046875, "loss/logits": 0.1420845091342926, "loss/reg": 0.004299336113035679, "step": 1071 }, { "epoch": 0.134, "grad_norm": 3.749925374984741, "grad_norm_var": 2.8756386517730372, "learning_rate": 0.0001, "loss": 1.6449, "loss/crossentropy": 2.1668522357940674, "loss/hidden": 1.3515625, "loss/logits": 0.2503596842288971, "loss/reg": 0.004297502338886261, "step": 1072 }, { "epoch": 0.134125, "grad_norm": 2.394890069961548, "grad_norm_var": 2.873752523963793, "learning_rate": 0.0001, "loss": 1.0081, "loss/crossentropy": 2.4769225120544434, "loss/hidden": 0.8203125, "loss/logits": 0.14485791325569153, "loss/reg": 0.004295617341995239, "step": 1073 }, { "epoch": 0.13425, "grad_norm": 2.521232843399048, "grad_norm_var": 2.9286262129865914, "learning_rate": 0.0001, "loss": 1.1533, "loss/crossentropy": 2.302316665649414, "loss/hidden": 0.9375, "loss/logits": 0.1728420853614807, "loss/reg": 0.004293751437216997, "step": 1074 }, { "epoch": 0.134375, "grad_norm": 3.0982587337493896, "grad_norm_var": 2.92279106991038, "learning_rate": 0.0001, "loss": 1.0719, "loss/crossentropy": 2.5007832050323486, "loss/hidden": 0.8828125, "loss/logits": 0.14618419110774994, "loss/reg": 0.004291870631277561, "step": 1075 }, { "epoch": 0.1345, "grad_norm": 2.1802334785461426, "grad_norm_var": 2.9709529639568184, "learning_rate": 0.0001, "loss": 1.0165, "loss/crossentropy": 2.736433982849121, "loss/hidden": 0.83984375, "loss/logits": 0.1337248980998993, "loss/reg": 0.00428979704156518, "step": 1076 }, { "epoch": 0.134625, "grad_norm": 2.309565782546997, "grad_norm_var": 1.1199522794543773, "learning_rate": 0.0001, "loss": 1.1145, "loss/crossentropy": 2.277212619781494, "loss/hidden": 0.91796875, "loss/logits": 0.1536218523979187, "loss/reg": 0.004287887830287218, "step": 1077 }, { "epoch": 0.13475, "grad_norm": 2.270759105682373, "grad_norm_var": 1.0951157521634287, "learning_rate": 0.0001, "loss": 0.9921, "loss/crossentropy": 2.6023480892181396, "loss/hidden": 0.8046875, "loss/logits": 0.14457595348358154, "loss/reg": 0.00428583100438118, "step": 1078 }, { "epoch": 0.134875, "grad_norm": 2.0448989868164062, "grad_norm_var": 0.9441842031089095, "learning_rate": 0.0001, "loss": 1.0484, "loss/crossentropy": 2.619910955429077, "loss/hidden": 0.84765625, "loss/logits": 0.15788725018501282, "loss/reg": 0.0042838454246521, "step": 1079 }, { "epoch": 0.135, "grad_norm": 2.8515126705169678, "grad_norm_var": 0.9247776412201837, "learning_rate": 0.0001, "loss": 0.9959, "loss/crossentropy": 2.4871740341186523, "loss/hidden": 0.8203125, "loss/logits": 0.13277903199195862, "loss/reg": 0.004281722474843264, "step": 1080 }, { "epoch": 0.135125, "grad_norm": 2.5293588638305664, "grad_norm_var": 0.3720854176506897, "learning_rate": 0.0001, "loss": 1.0284, "loss/crossentropy": 2.8959267139434814, "loss/hidden": 0.828125, "loss/logits": 0.15750843286514282, "loss/reg": 0.004279691725969315, "step": 1081 }, { "epoch": 0.13525, "grad_norm": 2.639998197555542, "grad_norm_var": 0.37201959594675976, "learning_rate": 0.0001, "loss": 1.1434, "loss/crossentropy": 2.4725499153137207, "loss/hidden": 0.93359375, "loss/logits": 0.16706131398677826, "loss/reg": 0.0042777759954333305, "step": 1082 }, { "epoch": 0.135375, "grad_norm": 2.474238157272339, "grad_norm_var": 0.35082104566976846, "learning_rate": 0.0001, "loss": 0.9969, "loss/crossentropy": 2.4064362049102783, "loss/hidden": 0.81640625, "loss/logits": 0.13769997656345367, "loss/reg": 0.004275754559785128, "step": 1083 }, { "epoch": 0.1355, "grad_norm": 5.083343982696533, "grad_norm_var": 0.6923522790607459, "learning_rate": 0.0001, "loss": 1.1973, "loss/crossentropy": 2.635760545730591, "loss/hidden": 0.98828125, "loss/logits": 0.16631773114204407, "loss/reg": 0.0042738220654428005, "step": 1084 }, { "epoch": 0.135625, "grad_norm": 2.9828197956085205, "grad_norm_var": 0.6846627645265992, "learning_rate": 0.0001, "loss": 1.4219, "loss/crossentropy": 2.64700984954834, "loss/hidden": 1.171875, "loss/logits": 0.20732998847961426, "loss/reg": 0.0042719184421002865, "step": 1085 }, { "epoch": 0.13575, "grad_norm": 2.5733866691589355, "grad_norm_var": 0.6868389075343996, "learning_rate": 0.0001, "loss": 1.0162, "loss/crossentropy": 2.6855356693267822, "loss/hidden": 0.83203125, "loss/logits": 0.14148542284965515, "loss/reg": 0.004269769415259361, "step": 1086 }, { "epoch": 0.135875, "grad_norm": 3.7361340522766113, "grad_norm_var": 0.6043207840777595, "learning_rate": 0.0001, "loss": 1.6576, "loss/crossentropy": 2.38763689994812, "loss/hidden": 1.328125, "loss/logits": 0.286837637424469, "loss/reg": 0.004267562180757523, "step": 1087 }, { "epoch": 0.136, "grad_norm": 2.4554789066314697, "grad_norm_var": 0.5520046435601859, "learning_rate": 0.0001, "loss": 1.0165, "loss/crossentropy": 2.697847366333008, "loss/hidden": 0.8203125, "loss/logits": 0.15349683165550232, "loss/reg": 0.004265283700078726, "step": 1088 }, { "epoch": 0.136125, "grad_norm": 2.2833757400512695, "grad_norm_var": 0.5581976166383347, "learning_rate": 0.0001, "loss": 1.1297, "loss/crossentropy": 2.6681339740753174, "loss/hidden": 0.91796875, "loss/logits": 0.16904997825622559, "loss/reg": 0.004263162147253752, "step": 1089 }, { "epoch": 0.13625, "grad_norm": 2.1826672554016113, "grad_norm_var": 0.5757864160069344, "learning_rate": 0.0001, "loss": 1.0232, "loss/crossentropy": 2.3187673091888428, "loss/hidden": 0.84375, "loss/logits": 0.136864572763443, "loss/reg": 0.004261130001395941, "step": 1090 }, { "epoch": 0.136375, "grad_norm": 3.739326238632202, "grad_norm_var": 0.632863410677898, "learning_rate": 0.0001, "loss": 1.1344, "loss/crossentropy": 2.590574264526367, "loss/hidden": 0.9140625, "loss/logits": 0.1777951866388321, "loss/reg": 0.004259143024682999, "step": 1091 }, { "epoch": 0.1365, "grad_norm": 2.1081202030181885, "grad_norm_var": 0.6388693719171433, "learning_rate": 0.0001, "loss": 1.0255, "loss/crossentropy": 2.8422060012817383, "loss/hidden": 0.84375, "loss/logits": 0.13917985558509827, "loss/reg": 0.0042572119273245335, "step": 1092 }, { "epoch": 0.136625, "grad_norm": 1.9327036142349243, "grad_norm_var": 0.6707091951265027, "learning_rate": 0.0001, "loss": 0.9773, "loss/crossentropy": 2.6250529289245605, "loss/hidden": 0.796875, "loss/logits": 0.13788098096847534, "loss/reg": 0.004255138337612152, "step": 1093 }, { "epoch": 0.13675, "grad_norm": 2.4659841060638428, "grad_norm_var": 0.6607986154781931, "learning_rate": 0.0001, "loss": 1.159, "loss/crossentropy": 2.2569518089294434, "loss/hidden": 0.94140625, "loss/logits": 0.1750330626964569, "loss/reg": 0.004253007471561432, "step": 1094 }, { "epoch": 0.136875, "grad_norm": 2.6554629802703857, "grad_norm_var": 0.6262725765926574, "learning_rate": 0.0001, "loss": 1.3871, "loss/crossentropy": 2.323014974594116, "loss/hidden": 1.1015625, "loss/logits": 0.2430596649646759, "loss/reg": 0.004250808618962765, "step": 1095 }, { "epoch": 0.137, "grad_norm": 2.722032070159912, "grad_norm_var": 0.6263166142478738, "learning_rate": 0.0001, "loss": 1.0409, "loss/crossentropy": 2.4675052165985107, "loss/hidden": 0.86328125, "loss/logits": 0.13518026471138, "loss/reg": 0.004248757380992174, "step": 1096 }, { "epoch": 0.137125, "grad_norm": 2.510000228881836, "grad_norm_var": 0.6270005997929298, "learning_rate": 0.0001, "loss": 1.0616, "loss/crossentropy": 2.3804080486297607, "loss/hidden": 0.87109375, "loss/logits": 0.14798954129219055, "loss/reg": 0.004246733151376247, "step": 1097 }, { "epoch": 0.13725, "grad_norm": 3.058847188949585, "grad_norm_var": 0.6299195109389221, "learning_rate": 0.0001, "loss": 1.0296, "loss/crossentropy": 2.7722012996673584, "loss/hidden": 0.83984375, "loss/logits": 0.1473253071308136, "loss/reg": 0.004244515672326088, "step": 1098 }, { "epoch": 0.137375, "grad_norm": 4.170520782470703, "grad_norm_var": 0.7337604064260393, "learning_rate": 0.0001, "loss": 1.1115, "loss/crossentropy": 3.227797508239746, "loss/hidden": 0.8828125, "loss/logits": 0.18627075850963593, "loss/reg": 0.004242491442710161, "step": 1099 }, { "epoch": 0.1375, "grad_norm": 2.20058536529541, "grad_norm_var": 0.4201977001022351, "learning_rate": 0.0001, "loss": 1.0558, "loss/crossentropy": 2.337442636489868, "loss/hidden": 0.8671875, "loss/logits": 0.14616578817367554, "loss/reg": 0.004240325652062893, "step": 1100 }, { "epoch": 0.137625, "grad_norm": 2.378139019012451, "grad_norm_var": 0.42315778530047454, "learning_rate": 0.0001, "loss": 0.9849, "loss/crossentropy": 2.418541431427002, "loss/hidden": 0.82421875, "loss/logits": 0.1183251217007637, "loss/reg": 0.00423810537904501, "step": 1101 }, { "epoch": 0.13775, "grad_norm": 2.4013891220092773, "grad_norm_var": 0.42787131976948645, "learning_rate": 0.0001, "loss": 1.044, "loss/crossentropy": 2.642296552658081, "loss/hidden": 0.8359375, "loss/logits": 0.16568773984909058, "loss/reg": 0.004236077889800072, "step": 1102 }, { "epoch": 0.137875, "grad_norm": 2.395822286605835, "grad_norm_var": 0.35275757091918025, "learning_rate": 0.0001, "loss": 1.0828, "loss/crossentropy": 2.457338571548462, "loss/hidden": 0.8828125, "loss/logits": 0.15768851339817047, "loss/reg": 0.004234058782458305, "step": 1103 }, { "epoch": 0.138, "grad_norm": 2.5931308269500732, "grad_norm_var": 0.35121999529942605, "learning_rate": 0.0001, "loss": 1.2445, "loss/crossentropy": 2.370553731918335, "loss/hidden": 1.015625, "loss/logits": 0.1865496039390564, "loss/reg": 0.004232100211083889, "step": 1104 }, { "epoch": 0.138125, "grad_norm": 2.5963783264160156, "grad_norm_var": 0.3436125305875331, "learning_rate": 0.0001, "loss": 0.9801, "loss/crossentropy": 2.5301551818847656, "loss/hidden": 0.80078125, "loss/logits": 0.1369716078042984, "loss/reg": 0.004230163525789976, "step": 1105 }, { "epoch": 0.13825, "grad_norm": 2.951883316040039, "grad_norm_var": 0.3345145438296379, "learning_rate": 0.0001, "loss": 0.9827, "loss/crossentropy": 3.0806362628936768, "loss/hidden": 0.8046875, "loss/logits": 0.13572098314762115, "loss/reg": 0.0042281243950128555, "step": 1106 }, { "epoch": 0.138375, "grad_norm": 2.111954927444458, "grad_norm_var": 0.2701844296523925, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.3894336223602295, "loss/hidden": 0.984375, "loss/logits": 0.1716623604297638, "loss/reg": 0.004226126708090305, "step": 1107 }, { "epoch": 0.1385, "grad_norm": 3.0929460525512695, "grad_norm_var": 0.2690614225265311, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.3786656856536865, "loss/hidden": 1.0, "loss/logits": 0.15606652200222015, "loss/reg": 0.004224089439958334, "step": 1108 }, { "epoch": 0.138625, "grad_norm": 3.325866460800171, "grad_norm_var": 0.25900974055676873, "learning_rate": 0.0001, "loss": 1.0968, "loss/crossentropy": 2.668332099914551, "loss/hidden": 0.8671875, "loss/logits": 0.1874256730079651, "loss/reg": 0.00422210618853569, "step": 1109 }, { "epoch": 0.13875, "grad_norm": 3.774113178253174, "grad_norm_var": 0.32044570279647266, "learning_rate": 0.0001, "loss": 0.9994, "loss/crossentropy": 2.6192150115966797, "loss/hidden": 0.8203125, "loss/logits": 0.13685137033462524, "loss/reg": 0.004220074508339167, "step": 1110 }, { "epoch": 0.138875, "grad_norm": 3.656229019165039, "grad_norm_var": 0.362595306683378, "learning_rate": 0.0001, "loss": 1.1159, "loss/crossentropy": 2.3791658878326416, "loss/hidden": 0.91796875, "loss/logits": 0.1557137668132782, "loss/reg": 0.004218076355755329, "step": 1111 }, { "epoch": 0.139, "grad_norm": 2.500005006790161, "grad_norm_var": 0.37009339748612907, "learning_rate": 0.0001, "loss": 1.1123, "loss/crossentropy": 2.2807750701904297, "loss/hidden": 0.89453125, "loss/logits": 0.17557448148727417, "loss/reg": 0.004216110333800316, "step": 1112 }, { "epoch": 0.139125, "grad_norm": 2.6660091876983643, "grad_norm_var": 0.36438900758074033, "learning_rate": 0.0001, "loss": 1.0914, "loss/crossentropy": 2.6014909744262695, "loss/hidden": 0.89453125, "loss/logits": 0.1547611951828003, "loss/reg": 0.004214086104184389, "step": 1113 }, { "epoch": 0.13925, "grad_norm": 2.3377296924591064, "grad_norm_var": 0.37845468238227015, "learning_rate": 0.0001, "loss": 1.1139, "loss/crossentropy": 2.7464191913604736, "loss/hidden": 0.90234375, "loss/logits": 0.16947275400161743, "loss/reg": 0.004212013445794582, "step": 1114 }, { "epoch": 0.139375, "grad_norm": 2.4301981925964355, "grad_norm_var": 0.2548452172505712, "learning_rate": 0.0001, "loss": 1.072, "loss/crossentropy": 2.5051262378692627, "loss/hidden": 0.8828125, "loss/logits": 0.14710885286331177, "loss/reg": 0.0042099012061953545, "step": 1115 }, { "epoch": 0.1395, "grad_norm": 2.815537452697754, "grad_norm_var": 0.23644342440034408, "learning_rate": 0.0001, "loss": 1.1469, "loss/crossentropy": 2.6059226989746094, "loss/hidden": 0.92578125, "loss/logits": 0.17900194227695465, "loss/reg": 0.0042078145779669285, "step": 1116 }, { "epoch": 0.139625, "grad_norm": 2.9746217727661133, "grad_norm_var": 0.22897005663627562, "learning_rate": 0.0001, "loss": 1.1095, "loss/crossentropy": 2.892439842224121, "loss/hidden": 0.91015625, "loss/logits": 0.15726403892040253, "loss/reg": 0.004205791745334864, "step": 1117 }, { "epoch": 0.13975, "grad_norm": 2.1311089992523193, "grad_norm_var": 0.24750381735717988, "learning_rate": 0.0001, "loss": 1.0609, "loss/crossentropy": 2.477149248123169, "loss/hidden": 0.8671875, "loss/logits": 0.15162619948387146, "loss/reg": 0.004203863441944122, "step": 1118 }, { "epoch": 0.139875, "grad_norm": 2.1927154064178467, "grad_norm_var": 0.2602719277895896, "learning_rate": 0.0001, "loss": 1.0056, "loss/crossentropy": 2.3840315341949463, "loss/hidden": 0.8046875, "loss/logits": 0.15886801481246948, "loss/reg": 0.004202014300972223, "step": 1119 }, { "epoch": 0.14, "grad_norm": 2.850994348526001, "grad_norm_var": 0.25871108381456814, "learning_rate": 0.0001, "loss": 1.2744, "loss/crossentropy": 2.935702085494995, "loss/hidden": 1.0390625, "loss/logits": 0.1933366060256958, "loss/reg": 0.004200007766485214, "step": 1120 }, { "epoch": 0.140125, "grad_norm": 2.7692806720733643, "grad_norm_var": 0.2564497076881023, "learning_rate": 0.0001, "loss": 1.1429, "loss/crossentropy": 2.305704355239868, "loss/hidden": 0.89453125, "loss/logits": 0.20640595257282257, "loss/reg": 0.0041981167159974575, "step": 1121 }, { "epoch": 0.14025, "grad_norm": 2.4898860454559326, "grad_norm_var": 0.2595914437365072, "learning_rate": 0.0001, "loss": 0.9756, "loss/crossentropy": 2.6060853004455566, "loss/hidden": 0.7890625, "loss/logits": 0.14455264806747437, "loss/reg": 0.004196107853204012, "step": 1122 }, { "epoch": 0.140375, "grad_norm": 31.69025421142578, "grad_norm_var": 52.39364291838152, "learning_rate": 0.0001, "loss": 1.1367, "loss/crossentropy": 2.816709518432617, "loss/hidden": 0.9453125, "loss/logits": 0.14948531985282898, "loss/reg": 0.004194286651909351, "step": 1123 }, { "epoch": 0.1405, "grad_norm": 2.6551873683929443, "grad_norm_var": 52.493939083618166, "learning_rate": 0.0001, "loss": 1.1028, "loss/crossentropy": 2.7032201290130615, "loss/hidden": 0.88671875, "loss/logits": 0.17413891851902008, "loss/reg": 0.004192298278212547, "step": 1124 }, { "epoch": 0.140625, "grad_norm": 2.712380886077881, "grad_norm_var": 52.61994398728479, "learning_rate": 0.0001, "loss": 1.1739, "loss/crossentropy": 2.622096300125122, "loss/hidden": 0.95703125, "loss/logits": 0.17497727274894714, "loss/reg": 0.0041902982629835606, "step": 1125 }, { "epoch": 0.14075, "grad_norm": 2.355632781982422, "grad_norm_var": 52.890626023811876, "learning_rate": 0.0001, "loss": 0.9784, "loss/crossentropy": 2.3552989959716797, "loss/hidden": 0.79296875, "loss/logits": 0.143496572971344, "loss/reg": 0.004188500810414553, "step": 1126 }, { "epoch": 0.140875, "grad_norm": 2.7159671783447266, "grad_norm_var": 53.045613069983446, "learning_rate": 0.0001, "loss": 1.1407, "loss/crossentropy": 2.45786190032959, "loss/hidden": 0.93359375, "loss/logits": 0.16524501144886017, "loss/reg": 0.004186683334410191, "step": 1127 }, { "epoch": 0.141, "grad_norm": 3.830094575881958, "grad_norm_var": 52.820475932071744, "learning_rate": 0.0001, "loss": 1.0811, "loss/crossentropy": 2.1976234912872314, "loss/hidden": 0.88671875, "loss/logits": 0.1525820791721344, "loss/reg": 0.004184682387858629, "step": 1128 }, { "epoch": 0.141125, "grad_norm": 2.6621882915496826, "grad_norm_var": 52.82139900035407, "learning_rate": 0.0001, "loss": 1.0459, "loss/crossentropy": 2.539736270904541, "loss/hidden": 0.81640625, "loss/logits": 0.1876693218946457, "loss/reg": 0.004182685166597366, "step": 1129 }, { "epoch": 0.14125, "grad_norm": 2.478451728820801, "grad_norm_var": 52.78251904082665, "learning_rate": 0.0001, "loss": 1.0815, "loss/crossentropy": 2.4196884632110596, "loss/hidden": 0.88671875, "loss/logits": 0.1529858261346817, "loss/reg": 0.00418076990172267, "step": 1130 }, { "epoch": 0.141375, "grad_norm": 3.299400568008423, "grad_norm_var": 52.59163994639377, "learning_rate": 0.0001, "loss": 1.2759, "loss/crossentropy": 2.5420329570770264, "loss/hidden": 1.0625, "loss/logits": 0.17164292931556702, "loss/reg": 0.004178792238235474, "step": 1131 }, { "epoch": 0.1415, "grad_norm": 2.3637001514434814, "grad_norm_var": 52.70822859008106, "learning_rate": 0.0001, "loss": 1.1613, "loss/crossentropy": 2.6717777252197266, "loss/hidden": 0.9453125, "loss/logits": 0.17417655885219574, "loss/reg": 0.004176879767328501, "step": 1132 }, { "epoch": 0.141625, "grad_norm": 2.500570058822632, "grad_norm_var": 52.819367266798494, "learning_rate": 0.0001, "loss": 0.9633, "loss/crossentropy": 2.6206371784210205, "loss/hidden": 0.78515625, "loss/logits": 0.13640643656253815, "loss/reg": 0.004175043664872646, "step": 1133 }, { "epoch": 0.14175, "grad_norm": 2.323843479156494, "grad_norm_var": 52.76129867971668, "learning_rate": 0.0001, "loss": 1.0942, "loss/crossentropy": 2.5218851566314697, "loss/hidden": 0.8984375, "loss/logits": 0.15399572253227234, "loss/reg": 0.004173224791884422, "step": 1134 }, { "epoch": 0.141875, "grad_norm": 2.0677201747894287, "grad_norm_var": 52.80061443559793, "learning_rate": 0.0001, "loss": 1.0591, "loss/crossentropy": 2.580735683441162, "loss/hidden": 0.87109375, "loss/logits": 0.14630158245563507, "loss/reg": 0.0041715288534760475, "step": 1135 }, { "epoch": 0.142, "grad_norm": 2.8460326194763184, "grad_norm_var": 52.80169720296209, "learning_rate": 0.0001, "loss": 1.2675, "loss/crossentropy": 2.315237283706665, "loss/hidden": 1.0625, "loss/logits": 0.16331645846366882, "loss/reg": 0.0041695088148117065, "step": 1136 }, { "epoch": 0.142125, "grad_norm": 2.169602632522583, "grad_norm_var": 52.96135990851253, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.5278215408325195, "loss/hidden": 0.8828125, "loss/logits": 0.1388036012649536, "loss/reg": 0.004167445003986359, "step": 1137 }, { "epoch": 0.14225, "grad_norm": 2.01615047454834, "grad_norm_var": 53.099042280735006, "learning_rate": 0.0001, "loss": 0.9819, "loss/crossentropy": 2.4882190227508545, "loss/hidden": 0.8046875, "loss/logits": 0.13553820550441742, "loss/reg": 0.004165465943515301, "step": 1138 }, { "epoch": 0.142375, "grad_norm": 2.4339070320129395, "grad_norm_var": 0.2098356411642726, "learning_rate": 0.0001, "loss": 0.9859, "loss/crossentropy": 2.644569158554077, "loss/hidden": 0.80859375, "loss/logits": 0.13571619987487793, "loss/reg": 0.00416343891993165, "step": 1139 }, { "epoch": 0.1425, "grad_norm": 2.3045334815979004, "grad_norm_var": 0.2144459690924632, "learning_rate": 0.0001, "loss": 1.061, "loss/crossentropy": 2.491943597793579, "loss/hidden": 0.859375, "loss/logits": 0.16004428267478943, "loss/reg": 0.004161354620009661, "step": 1140 }, { "epoch": 0.142625, "grad_norm": 2.173408031463623, "grad_norm_var": 0.22219091176189235, "learning_rate": 0.0001, "loss": 1.0067, "loss/crossentropy": 2.374779224395752, "loss/hidden": 0.8203125, "loss/logits": 0.14481596648693085, "loss/reg": 0.004159385804086924, "step": 1141 }, { "epoch": 0.14275, "grad_norm": 3.0710461139678955, "grad_norm_var": 0.23718192859111392, "learning_rate": 0.0001, "loss": 1.0164, "loss/crossentropy": 2.6076908111572266, "loss/hidden": 0.83984375, "loss/logits": 0.13495643436908722, "loss/reg": 0.004157309886068106, "step": 1142 }, { "epoch": 0.142875, "grad_norm": 3.763521432876587, "grad_norm_var": 0.3249627427406568, "learning_rate": 0.0001, "loss": 1.1313, "loss/crossentropy": 2.3016669750213623, "loss/hidden": 0.92578125, "loss/logits": 0.16395646333694458, "loss/reg": 0.004155360162258148, "step": 1143 }, { "epoch": 0.143, "grad_norm": 8.867222785949707, "grad_norm_var": 2.70734825211357, "learning_rate": 0.0001, "loss": 1.3971, "loss/crossentropy": 2.549853563308716, "loss/hidden": 1.1171875, "loss/logits": 0.23840667307376862, "loss/reg": 0.0041534146293997765, "step": 1144 }, { "epoch": 0.143125, "grad_norm": 4.079037189483643, "grad_norm_var": 2.7767747967197427, "learning_rate": 0.0001, "loss": 1.2853, "loss/crossentropy": 2.298107862472534, "loss/hidden": 1.0859375, "loss/logits": 0.15788918733596802, "loss/reg": 0.004151403903961182, "step": 1145 }, { "epoch": 0.14325, "grad_norm": 2.5469048023223877, "grad_norm_var": 2.7718749700746383, "learning_rate": 0.0001, "loss": 1.1133, "loss/crossentropy": 2.588602066040039, "loss/hidden": 0.8984375, "loss/logits": 0.17332546412944794, "loss/reg": 0.004149466287344694, "step": 1146 }, { "epoch": 0.143375, "grad_norm": 2.2044925689697266, "grad_norm_var": 2.8106347308786437, "learning_rate": 0.0001, "loss": 0.9728, "loss/crossentropy": 2.4486637115478516, "loss/hidden": 0.79296875, "loss/logits": 0.13837072253227234, "loss/reg": 0.004147485829889774, "step": 1147 }, { "epoch": 0.1435, "grad_norm": 2.0629138946533203, "grad_norm_var": 2.841135428686917, "learning_rate": 0.0001, "loss": 1.0498, "loss/crossentropy": 2.611081123352051, "loss/hidden": 0.8515625, "loss/logits": 0.1567818820476532, "loss/reg": 0.004145504906773567, "step": 1148 }, { "epoch": 0.143625, "grad_norm": 2.686124324798584, "grad_norm_var": 2.8318111276034035, "learning_rate": 0.0001, "loss": 1.1465, "loss/crossentropy": 2.3890397548675537, "loss/hidden": 0.9453125, "loss/logits": 0.1597655862569809, "loss/reg": 0.004143500700592995, "step": 1149 }, { "epoch": 0.14375, "grad_norm": 2.1094777584075928, "grad_norm_var": 2.8533239929343903, "learning_rate": 0.0001, "loss": 1.1325, "loss/crossentropy": 2.349745273590088, "loss/hidden": 0.91796875, "loss/logits": 0.17312946915626526, "loss/reg": 0.0041414061561226845, "step": 1150 }, { "epoch": 0.143875, "grad_norm": 2.2789037227630615, "grad_norm_var": 2.830912674059921, "learning_rate": 0.0001, "loss": 1.0119, "loss/crossentropy": 2.482297897338867, "loss/hidden": 0.828125, "loss/logits": 0.14236654341220856, "loss/reg": 0.004139502998441458, "step": 1151 }, { "epoch": 0.144, "grad_norm": 2.3409323692321777, "grad_norm_var": 2.855599485961874, "learning_rate": 0.0001, "loss": 1.0217, "loss/crossentropy": 2.4027068614959717, "loss/hidden": 0.8359375, "loss/logits": 0.1443997174501419, "loss/reg": 0.004137733485549688, "step": 1152 }, { "epoch": 0.144125, "grad_norm": 3.2646842002868652, "grad_norm_var": 2.8174411429914015, "learning_rate": 0.0001, "loss": 1.1094, "loss/crossentropy": 2.5013201236724854, "loss/hidden": 0.91015625, "loss/logits": 0.15787330269813538, "loss/reg": 0.004135794471949339, "step": 1153 }, { "epoch": 0.14425, "grad_norm": 3.01409912109375, "grad_norm_var": 2.747083786295129, "learning_rate": 0.0001, "loss": 1.4099, "loss/crossentropy": 2.341641426086426, "loss/hidden": 1.171875, "loss/logits": 0.1966913640499115, "loss/reg": 0.004133842419832945, "step": 1154 }, { "epoch": 0.144375, "grad_norm": 2.1998982429504395, "grad_norm_var": 2.770511502568856, "learning_rate": 0.0001, "loss": 1.027, "loss/crossentropy": 2.67824649810791, "loss/hidden": 0.8359375, "loss/logits": 0.1497696489095688, "loss/reg": 0.004131934605538845, "step": 1155 }, { "epoch": 0.1445, "grad_norm": 2.5142602920532227, "grad_norm_var": 2.7521224578865087, "learning_rate": 0.0001, "loss": 1.1456, "loss/crossentropy": 2.4931130409240723, "loss/hidden": 0.921875, "loss/logits": 0.18238465487957, "loss/reg": 0.004129941575229168, "step": 1156 }, { "epoch": 0.144625, "grad_norm": 2.1878349781036377, "grad_norm_var": 2.750403944498737, "learning_rate": 0.0001, "loss": 1.0802, "loss/crossentropy": 2.387873888015747, "loss/hidden": 0.890625, "loss/logits": 0.1483183354139328, "loss/reg": 0.004127953667193651, "step": 1157 }, { "epoch": 0.14475, "grad_norm": 2.688089609146118, "grad_norm_var": 2.759744220974267, "learning_rate": 0.0001, "loss": 1.0354, "loss/crossentropy": 2.6154494285583496, "loss/hidden": 0.85546875, "loss/logits": 0.13863371312618256, "loss/reg": 0.004126036539673805, "step": 1158 }, { "epoch": 0.144875, "grad_norm": 3.1651244163513184, "grad_norm_var": 2.7252368192156586, "learning_rate": 0.0001, "loss": 1.2689, "loss/crossentropy": 2.3369271755218506, "loss/hidden": 1.0234375, "loss/logits": 0.20426858961582184, "loss/reg": 0.004124056547880173, "step": 1159 }, { "epoch": 0.145, "grad_norm": 2.437448740005493, "grad_norm_var": 0.290374675783868, "learning_rate": 0.0001, "loss": 1.1068, "loss/crossentropy": 2.446481227874756, "loss/hidden": 0.91015625, "loss/logits": 0.15547212958335876, "loss/reg": 0.0041221086867153645, "step": 1160 }, { "epoch": 0.145125, "grad_norm": 3.550361394882202, "grad_norm_var": 0.20437982896584472, "learning_rate": 0.0001, "loss": 1.6847, "loss/crossentropy": 2.6945204734802246, "loss/hidden": 1.28125, "loss/logits": 0.36227577924728394, "loss/reg": 0.004120130091905594, "step": 1161 }, { "epoch": 0.14525, "grad_norm": 2.150529384613037, "grad_norm_var": 0.21585453142654387, "learning_rate": 0.0001, "loss": 1.0391, "loss/crossentropy": 2.3554482460021973, "loss/hidden": 0.84765625, "loss/logits": 0.15022103488445282, "loss/reg": 0.0041182260029017925, "step": 1162 }, { "epoch": 0.145375, "grad_norm": 4.6839704513549805, "grad_norm_var": 0.48472891056564626, "learning_rate": 0.0001, "loss": 1.5962, "loss/crossentropy": 2.456437587738037, "loss/hidden": 1.2890625, "loss/logits": 0.2660132944583893, "loss/reg": 0.004116271156817675, "step": 1163 }, { "epoch": 0.1455, "grad_norm": 2.3992183208465576, "grad_norm_var": 0.4628530155911977, "learning_rate": 0.0001, "loss": 1.0986, "loss/crossentropy": 2.535423755645752, "loss/hidden": 0.9140625, "loss/logits": 0.14340469241142273, "loss/reg": 0.0041144127026200294, "step": 1164 }, { "epoch": 0.145625, "grad_norm": 2.455538034439087, "grad_norm_var": 0.4675077175097224, "learning_rate": 0.0001, "loss": 1.1422, "loss/crossentropy": 2.5659542083740234, "loss/hidden": 0.9296875, "loss/logits": 0.17135412991046906, "loss/reg": 0.004112581722438335, "step": 1165 }, { "epoch": 0.14575, "grad_norm": 3.7746567726135254, "grad_norm_var": 0.5063635000809102, "learning_rate": 0.0001, "loss": 1.4588, "loss/crossentropy": 2.807483196258545, "loss/hidden": 1.1953125, "loss/logits": 0.22239741683006287, "loss/reg": 0.004110958427190781, "step": 1166 }, { "epoch": 0.145875, "grad_norm": 2.2929201126098633, "grad_norm_var": 0.5053662377320952, "learning_rate": 0.0001, "loss": 1.1351, "loss/crossentropy": 2.5097239017486572, "loss/hidden": 0.921875, "loss/logits": 0.17209036648273468, "loss/reg": 0.004109338391572237, "step": 1167 }, { "epoch": 0.146, "grad_norm": 4.034673690795898, "grad_norm_var": 0.5764809506272021, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.6598453521728516, "loss/hidden": 1.1953125, "loss/logits": 0.24519138038158417, "loss/reg": 0.0041076927445828915, "step": 1168 }, { "epoch": 0.146125, "grad_norm": 2.2866017818450928, "grad_norm_var": 0.5920811915580522, "learning_rate": 0.0001, "loss": 1.177, "loss/crossentropy": 2.3927576541900635, "loss/hidden": 0.953125, "loss/logits": 0.18283367156982422, "loss/reg": 0.004106137901544571, "step": 1169 }, { "epoch": 0.14625, "grad_norm": 2.899941921234131, "grad_norm_var": 0.5906217092668515, "learning_rate": 0.0001, "loss": 1.1387, "loss/crossentropy": 2.4218740463256836, "loss/hidden": 0.92578125, "loss/logits": 0.17189282178878784, "loss/reg": 0.004104320891201496, "step": 1170 }, { "epoch": 0.146375, "grad_norm": 2.9730427265167236, "grad_norm_var": 0.5601848624373048, "learning_rate": 0.0001, "loss": 1.0993, "loss/crossentropy": 2.669074296951294, "loss/hidden": 0.90234375, "loss/logits": 0.15591827034950256, "loss/reg": 0.004102461040019989, "step": 1171 }, { "epoch": 0.1465, "grad_norm": 3.55232572555542, "grad_norm_var": 0.5733288711493515, "learning_rate": 0.0001, "loss": 1.2912, "loss/crossentropy": 2.384329319000244, "loss/hidden": 1.0234375, "loss/logits": 0.22678744792938232, "loss/reg": 0.004100624471902847, "step": 1172 }, { "epoch": 0.146625, "grad_norm": 2.591209650039673, "grad_norm_var": 0.541389636484242, "learning_rate": 0.0001, "loss": 1.1135, "loss/crossentropy": 2.367767810821533, "loss/hidden": 0.9375, "loss/logits": 0.13497930765151978, "loss/reg": 0.004098633769899607, "step": 1173 }, { "epoch": 0.14675, "grad_norm": 2.9488017559051514, "grad_norm_var": 0.534935103556153, "learning_rate": 0.0001, "loss": 1.4669, "loss/crossentropy": 1.9882688522338867, "loss/hidden": 1.25, "loss/logits": 0.17595870792865753, "loss/reg": 0.004096675664186478, "step": 1174 }, { "epoch": 0.146875, "grad_norm": 2.837010145187378, "grad_norm_var": 0.5349767501482856, "learning_rate": 0.0001, "loss": 1.1548, "loss/crossentropy": 2.8100757598876953, "loss/hidden": 0.97265625, "loss/logits": 0.14116618037223816, "loss/reg": 0.004094698466360569, "step": 1175 }, { "epoch": 0.147, "grad_norm": 2.5989038944244385, "grad_norm_var": 0.5246730089916675, "learning_rate": 0.0001, "loss": 1.2746, "loss/crossentropy": 2.367872953414917, "loss/hidden": 1.0703125, "loss/logits": 0.16331195831298828, "loss/reg": 0.004092712886631489, "step": 1176 }, { "epoch": 0.147125, "grad_norm": 3.459014892578125, "grad_norm_var": 0.5185139879820743, "learning_rate": 0.0001, "loss": 1.1223, "loss/crossentropy": 2.533998727798462, "loss/hidden": 0.92578125, "loss/logits": 0.15561142563819885, "loss/reg": 0.004090711008757353, "step": 1177 }, { "epoch": 0.14725, "grad_norm": 5.084310054779053, "grad_norm_var": 0.7256747423481064, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.582167863845825, "loss/hidden": 1.0625, "loss/logits": 0.1748521625995636, "loss/reg": 0.004088713321834803, "step": 1178 }, { "epoch": 0.147375, "grad_norm": 2.778517961502075, "grad_norm_var": 0.5703725263929961, "learning_rate": 0.0001, "loss": 1.0471, "loss/crossentropy": 2.336113929748535, "loss/hidden": 0.8515625, "loss/logits": 0.15465718507766724, "loss/reg": 0.004086779430508614, "step": 1179 }, { "epoch": 0.1475, "grad_norm": 2.8686232566833496, "grad_norm_var": 0.5427611216294432, "learning_rate": 0.0001, "loss": 1.1435, "loss/crossentropy": 2.776197910308838, "loss/hidden": 0.9453125, "loss/logits": 0.1573391556739807, "loss/reg": 0.0040848455391824245, "step": 1180 }, { "epoch": 0.147625, "grad_norm": 2.511221170425415, "grad_norm_var": 0.5382462121749844, "learning_rate": 0.0001, "loss": 1.0821, "loss/crossentropy": 2.7198173999786377, "loss/hidden": 0.87890625, "loss/logits": 0.16234460473060608, "loss/reg": 0.004082926083356142, "step": 1181 }, { "epoch": 0.14775, "grad_norm": 2.1064679622650146, "grad_norm_var": 0.5606094401847085, "learning_rate": 0.0001, "loss": 0.98, "loss/crossentropy": 2.6126627922058105, "loss/hidden": 0.8046875, "loss/logits": 0.1345212161540985, "loss/reg": 0.004080874379724264, "step": 1182 }, { "epoch": 0.147875, "grad_norm": 4.242970943450928, "grad_norm_var": 0.6172993082607185, "learning_rate": 0.0001, "loss": 1.2843, "loss/crossentropy": 2.121168375015259, "loss/hidden": 1.0703125, "loss/logits": 0.1731598824262619, "loss/reg": 0.0040789819322526455, "step": 1183 }, { "epoch": 0.148, "grad_norm": 2.6352462768554688, "grad_norm_var": 0.5673230040929977, "learning_rate": 0.0001, "loss": 1.0252, "loss/crossentropy": 2.692413091659546, "loss/hidden": 0.84375, "loss/logits": 0.14067476987838745, "loss/reg": 0.004077126272022724, "step": 1184 }, { "epoch": 0.148125, "grad_norm": 2.905735731124878, "grad_norm_var": 0.5304583396362827, "learning_rate": 0.0001, "loss": 1.2004, "loss/crossentropy": 2.290217399597168, "loss/hidden": 0.98046875, "loss/logits": 0.17918136715888977, "loss/reg": 0.004075322765856981, "step": 1185 }, { "epoch": 0.14825, "grad_norm": 2.0793650150299072, "grad_norm_var": 0.5902824998400082, "learning_rate": 0.0001, "loss": 0.949, "loss/crossentropy": 2.605687379837036, "loss/hidden": 0.77734375, "loss/logits": 0.1309652477502823, "loss/reg": 0.00407352764159441, "step": 1186 }, { "epoch": 0.148375, "grad_norm": 2.730095624923706, "grad_norm_var": 0.5951944585982081, "learning_rate": 0.0001, "loss": 1.018, "loss/crossentropy": 2.399637222290039, "loss/hidden": 0.84375, "loss/logits": 0.13348934054374695, "loss/reg": 0.004071622621268034, "step": 1187 }, { "epoch": 0.1485, "grad_norm": 2.3118958473205566, "grad_norm_var": 0.5992861461620653, "learning_rate": 0.0001, "loss": 0.969, "loss/crossentropy": 2.583070993423462, "loss/hidden": 0.7890625, "loss/logits": 0.13926547765731812, "loss/reg": 0.004069886170327663, "step": 1188 }, { "epoch": 0.148625, "grad_norm": 2.281296491622925, "grad_norm_var": 0.6187961724202968, "learning_rate": 0.0001, "loss": 1.1247, "loss/crossentropy": 2.4077699184417725, "loss/hidden": 0.91796875, "loss/logits": 0.16607880592346191, "loss/reg": 0.00406790804117918, "step": 1189 }, { "epoch": 0.14875, "grad_norm": 3.76084041595459, "grad_norm_var": 0.6654318302540614, "learning_rate": 0.0001, "loss": 1.3819, "loss/crossentropy": 2.7298569679260254, "loss/hidden": 1.0546875, "loss/logits": 0.28656214475631714, "loss/reg": 0.004065926186740398, "step": 1190 }, { "epoch": 0.148875, "grad_norm": 3.076002597808838, "grad_norm_var": 0.6654180683387788, "learning_rate": 0.0001, "loss": 1.605, "loss/crossentropy": 1.9846049547195435, "loss/hidden": 1.3125, "loss/logits": 0.2518633008003235, "loss/reg": 0.004063920117914677, "step": 1191 }, { "epoch": 0.149, "grad_norm": 2.316555976867676, "grad_norm_var": 0.6841604530042008, "learning_rate": 0.0001, "loss": 1.0979, "loss/crossentropy": 2.3328921794891357, "loss/hidden": 0.89453125, "loss/logits": 0.16276977956295013, "loss/reg": 0.004061955027282238, "step": 1192 }, { "epoch": 0.149125, "grad_norm": 2.814150810241699, "grad_norm_var": 0.6661064219784556, "learning_rate": 0.0001, "loss": 1.203, "loss/crossentropy": 2.3084676265716553, "loss/hidden": 0.984375, "loss/logits": 0.17800584435462952, "loss/reg": 0.004059869330376387, "step": 1193 }, { "epoch": 0.14925, "grad_norm": 4.5728559494018555, "grad_norm_var": 0.5339391843004021, "learning_rate": 0.0001, "loss": 1.235, "loss/crossentropy": 2.786238431930542, "loss/hidden": 1.0078125, "loss/logits": 0.18664950132369995, "loss/reg": 0.004058040212839842, "step": 1194 }, { "epoch": 0.149375, "grad_norm": 2.267996311187744, "grad_norm_var": 0.556761488955063, "learning_rate": 0.0001, "loss": 1.1708, "loss/crossentropy": 2.1988203525543213, "loss/hidden": 0.96484375, "loss/logits": 0.1653607189655304, "loss/reg": 0.004056154750287533, "step": 1195 }, { "epoch": 0.1495, "grad_norm": 2.1123440265655518, "grad_norm_var": 0.5898830056876873, "learning_rate": 0.0001, "loss": 0.9765, "loss/crossentropy": 2.301168441772461, "loss/hidden": 0.80859375, "loss/logits": 0.12740209698677063, "loss/reg": 0.004054322373121977, "step": 1196 }, { "epoch": 0.149625, "grad_norm": 3.071465253829956, "grad_norm_var": 0.5882785049222033, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.6485557556152344, "loss/hidden": 1.0234375, "loss/logits": 0.21422387659549713, "loss/reg": 0.004052514210343361, "step": 1197 }, { "epoch": 0.14975, "grad_norm": 2.4168970584869385, "grad_norm_var": 0.5643403352790185, "learning_rate": 0.0001, "loss": 1.0625, "loss/crossentropy": 2.5202739238739014, "loss/hidden": 0.87109375, "loss/logits": 0.15086334943771362, "loss/reg": 0.004050557967275381, "step": 1198 }, { "epoch": 0.149875, "grad_norm": 2.1462841033935547, "grad_norm_var": 0.44960492320894496, "learning_rate": 0.0001, "loss": 1.0988, "loss/crossentropy": 2.435528039932251, "loss/hidden": 0.90234375, "loss/logits": 0.1560034155845642, "loss/reg": 0.004048600792884827, "step": 1199 }, { "epoch": 0.15, "grad_norm": 2.1528284549713135, "grad_norm_var": 0.46951760615473387, "learning_rate": 0.0001, "loss": 1.3208, "loss/crossentropy": 2.4528579711914062, "loss/hidden": 1.078125, "loss/logits": 0.2022087574005127, "loss/reg": 0.0040466394275426865, "step": 1200 }, { "epoch": 0.150125, "grad_norm": 2.827105760574341, "grad_norm_var": 0.46762692410470286, "learning_rate": 0.0001, "loss": 1.098, "loss/crossentropy": 2.634087562561035, "loss/hidden": 0.88671875, "loss/logits": 0.17087715864181519, "loss/reg": 0.004044875968247652, "step": 1201 }, { "epoch": 0.15025, "grad_norm": 2.268160104751587, "grad_norm_var": 0.45464383775398576, "learning_rate": 0.0001, "loss": 1.0992, "loss/crossentropy": 2.574035167694092, "loss/hidden": 0.89453125, "loss/logits": 0.16423508524894714, "loss/reg": 0.004043125547468662, "step": 1202 }, { "epoch": 0.150375, "grad_norm": 2.10304594039917, "grad_norm_var": 0.4763194687664772, "learning_rate": 0.0001, "loss": 1.0253, "loss/crossentropy": 2.686739444732666, "loss/hidden": 0.83984375, "loss/logits": 0.14499551057815552, "loss/reg": 0.004041461274027824, "step": 1203 }, { "epoch": 0.1505, "grad_norm": 2.8614940643310547, "grad_norm_var": 0.46996517485336897, "learning_rate": 0.0001, "loss": 1.3262, "loss/crossentropy": 2.327242136001587, "loss/hidden": 1.09375, "loss/logits": 0.1920863389968872, "loss/reg": 0.004039805382490158, "step": 1204 }, { "epoch": 0.150625, "grad_norm": 2.0004308223724365, "grad_norm_var": 0.4902227797061412, "learning_rate": 0.0001, "loss": 0.9759, "loss/crossentropy": 2.55458664894104, "loss/hidden": 0.80078125, "loss/logits": 0.13473068177700043, "loss/reg": 0.00403786962851882, "step": 1205 }, { "epoch": 0.15075, "grad_norm": 2.0800395011901855, "grad_norm_var": 0.42300499990140544, "learning_rate": 0.0001, "loss": 1.0823, "loss/crossentropy": 2.2755930423736572, "loss/hidden": 0.89453125, "loss/logits": 0.14743617177009583, "loss/reg": 0.00403629383072257, "step": 1206 }, { "epoch": 0.150875, "grad_norm": 2.8714046478271484, "grad_norm_var": 0.41176251270089, "learning_rate": 0.0001, "loss": 1.0406, "loss/crossentropy": 2.389319896697998, "loss/hidden": 0.83984375, "loss/logits": 0.16044044494628906, "loss/reg": 0.004034355282783508, "step": 1207 }, { "epoch": 0.151, "grad_norm": 2.136133909225464, "grad_norm_var": 0.41953769445076433, "learning_rate": 0.0001, "loss": 1.0709, "loss/crossentropy": 2.745492458343506, "loss/hidden": 0.8828125, "loss/logits": 0.14779764413833618, "loss/reg": 0.004032687284052372, "step": 1208 }, { "epoch": 0.151125, "grad_norm": 2.6204607486724854, "grad_norm_var": 0.4149034970549467, "learning_rate": 0.0001, "loss": 1.0942, "loss/crossentropy": 2.473696708679199, "loss/hidden": 0.91015625, "loss/logits": 0.14370602369308472, "loss/reg": 0.004030975513160229, "step": 1209 }, { "epoch": 0.15125, "grad_norm": 2.2523136138916016, "grad_norm_var": 0.11994939680660437, "learning_rate": 0.0001, "loss": 1.1321, "loss/crossentropy": 2.7228078842163086, "loss/hidden": 0.9296875, "loss/logits": 0.16212627291679382, "loss/reg": 0.004029178526252508, "step": 1210 }, { "epoch": 0.151375, "grad_norm": 3.1284432411193848, "grad_norm_var": 0.15259538885302745, "learning_rate": 0.0001, "loss": 1.0887, "loss/crossentropy": 2.81272029876709, "loss/hidden": 0.88671875, "loss/logits": 0.16168195009231567, "loss/reg": 0.004027185495942831, "step": 1211 }, { "epoch": 0.1515, "grad_norm": 2.160048246383667, "grad_norm_var": 0.15065002461184704, "learning_rate": 0.0001, "loss": 0.9783, "loss/crossentropy": 2.311624526977539, "loss/hidden": 0.8046875, "loss/logits": 0.13339203596115112, "loss/reg": 0.0040252963081002235, "step": 1212 }, { "epoch": 0.151625, "grad_norm": 2.9693639278411865, "grad_norm_var": 0.14275322843417157, "learning_rate": 0.0001, "loss": 0.9642, "loss/crossentropy": 2.664057970046997, "loss/hidden": 0.7890625, "loss/logits": 0.13490962982177734, "loss/reg": 0.0040232837200164795, "step": 1213 }, { "epoch": 0.15175, "grad_norm": 3.021174669265747, "grad_norm_var": 0.16394313365960494, "learning_rate": 0.0001, "loss": 1.2676, "loss/crossentropy": 2.6081109046936035, "loss/hidden": 1.046875, "loss/logits": 0.18048575520515442, "loss/reg": 0.004021205008029938, "step": 1214 }, { "epoch": 0.151875, "grad_norm": 2.3486392498016357, "grad_norm_var": 0.15763551716868943, "learning_rate": 0.0001, "loss": 0.9701, "loss/crossentropy": 2.4917783737182617, "loss/hidden": 0.79296875, "loss/logits": 0.13695400953292847, "loss/reg": 0.004019314423203468, "step": 1215 }, { "epoch": 0.152, "grad_norm": 2.184790849685669, "grad_norm_var": 0.15627282346626145, "learning_rate": 0.0001, "loss": 1.0597, "loss/crossentropy": 2.4108786582946777, "loss/hidden": 0.86328125, "loss/logits": 0.15620392560958862, "loss/reg": 0.004017516039311886, "step": 1216 }, { "epoch": 0.152125, "grad_norm": 2.46875262260437, "grad_norm_var": 0.1481710731830276, "learning_rate": 0.0001, "loss": 1.067, "loss/crossentropy": 2.67449688911438, "loss/hidden": 0.86328125, "loss/logits": 0.1635635495185852, "loss/reg": 0.004015681799501181, "step": 1217 }, { "epoch": 0.15225, "grad_norm": 2.1573519706726074, "grad_norm_var": 0.15187870918378293, "learning_rate": 0.0001, "loss": 1.1704, "loss/crossentropy": 2.501345634460449, "loss/hidden": 0.94140625, "loss/logits": 0.18887701630592346, "loss/reg": 0.0040138899348676205, "step": 1218 }, { "epoch": 0.152375, "grad_norm": 2.381070137023926, "grad_norm_var": 0.14346854325687347, "learning_rate": 0.0001, "loss": 1.0696, "loss/crossentropy": 2.485503673553467, "loss/hidden": 0.859375, "loss/logits": 0.1700785756111145, "loss/reg": 0.004012054763734341, "step": 1219 }, { "epoch": 0.1525, "grad_norm": 2.212719678878784, "grad_norm_var": 0.13656890921584572, "learning_rate": 0.0001, "loss": 1.0381, "loss/crossentropy": 2.6198275089263916, "loss/hidden": 0.84765625, "loss/logits": 0.15031346678733826, "loss/reg": 0.004010040778666735, "step": 1220 }, { "epoch": 0.152625, "grad_norm": 2.1536829471588135, "grad_norm_var": 0.12911465723150664, "learning_rate": 0.0001, "loss": 0.9917, "loss/crossentropy": 2.542593240737915, "loss/hidden": 0.8203125, "loss/logits": 0.13133659958839417, "loss/reg": 0.0040080067701637745, "step": 1221 }, { "epoch": 0.15275, "grad_norm": 2.5055301189422607, "grad_norm_var": 0.11963125742360768, "learning_rate": 0.0001, "loss": 1.1078, "loss/crossentropy": 2.522934675216675, "loss/hidden": 0.8671875, "loss/logits": 0.20054848492145538, "loss/reg": 0.004005954600870609, "step": 1222 }, { "epoch": 0.152875, "grad_norm": 2.0575757026672363, "grad_norm_var": 0.1178213242465496, "learning_rate": 0.0001, "loss": 1.0593, "loss/crossentropy": 2.6986868381500244, "loss/hidden": 0.875, "loss/logits": 0.14429835975170135, "loss/reg": 0.004004053305834532, "step": 1223 }, { "epoch": 0.153, "grad_norm": 6.1870503425598145, "grad_norm_var": 0.9888346629412268, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.4771060943603516, "loss/hidden": 0.99609375, "loss/logits": 0.1621606945991516, "loss/reg": 0.004002041183412075, "step": 1224 }, { "epoch": 0.153125, "grad_norm": 2.094531774520874, "grad_norm_var": 1.009986051026931, "learning_rate": 0.0001, "loss": 1.0328, "loss/crossentropy": 2.4067788124084473, "loss/hidden": 0.8515625, "loss/logits": 0.141241192817688, "loss/reg": 0.004000107757747173, "step": 1225 }, { "epoch": 0.15325, "grad_norm": 2.305340528488159, "grad_norm_var": 1.007401731577304, "learning_rate": 0.0001, "loss": 1.1055, "loss/crossentropy": 2.624967336654663, "loss/hidden": 0.91796875, "loss/logits": 0.1475597620010376, "loss/reg": 0.003998105879873037, "step": 1226 }, { "epoch": 0.153375, "grad_norm": 9.206901550292969, "grad_norm_var": 3.7076283352537036, "learning_rate": 0.0001, "loss": 1.7336, "loss/crossentropy": 2.340308666229248, "loss/hidden": 1.265625, "loss/logits": 0.4280090034008026, "loss/reg": 0.003996132407337427, "step": 1227 }, { "epoch": 0.1535, "grad_norm": 7.1393914222717285, "grad_norm_var": 4.682389594648043, "learning_rate": 0.0001, "loss": 2.1514, "loss/crossentropy": 2.1572506427764893, "loss/hidden": 1.7578125, "loss/logits": 0.3536328077316284, "loss/reg": 0.0039941999129951, "step": 1228 }, { "epoch": 0.153625, "grad_norm": 2.1037001609802246, "grad_norm_var": 4.771672156590602, "learning_rate": 0.0001, "loss": 0.9941, "loss/crossentropy": 2.7013680934906006, "loss/hidden": 0.80859375, "loss/logits": 0.14557519555091858, "loss/reg": 0.003992319572716951, "step": 1229 }, { "epoch": 0.15375, "grad_norm": 2.157334804534912, "grad_norm_var": 4.84846901790952, "learning_rate": 0.0001, "loss": 1.0878, "loss/crossentropy": 2.556549310684204, "loss/hidden": 0.88671875, "loss/logits": 0.16115835309028625, "loss/reg": 0.003990530967712402, "step": 1230 }, { "epoch": 0.153875, "grad_norm": 2.0853333473205566, "grad_norm_var": 4.883710165437185, "learning_rate": 0.0001, "loss": 1.0472, "loss/crossentropy": 2.114297866821289, "loss/hidden": 0.86328125, "loss/logits": 0.14400479197502136, "loss/reg": 0.003988809883594513, "step": 1231 }, { "epoch": 0.154, "grad_norm": 9.08906364440918, "grad_norm_var": 6.916882811324148, "learning_rate": 0.0001, "loss": 1.2453, "loss/crossentropy": 2.644493818283081, "loss/hidden": 1.046875, "loss/logits": 0.15859541296958923, "loss/reg": 0.003986929077655077, "step": 1232 }, { "epoch": 0.154125, "grad_norm": 2.883406639099121, "grad_norm_var": 6.862648195671316, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.49361252784729, "loss/hidden": 1.140625, "loss/logits": 0.22993981838226318, "loss/reg": 0.003985217306762934, "step": 1233 }, { "epoch": 0.15425, "grad_norm": 2.5608956813812256, "grad_norm_var": 6.7914369374581725, "learning_rate": 0.0001, "loss": 1.0612, "loss/crossentropy": 2.4652509689331055, "loss/hidden": 0.8671875, "loss/logits": 0.15421175956726074, "loss/reg": 0.003983261063694954, "step": 1234 }, { "epoch": 0.154375, "grad_norm": 2.1633126735687256, "grad_norm_var": 6.832556056171203, "learning_rate": 0.0001, "loss": 1.0324, "loss/crossentropy": 2.364274024963379, "loss/hidden": 0.8515625, "loss/logits": 0.14098191261291504, "loss/reg": 0.003981346730142832, "step": 1235 }, { "epoch": 0.1545, "grad_norm": 2.7880406379699707, "grad_norm_var": 6.740565356111725, "learning_rate": 0.0001, "loss": 1.2871, "loss/crossentropy": 2.7725203037261963, "loss/hidden": 1.0390625, "loss/logits": 0.20828330516815186, "loss/reg": 0.003979409113526344, "step": 1236 }, { "epoch": 0.154625, "grad_norm": 2.0032100677490234, "grad_norm_var": 6.773356796491393, "learning_rate": 0.0001, "loss": 1.0215, "loss/crossentropy": 2.426025390625, "loss/hidden": 0.84375, "loss/logits": 0.13792634010314941, "loss/reg": 0.003977475222200155, "step": 1237 }, { "epoch": 0.15475, "grad_norm": 2.803041696548462, "grad_norm_var": 6.731182546058613, "learning_rate": 0.0001, "loss": 1.1383, "loss/crossentropy": 2.7533788681030273, "loss/hidden": 0.93359375, "loss/logits": 0.1649492084980011, "loss/reg": 0.0039755916222929955, "step": 1238 }, { "epoch": 0.154875, "grad_norm": 2.6718337535858154, "grad_norm_var": 6.618056769993946, "learning_rate": 0.0001, "loss": 1.1889, "loss/crossentropy": 2.4473633766174316, "loss/hidden": 0.96484375, "loss/logits": 0.1843622773885727, "loss/reg": 0.003973691258579493, "step": 1239 }, { "epoch": 0.155, "grad_norm": 2.8771820068359375, "grad_norm_var": 6.233935399852195, "learning_rate": 0.0001, "loss": 1.1242, "loss/crossentropy": 2.7280538082122803, "loss/hidden": 0.9140625, "loss/logits": 0.17046083509922028, "loss/reg": 0.0039716921746730804, "step": 1240 }, { "epoch": 0.155125, "grad_norm": 2.2324299812316895, "grad_norm_var": 6.208210747435883, "learning_rate": 0.0001, "loss": 1.001, "loss/crossentropy": 2.7301042079925537, "loss/hidden": 0.82421875, "loss/logits": 0.1370982974767685, "loss/reg": 0.003969752229750156, "step": 1241 }, { "epoch": 0.15525, "grad_norm": 2.414759874343872, "grad_norm_var": 6.1905538159398015, "learning_rate": 0.0001, "loss": 1.0086, "loss/crossentropy": 2.161578893661499, "loss/hidden": 0.83203125, "loss/logits": 0.13684576749801636, "loss/reg": 0.00396784907206893, "step": 1242 }, { "epoch": 0.155375, "grad_norm": 2.278144359588623, "grad_norm_var": 3.986925647036762, "learning_rate": 0.0001, "loss": 1.0588, "loss/crossentropy": 2.1866931915283203, "loss/hidden": 0.8828125, "loss/logits": 0.13629356026649475, "loss/reg": 0.0039659528993070126, "step": 1243 }, { "epoch": 0.1555, "grad_norm": 2.3172078132629395, "grad_norm_var": 2.8692718796241876, "learning_rate": 0.0001, "loss": 1.0269, "loss/crossentropy": 2.7066776752471924, "loss/hidden": 0.84375, "loss/logits": 0.14347760379314423, "loss/reg": 0.003964039962738752, "step": 1244 }, { "epoch": 0.155625, "grad_norm": 2.5149173736572266, "grad_norm_var": 2.8395080960927275, "learning_rate": 0.0001, "loss": 1.147, "loss/crossentropy": 2.6849911212921143, "loss/hidden": 0.93359375, "loss/logits": 0.17379310727119446, "loss/reg": 0.00396218616515398, "step": 1245 }, { "epoch": 0.15575, "grad_norm": 3.1449217796325684, "grad_norm_var": 2.807281033079679, "learning_rate": 0.0001, "loss": 1.3347, "loss/crossentropy": 2.340134620666504, "loss/hidden": 1.09375, "loss/logits": 0.20130982995033264, "loss/reg": 0.003960458096116781, "step": 1246 }, { "epoch": 0.155875, "grad_norm": 2.4941136837005615, "grad_norm_var": 2.771865274736687, "learning_rate": 0.0001, "loss": 1.0381, "loss/crossentropy": 2.569666862487793, "loss/hidden": 0.84375, "loss/logits": 0.15474390983581543, "loss/reg": 0.003958826884627342, "step": 1247 }, { "epoch": 0.156, "grad_norm": 2.905941963195801, "grad_norm_var": 0.10203846777971345, "learning_rate": 0.0001, "loss": 1.0627, "loss/crossentropy": 2.7658021450042725, "loss/hidden": 0.87109375, "loss/logits": 0.1519913673400879, "loss/reg": 0.003957261331379414, "step": 1248 }, { "epoch": 0.156125, "grad_norm": 2.955136299133301, "grad_norm_var": 0.10539728005771849, "learning_rate": 0.0001, "loss": 1.1496, "loss/crossentropy": 2.783196449279785, "loss/hidden": 0.94140625, "loss/logits": 0.16862890124320984, "loss/reg": 0.0039556450210511684, "step": 1249 }, { "epoch": 0.15625, "grad_norm": 2.683770179748535, "grad_norm_var": 0.10618654391323404, "learning_rate": 0.0001, "loss": 1.2244, "loss/crossentropy": 2.306131362915039, "loss/hidden": 1.0, "loss/logits": 0.18487581610679626, "loss/reg": 0.0039537097327411175, "step": 1250 }, { "epoch": 0.156375, "grad_norm": 2.1440393924713135, "grad_norm_var": 0.10727540575258346, "learning_rate": 0.0001, "loss": 1.0298, "loss/crossentropy": 2.5097758769989014, "loss/hidden": 0.8515625, "loss/logits": 0.13874852657318115, "loss/reg": 0.003951748367398977, "step": 1251 }, { "epoch": 0.1565, "grad_norm": 5.823795795440674, "grad_norm_var": 0.7687695668695557, "learning_rate": 0.0001, "loss": 1.229, "loss/crossentropy": 2.7339322566986084, "loss/hidden": 1.046875, "loss/logits": 0.14258000254631042, "loss/reg": 0.003950015641748905, "step": 1252 }, { "epoch": 0.156625, "grad_norm": 3.07033109664917, "grad_norm_var": 0.7313342744887732, "learning_rate": 0.0001, "loss": 1.2339, "loss/crossentropy": 2.9465441703796387, "loss/hidden": 1.03125, "loss/logits": 0.16319361329078674, "loss/reg": 0.003948097582906485, "step": 1253 }, { "epoch": 0.15675, "grad_norm": 2.2648935317993164, "grad_norm_var": 0.7516000874171217, "learning_rate": 0.0001, "loss": 1.0369, "loss/crossentropy": 2.462608814239502, "loss/hidden": 0.83984375, "loss/logits": 0.157545804977417, "loss/reg": 0.003946339711546898, "step": 1254 }, { "epoch": 0.156875, "grad_norm": 8.53079605102539, "grad_norm_var": 2.7972635310947167, "learning_rate": 0.0001, "loss": 1.3134, "loss/crossentropy": 2.634783983230591, "loss/hidden": 1.1328125, "loss/logits": 0.1411134898662567, "loss/reg": 0.003944624215364456, "step": 1255 }, { "epoch": 0.157, "grad_norm": 2.3669466972351074, "grad_norm_var": 2.833168083556588, "learning_rate": 0.0001, "loss": 1.1473, "loss/crossentropy": 2.3000428676605225, "loss/hidden": 0.95703125, "loss/logits": 0.15085983276367188, "loss/reg": 0.003942654933780432, "step": 1256 }, { "epoch": 0.157125, "grad_norm": 3.7946250438690186, "grad_norm_var": 2.797930128567604, "learning_rate": 0.0001, "loss": 1.257, "loss/crossentropy": 2.2643165588378906, "loss/hidden": 1.0546875, "loss/logits": 0.16289997100830078, "loss/reg": 0.003940712660551071, "step": 1257 }, { "epoch": 0.15725, "grad_norm": 2.7023470401763916, "grad_norm_var": 2.7717805963911966, "learning_rate": 0.0001, "loss": 1.0415, "loss/crossentropy": 2.6158485412597656, "loss/hidden": 0.8515625, "loss/logits": 0.15053331851959229, "loss/reg": 0.003938745241612196, "step": 1258 }, { "epoch": 0.157375, "grad_norm": 2.1059186458587646, "grad_norm_var": 2.7959400050235406, "learning_rate": 0.0001, "loss": 0.9954, "loss/crossentropy": 2.2793667316436768, "loss/hidden": 0.828125, "loss/logits": 0.12795627117156982, "loss/reg": 0.003936750814318657, "step": 1259 }, { "epoch": 0.1575, "grad_norm": 3.712144136428833, "grad_norm_var": 2.74615990110932, "learning_rate": 0.0001, "loss": 1.5484, "loss/crossentropy": 2.348719835281372, "loss/hidden": 1.265625, "loss/logits": 0.2434225231409073, "loss/reg": 0.003934717271476984, "step": 1260 }, { "epoch": 0.157625, "grad_norm": 2.565690755844116, "grad_norm_var": 2.7408307436849646, "learning_rate": 0.0001, "loss": 0.9844, "loss/crossentropy": 2.635305404663086, "loss/hidden": 0.80078125, "loss/logits": 0.14425452053546906, "loss/reg": 0.003932674881070852, "step": 1261 }, { "epoch": 0.15775, "grad_norm": 2.1676135063171387, "grad_norm_var": 2.824524782775095, "learning_rate": 0.0001, "loss": 1.0319, "loss/crossentropy": 2.582833766937256, "loss/hidden": 0.84375, "loss/logits": 0.1488630175590515, "loss/reg": 0.003930607810616493, "step": 1262 }, { "epoch": 0.157875, "grad_norm": 2.2690494060516357, "grad_norm_var": 2.8509140700262754, "learning_rate": 0.0001, "loss": 0.984, "loss/crossentropy": 2.694347620010376, "loss/hidden": 0.8125, "loss/logits": 0.13225148618221283, "loss/reg": 0.003928603138774633, "step": 1263 }, { "epoch": 0.158, "grad_norm": 3.369201183319092, "grad_norm_var": 2.842832034310379, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.450927972793579, "loss/hidden": 0.94140625, "loss/logits": 0.1599656641483307, "loss/reg": 0.003926686476916075, "step": 1264 }, { "epoch": 0.158125, "grad_norm": 1.9869253635406494, "grad_norm_var": 2.9437333300576247, "learning_rate": 0.0001, "loss": 1.0556, "loss/crossentropy": 2.3994815349578857, "loss/hidden": 0.87890625, "loss/logits": 0.1374390572309494, "loss/reg": 0.003924795426428318, "step": 1265 }, { "epoch": 0.15825, "grad_norm": 2.7239654064178467, "grad_norm_var": 2.9409477001102284, "learning_rate": 0.0001, "loss": 1.1347, "loss/crossentropy": 2.549830913543701, "loss/hidden": 0.9453125, "loss/logits": 0.15013578534126282, "loss/reg": 0.003923265729099512, "step": 1266 }, { "epoch": 0.158375, "grad_norm": 3.253939151763916, "grad_norm_var": 2.857988200257295, "learning_rate": 0.0001, "loss": 1.5404, "loss/crossentropy": 2.297355890274048, "loss/hidden": 1.234375, "loss/logits": 0.2668222188949585, "loss/reg": 0.003921460825949907, "step": 1267 }, { "epoch": 0.1585, "grad_norm": 4.972027778625488, "grad_norm_var": 2.6160556342714547, "learning_rate": 0.0001, "loss": 1.1048, "loss/crossentropy": 2.5779943466186523, "loss/hidden": 0.8828125, "loss/logits": 0.18274636566638947, "loss/reg": 0.003919865936040878, "step": 1268 }, { "epoch": 0.158625, "grad_norm": 3.8399877548217773, "grad_norm_var": 2.635561990200084, "learning_rate": 0.0001, "loss": 1.1315, "loss/crossentropy": 2.8448750972747803, "loss/hidden": 0.9296875, "loss/logits": 0.16261914372444153, "loss/reg": 0.003917965106666088, "step": 1269 }, { "epoch": 0.15875, "grad_norm": 4.097829341888428, "grad_norm_var": 2.595225849251786, "learning_rate": 0.0001, "loss": 1.2206, "loss/crossentropy": 2.9463918209075928, "loss/hidden": 0.98828125, "loss/logits": 0.19318252801895142, "loss/reg": 0.0039165741764009, "step": 1270 }, { "epoch": 0.158875, "grad_norm": 4.908977508544922, "grad_norm_var": 0.9391465897119967, "learning_rate": 0.0001, "loss": 1.1711, "loss/crossentropy": 2.5089914798736572, "loss/hidden": 0.96875, "loss/logits": 0.16315871477127075, "loss/reg": 0.003915099892765284, "step": 1271 }, { "epoch": 0.159, "grad_norm": 2.4336774349212646, "grad_norm_var": 0.9322146223506894, "learning_rate": 0.0001, "loss": 1.0449, "loss/crossentropy": 2.3010151386260986, "loss/hidden": 0.85546875, "loss/logits": 0.15030014514923096, "loss/reg": 0.0039135850965976715, "step": 1272 }, { "epoch": 0.159125, "grad_norm": 2.9904537200927734, "grad_norm_var": 0.9068912920584419, "learning_rate": 0.0001, "loss": 1.1666, "loss/crossentropy": 2.428278923034668, "loss/hidden": 0.96484375, "loss/logits": 0.16259220242500305, "loss/reg": 0.0039116572588682175, "step": 1273 }, { "epoch": 0.15925, "grad_norm": 1.83692467212677, "grad_norm_var": 1.0031901798578553, "learning_rate": 0.0001, "loss": 0.9189, "loss/crossentropy": 2.4613776206970215, "loss/hidden": 0.76171875, "loss/logits": 0.11810196936130524, "loss/reg": 0.003909708932042122, "step": 1274 }, { "epoch": 0.159375, "grad_norm": 2.4019277095794678, "grad_norm_var": 0.9703342604960014, "learning_rate": 0.0001, "loss": 1.1094, "loss/crossentropy": 2.298656702041626, "loss/hidden": 0.9296875, "loss/logits": 0.14063113927841187, "loss/reg": 0.003907748498022556, "step": 1275 }, { "epoch": 0.1595, "grad_norm": 2.206735372543335, "grad_norm_var": 0.9882309911375784, "learning_rate": 0.0001, "loss": 1.0258, "loss/crossentropy": 2.4418487548828125, "loss/hidden": 0.84375, "loss/logits": 0.1430271565914154, "loss/reg": 0.003905918914824724, "step": 1276 }, { "epoch": 0.159625, "grad_norm": 2.794393539428711, "grad_norm_var": 0.9782088480890478, "learning_rate": 0.0001, "loss": 1.1423, "loss/crossentropy": 2.5516860485076904, "loss/hidden": 0.92578125, "loss/logits": 0.1774648129940033, "loss/reg": 0.003904127748683095, "step": 1277 }, { "epoch": 0.15975, "grad_norm": 2.9287729263305664, "grad_norm_var": 0.928333134335493, "learning_rate": 0.0001, "loss": 1.2932, "loss/crossentropy": 2.533024311065674, "loss/hidden": 1.0625, "loss/logits": 0.19170062243938446, "loss/reg": 0.0039022008422762156, "step": 1278 }, { "epoch": 0.159875, "grad_norm": 2.364227056503296, "grad_norm_var": 0.918818410696285, "learning_rate": 0.0001, "loss": 1.0528, "loss/crossentropy": 2.4888012409210205, "loss/hidden": 0.859375, "loss/logits": 0.15442782640457153, "loss/reg": 0.0039004147984087467, "step": 1279 }, { "epoch": 0.16, "grad_norm": 2.9671173095703125, "grad_norm_var": 0.9128487251694849, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.2953407764434814, "loss/hidden": 1.15625, "loss/logits": 0.21034780144691467, "loss/reg": 0.0038986552972346544, "step": 1280 }, { "epoch": 0.160125, "grad_norm": 2.1895735263824463, "grad_norm_var": 0.8868469140494813, "learning_rate": 0.0001, "loss": 1.1023, "loss/crossentropy": 2.418267011642456, "loss/hidden": 0.91015625, "loss/logits": 0.15315604209899902, "loss/reg": 0.0038967591244727373, "step": 1281 }, { "epoch": 0.16025, "grad_norm": 2.0041797161102295, "grad_norm_var": 0.9511806175749209, "learning_rate": 0.0001, "loss": 1.0711, "loss/crossentropy": 2.6270132064819336, "loss/hidden": 0.87890625, "loss/logits": 0.15329018235206604, "loss/reg": 0.0038950201123952866, "step": 1282 }, { "epoch": 0.160375, "grad_norm": 2.7158069610595703, "grad_norm_var": 0.9519147622693618, "learning_rate": 0.0001, "loss": 1.0333, "loss/crossentropy": 2.49900221824646, "loss/hidden": 0.84765625, "loss/logits": 0.14673107862472534, "loss/reg": 0.0038934045005589724, "step": 1283 }, { "epoch": 0.1605, "grad_norm": 2.5366370677948, "grad_norm_var": 0.6752056332094691, "learning_rate": 0.0001, "loss": 1.1833, "loss/crossentropy": 2.5442113876342773, "loss/hidden": 0.98046875, "loss/logits": 0.1639258712530136, "loss/reg": 0.003891737898811698, "step": 1284 }, { "epoch": 0.160625, "grad_norm": 3.805074453353882, "grad_norm_var": 0.6705619509398928, "learning_rate": 0.0001, "loss": 1.2758, "loss/crossentropy": 2.0399181842803955, "loss/hidden": 1.078125, "loss/logits": 0.15876121819019318, "loss/reg": 0.003889812156558037, "step": 1285 }, { "epoch": 0.16075, "grad_norm": 2.670008420944214, "grad_norm_var": 0.5554521676123362, "learning_rate": 0.0001, "loss": 1.2431, "loss/crossentropy": 2.538208484649658, "loss/hidden": 1.0234375, "loss/logits": 0.18076473474502563, "loss/reg": 0.003887931350618601, "step": 1286 }, { "epoch": 0.160875, "grad_norm": 2.1082379817962646, "grad_norm_var": 0.23374974294705825, "learning_rate": 0.0001, "loss": 1.0764, "loss/crossentropy": 2.4281837940216064, "loss/hidden": 0.87109375, "loss/logits": 0.16639548540115356, "loss/reg": 0.003886270336806774, "step": 1287 }, { "epoch": 0.161, "grad_norm": 2.232421398162842, "grad_norm_var": 0.23966051398124927, "learning_rate": 0.0001, "loss": 0.9928, "loss/crossentropy": 2.5111913681030273, "loss/hidden": 0.8203125, "loss/logits": 0.1336727738380432, "loss/reg": 0.003884353907778859, "step": 1288 }, { "epoch": 0.161125, "grad_norm": 1.8333826065063477, "grad_norm_var": 0.25492677200505887, "learning_rate": 0.0001, "loss": 1.0349, "loss/crossentropy": 2.4361870288848877, "loss/hidden": 0.859375, "loss/logits": 0.13674689829349518, "loss/reg": 0.0038827096577733755, "step": 1289 }, { "epoch": 0.16125, "grad_norm": 2.227004051208496, "grad_norm_var": 0.23126510746358592, "learning_rate": 0.0001, "loss": 1.0776, "loss/crossentropy": 2.1035265922546387, "loss/hidden": 0.88671875, "loss/logits": 0.1520470380783081, "loss/reg": 0.0038811014965176582, "step": 1290 }, { "epoch": 0.161375, "grad_norm": 2.4726340770721436, "grad_norm_var": 0.23066153493828262, "learning_rate": 0.0001, "loss": 1.3176, "loss/crossentropy": 2.5521106719970703, "loss/hidden": 1.078125, "loss/logits": 0.20070654153823853, "loss/reg": 0.0038791669066995382, "step": 1291 }, { "epoch": 0.1615, "grad_norm": 2.505643606185913, "grad_norm_var": 0.22441776850007666, "learning_rate": 0.0001, "loss": 0.9609, "loss/crossentropy": 2.562110662460327, "loss/hidden": 0.7890625, "loss/logits": 0.1330765336751938, "loss/reg": 0.003877209033817053, "step": 1292 }, { "epoch": 0.161625, "grad_norm": 2.2074098587036133, "grad_norm_var": 0.22464862758212098, "learning_rate": 0.0001, "loss": 1.0223, "loss/crossentropy": 2.4465317726135254, "loss/hidden": 0.8359375, "loss/logits": 0.1476120948791504, "loss/reg": 0.0038752437103539705, "step": 1293 }, { "epoch": 0.16175, "grad_norm": 4.475348949432373, "grad_norm_var": 0.4655478968178611, "learning_rate": 0.0001, "loss": 1.2869, "loss/crossentropy": 2.6663005352020264, "loss/hidden": 1.0546875, "loss/logits": 0.1934768706560135, "loss/reg": 0.0038730644155293703, "step": 1294 }, { "epoch": 0.161875, "grad_norm": 3.0794217586517334, "grad_norm_var": 0.4767340552867606, "learning_rate": 0.0001, "loss": 1.1619, "loss/crossentropy": 2.463402271270752, "loss/hidden": 0.96484375, "loss/logits": 0.15831388533115387, "loss/reg": 0.0038711209781467915, "step": 1295 }, { "epoch": 0.162, "grad_norm": 2.185600757598877, "grad_norm_var": 0.47945242338888056, "learning_rate": 0.0001, "loss": 1.2293, "loss/crossentropy": 2.240428924560547, "loss/hidden": 1.0078125, "loss/logits": 0.18282610177993774, "loss/reg": 0.003869203384965658, "step": 1296 }, { "epoch": 0.162125, "grad_norm": 2.6001839637756348, "grad_norm_var": 0.46872306833601746, "learning_rate": 0.0001, "loss": 1.1285, "loss/crossentropy": 2.6634023189544678, "loss/hidden": 0.9296875, "loss/logits": 0.16017360985279083, "loss/reg": 0.0038670580834150314, "step": 1297 }, { "epoch": 0.16225, "grad_norm": 2.9891393184661865, "grad_norm_var": 0.4506250664032753, "learning_rate": 0.0001, "loss": 1.2209, "loss/crossentropy": 2.2928833961486816, "loss/hidden": 0.98046875, "loss/logits": 0.2018088847398758, "loss/reg": 0.0038651188369840384, "step": 1298 }, { "epoch": 0.162375, "grad_norm": 2.5674245357513428, "grad_norm_var": 0.45100085978748794, "learning_rate": 0.0001, "loss": 1.099, "loss/crossentropy": 2.604619026184082, "loss/hidden": 0.87109375, "loss/logits": 0.18927830457687378, "loss/reg": 0.003863039892166853, "step": 1299 }, { "epoch": 0.1625, "grad_norm": 5.368757247924805, "grad_norm_var": 0.9072441308021212, "learning_rate": 0.0001, "loss": 1.5845, "loss/crossentropy": 1.9008941650390625, "loss/hidden": 1.234375, "loss/logits": 0.31154388189315796, "loss/reg": 0.0038609288167208433, "step": 1300 }, { "epoch": 0.162625, "grad_norm": 2.402621030807495, "grad_norm_var": 0.8483983819636706, "learning_rate": 0.0001, "loss": 1.224, "loss/crossentropy": 2.582455635070801, "loss/hidden": 1.0078125, "loss/logits": 0.17763689160346985, "loss/reg": 0.0038588044699281454, "step": 1301 }, { "epoch": 0.16275, "grad_norm": 2.1669483184814453, "grad_norm_var": 0.8692672249500539, "learning_rate": 0.0001, "loss": 1.1932, "loss/crossentropy": 2.2148444652557373, "loss/hidden": 0.98828125, "loss/logits": 0.1663748174905777, "loss/reg": 0.0038566740695387125, "step": 1302 }, { "epoch": 0.162875, "grad_norm": 2.7048940658569336, "grad_norm_var": 0.8433353029278644, "learning_rate": 0.0001, "loss": 1.0917, "loss/crossentropy": 2.576660394668579, "loss/hidden": 0.89453125, "loss/logits": 0.15860411524772644, "loss/reg": 0.0038547737058252096, "step": 1303 }, { "epoch": 0.163, "grad_norm": 7.6692304611206055, "grad_norm_var": 2.314715920521659, "learning_rate": 0.0001, "loss": 1.7499, "loss/crossentropy": 2.1765997409820557, "loss/hidden": 1.5078125, "loss/logits": 0.2035428285598755, "loss/reg": 0.0038528875447809696, "step": 1304 }, { "epoch": 0.163125, "grad_norm": 2.3525166511535645, "grad_norm_var": 2.2445116172134316, "learning_rate": 0.0001, "loss": 1.0827, "loss/crossentropy": 2.7200188636779785, "loss/hidden": 0.875, "loss/logits": 0.16918785870075226, "loss/reg": 0.003850834909826517, "step": 1305 }, { "epoch": 0.16325, "grad_norm": 2.1109790802001953, "grad_norm_var": 2.259220587304002, "learning_rate": 0.0001, "loss": 0.9954, "loss/crossentropy": 2.4676554203033447, "loss/hidden": 0.81640625, "loss/logits": 0.14051809906959534, "loss/reg": 0.003848861902952194, "step": 1306 }, { "epoch": 0.163375, "grad_norm": 2.2503230571746826, "grad_norm_var": 2.281384886865043, "learning_rate": 0.0001, "loss": 1.0349, "loss/crossentropy": 2.4450364112854004, "loss/hidden": 0.8515625, "loss/logits": 0.14489130675792694, "loss/reg": 0.0038469466380774975, "step": 1307 }, { "epoch": 0.1635, "grad_norm": 3.661198139190674, "grad_norm_var": 2.272915770254127, "learning_rate": 0.0001, "loss": 1.2995, "loss/crossentropy": 3.0266950130462646, "loss/hidden": 1.0390625, "loss/logits": 0.22195252776145935, "loss/reg": 0.0038448853883892298, "step": 1308 }, { "epoch": 0.163625, "grad_norm": 2.6725175380706787, "grad_norm_var": 2.226462629702375, "learning_rate": 0.0001, "loss": 1.1629, "loss/crossentropy": 2.4850494861602783, "loss/hidden": 0.9609375, "loss/logits": 0.16352644562721252, "loss/reg": 0.003842818085104227, "step": 1309 }, { "epoch": 0.16375, "grad_norm": 2.4281601905822754, "grad_norm_var": 2.1412558591766695, "learning_rate": 0.0001, "loss": 1.0183, "loss/crossentropy": 2.629995107650757, "loss/hidden": 0.83984375, "loss/logits": 0.14006099104881287, "loss/reg": 0.0038410108536481857, "step": 1310 }, { "epoch": 0.163875, "grad_norm": 2.779705762863159, "grad_norm_var": 2.146718277972097, "learning_rate": 0.0001, "loss": 1.0696, "loss/crossentropy": 2.845191240310669, "loss/hidden": 0.87890625, "loss/logits": 0.1523093283176422, "loss/reg": 0.0038392541464418173, "step": 1311 }, { "epoch": 0.164, "grad_norm": 2.1593542098999023, "grad_norm_var": 2.1498104356164496, "learning_rate": 0.0001, "loss": 1.1257, "loss/crossentropy": 2.5214879512786865, "loss/hidden": 0.93359375, "loss/logits": 0.15370512008666992, "loss/reg": 0.0038373004645109177, "step": 1312 }, { "epoch": 0.164125, "grad_norm": 2.261361837387085, "grad_norm_var": 2.177543523879501, "learning_rate": 0.0001, "loss": 1.193, "loss/crossentropy": 2.370466947555542, "loss/hidden": 0.9921875, "loss/logits": 0.16246996819972992, "loss/reg": 0.0038352562114596367, "step": 1313 }, { "epoch": 0.16425, "grad_norm": 4.186817169189453, "grad_norm_var": 2.260020426671607, "learning_rate": 0.0001, "loss": 1.3862, "loss/crossentropy": 2.2702040672302246, "loss/hidden": 1.1328125, "loss/logits": 0.21508020162582397, "loss/reg": 0.0038333996199071407, "step": 1314 }, { "epoch": 0.164375, "grad_norm": 3.48109769821167, "grad_norm_var": 2.2462280124966996, "learning_rate": 0.0001, "loss": 1.0443, "loss/crossentropy": 2.6975324153900146, "loss/hidden": 0.85546875, "loss/logits": 0.15051524341106415, "loss/reg": 0.0038314287085086107, "step": 1315 }, { "epoch": 0.1645, "grad_norm": 2.3781416416168213, "grad_norm_var": 1.9268796990894694, "learning_rate": 0.0001, "loss": 1.069, "loss/crossentropy": 2.565978527069092, "loss/hidden": 0.8671875, "loss/logits": 0.1635463535785675, "loss/reg": 0.003829606808722019, "step": 1316 }, { "epoch": 0.164625, "grad_norm": 2.532841682434082, "grad_norm_var": 1.9179299858722438, "learning_rate": 0.0001, "loss": 1.1731, "loss/crossentropy": 2.379420757293701, "loss/hidden": 0.9453125, "loss/logits": 0.18955053389072418, "loss/reg": 0.003827982349321246, "step": 1317 }, { "epoch": 0.16475, "grad_norm": 2.824406862258911, "grad_norm_var": 1.8730366601404502, "learning_rate": 0.0001, "loss": 1.2196, "loss/crossentropy": 2.745607852935791, "loss/hidden": 0.98828125, "loss/logits": 0.19303223490715027, "loss/reg": 0.003826139261946082, "step": 1318 }, { "epoch": 0.164875, "grad_norm": 2.927591562271118, "grad_norm_var": 1.866532019300668, "learning_rate": 0.0001, "loss": 1.0242, "loss/crossentropy": 2.745173215866089, "loss/hidden": 0.8515625, "loss/logits": 0.13440603017807007, "loss/reg": 0.003824233775958419, "step": 1319 }, { "epoch": 0.165, "grad_norm": 2.249950408935547, "grad_norm_var": 0.3587598970037938, "learning_rate": 0.0001, "loss": 1.1167, "loss/crossentropy": 2.182314395904541, "loss/hidden": 0.9140625, "loss/logits": 0.16442248225212097, "loss/reg": 0.0038224998861551285, "step": 1320 }, { "epoch": 0.165125, "grad_norm": 2.136213779449463, "grad_norm_var": 0.3718083111594938, "learning_rate": 0.0001, "loss": 1.2208, "loss/crossentropy": 2.3340518474578857, "loss/hidden": 0.99609375, "loss/logits": 0.18647333979606628, "loss/reg": 0.0038206097669899464, "step": 1321 }, { "epoch": 0.16525, "grad_norm": 2.3353726863861084, "grad_norm_var": 0.3576302941917286, "learning_rate": 0.0001, "loss": 1.0998, "loss/crossentropy": 2.558197498321533, "loss/hidden": 0.890625, "loss/logits": 0.17098355293273926, "loss/reg": 0.003818872617557645, "step": 1322 }, { "epoch": 0.165375, "grad_norm": 2.0035364627838135, "grad_norm_var": 0.376367123736596, "learning_rate": 0.0001, "loss": 0.9672, "loss/crossentropy": 2.0580060482025146, "loss/hidden": 0.8046875, "loss/logits": 0.12435194849967957, "loss/reg": 0.003817170625552535, "step": 1323 }, { "epoch": 0.1655, "grad_norm": 2.704374074935913, "grad_norm_var": 0.3095112579833424, "learning_rate": 0.0001, "loss": 1.2064, "loss/crossentropy": 2.056586742401123, "loss/hidden": 1.03125, "loss/logits": 0.1370464414358139, "loss/reg": 0.003815267700701952, "step": 1324 }, { "epoch": 0.165625, "grad_norm": 2.861746072769165, "grad_norm_var": 0.3128512221250444, "learning_rate": 0.0001, "loss": 1.2048, "loss/crossentropy": 2.649263381958008, "loss/hidden": 0.9921875, "loss/logits": 0.1744484305381775, "loss/reg": 0.0038135608192533255, "step": 1325 }, { "epoch": 0.16575, "grad_norm": 4.303040027618408, "grad_norm_var": 0.4794263231115619, "learning_rate": 0.0001, "loss": 1.396, "loss/crossentropy": 2.5463757514953613, "loss/hidden": 1.0859375, "loss/logits": 0.27196431159973145, "loss/reg": 0.003811680944636464, "step": 1326 }, { "epoch": 0.165875, "grad_norm": 2.6042354106903076, "grad_norm_var": 0.48083927966075424, "learning_rate": 0.0001, "loss": 1.3305, "loss/crossentropy": 2.652782917022705, "loss/hidden": 1.0625, "loss/logits": 0.2298596352338791, "loss/reg": 0.003809748450294137, "step": 1327 }, { "epoch": 0.166, "grad_norm": 2.099306344985962, "grad_norm_var": 0.4857685954885028, "learning_rate": 0.0001, "loss": 1.1101, "loss/crossentropy": 2.54836368560791, "loss/hidden": 0.89453125, "loss/logits": 0.17748317122459412, "loss/reg": 0.0038078485522419214, "step": 1328 }, { "epoch": 0.166125, "grad_norm": 5.760173320770264, "grad_norm_var": 1.026126259782783, "learning_rate": 0.0001, "loss": 1.4286, "loss/crossentropy": 2.3031833171844482, "loss/hidden": 1.0703125, "loss/logits": 0.3202149271965027, "loss/reg": 0.003806003602221608, "step": 1329 }, { "epoch": 0.16625, "grad_norm": 2.2108118534088135, "grad_norm_var": 0.9474122587329703, "learning_rate": 0.0001, "loss": 1.0457, "loss/crossentropy": 2.5074656009674072, "loss/hidden": 0.859375, "loss/logits": 0.14826124906539917, "loss/reg": 0.003804128849878907, "step": 1330 }, { "epoch": 0.166375, "grad_norm": 3.3911850452423096, "grad_norm_var": 0.9402114702613211, "learning_rate": 0.0001, "loss": 1.2478, "loss/crossentropy": 2.4859514236450195, "loss/hidden": 1.015625, "loss/logits": 0.19414550065994263, "loss/reg": 0.0038022748194634914, "step": 1331 }, { "epoch": 0.1665, "grad_norm": 2.3483588695526123, "grad_norm_var": 0.9420719086390626, "learning_rate": 0.0001, "loss": 1.1072, "loss/crossentropy": 2.7186105251312256, "loss/hidden": 0.90234375, "loss/logits": 0.16681891679763794, "loss/reg": 0.003800415899604559, "step": 1332 }, { "epoch": 0.166625, "grad_norm": 2.478461503982544, "grad_norm_var": 0.9444172935081412, "learning_rate": 0.0001, "loss": 1.2747, "loss/crossentropy": 2.414044141769409, "loss/hidden": 1.046875, "loss/logits": 0.18985730409622192, "loss/reg": 0.003798494813963771, "step": 1333 }, { "epoch": 0.16675, "grad_norm": 3.826606273651123, "grad_norm_var": 1.0067895170922097, "learning_rate": 0.0001, "loss": 1.5191, "loss/crossentropy": 2.52571964263916, "loss/hidden": 1.25, "loss/logits": 0.2311021387577057, "loss/reg": 0.0037966351956129074, "step": 1334 }, { "epoch": 0.166875, "grad_norm": 2.2956812381744385, "grad_norm_var": 1.0285842417783648, "learning_rate": 0.0001, "loss": 0.9857, "loss/crossentropy": 2.333362579345703, "loss/hidden": 0.8046875, "loss/logits": 0.14301563799381256, "loss/reg": 0.003794773481786251, "step": 1335 }, { "epoch": 0.167, "grad_norm": 2.215144634246826, "grad_norm_var": 1.0314472749301025, "learning_rate": 0.0001, "loss": 1.0197, "loss/crossentropy": 2.6435439586639404, "loss/hidden": 0.83203125, "loss/logits": 0.14977750182151794, "loss/reg": 0.003792962059378624, "step": 1336 }, { "epoch": 0.167125, "grad_norm": 2.289198875427246, "grad_norm_var": 1.0183830630566924, "learning_rate": 0.0001, "loss": 1.0596, "loss/crossentropy": 2.4188473224639893, "loss/hidden": 0.875, "loss/logits": 0.14672745764255524, "loss/reg": 0.0037910572718828917, "step": 1337 }, { "epoch": 0.16725, "grad_norm": 2.585996627807617, "grad_norm_var": 1.0048460491356954, "learning_rate": 0.0001, "loss": 1.1657, "loss/crossentropy": 2.459104537963867, "loss/hidden": 0.95703125, "loss/logits": 0.17073991894721985, "loss/reg": 0.0037892020773142576, "step": 1338 }, { "epoch": 0.167375, "grad_norm": 2.2020533084869385, "grad_norm_var": 0.9842790473450236, "learning_rate": 0.0001, "loss": 1.0632, "loss/crossentropy": 2.602936029434204, "loss/hidden": 0.875, "loss/logits": 0.1503719538450241, "loss/reg": 0.00378743140026927, "step": 1339 }, { "epoch": 0.1675, "grad_norm": 2.0642025470733643, "grad_norm_var": 1.0253976633091109, "learning_rate": 0.0001, "loss": 1.0186, "loss/crossentropy": 2.5779831409454346, "loss/hidden": 0.8359375, "loss/logits": 0.14476892352104187, "loss/reg": 0.0037853880785405636, "step": 1340 }, { "epoch": 0.167625, "grad_norm": 2.0386452674865723, "grad_norm_var": 1.0660144013342174, "learning_rate": 0.0001, "loss": 1.0255, "loss/crossentropy": 2.599996328353882, "loss/hidden": 0.8359375, "loss/logits": 0.15170088410377502, "loss/reg": 0.0037834926042705774, "step": 1341 }, { "epoch": 0.16775, "grad_norm": 2.469545841217041, "grad_norm_var": 0.9073509513912974, "learning_rate": 0.0001, "loss": 1.1727, "loss/crossentropy": 2.4470956325531006, "loss/hidden": 0.96875, "loss/logits": 0.1660883128643036, "loss/reg": 0.003781634848564863, "step": 1342 }, { "epoch": 0.167875, "grad_norm": 2.972076892852783, "grad_norm_var": 0.9120929514276962, "learning_rate": 0.0001, "loss": 1.0577, "loss/crossentropy": 2.2371230125427246, "loss/hidden": 0.87890625, "loss/logits": 0.14097647368907928, "loss/reg": 0.0037799072451889515, "step": 1343 }, { "epoch": 0.168, "grad_norm": 2.47365403175354, "grad_norm_var": 0.8907210075164879, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.574859857559204, "loss/hidden": 0.91015625, "loss/logits": 0.1744510978460312, "loss/reg": 0.003778197569772601, "step": 1344 }, { "epoch": 0.168125, "grad_norm": 3.3522775173187256, "grad_norm_var": 0.27908018822892944, "learning_rate": 0.0001, "loss": 1.1439, "loss/crossentropy": 2.3602190017700195, "loss/hidden": 0.94921875, "loss/logits": 0.15688063204288483, "loss/reg": 0.0037766145542263985, "step": 1345 }, { "epoch": 0.16825, "grad_norm": 2.2945148944854736, "grad_norm_var": 0.27544389245511314, "learning_rate": 0.0001, "loss": 1.0967, "loss/crossentropy": 2.0383851528167725, "loss/hidden": 0.94140625, "loss/logits": 0.11755125969648361, "loss/reg": 0.0037747540045529604, "step": 1346 }, { "epoch": 0.168375, "grad_norm": 2.3891069889068604, "grad_norm_var": 0.2299681545095474, "learning_rate": 0.0001, "loss": 1.1306, "loss/crossentropy": 2.3858978748321533, "loss/hidden": 0.91796875, "loss/logits": 0.174921452999115, "loss/reg": 0.003772968426346779, "step": 1347 }, { "epoch": 0.1685, "grad_norm": 2.070483684539795, "grad_norm_var": 0.24109670204344696, "learning_rate": 0.0001, "loss": 0.9534, "loss/crossentropy": 2.616994619369507, "loss/hidden": 0.7890625, "loss/logits": 0.12660518288612366, "loss/reg": 0.003771234769374132, "step": 1348 }, { "epoch": 0.168625, "grad_norm": 2.5373406410217285, "grad_norm_var": 0.24113562481536305, "learning_rate": 0.0001, "loss": 1.149, "loss/crossentropy": 2.5550339221954346, "loss/hidden": 0.9296875, "loss/logits": 0.18165861070156097, "loss/reg": 0.0037693637423217297, "step": 1349 }, { "epoch": 0.16875, "grad_norm": 2.4782979488372803, "grad_norm_var": 0.11712655452237944, "learning_rate": 0.0001, "loss": 1.1058, "loss/crossentropy": 2.424433469772339, "loss/hidden": 0.90625, "loss/logits": 0.16192512214183807, "loss/reg": 0.003767443122342229, "step": 1350 }, { "epoch": 0.168875, "grad_norm": 2.141442060470581, "grad_norm_var": 0.12118062200624896, "learning_rate": 0.0001, "loss": 1.0005, "loss/crossentropy": 2.4995741844177246, "loss/hidden": 0.828125, "loss/logits": 0.13474415242671967, "loss/reg": 0.0037655706983059645, "step": 1351 }, { "epoch": 0.169, "grad_norm": 3.0165703296661377, "grad_norm_var": 0.1404083277914895, "learning_rate": 0.0001, "loss": 1.0107, "loss/crossentropy": 2.753091812133789, "loss/hidden": 0.81640625, "loss/logits": 0.1566263735294342, "loss/reg": 0.0037636584602296352, "step": 1352 }, { "epoch": 0.169125, "grad_norm": 2.388427257537842, "grad_norm_var": 0.1387512034039254, "learning_rate": 0.0001, "loss": 0.9903, "loss/crossentropy": 2.3668482303619385, "loss/hidden": 0.8125, "loss/logits": 0.14017178118228912, "loss/reg": 0.0037617513444274664, "step": 1353 }, { "epoch": 0.16925, "grad_norm": 1.86200749874115, "grad_norm_var": 0.1600401535940278, "learning_rate": 0.0001, "loss": 0.9282, "loss/crossentropy": 2.6024041175842285, "loss/hidden": 0.765625, "loss/logits": 0.12501290440559387, "loss/reg": 0.003759781364351511, "step": 1354 }, { "epoch": 0.169375, "grad_norm": 2.6516194343566895, "grad_norm_var": 0.15949300228247545, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.66141939163208, "loss/hidden": 0.87109375, "loss/logits": 0.1546727865934372, "loss/reg": 0.0037579077761620283, "step": 1355 }, { "epoch": 0.1695, "grad_norm": 2.536879777908325, "grad_norm_var": 0.1491417929813607, "learning_rate": 0.0001, "loss": 1.0189, "loss/crossentropy": 2.509375810623169, "loss/hidden": 0.828125, "loss/logits": 0.1532239019870758, "loss/reg": 0.0037560255732387304, "step": 1356 }, { "epoch": 0.169625, "grad_norm": 2.869354724884033, "grad_norm_var": 0.14343589299983103, "learning_rate": 0.0001, "loss": 1.0562, "loss/crossentropy": 2.676845073699951, "loss/hidden": 0.84765625, "loss/logits": 0.1710355579853058, "loss/reg": 0.0037540122866630554, "step": 1357 }, { "epoch": 0.16975, "grad_norm": 3.0909178256988525, "grad_norm_var": 0.16243653600033944, "learning_rate": 0.0001, "loss": 0.9284, "loss/crossentropy": 2.4438745975494385, "loss/hidden": 0.76953125, "loss/logits": 0.12132885307073593, "loss/reg": 0.003752180142328143, "step": 1358 }, { "epoch": 0.169875, "grad_norm": 2.1628170013427734, "grad_norm_var": 0.16001678424908092, "learning_rate": 0.0001, "loss": 0.9578, "loss/crossentropy": 2.4705512523651123, "loss/hidden": 0.7890625, "loss/logits": 0.13126134872436523, "loss/reg": 0.0037503293715417385, "step": 1359 }, { "epoch": 0.17, "grad_norm": 2.031930923461914, "grad_norm_var": 0.17492556648024848, "learning_rate": 0.0001, "loss": 0.9667, "loss/crossentropy": 2.6681065559387207, "loss/hidden": 0.796875, "loss/logits": 0.1323142796754837, "loss/reg": 0.003748575458303094, "step": 1360 }, { "epoch": 0.170125, "grad_norm": 2.473466634750366, "grad_norm_var": 0.12240658206718862, "learning_rate": 0.0001, "loss": 1.1075, "loss/crossentropy": 2.744309902191162, "loss/hidden": 0.8984375, "loss/logits": 0.17157718539237976, "loss/reg": 0.003746669040992856, "step": 1361 }, { "epoch": 0.17025, "grad_norm": 2.9182069301605225, "grad_norm_var": 0.1348531412058311, "learning_rate": 0.0001, "loss": 0.9539, "loss/crossentropy": 2.7478702068328857, "loss/hidden": 0.77734375, "loss/logits": 0.13914981484413147, "loss/reg": 0.003744750050827861, "step": 1362 }, { "epoch": 0.170375, "grad_norm": 2.5078988075256348, "grad_norm_var": 0.13435597843808764, "learning_rate": 0.0001, "loss": 0.99, "loss/crossentropy": 2.5372326374053955, "loss/hidden": 0.8046875, "loss/logits": 0.14791148900985718, "loss/reg": 0.003742997534573078, "step": 1363 }, { "epoch": 0.1705, "grad_norm": 2.880740165710449, "grad_norm_var": 0.13075709652999348, "learning_rate": 0.0001, "loss": 1.2749, "loss/crossentropy": 2.247453451156616, "loss/hidden": 1.0546875, "loss/logits": 0.18284665048122406, "loss/reg": 0.003741198917850852, "step": 1364 }, { "epoch": 0.170625, "grad_norm": 2.802058219909668, "grad_norm_var": 0.13524607605756855, "learning_rate": 0.0001, "loss": 1.0738, "loss/crossentropy": 2.5267081260681152, "loss/hidden": 0.890625, "loss/logits": 0.1457740068435669, "loss/reg": 0.0037392526865005493, "step": 1365 }, { "epoch": 0.17075, "grad_norm": 2.8533761501312256, "grad_norm_var": 0.14041346014174008, "learning_rate": 0.0001, "loss": 1.1666, "loss/crossentropy": 2.620199680328369, "loss/hidden": 0.94921875, "loss/logits": 0.1800282597541809, "loss/reg": 0.0037372722290456295, "step": 1366 }, { "epoch": 0.170875, "grad_norm": 3.108058214187622, "grad_norm_var": 0.14303122083475486, "learning_rate": 0.0001, "loss": 1.223, "loss/crossentropy": 2.40995717048645, "loss/hidden": 1.015625, "loss/logits": 0.17001160979270935, "loss/reg": 0.0037354743108153343, "step": 1367 }, { "epoch": 0.171, "grad_norm": 2.5328550338745117, "grad_norm_var": 0.13302262467848933, "learning_rate": 0.0001, "loss": 1.1243, "loss/crossentropy": 2.48756742477417, "loss/hidden": 0.91015625, "loss/logits": 0.17680081725120544, "loss/reg": 0.0037336875684559345, "step": 1368 }, { "epoch": 0.171125, "grad_norm": 2.2643351554870605, "grad_norm_var": 0.13755867625505444, "learning_rate": 0.0001, "loss": 0.9683, "loss/crossentropy": 2.642735719680786, "loss/hidden": 0.80078125, "loss/logits": 0.13022208213806152, "loss/reg": 0.003731830744072795, "step": 1369 }, { "epoch": 0.17125, "grad_norm": 1.9663736820220947, "grad_norm_var": 0.12801642728851045, "learning_rate": 0.0001, "loss": 1.0785, "loss/crossentropy": 2.4266912937164307, "loss/hidden": 0.90625, "loss/logits": 0.13490843772888184, "loss/reg": 0.0037299375981092453, "step": 1370 }, { "epoch": 0.171375, "grad_norm": 2.1911253929138184, "grad_norm_var": 0.1382957404551554, "learning_rate": 0.0001, "loss": 1.1049, "loss/crossentropy": 2.4400012493133545, "loss/hidden": 0.90625, "loss/logits": 0.1613638699054718, "loss/reg": 0.003728190902620554, "step": 1371 }, { "epoch": 0.1715, "grad_norm": 2.3214714527130127, "grad_norm_var": 0.14227339992063978, "learning_rate": 0.0001, "loss": 0.9918, "loss/crossentropy": 2.5530331134796143, "loss/hidden": 0.81640625, "loss/logits": 0.13810178637504578, "loss/reg": 0.003726301481947303, "step": 1372 }, { "epoch": 0.171625, "grad_norm": 2.2973015308380127, "grad_norm_var": 0.13920199708697206, "learning_rate": 0.0001, "loss": 0.997, "loss/crossentropy": 2.51094913482666, "loss/hidden": 0.82421875, "loss/logits": 0.13557234406471252, "loss/reg": 0.0037244099657982588, "step": 1373 }, { "epoch": 0.17175, "grad_norm": 2.0010879039764404, "grad_norm_var": 0.1312278234613122, "learning_rate": 0.0001, "loss": 1.0181, "loss/crossentropy": 2.556290864944458, "loss/hidden": 0.8359375, "loss/logits": 0.1449136734008789, "loss/reg": 0.00372238177806139, "step": 1374 }, { "epoch": 0.171875, "grad_norm": 2.9626214504241943, "grad_norm_var": 0.13982906840783826, "learning_rate": 0.0001, "loss": 1.1252, "loss/crossentropy": 2.6870641708374023, "loss/hidden": 0.9453125, "loss/logits": 0.14271126687526703, "loss/reg": 0.003720562905073166, "step": 1375 }, { "epoch": 0.172, "grad_norm": 2.2013375759124756, "grad_norm_var": 0.1308908021708324, "learning_rate": 0.0001, "loss": 0.9615, "loss/crossentropy": 2.4653432369232178, "loss/hidden": 0.79296875, "loss/logits": 0.13133826851844788, "loss/reg": 0.0037186951376497746, "step": 1376 }, { "epoch": 0.172125, "grad_norm": 1.8874136209487915, "grad_norm_var": 0.15580902298580662, "learning_rate": 0.0001, "loss": 0.9726, "loss/crossentropy": 2.7080636024475098, "loss/hidden": 0.80078125, "loss/logits": 0.13469372689723969, "loss/reg": 0.0037167875561863184, "step": 1377 }, { "epoch": 0.17225, "grad_norm": 2.2416839599609375, "grad_norm_var": 0.14497829998406733, "learning_rate": 0.0001, "loss": 1.0436, "loss/crossentropy": 2.523420572280884, "loss/hidden": 0.8515625, "loss/logits": 0.15487292408943176, "loss/reg": 0.003714931197464466, "step": 1378 }, { "epoch": 0.172375, "grad_norm": 2.8195207118988037, "grad_norm_var": 0.15392134715338787, "learning_rate": 0.0001, "loss": 1.1432, "loss/crossentropy": 2.4862663745880127, "loss/hidden": 0.99609375, "loss/logits": 0.10999385267496109, "loss/reg": 0.003713154001161456, "step": 1379 }, { "epoch": 0.1725, "grad_norm": 2.118751287460327, "grad_norm_var": 0.14728210095098948, "learning_rate": 0.0001, "loss": 1.0546, "loss/crossentropy": 2.33610200881958, "loss/hidden": 0.875, "loss/logits": 0.14247506856918335, "loss/reg": 0.0037111735437065363, "step": 1380 }, { "epoch": 0.172625, "grad_norm": 2.783078193664551, "grad_norm_var": 0.14631392823386963, "learning_rate": 0.0001, "loss": 1.1765, "loss/crossentropy": 2.6499156951904297, "loss/hidden": 0.96484375, "loss/logits": 0.17454375326633453, "loss/reg": 0.0037092994898557663, "step": 1381 }, { "epoch": 0.17275, "grad_norm": 2.4130048751831055, "grad_norm_var": 0.13236574600067233, "learning_rate": 0.0001, "loss": 0.9868, "loss/crossentropy": 2.5483062267303467, "loss/hidden": 0.80859375, "loss/logits": 0.14112114906311035, "loss/reg": 0.003707532538101077, "step": 1382 }, { "epoch": 0.172875, "grad_norm": 2.2584409713745117, "grad_norm_var": 0.09521777507376417, "learning_rate": 0.0001, "loss": 1.1614, "loss/crossentropy": 2.5460643768310547, "loss/hidden": 0.9609375, "loss/logits": 0.16338081657886505, "loss/reg": 0.0037057846784591675, "step": 1383 }, { "epoch": 0.173, "grad_norm": 3.7609903812408447, "grad_norm_var": 0.2229059105024603, "learning_rate": 0.0001, "loss": 1.0447, "loss/crossentropy": 2.775035858154297, "loss/hidden": 0.85546875, "loss/logits": 0.15221986174583435, "loss/reg": 0.0037039562594145536, "step": 1384 }, { "epoch": 0.173125, "grad_norm": 2.734567642211914, "grad_norm_var": 0.22787300757804296, "learning_rate": 0.0001, "loss": 1.0205, "loss/crossentropy": 2.524040937423706, "loss/hidden": 0.83984375, "loss/logits": 0.14358995854854584, "loss/reg": 0.003702066373080015, "step": 1385 }, { "epoch": 0.17325, "grad_norm": 25.204544067382812, "grad_norm_var": 32.52689382508577, "learning_rate": 0.0001, "loss": 1.2611, "loss/crossentropy": 2.7667076587677, "loss/hidden": 1.078125, "loss/logits": 0.1459766924381256, "loss/reg": 0.0037002949975430965, "step": 1386 }, { "epoch": 0.173375, "grad_norm": 6.416390419006348, "grad_norm_var": 32.687121260827944, "learning_rate": 0.0001, "loss": 1.2171, "loss/crossentropy": 2.594097137451172, "loss/hidden": 1.0546875, "loss/logits": 0.1254766285419464, "loss/reg": 0.0036984088364988565, "step": 1387 }, { "epoch": 0.1735, "grad_norm": 1.9167765378952026, "grad_norm_var": 32.796098433775775, "learning_rate": 0.0001, "loss": 0.9744, "loss/crossentropy": 2.2913591861724854, "loss/hidden": 0.8125, "loss/logits": 0.12497787177562714, "loss/reg": 0.003696783911436796, "step": 1388 }, { "epoch": 0.173625, "grad_norm": 2.2160396575927734, "grad_norm_var": 32.816325970432494, "learning_rate": 0.0001, "loss": 0.9752, "loss/crossentropy": 2.8192343711853027, "loss/hidden": 0.8046875, "loss/logits": 0.13359323143959045, "loss/reg": 0.0036951396614313126, "step": 1389 }, { "epoch": 0.17375, "grad_norm": 2.288888931274414, "grad_norm_var": 32.74015382821924, "learning_rate": 0.0001, "loss": 1.1827, "loss/crossentropy": 2.637406826019287, "loss/hidden": 0.97265625, "loss/logits": 0.17314687371253967, "loss/reg": 0.0036930718924850225, "step": 1390 }, { "epoch": 0.173875, "grad_norm": 2.411794662475586, "grad_norm_var": 32.8455146358098, "learning_rate": 0.0001, "loss": 1.0526, "loss/crossentropy": 2.189383029937744, "loss/hidden": 0.86328125, "loss/logits": 0.1524190902709961, "loss/reg": 0.0036910499911755323, "step": 1391 }, { "epoch": 0.174, "grad_norm": 4.155782222747803, "grad_norm_var": 32.58828549446247, "learning_rate": 0.0001, "loss": 1.0565, "loss/crossentropy": 2.463545560836792, "loss/hidden": 0.87890625, "loss/logits": 0.14067277312278748, "loss/reg": 0.0036890122573822737, "step": 1392 }, { "epoch": 0.174125, "grad_norm": 5.347140312194824, "grad_norm_var": 32.25727325951478, "learning_rate": 0.0001, "loss": 1.3215, "loss/crossentropy": 2.153430223464966, "loss/hidden": 1.1171875, "loss/logits": 0.1673976629972458, "loss/reg": 0.0036869607865810394, "step": 1393 }, { "epoch": 0.17425, "grad_norm": 2.7766270637512207, "grad_norm_var": 32.11815070371227, "learning_rate": 0.0001, "loss": 1.0493, "loss/crossentropy": 2.4740800857543945, "loss/hidden": 0.88671875, "loss/logits": 0.1257045567035675, "loss/reg": 0.003685306990519166, "step": 1394 }, { "epoch": 0.174375, "grad_norm": 4.125362396240234, "grad_norm_var": 31.936244846904135, "learning_rate": 0.0001, "loss": 1.3623, "loss/crossentropy": 2.216123342514038, "loss/hidden": 1.1328125, "loss/logits": 0.1926833689212799, "loss/reg": 0.003683644812554121, "step": 1395 }, { "epoch": 0.1745, "grad_norm": 2.5013086795806885, "grad_norm_var": 31.82097080900541, "learning_rate": 0.0001, "loss": 1.1619, "loss/crossentropy": 2.2907092571258545, "loss/hidden": 0.98046875, "loss/logits": 0.14458391070365906, "loss/reg": 0.00368195166811347, "step": 1396 }, { "epoch": 0.174625, "grad_norm": 2.9556281566619873, "grad_norm_var": 31.78144628269959, "learning_rate": 0.0001, "loss": 1.3064, "loss/crossentropy": 2.600534439086914, "loss/hidden": 0.98828125, "loss/logits": 0.2812826335430145, "loss/reg": 0.0036801020614802837, "step": 1397 }, { "epoch": 0.17475, "grad_norm": 2.312662363052368, "grad_norm_var": 31.811237788762753, "learning_rate": 0.0001, "loss": 1.2056, "loss/crossentropy": 2.346238613128662, "loss/hidden": 1.0078125, "loss/logits": 0.16105079650878906, "loss/reg": 0.0036781977396458387, "step": 1398 }, { "epoch": 0.174875, "grad_norm": 2.161569356918335, "grad_norm_var": 31.841893155076757, "learning_rate": 0.0001, "loss": 1.0891, "loss/crossentropy": 2.6312341690063477, "loss/hidden": 0.8984375, "loss/logits": 0.15386250615119934, "loss/reg": 0.0036762990057468414, "step": 1399 }, { "epoch": 0.175, "grad_norm": 2.7927346229553223, "grad_norm_var": 32.00627187711323, "learning_rate": 0.0001, "loss": 1.0835, "loss/crossentropy": 2.3059043884277344, "loss/hidden": 0.90234375, "loss/logits": 0.1444191038608551, "loss/reg": 0.0036745734978467226, "step": 1400 }, { "epoch": 0.175125, "grad_norm": 3.7161078453063965, "grad_norm_var": 31.832840403479917, "learning_rate": 0.0001, "loss": 1.2519, "loss/crossentropy": 2.26311993598938, "loss/hidden": 1.015625, "loss/logits": 0.19954730570316315, "loss/reg": 0.0036726652178913355, "step": 1401 }, { "epoch": 0.17525, "grad_norm": 2.494056224822998, "grad_norm_var": 1.6194340047826798, "learning_rate": 0.0001, "loss": 1.1725, "loss/crossentropy": 2.2084007263183594, "loss/hidden": 0.97265625, "loss/logits": 0.16313332319259644, "loss/reg": 0.00367086473852396, "step": 1402 }, { "epoch": 0.175375, "grad_norm": 8.896705627441406, "grad_norm_var": 3.0802516385388357, "learning_rate": 0.0001, "loss": 1.8271, "loss/crossentropy": 2.3072290420532227, "loss/hidden": 1.546875, "loss/logits": 0.24357590079307556, "loss/reg": 0.0036691350396722555, "step": 1403 }, { "epoch": 0.1755, "grad_norm": 2.695955753326416, "grad_norm_var": 2.972744932112194, "learning_rate": 0.0001, "loss": 1.2248, "loss/crossentropy": 2.350301504135132, "loss/hidden": 1.015625, "loss/logits": 0.1725081503391266, "loss/reg": 0.003667246550321579, "step": 1404 }, { "epoch": 0.175625, "grad_norm": 2.44710373878479, "grad_norm_var": 2.940667944838992, "learning_rate": 0.0001, "loss": 1.1112, "loss/crossentropy": 2.5243077278137207, "loss/hidden": 0.921875, "loss/logits": 0.15269093215465546, "loss/reg": 0.0036653466522693634, "step": 1405 }, { "epoch": 0.17575, "grad_norm": 2.1895623207092285, "grad_norm_var": 2.9557342642141196, "learning_rate": 0.0001, "loss": 1.035, "loss/crossentropy": 2.4991495609283447, "loss/hidden": 0.85546875, "loss/logits": 0.14287039637565613, "loss/reg": 0.003663522657006979, "step": 1406 }, { "epoch": 0.175875, "grad_norm": 6.32980489730835, "grad_norm_var": 3.412629436693123, "learning_rate": 0.0001, "loss": 1.4889, "loss/crossentropy": 2.193056583404541, "loss/hidden": 1.265625, "loss/logits": 0.18663693964481354, "loss/reg": 0.0036618507001549006, "step": 1407 }, { "epoch": 0.176, "grad_norm": 2.4646599292755127, "grad_norm_var": 3.4702546151327094, "learning_rate": 0.0001, "loss": 1.1915, "loss/crossentropy": 2.23488450050354, "loss/hidden": 0.984375, "loss/logits": 0.17051902413368225, "loss/reg": 0.003659995039924979, "step": 1408 }, { "epoch": 0.176125, "grad_norm": 2.211646318435669, "grad_norm_var": 3.317894410006067, "learning_rate": 0.0001, "loss": 0.9964, "loss/crossentropy": 2.5030038356781006, "loss/hidden": 0.8125, "loss/logits": 0.1473085880279541, "loss/reg": 0.0036581193562597036, "step": 1409 }, { "epoch": 0.17625, "grad_norm": 2.623199224472046, "grad_norm_var": 3.330419454675635, "learning_rate": 0.0001, "loss": 1.3115, "loss/crossentropy": 2.550668716430664, "loss/hidden": 1.0859375, "loss/logits": 0.18897610902786255, "loss/reg": 0.0036564578767865896, "step": 1410 }, { "epoch": 0.176375, "grad_norm": 2.587066173553467, "grad_norm_var": 3.3105432674443476, "learning_rate": 0.0001, "loss": 1.0134, "loss/crossentropy": 2.560816764831543, "loss/hidden": 0.8359375, "loss/logits": 0.14094150066375732, "loss/reg": 0.003654823638498783, "step": 1411 }, { "epoch": 0.1765, "grad_norm": 3.003955602645874, "grad_norm_var": 3.278755120231675, "learning_rate": 0.0001, "loss": 1.1132, "loss/crossentropy": 2.602020740509033, "loss/hidden": 0.9296875, "loss/logits": 0.14699101448059082, "loss/reg": 0.0036529425997287035, "step": 1412 }, { "epoch": 0.176625, "grad_norm": 2.4333064556121826, "grad_norm_var": 3.315795478379337, "learning_rate": 0.0001, "loss": 0.9971, "loss/crossentropy": 2.402600049972534, "loss/hidden": 0.82421875, "loss/logits": 0.13639256358146667, "loss/reg": 0.003651064820587635, "step": 1413 }, { "epoch": 0.17675, "grad_norm": 3.2620654106140137, "grad_norm_var": 3.2585387544687285, "learning_rate": 0.0001, "loss": 1.2037, "loss/crossentropy": 2.5123348236083984, "loss/hidden": 0.98828125, "loss/logits": 0.17892900109291077, "loss/reg": 0.0036491919308900833, "step": 1414 }, { "epoch": 0.176875, "grad_norm": 2.7178690433502197, "grad_norm_var": 3.195713317595645, "learning_rate": 0.0001, "loss": 1.1421, "loss/crossentropy": 2.5939574241638184, "loss/hidden": 0.94140625, "loss/logits": 0.16417454183101654, "loss/reg": 0.003647380042821169, "step": 1415 }, { "epoch": 0.177, "grad_norm": 2.7069778442382812, "grad_norm_var": 3.2020201720099597, "learning_rate": 0.0001, "loss": 1.0438, "loss/crossentropy": 2.4623637199401855, "loss/hidden": 0.859375, "loss/logits": 0.14793148636817932, "loss/reg": 0.0036454948130995035, "step": 1416 }, { "epoch": 0.177125, "grad_norm": 3.3774526119232178, "grad_norm_var": 3.1903428630054806, "learning_rate": 0.0001, "loss": 1.2305, "loss/crossentropy": 2.493734836578369, "loss/hidden": 1.03125, "loss/logits": 0.1627763956785202, "loss/reg": 0.003643598174676299, "step": 1417 }, { "epoch": 0.17725, "grad_norm": 3.0849242210388184, "grad_norm_var": 3.1504347640183825, "learning_rate": 0.0001, "loss": 1.1358, "loss/crossentropy": 2.631613254547119, "loss/hidden": 0.92578125, "loss/logits": 0.1735854148864746, "loss/reg": 0.0036417231895029545, "step": 1418 }, { "epoch": 0.177375, "grad_norm": 2.294229030609131, "grad_norm_var": 0.9608081109988983, "learning_rate": 0.0001, "loss": 1.0362, "loss/crossentropy": 2.3616018295288086, "loss/hidden": 0.8515625, "loss/logits": 0.1482659876346588, "loss/reg": 0.003640011651441455, "step": 1419 }, { "epoch": 0.1775, "grad_norm": 2.5457763671875, "grad_norm_var": 0.9663407595303662, "learning_rate": 0.0001, "loss": 1.1153, "loss/crossentropy": 2.293968439102173, "loss/hidden": 0.92578125, "loss/logits": 0.1531478762626648, "loss/reg": 0.003638186492025852, "step": 1420 }, { "epoch": 0.177625, "grad_norm": 2.2807705402374268, "grad_norm_var": 0.9779472660718359, "learning_rate": 0.0001, "loss": 1.043, "loss/crossentropy": 2.3743157386779785, "loss/hidden": 0.859375, "loss/logits": 0.14730778336524963, "loss/reg": 0.003636348759755492, "step": 1421 }, { "epoch": 0.17775, "grad_norm": 3.111150026321411, "grad_norm_var": 0.9459346801334104, "learning_rate": 0.0001, "loss": 1.445, "loss/crossentropy": 2.40647292137146, "loss/hidden": 1.1875, "loss/logits": 0.22118544578552246, "loss/reg": 0.0036345720291137695, "step": 1422 }, { "epoch": 0.177875, "grad_norm": 2.957676410675049, "grad_norm_var": 0.13237886720594336, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.2662835121154785, "loss/hidden": 1.1953125, "loss/logits": 0.22237327694892883, "loss/reg": 0.0036327948328107595, "step": 1423 }, { "epoch": 0.178, "grad_norm": 2.8840086460113525, "grad_norm_var": 0.12859406693209482, "learning_rate": 0.0001, "loss": 1.1926, "loss/crossentropy": 2.29878830909729, "loss/hidden": 0.98828125, "loss/logits": 0.16803184151649475, "loss/reg": 0.003631110303103924, "step": 1424 }, { "epoch": 0.178125, "grad_norm": 2.56199049949646, "grad_norm_var": 0.11087788727616968, "learning_rate": 0.0001, "loss": 1.1011, "loss/crossentropy": 2.510556221008301, "loss/hidden": 0.88671875, "loss/logits": 0.1780451089143753, "loss/reg": 0.003629653248935938, "step": 1425 }, { "epoch": 0.17825, "grad_norm": 2.547821044921875, "grad_norm_var": 0.11277902977970579, "learning_rate": 0.0001, "loss": 1.1145, "loss/crossentropy": 2.4963526725769043, "loss/hidden": 0.9140625, "loss/logits": 0.1641579121351242, "loss/reg": 0.003628302598372102, "step": 1426 }, { "epoch": 0.178375, "grad_norm": 4.021576404571533, "grad_norm_var": 0.20596057757328007, "learning_rate": 0.0001, "loss": 1.0427, "loss/crossentropy": 2.6380093097686768, "loss/hidden": 0.875, "loss/logits": 0.13147366046905518, "loss/reg": 0.00362660875543952, "step": 1427 }, { "epoch": 0.1785, "grad_norm": 2.6613025665283203, "grad_norm_var": 0.2068119512618426, "learning_rate": 0.0001, "loss": 1.1507, "loss/crossentropy": 2.449720859527588, "loss/hidden": 0.94140625, "loss/logits": 0.17299975454807281, "loss/reg": 0.003625056240707636, "step": 1428 }, { "epoch": 0.178625, "grad_norm": 2.810811996459961, "grad_norm_var": 0.19522032187841584, "learning_rate": 0.0001, "loss": 1.2604, "loss/crossentropy": 2.5181782245635986, "loss/hidden": 1.0390625, "loss/logits": 0.18506762385368347, "loss/reg": 0.0036235651932656765, "step": 1429 }, { "epoch": 0.17875, "grad_norm": 2.3159515857696533, "grad_norm_var": 0.20096961733446103, "learning_rate": 0.0001, "loss": 1.0049, "loss/crossentropy": 2.4013493061065674, "loss/hidden": 0.83203125, "loss/logits": 0.13662859797477722, "loss/reg": 0.0036217791493982077, "step": 1430 }, { "epoch": 0.178875, "grad_norm": 2.760279893875122, "grad_norm_var": 0.20058922636977528, "learning_rate": 0.0001, "loss": 1.1095, "loss/crossentropy": 2.494328737258911, "loss/hidden": 0.90234375, "loss/logits": 0.17097754776477814, "loss/reg": 0.0036199174355715513, "step": 1431 }, { "epoch": 0.179, "grad_norm": 2.1179399490356445, "grad_norm_var": 0.23018267869760295, "learning_rate": 0.0001, "loss": 0.9199, "loss/crossentropy": 2.328307628631592, "loss/hidden": 0.76171875, "loss/logits": 0.12199117988348007, "loss/reg": 0.0036181595642119646, "step": 1432 }, { "epoch": 0.179125, "grad_norm": 1.972982406616211, "grad_norm_var": 0.23987289746597373, "learning_rate": 0.0001, "loss": 1.1017, "loss/crossentropy": 2.3198962211608887, "loss/hidden": 0.91015625, "loss/logits": 0.15536776185035706, "loss/reg": 0.0036163018085062504, "step": 1433 }, { "epoch": 0.17925, "grad_norm": 2.297431707382202, "grad_norm_var": 0.23643810387164474, "learning_rate": 0.0001, "loss": 1.0198, "loss/crossentropy": 2.3877716064453125, "loss/hidden": 0.84765625, "loss/logits": 0.13596263527870178, "loss/reg": 0.003614293411374092, "step": 1434 }, { "epoch": 0.179375, "grad_norm": 4.3237714767456055, "grad_norm_var": 0.4019732306137899, "learning_rate": 0.0001, "loss": 1.8528, "loss/crossentropy": 2.4731862545013428, "loss/hidden": 1.515625, "loss/logits": 0.3010145425796509, "loss/reg": 0.0036123625468462706, "step": 1435 }, { "epoch": 0.1795, "grad_norm": 2.4513933658599854, "grad_norm_var": 0.4052347077082838, "learning_rate": 0.0001, "loss": 1.0992, "loss/crossentropy": 2.420868396759033, "loss/hidden": 0.91796875, "loss/logits": 0.14515507221221924, "loss/reg": 0.003610546700656414, "step": 1436 }, { "epoch": 0.179625, "grad_norm": 3.088550329208374, "grad_norm_var": 0.39496121989805694, "learning_rate": 0.0001, "loss": 0.9986, "loss/crossentropy": 2.5977416038513184, "loss/hidden": 0.8046875, "loss/logits": 0.1577831655740738, "loss/reg": 0.0036085534375160933, "step": 1437 }, { "epoch": 0.17975, "grad_norm": 2.4659948348999023, "grad_norm_var": 0.3946649959456747, "learning_rate": 0.0001, "loss": 1.12, "loss/crossentropy": 2.3049263954162598, "loss/hidden": 0.9296875, "loss/logits": 0.1542476862668991, "loss/reg": 0.0036067250184714794, "step": 1438 }, { "epoch": 0.179875, "grad_norm": 3.8259365558624268, "grad_norm_var": 0.46409173226906736, "learning_rate": 0.0001, "loss": 1.0567, "loss/crossentropy": 2.495133876800537, "loss/hidden": 0.87890625, "loss/logits": 0.14177045226097107, "loss/reg": 0.0036046463064849377, "step": 1439 }, { "epoch": 0.18, "grad_norm": 4.203658103942871, "grad_norm_var": 0.5843312188094536, "learning_rate": 0.0001, "loss": 1.1694, "loss/crossentropy": 2.5536646842956543, "loss/hidden": 0.98828125, "loss/logits": 0.14505554735660553, "loss/reg": 0.0036028120666742325, "step": 1440 }, { "epoch": 0.180125, "grad_norm": 2.6005194187164307, "grad_norm_var": 0.5826787847955602, "learning_rate": 0.0001, "loss": 1.0808, "loss/crossentropy": 2.723029613494873, "loss/hidden": 0.87890625, "loss/logits": 0.16591691970825195, "loss/reg": 0.003600981319323182, "step": 1441 }, { "epoch": 0.18025, "grad_norm": 2.6143887042999268, "grad_norm_var": 0.5797933388848217, "learning_rate": 0.0001, "loss": 1.0984, "loss/crossentropy": 2.677304744720459, "loss/hidden": 0.890625, "loss/logits": 0.17178688943386078, "loss/reg": 0.003598999697715044, "step": 1442 }, { "epoch": 0.180375, "grad_norm": 3.0147054195404053, "grad_norm_var": 0.4936957943628516, "learning_rate": 0.0001, "loss": 1.1792, "loss/crossentropy": 2.567859649658203, "loss/hidden": 0.95703125, "loss/logits": 0.18622992932796478, "loss/reg": 0.00359702087007463, "step": 1443 }, { "epoch": 0.1805, "grad_norm": 2.5004806518554688, "grad_norm_var": 0.4992588141750974, "learning_rate": 0.0001, "loss": 1.0141, "loss/crossentropy": 2.335355043411255, "loss/hidden": 0.84765625, "loss/logits": 0.1304875910282135, "loss/reg": 0.0035950199235230684, "step": 1444 }, { "epoch": 0.180625, "grad_norm": 2.9756336212158203, "grad_norm_var": 0.5004185509481144, "learning_rate": 0.0001, "loss": 1.1931, "loss/crossentropy": 2.262080430984497, "loss/hidden": 1.0, "loss/logits": 0.15718932449817657, "loss/reg": 0.00359291210770607, "step": 1445 }, { "epoch": 0.18075, "grad_norm": 2.8728082180023193, "grad_norm_var": 0.48047395147950145, "learning_rate": 0.0001, "loss": 1.1377, "loss/crossentropy": 2.4912827014923096, "loss/hidden": 0.91796875, "loss/logits": 0.18385069072246552, "loss/reg": 0.0035907707642763853, "step": 1446 }, { "epoch": 0.180875, "grad_norm": 2.440415143966675, "grad_norm_var": 0.4919916999810859, "learning_rate": 0.0001, "loss": 1.1763, "loss/crossentropy": 2.539290189743042, "loss/hidden": 0.95703125, "loss/logits": 0.18333688378334045, "loss/reg": 0.003588638501241803, "step": 1447 }, { "epoch": 0.181, "grad_norm": 2.3080894947052, "grad_norm_var": 0.4754273782914159, "learning_rate": 0.0001, "loss": 1.0376, "loss/crossentropy": 2.552766799926758, "loss/hidden": 0.8515625, "loss/logits": 0.1501522660255432, "loss/reg": 0.0035868044942617416, "step": 1448 }, { "epoch": 0.181125, "grad_norm": 2.343794822692871, "grad_norm_var": 0.43955761846480074, "learning_rate": 0.0001, "loss": 1.3478, "loss/crossentropy": 2.1275408267974854, "loss/hidden": 1.109375, "loss/logits": 0.2026042938232422, "loss/reg": 0.0035850289277732372, "step": 1449 }, { "epoch": 0.18125, "grad_norm": 2.139036178588867, "grad_norm_var": 0.45375597061421685, "learning_rate": 0.0001, "loss": 1.0913, "loss/crossentropy": 2.6152234077453613, "loss/hidden": 0.89453125, "loss/logits": 0.16098400950431824, "loss/reg": 0.003583215642720461, "step": 1450 }, { "epoch": 0.181375, "grad_norm": 6.257389545440674, "grad_norm_var": 1.0582259715841054, "learning_rate": 0.0001, "loss": 1.7871, "loss/crossentropy": 2.396705150604248, "loss/hidden": 1.4140625, "loss/logits": 0.3372613787651062, "loss/reg": 0.003581451950594783, "step": 1451 }, { "epoch": 0.1815, "grad_norm": 2.660068988800049, "grad_norm_var": 1.0455046997651758, "learning_rate": 0.0001, "loss": 1.1302, "loss/crossentropy": 2.4636411666870117, "loss/hidden": 0.9375, "loss/logits": 0.15687166154384613, "loss/reg": 0.0035797141026705503, "step": 1452 }, { "epoch": 0.181625, "grad_norm": 2.0276317596435547, "grad_norm_var": 1.1060792073261607, "learning_rate": 0.0001, "loss": 1.0576, "loss/crossentropy": 2.645092010498047, "loss/hidden": 0.8671875, "loss/logits": 0.15465402603149414, "loss/reg": 0.003578024450689554, "step": 1453 }, { "epoch": 0.18175, "grad_norm": 2.706411123275757, "grad_norm_var": 1.0940753984711251, "learning_rate": 0.0001, "loss": 1.2783, "loss/crossentropy": 2.465183973312378, "loss/hidden": 1.0703125, "loss/logits": 0.1722460836172104, "loss/reg": 0.003576185554265976, "step": 1454 }, { "epoch": 0.181875, "grad_norm": 2.883852243423462, "grad_norm_var": 1.0418023995859433, "learning_rate": 0.0001, "loss": 1.1553, "loss/crossentropy": 2.62016224861145, "loss/hidden": 0.9609375, "loss/logits": 0.15863552689552307, "loss/reg": 0.0035745068453252316, "step": 1455 }, { "epoch": 0.182, "grad_norm": 2.712892532348633, "grad_norm_var": 0.9234243773258602, "learning_rate": 0.0001, "loss": 1.2436, "loss/crossentropy": 2.7399840354919434, "loss/hidden": 1.0078125, "loss/logits": 0.20005175471305847, "loss/reg": 0.0035726907663047314, "step": 1456 }, { "epoch": 0.182125, "grad_norm": 2.2389307022094727, "grad_norm_var": 0.9419911218676927, "learning_rate": 0.0001, "loss": 1.0805, "loss/crossentropy": 2.4630889892578125, "loss/hidden": 0.88671875, "loss/logits": 0.15808376669883728, "loss/reg": 0.003570869332179427, "step": 1457 }, { "epoch": 0.18225, "grad_norm": 2.272373676300049, "grad_norm_var": 0.9574713564477899, "learning_rate": 0.0001, "loss": 1.0423, "loss/crossentropy": 2.420292377471924, "loss/hidden": 0.8515625, "loss/logits": 0.1550384759902954, "loss/reg": 0.003569073276594281, "step": 1458 }, { "epoch": 0.182375, "grad_norm": 1.9881032705307007, "grad_norm_var": 0.9901407757083646, "learning_rate": 0.0001, "loss": 1.0698, "loss/crossentropy": 2.412853240966797, "loss/hidden": 0.890625, "loss/logits": 0.14347431063652039, "loss/reg": 0.0035672772210091352, "step": 1459 }, { "epoch": 0.1825, "grad_norm": 9.163015365600586, "grad_norm_var": 3.5801338990358595, "learning_rate": 0.0001, "loss": 1.2578, "loss/crossentropy": 2.492182731628418, "loss/hidden": 1.0390625, "loss/logits": 0.18306787312030792, "loss/reg": 0.003565459046512842, "step": 1460 }, { "epoch": 0.182625, "grad_norm": 2.5656776428222656, "grad_norm_var": 3.598769741394437, "learning_rate": 0.0001, "loss": 1.001, "loss/crossentropy": 2.5562655925750732, "loss/hidden": 0.82421875, "loss/logits": 0.14114579558372498, "loss/reg": 0.003563658567145467, "step": 1461 }, { "epoch": 0.18275, "grad_norm": 2.286931276321411, "grad_norm_var": 3.6378752514728876, "learning_rate": 0.0001, "loss": 0.9807, "loss/crossentropy": 2.698147773742676, "loss/hidden": 0.8125, "loss/logits": 0.13260522484779358, "loss/reg": 0.0035618396941572428, "step": 1462 }, { "epoch": 0.182875, "grad_norm": 2.371244430541992, "grad_norm_var": 3.643908523891269, "learning_rate": 0.0001, "loss": 1.1124, "loss/crossentropy": 2.432371139526367, "loss/hidden": 0.91015625, "loss/logits": 0.16662147641181946, "loss/reg": 0.0035599328111857176, "step": 1463 }, { "epoch": 0.183, "grad_norm": 2.6184229850769043, "grad_norm_var": 3.6189046702026477, "learning_rate": 0.0001, "loss": 1.0458, "loss/crossentropy": 2.947530508041382, "loss/hidden": 0.8671875, "loss/logits": 0.143006831407547, "loss/reg": 0.0035581255797296762, "step": 1464 }, { "epoch": 0.183125, "grad_norm": 2.143239736557007, "grad_norm_var": 3.641031281987516, "learning_rate": 0.0001, "loss": 1.0678, "loss/crossentropy": 2.6318516731262207, "loss/hidden": 0.88671875, "loss/logits": 0.14551308751106262, "loss/reg": 0.00355625175870955, "step": 1465 }, { "epoch": 0.18325, "grad_norm": 2.7335619926452637, "grad_norm_var": 3.589745013057074, "learning_rate": 0.0001, "loss": 1.2078, "loss/crossentropy": 2.375401020050049, "loss/hidden": 0.9765625, "loss/logits": 0.19566710293293, "loss/reg": 0.0035545255523175, "step": 1466 }, { "epoch": 0.183375, "grad_norm": 2.3009843826293945, "grad_norm_var": 2.903458838671109, "learning_rate": 0.0001, "loss": 1.1591, "loss/crossentropy": 2.5083110332489014, "loss/hidden": 0.94921875, "loss/logits": 0.1743055284023285, "loss/reg": 0.003552833804860711, "step": 1467 }, { "epoch": 0.1835, "grad_norm": 1.9567177295684814, "grad_norm_var": 2.952619415111201, "learning_rate": 0.0001, "loss": 0.9881, "loss/crossentropy": 2.5743629932403564, "loss/hidden": 0.80859375, "loss/logits": 0.14398899674415588, "loss/reg": 0.003551185131072998, "step": 1468 }, { "epoch": 0.183625, "grad_norm": 2.2741682529449463, "grad_norm_var": 2.9306800113679072, "learning_rate": 0.0001, "loss": 1.0048, "loss/crossentropy": 2.4656572341918945, "loss/hidden": 0.83203125, "loss/logits": 0.1372774988412857, "loss/reg": 0.003549505490809679, "step": 1469 }, { "epoch": 0.18375, "grad_norm": 2.9343631267547607, "grad_norm_var": 2.9302919053315675, "learning_rate": 0.0001, "loss": 1.1254, "loss/crossentropy": 2.4442336559295654, "loss/hidden": 0.8984375, "loss/logits": 0.19146251678466797, "loss/reg": 0.003547689877450466, "step": 1470 }, { "epoch": 0.183875, "grad_norm": 2.1364428997039795, "grad_norm_var": 2.9608635231208154, "learning_rate": 0.0001, "loss": 1.0281, "loss/crossentropy": 2.4912197589874268, "loss/hidden": 0.84765625, "loss/logits": 0.14495311677455902, "loss/reg": 0.003545962506905198, "step": 1471 }, { "epoch": 0.184, "grad_norm": 2.244561195373535, "grad_norm_var": 2.979609556239146, "learning_rate": 0.0001, "loss": 0.9575, "loss/crossentropy": 2.461921215057373, "loss/hidden": 0.80078125, "loss/logits": 0.12127329409122467, "loss/reg": 0.0035440947394818068, "step": 1472 }, { "epoch": 0.184125, "grad_norm": 2.6610682010650635, "grad_norm_var": 2.96117686540241, "learning_rate": 0.0001, "loss": 1.0065, "loss/crossentropy": 2.4418022632598877, "loss/hidden": 0.828125, "loss/logits": 0.14295433461666107, "loss/reg": 0.003542231861501932, "step": 1473 }, { "epoch": 0.18425, "grad_norm": 2.17217755317688, "grad_norm_var": 2.9687286207062233, "learning_rate": 0.0001, "loss": 1.0648, "loss/crossentropy": 2.4521894454956055, "loss/hidden": 0.8828125, "loss/logits": 0.146602600812912, "loss/reg": 0.003540375269949436, "step": 1474 }, { "epoch": 0.184375, "grad_norm": 2.5094003677368164, "grad_norm_var": 2.930364197494137, "learning_rate": 0.0001, "loss": 1.0535, "loss/crossentropy": 2.2975356578826904, "loss/hidden": 0.890625, "loss/logits": 0.12753836810588837, "loss/reg": 0.003538495395332575, "step": 1475 }, { "epoch": 0.1845, "grad_norm": 2.8615875244140625, "grad_norm_var": 0.08025149529701801, "learning_rate": 0.0001, "loss": 1.1913, "loss/crossentropy": 2.4363348484039307, "loss/hidden": 0.98046875, "loss/logits": 0.1754908561706543, "loss/reg": 0.0035365556832402945, "step": 1476 }, { "epoch": 0.184625, "grad_norm": 2.3295183181762695, "grad_norm_var": 0.07924959319393471, "learning_rate": 0.0001, "loss": 1.085, "loss/crossentropy": 2.641104221343994, "loss/hidden": 0.89453125, "loss/logits": 0.15514951944351196, "loss/reg": 0.0035345894284546375, "step": 1477 }, { "epoch": 0.18475, "grad_norm": 3.0680177211761475, "grad_norm_var": 0.10473031746968976, "learning_rate": 0.0001, "loss": 1.0339, "loss/crossentropy": 2.1963143348693848, "loss/hidden": 0.84765625, "loss/logits": 0.15095466375350952, "loss/reg": 0.003532707691192627, "step": 1478 }, { "epoch": 0.184875, "grad_norm": 2.1066739559173584, "grad_norm_var": 0.11213794701280312, "learning_rate": 0.0001, "loss": 0.9353, "loss/crossentropy": 2.988297700881958, "loss/hidden": 0.78515625, "loss/logits": 0.11482476443052292, "loss/reg": 0.0035308676306158304, "step": 1479 }, { "epoch": 0.185, "grad_norm": 2.4566586017608643, "grad_norm_var": 0.10993979963402485, "learning_rate": 0.0001, "loss": 1.3199, "loss/crossentropy": 2.324228048324585, "loss/hidden": 1.09375, "loss/logits": 0.19089123606681824, "loss/reg": 0.0035288881044834852, "step": 1480 }, { "epoch": 0.185125, "grad_norm": 2.360886573791504, "grad_norm_var": 0.10456219156340367, "learning_rate": 0.0001, "loss": 1.11, "loss/crossentropy": 2.5515263080596924, "loss/hidden": 0.90625, "loss/logits": 0.16846542060375214, "loss/reg": 0.003527080873027444, "step": 1481 }, { "epoch": 0.18525, "grad_norm": 2.6080853939056396, "grad_norm_var": 0.10070469690842856, "learning_rate": 0.0001, "loss": 1.1455, "loss/crossentropy": 2.456618070602417, "loss/hidden": 0.9296875, "loss/logits": 0.18056106567382812, "loss/reg": 0.003525231732055545, "step": 1482 }, { "epoch": 0.185375, "grad_norm": 2.5533339977264404, "grad_norm_var": 0.10013072862828926, "learning_rate": 0.0001, "loss": 1.0745, "loss/crossentropy": 2.6200571060180664, "loss/hidden": 0.89453125, "loss/logits": 0.14470210671424866, "loss/reg": 0.003523309249430895, "step": 1483 }, { "epoch": 0.1855, "grad_norm": 2.5463483333587646, "grad_norm_var": 0.08291376946414909, "learning_rate": 0.0001, "loss": 1.1705, "loss/crossentropy": 2.4352331161499023, "loss/hidden": 0.95703125, "loss/logits": 0.1782476007938385, "loss/reg": 0.0035214636009186506, "step": 1484 }, { "epoch": 0.185625, "grad_norm": 2.1944315433502197, "grad_norm_var": 0.08559466734096356, "learning_rate": 0.0001, "loss": 0.9894, "loss/crossentropy": 2.350627899169922, "loss/hidden": 0.82421875, "loss/logits": 0.13000428676605225, "loss/reg": 0.0035196379758417606, "step": 1485 }, { "epoch": 0.18575, "grad_norm": 2.352766990661621, "grad_norm_var": 0.0718094639254095, "learning_rate": 0.0001, "loss": 1.1058, "loss/crossentropy": 2.625410556793213, "loss/hidden": 0.921875, "loss/logits": 0.1486971080303192, "loss/reg": 0.003517881967127323, "step": 1486 }, { "epoch": 0.185875, "grad_norm": 2.4021310806274414, "grad_norm_var": 0.06519778826045103, "learning_rate": 0.0001, "loss": 0.9818, "loss/crossentropy": 2.3567581176757812, "loss/hidden": 0.8203125, "loss/logits": 0.1262809932231903, "loss/reg": 0.0035160251427441835, "step": 1487 }, { "epoch": 0.186, "grad_norm": 2.510427951812744, "grad_norm_var": 0.06182866367774575, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.475712537765503, "loss/hidden": 0.921875, "loss/logits": 0.1811387538909912, "loss/reg": 0.003514372045174241, "step": 1488 }, { "epoch": 0.186125, "grad_norm": 2.0962274074554443, "grad_norm_var": 0.0681959672911449, "learning_rate": 0.0001, "loss": 0.9282, "loss/crossentropy": 2.600541830062866, "loss/hidden": 0.76953125, "loss/logits": 0.12352926284074783, "loss/reg": 0.0035127766896039248, "step": 1489 }, { "epoch": 0.18625, "grad_norm": 2.7332022190093994, "grad_norm_var": 0.06741919371529713, "learning_rate": 0.0001, "loss": 1.0116, "loss/crossentropy": 2.5983481407165527, "loss/hidden": 0.8359375, "loss/logits": 0.14054536819458008, "loss/reg": 0.003511229529976845, "step": 1490 }, { "epoch": 0.186375, "grad_norm": 2.262906789779663, "grad_norm_var": 0.07027029030217118, "learning_rate": 0.0001, "loss": 1.2372, "loss/crossentropy": 2.5684587955474854, "loss/hidden": 1.015625, "loss/logits": 0.1864907145500183, "loss/reg": 0.0035094181075692177, "step": 1491 }, { "epoch": 0.1865, "grad_norm": 2.142493963241577, "grad_norm_var": 0.06458349300590362, "learning_rate": 0.0001, "loss": 0.9329, "loss/crossentropy": 2.6954498291015625, "loss/hidden": 0.765625, "loss/logits": 0.13216395676136017, "loss/reg": 0.003507613204419613, "step": 1492 }, { "epoch": 0.186625, "grad_norm": 2.312235116958618, "grad_norm_var": 0.06481126280718001, "learning_rate": 0.0001, "loss": 1.0631, "loss/crossentropy": 2.7299952507019043, "loss/hidden": 0.8828125, "loss/logits": 0.1452367901802063, "loss/reg": 0.003506068605929613, "step": 1493 }, { "epoch": 0.18675, "grad_norm": 2.66845703125, "grad_norm_var": 0.040222462022599596, "learning_rate": 0.0001, "loss": 1.1768, "loss/crossentropy": 2.5572774410247803, "loss/hidden": 0.96875, "loss/logits": 0.17304158210754395, "loss/reg": 0.0035042495001107454, "step": 1494 }, { "epoch": 0.186875, "grad_norm": 2.0365381240844727, "grad_norm_var": 0.04321872460463207, "learning_rate": 0.0001, "loss": 1.0286, "loss/crossentropy": 2.845195770263672, "loss/hidden": 0.8515625, "loss/logits": 0.14202150702476501, "loss/reg": 0.0035027535632252693, "step": 1495 }, { "epoch": 0.187, "grad_norm": 2.2023096084594727, "grad_norm_var": 0.04499537551175739, "learning_rate": 0.0001, "loss": 1.1091, "loss/crossentropy": 2.4882686138153076, "loss/hidden": 0.91796875, "loss/logits": 0.15609663724899292, "loss/reg": 0.0035012420266866684, "step": 1496 }, { "epoch": 0.187125, "grad_norm": 2.208552122116089, "grad_norm_var": 0.04671054126144914, "learning_rate": 0.0001, "loss": 1.2829, "loss/crossentropy": 2.6602842807769775, "loss/hidden": 1.0546875, "loss/logits": 0.19319944083690643, "loss/reg": 0.0034994245506823063, "step": 1497 }, { "epoch": 0.18725, "grad_norm": 2.562239170074463, "grad_norm_var": 0.04535231939183457, "learning_rate": 0.0001, "loss": 1.0075, "loss/crossentropy": 2.238691806793213, "loss/hidden": 0.8359375, "loss/logits": 0.136610209941864, "loss/reg": 0.0034975947346538305, "step": 1498 }, { "epoch": 0.187375, "grad_norm": 2.401848316192627, "grad_norm_var": 0.04291264261425264, "learning_rate": 0.0001, "loss": 1.0662, "loss/crossentropy": 2.685908079147339, "loss/hidden": 0.890625, "loss/logits": 0.1406560093164444, "loss/reg": 0.0034957744646817446, "step": 1499 }, { "epoch": 0.1875, "grad_norm": 2.12306809425354, "grad_norm_var": 0.04314595548621488, "learning_rate": 0.0001, "loss": 1.1621, "loss/crossentropy": 2.2005772590637207, "loss/hidden": 0.9609375, "loss/logits": 0.1661786586046219, "loss/reg": 0.0034941888879984617, "step": 1500 }, { "epoch": 0.187625, "grad_norm": 3.0501010417938232, "grad_norm_var": 0.07394000618437152, "learning_rate": 0.0001, "loss": 1.3451, "loss/crossentropy": 2.623453140258789, "loss/hidden": 1.09375, "loss/logits": 0.21640917658805847, "loss/reg": 0.003492384683340788, "step": 1501 }, { "epoch": 0.18775, "grad_norm": 3.257550001144409, "grad_norm_var": 0.12192848616994235, "learning_rate": 0.0001, "loss": 1.3142, "loss/crossentropy": 2.670974016189575, "loss/hidden": 1.0859375, "loss/logits": 0.19331884384155273, "loss/reg": 0.0034905769862234592, "step": 1502 }, { "epoch": 0.187875, "grad_norm": 1.9728327989578247, "grad_norm_var": 0.13536526430902043, "learning_rate": 0.0001, "loss": 0.9776, "loss/crossentropy": 2.603712320327759, "loss/hidden": 0.8046875, "loss/logits": 0.1380743682384491, "loss/reg": 0.0034887471701949835, "step": 1503 }, { "epoch": 0.188, "grad_norm": 41.34659194946289, "grad_norm_var": 94.9270262878801, "learning_rate": 0.0001, "loss": 1.3036, "loss/crossentropy": 2.142944812774658, "loss/hidden": 1.0859375, "loss/logits": 0.1828157901763916, "loss/reg": 0.003486843081191182, "step": 1504 }, { "epoch": 0.188125, "grad_norm": 2.199233293533325, "grad_norm_var": 94.89006007533027, "learning_rate": 0.0001, "loss": 1.027, "loss/crossentropy": 2.5525684356689453, "loss/hidden": 0.8515625, "loss/logits": 0.14055398106575012, "loss/reg": 0.0034850805532187223, "step": 1505 }, { "epoch": 0.18825, "grad_norm": 2.0132434368133545, "grad_norm_var": 95.12493831851324, "learning_rate": 0.0001, "loss": 0.9913, "loss/crossentropy": 2.3946876525878906, "loss/hidden": 0.82421875, "loss/logits": 0.13224273920059204, "loss/reg": 0.0034832614473998547, "step": 1506 }, { "epoch": 0.188375, "grad_norm": 2.1540908813476562, "grad_norm_var": 95.16245243204516, "learning_rate": 0.0001, "loss": 1.0228, "loss/crossentropy": 2.4561750888824463, "loss/hidden": 0.83203125, "loss/logits": 0.15593823790550232, "loss/reg": 0.003481344785541296, "step": 1507 }, { "epoch": 0.1885, "grad_norm": 2.6965999603271484, "grad_norm_var": 94.98598958949965, "learning_rate": 0.0001, "loss": 1.2656, "loss/crossentropy": 2.38210391998291, "loss/hidden": 1.0390625, "loss/logits": 0.19175776839256287, "loss/reg": 0.003479481441900134, "step": 1508 }, { "epoch": 0.188625, "grad_norm": 3.4180994033813477, "grad_norm_var": 94.69186888365496, "learning_rate": 0.0001, "loss": 1.2663, "loss/crossentropy": 2.4697988033294678, "loss/hidden": 1.046875, "loss/logits": 0.18465159833431244, "loss/reg": 0.0034777566324919462, "step": 1509 }, { "epoch": 0.18875, "grad_norm": 2.273015022277832, "grad_norm_var": 94.81900961164247, "learning_rate": 0.0001, "loss": 1.0814, "loss/crossentropy": 2.623250722885132, "loss/hidden": 0.87890625, "loss/logits": 0.16774481534957886, "loss/reg": 0.003475895617157221, "step": 1510 }, { "epoch": 0.188875, "grad_norm": 2.247028112411499, "grad_norm_var": 94.74226385976161, "learning_rate": 0.0001, "loss": 1.117, "loss/crossentropy": 2.674126148223877, "loss/hidden": 0.921875, "loss/logits": 0.1603851318359375, "loss/reg": 0.0034741731360554695, "step": 1511 }, { "epoch": 0.189, "grad_norm": 4.356043338775635, "grad_norm_var": 94.26240397096606, "learning_rate": 0.0001, "loss": 1.2795, "loss/crossentropy": 2.6789023876190186, "loss/hidden": 1.0546875, "loss/logits": 0.19006717205047607, "loss/reg": 0.0034723973367363214, "step": 1512 }, { "epoch": 0.189125, "grad_norm": 2.381601572036743, "grad_norm_var": 94.19946382080788, "learning_rate": 0.0001, "loss": 1.245, "loss/crossentropy": 2.090708017349243, "loss/hidden": 1.046875, "loss/logits": 0.1634030044078827, "loss/reg": 0.0034707069862633944, "step": 1513 }, { "epoch": 0.18925, "grad_norm": 4.63723087310791, "grad_norm_var": 93.78628244843514, "learning_rate": 0.0001, "loss": 1.0803, "loss/crossentropy": 2.9273617267608643, "loss/hidden": 0.90625, "loss/logits": 0.13935977220535278, "loss/reg": 0.0034691800829023123, "step": 1514 }, { "epoch": 0.189375, "grad_norm": 3.1558918952941895, "grad_norm_var": 93.54471655609011, "learning_rate": 0.0001, "loss": 1.1021, "loss/crossentropy": 2.6062216758728027, "loss/hidden": 0.8984375, "loss/logits": 0.16897618770599365, "loss/reg": 0.003467726521193981, "step": 1515 }, { "epoch": 0.1895, "grad_norm": 2.6901845932006836, "grad_norm_var": 93.331765452413, "learning_rate": 0.0001, "loss": 1.0927, "loss/crossentropy": 2.563309907913208, "loss/hidden": 0.9140625, "loss/logits": 0.14401212334632874, "loss/reg": 0.003465942805632949, "step": 1516 }, { "epoch": 0.189625, "grad_norm": 2.6145591735839844, "grad_norm_var": 93.47082774818871, "learning_rate": 0.0001, "loss": 1.1674, "loss/crossentropy": 2.80975604057312, "loss/hidden": 0.96484375, "loss/logits": 0.1679481714963913, "loss/reg": 0.0034644228871911764, "step": 1517 }, { "epoch": 0.18975, "grad_norm": 2.1897029876708984, "grad_norm_var": 93.82056409785089, "learning_rate": 0.0001, "loss": 1.1465, "loss/crossentropy": 2.4143311977386475, "loss/hidden": 0.94140625, "loss/logits": 0.17049774527549744, "loss/reg": 0.003462952096015215, "step": 1518 }, { "epoch": 0.189875, "grad_norm": 2.565324068069458, "grad_norm_var": 93.59177882800311, "learning_rate": 0.0001, "loss": 1.3113, "loss/crossentropy": 2.0998597145080566, "loss/hidden": 1.0703125, "loss/logits": 0.20640692114830017, "loss/reg": 0.0034615371841937304, "step": 1519 }, { "epoch": 0.19, "grad_norm": 2.4154701232910156, "grad_norm_var": 0.6036209187642118, "learning_rate": 0.0001, "loss": 0.9326, "loss/crossentropy": 2.6523633003234863, "loss/hidden": 0.76171875, "loss/logits": 0.1362442970275879, "loss/reg": 0.0034597725607454777, "step": 1520 }, { "epoch": 0.190125, "grad_norm": 2.2444441318511963, "grad_norm_var": 0.6004258293545394, "learning_rate": 0.0001, "loss": 1.128, "loss/crossentropy": 2.0900661945343018, "loss/hidden": 0.95703125, "loss/logits": 0.136434406042099, "loss/reg": 0.0034582833759486675, "step": 1521 }, { "epoch": 0.19025, "grad_norm": 2.411386728286743, "grad_norm_var": 0.5710476325004736, "learning_rate": 0.0001, "loss": 1.2059, "loss/crossentropy": 2.381988286972046, "loss/hidden": 0.99609375, "loss/logits": 0.17528721690177917, "loss/reg": 0.0034567993134260178, "step": 1522 }, { "epoch": 0.190375, "grad_norm": 2.5889010429382324, "grad_norm_var": 0.5466832532559577, "learning_rate": 0.0001, "loss": 1.0782, "loss/crossentropy": 2.6577584743499756, "loss/hidden": 0.8984375, "loss/logits": 0.14523936808109283, "loss/reg": 0.003455315949395299, "step": 1523 }, { "epoch": 0.1905, "grad_norm": 2.219857692718506, "grad_norm_var": 0.56780075329749, "learning_rate": 0.0001, "loss": 1.0392, "loss/crossentropy": 2.7978203296661377, "loss/hidden": 0.8359375, "loss/logits": 0.16872358322143555, "loss/reg": 0.0034539303742349148, "step": 1524 }, { "epoch": 0.190625, "grad_norm": 2.3635013103485107, "grad_norm_var": 0.5469604537174282, "learning_rate": 0.0001, "loss": 1.0257, "loss/crossentropy": 2.6388540267944336, "loss/hidden": 0.8359375, "loss/logits": 0.15525703132152557, "loss/reg": 0.0034521608613431454, "step": 1525 }, { "epoch": 0.19075, "grad_norm": 2.082015037536621, "grad_norm_var": 0.560359742807311, "learning_rate": 0.0001, "loss": 1.1153, "loss/crossentropy": 2.446864128112793, "loss/hidden": 0.921875, "loss/logits": 0.15890395641326904, "loss/reg": 0.003450631396844983, "step": 1526 }, { "epoch": 0.190875, "grad_norm": 3.5639352798461914, "grad_norm_var": 0.5896182471249951, "learning_rate": 0.0001, "loss": 1.193, "loss/crossentropy": 2.174250602722168, "loss/hidden": 1.0234375, "loss/logits": 0.13503766059875488, "loss/reg": 0.0034488984383642673, "step": 1527 }, { "epoch": 0.191, "grad_norm": 3.0047361850738525, "grad_norm_var": 0.4197832623445635, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.883190631866455, "loss/hidden": 1.0625, "loss/logits": 0.1883969008922577, "loss/reg": 0.003447153139859438, "step": 1528 }, { "epoch": 0.191125, "grad_norm": 2.433673858642578, "grad_norm_var": 0.4177730223981217, "learning_rate": 0.0001, "loss": 1.1853, "loss/crossentropy": 2.390415668487549, "loss/hidden": 0.9765625, "loss/logits": 0.17427203059196472, "loss/reg": 0.003445402719080448, "step": 1529 }, { "epoch": 0.19125, "grad_norm": 3.7508089542388916, "grad_norm_var": 0.23777977315326002, "learning_rate": 0.0001, "loss": 1.2249, "loss/crossentropy": 2.995112895965576, "loss/hidden": 1.015625, "loss/logits": 0.17482982575893402, "loss/reg": 0.003443735418841243, "step": 1530 }, { "epoch": 0.191375, "grad_norm": 2.349748373031616, "grad_norm_var": 0.22331083482360797, "learning_rate": 0.0001, "loss": 1.1455, "loss/crossentropy": 2.5757083892822266, "loss/hidden": 0.9453125, "loss/logits": 0.16579627990722656, "loss/reg": 0.00344208930619061, "step": 1531 }, { "epoch": 0.1915, "grad_norm": 2.0953431129455566, "grad_norm_var": 0.23771892232560557, "learning_rate": 0.0001, "loss": 1.0868, "loss/crossentropy": 2.493220567703247, "loss/hidden": 0.8828125, "loss/logits": 0.169611394405365, "loss/reg": 0.003440374741330743, "step": 1532 }, { "epoch": 0.191625, "grad_norm": 2.2157857418060303, "grad_norm_var": 0.24453549562240368, "learning_rate": 0.0001, "loss": 1.071, "loss/crossentropy": 2.383723735809326, "loss/hidden": 0.8984375, "loss/logits": 0.1382179707288742, "loss/reg": 0.0034387765917927027, "step": 1533 }, { "epoch": 0.19175, "grad_norm": 2.3576011657714844, "grad_norm_var": 0.23865884883084618, "learning_rate": 0.0001, "loss": 1.0066, "loss/crossentropy": 2.542468547821045, "loss/hidden": 0.828125, "loss/logits": 0.14409103989601135, "loss/reg": 0.0034371281508356333, "step": 1534 }, { "epoch": 0.191875, "grad_norm": 3.1461169719696045, "grad_norm_var": 0.2615933880776604, "learning_rate": 0.0001, "loss": 1.2554, "loss/crossentropy": 2.378319025039673, "loss/hidden": 1.046875, "loss/logits": 0.17414087057113647, "loss/reg": 0.0034355788957327604, "step": 1535 }, { "epoch": 0.192, "grad_norm": 2.0135512351989746, "grad_norm_var": 0.28038375054829506, "learning_rate": 0.0001, "loss": 1.083, "loss/crossentropy": 1.9969367980957031, "loss/hidden": 0.90625, "loss/logits": 0.1424512416124344, "loss/reg": 0.0034339565318077803, "step": 1536 }, { "epoch": 0.192125, "grad_norm": 2.7598655223846436, "grad_norm_var": 0.27581093075343593, "learning_rate": 0.0001, "loss": 1.2095, "loss/crossentropy": 2.7056527137756348, "loss/hidden": 1.0, "loss/logits": 0.1751493215560913, "loss/reg": 0.0034322130959481, "step": 1537 }, { "epoch": 0.19225, "grad_norm": 2.116117000579834, "grad_norm_var": 0.28808717203202694, "learning_rate": 0.0001, "loss": 1.0636, "loss/crossentropy": 2.7393789291381836, "loss/hidden": 0.87890625, "loss/logits": 0.15042325854301453, "loss/reg": 0.003430649871006608, "step": 1538 }, { "epoch": 0.192375, "grad_norm": 3.571504592895508, "grad_norm_var": 0.35138636847546134, "learning_rate": 0.0001, "loss": 1.3128, "loss/crossentropy": 2.3102715015411377, "loss/hidden": 1.0859375, "loss/logits": 0.19261470437049866, "loss/reg": 0.0034290915355086327, "step": 1539 }, { "epoch": 0.1925, "grad_norm": 2.954730987548828, "grad_norm_var": 0.34517124347645045, "learning_rate": 0.0001, "loss": 1.2081, "loss/crossentropy": 2.3605659008026123, "loss/hidden": 1.0078125, "loss/logits": 0.16598157584667206, "loss/reg": 0.003427294548600912, "step": 1540 }, { "epoch": 0.192625, "grad_norm": 2.188232660293579, "grad_norm_var": 0.3543400274390722, "learning_rate": 0.0001, "loss": 1.0359, "loss/crossentropy": 2.4097092151641846, "loss/hidden": 0.8515625, "loss/logits": 0.15011939406394958, "loss/reg": 0.0034254533238708973, "step": 1541 }, { "epoch": 0.19275, "grad_norm": 1.8244503736495972, "grad_norm_var": 0.37842932295744913, "learning_rate": 0.0001, "loss": 1.1172, "loss/crossentropy": 2.4335646629333496, "loss/hidden": 0.92578125, "loss/logits": 0.15719163417816162, "loss/reg": 0.003423537826165557, "step": 1542 }, { "epoch": 0.192875, "grad_norm": 2.0547943115234375, "grad_norm_var": 0.33619594757236554, "learning_rate": 0.0001, "loss": 1.1579, "loss/crossentropy": 2.5234384536743164, "loss/hidden": 0.9453125, "loss/logits": 0.17836084961891174, "loss/reg": 0.0034218020737171173, "step": 1543 }, { "epoch": 0.193, "grad_norm": 2.1028945446014404, "grad_norm_var": 0.33262686711846395, "learning_rate": 0.0001, "loss": 1.024, "loss/crossentropy": 2.5168957710266113, "loss/hidden": 0.86328125, "loss/logits": 0.1265672892332077, "loss/reg": 0.003420063992962241, "step": 1544 }, { "epoch": 0.193125, "grad_norm": 3.1589317321777344, "grad_norm_var": 0.3594795180238894, "learning_rate": 0.0001, "loss": 1.3999, "loss/crossentropy": 2.19030499458313, "loss/hidden": 1.171875, "loss/logits": 0.19384470582008362, "loss/reg": 0.0034181931987404823, "step": 1545 }, { "epoch": 0.19325, "grad_norm": 2.3448944091796875, "grad_norm_var": 0.2562841379896558, "learning_rate": 0.0001, "loss": 1.1032, "loss/crossentropy": 2.6743974685668945, "loss/hidden": 0.91015625, "loss/logits": 0.15885460376739502, "loss/reg": 0.003416434396058321, "step": 1546 }, { "epoch": 0.193375, "grad_norm": 2.1164956092834473, "grad_norm_var": 0.2629084863422197, "learning_rate": 0.0001, "loss": 1.1537, "loss/crossentropy": 2.233337879180908, "loss/hidden": 0.96875, "loss/logits": 0.15075814723968506, "loss/reg": 0.0034145053941756487, "step": 1547 }, { "epoch": 0.1935, "grad_norm": 2.2009761333465576, "grad_norm_var": 0.2587680482498602, "learning_rate": 0.0001, "loss": 1.066, "loss/crossentropy": 2.364064931869507, "loss/hidden": 0.87890625, "loss/logits": 0.15300363302230835, "loss/reg": 0.003412702353671193, "step": 1548 }, { "epoch": 0.193625, "grad_norm": 2.375746250152588, "grad_norm_var": 0.2554693062414391, "learning_rate": 0.0001, "loss": 1.1073, "loss/crossentropy": 2.6182949542999268, "loss/hidden": 0.92578125, "loss/logits": 0.14743448793888092, "loss/reg": 0.0034108073450624943, "step": 1549 }, { "epoch": 0.19375, "grad_norm": 2.5327534675598145, "grad_norm_var": 0.25510200809180733, "learning_rate": 0.0001, "loss": 1.0888, "loss/crossentropy": 2.710033416748047, "loss/hidden": 0.88671875, "loss/logits": 0.16803640127182007, "loss/reg": 0.0034089069813489914, "step": 1550 }, { "epoch": 0.193875, "grad_norm": 2.195998430252075, "grad_norm_var": 0.22541138413578582, "learning_rate": 0.0001, "loss": 1.1853, "loss/crossentropy": 2.199061393737793, "loss/hidden": 0.9765625, "loss/logits": 0.17465950548648834, "loss/reg": 0.003407144919037819, "step": 1551 }, { "epoch": 0.194, "grad_norm": 2.297307014465332, "grad_norm_var": 0.2155580849353057, "learning_rate": 0.0001, "loss": 0.9825, "loss/crossentropy": 2.3787636756896973, "loss/hidden": 0.82421875, "loss/logits": 0.12426453083753586, "loss/reg": 0.0034053786657750607, "step": 1552 }, { "epoch": 0.194125, "grad_norm": 42.85431671142578, "grad_norm_var": 102.47997721663579, "learning_rate": 0.0001, "loss": 1.0865, "loss/crossentropy": 2.3751256465911865, "loss/hidden": 0.8984375, "loss/logits": 0.15405802428722382, "loss/reg": 0.0034037018194794655, "step": 1553 }, { "epoch": 0.19425, "grad_norm": 2.448805809020996, "grad_norm_var": 102.36204705695503, "learning_rate": 0.0001, "loss": 1.1293, "loss/crossentropy": 2.632035255432129, "loss/hidden": 0.91796875, "loss/logits": 0.17729425430297852, "loss/reg": 0.0034019986633211374, "step": 1554 }, { "epoch": 0.194375, "grad_norm": 2.3281965255737305, "grad_norm_var": 102.68741632356571, "learning_rate": 0.0001, "loss": 1.1374, "loss/crossentropy": 2.444570302963257, "loss/hidden": 0.9375, "loss/logits": 0.16586540639400482, "loss/reg": 0.003400270827114582, "step": 1555 }, { "epoch": 0.1945, "grad_norm": 2.293851852416992, "grad_norm_var": 102.8838099010742, "learning_rate": 0.0001, "loss": 1.0756, "loss/crossentropy": 2.545522928237915, "loss/hidden": 0.875, "loss/logits": 0.16658270359039307, "loss/reg": 0.0033986270427703857, "step": 1556 }, { "epoch": 0.194625, "grad_norm": 4.360113143920898, "grad_norm_var": 102.41291327849744, "learning_rate": 0.0001, "loss": 1.2058, "loss/crossentropy": 2.3416459560394287, "loss/hidden": 1.0234375, "loss/logits": 0.14838361740112305, "loss/reg": 0.003396830288693309, "step": 1557 }, { "epoch": 0.19475, "grad_norm": 2.38606858253479, "grad_norm_var": 102.19721826513539, "learning_rate": 0.0001, "loss": 1.0499, "loss/crossentropy": 2.7640974521636963, "loss/hidden": 0.875, "loss/logits": 0.1409429907798767, "loss/reg": 0.0033951113000512123, "step": 1558 }, { "epoch": 0.194875, "grad_norm": 2.7758493423461914, "grad_norm_var": 101.94624591139775, "learning_rate": 0.0001, "loss": 1.0704, "loss/crossentropy": 2.497638702392578, "loss/hidden": 0.890625, "loss/logits": 0.14583545923233032, "loss/reg": 0.0033934745006263256, "step": 1559 }, { "epoch": 0.195, "grad_norm": 3.479710340499878, "grad_norm_var": 101.52401358472737, "learning_rate": 0.0001, "loss": 1.1734, "loss/crossentropy": 2.457615375518799, "loss/hidden": 0.9609375, "loss/logits": 0.1785276234149933, "loss/reg": 0.0033916765823960304, "step": 1560 }, { "epoch": 0.195125, "grad_norm": 2.017589807510376, "grad_norm_var": 101.90605089709193, "learning_rate": 0.0001, "loss": 1.1169, "loss/crossentropy": 2.240902900695801, "loss/hidden": 0.90625, "loss/logits": 0.17671313881874084, "loss/reg": 0.0033898656256496906, "step": 1561 }, { "epoch": 0.19525, "grad_norm": 3.5527615547180176, "grad_norm_var": 101.55947999989262, "learning_rate": 0.0001, "loss": 1.3431, "loss/crossentropy": 1.965632677078247, "loss/hidden": 1.1484375, "loss/logits": 0.16078650951385498, "loss/reg": 0.0033880271948873997, "step": 1562 }, { "epoch": 0.195375, "grad_norm": 2.345229387283325, "grad_norm_var": 101.47058431829664, "learning_rate": 0.0001, "loss": 1.0961, "loss/crossentropy": 2.583942174911499, "loss/hidden": 0.8984375, "loss/logits": 0.1638316810131073, "loss/reg": 0.003386161755770445, "step": 1563 }, { "epoch": 0.1955, "grad_norm": 3.6912682056427, "grad_norm_var": 101.02284512008372, "learning_rate": 0.0001, "loss": 1.1561, "loss/crossentropy": 2.344486713409424, "loss/hidden": 0.9140625, "loss/logits": 0.2081700563430786, "loss/reg": 0.0033844145946204662, "step": 1564 }, { "epoch": 0.195625, "grad_norm": 2.0806732177734375, "grad_norm_var": 101.14121040687311, "learning_rate": 0.0001, "loss": 1.1902, "loss/crossentropy": 2.3066744804382324, "loss/hidden": 0.9765625, "loss/logits": 0.17980894446372986, "loss/reg": 0.0033826008439064026, "step": 1565 }, { "epoch": 0.19575, "grad_norm": 2.6736996173858643, "grad_norm_var": 101.09180955446247, "learning_rate": 0.0001, "loss": 1.1123, "loss/crossentropy": 2.6491916179656982, "loss/hidden": 0.83984375, "loss/logits": 0.23861975967884064, "loss/reg": 0.0033807456493377686, "step": 1566 }, { "epoch": 0.195875, "grad_norm": 2.320620059967041, "grad_norm_var": 101.0422612381744, "learning_rate": 0.0001, "loss": 0.9997, "loss/crossentropy": 2.432460069656372, "loss/hidden": 0.83984375, "loss/logits": 0.12610265612602234, "loss/reg": 0.003379035508260131, "step": 1567 }, { "epoch": 0.196, "grad_norm": 2.170931816101074, "grad_norm_var": 101.09291343176476, "learning_rate": 0.0001, "loss": 1.0037, "loss/crossentropy": 2.6432549953460693, "loss/hidden": 0.83203125, "loss/logits": 0.13785216212272644, "loss/reg": 0.0033771616872400045, "step": 1568 }, { "epoch": 0.196125, "grad_norm": 1.9352695941925049, "grad_norm_var": 0.5014398496234489, "learning_rate": 0.0001, "loss": 1.0546, "loss/crossentropy": 2.368238925933838, "loss/hidden": 0.8671875, "loss/logits": 0.15368467569351196, "loss/reg": 0.0033752431627362967, "step": 1569 }, { "epoch": 0.19625, "grad_norm": 1.7521263360977173, "grad_norm_var": 0.5531383546467253, "learning_rate": 0.0001, "loss": 0.964, "loss/crossentropy": 2.388716220855713, "loss/hidden": 0.796875, "loss/logits": 0.13340537250041962, "loss/reg": 0.0033735185861587524, "step": 1570 }, { "epoch": 0.196375, "grad_norm": 2.942868709564209, "grad_norm_var": 0.5515874670900174, "learning_rate": 0.0001, "loss": 1.246, "loss/crossentropy": 2.4898412227630615, "loss/hidden": 1.03125, "loss/logits": 0.18107619881629944, "loss/reg": 0.0033718394115567207, "step": 1571 }, { "epoch": 0.1965, "grad_norm": 2.828531503677368, "grad_norm_var": 0.5423780354131977, "learning_rate": 0.0001, "loss": 1.1128, "loss/crossentropy": 2.526690721511841, "loss/hidden": 0.9140625, "loss/logits": 0.1650082767009735, "loss/reg": 0.0033701006323099136, "step": 1572 }, { "epoch": 0.196625, "grad_norm": 1.8868728876113892, "grad_norm_var": 0.3795729319831989, "learning_rate": 0.0001, "loss": 0.9528, "loss/crossentropy": 2.4025442600250244, "loss/hidden": 0.78125, "loss/logits": 0.13787029683589935, "loss/reg": 0.0033683953806757927, "step": 1573 }, { "epoch": 0.19675, "grad_norm": 2.290252208709717, "grad_norm_var": 0.38227303455985767, "learning_rate": 0.0001, "loss": 1.0692, "loss/crossentropy": 2.349837064743042, "loss/hidden": 0.87109375, "loss/logits": 0.16445474326610565, "loss/reg": 0.0033667993266135454, "step": 1574 }, { "epoch": 0.196875, "grad_norm": 2.190880537033081, "grad_norm_var": 0.385772762292572, "learning_rate": 0.0001, "loss": 0.9696, "loss/crossentropy": 2.2366368770599365, "loss/hidden": 0.796875, "loss/logits": 0.13903136551380157, "loss/reg": 0.0033651133999228477, "step": 1575 }, { "epoch": 0.197, "grad_norm": 3.2513270378112793, "grad_norm_var": 0.35950258294762044, "learning_rate": 0.0001, "loss": 1.2407, "loss/crossentropy": 2.3932385444641113, "loss/hidden": 1.03125, "loss/logits": 0.1757686287164688, "loss/reg": 0.003363401163369417, "step": 1576 }, { "epoch": 0.197125, "grad_norm": 2.902137279510498, "grad_norm_var": 0.35201813546933614, "learning_rate": 0.0001, "loss": 1.3339, "loss/crossentropy": 2.5385305881500244, "loss/hidden": 1.1171875, "loss/logits": 0.18306049704551697, "loss/reg": 0.0033617597073316574, "step": 1577 }, { "epoch": 0.19725, "grad_norm": 2.4924919605255127, "grad_norm_var": 0.2806556923123916, "learning_rate": 0.0001, "loss": 1.101, "loss/crossentropy": 2.913592576980591, "loss/hidden": 0.9140625, "loss/logits": 0.15331600606441498, "loss/reg": 0.003360015107318759, "step": 1578 }, { "epoch": 0.197375, "grad_norm": 2.6192233562469482, "grad_norm_var": 0.2802525663669532, "learning_rate": 0.0001, "loss": 1.2151, "loss/crossentropy": 2.543704032897949, "loss/hidden": 1.0234375, "loss/logits": 0.1581249237060547, "loss/reg": 0.003358310554176569, "step": 1579 }, { "epoch": 0.1975, "grad_norm": 2.3134047985076904, "grad_norm_var": 0.1803902922500375, "learning_rate": 0.0001, "loss": 1.217, "loss/crossentropy": 2.3356986045837402, "loss/hidden": 1.0234375, "loss/logits": 0.15994945168495178, "loss/reg": 0.003356639062985778, "step": 1580 }, { "epoch": 0.197625, "grad_norm": 2.918966054916382, "grad_norm_var": 0.18686370719447395, "learning_rate": 0.0001, "loss": 1.4755, "loss/crossentropy": 2.0418221950531006, "loss/hidden": 1.2109375, "loss/logits": 0.2309650182723999, "loss/reg": 0.00335493846796453, "step": 1581 }, { "epoch": 0.19775, "grad_norm": 2.1771440505981445, "grad_norm_var": 0.1886619692371113, "learning_rate": 0.0001, "loss": 0.9935, "loss/crossentropy": 2.502734661102295, "loss/hidden": 0.82421875, "loss/logits": 0.1357189416885376, "loss/reg": 0.0033531710505485535, "step": 1582 }, { "epoch": 0.197875, "grad_norm": 2.2884721755981445, "grad_norm_var": 0.18922569213149815, "learning_rate": 0.0001, "loss": 0.9984, "loss/crossentropy": 2.4832651615142822, "loss/hidden": 0.8125, "loss/logits": 0.15235686302185059, "loss/reg": 0.00335147837176919, "step": 1583 }, { "epoch": 0.198, "grad_norm": 2.547328472137451, "grad_norm_var": 0.18482493667708866, "learning_rate": 0.0001, "loss": 1.1119, "loss/crossentropy": 2.575803279876709, "loss/hidden": 0.92578125, "loss/logits": 0.15261293947696686, "loss/reg": 0.003349804785102606, "step": 1584 }, { "epoch": 0.198125, "grad_norm": 3.402043581008911, "grad_norm_var": 0.21694510449547597, "learning_rate": 0.0001, "loss": 1.3013, "loss/crossentropy": 2.0987796783447266, "loss/hidden": 1.09375, "loss/logits": 0.17406561970710754, "loss/reg": 0.0033480448182672262, "step": 1585 }, { "epoch": 0.19825, "grad_norm": 2.545072317123413, "grad_norm_var": 0.17185981683361665, "learning_rate": 0.0001, "loss": 1.1464, "loss/crossentropy": 2.5816264152526855, "loss/hidden": 0.9375, "loss/logits": 0.17544244229793549, "loss/reg": 0.003346419893205166, "step": 1586 }, { "epoch": 0.198375, "grad_norm": 3.0395045280456543, "grad_norm_var": 0.17686366063397868, "learning_rate": 0.0001, "loss": 1.248, "loss/crossentropy": 2.302675247192383, "loss/hidden": 1.015625, "loss/logits": 0.19897626340389252, "loss/reg": 0.0033445856533944607, "step": 1587 }, { "epoch": 0.1985, "grad_norm": 2.4645659923553467, "grad_norm_var": 0.17433679981741357, "learning_rate": 0.0001, "loss": 1.1567, "loss/crossentropy": 2.4756343364715576, "loss/hidden": 0.95703125, "loss/logits": 0.16626408696174622, "loss/reg": 0.0033427351154386997, "step": 1588 }, { "epoch": 0.198625, "grad_norm": 2.810246706008911, "grad_norm_var": 0.14190777744166377, "learning_rate": 0.0001, "loss": 1.1902, "loss/crossentropy": 2.5803637504577637, "loss/hidden": 0.9765625, "loss/logits": 0.18018130958080292, "loss/reg": 0.0033410657197237015, "step": 1589 }, { "epoch": 0.19875, "grad_norm": 2.4339115619659424, "grad_norm_var": 0.13648274466220206, "learning_rate": 0.0001, "loss": 1.028, "loss/crossentropy": 2.399829149246216, "loss/hidden": 0.828125, "loss/logits": 0.16653020679950714, "loss/reg": 0.0033393700141459703, "step": 1590 }, { "epoch": 0.198875, "grad_norm": 3.239158868789673, "grad_norm_var": 0.141020529033392, "learning_rate": 0.0001, "loss": 1.1675, "loss/crossentropy": 2.2257063388824463, "loss/hidden": 0.9609375, "loss/logits": 0.17314554750919342, "loss/reg": 0.0033375280909240246, "step": 1591 }, { "epoch": 0.199, "grad_norm": 2.437912940979004, "grad_norm_var": 0.12423960548648268, "learning_rate": 0.0001, "loss": 0.922, "loss/crossentropy": 2.4248645305633545, "loss/hidden": 0.765625, "loss/logits": 0.12298044562339783, "loss/reg": 0.00333569198846817, "step": 1592 }, { "epoch": 0.199125, "grad_norm": 2.8941097259521484, "grad_norm_var": 0.12398925250324358, "learning_rate": 0.0001, "loss": 1.1451, "loss/crossentropy": 2.6563851833343506, "loss/hidden": 0.93359375, "loss/logits": 0.17815393209457397, "loss/reg": 0.0033339851070195436, "step": 1593 }, { "epoch": 0.19925, "grad_norm": 2.803880214691162, "grad_norm_var": 0.12292982191307994, "learning_rate": 0.0001, "loss": 1.1357, "loss/crossentropy": 2.2759344577789307, "loss/hidden": 0.9609375, "loss/logits": 0.141413152217865, "loss/reg": 0.003332150634378195, "step": 1594 }, { "epoch": 0.199375, "grad_norm": 2.237532138824463, "grad_norm_var": 0.13530315628679926, "learning_rate": 0.0001, "loss": 1.2666, "loss/crossentropy": 2.077180862426758, "loss/hidden": 1.0625, "loss/logits": 0.1708061397075653, "loss/reg": 0.003330171573907137, "step": 1595 }, { "epoch": 0.1995, "grad_norm": 1.8717583417892456, "grad_norm_var": 0.16787872576412224, "learning_rate": 0.0001, "loss": 0.933, "loss/crossentropy": 2.5325214862823486, "loss/hidden": 0.7734375, "loss/logits": 0.12631843984127045, "loss/reg": 0.0033281673677265644, "step": 1596 }, { "epoch": 0.199625, "grad_norm": 2.3740711212158203, "grad_norm_var": 0.16558500323165828, "learning_rate": 0.0001, "loss": 0.9823, "loss/crossentropy": 2.456437349319458, "loss/hidden": 0.81640625, "loss/logits": 0.13264372944831848, "loss/reg": 0.003326438134536147, "step": 1597 }, { "epoch": 0.19975, "grad_norm": 2.2797210216522217, "grad_norm_var": 0.16048771364269968, "learning_rate": 0.0001, "loss": 1.1277, "loss/crossentropy": 2.478566884994507, "loss/hidden": 0.92578125, "loss/logits": 0.16867585480213165, "loss/reg": 0.0033244146034121513, "step": 1598 }, { "epoch": 0.199875, "grad_norm": 2.273390293121338, "grad_norm_var": 0.1611370953897993, "learning_rate": 0.0001, "loss": 1.0944, "loss/crossentropy": 2.3444995880126953, "loss/hidden": 0.91015625, "loss/logits": 0.15098696947097778, "loss/reg": 0.0033223910722881556, "step": 1599 }, { "epoch": 0.2, "grad_norm": 2.5064868927001953, "grad_norm_var": 0.1615466221150351, "learning_rate": 0.0001, "loss": 1.1207, "loss/crossentropy": 2.6761395931243896, "loss/hidden": 0.91796875, "loss/logits": 0.16954849660396576, "loss/reg": 0.003320206655189395, "step": 1600 }, { "epoch": 0.200125, "grad_norm": 2.1371653079986572, "grad_norm_var": 0.12641732646446915, "learning_rate": 0.0001, "loss": 1.1086, "loss/crossentropy": 2.167865037918091, "loss/hidden": 0.921875, "loss/logits": 0.15358451008796692, "loss/reg": 0.0033184492494910955, "step": 1601 }, { "epoch": 0.20025, "grad_norm": 2.7362449169158936, "grad_norm_var": 0.12929521265355667, "learning_rate": 0.0001, "loss": 1.1727, "loss/crossentropy": 2.7397940158843994, "loss/hidden": 0.9609375, "loss/logits": 0.17860586941242218, "loss/reg": 0.0033167051151394844, "step": 1602 }, { "epoch": 0.200375, "grad_norm": 2.124201536178589, "grad_norm_var": 0.11993136224217782, "learning_rate": 0.0001, "loss": 1.0476, "loss/crossentropy": 2.3766019344329834, "loss/hidden": 0.875, "loss/logits": 0.13947069644927979, "loss/reg": 0.003314658999443054, "step": 1603 }, { "epoch": 0.2005, "grad_norm": 2.1093101501464844, "grad_norm_var": 0.12838562389595007, "learning_rate": 0.0001, "loss": 1.0742, "loss/crossentropy": 2.491436004638672, "loss/hidden": 0.87109375, "loss/logits": 0.16998031735420227, "loss/reg": 0.0033127006608992815, "step": 1604 }, { "epoch": 0.200625, "grad_norm": 2.2037789821624756, "grad_norm_var": 0.12259215079400192, "learning_rate": 0.0001, "loss": 1.1443, "loss/crossentropy": 2.429650068283081, "loss/hidden": 0.93359375, "loss/logits": 0.1775752156972885, "loss/reg": 0.0033109041396528482, "step": 1605 }, { "epoch": 0.20075, "grad_norm": 2.0681684017181396, "grad_norm_var": 0.13009940320814947, "learning_rate": 0.0001, "loss": 0.9251, "loss/crossentropy": 2.5921154022216797, "loss/hidden": 0.7734375, "loss/logits": 0.11855532228946686, "loss/reg": 0.0033087998162955046, "step": 1606 }, { "epoch": 0.200875, "grad_norm": 2.6388769149780273, "grad_norm_var": 0.08494051001129363, "learning_rate": 0.0001, "loss": 1.0706, "loss/crossentropy": 2.5055747032165527, "loss/hidden": 0.89453125, "loss/logits": 0.14303961396217346, "loss/reg": 0.0033070738427340984, "step": 1607 }, { "epoch": 0.201, "grad_norm": 2.181697368621826, "grad_norm_var": 0.08624639517303852, "learning_rate": 0.0001, "loss": 1.0201, "loss/crossentropy": 2.4525694847106934, "loss/hidden": 0.84375, "loss/logits": 0.14332106709480286, "loss/reg": 0.003305203514173627, "step": 1608 }, { "epoch": 0.201125, "grad_norm": 2.6697564125061035, "grad_norm_var": 0.07281751738566834, "learning_rate": 0.0001, "loss": 1.0102, "loss/crossentropy": 2.7797605991363525, "loss/hidden": 0.828125, "loss/logits": 0.14901109039783478, "loss/reg": 0.003303457982838154, "step": 1609 }, { "epoch": 0.20125, "grad_norm": 2.582929849624634, "grad_norm_var": 0.06179040816687683, "learning_rate": 0.0001, "loss": 1.0996, "loss/crossentropy": 2.530728578567505, "loss/hidden": 0.91015625, "loss/logits": 0.1564064621925354, "loss/reg": 0.0033016535453498363, "step": 1610 }, { "epoch": 0.201375, "grad_norm": 1.8796063661575317, "grad_norm_var": 0.07336041461658145, "learning_rate": 0.0001, "loss": 1.0591, "loss/crossentropy": 2.5042145252227783, "loss/hidden": 0.87890625, "loss/logits": 0.14722013473510742, "loss/reg": 0.0032997329253703356, "step": 1611 }, { "epoch": 0.2015, "grad_norm": 3.313866138458252, "grad_norm_var": 0.1229542381526608, "learning_rate": 0.0001, "loss": 1.2483, "loss/crossentropy": 2.173633575439453, "loss/hidden": 1.0390625, "loss/logits": 0.17622298002243042, "loss/reg": 0.00329802418127656, "step": 1612 }, { "epoch": 0.201625, "grad_norm": 2.6430091857910156, "grad_norm_var": 0.12726375044356592, "learning_rate": 0.0001, "loss": 1.0964, "loss/crossentropy": 2.1356022357940674, "loss/hidden": 0.91015625, "loss/logits": 0.15332993865013123, "loss/reg": 0.0032957610674202442, "step": 1613 }, { "epoch": 0.20175, "grad_norm": 2.283130645751953, "grad_norm_var": 0.12721126777018643, "learning_rate": 0.0001, "loss": 1.1047, "loss/crossentropy": 2.6446938514709473, "loss/hidden": 0.9140625, "loss/logits": 0.15773829817771912, "loss/reg": 0.003294040448963642, "step": 1614 }, { "epoch": 0.201875, "grad_norm": 2.009594678878784, "grad_norm_var": 0.1359073820378919, "learning_rate": 0.0001, "loss": 1.3146, "loss/crossentropy": 2.353555679321289, "loss/hidden": 1.0859375, "loss/logits": 0.1957610696554184, "loss/reg": 0.0032923046965152025, "step": 1615 }, { "epoch": 0.202, "grad_norm": 2.2532074451446533, "grad_norm_var": 0.1356617628627058, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.421433210372925, "loss/hidden": 0.890625, "loss/logits": 0.139800027012825, "loss/reg": 0.0032904213294386864, "step": 1616 }, { "epoch": 0.202125, "grad_norm": 2.2480552196502686, "grad_norm_var": 0.1330667309785141, "learning_rate": 0.0001, "loss": 1.1926, "loss/crossentropy": 2.4565775394439697, "loss/hidden": 0.99609375, "loss/logits": 0.16361382603645325, "loss/reg": 0.0032883703242987394, "step": 1617 }, { "epoch": 0.20225, "grad_norm": 4.123042106628418, "grad_norm_var": 0.3206941892301216, "learning_rate": 0.0001, "loss": 1.4135, "loss/crossentropy": 2.3067095279693604, "loss/hidden": 1.140625, "loss/logits": 0.2400142401456833, "loss/reg": 0.0032866299152374268, "step": 1618 }, { "epoch": 0.202375, "grad_norm": 2.0010764598846436, "grad_norm_var": 0.3271258788637292, "learning_rate": 0.0001, "loss": 1.0215, "loss/crossentropy": 2.4679181575775146, "loss/hidden": 0.83984375, "loss/logits": 0.14884260296821594, "loss/reg": 0.0032845879904925823, "step": 1619 }, { "epoch": 0.2025, "grad_norm": 2.1155405044555664, "grad_norm_var": 0.3268448163523752, "learning_rate": 0.0001, "loss": 1.1275, "loss/crossentropy": 2.7230353355407715, "loss/hidden": 0.9375, "loss/logits": 0.15715520083904266, "loss/reg": 0.0032828382682055235, "step": 1620 }, { "epoch": 0.202625, "grad_norm": 2.0337374210357666, "grad_norm_var": 0.3342560560773141, "learning_rate": 0.0001, "loss": 1.0079, "loss/crossentropy": 2.726454734802246, "loss/hidden": 0.8359375, "loss/logits": 0.13911572098731995, "loss/reg": 0.003280794247984886, "step": 1621 }, { "epoch": 0.20275, "grad_norm": 2.493293285369873, "grad_norm_var": 0.32445634627695763, "learning_rate": 0.0001, "loss": 1.3205, "loss/crossentropy": 2.4674136638641357, "loss/hidden": 1.1015625, "loss/logits": 0.1861811727285385, "loss/reg": 0.003278720658272505, "step": 1622 }, { "epoch": 0.202875, "grad_norm": 2.0618650913238525, "grad_norm_var": 0.3320343293044615, "learning_rate": 0.0001, "loss": 1.0074, "loss/crossentropy": 2.5865259170532227, "loss/hidden": 0.83984375, "loss/logits": 0.13479462265968323, "loss/reg": 0.0032766172662377357, "step": 1623 }, { "epoch": 0.203, "grad_norm": 4.24573278427124, "grad_norm_var": 0.5297347853177716, "learning_rate": 0.0001, "loss": 1.0286, "loss/crossentropy": 2.7966721057891846, "loss/hidden": 0.84765625, "loss/logits": 0.1481596827507019, "loss/reg": 0.0032749150414019823, "step": 1624 }, { "epoch": 0.203125, "grad_norm": 4.453259468078613, "grad_norm_var": 0.7546780963902354, "learning_rate": 0.0001, "loss": 1.4379, "loss/crossentropy": 2.324061155319214, "loss/hidden": 1.1875, "loss/logits": 0.21765878796577454, "loss/reg": 0.003273224225267768, "step": 1625 }, { "epoch": 0.20325, "grad_norm": 2.1763815879821777, "grad_norm_var": 0.7697989170952411, "learning_rate": 0.0001, "loss": 1.2117, "loss/crossentropy": 2.2315030097961426, "loss/hidden": 1.0234375, "loss/logits": 0.1554989069700241, "loss/reg": 0.0032715124543756247, "step": 1626 }, { "epoch": 0.203375, "grad_norm": 2.147721529006958, "grad_norm_var": 0.7468977871556014, "learning_rate": 0.0001, "loss": 1.1699, "loss/crossentropy": 2.5481529235839844, "loss/hidden": 0.953125, "loss/logits": 0.18407508730888367, "loss/reg": 0.0032695841509848833, "step": 1627 }, { "epoch": 0.2035, "grad_norm": 2.1995062828063965, "grad_norm_var": 0.727752660020807, "learning_rate": 0.0001, "loss": 1.0413, "loss/crossentropy": 2.62490177154541, "loss/hidden": 0.86328125, "loss/logits": 0.14532649517059326, "loss/reg": 0.0032678483985364437, "step": 1628 }, { "epoch": 0.203625, "grad_norm": 2.6025071144104004, "grad_norm_var": 0.7275851745925003, "learning_rate": 0.0001, "loss": 1.3071, "loss/crossentropy": 2.261453628540039, "loss/hidden": 1.0703125, "loss/logits": 0.2041451334953308, "loss/reg": 0.0032659387215971947, "step": 1629 }, { "epoch": 0.20375, "grad_norm": 2.3895230293273926, "grad_norm_var": 0.7239327077368195, "learning_rate": 0.0001, "loss": 1.1771, "loss/crossentropy": 2.437429904937744, "loss/hidden": 0.95703125, "loss/logits": 0.1874021738767624, "loss/reg": 0.003264203667640686, "step": 1630 }, { "epoch": 0.203875, "grad_norm": 2.5536670684814453, "grad_norm_var": 0.6998122275898206, "learning_rate": 0.0001, "loss": 1.2855, "loss/crossentropy": 2.308814287185669, "loss/hidden": 1.078125, "loss/logits": 0.17473715543746948, "loss/reg": 0.003262232756242156, "step": 1631 }, { "epoch": 0.204, "grad_norm": 1.893472671508789, "grad_norm_var": 0.7260273238761611, "learning_rate": 0.0001, "loss": 1.0432, "loss/crossentropy": 2.4681177139282227, "loss/hidden": 0.859375, "loss/logits": 0.15119843184947968, "loss/reg": 0.003260491183027625, "step": 1632 }, { "epoch": 0.204125, "grad_norm": 2.6406593322753906, "grad_norm_var": 0.7167848758234858, "learning_rate": 0.0001, "loss": 0.9315, "loss/crossentropy": 2.692917585372925, "loss/hidden": 0.76953125, "loss/logits": 0.12942397594451904, "loss/reg": 0.003258763812482357, "step": 1633 }, { "epoch": 0.20425, "grad_norm": 2.6017487049102783, "grad_norm_var": 0.5592297482073356, "learning_rate": 0.0001, "loss": 1.4767, "loss/crossentropy": 1.8698807954788208, "loss/hidden": 1.2109375, "loss/logits": 0.2331976294517517, "loss/reg": 0.0032570629846304655, "step": 1634 }, { "epoch": 0.204375, "grad_norm": 2.134735107421875, "grad_norm_var": 0.5507758063156113, "learning_rate": 0.0001, "loss": 0.9424, "loss/crossentropy": 2.657256841659546, "loss/hidden": 0.78515625, "loss/logits": 0.1246921718120575, "loss/reg": 0.003255224786698818, "step": 1635 }, { "epoch": 0.2045, "grad_norm": 2.775542736053467, "grad_norm_var": 0.5400799961918049, "learning_rate": 0.0001, "loss": 1.1952, "loss/crossentropy": 2.3927488327026367, "loss/hidden": 0.98046875, "loss/logits": 0.18224084377288818, "loss/reg": 0.003253570292145014, "step": 1636 }, { "epoch": 0.204625, "grad_norm": 3.4010424613952637, "grad_norm_var": 0.5559319990050954, "learning_rate": 0.0001, "loss": 0.9753, "loss/crossentropy": 2.674126386642456, "loss/hidden": 0.80859375, "loss/logits": 0.1341724693775177, "loss/reg": 0.0032518133521080017, "step": 1637 }, { "epoch": 0.20475, "grad_norm": 2.3625357151031494, "grad_norm_var": 0.5601365603978583, "learning_rate": 0.0001, "loss": 1.1191, "loss/crossentropy": 2.00201416015625, "loss/hidden": 0.93359375, "loss/logits": 0.15301145613193512, "loss/reg": 0.003249979577958584, "step": 1638 }, { "epoch": 0.204875, "grad_norm": 2.233456611633301, "grad_norm_var": 0.5481778857364833, "learning_rate": 0.0001, "loss": 1.0923, "loss/crossentropy": 2.260629892349243, "loss/hidden": 0.9140625, "loss/logits": 0.14572405815124512, "loss/reg": 0.003248338820412755, "step": 1639 }, { "epoch": 0.205, "grad_norm": 2.4094839096069336, "grad_norm_var": 0.37452435323996436, "learning_rate": 0.0001, "loss": 1.0426, "loss/crossentropy": 2.3188180923461914, "loss/hidden": 0.8515625, "loss/logits": 0.15855032205581665, "loss/reg": 0.0032465672120451927, "step": 1640 }, { "epoch": 0.205125, "grad_norm": 2.5767383575439453, "grad_norm_var": 0.12114709294457929, "learning_rate": 0.0001, "loss": 1.107, "loss/crossentropy": 2.5933563709259033, "loss/hidden": 0.92578125, "loss/logits": 0.14877736568450928, "loss/reg": 0.003244933672249317, "step": 1641 }, { "epoch": 0.20525, "grad_norm": 2.55334734916687, "grad_norm_var": 0.11659405774919757, "learning_rate": 0.0001, "loss": 1.4642, "loss/crossentropy": 2.26149582862854, "loss/hidden": 1.1953125, "loss/logits": 0.23644839227199554, "loss/reg": 0.0032434100285172462, "step": 1642 }, { "epoch": 0.205375, "grad_norm": 2.3793838024139404, "grad_norm_var": 0.11007918089816542, "learning_rate": 0.0001, "loss": 1.0786, "loss/crossentropy": 2.1200153827667236, "loss/hidden": 0.89453125, "loss/logits": 0.15168824791908264, "loss/reg": 0.0032420321367681026, "step": 1643 }, { "epoch": 0.2055, "grad_norm": 2.159534215927124, "grad_norm_var": 0.11168307348258182, "learning_rate": 0.0001, "loss": 1.2839, "loss/crossentropy": 2.5448434352874756, "loss/hidden": 1.078125, "loss/logits": 0.17339983582496643, "loss/reg": 0.003240725724026561, "step": 1644 }, { "epoch": 0.205625, "grad_norm": 2.512781858444214, "grad_norm_var": 0.1107112022419983, "learning_rate": 0.0001, "loss": 1.0038, "loss/crossentropy": 2.5887203216552734, "loss/hidden": 0.83984375, "loss/logits": 0.1315690279006958, "loss/reg": 0.0032390966080129147, "step": 1645 }, { "epoch": 0.20575, "grad_norm": 2.471038818359375, "grad_norm_var": 0.11021265436342276, "learning_rate": 0.0001, "loss": 1.0674, "loss/crossentropy": 2.4827725887298584, "loss/hidden": 0.8984375, "loss/logits": 0.1366003006696701, "loss/reg": 0.003237416036427021, "step": 1646 }, { "epoch": 0.205875, "grad_norm": 2.5491740703582764, "grad_norm_var": 0.11016900462870065, "learning_rate": 0.0001, "loss": 0.9898, "loss/crossentropy": 2.808387279510498, "loss/hidden": 0.8203125, "loss/logits": 0.1371297538280487, "loss/reg": 0.0032356702722609043, "step": 1647 }, { "epoch": 0.206, "grad_norm": 2.476713180541992, "grad_norm_var": 0.08594114936163895, "learning_rate": 0.0001, "loss": 1.1398, "loss/crossentropy": 2.4765665531158447, "loss/hidden": 0.9375, "loss/logits": 0.1700000762939453, "loss/reg": 0.003234060015529394, "step": 1648 }, { "epoch": 0.206125, "grad_norm": 2.1351890563964844, "grad_norm_var": 0.09343219350858452, "learning_rate": 0.0001, "loss": 1.1968, "loss/crossentropy": 2.2499215602874756, "loss/hidden": 0.98828125, "loss/logits": 0.176192969083786, "loss/reg": 0.0032324332278221846, "step": 1649 }, { "epoch": 0.20625, "grad_norm": 2.1201257705688477, "grad_norm_var": 0.10032196484461338, "learning_rate": 0.0001, "loss": 1.0841, "loss/crossentropy": 2.6280465126037598, "loss/hidden": 0.890625, "loss/logits": 0.16118907928466797, "loss/reg": 0.003230888629332185, "step": 1650 }, { "epoch": 0.206375, "grad_norm": 2.829747200012207, "grad_norm_var": 0.10100266775164073, "learning_rate": 0.0001, "loss": 1.0978, "loss/crossentropy": 2.610457420349121, "loss/hidden": 0.90234375, "loss/logits": 0.16314734518527985, "loss/reg": 0.003229183377698064, "step": 1651 }, { "epoch": 0.2065, "grad_norm": 2.5310165882110596, "grad_norm_var": 0.09564570596240832, "learning_rate": 0.0001, "loss": 0.9424, "loss/crossentropy": 2.548105478286743, "loss/hidden": 0.7890625, "loss/logits": 0.1211063414812088, "loss/reg": 0.0032274452969431877, "step": 1652 }, { "epoch": 0.206625, "grad_norm": 2.257979393005371, "grad_norm_var": 0.037136142432717394, "learning_rate": 0.0001, "loss": 1.1971, "loss/crossentropy": 2.516902446746826, "loss/hidden": 0.97265625, "loss/logits": 0.19221451878547668, "loss/reg": 0.003225695574656129, "step": 1653 }, { "epoch": 0.20675, "grad_norm": 2.3023855686187744, "grad_norm_var": 0.03774205518613461, "learning_rate": 0.0001, "loss": 0.9739, "loss/crossentropy": 2.71262788772583, "loss/hidden": 0.80859375, "loss/logits": 0.1330820769071579, "loss/reg": 0.003224144922569394, "step": 1654 }, { "epoch": 0.206875, "grad_norm": 2.521340847015381, "grad_norm_var": 0.0362938578599632, "learning_rate": 0.0001, "loss": 1.098, "loss/crossentropy": 2.643906593322754, "loss/hidden": 0.90625, "loss/logits": 0.15947577357292175, "loss/reg": 0.003222482278943062, "step": 1655 }, { "epoch": 0.207, "grad_norm": 5.88134765625, "grad_norm_var": 0.7828817213139186, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.313748836517334, "loss/hidden": 1.2109375, "loss/logits": 0.17793220281600952, "loss/reg": 0.0032208384945988655, "step": 1656 }, { "epoch": 0.207125, "grad_norm": 2.5916645526885986, "grad_norm_var": 0.7827675255288795, "learning_rate": 0.0001, "loss": 1.2521, "loss/crossentropy": 2.4316515922546387, "loss/hidden": 1.03125, "loss/logits": 0.18862737715244293, "loss/reg": 0.0032191697973757982, "step": 1657 }, { "epoch": 0.20725, "grad_norm": 2.2206692695617676, "grad_norm_var": 0.7936192015383082, "learning_rate": 0.0001, "loss": 1.0245, "loss/crossentropy": 2.6382174491882324, "loss/hidden": 0.84765625, "loss/logits": 0.14466163516044617, "loss/reg": 0.00321741821244359, "step": 1658 }, { "epoch": 0.207375, "grad_norm": 2.8307604789733887, "grad_norm_var": 0.7917962945035991, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.4240641593933105, "loss/hidden": 1.15625, "loss/logits": 0.21726641058921814, "loss/reg": 0.0032156051602214575, "step": 1659 }, { "epoch": 0.2075, "grad_norm": 2.049522638320923, "grad_norm_var": 0.7997391376511624, "learning_rate": 0.0001, "loss": 0.9971, "loss/crossentropy": 2.6553359031677246, "loss/hidden": 0.83203125, "loss/logits": 0.1329321563243866, "loss/reg": 0.003213758347555995, "step": 1660 }, { "epoch": 0.207625, "grad_norm": 3.276784658432007, "grad_norm_var": 0.8229971260041339, "learning_rate": 0.0001, "loss": 1.44, "loss/crossentropy": 2.5629796981811523, "loss/hidden": 1.171875, "loss/logits": 0.23601150512695312, "loss/reg": 0.0032118717208504677, "step": 1661 }, { "epoch": 0.20775, "grad_norm": 3.7711448669433594, "grad_norm_var": 0.8906238399602217, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.474085569381714, "loss/hidden": 1.1875, "loss/logits": 0.20507219433784485, "loss/reg": 0.0032101524993777275, "step": 1662 }, { "epoch": 0.207875, "grad_norm": 2.741032361984253, "grad_norm_var": 0.887234593717244, "learning_rate": 0.0001, "loss": 1.1324, "loss/crossentropy": 2.6519734859466553, "loss/hidden": 0.9296875, "loss/logits": 0.17065690457820892, "loss/reg": 0.0032084728591144085, "step": 1663 }, { "epoch": 0.208, "grad_norm": 3.385438919067383, "grad_norm_var": 0.9016638698725956, "learning_rate": 0.0001, "loss": 1.4632, "loss/crossentropy": 2.8452768325805664, "loss/hidden": 1.21875, "loss/logits": 0.2123820185661316, "loss/reg": 0.0032068106811493635, "step": 1664 }, { "epoch": 0.208125, "grad_norm": 2.4284839630126953, "grad_norm_var": 0.8794628798393888, "learning_rate": 0.0001, "loss": 1.1069, "loss/crossentropy": 2.3913896083831787, "loss/hidden": 0.91796875, "loss/logits": 0.15689219534397125, "loss/reg": 0.0032050481531769037, "step": 1665 }, { "epoch": 0.20825, "grad_norm": 1.9970413446426392, "grad_norm_var": 0.8925309231944419, "learning_rate": 0.0001, "loss": 1.0111, "loss/crossentropy": 2.1553092002868652, "loss/hidden": 0.8359375, "loss/logits": 0.14312410354614258, "loss/reg": 0.003203297033905983, "step": 1666 }, { "epoch": 0.208375, "grad_norm": 2.629917621612549, "grad_norm_var": 0.8955935228773692, "learning_rate": 0.0001, "loss": 1.0293, "loss/crossentropy": 2.4769906997680664, "loss/hidden": 0.828125, "loss/logits": 0.16919545829296112, "loss/reg": 0.003201601095497608, "step": 1667 }, { "epoch": 0.2085, "grad_norm": 2.4040727615356445, "grad_norm_var": 0.9018056713863368, "learning_rate": 0.0001, "loss": 0.9916, "loss/crossentropy": 2.4735305309295654, "loss/hidden": 0.8046875, "loss/logits": 0.15492868423461914, "loss/reg": 0.003199809929355979, "step": 1668 }, { "epoch": 0.208625, "grad_norm": 2.666597843170166, "grad_norm_var": 0.8810435015232821, "learning_rate": 0.0001, "loss": 0.9514, "loss/crossentropy": 2.219966173171997, "loss/hidden": 0.78515625, "loss/logits": 0.13430990278720856, "loss/reg": 0.003198012476786971, "step": 1669 }, { "epoch": 0.20875, "grad_norm": 2.6586005687713623, "grad_norm_var": 0.8626734234562614, "learning_rate": 0.0001, "loss": 1.162, "loss/crossentropy": 2.411811351776123, "loss/hidden": 0.9765625, "loss/logits": 0.15343676507472992, "loss/reg": 0.0031961523927748203, "step": 1670 }, { "epoch": 0.208875, "grad_norm": 3.573573350906372, "grad_norm_var": 0.8817782564271193, "learning_rate": 0.0001, "loss": 1.0668, "loss/crossentropy": 2.475907802581787, "loss/hidden": 0.88671875, "loss/logits": 0.14815643429756165, "loss/reg": 0.003194718388840556, "step": 1671 }, { "epoch": 0.209, "grad_norm": 2.6452560424804688, "grad_norm_var": 0.26896437314466315, "learning_rate": 0.0001, "loss": 1.0764, "loss/crossentropy": 2.2734243869781494, "loss/hidden": 0.8828125, "loss/logits": 0.16163568198680878, "loss/reg": 0.0031934131402522326, "step": 1672 }, { "epoch": 0.209125, "grad_norm": 2.2780601978302, "grad_norm_var": 0.28139345731230725, "learning_rate": 0.0001, "loss": 1.1114, "loss/crossentropy": 2.6828384399414062, "loss/hidden": 0.90625, "loss/logits": 0.17327217757701874, "loss/reg": 0.003191707655787468, "step": 1673 }, { "epoch": 0.20925, "grad_norm": 3.0598275661468506, "grad_norm_var": 0.2692776803865986, "learning_rate": 0.0001, "loss": 1.1438, "loss/crossentropy": 2.626523017883301, "loss/hidden": 0.94921875, "loss/logits": 0.16267018020153046, "loss/reg": 0.0031899947207421064, "step": 1674 }, { "epoch": 0.209375, "grad_norm": 2.3135793209075928, "grad_norm_var": 0.28213310678472675, "learning_rate": 0.0001, "loss": 1.2395, "loss/crossentropy": 2.501697063446045, "loss/hidden": 1.015625, "loss/logits": 0.19198425114154816, "loss/reg": 0.003188441740348935, "step": 1675 }, { "epoch": 0.2095, "grad_norm": 3.061948537826538, "grad_norm_var": 0.25265989074379286, "learning_rate": 0.0001, "loss": 1.0785, "loss/crossentropy": 2.630337715148926, "loss/hidden": 0.8984375, "loss/logits": 0.14819283783435822, "loss/reg": 0.003186658024787903, "step": 1676 }, { "epoch": 0.209625, "grad_norm": 2.0030603408813477, "grad_norm_var": 0.27405567589367946, "learning_rate": 0.0001, "loss": 1.0526, "loss/crossentropy": 2.529303789138794, "loss/hidden": 0.87109375, "loss/logits": 0.1496969759464264, "loss/reg": 0.0031848950311541557, "step": 1677 }, { "epoch": 0.20975, "grad_norm": 5.00962495803833, "grad_norm_var": 0.5424888351687083, "learning_rate": 0.0001, "loss": 1.1409, "loss/crossentropy": 2.745102882385254, "loss/hidden": 0.95703125, "loss/logits": 0.15204139053821564, "loss/reg": 0.0031832093372941017, "step": 1678 }, { "epoch": 0.209875, "grad_norm": 2.7985355854034424, "grad_norm_var": 0.5422164981145214, "learning_rate": 0.0001, "loss": 1.2753, "loss/crossentropy": 2.2465641498565674, "loss/hidden": 1.0625, "loss/logits": 0.1809433400630951, "loss/reg": 0.0031814700923860073, "step": 1679 }, { "epoch": 0.21, "grad_norm": 4.798881530761719, "grad_norm_var": 0.7760732092314867, "learning_rate": 0.0001, "loss": 1.466, "loss/crossentropy": 2.329308271408081, "loss/hidden": 1.2421875, "loss/logits": 0.1920267790555954, "loss/reg": 0.003179659601300955, "step": 1680 }, { "epoch": 0.210125, "grad_norm": 2.5762155055999756, "grad_norm_var": 0.7682393360080743, "learning_rate": 0.0001, "loss": 1.0703, "loss/crossentropy": 2.5117287635803223, "loss/hidden": 0.8828125, "loss/logits": 0.1556781381368637, "loss/reg": 0.003177785314619541, "step": 1681 }, { "epoch": 0.21025, "grad_norm": 2.2130141258239746, "grad_norm_var": 0.7450180582948017, "learning_rate": 0.0001, "loss": 1.0062, "loss/crossentropy": 2.4735710620880127, "loss/hidden": 0.83203125, "loss/logits": 0.14245635271072388, "loss/reg": 0.0031759634148329496, "step": 1682 }, { "epoch": 0.210375, "grad_norm": 2.1119225025177, "grad_norm_var": 0.7816966335511644, "learning_rate": 0.0001, "loss": 1.1029, "loss/crossentropy": 2.5301568508148193, "loss/hidden": 0.90234375, "loss/logits": 0.1688534915447235, "loss/reg": 0.0031742649152874947, "step": 1683 }, { "epoch": 0.2105, "grad_norm": 2.5934348106384277, "grad_norm_var": 0.7717750228974445, "learning_rate": 0.0001, "loss": 1.3001, "loss/crossentropy": 2.7114899158477783, "loss/hidden": 1.078125, "loss/logits": 0.19023996591567993, "loss/reg": 0.0031723512802273035, "step": 1684 }, { "epoch": 0.210625, "grad_norm": 2.822317361831665, "grad_norm_var": 0.7684936610933231, "learning_rate": 0.0001, "loss": 1.1469, "loss/crossentropy": 2.4820547103881836, "loss/hidden": 0.95703125, "loss/logits": 0.15812493860721588, "loss/reg": 0.0031706641893833876, "step": 1685 }, { "epoch": 0.21075, "grad_norm": 2.204843044281006, "grad_norm_var": 0.7964126984830915, "learning_rate": 0.0001, "loss": 1.0662, "loss/crossentropy": 2.5764718055725098, "loss/hidden": 0.8828125, "loss/logits": 0.15168514847755432, "loss/reg": 0.0031687715090811253, "step": 1686 }, { "epoch": 0.210875, "grad_norm": 1.9805725812911987, "grad_norm_var": 0.8074897214563511, "learning_rate": 0.0001, "loss": 0.9752, "loss/crossentropy": 2.5879499912261963, "loss/hidden": 0.81640625, "loss/logits": 0.12714409828186035, "loss/reg": 0.0031668762676417828, "step": 1687 }, { "epoch": 0.211, "grad_norm": 2.5287587642669678, "grad_norm_var": 0.8104222753256015, "learning_rate": 0.0001, "loss": 1.0874, "loss/crossentropy": 2.712538957595825, "loss/hidden": 0.90234375, "loss/logits": 0.1533837765455246, "loss/reg": 0.0031649123411625624, "step": 1688 }, { "epoch": 0.211125, "grad_norm": 3.2568001747131348, "grad_norm_var": 0.8058133582529289, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.5451202392578125, "loss/hidden": 0.90625, "loss/logits": 0.12538479268550873, "loss/reg": 0.0031630489975214005, "step": 1689 }, { "epoch": 0.21125, "grad_norm": 2.1492838859558105, "grad_norm_var": 0.8301337770059201, "learning_rate": 0.0001, "loss": 1.0673, "loss/crossentropy": 2.457829236984253, "loss/hidden": 0.89453125, "loss/logits": 0.14114046096801758, "loss/reg": 0.0031612419988960028, "step": 1690 }, { "epoch": 0.211375, "grad_norm": 2.0851340293884277, "grad_norm_var": 0.8474934557513625, "learning_rate": 0.0001, "loss": 1.0295, "loss/crossentropy": 2.593254327774048, "loss/hidden": 0.8515625, "loss/logits": 0.14630341529846191, "loss/reg": 0.0031592664308845997, "step": 1691 }, { "epoch": 0.2115, "grad_norm": 2.692077159881592, "grad_norm_var": 0.8412586771616655, "learning_rate": 0.0001, "loss": 1.1157, "loss/crossentropy": 2.491499900817871, "loss/hidden": 0.9375, "loss/logits": 0.14665257930755615, "loss/reg": 0.003157339058816433, "step": 1692 }, { "epoch": 0.211625, "grad_norm": 1.9899176359176636, "grad_norm_var": 0.8425591567104391, "learning_rate": 0.0001, "loss": 1.043, "loss/crossentropy": 2.550938367843628, "loss/hidden": 0.8671875, "loss/logits": 0.14427784085273743, "loss/reg": 0.0031556852627545595, "step": 1693 }, { "epoch": 0.21175, "grad_norm": 1.861344575881958, "grad_norm_var": 0.508564313907548, "learning_rate": 0.0001, "loss": 0.9407, "loss/crossentropy": 2.7122299671173096, "loss/hidden": 0.78515625, "loss/logits": 0.12400847673416138, "loss/reg": 0.0031540419440716505, "step": 1694 }, { "epoch": 0.211875, "grad_norm": 2.5342886447906494, "grad_norm_var": 0.5038702664044002, "learning_rate": 0.0001, "loss": 1.2289, "loss/crossentropy": 2.5080294609069824, "loss/hidden": 0.9921875, "loss/logits": 0.20523084700107574, "loss/reg": 0.00315248966217041, "step": 1695 }, { "epoch": 0.212, "grad_norm": 2.043546438217163, "grad_norm_var": 0.14296074842546982, "learning_rate": 0.0001, "loss": 0.9838, "loss/crossentropy": 2.683417320251465, "loss/hidden": 0.80078125, "loss/logits": 0.15149196982383728, "loss/reg": 0.003150953445583582, "step": 1696 }, { "epoch": 0.212125, "grad_norm": 2.110501289367676, "grad_norm_var": 0.14263816283126, "learning_rate": 0.0001, "loss": 1.1321, "loss/crossentropy": 2.24113392829895, "loss/hidden": 0.9453125, "loss/logits": 0.155296191573143, "loss/reg": 0.003149296622723341, "step": 1697 }, { "epoch": 0.21225, "grad_norm": 2.7359836101531982, "grad_norm_var": 0.1520199744222225, "learning_rate": 0.0001, "loss": 1.1932, "loss/crossentropy": 2.5510122776031494, "loss/hidden": 0.9765625, "loss/logits": 0.1851990818977356, "loss/reg": 0.003147589974105358, "step": 1698 }, { "epoch": 0.212375, "grad_norm": 2.346010208129883, "grad_norm_var": 0.1478174979612748, "learning_rate": 0.0001, "loss": 1.0448, "loss/crossentropy": 2.4974029064178467, "loss/hidden": 0.87890625, "loss/logits": 0.13440260291099548, "loss/reg": 0.0031458197627216578, "step": 1699 }, { "epoch": 0.2125, "grad_norm": 2.1124165058135986, "grad_norm_var": 0.14800787911656718, "learning_rate": 0.0001, "loss": 1.1699, "loss/crossentropy": 2.265637159347534, "loss/hidden": 0.9765625, "loss/logits": 0.16191065311431885, "loss/reg": 0.003143977839499712, "step": 1700 }, { "epoch": 0.212625, "grad_norm": 2.4635298252105713, "grad_norm_var": 0.13302139739859153, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.3409652709960938, "loss/hidden": 1.03125, "loss/logits": 0.18988975882530212, "loss/reg": 0.0031423657201230526, "step": 1701 }, { "epoch": 0.21275, "grad_norm": 2.1505675315856934, "grad_norm_var": 0.1340275686171452, "learning_rate": 0.0001, "loss": 1.1472, "loss/crossentropy": 2.48573899269104, "loss/hidden": 0.95703125, "loss/logits": 0.15877564251422882, "loss/reg": 0.0031405584886670113, "step": 1702 }, { "epoch": 0.212875, "grad_norm": 2.3113481998443604, "grad_norm_var": 0.12611443887347676, "learning_rate": 0.0001, "loss": 1.0576, "loss/crossentropy": 2.165038824081421, "loss/hidden": 0.88671875, "loss/logits": 0.139505535364151, "loss/reg": 0.0031387642957270145, "step": 1703 }, { "epoch": 0.213, "grad_norm": 2.0424857139587402, "grad_norm_var": 0.12837729482331822, "learning_rate": 0.0001, "loss": 1.189, "loss/crossentropy": 2.2224714756011963, "loss/hidden": 0.9609375, "loss/logits": 0.19673088192939758, "loss/reg": 0.0031370187643915415, "step": 1704 }, { "epoch": 0.213125, "grad_norm": 2.016124725341797, "grad_norm_var": 0.06718613229377196, "learning_rate": 0.0001, "loss": 0.9392, "loss/crossentropy": 2.3173725605010986, "loss/hidden": 0.78125, "loss/logits": 0.1265917271375656, "loss/reg": 0.0031352161895483732, "step": 1705 }, { "epoch": 0.21325, "grad_norm": 2.1489810943603516, "grad_norm_var": 0.06718930728756754, "learning_rate": 0.0001, "loss": 0.9914, "loss/crossentropy": 2.3617665767669678, "loss/hidden": 0.828125, "loss/logits": 0.13194003701210022, "loss/reg": 0.0031336136162281036, "step": 1706 }, { "epoch": 0.213375, "grad_norm": 2.2248895168304443, "grad_norm_var": 0.06575221726070399, "learning_rate": 0.0001, "loss": 0.9928, "loss/crossentropy": 2.3003880977630615, "loss/hidden": 0.83984375, "loss/logits": 0.12159569561481476, "loss/reg": 0.0031318794935941696, "step": 1707 }, { "epoch": 0.2135, "grad_norm": 2.7656562328338623, "grad_norm_var": 0.07056003633158045, "learning_rate": 0.0001, "loss": 1.0895, "loss/crossentropy": 2.4966132640838623, "loss/hidden": 0.90625, "loss/logits": 0.1519893854856491, "loss/reg": 0.0031300997361540794, "step": 1708 }, { "epoch": 0.213625, "grad_norm": 2.4991726875305176, "grad_norm_var": 0.06971341387025494, "learning_rate": 0.0001, "loss": 1.0979, "loss/crossentropy": 2.0994441509246826, "loss/hidden": 0.90234375, "loss/logits": 0.16422367095947266, "loss/reg": 0.003128266194835305, "step": 1709 }, { "epoch": 0.21375, "grad_norm": 2.4266979694366455, "grad_norm_var": 0.05866460350893187, "learning_rate": 0.0001, "loss": 1.0559, "loss/crossentropy": 2.5089967250823975, "loss/hidden": 0.87890625, "loss/logits": 0.14569693803787231, "loss/reg": 0.003126643830910325, "step": 1710 }, { "epoch": 0.213875, "grad_norm": 2.5570080280303955, "grad_norm_var": 0.05938155406816629, "learning_rate": 0.0001, "loss": 1.1512, "loss/crossentropy": 2.390232801437378, "loss/hidden": 0.98046875, "loss/logits": 0.13949596881866455, "loss/reg": 0.0031250508036464453, "step": 1711 }, { "epoch": 0.214, "grad_norm": 2.4763524532318115, "grad_norm_var": 0.05573108256271908, "learning_rate": 0.0001, "loss": 1.2109, "loss/crossentropy": 2.4115543365478516, "loss/hidden": 1.0, "loss/logits": 0.17969435453414917, "loss/reg": 0.003123391419649124, "step": 1712 }, { "epoch": 0.214125, "grad_norm": 1.9605647325515747, "grad_norm_var": 0.061658860743410496, "learning_rate": 0.0001, "loss": 1.004, "loss/crossentropy": 2.1503031253814697, "loss/hidden": 0.83203125, "loss/logits": 0.14071419835090637, "loss/reg": 0.0031219625379890203, "step": 1713 }, { "epoch": 0.21425, "grad_norm": 2.206909656524658, "grad_norm_var": 0.05032832725278471, "learning_rate": 0.0001, "loss": 0.9881, "loss/crossentropy": 2.740894079208374, "loss/hidden": 0.81640625, "loss/logits": 0.14050860702991486, "loss/reg": 0.0031206535641103983, "step": 1714 }, { "epoch": 0.214375, "grad_norm": 2.084909677505493, "grad_norm_var": 0.05278877705570242, "learning_rate": 0.0001, "loss": 1.1267, "loss/crossentropy": 2.4331443309783936, "loss/hidden": 0.9375, "loss/logits": 0.1579878181219101, "loss/reg": 0.003117976011708379, "step": 1715 }, { "epoch": 0.2145, "grad_norm": 2.2570641040802, "grad_norm_var": 0.050903424022511426, "learning_rate": 0.0001, "loss": 0.9784, "loss/crossentropy": 2.688154935836792, "loss/hidden": 0.80078125, "loss/logits": 0.14644555747509003, "loss/reg": 0.0031148470006883144, "step": 1716 }, { "epoch": 0.214625, "grad_norm": 2.1008963584899902, "grad_norm_var": 0.05058773933843851, "learning_rate": 0.0001, "loss": 1.0819, "loss/crossentropy": 2.5256693363189697, "loss/hidden": 0.90234375, "loss/logits": 0.14845484495162964, "loss/reg": 0.003113400423899293, "step": 1717 }, { "epoch": 0.21475, "grad_norm": 2.4614713191986084, "grad_norm_var": 0.05191226779637403, "learning_rate": 0.0001, "loss": 1.0564, "loss/crossentropy": 2.5726735591888428, "loss/hidden": 0.87109375, "loss/logits": 0.15416929125785828, "loss/reg": 0.0031117405742406845, "step": 1718 }, { "epoch": 0.214875, "grad_norm": 2.3064160346984863, "grad_norm_var": 0.05189566088851283, "learning_rate": 0.0001, "loss": 1.1013, "loss/crossentropy": 2.7065393924713135, "loss/hidden": 0.90234375, "loss/logits": 0.16784769296646118, "loss/reg": 0.0031088702380657196, "step": 1719 }, { "epoch": 0.215, "grad_norm": 9.079465866088867, "grad_norm_var": 2.9207271705017384, "learning_rate": 0.0001, "loss": 1.3854, "loss/crossentropy": 2.3284268379211426, "loss/hidden": 1.1328125, "loss/logits": 0.22156238555908203, "loss/reg": 0.003107408294454217, "step": 1720 }, { "epoch": 0.215125, "grad_norm": 2.1745572090148926, "grad_norm_var": 2.9073576589134493, "learning_rate": 0.0001, "loss": 1.1169, "loss/crossentropy": 2.095036745071411, "loss/hidden": 0.93359375, "loss/logits": 0.15225940942764282, "loss/reg": 0.003105347976088524, "step": 1721 }, { "epoch": 0.21525, "grad_norm": 2.243680238723755, "grad_norm_var": 2.9005416312984256, "learning_rate": 0.0001, "loss": 1.0107, "loss/crossentropy": 2.2994542121887207, "loss/hidden": 0.84375, "loss/logits": 0.1359337866306305, "loss/reg": 0.003103848546743393, "step": 1722 }, { "epoch": 0.215375, "grad_norm": 2.6446003913879395, "grad_norm_var": 2.882775101197618, "learning_rate": 0.0001, "loss": 1.2141, "loss/crossentropy": 2.498286008834839, "loss/hidden": 1.0234375, "loss/logits": 0.1595914363861084, "loss/reg": 0.003102482995018363, "step": 1723 }, { "epoch": 0.2155, "grad_norm": 4.21211576461792, "grad_norm_var": 3.0136016192372757, "learning_rate": 0.0001, "loss": 1.0643, "loss/crossentropy": 2.2280969619750977, "loss/hidden": 0.8828125, "loss/logits": 0.15044564008712769, "loss/reg": 0.0031010708771646023, "step": 1724 }, { "epoch": 0.215625, "grad_norm": 2.093863010406494, "grad_norm_var": 3.0431383662912457, "learning_rate": 0.0001, "loss": 0.9815, "loss/crossentropy": 2.27380108833313, "loss/hidden": 0.82421875, "loss/logits": 0.12631164491176605, "loss/reg": 0.0030996648129075766, "step": 1725 }, { "epoch": 0.21575, "grad_norm": 2.16963267326355, "grad_norm_var": 3.061105934508265, "learning_rate": 0.0001, "loss": 1.1956, "loss/crossentropy": 2.180711269378662, "loss/hidden": 0.9921875, "loss/logits": 0.17242172360420227, "loss/reg": 0.00309771322645247, "step": 1726 }, { "epoch": 0.215875, "grad_norm": 3.120957136154175, "grad_norm_var": 3.0616334113432386, "learning_rate": 0.0001, "loss": 1.1386, "loss/crossentropy": 2.4755892753601074, "loss/hidden": 0.92578125, "loss/logits": 0.18183037638664246, "loss/reg": 0.0030961879529058933, "step": 1727 }, { "epoch": 0.216, "grad_norm": 2.810375213623047, "grad_norm_var": 3.051983920589513, "learning_rate": 0.0001, "loss": 1.0356, "loss/crossentropy": 2.844090461730957, "loss/hidden": 0.85546875, "loss/logits": 0.14921194314956665, "loss/reg": 0.0030942922458052635, "step": 1728 }, { "epoch": 0.216125, "grad_norm": 2.351074695587158, "grad_norm_var": 3.01413823672746, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.617952823638916, "loss/hidden": 0.9375, "loss/logits": 0.17222052812576294, "loss/reg": 0.00309238163754344, "step": 1729 }, { "epoch": 0.21625, "grad_norm": 3.7283058166503906, "grad_norm_var": 3.0192480530971872, "learning_rate": 0.0001, "loss": 1.3238, "loss/crossentropy": 2.559141159057617, "loss/hidden": 0.99609375, "loss/logits": 0.2968454360961914, "loss/reg": 0.0030908475164324045, "step": 1730 }, { "epoch": 0.216375, "grad_norm": 2.9617199897766113, "grad_norm_var": 2.961489976152211, "learning_rate": 0.0001, "loss": 1.3122, "loss/crossentropy": 2.598041534423828, "loss/hidden": 1.078125, "loss/logits": 0.2031644731760025, "loss/reg": 0.00308916624635458, "step": 1731 }, { "epoch": 0.2165, "grad_norm": 2.236485481262207, "grad_norm_var": 2.9636777426758716, "learning_rate": 0.0001, "loss": 1.0227, "loss/crossentropy": 2.625614881515503, "loss/hidden": 0.83984375, "loss/logits": 0.15201476216316223, "loss/reg": 0.003087525488808751, "step": 1732 }, { "epoch": 0.216625, "grad_norm": 2.3135018348693848, "grad_norm_var": 2.9397831294271772, "learning_rate": 0.0001, "loss": 0.9559, "loss/crossentropy": 2.3203794956207275, "loss/hidden": 0.7890625, "loss/logits": 0.13600721955299377, "loss/reg": 0.003086032345890999, "step": 1733 }, { "epoch": 0.21675, "grad_norm": 2.0412943363189697, "grad_norm_var": 2.984167856020982, "learning_rate": 0.0001, "loss": 1.0881, "loss/crossentropy": 2.715367317199707, "loss/hidden": 0.89453125, "loss/logits": 0.1627357304096222, "loss/reg": 0.0030844947323203087, "step": 1734 }, { "epoch": 0.216875, "grad_norm": 3.1161649227142334, "grad_norm_var": 2.946971551780882, "learning_rate": 0.0001, "loss": 1.0572, "loss/crossentropy": 2.4200503826141357, "loss/hidden": 0.8828125, "loss/logits": 0.14354635775089264, "loss/reg": 0.0030825661960989237, "step": 1735 }, { "epoch": 0.217, "grad_norm": 3.2668862342834473, "grad_norm_var": 0.4098138660932987, "learning_rate": 0.0001, "loss": 1.1414, "loss/crossentropy": 2.4813458919525146, "loss/hidden": 0.953125, "loss/logits": 0.15744319558143616, "loss/reg": 0.003080642083659768, "step": 1736 }, { "epoch": 0.217125, "grad_norm": 1.9718143939971924, "grad_norm_var": 0.42706875074818434, "learning_rate": 0.0001, "loss": 1.0276, "loss/crossentropy": 2.2978994846343994, "loss/hidden": 0.8671875, "loss/logits": 0.12966391444206238, "loss/reg": 0.0030790595337748528, "step": 1737 }, { "epoch": 0.21725, "grad_norm": 2.1405398845672607, "grad_norm_var": 0.4340798374863008, "learning_rate": 0.0001, "loss": 1.1405, "loss/crossentropy": 2.091478109359741, "loss/hidden": 0.94140625, "loss/logits": 0.16836108267307281, "loss/reg": 0.0030774776823818684, "step": 1738 }, { "epoch": 0.217375, "grad_norm": 3.8788137435913086, "grad_norm_var": 0.5203809166356061, "learning_rate": 0.0001, "loss": 1.751, "loss/crossentropy": 2.3941147327423096, "loss/hidden": 1.4453125, "loss/logits": 0.274971067905426, "loss/reg": 0.003075655549764633, "step": 1739 }, { "epoch": 0.2175, "grad_norm": 1.9825366735458374, "grad_norm_var": 0.4040997474989869, "learning_rate": 0.0001, "loss": 1.1001, "loss/crossentropy": 2.233893632888794, "loss/hidden": 0.91796875, "loss/logits": 0.15135906636714935, "loss/reg": 0.0030739654321223497, "step": 1740 }, { "epoch": 0.217625, "grad_norm": 2.0987682342529297, "grad_norm_var": 0.4037463519266102, "learning_rate": 0.0001, "loss": 1.0675, "loss/crossentropy": 2.4455137252807617, "loss/hidden": 0.875, "loss/logits": 0.1617324948310852, "loss/reg": 0.0030723894014954567, "step": 1741 }, { "epoch": 0.21775, "grad_norm": 3.920135498046875, "grad_norm_var": 0.48622454106490515, "learning_rate": 0.0001, "loss": 1.1559, "loss/crossentropy": 2.6298489570617676, "loss/hidden": 0.94140625, "loss/logits": 0.18376988172531128, "loss/reg": 0.0030707602854818106, "step": 1742 }, { "epoch": 0.217875, "grad_norm": 2.690436363220215, "grad_norm_var": 0.4762973265463903, "learning_rate": 0.0001, "loss": 1.2467, "loss/crossentropy": 2.1424267292022705, "loss/hidden": 1.0625, "loss/logits": 0.15353702008724213, "loss/reg": 0.0030691707506775856, "step": 1743 }, { "epoch": 0.218, "grad_norm": 2.604951858520508, "grad_norm_var": 0.47644030986631136, "learning_rate": 0.0001, "loss": 1.1014, "loss/crossentropy": 2.54174542427063, "loss/hidden": 0.91015625, "loss/logits": 0.1606130450963974, "loss/reg": 0.00306738936342299, "step": 1744 }, { "epoch": 0.218125, "grad_norm": 3.3228073120117188, "grad_norm_var": 0.48941099514094233, "learning_rate": 0.0001, "loss": 1.0578, "loss/crossentropy": 2.3931949138641357, "loss/hidden": 0.890625, "loss/logits": 0.13647425174713135, "loss/reg": 0.003065774915739894, "step": 1745 }, { "epoch": 0.21825, "grad_norm": 2.641117811203003, "grad_norm_var": 0.42396390393689143, "learning_rate": 0.0001, "loss": 0.971, "loss/crossentropy": 2.4205129146575928, "loss/hidden": 0.83203125, "loss/logits": 0.10831993818283081, "loss/reg": 0.003064037999138236, "step": 1746 }, { "epoch": 0.218375, "grad_norm": 2.7706568241119385, "grad_norm_var": 0.4195589879953497, "learning_rate": 0.0001, "loss": 1.1104, "loss/crossentropy": 2.719468355178833, "loss/hidden": 0.9140625, "loss/logits": 0.16571125388145447, "loss/reg": 0.0030624454375356436, "step": 1747 }, { "epoch": 0.2185, "grad_norm": 2.2494888305664062, "grad_norm_var": 0.41878793071204773, "learning_rate": 0.0001, "loss": 1.0321, "loss/crossentropy": 2.7627296447753906, "loss/hidden": 0.84765625, "loss/logits": 0.15384793281555176, "loss/reg": 0.003060864983126521, "step": 1748 }, { "epoch": 0.218625, "grad_norm": 2.4056599140167236, "grad_norm_var": 0.41471554214321593, "learning_rate": 0.0001, "loss": 1.0339, "loss/crossentropy": 2.546539068222046, "loss/hidden": 0.8671875, "loss/logits": 0.13611117005348206, "loss/reg": 0.0030592146795243025, "step": 1749 }, { "epoch": 0.21875, "grad_norm": 2.493166923522949, "grad_norm_var": 0.38815929501957475, "learning_rate": 0.0001, "loss": 1.0489, "loss/crossentropy": 2.243028402328491, "loss/hidden": 0.87109375, "loss/logits": 0.1472695916891098, "loss/reg": 0.0030575725249946117, "step": 1750 }, { "epoch": 0.218875, "grad_norm": 2.213844060897827, "grad_norm_var": 0.3916385925474808, "learning_rate": 0.0001, "loss": 1.0008, "loss/crossentropy": 2.419025182723999, "loss/hidden": 0.83984375, "loss/logits": 0.13044574856758118, "loss/reg": 0.003055924316868186, "step": 1751 }, { "epoch": 0.219, "grad_norm": 2.769224166870117, "grad_norm_var": 0.3672278962107095, "learning_rate": 0.0001, "loss": 1.0385, "loss/crossentropy": 2.7889208793640137, "loss/hidden": 0.8671875, "loss/logits": 0.1408083736896515, "loss/reg": 0.0030543410684913397, "step": 1752 }, { "epoch": 0.219125, "grad_norm": 2.4764599800109863, "grad_norm_var": 0.33854682568550765, "learning_rate": 0.0001, "loss": 1.0605, "loss/crossentropy": 2.422456741333008, "loss/hidden": 0.89453125, "loss/logits": 0.1354484111070633, "loss/reg": 0.0030527592170983553, "step": 1753 }, { "epoch": 0.21925, "grad_norm": 2.322908639907837, "grad_norm_var": 0.32784450880299965, "learning_rate": 0.0001, "loss": 1.1523, "loss/crossentropy": 2.441586494445801, "loss/hidden": 0.94921875, "loss/logits": 0.17252781987190247, "loss/reg": 0.003051069099456072, "step": 1754 }, { "epoch": 0.219375, "grad_norm": 2.6239187717437744, "grad_norm_var": 0.22527430071220297, "learning_rate": 0.0001, "loss": 1.1402, "loss/crossentropy": 2.3880774974823, "loss/hidden": 0.94140625, "loss/logits": 0.16831141710281372, "loss/reg": 0.0030493696685880423, "step": 1755 }, { "epoch": 0.2195, "grad_norm": 17.505414962768555, "grad_norm_var": 14.009084703934427, "learning_rate": 0.0001, "loss": 1.3282, "loss/crossentropy": 2.6417012214660645, "loss/hidden": 1.1328125, "loss/logits": 0.16491027176380157, "loss/reg": 0.003047748701646924, "step": 1756 }, { "epoch": 0.219625, "grad_norm": 4.20535135269165, "grad_norm_var": 13.873398017294948, "learning_rate": 0.0001, "loss": 1.131, "loss/crossentropy": 2.521103858947754, "loss/hidden": 0.96484375, "loss/logits": 0.13569971919059753, "loss/reg": 0.0030460914131253958, "step": 1757 }, { "epoch": 0.21975, "grad_norm": 2.9682602882385254, "grad_norm_var": 13.902211592229294, "learning_rate": 0.0001, "loss": 1.1961, "loss/crossentropy": 2.4939701557159424, "loss/hidden": 1.0, "loss/logits": 0.165610671043396, "loss/reg": 0.0030445046722888947, "step": 1758 }, { "epoch": 0.219875, "grad_norm": 2.7488410472869873, "grad_norm_var": 13.895018738483493, "learning_rate": 0.0001, "loss": 1.0071, "loss/crossentropy": 2.5324008464813232, "loss/hidden": 0.84765625, "loss/logits": 0.12899169325828552, "loss/reg": 0.003042889991775155, "step": 1759 }, { "epoch": 0.22, "grad_norm": 2.1646311283111572, "grad_norm_var": 13.968204624958085, "learning_rate": 0.0001, "loss": 1.1448, "loss/crossentropy": 2.5288894176483154, "loss/hidden": 0.94921875, "loss/logits": 0.16518330574035645, "loss/reg": 0.0030412436462938786, "step": 1760 }, { "epoch": 0.220125, "grad_norm": 1.9247658252716064, "grad_norm_var": 14.145314883597047, "learning_rate": 0.0001, "loss": 1.0406, "loss/crossentropy": 2.5389163494110107, "loss/hidden": 0.8671875, "loss/logits": 0.1429884135723114, "loss/reg": 0.003039830131456256, "step": 1761 }, { "epoch": 0.22025, "grad_norm": 2.1075799465179443, "grad_norm_var": 14.226356437632456, "learning_rate": 0.0001, "loss": 1.1968, "loss/crossentropy": 2.422471046447754, "loss/hidden": 1.0078125, "loss/logits": 0.15863800048828125, "loss/reg": 0.0030385232530534267, "step": 1762 }, { "epoch": 0.220375, "grad_norm": 2.6114046573638916, "grad_norm_var": 14.243361987467381, "learning_rate": 0.0001, "loss": 0.9892, "loss/crossentropy": 2.574613571166992, "loss/hidden": 0.83203125, "loss/logits": 0.1268356591463089, "loss/reg": 0.0030372380279004574, "step": 1763 }, { "epoch": 0.2205, "grad_norm": 2.0650744438171387, "grad_norm_var": 14.275914518581828, "learning_rate": 0.0001, "loss": 1.1595, "loss/crossentropy": 2.4691414833068848, "loss/hidden": 0.98046875, "loss/logits": 0.1486453115940094, "loss/reg": 0.003035652916878462, "step": 1764 }, { "epoch": 0.220625, "grad_norm": 2.160597085952759, "grad_norm_var": 14.314622026235172, "learning_rate": 0.0001, "loss": 1.1022, "loss/crossentropy": 2.609285593032837, "loss/hidden": 0.921875, "loss/logits": 0.15001603960990906, "loss/reg": 0.0030343951657414436, "step": 1765 }, { "epoch": 0.22075, "grad_norm": 3.069577217102051, "grad_norm_var": 14.261074973549244, "learning_rate": 0.0001, "loss": 1.1575, "loss/crossentropy": 2.647573947906494, "loss/hidden": 0.96875, "loss/logits": 0.1584310233592987, "loss/reg": 0.003032844513654709, "step": 1766 }, { "epoch": 0.220875, "grad_norm": 2.568370819091797, "grad_norm_var": 14.208317261947524, "learning_rate": 0.0001, "loss": 1.0488, "loss/crossentropy": 2.6948702335357666, "loss/hidden": 0.87109375, "loss/logits": 0.1474071443080902, "loss/reg": 0.0030314817558974028, "step": 1767 }, { "epoch": 0.221, "grad_norm": 2.68941068649292, "grad_norm_var": 14.21668663304105, "learning_rate": 0.0001, "loss": 1.0456, "loss/crossentropy": 2.5210416316986084, "loss/hidden": 0.87109375, "loss/logits": 0.14420348405838013, "loss/reg": 0.0030302261002361774, "step": 1768 }, { "epoch": 0.221125, "grad_norm": 1.9921388626098633, "grad_norm_var": 14.298301261709696, "learning_rate": 0.0001, "loss": 1.0236, "loss/crossentropy": 2.522998332977295, "loss/hidden": 0.84765625, "loss/logits": 0.14565014839172363, "loss/reg": 0.00302865426056087, "step": 1769 }, { "epoch": 0.22125, "grad_norm": 1.8499289751052856, "grad_norm_var": 14.38544404016637, "learning_rate": 0.0001, "loss": 1.0027, "loss/crossentropy": 2.466783046722412, "loss/hidden": 0.828125, "loss/logits": 0.14433151483535767, "loss/reg": 0.0030273436568677425, "step": 1770 }, { "epoch": 0.221375, "grad_norm": 2.4636669158935547, "grad_norm_var": 14.404773691988826, "learning_rate": 0.0001, "loss": 1.0178, "loss/crossentropy": 2.4166789054870605, "loss/hidden": 0.84765625, "loss/logits": 0.13983449339866638, "loss/reg": 0.0030260428320616484, "step": 1771 }, { "epoch": 0.2215, "grad_norm": 2.67075514793396, "grad_norm_var": 0.3450175902124835, "learning_rate": 0.0001, "loss": 1.0199, "loss/crossentropy": 2.695432662963867, "loss/hidden": 0.84765625, "loss/logits": 0.14194995164871216, "loss/reg": 0.0030244754161685705, "step": 1772 }, { "epoch": 0.221625, "grad_norm": 2.4573214054107666, "grad_norm_var": 0.14231832979352052, "learning_rate": 0.0001, "loss": 0.9742, "loss/crossentropy": 2.5035762786865234, "loss/hidden": 0.8203125, "loss/logits": 0.12364549934864044, "loss/reg": 0.003022938035428524, "step": 1773 }, { "epoch": 0.22175, "grad_norm": 2.343374729156494, "grad_norm_var": 0.11996201542798221, "learning_rate": 0.0001, "loss": 1.0924, "loss/crossentropy": 2.43853497505188, "loss/hidden": 0.9140625, "loss/logits": 0.14812292158603668, "loss/reg": 0.003021500539034605, "step": 1774 }, { "epoch": 0.221875, "grad_norm": 3.566265344619751, "grad_norm_var": 0.2032350727933528, "learning_rate": 0.0001, "loss": 1.1878, "loss/crossentropy": 2.46608567237854, "loss/hidden": 0.97265625, "loss/logits": 0.18495416641235352, "loss/reg": 0.00301993521861732, "step": 1775 }, { "epoch": 0.222, "grad_norm": 2.747910976409912, "grad_norm_var": 0.20471190685867377, "learning_rate": 0.0001, "loss": 1.0738, "loss/crossentropy": 2.6310603618621826, "loss/hidden": 0.87890625, "loss/logits": 0.16469183564186096, "loss/reg": 0.003018364543095231, "step": 1776 }, { "epoch": 0.222125, "grad_norm": 2.1164255142211914, "grad_norm_var": 0.1934448052628894, "learning_rate": 0.0001, "loss": 0.9762, "loss/crossentropy": 2.8867831230163574, "loss/hidden": 0.8046875, "loss/logits": 0.1413034349679947, "loss/reg": 0.0030167822260409594, "step": 1777 }, { "epoch": 0.22225, "grad_norm": 3.035059690475464, "grad_norm_var": 0.20270085598930473, "learning_rate": 0.0001, "loss": 1.1703, "loss/crossentropy": 2.1823151111602783, "loss/hidden": 0.9765625, "loss/logits": 0.16358500719070435, "loss/reg": 0.0030153028201311827, "step": 1778 }, { "epoch": 0.222375, "grad_norm": 2.5045015811920166, "grad_norm_var": 0.20219002055305796, "learning_rate": 0.0001, "loss": 1.2038, "loss/crossentropy": 2.401313543319702, "loss/hidden": 1.0078125, "loss/logits": 0.16582559049129486, "loss/reg": 0.0030142655596137047, "step": 1779 }, { "epoch": 0.2225, "grad_norm": 2.473170042037964, "grad_norm_var": 0.1879118733867992, "learning_rate": 0.0001, "loss": 1.0329, "loss/crossentropy": 2.824697494506836, "loss/hidden": 0.859375, "loss/logits": 0.14334872364997864, "loss/reg": 0.003013218054547906, "step": 1780 }, { "epoch": 0.222625, "grad_norm": 2.2666685581207275, "grad_norm_var": 0.18318870026567513, "learning_rate": 0.0001, "loss": 0.91, "loss/crossentropy": 2.4266369342803955, "loss/hidden": 0.7734375, "loss/logits": 0.10641002655029297, "loss/reg": 0.0030120951123535633, "step": 1781 }, { "epoch": 0.22275, "grad_norm": 2.411001205444336, "grad_norm_var": 0.16475203538871402, "learning_rate": 0.0001, "loss": 0.9902, "loss/crossentropy": 2.7164502143859863, "loss/hidden": 0.82421875, "loss/logits": 0.13585732877254486, "loss/reg": 0.0030110396910458803, "step": 1782 }, { "epoch": 0.222875, "grad_norm": 2.555788040161133, "grad_norm_var": 0.16466357931168235, "learning_rate": 0.0001, "loss": 0.9362, "loss/crossentropy": 2.4590888023376465, "loss/hidden": 0.78515625, "loss/logits": 0.12098520994186401, "loss/reg": 0.0030094829853624105, "step": 1783 }, { "epoch": 0.223, "grad_norm": 2.0648751258850098, "grad_norm_var": 0.17401513224721246, "learning_rate": 0.0001, "loss": 0.9556, "loss/crossentropy": 2.3822426795959473, "loss/hidden": 0.78515625, "loss/logits": 0.1403384804725647, "loss/reg": 0.003007945604622364, "step": 1784 }, { "epoch": 0.223125, "grad_norm": 4.551196098327637, "grad_norm_var": 0.4202881155883109, "learning_rate": 0.0001, "loss": 1.8349, "loss/crossentropy": 1.9226468801498413, "loss/hidden": 1.4609375, "loss/logits": 0.3438549041748047, "loss/reg": 0.003006393788382411, "step": 1785 }, { "epoch": 0.22325, "grad_norm": 3.032957077026367, "grad_norm_var": 0.38473481866021925, "learning_rate": 0.0001, "loss": 1.091, "loss/crossentropy": 2.3850667476654053, "loss/hidden": 0.90625, "loss/logits": 0.15473738312721252, "loss/reg": 0.0030050212517380714, "step": 1786 }, { "epoch": 0.223375, "grad_norm": 2.6902732849121094, "grad_norm_var": 0.3806885371660407, "learning_rate": 0.0001, "loss": 1.1889, "loss/crossentropy": 2.324967861175537, "loss/hidden": 0.984375, "loss/logits": 0.17453113198280334, "loss/reg": 0.0030037900432944298, "step": 1787 }, { "epoch": 0.2235, "grad_norm": 2.168304443359375, "grad_norm_var": 0.3996302660743254, "learning_rate": 0.0001, "loss": 1.0764, "loss/crossentropy": 2.2956700325012207, "loss/hidden": 0.8984375, "loss/logits": 0.14791284501552582, "loss/reg": 0.003002553479745984, "step": 1788 }, { "epoch": 0.223625, "grad_norm": 2.74092435836792, "grad_norm_var": 0.3959885005070151, "learning_rate": 0.0001, "loss": 0.9535, "loss/crossentropy": 2.631945848464966, "loss/hidden": 0.79296875, "loss/logits": 0.1305209994316101, "loss/reg": 0.0030011215712875128, "step": 1789 }, { "epoch": 0.22375, "grad_norm": 2.1927857398986816, "grad_norm_var": 0.4046525348789257, "learning_rate": 0.0001, "loss": 1.0108, "loss/crossentropy": 2.404738664627075, "loss/hidden": 0.83984375, "loss/logits": 0.14092186093330383, "loss/reg": 0.002999563468620181, "step": 1790 }, { "epoch": 0.223875, "grad_norm": 2.6189329624176025, "grad_norm_var": 0.35067712323398886, "learning_rate": 0.0001, "loss": 0.9651, "loss/crossentropy": 2.389113664627075, "loss/hidden": 0.78515625, "loss/logits": 0.15001046657562256, "loss/reg": 0.002998023759573698, "step": 1791 }, { "epoch": 0.224, "grad_norm": 2.298084020614624, "grad_norm_var": 0.35659197751071947, "learning_rate": 0.0001, "loss": 1.0233, "loss/crossentropy": 2.5539371967315674, "loss/hidden": 0.84765625, "loss/logits": 0.14568671584129333, "loss/reg": 0.002996444469317794, "step": 1792 }, { "epoch": 0.224125, "grad_norm": 2.6242432594299316, "grad_norm_var": 0.3394552173241588, "learning_rate": 0.0001, "loss": 1.1973, "loss/crossentropy": 2.5641584396362305, "loss/hidden": 0.984375, "loss/logits": 0.18293890357017517, "loss/reg": 0.002994769951328635, "step": 1793 }, { "epoch": 0.22425, "grad_norm": 3.7077572345733643, "grad_norm_var": 0.4032349111532985, "learning_rate": 0.0001, "loss": 1.26, "loss/crossentropy": 1.6599891185760498, "loss/hidden": 1.09375, "loss/logits": 0.13635557889938354, "loss/reg": 0.0029932132456451654, "step": 1794 }, { "epoch": 0.224375, "grad_norm": 3.1581003665924072, "grad_norm_var": 0.41452339637513186, "learning_rate": 0.0001, "loss": 0.9995, "loss/crossentropy": 2.8043997287750244, "loss/hidden": 0.83203125, "loss/logits": 0.1375463604927063, "loss/reg": 0.0029914977494627237, "step": 1795 }, { "epoch": 0.2245, "grad_norm": 3.2209360599517822, "grad_norm_var": 0.42464256487447377, "learning_rate": 0.0001, "loss": 1.1752, "loss/crossentropy": 2.321751832962036, "loss/hidden": 0.9921875, "loss/logits": 0.15309128165245056, "loss/reg": 0.0029897540807724, "step": 1796 }, { "epoch": 0.224625, "grad_norm": 3.6839895248413086, "grad_norm_var": 0.45527767818373166, "learning_rate": 0.0001, "loss": 1.2765, "loss/crossentropy": 2.8542239665985107, "loss/hidden": 1.0234375, "loss/logits": 0.2231515347957611, "loss/reg": 0.002988190157338977, "step": 1797 }, { "epoch": 0.22475, "grad_norm": 3.53092622756958, "grad_norm_var": 0.4669931032592082, "learning_rate": 0.0001, "loss": 1.2964, "loss/crossentropy": 2.4456238746643066, "loss/hidden": 1.0703125, "loss/logits": 0.19626130163669586, "loss/reg": 0.002986533334478736, "step": 1798 }, { "epoch": 0.224875, "grad_norm": 3.053359031677246, "grad_norm_var": 0.4578059410900018, "learning_rate": 0.0001, "loss": 0.9795, "loss/crossentropy": 2.8894784450531006, "loss/hidden": 0.8203125, "loss/logits": 0.12932872772216797, "loss/reg": 0.0029848606791347265, "step": 1799 }, { "epoch": 0.225, "grad_norm": 3.2195730209350586, "grad_norm_var": 0.4035408308703057, "learning_rate": 0.0001, "loss": 1.4028, "loss/crossentropy": 2.3305225372314453, "loss/hidden": 1.1875, "loss/logits": 0.18548178672790527, "loss/reg": 0.002983321435749531, "step": 1800 }, { "epoch": 0.225125, "grad_norm": 2.86997652053833, "grad_norm_var": 0.23937467026572654, "learning_rate": 0.0001, "loss": 1.1938, "loss/crossentropy": 2.579697847366333, "loss/hidden": 0.98828125, "loss/logits": 0.17568713426589966, "loss/reg": 0.0029817591421306133, "step": 1801 }, { "epoch": 0.22525, "grad_norm": 3.553030252456665, "grad_norm_var": 0.26371729729337307, "learning_rate": 0.0001, "loss": 1.1028, "loss/crossentropy": 2.8150346279144287, "loss/hidden": 0.93359375, "loss/logits": 0.13943785429000854, "loss/reg": 0.002980235731229186, "step": 1802 }, { "epoch": 0.225375, "grad_norm": 3.208162307739258, "grad_norm_var": 0.26197953760215476, "learning_rate": 0.0001, "loss": 1.1901, "loss/crossentropy": 2.499155044555664, "loss/hidden": 0.94921875, "loss/logits": 0.21110975742340088, "loss/reg": 0.002978700678795576, "step": 1803 }, { "epoch": 0.2255, "grad_norm": 4.348999500274658, "grad_norm_var": 0.3201132095155003, "learning_rate": 0.0001, "loss": 1.1629, "loss/crossentropy": 2.4431777000427246, "loss/hidden": 0.984375, "loss/logits": 0.14872056245803833, "loss/reg": 0.002977173076942563, "step": 1804 }, { "epoch": 0.225625, "grad_norm": 2.438546895980835, "grad_norm_var": 0.34138753432729513, "learning_rate": 0.0001, "loss": 1.0855, "loss/crossentropy": 2.518237352371216, "loss/hidden": 0.89453125, "loss/logits": 0.16119888424873352, "loss/reg": 0.002975636161863804, "step": 1805 }, { "epoch": 0.22575, "grad_norm": 3.8823599815368652, "grad_norm_var": 0.31363593562404785, "learning_rate": 0.0001, "loss": 1.1725, "loss/crossentropy": 2.419356107711792, "loss/hidden": 0.96875, "loss/logits": 0.17398422956466675, "loss/reg": 0.002974169095978141, "step": 1806 }, { "epoch": 0.225875, "grad_norm": 5.898488521575928, "grad_norm_var": 0.725838270489255, "learning_rate": 0.0001, "loss": 1.3984, "loss/crossentropy": 2.538686990737915, "loss/hidden": 1.140625, "loss/logits": 0.22802415490150452, "loss/reg": 0.002972749760374427, "step": 1807 }, { "epoch": 0.226, "grad_norm": 2.616062879562378, "grad_norm_var": 0.6846537892399804, "learning_rate": 0.0001, "loss": 0.9967, "loss/crossentropy": 2.91412091255188, "loss/hidden": 0.828125, "loss/logits": 0.13887512683868408, "loss/reg": 0.0029713378753513098, "step": 1808 }, { "epoch": 0.226125, "grad_norm": 2.6786835193634033, "grad_norm_var": 0.678929251874992, "learning_rate": 0.0001, "loss": 1.3028, "loss/crossentropy": 2.530329465866089, "loss/hidden": 1.0859375, "loss/logits": 0.18713583052158356, "loss/reg": 0.0029699981678277254, "step": 1809 }, { "epoch": 0.22625, "grad_norm": 2.3305532932281494, "grad_norm_var": 0.7486371828354244, "learning_rate": 0.0001, "loss": 1.1417, "loss/crossentropy": 2.5374956130981445, "loss/hidden": 0.9453125, "loss/logits": 0.16669398546218872, "loss/reg": 0.0029686312191188335, "step": 1810 }, { "epoch": 0.226375, "grad_norm": 2.9841620922088623, "grad_norm_var": 0.7551115699539401, "learning_rate": 0.0001, "loss": 1.1804, "loss/crossentropy": 2.4051711559295654, "loss/hidden": 0.97265625, "loss/logits": 0.17810006439685822, "loss/reg": 0.002967282198369503, "step": 1811 }, { "epoch": 0.2265, "grad_norm": 1.9030566215515137, "grad_norm_var": 0.885438078387665, "learning_rate": 0.0001, "loss": 1.0451, "loss/crossentropy": 2.449751615524292, "loss/hidden": 0.87109375, "loss/logits": 0.14433653652668, "loss/reg": 0.0029659466817975044, "step": 1812 }, { "epoch": 0.226625, "grad_norm": 2.075324773788452, "grad_norm_var": 0.9567700729341877, "learning_rate": 0.0001, "loss": 1.0281, "loss/crossentropy": 2.4204511642456055, "loss/hidden": 0.87109375, "loss/logits": 0.12739630043506622, "loss/reg": 0.0029646598268300295, "step": 1813 }, { "epoch": 0.22675, "grad_norm": 2.0552451610565186, "grad_norm_var": 1.0202742097321296, "learning_rate": 0.0001, "loss": 0.9888, "loss/crossentropy": 2.424468755722046, "loss/hidden": 0.81640625, "loss/logits": 0.14275333285331726, "loss/reg": 0.002963270992040634, "step": 1814 }, { "epoch": 0.226875, "grad_norm": 2.0605251789093018, "grad_norm_var": 1.0840480132956123, "learning_rate": 0.0001, "loss": 1.0656, "loss/crossentropy": 2.625704050064087, "loss/hidden": 0.89453125, "loss/logits": 0.1414736658334732, "loss/reg": 0.0029617997352033854, "step": 1815 }, { "epoch": 0.227, "grad_norm": 2.0984017848968506, "grad_norm_var": 1.1309350809822944, "learning_rate": 0.0001, "loss": 1.1418, "loss/crossentropy": 2.2266194820404053, "loss/hidden": 0.9453125, "loss/logits": 0.16688337922096252, "loss/reg": 0.002960240002721548, "step": 1816 }, { "epoch": 0.227125, "grad_norm": 2.257150411605835, "grad_norm_var": 1.1599327396837886, "learning_rate": 0.0001, "loss": 1.2117, "loss/crossentropy": 2.2862818241119385, "loss/hidden": 1.0078125, "loss/logits": 0.1742808222770691, "loss/reg": 0.0029587687458842993, "step": 1817 }, { "epoch": 0.22725, "grad_norm": 2.0277411937713623, "grad_norm_var": 1.1723884671928733, "learning_rate": 0.0001, "loss": 1.0665, "loss/crossentropy": 2.4400901794433594, "loss/hidden": 0.89453125, "loss/logits": 0.14237791299819946, "loss/reg": 0.0029572048224508762, "step": 1818 }, { "epoch": 0.227375, "grad_norm": 2.6541121006011963, "grad_norm_var": 1.161714891934859, "learning_rate": 0.0001, "loss": 1.0231, "loss/crossentropy": 2.5243773460388184, "loss/hidden": 0.85546875, "loss/logits": 0.13802778720855713, "loss/reg": 0.0029556897934526205, "step": 1819 }, { "epoch": 0.2275, "grad_norm": 2.3991587162017822, "grad_norm_var": 0.9886539748965065, "learning_rate": 0.0001, "loss": 1.3316, "loss/crossentropy": 2.3047027587890625, "loss/hidden": 1.1328125, "loss/logits": 0.16926254332065582, "loss/reg": 0.0029541929252445698, "step": 1820 }, { "epoch": 0.227625, "grad_norm": 2.27185320854187, "grad_norm_var": 0.9950342111305434, "learning_rate": 0.0001, "loss": 1.0846, "loss/crossentropy": 2.3836841583251953, "loss/hidden": 0.90625, "loss/logits": 0.14881691336631775, "loss/reg": 0.002952732378616929, "step": 1821 }, { "epoch": 0.22775, "grad_norm": 2.8672397136688232, "grad_norm_var": 0.8908872852448818, "learning_rate": 0.0001, "loss": 1.2423, "loss/crossentropy": 2.2023377418518066, "loss/hidden": 0.9921875, "loss/logits": 0.22060072422027588, "loss/reg": 0.0029511942993849516, "step": 1822 }, { "epoch": 0.227875, "grad_norm": 2.2803525924682617, "grad_norm_var": 0.10508732681826037, "learning_rate": 0.0001, "loss": 1.0304, "loss/crossentropy": 2.372196912765503, "loss/hidden": 0.8515625, "loss/logits": 0.14933274686336517, "loss/reg": 0.00294972350820899, "step": 1823 }, { "epoch": 0.228, "grad_norm": 2.6830813884735107, "grad_norm_var": 0.10776807926507198, "learning_rate": 0.0001, "loss": 1.1309, "loss/crossentropy": 2.475736618041992, "loss/hidden": 0.9296875, "loss/logits": 0.17176774144172668, "loss/reg": 0.002948229666799307, "step": 1824 }, { "epoch": 0.228125, "grad_norm": 2.7077255249023438, "grad_norm_var": 0.1090870968752443, "learning_rate": 0.0001, "loss": 1.1178, "loss/crossentropy": 2.6619277000427246, "loss/hidden": 0.92578125, "loss/logits": 0.16253264248371124, "loss/reg": 0.002946724882349372, "step": 1825 }, { "epoch": 0.22825, "grad_norm": 2.7542712688446045, "grad_norm_var": 0.11901288025463994, "learning_rate": 0.0001, "loss": 1.023, "loss/crossentropy": 2.3113021850585938, "loss/hidden": 0.84375, "loss/logits": 0.14976277947425842, "loss/reg": 0.00294527318328619, "step": 1826 }, { "epoch": 0.228375, "grad_norm": 2.1749322414398193, "grad_norm_var": 0.0947496886136868, "learning_rate": 0.0001, "loss": 1.076, "loss/crossentropy": 2.3755033016204834, "loss/hidden": 0.890625, "loss/logits": 0.1559135913848877, "loss/reg": 0.0029437614139169455, "step": 1827 }, { "epoch": 0.2285, "grad_norm": 2.2916691303253174, "grad_norm_var": 0.08209817483413247, "learning_rate": 0.0001, "loss": 1.2318, "loss/crossentropy": 2.409095287322998, "loss/hidden": 1.03125, "loss/logits": 0.17112311720848083, "loss/reg": 0.0029421907383948565, "step": 1828 }, { "epoch": 0.228625, "grad_norm": 3.1720941066741943, "grad_norm_var": 0.11657495418614777, "learning_rate": 0.0001, "loss": 1.0408, "loss/crossentropy": 2.571678400039673, "loss/hidden": 0.85546875, "loss/logits": 0.15594975650310516, "loss/reg": 0.0029407108668237925, "step": 1829 }, { "epoch": 0.22875, "grad_norm": 3.709214448928833, "grad_norm_var": 0.20662170797658444, "learning_rate": 0.0001, "loss": 1.4423, "loss/crossentropy": 2.3802380561828613, "loss/hidden": 1.2265625, "loss/logits": 0.18629974126815796, "loss/reg": 0.0029392328578978777, "step": 1830 }, { "epoch": 0.228875, "grad_norm": 2.151437520980835, "grad_norm_var": 0.20150086001236314, "learning_rate": 0.0001, "loss": 1.1299, "loss/crossentropy": 2.409843921661377, "loss/hidden": 0.9296875, "loss/logits": 0.17083843052387238, "loss/reg": 0.0029376852326095104, "step": 1831 }, { "epoch": 0.229, "grad_norm": 2.108267307281494, "grad_norm_var": 0.20093753742009024, "learning_rate": 0.0001, "loss": 1.0452, "loss/crossentropy": 2.7673611640930176, "loss/hidden": 0.86328125, "loss/logits": 0.15255650877952576, "loss/reg": 0.0029360908083617687, "step": 1832 }, { "epoch": 0.229125, "grad_norm": 2.295905828475952, "grad_norm_var": 0.19961170535207368, "learning_rate": 0.0001, "loss": 1.1137, "loss/crossentropy": 2.4432260990142822, "loss/hidden": 0.93359375, "loss/logits": 0.15079358220100403, "loss/reg": 0.002934559714049101, "step": 1833 }, { "epoch": 0.22925, "grad_norm": 2.0841267108917236, "grad_norm_var": 0.19600194880262786, "learning_rate": 0.0001, "loss": 1.1015, "loss/crossentropy": 2.406099319458008, "loss/hidden": 0.91015625, "loss/logits": 0.1619967818260193, "loss/reg": 0.002933042123913765, "step": 1834 }, { "epoch": 0.229375, "grad_norm": 2.2038230895996094, "grad_norm_var": 0.20169366112067166, "learning_rate": 0.0001, "loss": 1.1717, "loss/crossentropy": 2.36592435836792, "loss/hidden": 0.9765625, "loss/logits": 0.1658266931772232, "loss/reg": 0.0029314891435205936, "step": 1835 }, { "epoch": 0.2295, "grad_norm": 4.329677581787109, "grad_norm_var": 0.40617225913744975, "learning_rate": 0.0001, "loss": 1.0861, "loss/crossentropy": 2.428100824356079, "loss/hidden": 0.90625, "loss/logits": 0.15050062537193298, "loss/reg": 0.002929947804659605, "step": 1836 }, { "epoch": 0.229625, "grad_norm": 2.962228536605835, "grad_norm_var": 0.4029608323643475, "learning_rate": 0.0001, "loss": 1.2032, "loss/crossentropy": 2.661451816558838, "loss/hidden": 1.0078125, "loss/logits": 0.166073739528656, "loss/reg": 0.0029283969197422266, "step": 1837 }, { "epoch": 0.22975, "grad_norm": 2.1242473125457764, "grad_norm_var": 0.418270528733823, "learning_rate": 0.0001, "loss": 1.3265, "loss/crossentropy": 2.3514444828033447, "loss/hidden": 1.09375, "loss/logits": 0.20345856249332428, "loss/reg": 0.002926844172179699, "step": 1838 }, { "epoch": 0.229875, "grad_norm": 2.6637930870056152, "grad_norm_var": 0.4097338351488264, "learning_rate": 0.0001, "loss": 1.0729, "loss/crossentropy": 2.8255255222320557, "loss/hidden": 0.890625, "loss/logits": 0.15303359925746918, "loss/reg": 0.0029252341482788324, "step": 1839 }, { "epoch": 0.23, "grad_norm": 2.8248424530029297, "grad_norm_var": 0.4115956483187119, "learning_rate": 0.0001, "loss": 1.2172, "loss/crossentropy": 2.730475902557373, "loss/hidden": 0.9921875, "loss/logits": 0.19581902027130127, "loss/reg": 0.0029237025883048773, "step": 1840 }, { "epoch": 0.230125, "grad_norm": 2.1636877059936523, "grad_norm_var": 0.42662438202454495, "learning_rate": 0.0001, "loss": 1.2799, "loss/crossentropy": 2.4928698539733887, "loss/hidden": 1.078125, "loss/logits": 0.1725054681301117, "loss/reg": 0.002922156360000372, "step": 1841 }, { "epoch": 0.23025, "grad_norm": 2.155371904373169, "grad_norm_var": 0.4387901405468398, "learning_rate": 0.0001, "loss": 1.0823, "loss/crossentropy": 2.8904709815979004, "loss/hidden": 0.89453125, "loss/logits": 0.15856406092643738, "loss/reg": 0.002920587779954076, "step": 1842 }, { "epoch": 0.230375, "grad_norm": 2.688028573989868, "grad_norm_var": 0.4269539462286275, "learning_rate": 0.0001, "loss": 0.9496, "loss/crossentropy": 2.479196548461914, "loss/hidden": 0.80859375, "loss/logits": 0.1118479073047638, "loss/reg": 0.0029189754277467728, "step": 1843 }, { "epoch": 0.2305, "grad_norm": 3.4530677795410156, "grad_norm_var": 0.46033235618827817, "learning_rate": 0.0001, "loss": 1.3334, "loss/crossentropy": 2.3263473510742188, "loss/hidden": 1.109375, "loss/logits": 0.19483482837677002, "loss/reg": 0.0029173565562814474, "step": 1844 }, { "epoch": 0.230625, "grad_norm": 2.624260425567627, "grad_norm_var": 0.44410306117912624, "learning_rate": 0.0001, "loss": 1.1256, "loss/crossentropy": 2.4004266262054443, "loss/hidden": 0.921875, "loss/logits": 0.1746046245098114, "loss/reg": 0.00291584269143641, "step": 1845 }, { "epoch": 0.23075, "grad_norm": 2.484194755554199, "grad_norm_var": 0.36633673651388654, "learning_rate": 0.0001, "loss": 1.1565, "loss/crossentropy": 2.500600814819336, "loss/hidden": 0.96484375, "loss/logits": 0.1624952256679535, "loss/reg": 0.0029144061263650656, "step": 1846 }, { "epoch": 0.230875, "grad_norm": 2.1600430011749268, "grad_norm_var": 0.36584698292128315, "learning_rate": 0.0001, "loss": 1.0968, "loss/crossentropy": 2.7809672355651855, "loss/hidden": 0.9140625, "loss/logits": 0.1535949558019638, "loss/reg": 0.00291292741894722, "step": 1847 }, { "epoch": 0.231, "grad_norm": 1.9736144542694092, "grad_norm_var": 0.37550067856469693, "learning_rate": 0.0001, "loss": 1.0361, "loss/crossentropy": 2.4563684463500977, "loss/hidden": 0.86328125, "loss/logits": 0.14372900128364563, "loss/reg": 0.002911404939368367, "step": 1848 }, { "epoch": 0.231125, "grad_norm": 2.2191903591156006, "grad_norm_var": 0.3787174770815568, "learning_rate": 0.0001, "loss": 1.0114, "loss/crossentropy": 2.5433239936828613, "loss/hidden": 0.83984375, "loss/logits": 0.14246006309986115, "loss/reg": 0.002909915754571557, "step": 1849 }, { "epoch": 0.23125, "grad_norm": 2.5526938438415527, "grad_norm_var": 0.3621070968588713, "learning_rate": 0.0001, "loss": 0.9695, "loss/crossentropy": 2.5453128814697266, "loss/hidden": 0.8125, "loss/logits": 0.1279653012752533, "loss/reg": 0.0029083597473800182, "step": 1850 }, { "epoch": 0.231375, "grad_norm": 1.9157322645187378, "grad_norm_var": 0.38247098077205705, "learning_rate": 0.0001, "loss": 0.9632, "loss/crossentropy": 2.626375913619995, "loss/hidden": 0.796875, "loss/logits": 0.13728055357933044, "loss/reg": 0.0029068856965750456, "step": 1851 }, { "epoch": 0.2315, "grad_norm": 2.353100538253784, "grad_norm_var": 0.16577489550661723, "learning_rate": 0.0001, "loss": 1.1287, "loss/crossentropy": 2.479382038116455, "loss/hidden": 0.9296875, "loss/logits": 0.16994883120059967, "loss/reg": 0.0029053473845124245, "step": 1852 }, { "epoch": 0.231625, "grad_norm": 2.6295785903930664, "grad_norm_var": 0.1502992299825194, "learning_rate": 0.0001, "loss": 1.2396, "loss/crossentropy": 2.2818477153778076, "loss/hidden": 1.0390625, "loss/logits": 0.17154169082641602, "loss/reg": 0.002903790445998311, "step": 1853 }, { "epoch": 0.23175, "grad_norm": 2.1487951278686523, "grad_norm_var": 0.14931457999495443, "learning_rate": 0.0001, "loss": 1.0425, "loss/crossentropy": 2.6548938751220703, "loss/hidden": 0.86328125, "loss/logits": 0.15023818612098694, "loss/reg": 0.0029021373484283686, "step": 1854 }, { "epoch": 0.231875, "grad_norm": 2.2083189487457275, "grad_norm_var": 0.14857580667151063, "learning_rate": 0.0001, "loss": 1.0317, "loss/crossentropy": 2.3384876251220703, "loss/hidden": 0.8671875, "loss/logits": 0.13553163409233093, "loss/reg": 0.0029005296528339386, "step": 1855 }, { "epoch": 0.232, "grad_norm": 1.848423957824707, "grad_norm_var": 0.1541103110008151, "learning_rate": 0.0001, "loss": 0.9426, "loss/crossentropy": 2.4241435527801514, "loss/hidden": 0.8046875, "loss/logits": 0.1089288517832756, "loss/reg": 0.002899044193327427, "step": 1856 }, { "epoch": 0.232125, "grad_norm": 2.036503553390503, "grad_norm_var": 0.15825755313067677, "learning_rate": 0.0001, "loss": 0.9461, "loss/crossentropy": 2.3014883995056152, "loss/hidden": 0.78515625, "loss/logits": 0.1319369375705719, "loss/reg": 0.0028974406886845827, "step": 1857 }, { "epoch": 0.23225, "grad_norm": 2.712230682373047, "grad_norm_var": 0.16387938230163232, "learning_rate": 0.0001, "loss": 1.2105, "loss/crossentropy": 2.365384101867676, "loss/hidden": 1.0078125, "loss/logits": 0.17370465397834778, "loss/reg": 0.0028959375340491533, "step": 1858 }, { "epoch": 0.232375, "grad_norm": 2.5608208179473877, "grad_norm_var": 0.15958970126699837, "learning_rate": 0.0001, "loss": 1.1594, "loss/crossentropy": 2.623105049133301, "loss/hidden": 0.94140625, "loss/logits": 0.1890484094619751, "loss/reg": 0.002894355682656169, "step": 1859 }, { "epoch": 0.2325, "grad_norm": 1.9694890975952148, "grad_norm_var": 0.08242289833427534, "learning_rate": 0.0001, "loss": 0.8934, "loss/crossentropy": 2.1551883220672607, "loss/hidden": 0.74609375, "loss/logits": 0.11839769035577774, "loss/reg": 0.0028928006067872047, "step": 1860 }, { "epoch": 0.232625, "grad_norm": 2.146641492843628, "grad_norm_var": 0.0744266244705272, "learning_rate": 0.0001, "loss": 1.0166, "loss/crossentropy": 2.694187641143799, "loss/hidden": 0.85546875, "loss/logits": 0.1322515606880188, "loss/reg": 0.002891267416998744, "step": 1861 }, { "epoch": 0.23275, "grad_norm": 2.412816286087036, "grad_norm_var": 0.07246823357878984, "learning_rate": 0.0001, "loss": 1.182, "loss/crossentropy": 2.708153009414673, "loss/hidden": 0.96875, "loss/logits": 0.1843324601650238, "loss/reg": 0.0028897603042423725, "step": 1862 }, { "epoch": 0.232875, "grad_norm": 3.3550024032592773, "grad_norm_var": 0.14889475511776779, "learning_rate": 0.0001, "loss": 1.0168, "loss/crossentropy": 2.240648031234741, "loss/hidden": 0.8671875, "loss/logits": 0.12075857073068619, "loss/reg": 0.0028883127961307764, "step": 1863 }, { "epoch": 0.233, "grad_norm": 2.539043664932251, "grad_norm_var": 0.14312548265110311, "learning_rate": 0.0001, "loss": 0.9847, "loss/crossentropy": 2.3216147422790527, "loss/hidden": 0.828125, "loss/logits": 0.12774166464805603, "loss/reg": 0.0028867912478744984, "step": 1864 }, { "epoch": 0.233125, "grad_norm": 2.358919382095337, "grad_norm_var": 0.14189893172667017, "learning_rate": 0.0001, "loss": 1.0164, "loss/crossentropy": 2.5843939781188965, "loss/hidden": 0.83984375, "loss/logits": 0.14774055778980255, "loss/reg": 0.0028853206895291805, "step": 1865 }, { "epoch": 0.23325, "grad_norm": 2.560542106628418, "grad_norm_var": 0.14210520060771656, "learning_rate": 0.0001, "loss": 1.3539, "loss/crossentropy": 2.2003164291381836, "loss/hidden": 1.1015625, "loss/logits": 0.22345313429832458, "loss/reg": 0.002883851993829012, "step": 1866 }, { "epoch": 0.233375, "grad_norm": 2.2669901847839355, "grad_norm_var": 0.12902140426953868, "learning_rate": 0.0001, "loss": 1.0907, "loss/crossentropy": 2.550511360168457, "loss/hidden": 0.89453125, "loss/logits": 0.16732323169708252, "loss/reg": 0.0028824072796851397, "step": 1867 }, { "epoch": 0.2335, "grad_norm": 1.9912503957748413, "grad_norm_var": 0.1385847546259417, "learning_rate": 0.0001, "loss": 1.1119, "loss/crossentropy": 2.4620280265808105, "loss/hidden": 0.92578125, "loss/logits": 0.15735240280628204, "loss/reg": 0.0028808878269046545, "step": 1868 }, { "epoch": 0.233625, "grad_norm": 2.5993523597717285, "grad_norm_var": 0.13755172432265106, "learning_rate": 0.0001, "loss": 1.4922, "loss/crossentropy": 2.358572244644165, "loss/hidden": 1.2578125, "loss/logits": 0.20562607049942017, "loss/reg": 0.0028794193640351295, "step": 1869 }, { "epoch": 0.23375, "grad_norm": 3.30731463432312, "grad_norm_var": 0.18924561660283715, "learning_rate": 0.0001, "loss": 1.376, "loss/crossentropy": 2.7302675247192383, "loss/hidden": 1.140625, "loss/logits": 0.20656052231788635, "loss/reg": 0.0028778668493032455, "step": 1870 }, { "epoch": 0.233875, "grad_norm": 2.8096041679382324, "grad_norm_var": 0.19410140740735352, "learning_rate": 0.0001, "loss": 1.2133, "loss/crossentropy": 2.322666645050049, "loss/hidden": 1.0078125, "loss/logits": 0.17673031985759735, "loss/reg": 0.0028763674199581146, "step": 1871 }, { "epoch": 0.234, "grad_norm": 2.766726493835449, "grad_norm_var": 0.17104518125896953, "learning_rate": 0.0001, "loss": 1.194, "loss/crossentropy": 2.258401393890381, "loss/hidden": 0.99609375, "loss/logits": 0.169195756316185, "loss/reg": 0.0028748363256454468, "step": 1872 }, { "epoch": 0.234125, "grad_norm": 2.1711134910583496, "grad_norm_var": 0.16341771516509496, "learning_rate": 0.0001, "loss": 0.9274, "loss/crossentropy": 2.3150506019592285, "loss/hidden": 0.77734375, "loss/logits": 0.12128670513629913, "loss/reg": 0.002873300574719906, "step": 1873 }, { "epoch": 0.23425, "grad_norm": 2.1112637519836426, "grad_norm_var": 0.17162801880261083, "learning_rate": 0.0001, "loss": 0.9732, "loss/crossentropy": 2.506507396697998, "loss/hidden": 0.8046875, "loss/logits": 0.13975682854652405, "loss/reg": 0.0028718682006001472, "step": 1874 }, { "epoch": 0.234375, "grad_norm": 2.183361053466797, "grad_norm_var": 0.17724180763689687, "learning_rate": 0.0001, "loss": 1.1296, "loss/crossentropy": 2.4888036251068115, "loss/hidden": 0.93359375, "loss/logits": 0.16730833053588867, "loss/reg": 0.002870464464649558, "step": 1875 }, { "epoch": 0.2345, "grad_norm": 2.735436201095581, "grad_norm_var": 0.16260582148087882, "learning_rate": 0.0001, "loss": 1.1365, "loss/crossentropy": 2.1528542041778564, "loss/hidden": 0.96484375, "loss/logits": 0.14295265078544617, "loss/reg": 0.002869043732061982, "step": 1876 }, { "epoch": 0.234625, "grad_norm": 3.999509572982788, "grad_norm_var": 0.28500931963959975, "learning_rate": 0.0001, "loss": 1.3381, "loss/crossentropy": 3.042414903640747, "loss/hidden": 1.0703125, "loss/logits": 0.23910346627235413, "loss/reg": 0.0028676455840468407, "step": 1877 }, { "epoch": 0.23475, "grad_norm": 2.6461031436920166, "grad_norm_var": 0.2814837056327926, "learning_rate": 0.0001, "loss": 0.9627, "loss/crossentropy": 2.62857985496521, "loss/hidden": 0.7890625, "loss/logits": 0.14501667022705078, "loss/reg": 0.002866254420951009, "step": 1878 }, { "epoch": 0.234875, "grad_norm": 2.356269359588623, "grad_norm_var": 0.2499569691597026, "learning_rate": 0.0001, "loss": 1.1219, "loss/crossentropy": 2.535068988800049, "loss/hidden": 0.9453125, "loss/logits": 0.14790667593479156, "loss/reg": 0.002864871872588992, "step": 1879 }, { "epoch": 0.235, "grad_norm": 2.321554660797119, "grad_norm_var": 0.25432354819466757, "learning_rate": 0.0001, "loss": 1.0411, "loss/crossentropy": 2.5561270713806152, "loss/hidden": 0.86328125, "loss/logits": 0.14915838837623596, "loss/reg": 0.0028635053895413876, "step": 1880 }, { "epoch": 0.235125, "grad_norm": 2.6328799724578857, "grad_norm_var": 0.2511549738430517, "learning_rate": 0.0001, "loss": 1.2299, "loss/crossentropy": 2.174438238143921, "loss/hidden": 1.015625, "loss/logits": 0.18564572930335999, "loss/reg": 0.0028620159719139338, "step": 1881 }, { "epoch": 0.23525, "grad_norm": 2.7941927909851074, "grad_norm_var": 0.25361177630329473, "learning_rate": 0.0001, "loss": 1.0417, "loss/crossentropy": 2.698789596557617, "loss/hidden": 0.87109375, "loss/logits": 0.14195041358470917, "loss/reg": 0.0028606669511646032, "step": 1882 }, { "epoch": 0.235375, "grad_norm": 2.9317786693573, "grad_norm_var": 0.251201000396558, "learning_rate": 0.0001, "loss": 1.222, "loss/crossentropy": 2.6760761737823486, "loss/hidden": 1.015625, "loss/logits": 0.17773698270320892, "loss/reg": 0.0028592266608029604, "step": 1883 }, { "epoch": 0.2355, "grad_norm": 2.170546293258667, "grad_norm_var": 0.23752522799550563, "learning_rate": 0.0001, "loss": 1.0143, "loss/crossentropy": 2.390109062194824, "loss/hidden": 0.83984375, "loss/logits": 0.14584662020206451, "loss/reg": 0.0028577372431755066, "step": 1884 }, { "epoch": 0.235625, "grad_norm": 3.2109017372131348, "grad_norm_var": 0.25607174442198255, "learning_rate": 0.0001, "loss": 1.0177, "loss/crossentropy": 2.848381519317627, "loss/hidden": 0.83203125, "loss/logits": 0.1571502387523651, "loss/reg": 0.0028563509695231915, "step": 1885 }, { "epoch": 0.23575, "grad_norm": 3.3898189067840576, "grad_norm_var": 0.2632133556348774, "learning_rate": 0.0001, "loss": 0.9715, "loss/crossentropy": 2.551968574523926, "loss/hidden": 0.8046875, "loss/logits": 0.13831113278865814, "loss/reg": 0.0028548440895974636, "step": 1886 }, { "epoch": 0.235875, "grad_norm": 2.2735543251037598, "grad_norm_var": 0.27347767108519155, "learning_rate": 0.0001, "loss": 1.0096, "loss/crossentropy": 2.356642484664917, "loss/hidden": 0.84765625, "loss/logits": 0.1334502100944519, "loss/reg": 0.0028533469885587692, "step": 1887 }, { "epoch": 0.236, "grad_norm": 2.8403327465057373, "grad_norm_var": 0.274780906820475, "learning_rate": 0.0001, "loss": 1.1044, "loss/crossentropy": 2.49477481842041, "loss/hidden": 0.89453125, "loss/logits": 0.18135184049606323, "loss/reg": 0.002851872704923153, "step": 1888 }, { "epoch": 0.236125, "grad_norm": 2.4770002365112305, "grad_norm_var": 0.26015786291882914, "learning_rate": 0.0001, "loss": 1.1539, "loss/crossentropy": 2.3961706161499023, "loss/hidden": 0.9765625, "loss/logits": 0.14887994527816772, "loss/reg": 0.002850309479981661, "step": 1889 }, { "epoch": 0.23625, "grad_norm": 97.88726806640625, "grad_norm_var": 566.1572677979839, "learning_rate": 0.0001, "loss": 1.2242, "loss/crossentropy": 2.2282721996307373, "loss/hidden": 1.0234375, "loss/logits": 0.17226368188858032, "loss/reg": 0.002848886651918292, "step": 1890 }, { "epoch": 0.236375, "grad_norm": 2.3098387718200684, "grad_norm_var": 566.048741327807, "learning_rate": 0.0001, "loss": 1.0645, "loss/crossentropy": 2.4957542419433594, "loss/hidden": 0.88671875, "loss/logits": 0.14935660362243652, "loss/reg": 0.0028473958373069763, "step": 1891 }, { "epoch": 0.2365, "grad_norm": 3.0709023475646973, "grad_norm_var": 565.7896104746242, "learning_rate": 0.0001, "loss": 1.0863, "loss/crossentropy": 2.701103925704956, "loss/hidden": 0.90234375, "loss/logits": 0.15548059344291687, "loss/reg": 0.0028457811567932367, "step": 1892 }, { "epoch": 0.236625, "grad_norm": 4.502190113067627, "grad_norm_var": 565.4898863883287, "learning_rate": 0.0001, "loss": 1.4863, "loss/crossentropy": 2.418233871459961, "loss/hidden": 1.265625, "loss/logits": 0.1922321766614914, "loss/reg": 0.002844167174771428, "step": 1893 }, { "epoch": 0.23675, "grad_norm": 3.704207420349121, "grad_norm_var": 564.7003492594727, "learning_rate": 0.0001, "loss": 1.2931, "loss/crossentropy": 2.34171462059021, "loss/hidden": 1.078125, "loss/logits": 0.18653494119644165, "loss/reg": 0.0028426784556359053, "step": 1894 }, { "epoch": 0.236875, "grad_norm": 4.64210319519043, "grad_norm_var": 563.0616126406595, "learning_rate": 0.0001, "loss": 1.3692, "loss/crossentropy": 2.5464377403259277, "loss/hidden": 1.140625, "loss/logits": 0.2001732736825943, "loss/reg": 0.0028411895036697388, "step": 1895 }, { "epoch": 0.237, "grad_norm": 2.472402572631836, "grad_norm_var": 562.9297680002472, "learning_rate": 0.0001, "loss": 1.1043, "loss/crossentropy": 2.488330602645874, "loss/hidden": 0.9140625, "loss/logits": 0.1618151217699051, "loss/reg": 0.002839608583599329, "step": 1896 }, { "epoch": 0.237125, "grad_norm": 3.018164873123169, "grad_norm_var": 562.6141740686131, "learning_rate": 0.0001, "loss": 1.0354, "loss/crossentropy": 2.5816335678100586, "loss/hidden": 0.8671875, "loss/logits": 0.13983488082885742, "loss/reg": 0.0028381715528666973, "step": 1897 }, { "epoch": 0.23725, "grad_norm": 3.750331401824951, "grad_norm_var": 561.8825919502568, "learning_rate": 0.0001, "loss": 1.1906, "loss/crossentropy": 2.758951187133789, "loss/hidden": 0.98828125, "loss/logits": 0.17395856976509094, "loss/reg": 0.002836685860529542, "step": 1898 }, { "epoch": 0.237375, "grad_norm": 2.309964895248413, "grad_norm_var": 562.4132399812779, "learning_rate": 0.0001, "loss": 1.1765, "loss/crossentropy": 2.7613322734832764, "loss/hidden": 0.96875, "loss/logits": 0.17939506471157074, "loss/reg": 0.0028351792134344578, "step": 1899 }, { "epoch": 0.2375, "grad_norm": 3.629230499267578, "grad_norm_var": 561.2175971903463, "learning_rate": 0.0001, "loss": 1.1221, "loss/crossentropy": 2.351802110671997, "loss/hidden": 0.94921875, "loss/logits": 0.1445481777191162, "loss/reg": 0.002833602949976921, "step": 1900 }, { "epoch": 0.237625, "grad_norm": 2.0922610759735107, "grad_norm_var": 562.1731362143731, "learning_rate": 0.0001, "loss": 0.9687, "loss/crossentropy": 2.6160526275634766, "loss/hidden": 0.80859375, "loss/logits": 0.13180279731750488, "loss/reg": 0.0028320997953414917, "step": 1901 }, { "epoch": 0.23775, "grad_norm": 2.7156622409820557, "grad_norm_var": 562.7079033711701, "learning_rate": 0.0001, "loss": 1.1645, "loss/crossentropy": 2.891047239303589, "loss/hidden": 0.9453125, "loss/logits": 0.19086427986621857, "loss/reg": 0.0028305284213274717, "step": 1902 }, { "epoch": 0.237875, "grad_norm": 2.3164191246032715, "grad_norm_var": 562.6696833086194, "learning_rate": 0.0001, "loss": 1.1205, "loss/crossentropy": 2.3813233375549316, "loss/hidden": 0.9453125, "loss/logits": 0.1469373106956482, "loss/reg": 0.0028289342299103737, "step": 1903 }, { "epoch": 0.238, "grad_norm": 2.695955991744995, "grad_norm_var": 562.7892462486658, "learning_rate": 0.0001, "loss": 0.9631, "loss/crossentropy": 2.7349748611450195, "loss/hidden": 0.80078125, "loss/logits": 0.13401402533054352, "loss/reg": 0.0028272622730582952, "step": 1904 }, { "epoch": 0.238125, "grad_norm": 2.295044422149658, "grad_norm_var": 562.9489527602544, "learning_rate": 0.0001, "loss": 1.0814, "loss/crossentropy": 2.499460458755493, "loss/hidden": 0.91796875, "loss/logits": 0.1352241486310959, "loss/reg": 0.0028255698271095753, "step": 1905 }, { "epoch": 0.23825, "grad_norm": 2.244823455810547, "grad_norm_var": 0.6781732251602759, "learning_rate": 0.0001, "loss": 1.0582, "loss/crossentropy": 2.587311029434204, "loss/hidden": 0.8828125, "loss/logits": 0.14716576039791107, "loss/reg": 0.0028240818064659834, "step": 1906 }, { "epoch": 0.238375, "grad_norm": 2.270704507827759, "grad_norm_var": 0.6817949672684023, "learning_rate": 0.0001, "loss": 1.1391, "loss/crossentropy": 2.707805633544922, "loss/hidden": 0.93359375, "loss/logits": 0.1773141622543335, "loss/reg": 0.0028225905261933804, "step": 1907 }, { "epoch": 0.2385, "grad_norm": 3.1557998657226562, "grad_norm_var": 0.6832387916335013, "learning_rate": 0.0001, "loss": 1.1169, "loss/crossentropy": 2.439981460571289, "loss/hidden": 0.93359375, "loss/logits": 0.15508322417736053, "loss/reg": 0.0028210440650582314, "step": 1908 }, { "epoch": 0.238625, "grad_norm": 2.238996982574463, "grad_norm_var": 0.54658289647939, "learning_rate": 0.0001, "loss": 1.0766, "loss/crossentropy": 2.543613910675049, "loss/hidden": 0.890625, "loss/logits": 0.1577475666999817, "loss/reg": 0.0028194894548505545, "step": 1909 }, { "epoch": 0.23875, "grad_norm": 2.3347530364990234, "grad_norm_var": 0.5072756946952609, "learning_rate": 0.0001, "loss": 1.1956, "loss/crossentropy": 2.391685962677002, "loss/hidden": 1.0, "loss/logits": 0.16745620965957642, "loss/reg": 0.002817926462739706, "step": 1910 }, { "epoch": 0.238875, "grad_norm": 3.038079261779785, "grad_norm_var": 0.26585868434679016, "learning_rate": 0.0001, "loss": 1.1833, "loss/crossentropy": 2.6581690311431885, "loss/hidden": 0.9609375, "loss/logits": 0.19423067569732666, "loss/reg": 0.00281645474024117, "step": 1911 }, { "epoch": 0.239, "grad_norm": 2.801513671875, "grad_norm_var": 0.26434526750178977, "learning_rate": 0.0001, "loss": 1.2958, "loss/crossentropy": 2.2399282455444336, "loss/hidden": 1.0859375, "loss/logits": 0.18169432878494263, "loss/reg": 0.002814988372847438, "step": 1912 }, { "epoch": 0.239125, "grad_norm": 2.835973024368286, "grad_norm_var": 0.2582471639147646, "learning_rate": 0.0001, "loss": 1.2229, "loss/crossentropy": 2.8033790588378906, "loss/hidden": 1.0, "loss/logits": 0.19473232328891754, "loss/reg": 0.0028135550674051046, "step": 1913 }, { "epoch": 0.23925, "grad_norm": 2.284269094467163, "grad_norm_var": 0.18147043790236214, "learning_rate": 0.0001, "loss": 1.0563, "loss/crossentropy": 2.4895052909851074, "loss/hidden": 0.875, "loss/logits": 0.15318460762500763, "loss/reg": 0.0028121541254222393, "step": 1914 }, { "epoch": 0.239375, "grad_norm": 2.1640424728393555, "grad_norm_var": 0.18803017488825446, "learning_rate": 0.0001, "loss": 1.0567, "loss/crossentropy": 2.5001635551452637, "loss/hidden": 0.8828125, "loss/logits": 0.14573311805725098, "loss/reg": 0.0028106593526899815, "step": 1915 }, { "epoch": 0.2395, "grad_norm": 1.9871270656585693, "grad_norm_var": 0.12455762918461702, "learning_rate": 0.0001, "loss": 1.0899, "loss/crossentropy": 2.402726888656616, "loss/hidden": 0.9140625, "loss/logits": 0.14770221710205078, "loss/reg": 0.0028093019500374794, "step": 1916 }, { "epoch": 0.239625, "grad_norm": 1.9124521017074585, "grad_norm_var": 0.13556166178302545, "learning_rate": 0.0001, "loss": 1.0272, "loss/crossentropy": 2.475602388381958, "loss/hidden": 0.8515625, "loss/logits": 0.14752693474292755, "loss/reg": 0.002807790180668235, "step": 1917 }, { "epoch": 0.23975, "grad_norm": 2.6607353687286377, "grad_norm_var": 0.1338465573837538, "learning_rate": 0.0001, "loss": 1.0073, "loss/crossentropy": 2.5274460315704346, "loss/hidden": 0.8359375, "loss/logits": 0.14331699907779694, "loss/reg": 0.0028062344063073397, "step": 1918 }, { "epoch": 0.239875, "grad_norm": 2.725597381591797, "grad_norm_var": 0.13689784558561602, "learning_rate": 0.0001, "loss": 1.3288, "loss/crossentropy": 2.164072275161743, "loss/hidden": 1.109375, "loss/logits": 0.19133153557777405, "loss/reg": 0.002804698422551155, "step": 1919 }, { "epoch": 0.24, "grad_norm": 2.5143990516662598, "grad_norm_var": 0.13367861240944112, "learning_rate": 0.0001, "loss": 1.1068, "loss/crossentropy": 2.6517221927642822, "loss/hidden": 0.91015625, "loss/logits": 0.16858091950416565, "loss/reg": 0.002803155919536948, "step": 1920 }, { "epoch": 0.240125, "grad_norm": 4.250274658203125, "grad_norm_var": 0.32790836135060264, "learning_rate": 0.0001, "loss": 1.2321, "loss/crossentropy": 2.756089925765991, "loss/hidden": 0.9375, "loss/logits": 0.2666040062904358, "loss/reg": 0.00280156172811985, "step": 1921 }, { "epoch": 0.24025, "grad_norm": 3.579580307006836, "grad_norm_var": 0.3780541826973238, "learning_rate": 0.0001, "loss": 1.1615, "loss/crossentropy": 2.4776012897491455, "loss/hidden": 0.95703125, "loss/logits": 0.17651261389255524, "loss/reg": 0.002799983136355877, "step": 1922 }, { "epoch": 0.240375, "grad_norm": 2.552597761154175, "grad_norm_var": 0.3679322737687584, "learning_rate": 0.0001, "loss": 1.1309, "loss/crossentropy": 2.535444498062134, "loss/hidden": 0.9453125, "loss/logits": 0.1575796902179718, "loss/reg": 0.0027983970940113068, "step": 1923 }, { "epoch": 0.2405, "grad_norm": 2.1624505519866943, "grad_norm_var": 0.3678785758486583, "learning_rate": 0.0001, "loss": 1.2067, "loss/crossentropy": 2.214770555496216, "loss/hidden": 1.0234375, "loss/logits": 0.15530873835086823, "loss/reg": 0.0027969072107225657, "step": 1924 }, { "epoch": 0.240625, "grad_norm": 2.1517112255096436, "grad_norm_var": 0.3728782554598296, "learning_rate": 0.0001, "loss": 1.0171, "loss/crossentropy": 2.520151138305664, "loss/hidden": 0.84765625, "loss/logits": 0.14149996638298035, "loss/reg": 0.002795466920360923, "step": 1925 }, { "epoch": 0.24075, "grad_norm": 2.09903883934021, "grad_norm_var": 0.3853855727658185, "learning_rate": 0.0001, "loss": 1.0477, "loss/crossentropy": 2.3022382259368896, "loss/hidden": 0.875, "loss/logits": 0.14477625489234924, "loss/reg": 0.0027940254658460617, "step": 1926 }, { "epoch": 0.240875, "grad_norm": 2.2137794494628906, "grad_norm_var": 0.3805278519877115, "learning_rate": 0.0001, "loss": 1.0049, "loss/crossentropy": 2.397075653076172, "loss/hidden": 0.84375, "loss/logits": 0.13318374752998352, "loss/reg": 0.00279267062433064, "step": 1927 }, { "epoch": 0.241, "grad_norm": 2.1640615463256836, "grad_norm_var": 0.3850549000224494, "learning_rate": 0.0001, "loss": 1.2428, "loss/crossentropy": 2.292894124984741, "loss/hidden": 1.0390625, "loss/logits": 0.17587056756019592, "loss/reg": 0.002791155595332384, "step": 1928 }, { "epoch": 0.241125, "grad_norm": 2.0617549419403076, "grad_norm_var": 0.38950121594236903, "learning_rate": 0.0001, "loss": 1.091, "loss/crossentropy": 2.3114614486694336, "loss/hidden": 0.9140625, "loss/logits": 0.14906063675880432, "loss/reg": 0.0027897644322365522, "step": 1929 }, { "epoch": 0.24125, "grad_norm": 2.2500698566436768, "grad_norm_var": 0.3904109329361792, "learning_rate": 0.0001, "loss": 1.0731, "loss/crossentropy": 2.530860662460327, "loss/hidden": 0.8984375, "loss/logits": 0.14675912261009216, "loss/reg": 0.0027884161099791527, "step": 1930 }, { "epoch": 0.241375, "grad_norm": 3.118149518966675, "grad_norm_var": 0.40894295029893557, "learning_rate": 0.0001, "loss": 1.0974, "loss/crossentropy": 2.5993683338165283, "loss/hidden": 0.90625, "loss/logits": 0.16328555345535278, "loss/reg": 0.0027870717458426952, "step": 1931 }, { "epoch": 0.2415, "grad_norm": 2.231571912765503, "grad_norm_var": 0.39513912896007114, "learning_rate": 0.0001, "loss": 1.0225, "loss/crossentropy": 2.673400402069092, "loss/hidden": 0.85546875, "loss/logits": 0.13916508853435516, "loss/reg": 0.0027857308741658926, "step": 1932 }, { "epoch": 0.241625, "grad_norm": 2.2554473876953125, "grad_norm_var": 0.3737690186064941, "learning_rate": 0.0001, "loss": 1.1379, "loss/crossentropy": 2.538700819015503, "loss/hidden": 0.9453125, "loss/logits": 0.16477283835411072, "loss/reg": 0.002784265670925379, "step": 1933 }, { "epoch": 0.24175, "grad_norm": 2.140296459197998, "grad_norm_var": 0.3838427455168045, "learning_rate": 0.0001, "loss": 1.0912, "loss/crossentropy": 2.588484287261963, "loss/hidden": 0.8984375, "loss/logits": 0.1649562418460846, "loss/reg": 0.002782786963507533, "step": 1934 }, { "epoch": 0.241875, "grad_norm": 2.2381234169006348, "grad_norm_var": 0.38594407304694867, "learning_rate": 0.0001, "loss": 1.0233, "loss/crossentropy": 2.5911705493927, "loss/hidden": 0.859375, "loss/logits": 0.13616088032722473, "loss/reg": 0.0027812945190817118, "step": 1935 }, { "epoch": 0.242, "grad_norm": 1.9112108945846558, "grad_norm_var": 0.40744186602944354, "learning_rate": 0.0001, "loss": 0.9393, "loss/crossentropy": 2.5094900131225586, "loss/hidden": 0.78515625, "loss/logits": 0.12632881104946136, "loss/reg": 0.002779774833470583, "step": 1936 }, { "epoch": 0.242125, "grad_norm": 5.132593631744385, "grad_norm_var": 0.6665618029327437, "learning_rate": 0.0001, "loss": 1.2427, "loss/crossentropy": 2.5614383220672607, "loss/hidden": 1.0703125, "loss/logits": 0.144596129655838, "loss/reg": 0.002778239781036973, "step": 1937 }, { "epoch": 0.24225, "grad_norm": 3.1231844425201416, "grad_norm_var": 0.6148830410155909, "learning_rate": 0.0001, "loss": 0.8716, "loss/crossentropy": 2.501601457595825, "loss/hidden": 0.734375, "loss/logits": 0.1094457358121872, "loss/reg": 0.002776721026748419, "step": 1938 }, { "epoch": 0.242375, "grad_norm": 2.5960471630096436, "grad_norm_var": 0.6153759718928247, "learning_rate": 0.0001, "loss": 1.1199, "loss/crossentropy": 2.5568716526031494, "loss/hidden": 0.91796875, "loss/logits": 0.17422887682914734, "loss/reg": 0.0027751729357987642, "step": 1939 }, { "epoch": 0.2425, "grad_norm": 2.3119328022003174, "grad_norm_var": 0.6102323306014952, "learning_rate": 0.0001, "loss": 0.9679, "loss/crossentropy": 2.3882250785827637, "loss/hidden": 0.8046875, "loss/logits": 0.13546867668628693, "loss/reg": 0.0027736674528568983, "step": 1940 }, { "epoch": 0.242625, "grad_norm": 4.160861015319824, "grad_norm_var": 0.7692402881847016, "learning_rate": 0.0001, "loss": 1.2181, "loss/crossentropy": 2.2808122634887695, "loss/hidden": 0.984375, "loss/logits": 0.2060256004333496, "loss/reg": 0.0027721913065761328, "step": 1941 }, { "epoch": 0.24275, "grad_norm": 2.8490424156188965, "grad_norm_var": 0.7517497358643694, "learning_rate": 0.0001, "loss": 1.2525, "loss/crossentropy": 2.345285415649414, "loss/hidden": 1.03125, "loss/logits": 0.19358152151107788, "loss/reg": 0.0027707451954483986, "step": 1942 }, { "epoch": 0.242875, "grad_norm": 7.469876289367676, "grad_norm_var": 2.157014120724763, "learning_rate": 0.0001, "loss": 1.4803, "loss/crossentropy": 2.0654871463775635, "loss/hidden": 1.25, "loss/logits": 0.20259954035282135, "loss/reg": 0.002769321436062455, "step": 1943 }, { "epoch": 0.243, "grad_norm": 2.437960624694824, "grad_norm_var": 2.131142079716708, "learning_rate": 0.0001, "loss": 1.1002, "loss/crossentropy": 2.231019973754883, "loss/hidden": 0.921875, "loss/logits": 0.1506144404411316, "loss/reg": 0.002767904195934534, "step": 1944 }, { "epoch": 0.243125, "grad_norm": 3.1553590297698975, "grad_norm_var": 2.066455279052258, "learning_rate": 0.0001, "loss": 1.1744, "loss/crossentropy": 2.465205669403076, "loss/hidden": 0.95703125, "loss/logits": 0.18965642154216766, "loss/reg": 0.0027665095403790474, "step": 1945 }, { "epoch": 0.24325, "grad_norm": 2.4227399826049805, "grad_norm_var": 2.0490651192590494, "learning_rate": 0.0001, "loss": 1.2199, "loss/crossentropy": 2.8017942905426025, "loss/hidden": 0.9921875, "loss/logits": 0.20010419189929962, "loss/reg": 0.0027651283890008926, "step": 1946 }, { "epoch": 0.243375, "grad_norm": 1.9351319074630737, "grad_norm_var": 2.1332233829394576, "learning_rate": 0.0001, "loss": 1.1051, "loss/crossentropy": 2.385049819946289, "loss/hidden": 0.921875, "loss/logits": 0.15559425950050354, "loss/reg": 0.002763670403510332, "step": 1947 }, { "epoch": 0.2435, "grad_norm": 2.439415454864502, "grad_norm_var": 2.1139850344569364, "learning_rate": 0.0001, "loss": 1.0697, "loss/crossentropy": 2.5390353202819824, "loss/hidden": 0.859375, "loss/logits": 0.18273773789405823, "loss/reg": 0.002762230345979333, "step": 1948 }, { "epoch": 0.243625, "grad_norm": 3.9348716735839844, "grad_norm_var": 2.1154351813563985, "learning_rate": 0.0001, "loss": 1.5285, "loss/crossentropy": 2.544994354248047, "loss/hidden": 1.2265625, "loss/logits": 0.27432870864868164, "loss/reg": 0.0027608247473835945, "step": 1949 }, { "epoch": 0.24375, "grad_norm": 2.6699066162109375, "grad_norm_var": 2.0622895626261593, "learning_rate": 0.0001, "loss": 1.1582, "loss/crossentropy": 2.3712096214294434, "loss/hidden": 0.9609375, "loss/logits": 0.16963256895542145, "loss/reg": 0.002759524155408144, "step": 1950 }, { "epoch": 0.243875, "grad_norm": 2.510438919067383, "grad_norm_var": 2.032934141151713, "learning_rate": 0.0001, "loss": 1.1192, "loss/crossentropy": 2.6832380294799805, "loss/hidden": 0.9140625, "loss/logits": 0.17755383253097534, "loss/reg": 0.002758244751021266, "step": 1951 }, { "epoch": 0.244, "grad_norm": 2.703141689300537, "grad_norm_var": 1.9369671914291076, "learning_rate": 0.0001, "loss": 1.2835, "loss/crossentropy": 2.1617352962493896, "loss/hidden": 1.0625, "loss/logits": 0.1934407502412796, "loss/reg": 0.002756967907771468, "step": 1952 }, { "epoch": 0.244125, "grad_norm": 2.3727617263793945, "grad_norm_var": 1.7168647286460454, "learning_rate": 0.0001, "loss": 1.1945, "loss/crossentropy": 2.636251211166382, "loss/hidden": 0.9765625, "loss/logits": 0.19042374193668365, "loss/reg": 0.0027555141132324934, "step": 1953 }, { "epoch": 0.24425, "grad_norm": 2.1022789478302, "grad_norm_var": 1.774533228862542, "learning_rate": 0.0001, "loss": 1.0134, "loss/crossentropy": 2.517247200012207, "loss/hidden": 0.85546875, "loss/logits": 0.13039694726467133, "loss/reg": 0.0027542109601199627, "step": 1954 }, { "epoch": 0.244375, "grad_norm": 2.1997644901275635, "grad_norm_var": 1.8059291585278145, "learning_rate": 0.0001, "loss": 1.0473, "loss/crossentropy": 2.655820608139038, "loss/hidden": 0.87109375, "loss/logits": 0.1486739218235016, "loss/reg": 0.002752919914200902, "step": 1955 }, { "epoch": 0.2445, "grad_norm": 2.6467058658599854, "grad_norm_var": 1.7831262007346442, "learning_rate": 0.0001, "loss": 1.3302, "loss/crossentropy": 2.114410400390625, "loss/hidden": 1.125, "loss/logits": 0.17764800786972046, "loss/reg": 0.0027516759000718594, "step": 1956 }, { "epoch": 0.244625, "grad_norm": 2.7396280765533447, "grad_norm_var": 1.68951109645124, "learning_rate": 0.0001, "loss": 1.1134, "loss/crossentropy": 2.5355465412139893, "loss/hidden": 0.921875, "loss/logits": 0.16403117775917053, "loss/reg": 0.002750229090452194, "step": 1957 }, { "epoch": 0.24475, "grad_norm": 2.410583019256592, "grad_norm_var": 1.7051962159964043, "learning_rate": 0.0001, "loss": 1.1862, "loss/crossentropy": 2.0772080421447754, "loss/hidden": 1.015625, "loss/logits": 0.1430494487285614, "loss/reg": 0.0027487878687679768, "step": 1958 }, { "epoch": 0.244875, "grad_norm": 2.3453474044799805, "grad_norm_var": 0.21338224169162548, "learning_rate": 0.0001, "loss": 0.9842, "loss/crossentropy": 2.786540985107422, "loss/hidden": 0.80859375, "loss/logits": 0.14812374114990234, "loss/reg": 0.002747328719124198, "step": 1959 }, { "epoch": 0.245, "grad_norm": 3.010045051574707, "grad_norm_var": 0.22421355318186417, "learning_rate": 0.0001, "loss": 1.2039, "loss/crossentropy": 2.6426658630371094, "loss/hidden": 0.953125, "loss/logits": 0.22334754467010498, "loss/reg": 0.0027458607219159603, "step": 1960 }, { "epoch": 0.245125, "grad_norm": 2.431356191635132, "grad_norm_var": 0.20335259794886623, "learning_rate": 0.0001, "loss": 1.0896, "loss/crossentropy": 2.223994255065918, "loss/hidden": 0.921875, "loss/logits": 0.14024955034255981, "loss/reg": 0.0027444439474493265, "step": 1961 }, { "epoch": 0.24525, "grad_norm": 2.264490842819214, "grad_norm_var": 0.20770068539455805, "learning_rate": 0.0001, "loss": 0.9975, "loss/crossentropy": 2.5207021236419678, "loss/hidden": 0.83203125, "loss/logits": 0.13799090683460236, "loss/reg": 0.0027429983019828796, "step": 1962 }, { "epoch": 0.245375, "grad_norm": 2.420308828353882, "grad_norm_var": 0.18297715933091632, "learning_rate": 0.0001, "loss": 1.2007, "loss/crossentropy": 2.4860877990722656, "loss/hidden": 0.9921875, "loss/logits": 0.1810786873102188, "loss/reg": 0.00274151680059731, "step": 1963 }, { "epoch": 0.2455, "grad_norm": 2.414461851119995, "grad_norm_var": 0.18346740397452094, "learning_rate": 0.0001, "loss": 1.071, "loss/crossentropy": 2.4090845584869385, "loss/hidden": 0.90234375, "loss/logits": 0.14120522141456604, "loss/reg": 0.0027401153929531574, "step": 1964 }, { "epoch": 0.245625, "grad_norm": 2.4071054458618164, "grad_norm_var": 0.05203356240904213, "learning_rate": 0.0001, "loss": 1.0379, "loss/crossentropy": 2.6016945838928223, "loss/hidden": 0.86328125, "loss/logits": 0.14726971089839935, "loss/reg": 0.0027385957073420286, "step": 1965 }, { "epoch": 0.24575, "grad_norm": 2.428882360458374, "grad_norm_var": 0.04949778844412904, "learning_rate": 0.0001, "loss": 0.9793, "loss/crossentropy": 2.6819698810577393, "loss/hidden": 0.8203125, "loss/logits": 0.13164296746253967, "loss/reg": 0.002737129107117653, "step": 1966 }, { "epoch": 0.245875, "grad_norm": 2.2721593379974365, "grad_norm_var": 0.05153780887834784, "learning_rate": 0.0001, "loss": 1.0456, "loss/crossentropy": 2.2785394191741943, "loss/hidden": 0.86328125, "loss/logits": 0.1549597829580307, "loss/reg": 0.002735583111643791, "step": 1967 }, { "epoch": 0.246, "grad_norm": 2.5156705379486084, "grad_norm_var": 0.04735843285122859, "learning_rate": 0.0001, "loss": 1.1212, "loss/crossentropy": 2.4128031730651855, "loss/hidden": 0.9375, "loss/logits": 0.15634964406490326, "loss/reg": 0.002734163776040077, "step": 1968 }, { "epoch": 0.246125, "grad_norm": 2.315765380859375, "grad_norm_var": 0.04804468545032871, "learning_rate": 0.0001, "loss": 1.1106, "loss/crossentropy": 2.5273008346557617, "loss/hidden": 0.93359375, "loss/logits": 0.1496410369873047, "loss/reg": 0.0027327670250087976, "step": 1969 }, { "epoch": 0.24625, "grad_norm": 3.655541181564331, "grad_norm_var": 0.13038539827467327, "learning_rate": 0.0001, "loss": 1.1545, "loss/crossentropy": 2.816701889038086, "loss/hidden": 0.9453125, "loss/logits": 0.18190214037895203, "loss/reg": 0.002731376327574253, "step": 1970 }, { "epoch": 0.246375, "grad_norm": 2.0330469608306885, "grad_norm_var": 0.13946034117999087, "learning_rate": 0.0001, "loss": 0.9543, "loss/crossentropy": 2.5322234630584717, "loss/hidden": 0.8046875, "loss/logits": 0.12230876088142395, "loss/reg": 0.0027299553621560335, "step": 1971 }, { "epoch": 0.2465, "grad_norm": 2.3114256858825684, "grad_norm_var": 0.1407970077955942, "learning_rate": 0.0001, "loss": 1.0766, "loss/crossentropy": 2.403012752532959, "loss/hidden": 0.90625, "loss/logits": 0.14305217564105988, "loss/reg": 0.0027285972610116005, "step": 1972 }, { "epoch": 0.246625, "grad_norm": 2.872380256652832, "grad_norm_var": 0.14616669234115964, "learning_rate": 0.0001, "loss": 1.0601, "loss/crossentropy": 2.620347261428833, "loss/hidden": 0.875, "loss/logits": 0.15787017345428467, "loss/reg": 0.0027271404396742582, "step": 1973 }, { "epoch": 0.24675, "grad_norm": 2.204484462738037, "grad_norm_var": 0.15146511044817218, "learning_rate": 0.0001, "loss": 1.0066, "loss/crossentropy": 2.5610415935516357, "loss/hidden": 0.828125, "loss/logits": 0.15118342638015747, "loss/reg": 0.002725655445829034, "step": 1974 }, { "epoch": 0.246875, "grad_norm": 2.2441866397857666, "grad_norm_var": 0.15410845728410152, "learning_rate": 0.0001, "loss": 0.9559, "loss/crossentropy": 2.513578414916992, "loss/hidden": 0.8046875, "loss/logits": 0.12393586337566376, "loss/reg": 0.002724139718338847, "step": 1975 }, { "epoch": 0.247, "grad_norm": 2.8840277194976807, "grad_norm_var": 0.14632239260073235, "learning_rate": 0.0001, "loss": 1.0875, "loss/crossentropy": 2.491835832595825, "loss/hidden": 0.90625, "loss/logits": 0.15406697988510132, "loss/reg": 0.0027226670645177364, "step": 1976 }, { "epoch": 0.247125, "grad_norm": 2.744213342666626, "grad_norm_var": 0.15042299567527168, "learning_rate": 0.0001, "loss": 1.2954, "loss/crossentropy": 2.233274459838867, "loss/hidden": 1.0859375, "loss/logits": 0.18220359086990356, "loss/reg": 0.0027212114073336124, "step": 1977 }, { "epoch": 0.24725, "grad_norm": 2.3532700538635254, "grad_norm_var": 0.1481365956517531, "learning_rate": 0.0001, "loss": 1.1782, "loss/crossentropy": 2.70269775390625, "loss/hidden": 0.96484375, "loss/logits": 0.18613766133785248, "loss/reg": 0.002719811163842678, "step": 1978 }, { "epoch": 0.247375, "grad_norm": 2.016599416732788, "grad_norm_var": 0.16287134788210172, "learning_rate": 0.0001, "loss": 1.0368, "loss/crossentropy": 2.7286431789398193, "loss/hidden": 0.859375, "loss/logits": 0.1502460092306137, "loss/reg": 0.0027184404898434877, "step": 1979 }, { "epoch": 0.2475, "grad_norm": 2.468029499053955, "grad_norm_var": 0.16258562087950257, "learning_rate": 0.0001, "loss": 1.0856, "loss/crossentropy": 2.4531679153442383, "loss/hidden": 0.9140625, "loss/logits": 0.14435534179210663, "loss/reg": 0.0027170274406671524, "step": 1980 }, { "epoch": 0.247625, "grad_norm": 2.8880743980407715, "grad_norm_var": 0.1721816167867452, "learning_rate": 0.0001, "loss": 1.0137, "loss/crossentropy": 2.8085103034973145, "loss/hidden": 0.8203125, "loss/logits": 0.16619496047496796, "loss/reg": 0.0027155885472893715, "step": 1981 }, { "epoch": 0.24775, "grad_norm": 2.2269508838653564, "grad_norm_var": 0.1769945282356974, "learning_rate": 0.0001, "loss": 1.0995, "loss/crossentropy": 2.7113091945648193, "loss/hidden": 0.92578125, "loss/logits": 0.1465437412261963, "loss/reg": 0.0027142076287418604, "step": 1982 }, { "epoch": 0.247875, "grad_norm": 2.3174233436584473, "grad_norm_var": 0.17574531851225003, "learning_rate": 0.0001, "loss": 1.4501, "loss/crossentropy": 1.9189180135726929, "loss/hidden": 1.1953125, "loss/logits": 0.22766916453838348, "loss/reg": 0.0027127759531140327, "step": 1983 }, { "epoch": 0.248, "grad_norm": 1.7954802513122559, "grad_norm_var": 0.20696429693966606, "learning_rate": 0.0001, "loss": 1.0248, "loss/crossentropy": 2.484433174133301, "loss/hidden": 0.859375, "loss/logits": 0.1383214145898819, "loss/reg": 0.0027115046977996826, "step": 1984 }, { "epoch": 0.248125, "grad_norm": 1.98469078540802, "grad_norm_var": 0.22010164823287107, "learning_rate": 0.0001, "loss": 1.0636, "loss/crossentropy": 2.355754852294922, "loss/hidden": 0.875, "loss/logits": 0.16146710515022278, "loss/reg": 0.0027100895531475544, "step": 1985 }, { "epoch": 0.24825, "grad_norm": 3.0588693618774414, "grad_norm_var": 0.14544907650537522, "learning_rate": 0.0001, "loss": 1.2953, "loss/crossentropy": 2.55202317237854, "loss/hidden": 1.0625, "loss/logits": 0.20571765303611755, "loss/reg": 0.002708751941099763, "step": 1986 }, { "epoch": 0.248375, "grad_norm": 1.9647020101547241, "grad_norm_var": 0.14908673013686075, "learning_rate": 0.0001, "loss": 1.0645, "loss/crossentropy": 2.5482850074768066, "loss/hidden": 0.8828125, "loss/logits": 0.15463218092918396, "loss/reg": 0.0027072790544480085, "step": 1987 }, { "epoch": 0.2485, "grad_norm": 2.367072582244873, "grad_norm_var": 0.1486533124992943, "learning_rate": 0.0001, "loss": 1.087, "loss/crossentropy": 2.4881367683410645, "loss/hidden": 0.90625, "loss/logits": 0.15367110073566437, "loss/reg": 0.0027056580875068903, "step": 1988 }, { "epoch": 0.248625, "grad_norm": 2.4453704357147217, "grad_norm_var": 0.1331206329775275, "learning_rate": 0.0001, "loss": 1.0873, "loss/crossentropy": 2.556847333908081, "loss/hidden": 0.88671875, "loss/logits": 0.17354083061218262, "loss/reg": 0.0027040427085012197, "step": 1989 }, { "epoch": 0.24875, "grad_norm": 9.09829044342041, "grad_norm_var": 2.9487722333659785, "learning_rate": 0.0001, "loss": 1.2835, "loss/crossentropy": 2.4852135181427, "loss/hidden": 1.09375, "loss/logits": 0.16272324323654175, "loss/reg": 0.0027026128955185413, "step": 1990 }, { "epoch": 0.248875, "grad_norm": 2.424456834793091, "grad_norm_var": 2.9373577672795688, "learning_rate": 0.0001, "loss": 1.0357, "loss/crossentropy": 2.4856038093566895, "loss/hidden": 0.8515625, "loss/logits": 0.15710175037384033, "loss/reg": 0.0027011926285922527, "step": 1991 }, { "epoch": 0.249, "grad_norm": 2.624159336090088, "grad_norm_var": 2.939181373576417, "learning_rate": 0.0001, "loss": 1.3347, "loss/crossentropy": 2.166581869125366, "loss/hidden": 1.109375, "loss/logits": 0.19837325811386108, "loss/reg": 0.0026996470987796783, "step": 1992 }, { "epoch": 0.249125, "grad_norm": 2.957742691040039, "grad_norm_var": 2.9404825335519735, "learning_rate": 0.0001, "loss": 1.3991, "loss/crossentropy": 2.6836252212524414, "loss/hidden": 1.140625, "loss/logits": 0.23152483999729156, "loss/reg": 0.002698224736377597, "step": 1993 }, { "epoch": 0.24925, "grad_norm": 2.1770012378692627, "grad_norm_var": 2.9532045555307374, "learning_rate": 0.0001, "loss": 1.099, "loss/crossentropy": 2.48860239982605, "loss/hidden": 0.9296875, "loss/logits": 0.14239296317100525, "loss/reg": 0.002696766285225749, "step": 1994 }, { "epoch": 0.249375, "grad_norm": 2.590374708175659, "grad_norm_var": 2.9137765910812217, "learning_rate": 0.0001, "loss": 1.1914, "loss/crossentropy": 2.4991402626037598, "loss/hidden": 0.98828125, "loss/logits": 0.17614206671714783, "loss/reg": 0.0026953339111059904, "step": 1995 }, { "epoch": 0.2495, "grad_norm": 2.9202098846435547, "grad_norm_var": 2.904322765602717, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.286001682281494, "loss/hidden": 0.9453125, "loss/logits": 0.1659601628780365, "loss/reg": 0.0026938265655189753, "step": 1996 }, { "epoch": 0.249625, "grad_norm": 2.133800745010376, "grad_norm_var": 2.93756568739632, "learning_rate": 0.0001, "loss": 1.1125, "loss/crossentropy": 2.492587089538574, "loss/hidden": 0.91015625, "loss/logits": 0.17543631792068481, "loss/reg": 0.002692408859729767, "step": 1997 }, { "epoch": 0.24975, "grad_norm": 2.7813234329223633, "grad_norm_var": 2.9130920460482055, "learning_rate": 0.0001, "loss": 1.0991, "loss/crossentropy": 2.636305332183838, "loss/hidden": 0.921875, "loss/logits": 0.15036045014858246, "loss/reg": 0.0026909259613603354, "step": 1998 }, { "epoch": 0.249875, "grad_norm": 2.2112627029418945, "grad_norm_var": 2.9213711600102763, "learning_rate": 0.0001, "loss": 0.885, "loss/crossentropy": 2.672691822052002, "loss/hidden": 0.73828125, "loss/logits": 0.11980107426643372, "loss/reg": 0.002689523156732321, "step": 1999 }, { "epoch": 0.25, "grad_norm": 2.095968246459961, "grad_norm_var": 2.8849283178664864, "learning_rate": 0.0001, "loss": 0.9991, "loss/crossentropy": 2.556314706802368, "loss/hidden": 0.83203125, "loss/logits": 0.14016053080558777, "loss/reg": 0.002688055392354727, "step": 2000 }, { "epoch": 0.250125, "grad_norm": 2.8378489017486572, "grad_norm_var": 2.8303151286963772, "learning_rate": 0.0001, "loss": 1.2437, "loss/crossentropy": 2.3531494140625, "loss/hidden": 1.046875, "loss/logits": 0.16997525095939636, "loss/reg": 0.002686618361622095, "step": 2001 }, { "epoch": 0.25025, "grad_norm": 35.951927185058594, "grad_norm_var": 71.07008565728943, "learning_rate": 0.0001, "loss": 1.3095, "loss/crossentropy": 2.5978991985321045, "loss/hidden": 1.0859375, "loss/logits": 0.1967199146747589, "loss/reg": 0.002685235347598791, "step": 2002 }, { "epoch": 0.250375, "grad_norm": 2.5324807167053223, "grad_norm_var": 70.86243034736204, "learning_rate": 0.0001, "loss": 1.099, "loss/crossentropy": 2.450153350830078, "loss/hidden": 0.921875, "loss/logits": 0.150313138961792, "loss/reg": 0.002683886792510748, "step": 2003 }, { "epoch": 0.2505, "grad_norm": 2.555966854095459, "grad_norm_var": 70.7981127580626, "learning_rate": 0.0001, "loss": 1.085, "loss/crossentropy": 2.4010512828826904, "loss/hidden": 0.91015625, "loss/logits": 0.14801207184791565, "loss/reg": 0.0026825855020433664, "step": 2004 }, { "epoch": 0.250625, "grad_norm": 2.158355236053467, "grad_norm_var": 70.90183256821422, "learning_rate": 0.0001, "loss": 1.0294, "loss/crossentropy": 2.510014295578003, "loss/hidden": 0.8515625, "loss/logits": 0.15103401243686676, "loss/reg": 0.002681274898350239, "step": 2005 }, { "epoch": 0.25075, "grad_norm": 3.438277244567871, "grad_norm_var": 69.81362999028246, "learning_rate": 0.0001, "loss": 1.3826, "loss/crossentropy": 2.1209561824798584, "loss/hidden": 1.09375, "loss/logits": 0.2620807886123657, "loss/reg": 0.0026799608021974564, "step": 2006 }, { "epoch": 0.250875, "grad_norm": 5.347130298614502, "grad_norm_var": 69.48045019313803, "learning_rate": 0.0001, "loss": 1.097, "loss/crossentropy": 2.780998945236206, "loss/hidden": 0.9140625, "loss/logits": 0.15616188943386078, "loss/reg": 0.00267866812646389, "step": 2007 }, { "epoch": 0.251, "grad_norm": 2.695228338241577, "grad_norm_var": 69.45984358048393, "learning_rate": 0.0001, "loss": 1.2669, "loss/crossentropy": 2.1116249561309814, "loss/hidden": 1.046875, "loss/logits": 0.19320815801620483, "loss/reg": 0.0026772518176585436, "step": 2008 }, { "epoch": 0.251125, "grad_norm": 2.227726936340332, "grad_norm_var": 69.67602639162038, "learning_rate": 0.0001, "loss": 1.1164, "loss/crossentropy": 2.28704833984375, "loss/hidden": 0.9296875, "loss/logits": 0.15991178154945374, "loss/reg": 0.0026759705506265163, "step": 2009 }, { "epoch": 0.25125, "grad_norm": 2.8131332397460938, "grad_norm_var": 69.47961070787858, "learning_rate": 0.0001, "loss": 1.1738, "loss/crossentropy": 2.749027967453003, "loss/hidden": 0.953125, "loss/logits": 0.19389693439006805, "loss/reg": 0.0026746727526187897, "step": 2010 }, { "epoch": 0.251375, "grad_norm": 2.2205655574798584, "grad_norm_var": 69.59862326803298, "learning_rate": 0.0001, "loss": 1.0328, "loss/crossentropy": 2.382692337036133, "loss/hidden": 0.85546875, "loss/logits": 0.15056103467941284, "loss/reg": 0.002673567971214652, "step": 2011 }, { "epoch": 0.2515, "grad_norm": 2.027740001678467, "grad_norm_var": 69.87299358465654, "learning_rate": 0.0001, "loss": 1.0238, "loss/crossentropy": 2.3822014331817627, "loss/hidden": 0.84765625, "loss/logits": 0.14942201972007751, "loss/reg": 0.002672403585165739, "step": 2012 }, { "epoch": 0.251625, "grad_norm": 2.343120813369751, "grad_norm_var": 69.80266548987866, "learning_rate": 0.0001, "loss": 0.9842, "loss/crossentropy": 2.560854911804199, "loss/hidden": 0.82421875, "loss/logits": 0.1332956850528717, "loss/reg": 0.002671280177310109, "step": 2013 }, { "epoch": 0.25175, "grad_norm": 2.24985408782959, "grad_norm_var": 69.96087904595237, "learning_rate": 0.0001, "loss": 1.1486, "loss/crossentropy": 2.482980489730835, "loss/hidden": 0.96875, "loss/logits": 0.15319132804870605, "loss/reg": 0.0026701646856963634, "step": 2014 }, { "epoch": 0.251875, "grad_norm": 3.0031232833862305, "grad_norm_var": 69.73396196846653, "learning_rate": 0.0001, "loss": 1.1747, "loss/crossentropy": 2.248093843460083, "loss/hidden": 0.9921875, "loss/logits": 0.15582841634750366, "loss/reg": 0.0026687337085604668, "step": 2015 }, { "epoch": 0.252, "grad_norm": 2.3424453735351562, "grad_norm_var": 69.64951402336413, "learning_rate": 0.0001, "loss": 0.9724, "loss/crossentropy": 2.3156840801239014, "loss/hidden": 0.8203125, "loss/logits": 0.12541036307811737, "loss/reg": 0.0026673167012631893, "step": 2016 }, { "epoch": 0.252125, "grad_norm": 2.5692121982574463, "grad_norm_var": 69.72418185683372, "learning_rate": 0.0001, "loss": 1.1101, "loss/crossentropy": 2.2988123893737793, "loss/hidden": 0.92578125, "loss/logits": 0.1576610803604126, "loss/reg": 0.002665894338861108, "step": 2017 }, { "epoch": 0.25225, "grad_norm": 2.1914291381835938, "grad_norm_var": 0.6415342136157951, "learning_rate": 0.0001, "loss": 1.0365, "loss/crossentropy": 2.31876540184021, "loss/hidden": 0.87890625, "loss/logits": 0.13094455003738403, "loss/reg": 0.0026644866447895765, "step": 2018 }, { "epoch": 0.252375, "grad_norm": 2.36063289642334, "grad_norm_var": 0.6465248984391565, "learning_rate": 0.0001, "loss": 0.9872, "loss/crossentropy": 2.2804954051971436, "loss/hidden": 0.82421875, "loss/logits": 0.13631156086921692, "loss/reg": 0.0026632039807736874, "step": 2019 }, { "epoch": 0.2525, "grad_norm": 2.238553762435913, "grad_norm_var": 0.6571822282014294, "learning_rate": 0.0001, "loss": 1.1813, "loss/crossentropy": 2.1497645378112793, "loss/hidden": 0.98828125, "loss/logits": 0.1663762480020523, "loss/reg": 0.0026618337724357843, "step": 2020 }, { "epoch": 0.252625, "grad_norm": 3.0800483226776123, "grad_norm_var": 0.6511900980613661, "learning_rate": 0.0001, "loss": 1.0778, "loss/crossentropy": 2.7136666774749756, "loss/hidden": 0.88671875, "loss/logits": 0.16449707746505737, "loss/reg": 0.002660271944478154, "step": 2021 }, { "epoch": 0.25275, "grad_norm": 2.2444255352020264, "grad_norm_var": 0.6222359448420981, "learning_rate": 0.0001, "loss": 1.1214, "loss/crossentropy": 2.7323660850524902, "loss/hidden": 0.921875, "loss/logits": 0.17298623919487, "loss/reg": 0.0026587177999317646, "step": 2022 }, { "epoch": 0.252875, "grad_norm": 1.7222685813903809, "grad_norm_var": 0.12643786777779625, "learning_rate": 0.0001, "loss": 1.0124, "loss/crossentropy": 2.4022305011749268, "loss/hidden": 0.84765625, "loss/logits": 0.13815155625343323, "loss/reg": 0.0026573152281343937, "step": 2023 }, { "epoch": 0.253, "grad_norm": 4.3202595710754395, "grad_norm_var": 0.3564052405205416, "learning_rate": 0.0001, "loss": 1.2586, "loss/crossentropy": 2.6477274894714355, "loss/hidden": 1.078125, "loss/logits": 0.15386849641799927, "loss/reg": 0.0026557561941444874, "step": 2024 }, { "epoch": 0.253125, "grad_norm": 2.1517889499664307, "grad_norm_var": 0.3594936657737698, "learning_rate": 0.0001, "loss": 1.1451, "loss/crossentropy": 2.3960375785827637, "loss/hidden": 0.94921875, "loss/logits": 0.16935834288597107, "loss/reg": 0.002654144773259759, "step": 2025 }, { "epoch": 0.25325, "grad_norm": 2.0134212970733643, "grad_norm_var": 0.3652669798631668, "learning_rate": 0.0001, "loss": 1.0349, "loss/crossentropy": 2.356727361679077, "loss/hidden": 0.86328125, "loss/logits": 0.1451396942138672, "loss/reg": 0.0026525380089879036, "step": 2026 }, { "epoch": 0.253375, "grad_norm": 3.6036977767944336, "grad_norm_var": 0.44391707836797967, "learning_rate": 0.0001, "loss": 1.0694, "loss/crossentropy": 2.594559907913208, "loss/hidden": 0.88671875, "loss/logits": 0.1561770886182785, "loss/reg": 0.0026508504524827003, "step": 2027 }, { "epoch": 0.2535, "grad_norm": 2.9412989616394043, "grad_norm_var": 0.4350366049595718, "learning_rate": 0.0001, "loss": 1.2788, "loss/crossentropy": 2.793983221054077, "loss/hidden": 1.046875, "loss/logits": 0.2053903192281723, "loss/reg": 0.0026491335593163967, "step": 2028 }, { "epoch": 0.253625, "grad_norm": 3.288369655609131, "grad_norm_var": 0.46027253386829065, "learning_rate": 0.0001, "loss": 1.4718, "loss/crossentropy": 2.01153302192688, "loss/hidden": 1.2265625, "loss/logits": 0.21880899369716644, "loss/reg": 0.0026477184146642685, "step": 2029 }, { "epoch": 0.25375, "grad_norm": 2.63790225982666, "grad_norm_var": 0.44923643513780764, "learning_rate": 0.0001, "loss": 1.1276, "loss/crossentropy": 2.834561824798584, "loss/hidden": 0.91796875, "loss/logits": 0.18317309021949768, "loss/reg": 0.0026463211979717016, "step": 2030 }, { "epoch": 0.253875, "grad_norm": 2.3154261112213135, "grad_norm_var": 0.44818559844539413, "learning_rate": 0.0001, "loss": 1.0854, "loss/crossentropy": 2.5982468128204346, "loss/hidden": 0.8984375, "loss/logits": 0.16055455803871155, "loss/reg": 0.002644946100190282, "step": 2031 }, { "epoch": 0.254, "grad_norm": 2.049203872680664, "grad_norm_var": 0.4646593333388881, "learning_rate": 0.0001, "loss": 0.9727, "loss/crossentropy": 2.64897084236145, "loss/hidden": 0.8046875, "loss/logits": 0.141591876745224, "loss/reg": 0.002643442479893565, "step": 2032 }, { "epoch": 0.254125, "grad_norm": 4.817141056060791, "grad_norm_var": 0.768858858264782, "learning_rate": 0.0001, "loss": 1.3602, "loss/crossentropy": 2.678563117980957, "loss/hidden": 1.15625, "loss/logits": 0.17750373482704163, "loss/reg": 0.002642034785822034, "step": 2033 }, { "epoch": 0.25425, "grad_norm": 2.1885433197021484, "grad_norm_var": 0.7690737229662309, "learning_rate": 0.0001, "loss": 1.1656, "loss/crossentropy": 2.2676990032196045, "loss/hidden": 0.96875, "loss/logits": 0.17040611803531647, "loss/reg": 0.0026406804099678993, "step": 2034 }, { "epoch": 0.254375, "grad_norm": 2.9940383434295654, "grad_norm_var": 0.7614078557697835, "learning_rate": 0.0001, "loss": 1.2163, "loss/crossentropy": 2.45936918258667, "loss/hidden": 0.98046875, "loss/logits": 0.20945608615875244, "loss/reg": 0.002639343962073326, "step": 2035 }, { "epoch": 0.2545, "grad_norm": 3.281536102294922, "grad_norm_var": 0.753001768997, "learning_rate": 0.0001, "loss": 1.1134, "loss/crossentropy": 2.4623613357543945, "loss/hidden": 0.92578125, "loss/logits": 0.16123171150684357, "loss/reg": 0.0026379218325018883, "step": 2036 }, { "epoch": 0.254625, "grad_norm": 2.994741201400757, "grad_norm_var": 0.7508750624375343, "learning_rate": 0.0001, "loss": 1.1052, "loss/crossentropy": 2.2919602394104004, "loss/hidden": 0.93359375, "loss/logits": 0.14528794586658478, "loss/reg": 0.002636397024616599, "step": 2037 }, { "epoch": 0.25475, "grad_norm": 1.9883980751037598, "grad_norm_var": 0.775567758803898, "learning_rate": 0.0001, "loss": 1.0518, "loss/crossentropy": 2.352290153503418, "loss/hidden": 0.87109375, "loss/logits": 0.15439707040786743, "loss/reg": 0.0026349930558353662, "step": 2038 }, { "epoch": 0.254875, "grad_norm": 4.3199005126953125, "grad_norm_var": 0.8130278117022136, "learning_rate": 0.0001, "loss": 1.1397, "loss/crossentropy": 3.026923179626465, "loss/hidden": 0.96875, "loss/logits": 0.144575297832489, "loss/reg": 0.002633505268022418, "step": 2039 }, { "epoch": 0.255, "grad_norm": 1.8793284893035889, "grad_norm_var": 0.7538046611751156, "learning_rate": 0.0001, "loss": 1.0028, "loss/crossentropy": 2.404716968536377, "loss/hidden": 0.8359375, "loss/logits": 0.1405124068260193, "loss/reg": 0.002631993032991886, "step": 2040 }, { "epoch": 0.255125, "grad_norm": 2.325596570968628, "grad_norm_var": 0.7397080583422536, "learning_rate": 0.0001, "loss": 1.0558, "loss/crossentropy": 2.6889920234680176, "loss/hidden": 0.87890625, "loss/logits": 0.15057046711444855, "loss/reg": 0.002630598144605756, "step": 2041 }, { "epoch": 0.25525, "grad_norm": 2.7172155380249023, "grad_norm_var": 0.6919359946974526, "learning_rate": 0.0001, "loss": 1.1571, "loss/crossentropy": 2.1573784351348877, "loss/hidden": 0.97265625, "loss/logits": 0.15813103318214417, "loss/reg": 0.002629149006679654, "step": 2042 }, { "epoch": 0.255375, "grad_norm": 1.9764583110809326, "grad_norm_var": 0.7039703717809858, "learning_rate": 0.0001, "loss": 0.9286, "loss/crossentropy": 2.513575792312622, "loss/hidden": 0.78515625, "loss/logits": 0.11714302748441696, "loss/reg": 0.0026277226861566305, "step": 2043 }, { "epoch": 0.2555, "grad_norm": 2.172755002975464, "grad_norm_var": 0.7258635933086981, "learning_rate": 0.0001, "loss": 0.943, "loss/crossentropy": 2.66379451751709, "loss/hidden": 0.80078125, "loss/logits": 0.11595633625984192, "loss/reg": 0.0026261925231665373, "step": 2044 }, { "epoch": 0.255625, "grad_norm": 4.804805755615234, "grad_norm_var": 0.979116393628868, "learning_rate": 0.0001, "loss": 1.5802, "loss/crossentropy": 2.036736011505127, "loss/hidden": 1.3359375, "loss/logits": 0.21801407635211945, "loss/reg": 0.0026246763300150633, "step": 2045 }, { "epoch": 0.25575, "grad_norm": 2.6155989170074463, "grad_norm_var": 0.9797527506428044, "learning_rate": 0.0001, "loss": 1.2781, "loss/crossentropy": 2.222402572631836, "loss/hidden": 1.0859375, "loss/logits": 0.1659093201160431, "loss/reg": 0.0026231445372104645, "step": 2046 }, { "epoch": 0.255875, "grad_norm": 2.2226996421813965, "grad_norm_var": 0.9867762536136465, "learning_rate": 0.0001, "loss": 1.0649, "loss/crossentropy": 2.5088775157928467, "loss/hidden": 0.87109375, "loss/logits": 0.16760048270225525, "loss/reg": 0.0026215892285108566, "step": 2047 }, { "epoch": 0.256, "grad_norm": 2.2316296100616455, "grad_norm_var": 0.9697612443751865, "learning_rate": 0.0001, "loss": 1.1183, "loss/crossentropy": 2.5492899417877197, "loss/hidden": 0.94140625, "loss/logits": 0.15073196589946747, "loss/reg": 0.002620045794174075, "step": 2048 }, { "epoch": 0.256125, "grad_norm": 4.570781230926514, "grad_norm_var": 0.9087950470244465, "learning_rate": 0.0001, "loss": 1.2004, "loss/crossentropy": 2.1873745918273926, "loss/hidden": 1.0, "loss/logits": 0.17418642342090607, "loss/reg": 0.0026186287868767977, "step": 2049 }, { "epoch": 0.25625, "grad_norm": 2.7245278358459473, "grad_norm_var": 0.8808905781033597, "learning_rate": 0.0001, "loss": 1.1163, "loss/crossentropy": 2.5251500606536865, "loss/hidden": 0.94140625, "loss/logits": 0.14875037968158722, "loss/reg": 0.002617252990603447, "step": 2050 }, { "epoch": 0.256375, "grad_norm": 2.4628653526306152, "grad_norm_var": 0.8892972541710016, "learning_rate": 0.0001, "loss": 1.2789, "loss/crossentropy": 2.2656936645507812, "loss/hidden": 1.078125, "loss/logits": 0.17457154393196106, "loss/reg": 0.0026159349363297224, "step": 2051 }, { "epoch": 0.2565, "grad_norm": 2.511605739593506, "grad_norm_var": 0.8800499942039473, "learning_rate": 0.0001, "loss": 0.9916, "loss/crossentropy": 2.6102118492126465, "loss/hidden": 0.828125, "loss/logits": 0.1373082995414734, "loss/reg": 0.0026145295705646276, "step": 2052 }, { "epoch": 0.256625, "grad_norm": 2.0422439575195312, "grad_norm_var": 0.9097899576678846, "learning_rate": 0.0001, "loss": 1.0318, "loss/crossentropy": 2.7837326526641846, "loss/hidden": 0.859375, "loss/logits": 0.1462600827217102, "loss/reg": 0.0026131505146622658, "step": 2053 }, { "epoch": 0.25675, "grad_norm": 2.6785788536071777, "grad_norm_var": 0.8719698598483688, "learning_rate": 0.0001, "loss": 1.0822, "loss/crossentropy": 2.720308780670166, "loss/hidden": 0.90234375, "loss/logits": 0.1537412405014038, "loss/reg": 0.0026117220986634493, "step": 2054 }, { "epoch": 0.256875, "grad_norm": 3.30175518989563, "grad_norm_var": 0.7258174153627368, "learning_rate": 0.0001, "loss": 1.0522, "loss/crossentropy": 2.8275485038757324, "loss/hidden": 0.875, "loss/logits": 0.1510903686285019, "loss/reg": 0.002610334660857916, "step": 2055 }, { "epoch": 0.257, "grad_norm": 3.24013352394104, "grad_norm_var": 0.6922151427292303, "learning_rate": 0.0001, "loss": 1.1942, "loss/crossentropy": 2.0899558067321777, "loss/hidden": 1.046875, "loss/logits": 0.12128080427646637, "loss/reg": 0.0026089909952133894, "step": 2056 }, { "epoch": 0.257125, "grad_norm": 2.2610199451446533, "grad_norm_var": 0.6964524621220692, "learning_rate": 0.0001, "loss": 1.0661, "loss/crossentropy": 2.3946642875671387, "loss/hidden": 0.87109375, "loss/logits": 0.16891264915466309, "loss/reg": 0.002607578644528985, "step": 2057 }, { "epoch": 0.25725, "grad_norm": 2.013164520263672, "grad_norm_var": 0.733647526855039, "learning_rate": 0.0001, "loss": 0.9674, "loss/crossentropy": 2.4083251953125, "loss/hidden": 0.80078125, "loss/logits": 0.14051708579063416, "loss/reg": 0.0026061544194817543, "step": 2058 }, { "epoch": 0.257375, "grad_norm": 8.298988342285156, "grad_norm_var": 2.5888719830462414, "learning_rate": 0.0001, "loss": 1.8548, "loss/crossentropy": 1.840504765510559, "loss/hidden": 1.5859375, "loss/logits": 0.24281775951385498, "loss/reg": 0.0026047327555716038, "step": 2059 }, { "epoch": 0.2575, "grad_norm": 2.1740310192108154, "grad_norm_var": 2.5887084455809184, "learning_rate": 0.0001, "loss": 1.1736, "loss/crossentropy": 2.4619970321655273, "loss/hidden": 0.96484375, "loss/logits": 0.18274620175361633, "loss/reg": 0.002603366272523999, "step": 2060 }, { "epoch": 0.257625, "grad_norm": 1.87361478805542, "grad_norm_var": 2.4729622997441605, "learning_rate": 0.0001, "loss": 1.0009, "loss/crossentropy": 2.379354953765869, "loss/hidden": 0.82421875, "loss/logits": 0.15065261721611023, "loss/reg": 0.0026019513607025146, "step": 2061 }, { "epoch": 0.25775, "grad_norm": 2.698416233062744, "grad_norm_var": 2.4696823710637905, "learning_rate": 0.0001, "loss": 1.2247, "loss/crossentropy": 2.566319704055786, "loss/hidden": 1.03125, "loss/logits": 0.16748088598251343, "loss/reg": 0.002600590931251645, "step": 2062 }, { "epoch": 0.257875, "grad_norm": 2.143456220626831, "grad_norm_var": 2.4778293787620216, "learning_rate": 0.0001, "loss": 1.0272, "loss/crossentropy": 2.7131080627441406, "loss/hidden": 0.859375, "loss/logits": 0.14183677732944489, "loss/reg": 0.002599175553768873, "step": 2063 }, { "epoch": 0.258, "grad_norm": 1.9824784994125366, "grad_norm_var": 2.505629186159025, "learning_rate": 0.0001, "loss": 0.9592, "loss/crossentropy": 2.422607421875, "loss/hidden": 0.8125, "loss/logits": 0.12070031464099884, "loss/reg": 0.0025976994074881077, "step": 2064 }, { "epoch": 0.258125, "grad_norm": 2.024726390838623, "grad_norm_var": 2.3558484474631776, "learning_rate": 0.0001, "loss": 0.9666, "loss/crossentropy": 2.7257227897644043, "loss/hidden": 0.80078125, "loss/logits": 0.1398501992225647, "loss/reg": 0.0025962614454329014, "step": 2065 }, { "epoch": 0.25825, "grad_norm": 2.53908371925354, "grad_norm_var": 2.359294604798887, "learning_rate": 0.0001, "loss": 1.0536, "loss/crossentropy": 2.487863779067993, "loss/hidden": 0.8671875, "loss/logits": 0.16044992208480835, "loss/reg": 0.0025948965921998024, "step": 2066 }, { "epoch": 0.258375, "grad_norm": 2.9544944763183594, "grad_norm_var": 2.354570465880767, "learning_rate": 0.0001, "loss": 1.2219, "loss/crossentropy": 2.3688132762908936, "loss/hidden": 1.0390625, "loss/logits": 0.15687838196754456, "loss/reg": 0.0025935538578778505, "step": 2067 }, { "epoch": 0.2585, "grad_norm": 2.2477307319641113, "grad_norm_var": 2.368932219716166, "learning_rate": 0.0001, "loss": 1.0862, "loss/crossentropy": 2.6279659271240234, "loss/hidden": 0.90625, "loss/logits": 0.1539991796016693, "loss/reg": 0.0025922267232090235, "step": 2068 }, { "epoch": 0.258625, "grad_norm": 2.239948034286499, "grad_norm_var": 2.3519375237071585, "learning_rate": 0.0001, "loss": 1.1165, "loss/crossentropy": 2.373878002166748, "loss/hidden": 0.921875, "loss/logits": 0.16871587932109833, "loss/reg": 0.0025907608214765787, "step": 2069 }, { "epoch": 0.25875, "grad_norm": 3.5614728927612305, "grad_norm_var": 2.3873073365114137, "learning_rate": 0.0001, "loss": 1.0876, "loss/crossentropy": 2.7132346630096436, "loss/hidden": 0.88671875, "loss/logits": 0.17500154674053192, "loss/reg": 0.0025893226265907288, "step": 2070 }, { "epoch": 0.258875, "grad_norm": 2.7357993125915527, "grad_norm_var": 2.373022141220857, "learning_rate": 0.0001, "loss": 1.271, "loss/crossentropy": 2.18509840965271, "loss/hidden": 1.0625, "loss/logits": 0.1826370358467102, "loss/reg": 0.0025878336746245623, "step": 2071 }, { "epoch": 0.259, "grad_norm": 2.218555212020874, "grad_norm_var": 2.3799029785896897, "learning_rate": 0.0001, "loss": 1.1866, "loss/crossentropy": 2.24187970161438, "loss/hidden": 0.97265625, "loss/logits": 0.18805649876594543, "loss/reg": 0.0025863947812467813, "step": 2072 }, { "epoch": 0.259125, "grad_norm": 2.167565107345581, "grad_norm_var": 2.3865161330111797, "learning_rate": 0.0001, "loss": 1.1657, "loss/crossentropy": 2.399721145629883, "loss/hidden": 0.96875, "loss/logits": 0.1710546612739563, "loss/reg": 0.0025849028024822474, "step": 2073 }, { "epoch": 0.25925, "grad_norm": 2.0370163917541504, "grad_norm_var": 2.3842335080866714, "learning_rate": 0.0001, "loss": 1.0692, "loss/crossentropy": 2.6004505157470703, "loss/hidden": 0.890625, "loss/logits": 0.1527896374464035, "loss/reg": 0.002583381487056613, "step": 2074 }, { "epoch": 0.259375, "grad_norm": 3.6670825481414795, "grad_norm_var": 0.2941963099964766, "learning_rate": 0.0001, "loss": 1.4148, "loss/crossentropy": 2.865842342376709, "loss/hidden": 1.1796875, "loss/logits": 0.2093041092157364, "loss/reg": 0.00258182967081666, "step": 2075 }, { "epoch": 0.2595, "grad_norm": 2.2913897037506104, "grad_norm_var": 0.2906747816056636, "learning_rate": 0.0001, "loss": 1.0911, "loss/crossentropy": 2.556710958480835, "loss/hidden": 0.9140625, "loss/logits": 0.15122531354427338, "loss/reg": 0.002580266213044524, "step": 2076 }, { "epoch": 0.259625, "grad_norm": 2.3180160522460938, "grad_norm_var": 0.26818813625463345, "learning_rate": 0.0001, "loss": 1.0987, "loss/crossentropy": 2.357269287109375, "loss/hidden": 0.921875, "loss/logits": 0.15105529129505157, "loss/reg": 0.0025788662023842335, "step": 2077 }, { "epoch": 0.25975, "grad_norm": 2.735438346862793, "grad_norm_var": 0.26930654162698264, "learning_rate": 0.0001, "loss": 1.141, "loss/crossentropy": 2.745026111602783, "loss/hidden": 0.9453125, "loss/logits": 0.169925719499588, "loss/reg": 0.002577539999037981, "step": 2078 }, { "epoch": 0.259875, "grad_norm": 2.4695682525634766, "grad_norm_var": 0.26081916654100223, "learning_rate": 0.0001, "loss": 1.0243, "loss/crossentropy": 2.965155601501465, "loss/hidden": 0.85546875, "loss/logits": 0.1430715173482895, "loss/reg": 0.0025760820135474205, "step": 2079 }, { "epoch": 0.26, "grad_norm": 2.4745187759399414, "grad_norm_var": 0.24121789086487505, "learning_rate": 0.0001, "loss": 1.1742, "loss/crossentropy": 2.4014153480529785, "loss/hidden": 0.96484375, "loss/logits": 0.1835838109254837, "loss/reg": 0.0025746312458068132, "step": 2080 }, { "epoch": 0.260125, "grad_norm": 2.7991394996643066, "grad_norm_var": 0.2252218371134243, "learning_rate": 0.0001, "loss": 1.1632, "loss/crossentropy": 2.535705327987671, "loss/hidden": 0.98046875, "loss/logits": 0.15699650347232819, "loss/reg": 0.002573275938630104, "step": 2081 }, { "epoch": 0.26025, "grad_norm": 2.1277756690979004, "grad_norm_var": 0.23864518355619543, "learning_rate": 0.0001, "loss": 1.1291, "loss/crossentropy": 2.4120826721191406, "loss/hidden": 0.953125, "loss/logits": 0.1502905786037445, "loss/reg": 0.002571835182607174, "step": 2082 }, { "epoch": 0.260375, "grad_norm": 2.471019744873047, "grad_norm_var": 0.22816852734765666, "learning_rate": 0.0001, "loss": 1.0119, "loss/crossentropy": 2.473698616027832, "loss/hidden": 0.84765625, "loss/logits": 0.13856154680252075, "loss/reg": 0.002570484532043338, "step": 2083 }, { "epoch": 0.2605, "grad_norm": 2.4578030109405518, "grad_norm_var": 0.2228768023125506, "learning_rate": 0.0001, "loss": 1.0215, "loss/crossentropy": 2.4398980140686035, "loss/hidden": 0.86328125, "loss/logits": 0.13250717520713806, "loss/reg": 0.0025690330658107996, "step": 2084 }, { "epoch": 0.260625, "grad_norm": 2.277040958404541, "grad_norm_var": 0.22143798538185303, "learning_rate": 0.0001, "loss": 1.1276, "loss/crossentropy": 2.4847068786621094, "loss/hidden": 0.94140625, "loss/logits": 0.16049456596374512, "loss/reg": 0.0025675713550299406, "step": 2085 }, { "epoch": 0.26075, "grad_norm": 2.0303781032562256, "grad_norm_var": 0.1615829748113735, "learning_rate": 0.0001, "loss": 0.8164, "loss/crossentropy": 2.7072529792785645, "loss/hidden": 0.68359375, "loss/logits": 0.10713274776935577, "loss/reg": 0.0025660626124590635, "step": 2086 }, { "epoch": 0.260875, "grad_norm": 2.453800678253174, "grad_norm_var": 0.15599072439048314, "learning_rate": 0.0001, "loss": 1.0505, "loss/crossentropy": 2.5670292377471924, "loss/hidden": 0.87890625, "loss/logits": 0.145965114235878, "loss/reg": 0.0025645196437835693, "step": 2087 }, { "epoch": 0.261, "grad_norm": 2.2974658012390137, "grad_norm_var": 0.15407885545909963, "learning_rate": 0.0001, "loss": 0.989, "loss/crossentropy": 2.4595649242401123, "loss/hidden": 0.83203125, "loss/logits": 0.13132578134536743, "loss/reg": 0.0025629804003983736, "step": 2088 }, { "epoch": 0.261125, "grad_norm": 2.549549102783203, "grad_norm_var": 0.14921143407091222, "learning_rate": 0.0001, "loss": 1.0425, "loss/crossentropy": 2.3459436893463135, "loss/hidden": 0.875, "loss/logits": 0.14191356301307678, "loss/reg": 0.0025614260230213404, "step": 2089 }, { "epoch": 0.26125, "grad_norm": 2.002662181854248, "grad_norm_var": 0.15125047006817643, "learning_rate": 0.0001, "loss": 0.9319, "loss/crossentropy": 2.5568628311157227, "loss/hidden": 0.78515625, "loss/logits": 0.12118648737668991, "loss/reg": 0.0025598553474992514, "step": 2090 }, { "epoch": 0.261375, "grad_norm": 3.5039377212524414, "grad_norm_var": 0.12674192133970882, "learning_rate": 0.0001, "loss": 1.219, "loss/crossentropy": 2.7117178440093994, "loss/hidden": 0.96875, "loss/logits": 0.22463664412498474, "loss/reg": 0.0025584872346371412, "step": 2091 }, { "epoch": 0.2615, "grad_norm": 2.5032620429992676, "grad_norm_var": 0.12496179501936146, "learning_rate": 0.0001, "loss": 1.115, "loss/crossentropy": 2.6534841060638428, "loss/hidden": 0.92578125, "loss/logits": 0.16364844143390656, "loss/reg": 0.002557123312726617, "step": 2092 }, { "epoch": 0.261625, "grad_norm": 2.7911572456359863, "grad_norm_var": 0.1295569416749989, "learning_rate": 0.0001, "loss": 1.1413, "loss/crossentropy": 2.6506550312042236, "loss/hidden": 0.9296875, "loss/logits": 0.1860462874174118, "loss/reg": 0.0025556525215506554, "step": 2093 }, { "epoch": 0.26175, "grad_norm": 2.3240432739257812, "grad_norm_var": 0.12703017587634424, "learning_rate": 0.0001, "loss": 1.0838, "loss/crossentropy": 2.2893855571746826, "loss/hidden": 0.90234375, "loss/logits": 0.15589797496795654, "loss/reg": 0.0025542983785271645, "step": 2094 }, { "epoch": 0.261875, "grad_norm": 3.151837110519409, "grad_norm_var": 0.1560094683681425, "learning_rate": 0.0001, "loss": 1.1085, "loss/crossentropy": 2.4272518157958984, "loss/hidden": 0.9296875, "loss/logits": 0.1532888561487198, "loss/reg": 0.0025528501719236374, "step": 2095 }, { "epoch": 0.262, "grad_norm": 3.053194522857666, "grad_norm_var": 0.1739338415915216, "learning_rate": 0.0001, "loss": 1.0844, "loss/crossentropy": 2.6052980422973633, "loss/hidden": 0.91796875, "loss/logits": 0.14096005260944366, "loss/reg": 0.0025515130255371332, "step": 2096 }, { "epoch": 0.262125, "grad_norm": 2.720215320587158, "grad_norm_var": 0.17169750262604733, "learning_rate": 0.0001, "loss": 1.2073, "loss/crossentropy": 2.118901491165161, "loss/hidden": 1.0234375, "loss/logits": 0.1583658754825592, "loss/reg": 0.00255009438842535, "step": 2097 }, { "epoch": 0.26225, "grad_norm": 2.7364187240600586, "grad_norm_var": 0.16101627922160613, "learning_rate": 0.0001, "loss": 1.1784, "loss/crossentropy": 2.4940342903137207, "loss/hidden": 0.984375, "loss/logits": 0.1685602366924286, "loss/reg": 0.0025486235972493887, "step": 2098 }, { "epoch": 0.262375, "grad_norm": 2.849760055541992, "grad_norm_var": 0.1643399864772654, "learning_rate": 0.0001, "loss": 1.0345, "loss/crossentropy": 2.4084441661834717, "loss/hidden": 0.87109375, "loss/logits": 0.13791196048259735, "loss/reg": 0.002547134878113866, "step": 2099 }, { "epoch": 0.2625, "grad_norm": 2.7641000747680664, "grad_norm_var": 0.16413464058555954, "learning_rate": 0.0001, "loss": 1.2795, "loss/crossentropy": 2.128411054611206, "loss/hidden": 1.078125, "loss/logits": 0.17594923079013824, "loss/reg": 0.002545734867453575, "step": 2100 }, { "epoch": 0.262625, "grad_norm": 2.872300386428833, "grad_norm_var": 0.15861994953211986, "learning_rate": 0.0001, "loss": 1.3669, "loss/crossentropy": 2.1498610973358154, "loss/hidden": 1.1640625, "loss/logits": 0.17739522457122803, "loss/reg": 0.0025442338082939386, "step": 2101 }, { "epoch": 0.26275, "grad_norm": 1.8760157823562622, "grad_norm_var": 0.17312454082727455, "learning_rate": 0.0001, "loss": 0.9549, "loss/crossentropy": 2.3600080013275146, "loss/hidden": 0.796875, "loss/logits": 0.13260792195796967, "loss/reg": 0.0025428966619074345, "step": 2102 }, { "epoch": 0.262875, "grad_norm": 2.2524094581604004, "grad_norm_var": 0.18101126154057703, "learning_rate": 0.0001, "loss": 1.1621, "loss/crossentropy": 2.591085195541382, "loss/hidden": 0.9609375, "loss/logits": 0.17570914328098297, "loss/reg": 0.002541464054957032, "step": 2103 }, { "epoch": 0.263, "grad_norm": 2.175205707550049, "grad_norm_var": 0.18753773629897091, "learning_rate": 0.0001, "loss": 0.9875, "loss/crossentropy": 2.594789505004883, "loss/hidden": 0.8125, "loss/logits": 0.14954954385757446, "loss/reg": 0.0025400579907000065, "step": 2104 }, { "epoch": 0.263125, "grad_norm": 2.5387213230133057, "grad_norm_var": 0.18766536797837333, "learning_rate": 0.0001, "loss": 1.0972, "loss/crossentropy": 2.580713987350464, "loss/hidden": 0.92578125, "loss/logits": 0.14602404832839966, "loss/reg": 0.002538702916353941, "step": 2105 }, { "epoch": 0.26325, "grad_norm": 2.663135051727295, "grad_norm_var": 0.15949014850775325, "learning_rate": 0.0001, "loss": 1.1932, "loss/crossentropy": 2.443059206008911, "loss/hidden": 0.9765625, "loss/logits": 0.19129259884357452, "loss/reg": 0.00253737298771739, "step": 2106 }, { "epoch": 0.263375, "grad_norm": 2.1810390949249268, "grad_norm_var": 0.12238780245493401, "learning_rate": 0.0001, "loss": 1.025, "loss/crossentropy": 2.6288414001464844, "loss/hidden": 0.85546875, "loss/logits": 0.1442154347896576, "loss/reg": 0.0025360379368066788, "step": 2107 }, { "epoch": 0.2635, "grad_norm": 1.7628508806228638, "grad_norm_var": 0.16529281657149658, "learning_rate": 0.0001, "loss": 0.9667, "loss/crossentropy": 2.35375714302063, "loss/hidden": 0.80859375, "loss/logits": 0.1327545940876007, "loss/reg": 0.002534713363274932, "step": 2108 }, { "epoch": 0.263625, "grad_norm": 9.906915664672852, "grad_norm_var": 3.5639154264472226, "learning_rate": 0.0001, "loss": 1.2731, "loss/crossentropy": 2.392916440963745, "loss/hidden": 1.09375, "loss/logits": 0.15402920544147491, "loss/reg": 0.002533418359234929, "step": 2109 }, { "epoch": 0.26375, "grad_norm": 2.6886420249938965, "grad_norm_var": 3.539885392970071, "learning_rate": 0.0001, "loss": 1.0468, "loss/crossentropy": 2.595637559890747, "loss/hidden": 0.84765625, "loss/logits": 0.17386572062969208, "loss/reg": 0.002532056299969554, "step": 2110 }, { "epoch": 0.263875, "grad_norm": 2.734037399291992, "grad_norm_var": 3.5430079766526412, "learning_rate": 0.0001, "loss": 1.2884, "loss/crossentropy": 1.946371078491211, "loss/hidden": 1.078125, "loss/logits": 0.18492045998573303, "loss/reg": 0.0025307778269052505, "step": 2111 }, { "epoch": 0.264, "grad_norm": 2.3938324451446533, "grad_norm_var": 3.5642672644187727, "learning_rate": 0.0001, "loss": 1.1586, "loss/crossentropy": 2.6185896396636963, "loss/hidden": 0.97265625, "loss/logits": 0.16068941354751587, "loss/reg": 0.0025295494124293327, "step": 2112 }, { "epoch": 0.264125, "grad_norm": 2.1221187114715576, "grad_norm_var": 3.604528530575061, "learning_rate": 0.0001, "loss": 1.1524, "loss/crossentropy": 2.313891649246216, "loss/hidden": 0.9453125, "loss/logits": 0.18182779848575592, "loss/reg": 0.0025283738505095243, "step": 2113 }, { "epoch": 0.26425, "grad_norm": 3.1997838020324707, "grad_norm_var": 3.607387627810337, "learning_rate": 0.0001, "loss": 1.4178, "loss/crossentropy": 2.3778915405273438, "loss/hidden": 1.1640625, "loss/logits": 0.22844943404197693, "loss/reg": 0.0025272397324442863, "step": 2114 }, { "epoch": 0.264375, "grad_norm": 2.4375686645507812, "grad_norm_var": 3.6227628558502953, "learning_rate": 0.0001, "loss": 1.1067, "loss/crossentropy": 2.4927098751068115, "loss/hidden": 0.921875, "loss/logits": 0.1595517098903656, "loss/reg": 0.0025259493850171566, "step": 2115 }, { "epoch": 0.2645, "grad_norm": 2.517472743988037, "grad_norm_var": 3.6313799742098904, "learning_rate": 0.0001, "loss": 1.193, "loss/crossentropy": 2.2254316806793213, "loss/hidden": 0.9765625, "loss/logits": 0.19116130471229553, "loss/reg": 0.0025247803423553705, "step": 2116 }, { "epoch": 0.264625, "grad_norm": 3.4359259605407715, "grad_norm_var": 3.6495190836808544, "learning_rate": 0.0001, "loss": 1.2073, "loss/crossentropy": 2.4402449131011963, "loss/hidden": 1.0078125, "loss/logits": 0.17420834302902222, "loss/reg": 0.0025235412176698446, "step": 2117 }, { "epoch": 0.26475, "grad_norm": 2.5403125286102295, "grad_norm_var": 3.5837138668162574, "learning_rate": 0.0001, "loss": 1.0243, "loss/crossentropy": 2.443629026412964, "loss/hidden": 0.8359375, "loss/logits": 0.16313748061656952, "loss/reg": 0.0025223721750080585, "step": 2118 }, { "epoch": 0.264875, "grad_norm": 2.320072650909424, "grad_norm_var": 3.5775091823852314, "learning_rate": 0.0001, "loss": 1.0679, "loss/crossentropy": 2.1367576122283936, "loss/hidden": 0.890625, "loss/logits": 0.1520138382911682, "loss/reg": 0.0025213707704097033, "step": 2119 }, { "epoch": 0.265, "grad_norm": 2.275620698928833, "grad_norm_var": 3.567416445746644, "learning_rate": 0.0001, "loss": 1.0192, "loss/crossentropy": 2.4646902084350586, "loss/hidden": 0.85546875, "loss/logits": 0.13850857317447662, "loss/reg": 0.0025200750678777695, "step": 2120 }, { "epoch": 0.265125, "grad_norm": 2.5054898262023926, "grad_norm_var": 3.569451250368935, "learning_rate": 0.0001, "loss": 1.2207, "loss/crossentropy": 2.678284168243408, "loss/hidden": 1.015625, "loss/logits": 0.17984583973884583, "loss/reg": 0.0025187365245074034, "step": 2121 }, { "epoch": 0.26525, "grad_norm": 1.7853312492370605, "grad_norm_var": 3.6547312492038118, "learning_rate": 0.0001, "loss": 0.94, "loss/crossentropy": 2.515474796295166, "loss/hidden": 0.78125, "loss/logits": 0.13361087441444397, "loss/reg": 0.002517406363040209, "step": 2122 }, { "epoch": 0.265375, "grad_norm": 2.3041670322418213, "grad_norm_var": 3.6434579330218013, "learning_rate": 0.0001, "loss": 1.07, "loss/crossentropy": 2.2838218212127686, "loss/hidden": 0.90625, "loss/logits": 0.13861019909381866, "loss/reg": 0.0025160829536616802, "step": 2123 }, { "epoch": 0.2655, "grad_norm": 2.7424027919769287, "grad_norm_var": 3.5505809932023027, "learning_rate": 0.0001, "loss": 1.1824, "loss/crossentropy": 2.600839853286743, "loss/hidden": 0.9765625, "loss/logits": 0.1807045340538025, "loss/reg": 0.0025147644337266684, "step": 2124 }, { "epoch": 0.265625, "grad_norm": 2.0424578189849854, "grad_norm_var": 0.16771573849056623, "learning_rate": 0.0001, "loss": 1.1052, "loss/crossentropy": 2.412602424621582, "loss/hidden": 0.9140625, "loss/logits": 0.16601966321468353, "loss/reg": 0.002513502724468708, "step": 2125 }, { "epoch": 0.26575, "grad_norm": 2.1522467136383057, "grad_norm_var": 0.17240887913384598, "learning_rate": 0.0001, "loss": 0.9928, "loss/crossentropy": 2.5242130756378174, "loss/hidden": 0.80859375, "loss/logits": 0.15904076397418976, "loss/reg": 0.002512230770662427, "step": 2126 }, { "epoch": 0.265875, "grad_norm": 2.182203769683838, "grad_norm_var": 0.17196279003093196, "learning_rate": 0.0001, "loss": 1.1364, "loss/crossentropy": 2.385453701019287, "loss/hidden": 0.95703125, "loss/logits": 0.15426355600357056, "loss/reg": 0.002511093160137534, "step": 2127 }, { "epoch": 0.266, "grad_norm": 2.0122711658477783, "grad_norm_var": 0.1831469803234351, "learning_rate": 0.0001, "loss": 1.0171, "loss/crossentropy": 2.621617078781128, "loss/hidden": 0.84375, "loss/logits": 0.14826016128063202, "loss/reg": 0.0025100053753703833, "step": 2128 }, { "epoch": 0.266125, "grad_norm": 2.295485258102417, "grad_norm_var": 0.17834863113055613, "learning_rate": 0.0001, "loss": 0.9122, "loss/crossentropy": 2.5776326656341553, "loss/hidden": 0.7734375, "loss/logits": 0.1137041226029396, "loss/reg": 0.0025088961701840162, "step": 2129 }, { "epoch": 0.26625, "grad_norm": 2.047257423400879, "grad_norm_var": 0.14181565484542774, "learning_rate": 0.0001, "loss": 1.0106, "loss/crossentropy": 2.6138837337493896, "loss/hidden": 0.84765625, "loss/logits": 0.13791516423225403, "loss/reg": 0.002507563680410385, "step": 2130 }, { "epoch": 0.266375, "grad_norm": 2.6175718307495117, "grad_norm_var": 0.14594798165543638, "learning_rate": 0.0001, "loss": 1.091, "loss/crossentropy": 2.6606316566467285, "loss/hidden": 0.91015625, "loss/logits": 0.15581122040748596, "loss/reg": 0.002506369026377797, "step": 2131 }, { "epoch": 0.2665, "grad_norm": 2.2321534156799316, "grad_norm_var": 0.14508398841615625, "learning_rate": 0.0001, "loss": 0.8991, "loss/crossentropy": 2.455148935317993, "loss/hidden": 0.765625, "loss/logits": 0.10840518772602081, "loss/reg": 0.002505115233361721, "step": 2132 }, { "epoch": 0.266625, "grad_norm": 1.913902997970581, "grad_norm_var": 0.06811182441104482, "learning_rate": 0.0001, "loss": 1.0673, "loss/crossentropy": 2.668954610824585, "loss/hidden": 0.88671875, "loss/logits": 0.1555517315864563, "loss/reg": 0.0025039403699338436, "step": 2133 }, { "epoch": 0.26675, "grad_norm": 2.2853240966796875, "grad_norm_var": 0.06223935572954057, "learning_rate": 0.0001, "loss": 1.0956, "loss/crossentropy": 2.0821545124053955, "loss/hidden": 0.9140625, "loss/logits": 0.15655305981636047, "loss/reg": 0.0025026213843375444, "step": 2134 }, { "epoch": 0.266875, "grad_norm": 2.6810553073883057, "grad_norm_var": 0.07461676995996314, "learning_rate": 0.0001, "loss": 1.4301, "loss/crossentropy": 2.4769198894500732, "loss/hidden": 1.1953125, "loss/logits": 0.2098081409931183, "loss/reg": 0.0025014111306518316, "step": 2135 }, { "epoch": 0.267, "grad_norm": 2.886624813079834, "grad_norm_var": 0.09965531449209569, "learning_rate": 0.0001, "loss": 1.1364, "loss/crossentropy": 2.7249526977539062, "loss/hidden": 0.9140625, "loss/logits": 0.19732725620269775, "loss/reg": 0.00250007095746696, "step": 2136 }, { "epoch": 0.267125, "grad_norm": 2.0211737155914307, "grad_norm_var": 0.10058552075510055, "learning_rate": 0.0001, "loss": 1.0813, "loss/crossentropy": 2.6467666625976562, "loss/hidden": 0.87109375, "loss/logits": 0.18520744144916534, "loss/reg": 0.002498722868040204, "step": 2137 }, { "epoch": 0.26725, "grad_norm": 3.1686620712280273, "grad_norm_var": 0.13215603225421713, "learning_rate": 0.0001, "loss": 1.3823, "loss/crossentropy": 2.1631827354431152, "loss/hidden": 1.1171875, "loss/logits": 0.2401001751422882, "loss/reg": 0.0024973973631858826, "step": 2138 }, { "epoch": 0.267375, "grad_norm": 2.373699903488159, "grad_norm_var": 0.1320420034606587, "learning_rate": 0.0001, "loss": 0.9992, "loss/crossentropy": 2.382716178894043, "loss/hidden": 0.83984375, "loss/logits": 0.134346142411232, "loss/reg": 0.002496130531653762, "step": 2139 }, { "epoch": 0.2675, "grad_norm": 2.0626862049102783, "grad_norm_var": 0.1256635590084914, "learning_rate": 0.0001, "loss": 1.083, "loss/crossentropy": 2.3416714668273926, "loss/hidden": 0.90625, "loss/logits": 0.1518416553735733, "loss/reg": 0.002494869055226445, "step": 2140 }, { "epoch": 0.267625, "grad_norm": 3.0325145721435547, "grad_norm_var": 0.15148732683757088, "learning_rate": 0.0001, "loss": 1.1484, "loss/crossentropy": 2.376249074935913, "loss/hidden": 0.96875, "loss/logits": 0.1546870768070221, "loss/reg": 0.002493525855243206, "step": 2141 }, { "epoch": 0.26775, "grad_norm": 4.080895900726318, "grad_norm_var": 0.3272512838553247, "learning_rate": 0.0001, "loss": 1.296, "loss/crossentropy": 2.185392379760742, "loss/hidden": 1.0703125, "loss/logits": 0.20072491466999054, "loss/reg": 0.002492206171154976, "step": 2142 }, { "epoch": 0.267875, "grad_norm": 2.4873321056365967, "grad_norm_var": 0.3204119349007584, "learning_rate": 0.0001, "loss": 1.2321, "loss/crossentropy": 2.4436862468719482, "loss/hidden": 1.03125, "loss/logits": 0.17598329484462738, "loss/reg": 0.0024908732157200575, "step": 2143 }, { "epoch": 0.268, "grad_norm": 2.451023578643799, "grad_norm_var": 0.30318494651352973, "learning_rate": 0.0001, "loss": 1.1345, "loss/crossentropy": 2.4196064472198486, "loss/hidden": 0.93359375, "loss/logits": 0.17596589028835297, "loss/reg": 0.0024895668029785156, "step": 2144 }, { "epoch": 0.268125, "grad_norm": 1.9502043724060059, "grad_norm_var": 0.32188537570815756, "learning_rate": 0.0001, "loss": 0.9781, "loss/crossentropy": 2.593618154525757, "loss/hidden": 0.81640625, "loss/logits": 0.13681218028068542, "loss/reg": 0.0024882988072931767, "step": 2145 }, { "epoch": 0.26825, "grad_norm": 2.3103907108306885, "grad_norm_var": 0.30968813065111717, "learning_rate": 0.0001, "loss": 0.9856, "loss/crossentropy": 2.455277681350708, "loss/hidden": 0.83203125, "loss/logits": 0.12868571281433105, "loss/reg": 0.002487036632373929, "step": 2146 }, { "epoch": 0.268375, "grad_norm": 2.315978527069092, "grad_norm_var": 0.312040598840273, "learning_rate": 0.0001, "loss": 1.0636, "loss/crossentropy": 2.5436975955963135, "loss/hidden": 0.8984375, "loss/logits": 0.14029252529144287, "loss/reg": 0.0024857926182448864, "step": 2147 }, { "epoch": 0.2685, "grad_norm": 3.192843437194824, "grad_norm_var": 0.33338398901164296, "learning_rate": 0.0001, "loss": 1.0402, "loss/crossentropy": 2.398376226425171, "loss/hidden": 0.88671875, "loss/logits": 0.12860015034675598, "loss/reg": 0.0024846852757036686, "step": 2148 }, { "epoch": 0.268625, "grad_norm": 2.3080451488494873, "grad_norm_var": 0.30830407290765316, "learning_rate": 0.0001, "loss": 1.0892, "loss/crossentropy": 2.4340858459472656, "loss/hidden": 0.921875, "loss/logits": 0.14250406622886658, "loss/reg": 0.0024835695512592793, "step": 2149 }, { "epoch": 0.26875, "grad_norm": 2.3264517784118652, "grad_norm_var": 0.3066813078038609, "learning_rate": 0.0001, "loss": 1.1479, "loss/crossentropy": 2.3817081451416016, "loss/hidden": 0.97265625, "loss/logits": 0.1504080593585968, "loss/reg": 0.002482454292476177, "step": 2150 }, { "epoch": 0.268875, "grad_norm": 1.9920002222061157, "grad_norm_var": 0.32919394539771024, "learning_rate": 0.0001, "loss": 1.021, "loss/crossentropy": 2.5106582641601562, "loss/hidden": 0.8671875, "loss/logits": 0.128952294588089, "loss/reg": 0.002481187228113413, "step": 2151 }, { "epoch": 0.269, "grad_norm": 3.66256046295166, "grad_norm_var": 0.40061227385328185, "learning_rate": 0.0001, "loss": 1.4802, "loss/crossentropy": 2.535940408706665, "loss/hidden": 1.1875, "loss/logits": 0.26789215207099915, "loss/reg": 0.002479898277670145, "step": 2152 }, { "epoch": 0.269125, "grad_norm": 4.031623840332031, "grad_norm_var": 0.495785184624132, "learning_rate": 0.0001, "loss": 1.6688, "loss/crossentropy": 1.9220447540283203, "loss/hidden": 1.3125, "loss/logits": 0.33148324489593506, "loss/reg": 0.002478886628523469, "step": 2153 }, { "epoch": 0.26925, "grad_norm": 2.5391814708709717, "grad_norm_var": 0.4840843163433869, "learning_rate": 0.0001, "loss": 1.1294, "loss/crossentropy": 2.2794175148010254, "loss/hidden": 0.953125, "loss/logits": 0.15151268243789673, "loss/reg": 0.002477619331330061, "step": 2154 }, { "epoch": 0.269375, "grad_norm": 3.286329746246338, "grad_norm_var": 0.4970625974826726, "learning_rate": 0.0001, "loss": 1.265, "loss/crossentropy": 2.5059866905212402, "loss/hidden": 1.0, "loss/logits": 0.24023011326789856, "loss/reg": 0.002476518740877509, "step": 2155 }, { "epoch": 0.2695, "grad_norm": 5.929214954376221, "grad_norm_var": 1.0761359441998672, "learning_rate": 0.0001, "loss": 1.4467, "loss/crossentropy": 2.531900644302368, "loss/hidden": 1.234375, "loss/logits": 0.1875353753566742, "loss/reg": 0.002475421642884612, "step": 2156 }, { "epoch": 0.269625, "grad_norm": 2.5959959030151367, "grad_norm_var": 1.0857766324985008, "learning_rate": 0.0001, "loss": 1.0522, "loss/crossentropy": 2.2892539501190186, "loss/hidden": 0.890625, "loss/logits": 0.13680672645568848, "loss/reg": 0.00247434014454484, "step": 2157 }, { "epoch": 0.26975, "grad_norm": 6.8274102210998535, "grad_norm_var": 1.9654192450663406, "learning_rate": 0.0001, "loss": 1.8764, "loss/crossentropy": 2.436116933822632, "loss/hidden": 1.5234375, "loss/logits": 0.32819026708602905, "loss/reg": 0.0024732116144150496, "step": 2158 }, { "epoch": 0.269875, "grad_norm": 2.6238043308258057, "grad_norm_var": 1.9547451483297908, "learning_rate": 0.0001, "loss": 1.0002, "loss/crossentropy": 2.684384346008301, "loss/hidden": 0.828125, "loss/logits": 0.14738497138023376, "loss/reg": 0.0024720944929867983, "step": 2159 }, { "epoch": 0.27, "grad_norm": 3.0665173530578613, "grad_norm_var": 1.9213521586384046, "learning_rate": 0.0001, "loss": 1.2605, "loss/crossentropy": 2.5885279178619385, "loss/hidden": 1.015625, "loss/logits": 0.22020390629768372, "loss/reg": 0.00247096735984087, "step": 2160 }, { "epoch": 0.270125, "grad_norm": 3.285884141921997, "grad_norm_var": 1.8129652598509343, "learning_rate": 0.0001, "loss": 1.2834, "loss/crossentropy": 2.408535957336426, "loss/hidden": 1.09375, "loss/logits": 0.16491034626960754, "loss/reg": 0.002469826489686966, "step": 2161 }, { "epoch": 0.27025, "grad_norm": 2.946852445602417, "grad_norm_var": 1.7569857035996652, "learning_rate": 0.0001, "loss": 1.375, "loss/crossentropy": 2.795154094696045, "loss/hidden": 1.1015625, "loss/logits": 0.24879038333892822, "loss/reg": 0.002468690974637866, "step": 2162 }, { "epoch": 0.270375, "grad_norm": 2.481799840927124, "grad_norm_var": 1.7367674179890973, "learning_rate": 0.0001, "loss": 1.2877, "loss/crossentropy": 2.532032012939453, "loss/hidden": 1.046875, "loss/logits": 0.21614192426204681, "loss/reg": 0.0024675510358065367, "step": 2163 }, { "epoch": 0.2705, "grad_norm": 2.3081796169281006, "grad_norm_var": 1.8005074385336934, "learning_rate": 0.0001, "loss": 1.2784, "loss/crossentropy": 2.2821836471557617, "loss/hidden": 1.0703125, "loss/logits": 0.18341490626335144, "loss/reg": 0.002466218313202262, "step": 2164 }, { "epoch": 0.270625, "grad_norm": 3.1536834239959717, "grad_norm_var": 1.7375014532448314, "learning_rate": 0.0001, "loss": 1.1397, "loss/crossentropy": 2.3474032878875732, "loss/hidden": 0.9453125, "loss/logits": 0.16977465152740479, "loss/reg": 0.0024648867547512054, "step": 2165 }, { "epoch": 0.27075, "grad_norm": 2.847097635269165, "grad_norm_var": 1.6857431055004723, "learning_rate": 0.0001, "loss": 1.0372, "loss/crossentropy": 3.004148006439209, "loss/hidden": 0.86328125, "loss/logits": 0.14927570521831512, "loss/reg": 0.0024635624140501022, "step": 2166 }, { "epoch": 0.270875, "grad_norm": 2.96842885017395, "grad_norm_var": 1.568710670092694, "learning_rate": 0.0001, "loss": 1.2009, "loss/crossentropy": 2.4944570064544678, "loss/hidden": 1.0078125, "loss/logits": 0.16841644048690796, "loss/reg": 0.0024622357450425625, "step": 2167 }, { "epoch": 0.271, "grad_norm": 2.5392906665802, "grad_norm_var": 1.6096924226044735, "learning_rate": 0.0001, "loss": 1.1785, "loss/crossentropy": 2.53674054145813, "loss/hidden": 0.984375, "loss/logits": 0.16951404511928558, "loss/reg": 0.002460899529978633, "step": 2168 }, { "epoch": 0.271125, "grad_norm": 2.925675392150879, "grad_norm_var": 1.5840706048460789, "learning_rate": 0.0001, "loss": 1.3046, "loss/crossentropy": 2.3821847438812256, "loss/hidden": 1.0859375, "loss/logits": 0.1940276026725769, "loss/reg": 0.0024595865979790688, "step": 2169 }, { "epoch": 0.27125, "grad_norm": 2.127572774887085, "grad_norm_var": 1.6347859676394705, "learning_rate": 0.0001, "loss": 1.1291, "loss/crossentropy": 2.3356311321258545, "loss/hidden": 0.93359375, "loss/logits": 0.1709190011024475, "loss/reg": 0.0024583563208580017, "step": 2170 }, { "epoch": 0.271375, "grad_norm": 2.0105955600738525, "grad_norm_var": 1.7294078925223988, "learning_rate": 0.0001, "loss": 1.1322, "loss/crossentropy": 2.4044480323791504, "loss/hidden": 0.9453125, "loss/logits": 0.16235248744487762, "loss/reg": 0.0024570643436163664, "step": 2171 }, { "epoch": 0.2715, "grad_norm": 2.4808928966522217, "grad_norm_var": 1.2016128699820419, "learning_rate": 0.0001, "loss": 1.0687, "loss/crossentropy": 2.0658111572265625, "loss/hidden": 0.8984375, "loss/logits": 0.14573438465595245, "loss/reg": 0.0024556575808674097, "step": 2172 }, { "epoch": 0.271625, "grad_norm": 3.537330389022827, "grad_norm_var": 1.2126442279382557, "learning_rate": 0.0001, "loss": 1.2888, "loss/crossentropy": 2.389512300491333, "loss/hidden": 1.0546875, "loss/logits": 0.20959752798080444, "loss/reg": 0.002454345114529133, "step": 2173 }, { "epoch": 0.27175, "grad_norm": 2.1031746864318848, "grad_norm_var": 0.2018239005386133, "learning_rate": 0.0001, "loss": 0.988, "loss/crossentropy": 2.4889259338378906, "loss/hidden": 0.8359375, "loss/logits": 0.12754172086715698, "loss/reg": 0.002453018445521593, "step": 2174 }, { "epoch": 0.271875, "grad_norm": 2.307600975036621, "grad_norm_var": 0.21183025027659425, "learning_rate": 0.0001, "loss": 1.064, "loss/crossentropy": 2.7085604667663574, "loss/hidden": 0.89453125, "loss/logits": 0.144952654838562, "loss/reg": 0.0024516324046999216, "step": 2175 }, { "epoch": 0.272, "grad_norm": 2.0253207683563232, "grad_norm_var": 0.22775425344747358, "learning_rate": 0.0001, "loss": 1.018, "loss/crossentropy": 2.4250197410583496, "loss/hidden": 0.86328125, "loss/logits": 0.1302644908428192, "loss/reg": 0.002450294094160199, "step": 2176 }, { "epoch": 0.272125, "grad_norm": 2.2296600341796875, "grad_norm_var": 0.20484224050921113, "learning_rate": 0.0001, "loss": 1.0681, "loss/crossentropy": 2.6901113986968994, "loss/hidden": 0.89453125, "loss/logits": 0.14910361170768738, "loss/reg": 0.0024489881470799446, "step": 2177 }, { "epoch": 0.27225, "grad_norm": 2.9347352981567383, "grad_norm_var": 0.20422975863511184, "learning_rate": 0.0001, "loss": 1.1369, "loss/crossentropy": 2.703372001647949, "loss/hidden": 0.9453125, "loss/logits": 0.16715854406356812, "loss/reg": 0.002447771141305566, "step": 2178 }, { "epoch": 0.272375, "grad_norm": 2.468362331390381, "grad_norm_var": 0.2043835086792484, "learning_rate": 0.0001, "loss": 1.0516, "loss/crossentropy": 2.517442226409912, "loss/hidden": 0.85546875, "loss/logits": 0.1717083901166916, "loss/reg": 0.002446432365104556, "step": 2179 }, { "epoch": 0.2725, "grad_norm": 2.2512402534484863, "grad_norm_var": 0.20650154512647206, "learning_rate": 0.0001, "loss": 1.0634, "loss/crossentropy": 2.51335072517395, "loss/hidden": 0.8984375, "loss/logits": 0.1405012011528015, "loss/reg": 0.0024451883509755135, "step": 2180 }, { "epoch": 0.272625, "grad_norm": 2.1832735538482666, "grad_norm_var": 0.18814301725907753, "learning_rate": 0.0001, "loss": 1.1066, "loss/crossentropy": 2.304950714111328, "loss/hidden": 0.9140625, "loss/logits": 0.1680677831172943, "loss/reg": 0.0024438677355647087, "step": 2181 }, { "epoch": 0.27275, "grad_norm": 3.028186082839966, "grad_norm_var": 0.19866346147011668, "learning_rate": 0.0001, "loss": 1.1771, "loss/crossentropy": 2.2430667877197266, "loss/hidden": 1.0, "loss/logits": 0.15265987813472748, "loss/reg": 0.002442531520500779, "step": 2182 }, { "epoch": 0.272875, "grad_norm": 3.452002763748169, "grad_norm_var": 0.2429923816723561, "learning_rate": 0.0001, "loss": 1.3974, "loss/crossentropy": 2.9306530952453613, "loss/hidden": 1.1875, "loss/logits": 0.18550564348697662, "loss/reg": 0.0024412302300333977, "step": 2183 }, { "epoch": 0.273, "grad_norm": 2.246469259262085, "grad_norm_var": 0.24829348455254088, "learning_rate": 0.0001, "loss": 0.9694, "loss/crossentropy": 2.875187873840332, "loss/hidden": 0.8125, "loss/logits": 0.13248112797737122, "loss/reg": 0.0024398835375905037, "step": 2184 }, { "epoch": 0.273125, "grad_norm": 1.9008116722106934, "grad_norm_var": 0.2584376253200539, "learning_rate": 0.0001, "loss": 1.0076, "loss/crossentropy": 2.5398221015930176, "loss/hidden": 0.8359375, "loss/logits": 0.1472659856081009, "loss/reg": 0.002438550116494298, "step": 2185 }, { "epoch": 0.27325, "grad_norm": 2.383338212966919, "grad_norm_var": 0.25134477204656847, "learning_rate": 0.0001, "loss": 1.0405, "loss/crossentropy": 2.3420464992523193, "loss/hidden": 0.86328125, "loss/logits": 0.15280337631702423, "loss/reg": 0.0024372416082769632, "step": 2186 }, { "epoch": 0.273375, "grad_norm": 2.059873580932617, "grad_norm_var": 0.24846862725252852, "learning_rate": 0.0001, "loss": 1.1192, "loss/crossentropy": 2.2273800373077393, "loss/hidden": 0.9375, "loss/logits": 0.15739032626152039, "loss/reg": 0.0024359358940273523, "step": 2187 }, { "epoch": 0.2735, "grad_norm": 2.1881048679351807, "grad_norm_var": 0.2535775261348732, "learning_rate": 0.0001, "loss": 1.1327, "loss/crossentropy": 2.5001754760742188, "loss/hidden": 0.93359375, "loss/logits": 0.17471668124198914, "loss/reg": 0.002434708643704653, "step": 2188 }, { "epoch": 0.273625, "grad_norm": 2.273981809616089, "grad_norm_var": 0.17122102032743194, "learning_rate": 0.0001, "loss": 1.1312, "loss/crossentropy": 2.2429873943328857, "loss/hidden": 0.94921875, "loss/logits": 0.15765118598937988, "loss/reg": 0.002433572430163622, "step": 2189 }, { "epoch": 0.27375, "grad_norm": 2.81227970123291, "grad_norm_var": 0.17673399534132841, "learning_rate": 0.0001, "loss": 1.1478, "loss/crossentropy": 2.391328811645508, "loss/hidden": 0.953125, "loss/logits": 0.1703391671180725, "loss/reg": 0.0024322587996721268, "step": 2190 }, { "epoch": 0.273875, "grad_norm": 2.333510160446167, "grad_norm_var": 0.17638221193544698, "learning_rate": 0.0001, "loss": 1.1855, "loss/crossentropy": 2.386115550994873, "loss/hidden": 0.97265625, "loss/logits": 0.18851390480995178, "loss/reg": 0.002430940279737115, "step": 2191 }, { "epoch": 0.274, "grad_norm": 2.024066209793091, "grad_norm_var": 0.17644886482582908, "learning_rate": 0.0001, "loss": 1.0643, "loss/crossentropy": 2.5470540523529053, "loss/hidden": 0.90234375, "loss/logits": 0.13766372203826904, "loss/reg": 0.0024296287447214127, "step": 2192 }, { "epoch": 0.274125, "grad_norm": 2.385667324066162, "grad_norm_var": 0.17394588312457676, "learning_rate": 0.0001, "loss": 1.1431, "loss/crossentropy": 2.254932165145874, "loss/hidden": 0.9453125, "loss/logits": 0.17348717153072357, "loss/reg": 0.002428269013762474, "step": 2193 }, { "epoch": 0.27425, "grad_norm": 2.2435672283172607, "grad_norm_var": 0.1575530977191183, "learning_rate": 0.0001, "loss": 1.0206, "loss/crossentropy": 2.4399256706237793, "loss/hidden": 0.8515625, "loss/logits": 0.14472949504852295, "loss/reg": 0.002426979597657919, "step": 2194 }, { "epoch": 0.274375, "grad_norm": 2.492258071899414, "grad_norm_var": 0.15783950416811574, "learning_rate": 0.0001, "loss": 1.5161, "loss/crossentropy": 2.1318180561065674, "loss/hidden": 1.265625, "loss/logits": 0.2262248694896698, "loss/reg": 0.002425666432827711, "step": 2195 }, { "epoch": 0.2745, "grad_norm": 2.47834849357605, "grad_norm_var": 0.15682608115677468, "learning_rate": 0.0001, "loss": 1.1259, "loss/crossentropy": 2.5137441158294678, "loss/hidden": 0.93359375, "loss/logits": 0.16804705560207367, "loss/reg": 0.0024242608342319727, "step": 2196 }, { "epoch": 0.274625, "grad_norm": 2.338765859603882, "grad_norm_var": 0.15373285997066333, "learning_rate": 0.0001, "loss": 1.0452, "loss/crossentropy": 2.3742787837982178, "loss/hidden": 0.8671875, "loss/logits": 0.15376636385917664, "loss/reg": 0.002422948833554983, "step": 2197 }, { "epoch": 0.27475, "grad_norm": 2.5195517539978027, "grad_norm_var": 0.12832238511252853, "learning_rate": 0.0001, "loss": 0.9866, "loss/crossentropy": 2.7433509826660156, "loss/hidden": 0.81640625, "loss/logits": 0.1460229456424713, "loss/reg": 0.002421543002128601, "step": 2198 }, { "epoch": 0.274875, "grad_norm": 39.86418151855469, "grad_norm_var": 88.18231273868413, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.2249467372894287, "loss/hidden": 1.1484375, "loss/logits": 0.20356109738349915, "loss/reg": 0.0024201541673392057, "step": 2199 }, { "epoch": 0.275, "grad_norm": 2.2128329277038574, "grad_norm_var": 88.19320349331552, "learning_rate": 0.0001, "loss": 1.1435, "loss/crossentropy": 2.3889286518096924, "loss/hidden": 0.95703125, "loss/logits": 0.16229528188705444, "loss/reg": 0.002418855205178261, "step": 2200 }, { "epoch": 0.275125, "grad_norm": 2.263549566268921, "grad_norm_var": 88.06812657522232, "learning_rate": 0.0001, "loss": 1.0676, "loss/crossentropy": 2.4727964401245117, "loss/hidden": 0.8984375, "loss/logits": 0.1449824571609497, "loss/reg": 0.002417441690340638, "step": 2201 }, { "epoch": 0.27525, "grad_norm": 2.204317331314087, "grad_norm_var": 88.12494052659383, "learning_rate": 0.0001, "loss": 1.0427, "loss/crossentropy": 2.845208168029785, "loss/hidden": 0.86328125, "loss/logits": 0.15528936684131622, "loss/reg": 0.002416152274236083, "step": 2202 }, { "epoch": 0.275375, "grad_norm": 2.7695164680480957, "grad_norm_var": 87.90959609587053, "learning_rate": 0.0001, "loss": 1.1572, "loss/crossentropy": 3.061274290084839, "loss/hidden": 0.95703125, "loss/logits": 0.1759844273328781, "loss/reg": 0.002414730377495289, "step": 2203 }, { "epoch": 0.2755, "grad_norm": 2.717144012451172, "grad_norm_var": 87.74900173223746, "learning_rate": 0.0001, "loss": 0.9618, "loss/crossentropy": 2.6843478679656982, "loss/hidden": 0.8046875, "loss/logits": 0.13296306133270264, "loss/reg": 0.002413275185972452, "step": 2204 }, { "epoch": 0.275625, "grad_norm": 4.169137954711914, "grad_norm_var": 87.34886859661202, "learning_rate": 0.0001, "loss": 1.2853, "loss/crossentropy": 2.2965047359466553, "loss/hidden": 1.109375, "loss/logits": 0.15182708203792572, "loss/reg": 0.0024117783177644014, "step": 2205 }, { "epoch": 0.27575, "grad_norm": 2.2667558193206787, "grad_norm_var": 87.51672469703968, "learning_rate": 0.0001, "loss": 1.029, "loss/crossentropy": 2.393016815185547, "loss/hidden": 0.86328125, "loss/logits": 0.14166471362113953, "loss/reg": 0.002410315675660968, "step": 2206 }, { "epoch": 0.275875, "grad_norm": 2.368360757827759, "grad_norm_var": 87.50519913148962, "learning_rate": 0.0001, "loss": 0.8954, "loss/crossentropy": 2.2895853519439697, "loss/hidden": 0.7578125, "loss/logits": 0.11348042637109756, "loss/reg": 0.0024090223014354706, "step": 2207 }, { "epoch": 0.276, "grad_norm": 2.76328444480896, "grad_norm_var": 87.26255798292681, "learning_rate": 0.0001, "loss": 1.1464, "loss/crossentropy": 2.0994620323181152, "loss/hidden": 0.97265625, "loss/logits": 0.14964362978935242, "loss/reg": 0.0024077349808067083, "step": 2208 }, { "epoch": 0.276125, "grad_norm": 2.934947967529297, "grad_norm_var": 87.09884050424269, "learning_rate": 0.0001, "loss": 1.188, "loss/crossentropy": 2.646623134613037, "loss/hidden": 0.984375, "loss/logits": 0.1795879751443863, "loss/reg": 0.0024064902681857347, "step": 2209 }, { "epoch": 0.27625, "grad_norm": 2.149209976196289, "grad_norm_var": 87.13297984180517, "learning_rate": 0.0001, "loss": 0.9906, "loss/crossentropy": 2.5921735763549805, "loss/hidden": 0.83203125, "loss/logits": 0.13455170392990112, "loss/reg": 0.0024051491636782885, "step": 2210 }, { "epoch": 0.276375, "grad_norm": 2.078240156173706, "grad_norm_var": 87.27699310375388, "learning_rate": 0.0001, "loss": 1.0822, "loss/crossentropy": 2.531278133392334, "loss/hidden": 0.9140625, "loss/logits": 0.14407508075237274, "loss/reg": 0.002403819700703025, "step": 2211 }, { "epoch": 0.2765, "grad_norm": 2.0615551471710205, "grad_norm_var": 87.42137906886117, "learning_rate": 0.0001, "loss": 1.0569, "loss/crossentropy": 2.5218617916107178, "loss/hidden": 0.875, "loss/logits": 0.1578611135482788, "loss/reg": 0.00240244809538126, "step": 2212 }, { "epoch": 0.276625, "grad_norm": 2.3070061206817627, "grad_norm_var": 87.43209779441034, "learning_rate": 0.0001, "loss": 1.0893, "loss/crossentropy": 2.5459678173065186, "loss/hidden": 0.90234375, "loss/logits": 0.16296803951263428, "loss/reg": 0.002401063684374094, "step": 2213 }, { "epoch": 0.27675, "grad_norm": 2.5456299781799316, "grad_norm_var": 87.4240263282035, "learning_rate": 0.0001, "loss": 1.0224, "loss/crossentropy": 2.7797255516052246, "loss/hidden": 0.86328125, "loss/logits": 0.1351677030324936, "loss/reg": 0.0023996541276574135, "step": 2214 }, { "epoch": 0.276875, "grad_norm": 2.454465389251709, "grad_norm_var": 0.26613297432868044, "learning_rate": 0.0001, "loss": 1.0993, "loss/crossentropy": 2.325683116912842, "loss/hidden": 0.92578125, "loss/logits": 0.14953814446926117, "loss/reg": 0.0023983491118997335, "step": 2215 }, { "epoch": 0.277, "grad_norm": 2.4010567665100098, "grad_norm_var": 0.2607231884573897, "learning_rate": 0.0001, "loss": 1.0524, "loss/crossentropy": 2.5979669094085693, "loss/hidden": 0.859375, "loss/logits": 0.16907179355621338, "loss/reg": 0.002396960509940982, "step": 2216 }, { "epoch": 0.277125, "grad_norm": 2.320359230041504, "grad_norm_var": 0.2589188603631458, "learning_rate": 0.0001, "loss": 1.0719, "loss/crossentropy": 2.635875701904297, "loss/hidden": 0.89453125, "loss/logits": 0.15336889028549194, "loss/reg": 0.0023956771474331617, "step": 2217 }, { "epoch": 0.27725, "grad_norm": 3.701313018798828, "grad_norm_var": 0.33358847428227895, "learning_rate": 0.0001, "loss": 1.4376, "loss/crossentropy": 2.4794797897338867, "loss/hidden": 1.1796875, "loss/logits": 0.23391982913017273, "loss/reg": 0.002394366078078747, "step": 2218 }, { "epoch": 0.277375, "grad_norm": 2.2712745666503906, "grad_norm_var": 0.33953637806328724, "learning_rate": 0.0001, "loss": 1.0939, "loss/crossentropy": 2.227126359939575, "loss/hidden": 0.9140625, "loss/logits": 0.1559508740901947, "loss/reg": 0.0023930894676595926, "step": 2219 }, { "epoch": 0.2775, "grad_norm": 3.200831413269043, "grad_norm_var": 0.3620770912486317, "learning_rate": 0.0001, "loss": 1.4845, "loss/crossentropy": 1.5896509885787964, "loss/hidden": 1.265625, "loss/logits": 0.1949714720249176, "loss/reg": 0.002391854068264365, "step": 2220 }, { "epoch": 0.277625, "grad_norm": 2.6523869037628174, "grad_norm_var": 0.19350101011422657, "learning_rate": 0.0001, "loss": 1.0338, "loss/crossentropy": 2.3628790378570557, "loss/hidden": 0.83984375, "loss/logits": 0.17002764344215393, "loss/reg": 0.00239055254496634, "step": 2221 }, { "epoch": 0.27775, "grad_norm": 3.398405075073242, "grad_norm_var": 0.23385170773845326, "learning_rate": 0.0001, "loss": 1.3258, "loss/crossentropy": 2.0411486625671387, "loss/hidden": 1.1328125, "loss/logits": 0.1691063493490219, "loss/reg": 0.0023892249446362257, "step": 2222 }, { "epoch": 0.277875, "grad_norm": 2.298076629638672, "grad_norm_var": 0.23633606761226414, "learning_rate": 0.0001, "loss": 1.2568, "loss/crossentropy": 2.354487180709839, "loss/hidden": 1.046875, "loss/logits": 0.1860392689704895, "loss/reg": 0.002387946704402566, "step": 2223 }, { "epoch": 0.278, "grad_norm": 2.1703898906707764, "grad_norm_var": 0.2450921360847244, "learning_rate": 0.0001, "loss": 0.9363, "loss/crossentropy": 2.7801222801208496, "loss/hidden": 0.7734375, "loss/logits": 0.13903328776359558, "loss/reg": 0.0023865378461778164, "step": 2224 }, { "epoch": 0.278125, "grad_norm": 1.9371227025985718, "grad_norm_var": 0.2573127535239573, "learning_rate": 0.0001, "loss": 1.1557, "loss/crossentropy": 2.5280539989471436, "loss/hidden": 0.953125, "loss/logits": 0.17873959243297577, "loss/reg": 0.002385256811976433, "step": 2225 }, { "epoch": 0.27825, "grad_norm": 2.7512600421905518, "grad_norm_var": 0.2520719686541833, "learning_rate": 0.0001, "loss": 1.404, "loss/crossentropy": 2.6072893142700195, "loss/hidden": 1.125, "loss/logits": 0.2551468014717102, "loss/reg": 0.0023838947527110577, "step": 2226 }, { "epoch": 0.278375, "grad_norm": 2.2059197425842285, "grad_norm_var": 0.24532630111889747, "learning_rate": 0.0001, "loss": 1.1603, "loss/crossentropy": 2.6327438354492188, "loss/hidden": 0.94921875, "loss/logits": 0.18722915649414062, "loss/reg": 0.0023826071992516518, "step": 2227 }, { "epoch": 0.2785, "grad_norm": 2.207167387008667, "grad_norm_var": 0.23731753271737777, "learning_rate": 0.0001, "loss": 1.1868, "loss/crossentropy": 2.3738765716552734, "loss/hidden": 1.0, "loss/logits": 0.16301679611206055, "loss/reg": 0.002381255617365241, "step": 2228 }, { "epoch": 0.278625, "grad_norm": 2.4261422157287598, "grad_norm_var": 0.2343222068472859, "learning_rate": 0.0001, "loss": 1.1028, "loss/crossentropy": 2.5230891704559326, "loss/hidden": 0.91796875, "loss/logits": 0.16102899610996246, "loss/reg": 0.0023799969349056482, "step": 2229 }, { "epoch": 0.27875, "grad_norm": 3.2842934131622314, "grad_norm_var": 0.267120429704245, "learning_rate": 0.0001, "loss": 1.149, "loss/crossentropy": 2.159243106842041, "loss/hidden": 0.9765625, "loss/logits": 0.14861997961997986, "loss/reg": 0.0023786937817931175, "step": 2230 }, { "epoch": 0.278875, "grad_norm": 2.1109652519226074, "grad_norm_var": 0.28139076846260447, "learning_rate": 0.0001, "loss": 0.9918, "loss/crossentropy": 2.2167649269104004, "loss/hidden": 0.8359375, "loss/logits": 0.1320679634809494, "loss/reg": 0.002377388533204794, "step": 2231 }, { "epoch": 0.279, "grad_norm": 2.941351890563965, "grad_norm_var": 0.28648826135568767, "learning_rate": 0.0001, "loss": 1.1839, "loss/crossentropy": 2.5684897899627686, "loss/hidden": 0.96875, "loss/logits": 0.19135433435440063, "loss/reg": 0.0023760488256812096, "step": 2232 }, { "epoch": 0.279125, "grad_norm": 2.1496760845184326, "grad_norm_var": 0.2950674153806327, "learning_rate": 0.0001, "loss": 1.0956, "loss/crossentropy": 2.641505479812622, "loss/hidden": 0.90625, "loss/logits": 0.16562530398368835, "loss/reg": 0.0023748010862618685, "step": 2233 }, { "epoch": 0.27925, "grad_norm": 2.245387554168701, "grad_norm_var": 0.21505228651478248, "learning_rate": 0.0001, "loss": 1.2012, "loss/crossentropy": 2.346283197402954, "loss/hidden": 1.0, "loss/logits": 0.1774587631225586, "loss/reg": 0.0023735652212053537, "step": 2234 }, { "epoch": 0.279375, "grad_norm": 2.4604525566101074, "grad_norm_var": 0.2111246002462498, "learning_rate": 0.0001, "loss": 1.2916, "loss/crossentropy": 2.1031174659729004, "loss/hidden": 1.078125, "loss/logits": 0.18975679576396942, "loss/reg": 0.0023722327314317226, "step": 2235 }, { "epoch": 0.2795, "grad_norm": 2.3865561485290527, "grad_norm_var": 0.17946008584778728, "learning_rate": 0.0001, "loss": 1.1502, "loss/crossentropy": 2.46610951423645, "loss/hidden": 0.96484375, "loss/logits": 0.16160912811756134, "loss/reg": 0.0023708741646260023, "step": 2236 }, { "epoch": 0.279625, "grad_norm": 3.095547676086426, "grad_norm_var": 0.20212163916855932, "learning_rate": 0.0001, "loss": 1.2421, "loss/crossentropy": 2.6746020317077637, "loss/hidden": 1.015625, "loss/logits": 0.20277395844459534, "loss/reg": 0.002369475783780217, "step": 2237 }, { "epoch": 0.27975, "grad_norm": 2.4965710639953613, "grad_norm_var": 0.1454412824633526, "learning_rate": 0.0001, "loss": 1.2251, "loss/crossentropy": 2.6806206703186035, "loss/hidden": 0.9921875, "loss/logits": 0.2092117816209793, "loss/reg": 0.0023681086022406816, "step": 2238 }, { "epoch": 0.279875, "grad_norm": 2.304879665374756, "grad_norm_var": 0.14530824731896713, "learning_rate": 0.0001, "loss": 1.0608, "loss/crossentropy": 2.7175748348236084, "loss/hidden": 0.8828125, "loss/logits": 0.1543220579624176, "loss/reg": 0.0023667693603783846, "step": 2239 }, { "epoch": 0.28, "grad_norm": 2.46773624420166, "grad_norm_var": 0.13981391266067783, "learning_rate": 0.0001, "loss": 1.186, "loss/crossentropy": 2.545437812805176, "loss/hidden": 0.98828125, "loss/logits": 0.17406979203224182, "loss/reg": 0.002365546068176627, "step": 2240 }, { "epoch": 0.280125, "grad_norm": 6.842985153198242, "grad_norm_var": 1.2974707972437955, "learning_rate": 0.0001, "loss": 1.2845, "loss/crossentropy": 2.4045698642730713, "loss/hidden": 1.1015625, "loss/logits": 0.1593124270439148, "loss/reg": 0.0023641649167984724, "step": 2241 }, { "epoch": 0.28025, "grad_norm": 2.802086591720581, "grad_norm_var": 1.2974811606530077, "learning_rate": 0.0001, "loss": 1.2375, "loss/crossentropy": 2.37119460105896, "loss/hidden": 1.046875, "loss/logits": 0.16696254909038544, "loss/reg": 0.002362899947911501, "step": 2242 }, { "epoch": 0.280375, "grad_norm": 4.049259662628174, "grad_norm_var": 1.3695564680068921, "learning_rate": 0.0001, "loss": 1.3005, "loss/crossentropy": 2.65864896774292, "loss/hidden": 1.0, "loss/logits": 0.27689510583877563, "loss/reg": 0.002361551858484745, "step": 2243 }, { "epoch": 0.2805, "grad_norm": 4.948246479034424, "grad_norm_var": 1.588881920227938, "learning_rate": 0.0001, "loss": 1.932, "loss/crossentropy": 2.4465274810791016, "loss/hidden": 1.578125, "loss/logits": 0.33026307821273804, "loss/reg": 0.0023601939901709557, "step": 2244 }, { "epoch": 0.280625, "grad_norm": 2.4600002765655518, "grad_norm_var": 1.5860773648579805, "learning_rate": 0.0001, "loss": 1.0547, "loss/crossentropy": 2.506960391998291, "loss/hidden": 0.88671875, "loss/logits": 0.14437949657440186, "loss/reg": 0.0023589441552758217, "step": 2245 }, { "epoch": 0.28075, "grad_norm": 2.20862078666687, "grad_norm_var": 1.6269963621218333, "learning_rate": 0.0001, "loss": 1.1002, "loss/crossentropy": 2.4335319995880127, "loss/hidden": 0.91796875, "loss/logits": 0.15866833925247192, "loss/reg": 0.0023576358798891306, "step": 2246 }, { "epoch": 0.280875, "grad_norm": 2.959247589111328, "grad_norm_var": 1.5716264183384283, "learning_rate": 0.0001, "loss": 1.3174, "loss/crossentropy": 2.7157256603240967, "loss/hidden": 1.0859375, "loss/logits": 0.20794320106506348, "loss/reg": 0.0023564202710986137, "step": 2247 }, { "epoch": 0.281, "grad_norm": 2.2463550567626953, "grad_norm_var": 1.611990973966897, "learning_rate": 0.0001, "loss": 1.0737, "loss/crossentropy": 2.491847515106201, "loss/hidden": 0.90625, "loss/logits": 0.1439252644777298, "loss/reg": 0.0023552048951387405, "step": 2248 }, { "epoch": 0.281125, "grad_norm": 2.797983169555664, "grad_norm_var": 1.5640892485165332, "learning_rate": 0.0001, "loss": 1.2494, "loss/crossentropy": 2.7668051719665527, "loss/hidden": 0.98046875, "loss/logits": 0.24538934230804443, "loss/reg": 0.002353944582864642, "step": 2249 }, { "epoch": 0.28125, "grad_norm": 2.0815508365631104, "grad_norm_var": 1.583305234138436, "learning_rate": 0.0001, "loss": 1.149, "loss/crossentropy": 2.484417200088501, "loss/hidden": 0.9609375, "loss/logits": 0.16450098156929016, "loss/reg": 0.0023527194280177355, "step": 2250 }, { "epoch": 0.281375, "grad_norm": 2.8786518573760986, "grad_norm_var": 1.5620316333247253, "learning_rate": 0.0001, "loss": 1.0573, "loss/crossentropy": 2.3937926292419434, "loss/hidden": 0.89453125, "loss/logits": 0.13926297426223755, "loss/reg": 0.0023514260537922382, "step": 2251 }, { "epoch": 0.2815, "grad_norm": 2.1255078315734863, "grad_norm_var": 1.5898751387395271, "learning_rate": 0.0001, "loss": 0.9616, "loss/crossentropy": 2.6224896907806396, "loss/hidden": 0.80859375, "loss/logits": 0.12955206632614136, "loss/reg": 0.002349988091737032, "step": 2252 }, { "epoch": 0.281625, "grad_norm": 4.148455619812012, "grad_norm_var": 1.6658630001241854, "learning_rate": 0.0001, "loss": 1.1837, "loss/crossentropy": 2.2562179565429688, "loss/hidden": 0.98828125, "loss/logits": 0.1718844473361969, "loss/reg": 0.002348555251955986, "step": 2253 }, { "epoch": 0.28175, "grad_norm": 2.70953631401062, "grad_norm_var": 1.6511759200096776, "learning_rate": 0.0001, "loss": 1.0752, "loss/crossentropy": 2.597907066345215, "loss/hidden": 0.89453125, "loss/logits": 0.1572403907775879, "loss/reg": 0.002347097033634782, "step": 2254 }, { "epoch": 0.281875, "grad_norm": 15.160088539123535, "grad_norm_var": 10.570659548094392, "learning_rate": 0.0001, "loss": 1.3215, "loss/crossentropy": 2.600619077682495, "loss/hidden": 1.0859375, "loss/logits": 0.2120717167854309, "loss/reg": 0.002345849759876728, "step": 2255 }, { "epoch": 0.282, "grad_norm": 3.32181453704834, "grad_norm_var": 10.449686867365251, "learning_rate": 0.0001, "loss": 1.2123, "loss/crossentropy": 2.6941568851470947, "loss/hidden": 0.97265625, "loss/logits": 0.21618157625198364, "loss/reg": 0.0023446185514330864, "step": 2256 }, { "epoch": 0.282125, "grad_norm": 2.9014816284179688, "grad_norm_var": 9.918040589770209, "learning_rate": 0.0001, "loss": 1.1291, "loss/crossentropy": 2.5347681045532227, "loss/hidden": 0.9296875, "loss/logits": 0.17597657442092896, "loss/reg": 0.0023433659225702286, "step": 2257 }, { "epoch": 0.28225, "grad_norm": 2.697105884552002, "grad_norm_var": 9.931821806662498, "learning_rate": 0.0001, "loss": 1.0227, "loss/crossentropy": 2.640836715698242, "loss/hidden": 0.859375, "loss/logits": 0.13994210958480835, "loss/reg": 0.0023421046789735556, "step": 2258 }, { "epoch": 0.282375, "grad_norm": 2.070585250854492, "grad_norm_var": 10.092520006567533, "learning_rate": 0.0001, "loss": 0.9531, "loss/crossentropy": 2.48368501663208, "loss/hidden": 0.80078125, "loss/logits": 0.12891200184822083, "loss/reg": 0.0023408529814332724, "step": 2259 }, { "epoch": 0.2825, "grad_norm": 2.919170618057251, "grad_norm_var": 9.98703100641419, "learning_rate": 0.0001, "loss": 1.181, "loss/crossentropy": 2.347562789916992, "loss/hidden": 0.98046875, "loss/logits": 0.17717091739177704, "loss/reg": 0.0023395537864416838, "step": 2260 }, { "epoch": 0.282625, "grad_norm": 2.1717369556427, "grad_norm_var": 10.03144307873515, "learning_rate": 0.0001, "loss": 1.0057, "loss/crossentropy": 2.4904470443725586, "loss/hidden": 0.81640625, "loss/logits": 0.1659022867679596, "loss/reg": 0.002338262740522623, "step": 2261 }, { "epoch": 0.28275, "grad_norm": 2.9899816513061523, "grad_norm_var": 9.93898364875889, "learning_rate": 0.0001, "loss": 1.2755, "loss/crossentropy": 2.5957393646240234, "loss/hidden": 1.03125, "loss/logits": 0.22083069384098053, "loss/reg": 0.002337030367925763, "step": 2262 }, { "epoch": 0.282875, "grad_norm": 2.7302865982055664, "grad_norm_var": 9.959110272615376, "learning_rate": 0.0001, "loss": 1.221, "loss/crossentropy": 2.3313257694244385, "loss/hidden": 1.0078125, "loss/logits": 0.18980449438095093, "loss/reg": 0.0023357965983450413, "step": 2263 }, { "epoch": 0.283, "grad_norm": 3.0969674587249756, "grad_norm_var": 9.862501838451392, "learning_rate": 0.0001, "loss": 1.1945, "loss/crossentropy": 2.4774303436279297, "loss/hidden": 0.99609375, "loss/logits": 0.17506292462348938, "loss/reg": 0.002334574470296502, "step": 2264 }, { "epoch": 0.283125, "grad_norm": 3.6323814392089844, "grad_norm_var": 9.82234512134568, "learning_rate": 0.0001, "loss": 1.3518, "loss/crossentropy": 2.7669167518615723, "loss/hidden": 1.140625, "loss/logits": 0.18784978985786438, "loss/reg": 0.002333372598513961, "step": 2265 }, { "epoch": 0.28325, "grad_norm": 2.962461471557617, "grad_norm_var": 9.69223711740269, "learning_rate": 0.0001, "loss": 1.1619, "loss/crossentropy": 2.4263763427734375, "loss/hidden": 0.94921875, "loss/logits": 0.1893639862537384, "loss/reg": 0.002332240343093872, "step": 2266 }, { "epoch": 0.283375, "grad_norm": 2.1170146465301514, "grad_norm_var": 9.807562085057132, "learning_rate": 0.0001, "loss": 1.1237, "loss/crossentropy": 2.665316581726074, "loss/hidden": 0.9453125, "loss/logits": 0.15509799122810364, "loss/reg": 0.002331107622012496, "step": 2267 }, { "epoch": 0.2835, "grad_norm": 1.9553697109222412, "grad_norm_var": 9.843039409388325, "learning_rate": 0.0001, "loss": 1.0466, "loss/crossentropy": 2.5126242637634277, "loss/hidden": 0.87890625, "loss/logits": 0.1444346159696579, "loss/reg": 0.00232999911531806, "step": 2268 }, { "epoch": 0.283625, "grad_norm": 1.9847123622894287, "grad_norm_var": 9.9771414158157, "learning_rate": 0.0001, "loss": 1.0678, "loss/crossentropy": 2.460191249847412, "loss/hidden": 0.890625, "loss/logits": 0.153900146484375, "loss/reg": 0.0023288605734705925, "step": 2269 }, { "epoch": 0.28375, "grad_norm": 2.175307512283325, "grad_norm_var": 10.048705059135525, "learning_rate": 0.0001, "loss": 1.1278, "loss/crossentropy": 2.2887959480285645, "loss/hidden": 0.9453125, "loss/logits": 0.15923447906970978, "loss/reg": 0.002327769761905074, "step": 2270 }, { "epoch": 0.283875, "grad_norm": 1.9558424949645996, "grad_norm_var": 0.2948269846760882, "learning_rate": 0.0001, "loss": 1.2198, "loss/crossentropy": 2.4505839347839355, "loss/hidden": 1.0234375, "loss/logits": 0.17306137084960938, "loss/reg": 0.002326522720977664, "step": 2271 }, { "epoch": 0.284, "grad_norm": 3.0274269580841064, "grad_norm_var": 0.2721127614858536, "learning_rate": 0.0001, "loss": 1.0236, "loss/crossentropy": 2.563673734664917, "loss/hidden": 0.83984375, "loss/logits": 0.1605188250541687, "loss/reg": 0.0023253231775015593, "step": 2272 }, { "epoch": 0.284125, "grad_norm": 2.3401095867156982, "grad_norm_var": 0.268250600897346, "learning_rate": 0.0001, "loss": 1.1311, "loss/crossentropy": 2.9437670707702637, "loss/hidden": 0.9375, "loss/logits": 0.17039918899536133, "loss/reg": 0.0023240831214934587, "step": 2273 }, { "epoch": 0.28425, "grad_norm": 2.8064186573028564, "grad_norm_var": 0.27111740064450013, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.375760793685913, "loss/hidden": 1.1875, "loss/logits": 0.21980787813663483, "loss/reg": 0.0023229438811540604, "step": 2274 }, { "epoch": 0.284375, "grad_norm": 2.461909055709839, "grad_norm_var": 0.2552313472214562, "learning_rate": 0.0001, "loss": 1.0466, "loss/crossentropy": 2.5115292072296143, "loss/hidden": 0.875, "loss/logits": 0.14840534329414368, "loss/reg": 0.00232181278988719, "step": 2275 }, { "epoch": 0.2845, "grad_norm": 2.295276641845703, "grad_norm_var": 0.2515897410445253, "learning_rate": 0.0001, "loss": 1.0495, "loss/crossentropy": 2.4238152503967285, "loss/hidden": 0.859375, "loss/logits": 0.1669464409351349, "loss/reg": 0.0023207066114991903, "step": 2276 }, { "epoch": 0.284625, "grad_norm": 2.3408892154693604, "grad_norm_var": 0.2449832599577538, "learning_rate": 0.0001, "loss": 1.172, "loss/crossentropy": 2.654197931289673, "loss/hidden": 0.9609375, "loss/logits": 0.18782514333724976, "loss/reg": 0.0023194625973701477, "step": 2277 }, { "epoch": 0.28475, "grad_norm": 2.485037326812744, "grad_norm_var": 0.23160110543602266, "learning_rate": 0.0001, "loss": 1.161, "loss/crossentropy": 2.251394033432007, "loss/hidden": 0.98828125, "loss/logits": 0.14955447614192963, "loss/reg": 0.0023183117154985666, "step": 2278 }, { "epoch": 0.284875, "grad_norm": 2.978438377380371, "grad_norm_var": 0.24230950151186997, "learning_rate": 0.0001, "loss": 1.0045, "loss/crossentropy": 2.4966647624969482, "loss/hidden": 0.8359375, "loss/logits": 0.1454242616891861, "loss/reg": 0.0023171952925622463, "step": 2279 }, { "epoch": 0.285, "grad_norm": 2.29223370552063, "grad_norm_var": 0.22285899767482534, "learning_rate": 0.0001, "loss": 1.0492, "loss/crossentropy": 2.393254041671753, "loss/hidden": 0.88671875, "loss/logits": 0.13930846750736237, "loss/reg": 0.00231595104560256, "step": 2280 }, { "epoch": 0.285125, "grad_norm": 2.196556329727173, "grad_norm_var": 0.13265824422879077, "learning_rate": 0.0001, "loss": 1.0595, "loss/crossentropy": 2.4938318729400635, "loss/hidden": 0.8984375, "loss/logits": 0.13791415095329285, "loss/reg": 0.0023147067986428738, "step": 2281 }, { "epoch": 0.28525, "grad_norm": 3.350985288619995, "grad_norm_var": 0.17131088622846846, "learning_rate": 0.0001, "loss": 1.2429, "loss/crossentropy": 2.5520384311676025, "loss/hidden": 1.0390625, "loss/logits": 0.18074104189872742, "loss/reg": 0.0023134720977395773, "step": 2282 }, { "epoch": 0.285375, "grad_norm": 2.4520723819732666, "grad_norm_var": 0.16467015217425854, "learning_rate": 0.0001, "loss": 1.3356, "loss/crossentropy": 2.174309730529785, "loss/hidden": 1.1171875, "loss/logits": 0.19530147314071655, "loss/reg": 0.002312231110408902, "step": 2283 }, { "epoch": 0.2855, "grad_norm": 2.6148715019226074, "grad_norm_var": 0.14891681536379314, "learning_rate": 0.0001, "loss": 1.3478, "loss/crossentropy": 2.525818347930908, "loss/hidden": 1.109375, "loss/logits": 0.21527819335460663, "loss/reg": 0.0023110134061425924, "step": 2284 }, { "epoch": 0.285625, "grad_norm": 3.340635299682617, "grad_norm_var": 0.17339950037755802, "learning_rate": 0.0001, "loss": 1.2745, "loss/crossentropy": 2.624620199203491, "loss/hidden": 1.0546875, "loss/logits": 0.19669537246227264, "loss/reg": 0.0023098145611584187, "step": 2285 }, { "epoch": 0.28575, "grad_norm": 2.3565597534179688, "grad_norm_var": 0.1659233010853368, "learning_rate": 0.0001, "loss": 1.3221, "loss/crossentropy": 2.3247995376586914, "loss/hidden": 1.1015625, "loss/logits": 0.1974058747291565, "loss/reg": 0.002308570547029376, "step": 2286 }, { "epoch": 0.285875, "grad_norm": 2.4912595748901367, "grad_norm_var": 0.13921422281447515, "learning_rate": 0.0001, "loss": 1.1088, "loss/crossentropy": 2.5991525650024414, "loss/hidden": 0.921875, "loss/logits": 0.16389669477939606, "loss/reg": 0.0023073714692145586, "step": 2287 }, { "epoch": 0.286, "grad_norm": 2.6419484615325928, "grad_norm_var": 0.12727382416776603, "learning_rate": 0.0001, "loss": 1.3336, "loss/crossentropy": 1.9776073694229126, "loss/hidden": 1.125, "loss/logits": 0.18549180030822754, "loss/reg": 0.002306167734786868, "step": 2288 }, { "epoch": 0.286125, "grad_norm": 2.8341853618621826, "grad_norm_var": 0.12604736563566968, "learning_rate": 0.0001, "loss": 1.1864, "loss/crossentropy": 2.543550491333008, "loss/hidden": 0.984375, "loss/logits": 0.17898789048194885, "loss/reg": 0.0023050198797136545, "step": 2289 }, { "epoch": 0.28625, "grad_norm": 3.4228172302246094, "grad_norm_var": 0.165016139303043, "learning_rate": 0.0001, "loss": 1.5123, "loss/crossentropy": 2.928520441055298, "loss/hidden": 1.2109375, "loss/logits": 0.2783648371696472, "loss/reg": 0.0023037800565361977, "step": 2290 }, { "epoch": 0.286375, "grad_norm": 2.230731248855591, "grad_norm_var": 0.17445390423555693, "learning_rate": 0.0001, "loss": 1.0664, "loss/crossentropy": 2.4718921184539795, "loss/hidden": 0.890625, "loss/logits": 0.1527043581008911, "loss/reg": 0.002302588429301977, "step": 2291 }, { "epoch": 0.2865, "grad_norm": 2.1117284297943115, "grad_norm_var": 0.18512521835621457, "learning_rate": 0.0001, "loss": 1.1429, "loss/crossentropy": 2.625943899154663, "loss/hidden": 0.953125, "loss/logits": 0.1667558252811432, "loss/reg": 0.002301277592778206, "step": 2292 }, { "epoch": 0.286625, "grad_norm": 2.4937758445739746, "grad_norm_var": 0.18061497065554305, "learning_rate": 0.0001, "loss": 1.127, "loss/crossentropy": 2.5712082386016846, "loss/hidden": 0.9375, "loss/logits": 0.16652223467826843, "loss/reg": 0.0023000373039394617, "step": 2293 }, { "epoch": 0.28675, "grad_norm": 2.458190679550171, "grad_norm_var": 0.18122675848363937, "learning_rate": 0.0001, "loss": 1.1445, "loss/crossentropy": 2.4598801136016846, "loss/hidden": 0.95703125, "loss/logits": 0.16446572542190552, "loss/reg": 0.0022987998090684414, "step": 2294 }, { "epoch": 0.286875, "grad_norm": 2.3908166885375977, "grad_norm_var": 0.17642362742102263, "learning_rate": 0.0001, "loss": 1.0981, "loss/crossentropy": 2.544672966003418, "loss/hidden": 0.90234375, "loss/logits": 0.1727607101202011, "loss/reg": 0.0022975679021328688, "step": 2295 }, { "epoch": 0.287, "grad_norm": 2.045161008834839, "grad_norm_var": 0.1905411013016644, "learning_rate": 0.0001, "loss": 1.0386, "loss/crossentropy": 2.479443311691284, "loss/hidden": 0.87109375, "loss/logits": 0.1445106863975525, "loss/reg": 0.002296352991834283, "step": 2296 }, { "epoch": 0.287125, "grad_norm": 2.1158037185668945, "grad_norm_var": 0.1951796917098932, "learning_rate": 0.0001, "loss": 1.1343, "loss/crossentropy": 2.6101503372192383, "loss/hidden": 0.94140625, "loss/logits": 0.1699090600013733, "loss/reg": 0.0022950852289795876, "step": 2297 }, { "epoch": 0.28725, "grad_norm": 2.193514823913574, "grad_norm_var": 0.1606176847888811, "learning_rate": 0.0001, "loss": 1.0855, "loss/crossentropy": 2.3473117351531982, "loss/hidden": 0.92578125, "loss/logits": 0.1367611587047577, "loss/reg": 0.002293873345479369, "step": 2298 }, { "epoch": 0.287375, "grad_norm": 2.4940381050109863, "grad_norm_var": 0.16039170952120968, "learning_rate": 0.0001, "loss": 1.1701, "loss/crossentropy": 2.3491015434265137, "loss/hidden": 0.98046875, "loss/logits": 0.1667015701532364, "loss/reg": 0.00229267799295485, "step": 2299 }, { "epoch": 0.2875, "grad_norm": 2.2946813106536865, "grad_norm_var": 0.16252503039558686, "learning_rate": 0.0001, "loss": 1.1544, "loss/crossentropy": 2.68192982673645, "loss/hidden": 0.9609375, "loss/logits": 0.17053982615470886, "loss/reg": 0.0022914325818419456, "step": 2300 }, { "epoch": 0.287625, "grad_norm": 2.6737537384033203, "grad_norm_var": 0.11510583042833838, "learning_rate": 0.0001, "loss": 0.978, "loss/crossentropy": 2.514535665512085, "loss/hidden": 0.80859375, "loss/logits": 0.1465175449848175, "loss/reg": 0.002290048636496067, "step": 2301 }, { "epoch": 0.28775, "grad_norm": 2.342859983444214, "grad_norm_var": 0.11529383216604762, "learning_rate": 0.0001, "loss": 1.0737, "loss/crossentropy": 2.560364007949829, "loss/hidden": 0.90625, "loss/logits": 0.1445198655128479, "loss/reg": 0.002288810908794403, "step": 2302 }, { "epoch": 0.287875, "grad_norm": 2.5364012718200684, "grad_norm_var": 0.11565626345865213, "learning_rate": 0.0001, "loss": 1.1419, "loss/crossentropy": 2.7632384300231934, "loss/hidden": 0.92578125, "loss/logits": 0.1932322382926941, "loss/reg": 0.0022875003051012754, "step": 2303 }, { "epoch": 0.288, "grad_norm": 2.964402675628662, "grad_norm_var": 0.13019135494514344, "learning_rate": 0.0001, "loss": 1.3837, "loss/crossentropy": 2.0394208431243896, "loss/hidden": 1.1796875, "loss/logits": 0.18116863071918488, "loss/reg": 0.0022861603647470474, "step": 2304 }, { "epoch": 0.288125, "grad_norm": 2.503971576690674, "grad_norm_var": 0.12119990797890808, "learning_rate": 0.0001, "loss": 1.2923, "loss/crossentropy": 2.323030948638916, "loss/hidden": 1.0703125, "loss/logits": 0.19916847348213196, "loss/reg": 0.002284962683916092, "step": 2305 }, { "epoch": 0.28825, "grad_norm": 2.6977484226226807, "grad_norm_var": 0.06044874125653313, "learning_rate": 0.0001, "loss": 1.2035, "loss/crossentropy": 2.803896427154541, "loss/hidden": 0.98828125, "loss/logits": 0.19237124919891357, "loss/reg": 0.002283781534060836, "step": 2306 }, { "epoch": 0.288375, "grad_norm": 4.449802875518799, "grad_norm_var": 0.31540449428999484, "learning_rate": 0.0001, "loss": 1.3955, "loss/crossentropy": 2.784353256225586, "loss/hidden": 1.125, "loss/logits": 0.2477027028799057, "loss/reg": 0.0022825354244560003, "step": 2307 }, { "epoch": 0.2885, "grad_norm": 2.4779105186462402, "grad_norm_var": 0.3024885483043361, "learning_rate": 0.0001, "loss": 1.1618, "loss/crossentropy": 2.4562690258026123, "loss/hidden": 0.953125, "loss/logits": 0.18587693572044373, "loss/reg": 0.0022813454270362854, "step": 2308 }, { "epoch": 0.288625, "grad_norm": 2.3362557888031006, "grad_norm_var": 0.30565709067314706, "learning_rate": 0.0001, "loss": 1.1798, "loss/crossentropy": 2.363804578781128, "loss/hidden": 0.97265625, "loss/logits": 0.18437422811985016, "loss/reg": 0.0022800499573349953, "step": 2309 }, { "epoch": 0.28875, "grad_norm": 3.4993114471435547, "grad_norm_var": 0.35913723861517965, "learning_rate": 0.0001, "loss": 1.0727, "loss/crossentropy": 2.4836864471435547, "loss/hidden": 0.90625, "loss/logits": 0.14366208016872406, "loss/reg": 0.0022788674104958773, "step": 2310 }, { "epoch": 0.288875, "grad_norm": 2.315100908279419, "grad_norm_var": 0.36187009577711154, "learning_rate": 0.0001, "loss": 0.9997, "loss/crossentropy": 2.3859941959381104, "loss/hidden": 0.83203125, "loss/logits": 0.14485400915145874, "loss/reg": 0.0022777330595999956, "step": 2311 }, { "epoch": 0.289, "grad_norm": 2.7492525577545166, "grad_norm_var": 0.3387673534232742, "learning_rate": 0.0001, "loss": 1.0611, "loss/crossentropy": 2.4734277725219727, "loss/hidden": 0.890625, "loss/logits": 0.14766156673431396, "loss/reg": 0.0022765756584703922, "step": 2312 }, { "epoch": 0.289125, "grad_norm": 2.782494306564331, "grad_norm_var": 0.3177012041335234, "learning_rate": 0.0001, "loss": 1.3091, "loss/crossentropy": 2.330939292907715, "loss/hidden": 1.09375, "loss/logits": 0.1925935000181198, "loss/reg": 0.00227532722055912, "step": 2313 }, { "epoch": 0.28925, "grad_norm": 2.256880521774292, "grad_norm_var": 0.313614105852335, "learning_rate": 0.0001, "loss": 0.9895, "loss/crossentropy": 2.3592119216918945, "loss/hidden": 0.83203125, "loss/logits": 0.13472414016723633, "loss/reg": 0.0022741667926311493, "step": 2314 }, { "epoch": 0.289375, "grad_norm": 1.963213562965393, "grad_norm_var": 0.346575834474711, "learning_rate": 0.0001, "loss": 0.9468, "loss/crossentropy": 2.6037299633026123, "loss/hidden": 0.79296875, "loss/logits": 0.1311364769935608, "loss/reg": 0.002272919751703739, "step": 2315 }, { "epoch": 0.2895, "grad_norm": 2.5335559844970703, "grad_norm_var": 0.3379413501959229, "learning_rate": 0.0001, "loss": 1.1492, "loss/crossentropy": 2.56166934967041, "loss/hidden": 0.96875, "loss/logits": 0.15770667791366577, "loss/reg": 0.0022716743405908346, "step": 2316 }, { "epoch": 0.289625, "grad_norm": 2.2219431400299072, "grad_norm_var": 0.351839932062064, "learning_rate": 0.0001, "loss": 0.9362, "loss/crossentropy": 2.4063034057617188, "loss/hidden": 0.7890625, "loss/logits": 0.12445776164531708, "loss/reg": 0.0022704829461872578, "step": 2317 }, { "epoch": 0.28975, "grad_norm": 2.675863742828369, "grad_norm_var": 0.34449215523824905, "learning_rate": 0.0001, "loss": 1.1639, "loss/crossentropy": 2.4815635681152344, "loss/hidden": 0.98828125, "loss/logits": 0.15290312469005585, "loss/reg": 0.0022692400962114334, "step": 2318 }, { "epoch": 0.289875, "grad_norm": 3.4112133979797363, "grad_norm_var": 0.3749604181068301, "learning_rate": 0.0001, "loss": 1.3322, "loss/crossentropy": 2.2978286743164062, "loss/hidden": 1.078125, "loss/logits": 0.231400728225708, "loss/reg": 0.0022680372931063175, "step": 2319 }, { "epoch": 0.29, "grad_norm": 2.1255404949188232, "grad_norm_var": 0.3938344325055017, "learning_rate": 0.0001, "loss": 1.1635, "loss/crossentropy": 2.603931427001953, "loss/hidden": 0.96875, "loss/logits": 0.1720743179321289, "loss/reg": 0.0022668354213237762, "step": 2320 }, { "epoch": 0.290125, "grad_norm": 3.393237829208374, "grad_norm_var": 0.4214978965971733, "learning_rate": 0.0001, "loss": 1.1537, "loss/crossentropy": 2.3089537620544434, "loss/hidden": 0.95703125, "loss/logits": 0.1740158349275589, "loss/reg": 0.0022656810469925404, "step": 2321 }, { "epoch": 0.29025, "grad_norm": 2.4209048748016357, "grad_norm_var": 0.427961449067007, "learning_rate": 0.0001, "loss": 1.1066, "loss/crossentropy": 2.509672164916992, "loss/hidden": 0.91796875, "loss/logits": 0.16602107882499695, "loss/reg": 0.00226387451402843, "step": 2322 }, { "epoch": 0.290375, "grad_norm": 2.797715663909912, "grad_norm_var": 0.2187836662541788, "learning_rate": 0.0001, "loss": 1.0883, "loss/crossentropy": 2.3329074382781982, "loss/hidden": 0.90625, "loss/logits": 0.15945813059806824, "loss/reg": 0.002262289170175791, "step": 2323 }, { "epoch": 0.2905, "grad_norm": 2.258333206176758, "grad_norm_var": 0.2260309184436659, "learning_rate": 0.0001, "loss": 1.1114, "loss/crossentropy": 2.3607590198516846, "loss/hidden": 0.9296875, "loss/logits": 0.1590813398361206, "loss/reg": 0.0022610584273934364, "step": 2324 }, { "epoch": 0.290625, "grad_norm": 3.05591082572937, "grad_norm_var": 0.23224806610572882, "learning_rate": 0.0001, "loss": 1.1129, "loss/crossentropy": 2.482182741165161, "loss/hidden": 0.94140625, "loss/logits": 0.14887839555740356, "loss/reg": 0.002259862842038274, "step": 2325 }, { "epoch": 0.29075, "grad_norm": 3.481534004211426, "grad_norm_var": 0.2302636323918586, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.336320400238037, "loss/hidden": 1.1953125, "loss/logits": 0.20323044061660767, "loss/reg": 0.002258649794384837, "step": 2326 }, { "epoch": 0.290875, "grad_norm": 2.968456268310547, "grad_norm_var": 0.22753633498401857, "learning_rate": 0.0001, "loss": 1.0418, "loss/crossentropy": 2.5565855503082275, "loss/hidden": 0.86328125, "loss/logits": 0.15598532557487488, "loss/reg": 0.0022571764420717955, "step": 2327 }, { "epoch": 0.291, "grad_norm": 2.4593372344970703, "grad_norm_var": 0.23063450151318624, "learning_rate": 0.0001, "loss": 1.1159, "loss/crossentropy": 2.6731154918670654, "loss/hidden": 0.93359375, "loss/logits": 0.159798264503479, "loss/reg": 0.002255691448226571, "step": 2328 }, { "epoch": 0.291125, "grad_norm": 2.6843700408935547, "grad_norm_var": 0.22983491806422568, "learning_rate": 0.0001, "loss": 1.2713, "loss/crossentropy": 2.674574375152588, "loss/hidden": 1.03125, "loss/logits": 0.21753570437431335, "loss/reg": 0.002254314022138715, "step": 2329 }, { "epoch": 0.29125, "grad_norm": 2.9977915287017822, "grad_norm_var": 0.22340696006030256, "learning_rate": 0.0001, "loss": 1.1374, "loss/crossentropy": 2.449286937713623, "loss/hidden": 0.96484375, "loss/logits": 0.15002599358558655, "loss/reg": 0.0022528767585754395, "step": 2330 }, { "epoch": 0.291375, "grad_norm": 2.9296767711639404, "grad_norm_var": 0.18483677669215862, "learning_rate": 0.0001, "loss": 1.0531, "loss/crossentropy": 2.622969388961792, "loss/hidden": 0.87890625, "loss/logits": 0.1516713798046112, "loss/reg": 0.002251418773084879, "step": 2331 }, { "epoch": 0.2915, "grad_norm": 2.7505085468292236, "grad_norm_var": 0.18076648440234447, "learning_rate": 0.0001, "loss": 1.2174, "loss/crossentropy": 1.8988810777664185, "loss/hidden": 1.046875, "loss/logits": 0.14806458353996277, "loss/reg": 0.0022499056067317724, "step": 2332 }, { "epoch": 0.291625, "grad_norm": 2.6135544776916504, "grad_norm_var": 0.16071545426870723, "learning_rate": 0.0001, "loss": 1.3717, "loss/crossentropy": 2.2003393173217773, "loss/hidden": 1.140625, "loss/logits": 0.2086334228515625, "loss/reg": 0.0022484040819108486, "step": 2333 }, { "epoch": 0.29175, "grad_norm": 2.1136133670806885, "grad_norm_var": 0.19082867936137216, "learning_rate": 0.0001, "loss": 1.045, "loss/crossentropy": 2.455838203430176, "loss/hidden": 0.875, "loss/logits": 0.1474977433681488, "loss/reg": 0.002247190335765481, "step": 2334 }, { "epoch": 0.291875, "grad_norm": 2.239429235458374, "grad_norm_var": 0.17784790227391506, "learning_rate": 0.0001, "loss": 1.0429, "loss/crossentropy": 2.275726556777954, "loss/hidden": 0.85546875, "loss/logits": 0.1649402529001236, "loss/reg": 0.0022459831088781357, "step": 2335 }, { "epoch": 0.292, "grad_norm": 2.1819427013397217, "grad_norm_var": 0.17368436194440331, "learning_rate": 0.0001, "loss": 1.1337, "loss/crossentropy": 2.4590518474578857, "loss/hidden": 0.94921875, "loss/logits": 0.16198822855949402, "loss/reg": 0.002244751201942563, "step": 2336 }, { "epoch": 0.292125, "grad_norm": 2.4671285152435303, "grad_norm_var": 0.1428166072582009, "learning_rate": 0.0001, "loss": 1.1621, "loss/crossentropy": 2.518401861190796, "loss/hidden": 0.984375, "loss/logits": 0.15526780486106873, "loss/reg": 0.00224347529001534, "step": 2337 }, { "epoch": 0.29225, "grad_norm": 2.3487448692321777, "grad_norm_var": 0.1453584009443148, "learning_rate": 0.0001, "loss": 1.2345, "loss/crossentropy": 2.3165202140808105, "loss/hidden": 1.0390625, "loss/logits": 0.1729811280965805, "loss/reg": 0.0022422494366765022, "step": 2338 }, { "epoch": 0.292375, "grad_norm": 2.2789766788482666, "grad_norm_var": 0.1517351686029104, "learning_rate": 0.0001, "loss": 0.9853, "loss/crossentropy": 2.460134983062744, "loss/hidden": 0.83984375, "loss/logits": 0.12309332191944122, "loss/reg": 0.0022410245146602392, "step": 2339 }, { "epoch": 0.2925, "grad_norm": 2.5760514736175537, "grad_norm_var": 0.1429632585685861, "learning_rate": 0.0001, "loss": 1.0858, "loss/crossentropy": 2.463728904724121, "loss/hidden": 0.9140625, "loss/logits": 0.14935334026813507, "loss/reg": 0.0022397052962332964, "step": 2340 }, { "epoch": 0.292625, "grad_norm": 2.548550844192505, "grad_norm_var": 0.13052301670314062, "learning_rate": 0.0001, "loss": 1.2119, "loss/crossentropy": 2.5376319885253906, "loss/hidden": 1.0078125, "loss/logits": 0.18175125122070312, "loss/reg": 0.0022384945768862963, "step": 2341 }, { "epoch": 0.29275, "grad_norm": 2.415813446044922, "grad_norm_var": 0.07659779337117621, "learning_rate": 0.0001, "loss": 1.1275, "loss/crossentropy": 2.8628947734832764, "loss/hidden": 0.92578125, "loss/logits": 0.1793949156999588, "loss/reg": 0.0022372091189026833, "step": 2342 }, { "epoch": 0.292875, "grad_norm": 3.2068769931793213, "grad_norm_var": 0.09390219023083522, "learning_rate": 0.0001, "loss": 1.205, "loss/crossentropy": 2.7339158058166504, "loss/hidden": 0.984375, "loss/logits": 0.19830560684204102, "loss/reg": 0.0022359357681125402, "step": 2343 }, { "epoch": 0.293, "grad_norm": 1.9274033308029175, "grad_norm_var": 0.11807182726753916, "learning_rate": 0.0001, "loss": 1.003, "loss/crossentropy": 2.260179281234741, "loss/hidden": 0.8359375, "loss/logits": 0.14475148916244507, "loss/reg": 0.0022347313351929188, "step": 2344 }, { "epoch": 0.293125, "grad_norm": 2.0362331867218018, "grad_norm_var": 0.1299086349013222, "learning_rate": 0.0001, "loss": 1.074, "loss/crossentropy": 2.486905336380005, "loss/hidden": 0.8984375, "loss/logits": 0.15327274799346924, "loss/reg": 0.0022335397079586983, "step": 2345 }, { "epoch": 0.29325, "grad_norm": 2.905491590499878, "grad_norm_var": 0.12403211400510367, "learning_rate": 0.0001, "loss": 1.1305, "loss/crossentropy": 2.297912120819092, "loss/hidden": 0.94140625, "loss/logits": 0.16676940023899078, "loss/reg": 0.0022323019802570343, "step": 2346 }, { "epoch": 0.293375, "grad_norm": 2.1365294456481934, "grad_norm_var": 0.11486975958407246, "learning_rate": 0.0001, "loss": 0.9981, "loss/crossentropy": 2.7155988216400146, "loss/hidden": 0.83984375, "loss/logits": 0.13597136735916138, "loss/reg": 0.0022311012726277113, "step": 2347 }, { "epoch": 0.2935, "grad_norm": 2.275961399078369, "grad_norm_var": 0.10813836983129281, "learning_rate": 0.0001, "loss": 1.1664, "loss/crossentropy": 2.314725637435913, "loss/hidden": 0.97265625, "loss/logits": 0.17141079902648926, "loss/reg": 0.0022298451513051987, "step": 2348 }, { "epoch": 0.293625, "grad_norm": 2.0574400424957275, "grad_norm_var": 0.1110407689565775, "learning_rate": 0.0001, "loss": 1.0082, "loss/crossentropy": 2.4887874126434326, "loss/hidden": 0.84375, "loss/logits": 0.14216122031211853, "loss/reg": 0.0022286190651357174, "step": 2349 }, { "epoch": 0.29375, "grad_norm": 2.498141050338745, "grad_norm_var": 0.1077901782157842, "learning_rate": 0.0001, "loss": 1.1082, "loss/crossentropy": 2.7253406047821045, "loss/hidden": 0.921875, "loss/logits": 0.1640745848417282, "loss/reg": 0.0022274490911513567, "step": 2350 }, { "epoch": 0.293875, "grad_norm": 2.686478614807129, "grad_norm_var": 0.1118248857026595, "learning_rate": 0.0001, "loss": 1.2891, "loss/crossentropy": 2.2553563117980957, "loss/hidden": 1.078125, "loss/logits": 0.1887470781803131, "loss/reg": 0.0022261643316596746, "step": 2351 }, { "epoch": 0.294, "grad_norm": 2.0109453201293945, "grad_norm_var": 0.11883458323120469, "learning_rate": 0.0001, "loss": 1.0798, "loss/crossentropy": 2.516427755355835, "loss/hidden": 0.90234375, "loss/logits": 0.15522971749305725, "loss/reg": 0.002224972005933523, "step": 2352 }, { "epoch": 0.294125, "grad_norm": 2.1653997898101807, "grad_norm_var": 0.12176556600674258, "learning_rate": 0.0001, "loss": 0.9645, "loss/crossentropy": 2.5457258224487305, "loss/hidden": 0.796875, "loss/logits": 0.14535899460315704, "loss/reg": 0.0022235908545553684, "step": 2353 }, { "epoch": 0.29425, "grad_norm": 2.564622640609741, "grad_norm_var": 0.12378755478122555, "learning_rate": 0.0001, "loss": 1.1413, "loss/crossentropy": 2.549320697784424, "loss/hidden": 0.94140625, "loss/logits": 0.1776999831199646, "loss/reg": 0.002222399925813079, "step": 2354 }, { "epoch": 0.294375, "grad_norm": 2.2842698097229004, "grad_norm_var": 0.12370870519383974, "learning_rate": 0.0001, "loss": 1.2936, "loss/crossentropy": 2.3852198123931885, "loss/hidden": 1.0703125, "loss/logits": 0.2011002004146576, "loss/reg": 0.0022210769820958376, "step": 2355 }, { "epoch": 0.2945, "grad_norm": 2.379873514175415, "grad_norm_var": 0.12133939874134866, "learning_rate": 0.0001, "loss": 1.2274, "loss/crossentropy": 2.3708417415618896, "loss/hidden": 1.015625, "loss/logits": 0.18960796296596527, "loss/reg": 0.002219798509031534, "step": 2356 }, { "epoch": 0.294625, "grad_norm": 3.7287704944610596, "grad_norm_var": 0.23472339427804717, "learning_rate": 0.0001, "loss": 1.106, "loss/crossentropy": 2.2924282550811768, "loss/hidden": 0.94921875, "loss/logits": 0.13460811972618103, "loss/reg": 0.0022184138651937246, "step": 2357 }, { "epoch": 0.29475, "grad_norm": 2.8506343364715576, "grad_norm_var": 0.2442674270962811, "learning_rate": 0.0001, "loss": 1.0902, "loss/crossentropy": 2.315319299697876, "loss/hidden": 0.8984375, "loss/logits": 0.16961857676506042, "loss/reg": 0.0022172173485159874, "step": 2358 }, { "epoch": 0.294875, "grad_norm": 2.8585727214813232, "grad_norm_var": 0.21819488358667763, "learning_rate": 0.0001, "loss": 1.0408, "loss/crossentropy": 2.503530263900757, "loss/hidden": 0.84375, "loss/logits": 0.17494037747383118, "loss/reg": 0.0022158126812428236, "step": 2359 }, { "epoch": 0.295, "grad_norm": 3.1541364192962646, "grad_norm_var": 0.22506647160361481, "learning_rate": 0.0001, "loss": 1.4225, "loss/crossentropy": 2.0468430519104004, "loss/hidden": 1.203125, "loss/logits": 0.1972062885761261, "loss/reg": 0.0022143737878650427, "step": 2360 }, { "epoch": 0.295125, "grad_norm": 2.4996113777160645, "grad_norm_var": 0.20754138116211132, "learning_rate": 0.0001, "loss": 1.0905, "loss/crossentropy": 2.302741050720215, "loss/hidden": 0.91796875, "loss/logits": 0.15044939517974854, "loss/reg": 0.0022129348944872618, "step": 2361 }, { "epoch": 0.29525, "grad_norm": 2.1818268299102783, "grad_norm_var": 0.20752026717631603, "learning_rate": 0.0001, "loss": 1.0089, "loss/crossentropy": 2.609379291534424, "loss/hidden": 0.83984375, "loss/logits": 0.1469854712486267, "loss/reg": 0.002211726736277342, "step": 2362 }, { "epoch": 0.295375, "grad_norm": 2.1552815437316895, "grad_norm_var": 0.20658139620233745, "learning_rate": 0.0001, "loss": 1.0721, "loss/crossentropy": 2.397830009460449, "loss/hidden": 0.89453125, "loss/logits": 0.15542778372764587, "loss/reg": 0.00221027503721416, "step": 2363 }, { "epoch": 0.2955, "grad_norm": 2.5929059982299805, "grad_norm_var": 0.20246243959012702, "learning_rate": 0.0001, "loss": 1.1183, "loss/crossentropy": 2.359164237976074, "loss/hidden": 0.9453125, "loss/logits": 0.15085437893867493, "loss/reg": 0.002208893885836005, "step": 2364 }, { "epoch": 0.295625, "grad_norm": 27.555530548095703, "grad_norm_var": 39.19027713603315, "learning_rate": 0.0001, "loss": 1.3418, "loss/crossentropy": 2.7018837928771973, "loss/hidden": 1.140625, "loss/logits": 0.17907656729221344, "loss/reg": 0.0022077099420130253, "step": 2365 }, { "epoch": 0.29575, "grad_norm": 2.279125928878784, "grad_norm_var": 39.241087471777405, "learning_rate": 0.0001, "loss": 1.0658, "loss/crossentropy": 2.438565492630005, "loss/hidden": 0.90234375, "loss/logits": 0.1413879096508026, "loss/reg": 0.0022063306532800198, "step": 2366 }, { "epoch": 0.295875, "grad_norm": 2.095780611038208, "grad_norm_var": 39.375936752817545, "learning_rate": 0.0001, "loss": 1.0013, "loss/crossentropy": 2.5098965167999268, "loss/hidden": 0.8359375, "loss/logits": 0.14328661561012268, "loss/reg": 0.002205136464908719, "step": 2367 }, { "epoch": 0.296, "grad_norm": 2.428588390350342, "grad_norm_var": 39.27135252509599, "learning_rate": 0.0001, "loss": 1.0414, "loss/crossentropy": 2.3668057918548584, "loss/hidden": 0.87109375, "loss/logits": 0.1482658088207245, "loss/reg": 0.002203758805990219, "step": 2368 }, { "epoch": 0.296125, "grad_norm": 2.845325469970703, "grad_norm_var": 39.12387045935571, "learning_rate": 0.0001, "loss": 1.1262, "loss/crossentropy": 2.3159801959991455, "loss/hidden": 0.95703125, "loss/logits": 0.14714784920215607, "loss/reg": 0.00220238184556365, "step": 2369 }, { "epoch": 0.29625, "grad_norm": 4.609076976776123, "grad_norm_var": 38.95200874053604, "learning_rate": 0.0001, "loss": 1.5419, "loss/crossentropy": 2.4830880165100098, "loss/hidden": 1.28125, "loss/logits": 0.2386501133441925, "loss/reg": 0.002200998365879059, "step": 2370 }, { "epoch": 0.296375, "grad_norm": 3.0354135036468506, "grad_norm_var": 38.787274063020526, "learning_rate": 0.0001, "loss": 1.179, "loss/crossentropy": 2.1647109985351562, "loss/hidden": 0.98828125, "loss/logits": 0.1687154620885849, "loss/reg": 0.002199799520894885, "step": 2371 }, { "epoch": 0.2965, "grad_norm": 2.553205966949463, "grad_norm_var": 38.74412513716453, "learning_rate": 0.0001, "loss": 1.1839, "loss/crossentropy": 2.720599412918091, "loss/hidden": 0.96484375, "loss/logits": 0.19702382385730743, "loss/reg": 0.0021986099891364574, "step": 2372 }, { "epoch": 0.296625, "grad_norm": 2.177777051925659, "grad_norm_var": 39.02066610504202, "learning_rate": 0.0001, "loss": 1.0694, "loss/crossentropy": 2.282625675201416, "loss/hidden": 0.90625, "loss/logits": 0.14115026593208313, "loss/reg": 0.0021973750554025173, "step": 2373 }, { "epoch": 0.29675, "grad_norm": 2.192328691482544, "grad_norm_var": 39.16988170359019, "learning_rate": 0.0001, "loss": 1.17, "loss/crossentropy": 2.321070671081543, "loss/hidden": 0.98828125, "loss/logits": 0.15975269675254822, "loss/reg": 0.002196060959249735, "step": 2374 }, { "epoch": 0.296875, "grad_norm": 2.222501039505005, "grad_norm_var": 39.30901105187863, "learning_rate": 0.0001, "loss": 1.0741, "loss/crossentropy": 2.8119492530822754, "loss/hidden": 0.90234375, "loss/logits": 0.1497960090637207, "loss/reg": 0.002194872358813882, "step": 2375 }, { "epoch": 0.297, "grad_norm": 2.065248489379883, "grad_norm_var": 39.52931933240049, "learning_rate": 0.0001, "loss": 1.0531, "loss/crossentropy": 2.5330662727355957, "loss/hidden": 0.88671875, "loss/logits": 0.14444825053215027, "loss/reg": 0.002193637890741229, "step": 2376 }, { "epoch": 0.297125, "grad_norm": 1.8949674367904663, "grad_norm_var": 39.68063438056073, "learning_rate": 0.0001, "loss": 1.0039, "loss/crossentropy": 2.450725793838501, "loss/hidden": 0.84375, "loss/logits": 0.1382727026939392, "loss/reg": 0.002192447893321514, "step": 2377 }, { "epoch": 0.29725, "grad_norm": 2.1522247791290283, "grad_norm_var": 39.688083655377014, "learning_rate": 0.0001, "loss": 1.0017, "loss/crossentropy": 2.5332834720611572, "loss/hidden": 0.85546875, "loss/logits": 0.12431172281503677, "loss/reg": 0.002191171981394291, "step": 2378 }, { "epoch": 0.297375, "grad_norm": 2.868100166320801, "grad_norm_var": 39.539433421790285, "learning_rate": 0.0001, "loss": 1.1824, "loss/crossentropy": 2.3386754989624023, "loss/hidden": 1.0234375, "loss/logits": 0.13706368207931519, "loss/reg": 0.0021898935083299875, "step": 2379 }, { "epoch": 0.2975, "grad_norm": 2.6803483963012695, "grad_norm_var": 39.52236336345723, "learning_rate": 0.0001, "loss": 1.0903, "loss/crossentropy": 2.528651475906372, "loss/hidden": 0.89453125, "loss/logits": 0.17392218112945557, "loss/reg": 0.002188711427152157, "step": 2380 }, { "epoch": 0.297625, "grad_norm": 2.596548318862915, "grad_norm_var": 0.4115178655779776, "learning_rate": 0.0001, "loss": 1.0765, "loss/crossentropy": 2.3907644748687744, "loss/hidden": 0.90234375, "loss/logits": 0.15226177871227264, "loss/reg": 0.002187481615692377, "step": 2381 }, { "epoch": 0.29775, "grad_norm": 2.3986682891845703, "grad_norm_var": 0.40819660159978394, "learning_rate": 0.0001, "loss": 1.054, "loss/crossentropy": 2.5493686199188232, "loss/hidden": 0.890625, "loss/logits": 0.14154410362243652, "loss/reg": 0.002186246681958437, "step": 2382 }, { "epoch": 0.297875, "grad_norm": 4.2237114906311035, "grad_norm_var": 0.5620436598519111, "learning_rate": 0.0001, "loss": 1.2216, "loss/crossentropy": 2.5266928672790527, "loss/hidden": 0.9375, "loss/logits": 0.26227086782455444, "loss/reg": 0.0021850764751434326, "step": 2383 }, { "epoch": 0.298, "grad_norm": 2.8558404445648193, "grad_norm_var": 0.5589025390479747, "learning_rate": 0.0001, "loss": 1.1707, "loss/crossentropy": 2.575399160385132, "loss/hidden": 0.9765625, "loss/logits": 0.17227087914943695, "loss/reg": 0.0021839046385139227, "step": 2384 }, { "epoch": 0.298125, "grad_norm": 2.0674421787261963, "grad_norm_var": 0.5827589469489003, "learning_rate": 0.0001, "loss": 1.0904, "loss/crossentropy": 2.2102935314178467, "loss/hidden": 0.921875, "loss/logits": 0.14672164618968964, "loss/reg": 0.0021827819291502237, "step": 2385 }, { "epoch": 0.29825, "grad_norm": 2.407134532928467, "grad_norm_var": 0.31417224502863184, "learning_rate": 0.0001, "loss": 0.9828, "loss/crossentropy": 2.7107746601104736, "loss/hidden": 0.84375, "loss/logits": 0.11720333993434906, "loss/reg": 0.002181618008762598, "step": 2386 }, { "epoch": 0.298375, "grad_norm": 2.540349245071411, "grad_norm_var": 0.2957633905491216, "learning_rate": 0.0001, "loss": 1.2066, "loss/crossentropy": 2.255420684814453, "loss/hidden": 1.015625, "loss/logits": 0.16912397742271423, "loss/reg": 0.002180487383157015, "step": 2387 }, { "epoch": 0.2985, "grad_norm": 6.323755264282227, "grad_norm_var": 1.214332628924398, "learning_rate": 0.0001, "loss": 1.4587, "loss/crossentropy": 2.6238784790039062, "loss/hidden": 1.2578125, "loss/logits": 0.17911097407341003, "loss/reg": 0.002179316710680723, "step": 2388 }, { "epoch": 0.298625, "grad_norm": 2.6576192378997803, "grad_norm_var": 1.1934447123380916, "learning_rate": 0.0001, "loss": 0.9418, "loss/crossentropy": 2.7748494148254395, "loss/hidden": 0.7890625, "loss/logits": 0.1310005486011505, "loss/reg": 0.0021781304385513067, "step": 2389 }, { "epoch": 0.29875, "grad_norm": 2.0993154048919678, "grad_norm_var": 1.2010153184248262, "learning_rate": 0.0001, "loss": 1.093, "loss/crossentropy": 2.4098961353302, "loss/hidden": 0.90625, "loss/logits": 0.16498351097106934, "loss/reg": 0.0021769891027361155, "step": 2390 }, { "epoch": 0.298875, "grad_norm": 2.4427108764648438, "grad_norm_var": 1.188459349339032, "learning_rate": 0.0001, "loss": 1.1341, "loss/crossentropy": 2.3666250705718994, "loss/hidden": 0.95703125, "loss/logits": 0.15535235404968262, "loss/reg": 0.0021757963113486767, "step": 2391 }, { "epoch": 0.299, "grad_norm": 2.4708964824676514, "grad_norm_var": 1.1607818218977537, "learning_rate": 0.0001, "loss": 1.1352, "loss/crossentropy": 2.6541738510131836, "loss/hidden": 0.96484375, "loss/logits": 0.14865455031394958, "loss/reg": 0.00217461003921926, "step": 2392 }, { "epoch": 0.299125, "grad_norm": 2.5000877380371094, "grad_norm_var": 1.1112539793700875, "learning_rate": 0.0001, "loss": 1.3246, "loss/crossentropy": 2.3874454498291016, "loss/hidden": 1.1171875, "loss/logits": 0.1856321394443512, "loss/reg": 0.0021733848843723536, "step": 2393 }, { "epoch": 0.29925, "grad_norm": 2.274259090423584, "grad_norm_var": 1.1011516749665453, "learning_rate": 0.0001, "loss": 1.0716, "loss/crossentropy": 2.858301877975464, "loss/hidden": 0.87890625, "loss/logits": 0.1709641069173813, "loss/reg": 0.0021722489036619663, "step": 2394 }, { "epoch": 0.299375, "grad_norm": 2.388404607772827, "grad_norm_var": 1.1136033771646907, "learning_rate": 0.0001, "loss": 1.1524, "loss/crossentropy": 2.5182249546051025, "loss/hidden": 0.94140625, "loss/logits": 0.18931522965431213, "loss/reg": 0.002171047730371356, "step": 2395 }, { "epoch": 0.2995, "grad_norm": 2.479414224624634, "grad_norm_var": 1.119545207491715, "learning_rate": 0.0001, "loss": 0.9211, "loss/crossentropy": 2.4917259216308594, "loss/hidden": 0.7890625, "loss/logits": 0.11034320294857025, "loss/reg": 0.0021699254866689444, "step": 2396 }, { "epoch": 0.299625, "grad_norm": 2.57258939743042, "grad_norm_var": 1.1202162721705604, "learning_rate": 0.0001, "loss": 1.0991, "loss/crossentropy": 2.558342218399048, "loss/hidden": 0.9140625, "loss/logits": 0.16333159804344177, "loss/reg": 0.002168835373595357, "step": 2397 }, { "epoch": 0.29975, "grad_norm": 3.3557748794555664, "grad_norm_var": 1.1270340099928178, "learning_rate": 0.0001, "loss": 1.1072, "loss/crossentropy": 2.7857778072357178, "loss/hidden": 0.92578125, "loss/logits": 0.15975359082221985, "loss/reg": 0.0021677466575056314, "step": 2398 }, { "epoch": 0.299875, "grad_norm": 3.1725220680236816, "grad_norm_var": 1.0040785023856482, "learning_rate": 0.0001, "loss": 1.1488, "loss/crossentropy": 2.448627233505249, "loss/hidden": 0.91796875, "loss/logits": 0.20915868878364563, "loss/reg": 0.0021666809916496277, "step": 2399 }, { "epoch": 0.3, "grad_norm": 2.668774366378784, "grad_norm_var": 1.0045737039360791, "learning_rate": 0.0001, "loss": 1.1372, "loss/crossentropy": 2.5547099113464355, "loss/hidden": 0.94921875, "loss/logits": 0.1663532257080078, "loss/reg": 0.002165647689253092, "step": 2400 }, { "epoch": 0.300125, "grad_norm": 2.2911062240600586, "grad_norm_var": 0.9865603713315009, "learning_rate": 0.0001, "loss": 1.0479, "loss/crossentropy": 2.4761369228363037, "loss/hidden": 0.875, "loss/logits": 0.1512310802936554, "loss/reg": 0.0021646604873239994, "step": 2401 }, { "epoch": 0.30025, "grad_norm": 2.4086267948150635, "grad_norm_var": 0.9864842738202394, "learning_rate": 0.0001, "loss": 0.9611, "loss/crossentropy": 2.7032105922698975, "loss/hidden": 0.8125, "loss/logits": 0.12701138854026794, "loss/reg": 0.0021635033190250397, "step": 2402 }, { "epoch": 0.300375, "grad_norm": 2.8700461387634277, "grad_norm_var": 0.9822864320840416, "learning_rate": 0.0001, "loss": 1.2253, "loss/crossentropy": 2.389875888824463, "loss/hidden": 1.015625, "loss/logits": 0.18807819485664368, "loss/reg": 0.0021624851506203413, "step": 2403 }, { "epoch": 0.3005, "grad_norm": 2.8803300857543945, "grad_norm_var": 0.11056921305827032, "learning_rate": 0.0001, "loss": 1.287, "loss/crossentropy": 2.6379334926605225, "loss/hidden": 1.0859375, "loss/logits": 0.17943868041038513, "loss/reg": 0.002161318203434348, "step": 2404 }, { "epoch": 0.300625, "grad_norm": 2.0551042556762695, "grad_norm_var": 0.12829034443920193, "learning_rate": 0.0001, "loss": 1.077, "loss/crossentropy": 2.478105068206787, "loss/hidden": 0.8984375, "loss/logits": 0.1570039689540863, "loss/reg": 0.002160001778975129, "step": 2405 }, { "epoch": 0.30075, "grad_norm": 2.4833426475524902, "grad_norm_var": 0.11401505388090882, "learning_rate": 0.0001, "loss": 1.1724, "loss/crossentropy": 2.620856761932373, "loss/hidden": 0.96484375, "loss/logits": 0.18592841923236847, "loss/reg": 0.002158663934096694, "step": 2406 }, { "epoch": 0.300875, "grad_norm": 2.648876428604126, "grad_norm_var": 0.11283926731809117, "learning_rate": 0.0001, "loss": 1.173, "loss/crossentropy": 2.515164375305176, "loss/hidden": 0.96875, "loss/logits": 0.18271613121032715, "loss/reg": 0.0021575286518782377, "step": 2407 }, { "epoch": 0.301, "grad_norm": 2.987276315689087, "grad_norm_var": 0.12095949957958965, "learning_rate": 0.0001, "loss": 1.2709, "loss/crossentropy": 2.496842861175537, "loss/hidden": 1.0546875, "loss/logits": 0.19469350576400757, "loss/reg": 0.00215638754889369, "step": 2408 }, { "epoch": 0.301125, "grad_norm": 1.9031262397766113, "grad_norm_var": 0.15335631499839622, "learning_rate": 0.0001, "loss": 1.1422, "loss/crossentropy": 2.5851099491119385, "loss/hidden": 0.94140625, "loss/logits": 0.17919760942459106, "loss/reg": 0.0021552639082074165, "step": 2409 }, { "epoch": 0.30125, "grad_norm": 2.1677801609039307, "grad_norm_var": 0.15854718081577795, "learning_rate": 0.0001, "loss": 1.0837, "loss/crossentropy": 2.4408304691314697, "loss/hidden": 0.88671875, "loss/logits": 0.1754608154296875, "loss/reg": 0.0021540310699492693, "step": 2410 }, { "epoch": 0.301375, "grad_norm": 2.7278027534484863, "grad_norm_var": 0.15692617279536686, "learning_rate": 0.0001, "loss": 1.3775, "loss/crossentropy": 2.471968412399292, "loss/hidden": 1.109375, "loss/logits": 0.2465793341398239, "loss/reg": 0.002152809174731374, "step": 2411 }, { "epoch": 0.3015, "grad_norm": 2.9148828983306885, "grad_norm_var": 0.16151365261491718, "learning_rate": 0.0001, "loss": 1.1261, "loss/crossentropy": 2.5862338542938232, "loss/hidden": 0.93359375, "loss/logits": 0.17094576358795166, "loss/reg": 0.0021517013665288687, "step": 2412 }, { "epoch": 0.301625, "grad_norm": 2.7477314472198486, "grad_norm_var": 0.16204934512076055, "learning_rate": 0.0001, "loss": 1.1567, "loss/crossentropy": 2.6169795989990234, "loss/hidden": 0.95703125, "loss/logits": 0.17819753289222717, "loss/reg": 0.0021505923941731453, "step": 2413 }, { "epoch": 0.30175, "grad_norm": 2.4291226863861084, "grad_norm_var": 0.12761338266658698, "learning_rate": 0.0001, "loss": 1.1491, "loss/crossentropy": 2.514669895172119, "loss/hidden": 0.9609375, "loss/logits": 0.1666562259197235, "loss/reg": 0.0021494391839951277, "step": 2414 }, { "epoch": 0.301875, "grad_norm": 2.359739065170288, "grad_norm_var": 0.10520746775547778, "learning_rate": 0.0001, "loss": 1.0658, "loss/crossentropy": 2.699859380722046, "loss/hidden": 0.89453125, "loss/logits": 0.1498180329799652, "loss/reg": 0.002148275263607502, "step": 2415 }, { "epoch": 0.302, "grad_norm": 2.4702227115631104, "grad_norm_var": 0.10410288528468996, "learning_rate": 0.0001, "loss": 1.0777, "loss/crossentropy": 2.68515682220459, "loss/hidden": 0.875, "loss/logits": 0.18118959665298462, "loss/reg": 0.0021472014486789703, "step": 2416 }, { "epoch": 0.302125, "grad_norm": 2.287879467010498, "grad_norm_var": 0.10420268936281118, "learning_rate": 0.0001, "loss": 1.1297, "loss/crossentropy": 2.37418794631958, "loss/hidden": 0.97265625, "loss/logits": 0.1355462372303009, "loss/reg": 0.0021460477728396654, "step": 2417 }, { "epoch": 0.30225, "grad_norm": 2.628570079803467, "grad_norm_var": 0.10391990325521192, "learning_rate": 0.0001, "loss": 1.0658, "loss/crossentropy": 2.740374803543091, "loss/hidden": 0.8515625, "loss/logits": 0.19282981753349304, "loss/reg": 0.0021449297200888395, "step": 2418 }, { "epoch": 0.302375, "grad_norm": 2.2769341468811035, "grad_norm_var": 0.0994193452448144, "learning_rate": 0.0001, "loss": 1.195, "loss/crossentropy": 2.5616815090179443, "loss/hidden": 1.0078125, "loss/logits": 0.16570472717285156, "loss/reg": 0.002143777906894684, "step": 2419 }, { "epoch": 0.3025, "grad_norm": 2.2194042205810547, "grad_norm_var": 0.0930325102133755, "learning_rate": 0.0001, "loss": 1.0028, "loss/crossentropy": 2.5813071727752686, "loss/hidden": 0.8203125, "loss/logits": 0.16104257106781006, "loss/reg": 0.002142672659829259, "step": 2420 }, { "epoch": 0.302625, "grad_norm": 2.444960117340088, "grad_norm_var": 0.08165453936917307, "learning_rate": 0.0001, "loss": 1.0358, "loss/crossentropy": 2.714278221130371, "loss/hidden": 0.87890625, "loss/logits": 0.13547241687774658, "loss/reg": 0.0021415783558040857, "step": 2421 }, { "epoch": 0.30275, "grad_norm": 2.157719850540161, "grad_norm_var": 0.08818419905537264, "learning_rate": 0.0001, "loss": 1.0853, "loss/crossentropy": 2.458479881286621, "loss/hidden": 0.9140625, "loss/logits": 0.14987066388130188, "loss/reg": 0.002140489872545004, "step": 2422 }, { "epoch": 0.302875, "grad_norm": 2.846714973449707, "grad_norm_var": 0.09559289538667978, "learning_rate": 0.0001, "loss": 1.2096, "loss/crossentropy": 2.4821577072143555, "loss/hidden": 1.0078125, "loss/logits": 0.18039116263389587, "loss/reg": 0.0021393520291894674, "step": 2423 }, { "epoch": 0.303, "grad_norm": 2.497941493988037, "grad_norm_var": 0.07701227008512485, "learning_rate": 0.0001, "loss": 1.266, "loss/crossentropy": 2.0463898181915283, "loss/hidden": 1.0546875, "loss/logits": 0.18990883231163025, "loss/reg": 0.0021381995175033808, "step": 2424 }, { "epoch": 0.303125, "grad_norm": 2.2916762828826904, "grad_norm_var": 0.058503082796944265, "learning_rate": 0.0001, "loss": 1.1526, "loss/crossentropy": 2.53886079788208, "loss/hidden": 0.96484375, "loss/logits": 0.16638509929180145, "loss/reg": 0.0021371024195104837, "step": 2425 }, { "epoch": 0.30325, "grad_norm": 2.6170272827148438, "grad_norm_var": 0.053204788153341794, "learning_rate": 0.0001, "loss": 1.2588, "loss/crossentropy": 2.7130520343780518, "loss/hidden": 1.03125, "loss/logits": 0.20615074038505554, "loss/reg": 0.0021359475795179605, "step": 2426 }, { "epoch": 0.303375, "grad_norm": 2.4657719135284424, "grad_norm_var": 0.04935886701339693, "learning_rate": 0.0001, "loss": 1.0122, "loss/crossentropy": 2.3082022666931152, "loss/hidden": 0.859375, "loss/logits": 0.13152173161506653, "loss/reg": 0.0021348483860492706, "step": 2427 }, { "epoch": 0.3035, "grad_norm": 2.0947723388671875, "grad_norm_var": 0.04367961136763914, "learning_rate": 0.0001, "loss": 1.083, "loss/crossentropy": 2.4205880165100098, "loss/hidden": 0.9140625, "loss/logits": 0.1476408988237381, "loss/reg": 0.002133752219378948, "step": 2428 }, { "epoch": 0.303625, "grad_norm": 2.25714111328125, "grad_norm_var": 0.03775946331734208, "learning_rate": 0.0001, "loss": 1.2592, "loss/crossentropy": 2.346250295639038, "loss/hidden": 1.0703125, "loss/logits": 0.16760315001010895, "loss/reg": 0.0021326192654669285, "step": 2429 }, { "epoch": 0.30375, "grad_norm": 2.1641323566436768, "grad_norm_var": 0.04099910752878486, "learning_rate": 0.0001, "loss": 1.11, "loss/crossentropy": 2.5614030361175537, "loss/hidden": 0.92578125, "loss/logits": 0.1628931164741516, "loss/reg": 0.002131457207724452, "step": 2430 }, { "epoch": 0.303875, "grad_norm": 2.091635227203369, "grad_norm_var": 0.046217215110478334, "learning_rate": 0.0001, "loss": 0.9487, "loss/crossentropy": 2.627411365509033, "loss/hidden": 0.80078125, "loss/logits": 0.1266120970249176, "loss/reg": 0.0021303691901266575, "step": 2431 }, { "epoch": 0.304, "grad_norm": 2.064512014389038, "grad_norm_var": 0.050719827657180146, "learning_rate": 0.0001, "loss": 0.9613, "loss/crossentropy": 2.335934638977051, "loss/hidden": 0.80859375, "loss/logits": 0.13146154582500458, "loss/reg": 0.002129204338416457, "step": 2432 }, { "epoch": 0.304125, "grad_norm": 4.493721961975098, "grad_norm_var": 0.34010976964375245, "learning_rate": 0.0001, "loss": 1.284, "loss/crossentropy": 2.438467502593994, "loss/hidden": 1.0859375, "loss/logits": 0.17674224078655243, "loss/reg": 0.0021280222572386265, "step": 2433 }, { "epoch": 0.30425, "grad_norm": 3.3369619846343994, "grad_norm_var": 0.38590391302901994, "learning_rate": 0.0001, "loss": 1.3779, "loss/crossentropy": 2.774651288986206, "loss/hidden": 1.15625, "loss/logits": 0.20038242638111115, "loss/reg": 0.002126899780705571, "step": 2434 }, { "epoch": 0.304375, "grad_norm": 2.4502227306365967, "grad_norm_var": 0.38216316623404867, "learning_rate": 0.0001, "loss": 1.1456, "loss/crossentropy": 2.5198841094970703, "loss/hidden": 0.953125, "loss/logits": 0.1712300181388855, "loss/reg": 0.0021257472690194845, "step": 2435 }, { "epoch": 0.3045, "grad_norm": 2.5983469486236572, "grad_norm_var": 0.3753997399834589, "learning_rate": 0.0001, "loss": 1.0041, "loss/crossentropy": 2.7402634620666504, "loss/hidden": 0.84765625, "loss/logits": 0.13516700267791748, "loss/reg": 0.002124567050486803, "step": 2436 }, { "epoch": 0.304625, "grad_norm": 2.941843032836914, "grad_norm_var": 0.3835681851958033, "learning_rate": 0.0001, "loss": 1.2025, "loss/crossentropy": 2.236844539642334, "loss/hidden": 1.0234375, "loss/logits": 0.1578763723373413, "loss/reg": 0.002123430836945772, "step": 2437 }, { "epoch": 0.30475, "grad_norm": 2.6095266342163086, "grad_norm_var": 0.37054834478851123, "learning_rate": 0.0001, "loss": 1.1293, "loss/crossentropy": 2.6869966983795166, "loss/hidden": 0.921875, "loss/logits": 0.18621957302093506, "loss/reg": 0.0021222494542598724, "step": 2438 }, { "epoch": 0.304875, "grad_norm": 2.791029930114746, "grad_norm_var": 0.3690133617611515, "learning_rate": 0.0001, "loss": 1.4459, "loss/crossentropy": 2.3036041259765625, "loss/hidden": 1.203125, "loss/logits": 0.221592977643013, "loss/reg": 0.002121059689670801, "step": 2439 }, { "epoch": 0.305, "grad_norm": 2.3611831665039062, "grad_norm_var": 0.3722327517106841, "learning_rate": 0.0001, "loss": 1.1669, "loss/crossentropy": 2.3729498386383057, "loss/hidden": 0.98046875, "loss/logits": 0.1652437150478363, "loss/reg": 0.0021199327893555164, "step": 2440 }, { "epoch": 0.305125, "grad_norm": 2.42172908782959, "grad_norm_var": 0.36791143475395527, "learning_rate": 0.0001, "loss": 1.02, "loss/crossentropy": 2.8232316970825195, "loss/hidden": 0.86328125, "loss/logits": 0.13549447059631348, "loss/reg": 0.0021188165992498398, "step": 2441 }, { "epoch": 0.30525, "grad_norm": 3.2640671730041504, "grad_norm_var": 0.39468636586392736, "learning_rate": 0.0001, "loss": 1.456, "loss/crossentropy": 2.8770806789398193, "loss/hidden": 1.2265625, "loss/logits": 0.20822793245315552, "loss/reg": 0.002117713214829564, "step": 2442 }, { "epoch": 0.305375, "grad_norm": 2.373006582260132, "grad_norm_var": 0.39750796796011323, "learning_rate": 0.0001, "loss": 1.0963, "loss/crossentropy": 2.48526668548584, "loss/hidden": 0.921875, "loss/logits": 0.15330582857131958, "loss/reg": 0.0021165383514016867, "step": 2443 }, { "epoch": 0.3055, "grad_norm": 3.023404121398926, "grad_norm_var": 0.3833251566632574, "learning_rate": 0.0001, "loss": 1.2441, "loss/crossentropy": 2.3130240440368652, "loss/hidden": 1.046875, "loss/logits": 0.17610669136047363, "loss/reg": 0.002115316456183791, "step": 2444 }, { "epoch": 0.305625, "grad_norm": 2.178781747817993, "grad_norm_var": 0.3883635995386648, "learning_rate": 0.0001, "loss": 1.2342, "loss/crossentropy": 2.1899588108062744, "loss/hidden": 1.046875, "loss/logits": 0.16616564989089966, "loss/reg": 0.002114085713401437, "step": 2445 }, { "epoch": 0.30575, "grad_norm": 2.7941672801971436, "grad_norm_var": 0.36834568332028894, "learning_rate": 0.0001, "loss": 1.1686, "loss/crossentropy": 2.293048143386841, "loss/hidden": 0.96484375, "loss/logits": 0.18263517320156097, "loss/reg": 0.0021128428634256124, "step": 2446 }, { "epoch": 0.305875, "grad_norm": 2.6195220947265625, "grad_norm_var": 0.3403288599403801, "learning_rate": 0.0001, "loss": 1.1447, "loss/crossentropy": 2.743091583251953, "loss/hidden": 0.9609375, "loss/logits": 0.16267046332359314, "loss/reg": 0.0021115969866514206, "step": 2447 }, { "epoch": 0.306, "grad_norm": 3.349543333053589, "grad_norm_var": 0.32263719799683843, "learning_rate": 0.0001, "loss": 1.3073, "loss/crossentropy": 2.128545045852661, "loss/hidden": 1.0859375, "loss/logits": 0.2002711296081543, "loss/reg": 0.002110475907102227, "step": 2448 }, { "epoch": 0.306125, "grad_norm": 2.690319776535034, "grad_norm_var": 0.13077057659966157, "learning_rate": 0.0001, "loss": 1.232, "loss/crossentropy": 2.4963951110839844, "loss/hidden": 1.015625, "loss/logits": 0.19524666666984558, "loss/reg": 0.002109333872795105, "step": 2449 }, { "epoch": 0.30625, "grad_norm": 3.2910077571868896, "grad_norm_var": 0.1272309218149824, "learning_rate": 0.0001, "loss": 1.2677, "loss/crossentropy": 2.47861909866333, "loss/hidden": 1.0546875, "loss/logits": 0.19194313883781433, "loss/reg": 0.0021082195453345776, "step": 2450 }, { "epoch": 0.306375, "grad_norm": 4.730775356292725, "grad_norm_var": 0.36573885000797923, "learning_rate": 0.0001, "loss": 1.3281, "loss/crossentropy": 2.416245222091675, "loss/hidden": 1.1171875, "loss/logits": 0.18980750441551208, "loss/reg": 0.0021070209331810474, "step": 2451 }, { "epoch": 0.3065, "grad_norm": 3.2826430797576904, "grad_norm_var": 0.36954535067382205, "learning_rate": 0.0001, "loss": 1.1541, "loss/crossentropy": 2.133741617202759, "loss/hidden": 0.9921875, "loss/logits": 0.14082437753677368, "loss/reg": 0.002105786930769682, "step": 2452 }, { "epoch": 0.306625, "grad_norm": 2.5268890857696533, "grad_norm_var": 0.3791073289212515, "learning_rate": 0.0001, "loss": 1.035, "loss/crossentropy": 2.667654514312744, "loss/hidden": 0.86328125, "loss/logits": 0.15064285695552826, "loss/reg": 0.0021045496687293053, "step": 2453 }, { "epoch": 0.30675, "grad_norm": 2.2552711963653564, "grad_norm_var": 0.4003983341559279, "learning_rate": 0.0001, "loss": 1.0347, "loss/crossentropy": 2.683074951171875, "loss/hidden": 0.86328125, "loss/logits": 0.15039215981960297, "loss/reg": 0.0021034192759543657, "step": 2454 }, { "epoch": 0.306875, "grad_norm": 4.937719345092773, "grad_norm_var": 0.6652158853840601, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.6319425106048584, "loss/hidden": 1.03125, "loss/logits": 0.20028609037399292, "loss/reg": 0.0021023587323725224, "step": 2455 }, { "epoch": 0.307, "grad_norm": 2.904989719390869, "grad_norm_var": 0.6369263870540064, "learning_rate": 0.0001, "loss": 1.0624, "loss/crossentropy": 2.49183988571167, "loss/hidden": 0.89453125, "loss/logits": 0.14683844149112701, "loss/reg": 0.002101288875564933, "step": 2456 }, { "epoch": 0.307125, "grad_norm": 2.2416632175445557, "grad_norm_var": 0.6538025586005953, "learning_rate": 0.0001, "loss": 1.1226, "loss/crossentropy": 2.498965263366699, "loss/hidden": 0.9375, "loss/logits": 0.1640811562538147, "loss/reg": 0.002100135199725628, "step": 2457 }, { "epoch": 0.30725, "grad_norm": 2.445704698562622, "grad_norm_var": 0.6700089634231478, "learning_rate": 0.0001, "loss": 1.1776, "loss/crossentropy": 2.338676929473877, "loss/hidden": 0.9921875, "loss/logits": 0.1644667685031891, "loss/reg": 0.002099038101732731, "step": 2458 }, { "epoch": 0.307375, "grad_norm": 2.4770395755767822, "grad_norm_var": 0.6622957356859083, "learning_rate": 0.0001, "loss": 1.0124, "loss/crossentropy": 2.825948715209961, "loss/hidden": 0.83984375, "loss/logits": 0.15155267715454102, "loss/reg": 0.002097893040627241, "step": 2459 }, { "epoch": 0.3075, "grad_norm": 2.5629642009735107, "grad_norm_var": 0.6731478243948156, "learning_rate": 0.0001, "loss": 1.099, "loss/crossentropy": 2.5486607551574707, "loss/hidden": 0.90625, "loss/logits": 0.17178046703338623, "loss/reg": 0.0020967568270862103, "step": 2460 }, { "epoch": 0.307625, "grad_norm": 2.5415985584259033, "grad_norm_var": 0.6437978570048586, "learning_rate": 0.0001, "loss": 1.1027, "loss/crossentropy": 2.5102035999298096, "loss/hidden": 0.921875, "loss/logits": 0.15987958014011383, "loss/reg": 0.002095636911690235, "step": 2461 }, { "epoch": 0.30775, "grad_norm": 4.435094833374023, "grad_norm_var": 0.7718150232200209, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.5023560523986816, "loss/hidden": 1.09375, "loss/logits": 0.26149046421051025, "loss/reg": 0.002094438299536705, "step": 2462 }, { "epoch": 0.307875, "grad_norm": 2.209512710571289, "grad_norm_var": 0.8075386717224734, "learning_rate": 0.0001, "loss": 1.0077, "loss/crossentropy": 2.4771201610565186, "loss/hidden": 0.83984375, "loss/logits": 0.14696896076202393, "loss/reg": 0.0020932427141815424, "step": 2463 }, { "epoch": 0.308, "grad_norm": 2.2504334449768066, "grad_norm_var": 0.8399016626513837, "learning_rate": 0.0001, "loss": 1.15, "loss/crossentropy": 2.496920585632324, "loss/hidden": 0.95703125, "loss/logits": 0.17200365662574768, "loss/reg": 0.002092106733471155, "step": 2464 }, { "epoch": 0.308125, "grad_norm": 2.2594423294067383, "grad_norm_var": 0.8685194331274296, "learning_rate": 0.0001, "loss": 1.169, "loss/crossentropy": 2.6099801063537598, "loss/hidden": 0.98046875, "loss/logits": 0.16757354140281677, "loss/reg": 0.002090911380946636, "step": 2465 }, { "epoch": 0.30825, "grad_norm": 9.355794906616211, "grad_norm_var": 3.435404135981017, "learning_rate": 0.0001, "loss": 1.4743, "loss/crossentropy": 2.9165380001068115, "loss/hidden": 1.2109375, "loss/logits": 0.24246656894683838, "loss/reg": 0.002089662244543433, "step": 2466 }, { "epoch": 0.308375, "grad_norm": 2.4942128658294678, "grad_norm_var": 3.332882892890948, "learning_rate": 0.0001, "loss": 1.1019, "loss/crossentropy": 2.4018900394439697, "loss/hidden": 0.91796875, "loss/logits": 0.1630443036556244, "loss/reg": 0.002088395180180669, "step": 2467 }, { "epoch": 0.3085, "grad_norm": 1.8817147016525269, "grad_norm_var": 3.4398863549356045, "learning_rate": 0.0001, "loss": 0.9327, "loss/crossentropy": 2.4369280338287354, "loss/hidden": 0.78515625, "loss/logits": 0.12664957344532013, "loss/reg": 0.002087142551317811, "step": 2468 }, { "epoch": 0.308625, "grad_norm": 62.395118713378906, "grad_norm_var": 222.7880506780838, "learning_rate": 0.0001, "loss": 1.3917, "loss/crossentropy": 2.208883047103882, "loss/hidden": 1.1640625, "loss/logits": 0.20681332051753998, "loss/reg": 0.002086017047986388, "step": 2469 }, { "epoch": 0.30875, "grad_norm": 2.0631821155548096, "grad_norm_var": 222.90811372337927, "learning_rate": 0.0001, "loss": 1.1632, "loss/crossentropy": 2.5229339599609375, "loss/hidden": 0.984375, "loss/logits": 0.15795353055000305, "loss/reg": 0.002084895968437195, "step": 2470 }, { "epoch": 0.308875, "grad_norm": 2.7480690479278564, "grad_norm_var": 223.76344684955927, "learning_rate": 0.0001, "loss": 1.1538, "loss/crossentropy": 2.635938882827759, "loss/hidden": 0.9453125, "loss/logits": 0.18760758638381958, "loss/reg": 0.0020836745388805866, "step": 2471 }, { "epoch": 0.309, "grad_norm": 2.3494324684143066, "grad_norm_var": 224.0641578575862, "learning_rate": 0.0001, "loss": 1.1949, "loss/crossentropy": 2.3463780879974365, "loss/hidden": 1.0078125, "loss/logits": 0.16627079248428345, "loss/reg": 0.0020824531093239784, "step": 2472 }, { "epoch": 0.309125, "grad_norm": 2.974785089492798, "grad_norm_var": 223.6649366301598, "learning_rate": 0.0001, "loss": 1.3179, "loss/crossentropy": 2.3789594173431396, "loss/hidden": 1.1015625, "loss/logits": 0.1955249011516571, "loss/reg": 0.002081233076751232, "step": 2473 }, { "epoch": 0.30925, "grad_norm": 2.3864636421203613, "grad_norm_var": 223.69888034013712, "learning_rate": 0.0001, "loss": 1.2483, "loss/crossentropy": 2.318861722946167, "loss/hidden": 1.0546875, "loss/logits": 0.1728212833404541, "loss/reg": 0.0020799937192350626, "step": 2474 }, { "epoch": 0.309375, "grad_norm": 2.233569383621216, "grad_norm_var": 223.84004892743891, "learning_rate": 0.0001, "loss": 1.0577, "loss/crossentropy": 2.7401442527770996, "loss/hidden": 0.88671875, "loss/logits": 0.15014401078224182, "loss/reg": 0.0020787438843399286, "step": 2475 }, { "epoch": 0.3095, "grad_norm": 3.0971381664276123, "grad_norm_var": 223.56349078632311, "learning_rate": 0.0001, "loss": 1.1493, "loss/crossentropy": 2.515321731567383, "loss/hidden": 0.984375, "loss/logits": 0.14414407312870026, "loss/reg": 0.002077620942145586, "step": 2476 }, { "epoch": 0.309625, "grad_norm": 1.8496131896972656, "grad_norm_var": 223.97983460323528, "learning_rate": 0.0001, "loss": 1.0677, "loss/crossentropy": 2.3725855350494385, "loss/hidden": 0.890625, "loss/logits": 0.15632164478302002, "loss/reg": 0.002076513599604368, "step": 2477 }, { "epoch": 0.30975, "grad_norm": 2.0794615745544434, "grad_norm_var": 225.03377063332363, "learning_rate": 0.0001, "loss": 1.0074, "loss/crossentropy": 2.5441107749938965, "loss/hidden": 0.84375, "loss/logits": 0.1428467333316803, "loss/reg": 0.0020754833240062, "step": 2478 }, { "epoch": 0.309875, "grad_norm": 3.8224730491638184, "grad_norm_var": 224.26521467728128, "learning_rate": 0.0001, "loss": 1.4865, "loss/crossentropy": 2.1951661109924316, "loss/hidden": 1.25, "loss/logits": 0.215762659907341, "loss/reg": 0.002074404852464795, "step": 2479 }, { "epoch": 0.31, "grad_norm": 2.290989875793457, "grad_norm_var": 224.24158048605454, "learning_rate": 0.0001, "loss": 1.1752, "loss/crossentropy": 2.3957531452178955, "loss/hidden": 1.0, "loss/logits": 0.1544678509235382, "loss/reg": 0.0020732861012220383, "step": 2480 }, { "epoch": 0.310125, "grad_norm": 1.991398811340332, "grad_norm_var": 224.4027209007804, "learning_rate": 0.0001, "loss": 1.0749, "loss/crossentropy": 2.6138253211975098, "loss/hidden": 0.90234375, "loss/logits": 0.15183526277542114, "loss/reg": 0.002072281204164028, "step": 2481 }, { "epoch": 0.31025, "grad_norm": 2.0419325828552246, "grad_norm_var": 225.083804004697, "learning_rate": 0.0001, "loss": 1.0122, "loss/crossentropy": 2.4898176193237305, "loss/hidden": 0.85546875, "loss/logits": 0.1360248327255249, "loss/reg": 0.002071160590276122, "step": 2482 }, { "epoch": 0.310375, "grad_norm": 2.274839162826538, "grad_norm_var": 225.1942905687681, "learning_rate": 0.0001, "loss": 1.1225, "loss/crossentropy": 2.6271538734436035, "loss/hidden": 0.9296875, "loss/logits": 0.1720973402261734, "loss/reg": 0.0020701212342828512, "step": 2483 }, { "epoch": 0.3105, "grad_norm": 2.5726370811462402, "grad_norm_var": 224.8304575888439, "learning_rate": 0.0001, "loss": 1.1261, "loss/crossentropy": 2.377349376678467, "loss/hidden": 0.9453125, "loss/logits": 0.16004841029644012, "loss/reg": 0.002069063950330019, "step": 2484 }, { "epoch": 0.310625, "grad_norm": 2.0471315383911133, "grad_norm_var": 0.2650909035004048, "learning_rate": 0.0001, "loss": 1.0161, "loss/crossentropy": 2.551028251647949, "loss/hidden": 0.84765625, "loss/logits": 0.14776179194450378, "loss/reg": 0.0020679491572082043, "step": 2485 }, { "epoch": 0.31075, "grad_norm": 2.7036123275756836, "grad_norm_var": 0.25970607907091764, "learning_rate": 0.0001, "loss": 1.0807, "loss/crossentropy": 2.4640750885009766, "loss/hidden": 0.90625, "loss/logits": 0.15373341739177704, "loss/reg": 0.002066906075924635, "step": 2486 }, { "epoch": 0.310875, "grad_norm": 4.019110679626465, "grad_norm_var": 0.4084006851376974, "learning_rate": 0.0001, "loss": 1.4183, "loss/crossentropy": 2.7196004390716553, "loss/hidden": 1.1796875, "loss/logits": 0.21795910596847534, "loss/reg": 0.0020658550783991814, "step": 2487 }, { "epoch": 0.311, "grad_norm": 2.1248199939727783, "grad_norm_var": 0.41743808538203075, "learning_rate": 0.0001, "loss": 0.9468, "loss/crossentropy": 2.5661206245422363, "loss/hidden": 0.796875, "loss/logits": 0.12931255996227264, "loss/reg": 0.002064791973680258, "step": 2488 }, { "epoch": 0.311125, "grad_norm": 2.4856653213500977, "grad_norm_var": 0.4035055616839363, "learning_rate": 0.0001, "loss": 1.1206, "loss/crossentropy": 2.3283872604370117, "loss/hidden": 0.94921875, "loss/logits": 0.1507793515920639, "loss/reg": 0.0020636676345020533, "step": 2489 }, { "epoch": 0.31125, "grad_norm": 3.1577935218811035, "grad_norm_var": 0.4288793321989781, "learning_rate": 0.0001, "loss": 1.3234, "loss/crossentropy": 2.5108306407928467, "loss/hidden": 1.109375, "loss/logits": 0.19337984919548035, "loss/reg": 0.0020625172182917595, "step": 2490 }, { "epoch": 0.311375, "grad_norm": 2.4464476108551025, "grad_norm_var": 0.42274402306383363, "learning_rate": 0.0001, "loss": 1.1765, "loss/crossentropy": 2.6824586391448975, "loss/hidden": 0.984375, "loss/logits": 0.17150135338306427, "loss/reg": 0.0020613418892025948, "step": 2491 }, { "epoch": 0.3115, "grad_norm": 2.192991018295288, "grad_norm_var": 0.40942260104406974, "learning_rate": 0.0001, "loss": 1.1039, "loss/crossentropy": 2.5312118530273438, "loss/hidden": 0.90625, "loss/logits": 0.17700329422950745, "loss/reg": 0.0020601488649845123, "step": 2492 }, { "epoch": 0.311625, "grad_norm": 2.0428884029388428, "grad_norm_var": 0.3948342810350006, "learning_rate": 0.0001, "loss": 0.9967, "loss/crossentropy": 2.6421899795532227, "loss/hidden": 0.828125, "loss/logits": 0.1479574739933014, "loss/reg": 0.0020590380299836397, "step": 2493 }, { "epoch": 0.31175, "grad_norm": 2.4641101360321045, "grad_norm_var": 0.38157049870508125, "learning_rate": 0.0001, "loss": 1.0838, "loss/crossentropy": 2.665184736251831, "loss/hidden": 0.91015625, "loss/logits": 0.15311187505722046, "loss/reg": 0.0020578729454427958, "step": 2494 }, { "epoch": 0.311875, "grad_norm": 6.863973617553711, "grad_norm_var": 1.4788420625680108, "learning_rate": 0.0001, "loss": 1.3601, "loss/crossentropy": 2.0577409267425537, "loss/hidden": 1.125, "loss/logits": 0.21451006829738617, "loss/reg": 0.0020567593164741993, "step": 2495 }, { "epoch": 0.312, "grad_norm": 2.0063822269439697, "grad_norm_var": 1.500659755343929, "learning_rate": 0.0001, "loss": 1.0094, "loss/crossentropy": 2.3908817768096924, "loss/hidden": 0.84375, "loss/logits": 0.14511646330356598, "loss/reg": 0.0020556054078042507, "step": 2496 }, { "epoch": 0.312125, "grad_norm": 2.1783390045166016, "grad_norm_var": 1.4848145462891105, "learning_rate": 0.0001, "loss": 0.9983, "loss/crossentropy": 2.5374248027801514, "loss/hidden": 0.83984375, "loss/logits": 0.13792040944099426, "loss/reg": 0.002054414479061961, "step": 2497 }, { "epoch": 0.31225, "grad_norm": 49.8923454284668, "grad_norm_var": 140.22164099225867, "learning_rate": 0.0001, "loss": 1.0784, "loss/crossentropy": 2.484605073928833, "loss/hidden": 0.91015625, "loss/logits": 0.14767885208129883, "loss/reg": 0.0020532880444079638, "step": 2498 }, { "epoch": 0.312375, "grad_norm": 2.2526962757110596, "grad_norm_var": 140.23183442091207, "learning_rate": 0.0001, "loss": 1.0697, "loss/crossentropy": 2.460233688354492, "loss/hidden": 0.90625, "loss/logits": 0.14288920164108276, "loss/reg": 0.0020521648693829775, "step": 2499 }, { "epoch": 0.3125, "grad_norm": 1.8620309829711914, "grad_norm_var": 140.56119026050018, "learning_rate": 0.0001, "loss": 0.9349, "loss/crossentropy": 2.170700788497925, "loss/hidden": 0.79296875, "loss/logits": 0.12138842046260834, "loss/reg": 0.0020510072354227304, "step": 2500 }, { "epoch": 0.312625, "grad_norm": 2.6856696605682373, "grad_norm_var": 140.2781199385601, "learning_rate": 0.0001, "loss": 1.0955, "loss/crossentropy": 2.5085690021514893, "loss/hidden": 0.9375, "loss/logits": 0.13753914833068848, "loss/reg": 0.002049813512712717, "step": 2501 }, { "epoch": 0.31275, "grad_norm": 1.9341952800750732, "grad_norm_var": 140.62366325480613, "learning_rate": 0.0001, "loss": 1.1243, "loss/crossentropy": 2.5098164081573486, "loss/hidden": 0.93359375, "loss/logits": 0.17017102241516113, "loss/reg": 0.0020485916174948215, "step": 2502 }, { "epoch": 0.312875, "grad_norm": 2.370537519454956, "grad_norm_var": 141.1548883007058, "learning_rate": 0.0001, "loss": 0.989, "loss/crossentropy": 2.7039451599121094, "loss/hidden": 0.83203125, "loss/logits": 0.13650229573249817, "loss/reg": 0.0020473783370107412, "step": 2503 }, { "epoch": 0.313, "grad_norm": 1.9530695676803589, "grad_norm_var": 141.23539902424713, "learning_rate": 0.0001, "loss": 0.9323, "loss/crossentropy": 2.379255771636963, "loss/hidden": 0.79296875, "loss/logits": 0.11891569197177887, "loss/reg": 0.002046172507107258, "step": 2504 }, { "epoch": 0.313125, "grad_norm": 2.1750881671905518, "grad_norm_var": 141.3682945202908, "learning_rate": 0.0001, "loss": 1.0844, "loss/crossentropy": 2.677823066711426, "loss/hidden": 0.8984375, "loss/logits": 0.1655401587486267, "loss/reg": 0.0020449531730264425, "step": 2505 }, { "epoch": 0.31325, "grad_norm": 2.4236440658569336, "grad_norm_var": 141.63417887755597, "learning_rate": 0.0001, "loss": 1.2346, "loss/crossentropy": 2.3126349449157715, "loss/hidden": 1.0390625, "loss/logits": 0.17513760924339294, "loss/reg": 0.002043848391622305, "step": 2506 }, { "epoch": 0.313375, "grad_norm": 2.162949562072754, "grad_norm_var": 141.75402173026126, "learning_rate": 0.0001, "loss": 1.1092, "loss/crossentropy": 2.7810885906219482, "loss/hidden": 0.921875, "loss/logits": 0.1669064164161682, "loss/reg": 0.0020427126437425613, "step": 2507 }, { "epoch": 0.3135, "grad_norm": 2.0320096015930176, "grad_norm_var": 141.8259004898801, "learning_rate": 0.0001, "loss": 0.9906, "loss/crossentropy": 2.5370616912841797, "loss/hidden": 0.84375, "loss/logits": 0.12646229565143585, "loss/reg": 0.002041601575911045, "step": 2508 }, { "epoch": 0.313625, "grad_norm": 2.014404296875, "grad_norm_var": 141.83891472266623, "learning_rate": 0.0001, "loss": 1.0812, "loss/crossentropy": 2.496274471282959, "loss/hidden": 0.91796875, "loss/logits": 0.1428661197423935, "loss/reg": 0.002040464896708727, "step": 2509 }, { "epoch": 0.31375, "grad_norm": 2.1024680137634277, "grad_norm_var": 141.99128057353707, "learning_rate": 0.0001, "loss": 1.0575, "loss/crossentropy": 2.7689507007598877, "loss/hidden": 0.890625, "loss/logits": 0.14650031924247742, "loss/reg": 0.0020393847953528166, "step": 2510 }, { "epoch": 0.313875, "grad_norm": 1.835846185684204, "grad_norm_var": 142.61129817646503, "learning_rate": 0.0001, "loss": 1.1305, "loss/crossentropy": 2.443084239959717, "loss/hidden": 0.9453125, "loss/logits": 0.16476798057556152, "loss/reg": 0.002038278616964817, "step": 2511 }, { "epoch": 0.314, "grad_norm": 2.5783400535583496, "grad_norm_var": 142.39447908562437, "learning_rate": 0.0001, "loss": 1.3831, "loss/crossentropy": 2.287595748901367, "loss/hidden": 1.15625, "loss/logits": 0.20648595690727234, "loss/reg": 0.002037204336374998, "step": 2512 }, { "epoch": 0.314125, "grad_norm": 2.237896680831909, "grad_norm_var": 142.37107613146216, "learning_rate": 0.0001, "loss": 1.0109, "loss/crossentropy": 2.2790188789367676, "loss/hidden": 0.859375, "loss/logits": 0.13120730221271515, "loss/reg": 0.0020360846538096666, "step": 2513 }, { "epoch": 0.31425, "grad_norm": 2.013672113418579, "grad_norm_var": 0.061979443739919246, "learning_rate": 0.0001, "loss": 1.0218, "loss/crossentropy": 2.722689151763916, "loss/hidden": 0.8671875, "loss/logits": 0.1342456042766571, "loss/reg": 0.0020349263213574886, "step": 2514 }, { "epoch": 0.314375, "grad_norm": 11.686729431152344, "grad_norm_var": 5.735282377986308, "learning_rate": 0.0001, "loss": 1.5012, "loss/crossentropy": 3.044372797012329, "loss/hidden": 1.2890625, "loss/logits": 0.19183599948883057, "loss/reg": 0.0020337440073490143, "step": 2515 }, { "epoch": 0.3145, "grad_norm": 2.058525323867798, "grad_norm_var": 5.714319137050961, "learning_rate": 0.0001, "loss": 1.0859, "loss/crossentropy": 2.475882053375244, "loss/hidden": 0.90625, "loss/logits": 0.15936745703220367, "loss/reg": 0.002032553544268012, "step": 2516 }, { "epoch": 0.314625, "grad_norm": 1.9261480569839478, "grad_norm_var": 5.758565973046831, "learning_rate": 0.0001, "loss": 1.0914, "loss/crossentropy": 2.4602651596069336, "loss/hidden": 0.9140625, "loss/logits": 0.15698175132274628, "loss/reg": 0.0020313519053161144, "step": 2517 }, { "epoch": 0.31475, "grad_norm": 1.9977660179138184, "grad_norm_var": 5.75216566114615, "learning_rate": 0.0001, "loss": 1.0713, "loss/crossentropy": 2.4952573776245117, "loss/hidden": 0.89453125, "loss/logits": 0.1564377248287201, "loss/reg": 0.002030135365203023, "step": 2518 }, { "epoch": 0.314875, "grad_norm": 2.5560109615325928, "grad_norm_var": 5.745597670157684, "learning_rate": 0.0001, "loss": 0.9993, "loss/crossentropy": 2.472264051437378, "loss/hidden": 0.83984375, "loss/logits": 0.1391814798116684, "loss/reg": 0.0020289106760174036, "step": 2519 }, { "epoch": 0.315, "grad_norm": 2.2961292266845703, "grad_norm_var": 5.717202314644479, "learning_rate": 0.0001, "loss": 1.0263, "loss/crossentropy": 2.2715659141540527, "loss/hidden": 0.87109375, "loss/logits": 0.13494260609149933, "loss/reg": 0.0020276703871786594, "step": 2520 }, { "epoch": 0.315125, "grad_norm": 13.915026664733887, "grad_norm_var": 13.421861919816056, "learning_rate": 0.0001, "loss": 1.1143, "loss/crossentropy": 2.7245688438415527, "loss/hidden": 0.94921875, "loss/logits": 0.14481759071350098, "loss/reg": 0.0020265690982341766, "step": 2521 }, { "epoch": 0.31525, "grad_norm": 2.3356447219848633, "grad_norm_var": 13.434855944205257, "learning_rate": 0.0001, "loss": 1.2115, "loss/crossentropy": 2.5406410694122314, "loss/hidden": 1.015625, "loss/logits": 0.17557372152805328, "loss/reg": 0.0020254673436284065, "step": 2522 }, { "epoch": 0.315375, "grad_norm": 3.4417669773101807, "grad_norm_var": 13.311756518319068, "learning_rate": 0.0001, "loss": 1.1743, "loss/crossentropy": 2.2371115684509277, "loss/hidden": 1.0078125, "loss/logits": 0.14626199007034302, "loss/reg": 0.0020243863109499216, "step": 2523 }, { "epoch": 0.3155, "grad_norm": 2.596271514892578, "grad_norm_var": 13.216376193802184, "learning_rate": 0.0001, "loss": 1.0833, "loss/crossentropy": 2.457688331604004, "loss/hidden": 0.90625, "loss/logits": 0.156768798828125, "loss/reg": 0.0020232631359249353, "step": 2524 }, { "epoch": 0.315625, "grad_norm": 2.2715678215026855, "grad_norm_var": 13.166157619903226, "learning_rate": 0.0001, "loss": 1.0925, "loss/crossentropy": 2.7886834144592285, "loss/hidden": 0.91796875, "loss/logits": 0.15427999198436737, "loss/reg": 0.002022197237238288, "step": 2525 }, { "epoch": 0.31575, "grad_norm": 2.3366000652313232, "grad_norm_var": 13.122346964747104, "learning_rate": 0.0001, "loss": 1.0785, "loss/crossentropy": 2.660529613494873, "loss/hidden": 0.890625, "loss/logits": 0.16764235496520996, "loss/reg": 0.002021095482632518, "step": 2526 }, { "epoch": 0.315875, "grad_norm": 2.2059381008148193, "grad_norm_var": 13.042361733236818, "learning_rate": 0.0001, "loss": 1.2737, "loss/crossentropy": 2.3842883110046387, "loss/hidden": 1.0625, "loss/logits": 0.19095614552497864, "loss/reg": 0.0020200416911393404, "step": 2527 }, { "epoch": 0.316, "grad_norm": 2.15350079536438, "grad_norm_var": 13.114537983924208, "learning_rate": 0.0001, "loss": 1.1471, "loss/crossentropy": 2.4741461277008057, "loss/hidden": 0.96484375, "loss/logits": 0.16207173466682434, "loss/reg": 0.0020190184004604816, "step": 2528 }, { "epoch": 0.316125, "grad_norm": 1.8526016473770142, "grad_norm_var": 13.195169190367434, "learning_rate": 0.0001, "loss": 1.0734, "loss/crossentropy": 2.4597058296203613, "loss/hidden": 0.8984375, "loss/logits": 0.15478196740150452, "loss/reg": 0.0020179112907499075, "step": 2529 }, { "epoch": 0.31625, "grad_norm": 2.459451675415039, "grad_norm_var": 13.113139068311753, "learning_rate": 0.0001, "loss": 1.2066, "loss/crossentropy": 2.008545398712158, "loss/hidden": 1.0390625, "loss/logits": 0.14741522073745728, "loss/reg": 0.002016807673498988, "step": 2530 }, { "epoch": 0.316375, "grad_norm": 2.5776121616363525, "grad_norm_var": 8.51458245781698, "learning_rate": 0.0001, "loss": 1.0334, "loss/crossentropy": 2.3403666019439697, "loss/hidden": 0.859375, "loss/logits": 0.15389090776443481, "loss/reg": 0.002015738980844617, "step": 2531 }, { "epoch": 0.3165, "grad_norm": 2.07371187210083, "grad_norm_var": 8.512566410958884, "learning_rate": 0.0001, "loss": 0.987, "loss/crossentropy": 2.480900764465332, "loss/hidden": 0.83203125, "loss/logits": 0.13479670882225037, "loss/reg": 0.0020146409515291452, "step": 2532 }, { "epoch": 0.316625, "grad_norm": 2.789600372314453, "grad_norm_var": 8.428369110566658, "learning_rate": 0.0001, "loss": 1.0663, "loss/crossentropy": 2.4941792488098145, "loss/hidden": 0.90234375, "loss/logits": 0.14378391206264496, "loss/reg": 0.0020135974045842886, "step": 2533 }, { "epoch": 0.31675, "grad_norm": 1.8812384605407715, "grad_norm_var": 8.446594895458308, "learning_rate": 0.0001, "loss": 1.0759, "loss/crossentropy": 2.2334728240966797, "loss/hidden": 0.8984375, "loss/logits": 0.15728960931301117, "loss/reg": 0.002012489829212427, "step": 2534 }, { "epoch": 0.316875, "grad_norm": 14.457756042480469, "grad_norm_var": 16.422409560799995, "learning_rate": 0.0001, "loss": 2.6008, "loss/crossentropy": 2.998446464538574, "loss/hidden": 2.21875, "loss/logits": 0.36195558309555054, "loss/reg": 0.002011369913816452, "step": 2535 }, { "epoch": 0.317, "grad_norm": 2.382647752761841, "grad_norm_var": 16.40492022897086, "learning_rate": 0.0001, "loss": 0.9434, "loss/crossentropy": 2.729083776473999, "loss/hidden": 0.77734375, "loss/logits": 0.14595064520835876, "loss/reg": 0.002010288182646036, "step": 2536 }, { "epoch": 0.317125, "grad_norm": 2.000728130340576, "grad_norm_var": 9.300796237309086, "learning_rate": 0.0001, "loss": 0.9761, "loss/crossentropy": 2.2422895431518555, "loss/hidden": 0.83203125, "loss/logits": 0.12397600710391998, "loss/reg": 0.0020091875921934843, "step": 2537 }, { "epoch": 0.31725, "grad_norm": 2.1036500930786133, "grad_norm_var": 9.32822241474606, "learning_rate": 0.0001, "loss": 1.0603, "loss/crossentropy": 2.5317108631134033, "loss/hidden": 0.88671875, "loss/logits": 0.1535404622554779, "loss/reg": 0.0020081019029021263, "step": 2538 }, { "epoch": 0.317375, "grad_norm": 2.237929105758667, "grad_norm_var": 9.363787355681984, "learning_rate": 0.0001, "loss": 1.1197, "loss/crossentropy": 2.534454822540283, "loss/hidden": 0.953125, "loss/logits": 0.14651340246200562, "loss/reg": 0.002007076982408762, "step": 2539 }, { "epoch": 0.3175, "grad_norm": 2.1494557857513428, "grad_norm_var": 9.401735338638206, "learning_rate": 0.0001, "loss": 0.9989, "loss/crossentropy": 2.445340871810913, "loss/hidden": 0.84375, "loss/logits": 0.13510264456272125, "loss/reg": 0.002005952876061201, "step": 2540 }, { "epoch": 0.317625, "grad_norm": 2.4947187900543213, "grad_norm_var": 9.383296983170782, "learning_rate": 0.0001, "loss": 1.438, "loss/crossentropy": 2.328836441040039, "loss/hidden": 1.2109375, "loss/logits": 0.20696458220481873, "loss/reg": 0.0020048548467457294, "step": 2541 }, { "epoch": 0.31775, "grad_norm": 2.7269442081451416, "grad_norm_var": 9.357781621923724, "learning_rate": 0.0001, "loss": 1.1688, "loss/crossentropy": 2.5824856758117676, "loss/hidden": 0.99609375, "loss/logits": 0.152710422873497, "loss/reg": 0.00200366391800344, "step": 2542 }, { "epoch": 0.317875, "grad_norm": 2.341979742050171, "grad_norm_var": 9.343914259089363, "learning_rate": 0.0001, "loss": 0.9573, "loss/crossentropy": 2.4298720359802246, "loss/hidden": 0.80859375, "loss/logits": 0.12863609194755554, "loss/reg": 0.0020024811383336782, "step": 2543 }, { "epoch": 0.318, "grad_norm": 2.148449182510376, "grad_norm_var": 9.344514786448432, "learning_rate": 0.0001, "loss": 1.1332, "loss/crossentropy": 2.5009348392486572, "loss/hidden": 0.95703125, "loss/logits": 0.15610834956169128, "loss/reg": 0.0020013500470668077, "step": 2544 }, { "epoch": 0.318125, "grad_norm": 2.463320016860962, "grad_norm_var": 9.27094123457197, "learning_rate": 0.0001, "loss": 1.0071, "loss/crossentropy": 2.3776540756225586, "loss/hidden": 0.85546875, "loss/logits": 0.1316118985414505, "loss/reg": 0.0020002038218080997, "step": 2545 }, { "epoch": 0.31825, "grad_norm": 3.8832123279571533, "grad_norm_var": 9.279723919386347, "learning_rate": 0.0001, "loss": 1.1653, "loss/crossentropy": 2.4246480464935303, "loss/hidden": 0.99609375, "loss/logits": 0.1492367684841156, "loss/reg": 0.0019991095177829266, "step": 2546 }, { "epoch": 0.318375, "grad_norm": 2.8784990310668945, "grad_norm_var": 9.261634330154886, "learning_rate": 0.0001, "loss": 1.06, "loss/crossentropy": 2.4725842475891113, "loss/hidden": 0.91015625, "loss/logits": 0.12982821464538574, "loss/reg": 0.0019980138167738914, "step": 2547 }, { "epoch": 0.3185, "grad_norm": 2.188722848892212, "grad_norm_var": 9.245368070241575, "learning_rate": 0.0001, "loss": 0.9404, "loss/crossentropy": 2.2547831535339355, "loss/hidden": 0.79296875, "loss/logits": 0.12745878100395203, "loss/reg": 0.001996915554627776, "step": 2548 }, { "epoch": 0.318625, "grad_norm": 3.8353192806243896, "grad_norm_var": 9.257111893384563, "learning_rate": 0.0001, "loss": 1.5225, "loss/crossentropy": 2.8062922954559326, "loss/hidden": 1.296875, "loss/logits": 0.20562350749969482, "loss/reg": 0.0019958338234573603, "step": 2549 }, { "epoch": 0.31875, "grad_norm": 2.5866293907165527, "grad_norm_var": 9.158449313938563, "learning_rate": 0.0001, "loss": 1.2442, "loss/crossentropy": 2.1596083641052246, "loss/hidden": 1.046875, "loss/logits": 0.17734473943710327, "loss/reg": 0.0019946913234889507, "step": 2550 }, { "epoch": 0.318875, "grad_norm": 2.223085880279541, "grad_norm_var": 0.3205203250514669, "learning_rate": 0.0001, "loss": 1.1109, "loss/crossentropy": 2.4585084915161133, "loss/hidden": 0.94140625, "loss/logits": 0.14955827593803406, "loss/reg": 0.001993620302528143, "step": 2551 }, { "epoch": 0.319, "grad_norm": 2.2888619899749756, "grad_norm_var": 0.323041849650883, "learning_rate": 0.0001, "loss": 0.9568, "loss/crossentropy": 2.677177667617798, "loss/hidden": 0.79296875, "loss/logits": 0.14391276240348816, "loss/reg": 0.001992595847696066, "step": 2552 }, { "epoch": 0.319125, "grad_norm": 2.4546878337860107, "grad_norm_var": 0.3036155598438313, "learning_rate": 0.0001, "loss": 1.0778, "loss/crossentropy": 2.357370615005493, "loss/hidden": 0.91796875, "loss/logits": 0.13996361196041107, "loss/reg": 0.001991493860259652, "step": 2553 }, { "epoch": 0.31925, "grad_norm": 2.536940336227417, "grad_norm_var": 0.2888209107285185, "learning_rate": 0.0001, "loss": 1.1411, "loss/crossentropy": 2.638209581375122, "loss/hidden": 0.96875, "loss/logits": 0.15246427059173584, "loss/reg": 0.001990404212847352, "step": 2554 }, { "epoch": 0.319375, "grad_norm": 2.7242743968963623, "grad_norm_var": 0.2807787845320469, "learning_rate": 0.0001, "loss": 1.1024, "loss/crossentropy": 2.529128074645996, "loss/hidden": 0.92578125, "loss/logits": 0.15671178698539734, "loss/reg": 0.001989346230402589, "step": 2555 }, { "epoch": 0.3195, "grad_norm": 1.9480868577957153, "grad_norm_var": 0.29595541597851416, "learning_rate": 0.0001, "loss": 1.0473, "loss/crossentropy": 2.3832671642303467, "loss/hidden": 0.8828125, "loss/logits": 0.14461234211921692, "loss/reg": 0.0019883187487721443, "step": 2556 }, { "epoch": 0.319625, "grad_norm": 1.942399263381958, "grad_norm_var": 0.32334414929385763, "learning_rate": 0.0001, "loss": 1.0134, "loss/crossentropy": 2.4262657165527344, "loss/hidden": 0.8515625, "loss/logits": 0.1420045793056488, "loss/reg": 0.001987228635698557, "step": 2557 }, { "epoch": 0.31975, "grad_norm": 2.044696569442749, "grad_norm_var": 0.3384511732389848, "learning_rate": 0.0001, "loss": 1.0484, "loss/crossentropy": 2.503047466278076, "loss/hidden": 0.8828125, "loss/logits": 0.14570119976997375, "loss/reg": 0.0019861541222780943, "step": 2558 }, { "epoch": 0.319875, "grad_norm": 2.049950122833252, "grad_norm_var": 0.3511245559432036, "learning_rate": 0.0001, "loss": 1.0753, "loss/crossentropy": 2.2448768615722656, "loss/hidden": 0.90625, "loss/logits": 0.1492462009191513, "loss/reg": 0.0019850872922688723, "step": 2559 }, { "epoch": 0.32, "grad_norm": 2.087944269180298, "grad_norm_var": 0.35428882942596773, "learning_rate": 0.0001, "loss": 1.0591, "loss/crossentropy": 2.5325615406036377, "loss/hidden": 0.89453125, "loss/logits": 0.14472955465316772, "loss/reg": 0.001983870519325137, "step": 2560 }, { "epoch": 0.320125, "grad_norm": 2.1157948970794678, "grad_norm_var": 0.3639325024825874, "learning_rate": 0.0001, "loss": 1.0971, "loss/crossentropy": 2.698012113571167, "loss/hidden": 0.91796875, "loss/logits": 0.1592620313167572, "loss/reg": 0.001982634887099266, "step": 2561 }, { "epoch": 0.32025, "grad_norm": 3.156216859817505, "grad_norm_var": 0.2616089448480579, "learning_rate": 0.0001, "loss": 1.3014, "loss/crossentropy": 2.4384994506835938, "loss/hidden": 1.0859375, "loss/logits": 0.19568133354187012, "loss/reg": 0.0019815664272755384, "step": 2562 }, { "epoch": 0.320375, "grad_norm": 2.4762980937957764, "grad_norm_var": 0.24827810324392907, "learning_rate": 0.0001, "loss": 1.2071, "loss/crossentropy": 2.044973850250244, "loss/hidden": 1.0234375, "loss/logits": 0.16388536989688873, "loss/reg": 0.001980493776500225, "step": 2563 }, { "epoch": 0.3205, "grad_norm": 2.6863152980804443, "grad_norm_var": 0.24865793239752534, "learning_rate": 0.0001, "loss": 1.3931, "loss/crossentropy": 2.379929542541504, "loss/hidden": 1.1640625, "loss/logits": 0.20923388004302979, "loss/reg": 0.001979437656700611, "step": 2564 }, { "epoch": 0.320625, "grad_norm": 2.096332311630249, "grad_norm_var": 0.11583983357218598, "learning_rate": 0.0001, "loss": 1.1968, "loss/crossentropy": 2.4329240322113037, "loss/hidden": 1.0078125, "loss/logits": 0.16917604207992554, "loss/reg": 0.0019783389288932085, "step": 2565 }, { "epoch": 0.32075, "grad_norm": 2.109616756439209, "grad_norm_var": 0.11428969704909846, "learning_rate": 0.0001, "loss": 1.0618, "loss/crossentropy": 2.3552322387695312, "loss/hidden": 0.90234375, "loss/logits": 0.13967595994472504, "loss/reg": 0.001977319363504648, "step": 2566 }, { "epoch": 0.320875, "grad_norm": 1.929050326347351, "grad_norm_var": 0.123055373020703, "learning_rate": 0.0001, "loss": 1.177, "loss/crossentropy": 2.507193088531494, "loss/hidden": 0.984375, "loss/logits": 0.17285877466201782, "loss/reg": 0.001976217143237591, "step": 2567 }, { "epoch": 0.321, "grad_norm": 2.4638171195983887, "grad_norm_var": 0.12493102195682747, "learning_rate": 0.0001, "loss": 1.2868, "loss/crossentropy": 2.008462905883789, "loss/hidden": 1.09375, "loss/logits": 0.1733073592185974, "loss/reg": 0.0019752399530261755, "step": 2568 }, { "epoch": 0.321125, "grad_norm": 2.6583945751190186, "grad_norm_var": 0.1316879484045176, "learning_rate": 0.0001, "loss": 1.0465, "loss/crossentropy": 2.6284821033477783, "loss/hidden": 0.8671875, "loss/logits": 0.15960273146629333, "loss/reg": 0.0019742597360163927, "step": 2569 }, { "epoch": 0.32125, "grad_norm": 2.647958993911743, "grad_norm_var": 0.13575637260402093, "learning_rate": 0.0001, "loss": 1.0464, "loss/crossentropy": 2.5968806743621826, "loss/hidden": 0.88671875, "loss/logits": 0.1399203985929489, "loss/reg": 0.0019733814988285303, "step": 2570 }, { "epoch": 0.321375, "grad_norm": 3.293423652648926, "grad_norm_var": 0.1865997232768431, "learning_rate": 0.0001, "loss": 1.1069, "loss/crossentropy": 2.523405075073242, "loss/hidden": 0.92578125, "loss/logits": 0.16140154004096985, "loss/reg": 0.0019725202582776546, "step": 2571 }, { "epoch": 0.3215, "grad_norm": 3.2771072387695312, "grad_norm_var": 0.22459582472020062, "learning_rate": 0.0001, "loss": 1.3109, "loss/crossentropy": 2.6093358993530273, "loss/hidden": 1.109375, "loss/logits": 0.18180875480175018, "loss/reg": 0.0019716897513717413, "step": 2572 }, { "epoch": 0.321625, "grad_norm": 2.5897514820098877, "grad_norm_var": 0.20786292164714926, "learning_rate": 0.0001, "loss": 1.062, "loss/crossentropy": 2.612414836883545, "loss/hidden": 0.890625, "loss/logits": 0.15162289142608643, "loss/reg": 0.0019706198945641518, "step": 2573 }, { "epoch": 0.32175, "grad_norm": 2.6358892917633057, "grad_norm_var": 0.1953809808336136, "learning_rate": 0.0001, "loss": 1.1001, "loss/crossentropy": 2.7292776107788086, "loss/hidden": 0.9140625, "loss/logits": 0.16630934178829193, "loss/reg": 0.0019697847310453653, "step": 2574 }, { "epoch": 0.321875, "grad_norm": 2.1441431045532227, "grad_norm_var": 0.19006833028778178, "learning_rate": 0.0001, "loss": 0.999, "loss/crossentropy": 2.820829153060913, "loss/hidden": 0.84765625, "loss/logits": 0.13168208301067352, "loss/reg": 0.001968713477253914, "step": 2575 }, { "epoch": 0.322, "grad_norm": 2.262800931930542, "grad_norm_var": 0.1818361937293308, "learning_rate": 0.0001, "loss": 1.1439, "loss/crossentropy": 2.4279236793518066, "loss/hidden": 0.97265625, "loss/logits": 0.1515251100063324, "loss/reg": 0.0019678673706948757, "step": 2576 }, { "epoch": 0.322125, "grad_norm": 2.2489492893218994, "grad_norm_var": 0.17552075343180124, "learning_rate": 0.0001, "loss": 1.1435, "loss/crossentropy": 2.3599050045013428, "loss/hidden": 0.9609375, "loss/logits": 0.16286428272724152, "loss/reg": 0.001967033138498664, "step": 2577 }, { "epoch": 0.32225, "grad_norm": 2.3652756214141846, "grad_norm_var": 0.1498722088758777, "learning_rate": 0.0001, "loss": 1.0416, "loss/crossentropy": 2.7162282466888428, "loss/hidden": 0.875, "loss/logits": 0.14697779715061188, "loss/reg": 0.001966227311640978, "step": 2578 }, { "epoch": 0.322375, "grad_norm": 2.325157403945923, "grad_norm_var": 0.1516328842554282, "learning_rate": 0.0001, "loss": 1.0408, "loss/crossentropy": 2.294282913208008, "loss/hidden": 0.8828125, "loss/logits": 0.13837048411369324, "loss/reg": 0.00196542008779943, "step": 2579 }, { "epoch": 0.3225, "grad_norm": 2.701958179473877, "grad_norm_var": 0.1520714562883739, "learning_rate": 0.0001, "loss": 1.0467, "loss/crossentropy": 2.4668502807617188, "loss/hidden": 0.87109375, "loss/logits": 0.15591362118721008, "loss/reg": 0.0019645460415631533, "step": 2580 }, { "epoch": 0.322625, "grad_norm": 1.9630318880081177, "grad_norm_var": 0.16007843779454353, "learning_rate": 0.0001, "loss": 1.044, "loss/crossentropy": 2.502182722091675, "loss/hidden": 0.86328125, "loss/logits": 0.16107147932052612, "loss/reg": 0.001963698072358966, "step": 2581 }, { "epoch": 0.32275, "grad_norm": 2.698295831680298, "grad_norm_var": 0.15297816024436747, "learning_rate": 0.0001, "loss": 1.095, "loss/crossentropy": 2.5543415546417236, "loss/hidden": 0.90234375, "loss/logits": 0.1730049103498459, "loss/reg": 0.00196263799443841, "step": 2582 }, { "epoch": 0.322875, "grad_norm": 2.113349199295044, "grad_norm_var": 0.14075613757611471, "learning_rate": 0.0001, "loss": 1.0551, "loss/crossentropy": 2.528599977493286, "loss/hidden": 0.8828125, "loss/logits": 0.1526312530040741, "loss/reg": 0.001961572328582406, "step": 2583 }, { "epoch": 0.323, "grad_norm": 2.890878677368164, "grad_norm_var": 0.1487092045835197, "learning_rate": 0.0001, "loss": 1.2431, "loss/crossentropy": 2.4257144927978516, "loss/hidden": 1.0390625, "loss/logits": 0.18444174528121948, "loss/reg": 0.0019605166744440794, "step": 2584 }, { "epoch": 0.323125, "grad_norm": 2.090179204940796, "grad_norm_var": 0.16075379569648565, "learning_rate": 0.0001, "loss": 1.0713, "loss/crossentropy": 2.606339693069458, "loss/hidden": 0.90625, "loss/logits": 0.14540623128414154, "loss/reg": 0.001959529472514987, "step": 2585 }, { "epoch": 0.32325, "grad_norm": 2.3992185592651367, "grad_norm_var": 0.16022803998024712, "learning_rate": 0.0001, "loss": 1.0983, "loss/crossentropy": 2.474379301071167, "loss/hidden": 0.921875, "loss/logits": 0.1568717360496521, "loss/reg": 0.001958573702722788, "step": 2586 }, { "epoch": 0.323375, "grad_norm": 1.9647669792175293, "grad_norm_var": 0.1299961864540321, "learning_rate": 0.0001, "loss": 0.9863, "loss/crossentropy": 2.5449929237365723, "loss/hidden": 0.83984375, "loss/logits": 0.12689873576164246, "loss/reg": 0.0019576323684304953, "step": 2587 }, { "epoch": 0.3235, "grad_norm": 1.868558406829834, "grad_norm_var": 0.09244842162277672, "learning_rate": 0.0001, "loss": 0.9152, "loss/crossentropy": 2.3570404052734375, "loss/hidden": 0.77734375, "loss/logits": 0.11832741647958755, "loss/reg": 0.0019565485417842865, "step": 2588 }, { "epoch": 0.323625, "grad_norm": 2.1048176288604736, "grad_norm_var": 0.09027908715485934, "learning_rate": 0.0001, "loss": 1.0699, "loss/crossentropy": 2.3770053386688232, "loss/hidden": 0.90625, "loss/logits": 0.14414173364639282, "loss/reg": 0.001955605112016201, "step": 2589 }, { "epoch": 0.32375, "grad_norm": 2.4909298419952393, "grad_norm_var": 0.08507291369899021, "learning_rate": 0.0001, "loss": 1.0994, "loss/crossentropy": 2.4076027870178223, "loss/hidden": 0.921875, "loss/logits": 0.15802565217018127, "loss/reg": 0.0019545473624020815, "step": 2590 }, { "epoch": 0.323875, "grad_norm": 2.3506321907043457, "grad_norm_var": 0.08373528956607425, "learning_rate": 0.0001, "loss": 1.0926, "loss/crossentropy": 2.1984875202178955, "loss/hidden": 0.9140625, "loss/logits": 0.15897971391677856, "loss/reg": 0.0019535759929567575, "step": 2591 }, { "epoch": 0.324, "grad_norm": 2.2246623039245605, "grad_norm_var": 0.0840276935431565, "learning_rate": 0.0001, "loss": 1.0407, "loss/crossentropy": 2.5797808170318604, "loss/hidden": 0.875, "loss/logits": 0.14616936445236206, "loss/reg": 0.00195262860506773, "step": 2592 }, { "epoch": 0.324125, "grad_norm": 2.7434518337249756, "grad_norm_var": 0.09594230586653267, "learning_rate": 0.0001, "loss": 1.2661, "loss/crossentropy": 2.5231878757476807, "loss/hidden": 1.0703125, "loss/logits": 0.1762571930885315, "loss/reg": 0.0019517639884725213, "step": 2593 }, { "epoch": 0.32425, "grad_norm": 2.643927574157715, "grad_norm_var": 0.10207064215561695, "learning_rate": 0.0001, "loss": 1.0039, "loss/crossentropy": 2.4751248359680176, "loss/hidden": 0.85546875, "loss/logits": 0.1289035677909851, "loss/reg": 0.001950973179191351, "step": 2594 }, { "epoch": 0.324375, "grad_norm": 2.617863178253174, "grad_norm_var": 0.10651976033177034, "learning_rate": 0.0001, "loss": 1.172, "loss/crossentropy": 2.4379770755767822, "loss/hidden": 0.96875, "loss/logits": 0.18371832370758057, "loss/reg": 0.0019500176422297955, "step": 2595 }, { "epoch": 0.3245, "grad_norm": 2.211016893386841, "grad_norm_var": 0.09963533261878149, "learning_rate": 0.0001, "loss": 1.1805, "loss/crossentropy": 2.067227602005005, "loss/hidden": 1.0078125, "loss/logits": 0.1531563699245453, "loss/reg": 0.001949155586771667, "step": 2596 }, { "epoch": 0.324625, "grad_norm": 2.112340211868286, "grad_norm_var": 0.09360420011223501, "learning_rate": 0.0001, "loss": 1.1613, "loss/crossentropy": 2.4788920879364014, "loss/hidden": 0.9765625, "loss/logits": 0.16530287265777588, "loss/reg": 0.001948309363797307, "step": 2597 }, { "epoch": 0.32475, "grad_norm": 2.7137415409088135, "grad_norm_var": 0.0943460688032161, "learning_rate": 0.0001, "loss": 1.1684, "loss/crossentropy": 2.2482478618621826, "loss/hidden": 0.9921875, "loss/logits": 0.1567087471485138, "loss/reg": 0.0019474788568913937, "step": 2598 }, { "epoch": 0.324875, "grad_norm": 2.5278706550598145, "grad_norm_var": 0.09221184941195375, "learning_rate": 0.0001, "loss": 1.1715, "loss/crossentropy": 2.727576494216919, "loss/hidden": 0.96875, "loss/logits": 0.18328100442886353, "loss/reg": 0.0019467029487714171, "step": 2599 }, { "epoch": 0.325, "grad_norm": 2.705632448196411, "grad_norm_var": 0.0815449756671569, "learning_rate": 0.0001, "loss": 1.0767, "loss/crossentropy": 2.3500823974609375, "loss/hidden": 0.89453125, "loss/logits": 0.16269776225090027, "loss/reg": 0.0019462080672383308, "step": 2600 }, { "epoch": 0.325125, "grad_norm": 3.3013041019439697, "grad_norm_var": 0.12955290236167513, "learning_rate": 0.0001, "loss": 1.312, "loss/crossentropy": 2.919482946395874, "loss/hidden": 1.1171875, "loss/logits": 0.17538517713546753, "loss/reg": 0.00194516871124506, "step": 2601 }, { "epoch": 0.32525, "grad_norm": 2.343170642852783, "grad_norm_var": 0.13002631892181854, "learning_rate": 0.0001, "loss": 1.076, "loss/crossentropy": 2.449092388153076, "loss/hidden": 0.8984375, "loss/logits": 0.15812505781650543, "loss/reg": 0.0019441096810624003, "step": 2602 }, { "epoch": 0.325375, "grad_norm": 2.8342862129211426, "grad_norm_var": 0.12301929446977056, "learning_rate": 0.0001, "loss": 1.0517, "loss/crossentropy": 2.70703387260437, "loss/hidden": 0.87109375, "loss/logits": 0.16117960214614868, "loss/reg": 0.0019430873217061162, "step": 2603 }, { "epoch": 0.3255, "grad_norm": 3.188351631164551, "grad_norm_var": 0.12303225072807038, "learning_rate": 0.0001, "loss": 1.4198, "loss/crossentropy": 2.2564218044281006, "loss/hidden": 1.171875, "loss/logits": 0.2284945398569107, "loss/reg": 0.0019422497134655714, "step": 2604 }, { "epoch": 0.325625, "grad_norm": 4.321959972381592, "grad_norm_var": 0.29285890140147874, "learning_rate": 0.0001, "loss": 1.2613, "loss/crossentropy": 2.313366651535034, "loss/hidden": 1.0390625, "loss/logits": 0.2027965784072876, "loss/reg": 0.0019414284033700824, "step": 2605 }, { "epoch": 0.32575, "grad_norm": 2.484724760055542, "grad_norm_var": 0.29304106202063146, "learning_rate": 0.0001, "loss": 1.2207, "loss/crossentropy": 2.301015853881836, "loss/hidden": 1.03125, "loss/logits": 0.17009034752845764, "loss/reg": 0.0019404172198846936, "step": 2606 }, { "epoch": 0.325875, "grad_norm": 4.044075012207031, "grad_norm_var": 0.39162765914044234, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.3623392581939697, "loss/hidden": 1.15625, "loss/logits": 0.14709988236427307, "loss/reg": 0.0019393644761294127, "step": 2607 }, { "epoch": 0.326, "grad_norm": 2.28312611579895, "grad_norm_var": 0.3872500333513159, "learning_rate": 0.0001, "loss": 1.1442, "loss/crossentropy": 2.530644178390503, "loss/hidden": 0.95703125, "loss/logits": 0.16783158481121063, "loss/reg": 0.0019383745966479182, "step": 2608 }, { "epoch": 0.326125, "grad_norm": 2.490402936935425, "grad_norm_var": 0.39374385885138563, "learning_rate": 0.0001, "loss": 1.0274, "loss/crossentropy": 2.6754276752471924, "loss/hidden": 0.86328125, "loss/logits": 0.1447535753250122, "loss/reg": 0.0019373211544007063, "step": 2609 }, { "epoch": 0.32625, "grad_norm": 1.843550443649292, "grad_norm_var": 0.4505958548700162, "learning_rate": 0.0001, "loss": 0.9929, "loss/crossentropy": 2.4644904136657715, "loss/hidden": 0.83984375, "loss/logits": 0.13365285098552704, "loss/reg": 0.0019362904131412506, "step": 2610 }, { "epoch": 0.326375, "grad_norm": 2.4763755798339844, "grad_norm_var": 0.454367398867839, "learning_rate": 0.0001, "loss": 1.091, "loss/crossentropy": 2.4957921504974365, "loss/hidden": 0.9296875, "loss/logits": 0.14192816615104675, "loss/reg": 0.0019351824885234237, "step": 2611 }, { "epoch": 0.3265, "grad_norm": 1.973061203956604, "grad_norm_var": 0.47477274674504877, "learning_rate": 0.0001, "loss": 1.15, "loss/crossentropy": 2.5857620239257812, "loss/hidden": 0.9609375, "loss/logits": 0.16968974471092224, "loss/reg": 0.0019341424340382218, "step": 2612 }, { "epoch": 0.326625, "grad_norm": 2.3026509284973145, "grad_norm_var": 0.46142054733400684, "learning_rate": 0.0001, "loss": 1.0571, "loss/crossentropy": 2.44352650642395, "loss/hidden": 0.890625, "loss/logits": 0.1471881866455078, "loss/reg": 0.0019331028452143073, "step": 2613 }, { "epoch": 0.32675, "grad_norm": 1.9800761938095093, "grad_norm_var": 0.4975958103437288, "learning_rate": 0.0001, "loss": 0.9069, "loss/crossentropy": 2.072399616241455, "loss/hidden": 0.78125, "loss/logits": 0.10636016726493835, "loss/reg": 0.001932067214511335, "step": 2614 }, { "epoch": 0.326875, "grad_norm": 1.8899251222610474, "grad_norm_var": 0.5371446049294708, "learning_rate": 0.0001, "loss": 0.9566, "loss/crossentropy": 2.7395026683807373, "loss/hidden": 0.80859375, "loss/logits": 0.12866735458374023, "loss/reg": 0.0019310922361910343, "step": 2615 }, { "epoch": 0.327, "grad_norm": 2.284410238265991, "grad_norm_var": 0.5453293761821607, "learning_rate": 0.0001, "loss": 1.3133, "loss/crossentropy": 2.354835033416748, "loss/hidden": 1.1015625, "loss/logits": 0.19239306449890137, "loss/reg": 0.0019301031716167927, "step": 2616 }, { "epoch": 0.327125, "grad_norm": 3.2513251304626465, "grad_norm_var": 0.5409959610181516, "learning_rate": 0.0001, "loss": 1.4765, "loss/crossentropy": 2.1283581256866455, "loss/hidden": 1.21875, "loss/logits": 0.23847420513629913, "loss/reg": 0.001929031335748732, "step": 2617 }, { "epoch": 0.32725, "grad_norm": 2.7074813842773438, "grad_norm_var": 0.5356272008173538, "learning_rate": 0.0001, "loss": 0.9698, "loss/crossentropy": 2.9765610694885254, "loss/hidden": 0.828125, "loss/logits": 0.1223883181810379, "loss/reg": 0.0019280307460576296, "step": 2618 }, { "epoch": 0.327375, "grad_norm": 2.5938141345977783, "grad_norm_var": 0.5332440103953814, "learning_rate": 0.0001, "loss": 1.0404, "loss/crossentropy": 3.0576584339141846, "loss/hidden": 0.86328125, "loss/logits": 0.15788918733596802, "loss/reg": 0.0019270256161689758, "step": 2619 }, { "epoch": 0.3275, "grad_norm": 70.91342163085938, "grad_norm_var": 292.22305505838057, "learning_rate": 0.0001, "loss": 2.2366, "loss/crossentropy": 2.4672908782958984, "loss/hidden": 1.9609375, "loss/logits": 0.2563789486885071, "loss/reg": 0.0019259831169620156, "step": 2620 }, { "epoch": 0.327625, "grad_norm": 2.696791648864746, "grad_norm_var": 292.9391825501781, "learning_rate": 0.0001, "loss": 1.176, "loss/crossentropy": 2.7580645084381104, "loss/hidden": 0.98828125, "loss/logits": 0.16842973232269287, "loss/reg": 0.001924974494613707, "step": 2621 }, { "epoch": 0.32775, "grad_norm": 3.048142910003662, "grad_norm_var": 292.6375942978767, "learning_rate": 0.0001, "loss": 1.0943, "loss/crossentropy": 2.489809513092041, "loss/hidden": 0.90234375, "loss/logits": 0.17267011106014252, "loss/reg": 0.0019239335088059306, "step": 2622 }, { "epoch": 0.327875, "grad_norm": 2.526303768157959, "grad_norm_var": 293.3390160223432, "learning_rate": 0.0001, "loss": 1.082, "loss/crossentropy": 2.645115613937378, "loss/hidden": 0.9140625, "loss/logits": 0.14874479174613953, "loss/reg": 0.00192292092833668, "step": 2623 }, { "epoch": 0.328, "grad_norm": 2.52156400680542, "grad_norm_var": 293.20202837258046, "learning_rate": 0.0001, "loss": 1.0225, "loss/crossentropy": 2.6276845932006836, "loss/hidden": 0.85546875, "loss/logits": 0.14785343408584595, "loss/reg": 0.0019219197565689683, "step": 2624 }, { "epoch": 0.328125, "grad_norm": 1.962568998336792, "grad_norm_var": 293.51702033438136, "learning_rate": 0.0001, "loss": 0.9859, "loss/crossentropy": 2.4731664657592773, "loss/hidden": 0.84375, "loss/logits": 0.12292357534170151, "loss/reg": 0.0019208821468055248, "step": 2625 }, { "epoch": 0.32825, "grad_norm": 2.7825257778167725, "grad_norm_var": 292.96590174162, "learning_rate": 0.0001, "loss": 1.3042, "loss/crossentropy": 2.2124195098876953, "loss/hidden": 1.09375, "loss/logits": 0.19123528897762299, "loss/reg": 0.0019198302179574966, "step": 2626 }, { "epoch": 0.328375, "grad_norm": 2.6599276065826416, "grad_norm_var": 292.86355345397703, "learning_rate": 0.0001, "loss": 0.9111, "loss/crossentropy": 2.810068368911743, "loss/hidden": 0.77734375, "loss/logits": 0.1145322173833847, "loss/reg": 0.001918860012665391, "step": 2627 }, { "epoch": 0.3285, "grad_norm": 4.585007667541504, "grad_norm_var": 291.6242846998923, "learning_rate": 0.0001, "loss": 1.4881, "loss/crossentropy": 2.708850860595703, "loss/hidden": 1.203125, "loss/logits": 0.265799343585968, "loss/reg": 0.001917839515954256, "step": 2628 }, { "epoch": 0.328625, "grad_norm": 2.5904152393341064, "grad_norm_var": 291.45233283865343, "learning_rate": 0.0001, "loss": 1.2829, "loss/crossentropy": 2.342621088027954, "loss/hidden": 1.09375, "loss/logits": 0.1699591875076294, "loss/reg": 0.001916878274641931, "step": 2629 }, { "epoch": 0.32875, "grad_norm": 2.4327213764190674, "grad_norm_var": 291.16596820865624, "learning_rate": 0.0001, "loss": 1.2132, "loss/crossentropy": 2.508516788482666, "loss/hidden": 1.0234375, "loss/logits": 0.17065170407295227, "loss/reg": 0.001915843109600246, "step": 2630 }, { "epoch": 0.328875, "grad_norm": 3.6639018058776855, "grad_norm_var": 290.1621528649164, "learning_rate": 0.0001, "loss": 1.1892, "loss/crossentropy": 2.4983201026916504, "loss/hidden": 0.95703125, "loss/logits": 0.21297143399715424, "loss/reg": 0.0019148311112076044, "step": 2631 }, { "epoch": 0.329, "grad_norm": 2.7388134002685547, "grad_norm_var": 289.8847331615837, "learning_rate": 0.0001, "loss": 1.1596, "loss/crossentropy": 2.1745028495788574, "loss/hidden": 0.9765625, "loss/logits": 0.16389799118041992, "loss/reg": 0.0019138100324198604, "step": 2632 }, { "epoch": 0.329125, "grad_norm": 2.2227349281311035, "grad_norm_var": 290.47932645048627, "learning_rate": 0.0001, "loss": 1.0711, "loss/crossentropy": 2.739015579223633, "loss/hidden": 0.8984375, "loss/logits": 0.15357255935668945, "loss/reg": 0.0019128394778817892, "step": 2633 }, { "epoch": 0.32925, "grad_norm": 2.0811848640441895, "grad_norm_var": 290.86566611251646, "learning_rate": 0.0001, "loss": 0.9556, "loss/crossentropy": 2.6461029052734375, "loss/hidden": 0.80859375, "loss/logits": 0.12793703377246857, "loss/reg": 0.0019118597265332937, "step": 2634 }, { "epoch": 0.329375, "grad_norm": 2.116468906402588, "grad_norm_var": 291.1604224264861, "learning_rate": 0.0001, "loss": 1.2069, "loss/crossentropy": 2.62786602973938, "loss/hidden": 1.0, "loss/logits": 0.18774330615997314, "loss/reg": 0.0019108138512820005, "step": 2635 }, { "epoch": 0.3295, "grad_norm": 2.484828233718872, "grad_norm_var": 0.4199877560997796, "learning_rate": 0.0001, "loss": 1.2557, "loss/crossentropy": 2.1726491451263428, "loss/hidden": 1.0625, "loss/logits": 0.1741446554660797, "loss/reg": 0.0019097549375146627, "step": 2636 }, { "epoch": 0.329625, "grad_norm": 2.404581069946289, "grad_norm_var": 0.42523978856895234, "learning_rate": 0.0001, "loss": 1.0538, "loss/crossentropy": 2.691166877746582, "loss/hidden": 0.875, "loss/logits": 0.15975207090377808, "loss/reg": 0.0019086537649855018, "step": 2637 }, { "epoch": 0.32975, "grad_norm": 2.258976936340332, "grad_norm_var": 0.42504347565320494, "learning_rate": 0.0001, "loss": 1.051, "loss/crossentropy": 2.5933079719543457, "loss/hidden": 0.89453125, "loss/logits": 0.1373998373746872, "loss/reg": 0.0019075415330007672, "step": 2638 }, { "epoch": 0.329875, "grad_norm": 2.1695797443389893, "grad_norm_var": 0.437787722246537, "learning_rate": 0.0001, "loss": 0.9536, "loss/crossentropy": 2.6355223655700684, "loss/hidden": 0.8125, "loss/logits": 0.12198765575885773, "loss/reg": 0.0019065127708017826, "step": 2639 }, { "epoch": 0.33, "grad_norm": 2.090467929840088, "grad_norm_var": 0.45418373237175136, "learning_rate": 0.0001, "loss": 0.991, "loss/crossentropy": 2.470792531967163, "loss/hidden": 0.84375, "loss/logits": 0.12822991609573364, "loss/reg": 0.0019054242875427008, "step": 2640 }, { "epoch": 0.330125, "grad_norm": 2.0042479038238525, "grad_norm_var": 0.45087338227463625, "learning_rate": 0.0001, "loss": 1.0134, "loss/crossentropy": 2.563836097717285, "loss/hidden": 0.85546875, "loss/logits": 0.13884064555168152, "loss/reg": 0.0019043214851990342, "step": 2641 }, { "epoch": 0.33025, "grad_norm": 2.534039258956909, "grad_norm_var": 0.44803570730574976, "learning_rate": 0.0001, "loss": 1.2897, "loss/crossentropy": 2.4159111976623535, "loss/hidden": 1.0625, "loss/logits": 0.20816226303577423, "loss/reg": 0.0019032945856451988, "step": 2642 }, { "epoch": 0.330375, "grad_norm": 2.2325968742370605, "grad_norm_var": 0.45403270890063824, "learning_rate": 0.0001, "loss": 0.9832, "loss/crossentropy": 2.741281270980835, "loss/hidden": 0.8359375, "loss/logits": 0.1282157003879547, "loss/reg": 0.001902307034470141, "step": 2643 }, { "epoch": 0.3305, "grad_norm": 2.2622833251953125, "grad_norm_var": 0.15732163055984075, "learning_rate": 0.0001, "loss": 1.2005, "loss/crossentropy": 2.51218581199646, "loss/hidden": 1.0, "loss/logits": 0.18151551485061646, "loss/reg": 0.00190132821444422, "step": 2644 }, { "epoch": 0.330625, "grad_norm": 1.8014144897460938, "grad_norm_var": 0.17546012389283092, "learning_rate": 0.0001, "loss": 0.9646, "loss/crossentropy": 2.523585796356201, "loss/hidden": 0.796875, "loss/logits": 0.1487436294555664, "loss/reg": 0.0019003006163984537, "step": 2645 }, { "epoch": 0.33075, "grad_norm": 2.1793863773345947, "grad_norm_var": 0.17646356591704163, "learning_rate": 0.0001, "loss": 1.0278, "loss/crossentropy": 2.7488279342651367, "loss/hidden": 0.875, "loss/logits": 0.1337858885526657, "loss/reg": 0.001899329130537808, "step": 2646 }, { "epoch": 0.330875, "grad_norm": 1.9751060009002686, "grad_norm_var": 0.053871706804030355, "learning_rate": 0.0001, "loss": 1.0443, "loss/crossentropy": 2.5321316719055176, "loss/hidden": 0.890625, "loss/logits": 0.1346917301416397, "loss/reg": 0.0018984059570357203, "step": 2647 }, { "epoch": 0.331, "grad_norm": 2.362856864929199, "grad_norm_var": 0.036813837754827244, "learning_rate": 0.0001, "loss": 1.1987, "loss/crossentropy": 2.2530457973480225, "loss/hidden": 1.0078125, "loss/logits": 0.17189380526542664, "loss/reg": 0.0018974954728037119, "step": 2648 }, { "epoch": 0.331125, "grad_norm": 2.2598862648010254, "grad_norm_var": 0.037018677893221746, "learning_rate": 0.0001, "loss": 1.1309, "loss/crossentropy": 2.5918681621551514, "loss/hidden": 0.953125, "loss/logits": 0.15878039598464966, "loss/reg": 0.0018965794006362557, "step": 2649 }, { "epoch": 0.33125, "grad_norm": 2.2799508571624756, "grad_norm_var": 0.03630941081203384, "learning_rate": 0.0001, "loss": 1.2727, "loss/crossentropy": 2.3382413387298584, "loss/hidden": 1.0625, "loss/logits": 0.19120196998119354, "loss/reg": 0.0018956776475533843, "step": 2650 }, { "epoch": 0.331375, "grad_norm": 2.129265546798706, "grad_norm_var": 0.03615401761054026, "learning_rate": 0.0001, "loss": 1.0881, "loss/crossentropy": 2.4861137866973877, "loss/hidden": 0.91796875, "loss/logits": 0.15119296312332153, "loss/reg": 0.0018946613417938352, "step": 2651 }, { "epoch": 0.3315, "grad_norm": 2.465085506439209, "grad_norm_var": 0.03546635972848335, "learning_rate": 0.0001, "loss": 1.1274, "loss/crossentropy": 2.61588978767395, "loss/hidden": 0.9375, "loss/logits": 0.17096257209777832, "loss/reg": 0.0018936903215944767, "step": 2652 }, { "epoch": 0.331625, "grad_norm": 2.7458391189575195, "grad_norm_var": 0.05145716457255366, "learning_rate": 0.0001, "loss": 1.0552, "loss/crossentropy": 2.6071133613586426, "loss/hidden": 0.88671875, "loss/logits": 0.14952749013900757, "loss/reg": 0.0018926567863672972, "step": 2653 }, { "epoch": 0.33175, "grad_norm": 2.4976582527160645, "grad_norm_var": 0.055798693889490165, "learning_rate": 0.0001, "loss": 0.9894, "loss/crossentropy": 2.690462589263916, "loss/hidden": 0.8359375, "loss/logits": 0.13450011610984802, "loss/reg": 0.0018916286062449217, "step": 2654 }, { "epoch": 0.331875, "grad_norm": 3.005953788757324, "grad_norm_var": 0.0906226391588234, "learning_rate": 0.0001, "loss": 1.2367, "loss/crossentropy": 2.5991427898406982, "loss/hidden": 1.0234375, "loss/logits": 0.19438031315803528, "loss/reg": 0.0018905544420704246, "step": 2655 }, { "epoch": 0.332, "grad_norm": 2.4303152561187744, "grad_norm_var": 0.08827288791378933, "learning_rate": 0.0001, "loss": 1.0758, "loss/crossentropy": 2.6814777851104736, "loss/hidden": 0.89453125, "loss/logits": 0.16241493821144104, "loss/reg": 0.0018895355751737952, "step": 2656 }, { "epoch": 0.332125, "grad_norm": 2.3696177005767822, "grad_norm_var": 0.08109445000954238, "learning_rate": 0.0001, "loss": 1.0103, "loss/crossentropy": 2.1729187965393066, "loss/hidden": 0.859375, "loss/logits": 0.13203448057174683, "loss/reg": 0.00188846280798316, "step": 2657 }, { "epoch": 0.33225, "grad_norm": 2.690157413482666, "grad_norm_var": 0.0865381063830597, "learning_rate": 0.0001, "loss": 1.1794, "loss/crossentropy": 2.382683753967285, "loss/hidden": 0.98828125, "loss/logits": 0.1722523421049118, "loss/reg": 0.001887432998046279, "step": 2658 }, { "epoch": 0.332375, "grad_norm": 2.3377158641815186, "grad_norm_var": 0.08550668653987045, "learning_rate": 0.0001, "loss": 1.0269, "loss/crossentropy": 2.354161262512207, "loss/hidden": 0.8671875, "loss/logits": 0.14088031649589539, "loss/reg": 0.0018864229787141085, "step": 2659 }, { "epoch": 0.3325, "grad_norm": 2.344102621078491, "grad_norm_var": 0.08483691739346852, "learning_rate": 0.0001, "loss": 0.9614, "loss/crossentropy": 2.60709547996521, "loss/hidden": 0.8203125, "loss/logits": 0.12225183844566345, "loss/reg": 0.0018853791989386082, "step": 2660 }, { "epoch": 0.332625, "grad_norm": 2.5030486583709717, "grad_norm_var": 0.06268034044456741, "learning_rate": 0.0001, "loss": 1.1442, "loss/crossentropy": 2.5318028926849365, "loss/hidden": 0.95703125, "loss/logits": 0.16829800605773926, "loss/reg": 0.001884306431747973, "step": 2661 }, { "epoch": 0.33275, "grad_norm": 2.1227612495422363, "grad_norm_var": 0.06462940212313602, "learning_rate": 0.0001, "loss": 1.0323, "loss/crossentropy": 2.415374755859375, "loss/hidden": 0.87109375, "loss/logits": 0.14235788583755493, "loss/reg": 0.0018832576461136341, "step": 2662 }, { "epoch": 0.332875, "grad_norm": 2.2799723148345947, "grad_norm_var": 0.052863778793785815, "learning_rate": 0.0001, "loss": 0.8902, "loss/crossentropy": 2.6499576568603516, "loss/hidden": 0.75, "loss/logits": 0.1213902086019516, "loss/reg": 0.0018821622943505645, "step": 2663 }, { "epoch": 0.333, "grad_norm": 7.191273212432861, "grad_norm_var": 1.468983779343973, "learning_rate": 0.0001, "loss": 1.5584, "loss/crossentropy": 2.713200569152832, "loss/hidden": 1.2734375, "loss/logits": 0.26614871621131897, "loss/reg": 0.00188113609328866, "step": 2664 }, { "epoch": 0.333125, "grad_norm": 2.525273084640503, "grad_norm_var": 1.4568113213542326, "learning_rate": 0.0001, "loss": 1.0781, "loss/crossentropy": 2.497141122817993, "loss/hidden": 0.890625, "loss/logits": 0.1687030792236328, "loss/reg": 0.0018800360849127173, "step": 2665 }, { "epoch": 0.33325, "grad_norm": 2.6631534099578857, "grad_norm_var": 1.4422344316903526, "learning_rate": 0.0001, "loss": 1.3699, "loss/crossentropy": 2.4155497550964355, "loss/hidden": 1.125, "loss/logits": 0.22608187794685364, "loss/reg": 0.0018789093010127544, "step": 2666 }, { "epoch": 0.333375, "grad_norm": 2.211228132247925, "grad_norm_var": 1.4356649768828937, "learning_rate": 0.0001, "loss": 1.0598, "loss/crossentropy": 2.830287218093872, "loss/hidden": 0.8828125, "loss/logits": 0.15816667675971985, "loss/reg": 0.0018778806552290916, "step": 2667 }, { "epoch": 0.3335, "grad_norm": 2.8867194652557373, "grad_norm_var": 1.4294123814231512, "learning_rate": 0.0001, "loss": 1.3215, "loss/crossentropy": 2.463146209716797, "loss/hidden": 1.09375, "loss/logits": 0.20895789563655853, "loss/reg": 0.001876741531305015, "step": 2668 }, { "epoch": 0.333625, "grad_norm": 2.1876380443573, "grad_norm_var": 1.4529399599044663, "learning_rate": 0.0001, "loss": 1.1608, "loss/crossentropy": 2.2767724990844727, "loss/hidden": 0.98828125, "loss/logits": 0.15372249484062195, "loss/reg": 0.0018756336066871881, "step": 2669 }, { "epoch": 0.33375, "grad_norm": 2.315303325653076, "grad_norm_var": 1.4615284490095743, "learning_rate": 0.0001, "loss": 1.1973, "loss/crossentropy": 2.1595675945281982, "loss/hidden": 1.0078125, "loss/logits": 0.1707768440246582, "loss/reg": 0.0018744649132713675, "step": 2670 }, { "epoch": 0.333875, "grad_norm": 2.092723846435547, "grad_norm_var": 1.482975635094027, "learning_rate": 0.0001, "loss": 1.1159, "loss/crossentropy": 2.5402543544769287, "loss/hidden": 0.9453125, "loss/logits": 0.151841938495636, "loss/reg": 0.0018734410405158997, "step": 2671 }, { "epoch": 0.334, "grad_norm": 2.7746829986572266, "grad_norm_var": 1.478145299448578, "learning_rate": 0.0001, "loss": 1.2631, "loss/crossentropy": 2.16640043258667, "loss/hidden": 1.0546875, "loss/logits": 0.18966086208820343, "loss/reg": 0.0018723169341683388, "step": 2672 }, { "epoch": 0.334125, "grad_norm": 2.1127467155456543, "grad_norm_var": 1.4942169046707474, "learning_rate": 0.0001, "loss": 0.9903, "loss/crossentropy": 2.8081676959991455, "loss/hidden": 0.82421875, "loss/logits": 0.14737804234027863, "loss/reg": 0.0018712611636146903, "step": 2673 }, { "epoch": 0.33425, "grad_norm": 2.4230544567108154, "grad_norm_var": 1.4991121315361928, "learning_rate": 0.0001, "loss": 1.2295, "loss/crossentropy": 2.534193277359009, "loss/hidden": 1.03125, "loss/logits": 0.179592102766037, "loss/reg": 0.0018702613888308406, "step": 2674 }, { "epoch": 0.334375, "grad_norm": 1.7492200136184692, "grad_norm_var": 1.5480635226599677, "learning_rate": 0.0001, "loss": 0.9691, "loss/crossentropy": 2.468768835067749, "loss/hidden": 0.80859375, "loss/logits": 0.1418367475271225, "loss/reg": 0.0018693569581955671, "step": 2675 }, { "epoch": 0.3345, "grad_norm": 4.746180057525635, "grad_norm_var": 1.8110573961123264, "learning_rate": 0.0001, "loss": 1.3411, "loss/crossentropy": 2.5581769943237305, "loss/hidden": 1.109375, "loss/logits": 0.21305061876773834, "loss/reg": 0.0018682489171624184, "step": 2676 }, { "epoch": 0.334625, "grad_norm": 2.7722597122192383, "grad_norm_var": 1.8049617454579623, "learning_rate": 0.0001, "loss": 0.9742, "loss/crossentropy": 2.7115790843963623, "loss/hidden": 0.82421875, "loss/logits": 0.13126155734062195, "loss/reg": 0.0018671368015930057, "step": 2677 }, { "epoch": 0.33475, "grad_norm": 2.397779941558838, "grad_norm_var": 1.784272616704823, "learning_rate": 0.0001, "loss": 1.1656, "loss/crossentropy": 2.572451591491699, "loss/hidden": 0.953125, "loss/logits": 0.1937890648841858, "loss/reg": 0.0018660185160115361, "step": 2678 }, { "epoch": 0.334875, "grad_norm": 2.3001415729522705, "grad_norm_var": 1.7828106173966882, "learning_rate": 0.0001, "loss": 1.0428, "loss/crossentropy": 2.4587132930755615, "loss/hidden": 0.87890625, "loss/logits": 0.14525815844535828, "loss/reg": 0.0018651454010978341, "step": 2679 }, { "epoch": 0.335, "grad_norm": 2.3542325496673584, "grad_norm_var": 0.4351631843757099, "learning_rate": 0.0001, "loss": 1.0346, "loss/crossentropy": 2.6474688053131104, "loss/hidden": 0.8671875, "loss/logits": 0.14879170060157776, "loss/reg": 0.0018642585491761565, "step": 2680 }, { "epoch": 0.335125, "grad_norm": 2.582580804824829, "grad_norm_var": 0.4353168836471591, "learning_rate": 0.0001, "loss": 1.2111, "loss/crossentropy": 2.1883020401000977, "loss/hidden": 1.015625, "loss/logits": 0.1768563985824585, "loss/reg": 0.001863238401710987, "step": 2681 }, { "epoch": 0.33525, "grad_norm": 2.1748592853546143, "grad_norm_var": 0.4419145365363585, "learning_rate": 0.0001, "loss": 1.0678, "loss/crossentropy": 2.6145453453063965, "loss/hidden": 0.89453125, "loss/logits": 0.15465855598449707, "loss/reg": 0.001862386125139892, "step": 2682 }, { "epoch": 0.335375, "grad_norm": 2.200927972793579, "grad_norm_var": 0.4423247362667808, "learning_rate": 0.0001, "loss": 1.1016, "loss/crossentropy": 2.590766668319702, "loss/hidden": 0.91796875, "loss/logits": 0.1650109589099884, "loss/reg": 0.001861374475993216, "step": 2683 }, { "epoch": 0.3355, "grad_norm": 2.102149248123169, "grad_norm_var": 0.44080669716295556, "learning_rate": 0.0001, "loss": 1.0288, "loss/crossentropy": 2.5048580169677734, "loss/hidden": 0.8671875, "loss/logits": 0.14305247366428375, "loss/reg": 0.0018604644574224949, "step": 2684 }, { "epoch": 0.335625, "grad_norm": 1.945141077041626, "grad_norm_var": 0.45313968692240386, "learning_rate": 0.0001, "loss": 1.0379, "loss/crossentropy": 2.517838954925537, "loss/hidden": 0.87109375, "loss/logits": 0.14823828637599945, "loss/reg": 0.0018594445427879691, "step": 2685 }, { "epoch": 0.33575, "grad_norm": 2.2177393436431885, "grad_norm_var": 0.4553599669113528, "learning_rate": 0.0001, "loss": 1.087, "loss/crossentropy": 2.3068134784698486, "loss/hidden": 0.90234375, "loss/logits": 0.166079580783844, "loss/reg": 0.0018585223006084561, "step": 2686 }, { "epoch": 0.335875, "grad_norm": 2.09899640083313, "grad_norm_var": 0.455076876396211, "learning_rate": 0.0001, "loss": 1.1671, "loss/crossentropy": 2.3346714973449707, "loss/hidden": 0.97265625, "loss/logits": 0.1759161651134491, "loss/reg": 0.0018576070433482528, "step": 2687 }, { "epoch": 0.336, "grad_norm": 2.665550470352173, "grad_norm_var": 0.4508718710630892, "learning_rate": 0.0001, "loss": 1.055, "loss/crossentropy": 2.5305895805358887, "loss/hidden": 0.88671875, "loss/logits": 0.14973413944244385, "loss/reg": 0.0018566849175840616, "step": 2688 }, { "epoch": 0.336125, "grad_norm": 2.3301820755004883, "grad_norm_var": 0.44469517215555826, "learning_rate": 0.0001, "loss": 1.0943, "loss/crossentropy": 2.364471912384033, "loss/hidden": 0.92578125, "loss/logits": 0.1499364674091339, "loss/reg": 0.0018556717550382018, "step": 2689 }, { "epoch": 0.33625, "grad_norm": 3.3963143825531006, "grad_norm_var": 0.5015280834703465, "learning_rate": 0.0001, "loss": 1.1053, "loss/crossentropy": 2.3103365898132324, "loss/hidden": 0.9375, "loss/logits": 0.14927977323532104, "loss/reg": 0.0018547462532296777, "step": 2690 }, { "epoch": 0.336375, "grad_norm": 2.1051340103149414, "grad_norm_var": 0.473715244361483, "learning_rate": 0.0001, "loss": 0.971, "loss/crossentropy": 2.3400321006774902, "loss/hidden": 0.82421875, "loss/logits": 0.12821033596992493, "loss/reg": 0.001853820402175188, "step": 2691 }, { "epoch": 0.3365, "grad_norm": 37.967716217041016, "grad_norm_var": 79.29464280786549, "learning_rate": 0.0001, "loss": 1.3144, "loss/crossentropy": 2.231086492538452, "loss/hidden": 1.1328125, "loss/logits": 0.16302363574504852, "loss/reg": 0.001852900953963399, "step": 2692 }, { "epoch": 0.336625, "grad_norm": 2.417515516281128, "grad_norm_var": 79.38899332685124, "learning_rate": 0.0001, "loss": 1.2026, "loss/crossentropy": 2.1881463527679443, "loss/hidden": 0.99609375, "loss/logits": 0.1879623383283615, "loss/reg": 0.0018519519362598658, "step": 2693 }, { "epoch": 0.33675, "grad_norm": 2.122138738632202, "grad_norm_var": 79.4738903368775, "learning_rate": 0.0001, "loss": 1.0751, "loss/crossentropy": 2.732403039932251, "loss/hidden": 0.90234375, "loss/logits": 0.15429003536701202, "loss/reg": 0.001850950182415545, "step": 2694 }, { "epoch": 0.336875, "grad_norm": 2.427495241165161, "grad_norm_var": 79.43650789415395, "learning_rate": 0.0001, "loss": 1.1607, "loss/crossentropy": 2.749450445175171, "loss/hidden": 0.9765625, "loss/logits": 0.16566693782806396, "loss/reg": 0.0018500311998650432, "step": 2695 }, { "epoch": 0.337, "grad_norm": 2.026479721069336, "grad_norm_var": 79.54002070096992, "learning_rate": 0.0001, "loss": 1.035, "loss/crossentropy": 2.5245707035064697, "loss/hidden": 0.859375, "loss/logits": 0.15715818107128143, "loss/reg": 0.001849125954322517, "step": 2696 }, { "epoch": 0.337125, "grad_norm": 2.3066904544830322, "grad_norm_var": 79.61710631620817, "learning_rate": 0.0001, "loss": 1.0098, "loss/crossentropy": 2.422834873199463, "loss/hidden": 0.8515625, "loss/logits": 0.1397317349910736, "loss/reg": 0.001848128973506391, "step": 2697 }, { "epoch": 0.33725, "grad_norm": 24.429044723510742, "grad_norm_var": 103.57728077043416, "learning_rate": 0.0001, "loss": 0.9562, "loss/crossentropy": 2.4472463130950928, "loss/hidden": 0.80859375, "loss/logits": 0.12917684018611908, "loss/reg": 0.0018471195362508297, "step": 2698 }, { "epoch": 0.337375, "grad_norm": 2.4672513008117676, "grad_norm_var": 103.44956332547352, "learning_rate": 0.0001, "loss": 1.1181, "loss/crossentropy": 2.385657787322998, "loss/hidden": 0.9375, "loss/logits": 0.16217483580112457, "loss/reg": 0.001846141880378127, "step": 2699 }, { "epoch": 0.3375, "grad_norm": 2.169530153274536, "grad_norm_var": 103.41537549279397, "learning_rate": 0.0001, "loss": 1.0424, "loss/crossentropy": 2.466364622116089, "loss/hidden": 0.88671875, "loss/logits": 0.13720174133777618, "loss/reg": 0.0018451146315783262, "step": 2700 }, { "epoch": 0.337625, "grad_norm": 2.4193644523620605, "grad_norm_var": 103.17662778476077, "learning_rate": 0.0001, "loss": 1.1625, "loss/crossentropy": 2.966073513031006, "loss/hidden": 0.9765625, "loss/logits": 0.16748575866222382, "loss/reg": 0.0018441393040120602, "step": 2701 }, { "epoch": 0.33775, "grad_norm": 2.762655735015869, "grad_norm_var": 102.92234963115122, "learning_rate": 0.0001, "loss": 1.2467, "loss/crossentropy": 2.6424593925476074, "loss/hidden": 1.03125, "loss/logits": 0.19706225395202637, "loss/reg": 0.0018430985510349274, "step": 2702 }, { "epoch": 0.337875, "grad_norm": 3.081111192703247, "grad_norm_var": 102.47088573275903, "learning_rate": 0.0001, "loss": 1.2865, "loss/crossentropy": 2.973453998565674, "loss/hidden": 1.0546875, "loss/logits": 0.21340015530586243, "loss/reg": 0.0018420425476506352, "step": 2703 }, { "epoch": 0.338, "grad_norm": 2.039189338684082, "grad_norm_var": 102.77959342946441, "learning_rate": 0.0001, "loss": 0.9704, "loss/crossentropy": 2.543024778366089, "loss/hidden": 0.81640625, "loss/logits": 0.13555113971233368, "loss/reg": 0.001841049175709486, "step": 2704 }, { "epoch": 0.338125, "grad_norm": 94.04315948486328, "grad_norm_var": 583.250454169785, "learning_rate": 0.0001, "loss": 1.2311, "loss/crossentropy": 2.6103267669677734, "loss/hidden": 1.0546875, "loss/logits": 0.15800327062606812, "loss/reg": 0.0018400507979094982, "step": 2705 }, { "epoch": 0.33825, "grad_norm": 2.882803201675415, "grad_norm_var": 583.8396701404594, "learning_rate": 0.0001, "loss": 1.0309, "loss/crossentropy": 2.5338845252990723, "loss/hidden": 0.8671875, "loss/logits": 0.14534808695316315, "loss/reg": 0.0018390343757346272, "step": 2706 }, { "epoch": 0.338375, "grad_norm": 2.3847930431365967, "grad_norm_var": 583.4856970971107, "learning_rate": 0.0001, "loss": 1.1466, "loss/crossentropy": 2.5620946884155273, "loss/hidden": 0.96484375, "loss/logits": 0.16337844729423523, "loss/reg": 0.001838015508837998, "step": 2707 }, { "epoch": 0.3385, "grad_norm": 1.991986870765686, "grad_norm_var": 538.6004132313805, "learning_rate": 0.0001, "loss": 0.9835, "loss/crossentropy": 2.481635332107544, "loss/hidden": 0.82421875, "loss/logits": 0.14090412855148315, "loss/reg": 0.0018369447207078338, "step": 2708 }, { "epoch": 0.338625, "grad_norm": 2.2890617847442627, "grad_norm_var": 538.7227165596022, "learning_rate": 0.0001, "loss": 1.1003, "loss/crossentropy": 2.650740146636963, "loss/hidden": 0.91796875, "loss/logits": 0.16396166384220123, "loss/reg": 0.0018359466921538115, "step": 2709 }, { "epoch": 0.33875, "grad_norm": 2.0475618839263916, "grad_norm_var": 538.7963288012418, "learning_rate": 0.0001, "loss": 1.1912, "loss/crossentropy": 2.339217185974121, "loss/hidden": 1.0078125, "loss/logits": 0.16504302620887756, "loss/reg": 0.0018349699676036835, "step": 2710 }, { "epoch": 0.338875, "grad_norm": 2.890852928161621, "grad_norm_var": 538.3736961153293, "learning_rate": 0.0001, "loss": 1.3978, "loss/crossentropy": 2.5619242191314697, "loss/hidden": 1.15625, "loss/logits": 0.2232348620891571, "loss/reg": 0.001833954593166709, "step": 2711 }, { "epoch": 0.339, "grad_norm": 3.633685350418091, "grad_norm_var": 536.9305082511938, "learning_rate": 0.0001, "loss": 1.4332, "loss/crossentropy": 2.0454859733581543, "loss/hidden": 1.25, "loss/logits": 0.16486121714115143, "loss/reg": 0.0018328900914639235, "step": 2712 }, { "epoch": 0.339125, "grad_norm": 2.6921961307525635, "grad_norm_var": 536.5641480652256, "learning_rate": 0.0001, "loss": 1.1218, "loss/crossentropy": 2.6788878440856934, "loss/hidden": 0.953125, "loss/logits": 0.1503453403711319, "loss/reg": 0.001831823610700667, "step": 2713 }, { "epoch": 0.33925, "grad_norm": 2.6037473678588867, "grad_norm_var": 523.2960518523861, "learning_rate": 0.0001, "loss": 1.0522, "loss/crossentropy": 2.392246723175049, "loss/hidden": 0.88671875, "loss/logits": 0.14712922275066376, "loss/reg": 0.0018308209255337715, "step": 2714 }, { "epoch": 0.339375, "grad_norm": 2.685211420059204, "grad_norm_var": 523.1302419092834, "learning_rate": 0.0001, "loss": 1.2805, "loss/crossentropy": 2.1578640937805176, "loss/hidden": 1.09375, "loss/logits": 0.16845953464508057, "loss/reg": 0.0018298204522579908, "step": 2715 }, { "epoch": 0.3395, "grad_norm": 2.164790630340576, "grad_norm_var": 523.1341101489033, "learning_rate": 0.0001, "loss": 1.1006, "loss/crossentropy": 2.0535874366760254, "loss/hidden": 0.9375, "loss/logits": 0.14484111964702606, "loss/reg": 0.0018288521096110344, "step": 2716 }, { "epoch": 0.339625, "grad_norm": 2.4620444774627686, "grad_norm_var": 523.1008260461856, "learning_rate": 0.0001, "loss": 1.0658, "loss/crossentropy": 2.398467540740967, "loss/hidden": 0.890625, "loss/logits": 0.15694038569927216, "loss/reg": 0.0018278475617989898, "step": 2717 }, { "epoch": 0.33975, "grad_norm": 2.62349796295166, "grad_norm_var": 523.2046099617263, "learning_rate": 0.0001, "loss": 1.2082, "loss/crossentropy": 2.6654014587402344, "loss/hidden": 1.015625, "loss/logits": 0.17431676387786865, "loss/reg": 0.0018269274150952697, "step": 2718 }, { "epoch": 0.339875, "grad_norm": 2.4756715297698975, "grad_norm_var": 523.6473816674276, "learning_rate": 0.0001, "loss": 1.0077, "loss/crossentropy": 2.555826425552368, "loss/hidden": 0.85546875, "loss/logits": 0.13401949405670166, "loss/reg": 0.0018260165816172957, "step": 2719 }, { "epoch": 0.34, "grad_norm": 3.8888871669769287, "grad_norm_var": 522.3308516809772, "learning_rate": 0.0001, "loss": 1.5934, "loss/crossentropy": 1.4446630477905273, "loss/hidden": 1.359375, "loss/logits": 0.215741366147995, "loss/reg": 0.001825109589844942, "step": 2720 }, { "epoch": 0.340125, "grad_norm": 6.470000743865967, "grad_norm_var": 1.1742924017674103, "learning_rate": 0.0001, "loss": 1.2765, "loss/crossentropy": 2.287296772003174, "loss/hidden": 1.0859375, "loss/logits": 0.17229416966438293, "loss/reg": 0.0018241191282868385, "step": 2721 }, { "epoch": 0.34025, "grad_norm": 2.30989146232605, "grad_norm_var": 1.195102367072479, "learning_rate": 0.0001, "loss": 1.0072, "loss/crossentropy": 2.5110247135162354, "loss/hidden": 0.859375, "loss/logits": 0.1295817792415619, "loss/reg": 0.0018232045695185661, "step": 2722 }, { "epoch": 0.340375, "grad_norm": 2.239971399307251, "grad_norm_var": 1.2054128889630922, "learning_rate": 0.0001, "loss": 1.041, "loss/crossentropy": 2.501211166381836, "loss/hidden": 0.8671875, "loss/logits": 0.15554331243038177, "loss/reg": 0.0018223235383629799, "step": 2723 }, { "epoch": 0.3405, "grad_norm": 2.0520381927490234, "grad_norm_var": 1.1988338241303524, "learning_rate": 0.0001, "loss": 1.0928, "loss/crossentropy": 2.4717514514923096, "loss/hidden": 0.9296875, "loss/logits": 0.1448923945426941, "loss/reg": 0.0018213314469903708, "step": 2724 }, { "epoch": 0.340625, "grad_norm": 2.425610303878784, "grad_norm_var": 1.1898671289574774, "learning_rate": 0.0001, "loss": 1.0057, "loss/crossentropy": 2.582033634185791, "loss/hidden": 0.8671875, "loss/logits": 0.12033524364233017, "loss/reg": 0.001820415141992271, "step": 2725 }, { "epoch": 0.34075, "grad_norm": 2.4594147205352783, "grad_norm_var": 1.1561783476125167, "learning_rate": 0.0001, "loss": 1.1409, "loss/crossentropy": 2.4555699825286865, "loss/hidden": 0.95703125, "loss/logits": 0.1656779944896698, "loss/reg": 0.001819506287574768, "step": 2726 }, { "epoch": 0.340875, "grad_norm": 2.5513875484466553, "grad_norm_var": 1.1628823794799092, "learning_rate": 0.0001, "loss": 0.9692, "loss/crossentropy": 2.349595308303833, "loss/hidden": 0.83203125, "loss/logits": 0.11900325119495392, "loss/reg": 0.001818520831875503, "step": 2727 }, { "epoch": 0.341, "grad_norm": 2.160773277282715, "grad_norm_var": 1.1462620562110857, "learning_rate": 0.0001, "loss": 1.209, "loss/crossentropy": 2.3872406482696533, "loss/hidden": 1.015625, "loss/logits": 0.17516157031059265, "loss/reg": 0.0018175012664869428, "step": 2728 }, { "epoch": 0.341125, "grad_norm": 2.150336503982544, "grad_norm_var": 1.1699862214882226, "learning_rate": 0.0001, "loss": 1.0144, "loss/crossentropy": 2.1815345287323, "loss/hidden": 0.8671875, "loss/logits": 0.1290430724620819, "loss/reg": 0.0018164203502237797, "step": 2729 }, { "epoch": 0.34125, "grad_norm": 10.78381061553955, "grad_norm_var": 5.2114253749551445, "learning_rate": 0.0001, "loss": 1.3361, "loss/crossentropy": 2.532008171081543, "loss/hidden": 1.140625, "loss/logits": 0.17737087607383728, "loss/reg": 0.0018153302371501923, "step": 2730 }, { "epoch": 0.341375, "grad_norm": 2.75925874710083, "grad_norm_var": 5.206251564628699, "learning_rate": 0.0001, "loss": 1.1374, "loss/crossentropy": 2.4532768726348877, "loss/hidden": 0.94140625, "loss/logits": 0.17786462604999542, "loss/reg": 0.0018143365159630775, "step": 2731 }, { "epoch": 0.3415, "grad_norm": 2.558699369430542, "grad_norm_var": 5.159027094413, "learning_rate": 0.0001, "loss": 1.1245, "loss/crossentropy": 2.2733023166656494, "loss/hidden": 0.97265625, "loss/logits": 0.13374584913253784, "loss/reg": 0.0018132405821233988, "step": 2732 }, { "epoch": 0.341625, "grad_norm": 2.5739405155181885, "grad_norm_var": 5.147707540053412, "learning_rate": 0.0001, "loss": 1.1262, "loss/crossentropy": 2.544266939163208, "loss/hidden": 0.93359375, "loss/logits": 0.17448224127292633, "loss/reg": 0.0018122079782187939, "step": 2733 }, { "epoch": 0.34175, "grad_norm": 2.0026519298553467, "grad_norm_var": 5.226159548209328, "learning_rate": 0.0001, "loss": 1.0822, "loss/crossentropy": 2.433396816253662, "loss/hidden": 0.90625, "loss/logits": 0.15784336626529694, "loss/reg": 0.0018112336983904243, "step": 2734 }, { "epoch": 0.341875, "grad_norm": 2.01479434967041, "grad_norm_var": 5.286489056258906, "learning_rate": 0.0001, "loss": 1.0375, "loss/crossentropy": 2.6977927684783936, "loss/hidden": 0.875, "loss/logits": 0.14437653124332428, "loss/reg": 0.0018102286849170923, "step": 2735 }, { "epoch": 0.342, "grad_norm": 2.9384512901306152, "grad_norm_var": 5.2572436790201476, "learning_rate": 0.0001, "loss": 1.1931, "loss/crossentropy": 2.3533003330230713, "loss/hidden": 1.0078125, "loss/logits": 0.16717535257339478, "loss/reg": 0.0018091805977746844, "step": 2736 }, { "epoch": 0.342125, "grad_norm": 2.042508363723755, "grad_norm_var": 4.524390821705515, "learning_rate": 0.0001, "loss": 1.0206, "loss/crossentropy": 2.4242684841156006, "loss/hidden": 0.8671875, "loss/logits": 0.1352953016757965, "loss/reg": 0.0018081095768138766, "step": 2737 }, { "epoch": 0.34225, "grad_norm": 2.688789129257202, "grad_norm_var": 4.504740106660578, "learning_rate": 0.0001, "loss": 0.9199, "loss/crossentropy": 2.221768856048584, "loss/hidden": 0.78515625, "loss/logits": 0.11662349104881287, "loss/reg": 0.0018071271479129791, "step": 2738 }, { "epoch": 0.342375, "grad_norm": 4.383805274963379, "grad_norm_var": 4.603282506243825, "learning_rate": 0.0001, "loss": 1.2477, "loss/crossentropy": 2.616751194000244, "loss/hidden": 1.0234375, "loss/logits": 0.20622903108596802, "loss/reg": 0.0018060887232422829, "step": 2739 }, { "epoch": 0.3425, "grad_norm": 2.3086087703704834, "grad_norm_var": 4.573799596920866, "learning_rate": 0.0001, "loss": 1.1417, "loss/crossentropy": 2.658785820007324, "loss/hidden": 0.94921875, "loss/logits": 0.1744026243686676, "loss/reg": 0.0018050112994387746, "step": 2740 }, { "epoch": 0.342625, "grad_norm": 2.653839349746704, "grad_norm_var": 4.5580492099570655, "learning_rate": 0.0001, "loss": 1.116, "loss/crossentropy": 2.5258827209472656, "loss/hidden": 0.9375, "loss/logits": 0.16048413515090942, "loss/reg": 0.0018039275892078876, "step": 2741 }, { "epoch": 0.34275, "grad_norm": 2.994399309158325, "grad_norm_var": 4.532779882480922, "learning_rate": 0.0001, "loss": 1.0949, "loss/crossentropy": 2.611788511276245, "loss/hidden": 0.9296875, "loss/logits": 0.1471731811761856, "loss/reg": 0.001802846440114081, "step": 2742 }, { "epoch": 0.342875, "grad_norm": 2.4393908977508545, "grad_norm_var": 4.541724521540581, "learning_rate": 0.0001, "loss": 1.1015, "loss/crossentropy": 2.124354124069214, "loss/hidden": 0.953125, "loss/logits": 0.13034780323505402, "loss/reg": 0.0018018638947978616, "step": 2743 }, { "epoch": 0.343, "grad_norm": 4.066141128540039, "grad_norm_var": 4.532333814902241, "learning_rate": 0.0001, "loss": 1.3757, "loss/crossentropy": 2.419724225997925, "loss/hidden": 1.1328125, "loss/logits": 0.22492462396621704, "loss/reg": 0.0018008910119533539, "step": 2744 }, { "epoch": 0.343125, "grad_norm": 3.7051262855529785, "grad_norm_var": 4.463753098531913, "learning_rate": 0.0001, "loss": 1.4843, "loss/crossentropy": 2.3988916873931885, "loss/hidden": 1.2109375, "loss/logits": 0.25536584854125977, "loss/reg": 0.0017999615520238876, "step": 2745 }, { "epoch": 0.34325, "grad_norm": 2.850308418273926, "grad_norm_var": 0.4887056693696534, "learning_rate": 0.0001, "loss": 1.2108, "loss/crossentropy": 2.4959516525268555, "loss/hidden": 1.015625, "loss/logits": 0.1772080659866333, "loss/reg": 0.0017990324413403869, "step": 2746 }, { "epoch": 0.343375, "grad_norm": 2.268136501312256, "grad_norm_var": 0.5071881957031013, "learning_rate": 0.0001, "loss": 0.9835, "loss/crossentropy": 2.6413605213165283, "loss/hidden": 0.82421875, "loss/logits": 0.1413297951221466, "loss/reg": 0.0017980766715481877, "step": 2747 }, { "epoch": 0.3435, "grad_norm": 2.3138809204101562, "grad_norm_var": 0.5181775640931913, "learning_rate": 0.0001, "loss": 1.1187, "loss/crossentropy": 2.42869234085083, "loss/hidden": 0.92578125, "loss/logits": 0.1748996078968048, "loss/reg": 0.0017971734050661325, "step": 2748 }, { "epoch": 0.343625, "grad_norm": 2.5672531127929688, "grad_norm_var": 0.518350984033433, "learning_rate": 0.0001, "loss": 0.9826, "loss/crossentropy": 2.5995495319366455, "loss/hidden": 0.828125, "loss/logits": 0.13652917742729187, "loss/reg": 0.0017961938865482807, "step": 2749 }, { "epoch": 0.34375, "grad_norm": 1.9516596794128418, "grad_norm_var": 0.5236958625581072, "learning_rate": 0.0001, "loss": 0.9811, "loss/crossentropy": 2.435476064682007, "loss/hidden": 0.82421875, "loss/logits": 0.13894100487232208, "loss/reg": 0.0017952200723811984, "step": 2750 }, { "epoch": 0.343875, "grad_norm": 2.5574264526367188, "grad_norm_var": 0.488060116175734, "learning_rate": 0.0001, "loss": 1.0099, "loss/crossentropy": 2.4720406532287598, "loss/hidden": 0.86328125, "loss/logits": 0.12868744134902954, "loss/reg": 0.001794293406419456, "step": 2751 }, { "epoch": 0.344, "grad_norm": 2.925940752029419, "grad_norm_var": 0.4878316250921138, "learning_rate": 0.0001, "loss": 1.2293, "loss/crossentropy": 2.5146737098693848, "loss/hidden": 1.0546875, "loss/logits": 0.1567022204399109, "loss/reg": 0.001793318777345121, "step": 2752 }, { "epoch": 0.344125, "grad_norm": 2.994488000869751, "grad_norm_var": 0.448981072340289, "learning_rate": 0.0001, "loss": 1.3988, "loss/crossentropy": 2.4764902591705322, "loss/hidden": 1.109375, "loss/logits": 0.2715311348438263, "loss/reg": 0.0017923847772181034, "step": 2753 }, { "epoch": 0.34425, "grad_norm": 2.534334897994995, "grad_norm_var": 0.45388110019789163, "learning_rate": 0.0001, "loss": 1.4052, "loss/crossentropy": 2.1140565872192383, "loss/hidden": 1.1640625, "loss/logits": 0.2232171595096588, "loss/reg": 0.0017914106138050556, "step": 2754 }, { "epoch": 0.344375, "grad_norm": 2.1548988819122314, "grad_norm_var": 0.3069711549991249, "learning_rate": 0.0001, "loss": 1.0304, "loss/crossentropy": 2.432929277420044, "loss/hidden": 0.875, "loss/logits": 0.13744524121284485, "loss/reg": 0.0017905068816617131, "step": 2755 }, { "epoch": 0.3445, "grad_norm": 2.0614125728607178, "grad_norm_var": 0.32386714799604305, "learning_rate": 0.0001, "loss": 1.1116, "loss/crossentropy": 2.2116754055023193, "loss/hidden": 0.93359375, "loss/logits": 0.16013850271701813, "loss/reg": 0.0017896265489980578, "step": 2756 }, { "epoch": 0.344625, "grad_norm": 2.067782163619995, "grad_norm_var": 0.3481525590970089, "learning_rate": 0.0001, "loss": 0.9778, "loss/crossentropy": 2.635493755340576, "loss/hidden": 0.828125, "loss/logits": 0.13179558515548706, "loss/reg": 0.001788573688827455, "step": 2757 }, { "epoch": 0.34475, "grad_norm": 2.6904428005218506, "grad_norm_var": 0.34010243521213773, "learning_rate": 0.0001, "loss": 0.9779, "loss/crossentropy": 2.661101818084717, "loss/hidden": 0.81640625, "loss/logits": 0.1436275839805603, "loss/reg": 0.0017876180354505777, "step": 2758 }, { "epoch": 0.344875, "grad_norm": 2.0017945766448975, "grad_norm_var": 0.36344215103660343, "learning_rate": 0.0001, "loss": 1.1595, "loss/crossentropy": 2.2584800720214844, "loss/hidden": 0.98046875, "loss/logits": 0.16119754314422607, "loss/reg": 0.0017866643611341715, "step": 2759 }, { "epoch": 0.345, "grad_norm": 1.9678393602371216, "grad_norm_var": 0.2303754129150282, "learning_rate": 0.0001, "loss": 0.8987, "loss/crossentropy": 2.4431183338165283, "loss/hidden": 0.76171875, "loss/logits": 0.1191268265247345, "loss/reg": 0.0017856284976005554, "step": 2760 }, { "epoch": 0.345125, "grad_norm": 2.5889570713043213, "grad_norm_var": 0.1252878387671652, "learning_rate": 0.0001, "loss": 1.3742, "loss/crossentropy": 2.4502618312835693, "loss/hidden": 1.1484375, "loss/logits": 0.20786981284618378, "loss/reg": 0.001784571330063045, "step": 2761 }, { "epoch": 0.34525, "grad_norm": 2.4695801734924316, "grad_norm_var": 0.11179445953391909, "learning_rate": 0.0001, "loss": 1.0553, "loss/crossentropy": 2.3073744773864746, "loss/hidden": 0.8984375, "loss/logits": 0.1390579640865326, "loss/reg": 0.0017835937906056643, "step": 2762 }, { "epoch": 0.345375, "grad_norm": 1.9231826066970825, "grad_norm_var": 0.12447955864263918, "learning_rate": 0.0001, "loss": 0.941, "loss/crossentropy": 2.605283498764038, "loss/hidden": 0.796875, "loss/logits": 0.126291885972023, "loss/reg": 0.001782492734491825, "step": 2763 }, { "epoch": 0.3455, "grad_norm": 2.4828085899353027, "grad_norm_var": 0.12520901397492423, "learning_rate": 0.0001, "loss": 1.162, "loss/crossentropy": 2.1376023292541504, "loss/hidden": 0.98828125, "loss/logits": 0.15587899088859558, "loss/reg": 0.0017814133316278458, "step": 2764 }, { "epoch": 0.345625, "grad_norm": 55.61252212524414, "grad_norm_var": 177.37410367481726, "learning_rate": 0.0001, "loss": 1.0375, "loss/crossentropy": 2.6359689235687256, "loss/hidden": 0.88671875, "loss/logits": 0.13298413157463074, "loss/reg": 0.0017803205410018563, "step": 2765 }, { "epoch": 0.34575, "grad_norm": 3.980548143386841, "grad_norm_var": 176.62101658809144, "learning_rate": 0.0001, "loss": 1.2266, "loss/crossentropy": 2.284987211227417, "loss/hidden": 1.046875, "loss/logits": 0.16191720962524414, "loss/reg": 0.0017792723374441266, "step": 2766 }, { "epoch": 0.345875, "grad_norm": 2.2724697589874268, "grad_norm_var": 176.74979875532844, "learning_rate": 0.0001, "loss": 1.0111, "loss/crossentropy": 2.55192232131958, "loss/hidden": 0.86328125, "loss/logits": 0.13006368279457092, "loss/reg": 0.0017781838541850448, "step": 2767 }, { "epoch": 0.346, "grad_norm": 2.5958292484283447, "grad_norm_var": 176.88291563243817, "learning_rate": 0.0001, "loss": 1.1517, "loss/crossentropy": 2.412102222442627, "loss/hidden": 0.97265625, "loss/logits": 0.16130727529525757, "loss/reg": 0.0017772013088688254, "step": 2768 }, { "epoch": 0.346125, "grad_norm": 2.206526279449463, "grad_norm_var": 177.2138385159957, "learning_rate": 0.0001, "loss": 0.9923, "loss/crossentropy": 2.647113561630249, "loss/hidden": 0.8359375, "loss/logits": 0.13856253027915955, "loss/reg": 0.0017761333147063851, "step": 2769 }, { "epoch": 0.34625, "grad_norm": 2.2694692611694336, "grad_norm_var": 177.33092692458712, "learning_rate": 0.0001, "loss": 1.2333, "loss/crossentropy": 2.5317695140838623, "loss/hidden": 1.046875, "loss/logits": 0.16870775818824768, "loss/reg": 0.001775167416781187, "step": 2770 }, { "epoch": 0.346375, "grad_norm": 2.1964190006256104, "grad_norm_var": 177.31135839554514, "learning_rate": 0.0001, "loss": 1.3711, "loss/crossentropy": 2.143099308013916, "loss/hidden": 1.140625, "loss/logits": 0.21277371048927307, "loss/reg": 0.0017742635682225227, "step": 2771 }, { "epoch": 0.3465, "grad_norm": 2.0696725845336914, "grad_norm_var": 177.3073424444675, "learning_rate": 0.0001, "loss": 0.9565, "loss/crossentropy": 2.39957594871521, "loss/hidden": 0.80859375, "loss/logits": 0.13020746409893036, "loss/reg": 0.0017732323613017797, "step": 2772 }, { "epoch": 0.346625, "grad_norm": 2.297759771347046, "grad_norm_var": 177.19889555092212, "learning_rate": 0.0001, "loss": 0.9855, "loss/crossentropy": 2.4976587295532227, "loss/hidden": 0.83984375, "loss/logits": 0.12795329093933105, "loss/reg": 0.001772188232280314, "step": 2773 }, { "epoch": 0.34675, "grad_norm": 2.0303704738616943, "grad_norm_var": 177.49333884867005, "learning_rate": 0.0001, "loss": 0.8719, "loss/crossentropy": 2.7801976203918457, "loss/hidden": 0.74609375, "loss/logits": 0.10805592685937881, "loss/reg": 0.0017711480613797903, "step": 2774 }, { "epoch": 0.346875, "grad_norm": 2.3654887676239014, "grad_norm_var": 177.32298046059097, "learning_rate": 0.0001, "loss": 1.2791, "loss/crossentropy": 2.3477678298950195, "loss/hidden": 1.0703125, "loss/logits": 0.19110971689224243, "loss/reg": 0.0017701969482004642, "step": 2775 }, { "epoch": 0.347, "grad_norm": 2.895468235015869, "grad_norm_var": 176.91415279483022, "learning_rate": 0.0001, "loss": 1.2583, "loss/crossentropy": 2.339061737060547, "loss/hidden": 1.078125, "loss/logits": 0.16245663166046143, "loss/reg": 0.0017692426918074489, "step": 2776 }, { "epoch": 0.347125, "grad_norm": 2.0600805282592773, "grad_norm_var": 177.1556745505549, "learning_rate": 0.0001, "loss": 1.064, "loss/crossentropy": 2.4432625770568848, "loss/hidden": 0.90625, "loss/logits": 0.1401069164276123, "loss/reg": 0.0017682092729955912, "step": 2777 }, { "epoch": 0.34725, "grad_norm": 2.0482537746429443, "grad_norm_var": 177.3500986394907, "learning_rate": 0.0001, "loss": 1.0569, "loss/crossentropy": 2.1330184936523438, "loss/hidden": 0.91015625, "loss/logits": 0.12904143333435059, "loss/reg": 0.00176724954508245, "step": 2778 }, { "epoch": 0.347375, "grad_norm": 2.1464202404022217, "grad_norm_var": 177.24059748238406, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.4473721981048584, "loss/hidden": 0.875, "loss/logits": 0.1706811785697937, "loss/reg": 0.001766227767802775, "step": 2779 }, { "epoch": 0.3475, "grad_norm": 2.0545661449432373, "grad_norm_var": 177.43693589339054, "learning_rate": 0.0001, "loss": 0.9905, "loss/crossentropy": 2.7199933528900146, "loss/hidden": 0.8203125, "loss/logits": 0.1525641679763794, "loss/reg": 0.0017651945818215609, "step": 2780 }, { "epoch": 0.347625, "grad_norm": 2.4466211795806885, "grad_norm_var": 0.2375432950402548, "learning_rate": 0.0001, "loss": 1.0543, "loss/crossentropy": 2.6157615184783936, "loss/hidden": 0.90234375, "loss/logits": 0.13435673713684082, "loss/reg": 0.0017641617450863123, "step": 2781 }, { "epoch": 0.34775, "grad_norm": 2.0770654678344727, "grad_norm_var": 0.05549600700931687, "learning_rate": 0.0001, "loss": 1.0857, "loss/crossentropy": 2.587388515472412, "loss/hidden": 0.9140625, "loss/logits": 0.1540127992630005, "loss/reg": 0.0017632590606808662, "step": 2782 }, { "epoch": 0.347875, "grad_norm": 2.538799285888672, "grad_norm_var": 0.060655047153679933, "learning_rate": 0.0001, "loss": 1.1021, "loss/crossentropy": 2.2909505367279053, "loss/hidden": 0.92578125, "loss/logits": 0.15872646868228912, "loss/reg": 0.0017624498577788472, "step": 2783 }, { "epoch": 0.348, "grad_norm": 1.6760848760604858, "grad_norm_var": 0.07340596205414483, "learning_rate": 0.0001, "loss": 0.905, "loss/crossentropy": 2.564978837966919, "loss/hidden": 0.76953125, "loss/logits": 0.11784428358078003, "loss/reg": 0.00176150631159544, "step": 2784 }, { "epoch": 0.348125, "grad_norm": 2.2577829360961914, "grad_norm_var": 0.07353828091030336, "learning_rate": 0.0001, "loss": 0.9301, "loss/crossentropy": 2.6113736629486084, "loss/hidden": 0.78515625, "loss/logits": 0.127354234457016, "loss/reg": 0.0017606550827622414, "step": 2785 }, { "epoch": 0.34825, "grad_norm": 5.318231582641602, "grad_norm_var": 0.6768604751325653, "learning_rate": 0.0001, "loss": 1.0903, "loss/crossentropy": 2.9984378814697266, "loss/hidden": 0.90625, "loss/logits": 0.16649848222732544, "loss/reg": 0.0017598549602553248, "step": 2786 }, { "epoch": 0.348375, "grad_norm": 2.11244535446167, "grad_norm_var": 0.6796359323279574, "learning_rate": 0.0001, "loss": 1.0467, "loss/crossentropy": 2.564290761947632, "loss/hidden": 0.875, "loss/logits": 0.15414319932460785, "loss/reg": 0.0017589129274711013, "step": 2787 }, { "epoch": 0.3485, "grad_norm": 2.477431297302246, "grad_norm_var": 0.6720850581672863, "learning_rate": 0.0001, "loss": 1.1369, "loss/crossentropy": 2.210176706314087, "loss/hidden": 0.9609375, "loss/logits": 0.1584058403968811, "loss/reg": 0.0017579634441062808, "step": 2788 }, { "epoch": 0.348625, "grad_norm": 3.0897951126098633, "grad_norm_var": 0.6978364470166634, "learning_rate": 0.0001, "loss": 1.248, "loss/crossentropy": 2.6431119441986084, "loss/hidden": 1.015625, "loss/logits": 0.21483758091926575, "loss/reg": 0.0017570724012330174, "step": 2789 }, { "epoch": 0.34875, "grad_norm": 2.3309786319732666, "grad_norm_var": 0.6856758036862847, "learning_rate": 0.0001, "loss": 1.1589, "loss/crossentropy": 2.5294032096862793, "loss/hidden": 0.96875, "loss/logits": 0.17254266142845154, "loss/reg": 0.0017561818240210414, "step": 2790 }, { "epoch": 0.348875, "grad_norm": 2.4417543411254883, "grad_norm_var": 0.6847379269495424, "learning_rate": 0.0001, "loss": 1.3014, "loss/crossentropy": 2.760693073272705, "loss/hidden": 1.078125, "loss/logits": 0.20574572682380676, "loss/reg": 0.0017553071957081556, "step": 2791 }, { "epoch": 0.349, "grad_norm": 2.308687925338745, "grad_norm_var": 0.6751789801200065, "learning_rate": 0.0001, "loss": 1.0694, "loss/crossentropy": 2.5619680881500244, "loss/hidden": 0.90234375, "loss/logits": 0.1495397388935089, "loss/reg": 0.0017544146394357085, "step": 2792 }, { "epoch": 0.349125, "grad_norm": 3.248924970626831, "grad_norm_var": 0.6998734893562404, "learning_rate": 0.0001, "loss": 1.1235, "loss/crossentropy": 2.4844791889190674, "loss/hidden": 0.953125, "loss/logits": 0.15282654762268066, "loss/reg": 0.0017534621292725205, "step": 2793 }, { "epoch": 0.34925, "grad_norm": 2.2294931411743164, "grad_norm_var": 0.6901432197775282, "learning_rate": 0.0001, "loss": 1.2018, "loss/crossentropy": 2.2272846698760986, "loss/hidden": 1.0234375, "loss/logits": 0.1607985943555832, "loss/reg": 0.0017524892464280128, "step": 2794 }, { "epoch": 0.349375, "grad_norm": 2.264816999435425, "grad_norm_var": 0.6846926444708188, "learning_rate": 0.0001, "loss": 1.0195, "loss/crossentropy": 2.3695662021636963, "loss/hidden": 0.8671875, "loss/logits": 0.13475856184959412, "loss/reg": 0.0017515190411359072, "step": 2795 }, { "epoch": 0.3495, "grad_norm": 2.0306382179260254, "grad_norm_var": 0.6863237076437959, "learning_rate": 0.0001, "loss": 0.8819, "loss/crossentropy": 2.6183347702026367, "loss/hidden": 0.75, "loss/logits": 0.11441855877637863, "loss/reg": 0.0017505293944850564, "step": 2796 }, { "epoch": 0.349625, "grad_norm": 2.1402132511138916, "grad_norm_var": 0.6965415743140833, "learning_rate": 0.0001, "loss": 1.037, "loss/crossentropy": 2.3425629138946533, "loss/hidden": 0.8828125, "loss/logits": 0.13666534423828125, "loss/reg": 0.0017494849162176251, "step": 2797 }, { "epoch": 0.34975, "grad_norm": 2.0226893424987793, "grad_norm_var": 0.7000388277621127, "learning_rate": 0.0001, "loss": 1.0007, "loss/crossentropy": 2.299292802810669, "loss/hidden": 0.859375, "loss/logits": 0.12381201982498169, "loss/reg": 0.0017484592972323298, "step": 2798 }, { "epoch": 0.349875, "grad_norm": 2.9328784942626953, "grad_norm_var": 0.7101785362710192, "learning_rate": 0.0001, "loss": 1.4023, "loss/crossentropy": 2.332491397857666, "loss/hidden": 1.1484375, "loss/logits": 0.23635277152061462, "loss/reg": 0.0017474106280133128, "step": 2799 }, { "epoch": 0.35, "grad_norm": 2.3927390575408936, "grad_norm_var": 0.6582773529459321, "learning_rate": 0.0001, "loss": 1.2727, "loss/crossentropy": 2.6621432304382324, "loss/hidden": 1.0625, "loss/logits": 0.19269157946109772, "loss/reg": 0.0017465661512687802, "step": 2800 }, { "epoch": 0.350125, "grad_norm": 2.244951009750366, "grad_norm_var": 0.6588730979166993, "learning_rate": 0.0001, "loss": 0.9766, "loss/crossentropy": 2.4566526412963867, "loss/hidden": 0.828125, "loss/logits": 0.13099974393844604, "loss/reg": 0.0017457172507420182, "step": 2801 }, { "epoch": 0.35025, "grad_norm": 3.39272141456604, "grad_norm_var": 0.192519183199128, "learning_rate": 0.0001, "loss": 1.0929, "loss/crossentropy": 2.6078286170959473, "loss/hidden": 0.921875, "loss/logits": 0.15360912680625916, "loss/reg": 0.0017446336569264531, "step": 2802 }, { "epoch": 0.350375, "grad_norm": 2.9278719425201416, "grad_norm_var": 0.19424290340750355, "learning_rate": 0.0001, "loss": 1.0781, "loss/crossentropy": 2.319835901260376, "loss/hidden": 0.9140625, "loss/logits": 0.14657863974571228, "loss/reg": 0.0017435530899092555, "step": 2803 }, { "epoch": 0.3505, "grad_norm": 2.1318576335906982, "grad_norm_var": 0.20411907292901313, "learning_rate": 0.0001, "loss": 1.1889, "loss/crossentropy": 2.3817713260650635, "loss/hidden": 1.0, "loss/logits": 0.17143014073371887, "loss/reg": 0.0017426857957616448, "step": 2804 }, { "epoch": 0.350625, "grad_norm": 1.945341944694519, "grad_norm_var": 0.19723030835506036, "learning_rate": 0.0001, "loss": 1.0254, "loss/crossentropy": 2.451923370361328, "loss/hidden": 0.8671875, "loss/logits": 0.1408126801252365, "loss/reg": 0.0017416129121556878, "step": 2805 }, { "epoch": 0.35075, "grad_norm": 2.2014613151550293, "grad_norm_var": 0.2001037364628127, "learning_rate": 0.0001, "loss": 0.8466, "loss/crossentropy": 2.6317198276519775, "loss/hidden": 0.71875, "loss/logits": 0.1104741245508194, "loss/reg": 0.001740574254654348, "step": 2806 }, { "epoch": 0.350875, "grad_norm": 2.4871444702148438, "grad_norm_var": 0.20031232469223942, "learning_rate": 0.0001, "loss": 1.0783, "loss/crossentropy": 2.650496006011963, "loss/hidden": 0.921875, "loss/logits": 0.13907040655612946, "loss/reg": 0.001739674131385982, "step": 2807 }, { "epoch": 0.351, "grad_norm": 3.028041362762451, "grad_norm_var": 0.2208841932105509, "learning_rate": 0.0001, "loss": 1.2156, "loss/crossentropy": 2.5076615810394287, "loss/hidden": 1.0234375, "loss/logits": 0.17479819059371948, "loss/reg": 0.0017386304680258036, "step": 2808 }, { "epoch": 0.351125, "grad_norm": 2.3591175079345703, "grad_norm_var": 0.17871133108686993, "learning_rate": 0.0001, "loss": 1.1528, "loss/crossentropy": 2.187431812286377, "loss/hidden": 0.9921875, "loss/logits": 0.143191397190094, "loss/reg": 0.0017377145122736692, "step": 2809 }, { "epoch": 0.35125, "grad_norm": 2.481156826019287, "grad_norm_var": 0.17625213813124194, "learning_rate": 0.0001, "loss": 1.22, "loss/crossentropy": 2.619535207748413, "loss/hidden": 1.0, "loss/logits": 0.2025882601737976, "loss/reg": 0.0017368533881381154, "step": 2810 }, { "epoch": 0.351375, "grad_norm": 2.0328502655029297, "grad_norm_var": 0.1849244450996929, "learning_rate": 0.0001, "loss": 1.0645, "loss/crossentropy": 2.3837273120880127, "loss/hidden": 0.8984375, "loss/logits": 0.14870300889015198, "loss/reg": 0.0017360004130750895, "step": 2811 }, { "epoch": 0.3515, "grad_norm": 2.046006202697754, "grad_norm_var": 0.18413732218933293, "learning_rate": 0.0001, "loss": 0.955, "loss/crossentropy": 2.57859206199646, "loss/hidden": 0.8046875, "loss/logits": 0.1329650580883026, "loss/reg": 0.0017351453425362706, "step": 2812 }, { "epoch": 0.351625, "grad_norm": 2.258920192718506, "grad_norm_var": 0.18054314510522426, "learning_rate": 0.0001, "loss": 1.1427, "loss/crossentropy": 2.528386354446411, "loss/hidden": 0.95703125, "loss/logits": 0.1683269888162613, "loss/reg": 0.0017342991195619106, "step": 2813 }, { "epoch": 0.35175, "grad_norm": 2.157093048095703, "grad_norm_var": 0.17436652009595685, "learning_rate": 0.0001, "loss": 1.0762, "loss/crossentropy": 2.625523805618286, "loss/hidden": 0.90234375, "loss/logits": 0.1565004289150238, "loss/reg": 0.0017334959702566266, "step": 2814 }, { "epoch": 0.351875, "grad_norm": 3.138932943344116, "grad_norm_var": 0.19059556011613518, "learning_rate": 0.0001, "loss": 1.0524, "loss/crossentropy": 2.919931173324585, "loss/hidden": 0.89453125, "loss/logits": 0.1405646950006485, "loss/reg": 0.0017325450899079442, "step": 2815 }, { "epoch": 0.352, "grad_norm": 2.347205877304077, "grad_norm_var": 0.19108272001682133, "learning_rate": 0.0001, "loss": 1.0296, "loss/crossentropy": 2.767524003982544, "loss/hidden": 0.87109375, "loss/logits": 0.1411953866481781, "loss/reg": 0.0017315970035269856, "step": 2816 }, { "epoch": 0.352125, "grad_norm": 2.4137158393859863, "grad_norm_var": 0.18827598991505276, "learning_rate": 0.0001, "loss": 1.0797, "loss/crossentropy": 2.3541228771209717, "loss/hidden": 0.9140625, "loss/logits": 0.14832325279712677, "loss/reg": 0.0017306300578638911, "step": 2817 }, { "epoch": 0.35225, "grad_norm": 2.3438310623168945, "grad_norm_var": 0.1265013635706935, "learning_rate": 0.0001, "loss": 0.9357, "loss/crossentropy": 2.9085962772369385, "loss/hidden": 0.796875, "loss/logits": 0.12149880826473236, "loss/reg": 0.0017297144513577223, "step": 2818 }, { "epoch": 0.352375, "grad_norm": 2.161012887954712, "grad_norm_var": 0.10864658588507053, "learning_rate": 0.0001, "loss": 0.9622, "loss/crossentropy": 2.3477835655212402, "loss/hidden": 0.81640625, "loss/logits": 0.12846329808235168, "loss/reg": 0.0017287590308114886, "step": 2819 }, { "epoch": 0.3525, "grad_norm": 1.894262433052063, "grad_norm_var": 0.1189541215388239, "learning_rate": 0.0001, "loss": 1.0689, "loss/crossentropy": 2.359501838684082, "loss/hidden": 0.90234375, "loss/logits": 0.149271622300148, "loss/reg": 0.0017278444720432162, "step": 2820 }, { "epoch": 0.352625, "grad_norm": 2.1947989463806152, "grad_norm_var": 0.11001587853537267, "learning_rate": 0.0001, "loss": 1.2036, "loss/crossentropy": 2.4087963104248047, "loss/hidden": 1.03125, "loss/logits": 0.155103400349617, "loss/reg": 0.0017269667005166411, "step": 2821 }, { "epoch": 0.35275, "grad_norm": 2.671942949295044, "grad_norm_var": 0.11474595236059451, "learning_rate": 0.0001, "loss": 1.0621, "loss/crossentropy": 2.400142192840576, "loss/hidden": 0.890625, "loss/logits": 0.15425175428390503, "loss/reg": 0.0017260231543332338, "step": 2822 }, { "epoch": 0.352875, "grad_norm": 5.427580833435059, "grad_norm_var": 0.6987056225639108, "learning_rate": 0.0001, "loss": 1.6354, "loss/crossentropy": 2.1347997188568115, "loss/hidden": 1.328125, "loss/logits": 0.29003962874412537, "loss/reg": 0.0017250170931220055, "step": 2823 }, { "epoch": 0.353, "grad_norm": 2.474489212036133, "grad_norm_var": 0.6832958770224535, "learning_rate": 0.0001, "loss": 1.2657, "loss/crossentropy": 2.418750047683716, "loss/hidden": 1.0703125, "loss/logits": 0.1781148910522461, "loss/reg": 0.0017240834422409534, "step": 2824 }, { "epoch": 0.353125, "grad_norm": 2.3519270420074463, "grad_norm_var": 0.6834583195583049, "learning_rate": 0.0001, "loss": 1.0247, "loss/crossentropy": 2.818436861038208, "loss/hidden": 0.84765625, "loss/logits": 0.15981653332710266, "loss/reg": 0.0017231529345735908, "step": 2825 }, { "epoch": 0.35325, "grad_norm": 4.028870105743408, "grad_norm_var": 0.8241794064556344, "learning_rate": 0.0001, "loss": 1.0867, "loss/crossentropy": 2.968224048614502, "loss/hidden": 0.91796875, "loss/logits": 0.15149936079978943, "loss/reg": 0.0017222733004018664, "step": 2826 }, { "epoch": 0.353375, "grad_norm": 2.357175350189209, "grad_norm_var": 0.805299909604318, "learning_rate": 0.0001, "loss": 0.957, "loss/crossentropy": 2.680910348892212, "loss/hidden": 0.80859375, "loss/logits": 0.13120971620082855, "loss/reg": 0.0017213879618793726, "step": 2827 }, { "epoch": 0.3535, "grad_norm": 2.1809592247009277, "grad_norm_var": 0.7957187870834147, "learning_rate": 0.0001, "loss": 1.0192, "loss/crossentropy": 2.3528263568878174, "loss/hidden": 0.8671875, "loss/logits": 0.1348540186882019, "loss/reg": 0.00172046956140548, "step": 2828 }, { "epoch": 0.353625, "grad_norm": 3.107032299041748, "grad_norm_var": 0.7964315258885586, "learning_rate": 0.0001, "loss": 0.9984, "loss/crossentropy": 2.4621102809906006, "loss/hidden": 0.87109375, "loss/logits": 0.11015482246875763, "loss/reg": 0.0017195139080286026, "step": 2829 }, { "epoch": 0.35375, "grad_norm": 2.769864559173584, "grad_norm_var": 0.775282968505679, "learning_rate": 0.0001, "loss": 1.08, "loss/crossentropy": 2.809204578399658, "loss/hidden": 0.8984375, "loss/logits": 0.1644095778465271, "loss/reg": 0.0017186005134135485, "step": 2830 }, { "epoch": 0.353875, "grad_norm": 2.179577350616455, "grad_norm_var": 0.7819652113251147, "learning_rate": 0.0001, "loss": 0.9468, "loss/crossentropy": 2.4857349395751953, "loss/hidden": 0.8046875, "loss/logits": 0.12490560114383698, "loss/reg": 0.0017176737310364842, "step": 2831 }, { "epoch": 0.354, "grad_norm": 1.8614087104797363, "grad_norm_var": 0.8183693559133222, "learning_rate": 0.0001, "loss": 1.048, "loss/crossentropy": 2.5181362628936768, "loss/hidden": 0.87890625, "loss/logits": 0.15196043252944946, "loss/reg": 0.0017166847828775644, "step": 2832 }, { "epoch": 0.354125, "grad_norm": 2.0288565158843994, "grad_norm_var": 0.8398106395124689, "learning_rate": 0.0001, "loss": 0.9777, "loss/crossentropy": 2.4800610542297363, "loss/hidden": 0.828125, "loss/logits": 0.1323930323123932, "loss/reg": 0.0017157007241621614, "step": 2833 }, { "epoch": 0.35425, "grad_norm": 2.539849281311035, "grad_norm_var": 0.8348086533909319, "learning_rate": 0.0001, "loss": 1.164, "loss/crossentropy": 2.2461485862731934, "loss/hidden": 0.96875, "loss/logits": 0.17812898755073547, "loss/reg": 0.001714762533083558, "step": 2834 }, { "epoch": 0.354375, "grad_norm": 5.962584018707275, "grad_norm_var": 1.4955971766787357, "learning_rate": 0.0001, "loss": 1.3254, "loss/crossentropy": 2.266087293624878, "loss/hidden": 1.1171875, "loss/logits": 0.19111643731594086, "loss/reg": 0.001713839010335505, "step": 2835 }, { "epoch": 0.3545, "grad_norm": 5.018341541290283, "grad_norm_var": 1.6962571268793716, "learning_rate": 0.0001, "loss": 1.198, "loss/crossentropy": 2.8398637771606445, "loss/hidden": 1.046875, "loss/logits": 0.13401786983013153, "loss/reg": 0.0017128626350313425, "step": 2836 }, { "epoch": 0.354625, "grad_norm": 3.3438918590545654, "grad_norm_var": 1.6443537687069958, "learning_rate": 0.0001, "loss": 1.3532, "loss/crossentropy": 2.5057852268218994, "loss/hidden": 1.15625, "loss/logits": 0.17986303567886353, "loss/reg": 0.0017119403928518295, "step": 2837 }, { "epoch": 0.35475, "grad_norm": 2.5116755962371826, "grad_norm_var": 1.6560469666889797, "learning_rate": 0.0001, "loss": 1.1445, "loss/crossentropy": 2.520705461502075, "loss/hidden": 0.9765625, "loss/logits": 0.15080755949020386, "loss/reg": 0.0017110416665673256, "step": 2838 }, { "epoch": 0.354875, "grad_norm": 2.507730484008789, "grad_norm_var": 1.295972670963881, "learning_rate": 0.0001, "loss": 1.0349, "loss/crossentropy": 2.6069881916046143, "loss/hidden": 0.875, "loss/logits": 0.14284110069274902, "loss/reg": 0.0017101268749684095, "step": 2839 }, { "epoch": 0.355, "grad_norm": 2.1938116550445557, "grad_norm_var": 1.3187484558354448, "learning_rate": 0.0001, "loss": 1.0124, "loss/crossentropy": 2.6682937145233154, "loss/hidden": 0.859375, "loss/logits": 0.13588601350784302, "loss/reg": 0.0017091986956074834, "step": 2840 }, { "epoch": 0.355125, "grad_norm": 2.3042118549346924, "grad_norm_var": 1.3225937379973647, "learning_rate": 0.0001, "loss": 1.2725, "loss/crossentropy": 2.0934457778930664, "loss/hidden": 1.078125, "loss/logits": 0.17728814482688904, "loss/reg": 0.0017083316342905164, "step": 2841 }, { "epoch": 0.35525, "grad_norm": 2.5992774963378906, "grad_norm_var": 1.241057677452094, "learning_rate": 0.0001, "loss": 1.1869, "loss/crossentropy": 2.5267627239227295, "loss/hidden": 1.0, "loss/logits": 0.1698317676782608, "loss/reg": 0.0017074811039492488, "step": 2842 }, { "epoch": 0.355375, "grad_norm": 2.0538861751556396, "grad_norm_var": 1.2663977685856622, "learning_rate": 0.0001, "loss": 1.0433, "loss/crossentropy": 2.3042595386505127, "loss/hidden": 0.89453125, "loss/logits": 0.13174740970134735, "loss/reg": 0.0017066395375877619, "step": 2843 }, { "epoch": 0.3555, "grad_norm": 2.009514093399048, "grad_norm_var": 1.2829042908231085, "learning_rate": 0.0001, "loss": 1.0095, "loss/crossentropy": 2.7416183948516846, "loss/hidden": 0.85546875, "loss/logits": 0.13696113228797913, "loss/reg": 0.001705826842226088, "step": 2844 }, { "epoch": 0.355625, "grad_norm": 2.158133029937744, "grad_norm_var": 1.3018485999621927, "learning_rate": 0.0001, "loss": 1.1738, "loss/crossentropy": 2.521244764328003, "loss/hidden": 1.0, "loss/logits": 0.15675783157348633, "loss/reg": 0.0017048991285264492, "step": 2845 }, { "epoch": 0.35575, "grad_norm": 3.2688465118408203, "grad_norm_var": 1.3185544465877777, "learning_rate": 0.0001, "loss": 1.1386, "loss/crossentropy": 2.5707931518554688, "loss/hidden": 0.96484375, "loss/logits": 0.15672056376934052, "loss/reg": 0.001704056398011744, "step": 2846 }, { "epoch": 0.355875, "grad_norm": 2.957813024520874, "grad_norm_var": 1.2937054398222245, "learning_rate": 0.0001, "loss": 1.088, "loss/crossentropy": 2.3235251903533936, "loss/hidden": 0.9140625, "loss/logits": 0.15691466629505157, "loss/reg": 0.001703157089650631, "step": 2847 }, { "epoch": 0.356, "grad_norm": 3.2563531398773193, "grad_norm_var": 1.2347084824451768, "learning_rate": 0.0001, "loss": 1.0975, "loss/crossentropy": 2.3761394023895264, "loss/hidden": 0.921875, "loss/logits": 0.15862791240215302, "loss/reg": 0.0017022335669025779, "step": 2848 }, { "epoch": 0.356125, "grad_norm": 3.353381395339966, "grad_norm_var": 1.1870351296832722, "learning_rate": 0.0001, "loss": 1.2435, "loss/crossentropy": 2.5423715114593506, "loss/hidden": 1.0625, "loss/logits": 0.16403596103191376, "loss/reg": 0.0017013002652674913, "step": 2849 }, { "epoch": 0.35625, "grad_norm": 2.8158438205718994, "grad_norm_var": 1.1747723390043034, "learning_rate": 0.0001, "loss": 1.3025, "loss/crossentropy": 2.4496610164642334, "loss/hidden": 1.078125, "loss/logits": 0.20741979777812958, "loss/reg": 0.0017003763932734728, "step": 2850 }, { "epoch": 0.356375, "grad_norm": 3.559321880340576, "grad_norm_var": 0.5927506635024571, "learning_rate": 0.0001, "loss": 1.162, "loss/crossentropy": 2.6565349102020264, "loss/hidden": 0.98828125, "loss/logits": 0.15669971704483032, "loss/reg": 0.001699468120932579, "step": 2851 }, { "epoch": 0.3565, "grad_norm": 2.4786415100097656, "grad_norm_var": 0.2682260819984322, "learning_rate": 0.0001, "loss": 1.0444, "loss/crossentropy": 2.4158992767333984, "loss/hidden": 0.90234375, "loss/logits": 0.1250237375497818, "loss/reg": 0.0016986011760309339, "step": 2852 }, { "epoch": 0.356625, "grad_norm": 2.0186212062835693, "grad_norm_var": 0.2661232494044005, "learning_rate": 0.0001, "loss": 0.9457, "loss/crossentropy": 2.595698595046997, "loss/hidden": 0.8046875, "loss/logits": 0.1240091621875763, "loss/reg": 0.0016977444756776094, "step": 2853 }, { "epoch": 0.35675, "grad_norm": 2.433223009109497, "grad_norm_var": 0.26772410565416277, "learning_rate": 0.0001, "loss": 1.0565, "loss/crossentropy": 2.7013909816741943, "loss/hidden": 0.8828125, "loss/logits": 0.15675534307956696, "loss/reg": 0.0016968214185908437, "step": 2854 }, { "epoch": 0.356875, "grad_norm": 1.9425327777862549, "grad_norm_var": 0.2963791835092799, "learning_rate": 0.0001, "loss": 1.0591, "loss/crossentropy": 2.5410125255584717, "loss/hidden": 0.90234375, "loss/logits": 0.13975846767425537, "loss/reg": 0.0016959481872618198, "step": 2855 }, { "epoch": 0.357, "grad_norm": 2.183138370513916, "grad_norm_var": 0.29694686667191567, "learning_rate": 0.0001, "loss": 1.1945, "loss/crossentropy": 2.1535792350769043, "loss/hidden": 1.03125, "loss/logits": 0.14633342623710632, "loss/reg": 0.0016950230346992612, "step": 2856 }, { "epoch": 0.357125, "grad_norm": 3.1404290199279785, "grad_norm_var": 0.3091157714667965, "learning_rate": 0.0001, "loss": 1.2555, "loss/crossentropy": 2.534409284591675, "loss/hidden": 1.0390625, "loss/logits": 0.1994456946849823, "loss/reg": 0.0016941879875957966, "step": 2857 }, { "epoch": 0.35725, "grad_norm": 2.270643472671509, "grad_norm_var": 0.3176199209032717, "learning_rate": 0.0001, "loss": 1.2357, "loss/crossentropy": 2.3804426193237305, "loss/hidden": 1.03125, "loss/logits": 0.18747979402542114, "loss/reg": 0.0016932752914726734, "step": 2858 }, { "epoch": 0.357375, "grad_norm": 2.5950400829315186, "grad_norm_var": 0.2951643366720106, "learning_rate": 0.0001, "loss": 1.0399, "loss/crossentropy": 2.5609066486358643, "loss/hidden": 0.86328125, "loss/logits": 0.15968412160873413, "loss/reg": 0.001692446181550622, "step": 2859 }, { "epoch": 0.3575, "grad_norm": 2.2543294429779053, "grad_norm_var": 0.27791885851790615, "learning_rate": 0.0001, "loss": 1.1213, "loss/crossentropy": 2.394418239593506, "loss/hidden": 0.953125, "loss/logits": 0.15130820870399475, "loss/reg": 0.0016915244050323963, "step": 2860 }, { "epoch": 0.357625, "grad_norm": 2.7402095794677734, "grad_norm_var": 0.2595320833461064, "learning_rate": 0.0001, "loss": 1.2913, "loss/crossentropy": 2.3753504753112793, "loss/hidden": 1.0625, "loss/logits": 0.21191397309303284, "loss/reg": 0.0016905944794416428, "step": 2861 }, { "epoch": 0.35775, "grad_norm": 2.7218852043151855, "grad_norm_var": 0.23705668595723636, "learning_rate": 0.0001, "loss": 1.1269, "loss/crossentropy": 2.7040185928344727, "loss/hidden": 0.9609375, "loss/logits": 0.14908066391944885, "loss/reg": 0.0016896674642339349, "step": 2862 }, { "epoch": 0.357875, "grad_norm": 2.844148635864258, "grad_norm_var": 0.23350361432628286, "learning_rate": 0.0001, "loss": 1.2327, "loss/crossentropy": 2.476809501647949, "loss/hidden": 1.0234375, "loss/logits": 0.19235333800315857, "loss/reg": 0.0016887575620785356, "step": 2863 }, { "epoch": 0.358, "grad_norm": 2.66839599609375, "grad_norm_var": 0.2085927074359527, "learning_rate": 0.0001, "loss": 1.085, "loss/crossentropy": 1.7474039793014526, "loss/hidden": 0.953125, "loss/logits": 0.11502812802791595, "loss/reg": 0.001687839045189321, "step": 2864 }, { "epoch": 0.358125, "grad_norm": 2.7442846298217773, "grad_norm_var": 0.17272659674843732, "learning_rate": 0.0001, "loss": 0.9835, "loss/crossentropy": 2.371694564819336, "loss/hidden": 0.83203125, "loss/logits": 0.1346435248851776, "loss/reg": 0.0016869240207597613, "step": 2865 }, { "epoch": 0.35825, "grad_norm": 2.16766095161438, "grad_norm_var": 0.17930867246453938, "learning_rate": 0.0001, "loss": 1.1022, "loss/crossentropy": 2.725374460220337, "loss/hidden": 0.921875, "loss/logits": 0.16345086693763733, "loss/reg": 0.0016859890893101692, "step": 2866 }, { "epoch": 0.358375, "grad_norm": 2.8489997386932373, "grad_norm_var": 0.11502908688481266, "learning_rate": 0.0001, "loss": 1.1609, "loss/crossentropy": 2.853227138519287, "loss/hidden": 0.9765625, "loss/logits": 0.16744618117809296, "loss/reg": 0.0016850034007802606, "step": 2867 }, { "epoch": 0.3585, "grad_norm": 2.557431936264038, "grad_norm_var": 0.1151584402325812, "learning_rate": 0.0001, "loss": 1.2954, "loss/crossentropy": 2.177863597869873, "loss/hidden": 1.125, "loss/logits": 0.1535506397485733, "loss/reg": 0.0016840819735080004, "step": 2868 }, { "epoch": 0.358625, "grad_norm": 2.8179991245269775, "grad_norm_var": 0.1029166311183585, "learning_rate": 0.0001, "loss": 1.1301, "loss/crossentropy": 2.4123053550720215, "loss/hidden": 0.95703125, "loss/logits": 0.15621566772460938, "loss/reg": 0.0016831011744216084, "step": 2869 }, { "epoch": 0.35875, "grad_norm": 2.4346518516540527, "grad_norm_var": 0.10289295915638756, "learning_rate": 0.0001, "loss": 1.0453, "loss/crossentropy": 2.511106252670288, "loss/hidden": 0.8828125, "loss/logits": 0.14571568369865417, "loss/reg": 0.0016821377212181687, "step": 2870 }, { "epoch": 0.358875, "grad_norm": 2.1894872188568115, "grad_norm_var": 0.08643118589724566, "learning_rate": 0.0001, "loss": 1.1089, "loss/crossentropy": 2.333839178085327, "loss/hidden": 0.94140625, "loss/logits": 0.15067331492900848, "loss/reg": 0.0016812244430184364, "step": 2871 }, { "epoch": 0.359, "grad_norm": 2.148148536682129, "grad_norm_var": 0.08832965995138693, "learning_rate": 0.0001, "loss": 0.9834, "loss/crossentropy": 2.674180746078491, "loss/hidden": 0.83984375, "loss/logits": 0.12674987316131592, "loss/reg": 0.0016801903257146478, "step": 2872 }, { "epoch": 0.359125, "grad_norm": 2.201282501220703, "grad_norm_var": 0.07221140125128897, "learning_rate": 0.0001, "loss": 0.9838, "loss/crossentropy": 2.381173849105835, "loss/hidden": 0.83984375, "loss/logits": 0.1272009313106537, "loss/reg": 0.001679271343164146, "step": 2873 }, { "epoch": 0.35925, "grad_norm": 2.1592023372650146, "grad_norm_var": 0.07658556969441861, "learning_rate": 0.0001, "loss": 1.022, "loss/crossentropy": 2.620328426361084, "loss/hidden": 0.859375, "loss/logits": 0.14584526419639587, "loss/reg": 0.001678361790254712, "step": 2874 }, { "epoch": 0.359375, "grad_norm": 1.9773558378219604, "grad_norm_var": 0.09308364965767067, "learning_rate": 0.0001, "loss": 1.1047, "loss/crossentropy": 2.534482002258301, "loss/hidden": 0.9375, "loss/logits": 0.15045805275440216, "loss/reg": 0.0016774891410022974, "step": 2875 }, { "epoch": 0.3595, "grad_norm": 3.01912784576416, "grad_norm_var": 0.10793211877892896, "learning_rate": 0.0001, "loss": 1.3134, "loss/crossentropy": 2.471505641937256, "loss/hidden": 1.125, "loss/logits": 0.17159071564674377, "loss/reg": 0.001676530810073018, "step": 2876 }, { "epoch": 0.359625, "grad_norm": 2.4002370834350586, "grad_norm_var": 0.10494804525377924, "learning_rate": 0.0001, "loss": 1.1505, "loss/crossentropy": 2.5794620513916016, "loss/hidden": 0.96484375, "loss/logits": 0.16891884803771973, "loss/reg": 0.0016755268443375826, "step": 2877 }, { "epoch": 0.35975, "grad_norm": 2.5150439739227295, "grad_norm_var": 0.10133081510521572, "learning_rate": 0.0001, "loss": 1.1073, "loss/crossentropy": 2.50715708732605, "loss/hidden": 0.9453125, "loss/logits": 0.14528337121009827, "loss/reg": 0.0016746176406741142, "step": 2878 }, { "epoch": 0.359875, "grad_norm": 2.185851812362671, "grad_norm_var": 0.09652692805832945, "learning_rate": 0.0001, "loss": 1.1435, "loss/crossentropy": 2.303999423980713, "loss/hidden": 0.9453125, "loss/logits": 0.18143998086452484, "loss/reg": 0.0016736144898459315, "step": 2879 }, { "epoch": 0.36, "grad_norm": 2.304851531982422, "grad_norm_var": 0.09370160868619966, "learning_rate": 0.0001, "loss": 1.0842, "loss/crossentropy": 2.518378973007202, "loss/hidden": 0.91015625, "loss/logits": 0.15736135840415955, "loss/reg": 0.0016727159963920712, "step": 2880 }, { "epoch": 0.360125, "grad_norm": 22.98287582397461, "grad_norm_var": 26.576972707248476, "learning_rate": 0.0001, "loss": 2.4831, "loss/crossentropy": 2.1521108150482178, "loss/hidden": 1.953125, "loss/logits": 0.5132277011871338, "loss/reg": 0.0016718083061277866, "step": 2881 }, { "epoch": 0.36025, "grad_norm": 2.2551026344299316, "grad_norm_var": 26.55979637699447, "learning_rate": 0.0001, "loss": 1.1546, "loss/crossentropy": 2.119589328765869, "loss/hidden": 0.98828125, "loss/logits": 0.149569571018219, "loss/reg": 0.0016709292540326715, "step": 2882 }, { "epoch": 0.360375, "grad_norm": 2.1404869556427, "grad_norm_var": 26.67036865227022, "learning_rate": 0.0001, "loss": 1.0158, "loss/crossentropy": 2.41686749458313, "loss/hidden": 0.8671875, "loss/logits": 0.131933331489563, "loss/reg": 0.0016700363485142589, "step": 2883 }, { "epoch": 0.3605, "grad_norm": 3.26462721824646, "grad_norm_var": 26.599258626096695, "learning_rate": 0.0001, "loss": 1.1987, "loss/crossentropy": 2.345972776412964, "loss/hidden": 1.0078125, "loss/logits": 0.1742195188999176, "loss/reg": 0.0016691767377778888, "step": 2884 }, { "epoch": 0.360625, "grad_norm": 2.343209743499756, "grad_norm_var": 26.668377145273414, "learning_rate": 0.0001, "loss": 1.0626, "loss/crossentropy": 2.78903865814209, "loss/hidden": 0.90234375, "loss/logits": 0.14361333847045898, "loss/reg": 0.0016682589193806052, "step": 2885 }, { "epoch": 0.36075, "grad_norm": 54.53543472290039, "grad_norm_var": 187.82858462895567, "learning_rate": 0.0001, "loss": 1.6192, "loss/crossentropy": 2.9942140579223633, "loss/hidden": 1.4140625, "loss/logits": 0.18844394385814667, "loss/reg": 0.0016673958161845803, "step": 2886 }, { "epoch": 0.360875, "grad_norm": 7.349742412567139, "grad_norm_var": 186.24229567918917, "learning_rate": 0.0001, "loss": 1.3683, "loss/crossentropy": 2.735208034515381, "loss/hidden": 1.1796875, "loss/logits": 0.17190513014793396, "loss/reg": 0.0016665258444845676, "step": 2887 }, { "epoch": 0.361, "grad_norm": 2.874211311340332, "grad_norm_var": 185.7826572011801, "learning_rate": 0.0001, "loss": 1.4524, "loss/crossentropy": 2.2830288410186768, "loss/hidden": 1.1875, "loss/logits": 0.24822375178337097, "loss/reg": 0.001665602088905871, "step": 2888 }, { "epoch": 0.361125, "grad_norm": 2.592247724533081, "grad_norm_var": 185.52737031866582, "learning_rate": 0.0001, "loss": 1.324, "loss/crossentropy": 1.9951626062393188, "loss/hidden": 1.1171875, "loss/logits": 0.19014853239059448, "loss/reg": 0.001664699288085103, "step": 2889 }, { "epoch": 0.36125, "grad_norm": 4.098513603210449, "grad_norm_var": 184.43153764722382, "learning_rate": 0.0001, "loss": 1.2937, "loss/crossentropy": 2.4855711460113525, "loss/hidden": 1.0859375, "loss/logits": 0.1911044716835022, "loss/reg": 0.0016639186069369316, "step": 2890 }, { "epoch": 0.361375, "grad_norm": 2.1250834465026855, "grad_norm_var": 184.32555137334094, "learning_rate": 0.0001, "loss": 1.0661, "loss/crossentropy": 2.45037579536438, "loss/hidden": 0.90234375, "loss/logits": 0.14707760512828827, "loss/reg": 0.0016630340833216906, "step": 2891 }, { "epoch": 0.3615, "grad_norm": 2.0534567832946777, "grad_norm_var": 184.95261901365245, "learning_rate": 0.0001, "loss": 1.0969, "loss/crossentropy": 2.414289951324463, "loss/hidden": 0.9296875, "loss/logits": 0.15059277415275574, "loss/reg": 0.0016622061375528574, "step": 2892 }, { "epoch": 0.361625, "grad_norm": 2.3413257598876953, "grad_norm_var": 184.99192220310482, "learning_rate": 0.0001, "loss": 1.0781, "loss/crossentropy": 2.721737861633301, "loss/hidden": 0.89453125, "loss/logits": 0.16690678894519806, "loss/reg": 0.0016613163752481341, "step": 2893 }, { "epoch": 0.36175, "grad_norm": 2.0452423095703125, "grad_norm_var": 185.30999701305126, "learning_rate": 0.0001, "loss": 1.0467, "loss/crossentropy": 2.600454330444336, "loss/hidden": 0.890625, "loss/logits": 0.13950681686401367, "loss/reg": 0.0016604192787781358, "step": 2894 }, { "epoch": 0.361875, "grad_norm": 6.24308443069458, "grad_norm_var": 183.54884057287873, "learning_rate": 0.0001, "loss": 1.0078, "loss/crossentropy": 2.5805392265319824, "loss/hidden": 0.87890625, "loss/logits": 0.11227421462535858, "loss/reg": 0.0016594943590462208, "step": 2895 }, { "epoch": 0.362, "grad_norm": 2.3423571586608887, "grad_norm_var": 183.52246455973352, "learning_rate": 0.0001, "loss": 1.0958, "loss/crossentropy": 2.5158119201660156, "loss/hidden": 0.92578125, "loss/logits": 0.15347003936767578, "loss/reg": 0.001658568624407053, "step": 2896 }, { "epoch": 0.362125, "grad_norm": 2.2457339763641357, "grad_norm_var": 167.86411707408723, "learning_rate": 0.0001, "loss": 1.0263, "loss/crossentropy": 2.450537919998169, "loss/hidden": 0.875, "loss/logits": 0.1346817910671234, "loss/reg": 0.0016576156485825777, "step": 2897 }, { "epoch": 0.36225, "grad_norm": 1.931369423866272, "grad_norm_var": 168.04539746663633, "learning_rate": 0.0001, "loss": 1.0146, "loss/crossentropy": 2.366205930709839, "loss/hidden": 0.859375, "loss/logits": 0.138667032122612, "loss/reg": 0.0016567346174269915, "step": 2898 }, { "epoch": 0.362375, "grad_norm": 2.0043246746063232, "grad_norm_var": 168.12176130515084, "learning_rate": 0.0001, "loss": 1.0449, "loss/crossentropy": 2.4714229106903076, "loss/hidden": 0.88671875, "loss/logits": 0.1416543573141098, "loss/reg": 0.0016558223869651556, "step": 2899 }, { "epoch": 0.3625, "grad_norm": 2.7600367069244385, "grad_norm_var": 168.34016640367602, "learning_rate": 0.0001, "loss": 1.0991, "loss/crossentropy": 2.517082929611206, "loss/hidden": 0.94140625, "loss/logits": 0.14113199710845947, "loss/reg": 0.001654876279644668, "step": 2900 }, { "epoch": 0.362625, "grad_norm": 1.987077236175537, "grad_norm_var": 168.5332644528203, "learning_rate": 0.0001, "loss": 1.1151, "loss/crossentropy": 2.6820037364959717, "loss/hidden": 0.94921875, "loss/logits": 0.1493188440799713, "loss/reg": 0.001653992454521358, "step": 2901 }, { "epoch": 0.36275, "grad_norm": 3.923135757446289, "grad_norm_var": 2.5900991405657625, "learning_rate": 0.0001, "loss": 1.2995, "loss/crossentropy": 2.914050579071045, "loss/hidden": 1.0859375, "loss/logits": 0.19705688953399658, "loss/reg": 0.0016531149158254266, "step": 2902 }, { "epoch": 0.362875, "grad_norm": 2.84275221824646, "grad_norm_var": 1.2801984238637776, "learning_rate": 0.0001, "loss": 1.1637, "loss/crossentropy": 2.3229780197143555, "loss/hidden": 0.953125, "loss/logits": 0.19404518604278564, "loss/reg": 0.001652223290875554, "step": 2903 }, { "epoch": 0.363, "grad_norm": 2.1783151626586914, "grad_norm_var": 1.3013176695546562, "learning_rate": 0.0001, "loss": 0.9596, "loss/crossentropy": 2.2848258018493652, "loss/hidden": 0.80859375, "loss/logits": 0.13450674712657928, "loss/reg": 0.0016513013979420066, "step": 2904 }, { "epoch": 0.363125, "grad_norm": 1.9953367710113525, "grad_norm_var": 1.3347194382521905, "learning_rate": 0.0001, "loss": 0.9909, "loss/crossentropy": 2.797396183013916, "loss/hidden": 0.83984375, "loss/logits": 0.13458240032196045, "loss/reg": 0.0016504452796652913, "step": 2905 }, { "epoch": 0.36325, "grad_norm": 2.226459264755249, "grad_norm_var": 1.2033844568847858, "learning_rate": 0.0001, "loss": 1.1152, "loss/crossentropy": 2.29197359085083, "loss/hidden": 0.94140625, "loss/logits": 0.15732143819332123, "loss/reg": 0.001649680663831532, "step": 2906 }, { "epoch": 0.363375, "grad_norm": 2.776362895965576, "grad_norm_var": 1.190580519645186, "learning_rate": 0.0001, "loss": 1.2359, "loss/crossentropy": 2.5579962730407715, "loss/hidden": 1.0234375, "loss/logits": 0.19601425528526306, "loss/reg": 0.001648910460062325, "step": 2907 }, { "epoch": 0.3635, "grad_norm": 2.4440717697143555, "grad_norm_var": 1.170686987960004, "learning_rate": 0.0001, "loss": 1.0697, "loss/crossentropy": 2.6649057865142822, "loss/hidden": 0.9140625, "loss/logits": 0.13919401168823242, "loss/reg": 0.0016480166232213378, "step": 2908 }, { "epoch": 0.363625, "grad_norm": 2.674268960952759, "grad_norm_var": 1.1642259494931204, "learning_rate": 0.0001, "loss": 0.9908, "loss/crossentropy": 2.4012725353240967, "loss/hidden": 0.84375, "loss/logits": 0.13061130046844482, "loss/reg": 0.0016471183625981212, "step": 2909 }, { "epoch": 0.36375, "grad_norm": 2.835261106491089, "grad_norm_var": 1.138083498134074, "learning_rate": 0.0001, "loss": 1.3996, "loss/crossentropy": 1.9172604084014893, "loss/hidden": 1.21875, "loss/logits": 0.16441689431667328, "loss/reg": 0.0016463176580145955, "step": 2910 }, { "epoch": 0.363875, "grad_norm": 2.747319221496582, "grad_norm_var": 0.2565341199981375, "learning_rate": 0.0001, "loss": 1.0874, "loss/crossentropy": 2.341123104095459, "loss/hidden": 0.91015625, "loss/logits": 0.16080209612846375, "loss/reg": 0.0016454029828310013, "step": 2911 }, { "epoch": 0.364, "grad_norm": 2.1515984535217285, "grad_norm_var": 0.2626815705247921, "learning_rate": 0.0001, "loss": 1.072, "loss/crossentropy": 2.391768455505371, "loss/hidden": 0.90625, "loss/logits": 0.14932766556739807, "loss/reg": 0.0016445607179775834, "step": 2912 }, { "epoch": 0.364125, "grad_norm": 2.311025857925415, "grad_norm_var": 0.26088496055327154, "learning_rate": 0.0001, "loss": 1.1837, "loss/crossentropy": 2.2469513416290283, "loss/hidden": 0.98046875, "loss/logits": 0.1868339329957962, "loss/reg": 0.0016437117010354996, "step": 2913 }, { "epoch": 0.36425, "grad_norm": 2.18137526512146, "grad_norm_var": 0.2462767840523912, "learning_rate": 0.0001, "loss": 1.0557, "loss/crossentropy": 2.6416540145874023, "loss/hidden": 0.8984375, "loss/logits": 0.14087817072868347, "loss/reg": 0.0016428823582828045, "step": 2914 }, { "epoch": 0.364375, "grad_norm": 4.4402079582214355, "grad_norm_var": 0.45534860767229096, "learning_rate": 0.0001, "loss": 1.4029, "loss/crossentropy": 2.318537712097168, "loss/hidden": 1.171875, "loss/logits": 0.21463587880134583, "loss/reg": 0.0016419882886111736, "step": 2915 }, { "epoch": 0.3645, "grad_norm": 2.422072172164917, "grad_norm_var": 0.45773900634261794, "learning_rate": 0.0001, "loss": 1.036, "loss/crossentropy": 2.8344333171844482, "loss/hidden": 0.875, "loss/logits": 0.14455802738666534, "loss/reg": 0.0016410666285082698, "step": 2916 }, { "epoch": 0.364625, "grad_norm": 2.3255631923675537, "grad_norm_var": 0.43572399364784337, "learning_rate": 0.0001, "loss": 0.9992, "loss/crossentropy": 2.5267374515533447, "loss/hidden": 0.84765625, "loss/logits": 0.13512061536312103, "loss/reg": 0.0016402023611590266, "step": 2917 }, { "epoch": 0.36475, "grad_norm": 2.9109323024749756, "grad_norm_var": 0.3285694351872271, "learning_rate": 0.0001, "loss": 1.1444, "loss/crossentropy": 2.5319302082061768, "loss/hidden": 0.9765625, "loss/logits": 0.15141156315803528, "loss/reg": 0.0016392895486205816, "step": 2918 }, { "epoch": 0.364875, "grad_norm": 2.4816620349884033, "grad_norm_var": 0.32461869770864193, "learning_rate": 0.0001, "loss": 1.2578, "loss/crossentropy": 2.4686548709869385, "loss/hidden": 1.0625, "loss/logits": 0.1789342761039734, "loss/reg": 0.0016383545007556677, "step": 2919 }, { "epoch": 0.365, "grad_norm": 2.5076324939727783, "grad_norm_var": 0.31424819361351575, "learning_rate": 0.0001, "loss": 1.2365, "loss/crossentropy": 2.6436219215393066, "loss/hidden": 1.03125, "loss/logits": 0.18888777494430542, "loss/reg": 0.0016374120023101568, "step": 2920 }, { "epoch": 0.365125, "grad_norm": 2.527376890182495, "grad_norm_var": 0.28979447480508774, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.8049848079681396, "loss/hidden": 0.9453125, "loss/logits": 0.17654883861541748, "loss/reg": 0.0016365253832191229, "step": 2921 }, { "epoch": 0.36525, "grad_norm": 2.5993618965148926, "grad_norm_var": 0.27878430127710296, "learning_rate": 0.0001, "loss": 1.1076, "loss/crossentropy": 2.6346142292022705, "loss/hidden": 0.94140625, "loss/logits": 0.14986354112625122, "loss/reg": 0.0016356523847207427, "step": 2922 }, { "epoch": 0.365375, "grad_norm": 2.011859655380249, "grad_norm_var": 0.3020255848521759, "learning_rate": 0.0001, "loss": 1.1158, "loss/crossentropy": 2.452118396759033, "loss/hidden": 0.94140625, "loss/logits": 0.1580541729927063, "loss/reg": 0.0016347493510693312, "step": 2923 }, { "epoch": 0.3655, "grad_norm": 5.400126934051514, "grad_norm_var": 0.7874091732028183, "learning_rate": 0.0001, "loss": 1.3555, "loss/crossentropy": 2.657851219177246, "loss/hidden": 1.109375, "loss/logits": 0.22979383170604706, "loss/reg": 0.001633853418752551, "step": 2924 }, { "epoch": 0.365625, "grad_norm": 2.327488422393799, "grad_norm_var": 0.7999516330384864, "learning_rate": 0.0001, "loss": 1.1209, "loss/crossentropy": 2.6113743782043457, "loss/hidden": 0.94140625, "loss/logits": 0.16317567229270935, "loss/reg": 0.0016329664504155517, "step": 2925 }, { "epoch": 0.36575, "grad_norm": 2.0372982025146484, "grad_norm_var": 0.8318795115183185, "learning_rate": 0.0001, "loss": 1.0427, "loss/crossentropy": 2.472346782684326, "loss/hidden": 0.87890625, "loss/logits": 0.14744578301906586, "loss/reg": 0.0016319922870025039, "step": 2926 }, { "epoch": 0.365875, "grad_norm": 3.4449403285980225, "grad_norm_var": 0.8656348673977078, "learning_rate": 0.0001, "loss": 1.4243, "loss/crossentropy": 1.9134647846221924, "loss/hidden": 1.21875, "loss/logits": 0.18927118182182312, "loss/reg": 0.0016310216160491109, "step": 2927 }, { "epoch": 0.366, "grad_norm": 2.5038177967071533, "grad_norm_var": 0.8450496963230684, "learning_rate": 0.0001, "loss": 0.9629, "loss/crossentropy": 2.5827245712280273, "loss/hidden": 0.828125, "loss/logits": 0.11846432089805603, "loss/reg": 0.0016301407013088465, "step": 2928 }, { "epoch": 0.366125, "grad_norm": 2.245410680770874, "grad_norm_var": 0.8493958496354137, "learning_rate": 0.0001, "loss": 1.0012, "loss/crossentropy": 2.5887792110443115, "loss/hidden": 0.84765625, "loss/logits": 0.13726884126663208, "loss/reg": 0.0016292682848870754, "step": 2929 }, { "epoch": 0.36625, "grad_norm": 2.6145827770233154, "grad_norm_var": 0.8269554635355516, "learning_rate": 0.0001, "loss": 1.0314, "loss/crossentropy": 2.5537302494049072, "loss/hidden": 0.87109375, "loss/logits": 0.1439923644065857, "loss/reg": 0.0016284105367958546, "step": 2930 }, { "epoch": 0.366375, "grad_norm": 2.5989482402801514, "grad_norm_var": 0.6361772396644635, "learning_rate": 0.0001, "loss": 1.1513, "loss/crossentropy": 2.47092342376709, "loss/hidden": 0.94921875, "loss/logits": 0.18577691912651062, "loss/reg": 0.0016275214729830623, "step": 2931 }, { "epoch": 0.3665, "grad_norm": 2.3476715087890625, "grad_norm_var": 0.6391308990295269, "learning_rate": 0.0001, "loss": 1.1009, "loss/crossentropy": 2.4904348850250244, "loss/hidden": 0.9140625, "loss/logits": 0.17054884135723114, "loss/reg": 0.0016266867751255631, "step": 2932 }, { "epoch": 0.366625, "grad_norm": 3.217212677001953, "grad_norm_var": 0.6466483130272072, "learning_rate": 0.0001, "loss": 1.325, "loss/crossentropy": 2.2854831218719482, "loss/hidden": 1.109375, "loss/logits": 0.1993982046842575, "loss/reg": 0.0016257890965789557, "step": 2933 }, { "epoch": 0.36675, "grad_norm": 2.1475768089294434, "grad_norm_var": 0.6652651044347349, "learning_rate": 0.0001, "loss": 1.1104, "loss/crossentropy": 2.5113632678985596, "loss/hidden": 0.94140625, "loss/logits": 0.1527121663093567, "loss/reg": 0.0016249323962256312, "step": 2934 }, { "epoch": 0.366875, "grad_norm": 2.2910406589508057, "grad_norm_var": 0.6727883505127122, "learning_rate": 0.0001, "loss": 1.2111, "loss/crossentropy": 2.6591413021087646, "loss/hidden": 1.0, "loss/logits": 0.1948215812444687, "loss/reg": 0.0016241029370576143, "step": 2935 }, { "epoch": 0.367, "grad_norm": 2.2853500843048096, "grad_norm_var": 0.6808782153419806, "learning_rate": 0.0001, "loss": 0.9621, "loss/crossentropy": 2.5430846214294434, "loss/hidden": 0.8125, "loss/logits": 0.1333225667476654, "loss/reg": 0.0016232890775427222, "step": 2936 }, { "epoch": 0.367125, "grad_norm": 2.781404495239258, "grad_norm_var": 0.6803345406318376, "learning_rate": 0.0001, "loss": 1.1575, "loss/crossentropy": 2.4163620471954346, "loss/hidden": 0.98828125, "loss/logits": 0.1529882550239563, "loss/reg": 0.001622412703000009, "step": 2937 }, { "epoch": 0.36725, "grad_norm": 1.9200105667114258, "grad_norm_var": 0.7163369670094502, "learning_rate": 0.0001, "loss": 1.0498, "loss/crossentropy": 2.7060489654541016, "loss/hidden": 0.87890625, "loss/logits": 0.15471969544887543, "loss/reg": 0.0016215224750339985, "step": 2938 }, { "epoch": 0.367375, "grad_norm": 2.5477356910705566, "grad_norm_var": 0.6896953759726384, "learning_rate": 0.0001, "loss": 1.1545, "loss/crossentropy": 2.347503185272217, "loss/hidden": 0.98046875, "loss/logits": 0.15787500143051147, "loss/reg": 0.0016205697320401669, "step": 2939 }, { "epoch": 0.3675, "grad_norm": 2.3234238624572754, "grad_norm_var": 0.16111414662210918, "learning_rate": 0.0001, "loss": 0.9853, "loss/crossentropy": 2.5368473529815674, "loss/hidden": 0.82421875, "loss/logits": 0.14490297436714172, "loss/reg": 0.0016196636715903878, "step": 2940 }, { "epoch": 0.367625, "grad_norm": 2.39022159576416, "grad_norm_var": 0.1601085342486383, "learning_rate": 0.0001, "loss": 1.1495, "loss/crossentropy": 2.298597812652588, "loss/hidden": 0.95703125, "loss/logits": 0.17629364132881165, "loss/reg": 0.001618698937818408, "step": 2941 }, { "epoch": 0.36775, "grad_norm": 3.084016799926758, "grad_norm_var": 0.16665501543171382, "learning_rate": 0.0001, "loss": 1.0506, "loss/crossentropy": 2.7395713329315186, "loss/hidden": 0.8984375, "loss/logits": 0.13597065210342407, "loss/reg": 0.0016177864745259285, "step": 2942 }, { "epoch": 0.367875, "grad_norm": 2.2566306591033936, "grad_norm_var": 0.11255367098174791, "learning_rate": 0.0001, "loss": 0.9687, "loss/crossentropy": 2.7152233123779297, "loss/hidden": 0.82421875, "loss/logits": 0.12827168405056, "loss/reg": 0.0016168227884918451, "step": 2943 }, { "epoch": 0.368, "grad_norm": 2.4095306396484375, "grad_norm_var": 0.11271169926918295, "learning_rate": 0.0001, "loss": 1.1494, "loss/crossentropy": 2.550794839859009, "loss/hidden": 0.95703125, "loss/logits": 0.17617511749267578, "loss/reg": 0.0016159294173121452, "step": 2944 }, { "epoch": 0.368125, "grad_norm": 25.042810440063477, "grad_norm_var": 31.923880614994562, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 1.9788932800292969, "loss/hidden": 1.203125, "loss/logits": 0.18637996912002563, "loss/reg": 0.0016149738803505898, "step": 2945 }, { "epoch": 0.36825, "grad_norm": 1.85200834274292, "grad_norm_var": 32.09002112674632, "learning_rate": 0.0001, "loss": 1.1356, "loss/crossentropy": 2.250098943710327, "loss/hidden": 0.96484375, "loss/logits": 0.154619961977005, "loss/reg": 0.0016140936641022563, "step": 2946 }, { "epoch": 0.368375, "grad_norm": 4.755542278289795, "grad_norm_var": 32.02284383454991, "learning_rate": 0.0001, "loss": 1.2052, "loss/crossentropy": 2.456897735595703, "loss/hidden": 1.0078125, "loss/logits": 0.1812351942062378, "loss/reg": 0.0016132023883983493, "step": 2947 }, { "epoch": 0.3685, "grad_norm": 2.826408863067627, "grad_norm_var": 31.933084917938114, "learning_rate": 0.0001, "loss": 1.1185, "loss/crossentropy": 2.1193761825561523, "loss/hidden": 0.94921875, "loss/logits": 0.15315718948841095, "loss/reg": 0.0016123040113598108, "step": 2948 }, { "epoch": 0.368625, "grad_norm": 2.8231096267700195, "grad_norm_var": 31.98435540780198, "learning_rate": 0.0001, "loss": 1.1101, "loss/crossentropy": 2.352766513824463, "loss/hidden": 0.9453125, "loss/logits": 0.14870017766952515, "loss/reg": 0.0016114102909341455, "step": 2949 }, { "epoch": 0.36875, "grad_norm": 2.237957715988159, "grad_norm_var": 31.96274101296752, "learning_rate": 0.0001, "loss": 1.2492, "loss/crossentropy": 2.47367787361145, "loss/hidden": 1.03125, "loss/logits": 0.20180144906044006, "loss/reg": 0.0016105561517179012, "step": 2950 }, { "epoch": 0.368875, "grad_norm": 3.4921984672546387, "grad_norm_var": 31.78094709780618, "learning_rate": 0.0001, "loss": 1.1502, "loss/crossentropy": 2.4294402599334717, "loss/hidden": 0.99609375, "loss/logits": 0.13800190389156342, "loss/reg": 0.00160969328135252, "step": 2951 }, { "epoch": 0.369, "grad_norm": 2.243403434753418, "grad_norm_var": 31.791006379292472, "learning_rate": 0.0001, "loss": 1.2169, "loss/crossentropy": 2.216179370880127, "loss/hidden": 1.0390625, "loss/logits": 0.16170275211334229, "loss/reg": 0.0016088113188743591, "step": 2952 }, { "epoch": 0.369125, "grad_norm": 2.3184988498687744, "grad_norm_var": 31.88341674978521, "learning_rate": 0.0001, "loss": 1.1403, "loss/crossentropy": 2.370349645614624, "loss/hidden": 0.98046875, "loss/logits": 0.14376848936080933, "loss/reg": 0.0016078915214166045, "step": 2953 }, { "epoch": 0.36925, "grad_norm": 2.059495210647583, "grad_norm_var": 31.84534069205656, "learning_rate": 0.0001, "loss": 1.0407, "loss/crossentropy": 2.4124796390533447, "loss/hidden": 0.875, "loss/logits": 0.14962998032569885, "loss/reg": 0.0016069960547611117, "step": 2954 }, { "epoch": 0.369375, "grad_norm": 1.9678794145584106, "grad_norm_var": 31.981839552930616, "learning_rate": 0.0001, "loss": 0.9176, "loss/crossentropy": 2.5794992446899414, "loss/hidden": 0.78515625, "loss/logits": 0.11633378267288208, "loss/reg": 0.0016061411006376147, "step": 2955 }, { "epoch": 0.3695, "grad_norm": 1.9977718591690063, "grad_norm_var": 32.061490625600705, "learning_rate": 0.0001, "loss": 1.0702, "loss/crossentropy": 2.639310359954834, "loss/hidden": 0.92578125, "loss/logits": 0.12838619947433472, "loss/reg": 0.0016052514547482133, "step": 2956 }, { "epoch": 0.369625, "grad_norm": 2.021615982055664, "grad_norm_var": 32.14835401525327, "learning_rate": 0.0001, "loss": 0.9156, "loss/crossentropy": 2.420154094696045, "loss/hidden": 0.7734375, "loss/logits": 0.12614388763904572, "loss/reg": 0.0016044409712776542, "step": 2957 }, { "epoch": 0.36975, "grad_norm": 2.7315452098846436, "grad_norm_var": 32.19737149530218, "learning_rate": 0.0001, "loss": 1.2741, "loss/crossentropy": 2.5111894607543945, "loss/hidden": 1.09375, "loss/logits": 0.16434572637081146, "loss/reg": 0.0016035627340897918, "step": 2958 }, { "epoch": 0.369875, "grad_norm": 2.4453182220458984, "grad_norm_var": 32.15725155107872, "learning_rate": 0.0001, "loss": 1.1256, "loss/crossentropy": 2.4796969890594482, "loss/hidden": 0.94140625, "loss/logits": 0.16821438074111938, "loss/reg": 0.001602799049578607, "step": 2959 }, { "epoch": 0.37, "grad_norm": 4.0225749015808105, "grad_norm_var": 31.988221014436373, "learning_rate": 0.0001, "loss": 1.5174, "loss/crossentropy": 2.4007651805877686, "loss/hidden": 1.2578125, "loss/logits": 0.24357788264751434, "loss/reg": 0.001602031639777124, "step": 2960 }, { "epoch": 0.370125, "grad_norm": 2.115582227706909, "grad_norm_var": 0.6748591495437363, "learning_rate": 0.0001, "loss": 1.1042, "loss/crossentropy": 2.480705499649048, "loss/hidden": 0.93359375, "loss/logits": 0.15461376309394836, "loss/reg": 0.0016012740088626742, "step": 2961 }, { "epoch": 0.37025, "grad_norm": 2.1233468055725098, "grad_norm_var": 0.6516964803831929, "learning_rate": 0.0001, "loss": 1.2398, "loss/crossentropy": 2.3494679927825928, "loss/hidden": 1.0546875, "loss/logits": 0.16912665963172913, "loss/reg": 0.001600456889718771, "step": 2962 }, { "epoch": 0.370375, "grad_norm": 2.440214157104492, "grad_norm_var": 0.33253879293210675, "learning_rate": 0.0001, "loss": 1.2838, "loss/crossentropy": 2.5465586185455322, "loss/hidden": 1.09375, "loss/logits": 0.1740531474351883, "loss/reg": 0.001599667128175497, "step": 2963 }, { "epoch": 0.3705, "grad_norm": 2.3659937381744385, "grad_norm_var": 0.32523926632048655, "learning_rate": 0.0001, "loss": 1.233, "loss/crossentropy": 2.6333167552948, "loss/hidden": 1.03125, "loss/logits": 0.18575221300125122, "loss/reg": 0.0015987844672054052, "step": 2964 }, { "epoch": 0.370625, "grad_norm": 2.3069112300872803, "grad_norm_var": 0.31710156967325814, "learning_rate": 0.0001, "loss": 1.1422, "loss/crossentropy": 2.306267261505127, "loss/hidden": 0.97265625, "loss/logits": 0.15355217456817627, "loss/reg": 0.0015979325398802757, "step": 2965 }, { "epoch": 0.37075, "grad_norm": 2.2262582778930664, "grad_norm_var": 0.31741070097743107, "learning_rate": 0.0001, "loss": 1.1262, "loss/crossentropy": 2.929598808288574, "loss/hidden": 0.9453125, "loss/logits": 0.1649283617734909, "loss/reg": 0.0015971966786310077, "step": 2966 }, { "epoch": 0.370875, "grad_norm": 2.433411121368408, "grad_norm_var": 0.23751052805301712, "learning_rate": 0.0001, "loss": 1.0617, "loss/crossentropy": 2.3520545959472656, "loss/hidden": 0.8671875, "loss/logits": 0.17850735783576965, "loss/reg": 0.0015963308978825808, "step": 2967 }, { "epoch": 0.371, "grad_norm": 2.2656893730163574, "grad_norm_var": 0.23718399798644113, "learning_rate": 0.0001, "loss": 1.0593, "loss/crossentropy": 2.7874927520751953, "loss/hidden": 0.8984375, "loss/logits": 0.144947811961174, "loss/reg": 0.0015954799018800259, "step": 2968 }, { "epoch": 0.371125, "grad_norm": 2.143205165863037, "grad_norm_var": 0.2401944151894573, "learning_rate": 0.0001, "loss": 1.1878, "loss/crossentropy": 2.4601874351501465, "loss/hidden": 1.0, "loss/logits": 0.1718457043170929, "loss/reg": 0.001594609348103404, "step": 2969 }, { "epoch": 0.37125, "grad_norm": 101.12199401855469, "grad_norm_var": 609.6841218103087, "learning_rate": 0.0001, "loss": 1.0003, "loss/crossentropy": 2.685331106185913, "loss/hidden": 0.85546875, "loss/logits": 0.12886559963226318, "loss/reg": 0.0015937838470563293, "step": 2970 }, { "epoch": 0.371375, "grad_norm": 2.1937313079833984, "grad_norm_var": 609.4892316671171, "learning_rate": 0.0001, "loss": 1.088, "loss/crossentropy": 2.5401968955993652, "loss/hidden": 0.91796875, "loss/logits": 0.15409636497497559, "loss/reg": 0.0015928788343444467, "step": 2971 }, { "epoch": 0.3715, "grad_norm": 2.069490432739258, "grad_norm_var": 609.4268048775009, "learning_rate": 0.0001, "loss": 1.0601, "loss/crossentropy": 2.6639468669891357, "loss/hidden": 0.890625, "loss/logits": 0.15356793999671936, "loss/reg": 0.0015921550802886486, "step": 2972 }, { "epoch": 0.371625, "grad_norm": 1.8838802576065063, "grad_norm_var": 609.5481432149076, "learning_rate": 0.0001, "loss": 1.0138, "loss/crossentropy": 2.777111530303955, "loss/hidden": 0.859375, "loss/logits": 0.13847094774246216, "loss/reg": 0.0015913340030238032, "step": 2973 }, { "epoch": 0.37175, "grad_norm": 2.222355842590332, "grad_norm_var": 609.9597521400962, "learning_rate": 0.0001, "loss": 1.1779, "loss/crossentropy": 2.4574196338653564, "loss/hidden": 1.0078125, "loss/logits": 0.1541793793439865, "loss/reg": 0.0015905852196738124, "step": 2974 }, { "epoch": 0.371875, "grad_norm": 3.110410213470459, "grad_norm_var": 609.4483702483357, "learning_rate": 0.0001, "loss": 1.3736, "loss/crossentropy": 2.4717628955841064, "loss/hidden": 1.1484375, "loss/logits": 0.2092895209789276, "loss/reg": 0.0015897807897999883, "step": 2975 }, { "epoch": 0.372, "grad_norm": 2.075378894805908, "grad_norm_var": 610.8647577141769, "learning_rate": 0.0001, "loss": 1.1119, "loss/crossentropy": 2.3535499572753906, "loss/hidden": 0.92578125, "loss/logits": 0.17022764682769775, "loss/reg": 0.0015889210626482964, "step": 2976 }, { "epoch": 0.372125, "grad_norm": 2.694697618484497, "grad_norm_var": 610.3970970762971, "learning_rate": 0.0001, "loss": 1.3099, "loss/crossentropy": 2.3849284648895264, "loss/hidden": 1.125, "loss/logits": 0.16903114318847656, "loss/reg": 0.0015881662257015705, "step": 2977 }, { "epoch": 0.37225, "grad_norm": 2.756809949874878, "grad_norm_var": 609.8852987322907, "learning_rate": 0.0001, "loss": 1.2494, "loss/crossentropy": 2.4221389293670654, "loss/hidden": 1.0625, "loss/logits": 0.17101766169071198, "loss/reg": 0.0015873081283643842, "step": 2978 }, { "epoch": 0.372375, "grad_norm": 2.5563724040985107, "grad_norm_var": 609.7919889884278, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.5010735988616943, "loss/hidden": 0.94921875, "loss/logits": 0.1731671690940857, "loss/reg": 0.0015863956650719047, "step": 2979 }, { "epoch": 0.3725, "grad_norm": 2.227283000946045, "grad_norm_var": 609.9071316392499, "learning_rate": 0.0001, "loss": 1.2864, "loss/crossentropy": 2.1768126487731934, "loss/hidden": 1.1015625, "loss/logits": 0.16901464760303497, "loss/reg": 0.0015855665551498532, "step": 2980 }, { "epoch": 0.372625, "grad_norm": 2.498760461807251, "grad_norm_var": 609.750553201018, "learning_rate": 0.0001, "loss": 1.1915, "loss/crossentropy": 2.5166473388671875, "loss/hidden": 1.0078125, "loss/logits": 0.1678485870361328, "loss/reg": 0.0015847445465624332, "step": 2981 }, { "epoch": 0.37275, "grad_norm": 2.0772311687469482, "grad_norm_var": 609.8771980518538, "learning_rate": 0.0001, "loss": 1.0817, "loss/crossentropy": 2.231273889541626, "loss/hidden": 0.9296875, "loss/logits": 0.136214941740036, "loss/reg": 0.0015838835388422012, "step": 2982 }, { "epoch": 0.372875, "grad_norm": 2.6740105152130127, "grad_norm_var": 609.6855373209037, "learning_rate": 0.0001, "loss": 1.063, "loss/crossentropy": 2.4945671558380127, "loss/hidden": 0.90625, "loss/logits": 0.14091381430625916, "loss/reg": 0.0015829374315217137, "step": 2983 }, { "epoch": 0.373, "grad_norm": 2.5390610694885254, "grad_norm_var": 609.4616687213457, "learning_rate": 0.0001, "loss": 1.1538, "loss/crossentropy": 2.4923410415649414, "loss/hidden": 0.95703125, "loss/logits": 0.1809024214744568, "loss/reg": 0.0015819830587133765, "step": 2984 }, { "epoch": 0.373125, "grad_norm": 3.0879065990448, "grad_norm_var": 608.7100947362329, "learning_rate": 0.0001, "loss": 1.4248, "loss/crossentropy": 2.189671277999878, "loss/hidden": 1.1875, "loss/logits": 0.22149384021759033, "loss/reg": 0.001581133808940649, "step": 2985 }, { "epoch": 0.37325, "grad_norm": 2.886491537094116, "grad_norm_var": 0.1442169564098255, "learning_rate": 0.0001, "loss": 1.2171, "loss/crossentropy": 2.674792766571045, "loss/hidden": 1.015625, "loss/logits": 0.1856343150138855, "loss/reg": 0.0015803020214661956, "step": 2986 }, { "epoch": 0.373375, "grad_norm": 2.543637752532959, "grad_norm_var": 0.1388812563285914, "learning_rate": 0.0001, "loss": 1.4142, "loss/crossentropy": 2.339143753051758, "loss/hidden": 1.15625, "loss/logits": 0.24211972951889038, "loss/reg": 0.0015794888604432344, "step": 2987 }, { "epoch": 0.3735, "grad_norm": 3.9583852291107178, "grad_norm_var": 0.2549661221847576, "learning_rate": 0.0001, "loss": 1.5804, "loss/crossentropy": 2.2760813236236572, "loss/hidden": 1.2578125, "loss/logits": 0.30683714151382446, "loss/reg": 0.0015787298325449228, "step": 2988 }, { "epoch": 0.373625, "grad_norm": 2.7684473991394043, "grad_norm_var": 0.2179887474925766, "learning_rate": 0.0001, "loss": 1.0643, "loss/crossentropy": 2.4442691802978516, "loss/hidden": 0.890625, "loss/logits": 0.15788023173809052, "loss/reg": 0.0015778717352077365, "step": 2989 }, { "epoch": 0.37375, "grad_norm": 2.5182454586029053, "grad_norm_var": 0.20590566584736886, "learning_rate": 0.0001, "loss": 1.0297, "loss/crossentropy": 2.571276903152466, "loss/hidden": 0.87109375, "loss/logits": 0.14284473657608032, "loss/reg": 0.0015770101454108953, "step": 2990 }, { "epoch": 0.373875, "grad_norm": 5.801669120788574, "grad_norm_var": 0.8109427403927829, "learning_rate": 0.0001, "loss": 1.3349, "loss/crossentropy": 2.3687305450439453, "loss/hidden": 1.125, "loss/logits": 0.1941242516040802, "loss/reg": 0.0015760763781145215, "step": 2991 }, { "epoch": 0.374, "grad_norm": 2.3394758701324463, "grad_norm_var": 0.7878835563494871, "learning_rate": 0.0001, "loss": 1.0343, "loss/crossentropy": 2.267819881439209, "loss/hidden": 0.875, "loss/logits": 0.1435031294822693, "loss/reg": 0.0015751310857012868, "step": 2992 }, { "epoch": 0.374125, "grad_norm": 2.188391923904419, "grad_norm_var": 0.8157751605964617, "learning_rate": 0.0001, "loss": 1.1372, "loss/crossentropy": 2.53462290763855, "loss/hidden": 0.96484375, "loss/logits": 0.15659068524837494, "loss/reg": 0.0015742757823318243, "step": 2993 }, { "epoch": 0.37425, "grad_norm": 2.3553049564361572, "grad_norm_var": 0.8302444226053066, "learning_rate": 0.0001, "loss": 1.0073, "loss/crossentropy": 2.5353333950042725, "loss/hidden": 0.84375, "loss/logits": 0.14783015847206116, "loss/reg": 0.001573301968164742, "step": 2994 }, { "epoch": 0.374375, "grad_norm": 2.7004446983337402, "grad_norm_var": 0.8265967836978528, "learning_rate": 0.0001, "loss": 1.1471, "loss/crossentropy": 2.113797187805176, "loss/hidden": 0.9765625, "loss/logits": 0.1547660231590271, "loss/reg": 0.0015724552795290947, "step": 2995 }, { "epoch": 0.3745, "grad_norm": 1.9959298372268677, "grad_norm_var": 0.8483119145250185, "learning_rate": 0.0001, "loss": 1.1828, "loss/crossentropy": 2.3147120475769043, "loss/hidden": 0.99609375, "loss/logits": 0.1709543764591217, "loss/reg": 0.0015716016059741378, "step": 2996 }, { "epoch": 0.374625, "grad_norm": 3.4956319332122803, "grad_norm_var": 0.8692737110242154, "learning_rate": 0.0001, "loss": 1.5228, "loss/crossentropy": 2.463284492492676, "loss/hidden": 1.2421875, "loss/logits": 0.2648746371269226, "loss/reg": 0.0015707622515037656, "step": 2997 }, { "epoch": 0.37475, "grad_norm": 3.698127508163452, "grad_norm_var": 0.862008801986047, "learning_rate": 0.0001, "loss": 1.4478, "loss/crossentropy": 2.6185503005981445, "loss/hidden": 1.2265625, "loss/logits": 0.20552438497543335, "loss/reg": 0.0015699454816058278, "step": 2998 }, { "epoch": 0.374875, "grad_norm": 15.842541694641113, "grad_norm_var": 11.177027989912206, "learning_rate": 0.0001, "loss": 1.4999, "loss/crossentropy": 2.2738571166992188, "loss/hidden": 1.296875, "loss/logits": 0.18733161687850952, "loss/reg": 0.0015690682921558619, "step": 2999 }, { "epoch": 0.375, "grad_norm": 2.518263816833496, "grad_norm_var": 11.180537646726131, "learning_rate": 0.0001, "loss": 1.1735, "loss/crossentropy": 2.7226152420043945, "loss/hidden": 0.9765625, "loss/logits": 0.1812414675951004, "loss/reg": 0.0015682190423831344, "step": 3000 }, { "epoch": 0.375125, "grad_norm": 2.7900922298431396, "grad_norm_var": 11.21410628355189, "learning_rate": 0.0001, "loss": 1.0817, "loss/crossentropy": 2.534475564956665, "loss/hidden": 0.90625, "loss/logits": 0.1598208248615265, "loss/reg": 0.0015673957532271743, "step": 3001 }, { "epoch": 0.37525, "grad_norm": 7.228313446044922, "grad_norm_var": 11.87791469719746, "learning_rate": 0.0001, "loss": 1.796, "loss/crossentropy": 2.214907646179199, "loss/hidden": 1.4765625, "loss/logits": 0.30380678176879883, "loss/reg": 0.0015665374230593443, "step": 3002 }, { "epoch": 0.375375, "grad_norm": 2.5155136585235596, "grad_norm_var": 11.883599427242675, "learning_rate": 0.0001, "loss": 1.2784, "loss/crossentropy": 2.4789834022521973, "loss/hidden": 1.046875, "loss/logits": 0.21583440899848938, "loss/reg": 0.001565686077810824, "step": 3003 }, { "epoch": 0.3755, "grad_norm": 2.8447794914245605, "grad_norm_var": 11.973918960212185, "learning_rate": 0.0001, "loss": 1.1762, "loss/crossentropy": 2.5373079776763916, "loss/hidden": 0.96875, "loss/logits": 0.19176262617111206, "loss/reg": 0.0015647506806999445, "step": 3004 }, { "epoch": 0.375625, "grad_norm": 3.3740930557250977, "grad_norm_var": 11.89940601052097, "learning_rate": 0.0001, "loss": 1.36, "loss/crossentropy": 2.5511627197265625, "loss/hidden": 1.125, "loss/logits": 0.21937960386276245, "loss/reg": 0.001563914818689227, "step": 3005 }, { "epoch": 0.37575, "grad_norm": 2.379018783569336, "grad_norm_var": 11.928364104778915, "learning_rate": 0.0001, "loss": 1.1503, "loss/crossentropy": 2.470290422439575, "loss/hidden": 0.953125, "loss/logits": 0.18156743049621582, "loss/reg": 0.0015630749985575676, "step": 3006 }, { "epoch": 0.375875, "grad_norm": 2.1470794677734375, "grad_norm_var": 11.887259366212739, "learning_rate": 0.0001, "loss": 1.1169, "loss/crossentropy": 2.4587337970733643, "loss/hidden": 0.92578125, "loss/logits": 0.1754848062992096, "loss/reg": 0.0015622559003531933, "step": 3007 }, { "epoch": 0.376, "grad_norm": 2.302612543106079, "grad_norm_var": 11.894404051417206, "learning_rate": 0.0001, "loss": 1.1391, "loss/crossentropy": 2.4225594997406006, "loss/hidden": 0.953125, "loss/logits": 0.1703597754240036, "loss/reg": 0.001561451586894691, "step": 3008 }, { "epoch": 0.376125, "grad_norm": 2.641914129257202, "grad_norm_var": 11.811407780620785, "learning_rate": 0.0001, "loss": 1.1292, "loss/crossentropy": 2.2518274784088135, "loss/hidden": 0.95703125, "loss/logits": 0.15651926398277283, "loss/reg": 0.0015605826629325747, "step": 3009 }, { "epoch": 0.37625, "grad_norm": 2.193613052368164, "grad_norm_var": 11.844227829796447, "learning_rate": 0.0001, "loss": 1.0712, "loss/crossentropy": 2.6040220260620117, "loss/hidden": 0.890625, "loss/logits": 0.16501112282276154, "loss/reg": 0.0015596725279465318, "step": 3010 }, { "epoch": 0.376375, "grad_norm": 2.4500036239624023, "grad_norm_var": 11.88458883451683, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.2987077236175537, "loss/hidden": 0.95703125, "loss/logits": 0.16559123992919922, "loss/reg": 0.0015588371315971017, "step": 3011 }, { "epoch": 0.3765, "grad_norm": 2.7540335655212402, "grad_norm_var": 11.740568881604887, "learning_rate": 0.0001, "loss": 1.2316, "loss/crossentropy": 2.379760503768921, "loss/hidden": 1.03125, "loss/logits": 0.18474510312080383, "loss/reg": 0.0015579448081552982, "step": 3012 }, { "epoch": 0.376625, "grad_norm": 2.359339952468872, "grad_norm_var": 11.870936710907415, "learning_rate": 0.0001, "loss": 1.1, "loss/crossentropy": 2.7584152221679688, "loss/hidden": 0.9140625, "loss/logits": 0.1703372746706009, "loss/reg": 0.0015571293188259006, "step": 3013 }, { "epoch": 0.37675, "grad_norm": 3.7690534591674805, "grad_norm_var": 11.870737317221051, "learning_rate": 0.0001, "loss": 1.5399, "loss/crossentropy": 2.5877668857574463, "loss/hidden": 1.2265625, "loss/logits": 0.2977341413497925, "loss/reg": 0.0015562785556539893, "step": 3014 }, { "epoch": 0.376875, "grad_norm": 4.473683834075928, "grad_norm_var": 1.628915341021788, "learning_rate": 0.0001, "loss": 1.3005, "loss/crossentropy": 2.8921680450439453, "loss/hidden": 1.0859375, "loss/logits": 0.19899588823318481, "loss/reg": 0.0015554350102320313, "step": 3015 }, { "epoch": 0.377, "grad_norm": 3.871870756149292, "grad_norm_var": 1.6481237567456437, "learning_rate": 0.0001, "loss": 1.3923, "loss/crossentropy": 2.523580551147461, "loss/hidden": 1.109375, "loss/logits": 0.2673671841621399, "loss/reg": 0.0015546170761808753, "step": 3016 }, { "epoch": 0.377125, "grad_norm": 2.5204830169677734, "grad_norm_var": 1.6649195310359515, "learning_rate": 0.0001, "loss": 0.9553, "loss/crossentropy": 2.714613676071167, "loss/hidden": 0.80859375, "loss/logits": 0.1311224400997162, "loss/reg": 0.0015538409352302551, "step": 3017 }, { "epoch": 0.37725, "grad_norm": 2.0336124897003174, "grad_norm_var": 0.5018540327532947, "learning_rate": 0.0001, "loss": 1.0784, "loss/crossentropy": 2.409799337387085, "loss/hidden": 0.88671875, "loss/logits": 0.17613857984542847, "loss/reg": 0.0015530973905697465, "step": 3018 }, { "epoch": 0.377375, "grad_norm": 2.578618049621582, "grad_norm_var": 0.4997983002459968, "learning_rate": 0.0001, "loss": 1.3189, "loss/crossentropy": 2.3623170852661133, "loss/hidden": 1.109375, "loss/logits": 0.19397228956222534, "loss/reg": 0.0015522442990913987, "step": 3019 }, { "epoch": 0.3775, "grad_norm": 2.1041927337646484, "grad_norm_var": 0.5290004883339177, "learning_rate": 0.0001, "loss": 0.992, "loss/crossentropy": 2.716660976409912, "loss/hidden": 0.84765625, "loss/logits": 0.12882237136363983, "loss/reg": 0.0015514519764110446, "step": 3020 }, { "epoch": 0.377625, "grad_norm": 2.764676094055176, "grad_norm_var": 0.501263692115526, "learning_rate": 0.0001, "loss": 1.2114, "loss/crossentropy": 2.7042715549468994, "loss/hidden": 1.0234375, "loss/logits": 0.1724938005208969, "loss/reg": 0.0015505721094086766, "step": 3021 }, { "epoch": 0.37775, "grad_norm": 3.030519485473633, "grad_norm_var": 0.49912867138020806, "learning_rate": 0.0001, "loss": 1.0809, "loss/crossentropy": 2.6933398246765137, "loss/hidden": 0.89453125, "loss/logits": 0.170840322971344, "loss/reg": 0.0015496540581807494, "step": 3022 }, { "epoch": 0.377875, "grad_norm": 4.19857931137085, "grad_norm_var": 0.5973307900586896, "learning_rate": 0.0001, "loss": 1.0373, "loss/crossentropy": 2.5517749786376953, "loss/hidden": 0.875, "loss/logits": 0.14676275849342346, "loss/reg": 0.0015487612690776587, "step": 3023 }, { "epoch": 0.378, "grad_norm": 3.8462929725646973, "grad_norm_var": 0.6278518968240253, "learning_rate": 0.0001, "loss": 1.3307, "loss/crossentropy": 2.3984336853027344, "loss/hidden": 1.078125, "loss/logits": 0.23710979521274567, "loss/reg": 0.0015478322748094797, "step": 3024 }, { "epoch": 0.378125, "grad_norm": 2.561694622039795, "grad_norm_var": 0.6318103997779642, "learning_rate": 0.0001, "loss": 1.1234, "loss/crossentropy": 2.3877100944519043, "loss/hidden": 0.9296875, "loss/logits": 0.17824707925319672, "loss/reg": 0.0015469109639525414, "step": 3025 }, { "epoch": 0.37825, "grad_norm": 2.4824273586273193, "grad_norm_var": 0.6071496270914916, "learning_rate": 0.0001, "loss": 1.475, "loss/crossentropy": 2.0305521488189697, "loss/hidden": 1.234375, "loss/logits": 0.22519615292549133, "loss/reg": 0.0015460785944014788, "step": 3026 }, { "epoch": 0.378375, "grad_norm": 2.718799352645874, "grad_norm_var": 0.5924038173662739, "learning_rate": 0.0001, "loss": 1.2249, "loss/crossentropy": 2.6215500831604004, "loss/hidden": 1.0078125, "loss/logits": 0.20166423916816711, "loss/reg": 0.001545316306874156, "step": 3027 }, { "epoch": 0.3785, "grad_norm": 4.733144760131836, "grad_norm_var": 0.7711834288936773, "learning_rate": 0.0001, "loss": 1.8709, "loss/crossentropy": 2.4145333766937256, "loss/hidden": 1.53125, "loss/logits": 0.3242036998271942, "loss/reg": 0.0015445781173184514, "step": 3028 }, { "epoch": 0.378625, "grad_norm": 3.2688891887664795, "grad_norm_var": 0.7296781979737562, "learning_rate": 0.0001, "loss": 1.0755, "loss/crossentropy": 2.330563545227051, "loss/hidden": 0.921875, "loss/logits": 0.13817086815834045, "loss/reg": 0.0015437324764207006, "step": 3029 }, { "epoch": 0.37875, "grad_norm": 2.2654261589050293, "grad_norm_var": 0.753847589536597, "learning_rate": 0.0001, "loss": 1.161, "loss/crossentropy": 2.5420870780944824, "loss/hidden": 0.9609375, "loss/logits": 0.18459224700927734, "loss/reg": 0.0015429611084982753, "step": 3030 }, { "epoch": 0.378875, "grad_norm": 2.7828686237335205, "grad_norm_var": 0.6207675106699441, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.2896618843078613, "loss/hidden": 1.1171875, "loss/logits": 0.21609418094158173, "loss/reg": 0.0015421201242133975, "step": 3031 }, { "epoch": 0.379, "grad_norm": 2.534998893737793, "grad_norm_var": 0.5744081572068331, "learning_rate": 0.0001, "loss": 1.1259, "loss/crossentropy": 2.483560800552368, "loss/hidden": 0.9453125, "loss/logits": 0.1652083694934845, "loss/reg": 0.0015413160435855389, "step": 3032 }, { "epoch": 0.379125, "grad_norm": 2.6651241779327393, "grad_norm_var": 0.568366151024452, "learning_rate": 0.0001, "loss": 1.29, "loss/crossentropy": 2.5334088802337646, "loss/hidden": 1.0546875, "loss/logits": 0.21992579102516174, "loss/reg": 0.0015404759906232357, "step": 3033 }, { "epoch": 0.37925, "grad_norm": 2.3611483573913574, "grad_norm_var": 0.5367710983871838, "learning_rate": 0.0001, "loss": 1.0778, "loss/crossentropy": 2.5277013778686523, "loss/hidden": 0.90625, "loss/logits": 0.1561201959848404, "loss/reg": 0.0015396022936329246, "step": 3034 }, { "epoch": 0.379375, "grad_norm": 2.42582368850708, "grad_norm_var": 0.5454109434634156, "learning_rate": 0.0001, "loss": 1.4527, "loss/crossentropy": 2.2852399349212646, "loss/hidden": 1.203125, "loss/logits": 0.23422226309776306, "loss/reg": 0.0015387338353320956, "step": 3035 }, { "epoch": 0.3795, "grad_norm": 2.672703504562378, "grad_norm_var": 0.5036552900392195, "learning_rate": 0.0001, "loss": 1.1174, "loss/crossentropy": 2.3716447353363037, "loss/hidden": 0.9375, "loss/logits": 0.16453100740909576, "loss/reg": 0.0015379353426396847, "step": 3036 }, { "epoch": 0.379625, "grad_norm": 2.1379103660583496, "grad_norm_var": 0.5442855977535487, "learning_rate": 0.0001, "loss": 1.1733, "loss/crossentropy": 2.4636714458465576, "loss/hidden": 0.98046875, "loss/logits": 0.17747575044631958, "loss/reg": 0.0015370320761576295, "step": 3037 }, { "epoch": 0.37975, "grad_norm": 2.621626615524292, "grad_norm_var": 0.5485951090937569, "learning_rate": 0.0001, "loss": 1.2976, "loss/crossentropy": 2.4245622158050537, "loss/hidden": 1.0859375, "loss/logits": 0.19625593721866608, "loss/reg": 0.0015361856203526258, "step": 3038 }, { "epoch": 0.379875, "grad_norm": 3.692251443862915, "grad_norm_var": 0.4764334638501007, "learning_rate": 0.0001, "loss": 1.2877, "loss/crossentropy": 2.474142074584961, "loss/hidden": 1.09375, "loss/logits": 0.17863908410072327, "loss/reg": 0.0015353577909991145, "step": 3039 }, { "epoch": 0.38, "grad_norm": 2.7761356830596924, "grad_norm_var": 0.40737819549050847, "learning_rate": 0.0001, "loss": 1.0774, "loss/crossentropy": 2.4172182083129883, "loss/hidden": 0.92578125, "loss/logits": 0.13622896373271942, "loss/reg": 0.0015346118016168475, "step": 3040 }, { "epoch": 0.380125, "grad_norm": 2.1613762378692627, "grad_norm_var": 0.42978350412629046, "learning_rate": 0.0001, "loss": 1.1238, "loss/crossentropy": 2.5704824924468994, "loss/hidden": 0.953125, "loss/logits": 0.1553528606891632, "loss/reg": 0.0015337750082835555, "step": 3041 }, { "epoch": 0.38025, "grad_norm": 2.1340346336364746, "grad_norm_var": 0.45067186060471703, "learning_rate": 0.0001, "loss": 1.1085, "loss/crossentropy": 2.1345014572143555, "loss/hidden": 0.91015625, "loss/logits": 0.18302980065345764, "loss/reg": 0.0015329893212765455, "step": 3042 }, { "epoch": 0.380375, "grad_norm": 2.391342878341675, "grad_norm_var": 0.4586055732130525, "learning_rate": 0.0001, "loss": 1.1196, "loss/crossentropy": 2.64565372467041, "loss/hidden": 0.9453125, "loss/logits": 0.15901410579681396, "loss/reg": 0.0015322294784709811, "step": 3043 }, { "epoch": 0.3805, "grad_norm": 2.7479233741760254, "grad_norm_var": 0.17378667895695396, "learning_rate": 0.0001, "loss": 1.222, "loss/crossentropy": 2.491063356399536, "loss/hidden": 1.0625, "loss/logits": 0.14413951337337494, "loss/reg": 0.0015313579933717847, "step": 3044 }, { "epoch": 0.380625, "grad_norm": 2.6931698322296143, "grad_norm_var": 0.14334672689393693, "learning_rate": 0.0001, "loss": 1.2471, "loss/crossentropy": 2.6116511821746826, "loss/hidden": 1.0234375, "loss/logits": 0.20840109884738922, "loss/reg": 0.0015305442502722144, "step": 3045 }, { "epoch": 0.38075, "grad_norm": 3.008284568786621, "grad_norm_var": 0.1480167814838976, "learning_rate": 0.0001, "loss": 1.0525, "loss/crossentropy": 2.4817965030670166, "loss/hidden": 0.88671875, "loss/logits": 0.15045757591724396, "loss/reg": 0.0015296809142455459, "step": 3046 }, { "epoch": 0.380875, "grad_norm": 2.353818416595459, "grad_norm_var": 0.14979984783222544, "learning_rate": 0.0001, "loss": 1.2432, "loss/crossentropy": 2.719508647918701, "loss/hidden": 1.0234375, "loss/logits": 0.2044530212879181, "loss/reg": 0.0015289130387827754, "step": 3047 }, { "epoch": 0.381, "grad_norm": 2.1603620052337646, "grad_norm_var": 0.16112470586231378, "learning_rate": 0.0001, "loss": 1.1597, "loss/crossentropy": 2.54118275642395, "loss/hidden": 0.984375, "loss/logits": 0.16003243625164032, "loss/reg": 0.001528166583739221, "step": 3048 }, { "epoch": 0.381125, "grad_norm": 3.715346574783325, "grad_norm_var": 0.24440400782246433, "learning_rate": 0.0001, "loss": 1.5422, "loss/crossentropy": 2.47391676902771, "loss/hidden": 1.2578125, "loss/logits": 0.2690886855125427, "loss/reg": 0.001527426764369011, "step": 3049 }, { "epoch": 0.38125, "grad_norm": 3.198798656463623, "grad_norm_var": 0.2584171488901857, "learning_rate": 0.0001, "loss": 1.2011, "loss/crossentropy": 2.7698891162872314, "loss/hidden": 0.98828125, "loss/logits": 0.19752052426338196, "loss/reg": 0.0015266072005033493, "step": 3050 }, { "epoch": 0.381375, "grad_norm": 2.497174024581909, "grad_norm_var": 0.25631076689673954, "learning_rate": 0.0001, "loss": 1.2638, "loss/crossentropy": 2.385063648223877, "loss/hidden": 1.0703125, "loss/logits": 0.1782207489013672, "loss/reg": 0.001525795552879572, "step": 3051 }, { "epoch": 0.3815, "grad_norm": 3.043733596801758, "grad_norm_var": 0.26429942493089675, "learning_rate": 0.0001, "loss": 1.1767, "loss/crossentropy": 2.756899356842041, "loss/hidden": 0.98828125, "loss/logits": 0.17318682372570038, "loss/reg": 0.0015250993892550468, "step": 3052 }, { "epoch": 0.381625, "grad_norm": 2.4790260791778564, "grad_norm_var": 0.24562801518937458, "learning_rate": 0.0001, "loss": 1.0991, "loss/crossentropy": 2.632850170135498, "loss/hidden": 0.91796875, "loss/logits": 0.16589318215847015, "loss/reg": 0.001524452120065689, "step": 3053 }, { "epoch": 0.38175, "grad_norm": 2.8286497592926025, "grad_norm_var": 0.24532488984157225, "learning_rate": 0.0001, "loss": 1.3044, "loss/crossentropy": 2.5128822326660156, "loss/hidden": 1.0859375, "loss/logits": 0.20319077372550964, "loss/reg": 0.0015236367471516132, "step": 3054 }, { "epoch": 0.381875, "grad_norm": 5.666845798492432, "grad_norm_var": 0.7390401703679153, "learning_rate": 0.0001, "loss": 1.9423, "loss/crossentropy": 2.08412766456604, "loss/hidden": 1.6171875, "loss/logits": 0.3098413348197937, "loss/reg": 0.001522907754406333, "step": 3055 }, { "epoch": 0.382, "grad_norm": 2.4863898754119873, "grad_norm_var": 0.7477589712209045, "learning_rate": 0.0001, "loss": 1.101, "loss/crossentropy": 2.336117744445801, "loss/hidden": 0.93359375, "loss/logits": 0.15220022201538086, "loss/reg": 0.0015220923814922571, "step": 3056 }, { "epoch": 0.382125, "grad_norm": 2.644345998764038, "grad_norm_var": 0.7181288436062988, "learning_rate": 0.0001, "loss": 1.2392, "loss/crossentropy": 2.5535202026367188, "loss/hidden": 1.046875, "loss/logits": 0.1770707219839096, "loss/reg": 0.0015212802682071924, "step": 3057 }, { "epoch": 0.38225, "grad_norm": 4.635331630706787, "grad_norm_var": 0.8610161754364751, "learning_rate": 0.0001, "loss": 1.1377, "loss/crossentropy": 2.743553876876831, "loss/hidden": 0.98046875, "loss/logits": 0.14201176166534424, "loss/reg": 0.0015204973751679063, "step": 3058 }, { "epoch": 0.382375, "grad_norm": 3.356191635131836, "grad_norm_var": 0.8364712967306929, "learning_rate": 0.0001, "loss": 1.128, "loss/crossentropy": 2.347088098526001, "loss/hidden": 0.9453125, "loss/logits": 0.16744756698608398, "loss/reg": 0.001519753597676754, "step": 3059 }, { "epoch": 0.3825, "grad_norm": 2.8272480964660645, "grad_norm_var": 0.8331967177099528, "learning_rate": 0.0001, "loss": 1.2608, "loss/crossentropy": 2.868544340133667, "loss/hidden": 1.0390625, "loss/logits": 0.2065209299325943, "loss/reg": 0.0015189426485449076, "step": 3060 }, { "epoch": 0.382625, "grad_norm": 2.450787305831909, "grad_norm_var": 0.8500056796609101, "learning_rate": 0.0001, "loss": 1.1708, "loss/crossentropy": 2.6079471111297607, "loss/hidden": 0.984375, "loss/logits": 0.17119550704956055, "loss/reg": 0.0015181255294010043, "step": 3061 }, { "epoch": 0.38275, "grad_norm": 3.592229127883911, "grad_norm_var": 0.8653819290616579, "learning_rate": 0.0001, "loss": 1.2865, "loss/crossentropy": 3.110443353652954, "loss/hidden": 1.078125, "loss/logits": 0.19319577515125275, "loss/reg": 0.0015173177234828472, "step": 3062 }, { "epoch": 0.382875, "grad_norm": 2.410964012145996, "grad_norm_var": 0.8597404244600225, "learning_rate": 0.0001, "loss": 1.0393, "loss/crossentropy": 2.562082529067993, "loss/hidden": 0.8828125, "loss/logits": 0.14137160778045654, "loss/reg": 0.0015165103832259774, "step": 3063 }, { "epoch": 0.383, "grad_norm": 2.407580614089966, "grad_norm_var": 0.8317769249612977, "learning_rate": 0.0001, "loss": 1.3447, "loss/crossentropy": 2.3585827350616455, "loss/hidden": 1.109375, "loss/logits": 0.2201308310031891, "loss/reg": 0.0015156479785218835, "step": 3064 }, { "epoch": 0.383125, "grad_norm": 2.2852606773376465, "grad_norm_var": 0.8499001868856103, "learning_rate": 0.0001, "loss": 1.0313, "loss/crossentropy": 2.770510196685791, "loss/hidden": 0.88671875, "loss/logits": 0.12941858172416687, "loss/reg": 0.0015148305101320148, "step": 3065 }, { "epoch": 0.38325, "grad_norm": 2.1744890213012695, "grad_norm_var": 0.8952438191989435, "learning_rate": 0.0001, "loss": 1.1189, "loss/crossentropy": 2.7242465019226074, "loss/hidden": 0.93359375, "loss/logits": 0.17017105221748352, "loss/reg": 0.0015140252653509378, "step": 3066 }, { "epoch": 0.383375, "grad_norm": 2.1513309478759766, "grad_norm_var": 0.9252897605609308, "learning_rate": 0.0001, "loss": 1.2262, "loss/crossentropy": 2.3357694149017334, "loss/hidden": 1.0546875, "loss/logits": 0.15634681284427643, "loss/reg": 0.0015132210683077574, "step": 3067 }, { "epoch": 0.3835, "grad_norm": 1.9096907377243042, "grad_norm_var": 0.993766935801248, "learning_rate": 0.0001, "loss": 1.0732, "loss/crossentropy": 2.3828632831573486, "loss/hidden": 0.90625, "loss/logits": 0.1517866849899292, "loss/reg": 0.001512432238087058, "step": 3068 }, { "epoch": 0.383625, "grad_norm": 2.3809525966644287, "grad_norm_var": 0.9997964078305192, "learning_rate": 0.0001, "loss": 1.0295, "loss/crossentropy": 2.3541440963745117, "loss/hidden": 0.87109375, "loss/logits": 0.14329306781291962, "loss/reg": 0.001511548412963748, "step": 3069 }, { "epoch": 0.38375, "grad_norm": 2.148785352706909, "grad_norm_var": 1.0340665297704172, "learning_rate": 0.0001, "loss": 1.202, "loss/crossentropy": 2.5628278255462646, "loss/hidden": 0.98828125, "loss/logits": 0.1986294984817505, "loss/reg": 0.0015106116188690066, "step": 3070 }, { "epoch": 0.383875, "grad_norm": 4.334015369415283, "grad_norm_var": 0.643715138142222, "learning_rate": 0.0001, "loss": 1.3672, "loss/crossentropy": 2.490190267562866, "loss/hidden": 1.1171875, "loss/logits": 0.23495829105377197, "loss/reg": 0.00150971207767725, "step": 3071 }, { "epoch": 0.384, "grad_norm": 3.414167881011963, "grad_norm_var": 0.6633916090945272, "learning_rate": 0.0001, "loss": 1.311, "loss/crossentropy": 2.4356698989868164, "loss/hidden": 1.09375, "loss/logits": 0.2021765112876892, "loss/reg": 0.0015088983345776796, "step": 3072 }, { "epoch": 0.384125, "grad_norm": 3.4008517265319824, "grad_norm_var": 0.6814213970762945, "learning_rate": 0.0001, "loss": 1.2902, "loss/crossentropy": 2.515183925628662, "loss/hidden": 1.0703125, "loss/logits": 0.20480088889598846, "loss/reg": 0.0015079693403095007, "step": 3073 }, { "epoch": 0.38425, "grad_norm": 3.8095407485961914, "grad_norm_var": 0.5293933112138726, "learning_rate": 0.0001, "loss": 1.2624, "loss/crossentropy": 2.220061779022217, "loss/hidden": 1.0703125, "loss/logits": 0.17705152928829193, "loss/reg": 0.0015070406952872872, "step": 3074 }, { "epoch": 0.384375, "grad_norm": 2.9314463138580322, "grad_norm_var": 0.510069556795633, "learning_rate": 0.0001, "loss": 1.1556, "loss/crossentropy": 2.3562121391296387, "loss/hidden": 0.99609375, "loss/logits": 0.1444169580936432, "loss/reg": 0.0015061016893014312, "step": 3075 }, { "epoch": 0.3845, "grad_norm": 2.5193705558776855, "grad_norm_var": 0.5144374476381837, "learning_rate": 0.0001, "loss": 1.1076, "loss/crossentropy": 2.4433231353759766, "loss/hidden": 0.94140625, "loss/logits": 0.1511809378862381, "loss/reg": 0.0015051388181746006, "step": 3076 }, { "epoch": 0.384625, "grad_norm": 2.7669591903686523, "grad_norm_var": 0.5072245737466228, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.429995059967041, "loss/hidden": 1.125, "loss/logits": 0.22534441947937012, "loss/reg": 0.0015043155290186405, "step": 3077 }, { "epoch": 0.38475, "grad_norm": 3.1742730140686035, "grad_norm_var": 0.4734280839971527, "learning_rate": 0.0001, "loss": 1.4277, "loss/crossentropy": 2.3575611114501953, "loss/hidden": 1.1953125, "loss/logits": 0.2173648476600647, "loss/reg": 0.0015033509116619825, "step": 3078 }, { "epoch": 0.384875, "grad_norm": 2.3889479637145996, "grad_norm_var": 0.47449391299533134, "learning_rate": 0.0001, "loss": 1.1042, "loss/crossentropy": 2.50315260887146, "loss/hidden": 0.91796875, "loss/logits": 0.17120787501335144, "loss/reg": 0.0015025264583528042, "step": 3079 }, { "epoch": 0.385, "grad_norm": 2.424584150314331, "grad_norm_var": 0.47370766291071403, "learning_rate": 0.0001, "loss": 1.2204, "loss/crossentropy": 2.5353128910064697, "loss/hidden": 1.015625, "loss/logits": 0.18976464867591858, "loss/reg": 0.001501591526903212, "step": 3080 }, { "epoch": 0.385125, "grad_norm": 3.177478790283203, "grad_norm_var": 0.46657839732932155, "learning_rate": 0.0001, "loss": 1.1312, "loss/crossentropy": 3.065169334411621, "loss/hidden": 0.94140625, "loss/logits": 0.17475619912147522, "loss/reg": 0.0015007583424448967, "step": 3081 }, { "epoch": 0.38525, "grad_norm": 14.621908187866211, "grad_norm_var": 9.080253009998954, "learning_rate": 0.0001, "loss": 1.1922, "loss/crossentropy": 2.552370071411133, "loss/hidden": 0.96875, "loss/logits": 0.20840778946876526, "loss/reg": 0.0014999582199379802, "step": 3082 }, { "epoch": 0.385375, "grad_norm": 3.3133881092071533, "grad_norm_var": 8.940635912645899, "learning_rate": 0.0001, "loss": 1.3834, "loss/crossentropy": 2.3733625411987305, "loss/hidden": 1.1875, "loss/logits": 0.1808752417564392, "loss/reg": 0.001499146455898881, "step": 3083 }, { "epoch": 0.3855, "grad_norm": 3.1891136169433594, "grad_norm_var": 8.742691736673235, "learning_rate": 0.0001, "loss": 1.1132, "loss/crossentropy": 2.8844857215881348, "loss/hidden": 0.93359375, "loss/logits": 0.16465333104133606, "loss/reg": 0.0014985809102654457, "step": 3084 }, { "epoch": 0.385625, "grad_norm": 9.44871711730957, "grad_norm_var": 10.574873745149773, "learning_rate": 0.0001, "loss": 1.868, "loss/crossentropy": 3.0805418491363525, "loss/hidden": 1.4765625, "loss/logits": 0.3764764964580536, "loss/reg": 0.0014977955725044012, "step": 3085 }, { "epoch": 0.38575, "grad_norm": 3.5079286098480225, "grad_norm_var": 10.320154351297106, "learning_rate": 0.0001, "loss": 1.1707, "loss/crossentropy": 2.7522993087768555, "loss/hidden": 0.984375, "loss/logits": 0.1713697910308838, "loss/reg": 0.0014972144272178411, "step": 3086 }, { "epoch": 0.385875, "grad_norm": 2.882699728012085, "grad_norm_var": 10.440653614856666, "learning_rate": 0.0001, "loss": 1.2373, "loss/crossentropy": 2.194499969482422, "loss/hidden": 1.03125, "loss/logits": 0.1910915970802307, "loss/reg": 0.0014964212896302342, "step": 3087 }, { "epoch": 0.386, "grad_norm": 2.7966065406799316, "grad_norm_var": 10.52802001592425, "learning_rate": 0.0001, "loss": 1.3247, "loss/crossentropy": 2.546497106552124, "loss/hidden": 1.0703125, "loss/logits": 0.23945873975753784, "loss/reg": 0.0014958065003156662, "step": 3088 }, { "epoch": 0.386125, "grad_norm": 3.1555607318878174, "grad_norm_var": 10.55618733110625, "learning_rate": 0.0001, "loss": 1.227, "loss/crossentropy": 2.564608573913574, "loss/hidden": 1.0390625, "loss/logits": 0.17295242846012115, "loss/reg": 0.001495162257924676, "step": 3089 }, { "epoch": 0.38625, "grad_norm": 2.817620277404785, "grad_norm_var": 10.66029992309476, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.1066861152648926, "loss/hidden": 1.140625, "loss/logits": 0.20065559446811676, "loss/reg": 0.0014944219728931785, "step": 3090 }, { "epoch": 0.386375, "grad_norm": 2.651130437850952, "grad_norm_var": 10.707757005998351, "learning_rate": 0.0001, "loss": 1.109, "loss/crossentropy": 2.4121179580688477, "loss/hidden": 0.9375, "loss/logits": 0.15655362606048584, "loss/reg": 0.0014937702799215913, "step": 3091 }, { "epoch": 0.3865, "grad_norm": 11.054521560668945, "grad_norm_var": 13.516339088021812, "learning_rate": 0.0001, "loss": 1.488, "loss/crossentropy": 2.598538637161255, "loss/hidden": 1.25, "loss/logits": 0.223049595952034, "loss/reg": 0.0014930102042853832, "step": 3092 }, { "epoch": 0.386625, "grad_norm": 2.9682095050811768, "grad_norm_var": 13.4700670896951, "learning_rate": 0.0001, "loss": 1.4144, "loss/crossentropy": 2.2097835540771484, "loss/hidden": 1.203125, "loss/logits": 0.19634860754013062, "loss/reg": 0.001492318813689053, "step": 3093 }, { "epoch": 0.38675, "grad_norm": 5.174125671386719, "grad_norm_var": 13.3403195626377, "learning_rate": 0.0001, "loss": 1.2711, "loss/crossentropy": 2.8175737857818604, "loss/hidden": 1.09375, "loss/logits": 0.16242022812366486, "loss/reg": 0.001491627423092723, "step": 3094 }, { "epoch": 0.386875, "grad_norm": 2.8117153644561768, "grad_norm_var": 13.219906183272222, "learning_rate": 0.0001, "loss": 1.4947, "loss/crossentropy": 2.2445030212402344, "loss/hidden": 1.25, "loss/logits": 0.22975078225135803, "loss/reg": 0.0014908172888681293, "step": 3095 }, { "epoch": 0.387, "grad_norm": 7.696689128875732, "grad_norm_var": 13.322660622818619, "learning_rate": 0.0001, "loss": 2.3456, "loss/crossentropy": 2.569875717163086, "loss/hidden": 1.8203125, "loss/logits": 0.5104068517684937, "loss/reg": 0.001490112510509789, "step": 3096 }, { "epoch": 0.387125, "grad_norm": 2.285222291946411, "grad_norm_var": 13.598662894045217, "learning_rate": 0.0001, "loss": 1.0935, "loss/crossentropy": 2.6675126552581787, "loss/hidden": 0.921875, "loss/logits": 0.1567239761352539, "loss/reg": 0.0014894246123731136, "step": 3097 }, { "epoch": 0.38725, "grad_norm": 2.4583311080932617, "grad_norm_var": 7.278818348836731, "learning_rate": 0.0001, "loss": 1.1561, "loss/crossentropy": 2.4110171794891357, "loss/hidden": 0.98046875, "loss/logits": 0.16075652837753296, "loss/reg": 0.0014888017904013395, "step": 3098 }, { "epoch": 0.387375, "grad_norm": 2.9501612186431885, "grad_norm_var": 7.3330649886207, "learning_rate": 0.0001, "loss": 1.0332, "loss/crossentropy": 2.566474676132202, "loss/hidden": 0.890625, "loss/logits": 0.12770453095436096, "loss/reg": 0.0014881796669214964, "step": 3099 }, { "epoch": 0.3875, "grad_norm": 2.3767528533935547, "grad_norm_var": 7.4881936759913055, "learning_rate": 0.0001, "loss": 1.1161, "loss/crossentropy": 2.421116828918457, "loss/hidden": 0.93359375, "loss/logits": 0.16760317981243134, "loss/reg": 0.0014875682536512613, "step": 3100 }, { "epoch": 0.387625, "grad_norm": 2.4083352088928223, "grad_norm_var": 5.649444377071855, "learning_rate": 0.0001, "loss": 1.1276, "loss/crossentropy": 2.4182422161102295, "loss/hidden": 0.95703125, "loss/logits": 0.155739888548851, "loss/reg": 0.0014867965364828706, "step": 3101 }, { "epoch": 0.38775, "grad_norm": 6.1076765060424805, "grad_norm_var": 5.988047604643861, "learning_rate": 0.0001, "loss": 1.2855, "loss/crossentropy": 2.437669038772583, "loss/hidden": 1.0546875, "loss/logits": 0.2159654200077057, "loss/reg": 0.0014861947856843472, "step": 3102 }, { "epoch": 0.387875, "grad_norm": 2.1656200885772705, "grad_norm_var": 6.118617408020804, "learning_rate": 0.0001, "loss": 1.0569, "loss/crossentropy": 2.6133086681365967, "loss/hidden": 0.89453125, "loss/logits": 0.14749613404273987, "loss/reg": 0.001485634595155716, "step": 3103 }, { "epoch": 0.388, "grad_norm": 3.968498468399048, "grad_norm_var": 6.03713786793108, "learning_rate": 0.0001, "loss": 1.4319, "loss/crossentropy": 2.7312111854553223, "loss/hidden": 1.2109375, "loss/logits": 0.206155925989151, "loss/reg": 0.0014850635780021548, "step": 3104 }, { "epoch": 0.388125, "grad_norm": 2.6761622428894043, "grad_norm_var": 6.101683630569884, "learning_rate": 0.0001, "loss": 1.1513, "loss/crossentropy": 2.423086404800415, "loss/hidden": 0.94140625, "loss/logits": 0.19508376717567444, "loss/reg": 0.0014844181714579463, "step": 3105 }, { "epoch": 0.38825, "grad_norm": 2.9287075996398926, "grad_norm_var": 6.086264994833298, "learning_rate": 0.0001, "loss": 1.5205, "loss/crossentropy": 2.1396305561065674, "loss/hidden": 1.265625, "loss/logits": 0.2400842308998108, "loss/reg": 0.0014838469214737415, "step": 3106 }, { "epoch": 0.388375, "grad_norm": 2.5311455726623535, "grad_norm_var": 6.107425984338238, "learning_rate": 0.0001, "loss": 1.1222, "loss/crossentropy": 2.6279356479644775, "loss/hidden": 0.93359375, "loss/logits": 0.17378053069114685, "loss/reg": 0.0014832711312919855, "step": 3107 }, { "epoch": 0.3885, "grad_norm": 2.268536329269409, "grad_norm_var": 2.5626044620740966, "learning_rate": 0.0001, "loss": 1.0872, "loss/crossentropy": 2.6690196990966797, "loss/hidden": 0.921875, "loss/logits": 0.1505495309829712, "loss/reg": 0.0014825089601799846, "step": 3108 }, { "epoch": 0.388625, "grad_norm": 2.389401912689209, "grad_norm_var": 2.613855916177321, "learning_rate": 0.0001, "loss": 1.2056, "loss/crossentropy": 2.7793209552764893, "loss/hidden": 1.03125, "loss/logits": 0.1595500409603119, "loss/reg": 0.0014817919582128525, "step": 3109 }, { "epoch": 0.38875, "grad_norm": 2.7481424808502197, "grad_norm_var": 2.3835076953272334, "learning_rate": 0.0001, "loss": 1.1872, "loss/crossentropy": 2.7962143421173096, "loss/hidden": 0.9921875, "loss/logits": 0.18024000525474548, "loss/reg": 0.001481186947785318, "step": 3110 }, { "epoch": 0.388875, "grad_norm": 5.344142436981201, "grad_norm_var": 2.6622762228428494, "learning_rate": 0.0001, "loss": 1.4329, "loss/crossentropy": 2.7534737586975098, "loss/hidden": 1.1015625, "loss/logits": 0.3165036737918854, "loss/reg": 0.0014806733233854175, "step": 3111 }, { "epoch": 0.389, "grad_norm": 3.1115097999572754, "grad_norm_var": 1.3075599379443466, "learning_rate": 0.0001, "loss": 1.234, "loss/crossentropy": 2.13659405708313, "loss/hidden": 1.0625, "loss/logits": 0.15669596195220947, "loss/reg": 0.0014798784395679832, "step": 3112 }, { "epoch": 0.389125, "grad_norm": 9.919644355773926, "grad_norm_var": 4.177045013974175, "learning_rate": 0.0001, "loss": 1.4659, "loss/crossentropy": 2.933748483657837, "loss/hidden": 1.2578125, "loss/logits": 0.193283811211586, "loss/reg": 0.0014791582943871617, "step": 3113 }, { "epoch": 0.38925, "grad_norm": 2.749711513519287, "grad_norm_var": 4.14102525442575, "learning_rate": 0.0001, "loss": 1.2804, "loss/crossentropy": 2.389108419418335, "loss/hidden": 1.0859375, "loss/logits": 0.17970852553844452, "loss/reg": 0.001478366320952773, "step": 3114 }, { "epoch": 0.389375, "grad_norm": 3.4161529541015625, "grad_norm_var": 4.1179329133067375, "learning_rate": 0.0001, "loss": 1.1962, "loss/crossentropy": 2.7660036087036133, "loss/hidden": 1.015625, "loss/logits": 0.16577987372875214, "loss/reg": 0.0014776282478123903, "step": 3115 }, { "epoch": 0.3895, "grad_norm": 2.560009002685547, "grad_norm_var": 4.090890912743164, "learning_rate": 0.0001, "loss": 1.191, "loss/crossentropy": 2.275609254837036, "loss/hidden": 1.015625, "loss/logits": 0.1606125682592392, "loss/reg": 0.0014768131077289581, "step": 3116 }, { "epoch": 0.389625, "grad_norm": 2.8846094608306885, "grad_norm_var": 4.030610562554117, "learning_rate": 0.0001, "loss": 1.3857, "loss/crossentropy": 2.4561069011688232, "loss/hidden": 1.15625, "loss/logits": 0.21473157405853271, "loss/reg": 0.0014760474441573024, "step": 3117 }, { "epoch": 0.38975, "grad_norm": 3.6733250617980957, "grad_norm_var": 3.59048960874516, "learning_rate": 0.0001, "loss": 1.4934, "loss/crossentropy": 2.8224945068359375, "loss/hidden": 1.21875, "loss/logits": 0.2599002718925476, "loss/reg": 0.0014752430142834783, "step": 3118 }, { "epoch": 0.389875, "grad_norm": 2.62074613571167, "grad_norm_var": 3.5249819798108697, "learning_rate": 0.0001, "loss": 1.1377, "loss/crossentropy": 2.4233219623565674, "loss/hidden": 0.96875, "loss/logits": 0.1542065590620041, "loss/reg": 0.0014744291547685862, "step": 3119 }, { "epoch": 0.39, "grad_norm": 2.906020402908325, "grad_norm_var": 3.5273110674137835, "learning_rate": 0.0001, "loss": 1.3397, "loss/crossentropy": 2.2614736557006836, "loss/hidden": 1.140625, "loss/logits": 0.1843045949935913, "loss/reg": 0.001473503652960062, "step": 3120 }, { "epoch": 0.390125, "grad_norm": 2.4611387252807617, "grad_norm_var": 3.5515407196989437, "learning_rate": 0.0001, "loss": 1.1345, "loss/crossentropy": 2.298067092895508, "loss/hidden": 0.96484375, "loss/logits": 0.15495604276657104, "loss/reg": 0.0014726603403687477, "step": 3121 }, { "epoch": 0.39025, "grad_norm": 2.513363838195801, "grad_norm_var": 3.5888133239814994, "learning_rate": 0.0001, "loss": 1.2774, "loss/crossentropy": 2.527592420578003, "loss/hidden": 1.0859375, "loss/logits": 0.17678681015968323, "loss/reg": 0.0014717563753947616, "step": 3122 }, { "epoch": 0.390375, "grad_norm": 2.5627973079681396, "grad_norm_var": 3.585288934166859, "learning_rate": 0.0001, "loss": 1.1294, "loss/crossentropy": 2.3707189559936523, "loss/hidden": 0.953125, "loss/logits": 0.16152185201644897, "loss/reg": 0.001470955554395914, "step": 3123 }, { "epoch": 0.3905, "grad_norm": 2.708827495574951, "grad_norm_var": 3.531975226355064, "learning_rate": 0.0001, "loss": 1.2338, "loss/crossentropy": 2.5239439010620117, "loss/hidden": 1.046875, "loss/logits": 0.17219334840774536, "loss/reg": 0.0014701104955747724, "step": 3124 }, { "epoch": 0.390625, "grad_norm": 2.8166301250457764, "grad_norm_var": 3.4852118918475976, "learning_rate": 0.0001, "loss": 1.1936, "loss/crossentropy": 2.1727406978607178, "loss/hidden": 1.03125, "loss/logits": 0.14763377606868744, "loss/reg": 0.0014692150289192796, "step": 3125 }, { "epoch": 0.39075, "grad_norm": 2.2763895988464355, "grad_norm_var": 3.5424694748527665, "learning_rate": 0.0001, "loss": 1.2945, "loss/crossentropy": 2.0448904037475586, "loss/hidden": 1.109375, "loss/logits": 0.17043252289295197, "loss/reg": 0.0014683061745017767, "step": 3126 }, { "epoch": 0.390875, "grad_norm": 3.1606974601745605, "grad_norm_var": 3.2767183209605784, "learning_rate": 0.0001, "loss": 1.1201, "loss/crossentropy": 2.52594256401062, "loss/hidden": 0.94921875, "loss/logits": 0.15616539120674133, "loss/reg": 0.0014673679834231734, "step": 3127 }, { "epoch": 0.391, "grad_norm": 2.4146230220794678, "grad_norm_var": 3.3219234339669335, "learning_rate": 0.0001, "loss": 1.2131, "loss/crossentropy": 2.668795347213745, "loss/hidden": 1.0234375, "loss/logits": 0.17497006058692932, "loss/reg": 0.001466380781494081, "step": 3128 }, { "epoch": 0.391125, "grad_norm": 2.407149314880371, "grad_norm_var": 0.14627217968341952, "learning_rate": 0.0001, "loss": 1.098, "loss/crossentropy": 2.4945483207702637, "loss/hidden": 0.9375, "loss/logits": 0.1458122581243515, "loss/reg": 0.0014655799604952335, "step": 3129 }, { "epoch": 0.39125, "grad_norm": 2.4407310485839844, "grad_norm_var": 0.15259124394272944, "learning_rate": 0.0001, "loss": 1.4106, "loss/crossentropy": 2.413118362426758, "loss/hidden": 1.15625, "loss/logits": 0.23969539999961853, "loss/reg": 0.0014646538766101003, "step": 3130 }, { "epoch": 0.391375, "grad_norm": 7.806640148162842, "grad_norm_var": 1.7537979087511737, "learning_rate": 0.0001, "loss": 1.2182, "loss/crossentropy": 2.9865965843200684, "loss/hidden": 1.015625, "loss/logits": 0.18794524669647217, "loss/reg": 0.0014638546854257584, "step": 3131 }, { "epoch": 0.3915, "grad_norm": 2.66379451751709, "grad_norm_var": 1.7481976745923549, "learning_rate": 0.0001, "loss": 1.1974, "loss/crossentropy": 2.652742624282837, "loss/hidden": 1.0, "loss/logits": 0.1827564835548401, "loss/reg": 0.001463059219531715, "step": 3132 }, { "epoch": 0.391625, "grad_norm": 3.1615896224975586, "grad_norm_var": 1.747998292551649, "learning_rate": 0.0001, "loss": 1.3448, "loss/crossentropy": 2.4887144565582275, "loss/hidden": 1.125, "loss/logits": 0.20517219603061676, "loss/reg": 0.0014622474554926157, "step": 3133 }, { "epoch": 0.39175, "grad_norm": 3.9157252311706543, "grad_norm_var": 1.772231721120654, "learning_rate": 0.0001, "loss": 1.3063, "loss/crossentropy": 3.1372129917144775, "loss/hidden": 1.125, "loss/logits": 0.16667796671390533, "loss/reg": 0.0014614835381507874, "step": 3134 }, { "epoch": 0.391875, "grad_norm": 3.8285632133483887, "grad_norm_var": 1.7939090408593368, "learning_rate": 0.0001, "loss": 1.4048, "loss/crossentropy": 2.5189216136932373, "loss/hidden": 1.1953125, "loss/logits": 0.19492308795452118, "loss/reg": 0.0014607240445911884, "step": 3135 }, { "epoch": 0.392, "grad_norm": 2.7343673706054688, "grad_norm_var": 1.8008263038379062, "learning_rate": 0.0001, "loss": 1.1929, "loss/crossentropy": 2.7494773864746094, "loss/hidden": 1.0, "loss/logits": 0.17826081812381744, "loss/reg": 0.0014599841088056564, "step": 3136 }, { "epoch": 0.392125, "grad_norm": 3.400426149368286, "grad_norm_var": 1.7738205947233296, "learning_rate": 0.0001, "loss": 1.4573, "loss/crossentropy": 2.624523639678955, "loss/hidden": 1.1953125, "loss/logits": 0.24741919338703156, "loss/reg": 0.0014591823564842343, "step": 3137 }, { "epoch": 0.39225, "grad_norm": 3.1725072860717773, "grad_norm_var": 1.7427589090922528, "learning_rate": 0.0001, "loss": 1.1463, "loss/crossentropy": 2.6066739559173584, "loss/hidden": 0.953125, "loss/logits": 0.17858457565307617, "loss/reg": 0.0014583441661670804, "step": 3138 }, { "epoch": 0.392375, "grad_norm": 2.4614920616149902, "grad_norm_var": 1.7522364296762132, "learning_rate": 0.0001, "loss": 1.1504, "loss/crossentropy": 2.642681360244751, "loss/hidden": 0.97265625, "loss/logits": 0.16314703226089478, "loss/reg": 0.0014575106324627995, "step": 3139 }, { "epoch": 0.3925, "grad_norm": 2.0810978412628174, "grad_norm_var": 1.8188641058094466, "learning_rate": 0.0001, "loss": 1.1331, "loss/crossentropy": 2.34047269821167, "loss/hidden": 0.94921875, "loss/logits": 0.16931387782096863, "loss/reg": 0.0014566424069926143, "step": 3140 }, { "epoch": 0.392625, "grad_norm": 3.454364061355591, "grad_norm_var": 1.814116508869558, "learning_rate": 0.0001, "loss": 1.3945, "loss/crossentropy": 2.15335750579834, "loss/hidden": 1.171875, "loss/logits": 0.208018496632576, "loss/reg": 0.0014556582318618894, "step": 3141 }, { "epoch": 0.39275, "grad_norm": 2.7836453914642334, "grad_norm_var": 1.7669691714811522, "learning_rate": 0.0001, "loss": 1.3666, "loss/crossentropy": 2.437479257583618, "loss/hidden": 1.140625, "loss/logits": 0.21140936017036438, "loss/reg": 0.0014548501931130886, "step": 3142 }, { "epoch": 0.392875, "grad_norm": 3.633197069168091, "grad_norm_var": 1.7757399166903336, "learning_rate": 0.0001, "loss": 1.4194, "loss/crossentropy": 1.9959356784820557, "loss/hidden": 1.21875, "loss/logits": 0.1861596554517746, "loss/reg": 0.0014538905816152692, "step": 3143 }, { "epoch": 0.393, "grad_norm": 2.4751851558685303, "grad_norm_var": 1.769041881466553, "learning_rate": 0.0001, "loss": 1.1164, "loss/crossentropy": 2.4542863368988037, "loss/hidden": 0.9375, "loss/logits": 0.16434696316719055, "loss/reg": 0.001452919328585267, "step": 3144 }, { "epoch": 0.393125, "grad_norm": 5.5718889236450195, "grad_norm_var": 2.0282725761306506, "learning_rate": 0.0001, "loss": 1.4256, "loss/crossentropy": 3.0863025188446045, "loss/hidden": 1.15625, "loss/logits": 0.2548605501651764, "loss/reg": 0.0014519832329824567, "step": 3145 }, { "epoch": 0.39325, "grad_norm": 2.7773547172546387, "grad_norm_var": 1.9889750145998657, "learning_rate": 0.0001, "loss": 1.2371, "loss/crossentropy": 2.5975542068481445, "loss/hidden": 1.0234375, "loss/logits": 0.19917550683021545, "loss/reg": 0.0014511628542095423, "step": 3146 }, { "epoch": 0.393375, "grad_norm": 2.4508161544799805, "grad_norm_var": 0.7028754799269327, "learning_rate": 0.0001, "loss": 1.2065, "loss/crossentropy": 2.5284509658813477, "loss/hidden": 1.015625, "loss/logits": 0.17634466290473938, "loss/reg": 0.0014502240810543299, "step": 3147 }, { "epoch": 0.3935, "grad_norm": 2.2145609855651855, "grad_norm_var": 0.7452327886363113, "learning_rate": 0.0001, "loss": 1.2459, "loss/crossentropy": 2.3353018760681152, "loss/hidden": 1.0625, "loss/logits": 0.1688942015171051, "loss/reg": 0.001449417439289391, "step": 3148 }, { "epoch": 0.393625, "grad_norm": 2.7859535217285156, "grad_norm_var": 0.7525846696295692, "learning_rate": 0.0001, "loss": 1.2268, "loss/crossentropy": 2.333801507949829, "loss/hidden": 1.0390625, "loss/logits": 0.17329159379005432, "loss/reg": 0.0014485394349321723, "step": 3149 }, { "epoch": 0.39375, "grad_norm": 3.0437726974487305, "grad_norm_var": 0.7062926038039667, "learning_rate": 0.0001, "loss": 1.2126, "loss/crossentropy": 2.686184883117676, "loss/hidden": 1.0078125, "loss/logits": 0.1902828961610794, "loss/reg": 0.0014476029900833964, "step": 3150 }, { "epoch": 0.393875, "grad_norm": 2.394775390625, "grad_norm_var": 0.6867642924687706, "learning_rate": 0.0001, "loss": 1.0593, "loss/crossentropy": 2.4768013954162598, "loss/hidden": 0.90234375, "loss/logits": 0.14247344434261322, "loss/reg": 0.0014468033332377672, "step": 3151 }, { "epoch": 0.394, "grad_norm": 4.5347089767456055, "grad_norm_var": 0.834047766771126, "learning_rate": 0.0001, "loss": 1.6866, "loss/crossentropy": 2.4635729789733887, "loss/hidden": 1.4140625, "loss/logits": 0.2580716609954834, "loss/reg": 0.0014460093807429075, "step": 3152 }, { "epoch": 0.394125, "grad_norm": 3.0870273113250732, "grad_norm_var": 0.8266813774333125, "learning_rate": 0.0001, "loss": 1.1713, "loss/crossentropy": 2.800184965133667, "loss/hidden": 0.98046875, "loss/logits": 0.17638829350471497, "loss/reg": 0.0014451082097366452, "step": 3153 }, { "epoch": 0.39425, "grad_norm": 2.362227201461792, "grad_norm_var": 0.8553067605167236, "learning_rate": 0.0001, "loss": 1.1421, "loss/crossentropy": 2.5312883853912354, "loss/hidden": 0.96875, "loss/logits": 0.15886293351650238, "loss/reg": 0.00144416862167418, "step": 3154 }, { "epoch": 0.394375, "grad_norm": 2.428152561187744, "grad_norm_var": 0.8578011776883876, "learning_rate": 0.0001, "loss": 1.0405, "loss/crossentropy": 2.6899750232696533, "loss/hidden": 0.8828125, "loss/logits": 0.1432238668203354, "loss/reg": 0.001443324494175613, "step": 3155 }, { "epoch": 0.3945, "grad_norm": 3.214587450027466, "grad_norm_var": 0.7984819785149433, "learning_rate": 0.0001, "loss": 1.3027, "loss/crossentropy": 2.361611843109131, "loss/hidden": 1.1171875, "loss/logits": 0.17112301290035248, "loss/reg": 0.0014425143599510193, "step": 3156 }, { "epoch": 0.394625, "grad_norm": 4.067144393920898, "grad_norm_var": 0.852883901052644, "learning_rate": 0.0001, "loss": 1.3531, "loss/crossentropy": 2.7262966632843018, "loss/hidden": 1.1484375, "loss/logits": 0.1902923882007599, "loss/reg": 0.0014416533522307873, "step": 3157 }, { "epoch": 0.39475, "grad_norm": 2.263038158416748, "grad_norm_var": 0.8927590566943476, "learning_rate": 0.0001, "loss": 1.2932, "loss/crossentropy": 2.373117446899414, "loss/hidden": 1.0859375, "loss/logits": 0.19289630651474, "loss/reg": 0.0014407344860956073, "step": 3158 }, { "epoch": 0.394875, "grad_norm": 3.4677956104278564, "grad_norm_var": 0.8823025811608599, "learning_rate": 0.0001, "loss": 1.1267, "loss/crossentropy": 2.0556046962738037, "loss/hidden": 0.98828125, "loss/logits": 0.12401382625102997, "loss/reg": 0.00143993750680238, "step": 3159 }, { "epoch": 0.395, "grad_norm": 2.820385456085205, "grad_norm_var": 0.8623182900709015, "learning_rate": 0.0001, "loss": 1.3681, "loss/crossentropy": 2.042773962020874, "loss/hidden": 1.1484375, "loss/logits": 0.2052696943283081, "loss/reg": 0.00143915053922683, "step": 3160 }, { "epoch": 0.395125, "grad_norm": 5.711848258972168, "grad_norm_var": 0.9098061756362303, "learning_rate": 0.0001, "loss": 1.5792, "loss/crossentropy": 2.2495386600494385, "loss/hidden": 1.3828125, "loss/logits": 0.182022362947464, "loss/reg": 0.001438311068341136, "step": 3161 }, { "epoch": 0.39525, "grad_norm": 2.1175920963287354, "grad_norm_var": 0.9655269392484909, "learning_rate": 0.0001, "loss": 1.1051, "loss/crossentropy": 2.3756344318389893, "loss/hidden": 0.94140625, "loss/logits": 0.1493438482284546, "loss/reg": 0.0014374576276168227, "step": 3162 }, { "epoch": 0.395375, "grad_norm": 4.076714038848877, "grad_norm_var": 0.9986262418690328, "learning_rate": 0.0001, "loss": 1.252, "loss/crossentropy": 2.7908565998077393, "loss/hidden": 1.0390625, "loss/logits": 0.19854992628097534, "loss/reg": 0.0014365814859047532, "step": 3163 }, { "epoch": 0.3955, "grad_norm": 2.3469817638397217, "grad_norm_var": 0.9829960077186305, "learning_rate": 0.0001, "loss": 1.2075, "loss/crossentropy": 2.239279270172119, "loss/hidden": 1.015625, "loss/logits": 0.1775517463684082, "loss/reg": 0.0014357875334098935, "step": 3164 }, { "epoch": 0.395625, "grad_norm": 2.405308246612549, "grad_norm_var": 1.011551661315956, "learning_rate": 0.0001, "loss": 1.4074, "loss/crossentropy": 2.2752726078033447, "loss/hidden": 1.171875, "loss/logits": 0.2212238609790802, "loss/reg": 0.0014349615667015314, "step": 3165 }, { "epoch": 0.39575, "grad_norm": 3.0401687622070312, "grad_norm_var": 1.011601777818645, "learning_rate": 0.0001, "loss": 1.2845, "loss/crossentropy": 2.6117618083953857, "loss/hidden": 1.109375, "loss/logits": 0.16074272990226746, "loss/reg": 0.0014341834466904402, "step": 3166 }, { "epoch": 0.395875, "grad_norm": 3.0013606548309326, "grad_norm_var": 0.9738283994240208, "learning_rate": 0.0001, "loss": 1.1672, "loss/crossentropy": 2.5564990043640137, "loss/hidden": 0.96484375, "loss/logits": 0.18803799152374268, "loss/reg": 0.0014334300067275763, "step": 3167 }, { "epoch": 0.396, "grad_norm": 2.6962780952453613, "grad_norm_var": 0.8539922575708732, "learning_rate": 0.0001, "loss": 1.3026, "loss/crossentropy": 2.3556156158447266, "loss/hidden": 1.0859375, "loss/logits": 0.20230239629745483, "loss/reg": 0.0014327148674055934, "step": 3168 }, { "epoch": 0.396125, "grad_norm": 3.0202796459198, "grad_norm_var": 0.8541117250838755, "learning_rate": 0.0001, "loss": 1.2347, "loss/crossentropy": 2.730318069458008, "loss/hidden": 1.0390625, "loss/logits": 0.1812940239906311, "loss/reg": 0.0014319290639832616, "step": 3169 }, { "epoch": 0.39625, "grad_norm": 2.9215188026428223, "grad_norm_var": 0.821255486710181, "learning_rate": 0.0001, "loss": 1.4779, "loss/crossentropy": 2.267758846282959, "loss/hidden": 1.2265625, "loss/logits": 0.2370465248823166, "loss/reg": 0.0014311603736132383, "step": 3170 }, { "epoch": 0.396375, "grad_norm": 2.4590396881103516, "grad_norm_var": 0.8185484720966466, "learning_rate": 0.0001, "loss": 1.0491, "loss/crossentropy": 2.444979667663574, "loss/hidden": 0.8828125, "loss/logits": 0.1519661545753479, "loss/reg": 0.0014304454671218991, "step": 3171 }, { "epoch": 0.3965, "grad_norm": 2.5662736892700195, "grad_norm_var": 0.8350750440580098, "learning_rate": 0.0001, "loss": 1.2082, "loss/crossentropy": 2.2979326248168945, "loss/hidden": 1.015625, "loss/logits": 0.17823505401611328, "loss/reg": 0.0014297092566266656, "step": 3172 }, { "epoch": 0.396625, "grad_norm": 2.6666481494903564, "grad_norm_var": 0.7698485524567218, "learning_rate": 0.0001, "loss": 1.292, "loss/crossentropy": 2.86714243888855, "loss/hidden": 1.0859375, "loss/logits": 0.1917625516653061, "loss/reg": 0.001429002615623176, "step": 3173 }, { "epoch": 0.39675, "grad_norm": 2.703878164291382, "grad_norm_var": 0.7402155791768201, "learning_rate": 0.0001, "loss": 1.2042, "loss/crossentropy": 2.486057758331299, "loss/hidden": 1.015625, "loss/logits": 0.17433515191078186, "loss/reg": 0.001428212970495224, "step": 3174 }, { "epoch": 0.396875, "grad_norm": 2.5013811588287354, "grad_norm_var": 0.7384877086043424, "learning_rate": 0.0001, "loss": 1.0721, "loss/crossentropy": 2.5718910694122314, "loss/hidden": 0.9140625, "loss/logits": 0.14380885660648346, "loss/reg": 0.0014274645363911986, "step": 3175 }, { "epoch": 0.397, "grad_norm": 2.624527931213379, "grad_norm_var": 0.7440344276748133, "learning_rate": 0.0001, "loss": 1.1694, "loss/crossentropy": 2.5388550758361816, "loss/hidden": 0.97265625, "loss/logits": 0.18247374892234802, "loss/reg": 0.001426673261448741, "step": 3176 }, { "epoch": 0.397125, "grad_norm": 3.423884630203247, "grad_norm_var": 0.22218718379999322, "learning_rate": 0.0001, "loss": 1.3666, "loss/crossentropy": 2.3247339725494385, "loss/hidden": 1.15625, "loss/logits": 0.19609692692756653, "loss/reg": 0.0014259201707318425, "step": 3177 }, { "epoch": 0.39725, "grad_norm": 2.284940481185913, "grad_norm_var": 0.20902906966691623, "learning_rate": 0.0001, "loss": 1.0086, "loss/crossentropy": 2.5605931282043457, "loss/hidden": 0.87109375, "loss/logits": 0.12322554737329483, "loss/reg": 0.0014251739485189319, "step": 3178 }, { "epoch": 0.397375, "grad_norm": 2.3071844577789307, "grad_norm_var": 0.10261001984224905, "learning_rate": 0.0001, "loss": 0.9849, "loss/crossentropy": 2.571467399597168, "loss/hidden": 0.83203125, "loss/logits": 0.13859574496746063, "loss/reg": 0.0014243986224755645, "step": 3179 }, { "epoch": 0.3975, "grad_norm": 2.4365358352661133, "grad_norm_var": 0.09906793947369105, "learning_rate": 0.0001, "loss": 1.2161, "loss/crossentropy": 2.202028512954712, "loss/hidden": 1.03125, "loss/logits": 0.1705704927444458, "loss/reg": 0.001423671841621399, "step": 3180 }, { "epoch": 0.397625, "grad_norm": 2.1442601680755615, "grad_norm_var": 0.113277954657629, "learning_rate": 0.0001, "loss": 1.167, "loss/crossentropy": 2.603797197341919, "loss/hidden": 0.98828125, "loss/logits": 0.16453075408935547, "loss/reg": 0.0014228628715500236, "step": 3181 }, { "epoch": 0.39775, "grad_norm": 2.8968381881713867, "grad_norm_var": 0.10758108919777835, "learning_rate": 0.0001, "loss": 1.2757, "loss/crossentropy": 2.477893829345703, "loss/hidden": 1.0859375, "loss/logits": 0.17551052570343018, "loss/reg": 0.0014220853336155415, "step": 3182 }, { "epoch": 0.397875, "grad_norm": 5.751259803771973, "grad_norm_var": 0.7031905536524656, "learning_rate": 0.0001, "loss": 1.3406, "loss/crossentropy": 2.397312879562378, "loss/hidden": 1.125, "loss/logits": 0.2013690322637558, "loss/reg": 0.0014213289832696319, "step": 3183 }, { "epoch": 0.398, "grad_norm": 3.8826839923858643, "grad_norm_var": 0.7687767016001618, "learning_rate": 0.0001, "loss": 1.0771, "loss/crossentropy": 2.7769365310668945, "loss/hidden": 0.90625, "loss/logits": 0.1566927134990692, "loss/reg": 0.0014205975458025932, "step": 3184 }, { "epoch": 0.398125, "grad_norm": 2.416727066040039, "grad_norm_var": 0.7828259209253815, "learning_rate": 0.0001, "loss": 1.1545, "loss/crossentropy": 2.3300576210021973, "loss/hidden": 0.984375, "loss/logits": 0.1559705287218094, "loss/reg": 0.001419825479388237, "step": 3185 }, { "epoch": 0.39825, "grad_norm": 2.5174431800842285, "grad_norm_var": 0.7904826439406368, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.386939525604248, "loss/hidden": 1.1796875, "loss/logits": 0.23074419796466827, "loss/reg": 0.0014190953224897385, "step": 3186 }, { "epoch": 0.398375, "grad_norm": 2.987406015396118, "grad_norm_var": 0.7804607494272752, "learning_rate": 0.0001, "loss": 1.1619, "loss/crossentropy": 2.3375484943389893, "loss/hidden": 0.984375, "loss/logits": 0.16335271298885345, "loss/reg": 0.0014183121966198087, "step": 3187 }, { "epoch": 0.3985, "grad_norm": 2.861743211746216, "grad_norm_var": 0.7734791186347887, "learning_rate": 0.0001, "loss": 1.1959, "loss/crossentropy": 2.5404655933380127, "loss/hidden": 1.015625, "loss/logits": 0.166068434715271, "loss/reg": 0.0014175003161653876, "step": 3188 }, { "epoch": 0.398625, "grad_norm": 2.9863171577453613, "grad_norm_var": 0.769900278957188, "learning_rate": 0.0001, "loss": 1.3029, "loss/crossentropy": 2.5004663467407227, "loss/hidden": 1.0546875, "loss/logits": 0.23406264185905457, "loss/reg": 0.0014166788896545768, "step": 3189 }, { "epoch": 0.39875, "grad_norm": 2.615281581878662, "grad_norm_var": 0.7729490609937396, "learning_rate": 0.0001, "loss": 1.08, "loss/crossentropy": 2.6250874996185303, "loss/hidden": 0.91796875, "loss/logits": 0.147823303937912, "loss/reg": 0.0014159217244014144, "step": 3190 }, { "epoch": 0.398875, "grad_norm": 2.487459182739258, "grad_norm_var": 0.7737287764789355, "learning_rate": 0.0001, "loss": 1.1606, "loss/crossentropy": 2.6391091346740723, "loss/hidden": 0.984375, "loss/logits": 0.16211755573749542, "loss/reg": 0.0014151090290397406, "step": 3191 }, { "epoch": 0.399, "grad_norm": 3.7378759384155273, "grad_norm_var": 0.8082246033348002, "learning_rate": 0.0001, "loss": 1.0812, "loss/crossentropy": 2.8277475833892822, "loss/hidden": 0.91015625, "loss/logits": 0.15691182017326355, "loss/reg": 0.0014143340522423387, "step": 3192 }, { "epoch": 0.399125, "grad_norm": 6.60544490814209, "grad_norm_var": 1.6276358579546695, "learning_rate": 0.0001, "loss": 1.4461, "loss/crossentropy": 2.543706178665161, "loss/hidden": 1.1796875, "loss/logits": 0.2522701919078827, "loss/reg": 0.0014135376550257206, "step": 3193 }, { "epoch": 0.39925, "grad_norm": 2.0717575550079346, "grad_norm_var": 1.6559878110693025, "learning_rate": 0.0001, "loss": 0.9997, "loss/crossentropy": 2.3166720867156982, "loss/hidden": 0.84765625, "loss/logits": 0.13788923621177673, "loss/reg": 0.0014127305475994945, "step": 3194 }, { "epoch": 0.399375, "grad_norm": 2.4677398204803467, "grad_norm_var": 1.6391467554153905, "learning_rate": 0.0001, "loss": 1.1378, "loss/crossentropy": 2.7695729732513428, "loss/hidden": 0.9609375, "loss/logits": 0.16273443400859833, "loss/reg": 0.0014120152918621898, "step": 3195 }, { "epoch": 0.3995, "grad_norm": 2.941117763519287, "grad_norm_var": 1.60509657548088, "learning_rate": 0.0001, "loss": 1.0541, "loss/crossentropy": 2.4052300453186035, "loss/hidden": 0.89453125, "loss/logits": 0.14548833668231964, "loss/reg": 0.0014113292563706636, "step": 3196 }, { "epoch": 0.399625, "grad_norm": 2.4621706008911133, "grad_norm_var": 1.566208540500228, "learning_rate": 0.0001, "loss": 0.9971, "loss/crossentropy": 2.4397029876708984, "loss/hidden": 0.8515625, "loss/logits": 0.1314249485731125, "loss/reg": 0.0014106096932664514, "step": 3197 }, { "epoch": 0.39975, "grad_norm": 2.0559136867523193, "grad_norm_var": 1.6478257904535734, "learning_rate": 0.0001, "loss": 1.1571, "loss/crossentropy": 2.4714083671569824, "loss/hidden": 0.97265625, "loss/logits": 0.1703430712223053, "loss/reg": 0.001409909687936306, "step": 3198 }, { "epoch": 0.399875, "grad_norm": 2.6498966217041016, "grad_norm_var": 1.1849062029293758, "learning_rate": 0.0001, "loss": 1.1006, "loss/crossentropy": 2.4940390586853027, "loss/hidden": 0.91796875, "loss/logits": 0.16856229305267334, "loss/reg": 0.0014092193450778723, "step": 3199 }, { "epoch": 0.4, "grad_norm": 2.321859121322632, "grad_norm_var": 1.1501807232574037, "learning_rate": 0.0001, "loss": 1.2111, "loss/crossentropy": 2.5656702518463135, "loss/hidden": 1.015625, "loss/logits": 0.18143799901008606, "loss/reg": 0.001408591284416616, "step": 3200 }, { "epoch": 0.400125, "grad_norm": 2.438978672027588, "grad_norm_var": 1.1488175095258526, "learning_rate": 0.0001, "loss": 1.1829, "loss/crossentropy": 2.6898128986358643, "loss/hidden": 0.984375, "loss/logits": 0.18446823954582214, "loss/reg": 0.001407959614880383, "step": 3201 }, { "epoch": 0.40025, "grad_norm": 2.7628965377807617, "grad_norm_var": 1.1404548850874447, "learning_rate": 0.0001, "loss": 1.2485, "loss/crossentropy": 2.524129629135132, "loss/hidden": 1.0625, "loss/logits": 0.17190766334533691, "loss/reg": 0.0014071909245103598, "step": 3202 }, { "epoch": 0.400375, "grad_norm": 13.3856840133667, "grad_norm_var": 8.014732454338711, "learning_rate": 0.0001, "loss": 1.3688, "loss/crossentropy": 2.2739052772521973, "loss/hidden": 1.203125, "loss/logits": 0.15163373947143555, "loss/reg": 0.0014065294526517391, "step": 3203 }, { "epoch": 0.4005, "grad_norm": 2.3786189556121826, "grad_norm_var": 8.07386556250397, "learning_rate": 0.0001, "loss": 1.1228, "loss/crossentropy": 2.581334114074707, "loss/hidden": 0.94921875, "loss/logits": 0.15947729349136353, "loss/reg": 0.0014059359673410654, "step": 3204 }, { "epoch": 0.400625, "grad_norm": 2.10800838470459, "grad_norm_var": 8.184936880726184, "learning_rate": 0.0001, "loss": 1.129, "loss/crossentropy": 2.4888153076171875, "loss/hidden": 0.95703125, "loss/logits": 0.15787412226200104, "loss/reg": 0.0014051540056243539, "step": 3205 }, { "epoch": 0.40075, "grad_norm": 3.1674044132232666, "grad_norm_var": 8.141202877900437, "learning_rate": 0.0001, "loss": 1.1436, "loss/crossentropy": 2.95302677154541, "loss/hidden": 0.96875, "loss/logits": 0.16084638237953186, "loss/reg": 0.0014045133721083403, "step": 3206 }, { "epoch": 0.400875, "grad_norm": 3.2646758556365967, "grad_norm_var": 8.07375113018638, "learning_rate": 0.0001, "loss": 1.3706, "loss/crossentropy": 2.407227039337158, "loss/hidden": 1.125, "loss/logits": 0.23157890141010284, "loss/reg": 0.0014038854278624058, "step": 3207 }, { "epoch": 0.401, "grad_norm": 3.8222179412841797, "grad_norm_var": 8.076294419135172, "learning_rate": 0.0001, "loss": 1.2553, "loss/crossentropy": 2.2204642295837402, "loss/hidden": 1.0703125, "loss/logits": 0.17094703018665314, "loss/reg": 0.0014031098689883947, "step": 3208 }, { "epoch": 0.401125, "grad_norm": 2.6836647987365723, "grad_norm_var": 7.443273915593101, "learning_rate": 0.0001, "loss": 1.0676, "loss/crossentropy": 2.28102707862854, "loss/hidden": 0.9140625, "loss/logits": 0.13949863612651825, "loss/reg": 0.0014023492112755775, "step": 3209 }, { "epoch": 0.40125, "grad_norm": 3.3956005573272705, "grad_norm_var": 7.333994411807278, "learning_rate": 0.0001, "loss": 1.3605, "loss/crossentropy": 2.397531032562256, "loss/hidden": 1.1484375, "loss/logits": 0.19800767302513123, "loss/reg": 0.0014015804044902325, "step": 3210 }, { "epoch": 0.401375, "grad_norm": 2.7750182151794434, "grad_norm_var": 7.301940095918295, "learning_rate": 0.0001, "loss": 1.2998, "loss/crossentropy": 2.638395309448242, "loss/hidden": 1.1015625, "loss/logits": 0.18423208594322205, "loss/reg": 0.0014008664293214679, "step": 3211 }, { "epoch": 0.4015, "grad_norm": 3.0714621543884277, "grad_norm_var": 7.294794769575021, "learning_rate": 0.0001, "loss": 1.3575, "loss/crossentropy": 2.462658166885376, "loss/hidden": 1.15625, "loss/logits": 0.1872762143611908, "loss/reg": 0.0014000808587297797, "step": 3212 }, { "epoch": 0.401625, "grad_norm": 10.589031219482422, "grad_norm_var": 10.383144954125903, "learning_rate": 0.0001, "loss": 1.2212, "loss/crossentropy": 2.293719530105591, "loss/hidden": 1.0703125, "loss/logits": 0.13690148293972015, "loss/reg": 0.0013993012253195047, "step": 3213 }, { "epoch": 0.40175, "grad_norm": 2.9793620109558105, "grad_norm_var": 10.205762461006318, "learning_rate": 0.0001, "loss": 1.3911, "loss/crossentropy": 2.2331912517547607, "loss/hidden": 1.1875, "loss/logits": 0.18957272171974182, "loss/reg": 0.0013985136756673455, "step": 3214 }, { "epoch": 0.401875, "grad_norm": 2.6328084468841553, "grad_norm_var": 10.208827537708206, "learning_rate": 0.0001, "loss": 1.2285, "loss/crossentropy": 2.5185325145721436, "loss/hidden": 1.03125, "loss/logits": 0.183321014046669, "loss/reg": 0.0013977786293253303, "step": 3215 }, { "epoch": 0.402, "grad_norm": 2.4675610065460205, "grad_norm_var": 10.17782365635681, "learning_rate": 0.0001, "loss": 1.1234, "loss/crossentropy": 2.8550357818603516, "loss/hidden": 0.94140625, "loss/logits": 0.16804927587509155, "loss/reg": 0.0013970347354188561, "step": 3216 }, { "epoch": 0.402125, "grad_norm": 2.1422646045684814, "grad_norm_var": 10.244892632020703, "learning_rate": 0.0001, "loss": 1.042, "loss/crossentropy": 2.425354242324829, "loss/hidden": 0.88671875, "loss/logits": 0.14126884937286377, "loss/reg": 0.0013962731463834643, "step": 3217 }, { "epoch": 0.40225, "grad_norm": 3.2200849056243896, "grad_norm_var": 10.1839683892281, "learning_rate": 0.0001, "loss": 1.3614, "loss/crossentropy": 2.221250057220459, "loss/hidden": 1.15625, "loss/logits": 0.19119367003440857, "loss/reg": 0.0013955157482996583, "step": 3218 }, { "epoch": 0.402375, "grad_norm": 2.171010971069336, "grad_norm_var": 4.018007610627952, "learning_rate": 0.0001, "loss": 0.9664, "loss/crossentropy": 2.376636028289795, "loss/hidden": 0.8203125, "loss/logits": 0.13211877644062042, "loss/reg": 0.0013947088737040758, "step": 3219 }, { "epoch": 0.4025, "grad_norm": 2.8398897647857666, "grad_norm_var": 3.9743738518735037, "learning_rate": 0.0001, "loss": 1.2872, "loss/crossentropy": 2.305521249771118, "loss/hidden": 1.109375, "loss/logits": 0.16386672854423523, "loss/reg": 0.0013939387863501906, "step": 3220 }, { "epoch": 0.402625, "grad_norm": 2.270127058029175, "grad_norm_var": 3.949534513690692, "learning_rate": 0.0001, "loss": 1.169, "loss/crossentropy": 2.6820387840270996, "loss/hidden": 0.98046875, "loss/logits": 0.17459151148796082, "loss/reg": 0.0013931809226050973, "step": 3221 }, { "epoch": 0.40275, "grad_norm": 2.979616641998291, "grad_norm_var": 3.956141703727499, "learning_rate": 0.0001, "loss": 1.3056, "loss/crossentropy": 2.7731282711029053, "loss/hidden": 1.078125, "loss/logits": 0.21350443363189697, "loss/reg": 0.0013924350496381521, "step": 3222 }, { "epoch": 0.402875, "grad_norm": 5.7830963134765625, "grad_norm_var": 4.330097150928092, "learning_rate": 0.0001, "loss": 1.4978, "loss/crossentropy": 3.1430695056915283, "loss/hidden": 1.2734375, "loss/logits": 0.210455060005188, "loss/reg": 0.001391703262925148, "step": 3223 }, { "epoch": 0.403, "grad_norm": 4.043674945831299, "grad_norm_var": 4.3430036614332, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.6030006408691406, "loss/hidden": 1.0859375, "loss/logits": 0.21246591210365295, "loss/reg": 0.0013909507542848587, "step": 3224 }, { "epoch": 0.403125, "grad_norm": 2.4082164764404297, "grad_norm_var": 4.377828361564604, "learning_rate": 0.0001, "loss": 1.1097, "loss/crossentropy": 2.314228057861328, "loss/hidden": 0.94921875, "loss/logits": 0.1465829312801361, "loss/reg": 0.0013902006903663278, "step": 3225 }, { "epoch": 0.40325, "grad_norm": 2.7299933433532715, "grad_norm_var": 4.4135008617863525, "learning_rate": 0.0001, "loss": 1.3991, "loss/crossentropy": 2.3191654682159424, "loss/hidden": 1.171875, "loss/logits": 0.21328136324882507, "loss/reg": 0.001389437704347074, "step": 3226 }, { "epoch": 0.403375, "grad_norm": 40.410335540771484, "grad_norm_var": 89.5828365804228, "learning_rate": 0.0001, "loss": 1.2587, "loss/crossentropy": 2.544874906539917, "loss/hidden": 1.0703125, "loss/logits": 0.1744803637266159, "loss/reg": 0.0013887349050492048, "step": 3227 }, { "epoch": 0.4035, "grad_norm": 3.98893404006958, "grad_norm_var": 89.30213527874257, "learning_rate": 0.0001, "loss": 1.2559, "loss/crossentropy": 2.4807851314544678, "loss/hidden": 1.0546875, "loss/logits": 0.18736805021762848, "loss/reg": 0.0013880165060982108, "step": 3228 }, { "epoch": 0.403625, "grad_norm": 2.497218608856201, "grad_norm_var": 88.28527106284602, "learning_rate": 0.0001, "loss": 1.239, "loss/crossentropy": 2.413098096847534, "loss/hidden": 1.03125, "loss/logits": 0.1938888132572174, "loss/reg": 0.0013872504932805896, "step": 3229 }, { "epoch": 0.40375, "grad_norm": 3.349843978881836, "grad_norm_var": 88.17685634493883, "learning_rate": 0.0001, "loss": 1.2128, "loss/crossentropy": 2.519489049911499, "loss/hidden": 1.015625, "loss/logits": 0.1833306849002838, "loss/reg": 0.001386479940265417, "step": 3230 }, { "epoch": 0.403875, "grad_norm": 5.28583288192749, "grad_norm_var": 87.64819572170924, "learning_rate": 0.0001, "loss": 1.238, "loss/crossentropy": 2.8373186588287354, "loss/hidden": 1.046875, "loss/logits": 0.17727670073509216, "loss/reg": 0.0013857838930562139, "step": 3231 }, { "epoch": 0.404, "grad_norm": 2.4495253562927246, "grad_norm_var": 87.65559664964157, "learning_rate": 0.0001, "loss": 1.1464, "loss/crossentropy": 2.691544771194458, "loss/hidden": 0.953125, "loss/logits": 0.1794293224811554, "loss/reg": 0.0013850934337824583, "step": 3232 }, { "epoch": 0.404125, "grad_norm": 2.6242549419403076, "grad_norm_var": 87.45204207171957, "learning_rate": 0.0001, "loss": 1.1498, "loss/crossentropy": 2.396329879760742, "loss/hidden": 0.98828125, "loss/logits": 0.14770615100860596, "loss/reg": 0.001384427072480321, "step": 3233 }, { "epoch": 0.40425, "grad_norm": 4.392383575439453, "grad_norm_var": 87.17129551926975, "learning_rate": 0.0001, "loss": 1.1637, "loss/crossentropy": 2.6313347816467285, "loss/hidden": 1.0, "loss/logits": 0.1498868763446808, "loss/reg": 0.0013838107697665691, "step": 3234 }, { "epoch": 0.404375, "grad_norm": 2.036073923110962, "grad_norm_var": 87.23482816205849, "learning_rate": 0.0001, "loss": 1.0718, "loss/crossentropy": 2.7083024978637695, "loss/hidden": 0.91015625, "loss/logits": 0.14784078299999237, "loss/reg": 0.0013832391705363989, "step": 3235 }, { "epoch": 0.4045, "grad_norm": 3.0264666080474854, "grad_norm_var": 87.16758049615274, "learning_rate": 0.0001, "loss": 1.4289, "loss/crossentropy": 2.4732184410095215, "loss/hidden": 1.2109375, "loss/logits": 0.20415030419826508, "loss/reg": 0.001382635091431439, "step": 3236 }, { "epoch": 0.404625, "grad_norm": 3.108293294906616, "grad_norm_var": 86.8346377158096, "learning_rate": 0.0001, "loss": 1.22, "loss/crossentropy": 2.749412775039673, "loss/hidden": 1.0234375, "loss/logits": 0.1827469766139984, "loss/reg": 0.0013819299638271332, "step": 3237 }, { "epoch": 0.40475, "grad_norm": 2.819408893585205, "grad_norm_var": 86.8942369371502, "learning_rate": 0.0001, "loss": 1.1665, "loss/crossentropy": 2.53940486907959, "loss/hidden": 0.984375, "loss/logits": 0.168300598859787, "loss/reg": 0.0013813339173793793, "step": 3238 }, { "epoch": 0.404875, "grad_norm": 2.528777599334717, "grad_norm_var": 87.51340909552454, "learning_rate": 0.0001, "loss": 1.0908, "loss/crossentropy": 2.5331435203552246, "loss/hidden": 0.94140625, "loss/logits": 0.1356305629014969, "loss/reg": 0.0013805809430778027, "step": 3239 }, { "epoch": 0.405, "grad_norm": 3.0166585445404053, "grad_norm_var": 87.77618029567331, "learning_rate": 0.0001, "loss": 1.2389, "loss/crossentropy": 2.4555776119232178, "loss/hidden": 1.046875, "loss/logits": 0.1782711148262024, "loss/reg": 0.0013798880390822887, "step": 3240 }, { "epoch": 0.405125, "grad_norm": 2.7607667446136475, "grad_norm_var": 87.64251489533696, "learning_rate": 0.0001, "loss": 1.2959, "loss/crossentropy": 2.1808271408081055, "loss/hidden": 1.078125, "loss/logits": 0.2039627730846405, "loss/reg": 0.00137915404047817, "step": 3241 }, { "epoch": 0.40525, "grad_norm": 2.8349597454071045, "grad_norm_var": 87.60528888767317, "learning_rate": 0.0001, "loss": 1.0765, "loss/crossentropy": 2.526918649673462, "loss/hidden": 0.91796875, "loss/logits": 0.14476650953292847, "loss/reg": 0.0013784021139144897, "step": 3242 }, { "epoch": 0.405375, "grad_norm": 2.6388018131256104, "grad_norm_var": 0.6838202944423747, "learning_rate": 0.0001, "loss": 1.3661, "loss/crossentropy": 2.5510435104370117, "loss/hidden": 1.140625, "loss/logits": 0.21170908212661743, "loss/reg": 0.0013776673004031181, "step": 3243 }, { "epoch": 0.4055, "grad_norm": 2.87103271484375, "grad_norm_var": 0.6271754503477344, "learning_rate": 0.0001, "loss": 1.2956, "loss/crossentropy": 2.5166921615600586, "loss/hidden": 1.0859375, "loss/logits": 0.19591458141803741, "loss/reg": 0.0013769444776698947, "step": 3244 }, { "epoch": 0.405625, "grad_norm": 2.681727409362793, "grad_norm_var": 0.6165646790718247, "learning_rate": 0.0001, "loss": 1.1311, "loss/crossentropy": 2.555880069732666, "loss/hidden": 0.9453125, "loss/logits": 0.1720399558544159, "loss/reg": 0.0013761859154328704, "step": 3245 }, { "epoch": 0.40575, "grad_norm": 2.9257984161376953, "grad_norm_var": 0.6095242720920993, "learning_rate": 0.0001, "loss": 1.4892, "loss/crossentropy": 2.0228655338287354, "loss/hidden": 1.203125, "loss/logits": 0.2723585367202759, "loss/reg": 0.0013753996463492513, "step": 3246 }, { "epoch": 0.405875, "grad_norm": 2.5413167476654053, "grad_norm_var": 0.2438473977613434, "learning_rate": 0.0001, "loss": 1.3781, "loss/crossentropy": 2.308065891265869, "loss/hidden": 1.15625, "loss/logits": 0.2081434726715088, "loss/reg": 0.0013746068580076098, "step": 3247 }, { "epoch": 0.406, "grad_norm": 2.1319262981414795, "grad_norm_var": 0.2662006376377311, "learning_rate": 0.0001, "loss": 1.0863, "loss/crossentropy": 2.526817560195923, "loss/hidden": 0.91015625, "loss/logits": 0.16241538524627686, "loss/reg": 0.0013738599373027682, "step": 3248 }, { "epoch": 0.406125, "grad_norm": 2.9318926334381104, "grad_norm_var": 0.2645514803863146, "learning_rate": 0.0001, "loss": 1.3498, "loss/crossentropy": 2.1076154708862305, "loss/hidden": 1.15625, "loss/logits": 0.17979696393013, "loss/reg": 0.0013730617938563228, "step": 3249 }, { "epoch": 0.40625, "grad_norm": 2.8652472496032715, "grad_norm_var": 0.09175180801843605, "learning_rate": 0.0001, "loss": 1.2396, "loss/crossentropy": 2.4131739139556885, "loss/hidden": 1.0546875, "loss/logits": 0.17114615440368652, "loss/reg": 0.0013723403681069613, "step": 3250 }, { "epoch": 0.406375, "grad_norm": 3.5416419506073, "grad_norm_var": 0.09363118776181238, "learning_rate": 0.0001, "loss": 1.2488, "loss/crossentropy": 2.4226012229919434, "loss/hidden": 1.046875, "loss/logits": 0.1882304847240448, "loss/reg": 0.0013715783134102821, "step": 3251 }, { "epoch": 0.4065, "grad_norm": 2.3864035606384277, "grad_norm_var": 0.10217455618050408, "learning_rate": 0.0001, "loss": 1.3368, "loss/crossentropy": 2.2283196449279785, "loss/hidden": 1.1171875, "loss/logits": 0.20590125024318695, "loss/reg": 0.0013708476908504963, "step": 3252 }, { "epoch": 0.406625, "grad_norm": 2.5386037826538086, "grad_norm_var": 0.0980188242420771, "learning_rate": 0.0001, "loss": 1.0317, "loss/crossentropy": 2.8362772464752197, "loss/hidden": 0.8828125, "loss/logits": 0.13518750667572021, "loss/reg": 0.0013700791168957949, "step": 3253 }, { "epoch": 0.40675, "grad_norm": 2.8382070064544678, "grad_norm_var": 0.09821253316205585, "learning_rate": 0.0001, "loss": 1.2437, "loss/crossentropy": 2.582094669342041, "loss/hidden": 1.046875, "loss/logits": 0.18312525749206543, "loss/reg": 0.0013693557120859623, "step": 3254 }, { "epoch": 0.406875, "grad_norm": 2.3734328746795654, "grad_norm_var": 0.10434658637059992, "learning_rate": 0.0001, "loss": 1.1039, "loss/crossentropy": 2.344827175140381, "loss/hidden": 0.94140625, "loss/logits": 0.14878472685813904, "loss/reg": 0.0013686425518244505, "step": 3255 }, { "epoch": 0.407, "grad_norm": 3.0915582180023193, "grad_norm_var": 0.10743611474460124, "learning_rate": 0.0001, "loss": 1.3047, "loss/crossentropy": 2.748452663421631, "loss/hidden": 1.0546875, "loss/logits": 0.23634859919548035, "loss/reg": 0.0013678583782166243, "step": 3256 }, { "epoch": 0.407125, "grad_norm": 3.4319355487823486, "grad_norm_var": 0.13681494507521846, "learning_rate": 0.0001, "loss": 1.0962, "loss/crossentropy": 2.4852516651153564, "loss/hidden": 0.94140625, "loss/logits": 0.1411632001399994, "loss/reg": 0.001367009012028575, "step": 3257 }, { "epoch": 0.40725, "grad_norm": 3.4762368202209473, "grad_norm_var": 0.16644434012928452, "learning_rate": 0.0001, "loss": 1.1973, "loss/crossentropy": 2.503399610519409, "loss/hidden": 1.015625, "loss/logits": 0.16805356740951538, "loss/reg": 0.001366178854368627, "step": 3258 }, { "epoch": 0.407375, "grad_norm": 2.9194114208221436, "grad_norm_var": 0.16424538508467562, "learning_rate": 0.0001, "loss": 1.1955, "loss/crossentropy": 2.481788396835327, "loss/hidden": 1.0, "loss/logits": 0.1818678081035614, "loss/reg": 0.0013654421782121062, "step": 3259 }, { "epoch": 0.4075, "grad_norm": 4.125058174133301, "grad_norm_var": 0.26660878435141794, "learning_rate": 0.0001, "loss": 1.608, "loss/crossentropy": 3.004652738571167, "loss/hidden": 1.328125, "loss/logits": 0.26619577407836914, "loss/reg": 0.0013647113228216767, "step": 3260 }, { "epoch": 0.407625, "grad_norm": 2.4017460346221924, "grad_norm_var": 0.28059063393547207, "learning_rate": 0.0001, "loss": 1.2666, "loss/crossentropy": 2.389310598373413, "loss/hidden": 1.046875, "loss/logits": 0.20607197284698486, "loss/reg": 0.0013639895478263497, "step": 3261 }, { "epoch": 0.40775, "grad_norm": 2.9990475177764893, "grad_norm_var": 0.2811044313254378, "learning_rate": 0.0001, "loss": 1.3653, "loss/crossentropy": 2.2684502601623535, "loss/hidden": 1.1171875, "loss/logits": 0.23448723554611206, "loss/reg": 0.0013632772024720907, "step": 3262 }, { "epoch": 0.407875, "grad_norm": 3.5420069694519043, "grad_norm_var": 0.2942182997916376, "learning_rate": 0.0001, "loss": 1.5292, "loss/crossentropy": 1.9702965021133423, "loss/hidden": 1.265625, "loss/logits": 0.24991975724697113, "loss/reg": 0.001362509443424642, "step": 3263 }, { "epoch": 0.408, "grad_norm": 15.288668632507324, "grad_norm_var": 9.634631773399546, "learning_rate": 0.0001, "loss": 2.8279, "loss/crossentropy": 2.5885770320892334, "loss/hidden": 2.203125, "loss/logits": 0.6111425161361694, "loss/reg": 0.0013617242220789194, "step": 3264 }, { "epoch": 0.408125, "grad_norm": 2.669337034225464, "grad_norm_var": 9.669223436955038, "learning_rate": 0.0001, "loss": 1.2984, "loss/crossentropy": 2.583742618560791, "loss/hidden": 1.078125, "loss/logits": 0.2066747546195984, "loss/reg": 0.0013609494781121612, "step": 3265 }, { "epoch": 0.40825, "grad_norm": 3.7663309574127197, "grad_norm_var": 9.610003772697857, "learning_rate": 0.0001, "loss": 1.5119, "loss/crossentropy": 2.4241232872009277, "loss/hidden": 1.2734375, "loss/logits": 0.22488948702812195, "loss/reg": 0.001360214431770146, "step": 3266 }, { "epoch": 0.408375, "grad_norm": 2.257723093032837, "grad_norm_var": 9.763568457517218, "learning_rate": 0.0001, "loss": 1.186, "loss/crossentropy": 2.5134975910186768, "loss/hidden": 1.0, "loss/logits": 0.1723683774471283, "loss/reg": 0.001359510701149702, "step": 3267 }, { "epoch": 0.4085, "grad_norm": 2.2471210956573486, "grad_norm_var": 9.790226969453252, "learning_rate": 0.0001, "loss": 1.0886, "loss/crossentropy": 2.528656005859375, "loss/hidden": 0.93359375, "loss/logits": 0.1413833647966385, "loss/reg": 0.0013587415451183915, "step": 3268 }, { "epoch": 0.408625, "grad_norm": 2.4004416465759277, "grad_norm_var": 9.813697240073719, "learning_rate": 0.0001, "loss": 1.2297, "loss/crossentropy": 2.511753797531128, "loss/hidden": 1.046875, "loss/logits": 0.16923333704471588, "loss/reg": 0.001357951550744474, "step": 3269 }, { "epoch": 0.40875, "grad_norm": 2.2637429237365723, "grad_norm_var": 9.903339638952524, "learning_rate": 0.0001, "loss": 1.1108, "loss/crossentropy": 2.4631755352020264, "loss/hidden": 0.9453125, "loss/logits": 0.1518806517124176, "loss/reg": 0.001357236411422491, "step": 3270 }, { "epoch": 0.408875, "grad_norm": 2.364027261734009, "grad_norm_var": 9.905013008408396, "learning_rate": 0.0001, "loss": 1.0716, "loss/crossentropy": 2.8079750537872314, "loss/hidden": 0.8984375, "loss/logits": 0.15958982706069946, "loss/reg": 0.00135652138851583, "step": 3271 }, { "epoch": 0.409, "grad_norm": 2.248713493347168, "grad_norm_var": 10.018100275602675, "learning_rate": 0.0001, "loss": 1.004, "loss/crossentropy": 2.696610450744629, "loss/hidden": 0.859375, "loss/logits": 0.13102860748767853, "loss/reg": 0.0013557313941419125, "step": 3272 }, { "epoch": 0.409125, "grad_norm": 3.6910922527313232, "grad_norm_var": 10.014759519868468, "learning_rate": 0.0001, "loss": 1.2796, "loss/crossentropy": 2.5381524562835693, "loss/hidden": 1.0546875, "loss/logits": 0.21139733493328094, "loss/reg": 0.0013550100848078728, "step": 3273 }, { "epoch": 0.40925, "grad_norm": 2.2932257652282715, "grad_norm_var": 10.132207862859595, "learning_rate": 0.0001, "loss": 1.025, "loss/crossentropy": 2.6925697326660156, "loss/hidden": 0.87109375, "loss/logits": 0.14035049080848694, "loss/reg": 0.0013542849337682128, "step": 3274 }, { "epoch": 0.409375, "grad_norm": 2.170440196990967, "grad_norm_var": 10.234469870468553, "learning_rate": 0.0001, "loss": 1.0603, "loss/crossentropy": 2.5037624835968018, "loss/hidden": 0.89453125, "loss/logits": 0.15221749246120453, "loss/reg": 0.0013534951722249389, "step": 3275 }, { "epoch": 0.4095, "grad_norm": 2.053010940551758, "grad_norm_var": 10.342702334184112, "learning_rate": 0.0001, "loss": 1.0673, "loss/crossentropy": 2.2183890342712402, "loss/hidden": 0.921875, "loss/logits": 0.13187147676944733, "loss/reg": 0.0013527621049433947, "step": 3276 }, { "epoch": 0.409625, "grad_norm": 3.313840389251709, "grad_norm_var": 10.271345912307597, "learning_rate": 0.0001, "loss": 1.2612, "loss/crossentropy": 2.6085968017578125, "loss/hidden": 1.03125, "loss/logits": 0.21644455194473267, "loss/reg": 0.001352027989923954, "step": 3277 }, { "epoch": 0.40975, "grad_norm": 3.0933520793914795, "grad_norm_var": 10.26594169064091, "learning_rate": 0.0001, "loss": 1.1362, "loss/crossentropy": 2.572032928466797, "loss/hidden": 0.9609375, "loss/logits": 0.16176952421665192, "loss/reg": 0.0013512877048924565, "step": 3278 }, { "epoch": 0.409875, "grad_norm": 2.5233092308044434, "grad_norm_var": 10.322234895932993, "learning_rate": 0.0001, "loss": 1.2299, "loss/crossentropy": 2.6352758407592773, "loss/hidden": 1.046875, "loss/logits": 0.1695098727941513, "loss/reg": 0.0013506055111065507, "step": 3279 }, { "epoch": 0.41, "grad_norm": 2.683999538421631, "grad_norm_var": 0.29739463015689743, "learning_rate": 0.0001, "loss": 1.5067, "loss/crossentropy": 2.006277322769165, "loss/hidden": 1.234375, "loss/logits": 0.2588390111923218, "loss/reg": 0.001349855214357376, "step": 3280 }, { "epoch": 0.410125, "grad_norm": 2.2778337001800537, "grad_norm_var": 0.304789444170338, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.5638885498046875, "loss/hidden": 0.90234375, "loss/logits": 0.14745941758155823, "loss/reg": 0.0013492146972566843, "step": 3281 }, { "epoch": 0.41025, "grad_norm": 3.1845810413360596, "grad_norm_var": 0.23570680460872093, "learning_rate": 0.0001, "loss": 1.4872, "loss/crossentropy": 2.424877405166626, "loss/hidden": 1.234375, "loss/logits": 0.2393244504928589, "loss/reg": 0.0013484961818903685, "step": 3282 }, { "epoch": 0.410375, "grad_norm": 7.197822093963623, "grad_norm_var": 1.5575067546057577, "learning_rate": 0.0001, "loss": 1.1792, "loss/crossentropy": 2.8233795166015625, "loss/hidden": 0.99609375, "loss/logits": 0.16961494088172913, "loss/reg": 0.0013478442560881376, "step": 3283 }, { "epoch": 0.4105, "grad_norm": 3.5271525382995605, "grad_norm_var": 1.552681246210098, "learning_rate": 0.0001, "loss": 1.4698, "loss/crossentropy": 2.6934311389923096, "loss/hidden": 1.2421875, "loss/logits": 0.21409210562705994, "loss/reg": 0.0013471812708303332, "step": 3284 }, { "epoch": 0.410625, "grad_norm": 2.649142026901245, "grad_norm_var": 1.5381441579120814, "learning_rate": 0.0001, "loss": 1.1877, "loss/crossentropy": 2.5946388244628906, "loss/hidden": 0.98828125, "loss/logits": 0.18594694137573242, "loss/reg": 0.0013464605435729027, "step": 3285 }, { "epoch": 0.41075, "grad_norm": 2.0759615898132324, "grad_norm_var": 1.5580548610848903, "learning_rate": 0.0001, "loss": 1.0881, "loss/crossentropy": 2.6526455879211426, "loss/hidden": 0.9140625, "loss/logits": 0.1606123447418213, "loss/reg": 0.001345811178907752, "step": 3286 }, { "epoch": 0.410875, "grad_norm": 2.132097005844116, "grad_norm_var": 1.5798225686995389, "learning_rate": 0.0001, "loss": 1.0531, "loss/crossentropy": 2.7268004417419434, "loss/hidden": 0.90625, "loss/logits": 0.13341575860977173, "loss/reg": 0.0013451270060613751, "step": 3287 }, { "epoch": 0.411, "grad_norm": 3.211747646331787, "grad_norm_var": 1.5484164051302731, "learning_rate": 0.0001, "loss": 1.4638, "loss/crossentropy": 2.50492262840271, "loss/hidden": 1.21875, "loss/logits": 0.23160862922668457, "loss/reg": 0.001344402669928968, "step": 3288 }, { "epoch": 0.411125, "grad_norm": 2.163221597671509, "grad_norm_var": 1.5545299491700082, "learning_rate": 0.0001, "loss": 1.1241, "loss/crossentropy": 2.276374578475952, "loss/hidden": 0.9453125, "loss/logits": 0.16536301374435425, "loss/reg": 0.00134366936981678, "step": 3289 }, { "epoch": 0.41125, "grad_norm": 2.8019871711730957, "grad_norm_var": 1.5289078149624136, "learning_rate": 0.0001, "loss": 1.2889, "loss/crossentropy": 2.5616776943206787, "loss/hidden": 1.09375, "loss/logits": 0.18174652755260468, "loss/reg": 0.001342884497717023, "step": 3290 }, { "epoch": 0.411375, "grad_norm": 2.6276111602783203, "grad_norm_var": 1.4949869737137826, "learning_rate": 0.0001, "loss": 1.2739, "loss/crossentropy": 2.56685209274292, "loss/hidden": 1.078125, "loss/logits": 0.18231460452079773, "loss/reg": 0.0013421528274193406, "step": 3291 }, { "epoch": 0.4115, "grad_norm": 3.3840267658233643, "grad_norm_var": 1.4430121809885252, "learning_rate": 0.0001, "loss": 1.0615, "loss/crossentropy": 2.460604190826416, "loss/hidden": 0.8984375, "loss/logits": 0.14963871240615845, "loss/reg": 0.001341362134553492, "step": 3292 }, { "epoch": 0.411625, "grad_norm": 2.226689338684082, "grad_norm_var": 1.4790681999251771, "learning_rate": 0.0001, "loss": 1.005, "loss/crossentropy": 2.4341094493865967, "loss/hidden": 0.87109375, "loss/logits": 0.12047846615314484, "loss/reg": 0.0013405472273007035, "step": 3293 }, { "epoch": 0.41175, "grad_norm": 3.387566566467285, "grad_norm_var": 1.4887275248539547, "learning_rate": 0.0001, "loss": 1.1736, "loss/crossentropy": 2.6723873615264893, "loss/hidden": 1.015625, "loss/logits": 0.14457851648330688, "loss/reg": 0.0013398099690675735, "step": 3294 }, { "epoch": 0.411875, "grad_norm": 2.61442494392395, "grad_norm_var": 1.4834136310505828, "learning_rate": 0.0001, "loss": 1.3149, "loss/crossentropy": 2.2810609340667725, "loss/hidden": 1.109375, "loss/logits": 0.19213774800300598, "loss/reg": 0.0013390433741733432, "step": 3295 }, { "epoch": 0.412, "grad_norm": 2.9621057510375977, "grad_norm_var": 1.4761919647180015, "learning_rate": 0.0001, "loss": 1.2271, "loss/crossentropy": 2.423949956893921, "loss/hidden": 1.046875, "loss/logits": 0.16689057648181915, "loss/reg": 0.0013383596669882536, "step": 3296 }, { "epoch": 0.412125, "grad_norm": 4.147792339324951, "grad_norm_var": 1.508075664244089, "learning_rate": 0.0001, "loss": 1.5802, "loss/crossentropy": 2.2585575580596924, "loss/hidden": 1.3046875, "loss/logits": 0.26209986209869385, "loss/reg": 0.0013376338174566627, "step": 3297 }, { "epoch": 0.41225, "grad_norm": 2.1051316261291504, "grad_norm_var": 1.574970075795384, "learning_rate": 0.0001, "loss": 1.0705, "loss/crossentropy": 2.7249348163604736, "loss/hidden": 0.90234375, "loss/logits": 0.15477254986763, "loss/reg": 0.0013368859654292464, "step": 3298 }, { "epoch": 0.412375, "grad_norm": 3.4187605381011963, "grad_norm_var": 0.39062126424061316, "learning_rate": 0.0001, "loss": 1.145, "loss/crossentropy": 2.5211830139160156, "loss/hidden": 0.97265625, "loss/logits": 0.15899470448493958, "loss/reg": 0.001336073619313538, "step": 3299 }, { "epoch": 0.4125, "grad_norm": 4.430337429046631, "grad_norm_var": 0.5243897858344468, "learning_rate": 0.0001, "loss": 1.2875, "loss/crossentropy": 2.4548768997192383, "loss/hidden": 1.09375, "loss/logits": 0.18039393424987793, "loss/reg": 0.001335203880444169, "step": 3300 }, { "epoch": 0.412625, "grad_norm": 3.063654899597168, "grad_norm_var": 0.521476159235798, "learning_rate": 0.0001, "loss": 1.2062, "loss/crossentropy": 2.7421510219573975, "loss/hidden": 1.03125, "loss/logits": 0.16158878803253174, "loss/reg": 0.0013342640595510602, "step": 3301 }, { "epoch": 0.41275, "grad_norm": 3.1730549335479736, "grad_norm_var": 0.4729340679147593, "learning_rate": 0.0001, "loss": 1.1802, "loss/crossentropy": 2.25921893119812, "loss/hidden": 1.03125, "loss/logits": 0.13562339544296265, "loss/reg": 0.0013333380920812488, "step": 3302 }, { "epoch": 0.412875, "grad_norm": 3.016066074371338, "grad_norm_var": 0.42058181304209913, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.482773542404175, "loss/hidden": 1.078125, "loss/logits": 0.19097864627838135, "loss/reg": 0.001332607353106141, "step": 3303 }, { "epoch": 0.413, "grad_norm": 3.225841760635376, "grad_norm_var": 0.4209059177026309, "learning_rate": 0.0001, "loss": 1.5412, "loss/crossentropy": 2.4628305435180664, "loss/hidden": 1.21875, "loss/logits": 0.3090950548648834, "loss/reg": 0.0013318744022399187, "step": 3304 }, { "epoch": 0.413125, "grad_norm": 3.0334460735321045, "grad_norm_var": 0.365718864820109, "learning_rate": 0.0001, "loss": 1.1054, "loss/crossentropy": 2.175504684448242, "loss/hidden": 0.95703125, "loss/logits": 0.13503523170948029, "loss/reg": 0.00133111875038594, "step": 3305 }, { "epoch": 0.41325, "grad_norm": 2.896515369415283, "grad_norm_var": 0.3625066854581282, "learning_rate": 0.0001, "loss": 1.1516, "loss/crossentropy": 2.6385393142700195, "loss/hidden": 0.98046875, "loss/logits": 0.15785254538059235, "loss/reg": 0.0013303103623911738, "step": 3306 }, { "epoch": 0.413375, "grad_norm": 2.7492733001708984, "grad_norm_var": 0.3556542875064215, "learning_rate": 0.0001, "loss": 1.0986, "loss/crossentropy": 2.733436346054077, "loss/hidden": 0.9296875, "loss/logits": 0.15559624135494232, "loss/reg": 0.0013295969692990184, "step": 3307 }, { "epoch": 0.4135, "grad_norm": 2.491199254989624, "grad_norm_var": 0.37341014008216766, "learning_rate": 0.0001, "loss": 1.2808, "loss/crossentropy": 2.436964988708496, "loss/hidden": 1.0625, "loss/logits": 0.20498235523700714, "loss/reg": 0.001328872749581933, "step": 3308 }, { "epoch": 0.413625, "grad_norm": 2.6170647144317627, "grad_norm_var": 0.3396198478834622, "learning_rate": 0.0001, "loss": 1.3079, "loss/crossentropy": 2.3987443447113037, "loss/hidden": 1.1015625, "loss/logits": 0.19303929805755615, "loss/reg": 0.0013281499268487096, "step": 3309 }, { "epoch": 0.41375, "grad_norm": 3.773622512817383, "grad_norm_var": 0.36459846895629816, "learning_rate": 0.0001, "loss": 1.185, "loss/crossentropy": 2.64062762260437, "loss/hidden": 1.0078125, "loss/logits": 0.16387121379375458, "loss/reg": 0.001327413017861545, "step": 3310 }, { "epoch": 0.413875, "grad_norm": 2.6906070709228516, "grad_norm_var": 0.3599538183266532, "learning_rate": 0.0001, "loss": 1.1076, "loss/crossentropy": 2.4670679569244385, "loss/hidden": 0.9375, "loss/logits": 0.15685531497001648, "loss/reg": 0.001326718251220882, "step": 3311 }, { "epoch": 0.414, "grad_norm": 2.850633144378662, "grad_norm_var": 0.36296063042451865, "learning_rate": 0.0001, "loss": 1.3592, "loss/crossentropy": 2.5203495025634766, "loss/hidden": 1.1484375, "loss/logits": 0.19752125442028046, "loss/reg": 0.0013259871629998088, "step": 3312 }, { "epoch": 0.414125, "grad_norm": 2.3831822872161865, "grad_norm_var": 0.3122707215087736, "learning_rate": 0.0001, "loss": 1.096, "loss/crossentropy": 2.62555193901062, "loss/hidden": 0.92578125, "loss/logits": 0.15697607398033142, "loss/reg": 0.0013252823846414685, "step": 3313 }, { "epoch": 0.41425, "grad_norm": 2.0828702449798584, "grad_norm_var": 0.3149426892927295, "learning_rate": 0.0001, "loss": 0.963, "loss/crossentropy": 2.7026524543762207, "loss/hidden": 0.82421875, "loss/logits": 0.12552350759506226, "loss/reg": 0.0013245412847027183, "step": 3314 }, { "epoch": 0.414375, "grad_norm": 2.5038039684295654, "grad_norm_var": 0.31538594969437383, "learning_rate": 0.0001, "loss": 1.0943, "loss/crossentropy": 2.2586328983306885, "loss/hidden": 0.9375, "loss/logits": 0.14360737800598145, "loss/reg": 0.0013237958773970604, "step": 3315 }, { "epoch": 0.4145, "grad_norm": 2.558516502380371, "grad_norm_var": 0.1614977973938494, "learning_rate": 0.0001, "loss": 1.1646, "loss/crossentropy": 2.4407529830932617, "loss/hidden": 0.9921875, "loss/logits": 0.15916083753108978, "loss/reg": 0.0013230852782726288, "step": 3316 }, { "epoch": 0.414625, "grad_norm": 4.205377101898193, "grad_norm_var": 0.2801611988243688, "learning_rate": 0.0001, "loss": 1.5526, "loss/crossentropy": 2.019152879714966, "loss/hidden": 1.3671875, "loss/logits": 0.17216026782989502, "loss/reg": 0.001322389580309391, "step": 3317 }, { "epoch": 0.41475, "grad_norm": 3.7728421688079834, "grad_norm_var": 0.3252262571653735, "learning_rate": 0.0001, "loss": 1.4756, "loss/crossentropy": 2.3831534385681152, "loss/hidden": 1.2578125, "loss/logits": 0.20456063747406006, "loss/reg": 0.0013216695515438914, "step": 3318 }, { "epoch": 0.414875, "grad_norm": 2.9152305126190186, "grad_norm_var": 0.3246801242612432, "learning_rate": 0.0001, "loss": 1.2157, "loss/crossentropy": 2.2754969596862793, "loss/hidden": 1.0234375, "loss/logits": 0.17904165387153625, "loss/reg": 0.0013209345052018762, "step": 3319 }, { "epoch": 0.415, "grad_norm": 3.015517473220825, "grad_norm_var": 0.31892072661664744, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.425769805908203, "loss/hidden": 1.0625, "loss/logits": 0.1908043473958969, "loss/reg": 0.001320175244472921, "step": 3320 }, { "epoch": 0.415125, "grad_norm": 2.3281772136688232, "grad_norm_var": 0.3382808327157588, "learning_rate": 0.0001, "loss": 1.0502, "loss/crossentropy": 2.4965240955352783, "loss/hidden": 0.90625, "loss/logits": 0.13080419600009918, "loss/reg": 0.0013194014318287373, "step": 3321 }, { "epoch": 0.41525, "grad_norm": 3.1911144256591797, "grad_norm_var": 0.3449567075400824, "learning_rate": 0.0001, "loss": 1.1026, "loss/crossentropy": 2.6164281368255615, "loss/hidden": 0.94140625, "loss/logits": 0.14798693358898163, "loss/reg": 0.0013187059666961432, "step": 3322 }, { "epoch": 0.415375, "grad_norm": 2.914590358734131, "grad_norm_var": 0.343715753449307, "learning_rate": 0.0001, "loss": 1.1588, "loss/crossentropy": 2.648087501525879, "loss/hidden": 1.0078125, "loss/logits": 0.13784724473953247, "loss/reg": 0.0013179569505155087, "step": 3323 }, { "epoch": 0.4155, "grad_norm": 2.5131149291992188, "grad_norm_var": 0.34257051458756205, "learning_rate": 0.0001, "loss": 1.1002, "loss/crossentropy": 2.772576332092285, "loss/hidden": 0.9296875, "loss/logits": 0.15729355812072754, "loss/reg": 0.0013172643957659602, "step": 3324 }, { "epoch": 0.415625, "grad_norm": 2.508145809173584, "grad_norm_var": 0.34734490308361854, "learning_rate": 0.0001, "loss": 1.1186, "loss/crossentropy": 2.389176607131958, "loss/hidden": 0.94140625, "loss/logits": 0.16404464840888977, "loss/reg": 0.0013165458803996444, "step": 3325 }, { "epoch": 0.41575, "grad_norm": 2.14680552482605, "grad_norm_var": 0.32064494454367887, "learning_rate": 0.0001, "loss": 1.0814, "loss/crossentropy": 2.3090384006500244, "loss/hidden": 0.91796875, "loss/logits": 0.15022702515125275, "loss/reg": 0.001315823057666421, "step": 3326 }, { "epoch": 0.415875, "grad_norm": 2.1274735927581787, "grad_norm_var": 0.3476486809665135, "learning_rate": 0.0001, "loss": 1.1381, "loss/crossentropy": 2.3576443195343018, "loss/hidden": 0.96875, "loss/logits": 0.15618029236793518, "loss/reg": 0.001315122121013701, "step": 3327 }, { "epoch": 0.416, "grad_norm": 3.5183522701263428, "grad_norm_var": 0.3843767300349394, "learning_rate": 0.0001, "loss": 1.3594, "loss/crossentropy": 2.9169929027557373, "loss/hidden": 1.125, "loss/logits": 0.22122074663639069, "loss/reg": 0.0013144509866833687, "step": 3328 }, { "epoch": 0.416125, "grad_norm": 4.922730922698975, "grad_norm_var": 0.6487525674521329, "learning_rate": 0.0001, "loss": 1.977, "loss/crossentropy": 2.566072702407837, "loss/hidden": 1.6875, "loss/logits": 0.2763688564300537, "loss/reg": 0.0013137306086719036, "step": 3329 }, { "epoch": 0.41625, "grad_norm": 2.415743827819824, "grad_norm_var": 0.6171235098282665, "learning_rate": 0.0001, "loss": 1.0544, "loss/crossentropy": 2.5320489406585693, "loss/hidden": 0.890625, "loss/logits": 0.15062732994556427, "loss/reg": 0.0013130444567650557, "step": 3330 }, { "epoch": 0.416375, "grad_norm": 3.249044895172119, "grad_norm_var": 0.6052781154768864, "learning_rate": 0.0001, "loss": 1.1238, "loss/crossentropy": 2.387714147567749, "loss/hidden": 0.9609375, "loss/logits": 0.14974479377269745, "loss/reg": 0.0013123798416927457, "step": 3331 }, { "epoch": 0.4165, "grad_norm": 4.368046760559082, "grad_norm_var": 0.6988453580253934, "learning_rate": 0.0001, "loss": 1.2683, "loss/crossentropy": 2.652925729751587, "loss/hidden": 1.078125, "loss/logits": 0.17708569765090942, "loss/reg": 0.0013117208145558834, "step": 3332 }, { "epoch": 0.416625, "grad_norm": 3.333439588546753, "grad_norm_var": 0.6215757488339089, "learning_rate": 0.0001, "loss": 1.1908, "loss/crossentropy": 2.202847719192505, "loss/hidden": 0.99609375, "loss/logits": 0.18163394927978516, "loss/reg": 0.0013110390864312649, "step": 3333 }, { "epoch": 0.41675, "grad_norm": 3.2864432334899902, "grad_norm_var": 0.5912685861949927, "learning_rate": 0.0001, "loss": 1.0954, "loss/crossentropy": 2.514042377471924, "loss/hidden": 0.92578125, "loss/logits": 0.15651807188987732, "loss/reg": 0.0013103288365527987, "step": 3334 }, { "epoch": 0.416875, "grad_norm": 2.6446025371551514, "grad_norm_var": 0.6006052354543951, "learning_rate": 0.0001, "loss": 1.1823, "loss/crossentropy": 2.64298152923584, "loss/hidden": 1.0, "loss/logits": 0.1692470908164978, "loss/reg": 0.0013096093898639083, "step": 3335 }, { "epoch": 0.417, "grad_norm": 2.3157060146331787, "grad_norm_var": 0.6325845764245001, "learning_rate": 0.0001, "loss": 1.1487, "loss/crossentropy": 2.198298692703247, "loss/hidden": 0.98046875, "loss/logits": 0.15509849786758423, "loss/reg": 0.001308897277340293, "step": 3336 }, { "epoch": 0.417125, "grad_norm": 2.4228689670562744, "grad_norm_var": 0.6248336552835437, "learning_rate": 0.0001, "loss": 1.1486, "loss/crossentropy": 2.567547082901001, "loss/hidden": 0.97265625, "loss/logits": 0.16291046142578125, "loss/reg": 0.0013081900542601943, "step": 3337 }, { "epoch": 0.41725, "grad_norm": 2.4255974292755127, "grad_norm_var": 0.6411759779564127, "learning_rate": 0.0001, "loss": 1.0783, "loss/crossentropy": 2.458423376083374, "loss/hidden": 0.9140625, "loss/logits": 0.15119026601314545, "loss/reg": 0.001307505532167852, "step": 3338 }, { "epoch": 0.417375, "grad_norm": 2.900170087814331, "grad_norm_var": 0.6412465667391584, "learning_rate": 0.0001, "loss": 1.2301, "loss/crossentropy": 2.5732319355010986, "loss/hidden": 1.03125, "loss/logits": 0.18576835095882416, "loss/reg": 0.0013068497646600008, "step": 3339 }, { "epoch": 0.4175, "grad_norm": 3.213627815246582, "grad_norm_var": 0.6317044085856875, "learning_rate": 0.0001, "loss": 1.4903, "loss/crossentropy": 2.1533420085906982, "loss/hidden": 1.2421875, "loss/logits": 0.2350264936685562, "loss/reg": 0.0013061390491202474, "step": 3340 }, { "epoch": 0.417625, "grad_norm": 2.5928237438201904, "grad_norm_var": 0.62674130653506, "learning_rate": 0.0001, "loss": 1.1127, "loss/crossentropy": 2.3836777210235596, "loss/hidden": 0.953125, "loss/logits": 0.14655232429504395, "loss/reg": 0.0013055132003501058, "step": 3341 }, { "epoch": 0.41775, "grad_norm": 3.1287028789520264, "grad_norm_var": 0.5762524828672402, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.5002822875976562, "loss/hidden": 1.140625, "loss/logits": 0.21076270937919617, "loss/reg": 0.0013047473039478064, "step": 3342 }, { "epoch": 0.417875, "grad_norm": 3.578648567199707, "grad_norm_var": 0.5285815691026905, "learning_rate": 0.0001, "loss": 1.3991, "loss/crossentropy": 2.6659348011016846, "loss/hidden": 1.140625, "loss/logits": 0.24543854594230652, "loss/reg": 0.0013039587065577507, "step": 3343 }, { "epoch": 0.418, "grad_norm": 2.529108762741089, "grad_norm_var": 0.5404709531373306, "learning_rate": 0.0001, "loss": 1.2027, "loss/crossentropy": 2.394843578338623, "loss/hidden": 1.0078125, "loss/logits": 0.1818375587463379, "loss/reg": 0.0013031769776716828, "step": 3344 }, { "epoch": 0.418125, "grad_norm": 4.469112873077393, "grad_norm_var": 0.4420575595374011, "learning_rate": 0.0001, "loss": 1.3752, "loss/crossentropy": 2.743117570877075, "loss/hidden": 1.1875, "loss/logits": 0.17462807893753052, "loss/reg": 0.0013024811632931232, "step": 3345 }, { "epoch": 0.41825, "grad_norm": 2.832873821258545, "grad_norm_var": 0.4174006170455279, "learning_rate": 0.0001, "loss": 1.37, "loss/crossentropy": 2.479796886444092, "loss/hidden": 1.1328125, "loss/logits": 0.22413240373134613, "loss/reg": 0.001301713869906962, "step": 3346 }, { "epoch": 0.418375, "grad_norm": 2.4833927154541016, "grad_norm_var": 0.436851315211072, "learning_rate": 0.0001, "loss": 1.0889, "loss/crossentropy": 2.380126476287842, "loss/hidden": 0.9296875, "loss/logits": 0.1462429165840149, "loss/reg": 0.0013010833645239472, "step": 3347 }, { "epoch": 0.4185, "grad_norm": 2.559283971786499, "grad_norm_var": 0.3193139682511363, "learning_rate": 0.0001, "loss": 1.0911, "loss/crossentropy": 2.555972099304199, "loss/hidden": 0.92578125, "loss/logits": 0.15235137939453125, "loss/reg": 0.0013003787025809288, "step": 3348 }, { "epoch": 0.418625, "grad_norm": 2.9586470127105713, "grad_norm_var": 0.3074215309359479, "learning_rate": 0.0001, "loss": 1.2003, "loss/crossentropy": 2.48483943939209, "loss/hidden": 1.03125, "loss/logits": 0.15601880848407745, "loss/reg": 0.0012997391168028116, "step": 3349 }, { "epoch": 0.41875, "grad_norm": 8.608823776245117, "grad_norm_var": 2.3547344348152404, "learning_rate": 0.0001, "loss": 1.6639, "loss/crossentropy": 2.553575277328491, "loss/hidden": 1.3984375, "loss/logits": 0.2524634599685669, "loss/reg": 0.0012990542454645038, "step": 3350 }, { "epoch": 0.418875, "grad_norm": 2.06416916847229, "grad_norm_var": 2.4210179938610725, "learning_rate": 0.0001, "loss": 1.1367, "loss/crossentropy": 2.6069188117980957, "loss/hidden": 0.9453125, "loss/logits": 0.17835494875907898, "loss/reg": 0.0012983543565496802, "step": 3351 }, { "epoch": 0.419, "grad_norm": 3.5955183506011963, "grad_norm_var": 2.3737324597760905, "learning_rate": 0.0001, "loss": 1.5658, "loss/crossentropy": 2.2926290035247803, "loss/hidden": 1.28125, "loss/logits": 0.2715609073638916, "loss/reg": 0.0012976247817277908, "step": 3352 }, { "epoch": 0.419125, "grad_norm": 2.256730556488037, "grad_norm_var": 2.3942830952167506, "learning_rate": 0.0001, "loss": 1.1297, "loss/crossentropy": 2.4836690425872803, "loss/hidden": 0.96484375, "loss/logits": 0.15187230706214905, "loss/reg": 0.0012969628442078829, "step": 3353 }, { "epoch": 0.41925, "grad_norm": 2.416325569152832, "grad_norm_var": 2.395322873414443, "learning_rate": 0.0001, "loss": 1.1213, "loss/crossentropy": 2.333630084991455, "loss/hidden": 0.9609375, "loss/logits": 0.14739683270454407, "loss/reg": 0.0012963005574420094, "step": 3354 }, { "epoch": 0.419375, "grad_norm": 4.072288990020752, "grad_norm_var": 2.4246810674268238, "learning_rate": 0.0001, "loss": 1.2638, "loss/crossentropy": 2.770054340362549, "loss/hidden": 1.0859375, "loss/logits": 0.16493605077266693, "loss/reg": 0.001295646419748664, "step": 3355 }, { "epoch": 0.4195, "grad_norm": 3.548766613006592, "grad_norm_var": 2.4262771867686332, "learning_rate": 0.0001, "loss": 1.4329, "loss/crossentropy": 2.548022747039795, "loss/hidden": 1.1953125, "loss/logits": 0.224660724401474, "loss/reg": 0.0012949311640113592, "step": 3356 }, { "epoch": 0.419625, "grad_norm": 3.327481746673584, "grad_norm_var": 2.3852581544044558, "learning_rate": 0.0001, "loss": 1.0604, "loss/crossentropy": 2.7102015018463135, "loss/hidden": 0.90625, "loss/logits": 0.14120855927467346, "loss/reg": 0.0012942433822900057, "step": 3357 }, { "epoch": 0.41975, "grad_norm": 2.365936756134033, "grad_norm_var": 2.4494028140662727, "learning_rate": 0.0001, "loss": 1.1313, "loss/crossentropy": 2.2563679218292236, "loss/hidden": 0.9609375, "loss/logits": 0.1574612557888031, "loss/reg": 0.001293584587983787, "step": 3358 }, { "epoch": 0.419875, "grad_norm": 2.6721670627593994, "grad_norm_var": 2.473631126767009, "learning_rate": 0.0001, "loss": 1.2144, "loss/crossentropy": 2.5903282165527344, "loss/hidden": 1.0234375, "loss/logits": 0.17807823419570923, "loss/reg": 0.0012929600197821856, "step": 3359 }, { "epoch": 0.42, "grad_norm": 2.8678555488586426, "grad_norm_var": 2.4460958496629472, "learning_rate": 0.0001, "loss": 1.3542, "loss/crossentropy": 2.4297194480895996, "loss/hidden": 1.140625, "loss/logits": 0.20069774985313416, "loss/reg": 0.0012922539608553052, "step": 3360 }, { "epoch": 0.420125, "grad_norm": 3.0216147899627686, "grad_norm_var": 2.355021733273641, "learning_rate": 0.0001, "loss": 1.1114, "loss/crossentropy": 2.711182117462158, "loss/hidden": 0.9375, "loss/logits": 0.1609582006931305, "loss/reg": 0.001291529624722898, "step": 3361 }, { "epoch": 0.42025, "grad_norm": 9.73618221282959, "grad_norm_var": 4.9695875203119755, "learning_rate": 0.0001, "loss": 1.4549, "loss/crossentropy": 2.606106996536255, "loss/hidden": 1.203125, "loss/logits": 0.23888623714447021, "loss/reg": 0.001290790969505906, "step": 3362 }, { "epoch": 0.420375, "grad_norm": 4.575292587280273, "grad_norm_var": 4.91499499397502, "learning_rate": 0.0001, "loss": 1.5947, "loss/crossentropy": 2.2388734817504883, "loss/hidden": 1.4140625, "loss/logits": 0.1677321195602417, "loss/reg": 0.0012900837464258075, "step": 3363 }, { "epoch": 0.4205, "grad_norm": 2.817335605621338, "grad_norm_var": 4.876796569902524, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.275587558746338, "loss/hidden": 1.1484375, "loss/logits": 0.16910308599472046, "loss/reg": 0.0012893910752609372, "step": 3364 }, { "epoch": 0.420625, "grad_norm": 2.8092308044433594, "grad_norm_var": 4.895084373817448, "learning_rate": 0.0001, "loss": 1.0873, "loss/crossentropy": 2.8529770374298096, "loss/hidden": 0.9296875, "loss/logits": 0.14470066130161285, "loss/reg": 0.00128862913697958, "step": 3365 }, { "epoch": 0.42075, "grad_norm": 2.389458179473877, "grad_norm_var": 3.3226101244149344, "learning_rate": 0.0001, "loss": 1.1735, "loss/crossentropy": 2.535306453704834, "loss/hidden": 1.0, "loss/logits": 0.16063019633293152, "loss/reg": 0.0012879238929599524, "step": 3366 }, { "epoch": 0.420875, "grad_norm": 3.2601842880249023, "grad_norm_var": 3.197631184466223, "learning_rate": 0.0001, "loss": 1.3443, "loss/crossentropy": 2.803818702697754, "loss/hidden": 1.15625, "loss/logits": 0.17513489723205566, "loss/reg": 0.0012871760409325361, "step": 3367 }, { "epoch": 0.421, "grad_norm": 3.1525826454162598, "grad_norm_var": 3.2032641965617574, "learning_rate": 0.0001, "loss": 1.3725, "loss/crossentropy": 2.1860790252685547, "loss/hidden": 1.171875, "loss/logits": 0.18780097365379333, "loss/reg": 0.00128646451048553, "step": 3368 }, { "epoch": 0.421125, "grad_norm": 2.8939075469970703, "grad_norm_var": 3.1267874656792247, "learning_rate": 0.0001, "loss": 1.1077, "loss/crossentropy": 2.3947038650512695, "loss/hidden": 0.94921875, "loss/logits": 0.1455739438533783, "loss/reg": 0.0012857348192483187, "step": 3369 }, { "epoch": 0.42125, "grad_norm": 3.7349045276641846, "grad_norm_var": 3.0457381569039965, "learning_rate": 0.0001, "loss": 1.6501, "loss/crossentropy": 2.155015230178833, "loss/hidden": 1.3984375, "loss/logits": 0.23885434865951538, "loss/reg": 0.0012850266648456454, "step": 3370 }, { "epoch": 0.421375, "grad_norm": 2.445650815963745, "grad_norm_var": 3.1038681374812986, "learning_rate": 0.0001, "loss": 1.161, "loss/crossentropy": 2.552976608276367, "loss/hidden": 0.99609375, "loss/logits": 0.1520773470401764, "loss/reg": 0.0012842589057981968, "step": 3371 }, { "epoch": 0.4215, "grad_norm": 2.5028140544891357, "grad_norm_var": 3.162118369555628, "learning_rate": 0.0001, "loss": 1.2094, "loss/crossentropy": 2.3864190578460693, "loss/hidden": 1.03125, "loss/logits": 0.16534456610679626, "loss/reg": 0.0012834742665290833, "step": 3372 }, { "epoch": 0.421625, "grad_norm": 2.235213041305542, "grad_norm_var": 3.2488163471311697, "learning_rate": 0.0001, "loss": 1.1376, "loss/crossentropy": 2.630387544631958, "loss/hidden": 0.95703125, "loss/logits": 0.1677786409854889, "loss/reg": 0.0012826202437281609, "step": 3373 }, { "epoch": 0.42175, "grad_norm": 2.7280311584472656, "grad_norm_var": 3.2098621276755686, "learning_rate": 0.0001, "loss": 1.2096, "loss/crossentropy": 2.5886683464050293, "loss/hidden": 1.015625, "loss/logits": 0.18112343549728394, "loss/reg": 0.0012817607494071126, "step": 3374 }, { "epoch": 0.421875, "grad_norm": 2.2316954135894775, "grad_norm_var": 3.2626867518643383, "learning_rate": 0.0001, "loss": 0.9525, "loss/crossentropy": 2.5210201740264893, "loss/hidden": 0.81640625, "loss/logits": 0.12325558811426163, "loss/reg": 0.0012810579501092434, "step": 3375 }, { "epoch": 0.422, "grad_norm": 2.0915136337280273, "grad_norm_var": 3.348982517679576, "learning_rate": 0.0001, "loss": 1.1477, "loss/crossentropy": 2.346536636352539, "loss/hidden": 0.96875, "loss/logits": 0.16613951325416565, "loss/reg": 0.0012801970588043332, "step": 3376 }, { "epoch": 0.422125, "grad_norm": 2.0532820224761963, "grad_norm_var": 3.44212217318621, "learning_rate": 0.0001, "loss": 1.1131, "loss/crossentropy": 2.3691983222961426, "loss/hidden": 0.94921875, "loss/logits": 0.15110766887664795, "loss/reg": 0.0012793500209227204, "step": 3377 }, { "epoch": 0.42225, "grad_norm": 11.64814281463623, "grad_norm_var": 5.329567319501028, "learning_rate": 0.0001, "loss": 2.7542, "loss/crossentropy": 2.5983874797821045, "loss/hidden": 2.046875, "loss/logits": 0.6945192813873291, "loss/reg": 0.001278555253520608, "step": 3378 }, { "epoch": 0.422375, "grad_norm": 3.0341219902038574, "grad_norm_var": 5.22583802619043, "learning_rate": 0.0001, "loss": 1.4058, "loss/crossentropy": 2.528168201446533, "loss/hidden": 1.171875, "loss/logits": 0.2211117148399353, "loss/reg": 0.0012778483796864748, "step": 3379 }, { "epoch": 0.4225, "grad_norm": 2.6210012435913086, "grad_norm_var": 5.239619400254839, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.4222869873046875, "loss/hidden": 0.95703125, "loss/logits": 0.13729092478752136, "loss/reg": 0.0012771119363605976, "step": 3380 }, { "epoch": 0.422625, "grad_norm": 2.578958511352539, "grad_norm_var": 5.2561435164654835, "learning_rate": 0.0001, "loss": 1.1733, "loss/crossentropy": 2.368990898132324, "loss/hidden": 0.99609375, "loss/logits": 0.16442936658859253, "loss/reg": 0.0012762281112372875, "step": 3381 }, { "epoch": 0.42275, "grad_norm": 2.828655481338501, "grad_norm_var": 5.219265029440756, "learning_rate": 0.0001, "loss": 1.3614, "loss/crossentropy": 2.584900379180908, "loss/hidden": 1.140625, "loss/logits": 0.20799317955970764, "loss/reg": 0.0012755231000483036, "step": 3382 }, { "epoch": 0.422875, "grad_norm": 2.9273927211761475, "grad_norm_var": 5.225847777897642, "learning_rate": 0.0001, "loss": 1.3187, "loss/crossentropy": 2.924804449081421, "loss/hidden": 1.1015625, "loss/logits": 0.20437873899936676, "loss/reg": 0.0012748196022585034, "step": 3383 }, { "epoch": 0.423, "grad_norm": 7.046627521514893, "grad_norm_var": 6.1324720029717925, "learning_rate": 0.0001, "loss": 1.4946, "loss/crossentropy": 2.4752395153045654, "loss/hidden": 1.3125, "loss/logits": 0.16932812333106995, "loss/reg": 0.0012739531230181456, "step": 3384 }, { "epoch": 0.423125, "grad_norm": 2.5792341232299805, "grad_norm_var": 6.163046308530284, "learning_rate": 0.0001, "loss": 1.1491, "loss/crossentropy": 2.658679246902466, "loss/hidden": 0.9609375, "loss/logits": 0.1754234880208969, "loss/reg": 0.0012731251772493124, "step": 3385 }, { "epoch": 0.42325, "grad_norm": 2.4185712337493896, "grad_norm_var": 6.2222951977632714, "learning_rate": 0.0001, "loss": 1.3085, "loss/crossentropy": 2.395947217941284, "loss/hidden": 1.1328125, "loss/logits": 0.162948876619339, "loss/reg": 0.0012722749961540103, "step": 3386 }, { "epoch": 0.423375, "grad_norm": 7.0334343910217285, "grad_norm_var": 6.970405341226255, "learning_rate": 0.0001, "loss": 2.1016, "loss/crossentropy": 2.6856324672698975, "loss/hidden": 1.6015625, "loss/logits": 0.4872820973396301, "loss/reg": 0.0012714164331555367, "step": 3387 }, { "epoch": 0.4235, "grad_norm": 18.825759887695312, "grad_norm_var": 21.104502528780372, "learning_rate": 0.0001, "loss": 1.265, "loss/crossentropy": 2.670596122741699, "loss/hidden": 1.0859375, "loss/logits": 0.16635875403881073, "loss/reg": 0.0012705568224191666, "step": 3388 }, { "epoch": 0.423625, "grad_norm": 3.53476881980896, "grad_norm_var": 20.786419377374756, "learning_rate": 0.0001, "loss": 1.268, "loss/crossentropy": 2.1000654697418213, "loss/hidden": 1.1015625, "loss/logits": 0.1537715494632721, "loss/reg": 0.0012698600767180324, "step": 3389 }, { "epoch": 0.42375, "grad_norm": 2.630060911178589, "grad_norm_var": 20.813579562308444, "learning_rate": 0.0001, "loss": 1.3282, "loss/crossentropy": 2.4256396293640137, "loss/hidden": 1.109375, "loss/logits": 0.20609043538570404, "loss/reg": 0.0012691568117588758, "step": 3390 }, { "epoch": 0.423875, "grad_norm": 3.409085512161255, "grad_norm_var": 20.504066582440963, "learning_rate": 0.0001, "loss": 1.4216, "loss/crossentropy": 2.4985742568969727, "loss/hidden": 1.1796875, "loss/logits": 0.22920210659503937, "loss/reg": 0.0012684372486546636, "step": 3391 }, { "epoch": 0.424, "grad_norm": 3.330270767211914, "grad_norm_var": 20.14786491924047, "learning_rate": 0.0001, "loss": 1.3235, "loss/crossentropy": 2.5425846576690674, "loss/hidden": 1.125, "loss/logits": 0.18579837679862976, "loss/reg": 0.0012677302584052086, "step": 3392 }, { "epoch": 0.424125, "grad_norm": 2.4024415016174316, "grad_norm_var": 20.022667504581005, "learning_rate": 0.0001, "loss": 1.098, "loss/crossentropy": 2.7119483947753906, "loss/hidden": 0.921875, "loss/logits": 0.16348311305046082, "loss/reg": 0.001267016283236444, "step": 3393 }, { "epoch": 0.42425, "grad_norm": 2.3866076469421387, "grad_norm_var": 17.08519844740309, "learning_rate": 0.0001, "loss": 1.0332, "loss/crossentropy": 2.541130781173706, "loss/hidden": 0.87890625, "loss/logits": 0.14167964458465576, "loss/reg": 0.001266295788809657, "step": 3394 }, { "epoch": 0.424375, "grad_norm": 3.2358763217926025, "grad_norm_var": 17.0523664893313, "learning_rate": 0.0001, "loss": 1.4788, "loss/crossentropy": 2.356454610824585, "loss/hidden": 1.2421875, "loss/logits": 0.2239098846912384, "loss/reg": 0.0012656066101044416, "step": 3395 }, { "epoch": 0.4245, "grad_norm": 2.5249011516571045, "grad_norm_var": 17.07524910436306, "learning_rate": 0.0001, "loss": 1.1846, "loss/crossentropy": 2.3862226009368896, "loss/hidden": 1.015625, "loss/logits": 0.15631671249866486, "loss/reg": 0.0012648756382986903, "step": 3396 }, { "epoch": 0.424625, "grad_norm": 2.286381483078003, "grad_norm_var": 17.149913879453926, "learning_rate": 0.0001, "loss": 1.1578, "loss/crossentropy": 2.3956234455108643, "loss/hidden": 0.98828125, "loss/logits": 0.15687915682792664, "loss/reg": 0.0012641353532671928, "step": 3397 }, { "epoch": 0.42475, "grad_norm": 3.073234796524048, "grad_norm_var": 17.104448140061454, "learning_rate": 0.0001, "loss": 1.5482, "loss/crossentropy": 2.3788349628448486, "loss/hidden": 1.2890625, "loss/logits": 0.24646201729774475, "loss/reg": 0.0012633375590667129, "step": 3398 }, { "epoch": 0.424875, "grad_norm": 2.478750228881836, "grad_norm_var": 17.20229401665159, "learning_rate": 0.0001, "loss": 1.1745, "loss/crossentropy": 2.510806083679199, "loss/hidden": 1.0078125, "loss/logits": 0.15401197969913483, "loss/reg": 0.0012626349925994873, "step": 3399 }, { "epoch": 0.425, "grad_norm": 4.93013858795166, "grad_norm_var": 16.71415464522817, "learning_rate": 0.0001, "loss": 1.4849, "loss/crossentropy": 2.4152138233184814, "loss/hidden": 1.265625, "loss/logits": 0.2066764533519745, "loss/reg": 0.0012618736363947392, "step": 3400 }, { "epoch": 0.425125, "grad_norm": 3.364943504333496, "grad_norm_var": 16.583733756148302, "learning_rate": 0.0001, "loss": 1.3895, "loss/crossentropy": 2.366891384124756, "loss/hidden": 1.171875, "loss/logits": 0.20504985749721527, "loss/reg": 0.001261151279322803, "step": 3401 }, { "epoch": 0.42525, "grad_norm": 4.257473945617676, "grad_norm_var": 16.34810416934195, "learning_rate": 0.0001, "loss": 1.2318, "loss/crossentropy": 2.649857759475708, "loss/hidden": 1.0703125, "loss/logits": 0.14887118339538574, "loss/reg": 0.0012604593066498637, "step": 3402 }, { "epoch": 0.425375, "grad_norm": 2.534241199493408, "grad_norm_var": 16.007407569211516, "learning_rate": 0.0001, "loss": 1.149, "loss/crossentropy": 2.3638546466827393, "loss/hidden": 0.9765625, "loss/logits": 0.1598854809999466, "loss/reg": 0.0012597551103681326, "step": 3403 }, { "epoch": 0.4255, "grad_norm": 3.278895616531372, "grad_norm_var": 0.5375339735711719, "learning_rate": 0.0001, "loss": 1.3241, "loss/crossentropy": 2.2669084072113037, "loss/hidden": 1.1328125, "loss/logits": 0.17869816720485687, "loss/reg": 0.0012590433470904827, "step": 3404 }, { "epoch": 0.425625, "grad_norm": 2.602390766143799, "grad_norm_var": 0.5382690470671356, "learning_rate": 0.0001, "loss": 1.0556, "loss/crossentropy": 2.500565767288208, "loss/hidden": 0.91015625, "loss/logits": 0.13290029764175415, "loss/reg": 0.0012583717470988631, "step": 3405 }, { "epoch": 0.42575, "grad_norm": 2.692824125289917, "grad_norm_var": 0.5350398821821252, "learning_rate": 0.0001, "loss": 1.0906, "loss/crossentropy": 2.580354690551758, "loss/hidden": 0.9375, "loss/logits": 0.14051620662212372, "loss/reg": 0.0012577392626553774, "step": 3406 }, { "epoch": 0.425875, "grad_norm": 2.926161527633667, "grad_norm_var": 0.5264479354616676, "learning_rate": 0.0001, "loss": 1.2475, "loss/crossentropy": 2.4844837188720703, "loss/hidden": 1.0546875, "loss/logits": 0.18022066354751587, "loss/reg": 0.0012571116676554084, "step": 3407 }, { "epoch": 0.426, "grad_norm": 3.9087629318237305, "grad_norm_var": 0.5713653950876617, "learning_rate": 0.0001, "loss": 1.3401, "loss/crossentropy": 2.592961311340332, "loss/hidden": 1.140625, "loss/logits": 0.18693436682224274, "loss/reg": 0.001256460091099143, "step": 3408 }, { "epoch": 0.426125, "grad_norm": 4.682487964630127, "grad_norm_var": 0.6978203280988962, "learning_rate": 0.0001, "loss": 1.4022, "loss/crossentropy": 2.6020073890686035, "loss/hidden": 1.1875, "loss/logits": 0.20211443305015564, "loss/reg": 0.0012558320304378867, "step": 3409 }, { "epoch": 0.42625, "grad_norm": 2.5506813526153564, "grad_norm_var": 0.6817577903436274, "learning_rate": 0.0001, "loss": 1.1943, "loss/crossentropy": 2.609788417816162, "loss/hidden": 1.0, "loss/logits": 0.18171468377113342, "loss/reg": 0.0012551514664664865, "step": 3410 }, { "epoch": 0.426375, "grad_norm": 3.0468664169311523, "grad_norm_var": 0.6832882959089297, "learning_rate": 0.0001, "loss": 1.3283, "loss/crossentropy": 2.9923455715179443, "loss/hidden": 1.109375, "loss/logits": 0.20635701715946198, "loss/reg": 0.001254511997103691, "step": 3411 }, { "epoch": 0.4265, "grad_norm": 3.119813919067383, "grad_norm_var": 0.652160122817864, "learning_rate": 0.0001, "loss": 1.2567, "loss/crossentropy": 2.7921488285064697, "loss/hidden": 1.0546875, "loss/logits": 0.18942540884017944, "loss/reg": 0.0012538874289020896, "step": 3412 }, { "epoch": 0.426625, "grad_norm": 4.253607749938965, "grad_norm_var": 0.645639626177789, "learning_rate": 0.0001, "loss": 1.5235, "loss/crossentropy": 2.385000228881836, "loss/hidden": 1.3203125, "loss/logits": 0.19061830639839172, "loss/reg": 0.001253205118700862, "step": 3413 }, { "epoch": 0.42675, "grad_norm": 2.5698156356811523, "grad_norm_var": 0.680481105186064, "learning_rate": 0.0001, "loss": 1.3122, "loss/crossentropy": 2.6111855506896973, "loss/hidden": 1.109375, "loss/logits": 0.19028973579406738, "loss/reg": 0.0012525215279310942, "step": 3414 }, { "epoch": 0.426875, "grad_norm": 4.289391040802002, "grad_norm_var": 0.681114139600182, "learning_rate": 0.0001, "loss": 1.6592, "loss/crossentropy": 2.3776509761810303, "loss/hidden": 1.3671875, "loss/logits": 0.27947333455085754, "loss/reg": 0.001251855632290244, "step": 3415 }, { "epoch": 0.427, "grad_norm": 2.4858953952789307, "grad_norm_var": 0.5682329358412136, "learning_rate": 0.0001, "loss": 1.3403, "loss/crossentropy": 2.4464216232299805, "loss/hidden": 1.109375, "loss/logits": 0.21839639544487, "loss/reg": 0.0012511778622865677, "step": 3416 }, { "epoch": 0.427125, "grad_norm": 2.9489448070526123, "grad_norm_var": 0.5746294262841957, "learning_rate": 0.0001, "loss": 1.2328, "loss/crossentropy": 2.475005865097046, "loss/hidden": 1.046875, "loss/logits": 0.17341601848602295, "loss/reg": 0.0012504736660048366, "step": 3417 }, { "epoch": 0.42725, "grad_norm": 3.367196559906006, "grad_norm_var": 0.5056756003131255, "learning_rate": 0.0001, "loss": 1.2034, "loss/crossentropy": 1.5884217023849487, "loss/hidden": 1.1015625, "loss/logits": 0.08933718502521515, "loss/reg": 0.0012498110299929976, "step": 3418 }, { "epoch": 0.427375, "grad_norm": 3.065373420715332, "grad_norm_var": 0.4759028678756181, "learning_rate": 0.0001, "loss": 1.1635, "loss/crossentropy": 2.765894651412964, "loss/hidden": 0.9609375, "loss/logits": 0.19002410769462585, "loss/reg": 0.0012491667876020074, "step": 3419 }, { "epoch": 0.4275, "grad_norm": 2.313544988632202, "grad_norm_var": 0.5287309495314417, "learning_rate": 0.0001, "loss": 1.1375, "loss/crossentropy": 2.4853460788726807, "loss/hidden": 0.94921875, "loss/logits": 0.17583268880844116, "loss/reg": 0.0012485285988077521, "step": 3420 }, { "epoch": 0.427625, "grad_norm": 3.061662435531616, "grad_norm_var": 0.5067587467360963, "learning_rate": 0.0001, "loss": 1.3654, "loss/crossentropy": 2.49043607711792, "loss/hidden": 1.125, "loss/logits": 0.2279426008462906, "loss/reg": 0.0012478969292715192, "step": 3421 }, { "epoch": 0.42775, "grad_norm": 3.8236587047576904, "grad_norm_var": 0.5094295431678287, "learning_rate": 0.0001, "loss": 1.2539, "loss/crossentropy": 2.7700281143188477, "loss/hidden": 1.078125, "loss/logits": 0.16328606009483337, "loss/reg": 0.0012472590897232294, "step": 3422 }, { "epoch": 0.427875, "grad_norm": 2.240814208984375, "grad_norm_var": 0.5707417725774196, "learning_rate": 0.0001, "loss": 1.047, "loss/crossentropy": 2.451385021209717, "loss/hidden": 0.89453125, "loss/logits": 0.14000791311264038, "loss/reg": 0.0012465600157156587, "step": 3423 }, { "epoch": 0.428, "grad_norm": 3.1770434379577637, "grad_norm_var": 0.5382790788617958, "learning_rate": 0.0001, "loss": 1.2193, "loss/crossentropy": 2.704780101776123, "loss/hidden": 1.0390625, "loss/logits": 0.16781997680664062, "loss/reg": 0.0012459279969334602, "step": 3424 }, { "epoch": 0.428125, "grad_norm": 3.030484914779663, "grad_norm_var": 0.37950800463040246, "learning_rate": 0.0001, "loss": 1.4456, "loss/crossentropy": 2.412060260772705, "loss/hidden": 1.21875, "loss/logits": 0.2143547683954239, "loss/reg": 0.0012453191448003054, "step": 3425 }, { "epoch": 0.42825, "grad_norm": 2.2262816429138184, "grad_norm_var": 0.40915514056148367, "learning_rate": 0.0001, "loss": 1.1343, "loss/crossentropy": 2.519752264022827, "loss/hidden": 0.94921875, "loss/logits": 0.17265602946281433, "loss/reg": 0.00124474021140486, "step": 3426 }, { "epoch": 0.428375, "grad_norm": 2.729315996170044, "grad_norm_var": 0.4161734302819163, "learning_rate": 0.0001, "loss": 1.1404, "loss/crossentropy": 2.7469873428344727, "loss/hidden": 0.9765625, "loss/logits": 0.15135417878627777, "loss/reg": 0.0012441625585779548, "step": 3427 }, { "epoch": 0.4285, "grad_norm": 3.4567322731018066, "grad_norm_var": 0.42667704387629113, "learning_rate": 0.0001, "loss": 1.2375, "loss/crossentropy": 2.775790214538574, "loss/hidden": 1.0546875, "loss/logits": 0.1703491508960724, "loss/reg": 0.0012434873497113585, "step": 3428 }, { "epoch": 0.428625, "grad_norm": 2.336427688598633, "grad_norm_var": 0.3525602953693273, "learning_rate": 0.0001, "loss": 1.047, "loss/crossentropy": 2.5805509090423584, "loss/hidden": 0.87890625, "loss/logits": 0.15566515922546387, "loss/reg": 0.0012428333284333348, "step": 3429 }, { "epoch": 0.42875, "grad_norm": 2.4215245246887207, "grad_norm_var": 0.3613560792001273, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.536388874053955, "loss/hidden": 0.953125, "loss/logits": 0.14159247279167175, "loss/reg": 0.0012422014260664582, "step": 3430 }, { "epoch": 0.428875, "grad_norm": 3.2112715244293213, "grad_norm_var": 0.2394381174648629, "learning_rate": 0.0001, "loss": 1.1456, "loss/crossentropy": 2.2904157638549805, "loss/hidden": 1.0, "loss/logits": 0.1331569105386734, "loss/reg": 0.0012416383251547813, "step": 3431 }, { "epoch": 0.429, "grad_norm": 45.88874816894531, "grad_norm_var": 115.76320167925172, "learning_rate": 0.0001, "loss": 1.0117, "loss/crossentropy": 2.6698877811431885, "loss/hidden": 0.87109375, "loss/logits": 0.12823878228664398, "loss/reg": 0.0012410827912390232, "step": 3432 }, { "epoch": 0.429125, "grad_norm": 2.4286248683929443, "grad_norm_var": 115.9627370388882, "learning_rate": 0.0001, "loss": 1.2616, "loss/crossentropy": 2.103612184524536, "loss/hidden": 1.0703125, "loss/logits": 0.1788921058177948, "loss/reg": 0.001240414916537702, "step": 3433 }, { "epoch": 0.42925, "grad_norm": 2.689265251159668, "grad_norm_var": 116.18864660411508, "learning_rate": 0.0001, "loss": 1.2738, "loss/crossentropy": 2.3609302043914795, "loss/hidden": 1.0703125, "loss/logits": 0.19104725122451782, "loss/reg": 0.0012397363316267729, "step": 3434 }, { "epoch": 0.429375, "grad_norm": 3.3400299549102783, "grad_norm_var": 116.10397256881402, "learning_rate": 0.0001, "loss": 1.3492, "loss/crossentropy": 2.4718568325042725, "loss/hidden": 1.1640625, "loss/logits": 0.17276926338672638, "loss/reg": 0.0012391533236950636, "step": 3435 }, { "epoch": 0.4295, "grad_norm": 3.018950939178467, "grad_norm_var": 115.83316624778392, "learning_rate": 0.0001, "loss": 1.3537, "loss/crossentropy": 2.4487743377685547, "loss/hidden": 1.171875, "loss/logits": 0.16945204138755798, "loss/reg": 0.001238464843481779, "step": 3436 }, { "epoch": 0.429625, "grad_norm": 2.6757137775421143, "grad_norm_var": 115.97142866387756, "learning_rate": 0.0001, "loss": 1.1447, "loss/crossentropy": 2.454679250717163, "loss/hidden": 0.95703125, "loss/logits": 0.1752498894929886, "loss/reg": 0.001237865537405014, "step": 3437 }, { "epoch": 0.42975, "grad_norm": 2.6714251041412354, "grad_norm_var": 116.31861681764059, "learning_rate": 0.0001, "loss": 1.3167, "loss/crossentropy": 2.067734479904175, "loss/hidden": 1.125, "loss/logits": 0.17934894561767578, "loss/reg": 0.001237238640896976, "step": 3438 }, { "epoch": 0.429875, "grad_norm": 3.305312156677246, "grad_norm_var": 115.93090987862843, "learning_rate": 0.0001, "loss": 1.394, "loss/crossentropy": 2.5658860206604004, "loss/hidden": 1.1875, "loss/logits": 0.19414980709552765, "loss/reg": 0.0012365588918328285, "step": 3439 }, { "epoch": 0.43, "grad_norm": 3.131331205368042, "grad_norm_var": 115.94543010225995, "learning_rate": 0.0001, "loss": 1.6409, "loss/crossentropy": 2.42006516456604, "loss/hidden": 1.3828125, "loss/logits": 0.2457335889339447, "loss/reg": 0.001235869829542935, "step": 3440 }, { "epoch": 0.430125, "grad_norm": 2.490414619445801, "grad_norm_var": 116.14401488853926, "learning_rate": 0.0001, "loss": 1.1331, "loss/crossentropy": 2.4109976291656494, "loss/hidden": 0.9765625, "loss/logits": 0.14418454468250275, "loss/reg": 0.0012351801851764321, "step": 3441 }, { "epoch": 0.43025, "grad_norm": 2.6899540424346924, "grad_norm_var": 115.95497827713713, "learning_rate": 0.0001, "loss": 1.2509, "loss/crossentropy": 2.6147475242614746, "loss/hidden": 1.0703125, "loss/logits": 0.16823075711727142, "loss/reg": 0.0012345298891887069, "step": 3442 }, { "epoch": 0.430375, "grad_norm": 2.528609275817871, "grad_norm_var": 116.03245322303522, "learning_rate": 0.0001, "loss": 1.1774, "loss/crossentropy": 2.4867196083068848, "loss/hidden": 1.0, "loss/logits": 0.16507115960121155, "loss/reg": 0.001233840943314135, "step": 3443 }, { "epoch": 0.4305, "grad_norm": 6.451398849487305, "grad_norm_var": 115.77000514420874, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 3.054737091064453, "loss/hidden": 1.0859375, "loss/logits": 0.22011621296405792, "loss/reg": 0.0012332327896729112, "step": 3444 }, { "epoch": 0.430625, "grad_norm": 9.352788925170898, "grad_norm_var": 115.6955469539873, "learning_rate": 0.0001, "loss": 1.8394, "loss/crossentropy": 2.72918701171875, "loss/hidden": 1.5625, "loss/logits": 0.26458457112312317, "loss/reg": 0.00123261334374547, "step": 3445 }, { "epoch": 0.43075, "grad_norm": 3.307373046875, "grad_norm_var": 115.30498293365741, "learning_rate": 0.0001, "loss": 1.2012, "loss/crossentropy": 2.7193400859832764, "loss/hidden": 1.0390625, "loss/logits": 0.14984726905822754, "loss/reg": 0.0012320136884227395, "step": 3446 }, { "epoch": 0.430875, "grad_norm": 2.40112042427063, "grad_norm_var": 115.6687205277014, "learning_rate": 0.0001, "loss": 1.0548, "loss/crossentropy": 2.5848987102508545, "loss/hidden": 0.91015625, "loss/logits": 0.13236987590789795, "loss/reg": 0.001231399248354137, "step": 3447 }, { "epoch": 0.431, "grad_norm": 3.086855888366699, "grad_norm_var": 3.372706481450143, "learning_rate": 0.0001, "loss": 1.323, "loss/crossentropy": 2.5999648571014404, "loss/hidden": 1.1484375, "loss/logits": 0.16229502856731415, "loss/reg": 0.0012307389406487346, "step": 3448 }, { "epoch": 0.431125, "grad_norm": 3.6063601970672607, "grad_norm_var": 3.2953866312694475, "learning_rate": 0.0001, "loss": 1.4105, "loss/crossentropy": 2.662879228591919, "loss/hidden": 1.1875, "loss/logits": 0.21066385507583618, "loss/reg": 0.001230068039149046, "step": 3449 }, { "epoch": 0.43125, "grad_norm": 3.1095049381256104, "grad_norm_var": 3.258381508155472, "learning_rate": 0.0001, "loss": 1.5884, "loss/crossentropy": 2.3134074211120605, "loss/hidden": 1.296875, "loss/logits": 0.27925246953964233, "loss/reg": 0.0012294263578951359, "step": 3450 }, { "epoch": 0.431375, "grad_norm": 2.473538398742676, "grad_norm_var": 3.332216342096789, "learning_rate": 0.0001, "loss": 1.126, "loss/crossentropy": 2.654710054397583, "loss/hidden": 0.9609375, "loss/logits": 0.15276968479156494, "loss/reg": 0.0012287187855690718, "step": 3451 }, { "epoch": 0.4315, "grad_norm": 3.2627336978912354, "grad_norm_var": 3.319683742550171, "learning_rate": 0.0001, "loss": 1.2048, "loss/crossentropy": 2.2859139442443848, "loss/hidden": 1.0546875, "loss/logits": 0.13786113262176514, "loss/reg": 0.001228029141202569, "step": 3452 }, { "epoch": 0.431625, "grad_norm": 2.493816614151001, "grad_norm_var": 3.342568289601847, "learning_rate": 0.0001, "loss": 1.2819, "loss/crossentropy": 2.452895402908325, "loss/hidden": 1.0625, "loss/logits": 0.2070835530757904, "loss/reg": 0.001227400847710669, "step": 3453 }, { "epoch": 0.43175, "grad_norm": 2.3647286891937256, "grad_norm_var": 3.383256575428814, "learning_rate": 0.0001, "loss": 1.2673, "loss/crossentropy": 2.202064037322998, "loss/hidden": 1.0859375, "loss/logits": 0.16913244128227234, "loss/reg": 0.001226780004799366, "step": 3454 }, { "epoch": 0.431875, "grad_norm": 2.539780855178833, "grad_norm_var": 3.4401121464363276, "learning_rate": 0.0001, "loss": 1.1589, "loss/crossentropy": 2.3375728130340576, "loss/hidden": 0.9921875, "loss/logits": 0.15446734428405762, "loss/reg": 0.0012261293595656753, "step": 3455 }, { "epoch": 0.432, "grad_norm": 3.039510488510132, "grad_norm_var": 3.444609575464352, "learning_rate": 0.0001, "loss": 1.4382, "loss/crossentropy": 2.1632235050201416, "loss/hidden": 1.21875, "loss/logits": 0.20723550021648407, "loss/reg": 0.0012254834873601794, "step": 3456 }, { "epoch": 0.432125, "grad_norm": 2.476862668991089, "grad_norm_var": 3.4463547837484603, "learning_rate": 0.0001, "loss": 1.0985, "loss/crossentropy": 2.4073829650878906, "loss/hidden": 0.921875, "loss/logits": 0.16438889503479004, "loss/reg": 0.0012248161947354674, "step": 3457 }, { "epoch": 0.43225, "grad_norm": 2.796520948410034, "grad_norm_var": 3.4362785093367543, "learning_rate": 0.0001, "loss": 1.1448, "loss/crossentropy": 2.7997186183929443, "loss/hidden": 0.97265625, "loss/logits": 0.15992063283920288, "loss/reg": 0.0012241443619132042, "step": 3458 }, { "epoch": 0.432375, "grad_norm": 3.2943859100341797, "grad_norm_var": 3.3782681805617094, "learning_rate": 0.0001, "loss": 1.6189, "loss/crossentropy": 2.7017159461975098, "loss/hidden": 1.3203125, "loss/logits": 0.2863697409629822, "loss/reg": 0.0012234512250870466, "step": 3459 }, { "epoch": 0.4325, "grad_norm": 9.150236129760742, "grad_norm_var": 4.894258622525441, "learning_rate": 0.0001, "loss": 1.3915, "loss/crossentropy": 2.500932455062866, "loss/hidden": 1.203125, "loss/logits": 0.17619535326957703, "loss/reg": 0.0012228019768372178, "step": 3460 }, { "epoch": 0.432625, "grad_norm": 2.3378589153289795, "grad_norm_var": 2.65669880314623, "learning_rate": 0.0001, "loss": 1.0517, "loss/crossentropy": 2.4369237422943115, "loss/hidden": 0.9140625, "loss/logits": 0.12538093328475952, "loss/reg": 0.0012221434153616428, "step": 3461 }, { "epoch": 0.43275, "grad_norm": 2.456996202468872, "grad_norm_var": 2.6935558745605213, "learning_rate": 0.0001, "loss": 1.234, "loss/crossentropy": 2.6243457794189453, "loss/hidden": 1.0234375, "loss/logits": 0.1983601301908493, "loss/reg": 0.0012214162852615118, "step": 3462 }, { "epoch": 0.432875, "grad_norm": 2.84320068359375, "grad_norm_var": 2.6598204270241466, "learning_rate": 0.0001, "loss": 1.0562, "loss/crossentropy": 2.6735165119171143, "loss/hidden": 0.91015625, "loss/logits": 0.13384856283664703, "loss/reg": 0.0012207570252940059, "step": 3463 }, { "epoch": 0.433, "grad_norm": 2.743060827255249, "grad_norm_var": 2.67277479673922, "learning_rate": 0.0001, "loss": 1.1707, "loss/crossentropy": 2.230998992919922, "loss/hidden": 1.0, "loss/logits": 0.1585088074207306, "loss/reg": 0.0012200911296531558, "step": 3464 }, { "epoch": 0.433125, "grad_norm": 3.488234281539917, "grad_norm_var": 2.667039072203904, "learning_rate": 0.0001, "loss": 1.2195, "loss/crossentropy": 2.816776990890503, "loss/hidden": 1.0234375, "loss/logits": 0.1838226318359375, "loss/reg": 0.0012193657457828522, "step": 3465 }, { "epoch": 0.43325, "grad_norm": 2.8019826412200928, "grad_norm_var": 2.6758170615960806, "learning_rate": 0.0001, "loss": 1.0521, "loss/crossentropy": 2.746957302093506, "loss/hidden": 0.8984375, "loss/logits": 0.14152494072914124, "loss/reg": 0.001218555262312293, "step": 3466 }, { "epoch": 0.433375, "grad_norm": 2.2270567417144775, "grad_norm_var": 2.7021812449194176, "learning_rate": 0.0001, "loss": 1.1127, "loss/crossentropy": 2.328540086746216, "loss/hidden": 0.94921875, "loss/logits": 0.15134882926940918, "loss/reg": 0.0012177237076684833, "step": 3467 }, { "epoch": 0.4335, "grad_norm": 4.9913105964660645, "grad_norm_var": 2.9161084757931492, "learning_rate": 0.0001, "loss": 1.4319, "loss/crossentropy": 2.968677520751953, "loss/hidden": 1.1484375, "loss/logits": 0.2713378071784973, "loss/reg": 0.0012168894754722714, "step": 3468 }, { "epoch": 0.433625, "grad_norm": 2.5807225704193115, "grad_norm_var": 2.907785287628724, "learning_rate": 0.0001, "loss": 1.263, "loss/crossentropy": 2.6000256538391113, "loss/hidden": 1.046875, "loss/logits": 0.2039426565170288, "loss/reg": 0.0012162317289039493, "step": 3469 }, { "epoch": 0.43375, "grad_norm": 3.048914670944214, "grad_norm_var": 2.855528329678752, "learning_rate": 0.0001, "loss": 1.624, "loss/crossentropy": 2.372028112411499, "loss/hidden": 1.3515625, "loss/logits": 0.2602695822715759, "loss/reg": 0.0012155846925452352, "step": 3470 }, { "epoch": 0.433875, "grad_norm": 3.3348121643066406, "grad_norm_var": 2.814336388571759, "learning_rate": 0.0001, "loss": 1.4619, "loss/crossentropy": 2.1086273193359375, "loss/hidden": 1.234375, "loss/logits": 0.21537508070468903, "loss/reg": 0.001214950461871922, "step": 3471 }, { "epoch": 0.434, "grad_norm": 3.122891664505005, "grad_norm_var": 2.811310944426979, "learning_rate": 0.0001, "loss": 1.4317, "loss/crossentropy": 2.6366288661956787, "loss/hidden": 1.1875, "loss/logits": 0.232046440243721, "loss/reg": 0.0012141740880906582, "step": 3472 }, { "epoch": 0.434125, "grad_norm": 2.7893314361572266, "grad_norm_var": 2.78078865893132, "learning_rate": 0.0001, "loss": 1.3292, "loss/crossentropy": 2.6001977920532227, "loss/hidden": 1.125, "loss/logits": 0.19202546775341034, "loss/reg": 0.0012135268189013004, "step": 3473 }, { "epoch": 0.43425, "grad_norm": 10.19119644165039, "grad_norm_var": 5.627546769745172, "learning_rate": 0.0001, "loss": 2.0162, "loss/crossentropy": 4.430319309234619, "loss/hidden": 1.7265625, "loss/logits": 0.2775191068649292, "loss/reg": 0.0012127597583457828, "step": 3474 }, { "epoch": 0.434375, "grad_norm": 2.7287003993988037, "grad_norm_var": 5.688521344590063, "learning_rate": 0.0001, "loss": 1.1781, "loss/crossentropy": 2.491974353790283, "loss/hidden": 0.9921875, "loss/logits": 0.17377471923828125, "loss/reg": 0.0012119835009798408, "step": 3475 }, { "epoch": 0.4345, "grad_norm": 2.63222599029541, "grad_norm_var": 3.6960636506547937, "learning_rate": 0.0001, "loss": 1.2061, "loss/crossentropy": 2.50740385055542, "loss/hidden": 1.015625, "loss/logits": 0.17836406826972961, "loss/reg": 0.0012113320408388972, "step": 3476 }, { "epoch": 0.434625, "grad_norm": 4.090530872344971, "grad_norm_var": 3.641033929802565, "learning_rate": 0.0001, "loss": 1.4947, "loss/crossentropy": 2.102734088897705, "loss/hidden": 1.3125, "loss/logits": 0.17004507780075073, "loss/reg": 0.0012105869827792048, "step": 3477 }, { "epoch": 0.43475, "grad_norm": 2.4457340240478516, "grad_norm_var": 3.6426147356487792, "learning_rate": 0.0001, "loss": 1.3426, "loss/crossentropy": 2.1665356159210205, "loss/hidden": 1.125, "loss/logits": 0.20550841093063354, "loss/reg": 0.0012098514707759023, "step": 3478 }, { "epoch": 0.434875, "grad_norm": 3.5908169746398926, "grad_norm_var": 3.611703462995258, "learning_rate": 0.0001, "loss": 1.2882, "loss/crossentropy": 2.581627130508423, "loss/hidden": 1.09375, "loss/logits": 0.18235424160957336, "loss/reg": 0.0012091120006516576, "step": 3479 }, { "epoch": 0.435, "grad_norm": 2.3515243530273438, "grad_norm_var": 3.6634354565893865, "learning_rate": 0.0001, "loss": 1.3134, "loss/crossentropy": 2.5373475551605225, "loss/hidden": 1.109375, "loss/logits": 0.19196194410324097, "loss/reg": 0.0012084417976439, "step": 3480 }, { "epoch": 0.435125, "grad_norm": 3.44587779045105, "grad_norm_var": 3.6637608642152806, "learning_rate": 0.0001, "loss": 1.2926, "loss/crossentropy": 2.3316664695739746, "loss/hidden": 1.1328125, "loss/logits": 0.1477033942937851, "loss/reg": 0.0012077923165634274, "step": 3481 }, { "epoch": 0.43525, "grad_norm": 2.6313581466674805, "grad_norm_var": 3.6819915096019864, "learning_rate": 0.0001, "loss": 1.1559, "loss/crossentropy": 2.9107630252838135, "loss/hidden": 0.9765625, "loss/logits": 0.16727963089942932, "loss/reg": 0.0012070914963260293, "step": 3482 }, { "epoch": 0.435375, "grad_norm": 3.463376760482788, "grad_norm_var": 3.5655951033063795, "learning_rate": 0.0001, "loss": 1.3245, "loss/crossentropy": 2.6030032634735107, "loss/hidden": 1.125, "loss/logits": 0.187398761510849, "loss/reg": 0.0012064424809068441, "step": 3483 }, { "epoch": 0.4355, "grad_norm": 2.6820483207702637, "grad_norm_var": 3.4674095507929366, "learning_rate": 0.0001, "loss": 1.087, "loss/crossentropy": 2.709855318069458, "loss/hidden": 0.92578125, "loss/logits": 0.14911293983459473, "loss/reg": 0.0012057506246492267, "step": 3484 }, { "epoch": 0.435625, "grad_norm": 2.3327221870422363, "grad_norm_var": 3.499853176559363, "learning_rate": 0.0001, "loss": 1.1125, "loss/crossentropy": 2.389228582382202, "loss/hidden": 0.953125, "loss/logits": 0.14736363291740417, "loss/reg": 0.0012051108060404658, "step": 3485 }, { "epoch": 0.43575, "grad_norm": 2.5504212379455566, "grad_norm_var": 3.540721862436381, "learning_rate": 0.0001, "loss": 1.1202, "loss/crossentropy": 2.641368865966797, "loss/hidden": 0.9453125, "loss/logits": 0.1628507375717163, "loss/reg": 0.0012045117327943444, "step": 3486 }, { "epoch": 0.435875, "grad_norm": 2.909332275390625, "grad_norm_var": 3.555676322168798, "learning_rate": 0.0001, "loss": 1.3262, "loss/crossentropy": 2.595078945159912, "loss/hidden": 1.140625, "loss/logits": 0.17350763082504272, "loss/reg": 0.001203811028972268, "step": 3487 }, { "epoch": 0.436, "grad_norm": 3.222846746444702, "grad_norm_var": 3.552975736491134, "learning_rate": 0.0001, "loss": 1.3106, "loss/crossentropy": 2.8183071613311768, "loss/hidden": 1.09375, "loss/logits": 0.20485123991966248, "loss/reg": 0.001203155959956348, "step": 3488 }, { "epoch": 0.436125, "grad_norm": 3.432222843170166, "grad_norm_var": 3.52829376695131, "learning_rate": 0.0001, "loss": 1.6966, "loss/crossentropy": 2.4181973934173584, "loss/hidden": 1.4140625, "loss/logits": 0.270487904548645, "loss/reg": 0.00120252906344831, "step": 3489 }, { "epoch": 0.43625, "grad_norm": 3.3231089115142822, "grad_norm_var": 0.2746774527988575, "learning_rate": 0.0001, "loss": 1.1539, "loss/crossentropy": 2.305241584777832, "loss/hidden": 0.99609375, "loss/logits": 0.14581048488616943, "loss/reg": 0.001201905426569283, "step": 3490 }, { "epoch": 0.436375, "grad_norm": 2.8960988521575928, "grad_norm_var": 0.27060666692323115, "learning_rate": 0.0001, "loss": 0.995, "loss/crossentropy": 2.654099941253662, "loss/hidden": 0.84765625, "loss/logits": 0.13534346222877502, "loss/reg": 0.0012013025116175413, "step": 3491 }, { "epoch": 0.4365, "grad_norm": 4.5448150634765625, "grad_norm_var": 0.4054408114144865, "learning_rate": 0.0001, "loss": 1.402, "loss/crossentropy": 2.838365077972412, "loss/hidden": 1.203125, "loss/logits": 0.18685215711593628, "loss/reg": 0.0012006438337266445, "step": 3492 }, { "epoch": 0.436625, "grad_norm": 3.1039233207702637, "grad_norm_var": 0.33854798112083295, "learning_rate": 0.0001, "loss": 1.4342, "loss/crossentropy": 2.517207145690918, "loss/hidden": 1.203125, "loss/logits": 0.21905256807804108, "loss/reg": 0.0011999867856502533, "step": 3493 }, { "epoch": 0.43675, "grad_norm": 12.40979290008545, "grad_norm_var": 5.730428899030075, "learning_rate": 0.0001, "loss": 3.0383, "loss/crossentropy": 2.436239242553711, "loss/hidden": 2.390625, "loss/logits": 0.6357196569442749, "loss/reg": 0.0011994364904239774, "step": 3494 }, { "epoch": 0.436875, "grad_norm": 2.217996835708618, "grad_norm_var": 5.864660576723119, "learning_rate": 0.0001, "loss": 1.1114, "loss/crossentropy": 2.5777812004089355, "loss/hidden": 0.94140625, "loss/logits": 0.15795490145683289, "loss/reg": 0.0011988927144557238, "step": 3495 }, { "epoch": 0.437, "grad_norm": 2.5710649490356445, "grad_norm_var": 5.831278473317988, "learning_rate": 0.0001, "loss": 1.3773, "loss/crossentropy": 2.556877851486206, "loss/hidden": 1.1484375, "loss/logits": 0.21689218282699585, "loss/reg": 0.001198328915052116, "step": 3496 }, { "epoch": 0.437125, "grad_norm": 2.3586888313293457, "grad_norm_var": 5.928734813573347, "learning_rate": 0.0001, "loss": 1.2786, "loss/crossentropy": 2.2116305828094482, "loss/hidden": 1.1015625, "loss/logits": 0.1650557965040207, "loss/reg": 0.0011977785034105182, "step": 3497 }, { "epoch": 0.43725, "grad_norm": 3.1644842624664307, "grad_norm_var": 5.881865733199678, "learning_rate": 0.0001, "loss": 1.2017, "loss/crossentropy": 2.9448065757751465, "loss/hidden": 1.03125, "loss/logits": 0.1584676206111908, "loss/reg": 0.0011972419451922178, "step": 3498 }, { "epoch": 0.437375, "grad_norm": 4.873732089996338, "grad_norm_var": 5.985394615488459, "learning_rate": 0.0001, "loss": 1.2737, "loss/crossentropy": 2.4594779014587402, "loss/hidden": 1.078125, "loss/logits": 0.18357062339782715, "loss/reg": 0.0011967475293204188, "step": 3499 }, { "epoch": 0.4375, "grad_norm": 3.5670506954193115, "grad_norm_var": 5.918702247485991, "learning_rate": 0.0001, "loss": 1.3036, "loss/crossentropy": 2.89233660697937, "loss/hidden": 1.1015625, "loss/logits": 0.19002707302570343, "loss/reg": 0.0011961216805502772, "step": 3500 }, { "epoch": 0.437625, "grad_norm": 5.038003444671631, "grad_norm_var": 5.876654566401722, "learning_rate": 0.0001, "loss": 1.3262, "loss/crossentropy": 2.988802194595337, "loss/hidden": 1.1484375, "loss/logits": 0.1657765805721283, "loss/reg": 0.0011955717345699668, "step": 3501 }, { "epoch": 0.43775, "grad_norm": 6.115634918212891, "grad_norm_var": 6.035967897044383, "learning_rate": 0.0001, "loss": 1.4524, "loss/crossentropy": 2.2842555046081543, "loss/hidden": 1.234375, "loss/logits": 0.2060738056898117, "loss/reg": 0.0011950369225814939, "step": 3502 }, { "epoch": 0.437875, "grad_norm": 2.5483436584472656, "grad_norm_var": 6.101869061924177, "learning_rate": 0.0001, "loss": 1.2272, "loss/crossentropy": 2.5188350677490234, "loss/hidden": 1.0390625, "loss/logits": 0.17614933848381042, "loss/reg": 0.0011943891877308488, "step": 3503 }, { "epoch": 0.438, "grad_norm": 2.2892470359802246, "grad_norm_var": 6.263881740539867, "learning_rate": 0.0001, "loss": 1.1052, "loss/crossentropy": 2.4421935081481934, "loss/hidden": 0.94921875, "loss/logits": 0.14402246475219727, "loss/reg": 0.0011938222451135516, "step": 3504 }, { "epoch": 0.438125, "grad_norm": 2.9653117656707764, "grad_norm_var": 6.314621263831796, "learning_rate": 0.0001, "loss": 1.3109, "loss/crossentropy": 2.3867926597595215, "loss/hidden": 1.1171875, "loss/logits": 0.1817684769630432, "loss/reg": 0.0011931926710531116, "step": 3505 }, { "epoch": 0.43825, "grad_norm": 4.216728687286377, "grad_norm_var": 6.283974524231153, "learning_rate": 0.0001, "loss": 1.2662, "loss/crossentropy": 2.3675005435943604, "loss/hidden": 1.09375, "loss/logits": 0.1605110466480255, "loss/reg": 0.0011925544822588563, "step": 3506 }, { "epoch": 0.438375, "grad_norm": 2.450908660888672, "grad_norm_var": 6.365155928018351, "learning_rate": 0.0001, "loss": 1.2896, "loss/crossentropy": 2.471416473388672, "loss/hidden": 1.1015625, "loss/logits": 0.17609301209449768, "loss/reg": 0.0011919805547222495, "step": 3507 }, { "epoch": 0.4385, "grad_norm": 2.1974241733551025, "grad_norm_var": 6.547550504139699, "learning_rate": 0.0001, "loss": 1.0792, "loss/crossentropy": 2.625760555267334, "loss/hidden": 0.921875, "loss/logits": 0.14538338780403137, "loss/reg": 0.0011914258357137442, "step": 3508 }, { "epoch": 0.438625, "grad_norm": 2.9582881927490234, "grad_norm_var": 6.563956090816539, "learning_rate": 0.0001, "loss": 1.3259, "loss/crossentropy": 2.4431028366088867, "loss/hidden": 1.1171875, "loss/logits": 0.19678360223770142, "loss/reg": 0.001190775539726019, "step": 3509 }, { "epoch": 0.43875, "grad_norm": 1.934443712234497, "grad_norm_var": 1.4966048790954043, "learning_rate": 0.0001, "loss": 1.0106, "loss/crossentropy": 2.591839551925659, "loss/hidden": 0.87109375, "loss/logits": 0.12761525809764862, "loss/reg": 0.0011902200058102608, "step": 3510 }, { "epoch": 0.438875, "grad_norm": 2.5278308391571045, "grad_norm_var": 1.4613466795983934, "learning_rate": 0.0001, "loss": 1.238, "loss/crossentropy": 2.7111833095550537, "loss/hidden": 1.0625, "loss/logits": 0.163572758436203, "loss/reg": 0.0011896856594830751, "step": 3511 }, { "epoch": 0.439, "grad_norm": 4.258382320404053, "grad_norm_var": 1.4896758039529119, "learning_rate": 0.0001, "loss": 1.5884, "loss/crossentropy": 2.7958221435546875, "loss/hidden": 1.3203125, "loss/logits": 0.2561938166618347, "loss/reg": 0.0011890269815921783, "step": 3512 }, { "epoch": 0.439125, "grad_norm": 2.506981372833252, "grad_norm_var": 1.4716171239009668, "learning_rate": 0.0001, "loss": 1.2418, "loss/crossentropy": 2.356534719467163, "loss/hidden": 1.078125, "loss/logits": 0.15180833637714386, "loss/reg": 0.001188386115245521, "step": 3513 }, { "epoch": 0.43925, "grad_norm": 2.5087265968322754, "grad_norm_var": 1.5147836297799755, "learning_rate": 0.0001, "loss": 1.233, "loss/crossentropy": 2.4810922145843506, "loss/hidden": 1.046875, "loss/logits": 0.17421108484268188, "loss/reg": 0.0011878141667693853, "step": 3514 }, { "epoch": 0.439375, "grad_norm": 2.838984489440918, "grad_norm_var": 1.3492557548452129, "learning_rate": 0.0001, "loss": 1.2719, "loss/crossentropy": 2.5128064155578613, "loss/hidden": 1.09375, "loss/logits": 0.16628193855285645, "loss/reg": 0.0011871742317453027, "step": 3515 }, { "epoch": 0.4395, "grad_norm": 3.3905653953552246, "grad_norm_var": 1.3421568089402693, "learning_rate": 0.0001, "loss": 1.6992, "loss/crossentropy": 2.595998764038086, "loss/hidden": 1.4140625, "loss/logits": 0.2732674181461334, "loss/reg": 0.0011865240521728992, "step": 3516 }, { "epoch": 0.439625, "grad_norm": 3.312877655029297, "grad_norm_var": 1.0988593511950513, "learning_rate": 0.0001, "loss": 1.3176, "loss/crossentropy": 2.8347625732421875, "loss/hidden": 1.1015625, "loss/logits": 0.2041352540254593, "loss/reg": 0.0011858633952215314, "step": 3517 }, { "epoch": 0.43975, "grad_norm": 2.7405972480773926, "grad_norm_var": 0.43744487443665603, "learning_rate": 0.0001, "loss": 1.1794, "loss/crossentropy": 2.581897258758545, "loss/hidden": 1.0078125, "loss/logits": 0.15973857045173645, "loss/reg": 0.0011851764284074306, "step": 3518 }, { "epoch": 0.439875, "grad_norm": 2.9293551445007324, "grad_norm_var": 0.43104846274295594, "learning_rate": 0.0001, "loss": 1.3913, "loss/crossentropy": 2.339521884918213, "loss/hidden": 1.203125, "loss/logits": 0.1763632893562317, "loss/reg": 0.0011845098342746496, "step": 3519 }, { "epoch": 0.44, "grad_norm": 2.693204641342163, "grad_norm_var": 0.4096083499761825, "learning_rate": 0.0001, "loss": 1.1883, "loss/crossentropy": 2.706615447998047, "loss/hidden": 1.0078125, "loss/logits": 0.1686449646949768, "loss/reg": 0.001183857093565166, "step": 3520 }, { "epoch": 0.440125, "grad_norm": 3.3165698051452637, "grad_norm_var": 0.4202889731297053, "learning_rate": 0.0001, "loss": 1.3745, "loss/crossentropy": 2.5228943824768066, "loss/hidden": 1.1640625, "loss/logits": 0.1986275315284729, "loss/reg": 0.001183178392238915, "step": 3521 }, { "epoch": 0.44025, "grad_norm": 2.7257802486419678, "grad_norm_var": 0.30220987275662287, "learning_rate": 0.0001, "loss": 1.2361, "loss/crossentropy": 2.5024516582489014, "loss/hidden": 1.0625, "loss/logits": 0.16182298958301544, "loss/reg": 0.0011824622051790357, "step": 3522 }, { "epoch": 0.440375, "grad_norm": 2.2382771968841553, "grad_norm_var": 0.3158025480067181, "learning_rate": 0.0001, "loss": 1.1825, "loss/crossentropy": 2.3867766857147217, "loss/hidden": 0.9921875, "loss/logits": 0.1785307377576828, "loss/reg": 0.0011817410122603178, "step": 3523 }, { "epoch": 0.4405, "grad_norm": 3.301401376724243, "grad_norm_var": 0.30071786575470016, "learning_rate": 0.0001, "loss": 1.2382, "loss/crossentropy": 2.332056760787964, "loss/hidden": 1.0390625, "loss/logits": 0.18734192848205566, "loss/reg": 0.0011809492716565728, "step": 3524 }, { "epoch": 0.440625, "grad_norm": 5.856470108032227, "grad_norm_var": 0.8534665886064203, "learning_rate": 0.0001, "loss": 1.1433, "loss/crossentropy": 2.3241350650787354, "loss/hidden": 1.0, "loss/logits": 0.1315079778432846, "loss/reg": 0.0011803017696365714, "step": 3525 }, { "epoch": 0.44075, "grad_norm": 2.825281858444214, "grad_norm_var": 0.7684801643045744, "learning_rate": 0.0001, "loss": 1.3223, "loss/crossentropy": 2.2829885482788086, "loss/hidden": 1.1328125, "loss/logits": 0.1776442527770996, "loss/reg": 0.0011795731261372566, "step": 3526 }, { "epoch": 0.440875, "grad_norm": 2.5610859394073486, "grad_norm_var": 0.7659093844279823, "learning_rate": 0.0001, "loss": 1.1626, "loss/crossentropy": 2.4389169216156006, "loss/hidden": 0.9765625, "loss/logits": 0.174253910779953, "loss/reg": 0.0011789381969720125, "step": 3527 }, { "epoch": 0.441, "grad_norm": 2.731886625289917, "grad_norm_var": 0.6809235427520299, "learning_rate": 0.0001, "loss": 1.1758, "loss/crossentropy": 2.1395652294158936, "loss/hidden": 1.0, "loss/logits": 0.16399237513542175, "loss/reg": 0.001178206643089652, "step": 3528 }, { "epoch": 0.441125, "grad_norm": 2.883371114730835, "grad_norm_var": 0.6635361537843579, "learning_rate": 0.0001, "loss": 1.2626, "loss/crossentropy": 2.510798454284668, "loss/hidden": 1.0703125, "loss/logits": 0.1804863065481186, "loss/reg": 0.0011775859165936708, "step": 3529 }, { "epoch": 0.44125, "grad_norm": 2.5331616401672363, "grad_norm_var": 0.6617989144313687, "learning_rate": 0.0001, "loss": 1.2424, "loss/crossentropy": 2.586804151535034, "loss/hidden": 1.0546875, "loss/logits": 0.17594727873802185, "loss/reg": 0.001176953432150185, "step": 3530 }, { "epoch": 0.441375, "grad_norm": 4.199496269226074, "grad_norm_var": 0.7383131864292627, "learning_rate": 0.0001, "loss": 1.2872, "loss/crossentropy": 2.4636120796203613, "loss/hidden": 1.09375, "loss/logits": 0.18168184161186218, "loss/reg": 0.001176302321255207, "step": 3531 }, { "epoch": 0.4415, "grad_norm": 2.6419715881347656, "grad_norm_var": 0.7483243154142618, "learning_rate": 0.0001, "loss": 1.3141, "loss/crossentropy": 2.353142023086548, "loss/hidden": 1.1171875, "loss/logits": 0.18516454100608826, "loss/reg": 0.0011756623862311244, "step": 3532 }, { "epoch": 0.441625, "grad_norm": 3.9147651195526123, "grad_norm_var": 0.7885976589917313, "learning_rate": 0.0001, "loss": 1.3893, "loss/crossentropy": 2.902245044708252, "loss/hidden": 1.1875, "loss/logits": 0.19005146622657776, "loss/reg": 0.0011750501580536366, "step": 3533 }, { "epoch": 0.44175, "grad_norm": 2.6390585899353027, "grad_norm_var": 0.7945246903757889, "learning_rate": 0.0001, "loss": 1.033, "loss/crossentropy": 2.5772149562835693, "loss/hidden": 0.89453125, "loss/logits": 0.1267174631357193, "loss/reg": 0.0011744365328922868, "step": 3534 }, { "epoch": 0.441875, "grad_norm": 3.1822335720062256, "grad_norm_var": 0.791943503899942, "learning_rate": 0.0001, "loss": 1.2173, "loss/crossentropy": 2.845505475997925, "loss/hidden": 1.03125, "loss/logits": 0.1742953062057495, "loss/reg": 0.0011738425819203258, "step": 3535 }, { "epoch": 0.442, "grad_norm": 2.9762825965881348, "grad_norm_var": 0.7800786292880197, "learning_rate": 0.0001, "loss": 1.1272, "loss/crossentropy": 2.809368371963501, "loss/hidden": 0.953125, "loss/logits": 0.16236835718154907, "loss/reg": 0.0011732704006135464, "step": 3536 }, { "epoch": 0.442125, "grad_norm": 2.2480151653289795, "grad_norm_var": 0.8288415554131954, "learning_rate": 0.0001, "loss": 1.1549, "loss/crossentropy": 2.800075054168701, "loss/hidden": 0.98828125, "loss/logits": 0.15485844016075134, "loss/reg": 0.0011726267402991652, "step": 3537 }, { "epoch": 0.44225, "grad_norm": 2.9925923347473145, "grad_norm_var": 0.820292530675101, "learning_rate": 0.0001, "loss": 1.2735, "loss/crossentropy": 2.4843082427978516, "loss/hidden": 1.0859375, "loss/logits": 0.17585468292236328, "loss/reg": 0.0011720317415893078, "step": 3538 }, { "epoch": 0.442375, "grad_norm": 2.9145236015319824, "grad_norm_var": 0.7704696941124657, "learning_rate": 0.0001, "loss": 1.2922, "loss/crossentropy": 2.8048834800720215, "loss/hidden": 1.0859375, "loss/logits": 0.1945202648639679, "loss/reg": 0.0011714363936334848, "step": 3539 }, { "epoch": 0.4425, "grad_norm": 2.7871246337890625, "grad_norm_var": 0.7766249483548504, "learning_rate": 0.0001, "loss": 1.309, "loss/crossentropy": 2.4248902797698975, "loss/hidden": 1.109375, "loss/logits": 0.18787690997123718, "loss/reg": 0.0011708623496815562, "step": 3540 }, { "epoch": 0.442625, "grad_norm": 2.1860570907592773, "grad_norm_var": 0.27842439391009727, "learning_rate": 0.0001, "loss": 1.1244, "loss/crossentropy": 2.4124038219451904, "loss/hidden": 0.9609375, "loss/logits": 0.1517828404903412, "loss/reg": 0.0011702269548550248, "step": 3541 }, { "epoch": 0.44275, "grad_norm": 2.3994803428649902, "grad_norm_var": 0.29334841544943707, "learning_rate": 0.0001, "loss": 1.1829, "loss/crossentropy": 2.6770095825195312, "loss/hidden": 0.984375, "loss/logits": 0.18678104877471924, "loss/reg": 0.001169562223367393, "step": 3542 }, { "epoch": 0.442875, "grad_norm": 2.5452682971954346, "grad_norm_var": 0.29399856845995676, "learning_rate": 0.0001, "loss": 1.3185, "loss/crossentropy": 2.551114320755005, "loss/hidden": 1.109375, "loss/logits": 0.19748541712760925, "loss/reg": 0.001168894348666072, "step": 3543 }, { "epoch": 0.443, "grad_norm": 2.2736899852752686, "grad_norm_var": 0.31500527070219275, "learning_rate": 0.0001, "loss": 1.3049, "loss/crossentropy": 2.41780424118042, "loss/hidden": 1.09375, "loss/logits": 0.19942106306552887, "loss/reg": 0.0011682125041261315, "step": 3544 }, { "epoch": 0.443125, "grad_norm": 3.531858205795288, "grad_norm_var": 0.34570302338666276, "learning_rate": 0.0001, "loss": 1.4774, "loss/crossentropy": 2.4555931091308594, "loss/hidden": 1.265625, "loss/logits": 0.2000923603773117, "loss/reg": 0.0011675978312268853, "step": 3545 }, { "epoch": 0.44325, "grad_norm": 3.276501178741455, "grad_norm_var": 0.34657058579682976, "learning_rate": 0.0001, "loss": 1.3669, "loss/crossentropy": 2.3737740516662598, "loss/hidden": 1.125, "loss/logits": 0.23027172684669495, "loss/reg": 0.0011669376399368048, "step": 3546 }, { "epoch": 0.443375, "grad_norm": 2.2330145835876465, "grad_norm_var": 0.2525988319549308, "learning_rate": 0.0001, "loss": 1.1364, "loss/crossentropy": 2.708735942840576, "loss/hidden": 0.95703125, "loss/logits": 0.16765856742858887, "loss/reg": 0.0011662845499813557, "step": 3547 }, { "epoch": 0.4435, "grad_norm": 2.5228734016418457, "grad_norm_var": 0.25593767802966644, "learning_rate": 0.0001, "loss": 1.0594, "loss/crossentropy": 2.242794990539551, "loss/hidden": 0.92578125, "loss/logits": 0.12195594608783722, "loss/reg": 0.0011656444985419512, "step": 3548 }, { "epoch": 0.443625, "grad_norm": 2.7598013877868652, "grad_norm_var": 0.16594025509055518, "learning_rate": 0.0001, "loss": 1.4923, "loss/crossentropy": 2.178016424179077, "loss/hidden": 1.2578125, "loss/logits": 0.222868412733078, "loss/reg": 0.001164961257018149, "step": 3549 }, { "epoch": 0.44375, "grad_norm": 3.261380910873413, "grad_norm_var": 0.1836970809637819, "learning_rate": 0.0001, "loss": 1.2722, "loss/crossentropy": 2.3236746788024902, "loss/hidden": 1.0546875, "loss/logits": 0.2058398574590683, "loss/reg": 0.0011642600875347853, "step": 3550 }, { "epoch": 0.443875, "grad_norm": 2.814697742462158, "grad_norm_var": 0.17123602018924255, "learning_rate": 0.0001, "loss": 1.3606, "loss/crossentropy": 2.4919533729553223, "loss/hidden": 1.140625, "loss/logits": 0.20830154418945312, "loss/reg": 0.00116364483255893, "step": 3551 }, { "epoch": 0.444, "grad_norm": 2.7638015747070312, "grad_norm_var": 0.16715682294875644, "learning_rate": 0.0001, "loss": 1.1967, "loss/crossentropy": 2.5647330284118652, "loss/hidden": 1.015625, "loss/logits": 0.16942167282104492, "loss/reg": 0.0011629794025793672, "step": 3552 }, { "epoch": 0.444125, "grad_norm": 2.776494264602661, "grad_norm_var": 0.15139561842556734, "learning_rate": 0.0001, "loss": 1.31, "loss/crossentropy": 2.501893997192383, "loss/hidden": 1.109375, "loss/logits": 0.18898087739944458, "loss/reg": 0.0011622948804870248, "step": 3553 }, { "epoch": 0.44425, "grad_norm": 2.5202553272247314, "grad_norm_var": 0.15021560146062615, "learning_rate": 0.0001, "loss": 1.1038, "loss/crossentropy": 2.5649075508117676, "loss/hidden": 0.9375, "loss/logits": 0.15469080209732056, "loss/reg": 0.001161619438789785, "step": 3554 }, { "epoch": 0.444375, "grad_norm": 2.7661795616149902, "grad_norm_var": 0.1478013333678054, "learning_rate": 0.0001, "loss": 1.1625, "loss/crossentropy": 2.449272871017456, "loss/hidden": 0.98046875, "loss/logits": 0.1704619824886322, "loss/reg": 0.0011609526118263602, "step": 3555 }, { "epoch": 0.4445, "grad_norm": 2.7933788299560547, "grad_norm_var": 0.14786504393243735, "learning_rate": 0.0001, "loss": 1.198, "loss/crossentropy": 2.584373712539673, "loss/hidden": 1.03125, "loss/logits": 0.15514236688613892, "loss/reg": 0.0011602436425164342, "step": 3556 }, { "epoch": 0.444625, "grad_norm": 2.733640432357788, "grad_norm_var": 0.12805647130891748, "learning_rate": 0.0001, "loss": 1.1441, "loss/crossentropy": 2.3982536792755127, "loss/hidden": 0.97265625, "loss/logits": 0.1598653793334961, "loss/reg": 0.0011596106924116611, "step": 3557 }, { "epoch": 0.44475, "grad_norm": 2.3131258487701416, "grad_norm_var": 0.1325384777002758, "learning_rate": 0.0001, "loss": 1.1971, "loss/crossentropy": 2.2804203033447266, "loss/hidden": 1.03125, "loss/logits": 0.15429136157035828, "loss/reg": 0.0011589446803554893, "step": 3558 }, { "epoch": 0.444875, "grad_norm": 2.300266981124878, "grad_norm_var": 0.14274518893386415, "learning_rate": 0.0001, "loss": 1.0832, "loss/crossentropy": 2.427419424057007, "loss/hidden": 0.93359375, "loss/logits": 0.13798177242279053, "loss/reg": 0.0011582397855818272, "step": 3559 }, { "epoch": 0.445, "grad_norm": 3.0616273880004883, "grad_norm_var": 0.13386520051520823, "learning_rate": 0.0001, "loss": 1.6426, "loss/crossentropy": 2.341487407684326, "loss/hidden": 1.3671875, "loss/logits": 0.2638081908226013, "loss/reg": 0.001157608232460916, "step": 3560 }, { "epoch": 0.445125, "grad_norm": 2.3378190994262695, "grad_norm_var": 0.10276505520577076, "learning_rate": 0.0001, "loss": 1.1082, "loss/crossentropy": 2.4932336807250977, "loss/hidden": 0.94140625, "loss/logits": 0.1551976501941681, "loss/reg": 0.001156937680207193, "step": 3561 }, { "epoch": 0.44525, "grad_norm": 3.0600171089172363, "grad_norm_var": 0.08911658281806088, "learning_rate": 0.0001, "loss": 1.506, "loss/crossentropy": 2.4198286533355713, "loss/hidden": 1.25, "loss/logits": 0.24445520341396332, "loss/reg": 0.0011562502477318048, "step": 3562 }, { "epoch": 0.445375, "grad_norm": 3.337578535079956, "grad_norm_var": 0.09826686911328555, "learning_rate": 0.0001, "loss": 1.6268, "loss/crossentropy": 2.2925186157226562, "loss/hidden": 1.3515625, "loss/logits": 0.26368141174316406, "loss/reg": 0.0011556192766875029, "step": 3563 }, { "epoch": 0.4455, "grad_norm": 2.659259557723999, "grad_norm_var": 0.09515946080169423, "learning_rate": 0.0001, "loss": 1.1746, "loss/crossentropy": 2.0706629753112793, "loss/hidden": 1.0078125, "loss/logits": 0.15528205037117004, "loss/reg": 0.0011550229974091053, "step": 3564 }, { "epoch": 0.445625, "grad_norm": 2.8057568073272705, "grad_norm_var": 0.09525220010229773, "learning_rate": 0.0001, "loss": 1.326, "loss/crossentropy": 2.6041977405548096, "loss/hidden": 1.0859375, "loss/logits": 0.2285037636756897, "loss/reg": 0.0011544260196387768, "step": 3565 }, { "epoch": 0.44575, "grad_norm": 3.1558749675750732, "grad_norm_var": 0.08902249614874691, "learning_rate": 0.0001, "loss": 1.4572, "loss/crossentropy": 2.144787311553955, "loss/hidden": 1.2109375, "loss/logits": 0.23475821316242218, "loss/reg": 0.0011537930695340037, "step": 3566 }, { "epoch": 0.445875, "grad_norm": 2.304584264755249, "grad_norm_var": 0.10173478429978691, "learning_rate": 0.0001, "loss": 1.1759, "loss/crossentropy": 2.5764341354370117, "loss/hidden": 0.99609375, "loss/logits": 0.16824042797088623, "loss/reg": 0.001153232529759407, "step": 3567 }, { "epoch": 0.446, "grad_norm": 3.410735607147217, "grad_norm_var": 0.13075608266555794, "learning_rate": 0.0001, "loss": 1.401, "loss/crossentropy": 2.5188114643096924, "loss/hidden": 1.1875, "loss/logits": 0.20201647281646729, "loss/reg": 0.0011526905000209808, "step": 3568 }, { "epoch": 0.446125, "grad_norm": 3.209620475769043, "grad_norm_var": 0.14279612593460508, "learning_rate": 0.0001, "loss": 1.1826, "loss/crossentropy": 2.742910385131836, "loss/hidden": 1.0, "loss/logits": 0.171085387468338, "loss/reg": 0.0011521711712703109, "step": 3569 }, { "epoch": 0.44625, "grad_norm": 2.6267716884613037, "grad_norm_var": 0.1395591266941795, "learning_rate": 0.0001, "loss": 1.3246, "loss/crossentropy": 2.3360354900360107, "loss/hidden": 1.109375, "loss/logits": 0.20371320843696594, "loss/reg": 0.0011515433434396982, "step": 3570 }, { "epoch": 0.446375, "grad_norm": 3.6272213459014893, "grad_norm_var": 0.181466383462858, "learning_rate": 0.0001, "loss": 1.12, "loss/crossentropy": 2.884351968765259, "loss/hidden": 0.9609375, "loss/logits": 0.1475556194782257, "loss/reg": 0.0011509184259921312, "step": 3571 }, { "epoch": 0.4465, "grad_norm": 2.4846599102020264, "grad_norm_var": 0.19010693608252016, "learning_rate": 0.0001, "loss": 1.1033, "loss/crossentropy": 2.7638909816741943, "loss/hidden": 0.94140625, "loss/logits": 0.15043224394321442, "loss/reg": 0.0011503053829073906, "step": 3572 }, { "epoch": 0.446625, "grad_norm": 2.3881146907806396, "grad_norm_var": 0.2024357441937693, "learning_rate": 0.0001, "loss": 1.1539, "loss/crossentropy": 2.6274099349975586, "loss/hidden": 0.98046875, "loss/logits": 0.1619289517402649, "loss/reg": 0.0011497308732941747, "step": 3573 }, { "epoch": 0.44675, "grad_norm": 2.777580499649048, "grad_norm_var": 0.1846718601206587, "learning_rate": 0.0001, "loss": 1.3998, "loss/crossentropy": 2.3124446868896484, "loss/hidden": 1.171875, "loss/logits": 0.21644467115402222, "loss/reg": 0.0011492264457046986, "step": 3574 }, { "epoch": 0.446875, "grad_norm": 3.8019254207611084, "grad_norm_var": 0.2161969399307715, "learning_rate": 0.0001, "loss": 1.1198, "loss/crossentropy": 2.467770576477051, "loss/hidden": 0.96875, "loss/logits": 0.13957038521766663, "loss/reg": 0.001148765441030264, "step": 3575 }, { "epoch": 0.447, "grad_norm": 3.065824031829834, "grad_norm_var": 0.21626577767776947, "learning_rate": 0.0001, "loss": 1.1505, "loss/crossentropy": 2.5153257846832275, "loss/hidden": 0.99609375, "loss/logits": 0.14290481805801392, "loss/reg": 0.001148151233792305, "step": 3576 }, { "epoch": 0.447125, "grad_norm": 2.791710138320923, "grad_norm_var": 0.19264810354225784, "learning_rate": 0.0001, "loss": 1.1728, "loss/crossentropy": 2.6984310150146484, "loss/hidden": 0.99609375, "loss/logits": 0.1652686595916748, "loss/reg": 0.0011475669452920556, "step": 3577 }, { "epoch": 0.44725, "grad_norm": 2.660292625427246, "grad_norm_var": 0.19779420628742247, "learning_rate": 0.0001, "loss": 1.1709, "loss/crossentropy": 2.4613428115844727, "loss/hidden": 1.0, "loss/logits": 0.15944907069206238, "loss/reg": 0.001146994298323989, "step": 3578 }, { "epoch": 0.447375, "grad_norm": 2.4733824729919434, "grad_norm_var": 0.19914612919305127, "learning_rate": 0.0001, "loss": 1.1131, "loss/crossentropy": 2.384662389755249, "loss/hidden": 0.92578125, "loss/logits": 0.17585483193397522, "loss/reg": 0.0011464846320450306, "step": 3579 }, { "epoch": 0.4475, "grad_norm": 3.34675669670105, "grad_norm_var": 0.20751679049088215, "learning_rate": 0.0001, "loss": 1.5887, "loss/crossentropy": 2.4943597316741943, "loss/hidden": 1.3515625, "loss/logits": 0.2256905883550644, "loss/reg": 0.0011459969682618976, "step": 3580 }, { "epoch": 0.447625, "grad_norm": 2.388507843017578, "grad_norm_var": 0.22548655580539975, "learning_rate": 0.0001, "loss": 1.0678, "loss/crossentropy": 2.425760507583618, "loss/hidden": 0.9140625, "loss/logits": 0.1422823965549469, "loss/reg": 0.0011455710045993328, "step": 3581 }, { "epoch": 0.44775, "grad_norm": 2.2057530879974365, "grad_norm_var": 0.2503914458022327, "learning_rate": 0.0001, "loss": 1.2248, "loss/crossentropy": 2.3435981273651123, "loss/hidden": 1.046875, "loss/logits": 0.16652315855026245, "loss/reg": 0.0011449484154582024, "step": 3582 }, { "epoch": 0.447875, "grad_norm": 2.7188642024993896, "grad_norm_var": 0.23111709575955405, "learning_rate": 0.0001, "loss": 1.4304, "loss/crossentropy": 2.2792136669158936, "loss/hidden": 1.21875, "loss/logits": 0.20024079084396362, "loss/reg": 0.0011444262927398086, "step": 3583 }, { "epoch": 0.448, "grad_norm": 2.5918216705322266, "grad_norm_var": 0.2143826273239976, "learning_rate": 0.0001, "loss": 1.1332, "loss/crossentropy": 2.6339030265808105, "loss/hidden": 0.9609375, "loss/logits": 0.16080284118652344, "loss/reg": 0.0011439183726906776, "step": 3584 }, { "epoch": 0.448125, "grad_norm": 2.9104745388031006, "grad_norm_var": 0.20453193538451514, "learning_rate": 0.0001, "loss": 1.5821, "loss/crossentropy": 2.2891159057617188, "loss/hidden": 1.3125, "loss/logits": 0.25816503167152405, "loss/reg": 0.0011433002073317766, "step": 3585 }, { "epoch": 0.44825, "grad_norm": 2.522008180618286, "grad_norm_var": 0.2076897171752627, "learning_rate": 0.0001, "loss": 1.0907, "loss/crossentropy": 2.617436408996582, "loss/hidden": 0.93359375, "loss/logits": 0.14566650986671448, "loss/reg": 0.0011428332654759288, "step": 3586 }, { "epoch": 0.448375, "grad_norm": 2.856386423110962, "grad_norm_var": 0.15951650264687084, "learning_rate": 0.0001, "loss": 1.1546, "loss/crossentropy": 2.1711437702178955, "loss/hidden": 0.9765625, "loss/logits": 0.16665247082710266, "loss/reg": 0.0011423578253015876, "step": 3587 }, { "epoch": 0.4485, "grad_norm": 3.5055770874023438, "grad_norm_var": 0.1886753857163806, "learning_rate": 0.0001, "loss": 1.5057, "loss/crossentropy": 2.000603199005127, "loss/hidden": 1.296875, "loss/logits": 0.1973869800567627, "loss/reg": 0.0011418744688853621, "step": 3588 }, { "epoch": 0.448625, "grad_norm": 2.957284927368164, "grad_norm_var": 0.17669260780747867, "learning_rate": 0.0001, "loss": 1.2729, "loss/crossentropy": 2.0113439559936523, "loss/hidden": 1.1015625, "loss/logits": 0.15991082787513733, "loss/reg": 0.0011412588646635413, "step": 3589 }, { "epoch": 0.44875, "grad_norm": 2.873589038848877, "grad_norm_var": 0.17636234018586947, "learning_rate": 0.0001, "loss": 1.2305, "loss/crossentropy": 2.5013785362243652, "loss/hidden": 1.0703125, "loss/logits": 0.14876660704612732, "loss/reg": 0.00114063685759902, "step": 3590 }, { "epoch": 0.448875, "grad_norm": 3.182739734649658, "grad_norm_var": 0.12209713523720302, "learning_rate": 0.0001, "loss": 1.31, "loss/crossentropy": 2.872318744659424, "loss/hidden": 1.1171875, "loss/logits": 0.1813884675502777, "loss/reg": 0.001139985746704042, "step": 3591 }, { "epoch": 0.449, "grad_norm": 13.226044654846191, "grad_norm_var": 6.912838620852551, "learning_rate": 0.0001, "loss": 1.965, "loss/crossentropy": 2.593048572540283, "loss/hidden": 1.6953125, "loss/logits": 0.258280873298645, "loss/reg": 0.001139368861913681, "step": 3592 }, { "epoch": 0.449125, "grad_norm": 2.5739026069641113, "grad_norm_var": 6.934941343606555, "learning_rate": 0.0001, "loss": 1.2419, "loss/crossentropy": 2.536034107208252, "loss/hidden": 1.046875, "loss/logits": 0.18367289006710052, "loss/reg": 0.0011387302074581385, "step": 3593 }, { "epoch": 0.44925, "grad_norm": 2.6681830883026123, "grad_norm_var": 6.9341279996042955, "learning_rate": 0.0001, "loss": 1.2422, "loss/crossentropy": 2.8786990642547607, "loss/hidden": 1.0546875, "loss/logits": 0.17608636617660522, "loss/reg": 0.0011380617506802082, "step": 3594 }, { "epoch": 0.449375, "grad_norm": 3.519986629486084, "grad_norm_var": 6.868038213284778, "learning_rate": 0.0001, "loss": 1.2047, "loss/crossentropy": 2.307696580886841, "loss/hidden": 1.0390625, "loss/logits": 0.154233917593956, "loss/reg": 0.001137429615482688, "step": 3595 }, { "epoch": 0.4495, "grad_norm": 2.445401668548584, "grad_norm_var": 6.93759229538064, "learning_rate": 0.0001, "loss": 1.2668, "loss/crossentropy": 2.429853677749634, "loss/hidden": 1.0546875, "loss/logits": 0.20070236921310425, "loss/reg": 0.0011367432307451963, "step": 3596 }, { "epoch": 0.449625, "grad_norm": 2.532623052597046, "grad_norm_var": 6.918557635573557, "learning_rate": 0.0001, "loss": 0.8913, "loss/crossentropy": 2.603067636489868, "loss/hidden": 0.77734375, "loss/logits": 0.10264308750629425, "loss/reg": 0.0011360220378264785, "step": 3597 }, { "epoch": 0.44975, "grad_norm": 2.6163806915283203, "grad_norm_var": 6.860662961771296, "learning_rate": 0.0001, "loss": 1.2793, "loss/crossentropy": 2.305840253829956, "loss/hidden": 1.0625, "loss/logits": 0.20541991293430328, "loss/reg": 0.0011353256413713098, "step": 3598 }, { "epoch": 0.449875, "grad_norm": 2.729661464691162, "grad_norm_var": 6.859572576720755, "learning_rate": 0.0001, "loss": 1.4499, "loss/crossentropy": 2.5780832767486572, "loss/hidden": 1.203125, "loss/logits": 0.23546919226646423, "loss/reg": 0.0011346812825649977, "step": 3599 }, { "epoch": 0.45, "grad_norm": 2.3666024208068848, "grad_norm_var": 6.8894743042088535, "learning_rate": 0.0001, "loss": 1.0093, "loss/crossentropy": 2.6124517917633057, "loss/hidden": 0.8671875, "loss/logits": 0.1308036744594574, "loss/reg": 0.0011340826749801636, "step": 3600 }, { "epoch": 0.450125, "grad_norm": 2.138046979904175, "grad_norm_var": 6.984176885701056, "learning_rate": 0.0001, "loss": 1.0434, "loss/crossentropy": 2.537350654602051, "loss/hidden": 0.890625, "loss/logits": 0.14144738018512726, "loss/reg": 0.0011334845330566168, "step": 3601 }, { "epoch": 0.45025, "grad_norm": 2.0474178791046143, "grad_norm_var": 7.055055820163738, "learning_rate": 0.0001, "loss": 1.0695, "loss/crossentropy": 2.702651262283325, "loss/hidden": 0.921875, "loss/logits": 0.13630911707878113, "loss/reg": 0.0011329322587698698, "step": 3602 }, { "epoch": 0.450375, "grad_norm": 3.4538140296936035, "grad_norm_var": 7.034858094535054, "learning_rate": 0.0001, "loss": 1.3766, "loss/crossentropy": 2.5592472553253174, "loss/hidden": 1.140625, "loss/logits": 0.22460104525089264, "loss/reg": 0.0011324151419103146, "step": 3603 }, { "epoch": 0.4505, "grad_norm": 2.2115414142608643, "grad_norm_var": 7.1260152524609754, "learning_rate": 0.0001, "loss": 1.1367, "loss/crossentropy": 2.415616512298584, "loss/hidden": 0.96484375, "loss/logits": 0.16048820316791534, "loss/reg": 0.001131935161538422, "step": 3604 }, { "epoch": 0.450625, "grad_norm": 11.633960723876953, "grad_norm_var": 11.381086493637765, "learning_rate": 0.0001, "loss": 1.3079, "loss/crossentropy": 2.8634023666381836, "loss/hidden": 1.1171875, "loss/logits": 0.17942410707473755, "loss/reg": 0.0011314753210172057, "step": 3605 }, { "epoch": 0.45075, "grad_norm": 2.610074043273926, "grad_norm_var": 11.421094293143513, "learning_rate": 0.0001, "loss": 1.0561, "loss/crossentropy": 2.4831647872924805, "loss/hidden": 0.89453125, "loss/logits": 0.15028788149356842, "loss/reg": 0.0011308566899970174, "step": 3606 }, { "epoch": 0.450875, "grad_norm": 3.169635534286499, "grad_norm_var": 11.422309798018507, "learning_rate": 0.0001, "loss": 1.1629, "loss/crossentropy": 2.6023714542388916, "loss/hidden": 1.0, "loss/logits": 0.15155716240406036, "loss/reg": 0.001130243414081633, "step": 3607 }, { "epoch": 0.451, "grad_norm": 3.2706120014190674, "grad_norm_var": 5.199526188762797, "learning_rate": 0.0001, "loss": 1.4275, "loss/crossentropy": 2.5506865978240967, "loss/hidden": 1.203125, "loss/logits": 0.21303662657737732, "loss/reg": 0.0011297364253550768, "step": 3608 }, { "epoch": 0.451125, "grad_norm": 2.7580971717834473, "grad_norm_var": 5.185060862666481, "learning_rate": 0.0001, "loss": 1.1468, "loss/crossentropy": 2.631394147872925, "loss/hidden": 0.97265625, "loss/logits": 0.16284237802028656, "loss/reg": 0.0011292281560599804, "step": 3609 }, { "epoch": 0.45125, "grad_norm": 2.4963018894195557, "grad_norm_var": 5.2004875130504065, "learning_rate": 0.0001, "loss": 1.3285, "loss/crossentropy": 2.6183509826660156, "loss/hidden": 1.1171875, "loss/logits": 0.19999514520168304, "loss/reg": 0.0011287262896075845, "step": 3610 }, { "epoch": 0.451375, "grad_norm": 2.8753323554992676, "grad_norm_var": 5.203255650862703, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.478327512741089, "loss/hidden": 1.1796875, "loss/logits": 0.23371222615242004, "loss/reg": 0.0011281180195510387, "step": 3611 }, { "epoch": 0.4515, "grad_norm": 3.026810646057129, "grad_norm_var": 5.1651321346766315, "learning_rate": 0.0001, "loss": 1.3052, "loss/crossentropy": 2.6245927810668945, "loss/hidden": 1.1328125, "loss/logits": 0.1611611545085907, "loss/reg": 0.0011275901924818754, "step": 3612 }, { "epoch": 0.451625, "grad_norm": 3.0498433113098145, "grad_norm_var": 5.132651601029396, "learning_rate": 0.0001, "loss": 0.9547, "loss/crossentropy": 2.6303586959838867, "loss/hidden": 0.828125, "loss/logits": 0.11533454060554504, "loss/reg": 0.0011269885580986738, "step": 3613 }, { "epoch": 0.45175, "grad_norm": 3.2288856506347656, "grad_norm_var": 5.102035263494234, "learning_rate": 0.0001, "loss": 1.257, "loss/crossentropy": 2.257422924041748, "loss/hidden": 1.0859375, "loss/logits": 0.15975698828697205, "loss/reg": 0.0011263993801549077, "step": 3614 }, { "epoch": 0.451875, "grad_norm": 3.464022159576416, "grad_norm_var": 5.0782643207037905, "learning_rate": 0.0001, "loss": 1.3312, "loss/crossentropy": 2.461991310119629, "loss/hidden": 1.09375, "loss/logits": 0.22621627151966095, "loss/reg": 0.0011257915757596493, "step": 3615 }, { "epoch": 0.452, "grad_norm": 3.0488712787628174, "grad_norm_var": 5.016755832760903, "learning_rate": 0.0001, "loss": 1.3651, "loss/crossentropy": 2.354006290435791, "loss/hidden": 1.1875, "loss/logits": 0.16638943552970886, "loss/reg": 0.001125356531701982, "step": 3616 }, { "epoch": 0.452125, "grad_norm": 3.293280839920044, "grad_norm_var": 4.904984439932197, "learning_rate": 0.0001, "loss": 1.2398, "loss/crossentropy": 2.5223793983459473, "loss/hidden": 1.0390625, "loss/logits": 0.1895003318786621, "loss/reg": 0.001124921371228993, "step": 3617 }, { "epoch": 0.45225, "grad_norm": 2.3650379180908203, "grad_norm_var": 4.850730531431311, "learning_rate": 0.0001, "loss": 1.2088, "loss/crossentropy": 2.7652547359466553, "loss/hidden": 1.015625, "loss/logits": 0.18195992708206177, "loss/reg": 0.0011243068147450686, "step": 3618 }, { "epoch": 0.452375, "grad_norm": 2.5314254760742188, "grad_norm_var": 4.909248480410061, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.4079129695892334, "loss/hidden": 0.94921875, "loss/logits": 0.1466173231601715, "loss/reg": 0.0011237490689381957, "step": 3619 }, { "epoch": 0.4525, "grad_norm": 3.2177786827087402, "grad_norm_var": 4.807767017860247, "learning_rate": 0.0001, "loss": 1.2131, "loss/crossentropy": 2.2269489765167236, "loss/hidden": 1.03125, "loss/logits": 0.1706451177597046, "loss/reg": 0.0011233208933845162, "step": 3620 }, { "epoch": 0.452625, "grad_norm": 2.45078444480896, "grad_norm_var": 0.12208347740382805, "learning_rate": 0.0001, "loss": 1.0996, "loss/crossentropy": 2.2499990463256836, "loss/hidden": 0.921875, "loss/logits": 0.16647830605506897, "loss/reg": 0.0011227369541302323, "step": 3621 }, { "epoch": 0.45275, "grad_norm": 3.081821918487549, "grad_norm_var": 0.11596058449395723, "learning_rate": 0.0001, "loss": 1.0531, "loss/crossentropy": 2.3651978969573975, "loss/hidden": 0.90234375, "loss/logits": 0.1394936889410019, "loss/reg": 0.001122286543250084, "step": 3622 }, { "epoch": 0.452875, "grad_norm": 3.0705947875976562, "grad_norm_var": 0.11377935923128628, "learning_rate": 0.0001, "loss": 1.4857, "loss/crossentropy": 2.7812201976776123, "loss/hidden": 1.25, "loss/logits": 0.22444243729114532, "loss/reg": 0.0011217257706448436, "step": 3623 }, { "epoch": 0.453, "grad_norm": 4.485832691192627, "grad_norm_var": 0.2577267732279627, "learning_rate": 0.0001, "loss": 1.6391, "loss/crossentropy": 2.8572638034820557, "loss/hidden": 1.390625, "loss/logits": 0.23728522658348083, "loss/reg": 0.0011212818790227175, "step": 3624 }, { "epoch": 0.453125, "grad_norm": 3.350933790206909, "grad_norm_var": 0.2583744875701311, "learning_rate": 0.0001, "loss": 1.1278, "loss/crossentropy": 2.8117835521698, "loss/hidden": 0.97265625, "loss/logits": 0.14398273825645447, "loss/reg": 0.0011207489296793938, "step": 3625 }, { "epoch": 0.45325, "grad_norm": 2.69838809967041, "grad_norm_var": 0.24560755477418372, "learning_rate": 0.0001, "loss": 1.3424, "loss/crossentropy": 2.6321933269500732, "loss/hidden": 1.1484375, "loss/logits": 0.18280965089797974, "loss/reg": 0.0011201531160622835, "step": 3626 }, { "epoch": 0.453375, "grad_norm": 3.2805733680725098, "grad_norm_var": 0.2449489747480091, "learning_rate": 0.0001, "loss": 1.2311, "loss/crossentropy": 2.4263882637023926, "loss/hidden": 1.046875, "loss/logits": 0.17304658889770508, "loss/reg": 0.0011196060804650187, "step": 3627 }, { "epoch": 0.4535, "grad_norm": 2.980419397354126, "grad_norm_var": 0.24555354901583518, "learning_rate": 0.0001, "loss": 1.2535, "loss/crossentropy": 2.6335508823394775, "loss/hidden": 1.0625, "loss/logits": 0.17980441451072693, "loss/reg": 0.001119154621846974, "step": 3628 }, { "epoch": 0.453625, "grad_norm": 3.0101053714752197, "grad_norm_var": 0.2459174940316416, "learning_rate": 0.0001, "loss": 1.1762, "loss/crossentropy": 2.4689512252807617, "loss/hidden": 0.984375, "loss/logits": 0.18061095476150513, "loss/reg": 0.0011187418131157756, "step": 3629 }, { "epoch": 0.45375, "grad_norm": 2.650803804397583, "grad_norm_var": 0.25667077652347886, "learning_rate": 0.0001, "loss": 1.1039, "loss/crossentropy": 2.517517328262329, "loss/hidden": 0.94921875, "loss/logits": 0.14346536993980408, "loss/reg": 0.0011181577574461699, "step": 3630 }, { "epoch": 0.453875, "grad_norm": 2.732057571411133, "grad_norm_var": 0.25085198673878584, "learning_rate": 0.0001, "loss": 1.3755, "loss/crossentropy": 2.1822640895843506, "loss/hidden": 1.1328125, "loss/logits": 0.23148787021636963, "loss/reg": 0.0011175540275871754, "step": 3631 }, { "epoch": 0.454, "grad_norm": 3.2787492275238037, "grad_norm_var": 0.2551762124724232, "learning_rate": 0.0001, "loss": 1.2386, "loss/crossentropy": 2.7067642211914062, "loss/hidden": 1.0625, "loss/logits": 0.16496938467025757, "loss/reg": 0.0011169499484822154, "step": 3632 }, { "epoch": 0.454125, "grad_norm": 2.722620964050293, "grad_norm_var": 0.25549027998236323, "learning_rate": 0.0001, "loss": 1.351, "loss/crossentropy": 2.3964638710021973, "loss/hidden": 1.1328125, "loss/logits": 0.2070024311542511, "loss/reg": 0.0011163420276716352, "step": 3633 }, { "epoch": 0.45425, "grad_norm": 2.8469223976135254, "grad_norm_var": 0.2295761902468172, "learning_rate": 0.0001, "loss": 1.2938, "loss/crossentropy": 2.6107420921325684, "loss/hidden": 1.0859375, "loss/logits": 0.1966606229543686, "loss/reg": 0.001115738763473928, "step": 3634 }, { "epoch": 0.454375, "grad_norm": 2.3232243061065674, "grad_norm_var": 0.24596945268289663, "learning_rate": 0.0001, "loss": 1.1987, "loss/crossentropy": 2.622743844985962, "loss/hidden": 1.0, "loss/logits": 0.1875203251838684, "loss/reg": 0.0011151116341352463, "step": 3635 }, { "epoch": 0.4545, "grad_norm": 2.3462815284729004, "grad_norm_var": 0.2694518022929664, "learning_rate": 0.0001, "loss": 1.1892, "loss/crossentropy": 2.544023036956787, "loss/hidden": 1.015625, "loss/logits": 0.16246385872364044, "loss/reg": 0.0011145230382680893, "step": 3636 }, { "epoch": 0.454625, "grad_norm": 2.437540292739868, "grad_norm_var": 0.2703564765142268, "learning_rate": 0.0001, "loss": 1.1284, "loss/crossentropy": 2.7328503131866455, "loss/hidden": 0.96484375, "loss/logits": 0.15244735777378082, "loss/reg": 0.0011139034759253263, "step": 3637 }, { "epoch": 0.45475, "grad_norm": 8.919418334960938, "grad_norm_var": 2.4980929332987136, "learning_rate": 0.0001, "loss": 1.4368, "loss/crossentropy": 2.767719268798828, "loss/hidden": 1.2265625, "loss/logits": 0.19912247359752655, "loss/reg": 0.001113260630518198, "step": 3638 }, { "epoch": 0.454875, "grad_norm": 2.7015810012817383, "grad_norm_var": 2.518919311010584, "learning_rate": 0.0001, "loss": 1.1719, "loss/crossentropy": 2.4625489711761475, "loss/hidden": 1.0, "loss/logits": 0.16078132390975952, "loss/reg": 0.0011125723831355572, "step": 3639 }, { "epoch": 0.455, "grad_norm": 6.38781213760376, "grad_norm_var": 3.0462861727188146, "learning_rate": 0.0001, "loss": 1.5665, "loss/crossentropy": 2.579721689224243, "loss/hidden": 1.375, "loss/logits": 0.18035313487052917, "loss/reg": 0.0011118864640593529, "step": 3640 }, { "epoch": 0.455125, "grad_norm": 2.3712635040283203, "grad_norm_var": 3.1148632404853513, "learning_rate": 0.0001, "loss": 1.0392, "loss/crossentropy": 2.45876145362854, "loss/hidden": 0.8984375, "loss/logits": 0.12965673208236694, "loss/reg": 0.0011111798230558634, "step": 3641 }, { "epoch": 0.45525, "grad_norm": 2.917454719543457, "grad_norm_var": 3.098669558344962, "learning_rate": 0.0001, "loss": 1.2989, "loss/crossentropy": 3.0488340854644775, "loss/hidden": 1.0859375, "loss/logits": 0.20187309384346008, "loss/reg": 0.0011105705052614212, "step": 3642 }, { "epoch": 0.455375, "grad_norm": 2.584381341934204, "grad_norm_var": 3.1371869288962326, "learning_rate": 0.0001, "loss": 1.2241, "loss/crossentropy": 2.5257370471954346, "loss/hidden": 1.0390625, "loss/logits": 0.17389634251594543, "loss/reg": 0.0011099318508058786, "step": 3643 }, { "epoch": 0.4555, "grad_norm": 2.999863386154175, "grad_norm_var": 3.1363154986367854, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.502323865890503, "loss/hidden": 1.0703125, "loss/logits": 0.1711767166852951, "loss/reg": 0.001109326141886413, "step": 3644 }, { "epoch": 0.455625, "grad_norm": 2.749826431274414, "grad_norm_var": 3.151542870762167, "learning_rate": 0.0001, "loss": 1.2429, "loss/crossentropy": 2.3708789348602295, "loss/hidden": 1.0546875, "loss/logits": 0.177077978849411, "loss/reg": 0.0011087232269346714, "step": 3645 }, { "epoch": 0.45575, "grad_norm": 2.236565351486206, "grad_norm_var": 3.198709885301513, "learning_rate": 0.0001, "loss": 1.2166, "loss/crossentropy": 2.3554396629333496, "loss/hidden": 1.03125, "loss/logits": 0.17426779866218567, "loss/reg": 0.001108046737499535, "step": 3646 }, { "epoch": 0.455875, "grad_norm": 3.0919601917266846, "grad_norm_var": 3.1802847555277727, "learning_rate": 0.0001, "loss": 1.2327, "loss/crossentropy": 2.752289056777954, "loss/hidden": 1.0390625, "loss/logits": 0.18256229162216187, "loss/reg": 0.0011073796777054667, "step": 3647 }, { "epoch": 0.456, "grad_norm": 3.2478809356689453, "grad_norm_var": 3.180461473840118, "learning_rate": 0.0001, "loss": 1.3638, "loss/crossentropy": 2.572361469268799, "loss/hidden": 1.140625, "loss/logits": 0.21212169528007507, "loss/reg": 0.0011066870065405965, "step": 3648 }, { "epoch": 0.456125, "grad_norm": 3.6876742839813232, "grad_norm_var": 3.1636955904469057, "learning_rate": 0.0001, "loss": 1.555, "loss/crossentropy": 2.8398966789245605, "loss/hidden": 1.2734375, "loss/logits": 0.2705059051513672, "loss/reg": 0.0011060771066695452, "step": 3649 }, { "epoch": 0.45625, "grad_norm": 3.2252860069274902, "grad_norm_var": 3.1464763738482966, "learning_rate": 0.0001, "loss": 1.2316, "loss/crossentropy": 2.5448129177093506, "loss/hidden": 1.046875, "loss/logits": 0.17372067272663116, "loss/reg": 0.0011053438065573573, "step": 3650 }, { "epoch": 0.456375, "grad_norm": 3.08785080909729, "grad_norm_var": 3.074335608654623, "learning_rate": 0.0001, "loss": 1.1808, "loss/crossentropy": 2.2132527828216553, "loss/hidden": 1.015625, "loss/logits": 0.15412601828575134, "loss/reg": 0.001104602008126676, "step": 3651 }, { "epoch": 0.4565, "grad_norm": 33.436424255371094, "grad_norm_var": 58.965068257735055, "learning_rate": 0.0001, "loss": 1.1558, "loss/crossentropy": 2.4538676738739014, "loss/hidden": 0.98828125, "loss/logits": 0.15650439262390137, "loss/reg": 0.0011040032841265202, "step": 3652 }, { "epoch": 0.456625, "grad_norm": 2.9397642612457275, "grad_norm_var": 58.78378441203776, "learning_rate": 0.0001, "loss": 1.2445, "loss/crossentropy": 2.649888515472412, "loss/hidden": 1.0390625, "loss/logits": 0.19440674781799316, "loss/reg": 0.0011034229537472129, "step": 3653 }, { "epoch": 0.45675, "grad_norm": 2.056731700897217, "grad_norm_var": 58.51753866909271, "learning_rate": 0.0001, "loss": 1.1151, "loss/crossentropy": 2.437039852142334, "loss/hidden": 0.94140625, "loss/logits": 0.16268345713615417, "loss/reg": 0.0011027493746951222, "step": 3654 }, { "epoch": 0.456875, "grad_norm": 2.7032809257507324, "grad_norm_var": 58.517021831496244, "learning_rate": 0.0001, "loss": 1.1606, "loss/crossentropy": 2.2214465141296387, "loss/hidden": 1.0, "loss/logits": 0.14953437447547913, "loss/reg": 0.001102171023376286, "step": 3655 }, { "epoch": 0.457, "grad_norm": 2.744002342224121, "grad_norm_var": 58.66421958620806, "learning_rate": 0.0001, "loss": 1.1801, "loss/crossentropy": 2.5072290897369385, "loss/hidden": 0.99609375, "loss/logits": 0.17298246920108795, "loss/reg": 0.0011015147902071476, "step": 3656 }, { "epoch": 0.457125, "grad_norm": 2.658114194869995, "grad_norm_var": 58.5781915958235, "learning_rate": 0.0001, "loss": 1.2667, "loss/crossentropy": 2.6835861206054688, "loss/hidden": 1.0625, "loss/logits": 0.19321846961975098, "loss/reg": 0.0011009488953277469, "step": 3657 }, { "epoch": 0.45725, "grad_norm": 2.552165985107422, "grad_norm_var": 58.67690311737038, "learning_rate": 0.0001, "loss": 1.1982, "loss/crossentropy": 2.4873526096343994, "loss/hidden": 1.015625, "loss/logits": 0.1716001033782959, "loss/reg": 0.0011003789259120822, "step": 3658 }, { "epoch": 0.457375, "grad_norm": 2.6761128902435303, "grad_norm_var": 58.650940272795346, "learning_rate": 0.0001, "loss": 1.4296, "loss/crossentropy": 2.3641092777252197, "loss/hidden": 1.1875, "loss/logits": 0.2311011552810669, "loss/reg": 0.0010997228091582656, "step": 3659 }, { "epoch": 0.4575, "grad_norm": 2.313652515411377, "grad_norm_var": 58.84103367227098, "learning_rate": 0.0001, "loss": 1.1284, "loss/crossentropy": 2.6461470127105713, "loss/hidden": 0.953125, "loss/logits": 0.16426518559455872, "loss/reg": 0.0010991219896823168, "step": 3660 }, { "epoch": 0.457625, "grad_norm": 3.226473569869995, "grad_norm_var": 58.73047053590162, "learning_rate": 0.0001, "loss": 0.9704, "loss/crossentropy": 2.71610689163208, "loss/hidden": 0.83984375, "loss/logits": 0.11957935243844986, "loss/reg": 0.0010984810069203377, "step": 3661 }, { "epoch": 0.45775, "grad_norm": 2.674936056137085, "grad_norm_var": 58.59599625157168, "learning_rate": 0.0001, "loss": 1.1098, "loss/crossentropy": 2.3932313919067383, "loss/hidden": 0.9453125, "loss/logits": 0.1535184681415558, "loss/reg": 0.0010977821657434106, "step": 3662 }, { "epoch": 0.457875, "grad_norm": 2.4683806896209717, "grad_norm_var": 58.75983030291769, "learning_rate": 0.0001, "loss": 1.166, "loss/crossentropy": 2.3272764682769775, "loss/hidden": 0.9765625, "loss/logits": 0.178432434797287, "loss/reg": 0.0010970846051350236, "step": 3663 }, { "epoch": 0.458, "grad_norm": 2.7754459381103516, "grad_norm_var": 58.86721437996781, "learning_rate": 0.0001, "loss": 1.1847, "loss/crossentropy": 2.4661128520965576, "loss/hidden": 1.0078125, "loss/logits": 0.16594992578029633, "loss/reg": 0.0010963540989905596, "step": 3664 }, { "epoch": 0.458125, "grad_norm": 2.5619869232177734, "grad_norm_var": 59.098600946846524, "learning_rate": 0.0001, "loss": 1.5193, "loss/crossentropy": 2.084529161453247, "loss/hidden": 1.2578125, "loss/logits": 0.25057342648506165, "loss/reg": 0.0010956121841445565, "step": 3665 }, { "epoch": 0.45825, "grad_norm": 2.458009958267212, "grad_norm_var": 59.2792343742321, "learning_rate": 0.0001, "loss": 1.1, "loss/crossentropy": 2.4117701053619385, "loss/hidden": 0.9453125, "loss/logits": 0.14374569058418274, "loss/reg": 0.0010950363939628005, "step": 3666 }, { "epoch": 0.458375, "grad_norm": 2.7333226203918457, "grad_norm_var": 59.35778210782066, "learning_rate": 0.0001, "loss": 1.4746, "loss/crossentropy": 2.422085762023926, "loss/hidden": 1.2421875, "loss/logits": 0.22142797708511353, "loss/reg": 0.00109446095302701, "step": 3667 }, { "epoch": 0.4585, "grad_norm": 3.261953353881836, "grad_norm_var": 0.09123591748126667, "learning_rate": 0.0001, "loss": 1.1139, "loss/crossentropy": 2.3480167388916016, "loss/hidden": 0.9453125, "loss/logits": 0.1576104760169983, "loss/reg": 0.0010937739862129092, "step": 3668 }, { "epoch": 0.458625, "grad_norm": 2.803299903869629, "grad_norm_var": 0.08758730228382015, "learning_rate": 0.0001, "loss": 1.2582, "loss/crossentropy": 2.5081117153167725, "loss/hidden": 1.0546875, "loss/logits": 0.19254551827907562, "loss/reg": 0.0010932012228295207, "step": 3669 }, { "epoch": 0.45875, "grad_norm": 4.625733375549316, "grad_norm_var": 0.2911239572978341, "learning_rate": 0.0001, "loss": 1.5243, "loss/crossentropy": 2.6232597827911377, "loss/hidden": 1.25, "loss/logits": 0.263406902551651, "loss/reg": 0.0010925846872851253, "step": 3670 }, { "epoch": 0.458875, "grad_norm": 2.8602371215820312, "grad_norm_var": 0.29006815879736225, "learning_rate": 0.0001, "loss": 1.4661, "loss/crossentropy": 2.5596141815185547, "loss/hidden": 1.234375, "loss/logits": 0.22079582512378693, "loss/reg": 0.0010919829364866018, "step": 3671 }, { "epoch": 0.459, "grad_norm": 2.3989248275756836, "grad_norm_var": 0.301794672972945, "learning_rate": 0.0001, "loss": 1.2688, "loss/crossentropy": 2.475048303604126, "loss/hidden": 1.0703125, "loss/logits": 0.1875576376914978, "loss/reg": 0.0010913077276200056, "step": 3672 }, { "epoch": 0.459125, "grad_norm": 2.9176266193389893, "grad_norm_var": 0.3005564102564743, "learning_rate": 0.0001, "loss": 1.1655, "loss/crossentropy": 2.511626720428467, "loss/hidden": 0.9765625, "loss/logits": 0.1780145764350891, "loss/reg": 0.0010906413663178682, "step": 3673 }, { "epoch": 0.45925, "grad_norm": 3.753809690475464, "grad_norm_var": 0.34600579163123724, "learning_rate": 0.0001, "loss": 1.4817, "loss/crossentropy": 2.221444606781006, "loss/hidden": 1.25, "loss/logits": 0.22078345715999603, "loss/reg": 0.0010900585912168026, "step": 3674 }, { "epoch": 0.459375, "grad_norm": 2.2964346408843994, "grad_norm_var": 0.3666972648808949, "learning_rate": 0.0001, "loss": 1.0599, "loss/crossentropy": 2.5047900676727295, "loss/hidden": 0.90234375, "loss/logits": 0.14670003950595856, "loss/reg": 0.0010894747683778405, "step": 3675 }, { "epoch": 0.4595, "grad_norm": 2.69233775138855, "grad_norm_var": 0.34690575978238486, "learning_rate": 0.0001, "loss": 1.1892, "loss/crossentropy": 2.946650266647339, "loss/hidden": 1.0234375, "loss/logits": 0.15489830076694489, "loss/reg": 0.0010889058467000723, "step": 3676 }, { "epoch": 0.459625, "grad_norm": 2.629589796066284, "grad_norm_var": 0.3437321497761786, "learning_rate": 0.0001, "loss": 1.3049, "loss/crossentropy": 2.2960586547851562, "loss/hidden": 1.1171875, "loss/logits": 0.17683415114879608, "loss/reg": 0.0010883377399295568, "step": 3677 }, { "epoch": 0.45975, "grad_norm": 3.9362752437591553, "grad_norm_var": 0.4104463927065808, "learning_rate": 0.0001, "loss": 1.5652, "loss/crossentropy": 2.611161231994629, "loss/hidden": 1.328125, "loss/logits": 0.22619250416755676, "loss/reg": 0.0010877492604777217, "step": 3678 }, { "epoch": 0.459875, "grad_norm": 2.1377973556518555, "grad_norm_var": 0.43843206926681333, "learning_rate": 0.0001, "loss": 1.1726, "loss/crossentropy": 2.459545373916626, "loss/hidden": 0.98828125, "loss/logits": 0.1734371781349182, "loss/reg": 0.0010871333070099354, "step": 3679 }, { "epoch": 0.46, "grad_norm": 2.3406503200531006, "grad_norm_var": 0.4590726044015507, "learning_rate": 0.0001, "loss": 1.2897, "loss/crossentropy": 2.5806517601013184, "loss/hidden": 1.0703125, "loss/logits": 0.2085016667842865, "loss/reg": 0.0010865204967558384, "step": 3680 }, { "epoch": 0.460125, "grad_norm": 2.460411310195923, "grad_norm_var": 0.4643020689432279, "learning_rate": 0.0001, "loss": 1.0726, "loss/crossentropy": 2.4174599647521973, "loss/hidden": 0.91796875, "loss/logits": 0.1437738835811615, "loss/reg": 0.0010859040776267648, "step": 3681 }, { "epoch": 0.46025, "grad_norm": 2.243666887283325, "grad_norm_var": 0.47963800771299414, "learning_rate": 0.0001, "loss": 1.1133, "loss/crossentropy": 2.6184909343719482, "loss/hidden": 0.953125, "loss/logits": 0.1493586301803589, "loss/reg": 0.001085252151824534, "step": 3682 }, { "epoch": 0.460375, "grad_norm": 2.2660858631134033, "grad_norm_var": 0.5024671355996891, "learning_rate": 0.0001, "loss": 1.0891, "loss/crossentropy": 2.59515643119812, "loss/hidden": 0.921875, "loss/logits": 0.15641248226165771, "loss/reg": 0.001084585441276431, "step": 3683 }, { "epoch": 0.4605, "grad_norm": 2.380805015563965, "grad_norm_var": 0.5027769542279846, "learning_rate": 0.0001, "loss": 1.1207, "loss/crossentropy": 2.8866631984710693, "loss/hidden": 0.94921875, "loss/logits": 0.16059257090091705, "loss/reg": 0.00108393095433712, "step": 3684 }, { "epoch": 0.460625, "grad_norm": 2.254760265350342, "grad_norm_var": 0.5210841654778496, "learning_rate": 0.0001, "loss": 1.1427, "loss/crossentropy": 2.474443197250366, "loss/hidden": 0.9609375, "loss/logits": 0.17095285654067993, "loss/reg": 0.001083341776393354, "step": 3685 }, { "epoch": 0.46075, "grad_norm": 2.505544900894165, "grad_norm_var": 0.2752275628813301, "learning_rate": 0.0001, "loss": 1.0225, "loss/crossentropy": 2.7944929599761963, "loss/hidden": 0.875, "loss/logits": 0.13668867945671082, "loss/reg": 0.001082766568288207, "step": 3686 }, { "epoch": 0.460875, "grad_norm": 2.446007013320923, "grad_norm_var": 0.2732181653359135, "learning_rate": 0.0001, "loss": 1.0785, "loss/crossentropy": 2.7775626182556152, "loss/hidden": 0.9296875, "loss/logits": 0.13796471059322357, "loss/reg": 0.0010822261683642864, "step": 3687 }, { "epoch": 0.461, "grad_norm": 2.2367258071899414, "grad_norm_var": 0.2792930902827682, "learning_rate": 0.0001, "loss": 1.0042, "loss/crossentropy": 2.4872052669525146, "loss/hidden": 0.86328125, "loss/logits": 0.13013723492622375, "loss/reg": 0.001081735477782786, "step": 3688 }, { "epoch": 0.461125, "grad_norm": 2.698093891143799, "grad_norm_var": 0.27282235951992084, "learning_rate": 0.0001, "loss": 1.175, "loss/crossentropy": 2.4909794330596924, "loss/hidden": 0.99609375, "loss/logits": 0.16807909309864044, "loss/reg": 0.001081244321539998, "step": 3689 }, { "epoch": 0.46125, "grad_norm": 3.579244375228882, "grad_norm_var": 0.24740460305596887, "learning_rate": 0.0001, "loss": 1.6392, "loss/crossentropy": 2.7734837532043457, "loss/hidden": 1.3828125, "loss/logits": 0.24555744230747223, "loss/reg": 0.0010807537473738194, "step": 3690 }, { "epoch": 0.461375, "grad_norm": 4.243533134460449, "grad_norm_var": 0.4135856061566377, "learning_rate": 0.0001, "loss": 1.3935, "loss/crossentropy": 2.377235174179077, "loss/hidden": 1.1796875, "loss/logits": 0.20302686095237732, "loss/reg": 0.0010803104378283024, "step": 3691 }, { "epoch": 0.4615, "grad_norm": 2.6744658946990967, "grad_norm_var": 0.4136017152277494, "learning_rate": 0.0001, "loss": 1.1873, "loss/crossentropy": 2.396928310394287, "loss/hidden": 1.015625, "loss/logits": 0.16087684035301208, "loss/reg": 0.0010798652656376362, "step": 3692 }, { "epoch": 0.461625, "grad_norm": 2.168912887573242, "grad_norm_var": 0.4305519272395995, "learning_rate": 0.0001, "loss": 1.0219, "loss/crossentropy": 2.376469612121582, "loss/hidden": 0.87109375, "loss/logits": 0.14000368118286133, "loss/reg": 0.0010792772518470883, "step": 3693 }, { "epoch": 0.46175, "grad_norm": 2.9985311031341553, "grad_norm_var": 0.3260376648824355, "learning_rate": 0.0001, "loss": 1.2904, "loss/crossentropy": 2.275723457336426, "loss/hidden": 1.0859375, "loss/logits": 0.19368857145309448, "loss/reg": 0.001078745466656983, "step": 3694 }, { "epoch": 0.461875, "grad_norm": 2.672093152999878, "grad_norm_var": 0.3107957202708386, "learning_rate": 0.0001, "loss": 1.1539, "loss/crossentropy": 2.2827932834625244, "loss/hidden": 0.9921875, "loss/logits": 0.15088967978954315, "loss/reg": 0.001078160246834159, "step": 3695 }, { "epoch": 0.462, "grad_norm": 12.926865577697754, "grad_norm_var": 6.898728720439764, "learning_rate": 0.0001, "loss": 2.2934, "loss/crossentropy": 3.416651725769043, "loss/hidden": 1.796875, "loss/logits": 0.4857579171657562, "loss/reg": 0.0010776674607768655, "step": 3696 }, { "epoch": 0.462125, "grad_norm": 9.152348518371582, "grad_norm_var": 8.950943037634408, "learning_rate": 0.0001, "loss": 1.3778, "loss/crossentropy": 2.0033910274505615, "loss/hidden": 1.1953125, "loss/logits": 0.17175361514091492, "loss/reg": 0.001077084569260478, "step": 3697 }, { "epoch": 0.46225, "grad_norm": 3.9435274600982666, "grad_norm_var": 8.797954739049327, "learning_rate": 0.0001, "loss": 1.3315, "loss/crossentropy": 2.4210004806518555, "loss/hidden": 1.140625, "loss/logits": 0.18007992208003998, "loss/reg": 0.0010765788611024618, "step": 3698 }, { "epoch": 0.462375, "grad_norm": 3.363208293914795, "grad_norm_var": 8.645621549421772, "learning_rate": 0.0001, "loss": 1.3953, "loss/crossentropy": 2.6891672611236572, "loss/hidden": 1.140625, "loss/logits": 0.24389374256134033, "loss/reg": 0.0010760303121060133, "step": 3699 }, { "epoch": 0.4625, "grad_norm": 3.17488956451416, "grad_norm_var": 8.52521085035121, "learning_rate": 0.0001, "loss": 1.3374, "loss/crossentropy": 2.550089120864868, "loss/hidden": 1.125, "loss/logits": 0.20162931084632874, "loss/reg": 0.001075449399650097, "step": 3700 }, { "epoch": 0.462625, "grad_norm": 2.9204883575439453, "grad_norm_var": 8.403329201978751, "learning_rate": 0.0001, "loss": 1.3874, "loss/crossentropy": 2.557389259338379, "loss/hidden": 1.171875, "loss/logits": 0.20474404096603394, "loss/reg": 0.001074858708307147, "step": 3701 }, { "epoch": 0.46275, "grad_norm": 2.8108930587768555, "grad_norm_var": 8.34906463493503, "learning_rate": 0.0001, "loss": 1.2213, "loss/crossentropy": 2.494159698486328, "loss/hidden": 1.03125, "loss/logits": 0.17934384942054749, "loss/reg": 0.0010742675513029099, "step": 3702 }, { "epoch": 0.462875, "grad_norm": 2.300452470779419, "grad_norm_var": 8.380559453870614, "learning_rate": 0.0001, "loss": 1.2418, "loss/crossentropy": 2.407719612121582, "loss/hidden": 1.03125, "loss/logits": 0.19977647066116333, "loss/reg": 0.001073711202479899, "step": 3703 }, { "epoch": 0.463, "grad_norm": 2.4153335094451904, "grad_norm_var": 8.340764016304869, "learning_rate": 0.0001, "loss": 1.3021, "loss/crossentropy": 2.1892950534820557, "loss/hidden": 1.09375, "loss/logits": 0.19758394360542297, "loss/reg": 0.0010731123620644212, "step": 3704 }, { "epoch": 0.463125, "grad_norm": 2.779555320739746, "grad_norm_var": 8.327008969976916, "learning_rate": 0.0001, "loss": 1.3527, "loss/crossentropy": 2.529736042022705, "loss/hidden": 1.109375, "loss/logits": 0.2326381802558899, "loss/reg": 0.0010724973399192095, "step": 3705 }, { "epoch": 0.46325, "grad_norm": 2.4120383262634277, "grad_norm_var": 8.478847673307593, "learning_rate": 0.0001, "loss": 1.1266, "loss/crossentropy": 2.3775672912597656, "loss/hidden": 0.95703125, "loss/logits": 0.15887629985809326, "loss/reg": 0.001071861945092678, "step": 3706 }, { "epoch": 0.463375, "grad_norm": 3.5965144634246826, "grad_norm_var": 8.478379913922208, "learning_rate": 0.0001, "loss": 1.5997, "loss/crossentropy": 2.4436252117156982, "loss/hidden": 1.3203125, "loss/logits": 0.2686833441257477, "loss/reg": 0.0010712338844314218, "step": 3707 }, { "epoch": 0.4635, "grad_norm": 2.1841886043548584, "grad_norm_var": 8.573149465452317, "learning_rate": 0.0001, "loss": 1.1129, "loss/crossentropy": 2.644336223602295, "loss/hidden": 0.94921875, "loss/logits": 0.1529277265071869, "loss/reg": 0.0010705916211009026, "step": 3708 }, { "epoch": 0.463625, "grad_norm": 2.0873870849609375, "grad_norm_var": 8.591987821521936, "learning_rate": 0.0001, "loss": 1.0323, "loss/crossentropy": 2.6220035552978516, "loss/hidden": 0.8671875, "loss/logits": 0.1544329673051834, "loss/reg": 0.0010700024431571364, "step": 3709 }, { "epoch": 0.46375, "grad_norm": 3.7249808311462402, "grad_norm_var": 8.541660327195409, "learning_rate": 0.0001, "loss": 1.2367, "loss/crossentropy": 2.535261631011963, "loss/hidden": 1.0390625, "loss/logits": 0.18697765469551086, "loss/reg": 0.0010694039519876242, "step": 3710 }, { "epoch": 0.463875, "grad_norm": 2.5248610973358154, "grad_norm_var": 8.567199585520731, "learning_rate": 0.0001, "loss": 1.1387, "loss/crossentropy": 2.4419515132904053, "loss/hidden": 0.953125, "loss/logits": 0.17489930987358093, "loss/reg": 0.0010688200127333403, "step": 3711 }, { "epoch": 0.464, "grad_norm": 2.2635059356689453, "grad_norm_var": 2.832345299679211, "learning_rate": 0.0001, "loss": 1.2211, "loss/crossentropy": 2.411438465118408, "loss/hidden": 1.03125, "loss/logits": 0.1791185438632965, "loss/reg": 0.001068207318894565, "step": 3712 }, { "epoch": 0.464125, "grad_norm": 2.6226885318756104, "grad_norm_var": 0.3395957163593927, "learning_rate": 0.0001, "loss": 1.3046, "loss/crossentropy": 2.588963508605957, "loss/hidden": 1.09375, "loss/logits": 0.20012930035591125, "loss/reg": 0.001067598583176732, "step": 3713 }, { "epoch": 0.46425, "grad_norm": 3.1768360137939453, "grad_norm_var": 0.2615098498626177, "learning_rate": 0.0001, "loss": 1.2569, "loss/crossentropy": 2.28198504447937, "loss/hidden": 1.0703125, "loss/logits": 0.17596131563186646, "loss/reg": 0.001066942815668881, "step": 3714 }, { "epoch": 0.464375, "grad_norm": 2.1862196922302246, "grad_norm_var": 0.2553689439507593, "learning_rate": 0.0001, "loss": 1.0799, "loss/crossentropy": 2.560032844543457, "loss/hidden": 0.90625, "loss/logits": 0.16297385096549988, "loss/reg": 0.0010662595741450787, "step": 3715 }, { "epoch": 0.4645, "grad_norm": 2.509772539138794, "grad_norm_var": 0.24079721385979624, "learning_rate": 0.0001, "loss": 1.0761, "loss/crossentropy": 2.6149239540100098, "loss/hidden": 0.90625, "loss/logits": 0.15916873514652252, "loss/reg": 0.001065569813363254, "step": 3716 }, { "epoch": 0.464625, "grad_norm": 2.31333589553833, "grad_norm_var": 0.24252529920052718, "learning_rate": 0.0001, "loss": 1.1195, "loss/crossentropy": 2.5011069774627686, "loss/hidden": 0.9609375, "loss/logits": 0.1479259878396988, "loss/reg": 0.0010649975156411529, "step": 3717 }, { "epoch": 0.46475, "grad_norm": 7.927090167999268, "grad_norm_var": 2.0091994885856836, "learning_rate": 0.0001, "loss": 1.262, "loss/crossentropy": 2.4633114337921143, "loss/hidden": 1.1015625, "loss/logits": 0.14978863298892975, "loss/reg": 0.0010642620036378503, "step": 3718 }, { "epoch": 0.464875, "grad_norm": 2.5760974884033203, "grad_norm_var": 1.990478176711584, "learning_rate": 0.0001, "loss": 1.1781, "loss/crossentropy": 2.5917017459869385, "loss/hidden": 1.0078125, "loss/logits": 0.15963390469551086, "loss/reg": 0.0010636926162987947, "step": 3719 }, { "epoch": 0.465, "grad_norm": 2.5979511737823486, "grad_norm_var": 1.9793910978396516, "learning_rate": 0.0001, "loss": 1.0945, "loss/crossentropy": 2.7411742210388184, "loss/hidden": 0.92578125, "loss/logits": 0.1581270694732666, "loss/reg": 0.0010630934266373515, "step": 3720 }, { "epoch": 0.465125, "grad_norm": 2.0000927448272705, "grad_norm_var": 2.03691613326327, "learning_rate": 0.0001, "loss": 1.0186, "loss/crossentropy": 2.5383896827697754, "loss/hidden": 0.875, "loss/logits": 0.1329978108406067, "loss/reg": 0.0010624158894643188, "step": 3721 }, { "epoch": 0.46525, "grad_norm": 2.463991165161133, "grad_norm_var": 2.0335732706713747, "learning_rate": 0.0001, "loss": 1.0082, "loss/crossentropy": 2.351062059402466, "loss/hidden": 0.8671875, "loss/logits": 0.13040807843208313, "loss/reg": 0.0010616984218358994, "step": 3722 }, { "epoch": 0.465375, "grad_norm": 2.289612054824829, "grad_norm_var": 2.0228245437674537, "learning_rate": 0.0001, "loss": 1.216, "loss/crossentropy": 2.568997383117676, "loss/hidden": 1.0234375, "loss/logits": 0.18190687894821167, "loss/reg": 0.0010611186735332012, "step": 3723 }, { "epoch": 0.4655, "grad_norm": 2.659769296646118, "grad_norm_var": 1.9953409806457763, "learning_rate": 0.0001, "loss": 1.1619, "loss/crossentropy": 2.715094566345215, "loss/hidden": 0.9765625, "loss/logits": 0.1747661530971527, "loss/reg": 0.0010604273993521929, "step": 3724 }, { "epoch": 0.465625, "grad_norm": 3.296005964279175, "grad_norm_var": 1.9604788045868644, "learning_rate": 0.0001, "loss": 1.2162, "loss/crossentropy": 3.1480836868286133, "loss/hidden": 1.0390625, "loss/logits": 0.16650646924972534, "loss/reg": 0.0010598504450172186, "step": 3725 }, { "epoch": 0.46575, "grad_norm": 2.0886306762695312, "grad_norm_var": 1.9578298735364863, "learning_rate": 0.0001, "loss": 1.1084, "loss/crossentropy": 2.640695810317993, "loss/hidden": 0.9453125, "loss/logits": 0.15246766805648804, "loss/reg": 0.0010592687176540494, "step": 3726 }, { "epoch": 0.465875, "grad_norm": 2.641106367111206, "grad_norm_var": 1.9537352856502963, "learning_rate": 0.0001, "loss": 1.1463, "loss/crossentropy": 2.612945079803467, "loss/hidden": 0.98828125, "loss/logits": 0.14745888113975525, "loss/reg": 0.0010587131837382913, "step": 3727 }, { "epoch": 0.466, "grad_norm": 3.590258836746216, "grad_norm_var": 1.9598608598042269, "learning_rate": 0.0001, "loss": 1.4399, "loss/crossentropy": 2.3706881999969482, "loss/hidden": 1.234375, "loss/logits": 0.1949087232351303, "loss/reg": 0.0010581689421087503, "step": 3728 }, { "epoch": 0.466125, "grad_norm": 2.529618740081787, "grad_norm_var": 1.9642618708548638, "learning_rate": 0.0001, "loss": 1.3146, "loss/crossentropy": 2.6209750175476074, "loss/hidden": 1.125, "loss/logits": 0.17901381850242615, "loss/reg": 0.0010575860505923629, "step": 3729 }, { "epoch": 0.46625, "grad_norm": 2.8191704750061035, "grad_norm_var": 1.9603856829088868, "learning_rate": 0.0001, "loss": 1.2574, "loss/crossentropy": 2.4059226512908936, "loss/hidden": 1.03125, "loss/logits": 0.2155960202217102, "loss/reg": 0.0010569995502009988, "step": 3730 }, { "epoch": 0.466375, "grad_norm": 4.61501407623291, "grad_norm_var": 2.0961299825089514, "learning_rate": 0.0001, "loss": 1.7343, "loss/crossentropy": 2.489657402038574, "loss/hidden": 1.4921875, "loss/logits": 0.23158934712409973, "loss/reg": 0.0010563876712694764, "step": 3731 }, { "epoch": 0.4665, "grad_norm": 2.9083781242370605, "grad_norm_var": 2.0769583322350567, "learning_rate": 0.0001, "loss": 1.3934, "loss/crossentropy": 2.0922696590423584, "loss/hidden": 1.1796875, "loss/logits": 0.20314350724220276, "loss/reg": 0.0010557902278378606, "step": 3732 }, { "epoch": 0.466625, "grad_norm": 2.4133176803588867, "grad_norm_var": 2.0673326812737662, "learning_rate": 0.0001, "loss": 1.0564, "loss/crossentropy": 2.552004098892212, "loss/hidden": 0.890625, "loss/logits": 0.15522116422653198, "loss/reg": 0.001055160304531455, "step": 3733 }, { "epoch": 0.46675, "grad_norm": 14.528952598571777, "grad_norm_var": 9.050524511171833, "learning_rate": 0.0001, "loss": 2.2903, "loss/crossentropy": 3.16092586517334, "loss/hidden": 1.84375, "loss/logits": 0.4360020160675049, "loss/reg": 0.0010545790428295732, "step": 3734 }, { "epoch": 0.466875, "grad_norm": 2.535404920578003, "grad_norm_var": 9.05564689225628, "learning_rate": 0.0001, "loss": 1.1762, "loss/crossentropy": 2.5721189975738525, "loss/hidden": 0.984375, "loss/logits": 0.18130630254745483, "loss/reg": 0.0010539917275309563, "step": 3735 }, { "epoch": 0.467, "grad_norm": 3.3108134269714355, "grad_norm_var": 9.00180447033975, "learning_rate": 0.0001, "loss": 1.3573, "loss/crossentropy": 2.5983662605285645, "loss/hidden": 1.171875, "loss/logits": 0.1748800426721573, "loss/reg": 0.0010533732129260898, "step": 3736 }, { "epoch": 0.467125, "grad_norm": 2.887172222137451, "grad_norm_var": 8.868479698586702, "learning_rate": 0.0001, "loss": 1.3684, "loss/crossentropy": 2.6756997108459473, "loss/hidden": 1.1328125, "loss/logits": 0.22507551312446594, "loss/reg": 0.0010527775157243013, "step": 3737 }, { "epoch": 0.46725, "grad_norm": 3.2592861652374268, "grad_norm_var": 8.787699958506806, "learning_rate": 0.0001, "loss": 1.3629, "loss/crossentropy": 2.833913564682007, "loss/hidden": 1.140625, "loss/logits": 0.2117471694946289, "loss/reg": 0.0010521719232201576, "step": 3738 }, { "epoch": 0.467375, "grad_norm": 2.896531105041504, "grad_norm_var": 8.70077485822222, "learning_rate": 0.0001, "loss": 1.3264, "loss/crossentropy": 2.7216458320617676, "loss/hidden": 1.109375, "loss/logits": 0.20652078092098236, "loss/reg": 0.0010515962494537234, "step": 3739 }, { "epoch": 0.4675, "grad_norm": 2.625850200653076, "grad_norm_var": 8.705488910199454, "learning_rate": 0.0001, "loss": 1.1827, "loss/crossentropy": 2.850449562072754, "loss/hidden": 1.0, "loss/logits": 0.17214107513427734, "loss/reg": 0.001051024068146944, "step": 3740 }, { "epoch": 0.467625, "grad_norm": 2.634413242340088, "grad_norm_var": 8.76707967274899, "learning_rate": 0.0001, "loss": 1.2691, "loss/crossentropy": 2.690199851989746, "loss/hidden": 1.0703125, "loss/logits": 0.1882416307926178, "loss/reg": 0.0010504164965823293, "step": 3741 }, { "epoch": 0.46775, "grad_norm": 2.4903013706207275, "grad_norm_var": 8.693931095139906, "learning_rate": 0.0001, "loss": 1.0586, "loss/crossentropy": 2.819619655609131, "loss/hidden": 0.90234375, "loss/logits": 0.1457907110452652, "loss/reg": 0.0010497955372557044, "step": 3742 }, { "epoch": 0.467875, "grad_norm": 2.7081687450408936, "grad_norm_var": 8.685031403531102, "learning_rate": 0.0001, "loss": 1.2342, "loss/crossentropy": 2.5104386806488037, "loss/hidden": 1.0390625, "loss/logits": 0.18462492525577545, "loss/reg": 0.0010491719003766775, "step": 3743 }, { "epoch": 0.468, "grad_norm": 4.391199111938477, "grad_norm_var": 8.716391829694883, "learning_rate": 0.0001, "loss": 1.2382, "loss/crossentropy": 2.5002100467681885, "loss/hidden": 1.046875, "loss/logits": 0.18083949387073517, "loss/reg": 0.0010486061219125986, "step": 3744 }, { "epoch": 0.468125, "grad_norm": 3.1020188331604004, "grad_norm_var": 8.645859298980364, "learning_rate": 0.0001, "loss": 1.3629, "loss/crossentropy": 1.947885274887085, "loss/hidden": 1.1796875, "loss/logits": 0.17275087535381317, "loss/reg": 0.0010480149649083614, "step": 3745 }, { "epoch": 0.46825, "grad_norm": 2.4530813694000244, "grad_norm_var": 8.700055535892766, "learning_rate": 0.0001, "loss": 1.1514, "loss/crossentropy": 2.6354005336761475, "loss/hidden": 0.98046875, "loss/logits": 0.16043657064437866, "loss/reg": 0.0010474539594724774, "step": 3746 }, { "epoch": 0.468375, "grad_norm": 4.501230716705322, "grad_norm_var": 8.687513815503635, "learning_rate": 0.0001, "loss": 1.7084, "loss/crossentropy": 2.430232286453247, "loss/hidden": 1.421875, "loss/logits": 0.2760719656944275, "loss/reg": 0.001046862336806953, "step": 3747 }, { "epoch": 0.4685, "grad_norm": 2.7485971450805664, "grad_norm_var": 8.706568266073518, "learning_rate": 0.0001, "loss": 1.1655, "loss/crossentropy": 2.6515913009643555, "loss/hidden": 0.9921875, "loss/logits": 0.1628623902797699, "loss/reg": 0.0010462593054398894, "step": 3748 }, { "epoch": 0.468625, "grad_norm": 4.862452507019043, "grad_norm_var": 8.655447785710576, "learning_rate": 0.0001, "loss": 1.5374, "loss/crossentropy": 2.146183729171753, "loss/hidden": 1.3203125, "loss/logits": 0.2066783308982849, "loss/reg": 0.0010457164607942104, "step": 3749 }, { "epoch": 0.46875, "grad_norm": 2.5753014087677, "grad_norm_var": 0.5991467035758499, "learning_rate": 0.0001, "loss": 1.1439, "loss/crossentropy": 2.578402042388916, "loss/hidden": 0.984375, "loss/logits": 0.14904895424842834, "loss/reg": 0.0010451480047777295, "step": 3750 }, { "epoch": 0.468875, "grad_norm": 2.594205379486084, "grad_norm_var": 0.5947492424992521, "learning_rate": 0.0001, "loss": 1.178, "loss/crossentropy": 2.6443724632263184, "loss/hidden": 0.98046875, "loss/logits": 0.18712539970874786, "loss/reg": 0.001044557779096067, "step": 3751 }, { "epoch": 0.469, "grad_norm": 2.7609641551971436, "grad_norm_var": 0.6002086851799798, "learning_rate": 0.0001, "loss": 1.2574, "loss/crossentropy": 2.7476205825805664, "loss/hidden": 1.078125, "loss/logits": 0.16887043416500092, "loss/reg": 0.001043999451212585, "step": 3752 }, { "epoch": 0.469125, "grad_norm": 2.5273211002349854, "grad_norm_var": 0.6181859522141998, "learning_rate": 0.0001, "loss": 1.1099, "loss/crossentropy": 3.021183490753174, "loss/hidden": 0.9453125, "loss/logits": 0.15414130687713623, "loss/reg": 0.0010434577707201242, "step": 3753 }, { "epoch": 0.46925, "grad_norm": 3.0462095737457275, "grad_norm_var": 0.6156652887168186, "learning_rate": 0.0001, "loss": 1.263, "loss/crossentropy": 2.9509634971618652, "loss/hidden": 1.078125, "loss/logits": 0.1744719296693802, "loss/reg": 0.0010429274989292026, "step": 3754 }, { "epoch": 0.469375, "grad_norm": 2.086707592010498, "grad_norm_var": 0.6740199875863783, "learning_rate": 0.0001, "loss": 1.1625, "loss/crossentropy": 2.3012125492095947, "loss/hidden": 0.98046875, "loss/logits": 0.17162567377090454, "loss/reg": 0.001042398507706821, "step": 3755 }, { "epoch": 0.4695, "grad_norm": 2.6115479469299316, "grad_norm_var": 0.6747591383258867, "learning_rate": 0.0001, "loss": 1.3042, "loss/crossentropy": 2.3198139667510986, "loss/hidden": 1.1171875, "loss/logits": 0.17658983170986176, "loss/reg": 0.0010418756864964962, "step": 3756 }, { "epoch": 0.469625, "grad_norm": 3.4453163146972656, "grad_norm_var": 0.6756961687380224, "learning_rate": 0.0001, "loss": 1.3431, "loss/crossentropy": 2.3196887969970703, "loss/hidden": 1.15625, "loss/logits": 0.17645755410194397, "loss/reg": 0.0010413621785119176, "step": 3757 }, { "epoch": 0.46975, "grad_norm": 2.5657591819763184, "grad_norm_var": 0.6703550964819281, "learning_rate": 0.0001, "loss": 1.1525, "loss/crossentropy": 2.4865410327911377, "loss/hidden": 0.98828125, "loss/logits": 0.15384459495544434, "loss/reg": 0.0010407918598502874, "step": 3758 }, { "epoch": 0.469875, "grad_norm": 2.879530906677246, "grad_norm_var": 0.6641229903210423, "learning_rate": 0.0001, "loss": 1.2318, "loss/crossentropy": 2.6609575748443604, "loss/hidden": 1.046875, "loss/logits": 0.1745145618915558, "loss/reg": 0.001040301052853465, "step": 3759 }, { "epoch": 0.47, "grad_norm": 2.9898691177368164, "grad_norm_var": 0.5403649103417572, "learning_rate": 0.0001, "loss": 1.2754, "loss/crossentropy": 2.3772876262664795, "loss/hidden": 1.09375, "loss/logits": 0.1712690144777298, "loss/reg": 0.001039811410009861, "step": 3760 }, { "epoch": 0.470125, "grad_norm": 3.9162628650665283, "grad_norm_var": 0.5945733310808682, "learning_rate": 0.0001, "loss": 1.5104, "loss/crossentropy": 2.438507556915283, "loss/hidden": 1.265625, "loss/logits": 0.2343619167804718, "loss/reg": 0.001039364142343402, "step": 3761 }, { "epoch": 0.47025, "grad_norm": 2.7068610191345215, "grad_norm_var": 0.5788988255850784, "learning_rate": 0.0001, "loss": 1.1434, "loss/crossentropy": 2.649901866912842, "loss/hidden": 0.984375, "loss/logits": 0.14859837293624878, "loss/reg": 0.0010388274677097797, "step": 3762 }, { "epoch": 0.470375, "grad_norm": 2.143653392791748, "grad_norm_var": 0.4704556791782994, "learning_rate": 0.0001, "loss": 1.0468, "loss/crossentropy": 2.6706063747406006, "loss/hidden": 0.90234375, "loss/logits": 0.134120911359787, "loss/reg": 0.0010382536565884948, "step": 3763 }, { "epoch": 0.4705, "grad_norm": 2.214045524597168, "grad_norm_var": 0.49937555635427366, "learning_rate": 0.0001, "loss": 0.9968, "loss/crossentropy": 2.3302195072174072, "loss/hidden": 0.86328125, "loss/logits": 0.1231309324502945, "loss/reg": 0.0010377810103818774, "step": 3764 }, { "epoch": 0.470625, "grad_norm": 2.4828028678894043, "grad_norm_var": 0.22123695394161563, "learning_rate": 0.0001, "loss": 1.3054, "loss/crossentropy": 2.3276054859161377, "loss/hidden": 1.1015625, "loss/logits": 0.19348205626010895, "loss/reg": 0.0010373073164373636, "step": 3765 }, { "epoch": 0.47075, "grad_norm": 2.0254900455474854, "grad_norm_var": 0.2508585956693819, "learning_rate": 0.0001, "loss": 1.126, "loss/crossentropy": 2.52640962600708, "loss/hidden": 0.96484375, "loss/logits": 0.15082600712776184, "loss/reg": 0.0010368585353717208, "step": 3766 }, { "epoch": 0.470875, "grad_norm": 2.2360544204711914, "grad_norm_var": 0.26332043770821445, "learning_rate": 0.0001, "loss": 1.0951, "loss/crossentropy": 2.765958070755005, "loss/hidden": 0.9296875, "loss/logits": 0.15505728125572205, "loss/reg": 0.0010362821631133556, "step": 3767 }, { "epoch": 0.471, "grad_norm": 2.673532724380493, "grad_norm_var": 0.2626783305500375, "learning_rate": 0.0001, "loss": 1.0301, "loss/crossentropy": 2.4626576900482178, "loss/hidden": 0.87890625, "loss/logits": 0.14078682661056519, "loss/reg": 0.0010357408318668604, "step": 3768 }, { "epoch": 0.471125, "grad_norm": 2.7197072505950928, "grad_norm_var": 0.2616026821685369, "learning_rate": 0.0001, "loss": 1.3058, "loss/crossentropy": 2.2927536964416504, "loss/hidden": 1.1171875, "loss/logits": 0.17825204133987427, "loss/reg": 0.0010352992685511708, "step": 3769 }, { "epoch": 0.47125, "grad_norm": 2.530165910720825, "grad_norm_var": 0.2524615063826576, "learning_rate": 0.0001, "loss": 1.2327, "loss/crossentropy": 2.3669793605804443, "loss/hidden": 1.0625, "loss/logits": 0.15984904766082764, "loss/reg": 0.0010347155621275306, "step": 3770 }, { "epoch": 0.471375, "grad_norm": 2.8251028060913086, "grad_norm_var": 0.2321431990915052, "learning_rate": 0.0001, "loss": 1.3336, "loss/crossentropy": 2.6330323219299316, "loss/hidden": 1.1328125, "loss/logits": 0.19047117233276367, "loss/reg": 0.001034220797009766, "step": 3771 }, { "epoch": 0.4715, "grad_norm": 2.5778088569641113, "grad_norm_var": 0.23254637512083393, "learning_rate": 0.0001, "loss": 1.2419, "loss/crossentropy": 2.52360200881958, "loss/hidden": 1.046875, "loss/logits": 0.18464666604995728, "loss/reg": 0.00103374058380723, "step": 3772 }, { "epoch": 0.471625, "grad_norm": 2.73024845123291, "grad_norm_var": 0.1918465664209651, "learning_rate": 0.0001, "loss": 1.4287, "loss/crossentropy": 2.4510698318481445, "loss/hidden": 1.2109375, "loss/logits": 0.207466721534729, "loss/reg": 0.001033173524774611, "step": 3773 }, { "epoch": 0.47175, "grad_norm": 5.5982561111450195, "grad_norm_var": 0.7371647843596195, "learning_rate": 0.0001, "loss": 1.2914, "loss/crossentropy": 2.5356533527374268, "loss/hidden": 1.1171875, "loss/logits": 0.16388046741485596, "loss/reg": 0.0010326774790883064, "step": 3774 }, { "epoch": 0.471875, "grad_norm": 3.330371141433716, "grad_norm_var": 0.7529607383994403, "learning_rate": 0.0001, "loss": 1.2103, "loss/crossentropy": 2.3663136959075928, "loss/hidden": 1.0234375, "loss/logits": 0.1765405237674713, "loss/reg": 0.0010321062291041017, "step": 3775 }, { "epoch": 0.472, "grad_norm": 2.7923836708068848, "grad_norm_var": 0.7518802749050707, "learning_rate": 0.0001, "loss": 1.3959, "loss/crossentropy": 2.1732921600341797, "loss/hidden": 1.171875, "loss/logits": 0.21373462677001953, "loss/reg": 0.001031561754643917, "step": 3776 }, { "epoch": 0.472125, "grad_norm": 3.1368236541748047, "grad_norm_var": 0.6784073165047723, "learning_rate": 0.0001, "loss": 1.27, "loss/crossentropy": 2.3684985637664795, "loss/hidden": 1.0703125, "loss/logits": 0.1894214153289795, "loss/reg": 0.001031053252518177, "step": 3777 }, { "epoch": 0.47225, "grad_norm": 2.267341375350952, "grad_norm_var": 0.6956582019987451, "learning_rate": 0.0001, "loss": 1.1846, "loss/crossentropy": 2.434203863143921, "loss/hidden": 1.015625, "loss/logits": 0.15870434045791626, "loss/reg": 0.0010304804891347885, "step": 3778 }, { "epoch": 0.472375, "grad_norm": 2.6836698055267334, "grad_norm_var": 0.6689489415153349, "learning_rate": 0.0001, "loss": 1.2275, "loss/crossentropy": 2.5070836544036865, "loss/hidden": 1.0390625, "loss/logits": 0.17809173464775085, "loss/reg": 0.0010298980632796884, "step": 3779 }, { "epoch": 0.4725, "grad_norm": 3.875011682510376, "grad_norm_var": 0.7112782482680624, "learning_rate": 0.0001, "loss": 1.3003, "loss/crossentropy": 2.796905279159546, "loss/hidden": 1.1015625, "loss/logits": 0.18843744695186615, "loss/reg": 0.0010292951483279467, "step": 3780 }, { "epoch": 0.472625, "grad_norm": 2.358286142349243, "grad_norm_var": 0.719261638468659, "learning_rate": 0.0001, "loss": 1.2928, "loss/crossentropy": 2.372039794921875, "loss/hidden": 1.078125, "loss/logits": 0.20442074537277222, "loss/reg": 0.00102866324596107, "step": 3781 }, { "epoch": 0.47275, "grad_norm": 2.6700429916381836, "grad_norm_var": 0.670284927148046, "learning_rate": 0.0001, "loss": 1.1394, "loss/crossentropy": 2.4375572204589844, "loss/hidden": 0.9765625, "loss/logits": 0.15258187055587769, "loss/reg": 0.0010280911810696125, "step": 3782 }, { "epoch": 0.472875, "grad_norm": 3.147334575653076, "grad_norm_var": 0.6369219346869727, "learning_rate": 0.0001, "loss": 1.2127, "loss/crossentropy": 2.380455255508423, "loss/hidden": 1.03125, "loss/logits": 0.17114591598510742, "loss/reg": 0.0010274475207552314, "step": 3783 }, { "epoch": 0.473, "grad_norm": 3.2817318439483643, "grad_norm_var": 0.6339920866484136, "learning_rate": 0.0001, "loss": 1.4318, "loss/crossentropy": 2.395378351211548, "loss/hidden": 1.21875, "loss/logits": 0.20282995700836182, "loss/reg": 0.0010267584584653378, "step": 3784 }, { "epoch": 0.473125, "grad_norm": 4.071346759796143, "grad_norm_var": 0.691755820953265, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.656303644180298, "loss/hidden": 1.0625, "loss/logits": 0.205478698015213, "loss/reg": 0.0010261554270982742, "step": 3785 }, { "epoch": 0.47325, "grad_norm": 2.107231616973877, "grad_norm_var": 0.7360415472070372, "learning_rate": 0.0001, "loss": 1.0846, "loss/crossentropy": 2.515157699584961, "loss/hidden": 0.92578125, "loss/logits": 0.14859052002429962, "loss/reg": 0.0010254993103444576, "step": 3786 }, { "epoch": 0.473375, "grad_norm": 2.3594179153442383, "grad_norm_var": 0.7660936805607813, "learning_rate": 0.0001, "loss": 1.1806, "loss/crossentropy": 2.3779873847961426, "loss/hidden": 1.0078125, "loss/logits": 0.1625376045703888, "loss/reg": 0.0010249214246869087, "step": 3787 }, { "epoch": 0.4735, "grad_norm": 2.129061698913574, "grad_norm_var": 0.8076325916188192, "learning_rate": 0.0001, "loss": 1.213, "loss/crossentropy": 2.2039616107940674, "loss/hidden": 1.0234375, "loss/logits": 0.17928262054920197, "loss/reg": 0.0010243578581139445, "step": 3788 }, { "epoch": 0.473625, "grad_norm": 3.221423387527466, "grad_norm_var": 0.8028404753002611, "learning_rate": 0.0001, "loss": 1.5318, "loss/crossentropy": 2.3981847763061523, "loss/hidden": 1.28125, "loss/logits": 0.24029937386512756, "loss/reg": 0.0010237547103315592, "step": 3789 }, { "epoch": 0.47375, "grad_norm": 2.3523929119110107, "grad_norm_var": 0.3646925080007795, "learning_rate": 0.0001, "loss": 1.0549, "loss/crossentropy": 2.3994648456573486, "loss/hidden": 0.9140625, "loss/logits": 0.13063624501228333, "loss/reg": 0.0010232037166133523, "step": 3790 }, { "epoch": 0.473875, "grad_norm": 2.6020662784576416, "grad_norm_var": 0.3523126568125226, "learning_rate": 0.0001, "loss": 1.0249, "loss/crossentropy": 2.5697858333587646, "loss/hidden": 0.88671875, "loss/logits": 0.12795522809028625, "loss/reg": 0.001022615353576839, "step": 3791 }, { "epoch": 0.474, "grad_norm": 1.8998984098434448, "grad_norm_var": 0.40490284938084636, "learning_rate": 0.0001, "loss": 0.9904, "loss/crossentropy": 2.4907114505767822, "loss/hidden": 0.859375, "loss/logits": 0.12084287405014038, "loss/reg": 0.0010220760013908148, "step": 3792 }, { "epoch": 0.474125, "grad_norm": 2.4810969829559326, "grad_norm_var": 0.3988475010933337, "learning_rate": 0.0001, "loss": 1.1602, "loss/crossentropy": 2.324519634246826, "loss/hidden": 0.98046875, "loss/logits": 0.16946963965892792, "loss/reg": 0.00102148053701967, "step": 3793 }, { "epoch": 0.47425, "grad_norm": 2.6767971515655518, "grad_norm_var": 0.3846565348558063, "learning_rate": 0.0001, "loss": 1.0562, "loss/crossentropy": 2.077571392059326, "loss/hidden": 0.921875, "loss/logits": 0.12415832281112671, "loss/reg": 0.0010208690073341131, "step": 3794 }, { "epoch": 0.474375, "grad_norm": 3.015474557876587, "grad_norm_var": 0.3888329678437462, "learning_rate": 0.0001, "loss": 1.0951, "loss/crossentropy": 2.6470508575439453, "loss/hidden": 0.92578125, "loss/logits": 0.1591503620147705, "loss/reg": 0.0010203415295109153, "step": 3795 }, { "epoch": 0.4745, "grad_norm": 2.7243542671203613, "grad_norm_var": 0.3013672652226252, "learning_rate": 0.0001, "loss": 1.3046, "loss/crossentropy": 2.488734245300293, "loss/hidden": 1.109375, "loss/logits": 0.18504154682159424, "loss/reg": 0.0010197428055107594, "step": 3796 }, { "epoch": 0.474625, "grad_norm": 2.0837106704711914, "grad_norm_var": 0.31835592524550355, "learning_rate": 0.0001, "loss": 1.0348, "loss/crossentropy": 2.2965312004089355, "loss/hidden": 0.8984375, "loss/logits": 0.12620750069618225, "loss/reg": 0.0010191682958975434, "step": 3797 }, { "epoch": 0.47475, "grad_norm": 2.381087064743042, "grad_norm_var": 0.32382167976260845, "learning_rate": 0.0001, "loss": 1.1462, "loss/crossentropy": 2.5921807289123535, "loss/hidden": 0.9765625, "loss/logits": 0.15944349765777588, "loss/reg": 0.0010186234721913934, "step": 3798 }, { "epoch": 0.474875, "grad_norm": 2.2241954803466797, "grad_norm_var": 0.31690287607816753, "learning_rate": 0.0001, "loss": 1.1052, "loss/crossentropy": 2.8388302326202393, "loss/hidden": 0.9375, "loss/logits": 0.15752184391021729, "loss/reg": 0.0010180945973843336, "step": 3799 }, { "epoch": 0.475, "grad_norm": 2.018507480621338, "grad_norm_var": 0.30193114323544223, "learning_rate": 0.0001, "loss": 1.0819, "loss/crossentropy": 2.530259132385254, "loss/hidden": 0.921875, "loss/logits": 0.14986175298690796, "loss/reg": 0.001017540111206472, "step": 3800 }, { "epoch": 0.475125, "grad_norm": 2.4066989421844482, "grad_norm_var": 0.13118506914761804, "learning_rate": 0.0001, "loss": 1.1191, "loss/crossentropy": 2.491457462310791, "loss/hidden": 0.96484375, "loss/logits": 0.14407965540885925, "loss/reg": 0.0010169785236939788, "step": 3801 }, { "epoch": 0.47525, "grad_norm": 3.4207069873809814, "grad_norm_var": 0.18463647138117376, "learning_rate": 0.0001, "loss": 1.2738, "loss/crossentropy": 2.5598089694976807, "loss/hidden": 1.078125, "loss/logits": 0.18551099300384521, "loss/reg": 0.0010164324194192886, "step": 3802 }, { "epoch": 0.475375, "grad_norm": 2.2817890644073486, "grad_norm_var": 0.1864661962865923, "learning_rate": 0.0001, "loss": 1.1689, "loss/crossentropy": 2.6134941577911377, "loss/hidden": 0.9765625, "loss/logits": 0.18219929933547974, "loss/reg": 0.0010158595396205783, "step": 3803 }, { "epoch": 0.4755, "grad_norm": 3.1798338890075684, "grad_norm_var": 0.20421126288790953, "learning_rate": 0.0001, "loss": 1.0803, "loss/crossentropy": 2.555511474609375, "loss/hidden": 0.9296875, "loss/logits": 0.14045485854148865, "loss/reg": 0.0010152931790798903, "step": 3804 }, { "epoch": 0.475625, "grad_norm": 3.561237335205078, "grad_norm_var": 0.24136806404609146, "learning_rate": 0.0001, "loss": 1.2104, "loss/crossentropy": 2.514158248901367, "loss/hidden": 1.046875, "loss/logits": 0.1534043401479721, "loss/reg": 0.001014724955894053, "step": 3805 }, { "epoch": 0.47575, "grad_norm": 2.369704484939575, "grad_norm_var": 0.2408571239197707, "learning_rate": 0.0001, "loss": 1.1925, "loss/crossentropy": 2.439347982406616, "loss/hidden": 1.0, "loss/logits": 0.1823522448539734, "loss/reg": 0.0010141364764422178, "step": 3806 }, { "epoch": 0.475875, "grad_norm": 2.513235330581665, "grad_norm_var": 0.24112386156733306, "learning_rate": 0.0001, "loss": 1.1692, "loss/crossentropy": 2.369856119155884, "loss/hidden": 0.99609375, "loss/logits": 0.1630163937807083, "loss/reg": 0.0010135597549378872, "step": 3807 }, { "epoch": 0.476, "grad_norm": 2.7011382579803467, "grad_norm_var": 0.20886958637742434, "learning_rate": 0.0001, "loss": 1.1514, "loss/crossentropy": 2.400017738342285, "loss/hidden": 0.98828125, "loss/logits": 0.1530260443687439, "loss/reg": 0.0010130032896995544, "step": 3808 }, { "epoch": 0.476125, "grad_norm": 3.122610569000244, "grad_norm_var": 0.22207052291217502, "learning_rate": 0.0001, "loss": 1.1687, "loss/crossentropy": 2.6465508937835693, "loss/hidden": 0.99609375, "loss/logits": 0.16243284940719604, "loss/reg": 0.0010123944375663996, "step": 3809 }, { "epoch": 0.47625, "grad_norm": 3.0298376083374023, "grad_norm_var": 0.23029482487877337, "learning_rate": 0.0001, "loss": 1.3002, "loss/crossentropy": 2.301685333251953, "loss/hidden": 1.125, "loss/logits": 0.16509054601192474, "loss/reg": 0.0010117434430867434, "step": 3810 }, { "epoch": 0.476375, "grad_norm": 2.3605551719665527, "grad_norm_var": 0.2286489276417548, "learning_rate": 0.0001, "loss": 1.0267, "loss/crossentropy": 2.5654475688934326, "loss/hidden": 0.88671875, "loss/logits": 0.12986904382705688, "loss/reg": 0.0010112058371305466, "step": 3811 }, { "epoch": 0.4765, "grad_norm": 2.941214084625244, "grad_norm_var": 0.23377570028820643, "learning_rate": 0.0001, "loss": 1.3064, "loss/crossentropy": 2.3023383617401123, "loss/hidden": 1.125, "loss/logits": 0.17129108309745789, "loss/reg": 0.0010105702094733715, "step": 3812 }, { "epoch": 0.476625, "grad_norm": 2.736543655395508, "grad_norm_var": 0.21005368468196722, "learning_rate": 0.0001, "loss": 1.1362, "loss/crossentropy": 2.4729695320129395, "loss/hidden": 0.9765625, "loss/logits": 0.14952141046524048, "loss/reg": 0.001010029693134129, "step": 3813 }, { "epoch": 0.47675, "grad_norm": 2.5862503051757812, "grad_norm_var": 0.20387694088771013, "learning_rate": 0.0001, "loss": 1.0478, "loss/crossentropy": 2.3758089542388916, "loss/hidden": 0.9140625, "loss/logits": 0.1236475333571434, "loss/reg": 0.0010094038443639874, "step": 3814 }, { "epoch": 0.476875, "grad_norm": 2.851497173309326, "grad_norm_var": 0.18734666167367683, "learning_rate": 0.0001, "loss": 1.2131, "loss/crossentropy": 2.483431816101074, "loss/hidden": 1.0390625, "loss/logits": 0.1639733612537384, "loss/reg": 0.0010088563431054354, "step": 3815 }, { "epoch": 0.477, "grad_norm": 2.964251756668091, "grad_norm_var": 0.15036681068055652, "learning_rate": 0.0001, "loss": 1.2071, "loss/crossentropy": 2.5335826873779297, "loss/hidden": 1.03125, "loss/logits": 0.1657508760690689, "loss/reg": 0.0010083268862217665, "step": 3816 }, { "epoch": 0.477125, "grad_norm": 2.521502733230591, "grad_norm_var": 0.14495295749751047, "learning_rate": 0.0001, "loss": 1.1355, "loss/crossentropy": 2.4318947792053223, "loss/hidden": 0.984375, "loss/logits": 0.14100542664527893, "loss/reg": 0.0010078143095597625, "step": 3817 }, { "epoch": 0.47725, "grad_norm": 2.859482526779175, "grad_norm_var": 0.11979036556660366, "learning_rate": 0.0001, "loss": 1.5544, "loss/crossentropy": 2.394251585006714, "loss/hidden": 1.296875, "loss/logits": 0.24743588268756866, "loss/reg": 0.0010072708828374743, "step": 3818 }, { "epoch": 0.477375, "grad_norm": 2.821807384490967, "grad_norm_var": 0.10169110473085823, "learning_rate": 0.0001, "loss": 1.1867, "loss/crossentropy": 2.8032450675964355, "loss/hidden": 1.015625, "loss/logits": 0.1610012948513031, "loss/reg": 0.0010067183757200837, "step": 3819 }, { "epoch": 0.4775, "grad_norm": 5.056894302368164, "grad_norm_var": 0.4119471834275146, "learning_rate": 0.0001, "loss": 1.5494, "loss/crossentropy": 2.3905587196350098, "loss/hidden": 1.359375, "loss/logits": 0.17995157837867737, "loss/reg": 0.0010061421198770404, "step": 3820 }, { "epoch": 0.477625, "grad_norm": 2.6750197410583496, "grad_norm_var": 0.38731474525797294, "learning_rate": 0.0001, "loss": 1.0636, "loss/crossentropy": 2.6245410442352295, "loss/hidden": 0.8984375, "loss/logits": 0.15511852502822876, "loss/reg": 0.0010056288447231054, "step": 3821 }, { "epoch": 0.47775, "grad_norm": 2.2640578746795654, "grad_norm_var": 0.39522822476376024, "learning_rate": 0.0001, "loss": 1.1588, "loss/crossentropy": 2.5605897903442383, "loss/hidden": 0.98046875, "loss/logits": 0.16832897067070007, "loss/reg": 0.0010051216231659055, "step": 3822 }, { "epoch": 0.477875, "grad_norm": 2.810070276260376, "grad_norm_var": 0.38640265124457757, "learning_rate": 0.0001, "loss": 1.5285, "loss/crossentropy": 2.2229278087615967, "loss/hidden": 1.2890625, "loss/logits": 0.22936587035655975, "loss/reg": 0.0010045509552583098, "step": 3823 }, { "epoch": 0.478, "grad_norm": 7.900766849517822, "grad_norm_var": 1.942508201785307, "learning_rate": 0.0001, "loss": 1.5038, "loss/crossentropy": 2.3500988483428955, "loss/hidden": 1.28125, "loss/logits": 0.2125115692615509, "loss/reg": 0.0010039786575362086, "step": 3824 }, { "epoch": 0.478125, "grad_norm": 2.2908005714416504, "grad_norm_var": 1.9964314486523504, "learning_rate": 0.0001, "loss": 1.1035, "loss/crossentropy": 2.4654436111450195, "loss/hidden": 0.9375, "loss/logits": 0.1559765338897705, "loss/reg": 0.0010034649167209864, "step": 3825 }, { "epoch": 0.47825, "grad_norm": 2.7496652603149414, "grad_norm_var": 2.006457983223358, "learning_rate": 0.0001, "loss": 1.3527, "loss/crossentropy": 2.3856773376464844, "loss/hidden": 1.140625, "loss/logits": 0.20207852125167847, "loss/reg": 0.001002990873530507, "step": 3826 }, { "epoch": 0.478375, "grad_norm": 2.96850323677063, "grad_norm_var": 1.9656145811095322, "learning_rate": 0.0001, "loss": 1.1246, "loss/crossentropy": 2.509129047393799, "loss/hidden": 0.95703125, "loss/logits": 0.15750794112682343, "loss/reg": 0.0010024425573647022, "step": 3827 }, { "epoch": 0.4785, "grad_norm": 2.322000026702881, "grad_norm_var": 2.0099039069158446, "learning_rate": 0.0001, "loss": 1.0903, "loss/crossentropy": 2.430481433868408, "loss/hidden": 0.9375, "loss/logits": 0.14277693629264832, "loss/reg": 0.0010017442982643843, "step": 3828 }, { "epoch": 0.478625, "grad_norm": 2.84533953666687, "grad_norm_var": 2.0046649808036134, "learning_rate": 0.0001, "loss": 1.2539, "loss/crossentropy": 2.3790130615234375, "loss/hidden": 1.0625, "loss/logits": 0.1813977062702179, "loss/reg": 0.001001054304651916, "step": 3829 }, { "epoch": 0.47875, "grad_norm": 2.8437020778656006, "grad_norm_var": 1.9892671842286291, "learning_rate": 0.0001, "loss": 1.2267, "loss/crossentropy": 2.5717296600341797, "loss/hidden": 1.03125, "loss/logits": 0.18547095358371735, "loss/reg": 0.0010003690840676427, "step": 3830 }, { "epoch": 0.478875, "grad_norm": 4.956015110015869, "grad_norm_var": 2.176261985377486, "learning_rate": 0.0001, "loss": 1.1464, "loss/crossentropy": 2.7394986152648926, "loss/hidden": 0.96875, "loss/logits": 0.16768449544906616, "loss/reg": 0.0009996765293180943, "step": 3831 }, { "epoch": 0.479, "grad_norm": 6.31146240234375, "grad_norm_var": 2.7252666969035806, "learning_rate": 0.0001, "loss": 2.2214, "loss/crossentropy": 3.002293825149536, "loss/hidden": 1.7890625, "loss/logits": 0.422338604927063, "loss/reg": 0.0009989682585000992, "step": 3832 }, { "epoch": 0.479125, "grad_norm": 2.5021042823791504, "grad_norm_var": 2.727852920123257, "learning_rate": 0.0001, "loss": 1.0959, "loss/crossentropy": 2.5677802562713623, "loss/hidden": 0.94140625, "loss/logits": 0.1445181667804718, "loss/reg": 0.0009983173804357648, "step": 3833 }, { "epoch": 0.47925, "grad_norm": 19.8763370513916, "grad_norm_var": 19.347709603649108, "learning_rate": 0.0001, "loss": 1.4294, "loss/crossentropy": 2.2940475940704346, "loss/hidden": 1.234375, "loss/logits": 0.1850033551454544, "loss/reg": 0.0009976484579965472, "step": 3834 }, { "epoch": 0.479375, "grad_norm": 2.5055060386657715, "grad_norm_var": 19.427886427717326, "learning_rate": 0.0001, "loss": 1.061, "loss/crossentropy": 2.765748977661133, "loss/hidden": 0.9140625, "loss/logits": 0.13693846762180328, "loss/reg": 0.0009970958344638348, "step": 3835 }, { "epoch": 0.4795, "grad_norm": 2.8397607803344727, "grad_norm_var": 19.586715170394154, "learning_rate": 0.0001, "loss": 1.2979, "loss/crossentropy": 2.7400200366973877, "loss/hidden": 1.0859375, "loss/logits": 0.2020002007484436, "loss/reg": 0.0009964705677703023, "step": 3836 }, { "epoch": 0.479625, "grad_norm": 3.139051914215088, "grad_norm_var": 19.492437158417527, "learning_rate": 0.0001, "loss": 1.1547, "loss/crossentropy": 2.4851839542388916, "loss/hidden": 0.98828125, "loss/logits": 0.1564827859401703, "loss/reg": 0.0009958111913874745, "step": 3837 }, { "epoch": 0.47975, "grad_norm": 2.4869613647460938, "grad_norm_var": 19.43071436262637, "learning_rate": 0.0001, "loss": 1.3644, "loss/crossentropy": 2.621083974838257, "loss/hidden": 1.125, "loss/logits": 0.22949010133743286, "loss/reg": 0.000995261943899095, "step": 3838 }, { "epoch": 0.479875, "grad_norm": 3.4114043712615967, "grad_norm_var": 19.321086563682076, "learning_rate": 0.0001, "loss": 1.2731, "loss/crossentropy": 2.0640392303466797, "loss/hidden": 1.109375, "loss/logits": 0.15372848510742188, "loss/reg": 0.0009946615900844336, "step": 3839 }, { "epoch": 0.48, "grad_norm": 2.4083080291748047, "grad_norm_var": 18.71373744434961, "learning_rate": 0.0001, "loss": 1.3296, "loss/crossentropy": 2.670114040374756, "loss/hidden": 1.125, "loss/logits": 0.19469651579856873, "loss/reg": 0.000994127127341926, "step": 3840 }, { "epoch": 0.480125, "grad_norm": 3.122145414352417, "grad_norm_var": 18.550454205120285, "learning_rate": 0.0001, "loss": 1.2776, "loss/crossentropy": 2.5285885334014893, "loss/hidden": 1.0703125, "loss/logits": 0.19733071327209473, "loss/reg": 0.0009936005808413029, "step": 3841 }, { "epoch": 0.48025, "grad_norm": 3.6272876262664795, "grad_norm_var": 18.428234649581576, "learning_rate": 0.0001, "loss": 1.6053, "loss/crossentropy": 2.617379903793335, "loss/hidden": 1.34375, "loss/logits": 0.2516637146472931, "loss/reg": 0.0009931274689733982, "step": 3842 }, { "epoch": 0.480375, "grad_norm": 2.3280117511749268, "grad_norm_var": 18.564197774453874, "learning_rate": 0.0001, "loss": 1.1608, "loss/crossentropy": 2.7009565830230713, "loss/hidden": 0.984375, "loss/logits": 0.1665092259645462, "loss/reg": 0.000992586719803512, "step": 3843 }, { "epoch": 0.4805, "grad_norm": 3.039219617843628, "grad_norm_var": 18.41481139058075, "learning_rate": 0.0001, "loss": 1.2331, "loss/crossentropy": 2.2380242347717285, "loss/hidden": 1.0390625, "loss/logits": 0.18415310978889465, "loss/reg": 0.0009920995216816664, "step": 3844 }, { "epoch": 0.480625, "grad_norm": 14.223673820495605, "grad_norm_var": 24.352436062428406, "learning_rate": 0.0001, "loss": 2.1204, "loss/crossentropy": 2.3986945152282715, "loss/hidden": 1.7578125, "loss/logits": 0.35267388820648193, "loss/reg": 0.0009916136041283607, "step": 3845 }, { "epoch": 0.48075, "grad_norm": 3.344921112060547, "grad_norm_var": 24.22561688873614, "learning_rate": 0.0001, "loss": 1.4799, "loss/crossentropy": 2.4743399620056152, "loss/hidden": 1.234375, "loss/logits": 0.23563525080680847, "loss/reg": 0.0009911012602970004, "step": 3846 }, { "epoch": 0.480875, "grad_norm": 2.6594321727752686, "grad_norm_var": 24.571066998628513, "learning_rate": 0.0001, "loss": 1.1252, "loss/crossentropy": 2.463151693344116, "loss/hidden": 0.94921875, "loss/logits": 0.16602733731269836, "loss/reg": 0.000990543281659484, "step": 3847 }, { "epoch": 0.481, "grad_norm": 2.824007749557495, "grad_norm_var": 24.658198123895424, "learning_rate": 0.0001, "loss": 1.1541, "loss/crossentropy": 2.6075382232666016, "loss/hidden": 0.9921875, "loss/logits": 0.15203343331813812, "loss/reg": 0.0009900431614369154, "step": 3848 }, { "epoch": 0.481125, "grad_norm": 2.329282283782959, "grad_norm_var": 24.709469556839103, "learning_rate": 0.0001, "loss": 1.2429, "loss/crossentropy": 2.492720365524292, "loss/hidden": 1.078125, "loss/logits": 0.1548917591571808, "loss/reg": 0.000989554449915886, "step": 3849 }, { "epoch": 0.48125, "grad_norm": 2.59311580657959, "grad_norm_var": 8.257006324728058, "learning_rate": 0.0001, "loss": 1.2821, "loss/crossentropy": 2.73769474029541, "loss/hidden": 1.0859375, "loss/logits": 0.18624958395957947, "loss/reg": 0.0009890699293464422, "step": 3850 }, { "epoch": 0.481375, "grad_norm": 6.283566951751709, "grad_norm_var": 8.62037592937811, "learning_rate": 0.0001, "loss": 1.7599, "loss/crossentropy": 2.67447829246521, "loss/hidden": 1.484375, "loss/logits": 0.26564088463783264, "loss/reg": 0.0009886096231639385, "step": 3851 }, { "epoch": 0.4815, "grad_norm": 5.981793403625488, "grad_norm_var": 8.838780355448488, "learning_rate": 0.0001, "loss": 1.5497, "loss/crossentropy": 2.285877227783203, "loss/hidden": 1.3515625, "loss/logits": 0.18828608095645905, "loss/reg": 0.0009881755104288459, "step": 3852 }, { "epoch": 0.481625, "grad_norm": 2.33320689201355, "grad_norm_var": 8.970544010754018, "learning_rate": 0.0001, "loss": 1.1909, "loss/crossentropy": 2.391533851623535, "loss/hidden": 1.015625, "loss/logits": 0.16543591022491455, "loss/reg": 0.0009877528063952923, "step": 3853 }, { "epoch": 0.48175, "grad_norm": 2.369744062423706, "grad_norm_var": 8.994069607627866, "learning_rate": 0.0001, "loss": 1.3245, "loss/crossentropy": 2.3503201007843018, "loss/hidden": 1.1328125, "loss/logits": 0.18184694647789001, "loss/reg": 0.0009872325463220477, "step": 3854 }, { "epoch": 0.481875, "grad_norm": 2.110908031463623, "grad_norm_var": 9.189689981746975, "learning_rate": 0.0001, "loss": 1.1124, "loss/crossentropy": 2.629596710205078, "loss/hidden": 0.94921875, "loss/logits": 0.15330851078033447, "loss/reg": 0.0009868023917078972, "step": 3855 }, { "epoch": 0.482, "grad_norm": 3.180330991744995, "grad_norm_var": 9.0786761418345, "learning_rate": 0.0001, "loss": 1.4608, "loss/crossentropy": 2.6938958168029785, "loss/hidden": 1.234375, "loss/logits": 0.21655558049678802, "loss/reg": 0.0009862742153927684, "step": 3856 }, { "epoch": 0.482125, "grad_norm": 2.470818519592285, "grad_norm_var": 9.172474122918823, "learning_rate": 0.0001, "loss": 1.1179, "loss/crossentropy": 2.3639256954193115, "loss/hidden": 0.95703125, "loss/logits": 0.15101152658462524, "loss/reg": 0.0009858174016699195, "step": 3857 }, { "epoch": 0.48225, "grad_norm": 2.793362855911255, "grad_norm_var": 9.241392129845044, "learning_rate": 0.0001, "loss": 1.3409, "loss/crossentropy": 2.6154537200927734, "loss/hidden": 1.140625, "loss/logits": 0.19037774205207825, "loss/reg": 0.0009853205410763621, "step": 3858 }, { "epoch": 0.482375, "grad_norm": 2.125232696533203, "grad_norm_var": 9.283871048177028, "learning_rate": 0.0001, "loss": 1.1482, "loss/crossentropy": 2.272627115249634, "loss/hidden": 0.98046875, "loss/logits": 0.15784162282943726, "loss/reg": 0.0009848305489867926, "step": 3859 }, { "epoch": 0.4825, "grad_norm": 2.583829641342163, "grad_norm_var": 9.342504511012352, "learning_rate": 0.0001, "loss": 1.1036, "loss/crossentropy": 2.4618449211120605, "loss/hidden": 0.94140625, "loss/logits": 0.15236282348632812, "loss/reg": 0.0009843386942520738, "step": 3860 }, { "epoch": 0.482625, "grad_norm": 2.6063671112060547, "grad_norm_var": 1.5742289695223421, "learning_rate": 0.0001, "loss": 1.3485, "loss/crossentropy": 2.5883679389953613, "loss/hidden": 1.125, "loss/logits": 0.21369034051895142, "loss/reg": 0.000983806443400681, "step": 3861 }, { "epoch": 0.48275, "grad_norm": 2.942054510116577, "grad_norm_var": 1.5678256801899655, "learning_rate": 0.0001, "loss": 1.3694, "loss/crossentropy": 2.853081703186035, "loss/hidden": 1.171875, "loss/logits": 0.18769419193267822, "loss/reg": 0.0009832879295572639, "step": 3862 }, { "epoch": 0.482875, "grad_norm": 2.521088123321533, "grad_norm_var": 1.5755195914488807, "learning_rate": 0.0001, "loss": 1.2775, "loss/crossentropy": 2.386749744415283, "loss/hidden": 1.1015625, "loss/logits": 0.16611292958259583, "loss/reg": 0.0009827249450609088, "step": 3863 }, { "epoch": 0.483, "grad_norm": 2.823361873626709, "grad_norm_var": 1.5755350355787405, "learning_rate": 0.0001, "loss": 1.1976, "loss/crossentropy": 2.3291540145874023, "loss/hidden": 1.03125, "loss/logits": 0.15652960538864136, "loss/reg": 0.0009821507846936584, "step": 3864 }, { "epoch": 0.483125, "grad_norm": 2.208906650543213, "grad_norm_var": 1.5872539690793086, "learning_rate": 0.0001, "loss": 1.0896, "loss/crossentropy": 2.4856553077697754, "loss/hidden": 0.92578125, "loss/logits": 0.15402814745903015, "loss/reg": 0.0009815255180001259, "step": 3865 }, { "epoch": 0.48325, "grad_norm": 8.507161140441895, "grad_norm_var": 3.455969321245068, "learning_rate": 0.0001, "loss": 1.4013, "loss/crossentropy": 2.647115707397461, "loss/hidden": 1.1796875, "loss/logits": 0.21183907985687256, "loss/reg": 0.0009809039765968919, "step": 3866 }, { "epoch": 0.483375, "grad_norm": 2.6530346870422363, "grad_norm_var": 2.8670260174461486, "learning_rate": 0.0001, "loss": 1.2399, "loss/crossentropy": 2.5514309406280518, "loss/hidden": 1.046875, "loss/logits": 0.18321657180786133, "loss/reg": 0.000980377197265625, "step": 3867 }, { "epoch": 0.4835, "grad_norm": 2.7146053314208984, "grad_norm_var": 2.2954428251655212, "learning_rate": 0.0001, "loss": 1.1932, "loss/crossentropy": 2.4043617248535156, "loss/hidden": 1.0078125, "loss/logits": 0.17560836672782898, "loss/reg": 0.0009798492537811399, "step": 3868 }, { "epoch": 0.483625, "grad_norm": 2.8216774463653564, "grad_norm_var": 2.271226190417633, "learning_rate": 0.0001, "loss": 1.2375, "loss/crossentropy": 2.608152389526367, "loss/hidden": 1.03125, "loss/logits": 0.19642019271850586, "loss/reg": 0.0009793040808290243, "step": 3869 }, { "epoch": 0.48375, "grad_norm": 2.702674627304077, "grad_norm_var": 2.2517508637007, "learning_rate": 0.0001, "loss": 1.5555, "loss/crossentropy": 2.379974365234375, "loss/hidden": 1.2578125, "loss/logits": 0.2879074811935425, "loss/reg": 0.0009787173476070166, "step": 3870 }, { "epoch": 0.483875, "grad_norm": 3.3122060298919678, "grad_norm_var": 2.201885476855244, "learning_rate": 0.0001, "loss": 1.3003, "loss/crossentropy": 3.0506629943847656, "loss/hidden": 1.09375, "loss/logits": 0.19672811031341553, "loss/reg": 0.0009781832341104746, "step": 3871 }, { "epoch": 0.484, "grad_norm": 2.564932346343994, "grad_norm_var": 2.2157160804201377, "learning_rate": 0.0001, "loss": 1.1628, "loss/crossentropy": 2.654998302459717, "loss/hidden": 0.99609375, "loss/logits": 0.15690653026103973, "loss/reg": 0.000977598363533616, "step": 3872 }, { "epoch": 0.484125, "grad_norm": 1.9866045713424683, "grad_norm_var": 2.2659525588540843, "learning_rate": 0.0001, "loss": 1.1536, "loss/crossentropy": 2.248006582260132, "loss/hidden": 0.98828125, "loss/logits": 0.15551240742206573, "loss/reg": 0.0009770109318196774, "step": 3873 }, { "epoch": 0.48425, "grad_norm": 2.4865634441375732, "grad_norm_var": 2.279948465601332, "learning_rate": 0.0001, "loss": 1.2193, "loss/crossentropy": 2.6419310569763184, "loss/hidden": 1.03125, "loss/logits": 0.1782725751399994, "loss/reg": 0.00097639363957569, "step": 3874 }, { "epoch": 0.484375, "grad_norm": 2.750188112258911, "grad_norm_var": 2.233756909505145, "learning_rate": 0.0001, "loss": 1.2361, "loss/crossentropy": 2.5739917755126953, "loss/hidden": 1.0546875, "loss/logits": 0.17166922986507416, "loss/reg": 0.0009757412481121719, "step": 3875 }, { "epoch": 0.4845, "grad_norm": 2.746734142303467, "grad_norm_var": 2.2261245653476562, "learning_rate": 0.0001, "loss": 1.2022, "loss/crossentropy": 2.517704486846924, "loss/hidden": 1.03125, "loss/logits": 0.16122382879257202, "loss/reg": 0.0009750965400598943, "step": 3876 }, { "epoch": 0.484625, "grad_norm": 3.0659143924713135, "grad_norm_var": 2.2138711899042596, "learning_rate": 0.0001, "loss": 1.2555, "loss/crossentropy": 2.7549352645874023, "loss/hidden": 1.078125, "loss/logits": 0.1676369607448578, "loss/reg": 0.0009745695278979838, "step": 3877 }, { "epoch": 0.48475, "grad_norm": 2.1293835639953613, "grad_norm_var": 2.2668970708865377, "learning_rate": 0.0001, "loss": 1.1207, "loss/crossentropy": 2.406440019607544, "loss/hidden": 0.9609375, "loss/logits": 0.15001198649406433, "loss/reg": 0.0009739145752973855, "step": 3878 }, { "epoch": 0.484875, "grad_norm": 2.7037017345428467, "grad_norm_var": 2.2573280804882914, "learning_rate": 0.0001, "loss": 1.2281, "loss/crossentropy": 2.3235747814178467, "loss/hidden": 1.0390625, "loss/logits": 0.17930641770362854, "loss/reg": 0.0009733041515573859, "step": 3879 }, { "epoch": 0.485, "grad_norm": 2.542484760284424, "grad_norm_var": 2.269289790523822, "learning_rate": 0.0001, "loss": 1.172, "loss/crossentropy": 2.4276955127716064, "loss/hidden": 0.98828125, "loss/logits": 0.17397069931030273, "loss/reg": 0.0009726633434183896, "step": 3880 }, { "epoch": 0.485125, "grad_norm": 2.7562434673309326, "grad_norm_var": 2.230751620773511, "learning_rate": 0.0001, "loss": 1.2368, "loss/crossentropy": 2.9651577472686768, "loss/hidden": 1.046875, "loss/logits": 0.18017736077308655, "loss/reg": 0.0009719987865537405, "step": 3881 }, { "epoch": 0.48525, "grad_norm": 2.6996052265167236, "grad_norm_var": 0.09580699726711632, "learning_rate": 0.0001, "loss": 1.2163, "loss/crossentropy": 2.498152732849121, "loss/hidden": 1.0234375, "loss/logits": 0.1831151843070984, "loss/reg": 0.0009713195031508803, "step": 3882 }, { "epoch": 0.485375, "grad_norm": 2.6036300659179688, "grad_norm_var": 0.09603694842035626, "learning_rate": 0.0001, "loss": 1.1679, "loss/crossentropy": 2.5527193546295166, "loss/hidden": 0.9921875, "loss/logits": 0.1659805029630661, "loss/reg": 0.000970788998529315, "step": 3883 }, { "epoch": 0.4855, "grad_norm": 2.5404834747314453, "grad_norm_var": 0.09670351283897904, "learning_rate": 0.0001, "loss": 1.1343, "loss/crossentropy": 2.7073113918304443, "loss/hidden": 0.9609375, "loss/logits": 0.16367802023887634, "loss/reg": 0.000970257620792836, "step": 3884 }, { "epoch": 0.485625, "grad_norm": 2.691671371459961, "grad_norm_var": 0.09479809378065192, "learning_rate": 0.0001, "loss": 1.0045, "loss/crossentropy": 2.043846368789673, "loss/hidden": 0.8671875, "loss/logits": 0.12763670086860657, "loss/reg": 0.0009696534252725542, "step": 3885 }, { "epoch": 0.48575, "grad_norm": 2.20247745513916, "grad_norm_var": 0.10643478952511168, "learning_rate": 0.0001, "loss": 1.3398, "loss/crossentropy": 2.2900848388671875, "loss/hidden": 1.125, "loss/logits": 0.20509189367294312, "loss/reg": 0.0009690164006315172, "step": 3886 }, { "epoch": 0.485875, "grad_norm": 4.536986351013184, "grad_norm_var": 0.3146303454476907, "learning_rate": 0.0001, "loss": 1.6112, "loss/crossentropy": 2.3823635578155518, "loss/hidden": 1.3828125, "loss/logits": 0.2186613529920578, "loss/reg": 0.0009683648240752518, "step": 3887 }, { "epoch": 0.486, "grad_norm": 2.9407994747161865, "grad_norm_var": 0.3172937290333019, "learning_rate": 0.0001, "loss": 1.3771, "loss/crossentropy": 2.6425392627716064, "loss/hidden": 1.1640625, "loss/logits": 0.20335251092910767, "loss/reg": 0.0009678347851149738, "step": 3888 }, { "epoch": 0.486125, "grad_norm": 2.9972920417785645, "grad_norm_var": 0.28345555509000986, "learning_rate": 0.0001, "loss": 1.2534, "loss/crossentropy": 2.247020959854126, "loss/hidden": 1.0859375, "loss/logits": 0.15776273608207703, "loss/reg": 0.0009671897860243917, "step": 3889 }, { "epoch": 0.48625, "grad_norm": 7.43991231918335, "grad_norm_var": 1.6266788048715388, "learning_rate": 0.0001, "loss": 1.3465, "loss/crossentropy": 2.4727699756622314, "loss/hidden": 1.15625, "loss/logits": 0.18053875863552094, "loss/reg": 0.0009665501420386136, "step": 3890 }, { "epoch": 0.486375, "grad_norm": 2.3125510215759277, "grad_norm_var": 1.6581404490246254, "learning_rate": 0.0001, "loss": 1.0548, "loss/crossentropy": 2.6426429748535156, "loss/hidden": 0.90234375, "loss/logits": 0.14280441403388977, "loss/reg": 0.0009660268551670015, "step": 3891 }, { "epoch": 0.4865, "grad_norm": 2.7213850021362305, "grad_norm_var": 1.659228823470164, "learning_rate": 0.0001, "loss": 1.2788, "loss/crossentropy": 2.515568494796753, "loss/hidden": 1.09375, "loss/logits": 0.17534980177879333, "loss/reg": 0.0009653669549152255, "step": 3892 }, { "epoch": 0.486625, "grad_norm": 2.8212759494781494, "grad_norm_var": 1.6626225290198589, "learning_rate": 0.0001, "loss": 1.2863, "loss/crossentropy": 2.2115042209625244, "loss/hidden": 1.1015625, "loss/logits": 0.1750687062740326, "loss/reg": 0.0009648026316426694, "step": 3893 }, { "epoch": 0.48675, "grad_norm": 3.191157579421997, "grad_norm_var": 1.6041679662108321, "learning_rate": 0.0001, "loss": 1.2139, "loss/crossentropy": 2.6739723682403564, "loss/hidden": 1.0390625, "loss/logits": 0.1651734709739685, "loss/reg": 0.0009642810910008848, "step": 3894 }, { "epoch": 0.486875, "grad_norm": 2.7128381729125977, "grad_norm_var": 1.603682676196905, "learning_rate": 0.0001, "loss": 1.2623, "loss/crossentropy": 2.6381113529205322, "loss/hidden": 1.046875, "loss/logits": 0.20577171444892883, "loss/reg": 0.0009637601906433702, "step": 3895 }, { "epoch": 0.487, "grad_norm": 2.382399797439575, "grad_norm_var": 1.6173321535190628, "learning_rate": 0.0001, "loss": 1.054, "loss/crossentropy": 2.8548593521118164, "loss/hidden": 0.89453125, "loss/logits": 0.14981484413146973, "loss/reg": 0.0009632391156628728, "step": 3896 }, { "epoch": 0.487125, "grad_norm": 3.4758574962615967, "grad_norm_var": 1.6170100778082315, "learning_rate": 0.0001, "loss": 1.181, "loss/crossentropy": 2.9648144245147705, "loss/hidden": 0.9921875, "loss/logits": 0.17914965748786926, "loss/reg": 0.0009626782848499715, "step": 3897 }, { "epoch": 0.48725, "grad_norm": 2.64072847366333, "grad_norm_var": 1.6206988117745877, "learning_rate": 0.0001, "loss": 1.3307, "loss/crossentropy": 2.5237345695495605, "loss/hidden": 1.125, "loss/logits": 0.1960829198360443, "loss/reg": 0.0009621023200452328, "step": 3898 }, { "epoch": 0.487375, "grad_norm": 2.314140558242798, "grad_norm_var": 1.6465708178866398, "learning_rate": 0.0001, "loss": 1.2053, "loss/crossentropy": 2.739262819290161, "loss/hidden": 1.0234375, "loss/logits": 0.1722038984298706, "loss/reg": 0.0009614967275410891, "step": 3899 }, { "epoch": 0.4875, "grad_norm": 2.944770097732544, "grad_norm_var": 1.6255409352644599, "learning_rate": 0.0001, "loss": 1.3291, "loss/crossentropy": 2.3794772624969482, "loss/hidden": 1.1015625, "loss/logits": 0.21794244647026062, "loss/reg": 0.0009607769316062331, "step": 3900 }, { "epoch": 0.487625, "grad_norm": 3.114232063293457, "grad_norm_var": 1.6111376159289645, "learning_rate": 0.0001, "loss": 1.334, "loss/crossentropy": 2.614741563796997, "loss/hidden": 1.125, "loss/logits": 0.19943472743034363, "loss/reg": 0.0009602527716197073, "step": 3901 }, { "epoch": 0.48775, "grad_norm": 2.704216718673706, "grad_norm_var": 1.5620252022124599, "learning_rate": 0.0001, "loss": 1.1592, "loss/crossentropy": 2.404256820678711, "loss/hidden": 0.9765625, "loss/logits": 0.1730658859014511, "loss/reg": 0.0009597327443771064, "step": 3902 }, { "epoch": 0.487875, "grad_norm": 2.2545430660247803, "grad_norm_var": 1.481703857331202, "learning_rate": 0.0001, "loss": 1.1446, "loss/crossentropy": 2.913935661315918, "loss/hidden": 0.9765625, "loss/logits": 0.15848129987716675, "loss/reg": 0.0009590654517523944, "step": 3903 }, { "epoch": 0.488, "grad_norm": 2.967205762863159, "grad_norm_var": 1.4813259699079042, "learning_rate": 0.0001, "loss": 1.1742, "loss/crossentropy": 2.765528678894043, "loss/hidden": 1.0, "loss/logits": 0.16461580991744995, "loss/reg": 0.000958542397711426, "step": 3904 }, { "epoch": 0.488125, "grad_norm": 2.6479382514953613, "grad_norm_var": 1.4919753997504601, "learning_rate": 0.0001, "loss": 1.1728, "loss/crossentropy": 2.6421971321105957, "loss/hidden": 1.0, "loss/logits": 0.16317713260650635, "loss/reg": 0.0009580080513842404, "step": 3905 }, { "epoch": 0.48825, "grad_norm": 2.1999807357788086, "grad_norm_var": 0.134223632596242, "learning_rate": 0.0001, "loss": 1.2619, "loss/crossentropy": 2.246816635131836, "loss/hidden": 1.0703125, "loss/logits": 0.18201866745948792, "loss/reg": 0.0009574835421517491, "step": 3906 }, { "epoch": 0.488375, "grad_norm": 3.3809518814086914, "grad_norm_var": 0.14854556434714042, "learning_rate": 0.0001, "loss": 1.3093, "loss/crossentropy": 2.67181134223938, "loss/hidden": 1.078125, "loss/logits": 0.221635639667511, "loss/reg": 0.0009568546083755791, "step": 3907 }, { "epoch": 0.4885, "grad_norm": 2.0859053134918213, "grad_norm_var": 0.17871792409936518, "learning_rate": 0.0001, "loss": 1.0731, "loss/crossentropy": 2.5637154579162598, "loss/hidden": 0.91796875, "loss/logits": 0.14560778439044952, "loss/reg": 0.0009563195635564625, "step": 3908 }, { "epoch": 0.488625, "grad_norm": 3.300407648086548, "grad_norm_var": 0.1982655431839845, "learning_rate": 0.0001, "loss": 1.4915, "loss/crossentropy": 2.135890245437622, "loss/hidden": 1.2578125, "loss/logits": 0.22417889535427094, "loss/reg": 0.0009557988960295916, "step": 3909 }, { "epoch": 0.48875, "grad_norm": 2.711329698562622, "grad_norm_var": 0.18569989641920587, "learning_rate": 0.0001, "loss": 1.1081, "loss/crossentropy": 2.6608126163482666, "loss/hidden": 0.96484375, "loss/logits": 0.13368837535381317, "loss/reg": 0.0009551756666041911, "step": 3910 }, { "epoch": 0.488875, "grad_norm": 2.3675761222839355, "grad_norm_var": 0.19439330813992609, "learning_rate": 0.0001, "loss": 1.1084, "loss/crossentropy": 2.3836143016815186, "loss/hidden": 0.94921875, "loss/logits": 0.14966964721679688, "loss/reg": 0.0009546536603011191, "step": 3911 }, { "epoch": 0.489, "grad_norm": 2.262638807296753, "grad_norm_var": 0.20065281025498033, "learning_rate": 0.0001, "loss": 1.1201, "loss/crossentropy": 2.3407623767852783, "loss/hidden": 0.9609375, "loss/logits": 0.14966320991516113, "loss/reg": 0.0009541367762722075, "step": 3912 }, { "epoch": 0.489125, "grad_norm": 2.339641571044922, "grad_norm_var": 0.16543316725715324, "learning_rate": 0.0001, "loss": 1.2529, "loss/crossentropy": 2.575348377227783, "loss/hidden": 1.078125, "loss/logits": 0.16526679694652557, "loss/reg": 0.0009535907302051783, "step": 3913 }, { "epoch": 0.48925, "grad_norm": 2.8182976245880127, "grad_norm_var": 0.16742670273247975, "learning_rate": 0.0001, "loss": 1.2337, "loss/crossentropy": 2.4738118648529053, "loss/hidden": 1.046875, "loss/logits": 0.1772567629814148, "loss/reg": 0.0009530348470434546, "step": 3914 }, { "epoch": 0.489375, "grad_norm": 2.339776039123535, "grad_norm_var": 0.1663168443073078, "learning_rate": 0.0001, "loss": 1.0862, "loss/crossentropy": 2.608335018157959, "loss/hidden": 0.921875, "loss/logits": 0.15480951964855194, "loss/reg": 0.0009524454362690449, "step": 3915 }, { "epoch": 0.4895, "grad_norm": 2.5275776386260986, "grad_norm_var": 0.160935177290821, "learning_rate": 0.0001, "loss": 1.1894, "loss/crossentropy": 2.5267724990844727, "loss/hidden": 1.015625, "loss/logits": 0.1642695814371109, "loss/reg": 0.0009518436272628605, "step": 3916 }, { "epoch": 0.489625, "grad_norm": 3.2591397762298584, "grad_norm_var": 0.1716732034036833, "learning_rate": 0.0001, "loss": 1.4189, "loss/crossentropy": 2.6079602241516113, "loss/hidden": 1.1796875, "loss/logits": 0.22966748476028442, "loss/reg": 0.000951232563238591, "step": 3917 }, { "epoch": 0.48975, "grad_norm": 2.387899160385132, "grad_norm_var": 0.17502627718539923, "learning_rate": 0.0001, "loss": 1.091, "loss/crossentropy": 2.453495502471924, "loss/hidden": 0.921875, "loss/logits": 0.1595821976661682, "loss/reg": 0.0009506031055934727, "step": 3918 }, { "epoch": 0.489875, "grad_norm": 2.4388084411621094, "grad_norm_var": 0.16827582307402408, "learning_rate": 0.0001, "loss": 1.2675, "loss/crossentropy": 2.7695274353027344, "loss/hidden": 1.0546875, "loss/logits": 0.203264981508255, "loss/reg": 0.00095008296193555, "step": 3919 }, { "epoch": 0.49, "grad_norm": 2.580596685409546, "grad_norm_var": 0.1600905045883695, "learning_rate": 0.0001, "loss": 1.2695, "loss/crossentropy": 2.429319143295288, "loss/hidden": 1.0859375, "loss/logits": 0.17407003045082092, "loss/reg": 0.0009495636913925409, "step": 3920 }, { "epoch": 0.490125, "grad_norm": 3.3120410442352295, "grad_norm_var": 0.19163161057118241, "learning_rate": 0.0001, "loss": 1.3917, "loss/crossentropy": 2.457343816757202, "loss/hidden": 1.1875, "loss/logits": 0.19475361704826355, "loss/reg": 0.0009490433731116354, "step": 3921 }, { "epoch": 0.49025, "grad_norm": 2.112302303314209, "grad_norm_var": 0.19730912857784613, "learning_rate": 0.0001, "loss": 1.2138, "loss/crossentropy": 2.2379236221313477, "loss/hidden": 1.046875, "loss/logits": 0.15741994976997375, "loss/reg": 0.0009485049522481859, "step": 3922 }, { "epoch": 0.490375, "grad_norm": 2.802255630493164, "grad_norm_var": 0.16099536753709268, "learning_rate": 0.0001, "loss": 1.0893, "loss/crossentropy": 2.4165165424346924, "loss/hidden": 0.9296875, "loss/logits": 0.15012049674987793, "loss/reg": 0.0009479847503826022, "step": 3923 }, { "epoch": 0.4905, "grad_norm": 3.961231231689453, "grad_norm_var": 0.25153041278530824, "learning_rate": 0.0001, "loss": 1.1748, "loss/crossentropy": 2.6573047637939453, "loss/hidden": 1.015625, "loss/logits": 0.14971232414245605, "loss/reg": 0.0009474417893216014, "step": 3924 }, { "epoch": 0.490625, "grad_norm": 3.130035877227783, "grad_norm_var": 0.24016205160036938, "learning_rate": 0.0001, "loss": 1.2893, "loss/crossentropy": 2.245806932449341, "loss/hidden": 1.1015625, "loss/logits": 0.1782923936843872, "loss/reg": 0.000946921412833035, "step": 3925 }, { "epoch": 0.49075, "grad_norm": 4.162782192230225, "grad_norm_var": 0.3721961035343021, "learning_rate": 0.0001, "loss": 1.0629, "loss/crossentropy": 2.709290027618408, "loss/hidden": 0.90625, "loss/logits": 0.14717675745487213, "loss/reg": 0.000946375890634954, "step": 3926 }, { "epoch": 0.490875, "grad_norm": 2.816418409347534, "grad_norm_var": 0.35889890752315806, "learning_rate": 0.0001, "loss": 1.1183, "loss/crossentropy": 2.581409454345703, "loss/hidden": 0.94921875, "loss/logits": 0.15965479612350464, "loss/reg": 0.0009458541753701866, "step": 3927 }, { "epoch": 0.491, "grad_norm": 2.2804386615753174, "grad_norm_var": 0.3575764194963199, "learning_rate": 0.0001, "loss": 1.1146, "loss/crossentropy": 2.5990467071533203, "loss/hidden": 0.953125, "loss/logits": 0.1519862711429596, "loss/reg": 0.0009453533566556871, "step": 3928 }, { "epoch": 0.491125, "grad_norm": 2.730888605117798, "grad_norm_var": 0.34159846316781356, "learning_rate": 0.0001, "loss": 1.2424, "loss/crossentropy": 2.507784366607666, "loss/hidden": 1.046875, "loss/logits": 0.18604770302772522, "loss/reg": 0.0009447988704778254, "step": 3929 }, { "epoch": 0.49125, "grad_norm": 2.6504275798797607, "grad_norm_var": 0.3441539385940755, "learning_rate": 0.0001, "loss": 1.2027, "loss/crossentropy": 2.782844066619873, "loss/hidden": 1.0234375, "loss/logits": 0.16985847055912018, "loss/reg": 0.0009442323935218155, "step": 3930 }, { "epoch": 0.491375, "grad_norm": 3.606224775314331, "grad_norm_var": 0.35937414980921006, "learning_rate": 0.0001, "loss": 1.3568, "loss/crossentropy": 2.550386905670166, "loss/hidden": 1.140625, "loss/logits": 0.20672717690467834, "loss/reg": 0.000943707418628037, "step": 3931 }, { "epoch": 0.4915, "grad_norm": 4.1703619956970215, "grad_norm_var": 0.44155521600365333, "learning_rate": 0.0001, "loss": 1.3387, "loss/crossentropy": 2.7779364585876465, "loss/hidden": 1.125, "loss/logits": 0.20424896478652954, "loss/reg": 0.0009431518265046179, "step": 3932 }, { "epoch": 0.491625, "grad_norm": 3.0568296909332275, "grad_norm_var": 0.4378005795285027, "learning_rate": 0.0001, "loss": 1.3634, "loss/crossentropy": 2.7644073963165283, "loss/hidden": 1.1328125, "loss/logits": 0.2211238145828247, "loss/reg": 0.0009426433243788779, "step": 3933 }, { "epoch": 0.49175, "grad_norm": 2.3914036750793457, "grad_norm_var": 0.43750950412116557, "learning_rate": 0.0001, "loss": 1.457, "loss/crossentropy": 2.259760618209839, "loss/hidden": 1.2109375, "loss/logits": 0.23668238520622253, "loss/reg": 0.0009421485592611134, "step": 3934 }, { "epoch": 0.491875, "grad_norm": 3.7608489990234375, "grad_norm_var": 0.4455870886835328, "learning_rate": 0.0001, "loss": 1.7868, "loss/crossentropy": 2.4067938327789307, "loss/hidden": 1.359375, "loss/logits": 0.41802871227264404, "loss/reg": 0.0009416679968126118, "step": 3935 }, { "epoch": 0.492, "grad_norm": 2.3460745811462402, "grad_norm_var": 0.46511976278705236, "learning_rate": 0.0001, "loss": 1.2107, "loss/crossentropy": 2.4864561557769775, "loss/hidden": 1.0234375, "loss/logits": 0.17789408564567566, "loss/reg": 0.0009411853388883173, "step": 3936 }, { "epoch": 0.492125, "grad_norm": 2.99808406829834, "grad_norm_var": 0.46159451222495135, "learning_rate": 0.0001, "loss": 1.2502, "loss/crossentropy": 2.465390205383301, "loss/hidden": 1.0390625, "loss/logits": 0.20176857709884644, "loss/reg": 0.0009406855679117143, "step": 3937 }, { "epoch": 0.49225, "grad_norm": 2.6489341259002686, "grad_norm_var": 0.4117099659103149, "learning_rate": 0.0001, "loss": 1.3768, "loss/crossentropy": 2.5847280025482178, "loss/hidden": 1.125, "loss/logits": 0.2423875331878662, "loss/reg": 0.0009402011637575924, "step": 3938 }, { "epoch": 0.492375, "grad_norm": 3.1172893047332764, "grad_norm_var": 0.40563402312974783, "learning_rate": 0.0001, "loss": 1.4902, "loss/crossentropy": 2.5343353748321533, "loss/hidden": 1.2265625, "loss/logits": 0.25425875186920166, "loss/reg": 0.0009397356770932674, "step": 3939 }, { "epoch": 0.4925, "grad_norm": 2.5692596435546875, "grad_norm_var": 0.3695397471938577, "learning_rate": 0.0001, "loss": 1.2886, "loss/crossentropy": 2.312546968460083, "loss/hidden": 1.0625, "loss/logits": 0.21671564877033234, "loss/reg": 0.0009392163483425975, "step": 3940 }, { "epoch": 0.492625, "grad_norm": 3.4674227237701416, "grad_norm_var": 0.381277079078658, "learning_rate": 0.0001, "loss": 1.2129, "loss/crossentropy": 2.3780457973480225, "loss/hidden": 1.03125, "loss/logits": 0.17226016521453857, "loss/reg": 0.0009386827587150037, "step": 3941 }, { "epoch": 0.49275, "grad_norm": 3.6569018363952637, "grad_norm_var": 0.3221028906131245, "learning_rate": 0.0001, "loss": 1.5038, "loss/crossentropy": 2.843830108642578, "loss/hidden": 1.234375, "loss/logits": 0.26007604598999023, "loss/reg": 0.0009381178533658385, "step": 3942 }, { "epoch": 0.492875, "grad_norm": 3.5410196781158447, "grad_norm_var": 0.33556474667861735, "learning_rate": 0.0001, "loss": 1.3489, "loss/crossentropy": 2.6468727588653564, "loss/hidden": 1.125, "loss/logits": 0.21456359326839447, "loss/reg": 0.0009375586523674428, "step": 3943 }, { "epoch": 0.493, "grad_norm": 2.8974926471710205, "grad_norm_var": 0.2950577931689374, "learning_rate": 0.0001, "loss": 1.231, "loss/crossentropy": 2.5728209018707275, "loss/hidden": 1.0390625, "loss/logits": 0.18251976370811462, "loss/reg": 0.0009370028274133801, "step": 3944 }, { "epoch": 0.493125, "grad_norm": 3.554603338241577, "grad_norm_var": 0.2968604533939713, "learning_rate": 0.0001, "loss": 1.3633, "loss/crossentropy": 2.8205726146698, "loss/hidden": 1.1328125, "loss/logits": 0.22114431858062744, "loss/reg": 0.0009364415309391916, "step": 3945 }, { "epoch": 0.49325, "grad_norm": 2.4053094387054443, "grad_norm_var": 0.31701064234220716, "learning_rate": 0.0001, "loss": 1.3124, "loss/crossentropy": 2.3353934288024902, "loss/hidden": 1.1015625, "loss/logits": 0.20149603486061096, "loss/reg": 0.0009358717361465096, "step": 3946 }, { "epoch": 0.493375, "grad_norm": 2.5613019466400146, "grad_norm_var": 0.3198439970224984, "learning_rate": 0.0001, "loss": 1.4221, "loss/crossentropy": 2.6121621131896973, "loss/hidden": 1.1875, "loss/logits": 0.22521382570266724, "loss/reg": 0.0009352933848276734, "step": 3947 }, { "epoch": 0.4935, "grad_norm": 4.071358680725098, "grad_norm_var": 0.3059504250831118, "learning_rate": 0.0001, "loss": 1.3716, "loss/crossentropy": 2.517249822616577, "loss/hidden": 1.15625, "loss/logits": 0.20596405863761902, "loss/reg": 0.0009346426231786609, "step": 3948 }, { "epoch": 0.493625, "grad_norm": 3.3324694633483887, "grad_norm_var": 0.3103892337313624, "learning_rate": 0.0001, "loss": 1.2245, "loss/crossentropy": 2.511589288711548, "loss/hidden": 1.046875, "loss/logits": 0.16827695071697235, "loss/reg": 0.0009339956450276077, "step": 3949 }, { "epoch": 0.49375, "grad_norm": 2.819887638092041, "grad_norm_var": 0.28238178788073187, "learning_rate": 0.0001, "loss": 1.314, "loss/crossentropy": 2.520967960357666, "loss/hidden": 1.09375, "loss/logits": 0.2109559327363968, "loss/reg": 0.0009334888309240341, "step": 3950 }, { "epoch": 0.493875, "grad_norm": 2.9559245109558105, "grad_norm_var": 0.2529457516050684, "learning_rate": 0.0001, "loss": 1.4024, "loss/crossentropy": 2.5591156482696533, "loss/hidden": 1.1875, "loss/logits": 0.2055281400680542, "loss/reg": 0.0009328529122285545, "step": 3951 }, { "epoch": 0.494, "grad_norm": 2.619999408721924, "grad_norm_var": 0.23159855211520153, "learning_rate": 0.0001, "loss": 1.2908, "loss/crossentropy": 2.4011497497558594, "loss/hidden": 1.0859375, "loss/logits": 0.19557762145996094, "loss/reg": 0.0009323481936007738, "step": 3952 }, { "epoch": 0.494125, "grad_norm": 3.1778745651245117, "grad_norm_var": 0.23174914967786814, "learning_rate": 0.0001, "loss": 1.1535, "loss/crossentropy": 2.5008246898651123, "loss/hidden": 0.984375, "loss/logits": 0.15984448790550232, "loss/reg": 0.0009317274088971317, "step": 3953 }, { "epoch": 0.49425, "grad_norm": 2.529265880584717, "grad_norm_var": 0.239638891826331, "learning_rate": 0.0001, "loss": 1.1437, "loss/crossentropy": 2.5141210556030273, "loss/hidden": 0.96875, "loss/logits": 0.165601909160614, "loss/reg": 0.0009312194888480008, "step": 3954 }, { "epoch": 0.494375, "grad_norm": 2.1223270893096924, "grad_norm_var": 0.2965421775617015, "learning_rate": 0.0001, "loss": 1.0822, "loss/crossentropy": 2.413752555847168, "loss/hidden": 0.92578125, "loss/logits": 0.147147536277771, "loss/reg": 0.0009306025458499789, "step": 3955 }, { "epoch": 0.4945, "grad_norm": 2.5788283348083496, "grad_norm_var": 0.2959758307272123, "learning_rate": 0.0001, "loss": 1.1292, "loss/crossentropy": 2.535163402557373, "loss/hidden": 0.9765625, "loss/logits": 0.14331845939159393, "loss/reg": 0.0009299699449911714, "step": 3956 }, { "epoch": 0.494625, "grad_norm": 2.4149725437164307, "grad_norm_var": 0.3021730077725065, "learning_rate": 0.0001, "loss": 1.355, "loss/crossentropy": 2.273625135421753, "loss/hidden": 1.1484375, "loss/logits": 0.19726082682609558, "loss/reg": 0.0009293005568906665, "step": 3957 }, { "epoch": 0.49475, "grad_norm": 2.7857167720794678, "grad_norm_var": 0.26778294542027276, "learning_rate": 0.0001, "loss": 1.1531, "loss/crossentropy": 2.4891393184661865, "loss/hidden": 0.98046875, "loss/logits": 0.16329695284366608, "loss/reg": 0.0009286124259233475, "step": 3958 }, { "epoch": 0.494875, "grad_norm": 2.6382551193237305, "grad_norm_var": 0.24132270012107307, "learning_rate": 0.0001, "loss": 1.3939, "loss/crossentropy": 2.4273056983947754, "loss/hidden": 1.1875, "loss/logits": 0.19711405038833618, "loss/reg": 0.0009279102669097483, "step": 3959 }, { "epoch": 0.495, "grad_norm": 2.085751533508301, "grad_norm_var": 0.27645596067400885, "learning_rate": 0.0001, "loss": 1.0738, "loss/crossentropy": 2.4686269760131836, "loss/hidden": 0.91796875, "loss/logits": 0.14653117954730988, "loss/reg": 0.0009272327879443765, "step": 3960 }, { "epoch": 0.495125, "grad_norm": 3.4845197200775146, "grad_norm_var": 0.26962620695163175, "learning_rate": 0.0001, "loss": 1.3851, "loss/crossentropy": 2.4413247108459473, "loss/hidden": 1.15625, "loss/logits": 0.21958467364311218, "loss/reg": 0.0009265494882129133, "step": 3961 }, { "epoch": 0.49525, "grad_norm": 3.3621768951416016, "grad_norm_var": 0.2782196286235487, "learning_rate": 0.0001, "loss": 1.3792, "loss/crossentropy": 2.4991581439971924, "loss/hidden": 1.171875, "loss/logits": 0.19809837639331818, "loss/reg": 0.0009259862708859146, "step": 3962 }, { "epoch": 0.495375, "grad_norm": 2.616913080215454, "grad_norm_var": 0.2762997861028564, "learning_rate": 0.0001, "loss": 1.1833, "loss/crossentropy": 2.6797566413879395, "loss/hidden": 0.9921875, "loss/logits": 0.18182505667209625, "loss/reg": 0.0009253900498151779, "step": 3963 }, { "epoch": 0.4955, "grad_norm": 2.9457693099975586, "grad_norm_var": 0.1721492138460687, "learning_rate": 0.0001, "loss": 1.3131, "loss/crossentropy": 2.3894546031951904, "loss/hidden": 1.125, "loss/logits": 0.17889292538166046, "loss/reg": 0.0009247453999705613, "step": 3964 }, { "epoch": 0.495625, "grad_norm": 3.6163699626922607, "grad_norm_var": 0.19812164589540374, "learning_rate": 0.0001, "loss": 1.4383, "loss/crossentropy": 2.490079402923584, "loss/hidden": 1.171875, "loss/logits": 0.25718820095062256, "loss/reg": 0.000924249179661274, "step": 3965 }, { "epoch": 0.49575, "grad_norm": 2.4922850131988525, "grad_norm_var": 0.20383659135350715, "learning_rate": 0.0001, "loss": 1.2859, "loss/crossentropy": 2.5071754455566406, "loss/hidden": 1.0703125, "loss/logits": 0.20636126399040222, "loss/reg": 0.0009236352634616196, "step": 3966 }, { "epoch": 0.495875, "grad_norm": 2.3302204608917236, "grad_norm_var": 0.21335218351976362, "learning_rate": 0.0001, "loss": 1.1815, "loss/crossentropy": 2.4088146686553955, "loss/hidden": 1.0078125, "loss/logits": 0.1644718050956726, "loss/reg": 0.0009230885189026594, "step": 3967 }, { "epoch": 0.496, "grad_norm": 2.4410042762756348, "grad_norm_var": 0.21816076639898938, "learning_rate": 0.0001, "loss": 1.1691, "loss/crossentropy": 2.6988162994384766, "loss/hidden": 0.984375, "loss/logits": 0.17546086013317108, "loss/reg": 0.0009224659879691899, "step": 3968 }, { "epoch": 0.496125, "grad_norm": 2.248182535171509, "grad_norm_var": 0.21621575568570178, "learning_rate": 0.0001, "loss": 1.1367, "loss/crossentropy": 2.6623764038085938, "loss/hidden": 0.97265625, "loss/logits": 0.1548055112361908, "loss/reg": 0.0009219665080308914, "step": 3969 }, { "epoch": 0.49625, "grad_norm": 2.7014756202697754, "grad_norm_var": 0.2148772104368028, "learning_rate": 0.0001, "loss": 1.3078, "loss/crossentropy": 2.4977524280548096, "loss/hidden": 1.09375, "loss/logits": 0.20484310388565063, "loss/reg": 0.0009213967132382095, "step": 3970 }, { "epoch": 0.496375, "grad_norm": 2.7408030033111572, "grad_norm_var": 0.19287510769712882, "learning_rate": 0.0001, "loss": 1.3037, "loss/crossentropy": 2.4404594898223877, "loss/hidden": 1.09375, "loss/logits": 0.20076236128807068, "loss/reg": 0.0009208987466990948, "step": 3971 }, { "epoch": 0.4965, "grad_norm": 2.5678012371063232, "grad_norm_var": 0.19308689175158236, "learning_rate": 0.0001, "loss": 1.3663, "loss/crossentropy": 2.817671775817871, "loss/hidden": 1.125, "loss/logits": 0.23211173713207245, "loss/reg": 0.0009204022353515029, "step": 3972 }, { "epoch": 0.496625, "grad_norm": 2.295489549636841, "grad_norm_var": 0.1987909888192225, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.333028793334961, "loss/hidden": 1.140625, "loss/logits": 0.2263929843902588, "loss/reg": 0.000919912476092577, "step": 3973 }, { "epoch": 0.49675, "grad_norm": 2.372567653656006, "grad_norm_var": 0.20526325938994863, "learning_rate": 0.0001, "loss": 1.1827, "loss/crossentropy": 2.3132567405700684, "loss/hidden": 1.0078125, "loss/logits": 0.1656796932220459, "loss/reg": 0.0009194191661663353, "step": 3974 }, { "epoch": 0.496875, "grad_norm": 2.997095823287964, "grad_norm_var": 0.21113569414420302, "learning_rate": 0.0001, "loss": 1.2751, "loss/crossentropy": 2.745558261871338, "loss/hidden": 1.0703125, "loss/logits": 0.19558003544807434, "loss/reg": 0.0009188775438815355, "step": 3975 }, { "epoch": 0.497, "grad_norm": 3.956340789794922, "grad_norm_var": 0.2750945434784209, "learning_rate": 0.0001, "loss": 1.3822, "loss/crossentropy": 2.643021821975708, "loss/hidden": 1.171875, "loss/logits": 0.20116034150123596, "loss/reg": 0.0009183184010908008, "step": 3976 }, { "epoch": 0.497125, "grad_norm": 3.0044095516204834, "grad_norm_var": 0.24715823576654733, "learning_rate": 0.0001, "loss": 1.274, "loss/crossentropy": 2.4596290588378906, "loss/hidden": 1.078125, "loss/logits": 0.18671706318855286, "loss/reg": 0.0009177580359391868, "step": 3977 }, { "epoch": 0.49725, "grad_norm": 3.2114431858062744, "grad_norm_var": 0.23714019365725653, "learning_rate": 0.0001, "loss": 1.2392, "loss/crossentropy": 2.749554395675659, "loss/hidden": 1.046875, "loss/logits": 0.18318936228752136, "loss/reg": 0.0009172065183520317, "step": 3978 }, { "epoch": 0.497375, "grad_norm": 2.7675909996032715, "grad_norm_var": 0.23520966122532533, "learning_rate": 0.0001, "loss": 1.3437, "loss/crossentropy": 2.49666428565979, "loss/hidden": 1.140625, "loss/logits": 0.1938786804676056, "loss/reg": 0.0009167236858047545, "step": 3979 }, { "epoch": 0.4975, "grad_norm": 2.306425094604492, "grad_norm_var": 0.24773879192975887, "learning_rate": 0.0001, "loss": 1.1854, "loss/crossentropy": 2.6701619625091553, "loss/hidden": 0.98046875, "loss/logits": 0.19574448466300964, "loss/reg": 0.0009162958594970405, "step": 3980 }, { "epoch": 0.497625, "grad_norm": 2.644320249557495, "grad_norm_var": 0.1949075514427013, "learning_rate": 0.0001, "loss": 1.2231, "loss/crossentropy": 2.5645034313201904, "loss/hidden": 1.03125, "loss/logits": 0.18272888660430908, "loss/reg": 0.0009157858439721167, "step": 3981 }, { "epoch": 0.49775, "grad_norm": 2.77358078956604, "grad_norm_var": 0.1923496902469187, "learning_rate": 0.0001, "loss": 1.2133, "loss/crossentropy": 2.5585198402404785, "loss/hidden": 1.03125, "loss/logits": 0.17292793095111847, "loss/reg": 0.0009153065038844943, "step": 3982 }, { "epoch": 0.497875, "grad_norm": 3.532674789428711, "grad_norm_var": 0.22184172659931367, "learning_rate": 0.0001, "loss": 1.4724, "loss/crossentropy": 2.4798924922943115, "loss/hidden": 1.234375, "loss/logits": 0.22889506816864014, "loss/reg": 0.0009147940436378121, "step": 3983 }, { "epoch": 0.498, "grad_norm": 3.191368341445923, "grad_norm_var": 0.22260830953669786, "learning_rate": 0.0001, "loss": 1.069, "loss/crossentropy": 2.729538917541504, "loss/hidden": 0.89453125, "loss/logits": 0.16535162925720215, "loss/reg": 0.0009142736089415848, "step": 3984 }, { "epoch": 0.498125, "grad_norm": 2.4506547451019287, "grad_norm_var": 0.20941031531902563, "learning_rate": 0.0001, "loss": 1.0822, "loss/crossentropy": 2.5737271308898926, "loss/hidden": 0.921875, "loss/logits": 0.15115487575531006, "loss/reg": 0.0009138032328337431, "step": 3985 }, { "epoch": 0.49825, "grad_norm": 2.331437587738037, "grad_norm_var": 0.2250312141283293, "learning_rate": 0.0001, "loss": 1.0958, "loss/crossentropy": 2.365680456161499, "loss/hidden": 0.93359375, "loss/logits": 0.15310853719711304, "loss/reg": 0.0009133449639193714, "step": 3986 }, { "epoch": 0.498375, "grad_norm": 9.505460739135742, "grad_norm_var": 3.0122831572457356, "learning_rate": 0.0001, "loss": 1.2027, "loss/crossentropy": 2.486189365386963, "loss/hidden": 1.0390625, "loss/logits": 0.15448379516601562, "loss/reg": 0.0009128485689871013, "step": 3987 }, { "epoch": 0.4985, "grad_norm": 2.5104315280914307, "grad_norm_var": 3.0176635343629297, "learning_rate": 0.0001, "loss": 1.287, "loss/crossentropy": 2.428913116455078, "loss/hidden": 1.09375, "loss/logits": 0.1841382384300232, "loss/reg": 0.0009123769123107195, "step": 3988 }, { "epoch": 0.498625, "grad_norm": 3.0809807777404785, "grad_norm_var": 2.9572312796384077, "learning_rate": 0.0001, "loss": 1.3544, "loss/crossentropy": 2.701385736465454, "loss/hidden": 1.140625, "loss/logits": 0.20466375350952148, "loss/reg": 0.0009118671878241003, "step": 3989 }, { "epoch": 0.49875, "grad_norm": 2.850457191467285, "grad_norm_var": 2.9130602337873435, "learning_rate": 0.0001, "loss": 1.2238, "loss/crossentropy": 2.445533037185669, "loss/hidden": 1.0625, "loss/logits": 0.15222039818763733, "loss/reg": 0.0009113879059441388, "step": 3990 }, { "epoch": 0.498875, "grad_norm": 2.8227479457855225, "grad_norm_var": 2.922458671287775, "learning_rate": 0.0001, "loss": 1.2017, "loss/crossentropy": 2.486616849899292, "loss/hidden": 1.015625, "loss/logits": 0.17691916227340698, "loss/reg": 0.0009108763770200312, "step": 3991 }, { "epoch": 0.499, "grad_norm": 5.427181243896484, "grad_norm_var": 3.1846657880286395, "learning_rate": 0.0001, "loss": 1.2809, "loss/crossentropy": 2.4797611236572266, "loss/hidden": 1.0703125, "loss/logits": 0.20146211981773376, "loss/reg": 0.0009104108903557062, "step": 3992 }, { "epoch": 0.499125, "grad_norm": 3.3936173915863037, "grad_norm_var": 3.1735683271538915, "learning_rate": 0.0001, "loss": 1.3763, "loss/crossentropy": 2.405306100845337, "loss/hidden": 1.140625, "loss/logits": 0.22662195563316345, "loss/reg": 0.0009099606540985405, "step": 3993 }, { "epoch": 0.49925, "grad_norm": 2.770366668701172, "grad_norm_var": 3.198288297011017, "learning_rate": 0.0001, "loss": 1.3112, "loss/crossentropy": 2.582857131958008, "loss/hidden": 1.109375, "loss/logits": 0.1927052140235901, "loss/reg": 0.0009094626293517649, "step": 3994 }, { "epoch": 0.499375, "grad_norm": 2.7712109088897705, "grad_norm_var": 3.197985108770562, "learning_rate": 0.0001, "loss": 1.2464, "loss/crossentropy": 2.641078233718872, "loss/hidden": 1.0390625, "loss/logits": 0.198219895362854, "loss/reg": 0.0009089934173971415, "step": 3995 }, { "epoch": 0.4995, "grad_norm": 3.0732524394989014, "grad_norm_var": 3.1231625095959767, "learning_rate": 0.0001, "loss": 1.0941, "loss/crossentropy": 2.573498487472534, "loss/hidden": 0.9375, "loss/logits": 0.14748027920722961, "loss/reg": 0.0009085566271096468, "step": 3996 }, { "epoch": 0.499625, "grad_norm": 4.121737003326416, "grad_norm_var": 3.101740044336042, "learning_rate": 0.0001, "loss": 1.4662, "loss/crossentropy": 2.3963074684143066, "loss/hidden": 1.25, "loss/logits": 0.20708328485488892, "loss/reg": 0.0009081854368560016, "step": 3997 }, { "epoch": 0.49975, "grad_norm": 2.955850839614868, "grad_norm_var": 3.0852402879182352, "learning_rate": 0.0001, "loss": 1.4347, "loss/crossentropy": 2.679802179336548, "loss/hidden": 1.203125, "loss/logits": 0.22249548137187958, "loss/reg": 0.0009077699505724013, "step": 3998 }, { "epoch": 0.499875, "grad_norm": 3.4294350147247314, "grad_norm_var": 3.0861358343071417, "learning_rate": 0.0001, "loss": 1.4617, "loss/crossentropy": 2.4026973247528076, "loss/hidden": 1.21875, "loss/logits": 0.23383930325508118, "loss/reg": 0.000907301262486726, "step": 3999 }, { "epoch": 0.5, "grad_norm": 2.788649082183838, "grad_norm_var": 3.1151473651091957, "learning_rate": 0.0001, "loss": 1.2366, "loss/crossentropy": 2.63923716545105, "loss/hidden": 1.046875, "loss/logits": 0.18060708045959473, "loss/reg": 0.0009069106308743358, "step": 4000 }, { "epoch": 0.500125, "grad_norm": 3.012928009033203, "grad_norm_var": 3.0549094104988415, "learning_rate": 0.0001, "loss": 1.1944, "loss/crossentropy": 2.3369951248168945, "loss/hidden": 1.015625, "loss/logits": 0.1696736365556717, "loss/reg": 0.0009064265177585185, "step": 4001 }, { "epoch": 0.50025, "grad_norm": 2.6133642196655273, "grad_norm_var": 3.013963577081701, "learning_rate": 0.0001, "loss": 1.2054, "loss/crossentropy": 2.505615234375, "loss/hidden": 1.0234375, "loss/logits": 0.17292991280555725, "loss/reg": 0.0009059037547558546, "step": 4002 }, { "epoch": 0.500375, "grad_norm": 3.5540761947631836, "grad_norm_var": 0.5181355699753716, "learning_rate": 0.0001, "loss": 1.1908, "loss/crossentropy": 2.6081736087799072, "loss/hidden": 1.015625, "loss/logits": 0.16607603430747986, "loss/reg": 0.0009054000256583095, "step": 4003 }, { "epoch": 0.5005, "grad_norm": 3.123664379119873, "grad_norm_var": 0.4853780220928949, "learning_rate": 0.0001, "loss": 1.2633, "loss/crossentropy": 2.603935956954956, "loss/hidden": 1.0625, "loss/logits": 0.19175145030021667, "loss/reg": 0.000904905900824815, "step": 4004 }, { "epoch": 0.500625, "grad_norm": 2.9189834594726562, "grad_norm_var": 0.4903848283261055, "learning_rate": 0.0001, "loss": 1.0932, "loss/crossentropy": 2.947478771209717, "loss/hidden": 0.93359375, "loss/logits": 0.1505829095840454, "loss/reg": 0.0009043673635460436, "step": 4005 }, { "epoch": 0.50075, "grad_norm": 2.4921226501464844, "grad_norm_var": 0.5163871234475124, "learning_rate": 0.0001, "loss": 1.1487, "loss/crossentropy": 2.322331666946411, "loss/hidden": 0.984375, "loss/logits": 0.15529316663742065, "loss/reg": 0.0009038766729645431, "step": 4006 }, { "epoch": 0.500875, "grad_norm": 2.331488609313965, "grad_norm_var": 0.5564643276132091, "learning_rate": 0.0001, "loss": 1.2897, "loss/crossentropy": 2.502081871032715, "loss/hidden": 1.09375, "loss/logits": 0.18688435852527618, "loss/reg": 0.0009033869719132781, "step": 4007 }, { "epoch": 0.501, "grad_norm": 2.570848226547241, "grad_norm_var": 0.20812322986437928, "learning_rate": 0.0001, "loss": 1.0991, "loss/crossentropy": 2.5757386684417725, "loss/hidden": 0.9453125, "loss/logits": 0.14473742246627808, "loss/reg": 0.0009029440116137266, "step": 4008 }, { "epoch": 0.501125, "grad_norm": 2.770320415496826, "grad_norm_var": 0.19928510807453273, "learning_rate": 0.0001, "loss": 1.3468, "loss/crossentropy": 2.553703546524048, "loss/hidden": 1.1484375, "loss/logits": 0.18936949968338013, "loss/reg": 0.000902454077731818, "step": 4009 }, { "epoch": 0.50125, "grad_norm": 4.876682281494141, "grad_norm_var": 0.4243964601130109, "learning_rate": 0.0001, "loss": 1.5094, "loss/crossentropy": 2.296947717666626, "loss/hidden": 1.265625, "loss/logits": 0.23477548360824585, "loss/reg": 0.0009019606513902545, "step": 4010 }, { "epoch": 0.501375, "grad_norm": 7.858867168426514, "grad_norm_var": 1.8274102162596961, "learning_rate": 0.0001, "loss": 1.5997, "loss/crossentropy": 2.4177086353302, "loss/hidden": 1.375, "loss/logits": 0.21572838723659515, "loss/reg": 0.0009014529059641063, "step": 4011 }, { "epoch": 0.5015, "grad_norm": 3.1527204513549805, "grad_norm_var": 1.8242816792575218, "learning_rate": 0.0001, "loss": 1.3346, "loss/crossentropy": 2.8711116313934326, "loss/hidden": 1.125, "loss/logits": 0.20057313144207, "loss/reg": 0.0009009642526507378, "step": 4012 }, { "epoch": 0.501625, "grad_norm": 2.4070143699645996, "grad_norm_var": 1.8454923081312622, "learning_rate": 0.0001, "loss": 1.2447, "loss/crossentropy": 2.477879524230957, "loss/hidden": 1.03125, "loss/logits": 0.20441077649593353, "loss/reg": 0.000900513376109302, "step": 4013 }, { "epoch": 0.50175, "grad_norm": 2.2950596809387207, "grad_norm_var": 1.9034180079465615, "learning_rate": 0.0001, "loss": 1.0492, "loss/crossentropy": 2.639232635498047, "loss/hidden": 0.90625, "loss/logits": 0.13399088382720947, "loss/reg": 0.0009000430000014603, "step": 4014 }, { "epoch": 0.501875, "grad_norm": 2.471503734588623, "grad_norm_var": 1.9394182515172624, "learning_rate": 0.0001, "loss": 1.2833, "loss/crossentropy": 2.659464120864868, "loss/hidden": 1.0703125, "loss/logits": 0.204026460647583, "loss/reg": 0.0008995505049824715, "step": 4015 }, { "epoch": 0.502, "grad_norm": 3.5511374473571777, "grad_norm_var": 1.9336916787319844, "learning_rate": 0.0001, "loss": 1.2781, "loss/crossentropy": 2.4040091037750244, "loss/hidden": 1.0703125, "loss/logits": 0.19876572489738464, "loss/reg": 0.0008990928181447089, "step": 4016 }, { "epoch": 0.502125, "grad_norm": 2.464930772781372, "grad_norm_var": 1.96978603010281, "learning_rate": 0.0001, "loss": 1.1654, "loss/crossentropy": 2.7262563705444336, "loss/hidden": 0.98828125, "loss/logits": 0.16813211143016815, "loss/reg": 0.0008985809981822968, "step": 4017 }, { "epoch": 0.50225, "grad_norm": 2.2542638778686523, "grad_norm_var": 2.0066902009911844, "learning_rate": 0.0001, "loss": 1.0817, "loss/crossentropy": 2.646116018295288, "loss/hidden": 0.92578125, "loss/logits": 0.14690834283828735, "loss/reg": 0.0008980724960565567, "step": 4018 }, { "epoch": 0.502375, "grad_norm": 26.804153442382812, "grad_norm_var": 36.91030965065669, "learning_rate": 0.0001, "loss": 1.5054, "loss/crossentropy": 2.6154403686523438, "loss/hidden": 1.3046875, "loss/logits": 0.19177736341953278, "loss/reg": 0.0008975932141765952, "step": 4019 }, { "epoch": 0.5025, "grad_norm": 2.4565160274505615, "grad_norm_var": 37.07358722434458, "learning_rate": 0.0001, "loss": 1.1265, "loss/crossentropy": 2.3721554279327393, "loss/hidden": 0.953125, "loss/logits": 0.16439077258110046, "loss/reg": 0.0008971046190708876, "step": 4020 }, { "epoch": 0.502625, "grad_norm": 2.347928047180176, "grad_norm_var": 37.22232713831154, "learning_rate": 0.0001, "loss": 1.242, "loss/crossentropy": 2.384906530380249, "loss/hidden": 1.03125, "loss/logits": 0.2017393559217453, "loss/reg": 0.0008966255700215697, "step": 4021 }, { "epoch": 0.50275, "grad_norm": 3.13645601272583, "grad_norm_var": 37.06983977076176, "learning_rate": 0.0001, "loss": 1.3275, "loss/crossentropy": 2.621926784515381, "loss/hidden": 1.140625, "loss/logits": 0.17791080474853516, "loss/reg": 0.0008961274870671332, "step": 4022 }, { "epoch": 0.502875, "grad_norm": 4.693602561950684, "grad_norm_var": 36.70114885676022, "learning_rate": 0.0001, "loss": 1.6519, "loss/crossentropy": 2.390718698501587, "loss/hidden": 1.3828125, "loss/logits": 0.260110467672348, "loss/reg": 0.000895633187610656, "step": 4023 }, { "epoch": 0.503, "grad_norm": 3.607083320617676, "grad_norm_var": 36.46621130739588, "learning_rate": 0.0001, "loss": 1.5384, "loss/crossentropy": 2.2287180423736572, "loss/hidden": 1.265625, "loss/logits": 0.2638140618801117, "loss/reg": 0.0008951633935794234, "step": 4024 }, { "epoch": 0.503125, "grad_norm": 2.530722141265869, "grad_norm_var": 36.535335609219395, "learning_rate": 0.0001, "loss": 1.2545, "loss/crossentropy": 2.8253777027130127, "loss/hidden": 1.0546875, "loss/logits": 0.19086262583732605, "loss/reg": 0.0008946073357947171, "step": 4025 }, { "epoch": 0.50325, "grad_norm": 2.7998905181884766, "grad_norm_var": 36.78554857220329, "learning_rate": 0.0001, "loss": 1.1999, "loss/crossentropy": 2.7338967323303223, "loss/hidden": 1.0234375, "loss/logits": 0.1675322949886322, "loss/reg": 0.0008941168780438602, "step": 4026 }, { "epoch": 0.503375, "grad_norm": 2.228652238845825, "grad_norm_var": 36.378136219326855, "learning_rate": 0.0001, "loss": 1.1324, "loss/crossentropy": 2.623159646987915, "loss/hidden": 0.96875, "loss/logits": 0.154735267162323, "loss/reg": 0.0008935604128055274, "step": 4027 }, { "epoch": 0.5035, "grad_norm": 2.854313373565674, "grad_norm_var": 36.43034791017477, "learning_rate": 0.0001, "loss": 1.4018, "loss/crossentropy": 2.6131975650787354, "loss/hidden": 1.171875, "loss/logits": 0.2209455370903015, "loss/reg": 0.0008930471376515925, "step": 4028 }, { "epoch": 0.503625, "grad_norm": 2.564293622970581, "grad_norm_var": 36.39206167948705, "learning_rate": 0.0001, "loss": 1.4116, "loss/crossentropy": 2.5936598777770996, "loss/hidden": 1.203125, "loss/logits": 0.19959582388401031, "loss/reg": 0.0008924933499656618, "step": 4029 }, { "epoch": 0.50375, "grad_norm": 2.259002923965454, "grad_norm_var": 36.40186009646483, "learning_rate": 0.0001, "loss": 1.158, "loss/crossentropy": 2.63423228263855, "loss/hidden": 0.9765625, "loss/logits": 0.1724802702665329, "loss/reg": 0.0008919332176446915, "step": 4030 }, { "epoch": 0.503875, "grad_norm": 2.350407838821411, "grad_norm_var": 36.432526228897984, "learning_rate": 0.0001, "loss": 1.2853, "loss/crossentropy": 2.402923345565796, "loss/hidden": 1.0625, "loss/logits": 0.21393223106861115, "loss/reg": 0.0008913541096262634, "step": 4031 }, { "epoch": 0.504, "grad_norm": 2.3894293308258057, "grad_norm_var": 36.633869277023216, "learning_rate": 0.0001, "loss": 1.1509, "loss/crossentropy": 2.2506000995635986, "loss/hidden": 0.98046875, "loss/logits": 0.1615297794342041, "loss/reg": 0.0008907398441806436, "step": 4032 }, { "epoch": 0.504125, "grad_norm": 3.571176528930664, "grad_norm_var": 36.4494404969733, "learning_rate": 0.0001, "loss": 1.4747, "loss/crossentropy": 2.4401330947875977, "loss/hidden": 1.203125, "loss/logits": 0.2626614272594452, "loss/reg": 0.0008902524714358151, "step": 4033 }, { "epoch": 0.50425, "grad_norm": 17.259300231933594, "grad_norm_var": 46.42255077574965, "learning_rate": 0.0001, "loss": 1.2411, "loss/crossentropy": 2.396841526031494, "loss/hidden": 1.046875, "loss/logits": 0.18534690141677856, "loss/reg": 0.0008897543884813786, "step": 4034 }, { "epoch": 0.504375, "grad_norm": 2.4564504623413086, "grad_norm_var": 13.47082515552317, "learning_rate": 0.0001, "loss": 1.1711, "loss/crossentropy": 2.5840563774108887, "loss/hidden": 1.0, "loss/logits": 0.1622200608253479, "loss/reg": 0.0008892692276276648, "step": 4035 }, { "epoch": 0.5045, "grad_norm": 2.894209384918213, "grad_norm_var": 13.409116707446818, "learning_rate": 0.0001, "loss": 1.3954, "loss/crossentropy": 2.644078493118286, "loss/hidden": 1.171875, "loss/logits": 0.214639350771904, "loss/reg": 0.0008887395379133523, "step": 4036 }, { "epoch": 0.504625, "grad_norm": 2.393033981323242, "grad_norm_var": 13.40083308711054, "learning_rate": 0.0001, "loss": 1.2836, "loss/crossentropy": 2.687074899673462, "loss/hidden": 1.0625, "loss/logits": 0.21224312484264374, "loss/reg": 0.0008882602560333908, "step": 4037 }, { "epoch": 0.50475, "grad_norm": 2.4648678302764893, "grad_norm_var": 13.483895335352349, "learning_rate": 0.0001, "loss": 1.1417, "loss/crossentropy": 2.6949753761291504, "loss/hidden": 0.96875, "loss/logits": 0.1641184389591217, "loss/reg": 0.0008877577492967248, "step": 4038 }, { "epoch": 0.504875, "grad_norm": 3.998504161834717, "grad_norm_var": 13.422680529140925, "learning_rate": 0.0001, "loss": 1.4874, "loss/crossentropy": 2.658353805541992, "loss/hidden": 1.25, "loss/logits": 0.22853124141693115, "loss/reg": 0.0008872859179973602, "step": 4039 }, { "epoch": 0.505, "grad_norm": 2.313119649887085, "grad_norm_var": 13.53711794607451, "learning_rate": 0.0001, "loss": 1.0636, "loss/crossentropy": 2.845168352127075, "loss/hidden": 0.9140625, "loss/logits": 0.14065852761268616, "loss/reg": 0.0008868041913956404, "step": 4040 }, { "epoch": 0.505125, "grad_norm": 2.3383028507232666, "grad_norm_var": 13.566428157804792, "learning_rate": 0.0001, "loss": 1.0925, "loss/crossentropy": 2.797106981277466, "loss/hidden": 0.9375, "loss/logits": 0.1461259424686432, "loss/reg": 0.000886338297277689, "step": 4041 }, { "epoch": 0.50525, "grad_norm": 2.462202310562134, "grad_norm_var": 13.608271576925508, "learning_rate": 0.0001, "loss": 1.4053, "loss/crossentropy": 2.552946090698242, "loss/hidden": 1.15625, "loss/logits": 0.24020430445671082, "loss/reg": 0.0008859168156050146, "step": 4042 }, { "epoch": 0.505375, "grad_norm": 3.39054536819458, "grad_norm_var": 13.487970784798927, "learning_rate": 0.0001, "loss": 1.1561, "loss/crossentropy": 2.195681095123291, "loss/hidden": 0.9921875, "loss/logits": 0.15503782033920288, "loss/reg": 0.0008854405023157597, "step": 4043 }, { "epoch": 0.5055, "grad_norm": 4.395425319671631, "grad_norm_var": 13.478572489818813, "learning_rate": 0.0001, "loss": 1.5692, "loss/crossentropy": 2.87327241897583, "loss/hidden": 1.28125, "loss/logits": 0.27913302183151245, "loss/reg": 0.0008850258309394121, "step": 4044 }, { "epoch": 0.505625, "grad_norm": 2.4465978145599365, "grad_norm_var": 13.497555148896966, "learning_rate": 0.0001, "loss": 1.2494, "loss/crossentropy": 2.4886960983276367, "loss/hidden": 1.046875, "loss/logits": 0.19368073344230652, "loss/reg": 0.0008845477132126689, "step": 4045 }, { "epoch": 0.50575, "grad_norm": 2.888556957244873, "grad_norm_var": 13.400410376542347, "learning_rate": 0.0001, "loss": 1.2789, "loss/crossentropy": 2.463549852371216, "loss/hidden": 1.078125, "loss/logits": 0.1919676810503006, "loss/reg": 0.0008841063245199621, "step": 4046 }, { "epoch": 0.505875, "grad_norm": 2.4396920204162598, "grad_norm_var": 13.384238055027291, "learning_rate": 0.0001, "loss": 1.1964, "loss/crossentropy": 2.580559253692627, "loss/hidden": 1.015625, "loss/logits": 0.17189925909042358, "loss/reg": 0.000883627450093627, "step": 4047 }, { "epoch": 0.506, "grad_norm": 5.477128982543945, "grad_norm_var": 13.417358843705031, "learning_rate": 0.0001, "loss": 1.4667, "loss/crossentropy": 2.7947592735290527, "loss/hidden": 1.1796875, "loss/logits": 0.2782207429409027, "loss/reg": 0.0008831738377921283, "step": 4048 }, { "epoch": 0.506125, "grad_norm": 2.7555930614471436, "grad_norm_var": 13.500053334915314, "learning_rate": 0.0001, "loss": 1.3312, "loss/crossentropy": 2.4812841415405273, "loss/hidden": 1.125, "loss/logits": 0.19742220640182495, "loss/reg": 0.000882769119925797, "step": 4049 }, { "epoch": 0.50625, "grad_norm": 12.633561134338379, "grad_norm_var": 6.596822723107645, "learning_rate": 0.0001, "loss": 1.9108, "loss/crossentropy": 2.617006778717041, "loss/hidden": 1.5078125, "loss/logits": 0.39420652389526367, "loss/reg": 0.0008822933305054903, "step": 4050 }, { "epoch": 0.506375, "grad_norm": 3.3102378845214844, "grad_norm_var": 6.5111510021243895, "learning_rate": 0.0001, "loss": 1.2933, "loss/crossentropy": 2.6001665592193604, "loss/hidden": 1.09375, "loss/logits": 0.19071626663208008, "loss/reg": 0.0008819265640340745, "step": 4051 }, { "epoch": 0.5065, "grad_norm": 2.880075693130493, "grad_norm_var": 6.512611510845001, "learning_rate": 0.0001, "loss": 1.2456, "loss/crossentropy": 2.6513631343841553, "loss/hidden": 1.0625, "loss/logits": 0.1743147075176239, "loss/reg": 0.0008814571774564683, "step": 4052 }, { "epoch": 0.506625, "grad_norm": 8.182936668395996, "grad_norm_var": 7.628391803752272, "learning_rate": 0.0001, "loss": 1.3917, "loss/crossentropy": 2.884211778640747, "loss/hidden": 1.15625, "loss/logits": 0.22666551172733307, "loss/reg": 0.0008810752187855542, "step": 4053 }, { "epoch": 0.50675, "grad_norm": 3.0996837615966797, "grad_norm_var": 7.521645690710995, "learning_rate": 0.0001, "loss": 1.3786, "loss/crossentropy": 2.3387274742126465, "loss/hidden": 1.15625, "loss/logits": 0.21354138851165771, "loss/reg": 0.0008807178237475455, "step": 4054 }, { "epoch": 0.506875, "grad_norm": 2.4221150875091553, "grad_norm_var": 7.6905691150568805, "learning_rate": 0.0001, "loss": 1.1721, "loss/crossentropy": 2.492856740951538, "loss/hidden": 0.98046875, "loss/logits": 0.18286532163619995, "loss/reg": 0.0008803503587841988, "step": 4055 }, { "epoch": 0.507, "grad_norm": 3.5094215869903564, "grad_norm_var": 7.516571029461166, "learning_rate": 0.0001, "loss": 1.5715, "loss/crossentropy": 2.3606693744659424, "loss/hidden": 1.34375, "loss/logits": 0.21897512674331665, "loss/reg": 0.0008799819042906165, "step": 4056 }, { "epoch": 0.507125, "grad_norm": 3.5844151973724365, "grad_norm_var": 7.330968947585093, "learning_rate": 0.0001, "loss": 1.2389, "loss/crossentropy": 2.3789451122283936, "loss/hidden": 1.0703125, "loss/logits": 0.15983514487743378, "loss/reg": 0.0008795357425697148, "step": 4057 }, { "epoch": 0.50725, "grad_norm": 2.54848051071167, "grad_norm_var": 7.312393347883824, "learning_rate": 0.0001, "loss": 1.3997, "loss/crossentropy": 2.513110637664795, "loss/hidden": 1.1796875, "loss/logits": 0.2112513780593872, "loss/reg": 0.0008790674619376659, "step": 4058 }, { "epoch": 0.507375, "grad_norm": 3.176051378250122, "grad_norm_var": 7.336210127916763, "learning_rate": 0.0001, "loss": 1.7632, "loss/crossentropy": 2.5535194873809814, "loss/hidden": 1.4375, "loss/logits": 0.316952645778656, "loss/reg": 0.0008786006947048008, "step": 4059 }, { "epoch": 0.5075, "grad_norm": 2.3008880615234375, "grad_norm_var": 7.530516812730161, "learning_rate": 0.0001, "loss": 1.1946, "loss/crossentropy": 2.5973331928253174, "loss/hidden": 1.0078125, "loss/logits": 0.17802280187606812, "loss/reg": 0.0008781946380622685, "step": 4060 }, { "epoch": 0.507625, "grad_norm": 2.194951057434082, "grad_norm_var": 7.585873272513731, "learning_rate": 0.0001, "loss": 1.2242, "loss/crossentropy": 2.2463650703430176, "loss/hidden": 1.03125, "loss/logits": 0.1841346174478531, "loss/reg": 0.0008777684415690601, "step": 4061 }, { "epoch": 0.50775, "grad_norm": 2.3383469581604004, "grad_norm_var": 7.683597229196326, "learning_rate": 0.0001, "loss": 1.1742, "loss/crossentropy": 2.353847026824951, "loss/hidden": 1.0, "loss/logits": 0.1654077023267746, "loss/reg": 0.0008773073786869645, "step": 4062 }, { "epoch": 0.507875, "grad_norm": 3.4078288078308105, "grad_norm_var": 7.5500146768540075, "learning_rate": 0.0001, "loss": 1.2691, "loss/crossentropy": 2.773912191390991, "loss/hidden": 1.0859375, "loss/logits": 0.1743870973587036, "loss/reg": 0.000876842939760536, "step": 4063 }, { "epoch": 0.508, "grad_norm": 2.6578433513641357, "grad_norm_var": 7.487339475007051, "learning_rate": 0.0001, "loss": 1.1811, "loss/crossentropy": 2.5249104499816895, "loss/hidden": 0.99609375, "loss/logits": 0.17623111605644226, "loss/reg": 0.0008763576042838395, "step": 4064 }, { "epoch": 0.508125, "grad_norm": 3.191814661026001, "grad_norm_var": 7.437750969657854, "learning_rate": 0.0001, "loss": 1.2235, "loss/crossentropy": 2.4428138732910156, "loss/hidden": 1.0546875, "loss/logits": 0.1600135862827301, "loss/reg": 0.0008758967160247266, "step": 4065 }, { "epoch": 0.50825, "grad_norm": 2.622131109237671, "grad_norm_var": 1.9637845922559858, "learning_rate": 0.0001, "loss": 1.2719, "loss/crossentropy": 2.5119926929473877, "loss/hidden": 1.0859375, "loss/logits": 0.17722192406654358, "loss/reg": 0.0008753961883485317, "step": 4066 }, { "epoch": 0.508375, "grad_norm": 3.305476665496826, "grad_norm_var": 1.963725042291647, "learning_rate": 0.0001, "loss": 1.6824, "loss/crossentropy": 2.4159562587738037, "loss/hidden": 1.3828125, "loss/logits": 0.2908284664154053, "loss/reg": 0.0008749006665311754, "step": 4067 }, { "epoch": 0.5085, "grad_norm": 3.1701061725616455, "grad_norm_var": 1.9560730210343498, "learning_rate": 0.0001, "loss": 1.371, "loss/crossentropy": 2.6434249877929688, "loss/hidden": 1.140625, "loss/logits": 0.2216227650642395, "loss/reg": 0.0008743985090404749, "step": 4068 }, { "epoch": 0.508625, "grad_norm": 2.0637171268463135, "grad_norm_var": 0.2569519266413406, "learning_rate": 0.0001, "loss": 1.228, "loss/crossentropy": 2.555605173110962, "loss/hidden": 1.0546875, "loss/logits": 0.1645621955394745, "loss/reg": 0.0008738991455174983, "step": 4069 }, { "epoch": 0.50875, "grad_norm": 2.697035074234009, "grad_norm_var": 0.25365757743146083, "learning_rate": 0.0001, "loss": 1.3612, "loss/crossentropy": 2.549859046936035, "loss/hidden": 1.1484375, "loss/logits": 0.2040315717458725, "loss/reg": 0.0008733998402021825, "step": 4070 }, { "epoch": 0.508875, "grad_norm": 3.325472831726074, "grad_norm_var": 0.25620505888120965, "learning_rate": 0.0001, "loss": 1.3785, "loss/crossentropy": 2.493114948272705, "loss/hidden": 1.15625, "loss/logits": 0.21353749930858612, "loss/reg": 0.0008728534448891878, "step": 4071 }, { "epoch": 0.509, "grad_norm": 2.707178831100464, "grad_norm_var": 0.22919659266361972, "learning_rate": 0.0001, "loss": 1.3556, "loss/crossentropy": 2.27180814743042, "loss/hidden": 1.1484375, "loss/logits": 0.19845429062843323, "loss/reg": 0.000872294360306114, "step": 4072 }, { "epoch": 0.509125, "grad_norm": 2.4804210662841797, "grad_norm_var": 0.1944304431876835, "learning_rate": 0.0001, "loss": 1.3067, "loss/crossentropy": 2.7785990238189697, "loss/hidden": 1.1015625, "loss/logits": 0.1964125633239746, "loss/reg": 0.0008717176970094442, "step": 4073 }, { "epoch": 0.50925, "grad_norm": 2.7220990657806396, "grad_norm_var": 0.19137777131748307, "learning_rate": 0.0001, "loss": 1.1675, "loss/crossentropy": 2.2797794342041016, "loss/hidden": 1.0, "loss/logits": 0.15879331529140472, "loss/reg": 0.0008712350390851498, "step": 4074 }, { "epoch": 0.509375, "grad_norm": 2.352142095565796, "grad_norm_var": 0.18948181727728652, "learning_rate": 0.0001, "loss": 1.2585, "loss/crossentropy": 2.5584774017333984, "loss/hidden": 1.0625, "loss/logits": 0.18733683228492737, "loss/reg": 0.0008706508670002222, "step": 4075 }, { "epoch": 0.5095, "grad_norm": 2.662393808364868, "grad_norm_var": 0.1773956232064838, "learning_rate": 0.0001, "loss": 1.2553, "loss/crossentropy": 2.1634933948516846, "loss/hidden": 1.0703125, "loss/logits": 0.17623765766620636, "loss/reg": 0.0008700615726411343, "step": 4076 }, { "epoch": 0.509625, "grad_norm": 2.3494513034820557, "grad_norm_var": 0.16758358306007015, "learning_rate": 0.0001, "loss": 1.2266, "loss/crossentropy": 2.24137806892395, "loss/hidden": 1.046875, "loss/logits": 0.17100876569747925, "loss/reg": 0.000869490671902895, "step": 4077 }, { "epoch": 0.50975, "grad_norm": 2.9826624393463135, "grad_norm_var": 0.15787835835477135, "learning_rate": 0.0001, "loss": 1.2479, "loss/crossentropy": 2.6606571674346924, "loss/hidden": 1.0625, "loss/logits": 0.17674139142036438, "loss/reg": 0.0008689088863320649, "step": 4078 }, { "epoch": 0.509875, "grad_norm": 14.890130996704102, "grad_norm_var": 9.338433746666093, "learning_rate": 0.0001, "loss": 2.2856, "loss/crossentropy": 2.5678398609161377, "loss/hidden": 1.9765625, "loss/logits": 0.300337016582489, "loss/reg": 0.0008683408959768713, "step": 4079 }, { "epoch": 0.51, "grad_norm": 2.851184368133545, "grad_norm_var": 9.318770118827329, "learning_rate": 0.0001, "loss": 1.2004, "loss/crossentropy": 2.755657196044922, "loss/hidden": 1.0078125, "loss/logits": 0.1839064657688141, "loss/reg": 0.0008678102167323232, "step": 4080 }, { "epoch": 0.510125, "grad_norm": 2.8241658210754395, "grad_norm_var": 9.343469225431443, "learning_rate": 0.0001, "loss": 1.3311, "loss/crossentropy": 2.4032278060913086, "loss/hidden": 1.1171875, "loss/logits": 0.2052312195301056, "loss/reg": 0.0008673445554450154, "step": 4081 }, { "epoch": 0.51025, "grad_norm": 2.273699998855591, "grad_norm_var": 9.391857318174553, "learning_rate": 0.0001, "loss": 1.1705, "loss/crossentropy": 2.463425874710083, "loss/hidden": 0.9921875, "loss/logits": 0.16967608034610748, "loss/reg": 0.0008668684749864042, "step": 4082 }, { "epoch": 0.510375, "grad_norm": 2.6161773204803467, "grad_norm_var": 9.437462833950583, "learning_rate": 0.0001, "loss": 1.2608, "loss/crossentropy": 2.676311492919922, "loss/hidden": 1.0703125, "loss/logits": 0.18181532621383667, "loss/reg": 0.0008663350017741323, "step": 4083 }, { "epoch": 0.5105, "grad_norm": 5.414762496948242, "grad_norm_var": 9.672938185298229, "learning_rate": 0.0001, "loss": 1.3591, "loss/crossentropy": 2.717008113861084, "loss/hidden": 1.140625, "loss/logits": 0.20983606576919556, "loss/reg": 0.0008658412261866033, "step": 4084 }, { "epoch": 0.510625, "grad_norm": 3.014862537384033, "grad_norm_var": 9.537719945176981, "learning_rate": 0.0001, "loss": 1.2862, "loss/crossentropy": 2.5876879692077637, "loss/hidden": 1.0703125, "loss/logits": 0.2072124481201172, "loss/reg": 0.0008653398836031556, "step": 4085 }, { "epoch": 0.51075, "grad_norm": 2.7925806045532227, "grad_norm_var": 9.526338332781513, "learning_rate": 0.0001, "loss": 1.1125, "loss/crossentropy": 2.5644969940185547, "loss/hidden": 0.95703125, "loss/logits": 0.1467747986316681, "loss/reg": 0.0008648650837130845, "step": 4086 }, { "epoch": 0.510875, "grad_norm": 2.4847006797790527, "grad_norm_var": 9.60591469438732, "learning_rate": 0.0001, "loss": 1.1374, "loss/crossentropy": 2.6818811893463135, "loss/hidden": 0.96875, "loss/logits": 0.16000229120254517, "loss/reg": 0.0008643901092000306, "step": 4087 }, { "epoch": 0.511, "grad_norm": 2.896996259689331, "grad_norm_var": 9.585857127366626, "learning_rate": 0.0001, "loss": 1.339, "loss/crossentropy": 2.540834426879883, "loss/hidden": 1.1328125, "loss/logits": 0.19754791259765625, "loss/reg": 0.0008638479048386216, "step": 4088 }, { "epoch": 0.511125, "grad_norm": 2.3030641078948975, "grad_norm_var": 9.614310904717073, "learning_rate": 0.0001, "loss": 1.2866, "loss/crossentropy": 2.249614953994751, "loss/hidden": 1.09375, "loss/logits": 0.18419770896434784, "loss/reg": 0.0008633530233055353, "step": 4089 }, { "epoch": 0.51125, "grad_norm": 2.442394733428955, "grad_norm_var": 9.651547176429258, "learning_rate": 0.0001, "loss": 1.3028, "loss/crossentropy": 2.2043707370758057, "loss/hidden": 1.09375, "loss/logits": 0.20043601095676422, "loss/reg": 0.0008628551731817424, "step": 4090 }, { "epoch": 0.511375, "grad_norm": 2.701174020767212, "grad_norm_var": 9.602393718345695, "learning_rate": 0.0001, "loss": 1.0042, "loss/crossentropy": 2.626922607421875, "loss/hidden": 0.8515625, "loss/logits": 0.14402994513511658, "loss/reg": 0.0008623241446912289, "step": 4091 }, { "epoch": 0.5115, "grad_norm": 2.5504417419433594, "grad_norm_var": 9.617079722019652, "learning_rate": 0.0001, "loss": 1.1371, "loss/crossentropy": 2.6993486881256104, "loss/hidden": 0.9609375, "loss/logits": 0.16754762828350067, "loss/reg": 0.0008617914863862097, "step": 4092 }, { "epoch": 0.511625, "grad_norm": 2.895575761795044, "grad_norm_var": 9.545622544505253, "learning_rate": 0.0001, "loss": 1.3368, "loss/crossentropy": 2.811892509460449, "loss/hidden": 1.09375, "loss/logits": 0.23447111248970032, "loss/reg": 0.0008612489909864962, "step": 4093 }, { "epoch": 0.51175, "grad_norm": 2.271010637283325, "grad_norm_var": 9.637836981726458, "learning_rate": 0.0001, "loss": 1.0882, "loss/crossentropy": 2.6015803813934326, "loss/hidden": 0.92578125, "loss/logits": 0.15380442142486572, "loss/reg": 0.0008607796626165509, "step": 4094 }, { "epoch": 0.511875, "grad_norm": 5.304785251617432, "grad_norm_var": 0.9208372313352676, "learning_rate": 0.0001, "loss": 1.7472, "loss/crossentropy": 2.7530605792999268, "loss/hidden": 1.4140625, "loss/logits": 0.32451796531677246, "loss/reg": 0.0008603142341598868, "step": 4095 }, { "epoch": 0.512, "grad_norm": 2.478935956954956, "grad_norm_var": 0.9357597070562548, "learning_rate": 0.0001, "loss": 1.2945, "loss/crossentropy": 2.4969334602355957, "loss/hidden": 1.09375, "loss/logits": 0.1921313852071762, "loss/reg": 0.0008598544518463314, "step": 4096 }, { "epoch": 0.512125, "grad_norm": 2.379392385482788, "grad_norm_var": 0.9558281645298997, "learning_rate": 0.0001, "loss": 1.0925, "loss/crossentropy": 2.495603322982788, "loss/hidden": 0.9453125, "loss/logits": 0.13854797184467316, "loss/reg": 0.0008594950777478516, "step": 4097 }, { "epoch": 0.51225, "grad_norm": 2.9487383365631104, "grad_norm_var": 0.9255720069159658, "learning_rate": 0.0001, "loss": 1.2128, "loss/crossentropy": 2.4034905433654785, "loss/hidden": 1.03125, "loss/logits": 0.1729859709739685, "loss/reg": 0.0008590333163738251, "step": 4098 }, { "epoch": 0.512375, "grad_norm": 2.272583484649658, "grad_norm_var": 0.9490901731196033, "learning_rate": 0.0001, "loss": 1.1776, "loss/crossentropy": 2.3888251781463623, "loss/hidden": 0.99609375, "loss/logits": 0.17296119034290314, "loss/reg": 0.0008585570612922311, "step": 4099 }, { "epoch": 0.5125, "grad_norm": 2.542635440826416, "grad_norm_var": 0.5196294385213719, "learning_rate": 0.0001, "loss": 1.0671, "loss/crossentropy": 2.5513696670532227, "loss/hidden": 0.91015625, "loss/logits": 0.14834527671337128, "loss/reg": 0.0008581826696172357, "step": 4100 }, { "epoch": 0.512625, "grad_norm": 3.4122154712677, "grad_norm_var": 0.542603311185265, "learning_rate": 0.0001, "loss": 1.4134, "loss/crossentropy": 2.857693672180176, "loss/hidden": 1.2109375, "loss/logits": 0.19390010833740234, "loss/reg": 0.0008577204425819218, "step": 4101 }, { "epoch": 0.51275, "grad_norm": 2.6020314693450928, "grad_norm_var": 0.5448661674615389, "learning_rate": 0.0001, "loss": 1.1718, "loss/crossentropy": 2.5483522415161133, "loss/hidden": 1.0078125, "loss/logits": 0.15541675686836243, "loss/reg": 0.0008572403457947075, "step": 4102 }, { "epoch": 0.512875, "grad_norm": 186.63307189941406, "grad_norm_var": 2112.698017195477, "learning_rate": 0.0001, "loss": 3.6132, "loss/crossentropy": 2.8560104370117188, "loss/hidden": 3.171875, "loss/logits": 0.4327121675014496, "loss/reg": 0.0008567576296627522, "step": 4103 }, { "epoch": 0.513, "grad_norm": 6.748610019683838, "grad_norm_var": 2107.7744992104012, "learning_rate": 0.0001, "loss": 1.3744, "loss/crossentropy": 2.2177462577819824, "loss/hidden": 1.15625, "loss/logits": 0.2096305787563324, "loss/reg": 0.0008562696748413146, "step": 4104 }, { "epoch": 0.513125, "grad_norm": 2.6403520107269287, "grad_norm_var": 2107.231724342475, "learning_rate": 0.0001, "loss": 1.4023, "loss/crossentropy": 2.6115429401397705, "loss/hidden": 1.171875, "loss/logits": 0.2218974232673645, "loss/reg": 0.0008557829423807561, "step": 4105 }, { "epoch": 0.51325, "grad_norm": 4.28101921081543, "grad_norm_var": 2104.474462565423, "learning_rate": 0.0001, "loss": 1.6994, "loss/crossentropy": 2.403536081314087, "loss/hidden": 1.359375, "loss/logits": 0.3314824402332306, "loss/reg": 0.0008552958024665713, "step": 4106 }, { "epoch": 0.513375, "grad_norm": 2.975334644317627, "grad_norm_var": 2104.0417740808657, "learning_rate": 0.0001, "loss": 1.3599, "loss/crossentropy": 2.5667903423309326, "loss/hidden": 1.1484375, "loss/logits": 0.2029472142457962, "loss/reg": 0.0008547983597964048, "step": 4107 }, { "epoch": 0.5135, "grad_norm": 4.053854465484619, "grad_norm_var": 2101.7508979472964, "learning_rate": 0.0001, "loss": 1.696, "loss/crossentropy": 2.670978546142578, "loss/hidden": 1.3984375, "loss/logits": 0.2890186309814453, "loss/reg": 0.0008543440490029752, "step": 4108 }, { "epoch": 0.513625, "grad_norm": 3.462285041809082, "grad_norm_var": 2100.8731568213343, "learning_rate": 0.0001, "loss": 1.5154, "loss/crossentropy": 2.4646379947662354, "loss/hidden": 1.2578125, "loss/logits": 0.24905765056610107, "loss/reg": 0.0008539036498405039, "step": 4109 }, { "epoch": 0.51375, "grad_norm": 3.1310813426971436, "grad_norm_var": 2099.4811312719835, "learning_rate": 0.0001, "loss": 1.3179, "loss/crossentropy": 2.816258668899536, "loss/hidden": 1.109375, "loss/logits": 0.2000163197517395, "loss/reg": 0.0008534331573173404, "step": 4110 }, { "epoch": 0.513875, "grad_norm": 2.870558738708496, "grad_norm_var": 2102.9549157449733, "learning_rate": 0.0001, "loss": 1.1947, "loss/crossentropy": 2.57021427154541, "loss/hidden": 1.0078125, "loss/logits": 0.1783149689435959, "loss/reg": 0.0008530030027031898, "step": 4111 }, { "epoch": 0.514, "grad_norm": 2.1539409160614014, "grad_norm_var": 2103.491718686466, "learning_rate": 0.0001, "loss": 1.2609, "loss/crossentropy": 2.4085190296173096, "loss/hidden": 1.0703125, "loss/logits": 0.182031512260437, "loss/reg": 0.0008525521261617541, "step": 4112 }, { "epoch": 0.514125, "grad_norm": 3.0799827575683594, "grad_norm_var": 2102.37204099836, "learning_rate": 0.0001, "loss": 1.2661, "loss/crossentropy": 2.5446224212646484, "loss/hidden": 1.0859375, "loss/logits": 0.17164835333824158, "loss/reg": 0.0008520822157151997, "step": 4113 }, { "epoch": 0.51425, "grad_norm": 2.3098292350769043, "grad_norm_var": 2103.4018575830496, "learning_rate": 0.0001, "loss": 1.2115, "loss/crossentropy": 2.5183582305908203, "loss/hidden": 1.0234375, "loss/logits": 0.1795724332332611, "loss/reg": 0.0008516732486896217, "step": 4114 }, { "epoch": 0.514375, "grad_norm": 2.0623586177825928, "grad_norm_var": 2103.7529063716834, "learning_rate": 0.0001, "loss": 1.212, "loss/crossentropy": 2.478053331375122, "loss/hidden": 1.0234375, "loss/logits": 0.18009813129901886, "loss/reg": 0.0008512000786140561, "step": 4115 }, { "epoch": 0.5145, "grad_norm": 2.490999460220337, "grad_norm_var": 2103.836670373057, "learning_rate": 0.0001, "loss": 1.2705, "loss/crossentropy": 2.7578365802764893, "loss/hidden": 1.0703125, "loss/logits": 0.19167481362819672, "loss/reg": 0.000850707059726119, "step": 4116 }, { "epoch": 0.514625, "grad_norm": 2.3632266521453857, "grad_norm_var": 2105.4816552488364, "learning_rate": 0.0001, "loss": 1.123, "loss/crossentropy": 2.402446985244751, "loss/hidden": 0.94921875, "loss/logits": 0.16524842381477356, "loss/reg": 0.0008502579294145107, "step": 4117 }, { "epoch": 0.51475, "grad_norm": 2.4099833965301514, "grad_norm_var": 2105.7915990625384, "learning_rate": 0.0001, "loss": 1.0937, "loss/crossentropy": 2.4723873138427734, "loss/hidden": 0.92578125, "loss/logits": 0.15944242477416992, "loss/reg": 0.0008498082170262933, "step": 4118 }, { "epoch": 0.514875, "grad_norm": 24.488849639892578, "grad_norm_var": 29.830752943475034, "learning_rate": 0.0001, "loss": 2.3724, "loss/crossentropy": 2.2746217250823975, "loss/hidden": 2.09375, "loss/logits": 0.2701916992664337, "loss/reg": 0.0008493955247104168, "step": 4119 }, { "epoch": 0.515, "grad_norm": 3.622316598892212, "grad_norm_var": 29.4918550900215, "learning_rate": 0.0001, "loss": 1.3973, "loss/crossentropy": 2.5115089416503906, "loss/hidden": 1.15625, "loss/logits": 0.2325541377067566, "loss/reg": 0.0008489845786243677, "step": 4120 }, { "epoch": 0.515125, "grad_norm": 2.3608458042144775, "grad_norm_var": 29.55764767655663, "learning_rate": 0.0001, "loss": 1.0792, "loss/crossentropy": 2.5514280796051025, "loss/hidden": 0.91796875, "loss/logits": 0.15278944373130798, "loss/reg": 0.0008485670550726354, "step": 4121 }, { "epoch": 0.51525, "grad_norm": 3.1271841526031494, "grad_norm_var": 29.637203854652206, "learning_rate": 0.0001, "loss": 1.3284, "loss/crossentropy": 2.390993118286133, "loss/hidden": 1.140625, "loss/logits": 0.1792878657579422, "loss/reg": 0.0008481761324219406, "step": 4122 }, { "epoch": 0.515375, "grad_norm": 2.4089553356170654, "grad_norm_var": 29.748615960179382, "learning_rate": 0.0001, "loss": 1.2177, "loss/crossentropy": 2.487955093383789, "loss/hidden": 1.03125, "loss/logits": 0.17792850732803345, "loss/reg": 0.0008477973169647157, "step": 4123 }, { "epoch": 0.5155, "grad_norm": 2.9971773624420166, "grad_norm_var": 29.83191433557228, "learning_rate": 0.0001, "loss": 1.5263, "loss/crossentropy": 2.4921183586120605, "loss/hidden": 1.2265625, "loss/logits": 0.2912521958351135, "loss/reg": 0.0008474510977976024, "step": 4124 }, { "epoch": 0.515625, "grad_norm": 2.4439635276794434, "grad_norm_var": 29.981102050221246, "learning_rate": 0.0001, "loss": 1.297, "loss/crossentropy": 2.546945810317993, "loss/hidden": 1.1015625, "loss/logits": 0.18695366382598877, "loss/reg": 0.0008469963795505464, "step": 4125 }, { "epoch": 0.51575, "grad_norm": 2.205188751220703, "grad_norm_var": 30.144430633649176, "learning_rate": 0.0001, "loss": 1.1537, "loss/crossentropy": 2.642054796218872, "loss/hidden": 0.984375, "loss/logits": 0.1608656495809555, "loss/reg": 0.0008465936407446861, "step": 4126 }, { "epoch": 0.515875, "grad_norm": 3.0205178260803223, "grad_norm_var": 30.124009045209473, "learning_rate": 0.0001, "loss": 1.5181, "loss/crossentropy": 2.6145126819610596, "loss/hidden": 1.234375, "loss/logits": 0.2752184569835663, "loss/reg": 0.0008461964898742735, "step": 4127 }, { "epoch": 0.516, "grad_norm": 2.3117899894714355, "grad_norm_var": 30.08731124381274, "learning_rate": 0.0001, "loss": 1.1335, "loss/crossentropy": 2.6854753494262695, "loss/hidden": 0.9609375, "loss/logits": 0.16410666704177856, "loss/reg": 0.0008458442753180861, "step": 4128 }, { "epoch": 0.516125, "grad_norm": 2.800576686859131, "grad_norm_var": 30.125773795748938, "learning_rate": 0.0001, "loss": 1.4123, "loss/crossentropy": 2.3605527877807617, "loss/hidden": 1.1875, "loss/logits": 0.21632656455039978, "loss/reg": 0.0008453738410025835, "step": 4129 }, { "epoch": 0.51625, "grad_norm": 3.188699722290039, "grad_norm_var": 29.98021111576649, "learning_rate": 0.0001, "loss": 1.2419, "loss/crossentropy": 2.6491475105285645, "loss/hidden": 1.0546875, "loss/logits": 0.17875155806541443, "loss/reg": 0.0008448899607174098, "step": 4130 }, { "epoch": 0.516375, "grad_norm": 3.4677579402923584, "grad_norm_var": 29.73702549322443, "learning_rate": 0.0001, "loss": 1.4484, "loss/crossentropy": 2.581343412399292, "loss/hidden": 1.1875, "loss/logits": 0.2524116039276123, "loss/reg": 0.0008444382110610604, "step": 4131 }, { "epoch": 0.5165, "grad_norm": 2.3882832527160645, "grad_norm_var": 29.759813437407757, "learning_rate": 0.0001, "loss": 1.1772, "loss/crossentropy": 2.564290761947632, "loss/hidden": 1.0078125, "loss/logits": 0.16099011898040771, "loss/reg": 0.0008439547382295132, "step": 4132 }, { "epoch": 0.516625, "grad_norm": 22.98171615600586, "grad_norm_var": 51.55441269489542, "learning_rate": 0.0001, "loss": 2.2764, "loss/crossentropy": 2.705935478210449, "loss/hidden": 2.0, "loss/logits": 0.2679293751716614, "loss/reg": 0.0008434724295511842, "step": 4133 }, { "epoch": 0.51675, "grad_norm": 2.6084742546081543, "grad_norm_var": 51.478034421102194, "learning_rate": 0.0001, "loss": 1.1604, "loss/crossentropy": 2.525273561477661, "loss/hidden": 0.984375, "loss/logits": 0.1676352620124817, "loss/reg": 0.0008429911104030907, "step": 4134 }, { "epoch": 0.516875, "grad_norm": 3.666752576828003, "grad_norm_var": 25.5834022186759, "learning_rate": 0.0001, "loss": 1.5946, "loss/crossentropy": 3.080980062484741, "loss/hidden": 1.3203125, "loss/logits": 0.26581406593322754, "loss/reg": 0.0008425564155913889, "step": 4135 }, { "epoch": 0.517, "grad_norm": 2.967332601547241, "grad_norm_var": 25.651932726744853, "learning_rate": 0.0001, "loss": 1.3801, "loss/crossentropy": 2.485933780670166, "loss/hidden": 1.171875, "loss/logits": 0.19983679056167603, "loss/reg": 0.0008420854574069381, "step": 4136 }, { "epoch": 0.517125, "grad_norm": 2.606360912322998, "grad_norm_var": 25.600107925121126, "learning_rate": 0.0001, "loss": 1.3584, "loss/crossentropy": 2.6270759105682373, "loss/hidden": 1.15625, "loss/logits": 0.19370710849761963, "loss/reg": 0.0008416468626819551, "step": 4137 }, { "epoch": 0.51725, "grad_norm": 2.3534679412841797, "grad_norm_var": 25.73524169231343, "learning_rate": 0.0001, "loss": 1.0917, "loss/crossentropy": 2.4923019409179688, "loss/hidden": 0.92578125, "loss/logits": 0.15751691162586212, "loss/reg": 0.0008411825983785093, "step": 4138 }, { "epoch": 0.517375, "grad_norm": 2.6614179611206055, "grad_norm_var": 25.684790697785637, "learning_rate": 0.0001, "loss": 1.1501, "loss/crossentropy": 2.571427822113037, "loss/hidden": 0.96875, "loss/logits": 0.17292991280555725, "loss/reg": 0.0008407285786233842, "step": 4139 }, { "epoch": 0.5175, "grad_norm": 2.955474853515625, "grad_norm_var": 25.69070807823695, "learning_rate": 0.0001, "loss": 1.4013, "loss/crossentropy": 2.3990638256073, "loss/hidden": 1.1875, "loss/logits": 0.20535801351070404, "loss/reg": 0.0008402540115639567, "step": 4140 }, { "epoch": 0.517625, "grad_norm": 2.7117807865142822, "grad_norm_var": 25.638225427412348, "learning_rate": 0.0001, "loss": 1.2413, "loss/crossentropy": 2.444505214691162, "loss/hidden": 1.0390625, "loss/logits": 0.19386950135231018, "loss/reg": 0.0008397936471737921, "step": 4141 }, { "epoch": 0.51775, "grad_norm": 2.131366491317749, "grad_norm_var": 25.6567832602979, "learning_rate": 0.0001, "loss": 1.1605, "loss/crossentropy": 2.620258331298828, "loss/hidden": 0.984375, "loss/logits": 0.1677563190460205, "loss/reg": 0.0008393447496928275, "step": 4142 }, { "epoch": 0.517875, "grad_norm": 2.338768243789673, "grad_norm_var": 25.77953571884863, "learning_rate": 0.0001, "loss": 1.2265, "loss/crossentropy": 2.3121402263641357, "loss/hidden": 1.046875, "loss/logits": 0.17121942341327667, "loss/reg": 0.000838912499602884, "step": 4143 }, { "epoch": 0.518, "grad_norm": 3.046052932739258, "grad_norm_var": 25.647096659978757, "learning_rate": 0.0001, "loss": 1.2493, "loss/crossentropy": 2.110945701599121, "loss/hidden": 1.078125, "loss/logits": 0.1627526730298996, "loss/reg": 0.000838450447190553, "step": 4144 }, { "epoch": 0.518125, "grad_norm": 2.841336965560913, "grad_norm_var": 25.640385020099302, "learning_rate": 0.0001, "loss": 1.1565, "loss/crossentropy": 2.847136974334717, "loss/hidden": 0.984375, "loss/logits": 0.1637628972530365, "loss/reg": 0.000838011153973639, "step": 4145 }, { "epoch": 0.51825, "grad_norm": 2.444807529449463, "grad_norm_var": 25.761112768649376, "learning_rate": 0.0001, "loss": 1.0714, "loss/crossentropy": 2.3439459800720215, "loss/hidden": 0.9375, "loss/logits": 0.12550897896289825, "loss/reg": 0.000837548344861716, "step": 4146 }, { "epoch": 0.518375, "grad_norm": 2.6523170471191406, "grad_norm_var": 25.861703050115615, "learning_rate": 0.0001, "loss": 1.0735, "loss/crossentropy": 2.6181600093841553, "loss/hidden": 0.92578125, "loss/logits": 0.13931933045387268, "loss/reg": 0.0008370683644898236, "step": 4147 }, { "epoch": 0.5185, "grad_norm": 2.6663057804107666, "grad_norm_var": 25.80828099939675, "learning_rate": 0.0001, "loss": 1.2823, "loss/crossentropy": 2.645538568496704, "loss/hidden": 1.0703125, "loss/logits": 0.20357871055603027, "loss/reg": 0.0008366240072064102, "step": 4148 }, { "epoch": 0.518625, "grad_norm": 2.492847442626953, "grad_norm_var": 0.12766782004226204, "learning_rate": 0.0001, "loss": 1.1127, "loss/crossentropy": 2.599900007247925, "loss/hidden": 0.94921875, "loss/logits": 0.1550830602645874, "loss/reg": 0.0008361434447579086, "step": 4149 }, { "epoch": 0.51875, "grad_norm": 3.8604390621185303, "grad_norm_var": 0.21092827695058436, "learning_rate": 0.0001, "loss": 1.3821, "loss/crossentropy": 2.4752845764160156, "loss/hidden": 1.1484375, "loss/logits": 0.22526469826698303, "loss/reg": 0.0008356940816156566, "step": 4150 }, { "epoch": 0.518875, "grad_norm": 4.79877233505249, "grad_norm_var": 0.42564752336478834, "learning_rate": 0.0001, "loss": 2.2354, "loss/crossentropy": 3.03398060798645, "loss/hidden": 1.7421875, "loss/logits": 0.48482418060302734, "loss/reg": 0.0008352574659511447, "step": 4151 }, { "epoch": 0.519, "grad_norm": 2.322356939315796, "grad_norm_var": 0.44117447788002717, "learning_rate": 0.0001, "loss": 1.166, "loss/crossentropy": 2.614131212234497, "loss/hidden": 0.9921875, "loss/logits": 0.16545771062374115, "loss/reg": 0.0008347538532689214, "step": 4152 }, { "epoch": 0.519125, "grad_norm": 2.6133527755737305, "grad_norm_var": 0.4409921266107515, "learning_rate": 0.0001, "loss": 1.3768, "loss/crossentropy": 2.283562660217285, "loss/hidden": 1.171875, "loss/logits": 0.1965809166431427, "loss/reg": 0.0008342848159372807, "step": 4153 }, { "epoch": 0.51925, "grad_norm": 2.268911600112915, "grad_norm_var": 0.4465372966456925, "learning_rate": 0.0001, "loss": 1.2507, "loss/crossentropy": 2.6006205081939697, "loss/hidden": 1.046875, "loss/logits": 0.19552066922187805, "loss/reg": 0.000833874917589128, "step": 4154 }, { "epoch": 0.519375, "grad_norm": 2.134254217147827, "grad_norm_var": 0.47367458550355435, "learning_rate": 0.0001, "loss": 1.0602, "loss/crossentropy": 2.5676655769348145, "loss/hidden": 0.91015625, "loss/logits": 0.14167261123657227, "loss/reg": 0.0008334715967066586, "step": 4155 }, { "epoch": 0.5195, "grad_norm": 2.8334250450134277, "grad_norm_var": 0.47154575298581347, "learning_rate": 0.0001, "loss": 1.1713, "loss/crossentropy": 2.2586004734039307, "loss/hidden": 1.015625, "loss/logits": 0.14737316966056824, "loss/reg": 0.0008330708951689303, "step": 4156 }, { "epoch": 0.519625, "grad_norm": 2.3699567317962646, "grad_norm_var": 0.4810378737639058, "learning_rate": 0.0001, "loss": 1.2116, "loss/crossentropy": 2.5121302604675293, "loss/hidden": 1.0078125, "loss/logits": 0.19550742208957672, "loss/reg": 0.0008326801471412182, "step": 4157 }, { "epoch": 0.51975, "grad_norm": 3.2594265937805176, "grad_norm_var": 0.4692594550256587, "learning_rate": 0.0001, "loss": 1.453, "loss/crossentropy": 2.3055989742279053, "loss/hidden": 1.1953125, "loss/logits": 0.2493869960308075, "loss/reg": 0.0008323033689521253, "step": 4158 }, { "epoch": 0.519875, "grad_norm": 4.060752868652344, "grad_norm_var": 0.5466317448971871, "learning_rate": 0.0001, "loss": 1.7161, "loss/crossentropy": 2.376319169998169, "loss/hidden": 1.40625, "loss/logits": 0.30153656005859375, "loss/reg": 0.0008319362532347441, "step": 4159 }, { "epoch": 0.52, "grad_norm": 3.323867082595825, "grad_norm_var": 0.5562513774268145, "learning_rate": 0.0001, "loss": 1.3563, "loss/crossentropy": 2.790952682495117, "loss/hidden": 1.1328125, "loss/logits": 0.21516716480255127, "loss/reg": 0.0008315037121064961, "step": 4160 }, { "epoch": 0.520125, "grad_norm": 3.1720893383026123, "grad_norm_var": 0.5590046269570005, "learning_rate": 0.0001, "loss": 1.4221, "loss/crossentropy": 2.471376657485962, "loss/hidden": 1.1953125, "loss/logits": 0.2184683382511139, "loss/reg": 0.0008311242563650012, "step": 4161 }, { "epoch": 0.52025, "grad_norm": 3.417598009109497, "grad_norm_var": 0.5520245851504889, "learning_rate": 0.0001, "loss": 1.3311, "loss/crossentropy": 2.506002426147461, "loss/hidden": 1.15625, "loss/logits": 0.16656014323234558, "loss/reg": 0.000830760458484292, "step": 4162 }, { "epoch": 0.520375, "grad_norm": 3.014374256134033, "grad_norm_var": 0.5426890273707573, "learning_rate": 0.0001, "loss": 1.2833, "loss/crossentropy": 2.4197399616241455, "loss/hidden": 1.09375, "loss/logits": 0.18119922280311584, "loss/reg": 0.0008304059156216681, "step": 4163 }, { "epoch": 0.5205, "grad_norm": 2.2631726264953613, "grad_norm_var": 0.572827719666581, "learning_rate": 0.0001, "loss": 1.2259, "loss/crossentropy": 2.463536262512207, "loss/hidden": 1.0234375, "loss/logits": 0.19414667785167694, "loss/reg": 0.0008299616165459156, "step": 4164 }, { "epoch": 0.520625, "grad_norm": 2.5814943313598633, "grad_norm_var": 0.5671726493565795, "learning_rate": 0.0001, "loss": 1.383, "loss/crossentropy": 2.6093332767486572, "loss/hidden": 1.1796875, "loss/logits": 0.19502338767051697, "loss/reg": 0.0008295021252706647, "step": 4165 }, { "epoch": 0.52075, "grad_norm": 2.6463334560394287, "grad_norm_var": 0.5229894327616534, "learning_rate": 0.0001, "loss": 1.3841, "loss/crossentropy": 2.6809873580932617, "loss/hidden": 1.15625, "loss/logits": 0.21959224343299866, "loss/reg": 0.0008290879195556045, "step": 4166 }, { "epoch": 0.520875, "grad_norm": 2.579080104827881, "grad_norm_var": 0.2815511517949858, "learning_rate": 0.0001, "loss": 1.3348, "loss/crossentropy": 2.659588575363159, "loss/hidden": 1.125, "loss/logits": 0.20147258043289185, "loss/reg": 0.0008287233067676425, "step": 4167 }, { "epoch": 0.521, "grad_norm": 3.7683207988739014, "grad_norm_var": 0.3194112332133045, "learning_rate": 0.0001, "loss": 1.5248, "loss/crossentropy": 2.5204432010650635, "loss/hidden": 1.2890625, "loss/logits": 0.22747847437858582, "loss/reg": 0.0008283288916572928, "step": 4168 }, { "epoch": 0.521125, "grad_norm": 4.018487930297852, "grad_norm_var": 0.39020367804594497, "learning_rate": 0.0001, "loss": 1.73, "loss/crossentropy": 2.3914296627044678, "loss/hidden": 1.4296875, "loss/logits": 0.2920374870300293, "loss/reg": 0.0008279702160507441, "step": 4169 }, { "epoch": 0.52125, "grad_norm": 2.815986394882202, "grad_norm_var": 0.35689640402025113, "learning_rate": 0.0001, "loss": 1.6623, "loss/crossentropy": 2.521759033203125, "loss/hidden": 1.375, "loss/logits": 0.27900925278663635, "loss/reg": 0.0008275474538095295, "step": 4170 }, { "epoch": 0.521375, "grad_norm": 2.2227559089660645, "grad_norm_var": 0.3469792070944339, "learning_rate": 0.0001, "loss": 1.3256, "loss/crossentropy": 2.3301842212677, "loss/hidden": 1.0859375, "loss/logits": 0.23139387369155884, "loss/reg": 0.000827155658043921, "step": 4171 }, { "epoch": 0.5215, "grad_norm": 2.3071677684783936, "grad_norm_var": 0.37749884147675156, "learning_rate": 0.0001, "loss": 1.1528, "loss/crossentropy": 2.5100531578063965, "loss/hidden": 0.98046875, "loss/logits": 0.16402018070220947, "loss/reg": 0.0008267007651738822, "step": 4172 }, { "epoch": 0.521625, "grad_norm": 2.7560253143310547, "grad_norm_var": 0.35495873521388316, "learning_rate": 0.0001, "loss": 1.1909, "loss/crossentropy": 2.7208662033081055, "loss/hidden": 1.015625, "loss/logits": 0.16705168783664703, "loss/reg": 0.0008262392948381603, "step": 4173 }, { "epoch": 0.52175, "grad_norm": 3.0587356090545654, "grad_norm_var": 0.3508801753974353, "learning_rate": 0.0001, "loss": 1.3772, "loss/crossentropy": 2.9214959144592285, "loss/hidden": 1.171875, "loss/logits": 0.19703775644302368, "loss/reg": 0.000825813040137291, "step": 4174 }, { "epoch": 0.521875, "grad_norm": 3.0927529335021973, "grad_norm_var": 0.2725866903122238, "learning_rate": 0.0001, "loss": 1.3025, "loss/crossentropy": 2.4960737228393555, "loss/hidden": 1.09375, "loss/logits": 0.20044875144958496, "loss/reg": 0.0008253234555013478, "step": 4175 }, { "epoch": 0.522, "grad_norm": 3.5031867027282715, "grad_norm_var": 0.2837770245252763, "learning_rate": 0.0001, "loss": 1.3803, "loss/crossentropy": 2.5420405864715576, "loss/hidden": 1.140625, "loss/logits": 0.23144955933094025, "loss/reg": 0.0008248024387285113, "step": 4176 }, { "epoch": 0.522125, "grad_norm": 2.4144139289855957, "grad_norm_var": 0.2973311913034792, "learning_rate": 0.0001, "loss": 1.2088, "loss/crossentropy": 2.815413475036621, "loss/hidden": 1.03125, "loss/logits": 0.1693335473537445, "loss/reg": 0.0008242765907198191, "step": 4177 }, { "epoch": 0.52225, "grad_norm": 3.67046856880188, "grad_norm_var": 0.3186528391932588, "learning_rate": 0.0001, "loss": 1.2335, "loss/crossentropy": 2.388106107711792, "loss/hidden": 1.0625, "loss/logits": 0.16277176141738892, "loss/reg": 0.0008237826405093074, "step": 4178 }, { "epoch": 0.522375, "grad_norm": 3.4349443912506104, "grad_norm_var": 0.3350253102859234, "learning_rate": 0.0001, "loss": 1.0652, "loss/crossentropy": 3.250138759613037, "loss/hidden": 0.921875, "loss/logits": 0.13508209586143494, "loss/reg": 0.0008233314729295671, "step": 4179 }, { "epoch": 0.5225, "grad_norm": 3.201692819595337, "grad_norm_var": 0.3046511733929757, "learning_rate": 0.0001, "loss": 1.366, "loss/crossentropy": 2.2918670177459717, "loss/hidden": 1.15625, "loss/logits": 0.2014876902103424, "loss/reg": 0.0008228256483562291, "step": 4180 }, { "epoch": 0.522625, "grad_norm": 2.9136290550231934, "grad_norm_var": 0.29281353968818374, "learning_rate": 0.0001, "loss": 1.2448, "loss/crossentropy": 2.6273136138916016, "loss/hidden": 1.0546875, "loss/logits": 0.18190036714076996, "loss/reg": 0.0008223800105042756, "step": 4181 }, { "epoch": 0.52275, "grad_norm": 2.2323482036590576, "grad_norm_var": 0.32444041147451325, "learning_rate": 0.0001, "loss": 1.1069, "loss/crossentropy": 2.4598190784454346, "loss/hidden": 0.9453125, "loss/logits": 0.15341416001319885, "loss/reg": 0.0008219356532208622, "step": 4182 }, { "epoch": 0.522875, "grad_norm": 2.546335220336914, "grad_norm_var": 0.3263424257567863, "learning_rate": 0.0001, "loss": 1.3939, "loss/crossentropy": 2.5676722526550293, "loss/hidden": 1.15625, "loss/logits": 0.22948038578033447, "loss/reg": 0.0008214863482862711, "step": 4183 }, { "epoch": 0.523, "grad_norm": 2.2260639667510986, "grad_norm_var": 0.3164597083362793, "learning_rate": 0.0001, "loss": 1.0885, "loss/crossentropy": 2.6663970947265625, "loss/hidden": 0.921875, "loss/logits": 0.15840600430965424, "loss/reg": 0.0008210089872591197, "step": 4184 }, { "epoch": 0.523125, "grad_norm": 2.8603479862213135, "grad_norm_var": 0.22771952642845364, "learning_rate": 0.0001, "loss": 1.54, "loss/crossentropy": 2.4136691093444824, "loss/hidden": 1.2578125, "loss/logits": 0.27400460839271545, "loss/reg": 0.0008205072954297066, "step": 4185 }, { "epoch": 0.52325, "grad_norm": 3.27999210357666, "grad_norm_var": 0.24039836781204257, "learning_rate": 0.0001, "loss": 1.2451, "loss/crossentropy": 2.5940845012664795, "loss/hidden": 1.0546875, "loss/logits": 0.18218424916267395, "loss/reg": 0.0008200692827813327, "step": 4186 }, { "epoch": 0.523375, "grad_norm": 2.3074846267700195, "grad_norm_var": 0.2336756379137938, "learning_rate": 0.0001, "loss": 1.1891, "loss/crossentropy": 2.510866641998291, "loss/hidden": 1.015625, "loss/logits": 0.1652759611606598, "loss/reg": 0.0008196415728889406, "step": 4187 }, { "epoch": 0.5235, "grad_norm": 3.1304843425750732, "grad_norm_var": 0.21504102952355025, "learning_rate": 0.0001, "loss": 1.4396, "loss/crossentropy": 2.405059576034546, "loss/hidden": 1.21875, "loss/logits": 0.21263211965560913, "loss/reg": 0.0008192193927243352, "step": 4188 }, { "epoch": 0.523625, "grad_norm": 2.7377302646636963, "grad_norm_var": 0.21544805071695972, "learning_rate": 0.0001, "loss": 1.2568, "loss/crossentropy": 2.6660549640655518, "loss/hidden": 1.078125, "loss/logits": 0.17052534222602844, "loss/reg": 0.0008187603089027107, "step": 4189 }, { "epoch": 0.52375, "grad_norm": 3.5296483039855957, "grad_norm_var": 0.23844822820474196, "learning_rate": 0.0001, "loss": 1.5759, "loss/crossentropy": 2.4353723526000977, "loss/hidden": 1.3046875, "loss/logits": 0.26305416226387024, "loss/reg": 0.0008183260215446353, "step": 4190 }, { "epoch": 0.523875, "grad_norm": 2.6632556915283203, "grad_norm_var": 0.24137849388710417, "learning_rate": 0.0001, "loss": 1.2565, "loss/crossentropy": 2.582871913909912, "loss/hidden": 1.046875, "loss/logits": 0.2014957070350647, "loss/reg": 0.0008179208380170166, "step": 4191 }, { "epoch": 0.524, "grad_norm": 3.00535249710083, "grad_norm_var": 0.2178757222669598, "learning_rate": 0.0001, "loss": 1.1713, "loss/crossentropy": 2.631917715072632, "loss/hidden": 1.0, "loss/logits": 0.1630905568599701, "loss/reg": 0.000817537831608206, "step": 4192 }, { "epoch": 0.524125, "grad_norm": 2.4693267345428467, "grad_norm_var": 0.21462135005140962, "learning_rate": 0.0001, "loss": 0.9775, "loss/crossentropy": 2.4573001861572266, "loss/hidden": 0.84375, "loss/logits": 0.12558460235595703, "loss/reg": 0.0008171593653969467, "step": 4193 }, { "epoch": 0.52425, "grad_norm": 32.578773498535156, "grad_norm_var": 55.46096474066369, "learning_rate": 0.0001, "loss": 1.2449, "loss/crossentropy": 2.8529715538024902, "loss/hidden": 1.0859375, "loss/logits": 0.15080119669437408, "loss/reg": 0.0008167069754563272, "step": 4194 }, { "epoch": 0.524375, "grad_norm": 3.6846463680267334, "grad_norm_var": 55.42291528948792, "learning_rate": 0.0001, "loss": 1.293, "loss/crossentropy": 2.528735399246216, "loss/hidden": 1.1171875, "loss/logits": 0.1676066815853119, "loss/reg": 0.000816255109384656, "step": 4195 }, { "epoch": 0.5245, "grad_norm": 2.5146830081939697, "grad_norm_var": 55.590617820386726, "learning_rate": 0.0001, "loss": 1.1811, "loss/crossentropy": 2.3739027976989746, "loss/hidden": 1.0234375, "loss/logits": 0.14951826632022858, "loss/reg": 0.0008158254204317927, "step": 4196 }, { "epoch": 0.524625, "grad_norm": 2.8472511768341064, "grad_norm_var": 55.6064156840824, "learning_rate": 0.0001, "loss": 1.6401, "loss/crossentropy": 2.21915602684021, "loss/hidden": 1.3515625, "loss/logits": 0.2803974747657776, "loss/reg": 0.0008153741946443915, "step": 4197 }, { "epoch": 0.52475, "grad_norm": 3.9265055656433105, "grad_norm_var": 55.236666227793144, "learning_rate": 0.0001, "loss": 1.1883, "loss/crossentropy": 2.30238938331604, "loss/hidden": 1.015625, "loss/logits": 0.1644926220178604, "loss/reg": 0.0008149627246893942, "step": 4198 }, { "epoch": 0.524875, "grad_norm": 2.803231716156006, "grad_norm_var": 55.16465001756601, "learning_rate": 0.0001, "loss": 1.3348, "loss/crossentropy": 2.229112148284912, "loss/hidden": 1.125, "loss/logits": 0.20166757702827454, "loss/reg": 0.000814585539046675, "step": 4199 }, { "epoch": 0.525, "grad_norm": 3.628840923309326, "grad_norm_var": 54.80896508133667, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.435701370239258, "loss/hidden": 1.1484375, "loss/logits": 0.16607129573822021, "loss/reg": 0.0008141432772390544, "step": 4200 }, { "epoch": 0.525125, "grad_norm": 2.364655017852783, "grad_norm_var": 54.95734120574509, "learning_rate": 0.0001, "loss": 1.1828, "loss/crossentropy": 2.4542765617370605, "loss/hidden": 1.0078125, "loss/logits": 0.16688568890094757, "loss/reg": 0.0008136690012179315, "step": 4201 }, { "epoch": 0.52525, "grad_norm": 3.1115171909332275, "grad_norm_var": 54.99420288488831, "learning_rate": 0.0001, "loss": 1.2889, "loss/crossentropy": 2.890014886856079, "loss/hidden": 1.09375, "loss/logits": 0.18702596426010132, "loss/reg": 0.0008132050279527903, "step": 4202 }, { "epoch": 0.525375, "grad_norm": 4.278806209564209, "grad_norm_var": 54.57367529111172, "learning_rate": 0.0001, "loss": 1.3601, "loss/crossentropy": 2.7399425506591797, "loss/hidden": 1.15625, "loss/logits": 0.19574813544750214, "loss/reg": 0.0008127555483952165, "step": 4203 }, { "epoch": 0.5255, "grad_norm": 3.510767698287964, "grad_norm_var": 54.490219466173606, "learning_rate": 0.0001, "loss": 1.368, "loss/crossentropy": 2.609346628189087, "loss/hidden": 1.15625, "loss/logits": 0.20363910496234894, "loss/reg": 0.000812291691545397, "step": 4204 }, { "epoch": 0.525625, "grad_norm": 2.503868341445923, "grad_norm_var": 54.56350647655555, "learning_rate": 0.0001, "loss": 1.1801, "loss/crossentropy": 2.6714260578155518, "loss/hidden": 0.99609375, "loss/logits": 0.17584726214408875, "loss/reg": 0.0008118291734717786, "step": 4205 }, { "epoch": 0.52575, "grad_norm": 2.8441147804260254, "grad_norm_var": 54.723968500084666, "learning_rate": 0.0001, "loss": 1.3747, "loss/crossentropy": 2.600517988204956, "loss/hidden": 1.15625, "loss/logits": 0.21032652258872986, "loss/reg": 0.000811367470305413, "step": 4206 }, { "epoch": 0.525875, "grad_norm": 2.557873487472534, "grad_norm_var": 54.75638570991305, "learning_rate": 0.0001, "loss": 1.3847, "loss/crossentropy": 2.483959674835205, "loss/hidden": 1.15625, "loss/logits": 0.22037005424499512, "loss/reg": 0.0008109263726510108, "step": 4207 }, { "epoch": 0.526, "grad_norm": 5.100096702575684, "grad_norm_var": 54.497440540800184, "learning_rate": 0.0001, "loss": 1.9749, "loss/crossentropy": 1.7383568286895752, "loss/hidden": 1.671875, "loss/logits": 0.2949520945549011, "loss/reg": 0.0008104626322165132, "step": 4208 }, { "epoch": 0.526125, "grad_norm": 3.29878830909729, "grad_norm_var": 54.255550406816106, "learning_rate": 0.0001, "loss": 1.2541, "loss/crossentropy": 2.2682766914367676, "loss/hidden": 1.0859375, "loss/logits": 0.16002465784549713, "loss/reg": 0.0008100300910882652, "step": 4209 }, { "epoch": 0.52625, "grad_norm": 4.4921722412109375, "grad_norm_var": 0.6437414090378591, "learning_rate": 0.0001, "loss": 1.6974, "loss/crossentropy": 2.5695571899414062, "loss/hidden": 1.46875, "loss/logits": 0.22051367163658142, "loss/reg": 0.0008095898083411157, "step": 4210 }, { "epoch": 0.526375, "grad_norm": 3.5393497943878174, "grad_norm_var": 0.6384177439732374, "learning_rate": 0.0001, "loss": 1.5767, "loss/crossentropy": 2.3129115104675293, "loss/hidden": 1.3125, "loss/logits": 0.25613105297088623, "loss/reg": 0.0008091230411082506, "step": 4211 }, { "epoch": 0.5265, "grad_norm": 2.838677167892456, "grad_norm_var": 0.6096426405184416, "learning_rate": 0.0001, "loss": 1.0883, "loss/crossentropy": 2.5223453044891357, "loss/hidden": 0.9375, "loss/logits": 0.14273786544799805, "loss/reg": 0.0008086836314760149, "step": 4212 }, { "epoch": 0.526625, "grad_norm": 2.1936838626861572, "grad_norm_var": 0.6804035694603262, "learning_rate": 0.0001, "loss": 1.1239, "loss/crossentropy": 2.4174840450286865, "loss/hidden": 0.96484375, "loss/logits": 0.15101727843284607, "loss/reg": 0.0008082418935373425, "step": 4213 }, { "epoch": 0.52675, "grad_norm": 4.994417667388916, "grad_norm_var": 0.8391707807272032, "learning_rate": 0.0001, "loss": 1.4871, "loss/crossentropy": 2.7208335399627686, "loss/hidden": 1.25, "loss/logits": 0.22907094657421112, "loss/reg": 0.0008078106911852956, "step": 4214 }, { "epoch": 0.526875, "grad_norm": 2.3367254734039307, "grad_norm_var": 0.8885735992952047, "learning_rate": 0.0001, "loss": 1.2275, "loss/crossentropy": 2.466120719909668, "loss/hidden": 1.046875, "loss/logits": 0.1725480556488037, "loss/reg": 0.0008074032957665622, "step": 4215 }, { "epoch": 0.527, "grad_norm": 2.582688093185425, "grad_norm_var": 0.9180319232264149, "learning_rate": 0.0001, "loss": 1.4881, "loss/crossentropy": 2.6876585483551025, "loss/hidden": 1.234375, "loss/logits": 0.24565249681472778, "loss/reg": 0.0008069656905718148, "step": 4216 }, { "epoch": 0.527125, "grad_norm": 2.6797804832458496, "grad_norm_var": 0.8855995234533177, "learning_rate": 0.0001, "loss": 1.1943, "loss/crossentropy": 2.816967248916626, "loss/hidden": 1.0234375, "loss/logits": 0.1627635508775711, "loss/reg": 0.0008065280853770673, "step": 4217 }, { "epoch": 0.52725, "grad_norm": 2.055922746658325, "grad_norm_var": 0.9823272558608844, "learning_rate": 0.0001, "loss": 1.197, "loss/crossentropy": 2.806370973587036, "loss/hidden": 1.0234375, "loss/logits": 0.1654524803161621, "loss/reg": 0.0008061137632466853, "step": 4218 }, { "epoch": 0.527375, "grad_norm": 2.058664321899414, "grad_norm_var": 0.9822883638480031, "learning_rate": 0.0001, "loss": 1.1538, "loss/crossentropy": 2.6140778064727783, "loss/hidden": 0.97265625, "loss/logits": 0.17311787605285645, "loss/reg": 0.000805702933575958, "step": 4219 }, { "epoch": 0.5275, "grad_norm": 2.449545383453369, "grad_norm_var": 0.9944435632074523, "learning_rate": 0.0001, "loss": 1.2031, "loss/crossentropy": 2.5582563877105713, "loss/hidden": 1.03125, "loss/logits": 0.163771390914917, "loss/reg": 0.0008053261553868651, "step": 4220 }, { "epoch": 0.527625, "grad_norm": 2.137479066848755, "grad_norm_var": 1.0286777403178176, "learning_rate": 0.0001, "loss": 1.1796, "loss/crossentropy": 2.3425965309143066, "loss/hidden": 0.98828125, "loss/logits": 0.18323196470737457, "loss/reg": 0.0008049707976169884, "step": 4221 }, { "epoch": 0.52775, "grad_norm": 2.2574312686920166, "grad_norm_var": 1.0631662700151756, "learning_rate": 0.0001, "loss": 1.1533, "loss/crossentropy": 2.261107921600342, "loss/hidden": 0.984375, "loss/logits": 0.16085729002952576, "loss/reg": 0.0008046377915889025, "step": 4222 }, { "epoch": 0.527875, "grad_norm": 2.845203161239624, "grad_norm_var": 1.0524097299813766, "learning_rate": 0.0001, "loss": 1.2524, "loss/crossentropy": 2.6129589080810547, "loss/hidden": 1.0703125, "loss/logits": 0.1740429848432541, "loss/reg": 0.0008042225381359458, "step": 4223 }, { "epoch": 0.528, "grad_norm": 3.0447940826416016, "grad_norm_var": 0.7385281640965928, "learning_rate": 0.0001, "loss": 1.3499, "loss/crossentropy": 2.6787073612213135, "loss/hidden": 1.1484375, "loss/logits": 0.19340568780899048, "loss/reg": 0.0008038287633098662, "step": 4224 }, { "epoch": 0.528125, "grad_norm": 2.8211236000061035, "grad_norm_var": 0.7250229726287649, "learning_rate": 0.0001, "loss": 1.551, "loss/crossentropy": 2.283034324645996, "loss/hidden": 1.2890625, "loss/logits": 0.2538742423057556, "loss/reg": 0.0008034768397919834, "step": 4225 }, { "epoch": 0.52825, "grad_norm": 5.817380905151367, "grad_norm_var": 1.1279544606149194, "learning_rate": 0.0001, "loss": 1.4744, "loss/crossentropy": 2.369570732116699, "loss/hidden": 1.28125, "loss/logits": 0.18512769043445587, "loss/reg": 0.0008031214820221066, "step": 4226 }, { "epoch": 0.528375, "grad_norm": 2.2192578315734863, "grad_norm_var": 1.127117963327737, "learning_rate": 0.0001, "loss": 1.1598, "loss/crossentropy": 2.6720175743103027, "loss/hidden": 0.984375, "loss/logits": 0.1673586666584015, "loss/reg": 0.000802784925326705, "step": 4227 }, { "epoch": 0.5285, "grad_norm": 2.3636412620544434, "grad_norm_var": 1.140880979601284, "learning_rate": 0.0001, "loss": 1.2163, "loss/crossentropy": 2.5436294078826904, "loss/hidden": 1.046875, "loss/logits": 0.16137847304344177, "loss/reg": 0.0008024058188311756, "step": 4228 }, { "epoch": 0.528625, "grad_norm": 3.800652503967285, "grad_norm_var": 1.171593731230874, "learning_rate": 0.0001, "loss": 1.2112, "loss/crossentropy": 2.653392791748047, "loss/hidden": 1.03125, "loss/logits": 0.17188654839992523, "loss/reg": 0.0008021226385608315, "step": 4229 }, { "epoch": 0.52875, "grad_norm": 2.7381319999694824, "grad_norm_var": 0.8609063475884483, "learning_rate": 0.0001, "loss": 1.4313, "loss/crossentropy": 2.642808437347412, "loss/hidden": 1.21875, "loss/logits": 0.20448926091194153, "loss/reg": 0.0008018370135687292, "step": 4230 }, { "epoch": 0.528875, "grad_norm": 3.654867172241211, "grad_norm_var": 0.8945766090863266, "learning_rate": 0.0001, "loss": 1.3571, "loss/crossentropy": 2.5529444217681885, "loss/hidden": 1.15625, "loss/logits": 0.1927923560142517, "loss/reg": 0.0008015378261916339, "step": 4231 }, { "epoch": 0.529, "grad_norm": 2.371819257736206, "grad_norm_var": 0.9047423683578849, "learning_rate": 0.0001, "loss": 1.441, "loss/crossentropy": 1.9777675867080688, "loss/hidden": 1.1953125, "loss/logits": 0.23763874173164368, "loss/reg": 0.0008012460311874747, "step": 4232 }, { "epoch": 0.529125, "grad_norm": 4.284022808074951, "grad_norm_var": 1.032982961368956, "learning_rate": 0.0001, "loss": 1.6988, "loss/crossentropy": 2.9203107357025146, "loss/hidden": 1.3984375, "loss/logits": 0.2923945188522339, "loss/reg": 0.0008009518496692181, "step": 4233 }, { "epoch": 0.52925, "grad_norm": 2.6752400398254395, "grad_norm_var": 0.9845714770350459, "learning_rate": 0.0001, "loss": 1.3217, "loss/crossentropy": 2.699678421020508, "loss/hidden": 1.1015625, "loss/logits": 0.21208709478378296, "loss/reg": 0.0008006618591025472, "step": 4234 }, { "epoch": 0.529375, "grad_norm": 2.3961873054504395, "grad_norm_var": 0.9506245315988812, "learning_rate": 0.0001, "loss": 1.134, "loss/crossentropy": 2.40701961517334, "loss/hidden": 0.9609375, "loss/logits": 0.16501834988594055, "loss/reg": 0.0008002365939319134, "step": 4235 }, { "epoch": 0.5295, "grad_norm": 2.803257942199707, "grad_norm_var": 0.9328469168107622, "learning_rate": 0.0001, "loss": 1.2292, "loss/crossentropy": 2.649207830429077, "loss/hidden": 1.046875, "loss/logits": 0.17432409524917603, "loss/reg": 0.0007999027147889137, "step": 4236 }, { "epoch": 0.529625, "grad_norm": 2.375216484069824, "grad_norm_var": 0.9085822582006924, "learning_rate": 0.0001, "loss": 1.196, "loss/crossentropy": 2.7824199199676514, "loss/hidden": 1.0234375, "loss/logits": 0.16451817750930786, "loss/reg": 0.0007994776824489236, "step": 4237 }, { "epoch": 0.52975, "grad_norm": 2.3999438285827637, "grad_norm_var": 0.8951855038075099, "learning_rate": 0.0001, "loss": 1.2261, "loss/crossentropy": 2.547384262084961, "loss/hidden": 1.0390625, "loss/logits": 0.1790103018283844, "loss/reg": 0.0007990513113327324, "step": 4238 }, { "epoch": 0.529875, "grad_norm": 3.8512089252471924, "grad_norm_var": 0.932554875809429, "learning_rate": 0.0001, "loss": 1.3543, "loss/crossentropy": 2.5753393173217773, "loss/hidden": 1.0859375, "loss/logits": 0.26040977239608765, "loss/reg": 0.0007987336139194667, "step": 4239 }, { "epoch": 0.53, "grad_norm": 2.795478343963623, "grad_norm_var": 0.9383097243534839, "learning_rate": 0.0001, "loss": 1.2018, "loss/crossentropy": 2.9352991580963135, "loss/hidden": 1.0234375, "loss/logits": 0.1703283190727234, "loss/reg": 0.0007984273252077401, "step": 4240 }, { "epoch": 0.530125, "grad_norm": 4.425906181335449, "grad_norm_var": 1.0427064045445187, "learning_rate": 0.0001, "loss": 1.2964, "loss/crossentropy": 2.5377988815307617, "loss/hidden": 1.1171875, "loss/logits": 0.171259343624115, "loss/reg": 0.0007979967631399632, "step": 4241 }, { "epoch": 0.53025, "grad_norm": 2.8056933879852295, "grad_norm_var": 0.5528497751271172, "learning_rate": 0.0001, "loss": 1.2203, "loss/crossentropy": 2.3515279293060303, "loss/hidden": 1.046875, "loss/logits": 0.1654684841632843, "loss/reg": 0.0007975581684149802, "step": 4242 }, { "epoch": 0.530375, "grad_norm": 2.4627246856689453, "grad_norm_var": 0.5312899765555185, "learning_rate": 0.0001, "loss": 1.2479, "loss/crossentropy": 2.4652249813079834, "loss/hidden": 1.0546875, "loss/logits": 0.18525998294353485, "loss/reg": 0.0007971353479661047, "step": 4243 }, { "epoch": 0.5305, "grad_norm": 2.8251187801361084, "grad_norm_var": 0.5046602219776398, "learning_rate": 0.0001, "loss": 1.2227, "loss/crossentropy": 2.8940768241882324, "loss/hidden": 1.03125, "loss/logits": 0.18345582485198975, "loss/reg": 0.000796705367974937, "step": 4244 }, { "epoch": 0.530625, "grad_norm": 2.2845075130462646, "grad_norm_var": 0.49488256521562995, "learning_rate": 0.0001, "loss": 1.3404, "loss/crossentropy": 2.5932953357696533, "loss/hidden": 1.125, "loss/logits": 0.20746567845344543, "loss/reg": 0.0007962383097037673, "step": 4245 }, { "epoch": 0.53075, "grad_norm": 2.9845826625823975, "grad_norm_var": 0.4918207593538445, "learning_rate": 0.0001, "loss": 1.139, "loss/crossentropy": 2.699679374694824, "loss/hidden": 0.97265625, "loss/logits": 0.15838849544525146, "loss/reg": 0.0007957399939186871, "step": 4246 }, { "epoch": 0.530875, "grad_norm": 3.9989142417907715, "grad_norm_var": 0.530991815449632, "learning_rate": 0.0001, "loss": 1.3907, "loss/crossentropy": 2.847986936569214, "loss/hidden": 1.1796875, "loss/logits": 0.20309031009674072, "loss/reg": 0.0007953072781674564, "step": 4247 }, { "epoch": 0.531, "grad_norm": 3.0507006645202637, "grad_norm_var": 0.5044073642885505, "learning_rate": 0.0001, "loss": 1.0925, "loss/crossentropy": 2.6706018447875977, "loss/hidden": 0.93359375, "loss/logits": 0.1509624421596527, "loss/reg": 0.0007948344573378563, "step": 4248 }, { "epoch": 0.531125, "grad_norm": 3.1060192584991455, "grad_norm_var": 0.39357063484554355, "learning_rate": 0.0001, "loss": 1.3402, "loss/crossentropy": 2.5952630043029785, "loss/hidden": 1.1328125, "loss/logits": 0.1994590312242508, "loss/reg": 0.0007944065728224814, "step": 4249 }, { "epoch": 0.53125, "grad_norm": 3.2291526794433594, "grad_norm_var": 0.3922665638084633, "learning_rate": 0.0001, "loss": 1.1483, "loss/crossentropy": 2.261535882949829, "loss/hidden": 0.98828125, "loss/logits": 0.152106374502182, "loss/reg": 0.0007939246715977788, "step": 4250 }, { "epoch": 0.531375, "grad_norm": 2.5600414276123047, "grad_norm_var": 0.38103339415330595, "learning_rate": 0.0001, "loss": 1.1535, "loss/crossentropy": 2.5010032653808594, "loss/hidden": 0.9921875, "loss/logits": 0.15339386463165283, "loss/reg": 0.000793498067650944, "step": 4251 }, { "epoch": 0.5315, "grad_norm": 2.772977590560913, "grad_norm_var": 0.38187454259622555, "learning_rate": 0.0001, "loss": 1.1809, "loss/crossentropy": 2.6629507541656494, "loss/hidden": 0.9453125, "loss/logits": 0.2276071012020111, "loss/reg": 0.0007930543506518006, "step": 4252 }, { "epoch": 0.531625, "grad_norm": 3.1936185359954834, "grad_norm_var": 0.35604913255616566, "learning_rate": 0.0001, "loss": 1.4592, "loss/crossentropy": 2.5788443088531494, "loss/hidden": 1.21875, "loss/logits": 0.23255601525306702, "loss/reg": 0.0007926214020699263, "step": 4253 }, { "epoch": 0.53175, "grad_norm": 2.8036019802093506, "grad_norm_var": 0.33142581270121596, "learning_rate": 0.0001, "loss": 1.265, "loss/crossentropy": 2.33449649810791, "loss/hidden": 1.0625, "loss/logits": 0.19459016621112823, "loss/reg": 0.0007922223303467035, "step": 4254 }, { "epoch": 0.531875, "grad_norm": 2.784675121307373, "grad_norm_var": 0.2916966071329161, "learning_rate": 0.0001, "loss": 1.1128, "loss/crossentropy": 2.759453773498535, "loss/hidden": 0.94921875, "loss/logits": 0.155696839094162, "loss/reg": 0.0007917885086499155, "step": 4255 }, { "epoch": 0.532, "grad_norm": 3.4999773502349854, "grad_norm_var": 0.30301369675745465, "learning_rate": 0.0001, "loss": 1.5598, "loss/crossentropy": 2.6297483444213867, "loss/hidden": 1.28125, "loss/logits": 0.27060064673423767, "loss/reg": 0.0007914050947874784, "step": 4256 }, { "epoch": 0.532125, "grad_norm": 2.5080208778381348, "grad_norm_var": 0.18087401724402502, "learning_rate": 0.0001, "loss": 1.2338, "loss/crossentropy": 2.427809238433838, "loss/hidden": 1.0546875, "loss/logits": 0.17124870419502258, "loss/reg": 0.0007910503190942109, "step": 4257 }, { "epoch": 0.53225, "grad_norm": 2.777385950088501, "grad_norm_var": 0.1813909908713998, "learning_rate": 0.0001, "loss": 1.2718, "loss/crossentropy": 2.576920986175537, "loss/hidden": 1.09375, "loss/logits": 0.1701691746711731, "loss/reg": 0.000790624413639307, "step": 4258 }, { "epoch": 0.532375, "grad_norm": 3.101757764816284, "grad_norm_var": 0.16730203550636513, "learning_rate": 0.0001, "loss": 1.0979, "loss/crossentropy": 2.659444570541382, "loss/hidden": 0.94921875, "loss/logits": 0.14074057340621948, "loss/reg": 0.0007901783683337271, "step": 4259 }, { "epoch": 0.5325, "grad_norm": 3.0588555335998535, "grad_norm_var": 0.16627724346289838, "learning_rate": 0.0001, "loss": 1.1945, "loss/crossentropy": 2.436210870742798, "loss/hidden": 1.0078125, "loss/logits": 0.17874974012374878, "loss/reg": 0.000789721729233861, "step": 4260 }, { "epoch": 0.532625, "grad_norm": 2.363659381866455, "grad_norm_var": 0.15930592287657627, "learning_rate": 0.0001, "loss": 1.235, "loss/crossentropy": 2.311309576034546, "loss/hidden": 1.0546875, "loss/logits": 0.17245906591415405, "loss/reg": 0.0007892895373515785, "step": 4261 }, { "epoch": 0.53275, "grad_norm": 2.319941282272339, "grad_norm_var": 0.1871401555070245, "learning_rate": 0.0001, "loss": 1.0491, "loss/crossentropy": 2.476577043533325, "loss/hidden": 0.90234375, "loss/logits": 0.1389044225215912, "loss/reg": 0.0007888483814895153, "step": 4262 }, { "epoch": 0.532875, "grad_norm": 3.0937843322753906, "grad_norm_var": 0.11122348629206727, "learning_rate": 0.0001, "loss": 1.32, "loss/crossentropy": 2.6069986820220947, "loss/hidden": 1.1171875, "loss/logits": 0.19493424892425537, "loss/reg": 0.0007884378428570926, "step": 4263 }, { "epoch": 0.533, "grad_norm": 2.859908103942871, "grad_norm_var": 0.10938536421650251, "learning_rate": 0.0001, "loss": 1.3627, "loss/crossentropy": 2.3962929248809814, "loss/hidden": 1.1484375, "loss/logits": 0.206387460231781, "loss/reg": 0.0007880289340391755, "step": 4264 }, { "epoch": 0.533125, "grad_norm": 3.2257370948791504, "grad_norm_var": 0.11393545482147047, "learning_rate": 0.0001, "loss": 1.1062, "loss/crossentropy": 2.5028467178344727, "loss/hidden": 0.9453125, "loss/logits": 0.1529780477285385, "loss/reg": 0.0007876156596466899, "step": 4265 }, { "epoch": 0.53325, "grad_norm": 2.2641875743865967, "grad_norm_var": 0.12779790568717148, "learning_rate": 0.0001, "loss": 1.0802, "loss/crossentropy": 2.58443284034729, "loss/hidden": 0.92578125, "loss/logits": 0.14656266570091248, "loss/reg": 0.0007871895213611424, "step": 4266 }, { "epoch": 0.533375, "grad_norm": 2.7856333255767822, "grad_norm_var": 0.12303128456664064, "learning_rate": 0.0001, "loss": 1.2301, "loss/crossentropy": 2.704252243041992, "loss/hidden": 1.046875, "loss/logits": 0.17537742853164673, "loss/reg": 0.0007867568638175726, "step": 4267 }, { "epoch": 0.5335, "grad_norm": 2.6083579063415527, "grad_norm_var": 0.12616005723558127, "learning_rate": 0.0001, "loss": 1.1197, "loss/crossentropy": 2.489248514175415, "loss/hidden": 0.96875, "loss/logits": 0.1431349664926529, "loss/reg": 0.0007863271166570485, "step": 4268 }, { "epoch": 0.533625, "grad_norm": 2.1537485122680664, "grad_norm_var": 0.14305994442429792, "learning_rate": 0.0001, "loss": 1.0755, "loss/crossentropy": 2.1706762313842773, "loss/hidden": 0.91015625, "loss/logits": 0.1574527770280838, "loss/reg": 0.0007859190227463841, "step": 4269 }, { "epoch": 0.53375, "grad_norm": 3.229259490966797, "grad_norm_var": 0.1566839321705705, "learning_rate": 0.0001, "loss": 1.5039, "loss/crossentropy": 2.2021477222442627, "loss/hidden": 1.265625, "loss/logits": 0.23045289516448975, "loss/reg": 0.0007854876457713544, "step": 4270 }, { "epoch": 0.533875, "grad_norm": 2.4376285076141357, "grad_norm_var": 0.16444313460703838, "learning_rate": 0.0001, "loss": 1.3389, "loss/crossentropy": 2.429713487625122, "loss/hidden": 1.125, "loss/logits": 0.20603591203689575, "loss/reg": 0.0007850525435060263, "step": 4271 }, { "epoch": 0.534, "grad_norm": 2.689030170440674, "grad_norm_var": 0.1263982858564295, "learning_rate": 0.0001, "loss": 1.1438, "loss/crossentropy": 2.434504747390747, "loss/hidden": 0.96875, "loss/logits": 0.16720245778560638, "loss/reg": 0.0007846249500289559, "step": 4272 }, { "epoch": 0.534125, "grad_norm": 3.0625832080841064, "grad_norm_var": 0.1301446118670429, "learning_rate": 0.0001, "loss": 1.339, "loss/crossentropy": 2.756289005279541, "loss/hidden": 1.1328125, "loss/logits": 0.19838444888591766, "loss/reg": 0.0007842496270313859, "step": 4273 }, { "epoch": 0.53425, "grad_norm": 2.5634925365448, "grad_norm_var": 0.13227906054990488, "learning_rate": 0.0001, "loss": 1.3743, "loss/crossentropy": 2.040342330932617, "loss/hidden": 1.171875, "loss/logits": 0.19456124305725098, "loss/reg": 0.0007839030586183071, "step": 4274 }, { "epoch": 0.534375, "grad_norm": 3.2187962532043457, "grad_norm_var": 0.1388023452174811, "learning_rate": 0.0001, "loss": 1.2799, "loss/crossentropy": 2.798325538635254, "loss/hidden": 1.109375, "loss/logits": 0.16272765398025513, "loss/reg": 0.0007835247670300305, "step": 4275 }, { "epoch": 0.5345, "grad_norm": 3.312588691711426, "grad_norm_var": 0.1534133238713404, "learning_rate": 0.0001, "loss": 1.1657, "loss/crossentropy": 2.4354045391082764, "loss/hidden": 1.0, "loss/logits": 0.15784583985805511, "loss/reg": 0.0007830490940250456, "step": 4276 }, { "epoch": 0.534625, "grad_norm": 3.1986637115478516, "grad_norm_var": 0.15266701238015135, "learning_rate": 0.0001, "loss": 1.4695, "loss/crossentropy": 2.330810785293579, "loss/hidden": 1.2421875, "loss/logits": 0.21945421397686005, "loss/reg": 0.0007826081709936261, "step": 4277 }, { "epoch": 0.53475, "grad_norm": 3.9020164012908936, "grad_norm_var": 0.2048924090558624, "learning_rate": 0.0001, "loss": 1.3482, "loss/crossentropy": 2.6978073120117188, "loss/hidden": 1.1484375, "loss/logits": 0.19191455841064453, "loss/reg": 0.0007822333136573434, "step": 4278 }, { "epoch": 0.534875, "grad_norm": 4.596144676208496, "grad_norm_var": 0.3822064363595994, "learning_rate": 0.0001, "loss": 1.5676, "loss/crossentropy": 2.1120431423187256, "loss/hidden": 1.390625, "loss/logits": 0.1691661775112152, "loss/reg": 0.0007817589212208986, "step": 4279 }, { "epoch": 0.535, "grad_norm": 2.8333933353424072, "grad_norm_var": 0.38276945698075315, "learning_rate": 0.0001, "loss": 1.3029, "loss/crossentropy": 2.336569309234619, "loss/hidden": 1.1171875, "loss/logits": 0.17794615030288696, "loss/reg": 0.0007813722477294505, "step": 4280 }, { "epoch": 0.535125, "grad_norm": 2.8183515071868896, "grad_norm_var": 0.3811564130198955, "learning_rate": 0.0001, "loss": 1.2972, "loss/crossentropy": 2.552903413772583, "loss/hidden": 1.0859375, "loss/logits": 0.20341616868972778, "loss/reg": 0.0007809665403328836, "step": 4281 }, { "epoch": 0.53525, "grad_norm": 3.2036538124084473, "grad_norm_var": 0.34670244516900817, "learning_rate": 0.0001, "loss": 1.2338, "loss/crossentropy": 2.610888957977295, "loss/hidden": 1.0546875, "loss/logits": 0.17132464051246643, "loss/reg": 0.0007805935456417501, "step": 4282 }, { "epoch": 0.535375, "grad_norm": 2.560075044631958, "grad_norm_var": 0.3574820557578446, "learning_rate": 0.0001, "loss": 1.4891, "loss/crossentropy": 2.4837868213653564, "loss/hidden": 1.2421875, "loss/logits": 0.23908521234989166, "loss/reg": 0.0007801851606927812, "step": 4283 }, { "epoch": 0.5355, "grad_norm": 2.583531618118286, "grad_norm_var": 0.35889720682368137, "learning_rate": 0.0001, "loss": 1.177, "loss/crossentropy": 2.443410634994507, "loss/hidden": 1.0, "loss/logits": 0.16919344663619995, "loss/reg": 0.0007797402795404196, "step": 4284 }, { "epoch": 0.535625, "grad_norm": 2.3401753902435303, "grad_norm_var": 0.3394703148354909, "learning_rate": 0.0001, "loss": 1.3383, "loss/crossentropy": 2.2877025604248047, "loss/hidden": 1.140625, "loss/logits": 0.18984925746917725, "loss/reg": 0.0007793569238856435, "step": 4285 }, { "epoch": 0.53575, "grad_norm": 3.0080556869506836, "grad_norm_var": 0.3367794830605663, "learning_rate": 0.0001, "loss": 1.1288, "loss/crossentropy": 2.5951952934265137, "loss/hidden": 0.98046875, "loss/logits": 0.14053480327129364, "loss/reg": 0.0007789571536704898, "step": 4286 }, { "epoch": 0.535875, "grad_norm": 5.504602432250977, "grad_norm_var": 0.6863168785955689, "learning_rate": 0.0001, "loss": 1.4433, "loss/crossentropy": 2.588463068008423, "loss/hidden": 1.21875, "loss/logits": 0.2168074995279312, "loss/reg": 0.000778532586991787, "step": 4287 }, { "epoch": 0.536, "grad_norm": 2.6307475566864014, "grad_norm_var": 0.6905947211073413, "learning_rate": 0.0001, "loss": 1.3248, "loss/crossentropy": 2.402416706085205, "loss/hidden": 1.1328125, "loss/logits": 0.18419495224952698, "loss/reg": 0.0007780946907587349, "step": 4288 }, { "epoch": 0.536125, "grad_norm": 2.4014370441436768, "grad_norm_var": 0.7307821422728615, "learning_rate": 0.0001, "loss": 1.3704, "loss/crossentropy": 2.7740468978881836, "loss/hidden": 1.140625, "loss/logits": 0.2220003604888916, "loss/reg": 0.0007776354905217886, "step": 4289 }, { "epoch": 0.53625, "grad_norm": 3.0038692951202393, "grad_norm_var": 0.7074531122034006, "learning_rate": 0.0001, "loss": 1.2342, "loss/crossentropy": 2.5571351051330566, "loss/hidden": 1.0625, "loss/logits": 0.16391701996326447, "loss/reg": 0.0007771648815833032, "step": 4290 }, { "epoch": 0.536375, "grad_norm": 3.2682666778564453, "grad_norm_var": 0.7077646380949657, "learning_rate": 0.0001, "loss": 1.2734, "loss/crossentropy": 2.3858649730682373, "loss/hidden": 1.1015625, "loss/logits": 0.1640278398990631, "loss/reg": 0.0007767415372654796, "step": 4291 }, { "epoch": 0.5365, "grad_norm": 4.047142505645752, "grad_norm_var": 0.7527254515052332, "learning_rate": 0.0001, "loss": 1.4374, "loss/crossentropy": 2.907449960708618, "loss/hidden": 1.203125, "loss/logits": 0.22652965784072876, "loss/reg": 0.0007762570749036968, "step": 4292 }, { "epoch": 0.536625, "grad_norm": 2.674349069595337, "grad_norm_var": 0.7730595404686246, "learning_rate": 0.0001, "loss": 1.2319, "loss/crossentropy": 2.46040678024292, "loss/hidden": 1.046875, "loss/logits": 0.17724312841892242, "loss/reg": 0.00077582779340446, "step": 4293 }, { "epoch": 0.53675, "grad_norm": 3.4287502765655518, "grad_norm_var": 0.7434529801994434, "learning_rate": 0.0001, "loss": 1.4497, "loss/crossentropy": 2.5811245441436768, "loss/hidden": 1.2109375, "loss/logits": 0.2310168743133545, "loss/reg": 0.0007753508980385959, "step": 4294 }, { "epoch": 0.536875, "grad_norm": 2.398510694503784, "grad_norm_var": 0.6307598840832722, "learning_rate": 0.0001, "loss": 1.2625, "loss/crossentropy": 2.686615228652954, "loss/hidden": 1.0703125, "loss/logits": 0.18439540266990662, "loss/reg": 0.0007749320357106626, "step": 4295 }, { "epoch": 0.537, "grad_norm": 2.790966272354126, "grad_norm_var": 0.6320640996202047, "learning_rate": 0.0001, "loss": 1.2531, "loss/crossentropy": 2.581717014312744, "loss/hidden": 1.0703125, "loss/logits": 0.17499800026416779, "loss/reg": 0.0007744836038909853, "step": 4296 }, { "epoch": 0.537125, "grad_norm": 2.163574457168579, "grad_norm_var": 0.6783333122507048, "learning_rate": 0.0001, "loss": 1.1042, "loss/crossentropy": 2.2641072273254395, "loss/hidden": 0.94140625, "loss/logits": 0.15501300990581512, "loss/reg": 0.0007740315049886703, "step": 4297 }, { "epoch": 0.53725, "grad_norm": 2.4908692836761475, "grad_norm_var": 0.6907781190904255, "learning_rate": 0.0001, "loss": 1.1856, "loss/crossentropy": 2.946918487548828, "loss/hidden": 1.015625, "loss/logits": 0.16225066781044006, "loss/reg": 0.0007736249826848507, "step": 4298 }, { "epoch": 0.537375, "grad_norm": 3.7467291355133057, "grad_norm_var": 0.7161545496066604, "learning_rate": 0.0001, "loss": 1.6764, "loss/crossentropy": 2.405876398086548, "loss/hidden": 1.421875, "loss/logits": 0.24681775271892548, "loss/reg": 0.0007731511141173542, "step": 4299 }, { "epoch": 0.5375, "grad_norm": 3.5943777561187744, "grad_norm_var": 0.7198296031253743, "learning_rate": 0.0001, "loss": 1.2787, "loss/crossentropy": 2.9486658573150635, "loss/hidden": 1.109375, "loss/logits": 0.16161799430847168, "loss/reg": 0.0007726555340923369, "step": 4300 }, { "epoch": 0.537625, "grad_norm": 2.762221336364746, "grad_norm_var": 0.688583175833823, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.207447052001953, "loss/hidden": 1.140625, "loss/logits": 0.2033766657114029, "loss/reg": 0.0007722351583652198, "step": 4301 }, { "epoch": 0.53775, "grad_norm": 3.0024285316467285, "grad_norm_var": 0.6886688859508465, "learning_rate": 0.0001, "loss": 1.4311, "loss/crossentropy": 2.4488706588745117, "loss/hidden": 1.203125, "loss/logits": 0.22026120126247406, "loss/reg": 0.0007717700791545212, "step": 4302 }, { "epoch": 0.537875, "grad_norm": 2.7693490982055664, "grad_norm_var": 0.28634966838738457, "learning_rate": 0.0001, "loss": 1.3289, "loss/crossentropy": 2.4397623538970947, "loss/hidden": 1.109375, "loss/logits": 0.2117895632982254, "loss/reg": 0.0007713590748608112, "step": 4303 }, { "epoch": 0.538, "grad_norm": 3.2464942932128906, "grad_norm_var": 0.2839712051998494, "learning_rate": 0.0001, "loss": 1.3309, "loss/crossentropy": 2.6888301372528076, "loss/hidden": 1.140625, "loss/logits": 0.18257087469100952, "loss/reg": 0.0007708348566666245, "step": 4304 }, { "epoch": 0.538125, "grad_norm": 3.3217384815216064, "grad_norm_var": 0.2650737182976542, "learning_rate": 0.0001, "loss": 1.5245, "loss/crossentropy": 2.5861594676971436, "loss/hidden": 1.265625, "loss/logits": 0.25114282965660095, "loss/reg": 0.0007703467272222042, "step": 4305 }, { "epoch": 0.53825, "grad_norm": 3.046551465988159, "grad_norm_var": 0.2649571916454761, "learning_rate": 0.0001, "loss": 1.3655, "loss/crossentropy": 2.5561413764953613, "loss/hidden": 1.15625, "loss/logits": 0.20158691704273224, "loss/reg": 0.0007698650006204844, "step": 4306 }, { "epoch": 0.538375, "grad_norm": 2.7440884113311768, "grad_norm_var": 0.26666684051190426, "learning_rate": 0.0001, "loss": 1.3073, "loss/crossentropy": 2.2194578647613525, "loss/hidden": 1.140625, "loss/logits": 0.15900185704231262, "loss/reg": 0.0007693655788898468, "step": 4307 }, { "epoch": 0.5385, "grad_norm": 2.2024686336517334, "grad_norm_var": 0.22529872258106234, "learning_rate": 0.0001, "loss": 1.273, "loss/crossentropy": 2.3604884147644043, "loss/hidden": 1.078125, "loss/logits": 0.18718411028385162, "loss/reg": 0.0007689274498261511, "step": 4308 }, { "epoch": 0.538625, "grad_norm": 3.0037615299224854, "grad_norm_var": 0.22221517864989646, "learning_rate": 0.0001, "loss": 1.2467, "loss/crossentropy": 2.527989149093628, "loss/hidden": 1.0625, "loss/logits": 0.17648792266845703, "loss/reg": 0.000768458703532815, "step": 4309 }, { "epoch": 0.53875, "grad_norm": 2.279090642929077, "grad_norm_var": 0.22676900426597893, "learning_rate": 0.0001, "loss": 1.1412, "loss/crossentropy": 2.3947389125823975, "loss/hidden": 0.97265625, "loss/logits": 0.16089776158332825, "loss/reg": 0.0007679633563384414, "step": 4310 }, { "epoch": 0.538875, "grad_norm": 2.90759015083313, "grad_norm_var": 0.21247679796319877, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.611877918243408, "loss/hidden": 1.1640625, "loss/logits": 0.19843456149101257, "loss/reg": 0.0007675315137021244, "step": 4311 }, { "epoch": 0.539, "grad_norm": 11.580940246582031, "grad_norm_var": 4.937671293731479, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 2.134258270263672, "loss/hidden": 1.328125, "loss/logits": 0.1996900737285614, "loss/reg": 0.0007670237100683153, "step": 4312 }, { "epoch": 0.539125, "grad_norm": 4.055321216583252, "grad_norm_var": 4.842185668571474, "learning_rate": 0.0001, "loss": 1.6801, "loss/crossentropy": 2.476127862930298, "loss/hidden": 1.3828125, "loss/logits": 0.28962916135787964, "loss/reg": 0.0007665000157430768, "step": 4313 }, { "epoch": 0.53925, "grad_norm": 2.5053234100341797, "grad_norm_var": 4.840163089935197, "learning_rate": 0.0001, "loss": 1.2651, "loss/crossentropy": 2.3013198375701904, "loss/hidden": 1.0859375, "loss/logits": 0.17149397730827332, "loss/reg": 0.0007659852271899581, "step": 4314 }, { "epoch": 0.539375, "grad_norm": 2.2333312034606934, "grad_norm_var": 4.9432165684169185, "learning_rate": 0.0001, "loss": 1.2137, "loss/crossentropy": 2.2686450481414795, "loss/hidden": 1.03125, "loss/logits": 0.1747530847787857, "loss/reg": 0.0007654638029634953, "step": 4315 }, { "epoch": 0.5395, "grad_norm": 2.7128145694732666, "grad_norm_var": 4.975222867485366, "learning_rate": 0.0001, "loss": 1.1881, "loss/crossentropy": 2.0162811279296875, "loss/hidden": 1.03125, "loss/logits": 0.14917482435703278, "loss/reg": 0.0007649431936442852, "step": 4316 }, { "epoch": 0.539625, "grad_norm": 2.2773075103759766, "grad_norm_var": 5.0310478666975085, "learning_rate": 0.0001, "loss": 1.1735, "loss/crossentropy": 2.5373289585113525, "loss/hidden": 1.0078125, "loss/logits": 0.15805523097515106, "loss/reg": 0.000764415948651731, "step": 4317 }, { "epoch": 0.53975, "grad_norm": 2.229097366333008, "grad_norm_var": 5.10612367227909, "learning_rate": 0.0001, "loss": 1.1635, "loss/crossentropy": 2.3915274143218994, "loss/hidden": 1.0078125, "loss/logits": 0.14803923666477203, "loss/reg": 0.0007638537208549678, "step": 4318 }, { "epoch": 0.539875, "grad_norm": 3.097587823867798, "grad_norm_var": 5.08877107980457, "learning_rate": 0.0001, "loss": 1.2672, "loss/crossentropy": 2.540708065032959, "loss/hidden": 1.0546875, "loss/logits": 0.204829141497612, "loss/reg": 0.0007634417852386832, "step": 4319 }, { "epoch": 0.54, "grad_norm": 3.1548852920532227, "grad_norm_var": 5.090440398699323, "learning_rate": 0.0001, "loss": 1.5799, "loss/crossentropy": 2.359738826751709, "loss/hidden": 1.3046875, "loss/logits": 0.2676249146461487, "loss/reg": 0.0007629120955243707, "step": 4320 }, { "epoch": 0.540125, "grad_norm": 2.855215311050415, "grad_norm_var": 5.104836549640851, "learning_rate": 0.0001, "loss": 1.2797, "loss/crossentropy": 2.461088180541992, "loss/hidden": 1.078125, "loss/logits": 0.19396589696407318, "loss/reg": 0.00076239526970312, "step": 4321 }, { "epoch": 0.54025, "grad_norm": 7.446508407592773, "grad_norm_var": 6.162994137967568, "learning_rate": 0.0001, "loss": 1.539, "loss/crossentropy": 2.440758466720581, "loss/hidden": 1.28125, "loss/logits": 0.25010091066360474, "loss/reg": 0.0007618907256983221, "step": 4322 }, { "epoch": 0.540375, "grad_norm": 2.38568115234375, "grad_norm_var": 6.210984785084647, "learning_rate": 0.0001, "loss": 1.2979, "loss/crossentropy": 2.6126620769500732, "loss/hidden": 1.109375, "loss/logits": 0.18094149231910706, "loss/reg": 0.0007613736670464277, "step": 4323 }, { "epoch": 0.5405, "grad_norm": 3.0249383449554443, "grad_norm_var": 6.104619551114958, "learning_rate": 0.0001, "loss": 1.1903, "loss/crossentropy": 2.743725061416626, "loss/hidden": 0.99609375, "loss/logits": 0.1866292953491211, "loss/reg": 0.0007609627209603786, "step": 4324 }, { "epoch": 0.540625, "grad_norm": 3.2475051879882812, "grad_norm_var": 6.088652041659959, "learning_rate": 0.0001, "loss": 1.5275, "loss/crossentropy": 2.603980541229248, "loss/hidden": 1.234375, "loss/logits": 0.28555768728256226, "loss/reg": 0.0007605199934914708, "step": 4325 }, { "epoch": 0.54075, "grad_norm": 2.5935559272766113, "grad_norm_var": 6.038418314379303, "learning_rate": 0.0001, "loss": 1.4282, "loss/crossentropy": 2.723193407058716, "loss/hidden": 1.1875, "loss/logits": 0.23311856389045715, "loss/reg": 0.0007600217359140515, "step": 4326 }, { "epoch": 0.540875, "grad_norm": 2.897343397140503, "grad_norm_var": 6.039431292341474, "learning_rate": 0.0001, "loss": 1.1571, "loss/crossentropy": 2.921309232711792, "loss/hidden": 0.984375, "loss/logits": 0.16515156626701355, "loss/reg": 0.0007596341311000288, "step": 4327 }, { "epoch": 0.541, "grad_norm": 2.892134428024292, "grad_norm_var": 1.5624050621053727, "learning_rate": 0.0001, "loss": 1.2862, "loss/crossentropy": 2.451531410217285, "loss/hidden": 1.09375, "loss/logits": 0.18485133349895477, "loss/reg": 0.0007591514149680734, "step": 4328 }, { "epoch": 0.541125, "grad_norm": 2.2519259452819824, "grad_norm_var": 1.53608865018876, "learning_rate": 0.0001, "loss": 1.155, "loss/crossentropy": 2.290280342102051, "loss/hidden": 0.97265625, "loss/logits": 0.1747356653213501, "loss/reg": 0.0007587597356177866, "step": 4329 }, { "epoch": 0.54125, "grad_norm": 4.300634860992432, "grad_norm_var": 1.6220370190444744, "learning_rate": 0.0001, "loss": 1.1913, "loss/crossentropy": 2.4645400047302246, "loss/hidden": 1.03125, "loss/logits": 0.1525074541568756, "loss/reg": 0.0007583763217553496, "step": 4330 }, { "epoch": 0.541375, "grad_norm": 6.763911724090576, "grad_norm_var": 2.3813693530075892, "learning_rate": 0.0001, "loss": 1.3994, "loss/crossentropy": 2.630692958831787, "loss/hidden": 1.1953125, "loss/logits": 0.196555495262146, "loss/reg": 0.0007579340017400682, "step": 4331 }, { "epoch": 0.5415, "grad_norm": 3.643700122833252, "grad_norm_var": 2.3523228342191693, "learning_rate": 0.0001, "loss": 1.4784, "loss/crossentropy": 2.4097483158111572, "loss/hidden": 1.25, "loss/logits": 0.22083237767219543, "loss/reg": 0.0007575504714623094, "step": 4332 }, { "epoch": 0.541625, "grad_norm": 46.508567810058594, "grad_norm_var": 117.76253221539528, "learning_rate": 0.0001, "loss": 1.2694, "loss/crossentropy": 2.5145394802093506, "loss/hidden": 1.078125, "loss/logits": 0.18369576334953308, "loss/reg": 0.0007571564055979252, "step": 4333 }, { "epoch": 0.54175, "grad_norm": 2.810734987258911, "grad_norm_var": 117.47527422397124, "learning_rate": 0.0001, "loss": 1.2933, "loss/crossentropy": 2.6343157291412354, "loss/hidden": 1.109375, "loss/logits": 0.17638254165649414, "loss/reg": 0.0007567562279291451, "step": 4334 }, { "epoch": 0.541875, "grad_norm": 2.6816418170928955, "grad_norm_var": 117.66048465351336, "learning_rate": 0.0001, "loss": 1.3328, "loss/crossentropy": 2.554258108139038, "loss/hidden": 1.1328125, "loss/logits": 0.1923825591802597, "loss/reg": 0.0007563745020888746, "step": 4335 }, { "epoch": 0.542, "grad_norm": 2.8546464443206787, "grad_norm_var": 117.78866790509281, "learning_rate": 0.0001, "loss": 1.4719, "loss/crossentropy": 2.459660530090332, "loss/hidden": 1.203125, "loss/logits": 0.2612346410751343, "loss/reg": 0.0007560297381132841, "step": 4336 }, { "epoch": 0.542125, "grad_norm": 2.348065137863159, "grad_norm_var": 118.03074263117259, "learning_rate": 0.0001, "loss": 1.1329, "loss/crossentropy": 2.474038600921631, "loss/hidden": 0.9609375, "loss/logits": 0.16436153650283813, "loss/reg": 0.0007556190830655396, "step": 4337 }, { "epoch": 0.54225, "grad_norm": 2.2598190307617188, "grad_norm_var": 118.82636053187277, "learning_rate": 0.0001, "loss": 1.1165, "loss/crossentropy": 2.681725025177002, "loss/hidden": 0.9453125, "loss/logits": 0.1636652648448944, "loss/reg": 0.0007552086026407778, "step": 4338 }, { "epoch": 0.542375, "grad_norm": 2.5828845500946045, "grad_norm_var": 118.73792321805293, "learning_rate": 0.0001, "loss": 1.3746, "loss/crossentropy": 2.913975715637207, "loss/hidden": 1.140625, "loss/logits": 0.22644928097724915, "loss/reg": 0.0007548373541794717, "step": 4339 }, { "epoch": 0.5425, "grad_norm": 1.9523999691009521, "grad_norm_var": 119.21437188209269, "learning_rate": 0.0001, "loss": 1.1189, "loss/crossentropy": 2.52162766456604, "loss/hidden": 0.9453125, "loss/logits": 0.16600669920444489, "loss/reg": 0.0007545000407844782, "step": 4340 }, { "epoch": 0.542625, "grad_norm": 2.6731326580047607, "grad_norm_var": 119.42946090786054, "learning_rate": 0.0001, "loss": 1.0785, "loss/crossentropy": 2.3980965614318848, "loss/hidden": 0.93359375, "loss/logits": 0.13740120828151703, "loss/reg": 0.000754157139454037, "step": 4341 }, { "epoch": 0.54275, "grad_norm": 2.7612907886505127, "grad_norm_var": 119.36060547817306, "learning_rate": 0.0001, "loss": 1.3448, "loss/crossentropy": 2.646284580230713, "loss/hidden": 1.1171875, "loss/logits": 0.22005993127822876, "loss/reg": 0.0007537505589425564, "step": 4342 }, { "epoch": 0.542875, "grad_norm": 3.4855546951293945, "grad_norm_var": 119.15760515869381, "learning_rate": 0.0001, "loss": 1.4049, "loss/crossentropy": 2.270911455154419, "loss/hidden": 1.203125, "loss/logits": 0.19420970976352692, "loss/reg": 0.0007533322204835713, "step": 4343 }, { "epoch": 0.543, "grad_norm": 2.753962516784668, "grad_norm_var": 119.21233641540502, "learning_rate": 0.0001, "loss": 1.222, "loss/crossentropy": 2.4909310340881348, "loss/hidden": 1.03125, "loss/logits": 0.1831943690776825, "loss/reg": 0.0007528857095167041, "step": 4344 }, { "epoch": 0.543125, "grad_norm": 2.5998406410217285, "grad_norm_var": 119.05579597375547, "learning_rate": 0.0001, "loss": 1.4223, "loss/crossentropy": 2.6963372230529785, "loss/hidden": 1.203125, "loss/logits": 0.21167340874671936, "loss/reg": 0.000752482155803591, "step": 4345 }, { "epoch": 0.54325, "grad_norm": 2.421782970428467, "grad_norm_var": 119.65486819901125, "learning_rate": 0.0001, "loss": 1.314, "loss/crossentropy": 2.3374550342559814, "loss/hidden": 1.1328125, "loss/logits": 0.17368707060813904, "loss/reg": 0.000752086634747684, "step": 4346 }, { "epoch": 0.543375, "grad_norm": 2.323046922683716, "grad_norm_var": 120.25386071379154, "learning_rate": 0.0001, "loss": 1.3267, "loss/crossentropy": 2.3909287452697754, "loss/hidden": 1.125, "loss/logits": 0.19416093826293945, "loss/reg": 0.0007516910554841161, "step": 4347 }, { "epoch": 0.5435, "grad_norm": 3.2946829795837402, "grad_norm_var": 120.34396384526222, "learning_rate": 0.0001, "loss": 1.4768, "loss/crossentropy": 2.676945924758911, "loss/hidden": 1.25, "loss/logits": 0.21932925283908844, "loss/reg": 0.0007512561860494316, "step": 4348 }, { "epoch": 0.543625, "grad_norm": 2.1675515174865723, "grad_norm_var": 0.15490155223408783, "learning_rate": 0.0001, "loss": 1.1643, "loss/crossentropy": 2.6508641242980957, "loss/hidden": 0.984375, "loss/logits": 0.17243745923042297, "loss/reg": 0.0007508498383685946, "step": 4349 }, { "epoch": 0.54375, "grad_norm": 5.606265068054199, "grad_norm_var": 0.7132434050235276, "learning_rate": 0.0001, "loss": 1.3786, "loss/crossentropy": 2.5408432483673096, "loss/hidden": 1.15625, "loss/logits": 0.21489331126213074, "loss/reg": 0.000750451406929642, "step": 4350 }, { "epoch": 0.543875, "grad_norm": 3.6370832920074463, "grad_norm_var": 0.7554859601873346, "learning_rate": 0.0001, "loss": 1.1813, "loss/crossentropy": 2.436256170272827, "loss/hidden": 1.03125, "loss/logits": 0.14250212907791138, "loss/reg": 0.0007500721840187907, "step": 4351 }, { "epoch": 0.544, "grad_norm": 2.3157546520233154, "grad_norm_var": 0.7738502900537384, "learning_rate": 0.0001, "loss": 1.2124, "loss/crossentropy": 2.606243133544922, "loss/hidden": 1.0234375, "loss/logits": 0.18150916695594788, "loss/reg": 0.0007496618200093508, "step": 4352 }, { "epoch": 0.544125, "grad_norm": 2.1209120750427246, "grad_norm_var": 0.7914882013302055, "learning_rate": 0.0001, "loss": 1.1529, "loss/crossentropy": 2.618820905685425, "loss/hidden": 0.984375, "loss/logits": 0.16099025309085846, "loss/reg": 0.000749309197999537, "step": 4353 }, { "epoch": 0.54425, "grad_norm": 2.439145088195801, "grad_norm_var": 0.7803491923093283, "learning_rate": 0.0001, "loss": 1.3012, "loss/crossentropy": 2.201183795928955, "loss/hidden": 1.1171875, "loss/logits": 0.17653435468673706, "loss/reg": 0.0007489135605283082, "step": 4354 }, { "epoch": 0.544375, "grad_norm": 2.1764965057373047, "grad_norm_var": 0.8035710447462245, "learning_rate": 0.0001, "loss": 1.2579, "loss/crossentropy": 2.5112714767456055, "loss/hidden": 1.0390625, "loss/logits": 0.21139644086360931, "loss/reg": 0.0007485465030185878, "step": 4355 }, { "epoch": 0.5445, "grad_norm": 2.9138312339782715, "grad_norm_var": 0.7532579978653546, "learning_rate": 0.0001, "loss": 1.5236, "loss/crossentropy": 2.5480830669403076, "loss/hidden": 1.296875, "loss/logits": 0.21919548511505127, "loss/reg": 0.0007481864304281771, "step": 4356 }, { "epoch": 0.544625, "grad_norm": 2.2861311435699463, "grad_norm_var": 0.7720363512026682, "learning_rate": 0.0001, "loss": 1.2511, "loss/crossentropy": 2.5102345943450928, "loss/hidden": 1.0546875, "loss/logits": 0.18896307051181793, "loss/reg": 0.0007478368934243917, "step": 4357 }, { "epoch": 0.54475, "grad_norm": 2.2380564212799072, "grad_norm_var": 0.7940424429298254, "learning_rate": 0.0001, "loss": 1.0186, "loss/crossentropy": 2.3923065662384033, "loss/hidden": 0.8828125, "loss/logits": 0.12830883264541626, "loss/reg": 0.0007474982994608581, "step": 4358 }, { "epoch": 0.544875, "grad_norm": 2.7129838466644287, "grad_norm_var": 0.7605998078502192, "learning_rate": 0.0001, "loss": 1.3142, "loss/crossentropy": 2.93705415725708, "loss/hidden": 1.09375, "loss/logits": 0.21295228600502014, "loss/reg": 0.0007470863638445735, "step": 4359 }, { "epoch": 0.545, "grad_norm": 2.5997416973114014, "grad_norm_var": 0.7620145046675624, "learning_rate": 0.0001, "loss": 1.2201, "loss/crossentropy": 2.5305230617523193, "loss/hidden": 1.0390625, "loss/logits": 0.17359837889671326, "loss/reg": 0.0007466737297363579, "step": 4360 }, { "epoch": 0.545125, "grad_norm": 6.204303741455078, "grad_norm_var": 1.5062645699300823, "learning_rate": 0.0001, "loss": 1.4508, "loss/crossentropy": 2.626833200454712, "loss/hidden": 1.2109375, "loss/logits": 0.23235756158828735, "loss/reg": 0.0007462548674084246, "step": 4361 }, { "epoch": 0.54525, "grad_norm": 3.6881422996520996, "grad_norm_var": 1.5145851136254007, "learning_rate": 0.0001, "loss": 1.3015, "loss/crossentropy": 2.4094834327697754, "loss/hidden": 1.1171875, "loss/logits": 0.17690260708332062, "loss/reg": 0.0007458595209755003, "step": 4362 }, { "epoch": 0.545375, "grad_norm": 4.741880416870117, "grad_norm_var": 1.6473362084008376, "learning_rate": 0.0001, "loss": 1.29, "loss/crossentropy": 2.5024352073669434, "loss/hidden": 1.125, "loss/logits": 0.15753507614135742, "loss/reg": 0.0007454573060385883, "step": 4363 }, { "epoch": 0.5455, "grad_norm": 2.6446900367736816, "grad_norm_var": 1.6652271632762894, "learning_rate": 0.0001, "loss": 1.378, "loss/crossentropy": 2.7741615772247314, "loss/hidden": 1.15625, "loss/logits": 0.21432414650917053, "loss/reg": 0.0007450245320796967, "step": 4364 }, { "epoch": 0.545625, "grad_norm": 2.6466031074523926, "grad_norm_var": 1.6164467058394762, "learning_rate": 0.0001, "loss": 1.2802, "loss/crossentropy": 2.5898568630218506, "loss/hidden": 1.0859375, "loss/logits": 0.18684102594852448, "loss/reg": 0.0007445854716934264, "step": 4365 }, { "epoch": 0.54575, "grad_norm": 2.8751447200775146, "grad_norm_var": 1.2012067618112645, "learning_rate": 0.0001, "loss": 1.5081, "loss/crossentropy": 2.5583884716033936, "loss/hidden": 1.2578125, "loss/logits": 0.24287742376327515, "loss/reg": 0.0007441873895004392, "step": 4366 }, { "epoch": 0.545875, "grad_norm": 5.199769020080566, "grad_norm_var": 1.483435296787809, "learning_rate": 0.0001, "loss": 2.1743, "loss/crossentropy": 2.5770256519317627, "loss/hidden": 1.765625, "loss/logits": 0.40124043822288513, "loss/reg": 0.0007437560125254095, "step": 4367 }, { "epoch": 0.546, "grad_norm": 2.7233102321624756, "grad_norm_var": 1.4505087295766543, "learning_rate": 0.0001, "loss": 1.3998, "loss/crossentropy": 2.475400924682617, "loss/hidden": 1.1953125, "loss/logits": 0.1970546841621399, "loss/reg": 0.0007433612481690943, "step": 4368 }, { "epoch": 0.546125, "grad_norm": 2.9399712085723877, "grad_norm_var": 1.3813418826104715, "learning_rate": 0.0001, "loss": 1.3193, "loss/crossentropy": 2.8000640869140625, "loss/hidden": 1.125, "loss/logits": 0.18686392903327942, "loss/reg": 0.0007429345278069377, "step": 4369 }, { "epoch": 0.54625, "grad_norm": 2.2257113456726074, "grad_norm_var": 1.4055392793105432, "learning_rate": 0.0001, "loss": 1.2002, "loss/crossentropy": 2.391981363296509, "loss/hidden": 1.015625, "loss/logits": 0.17718388140201569, "loss/reg": 0.0007424785871990025, "step": 4370 }, { "epoch": 0.546375, "grad_norm": 1.9668281078338623, "grad_norm_var": 1.4362300778976647, "learning_rate": 0.0001, "loss": 1.212, "loss/crossentropy": 2.637223482131958, "loss/hidden": 1.0234375, "loss/logits": 0.18116462230682373, "loss/reg": 0.0007420408655889332, "step": 4371 }, { "epoch": 0.5465, "grad_norm": 2.9547719955444336, "grad_norm_var": 1.4349749900218047, "learning_rate": 0.0001, "loss": 1.2737, "loss/crossentropy": 2.529215097427368, "loss/hidden": 1.078125, "loss/logits": 0.1881544440984726, "loss/reg": 0.0007416480220854282, "step": 4372 }, { "epoch": 0.546625, "grad_norm": 4.388747692108154, "grad_norm_var": 1.464756504610443, "learning_rate": 0.0001, "loss": 1.722, "loss/crossentropy": 2.706779718399048, "loss/hidden": 1.4609375, "loss/logits": 0.25365954637527466, "loss/reg": 0.0007412211853079498, "step": 4373 }, { "epoch": 0.54675, "grad_norm": 4.015463352203369, "grad_norm_var": 1.4112683880823245, "learning_rate": 0.0001, "loss": 1.4952, "loss/crossentropy": 2.5657193660736084, "loss/hidden": 1.2578125, "loss/logits": 0.2300190031528473, "loss/reg": 0.0007407825323753059, "step": 4374 }, { "epoch": 0.546875, "grad_norm": 2.5494272708892822, "grad_norm_var": 1.428096990750304, "learning_rate": 0.0001, "loss": 1.2616, "loss/crossentropy": 2.4646568298339844, "loss/hidden": 1.078125, "loss/logits": 0.17603391408920288, "loss/reg": 0.0007403844501823187, "step": 4375 }, { "epoch": 0.547, "grad_norm": 2.7332448959350586, "grad_norm_var": 1.415005483417343, "learning_rate": 0.0001, "loss": 1.3892, "loss/crossentropy": 2.112539291381836, "loss/hidden": 1.1875, "loss/logits": 0.19429053366184235, "loss/reg": 0.0007399663445539773, "step": 4376 }, { "epoch": 0.547125, "grad_norm": 2.80497670173645, "grad_norm_var": 0.8689631500699988, "learning_rate": 0.0001, "loss": 1.3969, "loss/crossentropy": 2.2105605602264404, "loss/hidden": 1.1328125, "loss/logits": 0.2566582262516022, "loss/reg": 0.0007395715219900012, "step": 4377 }, { "epoch": 0.54725, "grad_norm": 2.3917407989501953, "grad_norm_var": 0.8885325129149871, "learning_rate": 0.0001, "loss": 1.0649, "loss/crossentropy": 2.528042793273926, "loss/hidden": 0.91015625, "loss/logits": 0.14737442135810852, "loss/reg": 0.0007391749531961977, "step": 4378 }, { "epoch": 0.547375, "grad_norm": 3.7689545154571533, "grad_norm_var": 0.736343701652716, "learning_rate": 0.0001, "loss": 1.3021, "loss/crossentropy": 2.904555320739746, "loss/hidden": 1.1171875, "loss/logits": 0.17752718925476074, "loss/reg": 0.0007387436926364899, "step": 4379 }, { "epoch": 0.5475, "grad_norm": 3.8795244693756104, "grad_norm_var": 0.7646105418818014, "learning_rate": 0.0001, "loss": 1.5341, "loss/crossentropy": 2.646824836730957, "loss/hidden": 1.2578125, "loss/logits": 0.2688699960708618, "loss/reg": 0.0007383546326309443, "step": 4380 }, { "epoch": 0.547625, "grad_norm": 2.625248908996582, "grad_norm_var": 0.7660125689009419, "learning_rate": 0.0001, "loss": 1.1411, "loss/crossentropy": 2.618689775466919, "loss/hidden": 0.98046875, "loss/logits": 0.15322041511535645, "loss/reg": 0.0007379245362244546, "step": 4381 }, { "epoch": 0.54775, "grad_norm": 4.357396125793457, "grad_norm_var": 0.8534205122065692, "learning_rate": 0.0001, "loss": 1.6496, "loss/crossentropy": 2.68809175491333, "loss/hidden": 1.3515625, "loss/logits": 0.2906687259674072, "loss/reg": 0.0007374860579147935, "step": 4382 }, { "epoch": 0.547875, "grad_norm": 2.564690351486206, "grad_norm_var": 0.5919300689206579, "learning_rate": 0.0001, "loss": 1.2377, "loss/crossentropy": 2.4104769229888916, "loss/hidden": 1.0625, "loss/logits": 0.16779373586177826, "loss/reg": 0.0007370863459073007, "step": 4383 }, { "epoch": 0.548, "grad_norm": 2.892347574234009, "grad_norm_var": 0.58622609511683, "learning_rate": 0.0001, "loss": 1.4219, "loss/crossentropy": 2.3523359298706055, "loss/hidden": 1.203125, "loss/logits": 0.2113942950963974, "loss/reg": 0.0007366937352344394, "step": 4384 }, { "epoch": 0.548125, "grad_norm": 2.8357341289520264, "grad_norm_var": 0.588659409043017, "learning_rate": 0.0001, "loss": 1.3924, "loss/crossentropy": 2.6699700355529785, "loss/hidden": 1.15625, "loss/logits": 0.22879590094089508, "loss/reg": 0.0007362795295193791, "step": 4385 }, { "epoch": 0.54825, "grad_norm": 5.1949143409729, "grad_norm_var": 0.8095086229301615, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.850984811782837, "loss/hidden": 1.140625, "loss/logits": 0.175614595413208, "loss/reg": 0.0007358610164374113, "step": 4386 }, { "epoch": 0.548375, "grad_norm": 2.2886297702789307, "grad_norm_var": 0.76112775682496, "learning_rate": 0.0001, "loss": 1.3539, "loss/crossentropy": 2.3537306785583496, "loss/hidden": 1.140625, "loss/logits": 0.20592449605464935, "loss/reg": 0.0007354711997322738, "step": 4387 }, { "epoch": 0.5485, "grad_norm": 2.9125101566314697, "grad_norm_var": 0.7629895404205075, "learning_rate": 0.0001, "loss": 1.4306, "loss/crossentropy": 2.7100188732147217, "loss/hidden": 1.171875, "loss/logits": 0.2514075040817261, "loss/reg": 0.0007350845844484866, "step": 4388 }, { "epoch": 0.548625, "grad_norm": 2.1317789554595947, "grad_norm_var": 0.7425047938493274, "learning_rate": 0.0001, "loss": 1.1482, "loss/crossentropy": 2.527949333190918, "loss/hidden": 0.98046875, "loss/logits": 0.16035214066505432, "loss/reg": 0.0007347051869146526, "step": 4389 }, { "epoch": 0.54875, "grad_norm": 2.597123861312866, "grad_norm_var": 0.6992066107235186, "learning_rate": 0.0001, "loss": 1.3401, "loss/crossentropy": 2.4790165424346924, "loss/hidden": 1.15625, "loss/logits": 0.17654214799404144, "loss/reg": 0.000734317465685308, "step": 4390 }, { "epoch": 0.548875, "grad_norm": 2.94423246383667, "grad_norm_var": 0.6834921548048366, "learning_rate": 0.0001, "loss": 1.4682, "loss/crossentropy": 2.702467441558838, "loss/hidden": 1.2109375, "loss/logits": 0.24990421533584595, "loss/reg": 0.0007338941795751452, "step": 4391 }, { "epoch": 0.549, "grad_norm": 2.745889663696289, "grad_norm_var": 0.6829551426953344, "learning_rate": 0.0001, "loss": 1.1673, "loss/crossentropy": 2.597860097885132, "loss/hidden": 0.99609375, "loss/logits": 0.16388574242591858, "loss/reg": 0.000733515596948564, "step": 4392 }, { "epoch": 0.549125, "grad_norm": 2.580049753189087, "grad_norm_var": 0.693719804701603, "learning_rate": 0.0001, "loss": 1.1899, "loss/crossentropy": 2.61126708984375, "loss/hidden": 1.015625, "loss/logits": 0.16690108180046082, "loss/reg": 0.0007331432425417006, "step": 4393 }, { "epoch": 0.54925, "grad_norm": 2.634711980819702, "grad_norm_var": 0.6762651008424956, "learning_rate": 0.0001, "loss": 1.2488, "loss/crossentropy": 2.5000343322753906, "loss/hidden": 1.0703125, "loss/logits": 0.17114189267158508, "loss/reg": 0.0007328322390094399, "step": 4394 }, { "epoch": 0.549375, "grad_norm": 2.4375531673431396, "grad_norm_var": 0.6611312268569992, "learning_rate": 0.0001, "loss": 1.3168, "loss/crossentropy": 2.4489033222198486, "loss/hidden": 1.1171875, "loss/logits": 0.192266047000885, "loss/reg": 0.0007325310143642128, "step": 4395 }, { "epoch": 0.5495, "grad_norm": 3.0454671382904053, "grad_norm_var": 0.6041746621116694, "learning_rate": 0.0001, "loss": 1.4668, "loss/crossentropy": 2.3400356769561768, "loss/hidden": 1.234375, "loss/logits": 0.22506557404994965, "loss/reg": 0.0007321545854210854, "step": 4396 }, { "epoch": 0.549625, "grad_norm": 2.3147783279418945, "grad_norm_var": 0.6225773534357811, "learning_rate": 0.0001, "loss": 1.2133, "loss/crossentropy": 2.480179786682129, "loss/hidden": 1.0234375, "loss/logits": 0.1825496107339859, "loss/reg": 0.0007317819981835783, "step": 4397 }, { "epoch": 0.54975, "grad_norm": 2.7174181938171387, "grad_norm_var": 0.4730565233425558, "learning_rate": 0.0001, "loss": 1.3743, "loss/crossentropy": 2.62204647064209, "loss/hidden": 1.125, "loss/logits": 0.24195396900177002, "loss/reg": 0.0007314448594115674, "step": 4398 }, { "epoch": 0.549875, "grad_norm": 2.262845516204834, "grad_norm_var": 0.4883163404776231, "learning_rate": 0.0001, "loss": 1.2198, "loss/crossentropy": 2.2600414752960205, "loss/hidden": 1.046875, "loss/logits": 0.1656586080789566, "loss/reg": 0.0007310616783797741, "step": 4399 }, { "epoch": 0.55, "grad_norm": 2.4234089851379395, "grad_norm_var": 0.49525453396929014, "learning_rate": 0.0001, "loss": 1.139, "loss/crossentropy": 2.7630972862243652, "loss/hidden": 0.98828125, "loss/logits": 0.14338788390159607, "loss/reg": 0.0007307111518457532, "step": 4400 }, { "epoch": 0.550125, "grad_norm": 2.6190402507781982, "grad_norm_var": 0.49583329519636526, "learning_rate": 0.0001, "loss": 1.4148, "loss/crossentropy": 2.4180099964141846, "loss/hidden": 1.1953125, "loss/logits": 0.21214328706264496, "loss/reg": 0.0007303257589228451, "step": 4401 }, { "epoch": 0.55025, "grad_norm": 3.2069180011749268, "grad_norm_var": 0.09229812891428538, "learning_rate": 0.0001, "loss": 1.2608, "loss/crossentropy": 2.357511520385742, "loss/hidden": 1.078125, "loss/logits": 0.17534323036670685, "loss/reg": 0.000729947118088603, "step": 4402 }, { "epoch": 0.550375, "grad_norm": 2.3635752201080322, "grad_norm_var": 0.08937388924869841, "learning_rate": 0.0001, "loss": 1.3068, "loss/crossentropy": 2.6714859008789062, "loss/hidden": 1.1015625, "loss/logits": 0.19798794388771057, "loss/reg": 0.0007296053227037191, "step": 4403 }, { "epoch": 0.5505, "grad_norm": 2.8807461261749268, "grad_norm_var": 0.08820268839478634, "learning_rate": 0.0001, "loss": 1.2273, "loss/crossentropy": 2.569603204727173, "loss/hidden": 1.03125, "loss/logits": 0.18877823650836945, "loss/reg": 0.0007292728987522423, "step": 4404 }, { "epoch": 0.550625, "grad_norm": 2.686673402786255, "grad_norm_var": 0.07139231850429345, "learning_rate": 0.0001, "loss": 1.3431, "loss/crossentropy": 2.605346441268921, "loss/hidden": 1.125, "loss/logits": 0.21081271767616272, "loss/reg": 0.0007288880296982825, "step": 4405 }, { "epoch": 0.55075, "grad_norm": 2.7356534004211426, "grad_norm_var": 0.0715453027600553, "learning_rate": 0.0001, "loss": 1.253, "loss/crossentropy": 2.2919907569885254, "loss/hidden": 1.078125, "loss/logits": 0.16754229366779327, "loss/reg": 0.0007285007741302252, "step": 4406 }, { "epoch": 0.550875, "grad_norm": 2.4898264408111572, "grad_norm_var": 0.06737721558547397, "learning_rate": 0.0001, "loss": 1.0478, "loss/crossentropy": 2.3306634426116943, "loss/hidden": 0.91015625, "loss/logits": 0.1303650438785553, "loss/reg": 0.0007280962890945375, "step": 4407 }, { "epoch": 0.551, "grad_norm": 2.3881101608276367, "grad_norm_var": 0.07004168070284228, "learning_rate": 0.0001, "loss": 1.3151, "loss/crossentropy": 2.2988457679748535, "loss/hidden": 1.109375, "loss/logits": 0.1984202265739441, "loss/reg": 0.0007277068216353655, "step": 4408 }, { "epoch": 0.551125, "grad_norm": 3.138016939163208, "grad_norm_var": 0.08714697328335413, "learning_rate": 0.0001, "loss": 1.3614, "loss/crossentropy": 2.6031341552734375, "loss/hidden": 1.1328125, "loss/logits": 0.22126799821853638, "loss/reg": 0.0007273079245351255, "step": 4409 }, { "epoch": 0.55125, "grad_norm": 2.23193621635437, "grad_norm_var": 0.09792179488424514, "learning_rate": 0.0001, "loss": 1.2806, "loss/crossentropy": 2.657900094985962, "loss/hidden": 1.0703125, "loss/logits": 0.20302695035934448, "loss/reg": 0.0007269393536262214, "step": 4410 }, { "epoch": 0.551375, "grad_norm": 2.4548845291137695, "grad_norm_var": 0.09751578872853746, "learning_rate": 0.0001, "loss": 1.1556, "loss/crossentropy": 2.4905598163604736, "loss/hidden": 0.99609375, "loss/logits": 0.15220150351524353, "loss/reg": 0.0007265664171427488, "step": 4411 }, { "epoch": 0.5515, "grad_norm": 3.536989688873291, "grad_norm_var": 0.14033802998242115, "learning_rate": 0.0001, "loss": 1.5634, "loss/crossentropy": 2.7158498764038086, "loss/hidden": 1.2734375, "loss/logits": 0.2826702296733856, "loss/reg": 0.0007261833525262773, "step": 4412 }, { "epoch": 0.551625, "grad_norm": 2.3942437171936035, "grad_norm_var": 0.13714724446464668, "learning_rate": 0.0001, "loss": 1.1314, "loss/crossentropy": 2.6964988708496094, "loss/hidden": 0.96484375, "loss/logits": 0.15927045047283173, "loss/reg": 0.0007257845136336982, "step": 4413 }, { "epoch": 0.55175, "grad_norm": 3.048090696334839, "grad_norm_var": 0.14659469035382788, "learning_rate": 0.0001, "loss": 1.292, "loss/crossentropy": 2.8009352684020996, "loss/hidden": 1.09375, "loss/logits": 0.1909656971693039, "loss/reg": 0.000725442252587527, "step": 4414 }, { "epoch": 0.551875, "grad_norm": 2.7653911113739014, "grad_norm_var": 0.13450704881220096, "learning_rate": 0.0001, "loss": 1.4501, "loss/crossentropy": 2.8117945194244385, "loss/hidden": 1.2265625, "loss/logits": 0.21626122295856476, "loss/reg": 0.0007250680355355144, "step": 4415 }, { "epoch": 0.552, "grad_norm": 3.005983352661133, "grad_norm_var": 0.1334406786294115, "learning_rate": 0.0001, "loss": 1.2203, "loss/crossentropy": 2.4372506141662598, "loss/hidden": 1.03125, "loss/logits": 0.18183960020542145, "loss/reg": 0.0007247229805216193, "step": 4416 }, { "epoch": 0.552125, "grad_norm": 2.082475423812866, "grad_norm_var": 0.16056256153562648, "learning_rate": 0.0001, "loss": 1.0896, "loss/crossentropy": 2.4334847927093506, "loss/hidden": 0.9375, "loss/logits": 0.14489752054214478, "loss/reg": 0.00072433368768543, "step": 4417 }, { "epoch": 0.55225, "grad_norm": 2.9970510005950928, "grad_norm_var": 0.14949702471696186, "learning_rate": 0.0001, "loss": 1.3892, "loss/crossentropy": 2.5793426036834717, "loss/hidden": 1.171875, "loss/logits": 0.21007487177848816, "loss/reg": 0.0007239328697323799, "step": 4418 }, { "epoch": 0.552375, "grad_norm": 2.436577796936035, "grad_norm_var": 0.1465556751085309, "learning_rate": 0.0001, "loss": 1.1843, "loss/crossentropy": 2.43178391456604, "loss/hidden": 1.0078125, "loss/logits": 0.16929614543914795, "loss/reg": 0.0007235208177007735, "step": 4419 }, { "epoch": 0.5525, "grad_norm": 2.6770565509796143, "grad_norm_var": 0.14436326881370837, "learning_rate": 0.0001, "loss": 1.2523, "loss/crossentropy": 2.4889719486236572, "loss/hidden": 1.0703125, "loss/logits": 0.1747802495956421, "loss/reg": 0.0007231408380903304, "step": 4420 }, { "epoch": 0.552625, "grad_norm": 2.423301935195923, "grad_norm_var": 0.1488789306277622, "learning_rate": 0.0001, "loss": 1.2399, "loss/crossentropy": 2.5944621562957764, "loss/hidden": 1.0625, "loss/logits": 0.17019732296466827, "loss/reg": 0.0007227633032016456, "step": 4421 }, { "epoch": 0.55275, "grad_norm": 2.596342086791992, "grad_norm_var": 0.1489717693195774, "learning_rate": 0.0001, "loss": 1.2844, "loss/crossentropy": 2.5977489948272705, "loss/hidden": 1.078125, "loss/logits": 0.19903530180454254, "loss/reg": 0.0007224099826999009, "step": 4422 }, { "epoch": 0.552875, "grad_norm": 2.3730928897857666, "grad_norm_var": 0.1525754860139062, "learning_rate": 0.0001, "loss": 1.2593, "loss/crossentropy": 2.630218505859375, "loss/hidden": 1.0625, "loss/logits": 0.18953412771224976, "loss/reg": 0.0007220465340651572, "step": 4423 }, { "epoch": 0.553, "grad_norm": 2.756249189376831, "grad_norm_var": 0.14773219125271736, "learning_rate": 0.0001, "loss": 1.4465, "loss/crossentropy": 2.2383439540863037, "loss/hidden": 1.234375, "loss/logits": 0.2048693299293518, "loss/reg": 0.0007216517115011811, "step": 4424 }, { "epoch": 0.553125, "grad_norm": 2.292926549911499, "grad_norm_var": 0.14102492022778962, "learning_rate": 0.0001, "loss": 1.2199, "loss/crossentropy": 2.4103198051452637, "loss/hidden": 1.0390625, "loss/logits": 0.17361019551753998, "loss/reg": 0.0007212521159090102, "step": 4425 }, { "epoch": 0.55325, "grad_norm": 2.4419875144958496, "grad_norm_var": 0.1326469742484882, "learning_rate": 0.0001, "loss": 1.1136, "loss/crossentropy": 2.431978464126587, "loss/hidden": 0.94921875, "loss/logits": 0.15719175338745117, "loss/reg": 0.0007208869210444391, "step": 4426 }, { "epoch": 0.553375, "grad_norm": 9.341507911682129, "grad_norm_var": 2.924322476063366, "learning_rate": 0.0001, "loss": 1.7334, "loss/crossentropy": 2.3005919456481934, "loss/hidden": 1.4375, "loss/logits": 0.2886706590652466, "loss/reg": 0.000720529118552804, "step": 4427 }, { "epoch": 0.5535, "grad_norm": 3.2879059314727783, "grad_norm_var": 2.9127931373771228, "learning_rate": 0.0001, "loss": 1.397, "loss/crossentropy": 2.9036872386932373, "loss/hidden": 1.1640625, "loss/logits": 0.22578305006027222, "loss/reg": 0.0007201313273981214, "step": 4428 }, { "epoch": 0.553625, "grad_norm": 3.1648917198181152, "grad_norm_var": 2.8817589871297806, "learning_rate": 0.0001, "loss": 1.1915, "loss/crossentropy": 2.3142452239990234, "loss/hidden": 1.0078125, "loss/logits": 0.17651015520095825, "loss/reg": 0.0007197767263278365, "step": 4429 }, { "epoch": 0.55375, "grad_norm": 2.2138359546661377, "grad_norm_var": 2.9316633485677435, "learning_rate": 0.0001, "loss": 1.0778, "loss/crossentropy": 2.4297120571136475, "loss/hidden": 0.91796875, "loss/logits": 0.15264172852039337, "loss/reg": 0.0007194069330580533, "step": 4430 }, { "epoch": 0.553875, "grad_norm": 3.5252978801727295, "grad_norm_var": 2.938559386155513, "learning_rate": 0.0001, "loss": 1.3935, "loss/crossentropy": 2.7502458095550537, "loss/hidden": 1.1640625, "loss/logits": 0.22222906351089478, "loss/reg": 0.0007190591422840953, "step": 4431 }, { "epoch": 0.554, "grad_norm": 2.4556353092193604, "grad_norm_var": 2.9644640868151666, "learning_rate": 0.0001, "loss": 1.2197, "loss/crossentropy": 2.449117660522461, "loss/hidden": 1.0078125, "loss/logits": 0.20471951365470886, "loss/reg": 0.0007186695584096014, "step": 4432 }, { "epoch": 0.554125, "grad_norm": 3.0038387775421143, "grad_norm_var": 2.8966186999678367, "learning_rate": 0.0001, "loss": 1.4895, "loss/crossentropy": 2.571241855621338, "loss/hidden": 1.2421875, "loss/logits": 0.24015557765960693, "loss/reg": 0.0007182814297266304, "step": 4433 }, { "epoch": 0.55425, "grad_norm": 2.8465569019317627, "grad_norm_var": 2.9005859610970615, "learning_rate": 0.0001, "loss": 1.3191, "loss/crossentropy": 2.534473419189453, "loss/hidden": 1.125, "loss/logits": 0.18690824508666992, "loss/reg": 0.0007179208332672715, "step": 4434 }, { "epoch": 0.554375, "grad_norm": 5.156100273132324, "grad_norm_var": 3.116893570326425, "learning_rate": 0.0001, "loss": 1.5672, "loss/crossentropy": 2.5761921405792236, "loss/hidden": 1.265625, "loss/logits": 0.2943968176841736, "loss/reg": 0.0007175655919127166, "step": 4435 }, { "epoch": 0.5545, "grad_norm": 3.763174295425415, "grad_norm_var": 3.1026134799743126, "learning_rate": 0.0001, "loss": 1.2892, "loss/crossentropy": 2.3816068172454834, "loss/hidden": 1.0390625, "loss/logits": 0.2429327368736267, "loss/reg": 0.0007171974284574389, "step": 4436 }, { "epoch": 0.554625, "grad_norm": 2.550952911376953, "grad_norm_var": 3.0878140174331974, "learning_rate": 0.0001, "loss": 1.1369, "loss/crossentropy": 2.2032039165496826, "loss/hidden": 0.96484375, "loss/logits": 0.16484080255031586, "loss/reg": 0.0007168537704274058, "step": 4437 }, { "epoch": 0.55475, "grad_norm": 3.176501750946045, "grad_norm_var": 3.049728347383089, "learning_rate": 0.0001, "loss": 1.3064, "loss/crossentropy": 2.7543258666992188, "loss/hidden": 1.1171875, "loss/logits": 0.18205410242080688, "loss/reg": 0.0007164585986174643, "step": 4438 }, { "epoch": 0.554875, "grad_norm": 2.4984006881713867, "grad_norm_var": 3.033604198266418, "learning_rate": 0.0001, "loss": 1.2112, "loss/crossentropy": 2.621812105178833, "loss/hidden": 1.0234375, "loss/logits": 0.1806178241968155, "loss/reg": 0.0007160644163377583, "step": 4439 }, { "epoch": 0.555, "grad_norm": 2.825178861618042, "grad_norm_var": 3.0279411638701768, "learning_rate": 0.0001, "loss": 1.1588, "loss/crossentropy": 2.670276165008545, "loss/hidden": 0.984375, "loss/logits": 0.16722796857357025, "loss/reg": 0.000715632748324424, "step": 4440 }, { "epoch": 0.555125, "grad_norm": 2.5361783504486084, "grad_norm_var": 2.995439723356863, "learning_rate": 0.0001, "loss": 1.3052, "loss/crossentropy": 2.6851730346679688, "loss/hidden": 1.1171875, "loss/logits": 0.1808435320854187, "loss/reg": 0.0007152439211495221, "step": 4441 }, { "epoch": 0.55525, "grad_norm": 2.3619027137756348, "grad_norm_var": 3.0063291078502874, "learning_rate": 0.0001, "loss": 1.1454, "loss/crossentropy": 2.522737741470337, "loss/hidden": 0.95703125, "loss/logits": 0.18118974566459656, "loss/reg": 0.0007148317527025938, "step": 4442 }, { "epoch": 0.555375, "grad_norm": 2.5747017860412598, "grad_norm_var": 0.5248724824146827, "learning_rate": 0.0001, "loss": 1.0713, "loss/crossentropy": 2.7740402221679688, "loss/hidden": 0.90625, "loss/logits": 0.15791814029216766, "loss/reg": 0.0007144423434510827, "step": 4443 }, { "epoch": 0.5555, "grad_norm": 4.2368621826171875, "grad_norm_var": 0.6180490154048272, "learning_rate": 0.0001, "loss": 2.0781, "loss/crossentropy": 2.110393762588501, "loss/hidden": 1.6640625, "loss/logits": 0.40686553716659546, "loss/reg": 0.0007140348898246884, "step": 4444 }, { "epoch": 0.555625, "grad_norm": 2.3225057125091553, "grad_norm_var": 0.6501273405790393, "learning_rate": 0.0001, "loss": 1.2226, "loss/crossentropy": 2.8040566444396973, "loss/hidden": 1.0390625, "loss/logits": 0.17641709744930267, "loss/reg": 0.0007136180647648871, "step": 4445 }, { "epoch": 0.55575, "grad_norm": 2.9209086894989014, "grad_norm_var": 0.6069770274830771, "learning_rate": 0.0001, "loss": 1.1908, "loss/crossentropy": 2.457697629928589, "loss/hidden": 1.03125, "loss/logits": 0.15240362286567688, "loss/reg": 0.000713236746378243, "step": 4446 }, { "epoch": 0.555875, "grad_norm": 4.244624614715576, "grad_norm_var": 0.6851739548523407, "learning_rate": 0.0001, "loss": 1.4625, "loss/crossentropy": 2.8965249061584473, "loss/hidden": 1.1875, "loss/logits": 0.26790234446525574, "loss/reg": 0.0007128501310944557, "step": 4447 }, { "epoch": 0.556, "grad_norm": 2.3939390182495117, "grad_norm_var": 0.6906477428733844, "learning_rate": 0.0001, "loss": 1.2772, "loss/crossentropy": 2.6697821617126465, "loss/hidden": 1.0625, "loss/logits": 0.20759887993335724, "loss/reg": 0.0007124625844880939, "step": 4448 }, { "epoch": 0.556125, "grad_norm": 2.305347442626953, "grad_norm_var": 0.7290041843892965, "learning_rate": 0.0001, "loss": 1.0642, "loss/crossentropy": 2.915950059890747, "loss/hidden": 0.91796875, "loss/logits": 0.1391080915927887, "loss/reg": 0.0007120901136659086, "step": 4449 }, { "epoch": 0.55625, "grad_norm": 4.246524810791016, "grad_norm_var": 0.8145286153989936, "learning_rate": 0.0001, "loss": 1.6257, "loss/crossentropy": 1.8663389682769775, "loss/hidden": 1.40625, "loss/logits": 0.21235673129558563, "loss/reg": 0.0007117219502106309, "step": 4450 }, { "epoch": 0.556375, "grad_norm": 2.5905065536499023, "grad_norm_var": 0.5335566063978184, "learning_rate": 0.0001, "loss": 1.1275, "loss/crossentropy": 2.5194008350372314, "loss/hidden": 0.9609375, "loss/logits": 0.15946415066719055, "loss/reg": 0.0007113807951100171, "step": 4451 }, { "epoch": 0.5565, "grad_norm": 2.542907238006592, "grad_norm_var": 0.4978579436484239, "learning_rate": 0.0001, "loss": 1.3796, "loss/crossentropy": 2.6060867309570312, "loss/hidden": 1.15625, "loss/logits": 0.2161937654018402, "loss/reg": 0.0007110003498382866, "step": 4452 }, { "epoch": 0.556625, "grad_norm": 2.3545022010803223, "grad_norm_var": 0.5092947753370481, "learning_rate": 0.0001, "loss": 1.3225, "loss/crossentropy": 2.142002820968628, "loss/hidden": 1.1171875, "loss/logits": 0.19818739593029022, "loss/reg": 0.0007106105331331491, "step": 4453 }, { "epoch": 0.55675, "grad_norm": 2.281297206878662, "grad_norm_var": 0.524375232039256, "learning_rate": 0.0001, "loss": 1.1868, "loss/crossentropy": 2.4126434326171875, "loss/hidden": 1.0078125, "loss/logits": 0.17187075316905975, "loss/reg": 0.0007102314848452806, "step": 4454 }, { "epoch": 0.556875, "grad_norm": 2.39570689201355, "grad_norm_var": 0.5295373757765103, "learning_rate": 0.0001, "loss": 1.4735, "loss/crossentropy": 2.071341037750244, "loss/hidden": 1.2578125, "loss/logits": 0.20862364768981934, "loss/reg": 0.0007098378846421838, "step": 4455 }, { "epoch": 0.557, "grad_norm": 2.581576108932495, "grad_norm_var": 0.5331056549592516, "learning_rate": 0.0001, "loss": 1.3941, "loss/crossentropy": 2.4882733821868896, "loss/hidden": 1.15625, "loss/logits": 0.2307380735874176, "loss/reg": 0.0007094437605701387, "step": 4456 }, { "epoch": 0.557125, "grad_norm": 2.3608651161193848, "grad_norm_var": 0.5413249050212537, "learning_rate": 0.0001, "loss": 1.2377, "loss/crossentropy": 2.4816031455993652, "loss/hidden": 1.046875, "loss/logits": 0.1836848258972168, "loss/reg": 0.000709027866832912, "step": 4457 }, { "epoch": 0.55725, "grad_norm": 2.264343500137329, "grad_norm_var": 0.5475491251039517, "learning_rate": 0.0001, "loss": 1.0656, "loss/crossentropy": 2.563610315322876, "loss/hidden": 0.90625, "loss/logits": 0.1522538661956787, "loss/reg": 0.0007086544646881521, "step": 4458 }, { "epoch": 0.557375, "grad_norm": 2.7943694591522217, "grad_norm_var": 0.5443010036573914, "learning_rate": 0.0001, "loss": 1.3396, "loss/crossentropy": 2.6027591228485107, "loss/hidden": 1.1328125, "loss/logits": 0.19968228042125702, "loss/reg": 0.0007082566153258085, "step": 4459 }, { "epoch": 0.5575, "grad_norm": 3.234349012374878, "grad_norm_var": 0.41535976975578115, "learning_rate": 0.0001, "loss": 1.3747, "loss/crossentropy": 2.4440813064575195, "loss/hidden": 1.1171875, "loss/logits": 0.2504076361656189, "loss/reg": 0.0007078528869897127, "step": 4460 }, { "epoch": 0.557625, "grad_norm": 2.2319788932800293, "grad_norm_var": 0.42090690117995183, "learning_rate": 0.0001, "loss": 1.2656, "loss/crossentropy": 2.6738274097442627, "loss/hidden": 1.0859375, "loss/logits": 0.1726263463497162, "loss/reg": 0.0007074872264638543, "step": 4461 }, { "epoch": 0.55775, "grad_norm": 2.5839266777038574, "grad_norm_var": 0.4196055126344274, "learning_rate": 0.0001, "loss": 1.211, "loss/crossentropy": 2.709434986114502, "loss/hidden": 1.03125, "loss/logits": 0.1726827174425125, "loss/reg": 0.0007071259897202253, "step": 4462 }, { "epoch": 0.557875, "grad_norm": 2.6009812355041504, "grad_norm_var": 0.2527770280429931, "learning_rate": 0.0001, "loss": 1.1408, "loss/crossentropy": 2.5537116527557373, "loss/hidden": 0.9765625, "loss/logits": 0.15719664096832275, "loss/reg": 0.0007067382684908807, "step": 4463 }, { "epoch": 0.558, "grad_norm": 2.1290831565856934, "grad_norm_var": 0.26479820886001043, "learning_rate": 0.0001, "loss": 1.1716, "loss/crossentropy": 2.482107400894165, "loss/hidden": 1.0, "loss/logits": 0.1645759493112564, "loss/reg": 0.0007063481607474387, "step": 4464 }, { "epoch": 0.558125, "grad_norm": 2.399453639984131, "grad_norm_var": 0.261734338542364, "learning_rate": 0.0001, "loss": 1.3015, "loss/crossentropy": 2.4497551918029785, "loss/hidden": 1.1015625, "loss/logits": 0.19288308918476105, "loss/reg": 0.0007059741183184087, "step": 4465 }, { "epoch": 0.55825, "grad_norm": 3.977930784225464, "grad_norm_var": 0.20725995569027778, "learning_rate": 0.0001, "loss": 1.5732, "loss/crossentropy": 2.412830114364624, "loss/hidden": 1.3125, "loss/logits": 0.25360560417175293, "loss/reg": 0.0007056187023408711, "step": 4466 }, { "epoch": 0.558375, "grad_norm": 2.544750213623047, "grad_norm_var": 0.20734340204899032, "learning_rate": 0.0001, "loss": 1.2497, "loss/crossentropy": 2.7347474098205566, "loss/hidden": 1.0625, "loss/logits": 0.18010343611240387, "loss/reg": 0.0007052186992950737, "step": 4467 }, { "epoch": 0.5585, "grad_norm": 5.156840801239014, "grad_norm_var": 0.621499309518048, "learning_rate": 0.0001, "loss": 2.2266, "loss/crossentropy": 2.5846238136291504, "loss/hidden": 1.765625, "loss/logits": 0.45397210121154785, "loss/reg": 0.0007048616535030305, "step": 4468 }, { "epoch": 0.558625, "grad_norm": 2.6567881107330322, "grad_norm_var": 0.6115420737255929, "learning_rate": 0.0001, "loss": 1.2935, "loss/crossentropy": 2.4462661743164062, "loss/hidden": 1.078125, "loss/logits": 0.2083076685667038, "loss/reg": 0.000704514910466969, "step": 4469 }, { "epoch": 0.55875, "grad_norm": 2.991745948791504, "grad_norm_var": 0.5975396030452466, "learning_rate": 0.0001, "loss": 1.2925, "loss/crossentropy": 2.590317487716675, "loss/hidden": 1.109375, "loss/logits": 0.17610308527946472, "loss/reg": 0.0007041269564069808, "step": 4470 }, { "epoch": 0.558875, "grad_norm": 6.560878753662109, "grad_norm_var": 1.4536701894834587, "learning_rate": 0.0001, "loss": 1.5945, "loss/crossentropy": 2.709332227706909, "loss/hidden": 1.234375, "loss/logits": 0.353103369474411, "loss/reg": 0.0007037417381070554, "step": 4471 }, { "epoch": 0.559, "grad_norm": 3.0423128604888916, "grad_norm_var": 1.437125447447237, "learning_rate": 0.0001, "loss": 1.3324, "loss/crossentropy": 2.6047189235687256, "loss/hidden": 1.078125, "loss/logits": 0.24722535908222198, "loss/reg": 0.0007033674628473818, "step": 4472 }, { "epoch": 0.559125, "grad_norm": 3.23799204826355, "grad_norm_var": 1.3992751959706895, "learning_rate": 0.0001, "loss": 1.3873, "loss/crossentropy": 2.8116791248321533, "loss/hidden": 1.140625, "loss/logits": 0.23962843418121338, "loss/reg": 0.0007029934204183519, "step": 4473 }, { "epoch": 0.55925, "grad_norm": 3.1605682373046875, "grad_norm_var": 1.3435857059995509, "learning_rate": 0.0001, "loss": 1.5566, "loss/crossentropy": 2.3760900497436523, "loss/hidden": 1.296875, "loss/logits": 0.252657949924469, "loss/reg": 0.0007026180392131209, "step": 4474 }, { "epoch": 0.559375, "grad_norm": 3.3776769638061523, "grad_norm_var": 1.3327982493516417, "learning_rate": 0.0001, "loss": 1.966, "loss/crossentropy": 2.483804941177368, "loss/hidden": 1.59375, "loss/logits": 0.3651900887489319, "loss/reg": 0.0007022356730885804, "step": 4475 }, { "epoch": 0.5595, "grad_norm": 2.8776419162750244, "grad_norm_var": 1.3411599879257057, "learning_rate": 0.0001, "loss": 1.3315, "loss/crossentropy": 2.3953258991241455, "loss/hidden": 1.1328125, "loss/logits": 0.1916530579328537, "loss/reg": 0.0007018448086455464, "step": 4476 }, { "epoch": 0.559625, "grad_norm": 2.4221973419189453, "grad_norm_var": 1.3183460596896204, "learning_rate": 0.0001, "loss": 1.2593, "loss/crossentropy": 2.6725263595581055, "loss/hidden": 1.0703125, "loss/logits": 0.18192315101623535, "loss/reg": 0.0007014517323113978, "step": 4477 }, { "epoch": 0.55975, "grad_norm": 3.066025495529175, "grad_norm_var": 1.29117898494203, "learning_rate": 0.0001, "loss": 1.3739, "loss/crossentropy": 2.610402822494507, "loss/hidden": 1.171875, "loss/logits": 0.1950564831495285, "loss/reg": 0.0007010297267697752, "step": 4478 }, { "epoch": 0.559875, "grad_norm": 3.157550096511841, "grad_norm_var": 1.2614354752549444, "learning_rate": 0.0001, "loss": 1.2764, "loss/crossentropy": 2.57654070854187, "loss/hidden": 1.078125, "loss/logits": 0.19122381508350372, "loss/reg": 0.0007006072555668652, "step": 4479 }, { "epoch": 0.56, "grad_norm": 4.057895660400391, "grad_norm_var": 1.193476787123675, "learning_rate": 0.0001, "loss": 1.5699, "loss/crossentropy": 2.5506434440612793, "loss/hidden": 1.3046875, "loss/logits": 0.2582187056541443, "loss/reg": 0.0007001668564043939, "step": 4480 }, { "epoch": 0.560125, "grad_norm": 2.530944347381592, "grad_norm_var": 1.1766998764503813, "learning_rate": 0.0001, "loss": 1.3222, "loss/crossentropy": 2.4340507984161377, "loss/hidden": 1.125, "loss/logits": 0.1902313530445099, "loss/reg": 0.0006997138843871653, "step": 4481 }, { "epoch": 0.56025, "grad_norm": 2.668170213699341, "grad_norm_var": 1.1875714492338334, "learning_rate": 0.0001, "loss": 1.2488, "loss/crossentropy": 2.563685655593872, "loss/hidden": 1.0625, "loss/logits": 0.1793413907289505, "loss/reg": 0.000699335359968245, "step": 4482 }, { "epoch": 0.560375, "grad_norm": 3.5016939640045166, "grad_norm_var": 1.1427793248628708, "learning_rate": 0.0001, "loss": 1.4205, "loss/crossentropy": 2.644573211669922, "loss/hidden": 1.15625, "loss/logits": 0.2572922706604004, "loss/reg": 0.0006989582325331867, "step": 4483 }, { "epoch": 0.5605, "grad_norm": 2.4795472621917725, "grad_norm_var": 0.9651224136878831, "learning_rate": 0.0001, "loss": 1.2999, "loss/crossentropy": 2.674819231033325, "loss/hidden": 1.1015625, "loss/logits": 0.19134750962257385, "loss/reg": 0.0006985680083744228, "step": 4484 }, { "epoch": 0.560625, "grad_norm": 2.1111507415771484, "grad_norm_var": 1.0259305143129647, "learning_rate": 0.0001, "loss": 1.1115, "loss/crossentropy": 2.635369300842285, "loss/hidden": 0.94140625, "loss/logits": 0.16310420632362366, "loss/reg": 0.0006981698097661138, "step": 4485 }, { "epoch": 0.56075, "grad_norm": 2.56740140914917, "grad_norm_var": 1.0491232091159182, "learning_rate": 0.0001, "loss": 1.2492, "loss/crossentropy": 2.4354569911956787, "loss/hidden": 1.046875, "loss/logits": 0.1953645795583725, "loss/reg": 0.0006978074088692665, "step": 4486 }, { "epoch": 0.560875, "grad_norm": 2.205566167831421, "grad_norm_var": 0.2691749798610845, "learning_rate": 0.0001, "loss": 1.1768, "loss/crossentropy": 2.458364486694336, "loss/hidden": 1.0, "loss/logits": 0.16979235410690308, "loss/reg": 0.0006974244024604559, "step": 4487 }, { "epoch": 0.561, "grad_norm": 3.3277246952056885, "grad_norm_var": 0.2795289120864934, "learning_rate": 0.0001, "loss": 1.2985, "loss/crossentropy": 2.5594680309295654, "loss/hidden": 1.0859375, "loss/logits": 0.2056213617324829, "loss/reg": 0.0006970446556806564, "step": 4488 }, { "epoch": 0.561125, "grad_norm": 2.719207525253296, "grad_norm_var": 0.27448268513713264, "learning_rate": 0.0001, "loss": 1.2187, "loss/crossentropy": 2.782625675201416, "loss/hidden": 1.015625, "loss/logits": 0.19606365263462067, "loss/reg": 0.0006966587970964611, "step": 4489 }, { "epoch": 0.56125, "grad_norm": 2.6043944358825684, "grad_norm_var": 0.2737094818986234, "learning_rate": 0.0001, "loss": 1.2179, "loss/crossentropy": 2.359708786010742, "loss/hidden": 1.046875, "loss/logits": 0.16409826278686523, "loss/reg": 0.00069629424251616, "step": 4490 }, { "epoch": 0.561375, "grad_norm": 2.8414390087127686, "grad_norm_var": 0.25428757538486385, "learning_rate": 0.0001, "loss": 1.53, "loss/crossentropy": 2.586477279663086, "loss/hidden": 1.2578125, "loss/logits": 0.2652093470096588, "loss/reg": 0.0006959176389500499, "step": 4491 }, { "epoch": 0.5615, "grad_norm": 3.5425660610198975, "grad_norm_var": 0.2869278786098448, "learning_rate": 0.0001, "loss": 1.6151, "loss/crossentropy": 2.3093535900115967, "loss/hidden": 1.34375, "loss/logits": 0.26443517208099365, "loss/reg": 0.0006955881253816187, "step": 4492 }, { "epoch": 0.561625, "grad_norm": 2.285921812057495, "grad_norm_var": 0.2960928434268773, "learning_rate": 0.0001, "loss": 1.2018, "loss/crossentropy": 2.5349080562591553, "loss/hidden": 1.0234375, "loss/logits": 0.17145785689353943, "loss/reg": 0.0006952505791559815, "step": 4493 }, { "epoch": 0.56175, "grad_norm": 2.317160129547119, "grad_norm_var": 0.309992291431656, "learning_rate": 0.0001, "loss": 1.2864, "loss/crossentropy": 2.5179524421691895, "loss/hidden": 1.078125, "loss/logits": 0.2013632357120514, "loss/reg": 0.0006948848604224622, "step": 4494 }, { "epoch": 0.561875, "grad_norm": 3.1483821868896484, "grad_norm_var": 0.30956952025532186, "learning_rate": 0.0001, "loss": 1.3575, "loss/crossentropy": 2.6103365421295166, "loss/hidden": 1.140625, "loss/logits": 0.20996689796447754, "loss/reg": 0.0006945776985958219, "step": 4495 }, { "epoch": 0.562, "grad_norm": 3.354868173599243, "grad_norm_var": 0.2231881885002688, "learning_rate": 0.0001, "loss": 1.6954, "loss/crossentropy": 2.5168941020965576, "loss/hidden": 1.390625, "loss/logits": 0.2978382110595703, "loss/reg": 0.0006942025502212346, "step": 4496 }, { "epoch": 0.562125, "grad_norm": 4.449245452880859, "grad_norm_var": 0.39385671571289244, "learning_rate": 0.0001, "loss": 1.6537, "loss/crossentropy": 2.2989501953125, "loss/hidden": 1.3828125, "loss/logits": 0.26392561197280884, "loss/reg": 0.000693834328558296, "step": 4497 }, { "epoch": 0.56225, "grad_norm": 4.908499240875244, "grad_norm_var": 0.6434434065606655, "learning_rate": 0.0001, "loss": 1.6632, "loss/crossentropy": 2.7167882919311523, "loss/hidden": 1.3671875, "loss/logits": 0.2891191840171814, "loss/reg": 0.0006934895063750446, "step": 4498 }, { "epoch": 0.562375, "grad_norm": 2.5053558349609375, "grad_norm_var": 0.641867538482101, "learning_rate": 0.0001, "loss": 1.251, "loss/crossentropy": 2.4930922985076904, "loss/hidden": 1.0703125, "loss/logits": 0.17372295260429382, "loss/reg": 0.0006931382231414318, "step": 4499 }, { "epoch": 0.5625, "grad_norm": 4.316573619842529, "grad_norm_var": 0.7349745211312455, "learning_rate": 0.0001, "loss": 1.3986, "loss/crossentropy": 2.2891299724578857, "loss/hidden": 1.1875, "loss/logits": 0.2042122185230255, "loss/reg": 0.0006927696522325277, "step": 4500 }, { "epoch": 0.562625, "grad_norm": 6.488615036010742, "grad_norm_var": 1.3698504822239725, "learning_rate": 0.0001, "loss": 1.7613, "loss/crossentropy": 2.540635347366333, "loss/hidden": 1.4921875, "loss/logits": 0.26223665475845337, "loss/reg": 0.0006923937471583486, "step": 4501 }, { "epoch": 0.56275, "grad_norm": 2.936908006668091, "grad_norm_var": 1.3398798059114738, "learning_rate": 0.0001, "loss": 1.2884, "loss/crossentropy": 2.727804183959961, "loss/hidden": 1.078125, "loss/logits": 0.2033381164073944, "loss/reg": 0.0006920029409229755, "step": 4502 }, { "epoch": 0.562875, "grad_norm": 4.481847763061523, "grad_norm_var": 1.3096952357003657, "learning_rate": 0.0001, "loss": 1.7047, "loss/crossentropy": 2.292308807373047, "loss/hidden": 1.390625, "loss/logits": 0.30711251497268677, "loss/reg": 0.0006916280253790319, "step": 4503 }, { "epoch": 0.563, "grad_norm": 7.710179328918457, "grad_norm_var": 2.4010468070321993, "learning_rate": 0.0001, "loss": 1.4129, "loss/crossentropy": 2.8124454021453857, "loss/hidden": 1.1875, "loss/logits": 0.21846038103103638, "loss/reg": 0.0006912278477102518, "step": 4504 }, { "epoch": 0.563125, "grad_norm": 3.1374881267547607, "grad_norm_var": 2.3523633414920964, "learning_rate": 0.0001, "loss": 1.3403, "loss/crossentropy": 2.56356143951416, "loss/hidden": 1.1484375, "loss/logits": 0.18493686616420746, "loss/reg": 0.000690847635269165, "step": 4505 }, { "epoch": 0.56325, "grad_norm": 3.543135404586792, "grad_norm_var": 2.255997075267877, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.4008758068084717, "loss/hidden": 1.1640625, "loss/logits": 0.1857180893421173, "loss/reg": 0.0006904760375618935, "step": 4506 }, { "epoch": 0.563375, "grad_norm": 3.3741295337677, "grad_norm_var": 2.200464167450309, "learning_rate": 0.0001, "loss": 1.5486, "loss/crossentropy": 2.595426082611084, "loss/hidden": 1.3125, "loss/logits": 0.22923415899276733, "loss/reg": 0.0006901058368384838, "step": 4507 }, { "epoch": 0.5635, "grad_norm": 3.699094533920288, "grad_norm_var": 2.1944040956340434, "learning_rate": 0.0001, "loss": 1.3066, "loss/crossentropy": 2.71610689163208, "loss/hidden": 1.1171875, "loss/logits": 0.18246811628341675, "loss/reg": 0.0006897106650285423, "step": 4508 }, { "epoch": 0.563625, "grad_norm": 2.385340452194214, "grad_norm_var": 2.1734126655070742, "learning_rate": 0.0001, "loss": 1.1413, "loss/crossentropy": 2.5832741260528564, "loss/hidden": 0.9765625, "loss/logits": 0.15780600905418396, "loss/reg": 0.0006893356330692768, "step": 4509 }, { "epoch": 0.56375, "grad_norm": 2.254789352416992, "grad_norm_var": 2.1870043189682593, "learning_rate": 0.0001, "loss": 1.1529, "loss/crossentropy": 2.5157976150512695, "loss/hidden": 0.98828125, "loss/logits": 0.15772180259227753, "loss/reg": 0.0006889314390718937, "step": 4510 }, { "epoch": 0.563875, "grad_norm": 2.882517099380493, "grad_norm_var": 2.218718313347457, "learning_rate": 0.0001, "loss": 1.464, "loss/crossentropy": 2.7163021564483643, "loss/hidden": 1.234375, "loss/logits": 0.2227695882320404, "loss/reg": 0.000688537023961544, "step": 4511 }, { "epoch": 0.564, "grad_norm": 2.469444751739502, "grad_norm_var": 2.332283989407267, "learning_rate": 0.0001, "loss": 1.1122, "loss/crossentropy": 2.530111312866211, "loss/hidden": 0.95703125, "loss/logits": 0.14826710522174835, "loss/reg": 0.0006881682202219963, "step": 4512 }, { "epoch": 0.564125, "grad_norm": 3.6196985244750977, "grad_norm_var": 2.308620045148433, "learning_rate": 0.0001, "loss": 1.621, "loss/crossentropy": 2.3394951820373535, "loss/hidden": 1.3203125, "loss/logits": 0.2937893569469452, "loss/reg": 0.0006877684500068426, "step": 4513 }, { "epoch": 0.56425, "grad_norm": 2.6924445629119873, "grad_norm_var": 2.2864232813926253, "learning_rate": 0.0001, "loss": 1.1686, "loss/crossentropy": 2.647494077682495, "loss/hidden": 0.98828125, "loss/logits": 0.17339608073234558, "loss/reg": 0.0006873994134366512, "step": 4514 }, { "epoch": 0.564375, "grad_norm": 2.8633244037628174, "grad_norm_var": 2.239508206226827, "learning_rate": 0.0001, "loss": 1.3076, "loss/crossentropy": 2.5798110961914062, "loss/hidden": 1.0859375, "loss/logits": 0.21476265788078308, "loss/reg": 0.0006869969074614346, "step": 4515 }, { "epoch": 0.5645, "grad_norm": 2.8772878646850586, "grad_norm_var": 2.2465246732678867, "learning_rate": 0.0001, "loss": 1.4729, "loss/crossentropy": 2.356633186340332, "loss/hidden": 1.2109375, "loss/logits": 0.2550662159919739, "loss/reg": 0.0006866454496048391, "step": 4516 }, { "epoch": 0.564625, "grad_norm": 3.257991313934326, "grad_norm_var": 1.6496153067575352, "learning_rate": 0.0001, "loss": 1.4467, "loss/crossentropy": 2.756434679031372, "loss/hidden": 1.1796875, "loss/logits": 0.2601250410079956, "loss/reg": 0.000686329382006079, "step": 4517 }, { "epoch": 0.56475, "grad_norm": 2.2729713916778564, "grad_norm_var": 1.7169750930154424, "learning_rate": 0.0001, "loss": 1.1786, "loss/crossentropy": 2.5760648250579834, "loss/hidden": 1.0, "loss/logits": 0.17176172137260437, "loss/reg": 0.0006860250141471624, "step": 4518 }, { "epoch": 0.564875, "grad_norm": 2.6303281784057617, "grad_norm_var": 1.6506060941670186, "learning_rate": 0.0001, "loss": 1.4219, "loss/crossentropy": 2.2181737422943115, "loss/hidden": 1.203125, "loss/logits": 0.21192814409732819, "loss/reg": 0.0006856513791717589, "step": 4519 }, { "epoch": 0.565, "grad_norm": 3.6165449619293213, "grad_norm_var": 0.2522735835412353, "learning_rate": 0.0001, "loss": 1.3247, "loss/crossentropy": 2.9296343326568604, "loss/hidden": 1.09375, "loss/logits": 0.22410619258880615, "loss/reg": 0.0006852814112789929, "step": 4520 }, { "epoch": 0.565125, "grad_norm": 2.830272912979126, "grad_norm_var": 0.25145647947833727, "learning_rate": 0.0001, "loss": 1.3736, "loss/crossentropy": 2.6217222213745117, "loss/hidden": 1.1328125, "loss/logits": 0.2339123636484146, "loss/reg": 0.0006849054479971528, "step": 4521 }, { "epoch": 0.56525, "grad_norm": 3.7443366050720215, "grad_norm_var": 0.2697823211472456, "learning_rate": 0.0001, "loss": 1.4139, "loss/crossentropy": 2.5947391986846924, "loss/hidden": 1.171875, "loss/logits": 0.23522931337356567, "loss/reg": 0.0006845341413281858, "step": 4522 }, { "epoch": 0.565375, "grad_norm": 2.613091230392456, "grad_norm_var": 0.26465946270059226, "learning_rate": 0.0001, "loss": 1.2163, "loss/crossentropy": 2.7802059650421143, "loss/hidden": 1.0390625, "loss/logits": 0.1703493893146515, "loss/reg": 0.0006841497379355133, "step": 4523 }, { "epoch": 0.5655, "grad_norm": 2.626009702682495, "grad_norm_var": 0.22506354304266338, "learning_rate": 0.0001, "loss": 1.337, "loss/crossentropy": 2.490262985229492, "loss/hidden": 1.109375, "loss/logits": 0.22081030905246735, "loss/reg": 0.0006837909459136426, "step": 4524 }, { "epoch": 0.565625, "grad_norm": 2.4620437622070312, "grad_norm_var": 0.22065586963003625, "learning_rate": 0.0001, "loss": 1.2133, "loss/crossentropy": 2.4771156311035156, "loss/hidden": 1.046875, "loss/logits": 0.1595613658428192, "loss/reg": 0.0006834240630269051, "step": 4525 }, { "epoch": 0.56575, "grad_norm": 4.680413722991943, "grad_norm_var": 0.3935971425514699, "learning_rate": 0.0001, "loss": 1.1643, "loss/crossentropy": 2.386408805847168, "loss/hidden": 1.0, "loss/logits": 0.15746569633483887, "loss/reg": 0.0006830891943536699, "step": 4526 }, { "epoch": 0.565875, "grad_norm": 2.5385818481445312, "grad_norm_var": 0.40677548539438796, "learning_rate": 0.0001, "loss": 1.4389, "loss/crossentropy": 2.541534900665283, "loss/hidden": 1.171875, "loss/logits": 0.2602420151233673, "loss/reg": 0.0006827231263741851, "step": 4527 }, { "epoch": 0.566, "grad_norm": 2.701918125152588, "grad_norm_var": 0.3941054552138456, "learning_rate": 0.0001, "loss": 1.1325, "loss/crossentropy": 2.517350196838379, "loss/hidden": 0.97265625, "loss/logits": 0.15299129486083984, "loss/reg": 0.0006823553703725338, "step": 4528 }, { "epoch": 0.566125, "grad_norm": 2.4812095165252686, "grad_norm_var": 0.38130457208174373, "learning_rate": 0.0001, "loss": 1.4929, "loss/crossentropy": 2.124282121658325, "loss/hidden": 1.2421875, "loss/logits": 0.2438463270664215, "loss/reg": 0.0006819934351369739, "step": 4529 }, { "epoch": 0.56625, "grad_norm": 2.771151542663574, "grad_norm_var": 0.3791930246732342, "learning_rate": 0.0001, "loss": 1.3731, "loss/crossentropy": 2.7082889080047607, "loss/hidden": 1.140625, "loss/logits": 0.22561095654964447, "loss/reg": 0.0006816537352278829, "step": 4530 }, { "epoch": 0.566375, "grad_norm": 2.378777265548706, "grad_norm_var": 0.39852803091065214, "learning_rate": 0.0001, "loss": 1.2525, "loss/crossentropy": 2.4184131622314453, "loss/hidden": 1.046875, "loss/logits": 0.19881518185138702, "loss/reg": 0.0006812852807343006, "step": 4531 }, { "epoch": 0.5665, "grad_norm": 2.715766668319702, "grad_norm_var": 0.40075935616718256, "learning_rate": 0.0001, "loss": 1.2581, "loss/crossentropy": 2.3417303562164307, "loss/hidden": 1.0625, "loss/logits": 0.18877288699150085, "loss/reg": 0.0006809026235714555, "step": 4532 }, { "epoch": 0.566625, "grad_norm": 2.7178573608398438, "grad_norm_var": 0.39285788228598906, "learning_rate": 0.0001, "loss": 1.1462, "loss/crossentropy": 2.5769307613372803, "loss/hidden": 0.96875, "loss/logits": 0.17061373591423035, "loss/reg": 0.0006805459852330387, "step": 4533 }, { "epoch": 0.56675, "grad_norm": 2.47206711769104, "grad_norm_var": 0.3797167106075108, "learning_rate": 0.0001, "loss": 1.1455, "loss/crossentropy": 2.4805266857147217, "loss/hidden": 0.98046875, "loss/logits": 0.15821900963783264, "loss/reg": 0.0006802155985496938, "step": 4534 }, { "epoch": 0.566875, "grad_norm": 2.326427459716797, "grad_norm_var": 0.39535335241871505, "learning_rate": 0.0001, "loss": 1.4323, "loss/crossentropy": 2.631990909576416, "loss/hidden": 1.1875, "loss/logits": 0.23799791932106018, "loss/reg": 0.000679884513374418, "step": 4535 }, { "epoch": 0.567, "grad_norm": 2.668260335922241, "grad_norm_var": 0.3552400048117221, "learning_rate": 0.0001, "loss": 1.7061, "loss/crossentropy": 2.2202181816101074, "loss/hidden": 1.3984375, "loss/logits": 0.3008308410644531, "loss/reg": 0.0006795180961489677, "step": 4536 }, { "epoch": 0.567125, "grad_norm": 3.2090702056884766, "grad_norm_var": 0.3659636334573726, "learning_rate": 0.0001, "loss": 1.421, "loss/crossentropy": 2.4416332244873047, "loss/hidden": 1.1953125, "loss/logits": 0.21888506412506104, "loss/reg": 0.0006791749037802219, "step": 4537 }, { "epoch": 0.56725, "grad_norm": 2.5800728797912598, "grad_norm_var": 0.30706716728067984, "learning_rate": 0.0001, "loss": 1.2508, "loss/crossentropy": 2.4019668102264404, "loss/hidden": 1.0703125, "loss/logits": 0.17374247312545776, "loss/reg": 0.0006787932943552732, "step": 4538 }, { "epoch": 0.567375, "grad_norm": 2.65321946144104, "grad_norm_var": 0.30645444352724077, "learning_rate": 0.0001, "loss": 1.3777, "loss/crossentropy": 2.5312485694885254, "loss/hidden": 1.171875, "loss/logits": 0.19901293516159058, "loss/reg": 0.0006784661090932786, "step": 4539 }, { "epoch": 0.5675, "grad_norm": 2.463282346725464, "grad_norm_var": 0.310776410097431, "learning_rate": 0.0001, "loss": 1.4295, "loss/crossentropy": 2.3864710330963135, "loss/hidden": 1.1796875, "loss/logits": 0.24298325181007385, "loss/reg": 0.0006780776311643422, "step": 4540 }, { "epoch": 0.567625, "grad_norm": 2.2354609966278076, "grad_norm_var": 0.32234495226337667, "learning_rate": 0.0001, "loss": 1.1764, "loss/crossentropy": 2.4044361114501953, "loss/hidden": 1.015625, "loss/logits": 0.15399056673049927, "loss/reg": 0.000677644566167146, "step": 4541 }, { "epoch": 0.56775, "grad_norm": 4.858021259307861, "grad_norm_var": 0.3706322072779576, "learning_rate": 0.0001, "loss": 1.2275, "loss/crossentropy": 3.1015918254852295, "loss/hidden": 1.046875, "loss/logits": 0.17382332682609558, "loss/reg": 0.0006772046908736229, "step": 4542 }, { "epoch": 0.567875, "grad_norm": 4.4401655197143555, "grad_norm_var": 0.5466561460588679, "learning_rate": 0.0001, "loss": 1.3324, "loss/crossentropy": 2.76124906539917, "loss/hidden": 1.109375, "loss/logits": 0.21629029512405396, "loss/reg": 0.0006767660961486399, "step": 4543 }, { "epoch": 0.568, "grad_norm": 3.0327701568603516, "grad_norm_var": 0.5467646439608113, "learning_rate": 0.0001, "loss": 1.2941, "loss/crossentropy": 2.895354986190796, "loss/hidden": 1.0703125, "loss/logits": 0.21704907715320587, "loss/reg": 0.0006763568380847573, "step": 4544 }, { "epoch": 0.568125, "grad_norm": 2.949216604232788, "grad_norm_var": 0.5358671977074626, "learning_rate": 0.0001, "loss": 1.159, "loss/crossentropy": 2.4743778705596924, "loss/hidden": 0.99609375, "loss/logits": 0.1561919003725052, "loss/reg": 0.000675928546115756, "step": 4545 }, { "epoch": 0.56825, "grad_norm": 3.4650347232818604, "grad_norm_var": 0.5536246013188814, "learning_rate": 0.0001, "loss": 1.4915, "loss/crossentropy": 2.2577457427978516, "loss/hidden": 1.296875, "loss/logits": 0.18786539137363434, "loss/reg": 0.0006754905916750431, "step": 4546 }, { "epoch": 0.568375, "grad_norm": 2.9720401763916016, "grad_norm_var": 0.5306081643955688, "learning_rate": 0.0001, "loss": 1.6225, "loss/crossentropy": 2.0870327949523926, "loss/hidden": 1.34375, "loss/logits": 0.27198994159698486, "loss/reg": 0.0006751281907781959, "step": 4547 }, { "epoch": 0.5685, "grad_norm": 3.1952383518218994, "grad_norm_var": 0.527769576344023, "learning_rate": 0.0001, "loss": 1.6118, "loss/crossentropy": 2.176018714904785, "loss/hidden": 1.3125, "loss/logits": 0.29256346821784973, "loss/reg": 0.0006747650331817567, "step": 4548 }, { "epoch": 0.568625, "grad_norm": 2.3627617359161377, "grad_norm_var": 0.5497136111552983, "learning_rate": 0.0001, "loss": 1.1431, "loss/crossentropy": 2.778780221939087, "loss/hidden": 0.95703125, "loss/logits": 0.17934004962444305, "loss/reg": 0.0006743670091964304, "step": 4549 }, { "epoch": 0.56875, "grad_norm": 3.804546594619751, "grad_norm_var": 0.5681857811271639, "learning_rate": 0.0001, "loss": 1.2659, "loss/crossentropy": 2.702648162841797, "loss/hidden": 1.0703125, "loss/logits": 0.1888205111026764, "loss/reg": 0.0006739565869793296, "step": 4550 }, { "epoch": 0.568875, "grad_norm": 2.425992727279663, "grad_norm_var": 0.5588548478832043, "learning_rate": 0.0001, "loss": 1.2501, "loss/crossentropy": 2.7963550090789795, "loss/hidden": 1.0546875, "loss/logits": 0.18868650496006012, "loss/reg": 0.0006735465140081942, "step": 4551 }, { "epoch": 0.569, "grad_norm": 3.0151405334472656, "grad_norm_var": 0.5472303512748697, "learning_rate": 0.0001, "loss": 1.2346, "loss/crossentropy": 2.733168601989746, "loss/hidden": 1.0625, "loss/logits": 0.165408194065094, "loss/reg": 0.000673184113111347, "step": 4552 }, { "epoch": 0.569125, "grad_norm": 2.500715494155884, "grad_norm_var": 0.5686555508114746, "learning_rate": 0.0001, "loss": 1.1411, "loss/crossentropy": 2.5846877098083496, "loss/hidden": 0.98828125, "loss/logits": 0.14606232941150665, "loss/reg": 0.0006727694417349994, "step": 4553 }, { "epoch": 0.56925, "grad_norm": 3.070617437362671, "grad_norm_var": 0.5523309257221757, "learning_rate": 0.0001, "loss": 1.3786, "loss/crossentropy": 2.4041576385498047, "loss/hidden": 1.1796875, "loss/logits": 0.19216537475585938, "loss/reg": 0.000672333175316453, "step": 4554 }, { "epoch": 0.569375, "grad_norm": 6.281548023223877, "grad_norm_var": 1.1636967740307265, "learning_rate": 0.0001, "loss": 1.4573, "loss/crossentropy": 2.5990986824035645, "loss/hidden": 1.2734375, "loss/logits": 0.17713865637779236, "loss/reg": 0.0006719810189679265, "step": 4555 }, { "epoch": 0.5695, "grad_norm": 4.030311107635498, "grad_norm_var": 1.1387899040505163, "learning_rate": 0.0001, "loss": 1.5401, "loss/crossentropy": 2.7084012031555176, "loss/hidden": 1.2890625, "loss/logits": 0.24434879422187805, "loss/reg": 0.0006715742638334632, "step": 4556 }, { "epoch": 0.569625, "grad_norm": 3.7404942512512207, "grad_norm_var": 1.0436660821513508, "learning_rate": 0.0001, "loss": 1.5987, "loss/crossentropy": 2.1367123126983643, "loss/hidden": 1.3671875, "loss/logits": 0.22484201192855835, "loss/reg": 0.0006712178001180291, "step": 4557 }, { "epoch": 0.56975, "grad_norm": 6.347155570983887, "grad_norm_var": 1.450103362414514, "learning_rate": 0.0001, "loss": 2.0802, "loss/crossentropy": 2.8225643634796143, "loss/hidden": 1.5703125, "loss/logits": 0.5031678676605225, "loss/reg": 0.0006708729197271168, "step": 4558 }, { "epoch": 0.569875, "grad_norm": 4.020057678222656, "grad_norm_var": 1.4141908269467838, "learning_rate": 0.0001, "loss": 1.555, "loss/crossentropy": 2.975463628768921, "loss/hidden": 1.2890625, "loss/logits": 0.2591874599456787, "loss/reg": 0.0006705180858261883, "step": 4559 }, { "epoch": 0.57, "grad_norm": 3.7491190433502197, "grad_norm_var": 1.3943915286932278, "learning_rate": 0.0001, "loss": 1.3646, "loss/crossentropy": 2.7959625720977783, "loss/hidden": 1.1328125, "loss/logits": 0.2250719666481018, "loss/reg": 0.0006701573729515076, "step": 4560 }, { "epoch": 0.570125, "grad_norm": 3.0165765285491943, "grad_norm_var": 1.3886449834095456, "learning_rate": 0.0001, "loss": 1.5566, "loss/crossentropy": 2.339538097381592, "loss/hidden": 1.2890625, "loss/logits": 0.26079800724983215, "loss/reg": 0.0006698236102238297, "step": 4561 }, { "epoch": 0.57025, "grad_norm": 2.3503782749176025, "grad_norm_var": 1.4900482321991848, "learning_rate": 0.0001, "loss": 1.2691, "loss/crossentropy": 2.562039613723755, "loss/hidden": 1.078125, "loss/logits": 0.18425245583057404, "loss/reg": 0.0006695083575323224, "step": 4562 }, { "epoch": 0.570375, "grad_norm": 4.332911014556885, "grad_norm_var": 1.4999880393716123, "learning_rate": 0.0001, "loss": 1.3567, "loss/crossentropy": 2.82328200340271, "loss/hidden": 1.15625, "loss/logits": 0.19377896189689636, "loss/reg": 0.0006691445014439523, "step": 4563 }, { "epoch": 0.5705, "grad_norm": 2.731065034866333, "grad_norm_var": 1.5409940795993007, "learning_rate": 0.0001, "loss": 1.3453, "loss/crossentropy": 2.445399284362793, "loss/hidden": 1.140625, "loss/logits": 0.19800078868865967, "loss/reg": 0.0006688159774057567, "step": 4564 }, { "epoch": 0.570625, "grad_norm": 2.5727505683898926, "grad_norm_var": 1.508795289673814, "learning_rate": 0.0001, "loss": 1.3993, "loss/crossentropy": 2.4861156940460205, "loss/hidden": 1.1875, "loss/logits": 0.20508365333080292, "loss/reg": 0.0006684315158054233, "step": 4565 }, { "epoch": 0.57075, "grad_norm": 3.201946973800659, "grad_norm_var": 1.5170113916454728, "learning_rate": 0.0001, "loss": 1.4372, "loss/crossentropy": 2.5562515258789062, "loss/hidden": 1.1953125, "loss/logits": 0.23517760634422302, "loss/reg": 0.0006680747610516846, "step": 4566 }, { "epoch": 0.570875, "grad_norm": 2.7736434936523438, "grad_norm_var": 1.470763653904697, "learning_rate": 0.0001, "loss": 1.5062, "loss/crossentropy": 2.373710870742798, "loss/hidden": 1.203125, "loss/logits": 0.29641884565353394, "loss/reg": 0.000667732791043818, "step": 4567 }, { "epoch": 0.571, "grad_norm": 2.648906946182251, "grad_norm_var": 1.5081162276444464, "learning_rate": 0.0001, "loss": 1.7162, "loss/crossentropy": 2.351306676864624, "loss/hidden": 1.40625, "loss/logits": 0.3032344579696655, "loss/reg": 0.0006674024625681341, "step": 4568 }, { "epoch": 0.571125, "grad_norm": 3.3087940216064453, "grad_norm_var": 1.4320480209252515, "learning_rate": 0.0001, "loss": 1.5309, "loss/crossentropy": 2.986168622970581, "loss/hidden": 1.265625, "loss/logits": 0.2586209774017334, "loss/reg": 0.0006671114242635667, "step": 4569 }, { "epoch": 0.57125, "grad_norm": 2.940706729888916, "grad_norm_var": 1.4428963523413425, "learning_rate": 0.0001, "loss": 1.2355, "loss/crossentropy": 2.5358071327209473, "loss/hidden": 1.0390625, "loss/logits": 0.18974870443344116, "loss/reg": 0.0006668127025477588, "step": 4570 }, { "epoch": 0.571375, "grad_norm": 2.294044256210327, "grad_norm_var": 1.025799309383753, "learning_rate": 0.0001, "loss": 1.1997, "loss/crossentropy": 2.696993350982666, "loss/hidden": 1.0, "loss/logits": 0.19306594133377075, "loss/reg": 0.0006664526299573481, "step": 4571 }, { "epoch": 0.5715, "grad_norm": 2.732311487197876, "grad_norm_var": 1.0183237069104574, "learning_rate": 0.0001, "loss": 1.1351, "loss/crossentropy": 2.487356424331665, "loss/hidden": 0.96484375, "loss/logits": 0.16359573602676392, "loss/reg": 0.0006661189254373312, "step": 4572 }, { "epoch": 0.571625, "grad_norm": 2.5495405197143555, "grad_norm_var": 1.0366356808799688, "learning_rate": 0.0001, "loss": 1.3463, "loss/crossentropy": 2.5621156692504883, "loss/hidden": 1.125, "loss/logits": 0.21466657519340515, "loss/reg": 0.0006657653721049428, "step": 4573 }, { "epoch": 0.57175, "grad_norm": 14.361347198486328, "grad_norm_var": 8.389056650221827, "learning_rate": 0.0001, "loss": 2.7987, "loss/crossentropy": 3.215804100036621, "loss/hidden": 1.9375, "loss/logits": 0.8545449376106262, "loss/reg": 0.0006654601893387735, "step": 4574 }, { "epoch": 0.571875, "grad_norm": 3.545475959777832, "grad_norm_var": 8.3843999745377, "learning_rate": 0.0001, "loss": 1.7659, "loss/crossentropy": 2.8742599487304688, "loss/hidden": 1.40625, "loss/logits": 0.3530464768409729, "loss/reg": 0.0006650951108895242, "step": 4575 }, { "epoch": 0.572, "grad_norm": 3.1224350929260254, "grad_norm_var": 8.404368960868544, "learning_rate": 0.0001, "loss": 1.2457, "loss/crossentropy": 2.6685378551483154, "loss/hidden": 1.046875, "loss/logits": 0.19212856888771057, "loss/reg": 0.0006647362606599927, "step": 4576 }, { "epoch": 0.572125, "grad_norm": 3.3228015899658203, "grad_norm_var": 8.384155754793975, "learning_rate": 0.0001, "loss": 1.461, "loss/crossentropy": 2.466223955154419, "loss/hidden": 1.25, "loss/logits": 0.20437504351139069, "loss/reg": 0.0006644031964242458, "step": 4577 }, { "epoch": 0.57225, "grad_norm": 3.10194993019104, "grad_norm_var": 8.286788273049789, "learning_rate": 0.0001, "loss": 1.5248, "loss/crossentropy": 2.0861318111419678, "loss/hidden": 1.328125, "loss/logits": 0.1900414228439331, "loss/reg": 0.0006640985957346857, "step": 4578 }, { "epoch": 0.572375, "grad_norm": 2.8079020977020264, "grad_norm_var": 8.307777751842819, "learning_rate": 0.0001, "loss": 1.5226, "loss/crossentropy": 2.4287517070770264, "loss/hidden": 1.2578125, "loss/logits": 0.2581234574317932, "loss/reg": 0.0006637373007833958, "step": 4579 }, { "epoch": 0.5725, "grad_norm": 2.257938861846924, "grad_norm_var": 8.37822240526538, "learning_rate": 0.0001, "loss": 1.0564, "loss/crossentropy": 2.420178174972534, "loss/hidden": 0.890625, "loss/logits": 0.1591595709323883, "loss/reg": 0.0006634180899709463, "step": 4580 }, { "epoch": 0.572625, "grad_norm": 2.418351650238037, "grad_norm_var": 8.400785849528669, "learning_rate": 0.0001, "loss": 1.2734, "loss/crossentropy": 2.5240256786346436, "loss/hidden": 1.0859375, "loss/logits": 0.18087010085582733, "loss/reg": 0.0006631156429648399, "step": 4581 }, { "epoch": 0.57275, "grad_norm": 3.804875373840332, "grad_norm_var": 8.392571039370422, "learning_rate": 0.0001, "loss": 1.1117, "loss/crossentropy": 2.6363558769226074, "loss/hidden": 0.9609375, "loss/logits": 0.14409351348876953, "loss/reg": 0.0006627574912272394, "step": 4582 }, { "epoch": 0.572875, "grad_norm": 3.708010673522949, "grad_norm_var": 8.341142103574539, "learning_rate": 0.0001, "loss": 1.4718, "loss/crossentropy": 2.679391622543335, "loss/hidden": 1.25, "loss/logits": 0.21518541872501373, "loss/reg": 0.000662395847029984, "step": 4583 }, { "epoch": 0.573, "grad_norm": 2.9515485763549805, "grad_norm_var": 8.305145229599395, "learning_rate": 0.0001, "loss": 1.389, "loss/crossentropy": 2.5151233673095703, "loss/hidden": 1.15625, "loss/logits": 0.22609193623065948, "loss/reg": 0.000662078382447362, "step": 4584 }, { "epoch": 0.573125, "grad_norm": 3.215197801589966, "grad_norm_var": 8.310596664469998, "learning_rate": 0.0001, "loss": 1.3321, "loss/crossentropy": 2.541609764099121, "loss/hidden": 1.140625, "loss/logits": 0.18482854962348938, "loss/reg": 0.0006617623730562627, "step": 4585 }, { "epoch": 0.57325, "grad_norm": 2.214221715927124, "grad_norm_var": 8.416734718812979, "learning_rate": 0.0001, "loss": 1.2574, "loss/crossentropy": 2.592489004135132, "loss/hidden": 1.0703125, "loss/logits": 0.18044862151145935, "loss/reg": 0.000661406316794455, "step": 4586 }, { "epoch": 0.573375, "grad_norm": 2.2308106422424316, "grad_norm_var": 8.428421079403389, "learning_rate": 0.0001, "loss": 1.2065, "loss/crossentropy": 2.5134332180023193, "loss/hidden": 1.03125, "loss/logits": 0.16861511766910553, "loss/reg": 0.000661081459838897, "step": 4587 }, { "epoch": 0.5735, "grad_norm": 2.1046624183654785, "grad_norm_var": 8.52955157640459, "learning_rate": 0.0001, "loss": 1.1836, "loss/crossentropy": 2.5080223083496094, "loss/hidden": 1.0078125, "loss/logits": 0.16920042037963867, "loss/reg": 0.0006607570685446262, "step": 4588 }, { "epoch": 0.573625, "grad_norm": 2.417208194732666, "grad_norm_var": 8.549309800703274, "learning_rate": 0.0001, "loss": 1.2834, "loss/crossentropy": 2.4829084873199463, "loss/hidden": 1.109375, "loss/logits": 0.16741085052490234, "loss/reg": 0.0006604373338632286, "step": 4589 }, { "epoch": 0.57375, "grad_norm": 2.3250010013580322, "grad_norm_var": 0.33207409304964863, "learning_rate": 0.0001, "loss": 1.2193, "loss/crossentropy": 2.4887712001800537, "loss/hidden": 1.03125, "loss/logits": 0.18146194517612457, "loss/reg": 0.0006601277273148298, "step": 4590 }, { "epoch": 0.573875, "grad_norm": 3.1517975330352783, "grad_norm_var": 0.30508535179409646, "learning_rate": 0.0001, "loss": 1.3435, "loss/crossentropy": 2.4211935997009277, "loss/hidden": 1.1328125, "loss/logits": 0.20413261651992798, "loss/reg": 0.0006597711471840739, "step": 4591 }, { "epoch": 0.574, "grad_norm": 2.763134002685547, "grad_norm_var": 0.2987691675193901, "learning_rate": 0.0001, "loss": 1.2985, "loss/crossentropy": 2.757932186126709, "loss/hidden": 1.109375, "loss/logits": 0.18250706791877747, "loss/reg": 0.0006594700389541686, "step": 4592 }, { "epoch": 0.574125, "grad_norm": 3.176699638366699, "grad_norm_var": 0.289913381886106, "learning_rate": 0.0001, "loss": 1.5047, "loss/crossentropy": 2.588878870010376, "loss/hidden": 1.25, "loss/logits": 0.24811306595802307, "loss/reg": 0.0006591686396859586, "step": 4593 }, { "epoch": 0.57425, "grad_norm": 3.1080195903778076, "grad_norm_var": 0.29016767086726813, "learning_rate": 0.0001, "loss": 1.3752, "loss/crossentropy": 2.426819086074829, "loss/hidden": 1.1640625, "loss/logits": 0.2045620232820511, "loss/reg": 0.0006588617688976228, "step": 4594 }, { "epoch": 0.574375, "grad_norm": 2.4358057975769043, "grad_norm_var": 0.2979806651295005, "learning_rate": 0.0001, "loss": 1.2027, "loss/crossentropy": 2.2934622764587402, "loss/hidden": 1.0078125, "loss/logits": 0.18832868337631226, "loss/reg": 0.0006585046066902578, "step": 4595 }, { "epoch": 0.5745, "grad_norm": 2.475191354751587, "grad_norm_var": 0.28616417895651314, "learning_rate": 0.0001, "loss": 1.1575, "loss/crossentropy": 2.6290647983551025, "loss/hidden": 0.9921875, "loss/logits": 0.15873996913433075, "loss/reg": 0.0006581645575352013, "step": 4596 }, { "epoch": 0.574625, "grad_norm": 2.941347122192383, "grad_norm_var": 0.27795121704902975, "learning_rate": 0.0001, "loss": 1.3596, "loss/crossentropy": 2.566096067428589, "loss/hidden": 1.140625, "loss/logits": 0.21242989599704742, "loss/reg": 0.0006578482571057975, "step": 4597 }, { "epoch": 0.57475, "grad_norm": 2.9882423877716064, "grad_norm_var": 0.2117377492546152, "learning_rate": 0.0001, "loss": 1.3225, "loss/crossentropy": 2.6459507942199707, "loss/hidden": 1.109375, "loss/logits": 0.20653992891311646, "loss/reg": 0.0006575433653779328, "step": 4598 }, { "epoch": 0.574875, "grad_norm": 2.5250051021575928, "grad_norm_var": 0.15013539056205805, "learning_rate": 0.0001, "loss": 1.2775, "loss/crossentropy": 2.6347110271453857, "loss/hidden": 1.0859375, "loss/logits": 0.18494847416877747, "loss/reg": 0.0006571990670636296, "step": 4599 }, { "epoch": 0.575, "grad_norm": 2.4598135948181152, "grad_norm_var": 0.14803374811252373, "learning_rate": 0.0001, "loss": 1.2429, "loss/crossentropy": 2.229729175567627, "loss/hidden": 1.0703125, "loss/logits": 0.1660645604133606, "loss/reg": 0.0006569124525412917, "step": 4600 }, { "epoch": 0.575125, "grad_norm": 3.138890266418457, "grad_norm_var": 0.14273120081999552, "learning_rate": 0.0001, "loss": 1.6008, "loss/crossentropy": 2.5071816444396973, "loss/hidden": 1.3203125, "loss/logits": 0.2739551067352295, "loss/reg": 0.0006565492949448526, "step": 4601 }, { "epoch": 0.57525, "grad_norm": 2.279791831970215, "grad_norm_var": 0.13915952718353233, "learning_rate": 0.0001, "loss": 1.2249, "loss/crossentropy": 2.194221019744873, "loss/hidden": 1.0546875, "loss/logits": 0.16361859440803528, "loss/reg": 0.0006561943446286023, "step": 4602 }, { "epoch": 0.575375, "grad_norm": 2.88458514213562, "grad_norm_var": 0.12867112392399516, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.6736679077148438, "loss/hidden": 1.21875, "loss/logits": 0.1992703378200531, "loss/reg": 0.0006558726308867335, "step": 4603 }, { "epoch": 0.5755, "grad_norm": 2.636608600616455, "grad_norm_var": 0.1042414958676865, "learning_rate": 0.0001, "loss": 1.1268, "loss/crossentropy": 2.6721432209014893, "loss/hidden": 0.96875, "loss/logits": 0.15147705376148224, "loss/reg": 0.000655564886983484, "step": 4604 }, { "epoch": 0.575625, "grad_norm": 2.60263991355896, "grad_norm_var": 0.09861507696856033, "learning_rate": 0.0001, "loss": 1.2779, "loss/crossentropy": 2.6998512744903564, "loss/hidden": 1.0859375, "loss/logits": 0.18536710739135742, "loss/reg": 0.0006552488775923848, "step": 4605 }, { "epoch": 0.57575, "grad_norm": 3.114922046661377, "grad_norm_var": 0.09355860133960595, "learning_rate": 0.0001, "loss": 1.0344, "loss/crossentropy": 3.0431647300720215, "loss/hidden": 0.89453125, "loss/logits": 0.13335463404655457, "loss/reg": 0.0006549703539349139, "step": 4606 }, { "epoch": 0.575875, "grad_norm": 2.5470635890960693, "grad_norm_var": 0.08745702722376882, "learning_rate": 0.0001, "loss": 1.1711, "loss/crossentropy": 2.739468574523926, "loss/hidden": 0.99609375, "loss/logits": 0.16841436922550201, "loss/reg": 0.0006547055672854185, "step": 4607 }, { "epoch": 0.576, "grad_norm": 3.0285682678222656, "grad_norm_var": 0.0921533137610877, "learning_rate": 0.0001, "loss": 1.5321, "loss/crossentropy": 2.5303940773010254, "loss/hidden": 1.2734375, "loss/logits": 0.2521589994430542, "loss/reg": 0.000654465111438185, "step": 4608 }, { "epoch": 0.576125, "grad_norm": 2.682666063308716, "grad_norm_var": 0.08071335638589877, "learning_rate": 0.0001, "loss": 1.2631, "loss/crossentropy": 2.5071346759796143, "loss/hidden": 1.0625, "loss/logits": 0.1940315067768097, "loss/reg": 0.0006541087059304118, "step": 4609 }, { "epoch": 0.57625, "grad_norm": 4.220350742340088, "grad_norm_var": 0.21253976651723647, "learning_rate": 0.0001, "loss": 1.7932, "loss/crossentropy": 2.385481119155884, "loss/hidden": 1.5390625, "loss/logits": 0.24758288264274597, "loss/reg": 0.0006538328598253429, "step": 4610 }, { "epoch": 0.576375, "grad_norm": 2.378221035003662, "grad_norm_var": 0.21562078394835543, "learning_rate": 0.0001, "loss": 1.148, "loss/crossentropy": 2.4394922256469727, "loss/hidden": 0.9765625, "loss/logits": 0.16493599116802216, "loss/reg": 0.0006535752909258008, "step": 4611 }, { "epoch": 0.5765, "grad_norm": 3.551015853881836, "grad_norm_var": 0.240435019915977, "learning_rate": 0.0001, "loss": 1.3902, "loss/crossentropy": 2.9033331871032715, "loss/hidden": 1.1796875, "loss/logits": 0.2039773166179657, "loss/reg": 0.0006532213883474469, "step": 4612 }, { "epoch": 0.576625, "grad_norm": 3.8183722496032715, "grad_norm_var": 0.29641488224865403, "learning_rate": 0.0001, "loss": 1.5756, "loss/crossentropy": 2.1728575229644775, "loss/hidden": 1.34375, "loss/logits": 0.22528573870658875, "loss/reg": 0.0006529578822664917, "step": 4613 }, { "epoch": 0.57675, "grad_norm": 2.487595319747925, "grad_norm_var": 0.3080955269421717, "learning_rate": 0.0001, "loss": 1.2953, "loss/crossentropy": 2.5076563358306885, "loss/hidden": 1.1015625, "loss/logits": 0.187259703874588, "loss/reg": 0.0006526995566673577, "step": 4614 }, { "epoch": 0.576875, "grad_norm": 4.95721435546875, "grad_norm_var": 0.5571039228756904, "learning_rate": 0.0001, "loss": 1.3705, "loss/crossentropy": 2.2007062435150146, "loss/hidden": 1.15625, "loss/logits": 0.20774805545806885, "loss/reg": 0.0006523983902297914, "step": 4615 }, { "epoch": 0.577, "grad_norm": 2.4585018157958984, "grad_norm_var": 0.5572071286173712, "learning_rate": 0.0001, "loss": 1.307, "loss/crossentropy": 2.4447827339172363, "loss/hidden": 1.1015625, "loss/logits": 0.19890260696411133, "loss/reg": 0.0006521317409351468, "step": 4616 }, { "epoch": 0.577125, "grad_norm": 2.6535496711730957, "grad_norm_var": 0.5661245231040272, "learning_rate": 0.0001, "loss": 1.3496, "loss/crossentropy": 2.5873663425445557, "loss/hidden": 1.140625, "loss/logits": 0.20248284935951233, "loss/reg": 0.0006518810405395925, "step": 4617 }, { "epoch": 0.57725, "grad_norm": 10.428425788879395, "grad_norm_var": 3.9131594532050333, "learning_rate": 0.0001, "loss": 1.25, "loss/crossentropy": 2.557337999343872, "loss/hidden": 1.0703125, "loss/logits": 0.1732219010591507, "loss/reg": 0.000651522190310061, "step": 4618 }, { "epoch": 0.577375, "grad_norm": 2.4153594970703125, "grad_norm_var": 3.9671834761637297, "learning_rate": 0.0001, "loss": 1.2325, "loss/crossentropy": 2.913679599761963, "loss/hidden": 1.03125, "loss/logits": 0.1947803497314453, "loss/reg": 0.0006511617102660239, "step": 4619 }, { "epoch": 0.5775, "grad_norm": 2.8080568313598633, "grad_norm_var": 3.94931074752382, "learning_rate": 0.0001, "loss": 1.2455, "loss/crossentropy": 2.7757327556610107, "loss/hidden": 1.046875, "loss/logits": 0.19212044775485992, "loss/reg": 0.0006508741062134504, "step": 4620 }, { "epoch": 0.577625, "grad_norm": 2.628504991531372, "grad_norm_var": 3.9462249798481093, "learning_rate": 0.0001, "loss": 1.1527, "loss/crossentropy": 2.5698959827423096, "loss/hidden": 0.984375, "loss/logits": 0.16179609298706055, "loss/reg": 0.0006505214842036366, "step": 4621 }, { "epoch": 0.57775, "grad_norm": 2.0951955318450928, "grad_norm_var": 4.065087567522325, "learning_rate": 0.0001, "loss": 1.1193, "loss/crossentropy": 2.535668134689331, "loss/hidden": 0.96484375, "loss/logits": 0.14794430136680603, "loss/reg": 0.0006501656025648117, "step": 4622 }, { "epoch": 0.577875, "grad_norm": 2.927222490310669, "grad_norm_var": 4.028483168056907, "learning_rate": 0.0001, "loss": 1.3669, "loss/crossentropy": 2.6391515731811523, "loss/hidden": 1.1328125, "loss/logits": 0.22754976153373718, "loss/reg": 0.0006498494767583907, "step": 4623 }, { "epoch": 0.578, "grad_norm": 3.3670709133148193, "grad_norm_var": 4.015668139477043, "learning_rate": 0.0001, "loss": 1.2687, "loss/crossentropy": 2.53467059135437, "loss/hidden": 1.078125, "loss/logits": 0.18407757580280304, "loss/reg": 0.0006495587876997888, "step": 4624 }, { "epoch": 0.578125, "grad_norm": 2.370218276977539, "grad_norm_var": 4.055500088654183, "learning_rate": 0.0001, "loss": 1.1663, "loss/crossentropy": 2.543036460876465, "loss/hidden": 1.0, "loss/logits": 0.15984173119068146, "loss/reg": 0.0006491952808573842, "step": 4625 }, { "epoch": 0.57825, "grad_norm": 2.4503896236419678, "grad_norm_var": 4.074880748100236, "learning_rate": 0.0001, "loss": 1.245, "loss/crossentropy": 2.202603578567505, "loss/hidden": 1.078125, "loss/logits": 0.1604217290878296, "loss/reg": 0.0006488971994258463, "step": 4626 }, { "epoch": 0.578375, "grad_norm": 2.8367466926574707, "grad_norm_var": 4.027864920879351, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.62064790725708, "loss/hidden": 1.09375, "loss/logits": 0.21593736112117767, "loss/reg": 0.0006485393969342113, "step": 4627 }, { "epoch": 0.5785, "grad_norm": 7.225318908691406, "grad_norm_var": 4.950117641399542, "learning_rate": 0.0001, "loss": 1.5399, "loss/crossentropy": 2.152503490447998, "loss/hidden": 1.3046875, "loss/logits": 0.22877474129199982, "loss/reg": 0.0006482175085693598, "step": 4628 }, { "epoch": 0.578625, "grad_norm": 3.296053886413574, "grad_norm_var": 4.953387244180233, "learning_rate": 0.0001, "loss": 1.3872, "loss/crossentropy": 2.323751449584961, "loss/hidden": 1.15625, "loss/logits": 0.22447875142097473, "loss/reg": 0.0006479380535893142, "step": 4629 }, { "epoch": 0.57875, "grad_norm": 1.9421238899230957, "grad_norm_var": 5.052003638554553, "learning_rate": 0.0001, "loss": 1.1093, "loss/crossentropy": 2.473816156387329, "loss/hidden": 0.953125, "loss/logits": 0.14966681599617004, "loss/reg": 0.0006476659327745438, "step": 4630 }, { "epoch": 0.578875, "grad_norm": 2.274291753768921, "grad_norm_var": 4.999830698190627, "learning_rate": 0.0001, "loss": 1.2517, "loss/crossentropy": 2.5056753158569336, "loss/hidden": 1.0625, "loss/logits": 0.1827664077281952, "loss/reg": 0.0006473138346336782, "step": 4631 }, { "epoch": 0.579, "grad_norm": 3.1611931324005127, "grad_norm_var": 4.943786283665692, "learning_rate": 0.0001, "loss": 1.4204, "loss/crossentropy": 2.2271625995635986, "loss/hidden": 1.2109375, "loss/logits": 0.2029782384634018, "loss/reg": 0.0006470340886153281, "step": 4632 }, { "epoch": 0.579125, "grad_norm": 2.233492851257324, "grad_norm_var": 4.998300394154497, "learning_rate": 0.0001, "loss": 1.1945, "loss/crossentropy": 2.4525234699249268, "loss/hidden": 1.03125, "loss/logits": 0.1567961722612381, "loss/reg": 0.0006466872873716056, "step": 4633 }, { "epoch": 0.57925, "grad_norm": 7.602259159088135, "grad_norm_var": 2.850439712944726, "learning_rate": 0.0001, "loss": 1.9042, "loss/crossentropy": 2.267160177230835, "loss/hidden": 1.546875, "loss/logits": 0.3508527874946594, "loss/reg": 0.0006463872850872576, "step": 4634 }, { "epoch": 0.579375, "grad_norm": 3.751835346221924, "grad_norm_var": 2.817426781391869, "learning_rate": 0.0001, "loss": 1.2597, "loss/crossentropy": 2.449087142944336, "loss/hidden": 1.0625, "loss/logits": 0.19072309136390686, "loss/reg": 0.0006461034063249826, "step": 4635 }, { "epoch": 0.5795, "grad_norm": 2.5605738162994385, "grad_norm_var": 2.8378383299380365, "learning_rate": 0.0001, "loss": 1.2293, "loss/crossentropy": 2.6488780975341797, "loss/hidden": 1.0390625, "loss/logits": 0.1838100552558899, "loss/reg": 0.0006458122516050935, "step": 4636 }, { "epoch": 0.579625, "grad_norm": 3.5354647636413574, "grad_norm_var": 2.8086326145985536, "learning_rate": 0.0001, "loss": 1.4371, "loss/crossentropy": 2.26729679107666, "loss/hidden": 1.234375, "loss/logits": 0.19626551866531372, "loss/reg": 0.0006454665563069284, "step": 4637 }, { "epoch": 0.57975, "grad_norm": 2.8880178928375244, "grad_norm_var": 2.715078553131843, "learning_rate": 0.0001, "loss": 1.2323, "loss/crossentropy": 2.882366180419922, "loss/hidden": 1.0390625, "loss/logits": 0.18681839108467102, "loss/reg": 0.0006451678927987814, "step": 4638 }, { "epoch": 0.579875, "grad_norm": 4.744136333465576, "grad_norm_var": 2.8065320070761497, "learning_rate": 0.0001, "loss": 1.4222, "loss/crossentropy": 2.4375789165496826, "loss/hidden": 1.1640625, "loss/logits": 0.2516406178474426, "loss/reg": 0.0006448218482546508, "step": 4639 }, { "epoch": 0.58, "grad_norm": 3.7077646255493164, "grad_norm_var": 2.807069026148184, "learning_rate": 0.0001, "loss": 1.419, "loss/crossentropy": 2.7084474563598633, "loss/hidden": 1.171875, "loss/logits": 0.24065282940864563, "loss/reg": 0.0006445383187383413, "step": 4640 }, { "epoch": 0.580125, "grad_norm": 3.0713050365448, "grad_norm_var": 2.7287912999794495, "learning_rate": 0.0001, "loss": 1.2608, "loss/crossentropy": 2.5310752391815186, "loss/hidden": 1.0859375, "loss/logits": 0.16838188469409943, "loss/reg": 0.0006442532758228481, "step": 4641 }, { "epoch": 0.58025, "grad_norm": 2.380626916885376, "grad_norm_var": 2.7396033300928053, "learning_rate": 0.0001, "loss": 1.2072, "loss/crossentropy": 2.699526309967041, "loss/hidden": 1.0078125, "loss/logits": 0.19298124313354492, "loss/reg": 0.000643935811240226, "step": 4642 }, { "epoch": 0.580375, "grad_norm": 2.4706833362579346, "grad_norm_var": 2.784045657889936, "learning_rate": 0.0001, "loss": 1.1639, "loss/crossentropy": 2.4858529567718506, "loss/hidden": 0.9921875, "loss/logits": 0.1652545928955078, "loss/reg": 0.000643593433778733, "step": 4643 }, { "epoch": 0.5805, "grad_norm": 3.0282297134399414, "grad_norm_var": 1.8298447069122514, "learning_rate": 0.0001, "loss": 1.4626, "loss/crossentropy": 2.2936015129089355, "loss/hidden": 1.2265625, "loss/logits": 0.22957974672317505, "loss/reg": 0.0006433242233470082, "step": 4644 }, { "epoch": 0.580625, "grad_norm": 8.294962882995605, "grad_norm_var": 3.395362502097339, "learning_rate": 0.0001, "loss": 1.8277, "loss/crossentropy": 2.220132350921631, "loss/hidden": 1.5703125, "loss/logits": 0.2509624660015106, "loss/reg": 0.0006430034409277141, "step": 4645 }, { "epoch": 0.58075, "grad_norm": 2.970308303833008, "grad_norm_var": 3.233752509270857, "learning_rate": 0.0001, "loss": 1.3214, "loss/crossentropy": 2.5036587715148926, "loss/hidden": 1.109375, "loss/logits": 0.20561806857585907, "loss/reg": 0.0006426598993130028, "step": 4646 }, { "epoch": 0.580875, "grad_norm": 2.508435010910034, "grad_norm_var": 3.193693713297608, "learning_rate": 0.0001, "loss": 1.3813, "loss/crossentropy": 2.5881357192993164, "loss/hidden": 1.15625, "loss/logits": 0.21867448091506958, "loss/reg": 0.0006423134473152459, "step": 4647 }, { "epoch": 0.581, "grad_norm": 2.7338249683380127, "grad_norm_var": 3.2347761171450027, "learning_rate": 0.0001, "loss": 1.1555, "loss/crossentropy": 2.8256001472473145, "loss/hidden": 0.99609375, "loss/logits": 0.15301626920700073, "loss/reg": 0.0006420139106921852, "step": 4648 }, { "epoch": 0.581125, "grad_norm": 2.8572258949279785, "grad_norm_var": 3.1408625169015294, "learning_rate": 0.0001, "loss": 1.4819, "loss/crossentropy": 2.582308769226074, "loss/hidden": 1.21875, "loss/logits": 0.25674575567245483, "loss/reg": 0.000641667633317411, "step": 4649 }, { "epoch": 0.58125, "grad_norm": 2.80285906791687, "grad_norm_var": 2.0795954972850033, "learning_rate": 0.0001, "loss": 1.3682, "loss/crossentropy": 2.259986639022827, "loss/hidden": 1.1484375, "loss/logits": 0.2133706510066986, "loss/reg": 0.0006413627415895462, "step": 4650 }, { "epoch": 0.581375, "grad_norm": 2.6211373805999756, "grad_norm_var": 2.105574443072294, "learning_rate": 0.0001, "loss": 1.1813, "loss/crossentropy": 2.692805528640747, "loss/hidden": 1.015625, "loss/logits": 0.15930116176605225, "loss/reg": 0.0006410317146219313, "step": 4651 }, { "epoch": 0.5815, "grad_norm": 2.7373361587524414, "grad_norm_var": 2.0895470224516877, "learning_rate": 0.0001, "loss": 1.2789, "loss/crossentropy": 2.3507421016693115, "loss/hidden": 1.09375, "loss/logits": 0.17878121137619019, "loss/reg": 0.0006407218752428889, "step": 4652 }, { "epoch": 0.581625, "grad_norm": 2.741372585296631, "grad_norm_var": 2.107682588335508, "learning_rate": 0.0001, "loss": 1.3908, "loss/crossentropy": 2.4924142360687256, "loss/hidden": 1.1484375, "loss/logits": 0.23594772815704346, "loss/reg": 0.0006404187297448516, "step": 4653 }, { "epoch": 0.58175, "grad_norm": 2.7200608253479004, "grad_norm_var": 2.11833333079659, "learning_rate": 0.0001, "loss": 1.3293, "loss/crossentropy": 2.5796916484832764, "loss/hidden": 1.1171875, "loss/logits": 0.20571675896644592, "loss/reg": 0.0006401249556802213, "step": 4654 }, { "epoch": 0.581875, "grad_norm": 2.9689550399780273, "grad_norm_var": 1.967412556474255, "learning_rate": 0.0001, "loss": 1.3354, "loss/crossentropy": 2.6272706985473633, "loss/hidden": 1.140625, "loss/logits": 0.188334658741951, "loss/reg": 0.0006398514378815889, "step": 4655 }, { "epoch": 0.582, "grad_norm": 2.694282293319702, "grad_norm_var": 1.9580544932952268, "learning_rate": 0.0001, "loss": 1.371, "loss/crossentropy": 2.242241859436035, "loss/hidden": 1.1484375, "loss/logits": 0.2161926031112671, "loss/reg": 0.000639581645373255, "step": 4656 }, { "epoch": 0.582125, "grad_norm": 2.388843059539795, "grad_norm_var": 1.9897843666635364, "learning_rate": 0.0001, "loss": 1.5271, "loss/crossentropy": 2.06567645072937, "loss/hidden": 1.2734375, "loss/logits": 0.2472807765007019, "loss/reg": 0.0006393597577698529, "step": 4657 }, { "epoch": 0.58225, "grad_norm": 2.196504592895508, "grad_norm_var": 2.0085188594640377, "learning_rate": 0.0001, "loss": 1.0891, "loss/crossentropy": 2.542485237121582, "loss/hidden": 0.94140625, "loss/logits": 0.14127019047737122, "loss/reg": 0.0006390196504071355, "step": 4658 }, { "epoch": 0.582375, "grad_norm": 3.6250271797180176, "grad_norm_var": 2.003261699516874, "learning_rate": 0.0001, "loss": 1.3598, "loss/crossentropy": 2.916038751602173, "loss/hidden": 1.15625, "loss/logits": 0.19720198214054108, "loss/reg": 0.0006387906614691019, "step": 4659 }, { "epoch": 0.5825, "grad_norm": 2.3115322589874268, "grad_norm_var": 2.0439517223896813, "learning_rate": 0.0001, "loss": 1.3049, "loss/crossentropy": 2.321695327758789, "loss/hidden": 1.109375, "loss/logits": 0.18914684653282166, "loss/reg": 0.0006384504376910627, "step": 4660 }, { "epoch": 0.582625, "grad_norm": 2.6319234371185303, "grad_norm_var": 0.10559040435935359, "learning_rate": 0.0001, "loss": 1.2249, "loss/crossentropy": 2.601752758026123, "loss/hidden": 1.03125, "loss/logits": 0.18728289008140564, "loss/reg": 0.0006381092243827879, "step": 4661 }, { "epoch": 0.58275, "grad_norm": 2.341745376586914, "grad_norm_var": 0.10925134944284665, "learning_rate": 0.0001, "loss": 1.2099, "loss/crossentropy": 2.434358596801758, "loss/hidden": 1.0234375, "loss/logits": 0.1800653636455536, "loss/reg": 0.0006378052057698369, "step": 4662 }, { "epoch": 0.582875, "grad_norm": 2.341581106185913, "grad_norm_var": 0.11480968299325482, "learning_rate": 0.0001, "loss": 1.1503, "loss/crossentropy": 2.3678412437438965, "loss/hidden": 0.98046875, "loss/logits": 0.16344693303108215, "loss/reg": 0.0006374584045261145, "step": 4663 }, { "epoch": 0.583, "grad_norm": 4.341050624847412, "grad_norm_var": 0.29001309320297347, "learning_rate": 0.0001, "loss": 2.1603, "loss/crossentropy": 2.572618246078491, "loss/hidden": 1.8203125, "loss/logits": 0.3336426019668579, "loss/reg": 0.000637140532489866, "step": 4664 }, { "epoch": 0.583125, "grad_norm": 3.0948147773742676, "grad_norm_var": 0.2963014651789194, "learning_rate": 0.0001, "loss": 1.3583, "loss/crossentropy": 2.650960683822632, "loss/hidden": 1.15625, "loss/logits": 0.19572153687477112, "loss/reg": 0.0006367935566231608, "step": 4665 }, { "epoch": 0.58325, "grad_norm": 2.570023536682129, "grad_norm_var": 0.29913341883591493, "learning_rate": 0.0001, "loss": 1.2296, "loss/crossentropy": 2.481837511062622, "loss/hidden": 1.0390625, "loss/logits": 0.1841709315776825, "loss/reg": 0.0006364603177644312, "step": 4666 }, { "epoch": 0.583375, "grad_norm": 2.7370505332946777, "grad_norm_var": 0.2976664958715711, "learning_rate": 0.0001, "loss": 1.3909, "loss/crossentropy": 2.4474353790283203, "loss/hidden": 1.15625, "loss/logits": 0.2282680720090866, "loss/reg": 0.0006361139239743352, "step": 4667 }, { "epoch": 0.5835, "grad_norm": 2.8040497303009033, "grad_norm_var": 0.2975862321847423, "learning_rate": 0.0001, "loss": 1.2609, "loss/crossentropy": 2.785749673843384, "loss/hidden": 1.1015625, "loss/logits": 0.15294674038887024, "loss/reg": 0.0006358035025186837, "step": 4668 }, { "epoch": 0.583625, "grad_norm": 2.6143417358398438, "grad_norm_var": 0.2992795396078499, "learning_rate": 0.0001, "loss": 1.1996, "loss/crossentropy": 2.7143678665161133, "loss/hidden": 1.0078125, "loss/logits": 0.18544641137123108, "loss/reg": 0.000635516073089093, "step": 4669 }, { "epoch": 0.58375, "grad_norm": 2.35081148147583, "grad_norm_var": 0.31044989530827605, "learning_rate": 0.0001, "loss": 1.0732, "loss/crossentropy": 2.5323424339294434, "loss/hidden": 0.92578125, "loss/logits": 0.1410677134990692, "loss/reg": 0.0006352057680487633, "step": 4670 }, { "epoch": 0.583875, "grad_norm": 2.5072147846221924, "grad_norm_var": 0.310343341201307, "learning_rate": 0.0001, "loss": 1.2831, "loss/crossentropy": 2.668748378753662, "loss/hidden": 1.078125, "loss/logits": 0.19859813153743744, "loss/reg": 0.0006348899914883077, "step": 4671 }, { "epoch": 0.584, "grad_norm": 3.01366925239563, "grad_norm_var": 0.31554168967480317, "learning_rate": 0.0001, "loss": 1.3431, "loss/crossentropy": 2.2803778648376465, "loss/hidden": 1.140625, "loss/logits": 0.1961110532283783, "loss/reg": 0.0006346572772599757, "step": 4672 }, { "epoch": 0.584125, "grad_norm": 3.0058212280273438, "grad_norm_var": 0.3102903918129338, "learning_rate": 0.0001, "loss": 1.6176, "loss/crossentropy": 2.2173070907592773, "loss/hidden": 1.3671875, "loss/logits": 0.2440624237060547, "loss/reg": 0.0006343870190903544, "step": 4673 }, { "epoch": 0.58425, "grad_norm": 3.8751094341278076, "grad_norm_var": 0.35570292178687335, "learning_rate": 0.0001, "loss": 1.3702, "loss/crossentropy": 2.5553789138793945, "loss/hidden": 1.15625, "loss/logits": 0.2076472043991089, "loss/reg": 0.0006340874824672937, "step": 4674 }, { "epoch": 0.584375, "grad_norm": 2.395458459854126, "grad_norm_var": 0.32893005682424153, "learning_rate": 0.0001, "loss": 1.1892, "loss/crossentropy": 2.582611560821533, "loss/hidden": 1.0, "loss/logits": 0.18283426761627197, "loss/reg": 0.0006338307866826653, "step": 4675 }, { "epoch": 0.5845, "grad_norm": 2.607210397720337, "grad_norm_var": 0.3148013342703498, "learning_rate": 0.0001, "loss": 1.4349, "loss/crossentropy": 2.3899710178375244, "loss/hidden": 1.2109375, "loss/logits": 0.2176152914762497, "loss/reg": 0.000633572752121836, "step": 4676 }, { "epoch": 0.584625, "grad_norm": 3.9262540340423584, "grad_norm_var": 0.3858425952014528, "learning_rate": 0.0001, "loss": 1.3536, "loss/crossentropy": 2.706650972366333, "loss/hidden": 1.125, "loss/logits": 0.22231543064117432, "loss/reg": 0.0006333195487968624, "step": 4677 }, { "epoch": 0.58475, "grad_norm": 2.402273178100586, "grad_norm_var": 0.3815025894385286, "learning_rate": 0.0001, "loss": 1.3605, "loss/crossentropy": 2.4344570636749268, "loss/hidden": 1.140625, "loss/logits": 0.21352441608905792, "loss/reg": 0.0006330383475869894, "step": 4678 }, { "epoch": 0.584875, "grad_norm": 2.3626298904418945, "grad_norm_var": 0.37993032055417736, "learning_rate": 0.0001, "loss": 1.1921, "loss/crossentropy": 2.568277597427368, "loss/hidden": 1.015625, "loss/logits": 0.17010287940502167, "loss/reg": 0.0006327080191113055, "step": 4679 }, { "epoch": 0.585, "grad_norm": 2.492293119430542, "grad_norm_var": 0.24153009011150364, "learning_rate": 0.0001, "loss": 1.2985, "loss/crossentropy": 2.5455281734466553, "loss/hidden": 1.125, "loss/logits": 0.167146235704422, "loss/reg": 0.0006323677953332663, "step": 4680 }, { "epoch": 0.585125, "grad_norm": 3.7730700969696045, "grad_norm_var": 0.29717486734959164, "learning_rate": 0.0001, "loss": 1.4341, "loss/crossentropy": 2.331550359725952, "loss/hidden": 1.234375, "loss/logits": 0.1933964490890503, "loss/reg": 0.0006320521351881325, "step": 4681 }, { "epoch": 0.58525, "grad_norm": 2.735081672668457, "grad_norm_var": 0.29293979429848277, "learning_rate": 0.0001, "loss": 1.1765, "loss/crossentropy": 2.8688902854919434, "loss/hidden": 0.99609375, "loss/logits": 0.17407768964767456, "loss/reg": 0.0006317704683169723, "step": 4682 }, { "epoch": 0.585375, "grad_norm": 2.6861777305603027, "grad_norm_var": 0.29386867932707117, "learning_rate": 0.0001, "loss": 1.3437, "loss/crossentropy": 2.6964738368988037, "loss/hidden": 1.140625, "loss/logits": 0.19675499200820923, "loss/reg": 0.0006314352503977716, "step": 4683 }, { "epoch": 0.5855, "grad_norm": 30.43675994873047, "grad_norm_var": 47.858664404507785, "learning_rate": 0.0001, "loss": 1.7687, "loss/crossentropy": 2.78887677192688, "loss/hidden": 1.4609375, "loss/logits": 0.3014136552810669, "loss/reg": 0.0006310963653959334, "step": 4684 }, { "epoch": 0.585625, "grad_norm": 2.8974359035491943, "grad_norm_var": 47.789703839118495, "learning_rate": 0.0001, "loss": 1.2554, "loss/crossentropy": 2.551180362701416, "loss/hidden": 1.0546875, "loss/logits": 0.1944343000650406, "loss/reg": 0.0006307560252025723, "step": 4685 }, { "epoch": 0.58575, "grad_norm": 2.922001361846924, "grad_norm_var": 47.63943157449671, "learning_rate": 0.0001, "loss": 1.347, "loss/crossentropy": 2.6626436710357666, "loss/hidden": 1.140625, "loss/logits": 0.20010551810264587, "loss/reg": 0.0006304056732915342, "step": 4686 }, { "epoch": 0.585875, "grad_norm": 2.2253870964050293, "grad_norm_var": 47.72406614747959, "learning_rate": 0.0001, "loss": 1.088, "loss/crossentropy": 2.4485559463500977, "loss/hidden": 0.9375, "loss/logits": 0.14417949318885803, "loss/reg": 0.0006300712120719254, "step": 4687 }, { "epoch": 0.586, "grad_norm": 2.9811899662017822, "grad_norm_var": 47.73104419204231, "learning_rate": 0.0001, "loss": 1.1748, "loss/crossentropy": 2.6289620399475098, "loss/hidden": 1.015625, "loss/logits": 0.1529102474451065, "loss/reg": 0.0006297536310739815, "step": 4688 }, { "epoch": 0.586125, "grad_norm": 2.790989398956299, "grad_norm_var": 47.77981504997193, "learning_rate": 0.0001, "loss": 1.4835, "loss/crossentropy": 2.3812954425811768, "loss/hidden": 1.25, "loss/logits": 0.22716450691223145, "loss/reg": 0.0006294146878644824, "step": 4689 }, { "epoch": 0.58625, "grad_norm": 2.767373561859131, "grad_norm_var": 47.96273538026895, "learning_rate": 0.0001, "loss": 1.3111, "loss/crossentropy": 2.479719400405884, "loss/hidden": 1.1015625, "loss/logits": 0.2032855749130249, "loss/reg": 0.000629096757620573, "step": 4690 }, { "epoch": 0.586375, "grad_norm": 2.5073401927948, "grad_norm_var": 47.931748673820294, "learning_rate": 0.0001, "loss": 1.2521, "loss/crossentropy": 2.802661895751953, "loss/hidden": 1.0546875, "loss/logits": 0.19114983081817627, "loss/reg": 0.0006287907599471509, "step": 4691 }, { "epoch": 0.5865, "grad_norm": 2.4203710556030273, "grad_norm_var": 47.9818829572571, "learning_rate": 0.0001, "loss": 1.1436, "loss/crossentropy": 2.5566537380218506, "loss/hidden": 0.98046875, "loss/logits": 0.1568315178155899, "loss/reg": 0.0006284890114329755, "step": 4692 }, { "epoch": 0.586625, "grad_norm": 3.624335527420044, "grad_norm_var": 48.01149852716707, "learning_rate": 0.0001, "loss": 1.4881, "loss/crossentropy": 2.4991908073425293, "loss/hidden": 1.265625, "loss/logits": 0.21619540452957153, "loss/reg": 0.000628159730695188, "step": 4693 }, { "epoch": 0.58675, "grad_norm": 2.5397613048553467, "grad_norm_var": 47.974196648816516, "learning_rate": 0.0001, "loss": 1.3095, "loss/crossentropy": 2.515597105026245, "loss/hidden": 1.0703125, "loss/logits": 0.2329448014497757, "loss/reg": 0.0006278451764956117, "step": 4694 }, { "epoch": 0.586875, "grad_norm": 3.2661478519439697, "grad_norm_var": 47.76650998298054, "learning_rate": 0.0001, "loss": 1.358, "loss/crossentropy": 2.5030808448791504, "loss/hidden": 1.140625, "loss/logits": 0.21108534932136536, "loss/reg": 0.0006275131599977612, "step": 4695 }, { "epoch": 0.587, "grad_norm": 2.5214285850524902, "grad_norm_var": 47.758504890002676, "learning_rate": 0.0001, "loss": 1.0869, "loss/crossentropy": 2.9453036785125732, "loss/hidden": 0.9296875, "loss/logits": 0.15095344185829163, "loss/reg": 0.0006271915044635534, "step": 4696 }, { "epoch": 0.587125, "grad_norm": 2.2989156246185303, "grad_norm_var": 48.050656364332546, "learning_rate": 0.0001, "loss": 1.1791, "loss/crossentropy": 2.4186148643493652, "loss/hidden": 1.0078125, "loss/logits": 0.16506549715995789, "loss/reg": 0.0006268562283366919, "step": 4697 }, { "epoch": 0.58725, "grad_norm": 2.8585758209228516, "grad_norm_var": 48.02293894261126, "learning_rate": 0.0001, "loss": 1.3308, "loss/crossentropy": 2.503479242324829, "loss/hidden": 1.140625, "loss/logits": 0.183914452791214, "loss/reg": 0.0006265369011089206, "step": 4698 }, { "epoch": 0.587375, "grad_norm": 2.2863001823425293, "grad_norm_var": 48.12878795870019, "learning_rate": 0.0001, "loss": 1.1574, "loss/crossentropy": 2.584681749343872, "loss/hidden": 0.9921875, "loss/logits": 0.15891656279563904, "loss/reg": 0.000626209715846926, "step": 4699 }, { "epoch": 0.5875, "grad_norm": 2.3650875091552734, "grad_norm_var": 0.14814679981686987, "learning_rate": 0.0001, "loss": 1.1647, "loss/crossentropy": 2.244703769683838, "loss/hidden": 0.9921875, "loss/logits": 0.16630248725414276, "loss/reg": 0.0006259073270484805, "step": 4700 }, { "epoch": 0.587625, "grad_norm": 2.773437976837158, "grad_norm_var": 0.14591860970795759, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.7894773483276367, "loss/hidden": 1.1015625, "loss/logits": 0.1745486557483673, "loss/reg": 0.0006256120395846665, "step": 4701 }, { "epoch": 0.58775, "grad_norm": 2.2696006298065186, "grad_norm_var": 0.15292988816295647, "learning_rate": 0.0001, "loss": 1.1794, "loss/crossentropy": 2.6657819747924805, "loss/hidden": 1.0, "loss/logits": 0.17316454648971558, "loss/reg": 0.0006253327592276037, "step": 4702 }, { "epoch": 0.587875, "grad_norm": 2.498739719390869, "grad_norm_var": 0.1419048842253969, "learning_rate": 0.0001, "loss": 1.1893, "loss/crossentropy": 2.4745805263519287, "loss/hidden": 1.015625, "loss/logits": 0.1674148440361023, "loss/reg": 0.000625002256128937, "step": 4703 }, { "epoch": 0.588, "grad_norm": 2.773653984069824, "grad_norm_var": 0.13607152391491487, "learning_rate": 0.0001, "loss": 1.211, "loss/crossentropy": 2.6415460109710693, "loss/hidden": 1.03125, "loss/logits": 0.17352080345153809, "loss/reg": 0.0006247060373425484, "step": 4704 }, { "epoch": 0.588125, "grad_norm": 2.552037239074707, "grad_norm_var": 0.13547089723350988, "learning_rate": 0.0001, "loss": 1.0926, "loss/crossentropy": 2.3512487411499023, "loss/hidden": 0.91796875, "loss/logits": 0.1683613508939743, "loss/reg": 0.0006243722746148705, "step": 4705 }, { "epoch": 0.58825, "grad_norm": 3.5300796031951904, "grad_norm_var": 0.18425335657560116, "learning_rate": 0.0001, "loss": 1.4682, "loss/crossentropy": 1.8885351419448853, "loss/hidden": 1.2421875, "loss/logits": 0.21973538398742676, "loss/reg": 0.0006239715148694813, "step": 4706 }, { "epoch": 0.588375, "grad_norm": 2.652418613433838, "grad_norm_var": 0.1819801209074323, "learning_rate": 0.0001, "loss": 1.2377, "loss/crossentropy": 2.5963494777679443, "loss/hidden": 1.0546875, "loss/logits": 0.1767793893814087, "loss/reg": 0.0006236377521418035, "step": 4707 }, { "epoch": 0.5885, "grad_norm": 2.583242177963257, "grad_norm_var": 0.17752366715341522, "learning_rate": 0.0001, "loss": 1.1868, "loss/crossentropy": 2.5870718955993652, "loss/hidden": 1.0078125, "loss/logits": 0.1727292835712433, "loss/reg": 0.0006232992163859308, "step": 4708 }, { "epoch": 0.588625, "grad_norm": 2.1652774810791016, "grad_norm_var": 0.13311151185834263, "learning_rate": 0.0001, "loss": 1.0877, "loss/crossentropy": 2.6708269119262695, "loss/hidden": 0.93359375, "loss/logits": 0.14788281917572021, "loss/reg": 0.000622964755166322, "step": 4709 }, { "epoch": 0.58875, "grad_norm": 2.624234199523926, "grad_norm_var": 0.13264340762424898, "learning_rate": 0.0001, "loss": 1.2007, "loss/crossentropy": 2.612135410308838, "loss/hidden": 1.015625, "loss/logits": 0.1788371354341507, "loss/reg": 0.0006225847755558789, "step": 4710 }, { "epoch": 0.588875, "grad_norm": 3.441258192062378, "grad_norm_var": 0.14950144931681825, "learning_rate": 0.0001, "loss": 1.4552, "loss/crossentropy": 2.6218936443328857, "loss/hidden": 1.1953125, "loss/logits": 0.2536616623401642, "loss/reg": 0.0006222163210622966, "step": 4711 }, { "epoch": 0.589, "grad_norm": 2.6492156982421875, "grad_norm_var": 0.14855047164641041, "learning_rate": 0.0001, "loss": 1.1354, "loss/crossentropy": 2.8068649768829346, "loss/hidden": 0.96484375, "loss/logits": 0.16434694826602936, "loss/reg": 0.0006218744092620909, "step": 4712 }, { "epoch": 0.589125, "grad_norm": 4.911133766174316, "grad_norm_var": 0.4544458179639567, "learning_rate": 0.0001, "loss": 1.4475, "loss/crossentropy": 2.982008695602417, "loss/hidden": 1.21875, "loss/logits": 0.22251421213150024, "loss/reg": 0.0006215113098733127, "step": 4713 }, { "epoch": 0.58925, "grad_norm": 2.370255708694458, "grad_norm_var": 0.4660819999538146, "learning_rate": 0.0001, "loss": 1.2413, "loss/crossentropy": 2.4590132236480713, "loss/hidden": 1.0625, "loss/logits": 0.17259567975997925, "loss/reg": 0.0006211713189259171, "step": 4714 }, { "epoch": 0.589375, "grad_norm": 2.158547878265381, "grad_norm_var": 0.475475320314073, "learning_rate": 0.0001, "loss": 1.1562, "loss/crossentropy": 2.505344867706299, "loss/hidden": 0.9765625, "loss/logits": 0.17338553071022034, "loss/reg": 0.000620837789028883, "step": 4715 }, { "epoch": 0.5895, "grad_norm": 3.7225372791290283, "grad_norm_var": 0.5173758699222532, "learning_rate": 0.0001, "loss": 1.4007, "loss/crossentropy": 2.4692041873931885, "loss/hidden": 1.1484375, "loss/logits": 0.24609379470348358, "loss/reg": 0.0006204904057085514, "step": 4716 }, { "epoch": 0.589625, "grad_norm": 3.0797834396362305, "grad_norm_var": 0.5199209074926557, "learning_rate": 0.0001, "loss": 1.2824, "loss/crossentropy": 2.2620136737823486, "loss/hidden": 1.09375, "loss/logits": 0.18247854709625244, "loss/reg": 0.0006201678770594299, "step": 4717 }, { "epoch": 0.58975, "grad_norm": 3.2343201637268066, "grad_norm_var": 0.5003611463598605, "learning_rate": 0.0001, "loss": 1.4461, "loss/crossentropy": 2.6459105014801025, "loss/hidden": 1.203125, "loss/logits": 0.23678520321846008, "loss/reg": 0.0006198700866661966, "step": 4718 }, { "epoch": 0.589875, "grad_norm": 2.1833035945892334, "grad_norm_var": 0.5248933250137611, "learning_rate": 0.0001, "loss": 1.1485, "loss/crossentropy": 2.4665727615356445, "loss/hidden": 0.9765625, "loss/logits": 0.1657778024673462, "loss/reg": 0.0006195475580170751, "step": 4719 }, { "epoch": 0.59, "grad_norm": 2.276548385620117, "grad_norm_var": 0.5496704237390759, "learning_rate": 0.0001, "loss": 1.1588, "loss/crossentropy": 2.585287094116211, "loss/hidden": 0.98046875, "loss/logits": 0.17213648557662964, "loss/reg": 0.0006192470900714397, "step": 4720 }, { "epoch": 0.590125, "grad_norm": 3.2019426822662354, "grad_norm_var": 0.5473561821010224, "learning_rate": 0.0001, "loss": 1.321, "loss/crossentropy": 2.383382797241211, "loss/hidden": 1.125, "loss/logits": 0.18979762494564056, "loss/reg": 0.0006189087289385498, "step": 4721 }, { "epoch": 0.59025, "grad_norm": 3.194746971130371, "grad_norm_var": 0.5272860209415579, "learning_rate": 0.0001, "loss": 1.4473, "loss/crossentropy": 2.243533134460449, "loss/hidden": 1.21875, "loss/logits": 0.22241128981113434, "loss/reg": 0.0006185868987813592, "step": 4722 }, { "epoch": 0.590375, "grad_norm": 3.157177209854126, "grad_norm_var": 0.5263422109369019, "learning_rate": 0.0001, "loss": 1.2193, "loss/crossentropy": 2.4925549030303955, "loss/hidden": 1.0625, "loss/logits": 0.15062439441680908, "loss/reg": 0.0006182413198985159, "step": 4723 }, { "epoch": 0.5905, "grad_norm": 2.457512617111206, "grad_norm_var": 0.5332202692857206, "learning_rate": 0.0001, "loss": 1.2906, "loss/crossentropy": 2.6806676387786865, "loss/hidden": 1.09375, "loss/logits": 0.19063445925712585, "loss/reg": 0.0006178833427838981, "step": 4724 }, { "epoch": 0.590625, "grad_norm": 2.288226366043091, "grad_norm_var": 0.5216822962017422, "learning_rate": 0.0001, "loss": 1.2596, "loss/crossentropy": 2.7105891704559326, "loss/hidden": 1.0625, "loss/logits": 0.19093140959739685, "loss/reg": 0.0006175250164233148, "step": 4725 }, { "epoch": 0.59075, "grad_norm": 2.742800235748291, "grad_norm_var": 0.5176572246166681, "learning_rate": 0.0001, "loss": 1.377, "loss/crossentropy": 2.72799015045166, "loss/hidden": 1.15625, "loss/logits": 0.21460343897342682, "loss/reg": 0.0006171996355988085, "step": 4726 }, { "epoch": 0.590875, "grad_norm": 2.5042622089385986, "grad_norm_var": 0.5101350910508008, "learning_rate": 0.0001, "loss": 1.2047, "loss/crossentropy": 2.6351380348205566, "loss/hidden": 1.0234375, "loss/logits": 0.17508895695209503, "loss/reg": 0.0006168906693346798, "step": 4727 }, { "epoch": 0.591, "grad_norm": 2.874661684036255, "grad_norm_var": 0.5062761731810096, "learning_rate": 0.0001, "loss": 1.5658, "loss/crossentropy": 2.422654867172241, "loss/hidden": 1.3046875, "loss/logits": 0.2549160122871399, "loss/reg": 0.0006165467784740031, "step": 4728 }, { "epoch": 0.591125, "grad_norm": 2.1993727684020996, "grad_norm_var": 0.2377627383182033, "learning_rate": 0.0001, "loss": 1.1808, "loss/crossentropy": 2.6052629947662354, "loss/hidden": 1.0, "loss/logits": 0.17461246252059937, "loss/reg": 0.0006162136560305953, "step": 4729 }, { "epoch": 0.59125, "grad_norm": 2.9430198669433594, "grad_norm_var": 0.23095554766561388, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.3848702907562256, "loss/hidden": 1.1640625, "loss/logits": 0.2059720754623413, "loss/reg": 0.000615897006355226, "step": 4730 }, { "epoch": 0.591375, "grad_norm": 2.218881607055664, "grad_norm_var": 0.2263151325991572, "learning_rate": 0.0001, "loss": 1.1058, "loss/crossentropy": 2.644315004348755, "loss/hidden": 0.9453125, "loss/logits": 0.1543736457824707, "loss/reg": 0.0006155523005872965, "step": 4731 }, { "epoch": 0.5915, "grad_norm": 2.953497886657715, "grad_norm_var": 0.16534502343301155, "learning_rate": 0.0001, "loss": 1.5658, "loss/crossentropy": 2.4887900352478027, "loss/hidden": 1.3359375, "loss/logits": 0.2237081527709961, "loss/reg": 0.0006151851848699152, "step": 4732 }, { "epoch": 0.591625, "grad_norm": 2.886462450027466, "grad_norm_var": 0.1583909936872478, "learning_rate": 0.0001, "loss": 1.3255, "loss/crossentropy": 2.7448747158050537, "loss/hidden": 1.1328125, "loss/logits": 0.1865481436252594, "loss/reg": 0.0006148516549728811, "step": 4733 }, { "epoch": 0.59175, "grad_norm": 3.0639710426330566, "grad_norm_var": 0.14823425737461, "learning_rate": 0.0001, "loss": 1.202, "loss/crossentropy": 2.4721999168395996, "loss/hidden": 1.03125, "loss/logits": 0.16463586688041687, "loss/reg": 0.0006144935614429414, "step": 4734 }, { "epoch": 0.591875, "grad_norm": 2.548227071762085, "grad_norm_var": 0.13157974596424443, "learning_rate": 0.0001, "loss": 1.2228, "loss/crossentropy": 2.484050750732422, "loss/hidden": 1.046875, "loss/logits": 0.16976234316825867, "loss/reg": 0.0006141397170722485, "step": 4735 }, { "epoch": 0.592, "grad_norm": 2.5478291511535645, "grad_norm_var": 0.12015898139888653, "learning_rate": 0.0001, "loss": 1.2286, "loss/crossentropy": 2.3843271732330322, "loss/hidden": 1.0390625, "loss/logits": 0.18341785669326782, "loss/reg": 0.000613829237408936, "step": 4736 }, { "epoch": 0.592125, "grad_norm": 2.4568793773651123, "grad_norm_var": 0.10860729447758691, "learning_rate": 0.0001, "loss": 1.2986, "loss/crossentropy": 2.5800838470458984, "loss/hidden": 1.0859375, "loss/logits": 0.2065696120262146, "loss/reg": 0.0006135067669674754, "step": 4737 }, { "epoch": 0.59225, "grad_norm": 2.4811277389526367, "grad_norm_var": 0.09239458638562516, "learning_rate": 0.0001, "loss": 1.292, "loss/crossentropy": 2.654041290283203, "loss/hidden": 1.09375, "loss/logits": 0.19212424755096436, "loss/reg": 0.0006131718982942402, "step": 4738 }, { "epoch": 0.592375, "grad_norm": 2.5448102951049805, "grad_norm_var": 0.07403289603246298, "learning_rate": 0.0001, "loss": 1.142, "loss/crossentropy": 2.647541046142578, "loss/hidden": 0.96875, "loss/logits": 0.16711601614952087, "loss/reg": 0.0006128384848125279, "step": 4739 }, { "epoch": 0.5925, "grad_norm": 2.7481484413146973, "grad_norm_var": 0.07352047646461415, "learning_rate": 0.0001, "loss": 1.3835, "loss/crossentropy": 2.89884090423584, "loss/hidden": 1.1640625, "loss/logits": 0.2133486270904541, "loss/reg": 0.000612483941949904, "step": 4740 }, { "epoch": 0.592625, "grad_norm": 2.6731925010681152, "grad_norm_var": 0.06548972896693286, "learning_rate": 0.0001, "loss": 1.157, "loss/crossentropy": 2.7134451866149902, "loss/hidden": 0.9921875, "loss/logits": 0.1587049961090088, "loss/reg": 0.0006121675251051784, "step": 4741 }, { "epoch": 0.59275, "grad_norm": 3.224759578704834, "grad_norm_var": 0.08602262083331666, "learning_rate": 0.0001, "loss": 1.3326, "loss/crossentropy": 2.637308120727539, "loss/hidden": 1.125, "loss/logits": 0.20151393115520477, "loss/reg": 0.0006118376040831208, "step": 4742 }, { "epoch": 0.592875, "grad_norm": 2.367532730102539, "grad_norm_var": 0.09038244469750983, "learning_rate": 0.0001, "loss": 1.432, "loss/crossentropy": 2.3945140838623047, "loss/hidden": 1.171875, "loss/logits": 0.2540336549282074, "loss/reg": 0.0006115060532465577, "step": 4743 }, { "epoch": 0.593, "grad_norm": 2.974602699279785, "grad_norm_var": 0.09372361472637726, "learning_rate": 0.0001, "loss": 1.1582, "loss/crossentropy": 2.122929573059082, "loss/hidden": 1.0, "loss/logits": 0.1521187275648117, "loss/reg": 0.0006111579132266343, "step": 4744 }, { "epoch": 0.593125, "grad_norm": 2.3020381927490234, "grad_norm_var": 0.08784399989949611, "learning_rate": 0.0001, "loss": 1.1735, "loss/crossentropy": 2.3894612789154053, "loss/hidden": 1.0, "loss/logits": 0.16743674874305725, "loss/reg": 0.000610792194493115, "step": 4745 }, { "epoch": 0.59325, "grad_norm": 2.937204599380493, "grad_norm_var": 0.08764484042195875, "learning_rate": 0.0001, "loss": 1.2271, "loss/crossentropy": 2.4431040287017822, "loss/hidden": 1.046875, "loss/logits": 0.1740814596414566, "loss/reg": 0.0006104632630012929, "step": 4746 }, { "epoch": 0.593375, "grad_norm": 2.356630325317383, "grad_norm_var": 0.08030519353089005, "learning_rate": 0.0001, "loss": 1.1245, "loss/crossentropy": 2.6453065872192383, "loss/hidden": 0.9609375, "loss/logits": 0.15742652118206024, "loss/reg": 0.000610154471360147, "step": 4747 }, { "epoch": 0.5935, "grad_norm": 2.774251937866211, "grad_norm_var": 0.07605600775670211, "learning_rate": 0.0001, "loss": 1.3195, "loss/crossentropy": 2.4929916858673096, "loss/hidden": 1.1171875, "loss/logits": 0.19620802998542786, "loss/reg": 0.0006098212325014174, "step": 4748 }, { "epoch": 0.593625, "grad_norm": 2.999812364578247, "grad_norm_var": 0.07997211074308837, "learning_rate": 0.0001, "loss": 1.1384, "loss/crossentropy": 2.273632764816284, "loss/hidden": 0.97265625, "loss/logits": 0.15963001549243927, "loss/reg": 0.0006095262360759079, "step": 4749 }, { "epoch": 0.59375, "grad_norm": 3.5456316471099854, "grad_norm_var": 0.11864533574640153, "learning_rate": 0.0001, "loss": 1.6085, "loss/crossentropy": 2.4749929904937744, "loss/hidden": 1.3125, "loss/logits": 0.28986838459968567, "loss/reg": 0.000609253125730902, "step": 4750 }, { "epoch": 0.593875, "grad_norm": 3.1401748657226562, "grad_norm_var": 0.12717216095988892, "learning_rate": 0.0001, "loss": 1.3412, "loss/crossentropy": 2.5305702686309814, "loss/hidden": 1.140625, "loss/logits": 0.1944623589515686, "loss/reg": 0.0006089300732128322, "step": 4751 }, { "epoch": 0.594, "grad_norm": 2.5471010208129883, "grad_norm_var": 0.1271922744744491, "learning_rate": 0.0001, "loss": 1.1474, "loss/crossentropy": 2.8028292655944824, "loss/hidden": 0.98046875, "loss/logits": 0.1608017385005951, "loss/reg": 0.0006086592329666018, "step": 4752 }, { "epoch": 0.594125, "grad_norm": 2.6707985401153564, "grad_norm_var": 0.12156007784787164, "learning_rate": 0.0001, "loss": 1.2664, "loss/crossentropy": 2.473475933074951, "loss/hidden": 1.0703125, "loss/logits": 0.1899702250957489, "loss/reg": 0.0006083724438212812, "step": 4753 }, { "epoch": 0.59425, "grad_norm": 3.919084072113037, "grad_norm_var": 0.19579336139520745, "learning_rate": 0.0001, "loss": 1.4183, "loss/crossentropy": 2.832719326019287, "loss/hidden": 1.171875, "loss/logits": 0.24036309123039246, "loss/reg": 0.0006081114406697452, "step": 4754 }, { "epoch": 0.594375, "grad_norm": 2.880371332168579, "grad_norm_var": 0.18882459389007308, "learning_rate": 0.0001, "loss": 1.1479, "loss/crossentropy": 2.6270391941070557, "loss/hidden": 0.99609375, "loss/logits": 0.14573879539966583, "loss/reg": 0.0006078595179133117, "step": 4755 }, { "epoch": 0.5945, "grad_norm": 2.95410418510437, "grad_norm_var": 0.1878869945963885, "learning_rate": 0.0001, "loss": 1.4691, "loss/crossentropy": 2.7339324951171875, "loss/hidden": 1.203125, "loss/logits": 0.2599219083786011, "loss/reg": 0.0006075326818972826, "step": 4756 }, { "epoch": 0.594625, "grad_norm": 3.427762269973755, "grad_norm_var": 0.20148850889135375, "learning_rate": 0.0001, "loss": 1.493, "loss/crossentropy": 2.4372353553771973, "loss/hidden": 1.2734375, "loss/logits": 0.2135353535413742, "loss/reg": 0.0006072077085264027, "step": 4757 }, { "epoch": 0.59475, "grad_norm": 6.3114519119262695, "grad_norm_var": 0.9146298132119831, "learning_rate": 0.0001, "loss": 1.604, "loss/crossentropy": 2.861260175704956, "loss/hidden": 1.359375, "loss/logits": 0.238596573472023, "loss/reg": 0.0006069192895665765, "step": 4758 }, { "epoch": 0.594875, "grad_norm": 2.7214977741241455, "grad_norm_var": 0.8863913929215528, "learning_rate": 0.0001, "loss": 1.3351, "loss/crossentropy": 2.5678317546844482, "loss/hidden": 1.140625, "loss/logits": 0.1884123980998993, "loss/reg": 0.0006065932102501392, "step": 4759 }, { "epoch": 0.595, "grad_norm": 3.3868165016174316, "grad_norm_var": 0.8871564925436578, "learning_rate": 0.0001, "loss": 1.3275, "loss/crossentropy": 2.474128007888794, "loss/hidden": 1.125, "loss/logits": 0.19645963609218597, "loss/reg": 0.0006063548498786986, "step": 4760 }, { "epoch": 0.595125, "grad_norm": 2.433189630508423, "grad_norm_var": 0.872884500544586, "learning_rate": 0.0001, "loss": 1.2298, "loss/crossentropy": 2.5283725261688232, "loss/hidden": 1.0390625, "loss/logits": 0.1846432387828827, "loss/reg": 0.0006061461754143238, "step": 4761 }, { "epoch": 0.59525, "grad_norm": 5.368790149688721, "grad_norm_var": 1.1611546866017723, "learning_rate": 0.0001, "loss": 1.2691, "loss/crossentropy": 2.401467800140381, "loss/hidden": 1.0859375, "loss/logits": 0.17711195349693298, "loss/reg": 0.0006058522849343717, "step": 4762 }, { "epoch": 0.595375, "grad_norm": 2.958914041519165, "grad_norm_var": 1.1048699912004336, "learning_rate": 0.0001, "loss": 1.6214, "loss/crossentropy": 2.5516738891601562, "loss/hidden": 1.3203125, "loss/logits": 0.29501596093177795, "loss/reg": 0.0006056046113371849, "step": 4763 }, { "epoch": 0.5955, "grad_norm": 2.4550440311431885, "grad_norm_var": 1.1369125611242898, "learning_rate": 0.0001, "loss": 1.2507, "loss/crossentropy": 2.3269715309143066, "loss/hidden": 1.0546875, "loss/logits": 0.18997333943843842, "loss/reg": 0.0006053518154658377, "step": 4764 }, { "epoch": 0.595625, "grad_norm": 2.60552978515625, "grad_norm_var": 1.1654345221467108, "learning_rate": 0.0001, "loss": 1.2223, "loss/crossentropy": 2.6754257678985596, "loss/hidden": 1.0390625, "loss/logits": 0.17720946669578552, "loss/reg": 0.0006051292293705046, "step": 4765 }, { "epoch": 0.59575, "grad_norm": 4.039705753326416, "grad_norm_var": 1.194705944181744, "learning_rate": 0.0001, "loss": 1.6387, "loss/crossentropy": 1.8826484680175781, "loss/hidden": 1.4140625, "loss/logits": 0.21861271560192108, "loss/reg": 0.00060491036856547, "step": 4766 }, { "epoch": 0.595875, "grad_norm": 10.3914155960083, "grad_norm_var": 4.264806790905223, "learning_rate": 0.0001, "loss": 1.453, "loss/crossentropy": 2.8078043460845947, "loss/hidden": 1.21875, "loss/logits": 0.22815364599227905, "loss/reg": 0.0006046341732144356, "step": 4767 }, { "epoch": 0.596, "grad_norm": 3.3232829570770264, "grad_norm_var": 4.171040159647246, "learning_rate": 0.0001, "loss": 1.623, "loss/crossentropy": 2.1682183742523193, "loss/hidden": 1.34375, "loss/logits": 0.27317339181900024, "loss/reg": 0.0006044033216312528, "step": 4768 }, { "epoch": 0.596125, "grad_norm": 2.7759814262390137, "grad_norm_var": 4.1549768832567215, "learning_rate": 0.0001, "loss": 1.3457, "loss/crossentropy": 2.699039936065674, "loss/hidden": 1.1328125, "loss/logits": 0.20686957240104675, "loss/reg": 0.0006041803280822933, "step": 4769 }, { "epoch": 0.59625, "grad_norm": 2.4670603275299072, "grad_norm_var": 4.277645958847536, "learning_rate": 0.0001, "loss": 1.3449, "loss/crossentropy": 2.437436819076538, "loss/hidden": 1.140625, "loss/logits": 0.19823190569877625, "loss/reg": 0.000603954482357949, "step": 4770 }, { "epoch": 0.596375, "grad_norm": 2.789968252182007, "grad_norm_var": 4.289016405824762, "learning_rate": 0.0001, "loss": 1.5359, "loss/crossentropy": 2.3058576583862305, "loss/hidden": 1.2734375, "loss/logits": 0.2563984990119934, "loss/reg": 0.0006036364356987178, "step": 4771 }, { "epoch": 0.5965, "grad_norm": 2.541313886642456, "grad_norm_var": 4.344883358373151, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.756533622741699, "loss/hidden": 1.1328125, "loss/logits": 0.1847788393497467, "loss/reg": 0.0006033815443515778, "step": 4772 }, { "epoch": 0.596625, "grad_norm": 2.765990734100342, "grad_norm_var": 4.400675190589224, "learning_rate": 0.0001, "loss": 1.0891, "loss/crossentropy": 2.5219838619232178, "loss/hidden": 0.94921875, "loss/logits": 0.13389122486114502, "loss/reg": 0.0006030666991136968, "step": 4773 }, { "epoch": 0.59675, "grad_norm": 2.7974941730499268, "grad_norm_var": 3.952862425493955, "learning_rate": 0.0001, "loss": 1.2509, "loss/crossentropy": 2.6054537296295166, "loss/hidden": 1.0546875, "loss/logits": 0.19017067551612854, "loss/reg": 0.0006027444615028799, "step": 4774 }, { "epoch": 0.596875, "grad_norm": 3.068053960800171, "grad_norm_var": 3.924910187598461, "learning_rate": 0.0001, "loss": 1.506, "loss/crossentropy": 2.9576988220214844, "loss/hidden": 1.2734375, "loss/logits": 0.22653287649154663, "loss/reg": 0.0006024567992426455, "step": 4775 }, { "epoch": 0.597, "grad_norm": 3.186460256576538, "grad_norm_var": 3.930724123478338, "learning_rate": 0.0001, "loss": 1.1807, "loss/crossentropy": 2.6703341007232666, "loss/hidden": 1.0078125, "loss/logits": 0.1668788194656372, "loss/reg": 0.0006021339213475585, "step": 4776 }, { "epoch": 0.597125, "grad_norm": 4.028050422668457, "grad_norm_var": 3.863265433570342, "learning_rate": 0.0001, "loss": 1.4, "loss/crossentropy": 2.5174453258514404, "loss/hidden": 1.1875, "loss/logits": 0.20646943151950836, "loss/reg": 0.0006018219282850623, "step": 4777 }, { "epoch": 0.59725, "grad_norm": 2.9257383346557617, "grad_norm_var": 3.659378548606247, "learning_rate": 0.0001, "loss": 1.3722, "loss/crossentropy": 2.638533592224121, "loss/hidden": 1.1484375, "loss/logits": 0.21770817041397095, "loss/reg": 0.0006014623213559389, "step": 4778 }, { "epoch": 0.597375, "grad_norm": 2.463714838027954, "grad_norm_var": 3.706799539485028, "learning_rate": 0.0001, "loss": 1.2748, "loss/crossentropy": 2.236125946044922, "loss/hidden": 1.078125, "loss/logits": 0.19069120287895203, "loss/reg": 0.0006011411896906793, "step": 4779 }, { "epoch": 0.5975, "grad_norm": 3.025319814682007, "grad_norm_var": 3.654205703938269, "learning_rate": 0.0001, "loss": 1.2849, "loss/crossentropy": 2.603207588195801, "loss/hidden": 1.09375, "loss/logits": 0.18509599566459656, "loss/reg": 0.000600758648943156, "step": 4780 }, { "epoch": 0.597625, "grad_norm": 3.5874366760253906, "grad_norm_var": 3.603945962860494, "learning_rate": 0.0001, "loss": 1.5165, "loss/crossentropy": 2.5957720279693604, "loss/hidden": 1.3203125, "loss/logits": 0.19016078114509583, "loss/reg": 0.0006004291353747249, "step": 4781 }, { "epoch": 0.59775, "grad_norm": 2.4295129776000977, "grad_norm_var": 3.6524951693641787, "learning_rate": 0.0001, "loss": 1.3011, "loss/crossentropy": 2.5825459957122803, "loss/hidden": 1.09375, "loss/logits": 0.20135995745658875, "loss/reg": 0.0006000997382216156, "step": 4782 }, { "epoch": 0.597875, "grad_norm": 2.793726682662964, "grad_norm_var": 0.18838031704912056, "learning_rate": 0.0001, "loss": 1.1171, "loss/crossentropy": 2.891960382461548, "loss/hidden": 0.95703125, "loss/logits": 0.15408726036548615, "loss/reg": 0.0005997293046675622, "step": 4783 }, { "epoch": 0.598, "grad_norm": 2.47297739982605, "grad_norm_var": 0.18961233955691342, "learning_rate": 0.0001, "loss": 1.3197, "loss/crossentropy": 2.394080877304077, "loss/hidden": 1.109375, "loss/logits": 0.20434343814849854, "loss/reg": 0.0005993624217808247, "step": 4784 }, { "epoch": 0.598125, "grad_norm": 2.3396763801574707, "grad_norm_var": 0.2077022200917857, "learning_rate": 0.0001, "loss": 1.1835, "loss/crossentropy": 2.910036563873291, "loss/hidden": 1.0078125, "loss/logits": 0.1696721613407135, "loss/reg": 0.0005989900673739612, "step": 4785 }, { "epoch": 0.59825, "grad_norm": 4.368083953857422, "grad_norm_var": 0.3351998717993295, "learning_rate": 0.0001, "loss": 1.6943, "loss/crossentropy": 2.4978280067443848, "loss/hidden": 1.453125, "loss/logits": 0.2351440191268921, "loss/reg": 0.000598667305894196, "step": 4786 }, { "epoch": 0.598375, "grad_norm": 3.0770161151885986, "grad_norm_var": 0.3333073467333802, "learning_rate": 0.0001, "loss": 1.4596, "loss/crossentropy": 2.44785213470459, "loss/hidden": 1.1953125, "loss/logits": 0.25825658440589905, "loss/reg": 0.0005983437295071781, "step": 4787 }, { "epoch": 0.5985, "grad_norm": 2.9402432441711426, "grad_norm_var": 0.31928639522167257, "learning_rate": 0.0001, "loss": 1.4396, "loss/crossentropy": 2.551346778869629, "loss/hidden": 1.2109375, "loss/logits": 0.22268305718898773, "loss/reg": 0.0005980234127491713, "step": 4788 }, { "epoch": 0.598625, "grad_norm": 4.258631229400635, "grad_norm_var": 0.40861047411666124, "learning_rate": 0.0001, "loss": 1.3314, "loss/crossentropy": 2.379242181777954, "loss/hidden": 1.15625, "loss/logits": 0.16916920244693756, "loss/reg": 0.000597693957388401, "step": 4789 }, { "epoch": 0.59875, "grad_norm": 2.4600636959075928, "grad_norm_var": 0.4297925549753463, "learning_rate": 0.0001, "loss": 1.1346, "loss/crossentropy": 2.6012067794799805, "loss/hidden": 0.97265625, "loss/logits": 0.15596172213554382, "loss/reg": 0.0005973785882815719, "step": 4790 }, { "epoch": 0.598875, "grad_norm": 2.734698534011841, "grad_norm_var": 0.4376708779808572, "learning_rate": 0.0001, "loss": 1.2425, "loss/crossentropy": 2.6106181144714355, "loss/hidden": 1.0625, "loss/logits": 0.17402637004852295, "loss/reg": 0.0005970748024992645, "step": 4791 }, { "epoch": 0.599, "grad_norm": 2.061224937438965, "grad_norm_var": 0.49906419844112954, "learning_rate": 0.0001, "loss": 1.1429, "loss/crossentropy": 2.26723051071167, "loss/hidden": 0.9765625, "loss/logits": 0.16033180058002472, "loss/reg": 0.0005967605975456536, "step": 4792 }, { "epoch": 0.599125, "grad_norm": 2.875535488128662, "grad_norm_var": 0.4237777977071798, "learning_rate": 0.0001, "loss": 1.4117, "loss/crossentropy": 2.4286675453186035, "loss/hidden": 1.1953125, "loss/logits": 0.21044857800006866, "loss/reg": 0.0005964524461887777, "step": 4793 }, { "epoch": 0.59925, "grad_norm": 2.8120903968811035, "grad_norm_var": 0.42458673092445615, "learning_rate": 0.0001, "loss": 1.3227, "loss/crossentropy": 2.528701066970825, "loss/hidden": 1.1328125, "loss/logits": 0.18393927812576294, "loss/reg": 0.0005961559363640845, "step": 4794 }, { "epoch": 0.599375, "grad_norm": 2.6969451904296875, "grad_norm_var": 0.41383619684836126, "learning_rate": 0.0001, "loss": 1.3236, "loss/crossentropy": 2.4815118312835693, "loss/hidden": 1.125, "loss/logits": 0.19260750710964203, "loss/reg": 0.00059585488634184, "step": 4795 }, { "epoch": 0.5995, "grad_norm": 2.6209754943847656, "grad_norm_var": 0.41909485675354075, "learning_rate": 0.0001, "loss": 1.6484, "loss/crossentropy": 2.0700392723083496, "loss/hidden": 1.375, "loss/logits": 0.2674342393875122, "loss/reg": 0.000595573044847697, "step": 4796 }, { "epoch": 0.599625, "grad_norm": 3.0742359161376953, "grad_norm_var": 0.3890677252553111, "learning_rate": 0.0001, "loss": 1.4967, "loss/crossentropy": 2.5666213035583496, "loss/hidden": 1.25, "loss/logits": 0.24078576266765594, "loss/reg": 0.0005953173968009651, "step": 4797 }, { "epoch": 0.59975, "grad_norm": 2.7261533737182617, "grad_norm_var": 0.3769088630164198, "learning_rate": 0.0001, "loss": 1.4612, "loss/crossentropy": 2.448683261871338, "loss/hidden": 1.203125, "loss/logits": 0.25216376781463623, "loss/reg": 0.0005950760096311569, "step": 4798 }, { "epoch": 0.599875, "grad_norm": 3.041327714920044, "grad_norm_var": 0.3774130543029171, "learning_rate": 0.0001, "loss": 1.2959, "loss/crossentropy": 2.697946548461914, "loss/hidden": 1.109375, "loss/logits": 0.18058064579963684, "loss/reg": 0.0005948347388766706, "step": 4799 }, { "epoch": 0.6, "grad_norm": 2.1899912357330322, "grad_norm_var": 0.3989073554102274, "learning_rate": 0.0001, "loss": 1.183, "loss/crossentropy": 2.554466724395752, "loss/hidden": 1.0078125, "loss/logits": 0.16921022534370422, "loss/reg": 0.0005945215816609561, "step": 4800 }, { "epoch": 0.600125, "grad_norm": 3.8676905632019043, "grad_norm_var": 0.43224398424006244, "learning_rate": 0.0001, "loss": 1.6969, "loss/crossentropy": 2.504239082336426, "loss/hidden": 1.4140625, "loss/logits": 0.2768888473510742, "loss/reg": 0.0005942654679529369, "step": 4801 }, { "epoch": 0.60025, "grad_norm": 2.3035457134246826, "grad_norm_var": 0.31868750997558665, "learning_rate": 0.0001, "loss": 1.2957, "loss/crossentropy": 2.5717153549194336, "loss/hidden": 1.0859375, "loss/logits": 0.20384037494659424, "loss/reg": 0.0005939487018622458, "step": 4802 }, { "epoch": 0.600375, "grad_norm": 2.4181036949157715, "grad_norm_var": 0.32664911663933344, "learning_rate": 0.0001, "loss": 1.2446, "loss/crossentropy": 2.7022974491119385, "loss/hidden": 1.046875, "loss/logits": 0.19178813695907593, "loss/reg": 0.0005936801317147911, "step": 4803 }, { "epoch": 0.6005, "grad_norm": 3.135193109512329, "grad_norm_var": 0.332212595041896, "learning_rate": 0.0001, "loss": 1.2458, "loss/crossentropy": 2.6275157928466797, "loss/hidden": 1.0703125, "loss/logits": 0.1695917844772339, "loss/reg": 0.0005934219225309789, "step": 4804 }, { "epoch": 0.600625, "grad_norm": 4.398707389831543, "grad_norm_var": 0.36012541346143034, "learning_rate": 0.0001, "loss": 1.209, "loss/crossentropy": 2.737250566482544, "loss/hidden": 1.0390625, "loss/logits": 0.1639731228351593, "loss/reg": 0.0005930961924605072, "step": 4805 }, { "epoch": 0.60075, "grad_norm": 4.186426162719727, "grad_norm_var": 0.45927983568299524, "learning_rate": 0.0001, "loss": 1.5499, "loss/crossentropy": 2.570542812347412, "loss/hidden": 1.2890625, "loss/logits": 0.2549266815185547, "loss/reg": 0.0005927775637246668, "step": 4806 }, { "epoch": 0.600875, "grad_norm": 3.078320264816284, "grad_norm_var": 0.4569589418080319, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.5313103199005127, "loss/hidden": 1.171875, "loss/logits": 0.17086535692214966, "loss/reg": 0.0005924517172388732, "step": 4807 }, { "epoch": 0.601, "grad_norm": 2.6753368377685547, "grad_norm_var": 0.40628944711898285, "learning_rate": 0.0001, "loss": 1.328, "loss/crossentropy": 2.4746038913726807, "loss/hidden": 1.1171875, "loss/logits": 0.20493531227111816, "loss/reg": 0.0005921745905652642, "step": 4808 }, { "epoch": 0.601125, "grad_norm": 2.982870578765869, "grad_norm_var": 0.40513828045734285, "learning_rate": 0.0001, "loss": 1.2565, "loss/crossentropy": 2.4355225563049316, "loss/hidden": 1.078125, "loss/logits": 0.17244315147399902, "loss/reg": 0.0005918531096540391, "step": 4809 }, { "epoch": 0.60125, "grad_norm": 2.8318235874176025, "grad_norm_var": 0.404634020420815, "learning_rate": 0.0001, "loss": 1.1166, "loss/crossentropy": 2.951063394546509, "loss/hidden": 0.9609375, "loss/logits": 0.1497742086648941, "loss/reg": 0.0005915405927225947, "step": 4810 }, { "epoch": 0.601375, "grad_norm": 2.7232656478881836, "grad_norm_var": 0.4035638480907816, "learning_rate": 0.0001, "loss": 1.3895, "loss/crossentropy": 2.73553729057312, "loss/hidden": 1.171875, "loss/logits": 0.21174587309360504, "loss/reg": 0.0005912499036639929, "step": 4811 }, { "epoch": 0.6015, "grad_norm": 7.627694129943848, "grad_norm_var": 1.706647086327761, "learning_rate": 0.0001, "loss": 1.7023, "loss/crossentropy": 2.786592483520508, "loss/hidden": 1.46875, "loss/logits": 0.22762562334537506, "loss/reg": 0.0005909315077587962, "step": 4812 }, { "epoch": 0.601625, "grad_norm": 3.9479169845581055, "grad_norm_var": 1.7247009538815823, "learning_rate": 0.0001, "loss": 1.6139, "loss/crossentropy": 2.370030164718628, "loss/hidden": 1.3046875, "loss/logits": 0.30327093601226807, "loss/reg": 0.000590614858083427, "step": 4813 }, { "epoch": 0.60175, "grad_norm": 2.404336929321289, "grad_norm_var": 1.7593754323272794, "learning_rate": 0.0001, "loss": 1.1921, "loss/crossentropy": 2.6471784114837646, "loss/hidden": 1.015625, "loss/logits": 0.17058280110359192, "loss/reg": 0.0005903012934140861, "step": 4814 }, { "epoch": 0.601875, "grad_norm": 3.561838150024414, "grad_norm_var": 1.753964384367149, "learning_rate": 0.0001, "loss": 1.3601, "loss/crossentropy": 2.713881492614746, "loss/hidden": 1.15625, "loss/logits": 0.1979692280292511, "loss/reg": 0.0005900205578655005, "step": 4815 }, { "epoch": 0.602, "grad_norm": 2.467925786972046, "grad_norm_var": 1.7141069667925817, "learning_rate": 0.0001, "loss": 1.1521, "loss/crossentropy": 2.5921597480773926, "loss/hidden": 1.0078125, "loss/logits": 0.1383569836616516, "loss/reg": 0.0005897526862099767, "step": 4816 }, { "epoch": 0.602125, "grad_norm": 3.4941413402557373, "grad_norm_var": 1.7001909062758755, "learning_rate": 0.0001, "loss": 1.6889, "loss/crossentropy": 2.300841808319092, "loss/hidden": 1.3984375, "loss/logits": 0.2846064567565918, "loss/reg": 0.0005894380155950785, "step": 4817 }, { "epoch": 0.60225, "grad_norm": 2.838955879211426, "grad_norm_var": 1.6405589804398135, "learning_rate": 0.0001, "loss": 1.4243, "loss/crossentropy": 2.640784978866577, "loss/hidden": 1.2109375, "loss/logits": 0.20745612680912018, "loss/reg": 0.0005891203181818128, "step": 4818 }, { "epoch": 0.602375, "grad_norm": 2.4574530124664307, "grad_norm_var": 1.6353818964896079, "learning_rate": 0.0001, "loss": 1.3368, "loss/crossentropy": 2.6519646644592285, "loss/hidden": 1.140625, "loss/logits": 0.19028262794017792, "loss/reg": 0.0005888278828933835, "step": 4819 }, { "epoch": 0.6025, "grad_norm": 2.4777557849884033, "grad_norm_var": 1.6878667396382894, "learning_rate": 0.0001, "loss": 1.2715, "loss/crossentropy": 2.3762710094451904, "loss/hidden": 1.0703125, "loss/logits": 0.19531339406967163, "loss/reg": 0.0005885105929337442, "step": 4820 }, { "epoch": 0.602625, "grad_norm": 3.5128121376037598, "grad_norm_var": 1.6171402927491803, "learning_rate": 0.0001, "loss": 1.107, "loss/crossentropy": 2.5014400482177734, "loss/hidden": 0.93359375, "loss/logits": 0.1675378978252411, "loss/reg": 0.0005881834076717496, "step": 4821 }, { "epoch": 0.60275, "grad_norm": 2.421044111251831, "grad_norm_var": 1.6101732124420947, "learning_rate": 0.0001, "loss": 1.2403, "loss/crossentropy": 2.5866332054138184, "loss/hidden": 1.0546875, "loss/logits": 0.17972949147224426, "loss/reg": 0.0005878765368834138, "step": 4822 }, { "epoch": 0.602875, "grad_norm": 2.735649824142456, "grad_norm_var": 1.623938270426373, "learning_rate": 0.0001, "loss": 1.2706, "loss/crossentropy": 2.2333219051361084, "loss/hidden": 1.09375, "loss/logits": 0.170949786901474, "loss/reg": 0.0005875707720406353, "step": 4823 }, { "epoch": 0.603, "grad_norm": 3.323552370071411, "grad_norm_var": 1.6050653951162834, "learning_rate": 0.0001, "loss": 1.5539, "loss/crossentropy": 2.4837307929992676, "loss/hidden": 1.2890625, "loss/logits": 0.25895142555236816, "loss/reg": 0.0005872608162462711, "step": 4824 }, { "epoch": 0.603125, "grad_norm": 2.2753045558929443, "grad_norm_var": 1.6604315647211647, "learning_rate": 0.0001, "loss": 1.4093, "loss/crossentropy": 2.520749807357788, "loss/hidden": 1.1796875, "loss/logits": 0.2237476408481598, "loss/reg": 0.000586936017498374, "step": 4825 }, { "epoch": 0.60325, "grad_norm": 2.9035122394561768, "grad_norm_var": 1.657292421436159, "learning_rate": 0.0001, "loss": 1.467, "loss/crossentropy": 2.9900126457214355, "loss/hidden": 1.25, "loss/logits": 0.21108433604240417, "loss/reg": 0.0005866258288733661, "step": 4826 }, { "epoch": 0.603375, "grad_norm": 3.175344228744507, "grad_norm_var": 1.641430795507453, "learning_rate": 0.0001, "loss": 1.3414, "loss/crossentropy": 2.919318199157715, "loss/hidden": 1.140625, "loss/logits": 0.19494445621967316, "loss/reg": 0.0005862872931174934, "step": 4827 }, { "epoch": 0.6035, "grad_norm": 2.657975673675537, "grad_norm_var": 0.2687539638928977, "learning_rate": 0.0001, "loss": 1.3692, "loss/crossentropy": 2.2542331218719482, "loss/hidden": 1.171875, "loss/logits": 0.19145530462265015, "loss/reg": 0.0005859831580892205, "step": 4828 }, { "epoch": 0.603625, "grad_norm": 2.721346855163574, "grad_norm_var": 0.1940162168879084, "learning_rate": 0.0001, "loss": 1.225, "loss/crossentropy": 2.695777654647827, "loss/hidden": 1.03125, "loss/logits": 0.18792682886123657, "loss/reg": 0.0005856486968696117, "step": 4829 }, { "epoch": 0.60375, "grad_norm": 2.5601959228515625, "grad_norm_var": 0.18649522094717572, "learning_rate": 0.0001, "loss": 1.1066, "loss/crossentropy": 2.5653951168060303, "loss/hidden": 0.94921875, "loss/logits": 0.15152794122695923, "loss/reg": 0.0005853482289239764, "step": 4830 }, { "epoch": 0.603875, "grad_norm": 3.1420130729675293, "grad_norm_var": 0.1576115549470349, "learning_rate": 0.0001, "loss": 1.1047, "loss/crossentropy": 2.609496593475342, "loss/hidden": 0.953125, "loss/logits": 0.14577338099479675, "loss/reg": 0.0005850301240570843, "step": 4831 }, { "epoch": 0.604, "grad_norm": 2.962820529937744, "grad_norm_var": 0.14950163382960657, "learning_rate": 0.0001, "loss": 1.3572, "loss/crossentropy": 2.568687915802002, "loss/hidden": 1.140625, "loss/logits": 0.21072155237197876, "loss/reg": 0.0005847078864462674, "step": 4832 }, { "epoch": 0.604125, "grad_norm": 2.797576427459717, "grad_norm_var": 0.12034953859203958, "learning_rate": 0.0001, "loss": 1.3118, "loss/crossentropy": 2.467916965484619, "loss/hidden": 1.1015625, "loss/logits": 0.20441851019859314, "loss/reg": 0.0005844228435307741, "step": 4833 }, { "epoch": 0.60425, "grad_norm": 2.3586924076080322, "grad_norm_var": 0.13292441552709963, "learning_rate": 0.0001, "loss": 1.2208, "loss/crossentropy": 2.512319803237915, "loss/hidden": 1.0390625, "loss/logits": 0.1758606731891632, "loss/reg": 0.0005841102101840079, "step": 4834 }, { "epoch": 0.604375, "grad_norm": 4.19402551651001, "grad_norm_var": 0.2466770464355098, "learning_rate": 0.0001, "loss": 1.4284, "loss/crossentropy": 2.4550182819366455, "loss/hidden": 1.2265625, "loss/logits": 0.19604751467704773, "loss/reg": 0.0005838022916577756, "step": 4835 }, { "epoch": 0.6045, "grad_norm": 2.5413978099823, "grad_norm_var": 0.24344285741762992, "learning_rate": 0.0001, "loss": 1.3179, "loss/crossentropy": 2.855466604232788, "loss/hidden": 1.1171875, "loss/logits": 0.19489505887031555, "loss/reg": 0.0005834982730448246, "step": 4836 }, { "epoch": 0.604625, "grad_norm": 3.588034152984619, "grad_norm_var": 0.2500159424396306, "learning_rate": 0.0001, "loss": 1.1835, "loss/crossentropy": 2.8206965923309326, "loss/hidden": 1.0234375, "loss/logits": 0.1542046070098877, "loss/reg": 0.0005832150927744806, "step": 4837 }, { "epoch": 0.60475, "grad_norm": 2.3512566089630127, "grad_norm_var": 0.2547528774361884, "learning_rate": 0.0001, "loss": 1.2167, "loss/crossentropy": 2.5672240257263184, "loss/hidden": 1.03125, "loss/logits": 0.1796046495437622, "loss/reg": 0.00058295315830037, "step": 4838 }, { "epoch": 0.604875, "grad_norm": 3.5047531127929688, "grad_norm_var": 0.27558258931863383, "learning_rate": 0.0001, "loss": 2.0985, "loss/crossentropy": 2.09775447845459, "loss/hidden": 1.640625, "loss/logits": 0.45205599069595337, "loss/reg": 0.0005826919223181903, "step": 4839 }, { "epoch": 0.605, "grad_norm": 2.915151834487915, "grad_norm_var": 0.26518187867893606, "learning_rate": 0.0001, "loss": 1.3946, "loss/crossentropy": 2.450901985168457, "loss/hidden": 1.171875, "loss/logits": 0.216878741979599, "loss/reg": 0.0005824507097713649, "step": 4840 }, { "epoch": 0.605125, "grad_norm": 4.423800945281982, "grad_norm_var": 0.37026474515027225, "learning_rate": 0.0001, "loss": 1.5509, "loss/crossentropy": 2.4897043704986572, "loss/hidden": 1.3203125, "loss/logits": 0.22479528188705444, "loss/reg": 0.0005821408121846616, "step": 4841 }, { "epoch": 0.60525, "grad_norm": 3.1904115676879883, "grad_norm_var": 0.3698105917720073, "learning_rate": 0.0001, "loss": 1.531, "loss/crossentropy": 2.4536125659942627, "loss/hidden": 1.296875, "loss/logits": 0.22829881310462952, "loss/reg": 0.000581899075768888, "step": 4842 }, { "epoch": 0.605375, "grad_norm": 2.600837469100952, "grad_norm_var": 0.38220121702039833, "learning_rate": 0.0001, "loss": 1.2737, "loss/crossentropy": 2.9362802505493164, "loss/hidden": 1.0703125, "loss/logits": 0.19752374291419983, "loss/reg": 0.0005816429038532078, "step": 4843 }, { "epoch": 0.6055, "grad_norm": 2.5390007495880127, "grad_norm_var": 0.38901747984859014, "learning_rate": 0.0001, "loss": 1.2462, "loss/crossentropy": 2.65464186668396, "loss/hidden": 1.0703125, "loss/logits": 0.1700541377067566, "loss/reg": 0.0005813289899379015, "step": 4844 }, { "epoch": 0.605625, "grad_norm": 2.74448561668396, "grad_norm_var": 0.38811579613535113, "learning_rate": 0.0001, "loss": 1.5166, "loss/crossentropy": 2.716486930847168, "loss/hidden": 1.2890625, "loss/logits": 0.2217557579278946, "loss/reg": 0.000581061583943665, "step": 4845 }, { "epoch": 0.60575, "grad_norm": 2.4446935653686523, "grad_norm_var": 0.3961216367007085, "learning_rate": 0.0001, "loss": 1.157, "loss/crossentropy": 2.7549285888671875, "loss/hidden": 0.984375, "loss/logits": 0.16679216921329498, "loss/reg": 0.0005808053538203239, "step": 4846 }, { "epoch": 0.605875, "grad_norm": 2.171741247177124, "grad_norm_var": 0.4390058799539692, "learning_rate": 0.0001, "loss": 1.1651, "loss/crossentropy": 2.598789930343628, "loss/hidden": 1.0, "loss/logits": 0.1592966914176941, "loss/reg": 0.0005805613473057747, "step": 4847 }, { "epoch": 0.606, "grad_norm": 2.5048880577087402, "grad_norm_var": 0.4518205272595869, "learning_rate": 0.0001, "loss": 1.3961, "loss/crossentropy": 2.464707136154175, "loss/hidden": 1.2109375, "loss/logits": 0.17933544516563416, "loss/reg": 0.0005802561063319445, "step": 4848 }, { "epoch": 0.606125, "grad_norm": 3.4955477714538574, "grad_norm_var": 0.469998381441215, "learning_rate": 0.0001, "loss": 1.3446, "loss/crossentropy": 2.5483779907226562, "loss/hidden": 1.15625, "loss/logits": 0.182506263256073, "loss/reg": 0.0005799981881864369, "step": 4849 }, { "epoch": 0.60625, "grad_norm": 3.460005760192871, "grad_norm_var": 0.4555914611792121, "learning_rate": 0.0001, "loss": 1.6852, "loss/crossentropy": 2.3514773845672607, "loss/hidden": 1.3671875, "loss/logits": 0.3122570514678955, "loss/reg": 0.0005796912009827793, "step": 4850 }, { "epoch": 0.606375, "grad_norm": 2.7026941776275635, "grad_norm_var": 0.36549775586926975, "learning_rate": 0.0001, "loss": 1.4054, "loss/crossentropy": 2.5484800338745117, "loss/hidden": 1.171875, "loss/logits": 0.22770854830741882, "loss/reg": 0.0005793850286863744, "step": 4851 }, { "epoch": 0.6065, "grad_norm": 2.6019747257232666, "grad_norm_var": 0.3624376076579959, "learning_rate": 0.0001, "loss": 1.5455, "loss/crossentropy": 2.587616443634033, "loss/hidden": 1.265625, "loss/logits": 0.27410924434661865, "loss/reg": 0.0005790789145976305, "step": 4852 }, { "epoch": 0.606625, "grad_norm": 3.4892446994781494, "grad_norm_var": 0.3546757629535816, "learning_rate": 0.0001, "loss": 1.4592, "loss/crossentropy": 2.5940136909484863, "loss/hidden": 1.1640625, "loss/logits": 0.28938084840774536, "loss/reg": 0.0005787747795693576, "step": 4853 }, { "epoch": 0.60675, "grad_norm": 4.7057061195373535, "grad_norm_var": 0.5143464679771105, "learning_rate": 0.0001, "loss": 1.3213, "loss/crossentropy": 2.505807876586914, "loss/hidden": 1.125, "loss/logits": 0.190528005361557, "loss/reg": 0.0005784579552710056, "step": 4854 }, { "epoch": 0.606875, "grad_norm": 2.7182748317718506, "grad_norm_var": 0.509873207087804, "learning_rate": 0.0001, "loss": 1.4209, "loss/crossentropy": 2.5217838287353516, "loss/hidden": 1.203125, "loss/logits": 0.21199819445610046, "loss/reg": 0.000578167091589421, "step": 4855 }, { "epoch": 0.607, "grad_norm": 3.0080347061157227, "grad_norm_var": 0.5088132523321595, "learning_rate": 0.0001, "loss": 1.3775, "loss/crossentropy": 2.31107234954834, "loss/hidden": 1.1796875, "loss/logits": 0.19200482964515686, "loss/reg": 0.0005778308841399848, "step": 4856 }, { "epoch": 0.607125, "grad_norm": 2.1160852909088135, "grad_norm_var": 0.41897377217139986, "learning_rate": 0.0001, "loss": 1.2221, "loss/crossentropy": 2.7166566848754883, "loss/hidden": 1.03125, "loss/logits": 0.1850520372390747, "loss/reg": 0.0005775231984443963, "step": 4857 }, { "epoch": 0.60725, "grad_norm": 3.995722532272339, "grad_norm_var": 0.4900611947268762, "learning_rate": 0.0001, "loss": 1.4324, "loss/crossentropy": 3.0400564670562744, "loss/hidden": 1.2109375, "loss/logits": 0.21565383672714233, "loss/reg": 0.0005771698779426515, "step": 4858 }, { "epoch": 0.607375, "grad_norm": 2.5705971717834473, "grad_norm_var": 0.4915511190668949, "learning_rate": 0.0001, "loss": 1.2855, "loss/crossentropy": 2.5939011573791504, "loss/hidden": 1.078125, "loss/logits": 0.2016150802373886, "loss/reg": 0.0005768176051788032, "step": 4859 }, { "epoch": 0.6075, "grad_norm": 2.996443748474121, "grad_norm_var": 0.47929979475402057, "learning_rate": 0.0001, "loss": 1.466, "loss/crossentropy": 2.287170886993408, "loss/hidden": 1.25, "loss/logits": 0.21020707488059998, "loss/reg": 0.0005764479865320027, "step": 4860 }, { "epoch": 0.607625, "grad_norm": 3.3019330501556396, "grad_norm_var": 0.4810022652100959, "learning_rate": 0.0001, "loss": 1.5158, "loss/crossentropy": 2.599637269973755, "loss/hidden": 1.28125, "loss/logits": 0.22878950834274292, "loss/reg": 0.000576143036596477, "step": 4861 }, { "epoch": 0.60775, "grad_norm": 2.7717251777648926, "grad_norm_var": 0.46270006824614235, "learning_rate": 0.0001, "loss": 1.5162, "loss/crossentropy": 2.599397897720337, "loss/hidden": 1.265625, "loss/logits": 0.24484074115753174, "loss/reg": 0.0005758387269452214, "step": 4862 }, { "epoch": 0.607875, "grad_norm": 2.897172689437866, "grad_norm_var": 0.4117867306908986, "learning_rate": 0.0001, "loss": 1.4089, "loss/crossentropy": 2.416867733001709, "loss/hidden": 1.203125, "loss/logits": 0.2000575214624405, "loss/reg": 0.0005755331949330866, "step": 4863 }, { "epoch": 0.608, "grad_norm": 3.483250141143799, "grad_norm_var": 0.3961319103912416, "learning_rate": 0.0001, "loss": 1.409, "loss/crossentropy": 2.432769775390625, "loss/hidden": 1.1875, "loss/logits": 0.2157767117023468, "loss/reg": 0.0005752359866164625, "step": 4864 }, { "epoch": 0.608125, "grad_norm": 2.6394050121307373, "grad_norm_var": 0.40188746525791974, "learning_rate": 0.0001, "loss": 1.3597, "loss/crossentropy": 2.3975517749786377, "loss/hidden": 1.125, "loss/logits": 0.22899147868156433, "loss/reg": 0.000574930221773684, "step": 4865 }, { "epoch": 0.60825, "grad_norm": 4.269050121307373, "grad_norm_var": 0.48258731452551135, "learning_rate": 0.0001, "loss": 1.421, "loss/crossentropy": 2.4351754188537598, "loss/hidden": 1.2109375, "loss/logits": 0.20436157286167145, "loss/reg": 0.0005746245733462274, "step": 4866 }, { "epoch": 0.608375, "grad_norm": 3.28647518157959, "grad_norm_var": 0.4697156739710711, "learning_rate": 0.0001, "loss": 1.5236, "loss/crossentropy": 2.5955939292907715, "loss/hidden": 1.28125, "loss/logits": 0.23658019304275513, "loss/reg": 0.0005743327201344073, "step": 4867 }, { "epoch": 0.6085, "grad_norm": 2.6980485916137695, "grad_norm_var": 0.4629112859777988, "learning_rate": 0.0001, "loss": 1.16, "loss/crossentropy": 2.521527051925659, "loss/hidden": 0.99609375, "loss/logits": 0.15817809104919434, "loss/reg": 0.0005740588530898094, "step": 4868 }, { "epoch": 0.608625, "grad_norm": 4.896971702575684, "grad_norm_var": 0.644023560823256, "learning_rate": 0.0001, "loss": 1.8602, "loss/crossentropy": 2.648588180541992, "loss/hidden": 1.3046875, "loss/logits": 0.5497561693191528, "loss/reg": 0.000573753088247031, "step": 4869 }, { "epoch": 0.60875, "grad_norm": 2.815244436264038, "grad_norm_var": 0.5060523231661418, "learning_rate": 0.0001, "loss": 1.5802, "loss/crossentropy": 2.4603071212768555, "loss/hidden": 1.3203125, "loss/logits": 0.2541220188140869, "loss/reg": 0.0005734827136620879, "step": 4870 }, { "epoch": 0.608875, "grad_norm": 3.261972427368164, "grad_norm_var": 0.4929387670893334, "learning_rate": 0.0001, "loss": 1.4879, "loss/crossentropy": 2.4528965950012207, "loss/hidden": 1.265625, "loss/logits": 0.216497004032135, "loss/reg": 0.0005732220015488565, "step": 4871 }, { "epoch": 0.609, "grad_norm": 3.8257153034210205, "grad_norm_var": 0.5151049146884453, "learning_rate": 0.0001, "loss": 1.4063, "loss/crossentropy": 3.0202796459198, "loss/hidden": 1.1953125, "loss/logits": 0.20527935028076172, "loss/reg": 0.0005729150725528598, "step": 4872 }, { "epoch": 0.609125, "grad_norm": 3.0795865058898926, "grad_norm_var": 0.42885398159473026, "learning_rate": 0.0001, "loss": 1.3777, "loss/crossentropy": 2.2657155990600586, "loss/hidden": 1.1484375, "loss/logits": 0.22353091835975647, "loss/reg": 0.0005726063973270357, "step": 4873 }, { "epoch": 0.60925, "grad_norm": 2.274156093597412, "grad_norm_var": 0.4542399358508568, "learning_rate": 0.0001, "loss": 1.1537, "loss/crossentropy": 2.6730244159698486, "loss/hidden": 1.0, "loss/logits": 0.14795416593551636, "loss/reg": 0.0005723153590224683, "step": 4874 }, { "epoch": 0.609375, "grad_norm": 2.972896099090576, "grad_norm_var": 0.43103751065993356, "learning_rate": 0.0001, "loss": 1.452, "loss/crossentropy": 2.6191091537475586, "loss/hidden": 1.21875, "loss/logits": 0.22750836610794067, "loss/reg": 0.000572045857552439, "step": 4875 }, { "epoch": 0.6095, "grad_norm": 2.435404062271118, "grad_norm_var": 0.4671999966608676, "learning_rate": 0.0001, "loss": 1.4175, "loss/crossentropy": 2.5176289081573486, "loss/hidden": 1.1875, "loss/logits": 0.2242531180381775, "loss/reg": 0.0005717151798307896, "step": 4876 }, { "epoch": 0.609625, "grad_norm": 3.0944583415985107, "grad_norm_var": 0.46656743758119984, "learning_rate": 0.0001, "loss": 1.4239, "loss/crossentropy": 2.4569203853607178, "loss/hidden": 1.1796875, "loss/logits": 0.2385401427745819, "loss/reg": 0.0005714134895242751, "step": 4877 }, { "epoch": 0.60975, "grad_norm": 2.6232123374938965, "grad_norm_var": 0.4758096086392764, "learning_rate": 0.0001, "loss": 1.2683, "loss/crossentropy": 2.261754035949707, "loss/hidden": 1.0859375, "loss/logits": 0.17664192616939545, "loss/reg": 0.0005711409612558782, "step": 4878 }, { "epoch": 0.609875, "grad_norm": 2.62685489654541, "grad_norm_var": 0.4898337846296042, "learning_rate": 0.0001, "loss": 1.38, "loss/crossentropy": 2.5567781925201416, "loss/hidden": 1.1484375, "loss/logits": 0.2258489727973938, "loss/reg": 0.0005708496319130063, "step": 4879 }, { "epoch": 0.61, "grad_norm": 4.418285846710205, "grad_norm_var": 0.5869377787098754, "learning_rate": 0.0001, "loss": 1.6358, "loss/crossentropy": 2.700396776199341, "loss/hidden": 1.3359375, "loss/logits": 0.2941451668739319, "loss/reg": 0.0005705617950297892, "step": 4880 }, { "epoch": 0.610125, "grad_norm": 2.249842405319214, "grad_norm_var": 0.6255985441956565, "learning_rate": 0.0001, "loss": 1.205, "loss/crossentropy": 2.5387449264526367, "loss/hidden": 1.015625, "loss/logits": 0.18368348479270935, "loss/reg": 0.0005702344351448119, "step": 4881 }, { "epoch": 0.61025, "grad_norm": 3.137723684310913, "grad_norm_var": 0.5408275065514848, "learning_rate": 0.0001, "loss": 1.3665, "loss/crossentropy": 2.291440486907959, "loss/hidden": 1.15625, "loss/logits": 0.20451316237449646, "loss/reg": 0.0005699550965800881, "step": 4882 }, { "epoch": 0.610375, "grad_norm": 3.3822567462921143, "grad_norm_var": 0.5437050375588908, "learning_rate": 0.0001, "loss": 1.1614, "loss/crossentropy": 2.402799606323242, "loss/hidden": 1.0078125, "loss/logits": 0.147854283452034, "loss/reg": 0.0005696389125660062, "step": 4883 }, { "epoch": 0.6105, "grad_norm": 2.1652915477752686, "grad_norm_var": 0.5908519486112861, "learning_rate": 0.0001, "loss": 1.0741, "loss/crossentropy": 2.4977290630340576, "loss/hidden": 0.91015625, "loss/logits": 0.1582721620798111, "loss/reg": 0.0005693563725799322, "step": 4884 }, { "epoch": 0.610625, "grad_norm": 2.4552664756774902, "grad_norm_var": 0.3715281043040922, "learning_rate": 0.0001, "loss": 1.2241, "loss/crossentropy": 2.328693389892578, "loss/hidden": 1.0390625, "loss/logits": 0.17935118079185486, "loss/reg": 0.0005690674297511578, "step": 4885 }, { "epoch": 0.61075, "grad_norm": 2.4420666694641113, "grad_norm_var": 0.3857495653013584, "learning_rate": 0.0001, "loss": 1.2753, "loss/crossentropy": 2.4019405841827393, "loss/hidden": 1.0859375, "loss/logits": 0.1836622804403305, "loss/reg": 0.0005687947850674391, "step": 4886 }, { "epoch": 0.610875, "grad_norm": 5.014517784118652, "grad_norm_var": 0.6616390462728075, "learning_rate": 0.0001, "loss": 1.4734, "loss/crossentropy": 2.825758218765259, "loss/hidden": 1.234375, "loss/logits": 0.23329919576644897, "loss/reg": 0.0005685319192707539, "step": 4887 }, { "epoch": 0.611, "grad_norm": 3.8345227241516113, "grad_norm_var": 0.6625990526157683, "learning_rate": 0.0001, "loss": 1.638, "loss/crossentropy": 2.6242668628692627, "loss/hidden": 1.375, "loss/logits": 0.25734254717826843, "loss/reg": 0.000568268122151494, "step": 4888 }, { "epoch": 0.611125, "grad_norm": 2.686180591583252, "grad_norm_var": 0.6687738944205033, "learning_rate": 0.0001, "loss": 1.2219, "loss/crossentropy": 2.5829107761383057, "loss/hidden": 1.046875, "loss/logits": 0.1693723350763321, "loss/reg": 0.0005680079339072108, "step": 4889 }, { "epoch": 0.61125, "grad_norm": 2.891543388366699, "grad_norm_var": 0.6338090185311596, "learning_rate": 0.0001, "loss": 1.4416, "loss/crossentropy": 2.575266122817993, "loss/hidden": 1.21875, "loss/logits": 0.21718794107437134, "loss/reg": 0.0005677321460098028, "step": 4890 }, { "epoch": 0.611375, "grad_norm": 2.858391523361206, "grad_norm_var": 0.6354528942485973, "learning_rate": 0.0001, "loss": 1.4235, "loss/crossentropy": 3.0141220092773438, "loss/hidden": 1.1953125, "loss/logits": 0.2225499153137207, "loss/reg": 0.0005674305139109492, "step": 4891 }, { "epoch": 0.6115, "grad_norm": 2.1065378189086914, "grad_norm_var": 0.6678348482690921, "learning_rate": 0.0001, "loss": 1.1378, "loss/crossentropy": 2.8112924098968506, "loss/hidden": 0.9765625, "loss/logits": 0.15555253624916077, "loss/reg": 0.0005671342369168997, "step": 4892 }, { "epoch": 0.611625, "grad_norm": 2.923412561416626, "grad_norm_var": 0.6674905654233716, "learning_rate": 0.0001, "loss": 1.3962, "loss/crossentropy": 2.5851149559020996, "loss/hidden": 1.140625, "loss/logits": 0.2498873621225357, "loss/reg": 0.000566819217056036, "step": 4893 }, { "epoch": 0.61175, "grad_norm": 2.7929961681365967, "grad_norm_var": 0.6610230311752159, "learning_rate": 0.0001, "loss": 1.4643, "loss/crossentropy": 2.467019557952881, "loss/hidden": 1.2265625, "loss/logits": 0.23203502595424652, "loss/reg": 0.0005664590862579644, "step": 4894 }, { "epoch": 0.611875, "grad_norm": 2.01957106590271, "grad_norm_var": 0.7142142183494788, "learning_rate": 0.0001, "loss": 1.1153, "loss/crossentropy": 2.524292469024658, "loss/hidden": 0.9453125, "loss/logits": 0.16434916853904724, "loss/reg": 0.0005660839378833771, "step": 4895 }, { "epoch": 0.612, "grad_norm": 2.234097719192505, "grad_norm_var": 0.5880271979366831, "learning_rate": 0.0001, "loss": 1.2456, "loss/crossentropy": 2.6065046787261963, "loss/hidden": 1.0703125, "loss/logits": 0.16964180767536163, "loss/reg": 0.0005657178116962314, "step": 4896 }, { "epoch": 0.612125, "grad_norm": 2.744075059890747, "grad_norm_var": 0.5654160726078465, "learning_rate": 0.0001, "loss": 1.2972, "loss/crossentropy": 2.562670946121216, "loss/hidden": 1.1015625, "loss/logits": 0.190032958984375, "loss/reg": 0.0005653337575495243, "step": 4897 }, { "epoch": 0.61225, "grad_norm": 2.12542462348938, "grad_norm_var": 0.5913740823317215, "learning_rate": 0.0001, "loss": 1.1255, "loss/crossentropy": 2.3336620330810547, "loss/hidden": 0.9609375, "loss/logits": 0.15890008211135864, "loss/reg": 0.0005650356179103255, "step": 4898 }, { "epoch": 0.612375, "grad_norm": 2.19514799118042, "grad_norm_var": 0.5860653338623242, "learning_rate": 0.0001, "loss": 1.3144, "loss/crossentropy": 2.702824354171753, "loss/hidden": 1.109375, "loss/logits": 0.19940197467803955, "loss/reg": 0.0005647364887408912, "step": 4899 }, { "epoch": 0.6125, "grad_norm": 8.340240478515625, "grad_norm_var": 2.514076741362856, "learning_rate": 0.0001, "loss": 1.7286, "loss/crossentropy": 3.0090887546539307, "loss/hidden": 1.46875, "loss/logits": 0.2542518973350525, "loss/reg": 0.0005643838667310774, "step": 4900 }, { "epoch": 0.612625, "grad_norm": 2.1624019145965576, "grad_norm_var": 2.5447694677859847, "learning_rate": 0.0001, "loss": 1.1212, "loss/crossentropy": 2.347147226333618, "loss/hidden": 0.96875, "loss/logits": 0.14684635400772095, "loss/reg": 0.0005640863673761487, "step": 4901 }, { "epoch": 0.61275, "grad_norm": 2.4589107036590576, "grad_norm_var": 2.5433416928967136, "learning_rate": 0.0001, "loss": 1.2414, "loss/crossentropy": 2.5530507564544678, "loss/hidden": 1.046875, "loss/logits": 0.1889043003320694, "loss/reg": 0.000563787529245019, "step": 4902 }, { "epoch": 0.612875, "grad_norm": 2.3927738666534424, "grad_norm_var": 2.29905561177583, "learning_rate": 0.0001, "loss": 1.2991, "loss/crossentropy": 2.5562758445739746, "loss/hidden": 1.1171875, "loss/logits": 0.17628264427185059, "loss/reg": 0.0005635002162307501, "step": 4903 }, { "epoch": 0.613, "grad_norm": 2.126657485961914, "grad_norm_var": 2.2737628947001745, "learning_rate": 0.0001, "loss": 1.1396, "loss/crossentropy": 2.64697527885437, "loss/hidden": 0.98046875, "loss/logits": 0.15350303053855896, "loss/reg": 0.0005631986423395574, "step": 4904 }, { "epoch": 0.613125, "grad_norm": 2.0946993827819824, "grad_norm_var": 2.305878266779477, "learning_rate": 0.0001, "loss": 1.1442, "loss/crossentropy": 2.5729494094848633, "loss/hidden": 0.98828125, "loss/logits": 0.15028178691864014, "loss/reg": 0.0005629113293252885, "step": 4905 }, { "epoch": 0.61325, "grad_norm": 2.2689433097839355, "grad_norm_var": 2.3207775438949576, "learning_rate": 0.0001, "loss": 1.1076, "loss/crossentropy": 2.7864270210266113, "loss/hidden": 0.94921875, "loss/logits": 0.15274514257907867, "loss/reg": 0.0005626165657304227, "step": 4906 }, { "epoch": 0.613375, "grad_norm": 3.8118295669555664, "grad_norm_var": 2.392609312066164, "learning_rate": 0.0001, "loss": 1.5167, "loss/crossentropy": 2.148536205291748, "loss/hidden": 1.2890625, "loss/logits": 0.22205954790115356, "loss/reg": 0.0005624181940220296, "step": 4907 }, { "epoch": 0.6135, "grad_norm": 2.5969250202178955, "grad_norm_var": 2.362306608936653, "learning_rate": 0.0001, "loss": 1.3024, "loss/crossentropy": 2.8969709873199463, "loss/hidden": 1.0859375, "loss/logits": 0.2108776867389679, "loss/reg": 0.0005621202290058136, "step": 4908 }, { "epoch": 0.613625, "grad_norm": 2.315657138824463, "grad_norm_var": 2.3778634845568867, "learning_rate": 0.0001, "loss": 1.1215, "loss/crossentropy": 2.8404669761657715, "loss/hidden": 0.96875, "loss/logits": 0.1471683382987976, "loss/reg": 0.0005618815193884075, "step": 4909 }, { "epoch": 0.61375, "grad_norm": 2.3845748901367188, "grad_norm_var": 2.388263157729988, "learning_rate": 0.0001, "loss": 1.3462, "loss/crossentropy": 2.6396701335906982, "loss/hidden": 1.125, "loss/logits": 0.21553921699523926, "loss/reg": 0.0005616502603515983, "step": 4910 }, { "epoch": 0.613875, "grad_norm": 3.700953483581543, "grad_norm_var": 2.3973927135397104, "learning_rate": 0.0001, "loss": 1.6638, "loss/crossentropy": 2.2261037826538086, "loss/hidden": 1.4375, "loss/logits": 0.22071364521980286, "loss/reg": 0.0005613930989056826, "step": 4911 }, { "epoch": 0.614, "grad_norm": 3.3779470920562744, "grad_norm_var": 2.3818661132872267, "learning_rate": 0.0001, "loss": 1.3888, "loss/crossentropy": 2.4561421871185303, "loss/hidden": 1.2109375, "loss/logits": 0.17224271595478058, "loss/reg": 0.0005611312226392329, "step": 4912 }, { "epoch": 0.614125, "grad_norm": 4.479068756103516, "grad_norm_var": 2.5238535394861468, "learning_rate": 0.0001, "loss": 1.5685, "loss/crossentropy": 2.5365781784057617, "loss/hidden": 1.3515625, "loss/logits": 0.21131739020347595, "loss/reg": 0.0005608335486613214, "step": 4913 }, { "epoch": 0.61425, "grad_norm": 3.004300117492676, "grad_norm_var": 2.4635495302497774, "learning_rate": 0.0001, "loss": 1.3666, "loss/crossentropy": 2.611215114593506, "loss/hidden": 1.1484375, "loss/logits": 0.21257871389389038, "loss/reg": 0.0005605760379694402, "step": 4914 }, { "epoch": 0.614375, "grad_norm": 2.6984732151031494, "grad_norm_var": 2.418192695796707, "learning_rate": 0.0001, "loss": 1.4278, "loss/crossentropy": 2.1883089542388916, "loss/hidden": 1.203125, "loss/logits": 0.21903248131275177, "loss/reg": 0.0005602742894552648, "step": 4915 }, { "epoch": 0.6145, "grad_norm": 2.417009115219116, "grad_norm_var": 0.5027551170145195, "learning_rate": 0.0001, "loss": 1.2988, "loss/crossentropy": 2.364406108856201, "loss/hidden": 1.1015625, "loss/logits": 0.1916167438030243, "loss/reg": 0.0005599759169854224, "step": 4916 }, { "epoch": 0.614625, "grad_norm": 2.762349843978882, "grad_norm_var": 0.476791945631286, "learning_rate": 0.0001, "loss": 1.3369, "loss/crossentropy": 2.5730743408203125, "loss/hidden": 1.125, "loss/logits": 0.20631200075149536, "loss/reg": 0.00055970303947106, "step": 4917 }, { "epoch": 0.61475, "grad_norm": 2.5307722091674805, "grad_norm_var": 0.4737920029827289, "learning_rate": 0.0001, "loss": 1.2866, "loss/crossentropy": 2.4997546672821045, "loss/hidden": 1.0703125, "loss/logits": 0.2107049524784088, "loss/reg": 0.0005594396498054266, "step": 4918 }, { "epoch": 0.614875, "grad_norm": 3.475677490234375, "grad_norm_var": 0.4868159454333198, "learning_rate": 0.0001, "loss": 1.4269, "loss/crossentropy": 2.6693992614746094, "loss/hidden": 1.171875, "loss/logits": 0.24939867854118347, "loss/reg": 0.0005591970402747393, "step": 4919 }, { "epoch": 0.615, "grad_norm": 3.0609750747680664, "grad_norm_var": 0.44779310912546916, "learning_rate": 0.0001, "loss": 1.2638, "loss/crossentropy": 2.5154542922973633, "loss/hidden": 1.0703125, "loss/logits": 0.18792179226875305, "loss/reg": 0.0005589518696069717, "step": 4920 }, { "epoch": 0.615125, "grad_norm": 5.800595760345459, "grad_norm_var": 0.8903159635865372, "learning_rate": 0.0001, "loss": 2.1067, "loss/crossentropy": 2.3905189037323, "loss/hidden": 1.71875, "loss/logits": 0.38234490156173706, "loss/reg": 0.0005586519255302846, "step": 4921 }, { "epoch": 0.61525, "grad_norm": 5.030116558074951, "grad_norm_var": 1.0358721371629749, "learning_rate": 0.0001, "loss": 1.9745, "loss/crossentropy": 2.688788652420044, "loss/hidden": 1.609375, "loss/logits": 0.3595157563686371, "loss/reg": 0.0005583763704635203, "step": 4922 }, { "epoch": 0.615375, "grad_norm": 2.760629415512085, "grad_norm_var": 1.0388676493504732, "learning_rate": 0.0001, "loss": 1.34, "loss/crossentropy": 2.6615965366363525, "loss/hidden": 1.1328125, "loss/logits": 0.20165178179740906, "loss/reg": 0.0005581091390922666, "step": 4923 }, { "epoch": 0.6155, "grad_norm": 2.972385883331299, "grad_norm_var": 1.013745349036988, "learning_rate": 0.0001, "loss": 1.4004, "loss/crossentropy": 2.241541624069214, "loss/hidden": 1.203125, "loss/logits": 0.1916828602552414, "loss/reg": 0.0005578378913924098, "step": 4924 }, { "epoch": 0.615625, "grad_norm": 5.409776210784912, "grad_norm_var": 1.2067389947830878, "learning_rate": 0.0001, "loss": 1.5104, "loss/crossentropy": 2.447474956512451, "loss/hidden": 1.28125, "loss/logits": 0.2236139178276062, "loss/reg": 0.0005575845716521144, "step": 4925 }, { "epoch": 0.61575, "grad_norm": 3.0625851154327393, "grad_norm_var": 1.1353935032586757, "learning_rate": 0.0001, "loss": 1.6457, "loss/crossentropy": 2.647064208984375, "loss/hidden": 1.3515625, "loss/logits": 0.28860729932785034, "loss/reg": 0.0005573367234319448, "step": 4926 }, { "epoch": 0.615875, "grad_norm": 2.9702558517456055, "grad_norm_var": 1.1524954316605014, "learning_rate": 0.0001, "loss": 1.5302, "loss/crossentropy": 2.6717190742492676, "loss/hidden": 1.265625, "loss/logits": 0.25905126333236694, "loss/reg": 0.0005570978973992169, "step": 4927 }, { "epoch": 0.616, "grad_norm": 2.231405019760132, "grad_norm_var": 1.2515263767724558, "learning_rate": 0.0001, "loss": 1.322, "loss/crossentropy": 2.550546169281006, "loss/hidden": 1.125, "loss/logits": 0.19142864644527435, "loss/reg": 0.0005568023188970983, "step": 4928 }, { "epoch": 0.616125, "grad_norm": 3.174826145172119, "grad_norm_var": 1.1730880862580382, "learning_rate": 0.0001, "loss": 1.5011, "loss/crossentropy": 2.478537082672119, "loss/hidden": 1.25, "loss/logits": 0.2455374151468277, "loss/reg": 0.0005565644823946059, "step": 4929 }, { "epoch": 0.61625, "grad_norm": 2.339202642440796, "grad_norm_var": 1.2300734284653778, "learning_rate": 0.0001, "loss": 1.0613, "loss/crossentropy": 2.80464243888855, "loss/hidden": 0.91015625, "loss/logits": 0.1455819010734558, "loss/reg": 0.0005563375889323652, "step": 4930 }, { "epoch": 0.616375, "grad_norm": 8.22903060913086, "grad_norm_var": 2.7029399654023174, "learning_rate": 0.0001, "loss": 1.2742, "loss/crossentropy": 2.713233470916748, "loss/hidden": 1.1015625, "loss/logits": 0.16711124777793884, "loss/reg": 0.0005560611025430262, "step": 4931 }, { "epoch": 0.6165, "grad_norm": 2.9488089084625244, "grad_norm_var": 2.6339524647797625, "learning_rate": 0.0001, "loss": 1.326, "loss/crossentropy": 2.720275640487671, "loss/hidden": 1.1171875, "loss/logits": 0.20328159630298615, "loss/reg": 0.0005557855474762619, "step": 4932 }, { "epoch": 0.616625, "grad_norm": 2.4534895420074463, "grad_norm_var": 2.6773943032750434, "learning_rate": 0.0001, "loss": 1.2565, "loss/crossentropy": 2.6757986545562744, "loss/hidden": 1.0546875, "loss/logits": 0.19622473418712616, "loss/reg": 0.0005555375828407705, "step": 4933 }, { "epoch": 0.61675, "grad_norm": 3.0552282333374023, "grad_norm_var": 2.6160995678738943, "learning_rate": 0.0001, "loss": 1.6384, "loss/crossentropy": 2.2904632091522217, "loss/hidden": 1.3828125, "loss/logits": 0.2500396966934204, "loss/reg": 0.0005552431684918702, "step": 4934 }, { "epoch": 0.616875, "grad_norm": 3.04520583152771, "grad_norm_var": 2.6397492720599622, "learning_rate": 0.0001, "loss": 1.3101, "loss/crossentropy": 2.741863489151001, "loss/hidden": 1.1015625, "loss/logits": 0.2029910385608673, "loss/reg": 0.0005549644702114165, "step": 4935 }, { "epoch": 0.617, "grad_norm": 2.485229253768921, "grad_norm_var": 2.706377501638053, "learning_rate": 0.0001, "loss": 1.1765, "loss/crossentropy": 3.2309765815734863, "loss/hidden": 1.0234375, "loss/logits": 0.147535040974617, "loss/reg": 0.0005546616739593446, "step": 4936 }, { "epoch": 0.617125, "grad_norm": 2.8473825454711914, "grad_norm_var": 2.3940343003622035, "learning_rate": 0.0001, "loss": 1.346, "loss/crossentropy": 2.4097814559936523, "loss/hidden": 1.140625, "loss/logits": 0.1998603343963623, "loss/reg": 0.0005543545703403652, "step": 4937 }, { "epoch": 0.61725, "grad_norm": 2.753227710723877, "grad_norm_var": 2.234848637118397, "learning_rate": 0.0001, "loss": 1.2345, "loss/crossentropy": 2.386141777038574, "loss/hidden": 1.03125, "loss/logits": 0.19766053557395935, "loss/reg": 0.0005540624842979014, "step": 4938 }, { "epoch": 0.617375, "grad_norm": 2.80220627784729, "grad_norm_var": 2.23198788158021, "learning_rate": 0.0001, "loss": 1.1267, "loss/crossentropy": 2.60487961769104, "loss/hidden": 0.96875, "loss/logits": 0.15239843726158142, "loss/reg": 0.0005537591059692204, "step": 4939 }, { "epoch": 0.6175, "grad_norm": 2.1790518760681152, "grad_norm_var": 2.305847784743067, "learning_rate": 0.0001, "loss": 1.1451, "loss/crossentropy": 2.7491512298583984, "loss/hidden": 0.97265625, "loss/logits": 0.16691777110099792, "loss/reg": 0.0005534747615456581, "step": 4940 }, { "epoch": 0.617625, "grad_norm": 6.779530048370361, "grad_norm_var": 2.817709514025163, "learning_rate": 0.0001, "loss": 1.5253, "loss/crossentropy": 2.5956194400787354, "loss/hidden": 1.2265625, "loss/logits": 0.2931591868400574, "loss/reg": 0.0005531933275051415, "step": 4941 }, { "epoch": 0.61775, "grad_norm": 4.106756210327148, "grad_norm_var": 2.847955491236708, "learning_rate": 0.0001, "loss": 1.1935, "loss/crossentropy": 2.4026265144348145, "loss/hidden": 1.03125, "loss/logits": 0.15674808621406555, "loss/reg": 0.0005528946639969945, "step": 4942 }, { "epoch": 0.617875, "grad_norm": 3.5193865299224854, "grad_norm_var": 2.8353334343422487, "learning_rate": 0.0001, "loss": 1.5551, "loss/crossentropy": 2.529135227203369, "loss/hidden": 1.28125, "loss/logits": 0.2682899832725525, "loss/reg": 0.0005525964661501348, "step": 4943 }, { "epoch": 0.618, "grad_norm": 2.6186234951019287, "grad_norm_var": 2.7825963802642906, "learning_rate": 0.0001, "loss": 1.3331, "loss/crossentropy": 2.500404119491577, "loss/hidden": 1.1328125, "loss/logits": 0.19478866457939148, "loss/reg": 0.0005522929131984711, "step": 4944 }, { "epoch": 0.618125, "grad_norm": 1.9407151937484741, "grad_norm_var": 2.9244759424276814, "learning_rate": 0.0001, "loss": 1.0385, "loss/crossentropy": 2.6449999809265137, "loss/hidden": 0.89453125, "loss/logits": 0.13842251896858215, "loss/reg": 0.0005519767291843891, "step": 4945 }, { "epoch": 0.61825, "grad_norm": 4.238762855529785, "grad_norm_var": 2.88602360360321, "learning_rate": 0.0001, "loss": 1.4043, "loss/crossentropy": 2.4643030166625977, "loss/hidden": 1.2265625, "loss/logits": 0.17225658893585205, "loss/reg": 0.0005516897072084248, "step": 4946 }, { "epoch": 0.618375, "grad_norm": 2.606613874435425, "grad_norm_var": 1.3167260779290177, "learning_rate": 0.0001, "loss": 1.2819, "loss/crossentropy": 2.619479179382324, "loss/hidden": 1.0546875, "loss/logits": 0.22167310118675232, "loss/reg": 0.0005513797514140606, "step": 4947 }, { "epoch": 0.6185, "grad_norm": 2.364633083343506, "grad_norm_var": 1.3536294118464203, "learning_rate": 0.0001, "loss": 1.3173, "loss/crossentropy": 2.4849133491516113, "loss/hidden": 1.109375, "loss/logits": 0.2024294137954712, "loss/reg": 0.0005510967457666993, "step": 4948 }, { "epoch": 0.618625, "grad_norm": 2.91581130027771, "grad_norm_var": 1.3263801801186308, "learning_rate": 0.0001, "loss": 1.3633, "loss/crossentropy": 2.8399455547332764, "loss/hidden": 1.171875, "loss/logits": 0.18587136268615723, "loss/reg": 0.0005508185131475329, "step": 4949 }, { "epoch": 0.61875, "grad_norm": 2.0510663986206055, "grad_norm_var": 1.4009051079539152, "learning_rate": 0.0001, "loss": 1.214, "loss/crossentropy": 2.4491288661956787, "loss/hidden": 1.0390625, "loss/logits": 0.16943015158176422, "loss/reg": 0.0005505350418388844, "step": 4950 }, { "epoch": 0.618875, "grad_norm": 2.3399457931518555, "grad_norm_var": 1.4351123324713104, "learning_rate": 0.0001, "loss": 1.301, "loss/crossentropy": 2.187808036804199, "loss/hidden": 1.1015625, "loss/logits": 0.1939225047826767, "loss/reg": 0.0005502430140040815, "step": 4951 }, { "epoch": 0.619, "grad_norm": 2.9870474338531494, "grad_norm_var": 1.4141127553033608, "learning_rate": 0.0001, "loss": 1.3025, "loss/crossentropy": 2.555478096008301, "loss/hidden": 1.109375, "loss/logits": 0.1875842809677124, "loss/reg": 0.0005499743274413049, "step": 4952 }, { "epoch": 0.619125, "grad_norm": 2.349762201309204, "grad_norm_var": 1.4440727863086091, "learning_rate": 0.0001, "loss": 1.2571, "loss/crossentropy": 2.5831427574157715, "loss/hidden": 1.0703125, "loss/logits": 0.18124550580978394, "loss/reg": 0.0005497378297150135, "step": 4953 }, { "epoch": 0.61925, "grad_norm": 3.70003604888916, "grad_norm_var": 1.4645835397214293, "learning_rate": 0.0001, "loss": 2.0057, "loss/crossentropy": 1.8808538913726807, "loss/hidden": 1.5390625, "loss/logits": 0.4611845016479492, "loss/reg": 0.0005494721699506044, "step": 4954 }, { "epoch": 0.619375, "grad_norm": 2.459172248840332, "grad_norm_var": 1.4852725034329126, "learning_rate": 0.0001, "loss": 1.0965, "loss/crossentropy": 2.6243743896484375, "loss/hidden": 0.9453125, "loss/logits": 0.14564569294452667, "loss/reg": 0.0005491918418556452, "step": 4955 }, { "epoch": 0.6195, "grad_norm": 7.967751502990723, "grad_norm_var": 2.8901495087245435, "learning_rate": 0.0001, "loss": 1.2421, "loss/crossentropy": 2.5883536338806152, "loss/hidden": 1.0703125, "loss/logits": 0.16630947589874268, "loss/reg": 0.0005489016184583306, "step": 4956 }, { "epoch": 0.619625, "grad_norm": 2.8494207859039307, "grad_norm_var": 2.102455817661897, "learning_rate": 0.0001, "loss": 1.4918, "loss/crossentropy": 2.6971263885498047, "loss/hidden": 1.25, "loss/logits": 0.2362653911113739, "loss/reg": 0.0005486051086336374, "step": 4957 }, { "epoch": 0.61975, "grad_norm": 3.540822744369507, "grad_norm_var": 2.0531814366734986, "learning_rate": 0.0001, "loss": 1.4487, "loss/crossentropy": 2.562878131866455, "loss/hidden": 1.1875, "loss/logits": 0.2557227611541748, "loss/reg": 0.0005483588902279735, "step": 4958 }, { "epoch": 0.619875, "grad_norm": 7.452695369720459, "grad_norm_var": 3.212210512452151, "learning_rate": 0.0001, "loss": 1.4619, "loss/crossentropy": 2.395374298095703, "loss/hidden": 1.296875, "loss/logits": 0.15956051647663116, "loss/reg": 0.0005480868858285248, "step": 4959 }, { "epoch": 0.62, "grad_norm": 2.17000412940979, "grad_norm_var": 3.2714639700438437, "learning_rate": 0.0001, "loss": 1.1831, "loss/crossentropy": 2.4675443172454834, "loss/hidden": 1.0, "loss/logits": 0.17759773135185242, "loss/reg": 0.0005478295497596264, "step": 4960 }, { "epoch": 0.620125, "grad_norm": 3.152024984359741, "grad_norm_var": 3.1321835887962304, "learning_rate": 0.0001, "loss": 1.4403, "loss/crossentropy": 2.085671901702881, "loss/hidden": 1.2421875, "loss/logits": 0.19260652363300323, "loss/reg": 0.0005475407815538347, "step": 4961 }, { "epoch": 0.62025, "grad_norm": 2.5981404781341553, "grad_norm_var": 3.127125452895486, "learning_rate": 0.0001, "loss": 1.3438, "loss/crossentropy": 2.4713547229766846, "loss/hidden": 1.15625, "loss/logits": 0.18208745121955872, "loss/reg": 0.0005472704651765525, "step": 4962 }, { "epoch": 0.620375, "grad_norm": 2.2771992683410645, "grad_norm_var": 3.1662976149057, "learning_rate": 0.0001, "loss": 1.2225, "loss/crossentropy": 2.6252012252807617, "loss/hidden": 1.046875, "loss/logits": 0.1701851189136505, "loss/reg": 0.0005469817551784217, "step": 4963 }, { "epoch": 0.6205, "grad_norm": 2.7038536071777344, "grad_norm_var": 3.130121864727452, "learning_rate": 0.0001, "loss": 1.3889, "loss/crossentropy": 2.424913167953491, "loss/hidden": 1.1875, "loss/logits": 0.1959245800971985, "loss/reg": 0.0005467231967486441, "step": 4964 }, { "epoch": 0.620625, "grad_norm": 2.8267664909362793, "grad_norm_var": 3.135709136792216, "learning_rate": 0.0001, "loss": 1.4093, "loss/crossentropy": 2.7479541301727295, "loss/hidden": 1.203125, "loss/logits": 0.20067881047725677, "loss/reg": 0.0005464740097522736, "step": 4965 }, { "epoch": 0.62075, "grad_norm": 2.332639455795288, "grad_norm_var": 3.0923073503630034, "learning_rate": 0.0001, "loss": 1.3454, "loss/crossentropy": 2.5609183311462402, "loss/hidden": 1.125, "loss/logits": 0.21495093405246735, "loss/reg": 0.0005461996188387275, "step": 4966 }, { "epoch": 0.620875, "grad_norm": 3.246527910232544, "grad_norm_var": 3.0207720985137065, "learning_rate": 0.0001, "loss": 1.3187, "loss/crossentropy": 2.2175519466400146, "loss/hidden": 1.1328125, "loss/logits": 0.18042564392089844, "loss/reg": 0.0005459077656269073, "step": 4967 }, { "epoch": 0.621, "grad_norm": 3.6334073543548584, "grad_norm_var": 3.010142675607174, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.7237327098846436, "loss/hidden": 1.21875, "loss/logits": 0.2573617100715637, "loss/reg": 0.0005455960636027157, "step": 4968 }, { "epoch": 0.621125, "grad_norm": 2.449659824371338, "grad_norm_var": 2.9960614419943026, "learning_rate": 0.0001, "loss": 1.3589, "loss/crossentropy": 2.1880946159362793, "loss/hidden": 1.140625, "loss/logits": 0.21278893947601318, "loss/reg": 0.0005453195190057158, "step": 4969 }, { "epoch": 0.62125, "grad_norm": 2.257798671722412, "grad_norm_var": 3.0799074232646944, "learning_rate": 0.0001, "loss": 1.0912, "loss/crossentropy": 2.467087507247925, "loss/hidden": 0.9296875, "loss/logits": 0.15609470009803772, "loss/reg": 0.0005450189928524196, "step": 4970 }, { "epoch": 0.621375, "grad_norm": 3.195885419845581, "grad_norm_var": 3.02437287897978, "learning_rate": 0.0001, "loss": 1.6244, "loss/crossentropy": 2.53643798828125, "loss/hidden": 1.359375, "loss/logits": 0.2596060633659363, "loss/reg": 0.0005447437870316207, "step": 4971 }, { "epoch": 0.6215, "grad_norm": 2.6923084259033203, "grad_norm_var": 1.562037512164922, "learning_rate": 0.0001, "loss": 1.2012, "loss/crossentropy": 2.783494710922241, "loss/hidden": 1.03125, "loss/logits": 0.16452351212501526, "loss/reg": 0.0005445019924081862, "step": 4972 }, { "epoch": 0.621625, "grad_norm": 2.9837582111358643, "grad_norm_var": 1.558924363622975, "learning_rate": 0.0001, "loss": 1.4692, "loss/crossentropy": 2.579230546951294, "loss/hidden": 1.234375, "loss/logits": 0.2293362319469452, "loss/reg": 0.0005442381370812654, "step": 4973 }, { "epoch": 0.62175, "grad_norm": 3.1515915393829346, "grad_norm_var": 1.5452349804222325, "learning_rate": 0.0001, "loss": 1.4645, "loss/crossentropy": 2.8038628101348877, "loss/hidden": 1.2109375, "loss/logits": 0.24810640513896942, "loss/reg": 0.0005439876695163548, "step": 4974 }, { "epoch": 0.621875, "grad_norm": 2.867612361907959, "grad_norm_var": 0.17999815549210113, "learning_rate": 0.0001, "loss": 1.5939, "loss/crossentropy": 2.2588303089141846, "loss/hidden": 1.3671875, "loss/logits": 0.22128160297870636, "loss/reg": 0.0005437021609395742, "step": 4975 }, { "epoch": 0.622, "grad_norm": 2.977447509765625, "grad_norm_var": 0.15467614764045645, "learning_rate": 0.0001, "loss": 1.2284, "loss/crossentropy": 2.604496479034424, "loss/hidden": 1.046875, "loss/logits": 0.17606210708618164, "loss/reg": 0.0005434139166027308, "step": 4976 }, { "epoch": 0.622125, "grad_norm": 2.9938836097717285, "grad_norm_var": 0.14953692469480656, "learning_rate": 0.0001, "loss": 1.6052, "loss/crossentropy": 2.655782699584961, "loss/hidden": 1.3203125, "loss/logits": 0.2794850468635559, "loss/reg": 0.0005431256140582263, "step": 4977 }, { "epoch": 0.62225, "grad_norm": 2.6136205196380615, "grad_norm_var": 0.14908514843266, "learning_rate": 0.0001, "loss": 1.2703, "loss/crossentropy": 2.469048500061035, "loss/hidden": 1.09375, "loss/logits": 0.17116305232048035, "loss/reg": 0.0005428439471870661, "step": 4978 }, { "epoch": 0.622375, "grad_norm": 2.2439520359039307, "grad_norm_var": 0.15158371289595132, "learning_rate": 0.0001, "loss": 1.2786, "loss/crossentropy": 2.4852683544158936, "loss/hidden": 1.0703125, "loss/logits": 0.2028830647468567, "loss/reg": 0.0005425706040114164, "step": 4979 }, { "epoch": 0.6225, "grad_norm": 2.510223865509033, "grad_norm_var": 0.15700740829626436, "learning_rate": 0.0001, "loss": 1.3322, "loss/crossentropy": 2.5004351139068604, "loss/hidden": 1.109375, "loss/logits": 0.2173686921596527, "loss/reg": 0.0005422804388217628, "step": 4980 }, { "epoch": 0.622625, "grad_norm": 2.6179513931274414, "grad_norm_var": 0.15929555643209123, "learning_rate": 0.0001, "loss": 1.2688, "loss/crossentropy": 2.408979892730713, "loss/hidden": 1.0703125, "loss/logits": 0.1930854618549347, "loss/reg": 0.000542005873285234, "step": 4981 }, { "epoch": 0.62275, "grad_norm": 3.0977301597595215, "grad_norm_var": 0.14840668010433714, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.8418331146240234, "loss/hidden": 1.1328125, "loss/logits": 0.21051013469696045, "loss/reg": 0.0005417463835328817, "step": 4982 }, { "epoch": 0.622875, "grad_norm": 2.273543357849121, "grad_norm_var": 0.15559295258558828, "learning_rate": 0.0001, "loss": 1.3372, "loss/crossentropy": 2.582071304321289, "loss/hidden": 1.1328125, "loss/logits": 0.19902080297470093, "loss/reg": 0.0005414910847321153, "step": 4983 }, { "epoch": 0.623, "grad_norm": 2.2438156604766846, "grad_norm_var": 0.11909062870261759, "learning_rate": 0.0001, "loss": 1.2352, "loss/crossentropy": 2.7186272144317627, "loss/hidden": 1.0546875, "loss/logits": 0.17511454224586487, "loss/reg": 0.0005411949823610485, "step": 4984 }, { "epoch": 0.623125, "grad_norm": 2.3704049587249756, "grad_norm_var": 0.12210933879369629, "learning_rate": 0.0001, "loss": 1.3294, "loss/crossentropy": 2.578062057495117, "loss/hidden": 1.109375, "loss/logits": 0.21462640166282654, "loss/reg": 0.000540881184861064, "step": 4985 }, { "epoch": 0.62325, "grad_norm": 2.7225797176361084, "grad_norm_var": 0.10862723704264378, "learning_rate": 0.0001, "loss": 1.3623, "loss/crossentropy": 2.387460470199585, "loss/hidden": 1.15625, "loss/logits": 0.2006010115146637, "loss/reg": 0.0005405499832704663, "step": 4986 }, { "epoch": 0.623375, "grad_norm": 2.59094500541687, "grad_norm_var": 0.09329802242593056, "learning_rate": 0.0001, "loss": 1.4758, "loss/crossentropy": 2.711782217025757, "loss/hidden": 1.234375, "loss/logits": 0.2360600233078003, "loss/reg": 0.0005402121460065246, "step": 4987 }, { "epoch": 0.6235, "grad_norm": 2.3170294761657715, "grad_norm_var": 0.10170747841076017, "learning_rate": 0.0001, "loss": 1.1656, "loss/crossentropy": 2.493044137954712, "loss/hidden": 0.98828125, "loss/logits": 0.17190653085708618, "loss/reg": 0.0005398475332185626, "step": 4988 }, { "epoch": 0.623625, "grad_norm": 3.405662775039673, "grad_norm_var": 0.13098880211180794, "learning_rate": 0.0001, "loss": 1.4146, "loss/crossentropy": 2.2370951175689697, "loss/hidden": 1.2109375, "loss/logits": 0.19830486178398132, "loss/reg": 0.0005394754698500037, "step": 4989 }, { "epoch": 0.62375, "grad_norm": 2.743506908416748, "grad_norm_var": 0.11613848012945573, "learning_rate": 0.0001, "loss": 1.4574, "loss/crossentropy": 2.4089930057525635, "loss/hidden": 1.21875, "loss/logits": 0.23329396545886993, "loss/reg": 0.0005390917067416012, "step": 4990 }, { "epoch": 0.623875, "grad_norm": 3.6254470348358154, "grad_norm_var": 0.17282229398171392, "learning_rate": 0.0001, "loss": 1.4109, "loss/crossentropy": 2.93365216255188, "loss/hidden": 1.125, "loss/logits": 0.2805187404155731, "loss/reg": 0.0005388056742958724, "step": 4991 }, { "epoch": 0.624, "grad_norm": 2.3916385173797607, "grad_norm_var": 0.1733209701091461, "learning_rate": 0.0001, "loss": 1.328, "loss/crossentropy": 2.4066271781921387, "loss/hidden": 1.109375, "loss/logits": 0.21327364444732666, "loss/reg": 0.0005385206895880401, "step": 4992 }, { "epoch": 0.624125, "grad_norm": 2.8669750690460205, "grad_norm_var": 0.168891450954348, "learning_rate": 0.0001, "loss": 1.318, "loss/crossentropy": 2.4358341693878174, "loss/hidden": 1.125, "loss/logits": 0.1876077651977539, "loss/reg": 0.0005381813389249146, "step": 4993 }, { "epoch": 0.62425, "grad_norm": 2.323716402053833, "grad_norm_var": 0.17611822675356734, "learning_rate": 0.0001, "loss": 1.1389, "loss/crossentropy": 2.7564315795898438, "loss/hidden": 0.97265625, "loss/logits": 0.16084694862365723, "loss/reg": 0.0005378050846047699, "step": 4994 }, { "epoch": 0.624375, "grad_norm": 2.279208183288574, "grad_norm_var": 0.1743032788747956, "learning_rate": 0.0001, "loss": 1.2356, "loss/crossentropy": 2.6019973754882812, "loss/hidden": 1.0390625, "loss/logits": 0.1911901831626892, "loss/reg": 0.0005375213222578168, "step": 4995 }, { "epoch": 0.6245, "grad_norm": 2.324799060821533, "grad_norm_var": 0.17987758528996642, "learning_rate": 0.0001, "loss": 1.3767, "loss/crossentropy": 2.7621936798095703, "loss/hidden": 1.1875, "loss/logits": 0.18386799097061157, "loss/reg": 0.0005371645675040781, "step": 4996 }, { "epoch": 0.624625, "grad_norm": 2.4591751098632812, "grad_norm_var": 0.18186037493414703, "learning_rate": 0.0001, "loss": 1.1651, "loss/crossentropy": 2.5723395347595215, "loss/hidden": 0.9765625, "loss/logits": 0.18314939737319946, "loss/reg": 0.0005368839483708143, "step": 4997 }, { "epoch": 0.62475, "grad_norm": 2.578076124191284, "grad_norm_var": 0.1661404077781981, "learning_rate": 0.0001, "loss": 1.3582, "loss/crossentropy": 2.658134698867798, "loss/hidden": 1.140625, "loss/logits": 0.21220415830612183, "loss/reg": 0.0005365489632822573, "step": 4998 }, { "epoch": 0.624875, "grad_norm": 2.862993001937866, "grad_norm_var": 0.16260883171552767, "learning_rate": 0.0001, "loss": 1.3707, "loss/crossentropy": 2.54771089553833, "loss/hidden": 1.171875, "loss/logits": 0.19346684217453003, "loss/reg": 0.0005362670635804534, "step": 4999 }, { "epoch": 0.625, "grad_norm": 2.6886775493621826, "grad_norm_var": 0.15197493367687662, "learning_rate": 0.0001, "loss": 1.4083, "loss/crossentropy": 2.536412477493286, "loss/hidden": 1.1953125, "loss/logits": 0.20759883522987366, "loss/reg": 0.0005359887727536261, "step": 5000 }, { "epoch": 0.625125, "grad_norm": 2.173192262649536, "grad_norm_var": 0.16200558353296302, "learning_rate": 0.0001, "loss": 1.085, "loss/crossentropy": 2.6688363552093506, "loss/hidden": 0.93359375, "loss/logits": 0.1460009068250656, "loss/reg": 0.0005356702022254467, "step": 5001 }, { "epoch": 0.62525, "grad_norm": 2.5002565383911133, "grad_norm_var": 0.162857397305307, "learning_rate": 0.0001, "loss": 1.3119, "loss/crossentropy": 2.539886474609375, "loss/hidden": 1.109375, "loss/logits": 0.1971282958984375, "loss/reg": 0.0005353468586690724, "step": 5002 }, { "epoch": 0.625375, "grad_norm": 2.454914093017578, "grad_norm_var": 0.16478043318490995, "learning_rate": 0.0001, "loss": 1.2649, "loss/crossentropy": 2.576460599899292, "loss/hidden": 1.0859375, "loss/logits": 0.17361682653427124, "loss/reg": 0.000534996681381017, "step": 5003 }, { "epoch": 0.6255, "grad_norm": 2.3305697441101074, "grad_norm_var": 0.16423642533440722, "learning_rate": 0.0001, "loss": 1.3292, "loss/crossentropy": 2.6183841228485107, "loss/hidden": 1.125, "loss/logits": 0.19884878396987915, "loss/reg": 0.0005346306134015322, "step": 5004 }, { "epoch": 0.625625, "grad_norm": 2.4461283683776855, "grad_norm_var": 0.1219746281482145, "learning_rate": 0.0001, "loss": 1.2255, "loss/crossentropy": 2.504101276397705, "loss/hidden": 1.046875, "loss/logits": 0.17329788208007812, "loss/reg": 0.0005342492368072271, "step": 5005 }, { "epoch": 0.62575, "grad_norm": 2.400207281112671, "grad_norm_var": 0.12119622667640115, "learning_rate": 0.0001, "loss": 1.2144, "loss/crossentropy": 2.453693151473999, "loss/hidden": 1.03125, "loss/logits": 0.17779508233070374, "loss/reg": 0.0005339618073776364, "step": 5006 }, { "epoch": 0.625875, "grad_norm": 2.026522159576416, "grad_norm_var": 0.05045390545200519, "learning_rate": 0.0001, "loss": 1.1753, "loss/crossentropy": 2.081284523010254, "loss/hidden": 1.015625, "loss/logits": 0.15432702004909515, "loss/reg": 0.0005336086032912135, "step": 5007 }, { "epoch": 0.626, "grad_norm": 2.69169282913208, "grad_norm_var": 0.05397847879995439, "learning_rate": 0.0001, "loss": 1.4648, "loss/crossentropy": 2.326686382293701, "loss/hidden": 1.234375, "loss/logits": 0.22512081265449524, "loss/reg": 0.0005332567961886525, "step": 5008 }, { "epoch": 0.626125, "grad_norm": 2.576397657394409, "grad_norm_var": 0.04360203996723702, "learning_rate": 0.0001, "loss": 1.2867, "loss/crossentropy": 2.428075075149536, "loss/hidden": 1.09375, "loss/logits": 0.18758901953697205, "loss/reg": 0.000532975303940475, "step": 5009 }, { "epoch": 0.62625, "grad_norm": 2.31620717048645, "grad_norm_var": 0.043726779765735795, "learning_rate": 0.0001, "loss": 1.1205, "loss/crossentropy": 2.3769290447235107, "loss/hidden": 0.96484375, "loss/logits": 0.1503528356552124, "loss/reg": 0.0005326452082954347, "step": 5010 }, { "epoch": 0.626375, "grad_norm": 2.7987914085388184, "grad_norm_var": 0.049161568292790274, "learning_rate": 0.0001, "loss": 1.3286, "loss/crossentropy": 2.608154535293579, "loss/hidden": 1.1328125, "loss/logits": 0.19049546122550964, "loss/reg": 0.0005323568475432694, "step": 5011 }, { "epoch": 0.6265, "grad_norm": 2.0328097343444824, "grad_norm_var": 0.06040737985483678, "learning_rate": 0.0001, "loss": 1.2486, "loss/crossentropy": 2.7035179138183594, "loss/hidden": 1.0546875, "loss/logits": 0.18858999013900757, "loss/reg": 0.0005320453201420605, "step": 5012 }, { "epoch": 0.626625, "grad_norm": 2.406262159347534, "grad_norm_var": 0.06057787261439671, "learning_rate": 0.0001, "loss": 1.2197, "loss/crossentropy": 2.3519928455352783, "loss/hidden": 1.046875, "loss/logits": 0.16747893393039703, "loss/reg": 0.0005317269242368639, "step": 5013 }, { "epoch": 0.62675, "grad_norm": 4.077879428863525, "grad_norm_var": 0.22573177173395392, "learning_rate": 0.0001, "loss": 1.4982, "loss/crossentropy": 2.7202751636505127, "loss/hidden": 1.234375, "loss/logits": 0.25851911306381226, "loss/reg": 0.0005314233712852001, "step": 5014 }, { "epoch": 0.626875, "grad_norm": 2.347374677658081, "grad_norm_var": 0.2207592809039357, "learning_rate": 0.0001, "loss": 1.3269, "loss/crossentropy": 2.5925893783569336, "loss/hidden": 1.109375, "loss/logits": 0.2122023105621338, "loss/reg": 0.0005311404238454998, "step": 5015 }, { "epoch": 0.627, "grad_norm": 3.0474603176116943, "grad_norm_var": 0.23702956665169736, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.6081910133361816, "loss/hidden": 1.2109375, "loss/logits": 0.17407673597335815, "loss/reg": 0.000530834193341434, "step": 5016 }, { "epoch": 0.627125, "grad_norm": 6.469361305236816, "grad_norm_var": 1.1809579869165938, "learning_rate": 0.0001, "loss": 1.3583, "loss/crossentropy": 2.4296584129333496, "loss/hidden": 1.15625, "loss/logits": 0.1967136561870575, "loss/reg": 0.0005305051454342902, "step": 5017 }, { "epoch": 0.62725, "grad_norm": 2.634432554244995, "grad_norm_var": 1.176583390208456, "learning_rate": 0.0001, "loss": 1.3239, "loss/crossentropy": 2.529200792312622, "loss/hidden": 1.1328125, "loss/logits": 0.18579792976379395, "loss/reg": 0.000530207937117666, "step": 5018 }, { "epoch": 0.627375, "grad_norm": 3.404893159866333, "grad_norm_var": 1.18724261133315, "learning_rate": 0.0001, "loss": 1.5197, "loss/crossentropy": 2.8444321155548096, "loss/hidden": 1.28125, "loss/logits": 0.23317666351795197, "loss/reg": 0.0005298717296682298, "step": 5019 }, { "epoch": 0.6275, "grad_norm": 2.3151774406433105, "grad_norm_var": 1.1883756537124015, "learning_rate": 0.0001, "loss": 1.0751, "loss/crossentropy": 2.4723622798919678, "loss/hidden": 0.92578125, "loss/logits": 0.1440669298171997, "loss/reg": 0.0005295376176945865, "step": 5020 }, { "epoch": 0.627625, "grad_norm": 6.513567924499512, "grad_norm_var": 1.9900765627772936, "learning_rate": 0.0001, "loss": 1.4448, "loss/crossentropy": 2.532794237136841, "loss/hidden": 1.265625, "loss/logits": 0.17392930388450623, "loss/reg": 0.0005292606656439602, "step": 5021 }, { "epoch": 0.62775, "grad_norm": 2.832745313644409, "grad_norm_var": 1.9597567804267024, "learning_rate": 0.0001, "loss": 1.5042, "loss/crossentropy": 2.2533299922943115, "loss/hidden": 1.2734375, "loss/logits": 0.2255171239376068, "loss/reg": 0.0005289779510349035, "step": 5022 }, { "epoch": 0.627875, "grad_norm": 3.7823522090911865, "grad_norm_var": 1.8880824031423058, "learning_rate": 0.0001, "loss": 1.288, "loss/crossentropy": 2.830096483230591, "loss/hidden": 1.078125, "loss/logits": 0.20461347699165344, "loss/reg": 0.000528674281667918, "step": 5023 }, { "epoch": 0.628, "grad_norm": 5.636371612548828, "grad_norm_var": 2.2047524442272466, "learning_rate": 0.0001, "loss": 1.3604, "loss/crossentropy": 2.2842531204223633, "loss/hidden": 1.1875, "loss/logits": 0.16757792234420776, "loss/reg": 0.000528356060385704, "step": 5024 }, { "epoch": 0.628125, "grad_norm": 3.145840883255005, "grad_norm_var": 2.1587276825525064, "learning_rate": 0.0001, "loss": 1.3364, "loss/crossentropy": 2.646454095840454, "loss/hidden": 1.125, "loss/logits": 0.20607605576515198, "loss/reg": 0.0005280792829580605, "step": 5025 }, { "epoch": 0.62825, "grad_norm": 2.7483439445495605, "grad_norm_var": 2.103049787781537, "learning_rate": 0.0001, "loss": 1.4575, "loss/crossentropy": 2.395620822906494, "loss/hidden": 1.2265625, "loss/logits": 0.2256191372871399, "loss/reg": 0.0005278020398691297, "step": 5026 }, { "epoch": 0.628375, "grad_norm": 2.7479169368743896, "grad_norm_var": 2.1080501378842045, "learning_rate": 0.0001, "loss": 1.5147, "loss/crossentropy": 2.502549409866333, "loss/hidden": 1.28125, "loss/logits": 0.22812913358211517, "loss/reg": 0.0005274826544336975, "step": 5027 }, { "epoch": 0.6285, "grad_norm": 2.253476142883301, "grad_norm_var": 2.0676629704273197, "learning_rate": 0.0001, "loss": 1.2273, "loss/crossentropy": 2.6329345703125, "loss/hidden": 1.046875, "loss/logits": 0.17520323395729065, "loss/reg": 0.000527159427292645, "step": 5028 }, { "epoch": 0.628625, "grad_norm": 3.1405653953552246, "grad_norm_var": 1.9920542922725872, "learning_rate": 0.0001, "loss": 1.4966, "loss/crossentropy": 2.415097951889038, "loss/hidden": 1.25, "loss/logits": 0.24129429459571838, "loss/reg": 0.0005268380045890808, "step": 5029 }, { "epoch": 0.62875, "grad_norm": 2.9411144256591797, "grad_norm_var": 1.9956296636451034, "learning_rate": 0.0001, "loss": 1.3929, "loss/crossentropy": 2.5109505653381348, "loss/hidden": 1.1640625, "loss/logits": 0.22359582781791687, "loss/reg": 0.0005265563959255815, "step": 5030 }, { "epoch": 0.628875, "grad_norm": 2.8404643535614014, "grad_norm_var": 1.9352063445058083, "learning_rate": 0.0001, "loss": 1.2813, "loss/crossentropy": 2.7974860668182373, "loss/hidden": 1.09375, "loss/logits": 0.18230897188186646, "loss/reg": 0.0005262440536171198, "step": 5031 }, { "epoch": 0.629, "grad_norm": 2.664606809616089, "grad_norm_var": 1.9689169792095744, "learning_rate": 0.0001, "loss": 1.4277, "loss/crossentropy": 2.4033122062683105, "loss/hidden": 1.2109375, "loss/logits": 0.21154119074344635, "loss/reg": 0.0005259308964014053, "step": 5032 }, { "epoch": 0.629125, "grad_norm": 2.205617904663086, "grad_norm_var": 1.419587828256574, "learning_rate": 0.0001, "loss": 1.2474, "loss/crossentropy": 2.2554681301116943, "loss/hidden": 1.0703125, "loss/logits": 0.17179113626480103, "loss/reg": 0.0005256090080365539, "step": 5033 }, { "epoch": 0.62925, "grad_norm": 2.5189366340637207, "grad_norm_var": 1.429715651584242, "learning_rate": 0.0001, "loss": 1.2132, "loss/crossentropy": 2.864790439605713, "loss/hidden": 1.03125, "loss/logits": 0.1766846776008606, "loss/reg": 0.0005252904375083745, "step": 5034 }, { "epoch": 0.629375, "grad_norm": 2.8188209533691406, "grad_norm_var": 1.4375750853973766, "learning_rate": 0.0001, "loss": 1.3996, "loss/crossentropy": 2.2679951190948486, "loss/hidden": 1.1875, "loss/logits": 0.2068983018398285, "loss/reg": 0.0005250159301795065, "step": 5035 }, { "epoch": 0.6295, "grad_norm": 10.175372123718262, "grad_norm_var": 4.377836819397839, "learning_rate": 0.0001, "loss": 1.482, "loss/crossentropy": 2.835876941680908, "loss/hidden": 1.296875, "loss/logits": 0.17990973591804504, "loss/reg": 0.000524749280884862, "step": 5036 }, { "epoch": 0.629625, "grad_norm": 2.3314313888549805, "grad_norm_var": 3.89393054126915, "learning_rate": 0.0001, "loss": 1.3596, "loss/crossentropy": 2.451582431793213, "loss/hidden": 1.1484375, "loss/logits": 0.205934077501297, "loss/reg": 0.0005245005013421178, "step": 5037 }, { "epoch": 0.62975, "grad_norm": 2.7197513580322266, "grad_norm_var": 3.9036362575385453, "learning_rate": 0.0001, "loss": 1.3131, "loss/crossentropy": 2.338655710220337, "loss/hidden": 1.125, "loss/logits": 0.18287312984466553, "loss/reg": 0.0005242621409706771, "step": 5038 }, { "epoch": 0.629875, "grad_norm": 3.3357372283935547, "grad_norm_var": 3.894342795596454, "learning_rate": 0.0001, "loss": 1.3464, "loss/crossentropy": 2.445064067840576, "loss/hidden": 1.140625, "loss/logits": 0.2004949450492859, "loss/reg": 0.0005240346654318273, "step": 5039 }, { "epoch": 0.63, "grad_norm": 2.353886127471924, "grad_norm_var": 3.5841770064431926, "learning_rate": 0.0001, "loss": 1.243, "loss/crossentropy": 2.4403977394104004, "loss/hidden": 1.046875, "loss/logits": 0.19091176986694336, "loss/reg": 0.0005238197627477348, "step": 5040 }, { "epoch": 0.630125, "grad_norm": 2.626847743988037, "grad_norm_var": 3.6036430422389265, "learning_rate": 0.0001, "loss": 1.2838, "loss/crossentropy": 2.498660087585449, "loss/hidden": 1.09375, "loss/logits": 0.18478964269161224, "loss/reg": 0.0005236261058598757, "step": 5041 }, { "epoch": 0.63025, "grad_norm": 2.589484691619873, "grad_norm_var": 3.613758181118415, "learning_rate": 0.0001, "loss": 1.2528, "loss/crossentropy": 2.6799468994140625, "loss/hidden": 1.0703125, "loss/logits": 0.17721867561340332, "loss/reg": 0.0005234424024820328, "step": 5042 }, { "epoch": 0.630375, "grad_norm": 22.624225616455078, "grad_norm_var": 27.262417302453493, "learning_rate": 0.0001, "loss": 1.2538, "loss/crossentropy": 2.3099939823150635, "loss/hidden": 1.0859375, "loss/logits": 0.16266754269599915, "loss/reg": 0.0005232709809206426, "step": 5043 }, { "epoch": 0.6305, "grad_norm": 5.992685794830322, "grad_norm_var": 27.074190217990115, "learning_rate": 0.0001, "loss": 1.433, "loss/crossentropy": 2.2757129669189453, "loss/hidden": 1.21875, "loss/logits": 0.20899216830730438, "loss/reg": 0.0005229943781159818, "step": 5044 }, { "epoch": 0.630625, "grad_norm": 2.728982448577881, "grad_norm_var": 27.165827015113063, "learning_rate": 0.0001, "loss": 1.3726, "loss/crossentropy": 2.4777474403381348, "loss/hidden": 1.15625, "loss/logits": 0.21113763749599457, "loss/reg": 0.0005228310474194586, "step": 5045 }, { "epoch": 0.63075, "grad_norm": 2.8746185302734375, "grad_norm_var": 27.180738084621794, "learning_rate": 0.0001, "loss": 1.3475, "loss/crossentropy": 2.645772933959961, "loss/hidden": 1.1328125, "loss/logits": 0.20947246253490448, "loss/reg": 0.0005226575303822756, "step": 5046 }, { "epoch": 0.630875, "grad_norm": 2.9318487644195557, "grad_norm_var": 27.159972000021696, "learning_rate": 0.0001, "loss": 1.315, "loss/crossentropy": 2.447939872741699, "loss/hidden": 1.125, "loss/logits": 0.1847275197505951, "loss/reg": 0.0005224990891292691, "step": 5047 }, { "epoch": 0.631, "grad_norm": 2.291654586791992, "grad_norm_var": 27.26457355824977, "learning_rate": 0.0001, "loss": 1.13, "loss/crossentropy": 2.5929484367370605, "loss/hidden": 0.96484375, "loss/logits": 0.159932941198349, "loss/reg": 0.0005223361076787114, "step": 5048 }, { "epoch": 0.631125, "grad_norm": 2.5116751194000244, "grad_norm_var": 27.17394342064155, "learning_rate": 0.0001, "loss": 1.3602, "loss/crossentropy": 2.6929996013641357, "loss/hidden": 1.1484375, "loss/logits": 0.20652911067008972, "loss/reg": 0.0005221616593189538, "step": 5049 }, { "epoch": 0.63125, "grad_norm": 3.280377149581909, "grad_norm_var": 27.000003952557357, "learning_rate": 0.0001, "loss": 1.3428, "loss/crossentropy": 2.622410297393799, "loss/hidden": 1.140625, "loss/logits": 0.19695377349853516, "loss/reg": 0.0005220046732574701, "step": 5050 }, { "epoch": 0.631375, "grad_norm": 2.4856362342834473, "grad_norm_var": 27.08770136298849, "learning_rate": 0.0001, "loss": 1.2621, "loss/crossentropy": 2.615776777267456, "loss/hidden": 1.0703125, "loss/logits": 0.1865202784538269, "loss/reg": 0.0005217258003540337, "step": 5051 }, { "epoch": 0.6315, "grad_norm": 6.010196685791016, "grad_norm_var": 25.08449760855853, "learning_rate": 0.0001, "loss": 1.4127, "loss/crossentropy": 2.583848714828491, "loss/hidden": 1.234375, "loss/logits": 0.1731245070695877, "loss/reg": 0.0005214445409364998, "step": 5052 }, { "epoch": 0.631625, "grad_norm": 2.708799362182617, "grad_norm_var": 24.991552262124003, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 1.9002983570098877, "loss/hidden": 1.2890625, "loss/logits": 0.2412625253200531, "loss/reg": 0.0005211685202084482, "step": 5053 }, { "epoch": 0.63175, "grad_norm": 2.5043365955352783, "grad_norm_var": 25.042113690723628, "learning_rate": 0.0001, "loss": 1.3244, "loss/crossentropy": 2.6799755096435547, "loss/hidden": 1.125, "loss/logits": 0.19419299066066742, "loss/reg": 0.0005209168884903193, "step": 5054 }, { "epoch": 0.631875, "grad_norm": 2.316530227661133, "grad_norm_var": 25.247001897408616, "learning_rate": 0.0001, "loss": 1.2848, "loss/crossentropy": 2.575704336166382, "loss/hidden": 1.1015625, "loss/logits": 0.17802339792251587, "loss/reg": 0.0005206351634114981, "step": 5055 }, { "epoch": 0.632, "grad_norm": 2.651268482208252, "grad_norm_var": 25.175285069665513, "learning_rate": 0.0001, "loss": 1.1911, "loss/crossentropy": 2.612311840057373, "loss/hidden": 1.015625, "loss/logits": 0.17023947834968567, "loss/reg": 0.0005203831824474037, "step": 5056 }, { "epoch": 0.632125, "grad_norm": 3.619611978530884, "grad_norm_var": 25.01268788999951, "learning_rate": 0.0001, "loss": 1.685, "loss/crossentropy": 2.412377119064331, "loss/hidden": 1.4609375, "loss/logits": 0.21886754035949707, "loss/reg": 0.0005201092571951449, "step": 5057 }, { "epoch": 0.63225, "grad_norm": 3.0377862453460693, "grad_norm_var": 24.918066690887947, "learning_rate": 0.0001, "loss": 1.2403, "loss/crossentropy": 2.474714756011963, "loss/hidden": 1.0703125, "loss/logits": 0.16479182243347168, "loss/reg": 0.0005198326543904841, "step": 5058 }, { "epoch": 0.632375, "grad_norm": 2.77370285987854, "grad_norm_var": 1.3392095912908148, "learning_rate": 0.0001, "loss": 1.235, "loss/crossentropy": 2.513166904449463, "loss/hidden": 1.0625, "loss/logits": 0.16732698678970337, "loss/reg": 0.0005195558187551796, "step": 5059 }, { "epoch": 0.6325, "grad_norm": 2.471597194671631, "grad_norm_var": 0.7888899348528163, "learning_rate": 0.0001, "loss": 1.2557, "loss/crossentropy": 2.396678924560547, "loss/hidden": 1.078125, "loss/logits": 0.17234951257705688, "loss/reg": 0.0005192662356421351, "step": 5060 }, { "epoch": 0.632625, "grad_norm": 2.996093988418579, "grad_norm_var": 0.7854807752691073, "learning_rate": 0.0001, "loss": 1.2618, "loss/crossentropy": 2.5212090015411377, "loss/hidden": 1.0703125, "loss/logits": 0.18628990650177002, "loss/reg": 0.0005189375369809568, "step": 5061 }, { "epoch": 0.63275, "grad_norm": 3.7189955711364746, "grad_norm_var": 0.8196850135461886, "learning_rate": 0.0001, "loss": 1.4176, "loss/crossentropy": 3.022876739501953, "loss/hidden": 1.2109375, "loss/logits": 0.20151013135910034, "loss/reg": 0.0005186534253880382, "step": 5062 }, { "epoch": 0.632875, "grad_norm": 2.974019765853882, "grad_norm_var": 0.8193039815728668, "learning_rate": 0.0001, "loss": 1.4856, "loss/crossentropy": 2.6187729835510254, "loss/hidden": 1.2265625, "loss/logits": 0.2538515627384186, "loss/reg": 0.0005183318280614913, "step": 5063 }, { "epoch": 0.633, "grad_norm": 2.536607503890991, "grad_norm_var": 0.7992001613010338, "learning_rate": 0.0001, "loss": 1.139, "loss/crossentropy": 2.3780019283294678, "loss/hidden": 0.984375, "loss/logits": 0.14942169189453125, "loss/reg": 0.0005179807194508612, "step": 5064 }, { "epoch": 0.633125, "grad_norm": 2.4373528957366943, "grad_norm_var": 0.8047544157470305, "learning_rate": 0.0001, "loss": 1.3362, "loss/crossentropy": 2.4543612003326416, "loss/hidden": 1.1484375, "loss/logits": 0.1825745701789856, "loss/reg": 0.0005176045233383775, "step": 5065 }, { "epoch": 0.63325, "grad_norm": 3.0422794818878174, "grad_norm_var": 0.800434155104843, "learning_rate": 0.0001, "loss": 1.1684, "loss/crossentropy": 2.7168428897857666, "loss/hidden": 1.0078125, "loss/logits": 0.15545246005058289, "loss/reg": 0.000517322332598269, "step": 5066 }, { "epoch": 0.633375, "grad_norm": 2.675471544265747, "grad_norm_var": 0.7892166751103598, "learning_rate": 0.0001, "loss": 1.2263, "loss/crossentropy": 2.7302112579345703, "loss/hidden": 1.0546875, "loss/logits": 0.16647346317768097, "loss/reg": 0.0005170151707716286, "step": 5067 }, { "epoch": 0.6335, "grad_norm": 23.712797164916992, "grad_norm_var": 27.410682345973346, "learning_rate": 0.0001, "loss": 1.3775, "loss/crossentropy": 2.345010280609131, "loss/hidden": 1.1953125, "loss/logits": 0.17697905004024506, "loss/reg": 0.0005167056806385517, "step": 5068 }, { "epoch": 0.633625, "grad_norm": 3.142698287963867, "grad_norm_var": 27.339876480621285, "learning_rate": 0.0001, "loss": 1.3667, "loss/crossentropy": 2.4437897205352783, "loss/hidden": 1.171875, "loss/logits": 0.18967381119728088, "loss/reg": 0.0005164352478459477, "step": 5069 }, { "epoch": 0.63375, "grad_norm": 2.580296754837036, "grad_norm_var": 27.323436130223115, "learning_rate": 0.0001, "loss": 1.3015, "loss/crossentropy": 2.615811347961426, "loss/hidden": 1.109375, "loss/logits": 0.18691349029541016, "loss/reg": 0.0005161683075129986, "step": 5070 }, { "epoch": 0.633875, "grad_norm": 2.5371739864349365, "grad_norm_var": 27.27201179605261, "learning_rate": 0.0001, "loss": 1.2518, "loss/crossentropy": 2.6959424018859863, "loss/hidden": 1.0625, "loss/logits": 0.1841207891702652, "loss/reg": 0.000515866675414145, "step": 5071 }, { "epoch": 0.634, "grad_norm": 4.52121639251709, "grad_norm_var": 27.10897027943701, "learning_rate": 0.0001, "loss": 1.4312, "loss/crossentropy": 2.4034411907196045, "loss/hidden": 1.2109375, "loss/logits": 0.21514348685741425, "loss/reg": 0.0005155931576155126, "step": 5072 }, { "epoch": 0.634125, "grad_norm": 2.294417381286621, "grad_norm_var": 27.338702364336232, "learning_rate": 0.0001, "loss": 1.1688, "loss/crossentropy": 2.7020933628082275, "loss/hidden": 0.98828125, "loss/logits": 0.17539094388484955, "loss/reg": 0.0005153018282726407, "step": 5073 }, { "epoch": 0.63425, "grad_norm": 2.6290814876556396, "grad_norm_var": 27.41333598365214, "learning_rate": 0.0001, "loss": 1.2985, "loss/crossentropy": 2.795491933822632, "loss/hidden": 1.09375, "loss/logits": 0.19959227740764618, "loss/reg": 0.000514959916472435, "step": 5074 }, { "epoch": 0.634375, "grad_norm": 4.604924201965332, "grad_norm_var": 27.27705654573621, "learning_rate": 0.0001, "loss": 1.4891, "loss/crossentropy": 2.5977494716644287, "loss/hidden": 1.265625, "loss/logits": 0.21829542517662048, "loss/reg": 0.0005146401235833764, "step": 5075 }, { "epoch": 0.6345, "grad_norm": 2.517012596130371, "grad_norm_var": 27.266085375771105, "learning_rate": 0.0001, "loss": 1.3941, "loss/crossentropy": 2.453739643096924, "loss/hidden": 1.171875, "loss/logits": 0.2170887291431427, "loss/reg": 0.0005143771995790303, "step": 5076 }, { "epoch": 0.634625, "grad_norm": 2.40828013420105, "grad_norm_var": 27.39046452961297, "learning_rate": 0.0001, "loss": 1.1425, "loss/crossentropy": 2.647333860397339, "loss/hidden": 0.96484375, "loss/logits": 0.172492116689682, "loss/reg": 0.0005141178262419999, "step": 5077 }, { "epoch": 0.63475, "grad_norm": 2.9582407474517822, "grad_norm_var": 27.482606882459216, "learning_rate": 0.0001, "loss": 1.5406, "loss/crossentropy": 2.667685031890869, "loss/hidden": 1.234375, "loss/logits": 0.30104827880859375, "loss/reg": 0.0005138475680723786, "step": 5078 }, { "epoch": 0.634875, "grad_norm": 4.939992427825928, "grad_norm_var": 27.396714206786704, "learning_rate": 0.0001, "loss": 1.3469, "loss/crossentropy": 2.5789103507995605, "loss/hidden": 1.140625, "loss/logits": 0.20114775002002716, "loss/reg": 0.0005135745159350336, "step": 5079 }, { "epoch": 0.635, "grad_norm": 2.3017282485961914, "grad_norm_var": 27.456830998924108, "learning_rate": 0.0001, "loss": 1.2564, "loss/crossentropy": 2.4447121620178223, "loss/hidden": 1.078125, "loss/logits": 0.17311710119247437, "loss/reg": 0.0005133126978762448, "step": 5080 }, { "epoch": 0.635125, "grad_norm": 3.612041473388672, "grad_norm_var": 27.24641340904396, "learning_rate": 0.0001, "loss": 1.6472, "loss/crossentropy": 2.7537331581115723, "loss/hidden": 1.3828125, "loss/logits": 0.25922805070877075, "loss/reg": 0.0005130463978275657, "step": 5081 }, { "epoch": 0.63525, "grad_norm": 2.029895782470703, "grad_norm_var": 27.494397288460323, "learning_rate": 0.0001, "loss": 1.1035, "loss/crossentropy": 2.4797043800354004, "loss/hidden": 0.953125, "loss/logits": 0.1452726125717163, "loss/reg": 0.0005127845797687769, "step": 5082 }, { "epoch": 0.635375, "grad_norm": 2.5657920837402344, "grad_norm_var": 27.51951417726907, "learning_rate": 0.0001, "loss": 1.1066, "loss/crossentropy": 2.6382389068603516, "loss/hidden": 0.95703125, "loss/logits": 0.14444413781166077, "loss/reg": 0.0005125009338371456, "step": 5083 }, { "epoch": 0.6355, "grad_norm": 2.691979169845581, "grad_norm_var": 0.8242961908715529, "learning_rate": 0.0001, "loss": 1.2313, "loss/crossentropy": 2.2464771270751953, "loss/hidden": 1.0625, "loss/logits": 0.16370633244514465, "loss/reg": 0.0005121990689076483, "step": 5084 }, { "epoch": 0.635625, "grad_norm": 3.210042953491211, "grad_norm_var": 0.8256731010703068, "learning_rate": 0.0001, "loss": 1.4558, "loss/crossentropy": 3.022054672241211, "loss/hidden": 1.234375, "loss/logits": 0.21628451347351074, "loss/reg": 0.0005119331181049347, "step": 5085 }, { "epoch": 0.63575, "grad_norm": 3.158437490463257, "grad_norm_var": 0.8122731852614067, "learning_rate": 0.0001, "loss": 1.711, "loss/crossentropy": 2.233520746231079, "loss/hidden": 1.375, "loss/logits": 0.33092251420021057, "loss/reg": 0.0005116112297400832, "step": 5086 }, { "epoch": 0.635875, "grad_norm": 2.4463376998901367, "grad_norm_var": 0.8191364305939731, "learning_rate": 0.0001, "loss": 1.3432, "loss/crossentropy": 2.504575490951538, "loss/hidden": 1.140625, "loss/logits": 0.19743302464485168, "loss/reg": 0.0005112785729579628, "step": 5087 }, { "epoch": 0.636, "grad_norm": 2.703399896621704, "grad_norm_var": 0.6704327047737858, "learning_rate": 0.0001, "loss": 1.1524, "loss/crossentropy": 2.7109973430633545, "loss/hidden": 0.98828125, "loss/logits": 0.15899059176445007, "loss/reg": 0.0005109831690788269, "step": 5088 }, { "epoch": 0.636125, "grad_norm": 4.830543518066406, "grad_norm_var": 0.8534569044259153, "learning_rate": 0.0001, "loss": 1.9454, "loss/crossentropy": 2.7993946075439453, "loss/hidden": 1.640625, "loss/logits": 0.2996511459350586, "loss/reg": 0.0005107104661874473, "step": 5089 }, { "epoch": 0.63625, "grad_norm": 2.322005271911621, "grad_norm_var": 0.8786512226279529, "learning_rate": 0.0001, "loss": 1.1914, "loss/crossentropy": 2.281465768814087, "loss/hidden": 1.03125, "loss/logits": 0.1550629734992981, "loss/reg": 0.0005104435258544981, "step": 5090 }, { "epoch": 0.636375, "grad_norm": 3.018836259841919, "grad_norm_var": 0.7136653772043767, "learning_rate": 0.0001, "loss": 1.6424, "loss/crossentropy": 2.548292636871338, "loss/hidden": 1.359375, "loss/logits": 0.277885377407074, "loss/reg": 0.000510144978761673, "step": 5091 }, { "epoch": 0.6365, "grad_norm": 2.658566951751709, "grad_norm_var": 0.706138570505137, "learning_rate": 0.0001, "loss": 1.3049, "loss/crossentropy": 2.457908868789673, "loss/hidden": 1.109375, "loss/logits": 0.19039186835289001, "loss/reg": 0.0005098793772049248, "step": 5092 }, { "epoch": 0.636625, "grad_norm": 3.4953501224517822, "grad_norm_var": 0.6955340845467153, "learning_rate": 0.0001, "loss": 1.6236, "loss/crossentropy": 2.2519772052764893, "loss/hidden": 1.390625, "loss/logits": 0.22783984243869781, "loss/reg": 0.0005095942760817707, "step": 5093 }, { "epoch": 0.63675, "grad_norm": 2.3407626152038574, "grad_norm_var": 0.7276554211530762, "learning_rate": 0.0001, "loss": 1.1417, "loss/crossentropy": 2.6515281200408936, "loss/hidden": 0.9765625, "loss/logits": 0.16006308794021606, "loss/reg": 0.0005093307117931545, "step": 5094 }, { "epoch": 0.636875, "grad_norm": 2.4877736568450928, "grad_norm_var": 0.4758426728056217, "learning_rate": 0.0001, "loss": 1.2009, "loss/crossentropy": 2.5335075855255127, "loss/hidden": 1.046875, "loss/logits": 0.14897051453590393, "loss/reg": 0.0005090605118311942, "step": 5095 }, { "epoch": 0.637, "grad_norm": 3.8838279247283936, "grad_norm_var": 0.5130207805498569, "learning_rate": 0.0001, "loss": 1.3139, "loss/crossentropy": 3.100192070007324, "loss/hidden": 1.0859375, "loss/logits": 0.2228861004114151, "loss/reg": 0.0005087615572847426, "step": 5096 }, { "epoch": 0.637125, "grad_norm": 2.6293835639953613, "grad_norm_var": 0.488723446924363, "learning_rate": 0.0001, "loss": 1.3716, "loss/crossentropy": 2.453946828842163, "loss/hidden": 1.15625, "loss/logits": 0.21022838354110718, "loss/reg": 0.0005084642907604575, "step": 5097 }, { "epoch": 0.63725, "grad_norm": 10.651998519897461, "grad_norm_var": 4.129490255991262, "learning_rate": 0.0001, "loss": 1.7473, "loss/crossentropy": 2.744757652282715, "loss/hidden": 1.4921875, "loss/logits": 0.2500014007091522, "loss/reg": 0.0005081415292806923, "step": 5098 }, { "epoch": 0.637375, "grad_norm": 3.177091121673584, "grad_norm_var": 4.081311631244849, "learning_rate": 0.0001, "loss": 1.7642, "loss/crossentropy": 2.2663798332214355, "loss/hidden": 1.46875, "loss/logits": 0.2903444170951843, "loss/reg": 0.0005078361718915403, "step": 5099 }, { "epoch": 0.6375, "grad_norm": 2.3659377098083496, "grad_norm_var": 4.122284123016568, "learning_rate": 0.0001, "loss": 1.2056, "loss/crossentropy": 2.6140024662017822, "loss/hidden": 1.03125, "loss/logits": 0.16930124163627625, "loss/reg": 0.0005075650988146663, "step": 5100 }, { "epoch": 0.637625, "grad_norm": 3.3458151817321777, "grad_norm_var": 4.118888327992189, "learning_rate": 0.0001, "loss": 1.3209, "loss/crossentropy": 2.3657383918762207, "loss/hidden": 1.140625, "loss/logits": 0.17517036199569702, "loss/reg": 0.0005072879139333963, "step": 5101 }, { "epoch": 0.63775, "grad_norm": 3.6010186672210693, "grad_norm_var": 4.112759652573845, "learning_rate": 0.0001, "loss": 1.369, "loss/crossentropy": 2.5407118797302246, "loss/hidden": 1.1640625, "loss/logits": 0.19986924529075623, "loss/reg": 0.0005070214974693954, "step": 5102 }, { "epoch": 0.637875, "grad_norm": 3.755084276199341, "grad_norm_var": 4.036398148891836, "learning_rate": 0.0001, "loss": 1.809, "loss/crossentropy": 2.286653995513916, "loss/hidden": 1.4609375, "loss/logits": 0.3429538607597351, "loss/reg": 0.0005067247548140585, "step": 5103 }, { "epoch": 0.638, "grad_norm": 2.20984148979187, "grad_norm_var": 4.109258412958022, "learning_rate": 0.0001, "loss": 1.2237, "loss/crossentropy": 2.652029514312744, "loss/hidden": 1.03125, "loss/logits": 0.18738701939582825, "loss/reg": 0.0005064418655820191, "step": 5104 }, { "epoch": 0.638125, "grad_norm": 2.533512592315674, "grad_norm_var": 4.046336467020783, "learning_rate": 0.0001, "loss": 1.2365, "loss/crossentropy": 2.5735015869140625, "loss/hidden": 1.0546875, "loss/logits": 0.17670659720897675, "loss/reg": 0.0005061431438662112, "step": 5105 }, { "epoch": 0.63825, "grad_norm": 2.710113286972046, "grad_norm_var": 3.99971851229822, "learning_rate": 0.0001, "loss": 1.3552, "loss/crossentropy": 2.4236743450164795, "loss/hidden": 1.140625, "loss/logits": 0.2094956934452057, "loss/reg": 0.0005058220122009516, "step": 5106 }, { "epoch": 0.638375, "grad_norm": 2.179696559906006, "grad_norm_var": 4.089625908458348, "learning_rate": 0.0001, "loss": 1.1526, "loss/crossentropy": 2.3775041103363037, "loss/hidden": 0.984375, "loss/logits": 0.16317762434482574, "loss/reg": 0.000505502859596163, "step": 5107 }, { "epoch": 0.6385, "grad_norm": 2.1832938194274902, "grad_norm_var": 4.149245958411658, "learning_rate": 0.0001, "loss": 1.2462, "loss/crossentropy": 2.4062111377716064, "loss/hidden": 1.0625, "loss/logits": 0.17866000533103943, "loss/reg": 0.0005052354536019266, "step": 5108 }, { "epoch": 0.638625, "grad_norm": 2.8180792331695557, "grad_norm_var": 4.164509560624048, "learning_rate": 0.0001, "loss": 1.5511, "loss/crossentropy": 2.3625166416168213, "loss/hidden": 1.3203125, "loss/logits": 0.2257830798625946, "loss/reg": 0.0005049873725511134, "step": 5109 }, { "epoch": 0.63875, "grad_norm": 3.0819716453552246, "grad_norm_var": 4.103594774887385, "learning_rate": 0.0001, "loss": 1.3369, "loss/crossentropy": 2.8977394104003906, "loss/hidden": 1.1484375, "loss/logits": 0.18345627188682556, "loss/reg": 0.0005047039594501257, "step": 5110 }, { "epoch": 0.638875, "grad_norm": 2.558230400085449, "grad_norm_var": 4.0957966018986545, "learning_rate": 0.0001, "loss": 1.2161, "loss/crossentropy": 2.389404773712158, "loss/hidden": 1.0390625, "loss/logits": 0.17200583219528198, "loss/reg": 0.000504427938722074, "step": 5111 }, { "epoch": 0.639, "grad_norm": 2.2083373069763184, "grad_norm_var": 4.153179765605644, "learning_rate": 0.0001, "loss": 1.2627, "loss/crossentropy": 2.4298479557037354, "loss/hidden": 1.0546875, "loss/logits": 0.20301291346549988, "loss/reg": 0.0005041343392804265, "step": 5112 }, { "epoch": 0.639125, "grad_norm": 2.5616302490234375, "grad_norm_var": 4.1590784924124895, "learning_rate": 0.0001, "loss": 1.2077, "loss/crossentropy": 2.708958387374878, "loss/hidden": 1.03125, "loss/logits": 0.1714297980070114, "loss/reg": 0.0005038422532379627, "step": 5113 }, { "epoch": 0.63925, "grad_norm": 2.3116414546966553, "grad_norm_var": 0.27124563177069283, "learning_rate": 0.0001, "loss": 1.362, "loss/crossentropy": 2.6458988189697266, "loss/hidden": 1.140625, "loss/logits": 0.21635600924491882, "loss/reg": 0.0005035361973568797, "step": 5114 }, { "epoch": 0.639375, "grad_norm": 2.8223469257354736, "grad_norm_var": 0.2577311151264228, "learning_rate": 0.0001, "loss": 1.294, "loss/crossentropy": 3.228630542755127, "loss/hidden": 1.09375, "loss/logits": 0.19520412385463715, "loss/reg": 0.0005032765329815447, "step": 5115 }, { "epoch": 0.6395, "grad_norm": 1.9360452890396118, "grad_norm_var": 0.2885964616436434, "learning_rate": 0.0001, "loss": 1.1293, "loss/crossentropy": 2.335219144821167, "loss/hidden": 0.96875, "loss/logits": 0.15548814833164215, "loss/reg": 0.0005030336324125528, "step": 5116 }, { "epoch": 0.639625, "grad_norm": 2.9464335441589355, "grad_norm_var": 0.2628995073998704, "learning_rate": 0.0001, "loss": 1.4099, "loss/crossentropy": 2.3817079067230225, "loss/hidden": 1.1875, "loss/logits": 0.21741178631782532, "loss/reg": 0.0005027567967772484, "step": 5117 }, { "epoch": 0.63975, "grad_norm": 2.817227363586426, "grad_norm_var": 0.20202121440513418, "learning_rate": 0.0001, "loss": 1.1574, "loss/crossentropy": 2.738774538040161, "loss/hidden": 0.9921875, "loss/logits": 0.160221666097641, "loss/reg": 0.0005024697165936232, "step": 5118 }, { "epoch": 0.639875, "grad_norm": 3.8553414344787598, "grad_norm_var": 0.21806218567493066, "learning_rate": 0.0001, "loss": 1.8026, "loss/crossentropy": 2.6818783283233643, "loss/hidden": 1.5078125, "loss/logits": 0.2897658348083496, "loss/reg": 0.0005021601682528853, "step": 5119 }, { "epoch": 0.64, "grad_norm": 4.015835285186768, "grad_norm_var": 0.3259503693648615, "learning_rate": 0.0001, "loss": 1.2456, "loss/crossentropy": 3.043262481689453, "loss/hidden": 1.078125, "loss/logits": 0.1624840497970581, "loss/reg": 0.000501849630381912, "step": 5120 }, { "epoch": 0.640125, "grad_norm": 7.182591438293457, "grad_norm_var": 1.560457329726601, "learning_rate": 0.0001, "loss": 2.1135, "loss/crossentropy": 2.725367784500122, "loss/hidden": 1.7265625, "loss/logits": 0.38195642828941345, "loss/reg": 0.0005015315255150199, "step": 5121 }, { "epoch": 0.64025, "grad_norm": 3.6878437995910645, "grad_norm_var": 1.5808754080160885, "learning_rate": 0.0001, "loss": 1.8076, "loss/crossentropy": 3.227931261062622, "loss/hidden": 1.4453125, "loss/logits": 0.3572898507118225, "loss/reg": 0.0005012640613131225, "step": 5122 }, { "epoch": 0.640375, "grad_norm": 4.723109722137451, "grad_norm_var": 1.6822770078247107, "learning_rate": 0.0001, "loss": 1.219, "loss/crossentropy": 2.5556604862213135, "loss/hidden": 1.0703125, "loss/logits": 0.14366495609283447, "loss/reg": 0.0005009453161619604, "step": 5123 }, { "epoch": 0.6405, "grad_norm": 3.1094002723693848, "grad_norm_var": 1.6064021911997959, "learning_rate": 0.0001, "loss": 1.3904, "loss/crossentropy": 2.386810302734375, "loss/hidden": 1.171875, "loss/logits": 0.2135646641254425, "loss/reg": 0.0005006203427910805, "step": 5124 }, { "epoch": 0.640625, "grad_norm": 2.9842448234558105, "grad_norm_var": 1.5976777282162715, "learning_rate": 0.0001, "loss": 1.3431, "loss/crossentropy": 2.588193655014038, "loss/hidden": 1.140625, "loss/logits": 0.19749455153942108, "loss/reg": 0.0005003067199140787, "step": 5125 }, { "epoch": 0.64075, "grad_norm": 2.6486287117004395, "grad_norm_var": 1.6220198886619888, "learning_rate": 0.0001, "loss": 1.7069, "loss/crossentropy": 2.064786434173584, "loss/hidden": 1.40625, "loss/logits": 0.29564523696899414, "loss/reg": 0.000500036810990423, "step": 5126 }, { "epoch": 0.640875, "grad_norm": 2.274152994155884, "grad_norm_var": 1.6541390583391438, "learning_rate": 0.0001, "loss": 1.3552, "loss/crossentropy": 2.596956491470337, "loss/hidden": 1.140625, "loss/logits": 0.2096271514892578, "loss/reg": 0.0004997726646251976, "step": 5127 }, { "epoch": 0.641, "grad_norm": 2.5470659732818604, "grad_norm_var": 1.6140252608224006, "learning_rate": 0.0001, "loss": 1.2689, "loss/crossentropy": 2.663341522216797, "loss/hidden": 1.0703125, "loss/logits": 0.1936332732439041, "loss/reg": 0.0004995138151571155, "step": 5128 }, { "epoch": 0.641125, "grad_norm": 2.886382818222046, "grad_norm_var": 1.5896639170004865, "learning_rate": 0.0001, "loss": 1.2848, "loss/crossentropy": 2.8203670978546143, "loss/hidden": 1.0859375, "loss/logits": 0.1939113438129425, "loss/reg": 0.0004992350586690009, "step": 5129 }, { "epoch": 0.64125, "grad_norm": 2.3910932540893555, "grad_norm_var": 1.5796224410075042, "learning_rate": 0.0001, "loss": 1.2017, "loss/crossentropy": 2.8257460594177246, "loss/hidden": 1.03125, "loss/logits": 0.16541257500648499, "loss/reg": 0.0004989809822291136, "step": 5130 }, { "epoch": 0.641375, "grad_norm": 2.159123420715332, "grad_norm_var": 1.6495061310916195, "learning_rate": 0.0001, "loss": 1.215, "loss/crossentropy": 2.619267702102661, "loss/hidden": 1.0390625, "loss/logits": 0.17090250551700592, "loss/reg": 0.0004987437860108912, "step": 5131 }, { "epoch": 0.6415, "grad_norm": 3.6242575645446777, "grad_norm_var": 1.5295557866439422, "learning_rate": 0.0001, "loss": 1.7226, "loss/crossentropy": 2.3628530502319336, "loss/hidden": 1.4140625, "loss/logits": 0.303516685962677, "loss/reg": 0.0004984820843674242, "step": 5132 }, { "epoch": 0.641625, "grad_norm": 2.8428053855895996, "grad_norm_var": 1.536021326131627, "learning_rate": 0.0001, "loss": 1.2058, "loss/crossentropy": 2.8129045963287354, "loss/hidden": 1.0078125, "loss/logits": 0.1929636299610138, "loss/reg": 0.0004982483224011958, "step": 5133 }, { "epoch": 0.64175, "grad_norm": 2.3136231899261475, "grad_norm_var": 1.5882723480422016, "learning_rate": 0.0001, "loss": 1.2229, "loss/crossentropy": 2.5950982570648193, "loss/hidden": 1.046875, "loss/logits": 0.17100678384304047, "loss/reg": 0.0004980195662938058, "step": 5134 }, { "epoch": 0.641875, "grad_norm": 3.364037036895752, "grad_norm_var": 1.5688036748142915, "learning_rate": 0.0001, "loss": 1.3256, "loss/crossentropy": 2.812877893447876, "loss/hidden": 1.125, "loss/logits": 0.1956014186143875, "loss/reg": 0.0004978045471943915, "step": 5135 }, { "epoch": 0.642, "grad_norm": 2.8039660453796387, "grad_norm_var": 1.5444637903775922, "learning_rate": 0.0001, "loss": 1.361, "loss/crossentropy": 2.4708750247955322, "loss/hidden": 1.1484375, "loss/logits": 0.20758356153964996, "loss/reg": 0.0004975510528311133, "step": 5136 }, { "epoch": 0.642125, "grad_norm": 2.7393760681152344, "grad_norm_var": 0.4316226859689541, "learning_rate": 0.0001, "loss": 1.4169, "loss/crossentropy": 2.345625162124634, "loss/hidden": 1.2109375, "loss/logits": 0.20095863938331604, "loss/reg": 0.0004972858005203307, "step": 5137 }, { "epoch": 0.64225, "grad_norm": 2.308830738067627, "grad_norm_var": 0.41365194220795637, "learning_rate": 0.0001, "loss": 1.1966, "loss/crossentropy": 2.796586036682129, "loss/hidden": 1.015625, "loss/logits": 0.1760481595993042, "loss/reg": 0.0004970442969352007, "step": 5138 }, { "epoch": 0.642375, "grad_norm": 2.2503998279571533, "grad_norm_var": 0.18071580162273312, "learning_rate": 0.0001, "loss": 1.2759, "loss/crossentropy": 2.16933012008667, "loss/hidden": 1.078125, "loss/logits": 0.19284602999687195, "loss/reg": 0.0004968110588379204, "step": 5139 }, { "epoch": 0.6425, "grad_norm": 2.7979438304901123, "grad_norm_var": 0.16990023550905847, "learning_rate": 0.0001, "loss": 1.357, "loss/crossentropy": 2.635906457901001, "loss/hidden": 1.140625, "loss/logits": 0.21144485473632812, "loss/reg": 0.000496627006214112, "step": 5140 }, { "epoch": 0.642625, "grad_norm": 3.0625033378601074, "grad_norm_var": 0.17342116716797495, "learning_rate": 0.0001, "loss": 1.2606, "loss/crossentropy": 2.3482744693756104, "loss/hidden": 1.0859375, "loss/logits": 0.16973236203193665, "loss/reg": 0.0004963643732480705, "step": 5141 }, { "epoch": 0.64275, "grad_norm": 2.6400468349456787, "grad_norm_var": 0.17347126350776018, "learning_rate": 0.0001, "loss": 1.2344, "loss/crossentropy": 2.4530110359191895, "loss/hidden": 1.0390625, "loss/logits": 0.19038966298103333, "loss/reg": 0.0004961794475093484, "step": 5142 }, { "epoch": 0.642875, "grad_norm": 2.3625407218933105, "grad_norm_var": 0.16908410000450355, "learning_rate": 0.0001, "loss": 1.2006, "loss/crossentropy": 2.52653431892395, "loss/hidden": 1.0234375, "loss/logits": 0.1721627116203308, "loss/reg": 0.000495915359351784, "step": 5143 }, { "epoch": 0.643, "grad_norm": 2.5446977615356445, "grad_norm_var": 0.16913064922060977, "learning_rate": 0.0001, "loss": 1.4796, "loss/crossentropy": 2.5442330837249756, "loss/hidden": 1.21875, "loss/logits": 0.25586214661598206, "loss/reg": 0.0004956519114784896, "step": 5144 }, { "epoch": 0.643125, "grad_norm": 2.88698410987854, "grad_norm_var": 0.16914615756937199, "learning_rate": 0.0001, "loss": 1.4264, "loss/crossentropy": 2.5848820209503174, "loss/hidden": 1.1796875, "loss/logits": 0.24177680909633636, "loss/reg": 0.0004953862517140806, "step": 5145 }, { "epoch": 0.64325, "grad_norm": 2.9080569744110107, "grad_norm_var": 0.16502117842256137, "learning_rate": 0.0001, "loss": 1.2973, "loss/crossentropy": 2.3834664821624756, "loss/hidden": 1.125, "loss/logits": 0.16732367873191833, "loss/reg": 0.0004951059236191213, "step": 5146 }, { "epoch": 0.643375, "grad_norm": 2.307844638824463, "grad_norm_var": 0.1551711131480251, "learning_rate": 0.0001, "loss": 1.2491, "loss/crossentropy": 2.472900867462158, "loss/hidden": 1.0625, "loss/logits": 0.18161331117153168, "loss/reg": 0.0004948489367961884, "step": 5147 }, { "epoch": 0.6435, "grad_norm": 2.142005205154419, "grad_norm_var": 0.11671513461741347, "learning_rate": 0.0001, "loss": 1.1806, "loss/crossentropy": 2.572748899459839, "loss/hidden": 1.0, "loss/logits": 0.17562462389469147, "loss/reg": 0.0004945840919390321, "step": 5148 }, { "epoch": 0.643625, "grad_norm": 3.851710557937622, "grad_norm_var": 0.20731493053725292, "learning_rate": 0.0001, "loss": 1.8246, "loss/crossentropy": 2.8883450031280518, "loss/hidden": 1.4921875, "loss/logits": 0.3274795711040497, "loss/reg": 0.0004942975356243551, "step": 5149 }, { "epoch": 0.64375, "grad_norm": 3.5250766277313232, "grad_norm_var": 0.23577706941659546, "learning_rate": 0.0001, "loss": 1.672, "loss/crossentropy": 2.401146650314331, "loss/hidden": 1.4296875, "loss/logits": 0.23737545311450958, "loss/reg": 0.0004940395592711866, "step": 5150 }, { "epoch": 0.643875, "grad_norm": 4.433433532714844, "grad_norm_var": 0.3903854776627583, "learning_rate": 0.0001, "loss": 1.5661, "loss/crossentropy": 2.4976308345794678, "loss/hidden": 1.3203125, "loss/logits": 0.24082845449447632, "loss/reg": 0.0004937876365147531, "step": 5151 }, { "epoch": 0.644, "grad_norm": 2.5987093448638916, "grad_norm_var": 0.39421930565954594, "learning_rate": 0.0001, "loss": 1.2614, "loss/crossentropy": 2.604088306427002, "loss/hidden": 1.046875, "loss/logits": 0.20957592129707336, "loss/reg": 0.0004935157485306263, "step": 5152 }, { "epoch": 0.644125, "grad_norm": 3.1131179332733154, "grad_norm_var": 0.3981838377936053, "learning_rate": 0.0001, "loss": 1.3198, "loss/crossentropy": 2.707632064819336, "loss/hidden": 1.125, "loss/logits": 0.18982860445976257, "loss/reg": 0.0004932382726110518, "step": 5153 }, { "epoch": 0.64425, "grad_norm": 3.435094118118286, "grad_norm_var": 0.39493987247301615, "learning_rate": 0.0001, "loss": 1.5109, "loss/crossentropy": 2.33650279045105, "loss/hidden": 1.296875, "loss/logits": 0.2091112732887268, "loss/reg": 0.0004929821006953716, "step": 5154 }, { "epoch": 0.644375, "grad_norm": 2.4468061923980713, "grad_norm_var": 0.3795862625973728, "learning_rate": 0.0001, "loss": 1.3064, "loss/crossentropy": 2.5348849296569824, "loss/hidden": 1.125, "loss/logits": 0.17651310563087463, "loss/reg": 0.0004927053232677281, "step": 5155 }, { "epoch": 0.6445, "grad_norm": 4.098026275634766, "grad_norm_var": 0.46042049102348226, "learning_rate": 0.0001, "loss": 1.5718, "loss/crossentropy": 2.626354694366455, "loss/hidden": 1.3125, "loss/logits": 0.25438952445983887, "loss/reg": 0.0004924043896608055, "step": 5156 }, { "epoch": 0.644625, "grad_norm": 3.3599681854248047, "grad_norm_var": 0.4675457299647699, "learning_rate": 0.0001, "loss": 1.4833, "loss/crossentropy": 2.6352806091308594, "loss/hidden": 1.2109375, "loss/logits": 0.2674857974052429, "loss/reg": 0.0004921218496747315, "step": 5157 }, { "epoch": 0.64475, "grad_norm": 2.787388324737549, "grad_norm_var": 0.4610279459644552, "learning_rate": 0.0001, "loss": 1.157, "loss/crossentropy": 2.5337777137756348, "loss/hidden": 1.0078125, "loss/logits": 0.14428307116031647, "loss/reg": 0.0004918101476505399, "step": 5158 }, { "epoch": 0.644875, "grad_norm": 3.0240979194641113, "grad_norm_var": 0.42773436346157967, "learning_rate": 0.0001, "loss": 1.423, "loss/crossentropy": 2.450546979904175, "loss/hidden": 1.1953125, "loss/logits": 0.22275590896606445, "loss/reg": 0.0004914866876788437, "step": 5159 }, { "epoch": 0.645, "grad_norm": 2.2992875576019287, "grad_norm_var": 0.4493886032714196, "learning_rate": 0.0001, "loss": 1.2918, "loss/crossentropy": 2.5551578998565674, "loss/hidden": 1.1015625, "loss/logits": 0.1852865219116211, "loss/reg": 0.0004911599680781364, "step": 5160 }, { "epoch": 0.645125, "grad_norm": 2.8911292552948, "grad_norm_var": 0.44928515518647505, "learning_rate": 0.0001, "loss": 1.3252, "loss/crossentropy": 2.443142890930176, "loss/hidden": 1.125, "loss/logits": 0.19528764486312866, "loss/reg": 0.0004909041454084218, "step": 5161 }, { "epoch": 0.64525, "grad_norm": 5.080987930297852, "grad_norm_var": 0.6956256498491292, "learning_rate": 0.0001, "loss": 1.4013, "loss/crossentropy": 2.6162731647491455, "loss/hidden": 1.203125, "loss/logits": 0.19328327476978302, "loss/reg": 0.000490644306410104, "step": 5162 }, { "epoch": 0.645375, "grad_norm": 2.619845390319824, "grad_norm_var": 0.66408974816134, "learning_rate": 0.0001, "loss": 1.5392, "loss/crossentropy": 2.244044780731201, "loss/hidden": 1.3125, "loss/logits": 0.22176669538021088, "loss/reg": 0.0004903867957182229, "step": 5163 }, { "epoch": 0.6455, "grad_norm": 2.3556268215179443, "grad_norm_var": 0.6359051548399044, "learning_rate": 0.0001, "loss": 1.2797, "loss/crossentropy": 2.5231869220733643, "loss/hidden": 1.0859375, "loss/logits": 0.1888793408870697, "loss/reg": 0.0004901342326775193, "step": 5164 }, { "epoch": 0.645625, "grad_norm": 2.0347917079925537, "grad_norm_var": 0.695255239303346, "learning_rate": 0.0001, "loss": 1.1178, "loss/crossentropy": 2.499645233154297, "loss/hidden": 0.9609375, "loss/logits": 0.15195827186107635, "loss/reg": 0.000489904370624572, "step": 5165 }, { "epoch": 0.64575, "grad_norm": 2.213557004928589, "grad_norm_var": 0.7339293107563566, "learning_rate": 0.0001, "loss": 1.1106, "loss/crossentropy": 2.387768507003784, "loss/hidden": 0.94140625, "loss/logits": 0.16430838406085968, "loss/reg": 0.0004896690952591598, "step": 5166 }, { "epoch": 0.645875, "grad_norm": 2.2562458515167236, "grad_norm_var": 0.6284416389243982, "learning_rate": 0.0001, "loss": 1.3573, "loss/crossentropy": 2.1698272228240967, "loss/hidden": 1.140625, "loss/logits": 0.2117382287979126, "loss/reg": 0.0004894161829724908, "step": 5167 }, { "epoch": 0.646, "grad_norm": 3.632619619369507, "grad_norm_var": 0.6518682946731512, "learning_rate": 0.0001, "loss": 1.3071, "loss/crossentropy": 2.4103193283081055, "loss/hidden": 1.125, "loss/logits": 0.1772119402885437, "loss/reg": 0.0004891889984719455, "step": 5168 }, { "epoch": 0.646125, "grad_norm": 2.5295865535736084, "grad_norm_var": 0.6626402280365293, "learning_rate": 0.0001, "loss": 1.1998, "loss/crossentropy": 2.7897772789001465, "loss/hidden": 1.03125, "loss/logits": 0.16364499926567078, "loss/reg": 0.000488972757011652, "step": 5169 }, { "epoch": 0.64625, "grad_norm": 2.795243501663208, "grad_norm_var": 0.646123723772511, "learning_rate": 0.0001, "loss": 1.3353, "loss/crossentropy": 2.3873825073242188, "loss/hidden": 1.140625, "loss/logits": 0.18974840641021729, "loss/reg": 0.0004887152463197708, "step": 5170 }, { "epoch": 0.646375, "grad_norm": 2.125919818878174, "grad_norm_var": 0.6720164645828445, "learning_rate": 0.0001, "loss": 1.2349, "loss/crossentropy": 2.9298245906829834, "loss/hidden": 1.0546875, "loss/logits": 0.17529751360416412, "loss/reg": 0.0004884784575551748, "step": 5171 }, { "epoch": 0.6465, "grad_norm": 3.1638801097869873, "grad_norm_var": 0.575036504407861, "learning_rate": 0.0001, "loss": 1.425, "loss/crossentropy": 2.5804829597473145, "loss/hidden": 1.203125, "loss/logits": 0.21698275208473206, "loss/reg": 0.0004882793000433594, "step": 5172 }, { "epoch": 0.646625, "grad_norm": 3.0045597553253174, "grad_norm_var": 0.5574919088605438, "learning_rate": 0.0001, "loss": 1.1452, "loss/crossentropy": 2.7656409740448, "loss/hidden": 0.98046875, "loss/logits": 0.15980589389801025, "loss/reg": 0.00048808346036821604, "step": 5173 }, { "epoch": 0.64675, "grad_norm": 2.717473030090332, "grad_norm_var": 0.5579235884002778, "learning_rate": 0.0001, "loss": 1.2373, "loss/crossentropy": 2.5951874256134033, "loss/hidden": 1.0625, "loss/logits": 0.16988062858581543, "loss/reg": 0.00048784830141812563, "step": 5174 }, { "epoch": 0.646875, "grad_norm": 3.0602993965148926, "grad_norm_var": 0.5591038247200228, "learning_rate": 0.0001, "loss": 1.3906, "loss/crossentropy": 2.655095338821411, "loss/hidden": 1.15625, "loss/logits": 0.2294430136680603, "loss/reg": 0.000487622048240155, "step": 5175 }, { "epoch": 0.647, "grad_norm": 2.9475865364074707, "grad_norm_var": 0.5421928916060618, "learning_rate": 0.0001, "loss": 1.4659, "loss/crossentropy": 2.504615306854248, "loss/hidden": 1.21875, "loss/logits": 0.24229462444782257, "loss/reg": 0.00048740938655100763, "step": 5176 }, { "epoch": 0.647125, "grad_norm": 2.5821826457977295, "grad_norm_var": 0.5460248176007999, "learning_rate": 0.0001, "loss": 1.4498, "loss/crossentropy": 2.771841526031494, "loss/hidden": 1.21875, "loss/logits": 0.22621293365955353, "loss/reg": 0.00048715833690948784, "step": 5177 }, { "epoch": 0.64725, "grad_norm": 3.0345113277435303, "grad_norm_var": 0.19084470485188731, "learning_rate": 0.0001, "loss": 1.7173, "loss/crossentropy": 2.2017674446105957, "loss/hidden": 1.46875, "loss/logits": 0.24367572367191315, "loss/reg": 0.00048689794493839145, "step": 5178 }, { "epoch": 0.647375, "grad_norm": 3.0718424320220947, "grad_norm_var": 0.19925778223771337, "learning_rate": 0.0001, "loss": 1.4732, "loss/crossentropy": 2.5842015743255615, "loss/hidden": 1.2421875, "loss/logits": 0.2261475920677185, "loss/reg": 0.00048663775669410825, "step": 5179 }, { "epoch": 0.6475, "grad_norm": 2.5418243408203125, "grad_norm_var": 0.19236938084567304, "learning_rate": 0.0001, "loss": 1.4376, "loss/crossentropy": 2.4082486629486084, "loss/hidden": 1.203125, "loss/logits": 0.2295657992362976, "loss/reg": 0.00048643359332345426, "step": 5180 }, { "epoch": 0.647625, "grad_norm": 2.5781702995300293, "grad_norm_var": 0.16030951474846517, "learning_rate": 0.0001, "loss": 1.4054, "loss/crossentropy": 2.508256673812866, "loss/hidden": 1.1953125, "loss/logits": 0.2052079439163208, "loss/reg": 0.00048617750871926546, "step": 5181 }, { "epoch": 0.64775, "grad_norm": 2.170836925506592, "grad_norm_var": 0.16357012151405925, "learning_rate": 0.0001, "loss": 1.0479, "loss/crossentropy": 2.2779572010040283, "loss/hidden": 0.91015625, "loss/logits": 0.13290968537330627, "loss/reg": 0.000485950120491907, "step": 5182 }, { "epoch": 0.647875, "grad_norm": 3.321007490158081, "grad_norm_var": 0.162442040005021, "learning_rate": 0.0001, "loss": 1.309, "loss/crossentropy": 2.5389273166656494, "loss/hidden": 1.1171875, "loss/logits": 0.1869564652442932, "loss/reg": 0.0004857091698795557, "step": 5183 }, { "epoch": 0.648, "grad_norm": 2.7751221656799316, "grad_norm_var": 0.11661495204850686, "learning_rate": 0.0001, "loss": 1.5532, "loss/crossentropy": 2.3829362392425537, "loss/hidden": 1.28125, "loss/logits": 0.2671353816986084, "loss/reg": 0.00048548952327109873, "step": 5184 }, { "epoch": 0.648125, "grad_norm": 2.289011240005493, "grad_norm_var": 0.12814447648591265, "learning_rate": 0.0001, "loss": 1.2132, "loss/crossentropy": 2.461151123046875, "loss/hidden": 1.0234375, "loss/logits": 0.18490515649318695, "loss/reg": 0.00048523227451369166, "step": 5185 }, { "epoch": 0.64825, "grad_norm": 2.6051506996154785, "grad_norm_var": 0.1295405037745996, "learning_rate": 0.0001, "loss": 1.2714, "loss/crossentropy": 2.4736618995666504, "loss/hidden": 1.0859375, "loss/logits": 0.18063730001449585, "loss/reg": 0.0004849778488278389, "step": 5186 }, { "epoch": 0.648375, "grad_norm": 2.2339768409729004, "grad_norm_var": 0.12128833897486, "learning_rate": 0.0001, "loss": 1.2974, "loss/crossentropy": 2.537332057952881, "loss/hidden": 1.1015625, "loss/logits": 0.1910131573677063, "loss/reg": 0.00048475636867806315, "step": 5187 }, { "epoch": 0.6485, "grad_norm": 2.717109203338623, "grad_norm_var": 0.10947175215065195, "learning_rate": 0.0001, "loss": 1.2832, "loss/crossentropy": 2.3444406986236572, "loss/hidden": 1.109375, "loss/logits": 0.1690056025981903, "loss/reg": 0.00048450729809701443, "step": 5188 }, { "epoch": 0.648625, "grad_norm": 3.607417345046997, "grad_norm_var": 0.15440334965074196, "learning_rate": 0.0001, "loss": 1.4304, "loss/crossentropy": 2.565886974334717, "loss/hidden": 1.2421875, "loss/logits": 0.18338046967983246, "loss/reg": 0.0004842884954996407, "step": 5189 }, { "epoch": 0.64875, "grad_norm": 7.735137462615967, "grad_norm_var": 1.6956011333460757, "learning_rate": 0.0001, "loss": 1.3128, "loss/crossentropy": 2.523268222808838, "loss/hidden": 1.140625, "loss/logits": 0.16733668744564056, "loss/reg": 0.00048407886060886085, "step": 5190 }, { "epoch": 0.648875, "grad_norm": 2.15339732170105, "grad_norm_var": 1.7493211873020185, "learning_rate": 0.0001, "loss": 1.1313, "loss/crossentropy": 2.6655101776123047, "loss/hidden": 0.96875, "loss/logits": 0.1576632857322693, "loss/reg": 0.0004838251043111086, "step": 5191 }, { "epoch": 0.649, "grad_norm": 2.5935885906219482, "grad_norm_var": 1.7607018799101146, "learning_rate": 0.0001, "loss": 1.2615, "loss/crossentropy": 2.3216471672058105, "loss/hidden": 1.078125, "loss/logits": 0.1784905046224594, "loss/reg": 0.0004836208245251328, "step": 5192 }, { "epoch": 0.649125, "grad_norm": 3.7090728282928467, "grad_norm_var": 1.777194972972499, "learning_rate": 0.0001, "loss": 1.2966, "loss/crossentropy": 2.7078912258148193, "loss/hidden": 1.1171875, "loss/logits": 0.1745830774307251, "loss/reg": 0.00048337128828279674, "step": 5193 }, { "epoch": 0.64925, "grad_norm": 2.198735237121582, "grad_norm_var": 1.8249269530726657, "learning_rate": 0.0001, "loss": 1.2506, "loss/crossentropy": 2.3670780658721924, "loss/hidden": 1.0625, "loss/logits": 0.1832929104566574, "loss/reg": 0.00048312320723198354, "step": 5194 }, { "epoch": 0.649375, "grad_norm": 3.240511417388916, "grad_norm_var": 1.8278970675558932, "learning_rate": 0.0001, "loss": 1.4743, "loss/crossentropy": 1.973451018333435, "loss/hidden": 1.2890625, "loss/logits": 0.18044418096542358, "loss/reg": 0.00048286429955624044, "step": 5195 }, { "epoch": 0.6495, "grad_norm": 2.3494796752929688, "grad_norm_var": 1.8427131606683578, "learning_rate": 0.0001, "loss": 1.3251, "loss/crossentropy": 2.4329187870025635, "loss/hidden": 1.109375, "loss/logits": 0.21086204051971436, "loss/reg": 0.0004826227086596191, "step": 5196 }, { "epoch": 0.649625, "grad_norm": 2.723385810852051, "grad_norm_var": 1.8355275539567364, "learning_rate": 0.0001, "loss": 1.2981, "loss/crossentropy": 2.384138345718384, "loss/hidden": 1.1171875, "loss/logits": 0.17613208293914795, "loss/reg": 0.0004823628696613014, "step": 5197 }, { "epoch": 0.64975, "grad_norm": 4.772950649261475, "grad_norm_var": 1.9618667560352188, "learning_rate": 0.0001, "loss": 1.751, "loss/crossentropy": 2.8865087032318115, "loss/hidden": 1.421875, "loss/logits": 0.3243442177772522, "loss/reg": 0.0004821417387574911, "step": 5198 }, { "epoch": 0.649875, "grad_norm": 3.138916254043579, "grad_norm_var": 1.960735693903764, "learning_rate": 0.0001, "loss": 1.3776, "loss/crossentropy": 2.667487859725952, "loss/hidden": 1.171875, "loss/logits": 0.200912743806839, "loss/reg": 0.0004819270398002118, "step": 5199 }, { "epoch": 0.65, "grad_norm": 2.5965301990509033, "grad_norm_var": 1.9723150729131265, "learning_rate": 0.0001, "loss": 1.4138, "loss/crossentropy": 2.313232183456421, "loss/hidden": 1.1875, "loss/logits": 0.22151952981948853, "loss/reg": 0.0004817004082724452, "step": 5200 }, { "epoch": 0.650125, "grad_norm": 3.0431463718414307, "grad_norm_var": 1.9196250498810379, "learning_rate": 0.0001, "loss": 1.5137, "loss/crossentropy": 2.769928216934204, "loss/hidden": 1.25, "loss/logits": 0.25883859395980835, "loss/reg": 0.00048147368943318725, "step": 5201 }, { "epoch": 0.65025, "grad_norm": 3.4975733757019043, "grad_norm_var": 1.8969952586089938, "learning_rate": 0.0001, "loss": 1.476, "loss/crossentropy": 2.488945245742798, "loss/hidden": 1.25, "loss/logits": 0.22115829586982727, "loss/reg": 0.0004812408587895334, "step": 5202 }, { "epoch": 0.650375, "grad_norm": 2.469485282897949, "grad_norm_var": 1.8679472827105976, "learning_rate": 0.0001, "loss": 1.1281, "loss/crossentropy": 2.4963667392730713, "loss/hidden": 0.9765625, "loss/logits": 0.1467246413230896, "loss/reg": 0.00048101975698955357, "step": 5203 }, { "epoch": 0.6505, "grad_norm": 2.2283456325531006, "grad_norm_var": 1.919831232380386, "learning_rate": 0.0001, "loss": 1.1274, "loss/crossentropy": 2.586665153503418, "loss/hidden": 0.96484375, "loss/logits": 0.15775027871131897, "loss/reg": 0.00048080208944156766, "step": 5204 }, { "epoch": 0.650625, "grad_norm": 3.7916312217712402, "grad_norm_var": 1.9306424502809607, "learning_rate": 0.0001, "loss": 1.5286, "loss/crossentropy": 2.4080846309661865, "loss/hidden": 1.2890625, "loss/logits": 0.23469951748847961, "loss/reg": 0.0004805464413948357, "step": 5205 }, { "epoch": 0.65075, "grad_norm": 2.815204620361328, "grad_norm_var": 0.511208379835625, "learning_rate": 0.0001, "loss": 1.2973, "loss/crossentropy": 2.6409101486206055, "loss/hidden": 1.1171875, "loss/logits": 0.17534008622169495, "loss/reg": 0.00048032597987912595, "step": 5206 }, { "epoch": 0.650875, "grad_norm": 2.864673614501953, "grad_norm_var": 0.4665578615098828, "learning_rate": 0.0001, "loss": 1.4907, "loss/crossentropy": 2.6199798583984375, "loss/hidden": 1.2421875, "loss/logits": 0.2437501847743988, "loss/reg": 0.0004800737660843879, "step": 5207 }, { "epoch": 0.651, "grad_norm": 3.2921037673950195, "grad_norm_var": 0.4590083705656011, "learning_rate": 0.0001, "loss": 1.1812, "loss/crossentropy": 2.465456962585449, "loss/hidden": 1.0078125, "loss/logits": 0.16859382390975952, "loss/reg": 0.0004798133159056306, "step": 5208 }, { "epoch": 0.651125, "grad_norm": 2.7156262397766113, "grad_norm_var": 0.43282633507632984, "learning_rate": 0.0001, "loss": 1.2194, "loss/crossentropy": 2.328638792037964, "loss/hidden": 1.0703125, "loss/logits": 0.14426806569099426, "loss/reg": 0.0004795394779648632, "step": 5209 }, { "epoch": 0.65125, "grad_norm": 3.5621836185455322, "grad_norm_var": 0.406322344760383, "learning_rate": 0.0001, "loss": 1.4199, "loss/crossentropy": 2.735100746154785, "loss/hidden": 1.203125, "loss/logits": 0.21198531985282898, "loss/reg": 0.0004792946274392307, "step": 5210 }, { "epoch": 0.651375, "grad_norm": 2.8739173412323, "grad_norm_var": 0.4063315726792619, "learning_rate": 0.0001, "loss": 1.2631, "loss/crossentropy": 2.3374931812286377, "loss/hidden": 1.0703125, "loss/logits": 0.18796005845069885, "loss/reg": 0.00047907012049108744, "step": 5211 }, { "epoch": 0.6515, "grad_norm": 3.4273245334625244, "grad_norm_var": 0.37884974046714676, "learning_rate": 0.0001, "loss": 1.3974, "loss/crossentropy": 2.7719478607177734, "loss/hidden": 1.203125, "loss/logits": 0.1894676387310028, "loss/reg": 0.00047881013597361743, "step": 5212 }, { "epoch": 0.651625, "grad_norm": 2.4117729663848877, "grad_norm_var": 0.40111946892584077, "learning_rate": 0.0001, "loss": 1.1873, "loss/crossentropy": 2.5654048919677734, "loss/hidden": 1.0078125, "loss/logits": 0.1747175008058548, "loss/reg": 0.00047855472075752914, "step": 5213 }, { "epoch": 0.65175, "grad_norm": 4.9393486976623535, "grad_norm_var": 0.4401034949883221, "learning_rate": 0.0001, "loss": 2.119, "loss/crossentropy": 2.5878076553344727, "loss/hidden": 1.7421875, "loss/logits": 0.3720424175262451, "loss/reg": 0.0004783076874446124, "step": 5214 }, { "epoch": 0.651875, "grad_norm": 2.83488392829895, "grad_norm_var": 0.4444748872623033, "learning_rate": 0.0001, "loss": 1.2653, "loss/crossentropy": 2.3555779457092285, "loss/hidden": 1.0703125, "loss/logits": 0.19015967845916748, "loss/reg": 0.00047805311623960733, "step": 5215 }, { "epoch": 0.652, "grad_norm": 13.944315910339355, "grad_norm_var": 7.753311752804865, "learning_rate": 0.0001, "loss": 3.1658, "loss/crossentropy": 2.384009838104248, "loss/hidden": 2.21875, "loss/logits": 0.9422811269760132, "loss/reg": 0.00047781263128854334, "step": 5216 }, { "epoch": 0.652125, "grad_norm": 2.705442428588867, "grad_norm_var": 7.794269541657217, "learning_rate": 0.0001, "loss": 1.2423, "loss/crossentropy": 2.3683977127075195, "loss/hidden": 1.0625, "loss/logits": 0.17504100501537323, "loss/reg": 0.00047756588901393116, "step": 5217 }, { "epoch": 0.65225, "grad_norm": 2.745706558227539, "grad_norm_var": 7.857248790320292, "learning_rate": 0.0001, "loss": 1.3138, "loss/crossentropy": 2.638909101486206, "loss/hidden": 1.09375, "loss/logits": 0.2152615785598755, "loss/reg": 0.00047733058454468846, "step": 5218 }, { "epoch": 0.652375, "grad_norm": 3.332033157348633, "grad_norm_var": 7.759197993818597, "learning_rate": 0.0001, "loss": 1.3574, "loss/crossentropy": 2.585336208343506, "loss/hidden": 1.15625, "loss/logits": 0.19641201198101044, "loss/reg": 0.0004771050007548183, "step": 5219 }, { "epoch": 0.6525, "grad_norm": 2.023881196975708, "grad_norm_var": 7.804119626673991, "learning_rate": 0.0001, "loss": 1.1462, "loss/crossentropy": 2.372755527496338, "loss/hidden": 0.99609375, "loss/logits": 0.14531582593917847, "loss/reg": 0.0004768501385115087, "step": 5220 }, { "epoch": 0.652625, "grad_norm": 2.48661732673645, "grad_norm_var": 7.906362620108448, "learning_rate": 0.0001, "loss": 1.2012, "loss/crossentropy": 2.3908636569976807, "loss/hidden": 1.015625, "loss/logits": 0.18085423111915588, "loss/reg": 0.0004766063066199422, "step": 5221 }, { "epoch": 0.65275, "grad_norm": 3.5553338527679443, "grad_norm_var": 7.8546720393134475, "learning_rate": 0.0001, "loss": 1.5125, "loss/crossentropy": 2.2479894161224365, "loss/hidden": 1.3125, "loss/logits": 0.19523324072360992, "loss/reg": 0.0004763673641718924, "step": 5222 }, { "epoch": 0.652875, "grad_norm": 3.0024614334106445, "grad_norm_var": 7.839920729565665, "learning_rate": 0.0001, "loss": 1.2169, "loss/crossentropy": 2.487412214279175, "loss/hidden": 1.046875, "loss/logits": 0.16531068086624146, "loss/reg": 0.00047611971967853606, "step": 5223 }, { "epoch": 0.653, "grad_norm": 2.857985734939575, "grad_norm_var": 7.877671553798408, "learning_rate": 0.0001, "loss": 1.2397, "loss/crossentropy": 2.6366636753082275, "loss/hidden": 1.0390625, "loss/logits": 0.1959173083305359, "loss/reg": 0.00047586814616806805, "step": 5224 }, { "epoch": 0.653125, "grad_norm": 2.6207306385040283, "grad_norm_var": 7.890862463156482, "learning_rate": 0.0001, "loss": 1.5247, "loss/crossentropy": 2.2198245525360107, "loss/hidden": 1.2734375, "loss/logits": 0.24650175869464874, "loss/reg": 0.0004756096750497818, "step": 5225 }, { "epoch": 0.65325, "grad_norm": 3.61238431930542, "grad_norm_var": 7.890045657523023, "learning_rate": 0.0001, "loss": 1.188, "loss/crossentropy": 2.7101356983184814, "loss/hidden": 1.03125, "loss/logits": 0.15197260677814484, "loss/reg": 0.0004753608664032072, "step": 5226 }, { "epoch": 0.653375, "grad_norm": 2.4905858039855957, "grad_norm_var": 7.942007681995347, "learning_rate": 0.0001, "loss": 1.3117, "loss/crossentropy": 2.4028687477111816, "loss/hidden": 1.109375, "loss/logits": 0.19759929180145264, "loss/reg": 0.00047511147568002343, "step": 5227 }, { "epoch": 0.6535, "grad_norm": 4.469687461853027, "grad_norm_var": 7.973835417595067, "learning_rate": 0.0001, "loss": 1.513, "loss/crossentropy": 2.3213841915130615, "loss/hidden": 1.3125, "loss/logits": 0.1957988142967224, "loss/reg": 0.000474862870760262, "step": 5228 }, { "epoch": 0.653625, "grad_norm": 2.3497707843780518, "grad_norm_var": 7.9851558898209625, "learning_rate": 0.0001, "loss": 1.1546, "loss/crossentropy": 2.614278793334961, "loss/hidden": 0.98828125, "loss/logits": 0.16157861053943634, "loss/reg": 0.0004746138583868742, "step": 5229 }, { "epoch": 0.65375, "grad_norm": 3.3068368434906006, "grad_norm_var": 7.892448656686611, "learning_rate": 0.0001, "loss": 1.2511, "loss/crossentropy": 2.5600478649139404, "loss/hidden": 1.0703125, "loss/logits": 0.17602242529392242, "loss/reg": 0.00047437477041967213, "step": 5230 }, { "epoch": 0.653875, "grad_norm": 2.61301589012146, "grad_norm_var": 7.9195249232719025, "learning_rate": 0.0001, "loss": 1.1558, "loss/crossentropy": 2.543168067932129, "loss/hidden": 0.98828125, "loss/logits": 0.16278992593288422, "loss/reg": 0.000474132364615798, "step": 5231 }, { "epoch": 0.654, "grad_norm": 2.972012519836426, "grad_norm_var": 0.357780103023984, "learning_rate": 0.0001, "loss": 1.5356, "loss/crossentropy": 2.2661826610565186, "loss/hidden": 1.2578125, "loss/logits": 0.2730242609977722, "loss/reg": 0.00047392590204253793, "step": 5232 }, { "epoch": 0.654125, "grad_norm": 3.385655164718628, "grad_norm_var": 0.36483270981415006, "learning_rate": 0.0001, "loss": 1.4446, "loss/crossentropy": 2.493088483810425, "loss/hidden": 1.2421875, "loss/logits": 0.19770009815692902, "loss/reg": 0.00047372200060635805, "step": 5233 }, { "epoch": 0.65425, "grad_norm": 2.3500349521636963, "grad_norm_var": 0.387455005721668, "learning_rate": 0.0001, "loss": 1.2731, "loss/crossentropy": 2.759765863418579, "loss/hidden": 1.0859375, "loss/logits": 0.1824702024459839, "loss/reg": 0.0004734951362479478, "step": 5234 }, { "epoch": 0.654375, "grad_norm": 2.4549832344055176, "grad_norm_var": 0.39252998921263044, "learning_rate": 0.0001, "loss": 1.3461, "loss/crossentropy": 2.433236598968506, "loss/hidden": 1.1484375, "loss/logits": 0.19289562106132507, "loss/reg": 0.0004732463858090341, "step": 5235 }, { "epoch": 0.6545, "grad_norm": 10.367582321166992, "grad_norm_var": 3.758370708592389, "learning_rate": 0.0001, "loss": 1.5597, "loss/crossentropy": 2.8678834438323975, "loss/hidden": 1.296875, "loss/logits": 0.2580886781215668, "loss/reg": 0.00047302618622779846, "step": 5236 }, { "epoch": 0.654625, "grad_norm": 5.031167507171631, "grad_norm_var": 3.8426446265702867, "learning_rate": 0.0001, "loss": 1.9347, "loss/crossentropy": 2.7445762157440186, "loss/hidden": 1.6171875, "loss/logits": 0.3128131031990051, "loss/reg": 0.00047281774459406734, "step": 5237 }, { "epoch": 0.65475, "grad_norm": 2.3881993293762207, "grad_norm_var": 3.933179210149285, "learning_rate": 0.0001, "loss": 1.3678, "loss/crossentropy": 2.422515869140625, "loss/hidden": 1.171875, "loss/logits": 0.1912132203578949, "loss/reg": 0.000472608080599457, "step": 5238 }, { "epoch": 0.654875, "grad_norm": 5.961392402648926, "grad_norm_var": 4.2773588861753336, "learning_rate": 0.0001, "loss": 1.7014, "loss/crossentropy": 2.6122918128967285, "loss/hidden": 1.46875, "loss/logits": 0.22789399325847626, "loss/reg": 0.0004724248719867319, "step": 5239 }, { "epoch": 0.655, "grad_norm": 2.8883795738220215, "grad_norm_var": 4.27399623864513, "learning_rate": 0.0001, "loss": 1.3891, "loss/crossentropy": 2.506220817565918, "loss/hidden": 1.15625, "loss/logits": 0.22810272872447968, "loss/reg": 0.0004722537414636463, "step": 5240 }, { "epoch": 0.655125, "grad_norm": 4.3049235343933105, "grad_norm_var": 4.208042096619541, "learning_rate": 0.0001, "loss": 1.2808, "loss/crossentropy": 2.590343713760376, "loss/hidden": 1.1015625, "loss/logits": 0.1744767427444458, "loss/reg": 0.0004720134602393955, "step": 5241 }, { "epoch": 0.65525, "grad_norm": 3.131474018096924, "grad_norm_var": 4.235114500362118, "learning_rate": 0.0001, "loss": 1.2699, "loss/crossentropy": 2.587616205215454, "loss/hidden": 1.0703125, "loss/logits": 0.1948910802602768, "loss/reg": 0.00047176957014016807, "step": 5242 }, { "epoch": 0.655375, "grad_norm": 2.52453351020813, "grad_norm_var": 4.229354219429865, "learning_rate": 0.0001, "loss": 1.2807, "loss/crossentropy": 2.7975990772247314, "loss/hidden": 1.09375, "loss/logits": 0.18226705491542816, "loss/reg": 0.0004715296090580523, "step": 5243 }, { "epoch": 0.6555, "grad_norm": 2.586200714111328, "grad_norm_var": 4.278180478399724, "learning_rate": 0.0001, "loss": 1.2213, "loss/crossentropy": 2.475921630859375, "loss/hidden": 1.03125, "loss/logits": 0.18529503047466278, "loss/reg": 0.0004713098460342735, "step": 5244 }, { "epoch": 0.655625, "grad_norm": 20.137168884277344, "grad_norm_var": 20.936917178198183, "learning_rate": 0.0001, "loss": 1.3835, "loss/crossentropy": 2.535867929458618, "loss/hidden": 1.1953125, "loss/logits": 0.18348944187164307, "loss/reg": 0.0004710921202786267, "step": 5245 }, { "epoch": 0.65575, "grad_norm": 3.9405174255371094, "grad_norm_var": 20.837949121591706, "learning_rate": 0.0001, "loss": 1.8494, "loss/crossentropy": 2.4065937995910645, "loss/hidden": 1.5390625, "loss/logits": 0.3056221008300781, "loss/reg": 0.00047085442929528654, "step": 5246 }, { "epoch": 0.655875, "grad_norm": 3.131988286972046, "grad_norm_var": 20.70242512932556, "learning_rate": 0.0001, "loss": 1.3318, "loss/crossentropy": 2.527498483657837, "loss/hidden": 1.1171875, "loss/logits": 0.2099296748638153, "loss/reg": 0.0004706361214630306, "step": 5247 }, { "epoch": 0.656, "grad_norm": 2.3125925064086914, "grad_norm_var": 20.894479357256674, "learning_rate": 0.0001, "loss": 1.1456, "loss/crossentropy": 2.448211669921875, "loss/hidden": 1.0, "loss/logits": 0.14087837934494019, "loss/reg": 0.0004703960148617625, "step": 5248 }, { "epoch": 0.656125, "grad_norm": 4.202197074890137, "grad_norm_var": 20.78150910732659, "learning_rate": 0.0001, "loss": 1.2383, "loss/crossentropy": 2.5311880111694336, "loss/hidden": 1.03125, "loss/logits": 0.2023012340068817, "loss/reg": 0.00047016574535518885, "step": 5249 }, { "epoch": 0.65625, "grad_norm": 2.4136340618133545, "grad_norm_var": 20.760502436566803, "learning_rate": 0.0001, "loss": 1.2755, "loss/crossentropy": 2.522026300430298, "loss/hidden": 1.078125, "loss/logits": 0.19268059730529785, "loss/reg": 0.0004699447890743613, "step": 5250 }, { "epoch": 0.656375, "grad_norm": 2.2620866298675537, "grad_norm_var": 20.824711169256656, "learning_rate": 0.0001, "loss": 1.2051, "loss/crossentropy": 2.6152353286743164, "loss/hidden": 1.03125, "loss/logits": 0.16911329329013824, "loss/reg": 0.0004696899850387126, "step": 5251 }, { "epoch": 0.6565, "grad_norm": 2.5382626056671143, "grad_norm_var": 18.89495470221063, "learning_rate": 0.0001, "loss": 1.378, "loss/crossentropy": 2.3972525596618652, "loss/hidden": 1.171875, "loss/logits": 0.2014172226190567, "loss/reg": 0.0004694721137639135, "step": 5252 }, { "epoch": 0.656625, "grad_norm": 2.6456851959228516, "grad_norm_var": 19.037033139775907, "learning_rate": 0.0001, "loss": 1.4722, "loss/crossentropy": 2.351311683654785, "loss/hidden": 1.25, "loss/logits": 0.2175287902355194, "loss/reg": 0.00046927895164117217, "step": 5253 }, { "epoch": 0.65675, "grad_norm": 3.086866617202759, "grad_norm_var": 18.897776861340905, "learning_rate": 0.0001, "loss": 1.3893, "loss/crossentropy": 2.408646583557129, "loss/hidden": 1.1875, "loss/logits": 0.19711902737617493, "loss/reg": 0.0004690898349508643, "step": 5254 }, { "epoch": 0.656875, "grad_norm": 2.4864540100097656, "grad_norm_var": 18.86151189879487, "learning_rate": 0.0001, "loss": 1.2461, "loss/crossentropy": 2.47487211227417, "loss/hidden": 1.046875, "loss/logits": 0.19457748532295227, "loss/reg": 0.00046884079347364604, "step": 5255 }, { "epoch": 0.657, "grad_norm": 2.8532750606536865, "grad_norm_var": 18.86696543620233, "learning_rate": 0.0001, "loss": 1.3863, "loss/crossentropy": 2.5158588886260986, "loss/hidden": 1.1796875, "loss/logits": 0.20190775394439697, "loss/reg": 0.00046859041322022676, "step": 5256 }, { "epoch": 0.657125, "grad_norm": 2.7092983722686768, "grad_norm_var": 18.96863697354406, "learning_rate": 0.0001, "loss": 1.2642, "loss/crossentropy": 2.6252801418304443, "loss/hidden": 1.078125, "loss/logits": 0.1814270317554474, "loss/reg": 0.0004683342995122075, "step": 5257 }, { "epoch": 0.65725, "grad_norm": 2.5568349361419678, "grad_norm_var": 19.050850796737922, "learning_rate": 0.0001, "loss": 1.427, "loss/crossentropy": 2.489044427871704, "loss/hidden": 1.1953125, "loss/logits": 0.22697106003761292, "loss/reg": 0.00046810737694613636, "step": 5258 }, { "epoch": 0.657375, "grad_norm": 3.0233421325683594, "grad_norm_var": 18.974973712330353, "learning_rate": 0.0001, "loss": 1.4092, "loss/crossentropy": 2.02358078956604, "loss/hidden": 1.203125, "loss/logits": 0.20135164260864258, "loss/reg": 0.00046788406325504184, "step": 5259 }, { "epoch": 0.6575, "grad_norm": 2.6063859462738037, "grad_norm_var": 18.971381446841402, "learning_rate": 0.0001, "loss": 1.3967, "loss/crossentropy": 2.6539816856384277, "loss/hidden": 1.15625, "loss/logits": 0.23572859168052673, "loss/reg": 0.0004676726821344346, "step": 5260 }, { "epoch": 0.657625, "grad_norm": 2.2957499027252197, "grad_norm_var": 0.31558048238700753, "learning_rate": 0.0001, "loss": 1.2899, "loss/crossentropy": 2.6140289306640625, "loss/hidden": 1.078125, "loss/logits": 0.20712876319885254, "loss/reg": 0.00046746505540795624, "step": 5261 }, { "epoch": 0.65775, "grad_norm": 1.9575836658477783, "grad_norm_var": 0.2641711921447758, "learning_rate": 0.0001, "loss": 1.1968, "loss/crossentropy": 2.4713449478149414, "loss/hidden": 1.015625, "loss/logits": 0.17654184997081757, "loss/reg": 0.00046722133993171155, "step": 5262 }, { "epoch": 0.657875, "grad_norm": 2.0622646808624268, "grad_norm_var": 0.27302628802305906, "learning_rate": 0.0001, "loss": 1.4313, "loss/crossentropy": 2.116300582885742, "loss/hidden": 1.1875, "loss/logits": 0.23915763199329376, "loss/reg": 0.00046700105303898454, "step": 5263 }, { "epoch": 0.658, "grad_norm": 2.3435957431793213, "grad_norm_var": 0.27179171096820387, "learning_rate": 0.0001, "loss": 1.3725, "loss/crossentropy": 2.24882173538208, "loss/hidden": 1.171875, "loss/logits": 0.19598081707954407, "loss/reg": 0.000466760277049616, "step": 5264 }, { "epoch": 0.658125, "grad_norm": 2.6779322624206543, "grad_norm_var": 0.09701378562227773, "learning_rate": 0.0001, "loss": 1.2866, "loss/crossentropy": 2.3002548217773438, "loss/hidden": 1.1171875, "loss/logits": 0.16471028327941895, "loss/reg": 0.0004665449960157275, "step": 5265 }, { "epoch": 0.65825, "grad_norm": 2.9162838459014893, "grad_norm_var": 0.10484157813047261, "learning_rate": 0.0001, "loss": 1.3837, "loss/crossentropy": 2.402135133743286, "loss/hidden": 1.1484375, "loss/logits": 0.23058053851127625, "loss/reg": 0.00046633006422780454, "step": 5266 }, { "epoch": 0.658375, "grad_norm": 2.363130569458008, "grad_norm_var": 0.10141392689273872, "learning_rate": 0.0001, "loss": 1.2699, "loss/crossentropy": 2.3783674240112305, "loss/hidden": 1.0859375, "loss/logits": 0.1793087124824524, "loss/reg": 0.0004660900740418583, "step": 5267 }, { "epoch": 0.6585, "grad_norm": 3.3506696224212646, "grad_norm_var": 0.13920648367446778, "learning_rate": 0.0001, "loss": 1.4476, "loss/crossentropy": 2.4612693786621094, "loss/hidden": 1.21875, "loss/logits": 0.22420671582221985, "loss/reg": 0.00046585031668655574, "step": 5268 }, { "epoch": 0.658625, "grad_norm": 3.4596946239471436, "grad_norm_var": 0.18330328243603541, "learning_rate": 0.0001, "loss": 1.6035, "loss/crossentropy": 2.275355100631714, "loss/hidden": 1.3671875, "loss/logits": 0.23169337213039398, "loss/reg": 0.00046560238115489483, "step": 5269 }, { "epoch": 0.65875, "grad_norm": 2.832716703414917, "grad_norm_var": 0.17327626452456477, "learning_rate": 0.0001, "loss": 1.546, "loss/crossentropy": 2.592271089553833, "loss/hidden": 1.2890625, "loss/logits": 0.252238929271698, "loss/reg": 0.0004653638752643019, "step": 5270 }, { "epoch": 0.658875, "grad_norm": 2.5709264278411865, "grad_norm_var": 0.17181319887994823, "learning_rate": 0.0001, "loss": 1.4264, "loss/crossentropy": 2.5509605407714844, "loss/hidden": 1.21875, "loss/logits": 0.20300482213497162, "loss/reg": 0.000465118937427178, "step": 5271 }, { "epoch": 0.659, "grad_norm": 5.5269999504089355, "grad_norm_var": 0.6870768189869939, "learning_rate": 0.0001, "loss": 1.8371, "loss/crossentropy": 2.353480100631714, "loss/hidden": 1.5390625, "loss/logits": 0.2934322953224182, "loss/reg": 0.00046487379586324096, "step": 5272 }, { "epoch": 0.659125, "grad_norm": 10.11805534362793, "grad_norm_var": 4.000090301817397, "learning_rate": 0.0001, "loss": 1.7815, "loss/crossentropy": 2.802718162536621, "loss/hidden": 1.46875, "loss/logits": 0.30805838108062744, "loss/reg": 0.0004646445158869028, "step": 5273 }, { "epoch": 0.65925, "grad_norm": 3.1610682010650635, "grad_norm_var": 3.9637302735991584, "learning_rate": 0.0001, "loss": 1.3979, "loss/crossentropy": 1.8927568197250366, "loss/hidden": 1.1875, "loss/logits": 0.20580336451530457, "loss/reg": 0.00046439701691269875, "step": 5274 }, { "epoch": 0.659375, "grad_norm": 5.916834354400635, "grad_norm_var": 4.369018501972139, "learning_rate": 0.0001, "loss": 2.026, "loss/crossentropy": 2.696779727935791, "loss/hidden": 1.5703125, "loss/logits": 0.4510304033756256, "loss/reg": 0.00046413892414420843, "step": 5275 }, { "epoch": 0.6595, "grad_norm": 2.3451242446899414, "grad_norm_var": 4.404761670658059, "learning_rate": 0.0001, "loss": 1.2063, "loss/crossentropy": 2.453455924987793, "loss/hidden": 1.03125, "loss/logits": 0.1704503893852234, "loss/reg": 0.00046389925410039723, "step": 5276 }, { "epoch": 0.659625, "grad_norm": 3.178346872329712, "grad_norm_var": 4.312477666707605, "learning_rate": 0.0001, "loss": 1.3639, "loss/crossentropy": 2.285336494445801, "loss/hidden": 1.15625, "loss/logits": 0.20296624302864075, "loss/reg": 0.000463654228951782, "step": 5277 }, { "epoch": 0.65975, "grad_norm": 2.625746965408325, "grad_norm_var": 4.19861894489834, "learning_rate": 0.0001, "loss": 1.0949, "loss/crossentropy": 2.7237274646759033, "loss/hidden": 0.9453125, "loss/logits": 0.1449621319770813, "loss/reg": 0.0004634273936972022, "step": 5278 }, { "epoch": 0.659875, "grad_norm": 2.7090964317321777, "grad_norm_var": 4.092959423704788, "learning_rate": 0.0001, "loss": 1.3372, "loss/crossentropy": 2.6196722984313965, "loss/hidden": 1.140625, "loss/logits": 0.1919824182987213, "loss/reg": 0.000463159813079983, "step": 5279 }, { "epoch": 0.66, "grad_norm": 2.6626853942871094, "grad_norm_var": 4.044549487445283, "learning_rate": 0.0001, "loss": 1.3309, "loss/crossentropy": 2.278923273086548, "loss/hidden": 1.1171875, "loss/logits": 0.209104523062706, "loss/reg": 0.00046289508463814855, "step": 5280 }, { "epoch": 0.660125, "grad_norm": 2.513728141784668, "grad_norm_var": 4.067537963785836, "learning_rate": 0.0001, "loss": 1.2615, "loss/crossentropy": 2.4568376541137695, "loss/hidden": 1.0859375, "loss/logits": 0.17092739045619965, "loss/reg": 0.00046264578122645617, "step": 5281 }, { "epoch": 0.66025, "grad_norm": 5.200710296630859, "grad_norm_var": 4.173052427577526, "learning_rate": 0.0001, "loss": 1.3852, "loss/crossentropy": 2.7520060539245605, "loss/hidden": 1.171875, "loss/logits": 0.2087387889623642, "loss/reg": 0.0004624006396625191, "step": 5282 }, { "epoch": 0.660375, "grad_norm": 3.215156078338623, "grad_norm_var": 4.057068653747061, "learning_rate": 0.0001, "loss": 1.4518, "loss/crossentropy": 2.3519976139068604, "loss/hidden": 1.2265625, "loss/logits": 0.2206091284751892, "loss/reg": 0.00046216233749873936, "step": 5283 }, { "epoch": 0.6605, "grad_norm": 2.8621318340301514, "grad_norm_var": 4.103646168546646, "learning_rate": 0.0001, "loss": 1.4692, "loss/crossentropy": 2.6647145748138428, "loss/hidden": 1.2578125, "loss/logits": 0.2068023979663849, "loss/reg": 0.00046194266178645194, "step": 5284 }, { "epoch": 0.660625, "grad_norm": 2.4932563304901123, "grad_norm_var": 4.206670061749463, "learning_rate": 0.0001, "loss": 1.4451, "loss/crossentropy": 2.4398720264434814, "loss/hidden": 1.171875, "loss/logits": 0.26859623193740845, "loss/reg": 0.00046172263682819903, "step": 5285 }, { "epoch": 0.66075, "grad_norm": 2.740652322769165, "grad_norm_var": 4.2184079627642745, "learning_rate": 0.0001, "loss": 1.3613, "loss/crossentropy": 2.8006467819213867, "loss/hidden": 1.1484375, "loss/logits": 0.20822763442993164, "loss/reg": 0.00046148241381160915, "step": 5286 }, { "epoch": 0.660875, "grad_norm": 2.4252676963806152, "grad_norm_var": 4.242439391897514, "learning_rate": 0.0001, "loss": 1.0871, "loss/crossentropy": 2.195422410964966, "loss/hidden": 0.9296875, "loss/logits": 0.15284115076065063, "loss/reg": 0.00046121771447360516, "step": 5287 }, { "epoch": 0.661, "grad_norm": 2.164792537689209, "grad_norm_var": 4.143798302119657, "learning_rate": 0.0001, "loss": 1.1296, "loss/crossentropy": 2.5274832248687744, "loss/hidden": 0.97265625, "loss/logits": 0.15231981873512268, "loss/reg": 0.00046098875463940203, "step": 5288 }, { "epoch": 0.661125, "grad_norm": 2.6451358795166016, "grad_norm_var": 1.0606376006734914, "learning_rate": 0.0001, "loss": 1.2404, "loss/crossentropy": 2.561119318008423, "loss/hidden": 1.0234375, "loss/logits": 0.21233737468719482, "loss/reg": 0.00046076500439085066, "step": 5289 }, { "epoch": 0.66125, "grad_norm": 3.2035140991210938, "grad_norm_var": 1.0613576606242037, "learning_rate": 0.0001, "loss": 1.4067, "loss/crossentropy": 2.381636142730713, "loss/hidden": 1.1875, "loss/logits": 0.2145482897758484, "loss/reg": 0.0004605454741977155, "step": 5290 }, { "epoch": 0.661375, "grad_norm": 2.530794382095337, "grad_norm_var": 0.486524598290732, "learning_rate": 0.0001, "loss": 1.2835, "loss/crossentropy": 2.7948646545410156, "loss/hidden": 1.0859375, "loss/logits": 0.19295290112495422, "loss/reg": 0.0004603037377819419, "step": 5291 }, { "epoch": 0.6615, "grad_norm": 3.6229724884033203, "grad_norm_var": 0.5034530018954749, "learning_rate": 0.0001, "loss": 1.3574, "loss/crossentropy": 3.0054373741149902, "loss/hidden": 1.140625, "loss/logits": 0.2121742069721222, "loss/reg": 0.00046009276411496103, "step": 5292 }, { "epoch": 0.661625, "grad_norm": 2.6386218070983887, "grad_norm_var": 0.5034007195513366, "learning_rate": 0.0001, "loss": 1.3103, "loss/crossentropy": 2.8318943977355957, "loss/hidden": 1.109375, "loss/logits": 0.19628813862800598, "loss/reg": 0.0004598494851961732, "step": 5293 }, { "epoch": 0.66175, "grad_norm": 2.609143018722534, "grad_norm_var": 0.504004942822024, "learning_rate": 0.0001, "loss": 1.3007, "loss/crossentropy": 2.6158194541931152, "loss/hidden": 1.109375, "loss/logits": 0.1866869479417801, "loss/reg": 0.00045962943113408983, "step": 5294 }, { "epoch": 0.661875, "grad_norm": 3.7944188117980957, "grad_norm_var": 0.5514679176345678, "learning_rate": 0.0001, "loss": 1.2509, "loss/crossentropy": 2.803081512451172, "loss/hidden": 1.0546875, "loss/logits": 0.19161981344223022, "loss/reg": 0.0004594200581777841, "step": 5295 }, { "epoch": 0.662, "grad_norm": 3.6002533435821533, "grad_norm_var": 0.5695297329679752, "learning_rate": 0.0001, "loss": 1.3638, "loss/crossentropy": 2.7973721027374268, "loss/hidden": 1.15625, "loss/logits": 0.20291835069656372, "loss/reg": 0.00045921807759441435, "step": 5296 }, { "epoch": 0.662125, "grad_norm": 3.3301682472229004, "grad_norm_var": 0.5564830336284293, "learning_rate": 0.0001, "loss": 1.2589, "loss/crossentropy": 2.6643614768981934, "loss/hidden": 1.0703125, "loss/logits": 0.18401961028575897, "loss/reg": 0.00045903329737484455, "step": 5297 }, { "epoch": 0.66225, "grad_norm": 28.716524124145508, "grad_norm_var": 41.80772362406089, "learning_rate": 0.0001, "loss": 1.3799, "loss/crossentropy": 2.3374881744384766, "loss/hidden": 1.1875, "loss/logits": 0.18778568506240845, "loss/reg": 0.00045884071732871234, "step": 5298 }, { "epoch": 0.662375, "grad_norm": 2.275423288345337, "grad_norm_var": 42.02854752992906, "learning_rate": 0.0001, "loss": 1.1011, "loss/crossentropy": 2.5324060916900635, "loss/hidden": 0.95703125, "loss/logits": 0.13951267302036285, "loss/reg": 0.0004585980495903641, "step": 5299 }, { "epoch": 0.6625, "grad_norm": 2.2338249683380127, "grad_norm_var": 42.18861531354093, "learning_rate": 0.0001, "loss": 1.2049, "loss/crossentropy": 2.7113494873046875, "loss/hidden": 1.03125, "loss/logits": 0.16910767555236816, "loss/reg": 0.00045841760584153235, "step": 5300 }, { "epoch": 0.662625, "grad_norm": 2.8575656414031982, "grad_norm_var": 42.10239440432376, "learning_rate": 0.0001, "loss": 1.3398, "loss/crossentropy": 2.767899990081787, "loss/hidden": 1.140625, "loss/logits": 0.19455108046531677, "loss/reg": 0.0004582372203003615, "step": 5301 }, { "epoch": 0.66275, "grad_norm": 2.4183335304260254, "grad_norm_var": 42.18285598985622, "learning_rate": 0.0001, "loss": 1.2989, "loss/crossentropy": 2.587294101715088, "loss/hidden": 1.0859375, "loss/logits": 0.2084307074546814, "loss/reg": 0.00045800008228980005, "step": 5302 }, { "epoch": 0.662875, "grad_norm": 2.8901052474975586, "grad_norm_var": 42.07138721263517, "learning_rate": 0.0001, "loss": 1.5017, "loss/crossentropy": 2.5259766578674316, "loss/hidden": 1.2578125, "loss/logits": 0.23928505182266235, "loss/reg": 0.00045782438246533275, "step": 5303 }, { "epoch": 0.663, "grad_norm": 2.317484140396118, "grad_norm_var": 42.025898190784, "learning_rate": 0.0001, "loss": 1.3718, "loss/crossentropy": 2.485806465148926, "loss/hidden": 1.1640625, "loss/logits": 0.2031121850013733, "loss/reg": 0.00045762068475596607, "step": 5304 }, { "epoch": 0.663125, "grad_norm": 2.1729071140289307, "grad_norm_var": 42.15538262838205, "learning_rate": 0.0001, "loss": 1.1136, "loss/crossentropy": 2.3840219974517822, "loss/hidden": 0.95703125, "loss/logits": 0.15204299986362457, "loss/reg": 0.00045747350668534636, "step": 5305 }, { "epoch": 0.66325, "grad_norm": 2.4701192378997803, "grad_norm_var": 42.31096189808251, "learning_rate": 0.0001, "loss": 1.3317, "loss/crossentropy": 2.478489875793457, "loss/hidden": 1.125, "loss/logits": 0.20216184854507446, "loss/reg": 0.0004572462639771402, "step": 5306 }, { "epoch": 0.663375, "grad_norm": 2.616339683532715, "grad_norm_var": 42.29004296407937, "learning_rate": 0.0001, "loss": 1.5899, "loss/crossentropy": 2.359524965286255, "loss/hidden": 1.3125, "loss/logits": 0.27281293272972107, "loss/reg": 0.0004570800520014018, "step": 5307 }, { "epoch": 0.6635, "grad_norm": 2.385524272918701, "grad_norm_var": 42.515645308836866, "learning_rate": 0.0001, "loss": 1.0858, "loss/crossentropy": 2.6694822311401367, "loss/hidden": 0.9375, "loss/logits": 0.14377281069755554, "loss/reg": 0.00045693141873925924, "step": 5308 }, { "epoch": 0.663625, "grad_norm": 2.447209596633911, "grad_norm_var": 42.561176529971874, "learning_rate": 0.0001, "loss": 1.1832, "loss/crossentropy": 2.521389961242676, "loss/hidden": 1.0078125, "loss/logits": 0.17084261775016785, "loss/reg": 0.00045677772141061723, "step": 5309 }, { "epoch": 0.66375, "grad_norm": 2.5744824409484863, "grad_norm_var": 42.56916261890438, "learning_rate": 0.0001, "loss": 1.1226, "loss/crossentropy": 2.65447998046875, "loss/hidden": 0.96484375, "loss/logits": 0.15314367413520813, "loss/reg": 0.00045666101505048573, "step": 5310 }, { "epoch": 0.663875, "grad_norm": 2.698861837387085, "grad_norm_var": 42.72077547052029, "learning_rate": 0.0001, "loss": 1.2086, "loss/crossentropy": 2.5830135345458984, "loss/hidden": 1.0390625, "loss/logits": 0.16500937938690186, "loss/reg": 0.00045643141493201256, "step": 5311 }, { "epoch": 0.664, "grad_norm": 12.505603790283203, "grad_norm_var": 46.90547794815793, "learning_rate": 0.0001, "loss": 2.2268, "loss/crossentropy": 2.104893684387207, "loss/hidden": 1.7578125, "loss/logits": 0.46439725160598755, "loss/reg": 0.00045627504005096853, "step": 5312 }, { "epoch": 0.664125, "grad_norm": 2.663820743560791, "grad_norm_var": 47.064431766126695, "learning_rate": 0.0001, "loss": 1.167, "loss/crossentropy": 2.5825209617614746, "loss/hidden": 1.0078125, "loss/logits": 0.15467441082000732, "loss/reg": 0.0004560365923680365, "step": 5313 }, { "epoch": 0.66425, "grad_norm": 3.3199844360351562, "grad_norm_var": 6.272042281431344, "learning_rate": 0.0001, "loss": 1.7793, "loss/crossentropy": 2.5747294425964355, "loss/hidden": 1.484375, "loss/logits": 0.29039520025253296, "loss/reg": 0.0004557970678433776, "step": 5314 }, { "epoch": 0.664375, "grad_norm": 3.955622911453247, "grad_norm_var": 6.246288739115379, "learning_rate": 0.0001, "loss": 1.4873, "loss/crossentropy": 2.602041006088257, "loss/hidden": 1.2578125, "loss/logits": 0.22495043277740479, "loss/reg": 0.000455580186098814, "step": 5315 }, { "epoch": 0.6645, "grad_norm": 2.7766709327697754, "grad_norm_var": 6.188768575500836, "learning_rate": 0.0001, "loss": 1.5787, "loss/crossentropy": 2.5361313819885254, "loss/hidden": 1.3046875, "loss/logits": 0.2694621682167053, "loss/reg": 0.00045535227400250733, "step": 5316 }, { "epoch": 0.664625, "grad_norm": 2.806126356124878, "grad_norm_var": 6.192084428960646, "learning_rate": 0.0001, "loss": 1.2459, "loss/crossentropy": 2.6851398944854736, "loss/hidden": 1.0546875, "loss/logits": 0.1866580843925476, "loss/reg": 0.00045516519458033144, "step": 5317 }, { "epoch": 0.66475, "grad_norm": 2.244271993637085, "grad_norm_var": 6.214757860705928, "learning_rate": 0.0001, "loss": 1.3279, "loss/crossentropy": 2.3996243476867676, "loss/hidden": 1.140625, "loss/logits": 0.18273897469043732, "loss/reg": 0.00045493862126022577, "step": 5318 }, { "epoch": 0.664875, "grad_norm": 2.308149576187134, "grad_norm_var": 6.267949182823374, "learning_rate": 0.0001, "loss": 1.1333, "loss/crossentropy": 2.4416017532348633, "loss/hidden": 0.97265625, "loss/logits": 0.1561460793018341, "loss/reg": 0.0004547339922282845, "step": 5319 }, { "epoch": 0.665, "grad_norm": 3.2636053562164307, "grad_norm_var": 6.204184368199285, "learning_rate": 0.0001, "loss": 1.4231, "loss/crossentropy": 2.443971872329712, "loss/hidden": 1.234375, "loss/logits": 0.18417704105377197, "loss/reg": 0.0004544945841189474, "step": 5320 }, { "epoch": 0.665125, "grad_norm": 2.723546028137207, "grad_norm_var": 6.138506936500789, "learning_rate": 0.0001, "loss": 1.293, "loss/crossentropy": 2.662248134613037, "loss/hidden": 1.125, "loss/logits": 0.1635037362575531, "loss/reg": 0.0004543080576695502, "step": 5321 }, { "epoch": 0.66525, "grad_norm": 2.787825107574463, "grad_norm_var": 6.107119615691008, "learning_rate": 0.0001, "loss": 1.1087, "loss/crossentropy": 2.660428047180176, "loss/hidden": 0.97265625, "loss/logits": 0.1314929723739624, "loss/reg": 0.00045407499419525266, "step": 5322 }, { "epoch": 0.665375, "grad_norm": 3.2844440937042236, "grad_norm_var": 6.067003135777103, "learning_rate": 0.0001, "loss": 1.3986, "loss/crossentropy": 2.5088388919830322, "loss/hidden": 1.15625, "loss/logits": 0.23777800798416138, "loss/reg": 0.00045390811283141375, "step": 5323 }, { "epoch": 0.6655, "grad_norm": 2.60115647315979, "grad_norm_var": 6.040120773224521, "learning_rate": 0.0001, "loss": 1.3985, "loss/crossentropy": 2.3189697265625, "loss/hidden": 1.1875, "loss/logits": 0.20647010207176208, "loss/reg": 0.0004536685300990939, "step": 5324 }, { "epoch": 0.665625, "grad_norm": 3.3378052711486816, "grad_norm_var": 5.972386811843031, "learning_rate": 0.0001, "loss": 1.5052, "loss/crossentropy": 2.894184112548828, "loss/hidden": 1.25, "loss/logits": 0.250666081905365, "loss/reg": 0.0004534890758804977, "step": 5325 }, { "epoch": 0.66575, "grad_norm": 2.326862096786499, "grad_norm_var": 6.006470536554494, "learning_rate": 0.0001, "loss": 1.2941, "loss/crossentropy": 2.352461814880371, "loss/hidden": 1.0859375, "loss/logits": 0.20359328389167786, "loss/reg": 0.0004532611055765301, "step": 5326 }, { "epoch": 0.665875, "grad_norm": 2.7144296169281006, "grad_norm_var": 6.004874085507625, "learning_rate": 0.0001, "loss": 1.3377, "loss/crossentropy": 2.5437347888946533, "loss/hidden": 1.140625, "loss/logits": 0.192535862326622, "loss/reg": 0.00045302981743589044, "step": 5327 }, { "epoch": 0.666, "grad_norm": 2.2008466720581055, "grad_norm_var": 0.2355791314422028, "learning_rate": 0.0001, "loss": 1.2374, "loss/crossentropy": 2.45443058013916, "loss/hidden": 1.046875, "loss/logits": 0.18600840866565704, "loss/reg": 0.0004528329009190202, "step": 5328 }, { "epoch": 0.666125, "grad_norm": 2.5731253623962402, "grad_norm_var": 0.238129373043288, "learning_rate": 0.0001, "loss": 1.2933, "loss/crossentropy": 2.316206216812134, "loss/hidden": 1.1015625, "loss/logits": 0.18720197677612305, "loss/reg": 0.00045259209582582116, "step": 5329 }, { "epoch": 0.66625, "grad_norm": 3.0777347087860107, "grad_norm_var": 0.225858605275702, "learning_rate": 0.0001, "loss": 1.3235, "loss/crossentropy": 2.3852949142456055, "loss/hidden": 1.140625, "loss/logits": 0.17836213111877441, "loss/reg": 0.0004523676179815084, "step": 5330 }, { "epoch": 0.666375, "grad_norm": 2.4016175270080566, "grad_norm_var": 0.13970579459153948, "learning_rate": 0.0001, "loss": 1.1046, "loss/crossentropy": 2.304971218109131, "loss/hidden": 0.94921875, "loss/logits": 0.15082383155822754, "loss/reg": 0.00045213187695480883, "step": 5331 }, { "epoch": 0.6665, "grad_norm": 2.6640431880950928, "grad_norm_var": 0.13956143429268858, "learning_rate": 0.0001, "loss": 1.3933, "loss/crossentropy": 2.635286808013916, "loss/hidden": 1.1875, "loss/logits": 0.20127511024475098, "loss/reg": 0.00045190504170022905, "step": 5332 }, { "epoch": 0.666625, "grad_norm": 2.650421142578125, "grad_norm_var": 0.13902341676694593, "learning_rate": 0.0001, "loss": 1.336, "loss/crossentropy": 2.6121504306793213, "loss/hidden": 1.1328125, "loss/logits": 0.1987154483795166, "loss/reg": 0.0004516627814155072, "step": 5333 }, { "epoch": 0.66675, "grad_norm": 2.5734705924987793, "grad_norm_var": 0.12590336345942793, "learning_rate": 0.0001, "loss": 1.1343, "loss/crossentropy": 2.6010353565216064, "loss/hidden": 0.98046875, "loss/logits": 0.14929088950157166, "loss/reg": 0.00045142287854105234, "step": 5334 }, { "epoch": 0.666875, "grad_norm": 5.2128005027771, "grad_norm_var": 0.494459811233101, "learning_rate": 0.0001, "loss": 1.3404, "loss/crossentropy": 2.7840640544891357, "loss/hidden": 1.125, "loss/logits": 0.2109326273202896, "loss/reg": 0.00045116685214452446, "step": 5335 }, { "epoch": 0.667, "grad_norm": 2.313035488128662, "grad_norm_var": 0.5047998096487428, "learning_rate": 0.0001, "loss": 1.2549, "loss/crossentropy": 2.33984637260437, "loss/hidden": 1.09375, "loss/logits": 0.15659353137016296, "loss/reg": 0.000450930732768029, "step": 5336 }, { "epoch": 0.667125, "grad_norm": 2.2982425689697266, "grad_norm_var": 0.5227199828281376, "learning_rate": 0.0001, "loss": 1.1787, "loss/crossentropy": 2.4949960708618164, "loss/hidden": 1.0, "loss/logits": 0.17416705191135406, "loss/reg": 0.0004506927216425538, "step": 5337 }, { "epoch": 0.66725, "grad_norm": 2.4009838104248047, "grad_norm_var": 0.5334031481807784, "learning_rate": 0.0001, "loss": 1.3453, "loss/crossentropy": 2.468522548675537, "loss/hidden": 1.1328125, "loss/logits": 0.20800238847732544, "loss/reg": 0.000450435618404299, "step": 5338 }, { "epoch": 0.667375, "grad_norm": 2.496230363845825, "grad_norm_var": 0.5202105298485994, "learning_rate": 0.0001, "loss": 1.3204, "loss/crossentropy": 2.461984157562256, "loss/hidden": 1.125, "loss/logits": 0.19091251492500305, "loss/reg": 0.0004502017982304096, "step": 5339 }, { "epoch": 0.6675, "grad_norm": 2.88578724861145, "grad_norm_var": 0.5199980743240596, "learning_rate": 0.0001, "loss": 1.2747, "loss/crossentropy": 2.42002272605896, "loss/hidden": 1.09375, "loss/logits": 0.17645590007305145, "loss/reg": 0.000449955346994102, "step": 5340 }, { "epoch": 0.667625, "grad_norm": 2.5179529190063477, "grad_norm_var": 0.49862346086390136, "learning_rate": 0.0001, "loss": 1.2491, "loss/crossentropy": 2.724069833755493, "loss/hidden": 1.0703125, "loss/logits": 0.1742675006389618, "loss/reg": 0.0004497337795328349, "step": 5341 }, { "epoch": 0.66775, "grad_norm": 3.006403923034668, "grad_norm_var": 0.493066923439797, "learning_rate": 0.0001, "loss": 1.6412, "loss/crossentropy": 2.4319851398468018, "loss/hidden": 1.328125, "loss/logits": 0.3085916340351105, "loss/reg": 0.0004494724271353334, "step": 5342 }, { "epoch": 0.667875, "grad_norm": 3.2237625122070312, "grad_norm_var": 0.5069196956480094, "learning_rate": 0.0001, "loss": 1.276, "loss/crossentropy": 2.536109685897827, "loss/hidden": 1.1015625, "loss/logits": 0.16992777585983276, "loss/reg": 0.0004491982108447701, "step": 5343 }, { "epoch": 0.668, "grad_norm": 3.587311029434204, "grad_norm_var": 0.5198088564387753, "learning_rate": 0.0001, "loss": 1.3172, "loss/crossentropy": 2.5548460483551025, "loss/hidden": 1.109375, "loss/logits": 0.20331169664859772, "loss/reg": 0.0004489192506298423, "step": 5344 }, { "epoch": 0.668125, "grad_norm": 3.252134084701538, "grad_norm_var": 0.5219570608221374, "learning_rate": 0.0001, "loss": 1.3556, "loss/crossentropy": 2.6632044315338135, "loss/hidden": 1.1484375, "loss/logits": 0.202676460146904, "loss/reg": 0.0004486379912123084, "step": 5345 }, { "epoch": 0.66825, "grad_norm": 2.9492533206939697, "grad_norm_var": 0.5201174072191094, "learning_rate": 0.0001, "loss": 1.2634, "loss/crossentropy": 2.4222962856292725, "loss/hidden": 1.0859375, "loss/logits": 0.17301438748836517, "loss/reg": 0.00044840501504950225, "step": 5346 }, { "epoch": 0.668375, "grad_norm": 4.640530109405518, "grad_norm_var": 0.6840109312201917, "learning_rate": 0.0001, "loss": 1.5821, "loss/crossentropy": 2.599160671234131, "loss/hidden": 1.3046875, "loss/logits": 0.27296727895736694, "loss/reg": 0.0004481346404645592, "step": 5347 }, { "epoch": 0.6685, "grad_norm": 2.698345422744751, "grad_norm_var": 0.6823557326182936, "learning_rate": 0.0001, "loss": 1.2839, "loss/crossentropy": 2.6194424629211426, "loss/hidden": 1.078125, "loss/logits": 0.20128296315670013, "loss/reg": 0.00044790550600737333, "step": 5348 }, { "epoch": 0.668625, "grad_norm": 3.327086925506592, "grad_norm_var": 0.6754484783802042, "learning_rate": 0.0001, "loss": 1.3549, "loss/crossentropy": 2.886118173599243, "loss/hidden": 1.1484375, "loss/logits": 0.2020239233970642, "loss/reg": 0.000447675323812291, "step": 5349 }, { "epoch": 0.66875, "grad_norm": 4.937624454498291, "grad_norm_var": 0.8630707357369971, "learning_rate": 0.0001, "loss": 1.4004, "loss/crossentropy": 2.747856616973877, "loss/hidden": 1.2109375, "loss/logits": 0.18499839305877686, "loss/reg": 0.0004474426677916199, "step": 5350 }, { "epoch": 0.668875, "grad_norm": 3.729525089263916, "grad_norm_var": 0.609272946106276, "learning_rate": 0.0001, "loss": 1.323, "loss/crossentropy": 2.849842071533203, "loss/hidden": 1.1171875, "loss/logits": 0.20133747160434723, "loss/reg": 0.0004471908323466778, "step": 5351 }, { "epoch": 0.669, "grad_norm": 3.46600604057312, "grad_norm_var": 0.5649953957179925, "learning_rate": 0.0001, "loss": 1.429, "loss/crossentropy": 2.536418914794922, "loss/hidden": 1.21875, "loss/logits": 0.20579880475997925, "loss/reg": 0.0004469565465115011, "step": 5352 }, { "epoch": 0.669125, "grad_norm": 4.050957202911377, "grad_norm_var": 0.5430873542704847, "learning_rate": 0.0001, "loss": 1.4068, "loss/crossentropy": 2.8850791454315186, "loss/hidden": 1.140625, "loss/logits": 0.2617546021938324, "loss/reg": 0.0004467372491490096, "step": 5353 }, { "epoch": 0.66925, "grad_norm": 2.706890821456909, "grad_norm_var": 0.5113243896957708, "learning_rate": 0.0001, "loss": 1.3494, "loss/crossentropy": 2.599914789199829, "loss/hidden": 1.1484375, "loss/logits": 0.19653800129890442, "loss/reg": 0.0004465313977561891, "step": 5354 }, { "epoch": 0.669375, "grad_norm": 2.404632329940796, "grad_norm_var": 0.5221811236835407, "learning_rate": 0.0001, "loss": 1.1458, "loss/crossentropy": 2.512092351913452, "loss/hidden": 0.984375, "loss/logits": 0.15696024894714355, "loss/reg": 0.00044635709491558373, "step": 5355 }, { "epoch": 0.6695, "grad_norm": 3.7249858379364014, "grad_norm_var": 0.5157639256703987, "learning_rate": 0.0001, "loss": 1.76, "loss/crossentropy": 1.9872874021530151, "loss/hidden": 1.4921875, "loss/logits": 0.2633439898490906, "loss/reg": 0.0004461316275410354, "step": 5356 }, { "epoch": 0.669625, "grad_norm": 2.4210948944091797, "grad_norm_var": 0.5275988386009393, "learning_rate": 0.0001, "loss": 1.4165, "loss/crossentropy": 2.5990381240844727, "loss/hidden": 1.1640625, "loss/logits": 0.24798685312271118, "loss/reg": 0.00044595703366212547, "step": 5357 }, { "epoch": 0.66975, "grad_norm": 2.642622232437134, "grad_norm_var": 0.5541319956201958, "learning_rate": 0.0001, "loss": 1.2973, "loss/crossentropy": 2.129321336746216, "loss/hidden": 1.125, "loss/logits": 0.16786924004554749, "loss/reg": 0.00044572888873517513, "step": 5358 }, { "epoch": 0.669875, "grad_norm": 2.3087058067321777, "grad_norm_var": 0.6231081114801893, "learning_rate": 0.0001, "loss": 1.298, "loss/crossentropy": 2.7680020332336426, "loss/hidden": 1.09375, "loss/logits": 0.19978129863739014, "loss/reg": 0.0004454957088455558, "step": 5359 }, { "epoch": 0.67, "grad_norm": 2.453461170196533, "grad_norm_var": 0.6604741626555399, "learning_rate": 0.0001, "loss": 1.2325, "loss/crossentropy": 2.6278154850006104, "loss/hidden": 1.0390625, "loss/logits": 0.18895316123962402, "loss/reg": 0.0004452932917047292, "step": 5360 }, { "epoch": 0.670125, "grad_norm": 2.490751266479492, "grad_norm_var": 0.6946734581708405, "learning_rate": 0.0001, "loss": 1.2911, "loss/crossentropy": 2.7308547496795654, "loss/hidden": 1.1015625, "loss/logits": 0.18510624766349792, "loss/reg": 0.0004450663982424885, "step": 5361 }, { "epoch": 0.67025, "grad_norm": 3.361971378326416, "grad_norm_var": 0.6923724368182765, "learning_rate": 0.0001, "loss": 1.2636, "loss/crossentropy": 2.6310391426086426, "loss/hidden": 1.09375, "loss/logits": 0.165365070104599, "loss/reg": 0.0004448446852620691, "step": 5362 }, { "epoch": 0.670375, "grad_norm": 3.627227544784546, "grad_norm_var": 0.5633155071029324, "learning_rate": 0.0001, "loss": 1.4242, "loss/crossentropy": 2.675806999206543, "loss/hidden": 1.2109375, "loss/logits": 0.20880906283855438, "loss/reg": 0.0004446312668733299, "step": 5363 }, { "epoch": 0.6705, "grad_norm": 3.2136592864990234, "grad_norm_var": 0.5490863700765725, "learning_rate": 0.0001, "loss": 1.242, "loss/crossentropy": 2.7349441051483154, "loss/hidden": 1.0625, "loss/logits": 0.17501530051231384, "loss/reg": 0.00044443446677178144, "step": 5364 }, { "epoch": 0.670625, "grad_norm": 2.5498502254486084, "grad_norm_var": 0.571516687556531, "learning_rate": 0.0001, "loss": 1.2004, "loss/crossentropy": 2.5311403274536133, "loss/hidden": 1.0390625, "loss/logits": 0.15690335631370544, "loss/reg": 0.0004442111530806869, "step": 5365 }, { "epoch": 0.67075, "grad_norm": 2.679511785507202, "grad_norm_var": 0.3461536433646378, "learning_rate": 0.0001, "loss": 1.4345, "loss/crossentropy": 2.4642646312713623, "loss/hidden": 1.21875, "loss/logits": 0.21133987605571747, "loss/reg": 0.0004439998883754015, "step": 5366 }, { "epoch": 0.670875, "grad_norm": 4.421633720397949, "grad_norm_var": 0.4443832559020611, "learning_rate": 0.0001, "loss": 1.5507, "loss/crossentropy": 2.5406763553619385, "loss/hidden": 1.3125, "loss/logits": 0.23374657332897186, "loss/reg": 0.0004437816678546369, "step": 5367 }, { "epoch": 0.671, "grad_norm": 79.3426513671875, "grad_norm_var": 364.65669118827367, "learning_rate": 0.0001, "loss": 1.4568, "loss/crossentropy": 2.500840902328491, "loss/hidden": 1.28125, "loss/logits": 0.1711573451757431, "loss/reg": 0.000443570053903386, "step": 5368 }, { "epoch": 0.671125, "grad_norm": 3.339012622833252, "grad_norm_var": 365.041882134627, "learning_rate": 0.0001, "loss": 1.2011, "loss/crossentropy": 2.8079521656036377, "loss/hidden": 1.03125, "loss/logits": 0.16542333364486694, "loss/reg": 0.0004433691210579127, "step": 5369 }, { "epoch": 0.67125, "grad_norm": 3.3998615741729736, "grad_norm_var": 364.607729441227, "learning_rate": 0.0001, "loss": 1.1737, "loss/crossentropy": 2.556600332260132, "loss/hidden": 1.015625, "loss/logits": 0.15365390479564667, "loss/reg": 0.00044313547550700605, "step": 5370 }, { "epoch": 0.671375, "grad_norm": 3.246858596801758, "grad_norm_var": 364.04911712320217, "learning_rate": 0.0001, "loss": 1.6875, "loss/crossentropy": 2.5674238204956055, "loss/hidden": 1.375, "loss/logits": 0.3080388307571411, "loss/reg": 0.00044291646918281913, "step": 5371 }, { "epoch": 0.6715, "grad_norm": 4.129364013671875, "grad_norm_var": 363.8381959955001, "learning_rate": 0.0001, "loss": 1.4808, "loss/crossentropy": 2.599928379058838, "loss/hidden": 1.25, "loss/logits": 0.22640308737754822, "loss/reg": 0.0004427223466336727, "step": 5372 }, { "epoch": 0.671625, "grad_norm": 2.300924062728882, "grad_norm_var": 363.9261129763709, "learning_rate": 0.0001, "loss": 1.2702, "loss/crossentropy": 1.8088818788528442, "loss/hidden": 1.078125, "loss/logits": 0.18760734796524048, "loss/reg": 0.00044251300278119743, "step": 5373 }, { "epoch": 0.67175, "grad_norm": 2.905777931213379, "grad_norm_var": 363.74792928082013, "learning_rate": 0.0001, "loss": 1.6023, "loss/crossentropy": 2.286130905151367, "loss/hidden": 1.328125, "loss/logits": 0.2697668671607971, "loss/reg": 0.0004423448699526489, "step": 5374 }, { "epoch": 0.671875, "grad_norm": 3.3158631324768066, "grad_norm_var": 363.0657627440716, "learning_rate": 0.0001, "loss": 1.6565, "loss/crossentropy": 1.928989052772522, "loss/hidden": 1.375, "loss/logits": 0.27706336975097656, "loss/reg": 0.00044219486881047487, "step": 5375 }, { "epoch": 0.672, "grad_norm": 3.2706809043884277, "grad_norm_var": 362.51145722013865, "learning_rate": 0.0001, "loss": 1.5819, "loss/crossentropy": 2.4750547409057617, "loss/hidden": 1.3515625, "loss/logits": 0.22589200735092163, "loss/reg": 0.00044205630547367036, "step": 5376 }, { "epoch": 0.672125, "grad_norm": 2.319202423095703, "grad_norm_var": 362.63873244563786, "learning_rate": 0.0001, "loss": 1.2657, "loss/crossentropy": 2.3651175498962402, "loss/hidden": 1.078125, "loss/logits": 0.1831604540348053, "loss/reg": 0.00044191800407133996, "step": 5377 }, { "epoch": 0.67225, "grad_norm": 3.53363299369812, "grad_norm_var": 362.53524188289333, "learning_rate": 0.0001, "loss": 1.5756, "loss/crossentropy": 2.1099536418914795, "loss/hidden": 1.390625, "loss/logits": 0.18053171038627625, "loss/reg": 0.00044177204836159945, "step": 5378 }, { "epoch": 0.672375, "grad_norm": 2.841738700866699, "grad_norm_var": 363.0291260958093, "learning_rate": 0.0001, "loss": 1.4693, "loss/crossentropy": 2.442708969116211, "loss/hidden": 1.25, "loss/logits": 0.21484294533729553, "loss/reg": 0.00044154608622193336, "step": 5379 }, { "epoch": 0.6725, "grad_norm": 3.0274600982666016, "grad_norm_var": 363.14827521807575, "learning_rate": 0.0001, "loss": 1.272, "loss/crossentropy": 2.705087661743164, "loss/hidden": 1.0703125, "loss/logits": 0.19725587964057922, "loss/reg": 0.00044131811591796577, "step": 5380 }, { "epoch": 0.672625, "grad_norm": 3.2435977458953857, "grad_norm_var": 362.682173349106, "learning_rate": 0.0001, "loss": 1.318, "loss/crossentropy": 2.3712284564971924, "loss/hidden": 1.125, "loss/logits": 0.18863721191883087, "loss/reg": 0.00044108941801823676, "step": 5381 }, { "epoch": 0.67275, "grad_norm": 2.5728161334991455, "grad_norm_var": 362.7579679846458, "learning_rate": 0.0001, "loss": 1.4164, "loss/crossentropy": 2.475926399230957, "loss/hidden": 1.21875, "loss/logits": 0.19319912791252136, "loss/reg": 0.00044087052810937166, "step": 5382 }, { "epoch": 0.672875, "grad_norm": 3.6886887550354004, "grad_norm_var": 363.13642426248146, "learning_rate": 0.0001, "loss": 1.6656, "loss/crossentropy": 2.3885841369628906, "loss/hidden": 1.3515625, "loss/logits": 0.3096636235713959, "loss/reg": 0.00044063362292945385, "step": 5383 }, { "epoch": 0.673, "grad_norm": 2.664027214050293, "grad_norm_var": 0.24550004572687362, "learning_rate": 0.0001, "loss": 1.4209, "loss/crossentropy": 2.515713691711426, "loss/hidden": 1.203125, "loss/logits": 0.213333398103714, "loss/reg": 0.00044040716602467, "step": 5384 }, { "epoch": 0.673125, "grad_norm": 3.1485276222229004, "grad_norm_var": 0.24201407884947887, "learning_rate": 0.0001, "loss": 1.4518, "loss/crossentropy": 2.564122438430786, "loss/hidden": 1.21875, "loss/logits": 0.22861117124557495, "loss/reg": 0.0004401783808134496, "step": 5385 }, { "epoch": 0.67325, "grad_norm": 2.9468090534210205, "grad_norm_var": 0.2367629381977089, "learning_rate": 0.0001, "loss": 1.5508, "loss/crossentropy": 2.6194264888763428, "loss/hidden": 1.2734375, "loss/logits": 0.2729443907737732, "loss/reg": 0.0004399404861032963, "step": 5386 }, { "epoch": 0.673375, "grad_norm": 2.0255582332611084, "grad_norm_var": 0.301552765333318, "learning_rate": 0.0001, "loss": 1.1038, "loss/crossentropy": 2.4863121509552, "loss/hidden": 0.94921875, "loss/logits": 0.15017934143543243, "loss/reg": 0.0004396943550091237, "step": 5387 }, { "epoch": 0.6735, "grad_norm": 2.398064374923706, "grad_norm_var": 0.22724527071173928, "learning_rate": 0.0001, "loss": 1.1179, "loss/crossentropy": 2.4138054847717285, "loss/hidden": 0.96875, "loss/logits": 0.1447570025920868, "loss/reg": 0.00043946332880295813, "step": 5388 }, { "epoch": 0.673625, "grad_norm": 3.2087316513061523, "grad_norm_var": 0.20772719981273344, "learning_rate": 0.0001, "loss": 1.4999, "loss/crossentropy": 2.4630112648010254, "loss/hidden": 1.234375, "loss/logits": 0.26112091541290283, "loss/reg": 0.0004392281698528677, "step": 5389 }, { "epoch": 0.67375, "grad_norm": 8.38025951385498, "grad_norm_var": 2.0526221008718255, "learning_rate": 0.0001, "loss": 1.6922, "loss/crossentropy": 2.879201889038086, "loss/hidden": 1.40625, "loss/logits": 0.28158318996429443, "loss/reg": 0.0004389994719531387, "step": 5390 }, { "epoch": 0.673875, "grad_norm": 3.2772390842437744, "grad_norm_var": 2.0525646568673985, "learning_rate": 0.0001, "loss": 1.1585, "loss/crossentropy": 2.5543901920318604, "loss/hidden": 1.0, "loss/logits": 0.15407365560531616, "loss/reg": 0.00043877342250198126, "step": 5391 }, { "epoch": 0.674, "grad_norm": 2.55635929107666, "grad_norm_var": 2.0857422297887274, "learning_rate": 0.0001, "loss": 1.1265, "loss/crossentropy": 2.598320722579956, "loss/hidden": 0.96875, "loss/logits": 0.1533835381269455, "loss/reg": 0.0004385084321256727, "step": 5392 }, { "epoch": 0.674125, "grad_norm": 3.983628273010254, "grad_norm_var": 2.0546413197700337, "learning_rate": 0.0001, "loss": 1.7538, "loss/crossentropy": 2.4968719482421875, "loss/hidden": 1.4921875, "loss/logits": 0.25721877813339233, "loss/reg": 0.00043827391345985234, "step": 5393 }, { "epoch": 0.67425, "grad_norm": 2.393146276473999, "grad_norm_var": 2.1070339605735264, "learning_rate": 0.0001, "loss": 1.3354, "loss/crossentropy": 2.3950600624084473, "loss/hidden": 1.125, "loss/logits": 0.2060486376285553, "loss/reg": 0.000438010785728693, "step": 5394 }, { "epoch": 0.674375, "grad_norm": 3.1863231658935547, "grad_norm_var": 2.094673574189259, "learning_rate": 0.0001, "loss": 1.376, "loss/crossentropy": 2.316434860229492, "loss/hidden": 1.1875, "loss/logits": 0.18415965139865875, "loss/reg": 0.0004377415170893073, "step": 5395 }, { "epoch": 0.6745, "grad_norm": 2.7028045654296875, "grad_norm_var": 2.1127914940530594, "learning_rate": 0.0001, "loss": 1.3983, "loss/crossentropy": 2.5944783687591553, "loss/hidden": 1.1640625, "loss/logits": 0.22986876964569092, "loss/reg": 0.0004374673590064049, "step": 5396 }, { "epoch": 0.674625, "grad_norm": 4.658191680908203, "grad_norm_var": 2.2322119560287508, "learning_rate": 0.0001, "loss": 1.6843, "loss/crossentropy": 2.550144910812378, "loss/hidden": 1.3828125, "loss/logits": 0.2971251308917999, "loss/reg": 0.00043724162969738245, "step": 5397 }, { "epoch": 0.67475, "grad_norm": 5.299314498901367, "grad_norm_var": 2.409948317167493, "learning_rate": 0.0001, "loss": 1.7407, "loss/crossentropy": 2.4723591804504395, "loss/hidden": 1.4765625, "loss/logits": 0.2597990930080414, "loss/reg": 0.00043696165084838867, "step": 5398 }, { "epoch": 0.674875, "grad_norm": 5.428415298461914, "grad_norm_var": 2.635377673940337, "learning_rate": 0.0001, "loss": 1.6018, "loss/crossentropy": 2.6283481121063232, "loss/hidden": 1.3984375, "loss/logits": 0.19902899861335754, "loss/reg": 0.00043669139267876744, "step": 5399 }, { "epoch": 0.675, "grad_norm": 3.0214309692382812, "grad_norm_var": 2.5968005961084106, "learning_rate": 0.0001, "loss": 1.4623, "loss/crossentropy": 2.661827564239502, "loss/hidden": 1.234375, "loss/logits": 0.22357907891273499, "loss/reg": 0.000436457950854674, "step": 5400 }, { "epoch": 0.675125, "grad_norm": 2.685438394546509, "grad_norm_var": 2.6419962940686292, "learning_rate": 0.0001, "loss": 1.293, "loss/crossentropy": 2.725126028060913, "loss/hidden": 1.1015625, "loss/logits": 0.1870783269405365, "loss/reg": 0.00043622576049529016, "step": 5401 }, { "epoch": 0.67525, "grad_norm": 3.763516664505005, "grad_norm_var": 2.6088007886160614, "learning_rate": 0.0001, "loss": 1.4291, "loss/crossentropy": 2.46301531791687, "loss/hidden": 1.21875, "loss/logits": 0.20599912106990814, "loss/reg": 0.0004359595477581024, "step": 5402 }, { "epoch": 0.675375, "grad_norm": 4.266615867614746, "grad_norm_var": 2.4266857604311873, "learning_rate": 0.0001, "loss": 1.475, "loss/crossentropy": 2.371729612350464, "loss/hidden": 1.2265625, "loss/logits": 0.24410545825958252, "loss/reg": 0.00043567593093030155, "step": 5403 }, { "epoch": 0.6755, "grad_norm": 2.732332706451416, "grad_norm_var": 2.3700455596922096, "learning_rate": 0.0001, "loss": 1.2142, "loss/crossentropy": 2.6602907180786133, "loss/hidden": 1.0390625, "loss/logits": 0.17077139019966125, "loss/reg": 0.00043542301864363253, "step": 5404 }, { "epoch": 0.675625, "grad_norm": 2.357802391052246, "grad_norm_var": 2.4876582431398333, "learning_rate": 0.0001, "loss": 1.4165, "loss/crossentropy": 2.405435562133789, "loss/hidden": 1.1875, "loss/logits": 0.22465795278549194, "loss/reg": 0.0004351481329649687, "step": 5405 }, { "epoch": 0.67575, "grad_norm": 5.211427688598633, "grad_norm_var": 1.1772117429674835, "learning_rate": 0.0001, "loss": 1.5393, "loss/crossentropy": 2.279665946960449, "loss/hidden": 1.3671875, "loss/logits": 0.16778263449668884, "loss/reg": 0.0004349234513938427, "step": 5406 }, { "epoch": 0.675875, "grad_norm": 3.029139280319214, "grad_norm_var": 1.19157860303397, "learning_rate": 0.0001, "loss": 1.6278, "loss/crossentropy": 2.4744632244110107, "loss/hidden": 1.375, "loss/logits": 0.2484326958656311, "loss/reg": 0.0004346879431977868, "step": 5407 }, { "epoch": 0.676, "grad_norm": 2.6524596214294434, "grad_norm_var": 1.1790428067508212, "learning_rate": 0.0001, "loss": 1.1992, "loss/crossentropy": 2.3964970111846924, "loss/hidden": 1.03125, "loss/logits": 0.16358616948127747, "loss/reg": 0.00043445266783237457, "step": 5408 }, { "epoch": 0.676125, "grad_norm": 3.5723719596862793, "grad_norm_var": 1.1677961711813916, "learning_rate": 0.0001, "loss": 1.4167, "loss/crossentropy": 2.5274109840393066, "loss/hidden": 1.1796875, "loss/logits": 0.232657790184021, "loss/reg": 0.00043422140879556537, "step": 5409 }, { "epoch": 0.67625, "grad_norm": 3.408264398574829, "grad_norm_var": 1.074261455871844, "learning_rate": 0.0001, "loss": 1.5806, "loss/crossentropy": 2.3832130432128906, "loss/hidden": 1.34375, "loss/logits": 0.23252132534980774, "loss/reg": 0.0004339804290793836, "step": 5410 }, { "epoch": 0.676375, "grad_norm": 3.5329806804656982, "grad_norm_var": 1.0615658548715088, "learning_rate": 0.0001, "loss": 1.5558, "loss/crossentropy": 2.5562639236450195, "loss/hidden": 1.328125, "loss/logits": 0.22338107228279114, "loss/reg": 0.000433720531873405, "step": 5411 }, { "epoch": 0.6765, "grad_norm": 3.380483865737915, "grad_norm_var": 1.005120596988676, "learning_rate": 0.0001, "loss": 1.9745, "loss/crossentropy": 2.474592685699463, "loss/hidden": 1.609375, "loss/logits": 0.36079204082489014, "loss/reg": 0.00043349593761377037, "step": 5412 }, { "epoch": 0.676625, "grad_norm": 2.4904584884643555, "grad_norm_var": 1.0182555791267305, "learning_rate": 0.0001, "loss": 1.4883, "loss/crossentropy": 2.616684675216675, "loss/hidden": 1.25, "loss/logits": 0.2339901626110077, "loss/reg": 0.00043327719322405756, "step": 5413 }, { "epoch": 0.67675, "grad_norm": 3.071848154067993, "grad_norm_var": 0.8094198131010517, "learning_rate": 0.0001, "loss": 1.564, "loss/crossentropy": 3.032489061355591, "loss/hidden": 1.296875, "loss/logits": 0.2627839148044586, "loss/reg": 0.00043307969463057816, "step": 5414 }, { "epoch": 0.676875, "grad_norm": 3.109337568283081, "grad_norm_var": 0.5223068707101561, "learning_rate": 0.0001, "loss": 1.4766, "loss/crossentropy": 2.219552993774414, "loss/hidden": 1.25, "loss/logits": 0.2222641408443451, "loss/reg": 0.00043284965795464814, "step": 5415 }, { "epoch": 0.677, "grad_norm": 2.311537265777588, "grad_norm_var": 0.5771296895744087, "learning_rate": 0.0001, "loss": 1.2921, "loss/crossentropy": 2.405669927597046, "loss/hidden": 1.109375, "loss/logits": 0.17842912673950195, "loss/reg": 0.0004326407506596297, "step": 5416 }, { "epoch": 0.677125, "grad_norm": 2.751059055328369, "grad_norm_var": 0.5726910830738421, "learning_rate": 0.0001, "loss": 1.2564, "loss/crossentropy": 2.5346100330352783, "loss/hidden": 1.0703125, "loss/logits": 0.18176595866680145, "loss/reg": 0.00043241993989795446, "step": 5417 }, { "epoch": 0.67725, "grad_norm": 3.035167932510376, "grad_norm_var": 0.553802478632146, "learning_rate": 0.0001, "loss": 1.3349, "loss/crossentropy": 2.3653640747070312, "loss/hidden": 1.140625, "loss/logits": 0.1899435967206955, "loss/reg": 0.00043218862265348434, "step": 5418 }, { "epoch": 0.677375, "grad_norm": 2.8122527599334717, "grad_norm_var": 0.47569295497267583, "learning_rate": 0.0001, "loss": 1.2727, "loss/crossentropy": 2.722940683364868, "loss/hidden": 1.078125, "loss/logits": 0.1902797520160675, "loss/reg": 0.0004319427243899554, "step": 5419 }, { "epoch": 0.6775, "grad_norm": 2.8701188564300537, "grad_norm_var": 0.47028691033917364, "learning_rate": 0.0001, "loss": 1.4122, "loss/crossentropy": 2.6840362548828125, "loss/hidden": 1.2109375, "loss/logits": 0.19694039225578308, "loss/reg": 0.00043169158743694425, "step": 5420 }, { "epoch": 0.677625, "grad_norm": 2.552515745162964, "grad_norm_var": 0.45339305797938984, "learning_rate": 0.0001, "loss": 1.4305, "loss/crossentropy": 2.099841594696045, "loss/hidden": 1.1796875, "loss/logits": 0.2465479075908661, "loss/reg": 0.00043145849485881627, "step": 5421 }, { "epoch": 0.67775, "grad_norm": 2.6515355110168457, "grad_norm_var": 0.1463717845677896, "learning_rate": 0.0001, "loss": 1.3278, "loss/crossentropy": 2.4627621173858643, "loss/hidden": 1.1328125, "loss/logits": 0.19069699943065643, "loss/reg": 0.0004311972006689757, "step": 5422 }, { "epoch": 0.677875, "grad_norm": 2.7159440517425537, "grad_norm_var": 0.14927997679944457, "learning_rate": 0.0001, "loss": 1.1232, "loss/crossentropy": 2.5871620178222656, "loss/hidden": 0.953125, "loss/logits": 0.1657763421535492, "loss/reg": 0.0004309428622946143, "step": 5423 }, { "epoch": 0.678, "grad_norm": 2.878547191619873, "grad_norm_var": 0.14403601654562787, "learning_rate": 0.0001, "loss": 1.3504, "loss/crossentropy": 2.6477489471435547, "loss/hidden": 1.1484375, "loss/logits": 0.19764818251132965, "loss/reg": 0.0004307157942093909, "step": 5424 }, { "epoch": 0.678125, "grad_norm": 3.1728551387786865, "grad_norm_var": 0.12067376534269153, "learning_rate": 0.0001, "loss": 1.4109, "loss/crossentropy": 2.4884824752807617, "loss/hidden": 1.1875, "loss/logits": 0.21913091838359833, "loss/reg": 0.00043049242231063545, "step": 5425 }, { "epoch": 0.67825, "grad_norm": 2.401153087615967, "grad_norm_var": 0.11870999160264452, "learning_rate": 0.0001, "loss": 1.2097, "loss/crossentropy": 2.6700668334960938, "loss/hidden": 1.0234375, "loss/logits": 0.1819230616092682, "loss/reg": 0.0004302791494410485, "step": 5426 }, { "epoch": 0.678375, "grad_norm": 3.4147253036499023, "grad_norm_var": 0.1089509861347833, "learning_rate": 0.0001, "loss": 1.6903, "loss/crossentropy": 2.578042507171631, "loss/hidden": 1.390625, "loss/logits": 0.29538822174072266, "loss/reg": 0.0004300376458559185, "step": 5427 }, { "epoch": 0.6785, "grad_norm": 2.570852041244507, "grad_norm_var": 0.09278558571356044, "learning_rate": 0.0001, "loss": 1.522, "loss/crossentropy": 2.4294822216033936, "loss/hidden": 1.2734375, "loss/logits": 0.24428033828735352, "loss/reg": 0.0004297937557566911, "step": 5428 }, { "epoch": 0.678625, "grad_norm": 3.3362808227539062, "grad_norm_var": 0.10252026189180938, "learning_rate": 0.0001, "loss": 1.2026, "loss/crossentropy": 2.9162685871124268, "loss/hidden": 1.03125, "loss/logits": 0.1670786738395691, "loss/reg": 0.0004295586550142616, "step": 5429 }, { "epoch": 0.67875, "grad_norm": 2.9991984367370605, "grad_norm_var": 0.10073491500702308, "learning_rate": 0.0001, "loss": 1.2363, "loss/crossentropy": 2.748307943344116, "loss/hidden": 1.0546875, "loss/logits": 0.17730626463890076, "loss/reg": 0.0004293296951800585, "step": 5430 }, { "epoch": 0.678875, "grad_norm": 2.889746904373169, "grad_norm_var": 0.09612462668682914, "learning_rate": 0.0001, "loss": 1.3863, "loss/crossentropy": 2.6907846927642822, "loss/hidden": 1.15625, "loss/logits": 0.22571545839309692, "loss/reg": 0.0004290785873308778, "step": 5431 }, { "epoch": 0.679, "grad_norm": 3.103224277496338, "grad_norm_var": 0.08001880167472564, "learning_rate": 0.0001, "loss": 1.3294, "loss/crossentropy": 2.999793767929077, "loss/hidden": 1.125, "loss/logits": 0.20012786984443665, "loss/reg": 0.00042883161222562194, "step": 5432 }, { "epoch": 0.679125, "grad_norm": 2.7626876831054688, "grad_norm_var": 0.07982004734691657, "learning_rate": 0.0001, "loss": 1.184, "loss/crossentropy": 2.759852647781372, "loss/hidden": 1.015625, "loss/logits": 0.16413095593452454, "loss/reg": 0.0004285997129045427, "step": 5433 }, { "epoch": 0.67925, "grad_norm": 2.675163507461548, "grad_norm_var": 0.08073251459825978, "learning_rate": 0.0001, "loss": 1.5603, "loss/crossentropy": 2.3591206073760986, "loss/hidden": 1.328125, "loss/logits": 0.22787362337112427, "loss/reg": 0.00042833772022277117, "step": 5434 }, { "epoch": 0.679375, "grad_norm": 2.940769672393799, "grad_norm_var": 0.08089650183934485, "learning_rate": 0.0001, "loss": 1.315, "loss/crossentropy": 2.3365612030029297, "loss/hidden": 1.125, "loss/logits": 0.1857677400112152, "loss/reg": 0.0004280720022507012, "step": 5435 }, { "epoch": 0.6795, "grad_norm": 3.134026050567627, "grad_norm_var": 0.08521993379715316, "learning_rate": 0.0001, "loss": 1.1928, "loss/crossentropy": 2.852461576461792, "loss/hidden": 1.03125, "loss/logits": 0.1572399139404297, "loss/reg": 0.00042784446850419044, "step": 5436 }, { "epoch": 0.679625, "grad_norm": 2.447585105895996, "grad_norm_var": 0.09059409053401714, "learning_rate": 0.0001, "loss": 1.3364, "loss/crossentropy": 2.7245078086853027, "loss/hidden": 1.1015625, "loss/logits": 0.2305382341146469, "loss/reg": 0.00042757834307849407, "step": 5437 }, { "epoch": 0.67975, "grad_norm": 2.933460235595703, "grad_norm_var": 0.08694013189743993, "learning_rate": 0.0001, "loss": 1.1234, "loss/crossentropy": 2.608306407928467, "loss/hidden": 0.97265625, "loss/logits": 0.1464659571647644, "loss/reg": 0.0004272922524251044, "step": 5438 }, { "epoch": 0.679875, "grad_norm": 4.45463752746582, "grad_norm_var": 0.23355671087648489, "learning_rate": 0.0001, "loss": 1.5334, "loss/crossentropy": 2.2235097885131836, "loss/hidden": 1.3046875, "loss/logits": 0.22440658509731293, "loss/reg": 0.0004270592180546373, "step": 5439 }, { "epoch": 0.68, "grad_norm": 2.9124622344970703, "grad_norm_var": 0.233046912642043, "learning_rate": 0.0001, "loss": 1.189, "loss/crossentropy": 2.733024835586548, "loss/hidden": 1.0234375, "loss/logits": 0.1612912118434906, "loss/reg": 0.00042679638136178255, "step": 5440 }, { "epoch": 0.680125, "grad_norm": 2.1599583625793457, "grad_norm_var": 0.27508104531363314, "learning_rate": 0.0001, "loss": 1.2191, "loss/crossentropy": 2.569704055786133, "loss/hidden": 1.03125, "loss/logits": 0.18359801173210144, "loss/reg": 0.00042656168807297945, "step": 5441 }, { "epoch": 0.68025, "grad_norm": 2.9614884853363037, "grad_norm_var": 0.25399858301458667, "learning_rate": 0.0001, "loss": 1.3736, "loss/crossentropy": 2.4460577964782715, "loss/hidden": 1.15625, "loss/logits": 0.21306048333644867, "loss/reg": 0.00042630621464923024, "step": 5442 }, { "epoch": 0.680375, "grad_norm": 3.185661792755127, "grad_norm_var": 0.24403172065740364, "learning_rate": 0.0001, "loss": 1.6358, "loss/crossentropy": 2.5314016342163086, "loss/hidden": 1.375, "loss/logits": 0.25649452209472656, "loss/reg": 0.0004260746936779469, "step": 5443 }, { "epoch": 0.6805, "grad_norm": 6.540060520172119, "grad_norm_var": 1.019201370377851, "learning_rate": 0.0001, "loss": 1.9931, "loss/crossentropy": 2.4772446155548096, "loss/hidden": 1.640625, "loss/logits": 0.34821775555610657, "loss/reg": 0.00042583709000609815, "step": 5444 }, { "epoch": 0.680625, "grad_norm": 2.7520055770874023, "grad_norm_var": 1.0310718120279487, "learning_rate": 0.0001, "loss": 1.3657, "loss/crossentropy": 2.6015262603759766, "loss/hidden": 1.140625, "loss/logits": 0.22085824608802795, "loss/reg": 0.0004255745152477175, "step": 5445 }, { "epoch": 0.68075, "grad_norm": 2.5274672508239746, "grad_norm_var": 1.0562423867805022, "learning_rate": 0.0001, "loss": 1.1585, "loss/crossentropy": 2.450608491897583, "loss/hidden": 0.99609375, "loss/logits": 0.15814492106437683, "loss/reg": 0.0004253205261193216, "step": 5446 }, { "epoch": 0.680875, "grad_norm": 12.35372543334961, "grad_norm_var": 6.327314399404917, "learning_rate": 0.0001, "loss": 1.5989, "loss/crossentropy": 2.856175184249878, "loss/hidden": 1.359375, "loss/logits": 0.23524026572704315, "loss/reg": 0.0004250943020451814, "step": 5447 }, { "epoch": 0.681, "grad_norm": 3.3181214332580566, "grad_norm_var": 6.311947342675307, "learning_rate": 0.0001, "loss": 1.3684, "loss/crossentropy": 2.319237470626831, "loss/hidden": 1.1640625, "loss/logits": 0.20008964836597443, "loss/reg": 0.00042486871825531125, "step": 5448 }, { "epoch": 0.681125, "grad_norm": 3.455117702484131, "grad_norm_var": 6.250418860549621, "learning_rate": 0.0001, "loss": 1.9791, "loss/crossentropy": 2.521967887878418, "loss/hidden": 1.640625, "loss/logits": 0.33426809310913086, "loss/reg": 0.00042464458965696394, "step": 5449 }, { "epoch": 0.68125, "grad_norm": 2.545179605484009, "grad_norm_var": 6.270917293768563, "learning_rate": 0.0001, "loss": 1.2881, "loss/crossentropy": 2.3981826305389404, "loss/hidden": 1.109375, "loss/logits": 0.17447076737880707, "loss/reg": 0.0004244238662067801, "step": 5450 }, { "epoch": 0.681375, "grad_norm": 2.623030424118042, "grad_norm_var": 6.313156640153352, "learning_rate": 0.0001, "loss": 1.2757, "loss/crossentropy": 2.3044955730438232, "loss/hidden": 1.046875, "loss/logits": 0.22454717755317688, "loss/reg": 0.00042418212979100645, "step": 5451 }, { "epoch": 0.6815, "grad_norm": 2.5895488262176514, "grad_norm_var": 6.377782230491872, "learning_rate": 0.0001, "loss": 1.3216, "loss/crossentropy": 2.6253139972686768, "loss/hidden": 1.1171875, "loss/logits": 0.2001282423734665, "loss/reg": 0.0004239593108650297, "step": 5452 }, { "epoch": 0.681625, "grad_norm": 2.568340301513672, "grad_norm_var": 6.357965814984018, "learning_rate": 0.0001, "loss": 1.2592, "loss/crossentropy": 2.530377149581909, "loss/hidden": 1.0859375, "loss/logits": 0.16903427243232727, "loss/reg": 0.0004237276443745941, "step": 5453 }, { "epoch": 0.68175, "grad_norm": 2.6254935264587402, "grad_norm_var": 6.39711519045131, "learning_rate": 0.0001, "loss": 1.594, "loss/crossentropy": 2.4893639087677, "loss/hidden": 1.3203125, "loss/logits": 0.2694794237613678, "loss/reg": 0.0004234940279275179, "step": 5454 }, { "epoch": 0.681875, "grad_norm": 2.6455984115600586, "grad_norm_var": 6.425244119565044, "learning_rate": 0.0001, "loss": 1.355, "loss/crossentropy": 2.6694252490997314, "loss/hidden": 1.140625, "loss/logits": 0.21016961336135864, "loss/reg": 0.0004232541541568935, "step": 5455 }, { "epoch": 0.682, "grad_norm": 2.5469460487365723, "grad_norm_var": 6.467599025003724, "learning_rate": 0.0001, "loss": 1.4092, "loss/crossentropy": 2.508185863494873, "loss/hidden": 1.1640625, "loss/logits": 0.24086111783981323, "loss/reg": 0.0004230269114486873, "step": 5456 }, { "epoch": 0.682125, "grad_norm": 2.6017539501190186, "grad_norm_var": 6.395715411155684, "learning_rate": 0.0001, "loss": 1.2991, "loss/crossentropy": 2.2846646308898926, "loss/hidden": 1.125, "loss/logits": 0.1698942631483078, "loss/reg": 0.00042281096102669835, "step": 5457 }, { "epoch": 0.68225, "grad_norm": 3.1235287189483643, "grad_norm_var": 6.38323774363394, "learning_rate": 0.0001, "loss": 1.3224, "loss/crossentropy": 2.899484872817993, "loss/hidden": 1.1171875, "loss/logits": 0.20099115371704102, "loss/reg": 0.0004225989105179906, "step": 5458 }, { "epoch": 0.682375, "grad_norm": 3.7674739360809326, "grad_norm_var": 6.370305030518647, "learning_rate": 0.0001, "loss": 1.4126, "loss/crossentropy": 2.570025682449341, "loss/hidden": 1.171875, "loss/logits": 0.23649919033050537, "loss/reg": 0.00042239928734488785, "step": 5459 }, { "epoch": 0.6825, "grad_norm": 2.8423941135406494, "grad_norm_var": 5.805638134164261, "learning_rate": 0.0001, "loss": 1.292, "loss/crossentropy": 2.1048660278320312, "loss/hidden": 1.109375, "loss/logits": 0.17837616801261902, "loss/reg": 0.00042220312752760947, "step": 5460 }, { "epoch": 0.682625, "grad_norm": 3.0086936950683594, "grad_norm_var": 5.7865395218266125, "learning_rate": 0.0001, "loss": 1.3194, "loss/crossentropy": 2.5657637119293213, "loss/hidden": 1.125, "loss/logits": 0.1901462972164154, "loss/reg": 0.00042201403994113207, "step": 5461 }, { "epoch": 0.68275, "grad_norm": 2.283416271209717, "grad_norm_var": 5.820164295759216, "learning_rate": 0.0001, "loss": 1.2373, "loss/crossentropy": 2.5785443782806396, "loss/hidden": 1.046875, "loss/logits": 0.1862044483423233, "loss/reg": 0.000421845936216414, "step": 5462 }, { "epoch": 0.682875, "grad_norm": 3.1286489963531494, "grad_norm_var": 0.1641798910339721, "learning_rate": 0.0001, "loss": 1.2571, "loss/crossentropy": 2.5902292728424072, "loss/hidden": 1.078125, "loss/logits": 0.17476552724838257, "loss/reg": 0.0004216840898152441, "step": 5463 }, { "epoch": 0.683, "grad_norm": 4.20621395111084, "grad_norm_var": 0.26836314074565354, "learning_rate": 0.0001, "loss": 1.219, "loss/crossentropy": 2.900623321533203, "loss/hidden": 1.0703125, "loss/logits": 0.14443746209144592, "loss/reg": 0.0004215455846861005, "step": 5464 }, { "epoch": 0.683125, "grad_norm": 2.936481237411499, "grad_norm_var": 0.24748486248742338, "learning_rate": 0.0001, "loss": 1.3554, "loss/crossentropy": 2.7692065238952637, "loss/hidden": 1.140625, "loss/logits": 0.21058803796768188, "loss/reg": 0.00042132349335588515, "step": 5465 }, { "epoch": 0.68325, "grad_norm": 2.7394216060638428, "grad_norm_var": 0.24123180244091955, "learning_rate": 0.0001, "loss": 1.4609, "loss/crossentropy": 2.73134446144104, "loss/hidden": 1.234375, "loss/logits": 0.2223510891199112, "loss/reg": 0.0004211013438180089, "step": 5466 }, { "epoch": 0.683375, "grad_norm": 3.3720974922180176, "grad_norm_var": 0.2496557219376039, "learning_rate": 0.0001, "loss": 1.3189, "loss/crossentropy": 2.76102352142334, "loss/hidden": 1.1171875, "loss/logits": 0.19749626517295837, "loss/reg": 0.00042087867041118443, "step": 5467 }, { "epoch": 0.6835, "grad_norm": 3.1647143363952637, "grad_norm_var": 0.24371460474388465, "learning_rate": 0.0001, "loss": 1.6191, "loss/crossentropy": 2.3441827297210693, "loss/hidden": 1.375, "loss/logits": 0.23989973962306976, "loss/reg": 0.0004206575104035437, "step": 5468 }, { "epoch": 0.683625, "grad_norm": 3.521568775177002, "grad_norm_var": 0.24912768567356972, "learning_rate": 0.0001, "loss": 1.2807, "loss/crossentropy": 2.4562556743621826, "loss/hidden": 1.109375, "loss/logits": 0.16708973050117493, "loss/reg": 0.0004204510187264532, "step": 5469 }, { "epoch": 0.68375, "grad_norm": 2.6003687381744385, "grad_norm_var": 0.2505294362528796, "learning_rate": 0.0001, "loss": 1.1486, "loss/crossentropy": 2.6166062355041504, "loss/hidden": 0.9921875, "loss/logits": 0.15218926966190338, "loss/reg": 0.00042023436981253326, "step": 5470 }, { "epoch": 0.683875, "grad_norm": 2.390033483505249, "grad_norm_var": 0.267729983822111, "learning_rate": 0.0001, "loss": 1.3709, "loss/crossentropy": 2.333648443222046, "loss/hidden": 1.1640625, "loss/logits": 0.2026568353176117, "loss/reg": 0.00042002202826552093, "step": 5471 }, { "epoch": 0.684, "grad_norm": 4.188377857208252, "grad_norm_var": 0.3337718982552168, "learning_rate": 0.0001, "loss": 1.7483, "loss/crossentropy": 2.454590082168579, "loss/hidden": 1.5, "loss/logits": 0.24408692121505737, "loss/reg": 0.00041981201502494514, "step": 5472 }, { "epoch": 0.684125, "grad_norm": 2.682875394821167, "grad_norm_var": 0.32860803622126367, "learning_rate": 0.0001, "loss": 1.3415, "loss/crossentropy": 2.6603896617889404, "loss/hidden": 1.1484375, "loss/logits": 0.18888312578201294, "loss/reg": 0.0004195940273348242, "step": 5473 }, { "epoch": 0.68425, "grad_norm": 2.6177210807800293, "grad_norm_var": 0.34451318432244593, "learning_rate": 0.0001, "loss": 1.2175, "loss/crossentropy": 2.6436362266540527, "loss/hidden": 1.0390625, "loss/logits": 0.17423918843269348, "loss/reg": 0.0004193309578113258, "step": 5474 }, { "epoch": 0.684375, "grad_norm": 3.314690113067627, "grad_norm_var": 0.31646623244470573, "learning_rate": 0.0001, "loss": 1.3282, "loss/crossentropy": 2.500065326690674, "loss/hidden": 1.140625, "loss/logits": 0.18342848122119904, "loss/reg": 0.00041911049629561603, "step": 5475 }, { "epoch": 0.6845, "grad_norm": 2.4798214435577393, "grad_norm_var": 0.33531610260034933, "learning_rate": 0.0001, "loss": 1.2543, "loss/crossentropy": 2.4932732582092285, "loss/hidden": 1.0625, "loss/logits": 0.1876411736011505, "loss/reg": 0.00041889818385243416, "step": 5476 }, { "epoch": 0.684625, "grad_norm": 2.2866921424865723, "grad_norm_var": 0.3708810386676189, "learning_rate": 0.0001, "loss": 1.1107, "loss/crossentropy": 2.726917266845703, "loss/hidden": 0.9453125, "loss/logits": 0.1611875295639038, "loss/reg": 0.00041869276901707053, "step": 5477 }, { "epoch": 0.68475, "grad_norm": 3.3589437007904053, "grad_norm_var": 0.341196240887272, "learning_rate": 0.0001, "loss": 1.4292, "loss/crossentropy": 2.3774776458740234, "loss/hidden": 1.203125, "loss/logits": 0.22188818454742432, "loss/reg": 0.0004184679710306227, "step": 5478 }, { "epoch": 0.684875, "grad_norm": 3.441188335418701, "grad_norm_var": 0.3500873564512306, "learning_rate": 0.0001, "loss": 1.1873, "loss/crossentropy": 2.8608152866363525, "loss/hidden": 1.015625, "loss/logits": 0.1675388216972351, "loss/reg": 0.00041824622894637287, "step": 5479 }, { "epoch": 0.685, "grad_norm": 2.6225075721740723, "grad_norm_var": 0.26931284506677167, "learning_rate": 0.0001, "loss": 1.2291, "loss/crossentropy": 2.2235870361328125, "loss/hidden": 1.0703125, "loss/logits": 0.15462011098861694, "loss/reg": 0.0004180260293651372, "step": 5480 }, { "epoch": 0.685125, "grad_norm": 2.3669610023498535, "grad_norm_var": 0.29306755909984283, "learning_rate": 0.0001, "loss": 1.3097, "loss/crossentropy": 2.527008295059204, "loss/hidden": 1.125, "loss/logits": 0.18053549528121948, "loss/reg": 0.00041779284947551787, "step": 5481 }, { "epoch": 0.68525, "grad_norm": 3.223559617996216, "grad_norm_var": 0.2943335707714633, "learning_rate": 0.0001, "loss": 1.3689, "loss/crossentropy": 2.4829256534576416, "loss/hidden": 1.1640625, "loss/logits": 0.2006581723690033, "loss/reg": 0.00041759369196370244, "step": 5482 }, { "epoch": 0.685375, "grad_norm": 2.3329198360443115, "grad_norm_var": 0.3070842254664586, "learning_rate": 0.0001, "loss": 1.2064, "loss/crossentropy": 2.345205783843994, "loss/hidden": 1.03125, "loss/logits": 0.17092736065387726, "loss/reg": 0.0004173771885689348, "step": 5483 }, { "epoch": 0.6855, "grad_norm": 5.2479681968688965, "grad_norm_var": 0.6485102614667727, "learning_rate": 0.0001, "loss": 1.6165, "loss/crossentropy": 3.0325088500976562, "loss/hidden": 1.3671875, "loss/logits": 0.2451724410057068, "loss/reg": 0.0004171617329120636, "step": 5484 }, { "epoch": 0.685625, "grad_norm": 2.9830245971679688, "grad_norm_var": 0.632220099921782, "learning_rate": 0.0001, "loss": 1.1924, "loss/crossentropy": 2.543621063232422, "loss/hidden": 1.0078125, "loss/logits": 0.18039390444755554, "loss/reg": 0.000416945549659431, "step": 5485 }, { "epoch": 0.68575, "grad_norm": 2.622441291809082, "grad_norm_var": 0.6310491124809602, "learning_rate": 0.0001, "loss": 1.1785, "loss/crossentropy": 2.3108136653900146, "loss/hidden": 1.015625, "loss/logits": 0.15875503420829773, "loss/reg": 0.0004167147562839091, "step": 5486 }, { "epoch": 0.685875, "grad_norm": 2.662191867828369, "grad_norm_var": 0.613181909906357, "learning_rate": 0.0001, "loss": 1.1441, "loss/crossentropy": 2.491955280303955, "loss/hidden": 0.98046875, "loss/logits": 0.15946689248085022, "loss/reg": 0.00041646038880571723, "step": 5487 }, { "epoch": 0.686, "grad_norm": 2.337700366973877, "grad_norm_var": 0.5406649555997639, "learning_rate": 0.0001, "loss": 1.3717, "loss/crossentropy": 2.279362201690674, "loss/hidden": 1.171875, "loss/logits": 0.19563224911689758, "loss/reg": 0.00041625264566391706, "step": 5488 }, { "epoch": 0.686125, "grad_norm": 2.6881520748138428, "grad_norm_var": 0.540505968125986, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.4910645484924316, "loss/hidden": 1.15625, "loss/logits": 0.19574174284934998, "loss/reg": 0.00041601029806770384, "step": 5489 }, { "epoch": 0.68625, "grad_norm": 3.6895298957824707, "grad_norm_var": 0.5702988605943932, "learning_rate": 0.0001, "loss": 1.8018, "loss/crossentropy": 2.0317025184631348, "loss/hidden": 1.5546875, "loss/logits": 0.24294427037239075, "loss/reg": 0.00041575098293833435, "step": 5490 }, { "epoch": 0.686375, "grad_norm": 2.865417242050171, "grad_norm_var": 0.562784010978073, "learning_rate": 0.0001, "loss": 1.4425, "loss/crossentropy": 2.8348467350006104, "loss/hidden": 1.203125, "loss/logits": 0.23524519801139832, "loss/reg": 0.0004155354399699718, "step": 5491 }, { "epoch": 0.6865, "grad_norm": 2.753690004348755, "grad_norm_var": 0.5502822263978099, "learning_rate": 0.0001, "loss": 1.3537, "loss/crossentropy": 2.572721481323242, "loss/hidden": 1.125, "loss/logits": 0.22449779510498047, "loss/reg": 0.0004153063928242773, "step": 5492 }, { "epoch": 0.686625, "grad_norm": 5.790607929229736, "grad_norm_var": 0.999471234685915, "learning_rate": 0.0001, "loss": 1.7403, "loss/crossentropy": 2.3601672649383545, "loss/hidden": 1.46875, "loss/logits": 0.2674051523208618, "loss/reg": 0.0004150601744186133, "step": 5493 }, { "epoch": 0.68675, "grad_norm": 3.1955373287200928, "grad_norm_var": 0.9973867850247785, "learning_rate": 0.0001, "loss": 1.4234, "loss/crossentropy": 2.3595571517944336, "loss/hidden": 1.2109375, "loss/logits": 0.20833662152290344, "loss/reg": 0.0004148033040110022, "step": 5494 }, { "epoch": 0.686875, "grad_norm": 3.445220708847046, "grad_norm_var": 0.9975301311623807, "learning_rate": 0.0001, "loss": 1.4556, "loss/crossentropy": 2.501699447631836, "loss/hidden": 1.234375, "loss/logits": 0.21708932518959045, "loss/reg": 0.00041455161408521235, "step": 5495 }, { "epoch": 0.687, "grad_norm": 2.30682373046875, "grad_norm_var": 1.027085865610513, "learning_rate": 0.0001, "loss": 1.3146, "loss/crossentropy": 2.1869280338287354, "loss/hidden": 1.125, "loss/logits": 0.18544495105743408, "loss/reg": 0.0004142920079175383, "step": 5496 }, { "epoch": 0.687125, "grad_norm": 2.854933738708496, "grad_norm_var": 0.9905669045040002, "learning_rate": 0.0001, "loss": 1.2284, "loss/crossentropy": 2.7007081508636475, "loss/hidden": 1.046875, "loss/logits": 0.17736028134822845, "loss/reg": 0.00041407649405300617, "step": 5497 }, { "epoch": 0.68725, "grad_norm": 3.8383750915527344, "grad_norm_var": 1.017149227769004, "learning_rate": 0.0001, "loss": 1.5534, "loss/crossentropy": 2.6087379455566406, "loss/hidden": 1.3125, "loss/logits": 0.23676510155200958, "loss/reg": 0.00041383475763723254, "step": 5498 }, { "epoch": 0.687375, "grad_norm": 3.9282705783843994, "grad_norm_var": 0.9862700713885791, "learning_rate": 0.0001, "loss": 1.6175, "loss/crossentropy": 2.183044672012329, "loss/hidden": 1.390625, "loss/logits": 0.22269654273986816, "loss/reg": 0.00041358169983141124, "step": 5499 }, { "epoch": 0.6875, "grad_norm": 3.253253221511841, "grad_norm_var": 0.7236784084094247, "learning_rate": 0.0001, "loss": 1.2394, "loss/crossentropy": 2.5703794956207275, "loss/hidden": 1.0546875, "loss/logits": 0.18060234189033508, "loss/reg": 0.0004133678739890456, "step": 5500 }, { "epoch": 0.687625, "grad_norm": 3.0988667011260986, "grad_norm_var": 0.721151158133182, "learning_rate": 0.0001, "loss": 1.334, "loss/crossentropy": 2.4942405223846436, "loss/hidden": 1.140625, "loss/logits": 0.1891992837190628, "loss/reg": 0.00041315105045214295, "step": 5501 }, { "epoch": 0.68775, "grad_norm": 2.530358076095581, "grad_norm_var": 0.7288727771724682, "learning_rate": 0.0001, "loss": 1.3136, "loss/crossentropy": 2.5047006607055664, "loss/hidden": 1.109375, "loss/logits": 0.20007270574569702, "loss/reg": 0.0004129312583245337, "step": 5502 }, { "epoch": 0.687875, "grad_norm": 2.484694719314575, "grad_norm_var": 0.7436273567497277, "learning_rate": 0.0001, "loss": 1.2271, "loss/crossentropy": 2.7548744678497314, "loss/hidden": 1.046875, "loss/logits": 0.17608337104320526, "loss/reg": 0.000412695633713156, "step": 5503 }, { "epoch": 0.688, "grad_norm": 3.576840400695801, "grad_norm_var": 0.6985569911005844, "learning_rate": 0.0001, "loss": 1.5269, "loss/crossentropy": 2.3654022216796875, "loss/hidden": 1.265625, "loss/logits": 0.257143497467041, "loss/reg": 0.00041248893830925226, "step": 5504 }, { "epoch": 0.688125, "grad_norm": 4.303353309631348, "grad_norm_var": 0.7365663240407512, "learning_rate": 0.0001, "loss": 1.3121, "loss/crossentropy": 2.4240882396698, "loss/hidden": 1.1328125, "loss/logits": 0.17513635754585266, "loss/reg": 0.0004122876562178135, "step": 5505 }, { "epoch": 0.68825, "grad_norm": 2.81614351272583, "grad_norm_var": 0.7470010512050121, "learning_rate": 0.0001, "loss": 1.6824, "loss/crossentropy": 2.077568769454956, "loss/hidden": 1.359375, "loss/logits": 0.31894204020500183, "loss/reg": 0.00041208736365661025, "step": 5506 }, { "epoch": 0.688375, "grad_norm": 3.193471908569336, "grad_norm_var": 0.7340557395323979, "learning_rate": 0.0001, "loss": 1.3512, "loss/crossentropy": 2.7577896118164062, "loss/hidden": 1.140625, "loss/logits": 0.20648784935474396, "loss/reg": 0.00041187246097251773, "step": 5507 }, { "epoch": 0.6885, "grad_norm": 3.341780424118042, "grad_norm_var": 0.7100385054110385, "learning_rate": 0.0001, "loss": 1.4471, "loss/crossentropy": 2.6314797401428223, "loss/hidden": 1.1484375, "loss/logits": 0.2945113182067871, "loss/reg": 0.0004116392519790679, "step": 5508 }, { "epoch": 0.688625, "grad_norm": 3.1299052238464355, "grad_norm_var": 0.294615781568397, "learning_rate": 0.0001, "loss": 1.3364, "loss/crossentropy": 2.424250364303589, "loss/hidden": 1.1484375, "loss/logits": 0.1838647872209549, "loss/reg": 0.0004114351759199053, "step": 5509 }, { "epoch": 0.68875, "grad_norm": 2.666785955429077, "grad_norm_var": 0.3128350853488579, "learning_rate": 0.0001, "loss": 1.3099, "loss/crossentropy": 2.3799023628234863, "loss/hidden": 1.1328125, "loss/logits": 0.17293982207775116, "loss/reg": 0.0004111995513085276, "step": 5510 }, { "epoch": 0.688875, "grad_norm": 3.9698190689086914, "grad_norm_var": 0.3490714623819388, "learning_rate": 0.0001, "loss": 1.3108, "loss/crossentropy": 2.8301498889923096, "loss/hidden": 1.1328125, "loss/logits": 0.17391088604927063, "loss/reg": 0.00041099879308603704, "step": 5511 }, { "epoch": 0.689, "grad_norm": 2.326845169067383, "grad_norm_var": 0.34669653014959917, "learning_rate": 0.0001, "loss": 1.2346, "loss/crossentropy": 2.4966092109680176, "loss/hidden": 1.0546875, "loss/logits": 0.1758403182029724, "loss/reg": 0.00041076287743635476, "step": 5512 }, { "epoch": 0.689125, "grad_norm": 2.6217610836029053, "grad_norm_var": 0.3610435507254503, "learning_rate": 0.0001, "loss": 1.3184, "loss/crossentropy": 2.503147840499878, "loss/hidden": 1.125, "loss/logits": 0.18934360146522522, "loss/reg": 0.00041052643791772425, "step": 5513 }, { "epoch": 0.68925, "grad_norm": 3.004610061645508, "grad_norm_var": 0.33269387707024783, "learning_rate": 0.0001, "loss": 1.3304, "loss/crossentropy": 2.6351006031036377, "loss/hidden": 1.125, "loss/logits": 0.20129269361495972, "loss/reg": 0.0004102734092157334, "step": 5514 }, { "epoch": 0.689375, "grad_norm": 2.361711025238037, "grad_norm_var": 0.32151421370998773, "learning_rate": 0.0001, "loss": 1.2144, "loss/crossentropy": 2.5194931030273438, "loss/hidden": 1.0234375, "loss/logits": 0.1868700087070465, "loss/reg": 0.00041001802310347557, "step": 5515 }, { "epoch": 0.6895, "grad_norm": 2.5887155532836914, "grad_norm_var": 0.33044217111746926, "learning_rate": 0.0001, "loss": 1.3132, "loss/crossentropy": 2.4275314807891846, "loss/hidden": 1.1328125, "loss/logits": 0.1762542873620987, "loss/reg": 0.0004097373748663813, "step": 5516 }, { "epoch": 0.689625, "grad_norm": 3.8031439781188965, "grad_norm_var": 0.3706345980481745, "learning_rate": 0.0001, "loss": 1.7691, "loss/crossentropy": 2.10662579536438, "loss/hidden": 1.4453125, "loss/logits": 0.31966444849967957, "loss/reg": 0.0004095205513294786, "step": 5517 }, { "epoch": 0.68975, "grad_norm": 2.352830410003662, "grad_norm_var": 0.3847860202981186, "learning_rate": 0.0001, "loss": 1.2437, "loss/crossentropy": 2.8105781078338623, "loss/hidden": 1.0625, "loss/logits": 0.17711596190929413, "loss/reg": 0.0004092364979442209, "step": 5518 }, { "epoch": 0.689875, "grad_norm": 2.987183094024658, "grad_norm_var": 0.3637709787725932, "learning_rate": 0.0001, "loss": 1.4502, "loss/crossentropy": 2.601163625717163, "loss/hidden": 1.203125, "loss/logits": 0.2429393082857132, "loss/reg": 0.00040902261389419436, "step": 5519 }, { "epoch": 0.69, "grad_norm": 2.909416437149048, "grad_norm_var": 0.34609054808285294, "learning_rate": 0.0001, "loss": 1.2747, "loss/crossentropy": 2.4843389987945557, "loss/hidden": 1.0859375, "loss/logits": 0.18466702103614807, "loss/reg": 0.0004087655106559396, "step": 5520 }, { "epoch": 0.690125, "grad_norm": 3.220181465148926, "grad_norm_var": 0.23459255815825203, "learning_rate": 0.0001, "loss": 1.6116, "loss/crossentropy": 2.482584238052368, "loss/hidden": 1.3671875, "loss/logits": 0.24033337831497192, "loss/reg": 0.0004085048276465386, "step": 5521 }, { "epoch": 0.69025, "grad_norm": 3.095963478088379, "grad_norm_var": 0.23427226123909578, "learning_rate": 0.0001, "loss": 1.4991, "loss/crossentropy": 2.2297990322113037, "loss/hidden": 1.2578125, "loss/logits": 0.23723135888576508, "loss/reg": 0.00040829050703905523, "step": 5522 }, { "epoch": 0.690375, "grad_norm": 4.246294021606445, "grad_norm_var": 0.3344447061261803, "learning_rate": 0.0001, "loss": 1.4159, "loss/crossentropy": 2.559274911880493, "loss/hidden": 1.203125, "loss/logits": 0.20873601734638214, "loss/reg": 0.0004080451326444745, "step": 5523 }, { "epoch": 0.6905, "grad_norm": 3.926528215408325, "grad_norm_var": 0.3794076633254832, "learning_rate": 0.0001, "loss": 1.7014, "loss/crossentropy": 2.418762683868408, "loss/hidden": 1.453125, "loss/logits": 0.24418139457702637, "loss/reg": 0.00040783605072647333, "step": 5524 }, { "epoch": 0.690625, "grad_norm": 2.3276751041412354, "grad_norm_var": 0.41383628182922294, "learning_rate": 0.0001, "loss": 1.2141, "loss/crossentropy": 2.432154655456543, "loss/hidden": 1.0390625, "loss/logits": 0.17100711166858673, "loss/reg": 0.0004075978649780154, "step": 5525 }, { "epoch": 0.69075, "grad_norm": 2.544187068939209, "grad_norm_var": 0.42064090875234494, "learning_rate": 0.0001, "loss": 1.3548, "loss/crossentropy": 2.8702738285064697, "loss/hidden": 1.140625, "loss/logits": 0.2100559026002884, "loss/reg": 0.0004073545860592276, "step": 5526 }, { "epoch": 0.690875, "grad_norm": 2.3339600563049316, "grad_norm_var": 0.38027203513504354, "learning_rate": 0.0001, "loss": 1.4306, "loss/crossentropy": 2.5530083179473877, "loss/hidden": 1.1953125, "loss/logits": 0.2312021553516388, "loss/reg": 0.00040712661575526, "step": 5527 }, { "epoch": 0.691, "grad_norm": 2.4631497859954834, "grad_norm_var": 0.3707316219086444, "learning_rate": 0.0001, "loss": 1.403, "loss/crossentropy": 2.413064479827881, "loss/hidden": 1.171875, "loss/logits": 0.22701285779476166, "loss/reg": 0.0004069133719895035, "step": 5528 }, { "epoch": 0.691125, "grad_norm": 2.9071743488311768, "grad_norm_var": 0.3643133102115691, "learning_rate": 0.0001, "loss": 1.4118, "loss/crossentropy": 2.6901044845581055, "loss/hidden": 1.171875, "loss/logits": 0.23582980036735535, "loss/reg": 0.00040670367889106274, "step": 5529 }, { "epoch": 0.69125, "grad_norm": 2.4783332347869873, "grad_norm_var": 0.3772335787501021, "learning_rate": 0.0001, "loss": 1.4898, "loss/crossentropy": 2.5630013942718506, "loss/hidden": 1.2421875, "loss/logits": 0.24354460835456848, "loss/reg": 0.0004064767272211611, "step": 5530 }, { "epoch": 0.691375, "grad_norm": 2.9825706481933594, "grad_norm_var": 0.35600730038185396, "learning_rate": 0.0001, "loss": 1.3988, "loss/crossentropy": 2.5326037406921387, "loss/hidden": 1.1640625, "loss/logits": 0.2306891679763794, "loss/reg": 0.00040624645771458745, "step": 5531 }, { "epoch": 0.6915, "grad_norm": 2.566065788269043, "grad_norm_var": 0.35712426057374763, "learning_rate": 0.0001, "loss": 1.2663, "loss/crossentropy": 2.5791549682617188, "loss/hidden": 1.09375, "loss/logits": 0.16846491396427155, "loss/reg": 0.000406035193009302, "step": 5532 }, { "epoch": 0.691625, "grad_norm": 6.77375602722168, "grad_norm_var": 1.2479424128649412, "learning_rate": 0.0001, "loss": 1.3522, "loss/crossentropy": 2.3890557289123535, "loss/hidden": 1.1875, "loss/logits": 0.16062113642692566, "loss/reg": 0.0004057992191519588, "step": 5533 }, { "epoch": 0.69175, "grad_norm": 3.119889736175537, "grad_norm_var": 1.2050060262401294, "learning_rate": 0.0001, "loss": 1.2208, "loss/crossentropy": 2.757611036300659, "loss/hidden": 1.0234375, "loss/logits": 0.1932968944311142, "loss/reg": 0.000405587546993047, "step": 5534 }, { "epoch": 0.691875, "grad_norm": 3.1654250621795654, "grad_norm_var": 1.2024057963967851, "learning_rate": 0.0001, "loss": 1.3312, "loss/crossentropy": 2.4472551345825195, "loss/hidden": 1.125, "loss/logits": 0.20215316116809845, "loss/reg": 0.000405383063480258, "step": 5535 }, { "epoch": 0.692, "grad_norm": 2.650033950805664, "grad_norm_var": 1.216359009505579, "learning_rate": 0.0001, "loss": 1.1496, "loss/crossentropy": 2.7487049102783203, "loss/hidden": 0.9765625, "loss/logits": 0.16899126768112183, "loss/reg": 0.00040516204899176955, "step": 5536 }, { "epoch": 0.692125, "grad_norm": 2.901576280593872, "grad_norm_var": 1.2207871527802068, "learning_rate": 0.0001, "loss": 1.4872, "loss/crossentropy": 2.5277833938598633, "loss/hidden": 1.2421875, "loss/logits": 0.24098409712314606, "loss/reg": 0.00040495983557775617, "step": 5537 }, { "epoch": 0.69225, "grad_norm": 2.3175954818725586, "grad_norm_var": 1.2647969039006264, "learning_rate": 0.0001, "loss": 1.211, "loss/crossentropy": 2.690112352371216, "loss/hidden": 1.0390625, "loss/logits": 0.16784214973449707, "loss/reg": 0.00040477141737937927, "step": 5538 }, { "epoch": 0.692375, "grad_norm": 2.4726054668426514, "grad_norm_var": 1.1918713239302472, "learning_rate": 0.0001, "loss": 1.2394, "loss/crossentropy": 2.6067423820495605, "loss/hidden": 1.0625, "loss/logits": 0.17287321388721466, "loss/reg": 0.0004045840760227293, "step": 5539 }, { "epoch": 0.6925, "grad_norm": 2.426508903503418, "grad_norm_var": 1.146323483143853, "learning_rate": 0.0001, "loss": 1.2193, "loss/crossentropy": 2.544769048690796, "loss/hidden": 1.0390625, "loss/logits": 0.17616134881973267, "loss/reg": 0.00040439044823870063, "step": 5540 }, { "epoch": 0.692625, "grad_norm": 11.014348030090332, "grad_norm_var": 5.19737813501619, "learning_rate": 0.0001, "loss": 1.6129, "loss/crossentropy": 2.460659980773926, "loss/hidden": 1.3984375, "loss/logits": 0.21044471859931946, "loss/reg": 0.0004042104119434953, "step": 5541 }, { "epoch": 0.69275, "grad_norm": 3.49969220161438, "grad_norm_var": 5.139698285183154, "learning_rate": 0.0001, "loss": 1.5803, "loss/crossentropy": 2.550494909286499, "loss/hidden": 1.3671875, "loss/logits": 0.20907960832118988, "loss/reg": 0.0004040282219648361, "step": 5542 }, { "epoch": 0.692875, "grad_norm": 2.433881998062134, "grad_norm_var": 5.124726722901671, "learning_rate": 0.0001, "loss": 1.4534, "loss/crossentropy": 2.5615029335021973, "loss/hidden": 1.2109375, "loss/logits": 0.238406240940094, "loss/reg": 0.00040381166036240757, "step": 5543 }, { "epoch": 0.693, "grad_norm": 15.42405891418457, "grad_norm_var": 13.813353850711579, "learning_rate": 0.0001, "loss": 2.2892, "loss/crossentropy": 2.8580057621002197, "loss/hidden": 1.8125, "loss/logits": 0.4726327657699585, "loss/reg": 0.0004036256286781281, "step": 5544 }, { "epoch": 0.693125, "grad_norm": 4.025717258453369, "grad_norm_var": 13.680716522901294, "learning_rate": 0.0001, "loss": 1.5401, "loss/crossentropy": 2.7682390213012695, "loss/hidden": 1.3046875, "loss/logits": 0.23133417963981628, "loss/reg": 0.0004034128214698285, "step": 5545 }, { "epoch": 0.69325, "grad_norm": 3.056849956512451, "grad_norm_var": 13.554118494700992, "learning_rate": 0.0001, "loss": 1.0681, "loss/crossentropy": 2.3346989154815674, "loss/hidden": 0.92578125, "loss/logits": 0.13829562067985535, "loss/reg": 0.00040319946128875017, "step": 5546 }, { "epoch": 0.693375, "grad_norm": 3.3694405555725098, "grad_norm_var": 13.488969856896986, "learning_rate": 0.0001, "loss": 1.5045, "loss/crossentropy": 2.349416732788086, "loss/hidden": 1.2734375, "loss/logits": 0.22703590989112854, "loss/reg": 0.0004030179406981915, "step": 5547 }, { "epoch": 0.6935, "grad_norm": 2.5791146755218506, "grad_norm_var": 13.48570083592354, "learning_rate": 0.0001, "loss": 1.2084, "loss/crossentropy": 2.68235182762146, "loss/hidden": 1.015625, "loss/logits": 0.18872559070587158, "loss/reg": 0.0004028100520372391, "step": 5548 }, { "epoch": 0.693625, "grad_norm": 2.2620599269866943, "grad_norm_var": 13.361182490716024, "learning_rate": 0.0001, "loss": 1.1281, "loss/crossentropy": 2.8063087463378906, "loss/hidden": 0.953125, "loss/logits": 0.17091238498687744, "loss/reg": 0.000402626144932583, "step": 5549 }, { "epoch": 0.69375, "grad_norm": 4.299837112426758, "grad_norm_var": 13.28300156402611, "learning_rate": 0.0001, "loss": 1.5416, "loss/crossentropy": 2.5237224102020264, "loss/hidden": 1.28125, "loss/logits": 0.25631216168403625, "loss/reg": 0.00040244244155474007, "step": 5550 }, { "epoch": 0.693875, "grad_norm": 2.6455607414245605, "grad_norm_var": 13.3746316673169, "learning_rate": 0.0001, "loss": 1.2363, "loss/crossentropy": 2.5146124362945557, "loss/hidden": 1.0546875, "loss/logits": 0.1775951236486435, "loss/reg": 0.0004022637440357357, "step": 5551 }, { "epoch": 0.694, "grad_norm": 3.3872413635253906, "grad_norm_var": 13.255147039493197, "learning_rate": 0.0001, "loss": 1.3805, "loss/crossentropy": 2.5031204223632812, "loss/hidden": 1.1796875, "loss/logits": 0.19680237770080566, "loss/reg": 0.00040208708378486335, "step": 5552 }, { "epoch": 0.694125, "grad_norm": 3.2348897457122803, "grad_norm_var": 13.20184183528341, "learning_rate": 0.0001, "loss": 1.425, "loss/crossentropy": 2.394880771636963, "loss/hidden": 1.21875, "loss/logits": 0.2022775113582611, "loss/reg": 0.0004018846957478672, "step": 5553 }, { "epoch": 0.69425, "grad_norm": 3.185516834259033, "grad_norm_var": 13.022048567706308, "learning_rate": 0.0001, "loss": 1.3486, "loss/crossentropy": 2.517847776412964, "loss/hidden": 1.15625, "loss/logits": 0.18836769461631775, "loss/reg": 0.0004016738967038691, "step": 5554 }, { "epoch": 0.694375, "grad_norm": 3.4038889408111572, "grad_norm_var": 12.845329688973036, "learning_rate": 0.0001, "loss": 1.4872, "loss/crossentropy": 3.127033233642578, "loss/hidden": 1.2734375, "loss/logits": 0.20973274111747742, "loss/reg": 0.0004014935693703592, "step": 5555 }, { "epoch": 0.6945, "grad_norm": 2.508147716522217, "grad_norm_var": 12.82436744497605, "learning_rate": 0.0001, "loss": 1.3396, "loss/crossentropy": 2.5500411987304688, "loss/hidden": 1.125, "loss/logits": 0.21054738759994507, "loss/reg": 0.0004013177822344005, "step": 5556 }, { "epoch": 0.694625, "grad_norm": 3.480465888977051, "grad_norm_var": 9.723220247134995, "learning_rate": 0.0001, "loss": 1.1948, "loss/crossentropy": 3.0959415435791016, "loss/hidden": 1.03125, "loss/logits": 0.15958905220031738, "loss/reg": 0.00040109187830239534, "step": 5557 }, { "epoch": 0.69475, "grad_norm": 2.153228521347046, "grad_norm_var": 9.912844592154881, "learning_rate": 0.0001, "loss": 1.1231, "loss/crossentropy": 2.3887367248535156, "loss/hidden": 0.96484375, "loss/logits": 0.15421921014785767, "loss/reg": 0.00040085960063152015, "step": 5558 }, { "epoch": 0.694875, "grad_norm": 2.29073166847229, "grad_norm_var": 9.940975320679504, "learning_rate": 0.0001, "loss": 1.1098, "loss/crossentropy": 2.7591772079467773, "loss/hidden": 0.9609375, "loss/logits": 0.14490020275115967, "loss/reg": 0.00040062764310278, "step": 5559 }, { "epoch": 0.695, "grad_norm": 3.2774977684020996, "grad_norm_var": 0.38780779885842853, "learning_rate": 0.0001, "loss": 1.3108, "loss/crossentropy": 2.4618120193481445, "loss/hidden": 1.1328125, "loss/logits": 0.17403028905391693, "loss/reg": 0.00040038488805294037, "step": 5560 }, { "epoch": 0.695125, "grad_norm": 5.256815433502197, "grad_norm_var": 0.6389982366427451, "learning_rate": 0.0001, "loss": 1.3877, "loss/crossentropy": 2.6327641010284424, "loss/hidden": 1.1953125, "loss/logits": 0.18840667605400085, "loss/reg": 0.00040017336141318083, "step": 5561 }, { "epoch": 0.69525, "grad_norm": 2.4482779502868652, "grad_norm_var": 0.6696600092514396, "learning_rate": 0.0001, "loss": 1.3009, "loss/crossentropy": 2.3507120609283447, "loss/hidden": 1.125, "loss/logits": 0.1719055473804474, "loss/reg": 0.00039994390681385994, "step": 5562 }, { "epoch": 0.695375, "grad_norm": 3.991633653640747, "grad_norm_var": 0.7152604495364602, "learning_rate": 0.0001, "loss": 1.5244, "loss/crossentropy": 2.483867883682251, "loss/hidden": 1.3203125, "loss/logits": 0.2000509798526764, "loss/reg": 0.0003997303720097989, "step": 5563 }, { "epoch": 0.6955, "grad_norm": 16.118450164794922, "grad_norm_var": 11.141219315937592, "learning_rate": 0.0001, "loss": 1.2857, "loss/crossentropy": 2.362740993499756, "loss/hidden": 1.1015625, "loss/logits": 0.18014955520629883, "loss/reg": 0.0003995215520262718, "step": 5564 }, { "epoch": 0.695625, "grad_norm": 2.3870162963867188, "grad_norm_var": 11.113297698747479, "learning_rate": 0.0001, "loss": 1.2765, "loss/crossentropy": 2.591388702392578, "loss/hidden": 1.0703125, "loss/logits": 0.20215675234794617, "loss/reg": 0.00039930257480591536, "step": 5565 }, { "epoch": 0.69575, "grad_norm": 2.476295232772827, "grad_norm_var": 11.249278762333253, "learning_rate": 0.0001, "loss": 1.1282, "loss/crossentropy": 2.547999858856201, "loss/hidden": 0.97265625, "loss/logits": 0.15153008699417114, "loss/reg": 0.00039907105383463204, "step": 5566 }, { "epoch": 0.695875, "grad_norm": 3.0509166717529297, "grad_norm_var": 11.192270461673054, "learning_rate": 0.0001, "loss": 1.4491, "loss/crossentropy": 2.8192243576049805, "loss/hidden": 1.2265625, "loss/logits": 0.21854132413864136, "loss/reg": 0.00039886520244181156, "step": 5567 }, { "epoch": 0.696, "grad_norm": 2.500783681869507, "grad_norm_var": 11.303842866931648, "learning_rate": 0.0001, "loss": 1.4236, "loss/crossentropy": 2.313400983810425, "loss/hidden": 1.21875, "loss/logits": 0.20083212852478027, "loss/reg": 0.00039864680729806423, "step": 5568 }, { "epoch": 0.696125, "grad_norm": 3.084709644317627, "grad_norm_var": 11.317775414092974, "learning_rate": 0.0001, "loss": 1.5472, "loss/crossentropy": 2.2037062644958496, "loss/hidden": 1.3125, "loss/logits": 0.23072680830955505, "loss/reg": 0.00039844808634370565, "step": 5569 }, { "epoch": 0.69625, "grad_norm": 2.444201707839966, "grad_norm_var": 11.417889837625632, "learning_rate": 0.0001, "loss": 1.3739, "loss/crossentropy": 2.4223687648773193, "loss/hidden": 1.15625, "loss/logits": 0.2136361300945282, "loss/reg": 0.0003982492198701948, "step": 5570 }, { "epoch": 0.696375, "grad_norm": 3.483283042907715, "grad_norm_var": 11.414042278942269, "learning_rate": 0.0001, "loss": 1.4731, "loss/crossentropy": 2.4822659492492676, "loss/hidden": 1.2421875, "loss/logits": 0.22696179151535034, "loss/reg": 0.0003980512556154281, "step": 5571 }, { "epoch": 0.6965, "grad_norm": 3.3292980194091797, "grad_norm_var": 11.31370137762802, "learning_rate": 0.0001, "loss": 1.54, "loss/crossentropy": 2.2392640113830566, "loss/hidden": 1.3125, "loss/logits": 0.22349873185157776, "loss/reg": 0.0003978393506258726, "step": 5572 }, { "epoch": 0.696625, "grad_norm": 2.323638916015625, "grad_norm_var": 11.456013782802549, "learning_rate": 0.0001, "loss": 1.1214, "loss/crossentropy": 2.4840517044067383, "loss/hidden": 0.96875, "loss/logits": 0.14865343272686005, "loss/reg": 0.0003976241569034755, "step": 5573 }, { "epoch": 0.69675, "grad_norm": 2.1289737224578857, "grad_norm_var": 11.461339132696875, "learning_rate": 0.0001, "loss": 1.0931, "loss/crossentropy": 2.675004720687866, "loss/hidden": 0.94921875, "loss/logits": 0.13987161219120026, "loss/reg": 0.0003974198189098388, "step": 5574 }, { "epoch": 0.696875, "grad_norm": 4.501442909240723, "grad_norm_var": 11.3257399530074, "learning_rate": 0.0001, "loss": 1.5355, "loss/crossentropy": 2.3885691165924072, "loss/hidden": 1.2890625, "loss/logits": 0.24244412779808044, "loss/reg": 0.0003972063132096082, "step": 5575 }, { "epoch": 0.697, "grad_norm": 2.159271478652954, "grad_norm_var": 11.500462509738181, "learning_rate": 0.0001, "loss": 1.2021, "loss/crossentropy": 2.8525633811950684, "loss/hidden": 1.015625, "loss/logits": 0.1825087070465088, "loss/reg": 0.00039698166074231267, "step": 5576 }, { "epoch": 0.697125, "grad_norm": 2.3877112865448, "grad_norm_var": 11.478806576000366, "learning_rate": 0.0001, "loss": 1.178, "loss/crossentropy": 2.456425905227661, "loss/hidden": 1.015625, "loss/logits": 0.15837126970291138, "loss/reg": 0.00039674065192230046, "step": 5577 }, { "epoch": 0.69725, "grad_norm": 2.5694167613983154, "grad_norm_var": 11.45989386382093, "learning_rate": 0.0001, "loss": 1.2515, "loss/crossentropy": 2.8302180767059326, "loss/hidden": 1.078125, "loss/logits": 0.16940517723560333, "loss/reg": 0.00039647327503189445, "step": 5578 }, { "epoch": 0.697375, "grad_norm": 2.288757085800171, "grad_norm_var": 11.571183644710912, "learning_rate": 0.0001, "loss": 1.3748, "loss/crossentropy": 2.571776866912842, "loss/hidden": 1.1640625, "loss/logits": 0.20680516958236694, "loss/reg": 0.0003961949551012367, "step": 5579 }, { "epoch": 0.6975, "grad_norm": 3.806220293045044, "grad_norm_var": 0.45741473968066065, "learning_rate": 0.0001, "loss": 1.3908, "loss/crossentropy": 2.4294633865356445, "loss/hidden": 1.203125, "loss/logits": 0.1837429702281952, "loss/reg": 0.0003959217283409089, "step": 5580 }, { "epoch": 0.697625, "grad_norm": 2.4519996643066406, "grad_norm_var": 0.4540343586287631, "learning_rate": 0.0001, "loss": 1.2125, "loss/crossentropy": 2.4730961322784424, "loss/hidden": 1.046875, "loss/logits": 0.16166679561138153, "loss/reg": 0.00039571229717694223, "step": 5581 }, { "epoch": 0.69775, "grad_norm": 2.324561595916748, "grad_norm_var": 0.4622585729545326, "learning_rate": 0.0001, "loss": 1.3448, "loss/crossentropy": 2.3990743160247803, "loss/hidden": 1.140625, "loss/logits": 0.20024581253528595, "loss/reg": 0.00039550711517222226, "step": 5582 }, { "epoch": 0.697875, "grad_norm": 2.120513439178467, "grad_norm_var": 0.4855073647037292, "learning_rate": 0.0001, "loss": 1.1308, "loss/crossentropy": 2.671304702758789, "loss/hidden": 0.97265625, "loss/logits": 0.15419261157512665, "loss/reg": 0.0003952443948946893, "step": 5583 }, { "epoch": 0.698, "grad_norm": 2.419395685195923, "grad_norm_var": 0.4885612148231748, "learning_rate": 0.0001, "loss": 1.2937, "loss/crossentropy": 2.4397037029266357, "loss/hidden": 1.1015625, "loss/logits": 0.18815910816192627, "loss/reg": 0.00039499468402937055, "step": 5584 }, { "epoch": 0.698125, "grad_norm": 3.6896743774414062, "grad_norm_var": 0.5393237781307411, "learning_rate": 0.0001, "loss": 1.3323, "loss/crossentropy": 2.656951904296875, "loss/hidden": 1.1484375, "loss/logits": 0.1799333542585373, "loss/reg": 0.000394727336242795, "step": 5585 }, { "epoch": 0.69825, "grad_norm": 2.755342721939087, "grad_norm_var": 0.5315774686969803, "learning_rate": 0.0001, "loss": 1.3005, "loss/crossentropy": 2.9815828800201416, "loss/hidden": 1.1171875, "loss/logits": 0.1793486773967743, "loss/reg": 0.00039443926652893424, "step": 5586 }, { "epoch": 0.698375, "grad_norm": 2.9405765533447266, "grad_norm_var": 0.5002690523460084, "learning_rate": 0.0001, "loss": 1.3169, "loss/crossentropy": 2.357609987258911, "loss/hidden": 1.1171875, "loss/logits": 0.19581273198127747, "loss/reg": 0.00039412896148860455, "step": 5587 }, { "epoch": 0.6985, "grad_norm": 2.1860780715942383, "grad_norm_var": 0.49552636445556575, "learning_rate": 0.0001, "loss": 1.2113, "loss/crossentropy": 2.5779616832733154, "loss/hidden": 1.03125, "loss/logits": 0.17609381675720215, "loss/reg": 0.0003939214802812785, "step": 5588 }, { "epoch": 0.698625, "grad_norm": 2.1856369972229004, "grad_norm_var": 0.5034733961931873, "learning_rate": 0.0001, "loss": 1.2982, "loss/crossentropy": 2.0742738246917725, "loss/hidden": 1.109375, "loss/logits": 0.18486806750297546, "loss/reg": 0.00039371862658299506, "step": 5589 }, { "epoch": 0.69875, "grad_norm": 3.3325355052948, "grad_norm_var": 0.505225785030756, "learning_rate": 0.0001, "loss": 1.2633, "loss/crossentropy": 2.328824043273926, "loss/hidden": 1.0859375, "loss/logits": 0.17339614033699036, "loss/reg": 0.0003934884734917432, "step": 5590 }, { "epoch": 0.698875, "grad_norm": 2.8440299034118652, "grad_norm_var": 0.291511292352349, "learning_rate": 0.0001, "loss": 1.2937, "loss/crossentropy": 2.5226919651031494, "loss/hidden": 1.109375, "loss/logits": 0.1803685575723648, "loss/reg": 0.00039328992716036737, "step": 5591 }, { "epoch": 0.699, "grad_norm": 2.7066073417663574, "grad_norm_var": 0.2741408644811502, "learning_rate": 0.0001, "loss": 1.3239, "loss/crossentropy": 2.8737804889678955, "loss/hidden": 1.125, "loss/logits": 0.19498354196548462, "loss/reg": 0.0003930708044208586, "step": 5592 }, { "epoch": 0.699125, "grad_norm": 2.4233014583587646, "grad_norm_var": 0.27279474025685485, "learning_rate": 0.0001, "loss": 1.3028, "loss/crossentropy": 2.471951723098755, "loss/hidden": 1.109375, "loss/logits": 0.189534991979599, "loss/reg": 0.0003928521473426372, "step": 5593 }, { "epoch": 0.69925, "grad_norm": 2.6511738300323486, "grad_norm_var": 0.27189486659761025, "learning_rate": 0.0001, "loss": 1.3154, "loss/crossentropy": 2.5666444301605225, "loss/hidden": 1.125, "loss/logits": 0.18646445870399475, "loss/reg": 0.0003926323843188584, "step": 5594 }, { "epoch": 0.699375, "grad_norm": 2.6886417865753174, "grad_norm_var": 0.26020771671646775, "learning_rate": 0.0001, "loss": 1.48, "loss/crossentropy": 2.4496917724609375, "loss/hidden": 1.2421875, "loss/logits": 0.23387524485588074, "loss/reg": 0.00039239003672264516, "step": 5595 }, { "epoch": 0.6995, "grad_norm": 2.4120559692382812, "grad_norm_var": 0.17984572621796435, "learning_rate": 0.0001, "loss": 1.093, "loss/crossentropy": 2.8721683025360107, "loss/hidden": 0.94140625, "loss/logits": 0.1477087140083313, "loss/reg": 0.00039216261939145625, "step": 5596 }, { "epoch": 0.699625, "grad_norm": 2.6257317066192627, "grad_norm_var": 0.17753343966658097, "learning_rate": 0.0001, "loss": 1.2548, "loss/crossentropy": 2.5272085666656494, "loss/hidden": 1.078125, "loss/logits": 0.17271146178245544, "loss/reg": 0.0003919171867892146, "step": 5597 }, { "epoch": 0.69975, "grad_norm": 2.598174810409546, "grad_norm_var": 0.17055454176850068, "learning_rate": 0.0001, "loss": 1.141, "loss/crossentropy": 2.6258721351623535, "loss/hidden": 0.96875, "loss/logits": 0.16837427020072937, "loss/reg": 0.00039165763882920146, "step": 5598 }, { "epoch": 0.699875, "grad_norm": 4.903883457183838, "grad_norm_var": 0.4540876315967495, "learning_rate": 0.0001, "loss": 1.6571, "loss/crossentropy": 2.7409121990203857, "loss/hidden": 1.375, "loss/logits": 0.2781774401664734, "loss/reg": 0.0003913920954801142, "step": 5599 }, { "epoch": 0.7, "grad_norm": 2.232440710067749, "grad_norm_var": 0.46663647255342083, "learning_rate": 0.0001, "loss": 1.297, "loss/crossentropy": 2.304394483566284, "loss/hidden": 1.125, "loss/logits": 0.16810132563114166, "loss/reg": 0.00039118618587963283, "step": 5600 }, { "epoch": 0.700125, "grad_norm": 2.3737502098083496, "grad_norm_var": 0.4228877667052141, "learning_rate": 0.0001, "loss": 1.3303, "loss/crossentropy": 2.358027935028076, "loss/hidden": 1.1328125, "loss/logits": 0.19356687366962433, "loss/reg": 0.0003909511142410338, "step": 5601 }, { "epoch": 0.70025, "grad_norm": 2.408777952194214, "grad_norm_var": 0.4297431449355031, "learning_rate": 0.0001, "loss": 1.1681, "loss/crossentropy": 2.726747512817383, "loss/hidden": 0.99609375, "loss/logits": 0.16813813149929047, "loss/reg": 0.00039075518725439906, "step": 5602 }, { "epoch": 0.700375, "grad_norm": 2.9588005542755127, "grad_norm_var": 0.43030087660297694, "learning_rate": 0.0001, "loss": 1.6419, "loss/crossentropy": 2.540415048599243, "loss/hidden": 1.3828125, "loss/logits": 0.2551679313182831, "loss/reg": 0.00039053874206729233, "step": 5603 }, { "epoch": 0.7005, "grad_norm": 2.706190586090088, "grad_norm_var": 0.4101312285664484, "learning_rate": 0.0001, "loss": 1.1569, "loss/crossentropy": 2.6654856204986572, "loss/hidden": 0.99609375, "loss/logits": 0.15690621733665466, "loss/reg": 0.0003903376345988363, "step": 5604 }, { "epoch": 0.700625, "grad_norm": 2.4875590801239014, "grad_norm_var": 0.3929792232318581, "learning_rate": 0.0001, "loss": 1.2559, "loss/crossentropy": 2.438206672668457, "loss/hidden": 1.0625, "loss/logits": 0.1895267367362976, "loss/reg": 0.00039010337786749005, "step": 5605 }, { "epoch": 0.70075, "grad_norm": 3.1305348873138428, "grad_norm_var": 0.38043513873188883, "learning_rate": 0.0001, "loss": 1.5095, "loss/crossentropy": 2.388439655303955, "loss/hidden": 1.2578125, "loss/logits": 0.24781909584999084, "loss/reg": 0.0003898801514878869, "step": 5606 }, { "epoch": 0.700875, "grad_norm": 3.1645774841308594, "grad_norm_var": 0.39047076510104545, "learning_rate": 0.0001, "loss": 1.5182, "loss/crossentropy": 2.520219087600708, "loss/hidden": 1.2578125, "loss/logits": 0.2564418911933899, "loss/reg": 0.0003896776761393994, "step": 5607 }, { "epoch": 0.701, "grad_norm": 2.547895908355713, "grad_norm_var": 0.3935878842460009, "learning_rate": 0.0001, "loss": 1.2478, "loss/crossentropy": 2.6143839359283447, "loss/hidden": 1.0625, "loss/logits": 0.18137343227863312, "loss/reg": 0.0003894589899573475, "step": 5608 }, { "epoch": 0.701125, "grad_norm": 2.6366403102874756, "grad_norm_var": 0.3865821462809957, "learning_rate": 0.0001, "loss": 1.3812, "loss/crossentropy": 2.470318555831909, "loss/hidden": 1.171875, "loss/logits": 0.2053987979888916, "loss/reg": 0.0003892551176249981, "step": 5609 }, { "epoch": 0.70125, "grad_norm": 3.1570487022399902, "grad_norm_var": 0.3936897454810243, "learning_rate": 0.0001, "loss": 1.3145, "loss/crossentropy": 2.6970317363739014, "loss/hidden": 1.109375, "loss/logits": 0.20125392079353333, "loss/reg": 0.0003890383231919259, "step": 5610 }, { "epoch": 0.701375, "grad_norm": 4.142799377441406, "grad_norm_var": 0.5014397498661453, "learning_rate": 0.0001, "loss": 1.4561, "loss/crossentropy": 2.3719234466552734, "loss/hidden": 1.1953125, "loss/logits": 0.2569223940372467, "loss/reg": 0.0003888326755259186, "step": 5611 }, { "epoch": 0.7015, "grad_norm": 2.332122325897217, "grad_norm_var": 0.5070973655632126, "learning_rate": 0.0001, "loss": 1.277, "loss/crossentropy": 2.5079617500305176, "loss/hidden": 1.078125, "loss/logits": 0.19502034783363342, "loss/reg": 0.0003886097692884505, "step": 5612 }, { "epoch": 0.701625, "grad_norm": 3.8904454708099365, "grad_norm_var": 0.5607437039178715, "learning_rate": 0.0001, "loss": 1.538, "loss/crossentropy": 2.622279405593872, "loss/hidden": 1.28125, "loss/logits": 0.25288641452789307, "loss/reg": 0.00038841430796310306, "step": 5613 }, { "epoch": 0.70175, "grad_norm": 2.704127550125122, "grad_norm_var": 0.5560586509628251, "learning_rate": 0.0001, "loss": 1.5253, "loss/crossentropy": 2.427088737487793, "loss/hidden": 1.265625, "loss/logits": 0.2557724714279175, "loss/reg": 0.0003881995507981628, "step": 5614 }, { "epoch": 0.701875, "grad_norm": 3.417328119277954, "grad_norm_var": 0.3140551755787193, "learning_rate": 0.0001, "loss": 2.1088, "loss/crossentropy": 2.1916682720184326, "loss/hidden": 1.6953125, "loss/logits": 0.4096325635910034, "loss/reg": 0.00038800088805146515, "step": 5615 }, { "epoch": 0.702, "grad_norm": 2.752232551574707, "grad_norm_var": 0.2851480393017785, "learning_rate": 0.0001, "loss": 1.2514, "loss/crossentropy": 2.445293426513672, "loss/hidden": 1.0625, "loss/logits": 0.18505337834358215, "loss/reg": 0.00038779518217779696, "step": 5616 }, { "epoch": 0.702125, "grad_norm": 3.049131155014038, "grad_norm_var": 0.26395531339210243, "learning_rate": 0.0001, "loss": 1.2925, "loss/crossentropy": 2.460162878036499, "loss/hidden": 1.0859375, "loss/logits": 0.2027306854724884, "loss/reg": 0.0003875967231579125, "step": 5617 }, { "epoch": 0.70225, "grad_norm": 2.4640250205993652, "grad_norm_var": 0.26002751764640003, "learning_rate": 0.0001, "loss": 1.3476, "loss/crossentropy": 2.48246431350708, "loss/hidden": 1.140625, "loss/logits": 0.20308585464954376, "loss/reg": 0.00038737780414521694, "step": 5618 }, { "epoch": 0.702375, "grad_norm": 4.383622646331787, "grad_norm_var": 0.38452746844445923, "learning_rate": 0.0001, "loss": 1.5365, "loss/crossentropy": 2.886321544647217, "loss/hidden": 1.2890625, "loss/logits": 0.24358531832695007, "loss/reg": 0.0003871400549542159, "step": 5619 }, { "epoch": 0.7025, "grad_norm": 2.767573356628418, "grad_norm_var": 0.381864038818105, "learning_rate": 0.0001, "loss": 1.2068, "loss/crossentropy": 2.4721148014068604, "loss/hidden": 1.03125, "loss/logits": 0.1716577410697937, "loss/reg": 0.0003869329229928553, "step": 5620 }, { "epoch": 0.702625, "grad_norm": 2.6379518508911133, "grad_norm_var": 0.3717140647035118, "learning_rate": 0.0001, "loss": 1.2901, "loss/crossentropy": 2.618375778198242, "loss/hidden": 1.109375, "loss/logits": 0.17689216136932373, "loss/reg": 0.00038669368950650096, "step": 5621 }, { "epoch": 0.70275, "grad_norm": 3.0119879245758057, "grad_norm_var": 0.3716929245599573, "learning_rate": 0.0001, "loss": 1.2921, "loss/crossentropy": 2.388140916824341, "loss/hidden": 1.1015625, "loss/logits": 0.18666890263557434, "loss/reg": 0.00038649776251986623, "step": 5622 }, { "epoch": 0.702875, "grad_norm": 2.4927759170532227, "grad_norm_var": 0.3910899730327543, "learning_rate": 0.0001, "loss": 1.2271, "loss/crossentropy": 2.611123561859131, "loss/hidden": 1.03125, "loss/logits": 0.191981703042984, "loss/reg": 0.00038632145151495934, "step": 5623 }, { "epoch": 0.703, "grad_norm": 2.484717845916748, "grad_norm_var": 0.3953519699273168, "learning_rate": 0.0001, "loss": 1.3179, "loss/crossentropy": 2.500091791152954, "loss/hidden": 1.125, "loss/logits": 0.18902955949306488, "loss/reg": 0.000386133324354887, "step": 5624 }, { "epoch": 0.703125, "grad_norm": 3.016356945037842, "grad_norm_var": 0.3849401068954684, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.599017858505249, "loss/hidden": 1.0546875, "loss/logits": 0.19400669634342194, "loss/reg": 0.00038593384670093656, "step": 5625 }, { "epoch": 0.70325, "grad_norm": 2.757164716720581, "grad_norm_var": 0.388907613890864, "learning_rate": 0.0001, "loss": 1.2381, "loss/crossentropy": 2.756190538406372, "loss/hidden": 1.0625, "loss/logits": 0.1717298924922943, "loss/reg": 0.00038573655183427036, "step": 5626 }, { "epoch": 0.703375, "grad_norm": 3.5778567790985107, "grad_norm_var": 0.32420587845060683, "learning_rate": 0.0001, "loss": 1.3202, "loss/crossentropy": 2.497340440750122, "loss/hidden": 1.125, "loss/logits": 0.19137847423553467, "loss/reg": 0.0003855507238768041, "step": 5627 }, { "epoch": 0.7035, "grad_norm": 2.8026418685913086, "grad_norm_var": 0.29716446791353496, "learning_rate": 0.0001, "loss": 1.3714, "loss/crossentropy": 2.6143875122070312, "loss/hidden": 1.1640625, "loss/logits": 0.20345285534858704, "loss/reg": 0.0003853728121612221, "step": 5628 }, { "epoch": 0.703625, "grad_norm": 3.1028411388397217, "grad_norm_var": 0.24380325496761632, "learning_rate": 0.0001, "loss": 1.4133, "loss/crossentropy": 2.7999114990234375, "loss/hidden": 1.1875, "loss/logits": 0.22196871042251587, "loss/reg": 0.0003852102381642908, "step": 5629 }, { "epoch": 0.70375, "grad_norm": 3.2838377952575684, "grad_norm_var": 0.2447285317179838, "learning_rate": 0.0001, "loss": 1.2444, "loss/crossentropy": 2.5341384410858154, "loss/hidden": 1.0625, "loss/logits": 0.17808926105499268, "loss/reg": 0.00038506268174387515, "step": 5630 }, { "epoch": 0.703875, "grad_norm": 3.054190158843994, "grad_norm_var": 0.2327701881402104, "learning_rate": 0.0001, "loss": 1.261, "loss/crossentropy": 2.547927141189575, "loss/hidden": 1.09375, "loss/logits": 0.16337519884109497, "loss/reg": 0.00038486934499815106, "step": 5631 }, { "epoch": 0.704, "grad_norm": 3.088590621948242, "grad_norm_var": 0.2297415603206178, "learning_rate": 0.0001, "loss": 1.5306, "loss/crossentropy": 2.5232903957366943, "loss/hidden": 1.2734375, "loss/logits": 0.25330013036727905, "loss/reg": 0.0003847070911433548, "step": 5632 }, { "epoch": 0.704125, "grad_norm": 3.143901824951172, "grad_norm_var": 0.2309432624953909, "learning_rate": 0.0001, "loss": 1.369, "loss/crossentropy": 2.614309072494507, "loss/hidden": 1.1328125, "loss/logits": 0.2323620319366455, "loss/reg": 0.0003845147439278662, "step": 5633 }, { "epoch": 0.70425, "grad_norm": 2.5409669876098633, "grad_norm_var": 0.22576983125350125, "learning_rate": 0.0001, "loss": 1.3627, "loss/crossentropy": 2.379822254180908, "loss/hidden": 1.15625, "loss/logits": 0.2025821954011917, "loss/reg": 0.0003843198937829584, "step": 5634 }, { "epoch": 0.704375, "grad_norm": 2.5403175354003906, "grad_norm_var": 0.1003299133906836, "learning_rate": 0.0001, "loss": 1.3393, "loss/crossentropy": 2.609943389892578, "loss/hidden": 1.1328125, "loss/logits": 0.20266571640968323, "loss/reg": 0.0003841435827780515, "step": 5635 }, { "epoch": 0.7045, "grad_norm": 2.4235479831695557, "grad_norm_var": 0.11352526421381957, "learning_rate": 0.0001, "loss": 1.3405, "loss/crossentropy": 2.5970187187194824, "loss/hidden": 1.1171875, "loss/logits": 0.21942687034606934, "loss/reg": 0.0003839491109829396, "step": 5636 }, { "epoch": 0.704625, "grad_norm": 2.9117884635925293, "grad_norm_var": 0.10964900727449252, "learning_rate": 0.0001, "loss": 1.1097, "loss/crossentropy": 2.534849166870117, "loss/hidden": 0.9609375, "loss/logits": 0.14496402442455292, "loss/reg": 0.0003837979747913778, "step": 5637 }, { "epoch": 0.70475, "grad_norm": 3.0415213108062744, "grad_norm_var": 0.11018548681799961, "learning_rate": 0.0001, "loss": 1.1681, "loss/crossentropy": 2.5937235355377197, "loss/hidden": 1.0, "loss/logits": 0.16423794627189636, "loss/reg": 0.0003836004179902375, "step": 5638 }, { "epoch": 0.704875, "grad_norm": 3.0675041675567627, "grad_norm_var": 0.10028032636110519, "learning_rate": 0.0001, "loss": 1.3328, "loss/crossentropy": 2.705106496810913, "loss/hidden": 1.1328125, "loss/logits": 0.1961762011051178, "loss/reg": 0.0003834415110759437, "step": 5639 }, { "epoch": 0.705, "grad_norm": 2.4080557823181152, "grad_norm_var": 0.10517214930417348, "learning_rate": 0.0001, "loss": 1.1786, "loss/crossentropy": 2.643638849258423, "loss/hidden": 1.0234375, "loss/logits": 0.15129563212394714, "loss/reg": 0.0003832859802059829, "step": 5640 }, { "epoch": 0.705125, "grad_norm": 2.137929677963257, "grad_norm_var": 0.14241437235540175, "learning_rate": 0.0001, "loss": 1.2217, "loss/crossentropy": 2.561596632003784, "loss/hidden": 1.046875, "loss/logits": 0.17102035880088806, "loss/reg": 0.0003830902569461614, "step": 5641 }, { "epoch": 0.70525, "grad_norm": 2.7303593158721924, "grad_norm_var": 0.14285421812394147, "learning_rate": 0.0001, "loss": 1.2095, "loss/crossentropy": 2.379709005355835, "loss/hidden": 1.0390625, "loss/logits": 0.16661138832569122, "loss/reg": 0.00038290530210360885, "step": 5642 }, { "epoch": 0.705375, "grad_norm": 2.5105299949645996, "grad_norm_var": 0.11274754295761795, "learning_rate": 0.0001, "loss": 1.2788, "loss/crossentropy": 2.632866382598877, "loss/hidden": 1.09375, "loss/logits": 0.18121938407421112, "loss/reg": 0.0003827050677500665, "step": 5643 }, { "epoch": 0.7055, "grad_norm": 2.2706708908081055, "grad_norm_var": 0.13019635533523038, "learning_rate": 0.0001, "loss": 1.1977, "loss/crossentropy": 2.650648832321167, "loss/hidden": 1.0078125, "loss/logits": 0.18609429895877838, "loss/reg": 0.0003825248568318784, "step": 5644 }, { "epoch": 0.705625, "grad_norm": 2.644606351852417, "grad_norm_var": 0.12274185418196643, "learning_rate": 0.0001, "loss": 1.5148, "loss/crossentropy": 2.6995437145233154, "loss/hidden": 1.2734375, "loss/logits": 0.23755627870559692, "loss/reg": 0.0003823251463472843, "step": 5645 }, { "epoch": 0.70575, "grad_norm": 3.0209484100341797, "grad_norm_var": 0.10790741042751412, "learning_rate": 0.0001, "loss": 1.8455, "loss/crossentropy": 2.046595573425293, "loss/hidden": 1.4921875, "loss/logits": 0.34946268796920776, "loss/reg": 0.00038212843355722725, "step": 5646 }, { "epoch": 0.705875, "grad_norm": 2.9170427322387695, "grad_norm_var": 0.10298952387190259, "learning_rate": 0.0001, "loss": 1.3442, "loss/crossentropy": 2.862168550491333, "loss/hidden": 1.1328125, "loss/logits": 0.20760458707809448, "loss/reg": 0.00038194385706447065, "step": 5647 }, { "epoch": 0.706, "grad_norm": 4.927490711212158, "grad_norm_var": 0.4065753565883436, "learning_rate": 0.0001, "loss": 1.5028, "loss/crossentropy": 2.5274503231048584, "loss/hidden": 1.28125, "loss/logits": 0.217777281999588, "loss/reg": 0.00038175785448402166, "step": 5648 }, { "epoch": 0.706125, "grad_norm": 3.616629123687744, "grad_norm_var": 0.440496304937119, "learning_rate": 0.0001, "loss": 1.559, "loss/crossentropy": 2.715053081512451, "loss/hidden": 1.3515625, "loss/logits": 0.20365993678569794, "loss/reg": 0.00038155732909217477, "step": 5649 }, { "epoch": 0.70625, "grad_norm": 2.9936044216156006, "grad_norm_var": 0.43423611466605294, "learning_rate": 0.0001, "loss": 1.362, "loss/crossentropy": 2.257342576980591, "loss/hidden": 1.1640625, "loss/logits": 0.19414649903774261, "loss/reg": 0.0003813653311226517, "step": 5650 }, { "epoch": 0.706375, "grad_norm": 3.811659097671509, "grad_norm_var": 0.47680058154575133, "learning_rate": 0.0001, "loss": 1.6195, "loss/crossentropy": 2.73901629447937, "loss/hidden": 1.3359375, "loss/logits": 0.27976956963539124, "loss/reg": 0.00038116206997074187, "step": 5651 }, { "epoch": 0.7065, "grad_norm": 2.686845064163208, "grad_norm_var": 0.46213846100495176, "learning_rate": 0.0001, "loss": 1.328, "loss/crossentropy": 2.451949119567871, "loss/hidden": 1.140625, "loss/logits": 0.18357178568840027, "loss/reg": 0.0003809937334153801, "step": 5652 }, { "epoch": 0.706625, "grad_norm": 2.571565866470337, "grad_norm_var": 0.47251592946392, "learning_rate": 0.0001, "loss": 1.0661, "loss/crossentropy": 2.837015151977539, "loss/hidden": 0.91796875, "loss/logits": 0.14434637129306793, "loss/reg": 0.0003807951870840043, "step": 5653 }, { "epoch": 0.70675, "grad_norm": 2.6581711769104004, "grad_norm_var": 0.4775242326869185, "learning_rate": 0.0001, "loss": 1.2196, "loss/crossentropy": 2.6924257278442383, "loss/hidden": 1.046875, "loss/logits": 0.16896724700927734, "loss/reg": 0.00038061925442889333, "step": 5654 }, { "epoch": 0.706875, "grad_norm": 3.239328622817993, "grad_norm_var": 0.48238562951052394, "learning_rate": 0.0001, "loss": 1.5496, "loss/crossentropy": 2.688408613204956, "loss/hidden": 1.3203125, "loss/logits": 0.22547391057014465, "loss/reg": 0.0003804440493695438, "step": 5655 }, { "epoch": 0.707, "grad_norm": 2.391291618347168, "grad_norm_var": 0.48360693740685956, "learning_rate": 0.0001, "loss": 1.2954, "loss/crossentropy": 2.2866854667663574, "loss/hidden": 1.109375, "loss/logits": 0.1821998655796051, "loss/reg": 0.00038025606772862375, "step": 5656 }, { "epoch": 0.707125, "grad_norm": 3.456374406814575, "grad_norm_var": 0.45027816249166414, "learning_rate": 0.0001, "loss": 1.6165, "loss/crossentropy": 2.3923959732055664, "loss/hidden": 1.2890625, "loss/logits": 0.3236483931541443, "loss/reg": 0.00038005359238013625, "step": 5657 }, { "epoch": 0.70725, "grad_norm": 2.5456459522247314, "grad_norm_var": 0.45973967197487203, "learning_rate": 0.0001, "loss": 1.253, "loss/crossentropy": 2.611318349838257, "loss/hidden": 1.078125, "loss/logits": 0.17112302780151367, "loss/reg": 0.0003798868565354496, "step": 5658 }, { "epoch": 0.707375, "grad_norm": 2.2263803482055664, "grad_norm_var": 0.483951700783904, "learning_rate": 0.0001, "loss": 1.1477, "loss/crossentropy": 2.505796432495117, "loss/hidden": 0.98828125, "loss/logits": 0.15563899278640747, "loss/reg": 0.00037973994039930403, "step": 5659 }, { "epoch": 0.7075, "grad_norm": 2.8813133239746094, "grad_norm_var": 0.44798637640610556, "learning_rate": 0.0001, "loss": 1.4049, "loss/crossentropy": 2.4729599952697754, "loss/hidden": 1.1875, "loss/logits": 0.2135801762342453, "loss/reg": 0.0003795425873249769, "step": 5660 }, { "epoch": 0.707625, "grad_norm": 2.7096028327941895, "grad_norm_var": 0.444851530055111, "learning_rate": 0.0001, "loss": 1.3762, "loss/crossentropy": 2.6639912128448486, "loss/hidden": 1.171875, "loss/logits": 0.20048552751541138, "loss/reg": 0.00037940277252346277, "step": 5661 }, { "epoch": 0.70775, "grad_norm": 2.816119909286499, "grad_norm_var": 0.4480177227698088, "learning_rate": 0.0001, "loss": 1.1997, "loss/crossentropy": 2.559699535369873, "loss/hidden": 1.046875, "loss/logits": 0.14899981021881104, "loss/reg": 0.0003792571369558573, "step": 5662 }, { "epoch": 0.707875, "grad_norm": 2.6368253231048584, "grad_norm_var": 0.4570734435394492, "learning_rate": 0.0001, "loss": 1.4592, "loss/crossentropy": 2.351889133453369, "loss/hidden": 1.234375, "loss/logits": 0.22105160355567932, "loss/reg": 0.0003791071940213442, "step": 5663 }, { "epoch": 0.708, "grad_norm": 3.4506916999816895, "grad_norm_var": 0.21592433634158753, "learning_rate": 0.0001, "loss": 1.3937, "loss/crossentropy": 2.449294090270996, "loss/hidden": 1.203125, "loss/logits": 0.1868312656879425, "loss/reg": 0.00037897267611697316, "step": 5664 }, { "epoch": 0.708125, "grad_norm": 2.946615695953369, "grad_norm_var": 0.18159219792844372, "learning_rate": 0.0001, "loss": 1.2062, "loss/crossentropy": 2.317211866378784, "loss/hidden": 1.0390625, "loss/logits": 0.16336211562156677, "loss/reg": 0.00037877168506383896, "step": 5665 }, { "epoch": 0.70825, "grad_norm": 2.519148588180542, "grad_norm_var": 0.18824558446219514, "learning_rate": 0.0001, "loss": 1.5614, "loss/crossentropy": 2.4812073707580566, "loss/hidden": 1.3046875, "loss/logits": 0.25295019149780273, "loss/reg": 0.00037863239413127303, "step": 5666 }, { "epoch": 0.708375, "grad_norm": 2.991147041320801, "grad_norm_var": 0.12475760520909773, "learning_rate": 0.0001, "loss": 1.3671, "loss/crossentropy": 2.5050466060638428, "loss/hidden": 1.171875, "loss/logits": 0.1914440244436264, "loss/reg": 0.00037849502405151725, "step": 5667 }, { "epoch": 0.7085, "grad_norm": 3.0592241287231445, "grad_norm_var": 0.12803235802989335, "learning_rate": 0.0001, "loss": 1.3533, "loss/crossentropy": 2.721823215484619, "loss/hidden": 1.15625, "loss/logits": 0.19329936802387238, "loss/reg": 0.0003783311112783849, "step": 5668 }, { "epoch": 0.708625, "grad_norm": 2.55865478515625, "grad_norm_var": 0.12846823890894257, "learning_rate": 0.0001, "loss": 1.3432, "loss/crossentropy": 2.5041537284851074, "loss/hidden": 1.140625, "loss/logits": 0.19884279370307922, "loss/reg": 0.0003781299164984375, "step": 5669 }, { "epoch": 0.70875, "grad_norm": 2.394390344619751, "grad_norm_var": 0.13843509404703294, "learning_rate": 0.0001, "loss": 1.2294, "loss/crossentropy": 2.6264326572418213, "loss/hidden": 1.046875, "loss/logits": 0.17869889736175537, "loss/reg": 0.00037797511322423816, "step": 5670 }, { "epoch": 0.708875, "grad_norm": 2.1285085678100586, "grad_norm_var": 0.15069713179726232, "learning_rate": 0.0001, "loss": 1.2589, "loss/crossentropy": 2.5580484867095947, "loss/hidden": 1.0625, "loss/logits": 0.19258099794387817, "loss/reg": 0.00037783593870699406, "step": 5671 }, { "epoch": 0.709, "grad_norm": 2.9025659561157227, "grad_norm_var": 0.1438089408197423, "learning_rate": 0.0001, "loss": 1.6756, "loss/crossentropy": 2.651326894760132, "loss/hidden": 1.3671875, "loss/logits": 0.30462533235549927, "loss/reg": 0.0003776684170588851, "step": 5672 }, { "epoch": 0.709125, "grad_norm": 2.848186731338501, "grad_norm_var": 0.11077738609638553, "learning_rate": 0.0001, "loss": 1.2201, "loss/crossentropy": 2.518198013305664, "loss/hidden": 1.046875, "loss/logits": 0.16948901116847992, "loss/reg": 0.00037746725138276815, "step": 5673 }, { "epoch": 0.70925, "grad_norm": 2.699270725250244, "grad_norm_var": 0.10855942818554519, "learning_rate": 0.0001, "loss": 1.3578, "loss/crossentropy": 2.59915828704834, "loss/hidden": 1.109375, "loss/logits": 0.24463121592998505, "loss/reg": 0.00037731509655714035, "step": 5674 }, { "epoch": 0.709375, "grad_norm": 2.3717944622039795, "grad_norm_var": 0.10000913332708687, "learning_rate": 0.0001, "loss": 1.293, "loss/crossentropy": 2.4993419647216797, "loss/hidden": 1.109375, "loss/logits": 0.17986154556274414, "loss/reg": 0.0003771586634684354, "step": 5675 }, { "epoch": 0.7095, "grad_norm": 2.8315932750701904, "grad_norm_var": 0.09925751008534742, "learning_rate": 0.0001, "loss": 1.4911, "loss/crossentropy": 2.2964906692504883, "loss/hidden": 1.2734375, "loss/logits": 0.2138507068157196, "loss/reg": 0.00037700074608437717, "step": 5676 }, { "epoch": 0.709625, "grad_norm": 2.3275809288024902, "grad_norm_var": 0.110004610845728, "learning_rate": 0.0001, "loss": 1.3594, "loss/crossentropy": 2.4330248832702637, "loss/hidden": 1.140625, "loss/logits": 0.2149728387594223, "loss/reg": 0.00037684018025174737, "step": 5677 }, { "epoch": 0.70975, "grad_norm": 2.5306575298309326, "grad_norm_var": 0.11134953701631099, "learning_rate": 0.0001, "loss": 1.2667, "loss/crossentropy": 2.43477725982666, "loss/hidden": 1.078125, "loss/logits": 0.18483155965805054, "loss/reg": 0.00037666078424081206, "step": 5678 }, { "epoch": 0.709875, "grad_norm": 2.4156229496002197, "grad_norm_var": 0.11626514853933803, "learning_rate": 0.0001, "loss": 1.2669, "loss/crossentropy": 2.632476329803467, "loss/hidden": 1.0703125, "loss/logits": 0.1928207278251648, "loss/reg": 0.00037646316923201084, "step": 5679 }, { "epoch": 0.71, "grad_norm": 3.988574743270874, "grad_norm_var": 0.18919104743157125, "learning_rate": 0.0001, "loss": 1.6438, "loss/crossentropy": 2.8597028255462646, "loss/hidden": 1.3515625, "loss/logits": 0.28847697377204895, "loss/reg": 0.00037627998972311616, "step": 5680 }, { "epoch": 0.710125, "grad_norm": 2.6224138736724854, "grad_norm_var": 0.18594686512134306, "learning_rate": 0.0001, "loss": 1.2057, "loss/crossentropy": 3.1605892181396484, "loss/hidden": 1.0234375, "loss/logits": 0.17851832509040833, "loss/reg": 0.0003760818799491972, "step": 5681 }, { "epoch": 0.71025, "grad_norm": 2.7672622203826904, "grad_norm_var": 0.18383354696532, "learning_rate": 0.0001, "loss": 1.4967, "loss/crossentropy": 2.369900941848755, "loss/hidden": 1.25, "loss/logits": 0.24290262162685394, "loss/reg": 0.0003759200335480273, "step": 5682 }, { "epoch": 0.710375, "grad_norm": 7.553741455078125, "grad_norm_var": 1.6530027310759823, "learning_rate": 0.0001, "loss": 1.283, "loss/crossentropy": 2.800811290740967, "loss/hidden": 1.1171875, "loss/logits": 0.16208630800247192, "loss/reg": 0.0003757603990379721, "step": 5683 }, { "epoch": 0.7105, "grad_norm": 2.7660460472106934, "grad_norm_var": 1.6560598265344935, "learning_rate": 0.0001, "loss": 1.3231, "loss/crossentropy": 2.728396415710449, "loss/hidden": 1.125, "loss/logits": 0.19430798292160034, "loss/reg": 0.00037559837801381946, "step": 5684 }, { "epoch": 0.710625, "grad_norm": 7.599856853485107, "grad_norm_var": 2.9600771219690705, "learning_rate": 0.0001, "loss": 1.8716, "loss/crossentropy": 2.3188931941986084, "loss/hidden": 1.53125, "loss/logits": 0.33657360076904297, "loss/reg": 0.0003753999772015959, "step": 5685 }, { "epoch": 0.71075, "grad_norm": 2.650937080383301, "grad_norm_var": 2.933324170466622, "learning_rate": 0.0001, "loss": 1.3642, "loss/crossentropy": 2.5786943435668945, "loss/hidden": 1.15625, "loss/logits": 0.2041967213153839, "loss/reg": 0.0003752127813640982, "step": 5686 }, { "epoch": 0.710875, "grad_norm": 2.4803502559661865, "grad_norm_var": 2.8855040071911082, "learning_rate": 0.0001, "loss": 1.2721, "loss/crossentropy": 2.525028705596924, "loss/hidden": 1.09375, "loss/logits": 0.17460569739341736, "loss/reg": 0.00037505110958591104, "step": 5687 }, { "epoch": 0.711, "grad_norm": 2.365394353866577, "grad_norm_var": 2.934494893543255, "learning_rate": 0.0001, "loss": 1.4657, "loss/crossentropy": 2.250152111053467, "loss/hidden": 1.234375, "loss/logits": 0.22758044302463531, "loss/reg": 0.00037487532244995236, "step": 5688 }, { "epoch": 0.711125, "grad_norm": 4.990616798400879, "grad_norm_var": 3.091962248704097, "learning_rate": 0.0001, "loss": 1.6802, "loss/crossentropy": 2.474379539489746, "loss/hidden": 1.3828125, "loss/logits": 0.2935987710952759, "loss/reg": 0.0003746793663594872, "step": 5689 }, { "epoch": 0.71125, "grad_norm": 2.8003604412078857, "grad_norm_var": 3.082682879062524, "learning_rate": 0.0001, "loss": 1.2164, "loss/crossentropy": 2.9018681049346924, "loss/hidden": 1.03125, "loss/logits": 0.1814047396183014, "loss/reg": 0.0003744862915482372, "step": 5690 }, { "epoch": 0.711375, "grad_norm": 2.7389514446258545, "grad_norm_var": 3.0387451585959826, "learning_rate": 0.0001, "loss": 1.3934, "loss/crossentropy": 2.597708225250244, "loss/hidden": 1.171875, "loss/logits": 0.21774005889892578, "loss/reg": 0.0003742809931281954, "step": 5691 }, { "epoch": 0.7115, "grad_norm": 2.655517816543579, "grad_norm_var": 3.0555384050309375, "learning_rate": 0.0001, "loss": 1.2273, "loss/crossentropy": 2.4063162803649902, "loss/hidden": 1.0546875, "loss/logits": 0.16883526742458344, "loss/reg": 0.00037405354669317603, "step": 5692 }, { "epoch": 0.711625, "grad_norm": 2.4380414485931396, "grad_norm_var": 3.0397203333778973, "learning_rate": 0.0001, "loss": 1.1543, "loss/crossentropy": 2.72413969039917, "loss/hidden": 1.0, "loss/logits": 0.1505393534898758, "loss/reg": 0.0003738535742741078, "step": 5693 }, { "epoch": 0.71175, "grad_norm": 2.4541678428649902, "grad_norm_var": 3.049566785437839, "learning_rate": 0.0001, "loss": 1.2559, "loss/crossentropy": 2.6617660522460938, "loss/hidden": 1.0859375, "loss/logits": 0.16626830399036407, "loss/reg": 0.0003736311919055879, "step": 5694 }, { "epoch": 0.711875, "grad_norm": 2.0965566635131836, "grad_norm_var": 3.100167735288382, "learning_rate": 0.0001, "loss": 1.2207, "loss/crossentropy": 2.343813419342041, "loss/hidden": 1.0390625, "loss/logits": 0.1779192090034485, "loss/reg": 0.00037342729046940804, "step": 5695 }, { "epoch": 0.712, "grad_norm": 2.9223740100860596, "grad_norm_var": 3.0925985892151875, "learning_rate": 0.0001, "loss": 1.5055, "loss/crossentropy": 2.5674257278442383, "loss/hidden": 1.2734375, "loss/logits": 0.22831273078918457, "loss/reg": 0.0003732294135261327, "step": 5696 }, { "epoch": 0.712125, "grad_norm": 3.174813985824585, "grad_norm_var": 3.0566881360896025, "learning_rate": 0.0001, "loss": 1.6213, "loss/crossentropy": 2.433300733566284, "loss/hidden": 1.34375, "loss/logits": 0.2738567590713501, "loss/reg": 0.0003730466414708644, "step": 5697 }, { "epoch": 0.71225, "grad_norm": 2.603998899459839, "grad_norm_var": 3.072202597916176, "learning_rate": 0.0001, "loss": 1.2615, "loss/crossentropy": 2.560908555984497, "loss/hidden": 1.078125, "loss/logits": 0.17963871359825134, "loss/reg": 0.0003728584561031312, "step": 5698 }, { "epoch": 0.712375, "grad_norm": 2.5758402347564697, "grad_norm_var": 1.8595079261378251, "learning_rate": 0.0001, "loss": 1.156, "loss/crossentropy": 2.553062677383423, "loss/hidden": 1.0, "loss/logits": 0.1522318720817566, "loss/reg": 0.00037267792504280806, "step": 5699 }, { "epoch": 0.7125, "grad_norm": 2.692561388015747, "grad_norm_var": 1.8629422453812365, "learning_rate": 0.0001, "loss": 1.2202, "loss/crossentropy": 2.4336562156677246, "loss/hidden": 1.0625, "loss/logits": 0.15401065349578857, "loss/reg": 0.00037247364525683224, "step": 5700 }, { "epoch": 0.712625, "grad_norm": 2.367158889770508, "grad_norm_var": 0.4190605492498603, "learning_rate": 0.0001, "loss": 1.3246, "loss/crossentropy": 2.208888292312622, "loss/hidden": 1.1328125, "loss/logits": 0.18801695108413696, "loss/reg": 0.0003722697729244828, "step": 5701 }, { "epoch": 0.71275, "grad_norm": 2.5294392108917236, "grad_norm_var": 0.4215956856128855, "learning_rate": 0.0001, "loss": 1.266, "loss/crossentropy": 2.6590356826782227, "loss/hidden": 1.078125, "loss/logits": 0.18416786193847656, "loss/reg": 0.00037209875881671906, "step": 5702 }, { "epoch": 0.712875, "grad_norm": 3.3605356216430664, "grad_norm_var": 0.43920563236719445, "learning_rate": 0.0001, "loss": 1.4373, "loss/crossentropy": 2.7114882469177246, "loss/hidden": 1.2265625, "loss/logits": 0.2070598304271698, "loss/reg": 0.00037193557363934815, "step": 5703 }, { "epoch": 0.713, "grad_norm": 3.005321979522705, "grad_norm_var": 0.42789724627552533, "learning_rate": 0.0001, "loss": 1.801, "loss/crossentropy": 1.9861658811569214, "loss/hidden": 1.453125, "loss/logits": 0.34417062997817993, "loss/reg": 0.0003717677900567651, "step": 5704 }, { "epoch": 0.713125, "grad_norm": 2.657390594482422, "grad_norm_var": 0.09843762613896463, "learning_rate": 0.0001, "loss": 1.281, "loss/crossentropy": 2.8208463191986084, "loss/hidden": 1.09375, "loss/logits": 0.18353816866874695, "loss/reg": 0.00037159561179578304, "step": 5705 }, { "epoch": 0.71325, "grad_norm": 4.640601634979248, "grad_norm_var": 0.3366652147282572, "learning_rate": 0.0001, "loss": 1.4306, "loss/crossentropy": 2.856410026550293, "loss/hidden": 1.0859375, "loss/logits": 0.34097886085510254, "loss/reg": 0.0003714081540238112, "step": 5706 }, { "epoch": 0.713375, "grad_norm": 2.941333532333374, "grad_norm_var": 0.3373867357209176, "learning_rate": 0.0001, "loss": 1.3613, "loss/crossentropy": 2.6913082599639893, "loss/hidden": 1.1484375, "loss/logits": 0.20917707681655884, "loss/reg": 0.00037125012022443116, "step": 5707 }, { "epoch": 0.7135, "grad_norm": 4.095263957977295, "grad_norm_var": 0.43541817911081654, "learning_rate": 0.0001, "loss": 1.506, "loss/crossentropy": 2.7709624767303467, "loss/hidden": 1.3046875, "loss/logits": 0.19764190912246704, "loss/reg": 0.00037106493255123496, "step": 5708 }, { "epoch": 0.713625, "grad_norm": 2.2941195964813232, "grad_norm_var": 0.4457639422117154, "learning_rate": 0.0001, "loss": 1.1199, "loss/crossentropy": 2.4326698780059814, "loss/hidden": 0.95703125, "loss/logits": 0.15912145376205444, "loss/reg": 0.00037088667158968747, "step": 5709 }, { "epoch": 0.71375, "grad_norm": 2.437795400619507, "grad_norm_var": 0.446755509961546, "learning_rate": 0.0001, "loss": 1.1835, "loss/crossentropy": 2.5594732761383057, "loss/hidden": 1.015625, "loss/logits": 0.1641993522644043, "loss/reg": 0.00037068992969579995, "step": 5710 }, { "epoch": 0.713875, "grad_norm": 4.025079250335693, "grad_norm_var": 0.4726896455702537, "learning_rate": 0.0001, "loss": 1.7462, "loss/crossentropy": 2.972163677215576, "loss/hidden": 1.453125, "loss/logits": 0.2893938422203064, "loss/reg": 0.0003705074777826667, "step": 5711 }, { "epoch": 0.714, "grad_norm": 2.932145357131958, "grad_norm_var": 0.47256812592897646, "learning_rate": 0.0001, "loss": 1.5491, "loss/crossentropy": 2.5293805599212646, "loss/hidden": 1.296875, "loss/logits": 0.24854236841201782, "loss/reg": 0.0003703366091940552, "step": 5712 }, { "epoch": 0.714125, "grad_norm": 2.6184656620025635, "grad_norm_var": 0.4804913985520573, "learning_rate": 0.0001, "loss": 1.3989, "loss/crossentropy": 2.506767988204956, "loss/hidden": 1.171875, "loss/logits": 0.2233162522315979, "loss/reg": 0.0003701469977386296, "step": 5713 }, { "epoch": 0.71425, "grad_norm": 2.7178218364715576, "grad_norm_var": 0.4755027319711092, "learning_rate": 0.0001, "loss": 1.5339, "loss/crossentropy": 2.2329671382904053, "loss/hidden": 1.2734375, "loss/logits": 0.2568070590496063, "loss/reg": 0.0003699749067891389, "step": 5714 }, { "epoch": 0.714375, "grad_norm": 2.7111470699310303, "grad_norm_var": 0.4691177950876956, "learning_rate": 0.0001, "loss": 1.4017, "loss/crossentropy": 2.697866201400757, "loss/hidden": 1.1953125, "loss/logits": 0.20269858837127686, "loss/reg": 0.0003697677457239479, "step": 5715 }, { "epoch": 0.7145, "grad_norm": 2.7465569972991943, "grad_norm_var": 0.467074856944496, "learning_rate": 0.0001, "loss": 1.2045, "loss/crossentropy": 2.420863151550293, "loss/hidden": 1.0546875, "loss/logits": 0.14609871804714203, "loss/reg": 0.0003695569175761193, "step": 5716 }, { "epoch": 0.714625, "grad_norm": 4.398207187652588, "grad_norm_var": 0.5521627121597966, "learning_rate": 0.0001, "loss": 1.8508, "loss/crossentropy": 1.850888729095459, "loss/hidden": 1.6171875, "loss/logits": 0.22987419366836548, "loss/reg": 0.00036933502997271717, "step": 5717 }, { "epoch": 0.71475, "grad_norm": 24.481433868408203, "grad_norm_var": 28.906779016075227, "learning_rate": 0.0001, "loss": 1.224, "loss/crossentropy": 2.675092935562134, "loss/hidden": 1.0625, "loss/logits": 0.15782737731933594, "loss/reg": 0.0003691082529257983, "step": 5718 }, { "epoch": 0.714875, "grad_norm": 2.7329981327056885, "grad_norm_var": 29.027063209589837, "learning_rate": 0.0001, "loss": 1.5746, "loss/crossentropy": 2.5178370475769043, "loss/hidden": 1.3046875, "loss/logits": 0.26617974042892456, "loss/reg": 0.00036887687747366726, "step": 5719 }, { "epoch": 0.715, "grad_norm": 2.902642011642456, "grad_norm_var": 29.047702422327735, "learning_rate": 0.0001, "loss": 1.3681, "loss/crossentropy": 2.751828670501709, "loss/hidden": 1.1875, "loss/logits": 0.17690420150756836, "loss/reg": 0.0003686138370539993, "step": 5720 }, { "epoch": 0.715125, "grad_norm": 2.5569441318511963, "grad_norm_var": 29.07245251530308, "learning_rate": 0.0001, "loss": 1.166, "loss/crossentropy": 2.5573737621307373, "loss/hidden": 1.0, "loss/logits": 0.16236111521720886, "loss/reg": 0.00036842667032033205, "step": 5721 }, { "epoch": 0.71525, "grad_norm": 2.5911929607391357, "grad_norm_var": 29.283430505608262, "learning_rate": 0.0001, "loss": 1.3205, "loss/crossentropy": 2.584099054336548, "loss/hidden": 1.1171875, "loss/logits": 0.19959846138954163, "loss/reg": 0.0003682394162751734, "step": 5722 }, { "epoch": 0.715375, "grad_norm": 3.6241378784179688, "grad_norm_var": 29.186695463971677, "learning_rate": 0.0001, "loss": 1.5059, "loss/crossentropy": 1.9644908905029297, "loss/hidden": 1.265625, "loss/logits": 0.23659175634384155, "loss/reg": 0.00036803740658797324, "step": 5723 }, { "epoch": 0.7155, "grad_norm": 2.7626824378967285, "grad_norm_var": 29.34589552870665, "learning_rate": 0.0001, "loss": 1.3039, "loss/crossentropy": 2.228987455368042, "loss/hidden": 1.109375, "loss/logits": 0.19086912274360657, "loss/reg": 0.0003677844360936433, "step": 5724 }, { "epoch": 0.715625, "grad_norm": 2.8101556301116943, "grad_norm_var": 29.225671233986038, "learning_rate": 0.0001, "loss": 1.5422, "loss/crossentropy": 2.44899845123291, "loss/hidden": 1.3203125, "loss/logits": 0.2181798666715622, "loss/reg": 0.000367596629075706, "step": 5725 }, { "epoch": 0.71575, "grad_norm": 2.601062774658203, "grad_norm_var": 29.186459616116696, "learning_rate": 0.0001, "loss": 1.3057, "loss/crossentropy": 2.782904624938965, "loss/hidden": 1.1171875, "loss/logits": 0.18487858772277832, "loss/reg": 0.00036736761103384197, "step": 5726 }, { "epoch": 0.715875, "grad_norm": 3.8171699047088623, "grad_norm_var": 29.19749739380756, "learning_rate": 0.0001, "loss": 1.7324, "loss/crossentropy": 2.262608528137207, "loss/hidden": 1.46875, "loss/logits": 0.2599869966506958, "loss/reg": 0.00036712168366648257, "step": 5727 }, { "epoch": 0.716, "grad_norm": 2.856574296951294, "grad_norm_var": 29.21176597895488, "learning_rate": 0.0001, "loss": 1.2796, "loss/crossentropy": 2.6004457473754883, "loss/hidden": 1.0859375, "loss/logits": 0.19000020623207092, "loss/reg": 0.00036686682142317295, "step": 5728 }, { "epoch": 0.716125, "grad_norm": 2.356333017349243, "grad_norm_var": 29.27511412695848, "learning_rate": 0.0001, "loss": 1.3039, "loss/crossentropy": 2.367638349533081, "loss/hidden": 1.109375, "loss/logits": 0.19085419178009033, "loss/reg": 0.0003665951080620289, "step": 5729 }, { "epoch": 0.71625, "grad_norm": 2.577641010284424, "grad_norm_var": 29.305759135580065, "learning_rate": 0.0001, "loss": 1.3636, "loss/crossentropy": 2.4965403079986572, "loss/hidden": 1.171875, "loss/logits": 0.18810787796974182, "loss/reg": 0.0003663328825496137, "step": 5730 }, { "epoch": 0.716375, "grad_norm": 2.4351539611816406, "grad_norm_var": 29.368360063869186, "learning_rate": 0.0001, "loss": 1.2684, "loss/crossentropy": 2.521620750427246, "loss/hidden": 1.0703125, "loss/logits": 0.19443081319332123, "loss/reg": 0.0003660721704363823, "step": 5731 }, { "epoch": 0.7165, "grad_norm": 2.505187749862671, "grad_norm_var": 29.420890547734228, "learning_rate": 0.0001, "loss": 1.1992, "loss/crossentropy": 2.3392248153686523, "loss/hidden": 1.0390625, "loss/logits": 0.1564978063106537, "loss/reg": 0.00036585042835213244, "step": 5732 }, { "epoch": 0.716625, "grad_norm": 2.8398141860961914, "grad_norm_var": 29.542005598197782, "learning_rate": 0.0001, "loss": 1.5044, "loss/crossentropy": 2.4533817768096924, "loss/hidden": 1.2734375, "loss/logits": 0.22730302810668945, "loss/reg": 0.00036559486761689186, "step": 5733 }, { "epoch": 0.71675, "grad_norm": 2.75813364982605, "grad_norm_var": 0.15634230747659691, "learning_rate": 0.0001, "loss": 1.2673, "loss/crossentropy": 2.67130708694458, "loss/hidden": 1.078125, "loss/logits": 0.18547874689102173, "loss/reg": 0.0003654052270576358, "step": 5734 }, { "epoch": 0.716875, "grad_norm": 2.3956034183502197, "grad_norm_var": 0.1662682180852689, "learning_rate": 0.0001, "loss": 1.3701, "loss/crossentropy": 2.425265073776245, "loss/hidden": 1.1640625, "loss/logits": 0.20235227048397064, "loss/reg": 0.00036514000385068357, "step": 5735 }, { "epoch": 0.717, "grad_norm": 3.565467596054077, "grad_norm_var": 0.20506027901016352, "learning_rate": 0.0001, "loss": 1.4516, "loss/crossentropy": 2.854248285293579, "loss/hidden": 1.1796875, "loss/logits": 0.26830488443374634, "loss/reg": 0.0003649081918410957, "step": 5736 }, { "epoch": 0.717125, "grad_norm": 2.7023470401763916, "grad_norm_var": 0.20136265072077605, "learning_rate": 0.0001, "loss": 1.2649, "loss/crossentropy": 2.508511781692505, "loss/hidden": 1.0703125, "loss/logits": 0.19097790122032166, "loss/reg": 0.0003646640107035637, "step": 5737 }, { "epoch": 0.71725, "grad_norm": 2.97951602935791, "grad_norm_var": 0.19868598146873992, "learning_rate": 0.0001, "loss": 1.4979, "loss/crossentropy": 2.421482801437378, "loss/hidden": 1.2421875, "loss/logits": 0.25204789638519287, "loss/reg": 0.00036442340933717787, "step": 5738 }, { "epoch": 0.717375, "grad_norm": 4.831191062927246, "grad_norm_var": 0.41446810663437056, "learning_rate": 0.0001, "loss": 1.3323, "loss/crossentropy": 2.5508930683135986, "loss/hidden": 1.15625, "loss/logits": 0.17244485020637512, "loss/reg": 0.0003642434603534639, "step": 5739 }, { "epoch": 0.7175, "grad_norm": 3.5134918689727783, "grad_norm_var": 0.43348833675981857, "learning_rate": 0.0001, "loss": 1.2577, "loss/crossentropy": 2.4572179317474365, "loss/hidden": 1.0625, "loss/logits": 0.19154798984527588, "loss/reg": 0.000364012987120077, "step": 5740 }, { "epoch": 0.717625, "grad_norm": 2.1128249168395996, "grad_norm_var": 0.47888650398652405, "learning_rate": 0.0001, "loss": 1.2972, "loss/crossentropy": 2.2697207927703857, "loss/hidden": 1.109375, "loss/logits": 0.18416211009025574, "loss/reg": 0.00036378938239067793, "step": 5741 }, { "epoch": 0.71775, "grad_norm": 2.4992082118988037, "grad_norm_var": 0.4839744936038831, "learning_rate": 0.0001, "loss": 1.5833, "loss/crossentropy": 2.31099534034729, "loss/hidden": 1.3125, "loss/logits": 0.2671615183353424, "loss/reg": 0.0003635583561845124, "step": 5742 }, { "epoch": 0.717875, "grad_norm": 2.6057119369506836, "grad_norm_var": 0.4310426059213169, "learning_rate": 0.0001, "loss": 1.3321, "loss/crossentropy": 2.2931487560272217, "loss/hidden": 1.125, "loss/logits": 0.20349088311195374, "loss/reg": 0.00036332072340883315, "step": 5743 }, { "epoch": 0.718, "grad_norm": 4.687112808227539, "grad_norm_var": 0.6430804050003645, "learning_rate": 0.0001, "loss": 1.7466, "loss/crossentropy": 2.4156126976013184, "loss/hidden": 1.4453125, "loss/logits": 0.2976236343383789, "loss/reg": 0.0003630835562944412, "step": 5744 }, { "epoch": 0.718125, "grad_norm": 3.0812907218933105, "grad_norm_var": 0.6175484373609206, "learning_rate": 0.0001, "loss": 1.3071, "loss/crossentropy": 2.6850814819335938, "loss/hidden": 1.109375, "loss/logits": 0.19411899149417877, "loss/reg": 0.0003628984559327364, "step": 5745 }, { "epoch": 0.71825, "grad_norm": 3.504058361053467, "grad_norm_var": 0.6183257796956352, "learning_rate": 0.0001, "loss": 1.3719, "loss/crossentropy": 2.624849557876587, "loss/hidden": 1.171875, "loss/logits": 0.19639447331428528, "loss/reg": 0.0003627150726970285, "step": 5746 }, { "epoch": 0.718375, "grad_norm": 4.889694690704346, "grad_norm_var": 0.789231468827061, "learning_rate": 0.0001, "loss": 1.5132, "loss/crossentropy": 2.4722864627838135, "loss/hidden": 1.3046875, "loss/logits": 0.20483845472335815, "loss/reg": 0.0003624896053224802, "step": 5747 }, { "epoch": 0.7185, "grad_norm": 2.3323779106140137, "grad_norm_var": 0.8074970714389025, "learning_rate": 0.0001, "loss": 1.2402, "loss/crossentropy": 2.4992592334747314, "loss/hidden": 1.0546875, "loss/logits": 0.18187940120697021, "loss/reg": 0.00036230424302630126, "step": 5748 }, { "epoch": 0.718625, "grad_norm": 3.479903221130371, "grad_norm_var": 0.8018421532802668, "learning_rate": 0.0001, "loss": 1.3097, "loss/crossentropy": 2.4811923503875732, "loss/hidden": 1.1171875, "loss/logits": 0.18893253803253174, "loss/reg": 0.00036206966615282, "step": 5749 }, { "epoch": 0.71875, "grad_norm": 2.753910541534424, "grad_norm_var": 0.802118044312207, "learning_rate": 0.0001, "loss": 1.4545, "loss/crossentropy": 2.5713188648223877, "loss/hidden": 1.234375, "loss/logits": 0.21654322743415833, "loss/reg": 0.0003618276969064027, "step": 5750 }, { "epoch": 0.718875, "grad_norm": 2.2514877319335938, "grad_norm_var": 0.8197541093129378, "learning_rate": 0.0001, "loss": 1.2104, "loss/crossentropy": 2.705150604248047, "loss/hidden": 1.03125, "loss/logits": 0.1755264401435852, "loss/reg": 0.00036162862670607865, "step": 5751 }, { "epoch": 0.719, "grad_norm": 2.3359086513519287, "grad_norm_var": 0.8603685437886118, "learning_rate": 0.0001, "loss": 1.3058, "loss/crossentropy": 2.5658555030822754, "loss/hidden": 1.09375, "loss/logits": 0.20845401287078857, "loss/reg": 0.0003613839508034289, "step": 5752 }, { "epoch": 0.719125, "grad_norm": 3.5554027557373047, "grad_norm_var": 0.8537959969145426, "learning_rate": 0.0001, "loss": 1.6001, "loss/crossentropy": 2.5004594326019287, "loss/hidden": 1.3828125, "loss/logits": 0.21365897357463837, "loss/reg": 0.0003611399151850492, "step": 5753 }, { "epoch": 0.71925, "grad_norm": 4.079704284667969, "grad_norm_var": 0.8951500285798087, "learning_rate": 0.0001, "loss": 1.7205, "loss/crossentropy": 1.9921120405197144, "loss/hidden": 1.484375, "loss/logits": 0.23251482844352722, "loss/reg": 0.0003609427949413657, "step": 5754 }, { "epoch": 0.719375, "grad_norm": 5.513491153717041, "grad_norm_var": 1.0651736846382653, "learning_rate": 0.0001, "loss": 1.7621, "loss/crossentropy": 2.6408398151397705, "loss/hidden": 1.421875, "loss/logits": 0.336586058139801, "loss/reg": 0.0003606978280004114, "step": 5755 }, { "epoch": 0.7195, "grad_norm": 2.5355772972106934, "grad_norm_var": 1.1003302770316916, "learning_rate": 0.0001, "loss": 1.3859, "loss/crossentropy": 2.4636189937591553, "loss/hidden": 1.1796875, "loss/logits": 0.20256280899047852, "loss/reg": 0.0003604759695008397, "step": 5756 }, { "epoch": 0.719625, "grad_norm": 5.446800231933594, "grad_norm_var": 1.283486332237002, "learning_rate": 0.0001, "loss": 1.3728, "loss/crossentropy": 2.5620102882385254, "loss/hidden": 1.171875, "loss/logits": 0.1973361372947693, "loss/reg": 0.00036027946043759584, "step": 5757 }, { "epoch": 0.71975, "grad_norm": 2.454723358154297, "grad_norm_var": 1.2893798137490058, "learning_rate": 0.0001, "loss": 1.3208, "loss/crossentropy": 2.389644145965576, "loss/hidden": 1.125, "loss/logits": 0.1922318935394287, "loss/reg": 0.0003600687487050891, "step": 5758 }, { "epoch": 0.719875, "grad_norm": 2.3590869903564453, "grad_norm_var": 1.3215755737901111, "learning_rate": 0.0001, "loss": 1.1706, "loss/crossentropy": 2.789262533187866, "loss/hidden": 1.015625, "loss/logits": 0.15132831037044525, "loss/reg": 0.00035988056333735585, "step": 5759 }, { "epoch": 0.72, "grad_norm": 2.6318182945251465, "grad_norm_var": 1.2476095428796419, "learning_rate": 0.0001, "loss": 1.3223, "loss/crossentropy": 2.4788248538970947, "loss/hidden": 1.1171875, "loss/logits": 0.2015315294265747, "loss/reg": 0.00035970553290098906, "step": 5760 }, { "epoch": 0.720125, "grad_norm": 4.247242450714111, "grad_norm_var": 1.2946367806792956, "learning_rate": 0.0001, "loss": 1.6365, "loss/crossentropy": 2.560563325881958, "loss/hidden": 1.375, "loss/logits": 0.25794732570648193, "loss/reg": 0.0003595038433559239, "step": 5761 }, { "epoch": 0.72025, "grad_norm": 3.1223225593566895, "grad_norm_var": 1.2983563909853688, "learning_rate": 0.0001, "loss": 1.3292, "loss/crossentropy": 2.6948444843292236, "loss/hidden": 1.1171875, "loss/logits": 0.2084653228521347, "loss/reg": 0.0003593252331484109, "step": 5762 }, { "epoch": 0.720375, "grad_norm": 3.5839788913726807, "grad_norm_var": 1.1410960406921116, "learning_rate": 0.0001, "loss": 1.7101, "loss/crossentropy": 2.332798480987549, "loss/hidden": 1.40625, "loss/logits": 0.3002195656299591, "loss/reg": 0.00035915206535719335, "step": 5763 }, { "epoch": 0.7205, "grad_norm": 3.752647638320923, "grad_norm_var": 1.0853070552102126, "learning_rate": 0.0001, "loss": 1.458, "loss/crossentropy": 2.4965763092041016, "loss/hidden": 1.2265625, "loss/logits": 0.22788339853286743, "loss/reg": 0.0003590118431020528, "step": 5764 }, { "epoch": 0.720625, "grad_norm": 3.030879020690918, "grad_norm_var": 1.0920171082838348, "learning_rate": 0.0001, "loss": 1.3018, "loss/crossentropy": 2.793853521347046, "loss/hidden": 1.125, "loss/logits": 0.17325782775878906, "loss/reg": 0.00035882025258615613, "step": 5765 }, { "epoch": 0.72075, "grad_norm": 9.203132629394531, "grad_norm_var": 3.176016179428961, "learning_rate": 0.0001, "loss": 1.8809, "loss/crossentropy": 2.650239944458008, "loss/hidden": 1.5859375, "loss/logits": 0.29140618443489075, "loss/reg": 0.00035867837141267955, "step": 5766 }, { "epoch": 0.720875, "grad_norm": 3.0232269763946533, "grad_norm_var": 3.058375105113521, "learning_rate": 0.0001, "loss": 1.3627, "loss/crossentropy": 2.647555112838745, "loss/hidden": 1.140625, "loss/logits": 0.21845968067646027, "loss/reg": 0.0003584854130167514, "step": 5767 }, { "epoch": 0.721, "grad_norm": 2.8914756774902344, "grad_norm_var": 2.9688609788647318, "learning_rate": 0.0001, "loss": 1.2694, "loss/crossentropy": 2.6094343662261963, "loss/hidden": 1.078125, "loss/logits": 0.18769600987434387, "loss/reg": 0.000358328630682081, "step": 5768 }, { "epoch": 0.721125, "grad_norm": 3.335010051727295, "grad_norm_var": 2.980244282961062, "learning_rate": 0.0001, "loss": 1.5188, "loss/crossentropy": 2.324448585510254, "loss/hidden": 1.3125, "loss/logits": 0.20268398523330688, "loss/reg": 0.0003581807832233608, "step": 5769 }, { "epoch": 0.72125, "grad_norm": 2.5124571323394775, "grad_norm_var": 3.080681350594125, "learning_rate": 0.0001, "loss": 1.5375, "loss/crossentropy": 2.441929340362549, "loss/hidden": 1.3046875, "loss/logits": 0.22923922538757324, "loss/reg": 0.00035799172474071383, "step": 5770 }, { "epoch": 0.721375, "grad_norm": 2.659370183944702, "grad_norm_var": 2.910240837468552, "learning_rate": 0.0001, "loss": 1.4906, "loss/crossentropy": 2.6951446533203125, "loss/hidden": 1.2421875, "loss/logits": 0.24487558007240295, "loss/reg": 0.000357835233444348, "step": 5771 }, { "epoch": 0.7215, "grad_norm": 2.788163661956787, "grad_norm_var": 2.8800859934539895, "learning_rate": 0.0001, "loss": 1.3043, "loss/crossentropy": 2.73773455619812, "loss/hidden": 1.125, "loss/logits": 0.17569778859615326, "loss/reg": 0.00035769285750575364, "step": 5772 }, { "epoch": 0.721625, "grad_norm": 2.5156047344207764, "grad_norm_var": 2.681680655935577, "learning_rate": 0.0001, "loss": 1.2131, "loss/crossentropy": 2.4380264282226562, "loss/hidden": 1.046875, "loss/logits": 0.16267549991607666, "loss/reg": 0.0003575009759515524, "step": 5773 }, { "epoch": 0.72175, "grad_norm": 3.3178393840789795, "grad_norm_var": 2.621534442092911, "learning_rate": 0.0001, "loss": 1.6737, "loss/crossentropy": 2.2742342948913574, "loss/hidden": 1.3671875, "loss/logits": 0.3028985261917114, "loss/reg": 0.0003573299909476191, "step": 5774 }, { "epoch": 0.721875, "grad_norm": 3.2020394802093506, "grad_norm_var": 2.544919046937479, "learning_rate": 0.0001, "loss": 1.2771, "loss/crossentropy": 2.7629852294921875, "loss/hidden": 1.0859375, "loss/logits": 0.18754467368125916, "loss/reg": 0.0003571631386876106, "step": 5775 }, { "epoch": 0.722, "grad_norm": 2.6907873153686523, "grad_norm_var": 2.5384000961816393, "learning_rate": 0.0001, "loss": 1.3271, "loss/crossentropy": 2.432802677154541, "loss/hidden": 1.1328125, "loss/logits": 0.19072461128234863, "loss/reg": 0.00035697457497008145, "step": 5776 }, { "epoch": 0.722125, "grad_norm": 2.413527727127075, "grad_norm_var": 2.563967565341439, "learning_rate": 0.0001, "loss": 1.3277, "loss/crossentropy": 2.3465073108673096, "loss/hidden": 1.109375, "loss/logits": 0.21474689245224, "loss/reg": 0.0003567654639482498, "step": 5777 }, { "epoch": 0.72225, "grad_norm": 3.188591718673706, "grad_norm_var": 2.5619859609613664, "learning_rate": 0.0001, "loss": 1.6529, "loss/crossentropy": 2.3550021648406982, "loss/hidden": 1.3671875, "loss/logits": 0.28219419717788696, "loss/reg": 0.0003565794322639704, "step": 5778 }, { "epoch": 0.722375, "grad_norm": 2.3342881202697754, "grad_norm_var": 2.625905048137459, "learning_rate": 0.0001, "loss": 1.3259, "loss/crossentropy": 2.461946487426758, "loss/hidden": 1.125, "loss/logits": 0.19729074835777283, "loss/reg": 0.0003563994832802564, "step": 5779 }, { "epoch": 0.7225, "grad_norm": 2.8239519596099854, "grad_norm_var": 2.6242171192053014, "learning_rate": 0.0001, "loss": 1.3005, "loss/crossentropy": 2.530243396759033, "loss/hidden": 1.1328125, "loss/logits": 0.1641232669353485, "loss/reg": 0.00035622387076728046, "step": 5780 }, { "epoch": 0.722625, "grad_norm": 2.969609260559082, "grad_norm_var": 2.626206244338933, "learning_rate": 0.0001, "loss": 1.6743, "loss/crossentropy": 2.19881534576416, "loss/hidden": 1.390625, "loss/logits": 0.2801201045513153, "loss/reg": 0.00035603210562840104, "step": 5781 }, { "epoch": 0.72275, "grad_norm": 6.316041946411133, "grad_norm_var": 0.8523809542108133, "learning_rate": 0.0001, "loss": 2.0067, "loss/crossentropy": 2.247415065765381, "loss/hidden": 1.5390625, "loss/logits": 0.4640933871269226, "loss/reg": 0.0003558431926649064, "step": 5782 }, { "epoch": 0.722875, "grad_norm": 2.5488364696502686, "grad_norm_var": 0.8688592346410395, "learning_rate": 0.0001, "loss": 1.2104, "loss/crossentropy": 2.4141085147857666, "loss/hidden": 1.046875, "loss/logits": 0.1599734127521515, "loss/reg": 0.000355644675437361, "step": 5783 }, { "epoch": 0.723, "grad_norm": 2.294776439666748, "grad_norm_var": 0.9022705545250439, "learning_rate": 0.0001, "loss": 1.3179, "loss/crossentropy": 2.342129707336426, "loss/hidden": 1.125, "loss/logits": 0.1893795281648636, "loss/reg": 0.0003554400464054197, "step": 5784 }, { "epoch": 0.723125, "grad_norm": 2.909126043319702, "grad_norm_var": 0.8942670044471385, "learning_rate": 0.0001, "loss": 1.349, "loss/crossentropy": 2.357543468475342, "loss/hidden": 1.1328125, "loss/logits": 0.21261516213417053, "loss/reg": 0.00035525631392374635, "step": 5785 }, { "epoch": 0.72325, "grad_norm": 2.6049036979675293, "grad_norm_var": 0.8891883381790134, "learning_rate": 0.0001, "loss": 1.2839, "loss/crossentropy": 2.157963752746582, "loss/hidden": 1.125, "loss/logits": 0.15532733500003815, "loss/reg": 0.000355065189069137, "step": 5786 }, { "epoch": 0.723375, "grad_norm": 4.008474826812744, "grad_norm_var": 0.9464212877614945, "learning_rate": 0.0001, "loss": 1.289, "loss/crossentropy": 2.5804286003112793, "loss/hidden": 1.1015625, "loss/logits": 0.18385082483291626, "loss/reg": 0.0003548190288711339, "step": 5787 }, { "epoch": 0.7235, "grad_norm": 2.784261465072632, "grad_norm_var": 0.9465625866576864, "learning_rate": 0.0001, "loss": 1.4124, "loss/crossentropy": 2.4447803497314453, "loss/hidden": 1.203125, "loss/logits": 0.20574700832366943, "loss/reg": 0.00035462953383103013, "step": 5788 }, { "epoch": 0.723625, "grad_norm": 3.3968617916107178, "grad_norm_var": 0.9314082192454879, "learning_rate": 0.0001, "loss": 1.886, "loss/crossentropy": 2.3634748458862305, "loss/hidden": 1.515625, "loss/logits": 0.3668377995491028, "loss/reg": 0.00035444367676973343, "step": 5789 }, { "epoch": 0.72375, "grad_norm": 2.597902297973633, "grad_norm_var": 0.9441152026886509, "learning_rate": 0.0001, "loss": 1.2998, "loss/crossentropy": 2.554722547531128, "loss/hidden": 1.109375, "loss/logits": 0.18684709072113037, "loss/reg": 0.00035425618989393115, "step": 5790 }, { "epoch": 0.723875, "grad_norm": 2.7634201049804688, "grad_norm_var": 0.948285720682573, "learning_rate": 0.0001, "loss": 1.2532, "loss/crossentropy": 2.4910571575164795, "loss/hidden": 1.078125, "loss/logits": 0.17155171930789948, "loss/reg": 0.0003540589823387563, "step": 5791 }, { "epoch": 0.724, "grad_norm": 2.8233799934387207, "grad_norm_var": 0.9432048586997058, "learning_rate": 0.0001, "loss": 1.3155, "loss/crossentropy": 2.658449172973633, "loss/hidden": 1.1328125, "loss/logits": 0.1791212558746338, "loss/reg": 0.00035388072137720883, "step": 5792 }, { "epoch": 0.724125, "grad_norm": 2.216360092163086, "grad_norm_var": 0.9623305587965044, "learning_rate": 0.0001, "loss": 1.1091, "loss/crossentropy": 2.334512948989868, "loss/hidden": 0.9765625, "loss/logits": 0.1290445476770401, "loss/reg": 0.0003537074662744999, "step": 5793 }, { "epoch": 0.72425, "grad_norm": 2.7778899669647217, "grad_norm_var": 0.9645332271317926, "learning_rate": 0.0001, "loss": 1.3562, "loss/crossentropy": 2.4621026515960693, "loss/hidden": 1.140625, "loss/logits": 0.21201764047145844, "loss/reg": 0.0003535384021233767, "step": 5794 }, { "epoch": 0.724375, "grad_norm": 2.257072687149048, "grad_norm_var": 0.9718690731142068, "learning_rate": 0.0001, "loss": 1.3183, "loss/crossentropy": 2.6049745082855225, "loss/hidden": 1.09375, "loss/logits": 0.2210577428340912, "loss/reg": 0.0003533657582011074, "step": 5795 }, { "epoch": 0.7245, "grad_norm": 4.267026424407959, "grad_norm_var": 1.0670328687848563, "learning_rate": 0.0001, "loss": 1.3234, "loss/crossentropy": 2.591435670852661, "loss/hidden": 1.1171875, "loss/logits": 0.20270425081253052, "loss/reg": 0.00035319558810442686, "step": 5796 }, { "epoch": 0.724625, "grad_norm": 2.845338821411133, "grad_norm_var": 1.0700922243531135, "learning_rate": 0.0001, "loss": 1.311, "loss/crossentropy": 2.096904754638672, "loss/hidden": 1.1015625, "loss/logits": 0.20589277148246765, "loss/reg": 0.00035300804302096367, "step": 5797 }, { "epoch": 0.72475, "grad_norm": 3.259796142578125, "grad_norm_var": 0.33855038643240787, "learning_rate": 0.0001, "loss": 1.4174, "loss/crossentropy": 2.281235456466675, "loss/hidden": 1.2265625, "loss/logits": 0.18727287650108337, "loss/reg": 0.00035284331534057856, "step": 5798 }, { "epoch": 0.724875, "grad_norm": 2.6268022060394287, "grad_norm_var": 0.33530876555479666, "learning_rate": 0.0001, "loss": 1.4485, "loss/crossentropy": 2.3747358322143555, "loss/hidden": 1.234375, "loss/logits": 0.21061909198760986, "loss/reg": 0.0003526851942297071, "step": 5799 }, { "epoch": 0.725, "grad_norm": 3.577906370162964, "grad_norm_var": 0.3343090417278534, "learning_rate": 0.0001, "loss": 1.3469, "loss/crossentropy": 2.5986757278442383, "loss/hidden": 1.140625, "loss/logits": 0.20275937020778656, "loss/reg": 0.0003524967178236693, "step": 5800 }, { "epoch": 0.725125, "grad_norm": 2.899341583251953, "grad_norm_var": 0.33441046496947896, "learning_rate": 0.0001, "loss": 1.4172, "loss/crossentropy": 2.446784734725952, "loss/hidden": 1.1875, "loss/logits": 0.22621336579322815, "loss/reg": 0.00035233862581662834, "step": 5801 }, { "epoch": 0.72525, "grad_norm": 3.517775535583496, "grad_norm_var": 0.3406351819109574, "learning_rate": 0.0001, "loss": 1.424, "loss/crossentropy": 2.4028208255767822, "loss/hidden": 1.21875, "loss/logits": 0.20168934762477875, "loss/reg": 0.0003521901380736381, "step": 5802 }, { "epoch": 0.725375, "grad_norm": 2.8422160148620605, "grad_norm_var": 0.2748480206820346, "learning_rate": 0.0001, "loss": 1.4348, "loss/crossentropy": 2.5622966289520264, "loss/hidden": 1.203125, "loss/logits": 0.2281184047460556, "loss/reg": 0.0003520009049680084, "step": 5803 }, { "epoch": 0.7255, "grad_norm": 2.1715996265411377, "grad_norm_var": 0.3131400587293418, "learning_rate": 0.0001, "loss": 1.1994, "loss/crossentropy": 2.5747973918914795, "loss/hidden": 1.0234375, "loss/logits": 0.17242954671382904, "loss/reg": 0.00035181958810426295, "step": 5804 }, { "epoch": 0.725625, "grad_norm": 2.7852296829223633, "grad_norm_var": 0.2982475396714295, "learning_rate": 0.0001, "loss": 1.425, "loss/crossentropy": 2.5607850551605225, "loss/hidden": 1.203125, "loss/logits": 0.21831008791923523, "loss/reg": 0.0003516597207635641, "step": 5805 }, { "epoch": 0.72575, "grad_norm": 2.8760323524475098, "grad_norm_var": 0.29227551868548457, "learning_rate": 0.0001, "loss": 1.3951, "loss/crossentropy": 2.648505926132202, "loss/hidden": 1.1875, "loss/logits": 0.2040904462337494, "loss/reg": 0.0003514729905873537, "step": 5806 }, { "epoch": 0.725875, "grad_norm": 3.195047378540039, "grad_norm_var": 0.29567364333193474, "learning_rate": 0.0001, "loss": 1.5103, "loss/crossentropy": 2.2724082469940186, "loss/hidden": 1.3046875, "loss/logits": 0.20207901298999786, "loss/reg": 0.0003512997063808143, "step": 5807 }, { "epoch": 0.726, "grad_norm": 2.8235950469970703, "grad_norm_var": 0.29567048361795567, "learning_rate": 0.0001, "loss": 1.1511, "loss/crossentropy": 2.649785280227661, "loss/hidden": 0.99609375, "loss/logits": 0.15153461694717407, "loss/reg": 0.0003511276445351541, "step": 5808 }, { "epoch": 0.726125, "grad_norm": 2.775355815887451, "grad_norm_var": 0.26173571408325313, "learning_rate": 0.0001, "loss": 1.3648, "loss/crossentropy": 2.737452507019043, "loss/hidden": 1.15625, "loss/logits": 0.20503708720207214, "loss/reg": 0.0003509355883579701, "step": 5809 }, { "epoch": 0.72625, "grad_norm": 2.4782919883728027, "grad_norm_var": 0.27496488981439504, "learning_rate": 0.0001, "loss": 1.2441, "loss/crossentropy": 2.7315049171447754, "loss/hidden": 1.0703125, "loss/logits": 0.17025741934776306, "loss/reg": 0.00035073357867076993, "step": 5810 }, { "epoch": 0.726375, "grad_norm": 2.6684603691101074, "grad_norm_var": 0.2475395343539804, "learning_rate": 0.0001, "loss": 1.4811, "loss/crossentropy": 2.6623623371124268, "loss/hidden": 1.2421875, "loss/logits": 0.23540465533733368, "loss/reg": 0.0003505020576994866, "step": 5811 }, { "epoch": 0.7265, "grad_norm": 3.1885862350463867, "grad_norm_var": 0.1345342263889068, "learning_rate": 0.0001, "loss": 1.6526, "loss/crossentropy": 2.5080626010894775, "loss/hidden": 1.390625, "loss/logits": 0.25849512219429016, "loss/reg": 0.00035024393582716584, "step": 5812 }, { "epoch": 0.726625, "grad_norm": 2.373570203781128, "grad_norm_var": 0.1523993910151906, "learning_rate": 0.0001, "loss": 1.4508, "loss/crossentropy": 2.433903455734253, "loss/hidden": 1.203125, "loss/logits": 0.24420614540576935, "loss/reg": 0.0003500577586237341, "step": 5813 }, { "epoch": 0.72675, "grad_norm": 2.3500311374664307, "grad_norm_var": 0.15790427147543346, "learning_rate": 0.0001, "loss": 1.2192, "loss/crossentropy": 2.4833922386169434, "loss/hidden": 1.0390625, "loss/logits": 0.17667415738105774, "loss/reg": 0.00034986063838005066, "step": 5814 }, { "epoch": 0.726875, "grad_norm": 2.558826208114624, "grad_norm_var": 0.15996101344712746, "learning_rate": 0.0001, "loss": 1.1979, "loss/crossentropy": 2.663522481918335, "loss/hidden": 1.03125, "loss/logits": 0.16312161087989807, "loss/reg": 0.00034967210376635194, "step": 5815 }, { "epoch": 0.727, "grad_norm": 2.9323034286499023, "grad_norm_var": 0.12056516895126208, "learning_rate": 0.0001, "loss": 1.1691, "loss/crossentropy": 2.852360725402832, "loss/hidden": 0.99609375, "loss/logits": 0.16952945291996002, "loss/reg": 0.0003494948905427009, "step": 5816 }, { "epoch": 0.727125, "grad_norm": 4.377594947814941, "grad_norm_var": 0.28120330289222295, "learning_rate": 0.0001, "loss": 1.4148, "loss/crossentropy": 2.6891262531280518, "loss/hidden": 1.1953125, "loss/logits": 0.2159792184829712, "loss/reg": 0.0003493047843221575, "step": 5817 }, { "epoch": 0.72725, "grad_norm": 3.1099114418029785, "grad_norm_var": 0.2563544824397885, "learning_rate": 0.0001, "loss": 1.8792, "loss/crossentropy": 2.3514020442962646, "loss/hidden": 1.546875, "loss/logits": 0.3287960886955261, "loss/reg": 0.0003491241077426821, "step": 5818 }, { "epoch": 0.727375, "grad_norm": 2.619551420211792, "grad_norm_var": 0.2595110872522201, "learning_rate": 0.0001, "loss": 1.4278, "loss/crossentropy": 2.4791524410247803, "loss/hidden": 1.203125, "loss/logits": 0.22116973996162415, "loss/reg": 0.0003489228547550738, "step": 5819 }, { "epoch": 0.7275, "grad_norm": 3.297621011734009, "grad_norm_var": 0.23986921245285822, "learning_rate": 0.0001, "loss": 1.1654, "loss/crossentropy": 2.507911205291748, "loss/hidden": 1.0078125, "loss/logits": 0.15405365824699402, "loss/reg": 0.0003487396170385182, "step": 5820 }, { "epoch": 0.727625, "grad_norm": 2.7898149490356445, "grad_norm_var": 0.2397999770649276, "learning_rate": 0.0001, "loss": 1.428, "loss/crossentropy": 2.442533016204834, "loss/hidden": 1.21875, "loss/logits": 0.20580677688121796, "loss/reg": 0.00034854240948334336, "step": 5821 }, { "epoch": 0.72775, "grad_norm": 3.252408027648926, "grad_norm_var": 0.24740509066485833, "learning_rate": 0.0001, "loss": 1.3733, "loss/crossentropy": 2.810051202774048, "loss/hidden": 1.1640625, "loss/logits": 0.20574405789375305, "loss/reg": 0.00034833731479011476, "step": 5822 }, { "epoch": 0.727875, "grad_norm": 3.000748634338379, "grad_norm_var": 0.24275398697475883, "learning_rate": 0.0001, "loss": 1.4115, "loss/crossentropy": 2.593646287918091, "loss/hidden": 1.1953125, "loss/logits": 0.21274632215499878, "loss/reg": 0.0003481580060906708, "step": 5823 }, { "epoch": 0.728, "grad_norm": 2.446887731552124, "grad_norm_var": 0.25607829731645687, "learning_rate": 0.0001, "loss": 1.1807, "loss/crossentropy": 2.9798054695129395, "loss/hidden": 1.0, "loss/logits": 0.17721468210220337, "loss/reg": 0.0003479847509879619, "step": 5824 }, { "epoch": 0.728125, "grad_norm": 2.8059816360473633, "grad_norm_var": 0.2556738892707093, "learning_rate": 0.0001, "loss": 1.3386, "loss/crossentropy": 2.0668869018554688, "loss/hidden": 1.140625, "loss/logits": 0.1945306360721588, "loss/reg": 0.0003477917052805424, "step": 5825 }, { "epoch": 0.72825, "grad_norm": 3.3924720287323, "grad_norm_var": 0.2576426730882737, "learning_rate": 0.0001, "loss": 1.3105, "loss/crossentropy": 2.247586488723755, "loss/hidden": 1.1328125, "loss/logits": 0.1742526888847351, "loss/reg": 0.00034761137794703245, "step": 5826 }, { "epoch": 0.728375, "grad_norm": 10.421845436096191, "grad_norm_var": 3.7260538695940775, "learning_rate": 0.0001, "loss": 2.1057, "loss/crossentropy": 2.207242488861084, "loss/hidden": 1.84375, "loss/logits": 0.2585008144378662, "loss/reg": 0.0003474344266578555, "step": 5827 }, { "epoch": 0.7285, "grad_norm": 3.0405614376068115, "grad_norm_var": 3.7322350899467276, "learning_rate": 0.0001, "loss": 1.1723, "loss/crossentropy": 2.6673004627227783, "loss/hidden": 1.0078125, "loss/logits": 0.16101299226284027, "loss/reg": 0.00034725424484349787, "step": 5828 }, { "epoch": 0.728625, "grad_norm": 4.260777950286865, "grad_norm_var": 3.6907330589911793, "learning_rate": 0.0001, "loss": 1.2281, "loss/crossentropy": 2.4530506134033203, "loss/hidden": 1.0703125, "loss/logits": 0.1543082594871521, "loss/reg": 0.0003470552619546652, "step": 5829 }, { "epoch": 0.72875, "grad_norm": 4.192967414855957, "grad_norm_var": 3.6103377721063716, "learning_rate": 0.0001, "loss": 1.3884, "loss/crossentropy": 2.7927775382995605, "loss/hidden": 1.1875, "loss/logits": 0.197437584400177, "loss/reg": 0.0003468738286755979, "step": 5830 }, { "epoch": 0.728875, "grad_norm": 3.798435926437378, "grad_norm_var": 3.524990834237807, "learning_rate": 0.0001, "loss": 1.5035, "loss/crossentropy": 2.5427515506744385, "loss/hidden": 1.265625, "loss/logits": 0.23438668251037598, "loss/reg": 0.0003466843336354941, "step": 5831 }, { "epoch": 0.729, "grad_norm": 3.8597657680511475, "grad_norm_var": 3.4796451830874786, "learning_rate": 0.0001, "loss": 1.5874, "loss/crossentropy": 2.5832245349884033, "loss/hidden": 1.3515625, "loss/logits": 0.23235365748405457, "loss/reg": 0.0003465036570560187, "step": 5832 }, { "epoch": 0.729125, "grad_norm": 2.765606641769409, "grad_norm_var": 3.516126344229746, "learning_rate": 0.0001, "loss": 1.2932, "loss/crossentropy": 2.726531744003296, "loss/hidden": 1.09375, "loss/logits": 0.19601640105247498, "loss/reg": 0.0003463376488070935, "step": 5833 }, { "epoch": 0.72925, "grad_norm": 2.7269084453582764, "grad_norm_var": 3.554966987248673, "learning_rate": 0.0001, "loss": 1.4246, "loss/crossentropy": 2.637570381164551, "loss/hidden": 1.2109375, "loss/logits": 0.2101864218711853, "loss/reg": 0.00034619783400557935, "step": 5834 }, { "epoch": 0.729375, "grad_norm": 3.1013143062591553, "grad_norm_var": 3.5021886181626503, "learning_rate": 0.0001, "loss": 1.5426, "loss/crossentropy": 2.741814136505127, "loss/hidden": 1.3125, "loss/logits": 0.22664053738117218, "loss/reg": 0.0003460595617070794, "step": 5835 }, { "epoch": 0.7295, "grad_norm": 3.587531089782715, "grad_norm_var": 3.491998627843652, "learning_rate": 0.0001, "loss": 1.3093, "loss/crossentropy": 2.53836727142334, "loss/hidden": 1.1328125, "loss/logits": 0.17306053638458252, "loss/reg": 0.00034592769225127995, "step": 5836 }, { "epoch": 0.729625, "grad_norm": 2.3415491580963135, "grad_norm_var": 3.5598697356725055, "learning_rate": 0.0001, "loss": 1.3394, "loss/crossentropy": 2.5274956226348877, "loss/hidden": 1.125, "loss/logits": 0.21098892390727997, "loss/reg": 0.0003457418642938137, "step": 5837 }, { "epoch": 0.72975, "grad_norm": 5.945925235748291, "grad_norm_var": 3.8571475257390504, "learning_rate": 0.0001, "loss": 1.6861, "loss/crossentropy": 2.732320547103882, "loss/hidden": 1.3515625, "loss/logits": 0.3310815691947937, "loss/reg": 0.00034558374318294227, "step": 5838 }, { "epoch": 0.729875, "grad_norm": 2.6978883743286133, "grad_norm_var": 3.8973995538381336, "learning_rate": 0.0001, "loss": 1.6114, "loss/crossentropy": 2.546891212463379, "loss/hidden": 1.328125, "loss/logits": 0.27982646226882935, "loss/reg": 0.00034540038905106485, "step": 5839 }, { "epoch": 0.73, "grad_norm": 2.7873010635375977, "grad_norm_var": 3.8415629311523607, "learning_rate": 0.0001, "loss": 1.1638, "loss/crossentropy": 2.56611704826355, "loss/hidden": 1.0, "loss/logits": 0.1603727489709854, "loss/reg": 0.0003452190721873194, "step": 5840 }, { "epoch": 0.730125, "grad_norm": 2.57248854637146, "grad_norm_var": 3.877719966342573, "learning_rate": 0.0001, "loss": 1.22, "loss/crossentropy": 2.6887991428375244, "loss/hidden": 1.0390625, "loss/logits": 0.1774541139602661, "loss/reg": 0.0003450515796430409, "step": 5841 }, { "epoch": 0.73025, "grad_norm": 2.5245282649993896, "grad_norm_var": 3.976979205631831, "learning_rate": 0.0001, "loss": 1.226, "loss/crossentropy": 2.4269022941589355, "loss/hidden": 1.0546875, "loss/logits": 0.1678306758403778, "loss/reg": 0.00034489904646761715, "step": 5842 }, { "epoch": 0.730375, "grad_norm": 4.1797637939453125, "grad_norm_var": 0.8919073603845142, "learning_rate": 0.0001, "loss": 1.5805, "loss/crossentropy": 2.6343398094177246, "loss/hidden": 1.3125, "loss/logits": 0.264553427696228, "loss/reg": 0.00034472529659979045, "step": 5843 }, { "epoch": 0.7305, "grad_norm": 2.848374605178833, "grad_norm_var": 0.9033997032002141, "learning_rate": 0.0001, "loss": 1.3656, "loss/crossentropy": 2.297710418701172, "loss/hidden": 1.171875, "loss/logits": 0.19026324152946472, "loss/reg": 0.00034454712294973433, "step": 5844 }, { "epoch": 0.730625, "grad_norm": 2.1812777519226074, "grad_norm_var": 0.9313851120390474, "learning_rate": 0.0001, "loss": 1.2396, "loss/crossentropy": 2.599980354309082, "loss/hidden": 1.0625, "loss/logits": 0.17362308502197266, "loss/reg": 0.00034436630085110664, "step": 5845 }, { "epoch": 0.73075, "grad_norm": 3.676880121231079, "grad_norm_var": 0.8836246841376806, "learning_rate": 0.0001, "loss": 1.4323, "loss/crossentropy": 2.8791229724884033, "loss/hidden": 1.234375, "loss/logits": 0.1944769024848938, "loss/reg": 0.00034421044983901083, "step": 5846 }, { "epoch": 0.730875, "grad_norm": 2.6519789695739746, "grad_norm_var": 0.8780738399863232, "learning_rate": 0.0001, "loss": 1.2451, "loss/crossentropy": 2.390490770339966, "loss/hidden": 1.0625, "loss/logits": 0.1791098564863205, "loss/reg": 0.0003440588479861617, "step": 5847 }, { "epoch": 0.731, "grad_norm": 2.5238258838653564, "grad_norm_var": 0.8637389710027262, "learning_rate": 0.0001, "loss": 1.4006, "loss/crossentropy": 2.5409886837005615, "loss/hidden": 1.171875, "loss/logits": 0.22533497214317322, "loss/reg": 0.0003438782296143472, "step": 5848 }, { "epoch": 0.731125, "grad_norm": 2.168118953704834, "grad_norm_var": 0.9102663014389046, "learning_rate": 0.0001, "loss": 1.1461, "loss/crossentropy": 2.4215140342712402, "loss/hidden": 0.97265625, "loss/logits": 0.17002424597740173, "loss/reg": 0.00034371024230495095, "step": 5849 }, { "epoch": 0.73125, "grad_norm": 2.5402512550354004, "grad_norm_var": 0.9200425470136128, "learning_rate": 0.0001, "loss": 1.4058, "loss/crossentropy": 2.6109843254089355, "loss/hidden": 1.15625, "loss/logits": 0.2461174875497818, "loss/reg": 0.00034355511888861656, "step": 5850 }, { "epoch": 0.731375, "grad_norm": 2.488236427307129, "grad_norm_var": 0.9369331111353328, "learning_rate": 0.0001, "loss": 1.3876, "loss/crossentropy": 2.3451547622680664, "loss/hidden": 1.15625, "loss/logits": 0.22793009877204895, "loss/reg": 0.00034337781835347414, "step": 5851 }, { "epoch": 0.7315, "grad_norm": 3.311028003692627, "grad_norm_var": 0.9193963526871348, "learning_rate": 0.0001, "loss": 1.4637, "loss/crossentropy": 2.5624747276306152, "loss/hidden": 1.25, "loss/logits": 0.21030648052692413, "loss/reg": 0.0003431989171076566, "step": 5852 }, { "epoch": 0.731625, "grad_norm": 2.968933582305908, "grad_norm_var": 0.8918476584764446, "learning_rate": 0.0001, "loss": 1.2338, "loss/crossentropy": 2.799880266189575, "loss/hidden": 1.0703125, "loss/logits": 0.16004934906959534, "loss/reg": 0.000343008287018165, "step": 5853 }, { "epoch": 0.73175, "grad_norm": 2.744405508041382, "grad_norm_var": 0.2767128609600339, "learning_rate": 0.0001, "loss": 1.3267, "loss/crossentropy": 2.50356388092041, "loss/hidden": 1.140625, "loss/logits": 0.1826406717300415, "loss/reg": 0.0003428068885114044, "step": 5854 }, { "epoch": 0.731875, "grad_norm": 2.032357692718506, "grad_norm_var": 0.3138192314342168, "learning_rate": 0.0001, "loss": 1.0723, "loss/crossentropy": 2.4482672214508057, "loss/hidden": 0.93359375, "loss/logits": 0.1352362334728241, "loss/reg": 0.0003426059556659311, "step": 5855 }, { "epoch": 0.732, "grad_norm": 2.927070379257202, "grad_norm_var": 0.31550267883544975, "learning_rate": 0.0001, "loss": 1.4506, "loss/crossentropy": 2.3541576862335205, "loss/hidden": 1.2265625, "loss/logits": 0.2206108272075653, "loss/reg": 0.00034238427178934216, "step": 5856 }, { "epoch": 0.732125, "grad_norm": 2.1834144592285156, "grad_norm_var": 0.3352733445077007, "learning_rate": 0.0001, "loss": 1.2883, "loss/crossentropy": 2.58019757270813, "loss/hidden": 1.1015625, "loss/logits": 0.1832742989063263, "loss/reg": 0.00034220764064230025, "step": 5857 }, { "epoch": 0.73225, "grad_norm": 35.953670501708984, "grad_norm_var": 69.18832303007929, "learning_rate": 0.0001, "loss": 1.3415, "loss/crossentropy": 2.663177728652954, "loss/hidden": 1.140625, "loss/logits": 0.1974789798259735, "loss/reg": 0.000341985170962289, "step": 5858 }, { "epoch": 0.732375, "grad_norm": 3.3783888816833496, "grad_norm_var": 69.29860343076122, "learning_rate": 0.0001, "loss": 1.6152, "loss/crossentropy": 2.898686170578003, "loss/hidden": 1.28125, "loss/logits": 0.33055558800697327, "loss/reg": 0.0003417843545321375, "step": 5859 }, { "epoch": 0.7325, "grad_norm": 4.306084632873535, "grad_norm_var": 69.0547844938545, "learning_rate": 0.0001, "loss": 1.7756, "loss/crossentropy": 2.5933785438537598, "loss/hidden": 1.5, "loss/logits": 0.2721554934978485, "loss/reg": 0.0003416058316361159, "step": 5860 }, { "epoch": 0.732625, "grad_norm": 2.5145819187164307, "grad_norm_var": 68.94191743242565, "learning_rate": 0.0001, "loss": 1.2419, "loss/crossentropy": 2.1762781143188477, "loss/hidden": 1.0703125, "loss/logits": 0.16818919777870178, "loss/reg": 0.0003414298116695136, "step": 5861 }, { "epoch": 0.73275, "grad_norm": 2.961345672607422, "grad_norm_var": 69.09042454170375, "learning_rate": 0.0001, "loss": 1.558, "loss/crossentropy": 2.4764034748077393, "loss/hidden": 1.3203125, "loss/logits": 0.2342929095029831, "loss/reg": 0.0003412387450225651, "step": 5862 }, { "epoch": 0.732875, "grad_norm": 2.1547136306762695, "grad_norm_var": 69.25183487854429, "learning_rate": 0.0001, "loss": 1.2702, "loss/crossentropy": 2.646944999694824, "loss/hidden": 1.078125, "loss/logits": 0.18866059184074402, "loss/reg": 0.00034105454687960446, "step": 5863 }, { "epoch": 0.733, "grad_norm": 2.376105785369873, "grad_norm_var": 69.29846902294832, "learning_rate": 0.0001, "loss": 1.2347, "loss/crossentropy": 2.161098003387451, "loss/hidden": 1.0390625, "loss/logits": 0.19222982227802277, "loss/reg": 0.0003408784978091717, "step": 5864 }, { "epoch": 0.733125, "grad_norm": 2.285945415496826, "grad_norm_var": 69.25778442392856, "learning_rate": 0.0001, "loss": 1.2669, "loss/crossentropy": 2.608652114868164, "loss/hidden": 1.0703125, "loss/logits": 0.1931852400302887, "loss/reg": 0.0003407025069463998, "step": 5865 }, { "epoch": 0.73325, "grad_norm": 2.509399175643921, "grad_norm_var": 69.26722359300396, "learning_rate": 0.0001, "loss": 1.2138, "loss/crossentropy": 2.561769723892212, "loss/hidden": 1.03125, "loss/logits": 0.17911896109580994, "loss/reg": 0.0003405007300898433, "step": 5866 }, { "epoch": 0.733375, "grad_norm": 2.318239212036133, "grad_norm_var": 69.32184777529243, "learning_rate": 0.0001, "loss": 1.1519, "loss/crossentropy": 2.481893539428711, "loss/hidden": 0.98828125, "loss/logits": 0.1601789891719818, "loss/reg": 0.00034030131064355373, "step": 5867 }, { "epoch": 0.7335, "grad_norm": 2.7473559379577637, "grad_norm_var": 69.45420162556805, "learning_rate": 0.0001, "loss": 1.2775, "loss/crossentropy": 3.1755504608154297, "loss/hidden": 1.0859375, "loss/logits": 0.18819063901901245, "loss/reg": 0.0003400914429221302, "step": 5868 }, { "epoch": 0.733625, "grad_norm": 3.70042085647583, "grad_norm_var": 69.31172667627173, "learning_rate": 0.0001, "loss": 1.4232, "loss/crossentropy": 2.548888921737671, "loss/hidden": 1.203125, "loss/logits": 0.21663488447666168, "loss/reg": 0.0003398776170797646, "step": 5869 }, { "epoch": 0.73375, "grad_norm": 2.5285134315490723, "grad_norm_var": 69.37433934026396, "learning_rate": 0.0001, "loss": 1.241, "loss/crossentropy": 2.486875057220459, "loss/hidden": 1.0625, "loss/logits": 0.1751430481672287, "loss/reg": 0.0003397009277250618, "step": 5870 }, { "epoch": 0.733875, "grad_norm": 2.4417717456817627, "grad_norm_var": 69.23346924775059, "learning_rate": 0.0001, "loss": 1.1544, "loss/crossentropy": 2.600083827972412, "loss/hidden": 0.99609375, "loss/logits": 0.15488238632678986, "loss/reg": 0.0003395034000277519, "step": 5871 }, { "epoch": 0.734, "grad_norm": 2.239774703979492, "grad_norm_var": 69.43741629616095, "learning_rate": 0.0001, "loss": 1.1005, "loss/crossentropy": 2.5327818393707275, "loss/hidden": 0.94921875, "loss/logits": 0.14786562323570251, "loss/reg": 0.0003392936778254807, "step": 5872 }, { "epoch": 0.734125, "grad_norm": 2.1733505725860596, "grad_norm_var": 69.4409168995172, "learning_rate": 0.0001, "loss": 1.3, "loss/crossentropy": 2.5810866355895996, "loss/hidden": 1.109375, "loss/logits": 0.18721260130405426, "loss/reg": 0.0003391086938790977, "step": 5873 }, { "epoch": 0.73425, "grad_norm": 4.101447582244873, "grad_norm_var": 0.48685469185568686, "learning_rate": 0.0001, "loss": 1.8043, "loss/crossentropy": 2.376476287841797, "loss/hidden": 1.5390625, "loss/logits": 0.2618522644042969, "loss/reg": 0.0003389383782632649, "step": 5874 }, { "epoch": 0.734375, "grad_norm": 2.6705636978149414, "grad_norm_var": 0.46321277582470943, "learning_rate": 0.0001, "loss": 1.2083, "loss/crossentropy": 2.4850213527679443, "loss/hidden": 1.046875, "loss/logits": 0.15802717208862305, "loss/reg": 0.0003387660544831306, "step": 5875 }, { "epoch": 0.7345, "grad_norm": 2.2911691665649414, "grad_norm_var": 0.29940226143968685, "learning_rate": 0.0001, "loss": 1.0969, "loss/crossentropy": 2.596632242202759, "loss/hidden": 0.9453125, "loss/logits": 0.1482231169939041, "loss/reg": 0.00033860738039948046, "step": 5876 }, { "epoch": 0.734625, "grad_norm": 2.1077311038970947, "grad_norm_var": 0.31578739453759586, "learning_rate": 0.0001, "loss": 1.2192, "loss/crossentropy": 2.5696728229522705, "loss/hidden": 1.03125, "loss/logits": 0.1845846325159073, "loss/reg": 0.0003384467854630202, "step": 5877 }, { "epoch": 0.73475, "grad_norm": 2.2413716316223145, "grad_norm_var": 0.3135442088342605, "learning_rate": 0.0001, "loss": 1.1487, "loss/crossentropy": 2.5496485233306885, "loss/hidden": 0.984375, "loss/logits": 0.16096720099449158, "loss/reg": 0.00033827105653472245, "step": 5878 }, { "epoch": 0.734875, "grad_norm": 4.534578323364258, "grad_norm_var": 0.5403558179051591, "learning_rate": 0.0001, "loss": 1.5263, "loss/crossentropy": 3.328578472137451, "loss/hidden": 1.296875, "loss/logits": 0.22607408463954926, "loss/reg": 0.00033810827881097794, "step": 5879 }, { "epoch": 0.735, "grad_norm": 2.244715929031372, "grad_norm_var": 0.5471831301380118, "learning_rate": 0.0001, "loss": 1.2585, "loss/crossentropy": 2.5329370498657227, "loss/hidden": 1.0703125, "loss/logits": 0.18477723002433777, "loss/reg": 0.0003379371191840619, "step": 5880 }, { "epoch": 0.735125, "grad_norm": 2.389770269393921, "grad_norm_var": 0.5421800393332963, "learning_rate": 0.0001, "loss": 1.139, "loss/crossentropy": 2.720810651779175, "loss/hidden": 0.98828125, "loss/logits": 0.14731337130069733, "loss/reg": 0.0003377666580490768, "step": 5881 }, { "epoch": 0.73525, "grad_norm": 2.430854082107544, "grad_norm_var": 0.5445880189987965, "learning_rate": 0.0001, "loss": 1.3494, "loss/crossentropy": 2.4895286560058594, "loss/hidden": 1.1484375, "loss/logits": 0.19761110842227936, "loss/reg": 0.0003375930537004024, "step": 5882 }, { "epoch": 0.735375, "grad_norm": 3.8101112842559814, "grad_norm_var": 0.6082317750105193, "learning_rate": 0.0001, "loss": 1.6107, "loss/crossentropy": 2.2767553329467773, "loss/hidden": 1.40625, "loss/logits": 0.20111408829689026, "loss/reg": 0.000337436591507867, "step": 5883 }, { "epoch": 0.7355, "grad_norm": 2.2720115184783936, "grad_norm_var": 0.6251100212621926, "learning_rate": 0.0001, "loss": 1.263, "loss/crossentropy": 2.6437177658081055, "loss/hidden": 1.078125, "loss/logits": 0.18147742748260498, "loss/reg": 0.00033726744004525244, "step": 5884 }, { "epoch": 0.735625, "grad_norm": 2.3541975021362305, "grad_norm_var": 0.56978133355598, "learning_rate": 0.0001, "loss": 1.3106, "loss/crossentropy": 2.623655319213867, "loss/hidden": 1.125, "loss/logits": 0.18222568929195404, "loss/reg": 0.00033710015122778714, "step": 5885 }, { "epoch": 0.73575, "grad_norm": 3.382026433944702, "grad_norm_var": 0.5984140622538383, "learning_rate": 0.0001, "loss": 1.4327, "loss/crossentropy": 2.3121297359466553, "loss/hidden": 1.234375, "loss/logits": 0.19490866363048553, "loss/reg": 0.0003369629557710141, "step": 5886 }, { "epoch": 0.735875, "grad_norm": 2.4861981868743896, "grad_norm_var": 0.5968280755817656, "learning_rate": 0.0001, "loss": 1.2794, "loss/crossentropy": 2.317279100418091, "loss/hidden": 1.1015625, "loss/logits": 0.1744520664215088, "loss/reg": 0.0003368280886206776, "step": 5887 }, { "epoch": 0.736, "grad_norm": 3.4888675212860107, "grad_norm_var": 0.6121785873863133, "learning_rate": 0.0001, "loss": 1.6889, "loss/crossentropy": 2.517758846282959, "loss/hidden": 1.328125, "loss/logits": 0.357358455657959, "loss/reg": 0.00033669063122943044, "step": 5888 }, { "epoch": 0.736125, "grad_norm": 3.1903774738311768, "grad_norm_var": 0.5903323928610651, "learning_rate": 0.0001, "loss": 1.4872, "loss/crossentropy": 2.663097381591797, "loss/hidden": 1.25, "loss/logits": 0.23385955393314362, "loss/reg": 0.0003365654847584665, "step": 5889 }, { "epoch": 0.73625, "grad_norm": 2.20845365524292, "grad_norm_var": 0.5046789110551534, "learning_rate": 0.0001, "loss": 1.4268, "loss/crossentropy": 2.421767234802246, "loss/hidden": 1.21875, "loss/logits": 0.20469602942466736, "loss/reg": 0.0003363903670106083, "step": 5890 }, { "epoch": 0.736375, "grad_norm": 2.9888088703155518, "grad_norm_var": 0.5073650599082211, "learning_rate": 0.0001, "loss": 1.4299, "loss/crossentropy": 2.376645803451538, "loss/hidden": 1.1875, "loss/logits": 0.23900865018367767, "loss/reg": 0.0003362286079209298, "step": 5891 }, { "epoch": 0.7365, "grad_norm": 2.8762218952178955, "grad_norm_var": 0.4909122030634298, "learning_rate": 0.0001, "loss": 1.2962, "loss/crossentropy": 2.6552491188049316, "loss/hidden": 1.109375, "loss/logits": 0.18348003923892975, "loss/reg": 0.00033609315869398415, "step": 5892 }, { "epoch": 0.736625, "grad_norm": 3.5547149181365967, "grad_norm_var": 0.48572453201744187, "learning_rate": 0.0001, "loss": 1.4607, "loss/crossentropy": 2.9978458881378174, "loss/hidden": 1.2109375, "loss/logits": 0.24640411138534546, "loss/reg": 0.000335968128638342, "step": 5893 }, { "epoch": 0.73675, "grad_norm": 2.5020625591278076, "grad_norm_var": 0.46696314595790583, "learning_rate": 0.0001, "loss": 1.194, "loss/crossentropy": 2.741952657699585, "loss/hidden": 1.015625, "loss/logits": 0.1750544011592865, "loss/reg": 0.00033579563023522496, "step": 5894 }, { "epoch": 0.736875, "grad_norm": 2.5688724517822266, "grad_norm_var": 0.285192870795553, "learning_rate": 0.0001, "loss": 1.2427, "loss/crossentropy": 2.575575113296509, "loss/hidden": 1.0546875, "loss/logits": 0.18465888500213623, "loss/reg": 0.0003356541274115443, "step": 5895 }, { "epoch": 0.737, "grad_norm": 2.4900758266448975, "grad_norm_var": 0.2708953192523701, "learning_rate": 0.0001, "loss": 1.2626, "loss/crossentropy": 2.8225557804107666, "loss/hidden": 1.078125, "loss/logits": 0.18108026683330536, "loss/reg": 0.00033548130886629224, "step": 5896 }, { "epoch": 0.737125, "grad_norm": 3.1728310585021973, "grad_norm_var": 0.26512452522834545, "learning_rate": 0.0001, "loss": 1.6888, "loss/crossentropy": 2.7279675006866455, "loss/hidden": 1.390625, "loss/logits": 0.294812947511673, "loss/reg": 0.0003353363135829568, "step": 5897 }, { "epoch": 0.73725, "grad_norm": 2.5473368167877197, "grad_norm_var": 0.25929126458221674, "learning_rate": 0.0001, "loss": 1.1811, "loss/crossentropy": 2.440840482711792, "loss/hidden": 1.0078125, "loss/logits": 0.16989538073539734, "loss/reg": 0.00033520825672894716, "step": 5898 }, { "epoch": 0.737375, "grad_norm": 2.797945737838745, "grad_norm_var": 0.196221787239665, "learning_rate": 0.0001, "loss": 1.2894, "loss/crossentropy": 2.8853793144226074, "loss/hidden": 1.1015625, "loss/logits": 0.18443891406059265, "loss/reg": 0.0003350943443365395, "step": 5899 }, { "epoch": 0.7375, "grad_norm": 2.8407716751098633, "grad_norm_var": 0.17601602834387578, "learning_rate": 0.0001, "loss": 1.4432, "loss/crossentropy": 2.5316739082336426, "loss/hidden": 1.203125, "loss/logits": 0.2366899698972702, "loss/reg": 0.00033498459379188716, "step": 5900 }, { "epoch": 0.737625, "grad_norm": 3.0048787593841553, "grad_norm_var": 0.16027771274403904, "learning_rate": 0.0001, "loss": 1.3164, "loss/crossentropy": 2.468306303024292, "loss/hidden": 1.1171875, "loss/logits": 0.195832759141922, "loss/reg": 0.00033484786399640143, "step": 5901 }, { "epoch": 0.73775, "grad_norm": 2.6156129837036133, "grad_norm_var": 0.1458188233912966, "learning_rate": 0.0001, "loss": 1.3142, "loss/crossentropy": 2.7285871505737305, "loss/hidden": 1.1171875, "loss/logits": 0.19367733597755432, "loss/reg": 0.000334745564032346, "step": 5902 }, { "epoch": 0.737875, "grad_norm": 3.7274820804595947, "grad_norm_var": 0.18465828405345305, "learning_rate": 0.0001, "loss": 1.6681, "loss/crossentropy": 2.477309226989746, "loss/hidden": 1.390625, "loss/logits": 0.27411723136901855, "loss/reg": 0.00033456942765042186, "step": 5903 }, { "epoch": 0.738, "grad_norm": 2.9732773303985596, "grad_norm_var": 0.1615442055414308, "learning_rate": 0.0001, "loss": 1.3387, "loss/crossentropy": 2.700324058532715, "loss/hidden": 1.15625, "loss/logits": 0.1791386604309082, "loss/reg": 0.0003344540309626609, "step": 5904 }, { "epoch": 0.738125, "grad_norm": 2.597656011581421, "grad_norm_var": 0.1588724912149054, "learning_rate": 0.0001, "loss": 1.5396, "loss/crossentropy": 2.4982504844665527, "loss/hidden": 1.2734375, "loss/logits": 0.262789249420166, "loss/reg": 0.00033434154465794563, "step": 5905 }, { "epoch": 0.73825, "grad_norm": 2.921417474746704, "grad_norm_var": 0.1304459375032053, "learning_rate": 0.0001, "loss": 1.5882, "loss/crossentropy": 2.4708197116851807, "loss/hidden": 1.328125, "loss/logits": 0.2567143440246582, "loss/reg": 0.000334229291183874, "step": 5906 }, { "epoch": 0.738375, "grad_norm": 3.702824592590332, "grad_norm_var": 0.17207360980867534, "learning_rate": 0.0001, "loss": 1.6807, "loss/crossentropy": 2.457385778427124, "loss/hidden": 1.4375, "loss/logits": 0.2398131638765335, "loss/reg": 0.00033408578019589186, "step": 5907 }, { "epoch": 0.7385, "grad_norm": 2.7318997383117676, "grad_norm_var": 0.17442708087327502, "learning_rate": 0.0001, "loss": 1.561, "loss/crossentropy": 2.3423120975494385, "loss/hidden": 1.3203125, "loss/logits": 0.23730884492397308, "loss/reg": 0.00033390658791176975, "step": 5908 }, { "epoch": 0.738625, "grad_norm": 2.181257486343384, "grad_norm_var": 0.17643178006601754, "learning_rate": 0.0001, "loss": 1.1994, "loss/crossentropy": 2.533052921295166, "loss/hidden": 1.015625, "loss/logits": 0.18046541512012482, "loss/reg": 0.000333727162797004, "step": 5909 }, { "epoch": 0.73875, "grad_norm": 3.0709311962127686, "grad_norm_var": 0.17132766851500872, "learning_rate": 0.0001, "loss": 1.6464, "loss/crossentropy": 2.2391395568847656, "loss/hidden": 1.3828125, "loss/logits": 0.26029500365257263, "loss/reg": 0.00033357017673552036, "step": 5910 }, { "epoch": 0.738875, "grad_norm": 2.3272650241851807, "grad_norm_var": 0.18472715141458002, "learning_rate": 0.0001, "loss": 1.3696, "loss/crossentropy": 2.511070489883423, "loss/hidden": 1.1640625, "loss/logits": 0.20223771035671234, "loss/reg": 0.0003334154898766428, "step": 5911 }, { "epoch": 0.739, "grad_norm": 2.492403507232666, "grad_norm_var": 0.18461377798860212, "learning_rate": 0.0001, "loss": 1.3507, "loss/crossentropy": 2.5843262672424316, "loss/hidden": 1.140625, "loss/logits": 0.20678028464317322, "loss/reg": 0.00033326848642900586, "step": 5912 }, { "epoch": 0.739125, "grad_norm": 4.045313835144043, "grad_norm_var": 0.2689765099970136, "learning_rate": 0.0001, "loss": 1.6459, "loss/crossentropy": 2.2589032649993896, "loss/hidden": 1.40625, "loss/logits": 0.23632849752902985, "loss/reg": 0.0003330893232487142, "step": 5913 }, { "epoch": 0.73925, "grad_norm": 2.2831811904907227, "grad_norm_var": 0.28615114360105903, "learning_rate": 0.0001, "loss": 1.31, "loss/crossentropy": 2.433950424194336, "loss/hidden": 1.109375, "loss/logits": 0.19731780886650085, "loss/reg": 0.00033291304134763777, "step": 5914 }, { "epoch": 0.739375, "grad_norm": 3.1256115436553955, "grad_norm_var": 0.2886373262664807, "learning_rate": 0.0001, "loss": 1.4613, "loss/crossentropy": 2.802358865737915, "loss/hidden": 1.25, "loss/logits": 0.2080082893371582, "loss/reg": 0.00033274994348175824, "step": 5915 }, { "epoch": 0.7395, "grad_norm": 3.245595932006836, "grad_norm_var": 0.29486738367060616, "learning_rate": 0.0001, "loss": 1.3419, "loss/crossentropy": 2.177114486694336, "loss/hidden": 1.15625, "loss/logits": 0.1823142021894455, "loss/reg": 0.0003325957804918289, "step": 5916 }, { "epoch": 0.739625, "grad_norm": 8.950243949890137, "grad_norm_var": 2.5551808002670744, "learning_rate": 0.0001, "loss": 1.7129, "loss/crossentropy": 2.2064614295959473, "loss/hidden": 1.4765625, "loss/logits": 0.23300740122795105, "loss/reg": 0.0003324368444737047, "step": 5917 }, { "epoch": 0.73975, "grad_norm": 2.2778289318084717, "grad_norm_var": 2.5936756462482395, "learning_rate": 0.0001, "loss": 1.2956, "loss/crossentropy": 2.6093106269836426, "loss/hidden": 1.09375, "loss/logits": 0.19855250418186188, "loss/reg": 0.00033227563835680485, "step": 5918 }, { "epoch": 0.739875, "grad_norm": 4.91441011428833, "grad_norm_var": 2.750819811327665, "learning_rate": 0.0001, "loss": 1.7629, "loss/crossentropy": 2.6736953258514404, "loss/hidden": 1.46875, "loss/logits": 0.2907862365245819, "loss/reg": 0.0003321039548609406, "step": 5919 }, { "epoch": 0.74, "grad_norm": 2.9028098583221436, "grad_norm_var": 2.7548113159764616, "learning_rate": 0.0001, "loss": 1.3298, "loss/crossentropy": 2.5945184230804443, "loss/hidden": 1.1328125, "loss/logits": 0.19364646077156067, "loss/reg": 0.0003319478128105402, "step": 5920 }, { "epoch": 0.740125, "grad_norm": 2.256972074508667, "grad_norm_var": 2.7967247628523477, "learning_rate": 0.0001, "loss": 1.1236, "loss/crossentropy": 2.4407641887664795, "loss/hidden": 0.95703125, "loss/logits": 0.16321362555027008, "loss/reg": 0.0003317914088256657, "step": 5921 }, { "epoch": 0.74025, "grad_norm": 2.3372268676757812, "grad_norm_var": 2.8506100974956587, "learning_rate": 0.0001, "loss": 1.1876, "loss/crossentropy": 2.4240896701812744, "loss/hidden": 1.015625, "loss/logits": 0.16863176226615906, "loss/reg": 0.00033161992905661464, "step": 5922 }, { "epoch": 0.740375, "grad_norm": 3.884465217590332, "grad_norm_var": 2.8623587982825063, "learning_rate": 0.0001, "loss": 1.2788, "loss/crossentropy": 2.568887948989868, "loss/hidden": 1.078125, "loss/logits": 0.1973555088043213, "loss/reg": 0.0003314699570182711, "step": 5923 }, { "epoch": 0.7405, "grad_norm": 3.9537575244903564, "grad_norm_var": 2.8608000411277885, "learning_rate": 0.0001, "loss": 1.5348, "loss/crossentropy": 2.7056522369384766, "loss/hidden": 1.296875, "loss/logits": 0.23465165495872498, "loss/reg": 0.00033131829695776105, "step": 5924 }, { "epoch": 0.740625, "grad_norm": 2.350067615509033, "grad_norm_var": 2.8353616509785655, "learning_rate": 0.0001, "loss": 1.2411, "loss/crossentropy": 2.4559946060180664, "loss/hidden": 1.078125, "loss/logits": 0.15964707732200623, "loss/reg": 0.0003311441105324775, "step": 5925 }, { "epoch": 0.74075, "grad_norm": 3.0229170322418213, "grad_norm_var": 2.837619633663058, "learning_rate": 0.0001, "loss": 1.2221, "loss/crossentropy": 2.5713443756103516, "loss/hidden": 1.046875, "loss/logits": 0.17191079258918762, "loss/reg": 0.0003309783060103655, "step": 5926 }, { "epoch": 0.740875, "grad_norm": 3.0989155769348145, "grad_norm_var": 2.764657175796814, "learning_rate": 0.0001, "loss": 1.467, "loss/crossentropy": 2.630729913711548, "loss/hidden": 1.25, "loss/logits": 0.21370351314544678, "loss/reg": 0.0003308179439045489, "step": 5927 }, { "epoch": 0.741, "grad_norm": 2.714155435562134, "grad_norm_var": 2.739525059236797, "learning_rate": 0.0001, "loss": 1.4391, "loss/crossentropy": 2.5607059001922607, "loss/hidden": 1.203125, "loss/logits": 0.23262834548950195, "loss/reg": 0.00033063144655898213, "step": 5928 }, { "epoch": 0.741125, "grad_norm": 2.5810906887054443, "grad_norm_var": 2.75929359616651, "learning_rate": 0.0001, "loss": 1.4153, "loss/crossentropy": 2.548133373260498, "loss/hidden": 1.1875, "loss/logits": 0.2245035469532013, "loss/reg": 0.0003304328420199454, "step": 5929 }, { "epoch": 0.74125, "grad_norm": 2.268876075744629, "grad_norm_var": 2.7613768546602224, "learning_rate": 0.0001, "loss": 1.2014, "loss/crossentropy": 2.954787254333496, "loss/hidden": 1.0234375, "loss/logits": 0.17467030882835388, "loss/reg": 0.0003302340628579259, "step": 5930 }, { "epoch": 0.741375, "grad_norm": 2.6860134601593018, "grad_norm_var": 2.7876506993080805, "learning_rate": 0.0001, "loss": 1.4776, "loss/crossentropy": 2.4631283283233643, "loss/hidden": 1.25, "loss/logits": 0.22429159283638, "loss/reg": 0.00033006430021487176, "step": 5931 }, { "epoch": 0.7415, "grad_norm": 2.669621706008911, "grad_norm_var": 2.8156604129783678, "learning_rate": 0.0001, "loss": 1.3765, "loss/crossentropy": 2.7005295753479004, "loss/hidden": 1.140625, "loss/logits": 0.23258845508098602, "loss/reg": 0.00032986936275847256, "step": 5932 }, { "epoch": 0.741625, "grad_norm": 2.179711103439331, "grad_norm_var": 0.5838933539489128, "learning_rate": 0.0001, "loss": 1.1265, "loss/crossentropy": 2.6073689460754395, "loss/hidden": 0.96484375, "loss/logits": 0.15840458869934082, "loss/reg": 0.00032966682920232415, "step": 5933 }, { "epoch": 0.74175, "grad_norm": 2.7498018741607666, "grad_norm_var": 0.5598472005477485, "learning_rate": 0.0001, "loss": 1.4516, "loss/crossentropy": 2.40354585647583, "loss/hidden": 1.21875, "loss/logits": 0.2295638620853424, "loss/reg": 0.00032949564047157764, "step": 5934 }, { "epoch": 0.741875, "grad_norm": 2.745124578475952, "grad_norm_var": 0.27440342490601943, "learning_rate": 0.0001, "loss": 1.2826, "loss/crossentropy": 2.6387088298797607, "loss/hidden": 1.1015625, "loss/logits": 0.1777149885892868, "loss/reg": 0.0003293130430392921, "step": 5935 }, { "epoch": 0.742, "grad_norm": 2.3229498863220215, "grad_norm_var": 0.2855440752036808, "learning_rate": 0.0001, "loss": 1.2552, "loss/crossentropy": 2.54856276512146, "loss/hidden": 1.09375, "loss/logits": 0.1582050621509552, "loss/reg": 0.0003291401662863791, "step": 5936 }, { "epoch": 0.742125, "grad_norm": 2.2141273021698, "grad_norm_var": 0.2884116220168304, "learning_rate": 0.0001, "loss": 1.361, "loss/crossentropy": 2.163135290145874, "loss/hidden": 1.1640625, "loss/logits": 0.19367504119873047, "loss/reg": 0.0003289373416919261, "step": 5937 }, { "epoch": 0.74225, "grad_norm": 2.8455557823181152, "grad_norm_var": 0.27752183908298683, "learning_rate": 0.0001, "loss": 1.9654, "loss/crossentropy": 2.3098607063293457, "loss/hidden": 1.578125, "loss/logits": 0.3840157687664032, "loss/reg": 0.0003287659783381969, "step": 5938 }, { "epoch": 0.742375, "grad_norm": 2.8573460578918457, "grad_norm_var": 0.19055138937906502, "learning_rate": 0.0001, "loss": 1.2479, "loss/crossentropy": 2.5088791847229004, "loss/hidden": 1.0859375, "loss/logits": 0.15865036845207214, "loss/reg": 0.00032855678000487387, "step": 5939 }, { "epoch": 0.7425, "grad_norm": 2.585496187210083, "grad_norm_var": 0.07951551483052223, "learning_rate": 0.0001, "loss": 1.584, "loss/crossentropy": 2.2506895065307617, "loss/hidden": 1.3203125, "loss/logits": 0.2604162096977234, "loss/reg": 0.000328382127918303, "step": 5940 }, { "epoch": 0.742625, "grad_norm": 2.360110282897949, "grad_norm_var": 0.07916273529835109, "learning_rate": 0.0001, "loss": 1.3384, "loss/crossentropy": 2.539308547973633, "loss/hidden": 1.1328125, "loss/logits": 0.20229515433311462, "loss/reg": 0.0003282217076048255, "step": 5941 }, { "epoch": 0.74275, "grad_norm": 2.5365190505981445, "grad_norm_var": 0.06774505087071982, "learning_rate": 0.0001, "loss": 1.3301, "loss/crossentropy": 2.726167917251587, "loss/hidden": 1.1171875, "loss/logits": 0.20966455340385437, "loss/reg": 0.00032801873749122024, "step": 5942 }, { "epoch": 0.742875, "grad_norm": 2.7181553840637207, "grad_norm_var": 0.05089154896293901, "learning_rate": 0.0001, "loss": 1.3094, "loss/crossentropy": 2.8459270000457764, "loss/hidden": 1.1015625, "loss/logits": 0.20450995862483978, "loss/reg": 0.0003278119256719947, "step": 5943 }, { "epoch": 0.743, "grad_norm": 4.848911285400391, "grad_norm_var": 0.37826527091098494, "learning_rate": 0.0001, "loss": 1.8638, "loss/crossentropy": 2.153369665145874, "loss/hidden": 1.5625, "loss/logits": 0.29804515838623047, "loss/reg": 0.00032764452043920755, "step": 5944 }, { "epoch": 0.743125, "grad_norm": 2.43951153755188, "grad_norm_var": 0.38172664910837945, "learning_rate": 0.0001, "loss": 1.2614, "loss/crossentropy": 2.352330446243286, "loss/hidden": 1.0859375, "loss/logits": 0.17217367887496948, "loss/reg": 0.00032745624775998294, "step": 5945 }, { "epoch": 0.74325, "grad_norm": 2.2651445865631104, "grad_norm_var": 0.38193666355456507, "learning_rate": 0.0001, "loss": 1.1936, "loss/crossentropy": 2.3231472969055176, "loss/hidden": 1.015625, "loss/logits": 0.17474409937858582, "loss/reg": 0.00032725869095884264, "step": 5946 }, { "epoch": 0.743375, "grad_norm": 2.9711334705352783, "grad_norm_var": 0.38690372826665087, "learning_rate": 0.0001, "loss": 1.6523, "loss/crossentropy": 2.6402854919433594, "loss/hidden": 1.3515625, "loss/logits": 0.29742902517318726, "loss/reg": 0.0003270643937867135, "step": 5947 }, { "epoch": 0.7435, "grad_norm": 2.450713872909546, "grad_norm_var": 0.39098468384746426, "learning_rate": 0.0001, "loss": 1.31, "loss/crossentropy": 2.713430881500244, "loss/hidden": 1.109375, "loss/logits": 0.19733192026615143, "loss/reg": 0.00032690452644601464, "step": 5948 }, { "epoch": 0.743625, "grad_norm": 2.284217357635498, "grad_norm_var": 0.3845130141519803, "learning_rate": 0.0001, "loss": 1.2769, "loss/crossentropy": 2.2952022552490234, "loss/hidden": 1.0859375, "loss/logits": 0.18769782781600952, "loss/reg": 0.0003267539432272315, "step": 5949 }, { "epoch": 0.74375, "grad_norm": 2.837104082107544, "grad_norm_var": 0.38557284698703903, "learning_rate": 0.0001, "loss": 1.3966, "loss/crossentropy": 2.305765390396118, "loss/hidden": 1.1953125, "loss/logits": 0.19806094467639923, "loss/reg": 0.00032659192220307887, "step": 5950 }, { "epoch": 0.743875, "grad_norm": 2.2562801837921143, "grad_norm_var": 0.39790174870997724, "learning_rate": 0.0001, "loss": 1.4371, "loss/crossentropy": 2.1445233821868896, "loss/hidden": 1.2265625, "loss/logits": 0.20728346705436707, "loss/reg": 0.0003264171537011862, "step": 5951 }, { "epoch": 0.744, "grad_norm": 2.6837167739868164, "grad_norm_var": 0.389122106276424, "learning_rate": 0.0001, "loss": 1.3898, "loss/crossentropy": 2.5559496879577637, "loss/hidden": 1.15625, "loss/logits": 0.23027169704437256, "loss/reg": 0.00032622841536067426, "step": 5952 }, { "epoch": 0.744125, "grad_norm": 2.426508665084839, "grad_norm_var": 0.3782638504504272, "learning_rate": 0.0001, "loss": 1.2059, "loss/crossentropy": 2.4925501346588135, "loss/hidden": 1.0390625, "loss/logits": 0.16357171535491943, "loss/reg": 0.0003260860394220799, "step": 5953 }, { "epoch": 0.74425, "grad_norm": 5.0438337326049805, "grad_norm_var": 0.7199046856791853, "learning_rate": 0.0001, "loss": 1.8056, "loss/crossentropy": 2.6331818103790283, "loss/hidden": 1.5, "loss/logits": 0.30231595039367676, "loss/reg": 0.00032594570075161755, "step": 5954 }, { "epoch": 0.744375, "grad_norm": 2.738022565841675, "grad_norm_var": 0.720642593862582, "learning_rate": 0.0001, "loss": 1.3212, "loss/crossentropy": 2.576456308364868, "loss/hidden": 1.109375, "loss/logits": 0.2086060345172882, "loss/reg": 0.00032577497768215835, "step": 5955 }, { "epoch": 0.7445, "grad_norm": 2.6881680488586426, "grad_norm_var": 0.7178127853279382, "learning_rate": 0.0001, "loss": 1.3598, "loss/crossentropy": 2.5243372917175293, "loss/hidden": 1.140625, "loss/logits": 0.21593965590000153, "loss/reg": 0.0003256101335864514, "step": 5956 }, { "epoch": 0.744625, "grad_norm": 3.2282326221466064, "grad_norm_var": 0.7085863173739253, "learning_rate": 0.0001, "loss": 1.2779, "loss/crossentropy": 2.37178897857666, "loss/hidden": 1.09375, "loss/logits": 0.1809050440788269, "loss/reg": 0.00032544220448471606, "step": 5957 }, { "epoch": 0.74475, "grad_norm": 2.579267978668213, "grad_norm_var": 0.7066229832756389, "learning_rate": 0.0001, "loss": 1.2918, "loss/crossentropy": 2.41471529006958, "loss/hidden": 1.1171875, "loss/logits": 0.17132364213466644, "loss/reg": 0.0003252972674090415, "step": 5958 }, { "epoch": 0.744875, "grad_norm": 2.5531651973724365, "grad_norm_var": 0.7124057001344303, "learning_rate": 0.0001, "loss": 1.46, "loss/crossentropy": 2.389686107635498, "loss/hidden": 1.2265625, "loss/logits": 0.23014959692955017, "loss/reg": 0.0003251373127568513, "step": 5959 }, { "epoch": 0.745, "grad_norm": 3.9295237064361572, "grad_norm_var": 0.5255153377371014, "learning_rate": 0.0001, "loss": 1.2956, "loss/crossentropy": 2.7612991333007812, "loss/hidden": 1.09375, "loss/logits": 0.19860482215881348, "loss/reg": 0.00032495753839612007, "step": 5960 }, { "epoch": 0.745125, "grad_norm": 4.330012321472168, "grad_norm_var": 0.6489712791257489, "learning_rate": 0.0001, "loss": 1.6764, "loss/crossentropy": 2.2265918254852295, "loss/hidden": 1.4453125, "loss/logits": 0.22785601019859314, "loss/reg": 0.00032480264781042933, "step": 5961 }, { "epoch": 0.74525, "grad_norm": 2.9176435470581055, "grad_norm_var": 0.6156449513918438, "learning_rate": 0.0001, "loss": 1.365, "loss/crossentropy": 2.5704097747802734, "loss/hidden": 1.171875, "loss/logits": 0.1898791790008545, "loss/reg": 0.0003246459236834198, "step": 5962 }, { "epoch": 0.745375, "grad_norm": 3.0494964122772217, "grad_norm_var": 0.61578098519127, "learning_rate": 0.0001, "loss": 1.3295, "loss/crossentropy": 2.622544050216675, "loss/hidden": 1.1328125, "loss/logits": 0.19347788393497467, "loss/reg": 0.0003244674007873982, "step": 5963 }, { "epoch": 0.7455, "grad_norm": 2.5948495864868164, "grad_norm_var": 0.6065281122822768, "learning_rate": 0.0001, "loss": 1.3443, "loss/crossentropy": 2.2072644233703613, "loss/hidden": 1.140625, "loss/logits": 0.20039603114128113, "loss/reg": 0.0003242897801101208, "step": 5964 }, { "epoch": 0.745625, "grad_norm": 2.2284281253814697, "grad_norm_var": 0.6121121422222735, "learning_rate": 0.0001, "loss": 1.344, "loss/crossentropy": 2.5461528301239014, "loss/hidden": 1.15625, "loss/logits": 0.18453630805015564, "loss/reg": 0.00032410499989055097, "step": 5965 }, { "epoch": 0.74575, "grad_norm": 2.80876088142395, "grad_norm_var": 0.6127978498057167, "learning_rate": 0.0001, "loss": 1.3857, "loss/crossentropy": 2.9131550788879395, "loss/hidden": 1.1640625, "loss/logits": 0.21841441094875336, "loss/reg": 0.0003239178622607142, "step": 5966 }, { "epoch": 0.745875, "grad_norm": 2.7107560634613037, "grad_norm_var": 0.5804283418960245, "learning_rate": 0.0001, "loss": 1.3456, "loss/crossentropy": 2.4584732055664062, "loss/hidden": 1.15625, "loss/logits": 0.18615064024925232, "loss/reg": 0.000323754531564191, "step": 5967 }, { "epoch": 0.746, "grad_norm": 2.7357795238494873, "grad_norm_var": 0.578180772497413, "learning_rate": 0.0001, "loss": 1.4105, "loss/crossentropy": 2.4138050079345703, "loss/hidden": 1.1953125, "loss/logits": 0.21191397309303284, "loss/reg": 0.0003235965850763023, "step": 5968 }, { "epoch": 0.746125, "grad_norm": 2.5564987659454346, "grad_norm_var": 0.5686878287847965, "learning_rate": 0.0001, "loss": 1.6317, "loss/crossentropy": 2.5190439224243164, "loss/hidden": 1.3359375, "loss/logits": 0.2925615906715393, "loss/reg": 0.00032344512874260545, "step": 5969 }, { "epoch": 0.74625, "grad_norm": 2.585026264190674, "grad_norm_var": 0.29068184791165, "learning_rate": 0.0001, "loss": 1.4362, "loss/crossentropy": 2.5016257762908936, "loss/hidden": 1.234375, "loss/logits": 0.1985553652048111, "loss/reg": 0.000323243613820523, "step": 5970 }, { "epoch": 0.746375, "grad_norm": 3.3838491439819336, "grad_norm_var": 0.30369756109990337, "learning_rate": 0.0001, "loss": 1.3271, "loss/crossentropy": 2.4760708808898926, "loss/hidden": 1.1171875, "loss/logits": 0.2066890001296997, "loss/reg": 0.00032305470085702837, "step": 5971 }, { "epoch": 0.7465, "grad_norm": 2.9483323097229004, "grad_norm_var": 0.2995402718201999, "learning_rate": 0.0001, "loss": 1.4708, "loss/crossentropy": 2.4371140003204346, "loss/hidden": 1.2578125, "loss/logits": 0.20974424481391907, "loss/reg": 0.00032288566580973566, "step": 5972 }, { "epoch": 0.746625, "grad_norm": 3.0666584968566895, "grad_norm_var": 0.2950965881041005, "learning_rate": 0.0001, "loss": 1.4476, "loss/crossentropy": 2.332296371459961, "loss/hidden": 1.25, "loss/logits": 0.19436904788017273, "loss/reg": 0.00032266994821839035, "step": 5973 }, { "epoch": 0.74675, "grad_norm": 3.222269058227539, "grad_norm_var": 0.29034238510546306, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.252399444580078, "loss/hidden": 1.25, "loss/logits": 0.20466290414333344, "loss/reg": 0.00032249497598968446, "step": 5974 }, { "epoch": 0.746875, "grad_norm": 2.1345162391662598, "grad_norm_var": 0.32491676550672516, "learning_rate": 0.0001, "loss": 1.1876, "loss/crossentropy": 2.5106287002563477, "loss/hidden": 1.015625, "loss/logits": 0.16874536871910095, "loss/reg": 0.00032230690703727305, "step": 5975 }, { "epoch": 0.747, "grad_norm": 4.442264556884766, "grad_norm_var": 0.40830353328640284, "learning_rate": 0.0001, "loss": 1.8193, "loss/crossentropy": 2.0524110794067383, "loss/hidden": 1.5234375, "loss/logits": 0.292635440826416, "loss/reg": 0.00032211176585406065, "step": 5976 }, { "epoch": 0.747125, "grad_norm": 2.962439775466919, "grad_norm_var": 0.27942963065930043, "learning_rate": 0.0001, "loss": 1.2757, "loss/crossentropy": 2.6633663177490234, "loss/hidden": 1.0859375, "loss/logits": 0.18651911616325378, "loss/reg": 0.00032189604826271534, "step": 5977 }, { "epoch": 0.74725, "grad_norm": 2.180654287338257, "grad_norm_var": 0.31132094586148845, "learning_rate": 0.0001, "loss": 1.1614, "loss/crossentropy": 2.3197295665740967, "loss/hidden": 1.0, "loss/logits": 0.15819315612316132, "loss/reg": 0.00032168556936085224, "step": 5978 }, { "epoch": 0.747375, "grad_norm": 2.63704252243042, "grad_norm_var": 0.31101862083133536, "learning_rate": 0.0001, "loss": 1.2756, "loss/crossentropy": 2.4831109046936035, "loss/hidden": 1.0859375, "loss/logits": 0.18639925122261047, "loss/reg": 0.0003214754688087851, "step": 5979 }, { "epoch": 0.7475, "grad_norm": 2.669703245162964, "grad_norm_var": 0.30907296853391225, "learning_rate": 0.0001, "loss": 1.4275, "loss/crossentropy": 2.3359830379486084, "loss/hidden": 1.1953125, "loss/logits": 0.2289370745420456, "loss/reg": 0.00032124583958648145, "step": 5980 }, { "epoch": 0.747625, "grad_norm": 39.53132247924805, "grad_norm_var": 84.28832625517589, "learning_rate": 0.0001, "loss": 1.6107, "loss/crossentropy": 2.466092348098755, "loss/hidden": 1.3671875, "loss/logits": 0.2402658462524414, "loss/reg": 0.00032101295073516667, "step": 5981 }, { "epoch": 0.74775, "grad_norm": 4.307476997375488, "grad_norm_var": 83.95866705167579, "learning_rate": 0.0001, "loss": 1.6519, "loss/crossentropy": 2.3100764751434326, "loss/hidden": 1.3671875, "loss/logits": 0.28154343366622925, "loss/reg": 0.00032083355472423136, "step": 5982 }, { "epoch": 0.747875, "grad_norm": 3.3333828449249268, "grad_norm_var": 83.77170887485453, "learning_rate": 0.0001, "loss": 1.2765, "loss/crossentropy": 2.769599437713623, "loss/hidden": 1.09375, "loss/logits": 0.17954295873641968, "loss/reg": 0.00032066876883618534, "step": 5983 }, { "epoch": 0.748, "grad_norm": 3.1329689025878906, "grad_norm_var": 83.64611155671847, "learning_rate": 0.0001, "loss": 1.3724, "loss/crossentropy": 2.4463210105895996, "loss/hidden": 1.1640625, "loss/logits": 0.20514662563800812, "loss/reg": 0.0003205043321941048, "step": 5984 }, { "epoch": 0.748125, "grad_norm": 2.31221079826355, "grad_norm_var": 83.73980125549244, "learning_rate": 0.0001, "loss": 1.295, "loss/crossentropy": 2.893684148788452, "loss/hidden": 1.1015625, "loss/logits": 0.19018878042697906, "loss/reg": 0.00032034009927883744, "step": 5985 }, { "epoch": 0.74825, "grad_norm": 3.1054232120513916, "grad_norm_var": 83.56812785075347, "learning_rate": 0.0001, "loss": 1.5521, "loss/crossentropy": 2.6269516944885254, "loss/hidden": 1.2734375, "loss/logits": 0.27542686462402344, "loss/reg": 0.00032018873025663197, "step": 5986 }, { "epoch": 0.748375, "grad_norm": 2.402522087097168, "grad_norm_var": 83.88369712995032, "learning_rate": 0.0001, "loss": 1.296, "loss/crossentropy": 2.4600188732147217, "loss/hidden": 1.109375, "loss/logits": 0.1834680438041687, "loss/reg": 0.0003200437349732965, "step": 5987 }, { "epoch": 0.7485, "grad_norm": 2.2611641883850098, "grad_norm_var": 84.12632263214445, "learning_rate": 0.0001, "loss": 1.3116, "loss/crossentropy": 2.4636988639831543, "loss/hidden": 1.109375, "loss/logits": 0.19899678230285645, "loss/reg": 0.00031988267437554896, "step": 5988 }, { "epoch": 0.748625, "grad_norm": 2.9049880504608154, "grad_norm_var": 84.17461899978571, "learning_rate": 0.0001, "loss": 1.5767, "loss/crossentropy": 2.5166800022125244, "loss/hidden": 1.3359375, "loss/logits": 0.23754189908504486, "loss/reg": 0.00031972696888260543, "step": 5989 }, { "epoch": 0.74875, "grad_norm": 2.5923912525177, "grad_norm_var": 84.36729929166749, "learning_rate": 0.0001, "loss": 1.4838, "loss/crossentropy": 2.45926833152771, "loss/hidden": 1.25, "loss/logits": 0.23059165477752686, "loss/reg": 0.0003195702738594264, "step": 5990 }, { "epoch": 0.748875, "grad_norm": 13.425292015075684, "grad_norm_var": 87.74725010508931, "learning_rate": 0.0001, "loss": 1.9469, "loss/crossentropy": 2.3516526222229004, "loss/hidden": 1.6640625, "loss/logits": 0.27965977787971497, "loss/reg": 0.00031939681502990425, "step": 5991 }, { "epoch": 0.749, "grad_norm": 5.072973251342773, "grad_norm_var": 87.65056929965418, "learning_rate": 0.0001, "loss": 1.283, "loss/crossentropy": 3.099637031555176, "loss/hidden": 1.109375, "loss/logits": 0.17041867971420288, "loss/reg": 0.00031922367634251714, "step": 5992 }, { "epoch": 0.749125, "grad_norm": 2.7701213359832764, "grad_norm_var": 87.72889949069642, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.8283655643463135, "loss/hidden": 1.21875, "loss/logits": 0.18846796452999115, "loss/reg": 0.00031904433853924274, "step": 5993 }, { "epoch": 0.74925, "grad_norm": 2.5575144290924072, "grad_norm_var": 87.5501336281732, "learning_rate": 0.0001, "loss": 1.2851, "loss/crossentropy": 2.425816774368286, "loss/hidden": 1.1015625, "loss/logits": 0.18036875128746033, "loss/reg": 0.0003188636910635978, "step": 5994 }, { "epoch": 0.749375, "grad_norm": 2.5059916973114014, "grad_norm_var": 87.60889539673728, "learning_rate": 0.0001, "loss": 1.3765, "loss/crossentropy": 2.678098678588867, "loss/hidden": 1.15625, "loss/logits": 0.21709442138671875, "loss/reg": 0.0003186646499671042, "step": 5995 }, { "epoch": 0.7495, "grad_norm": 2.319871187210083, "grad_norm_var": 87.76863435631192, "learning_rate": 0.0001, "loss": 1.2362, "loss/crossentropy": 2.7199866771698, "loss/hidden": 1.0625, "loss/logits": 0.17056018114089966, "loss/reg": 0.00031846726778894663, "step": 5996 }, { "epoch": 0.749625, "grad_norm": 2.6254985332489014, "grad_norm_var": 7.445610339348475, "learning_rate": 0.0001, "loss": 1.2546, "loss/crossentropy": 2.404189348220825, "loss/hidden": 1.0625, "loss/logits": 0.18888741731643677, "loss/reg": 0.0003182803629897535, "step": 5997 }, { "epoch": 0.74975, "grad_norm": 2.324106454849243, "grad_norm_var": 7.504870771854759, "learning_rate": 0.0001, "loss": 1.3028, "loss/crossentropy": 2.164024829864502, "loss/hidden": 1.125, "loss/logits": 0.17462413012981415, "loss/reg": 0.000318065081955865, "step": 5998 }, { "epoch": 0.749875, "grad_norm": 2.18009090423584, "grad_norm_var": 7.610223839524435, "learning_rate": 0.0001, "loss": 1.3427, "loss/crossentropy": 2.3608083724975586, "loss/hidden": 1.1328125, "loss/logits": 0.20673716068267822, "loss/reg": 0.0003178539336659014, "step": 5999 }, { "epoch": 0.75, "grad_norm": 2.7074661254882812, "grad_norm_var": 7.637019510577631, "learning_rate": 0.0001, "loss": 1.5568, "loss/crossentropy": 2.5729610919952393, "loss/hidden": 1.328125, "loss/logits": 0.22552789747714996, "loss/reg": 0.0003176866448484361, "step": 6000 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.86435169386496e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }