{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 250, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000125, "grad_norm": 4.097814559936523, "learning_rate": 1.0000000000000002e-06, "loss": 1.1655, "loss/crossentropy": 2.343535900115967, "loss/hidden": 0.9296875, "loss/logits": 0.17379230260849, "loss/reg": 0.006198255345225334, "step": 1 }, { "epoch": 0.00025, "grad_norm": 3.662576913833618, "learning_rate": 2.0000000000000003e-06, "loss": 1.4973, "loss/crossentropy": 2.318769931793213, "loss/hidden": 1.1875, "loss/logits": 0.24786217510700226, "loss/reg": 0.006198255345225334, "step": 2 }, { "epoch": 0.000375, "grad_norm": 2.8296749591827393, "learning_rate": 3e-06, "loss": 1.2258, "loss/crossentropy": 2.4907937049865723, "loss/hidden": 0.97265625, "loss/logits": 0.19112952053546906, "loss/reg": 0.006198245566338301, "step": 3 }, { "epoch": 0.0005, "grad_norm": 3.057624578475952, "learning_rate": 4.000000000000001e-06, "loss": 1.1136, "loss/crossentropy": 2.744520902633667, "loss/hidden": 0.890625, "loss/logits": 0.16101403534412384, "loss/reg": 0.006198232993483543, "step": 4 }, { "epoch": 0.000625, "grad_norm": 2.7055587768554688, "learning_rate": 5e-06, "loss": 1.1943, "loss/crossentropy": 2.5722062587738037, "loss/hidden": 0.94921875, "loss/logits": 0.18310005962848663, "loss/reg": 0.0061982134357094765, "step": 5 }, { "epoch": 0.00075, "grad_norm": 3.789276361465454, "learning_rate": 6e-06, "loss": 1.247, "loss/crossentropy": 2.613312005996704, "loss/hidden": 1.0078125, "loss/logits": 0.17725251615047455, "loss/reg": 0.006198191549628973, "step": 6 }, { "epoch": 0.000875, "grad_norm": 3.997910499572754, "learning_rate": 7.000000000000001e-06, "loss": 1.4206, "loss/crossentropy": 2.4207534790039062, "loss/hidden": 1.125, "loss/logits": 0.2336406409740448, "loss/reg": 0.006198164541274309, "step": 7 }, { "epoch": 0.001, "grad_norm": 2.5986244678497314, "learning_rate": 8.000000000000001e-06, "loss": 1.0878, "loss/crossentropy": 2.536424160003662, "loss/hidden": 0.8671875, "loss/logits": 0.1585812270641327, "loss/reg": 0.006198132876306772, "step": 8 }, { "epoch": 0.001125, "grad_norm": 2.2757976055145264, "learning_rate": 9e-06, "loss": 1.1175, "loss/crossentropy": 2.745281219482422, "loss/hidden": 0.89453125, "loss/logits": 0.16094230115413666, "loss/reg": 0.006198094692081213, "step": 9 }, { "epoch": 0.00125, "grad_norm": 2.261094808578491, "learning_rate": 1e-05, "loss": 1.0803, "loss/crossentropy": 2.3173577785491943, "loss/hidden": 0.8671875, "loss/logits": 0.15108685195446014, "loss/reg": 0.0061980499885976315, "step": 10 }, { "epoch": 0.001375, "grad_norm": 21.777265548706055, "learning_rate": 1.1000000000000001e-05, "loss": 2.0501, "loss/crossentropy": 3.2122714519500732, "loss/hidden": 1.7109375, "loss/logits": 0.27713608741760254, "loss/reg": 0.006198008079081774, "step": 11 }, { "epoch": 0.0015, "grad_norm": 2.5655505657196045, "learning_rate": 1.2e-05, "loss": 1.151, "loss/crossentropy": 2.706430196762085, "loss/hidden": 0.8984375, "loss/logits": 0.19056561589241028, "loss/reg": 0.0061979577876627445, "step": 12 }, { "epoch": 0.001625, "grad_norm": 2.403053045272827, "learning_rate": 1.3000000000000001e-05, "loss": 1.0719, "loss/crossentropy": 2.0466296672821045, "loss/hidden": 0.88671875, "loss/logits": 0.12316589802503586, "loss/reg": 0.0061978911980986595, "step": 13 }, { "epoch": 0.00175, "grad_norm": 3.840881586074829, "learning_rate": 1.4000000000000001e-05, "loss": 1.5441, "loss/crossentropy": 2.3191423416137695, "loss/hidden": 1.234375, "loss/logits": 0.24779079854488373, "loss/reg": 0.00619781669229269, "step": 14 }, { "epoch": 0.001875, "grad_norm": 2.557331085205078, "learning_rate": 1.5e-05, "loss": 0.9444, "loss/crossentropy": 2.6370084285736084, "loss/hidden": 0.76953125, "loss/logits": 0.11287336051464081, "loss/reg": 0.006197733338922262, "step": 15 }, { "epoch": 0.002, "grad_norm": 3.1850404739379883, "grad_norm_var": 22.31061335402559, "learning_rate": 1.6000000000000003e-05, "loss": 1.3213, "loss/crossentropy": 2.676577091217041, "loss/hidden": 1.0546875, "loss/logits": 0.2046227753162384, "loss/reg": 0.006197639741003513, "step": 16 }, { "epoch": 0.002125, "grad_norm": 2.2587289810180664, "grad_norm_var": 22.553268201402446, "learning_rate": 1.7000000000000003e-05, "loss": 1.0312, "loss/crossentropy": 2.4961040019989014, "loss/hidden": 0.8203125, "loss/logits": 0.148894801735878, "loss/reg": 0.006197560112923384, "step": 17 }, { "epoch": 0.00225, "grad_norm": 3.3259811401367188, "grad_norm_var": 22.58044614452358, "learning_rate": 1.8e-05, "loss": 1.3626, "loss/crossentropy": 2.5914387702941895, "loss/hidden": 1.046875, "loss/logits": 0.25370728969573975, "loss/reg": 0.006197475362569094, "step": 18 }, { "epoch": 0.002375, "grad_norm": 2.468914747238159, "grad_norm_var": 22.649171856957494, "learning_rate": 1.9e-05, "loss": 1.1683, "loss/crossentropy": 2.6096584796905518, "loss/hidden": 0.921875, "loss/logits": 0.18447336554527283, "loss/reg": 0.00619738781824708, "step": 19 }, { "epoch": 0.0025, "grad_norm": 2.3097646236419678, "grad_norm_var": 22.784756315801523, "learning_rate": 2e-05, "loss": 1.1605, "loss/crossentropy": 2.299048662185669, "loss/hidden": 0.9375, "loss/logits": 0.16106057167053223, "loss/reg": 0.006197274662554264, "step": 20 }, { "epoch": 0.002625, "grad_norm": 2.1111207008361816, "grad_norm_var": 22.911025462198744, "learning_rate": 2.1e-05, "loss": 0.939, "loss/crossentropy": 2.547258138656616, "loss/hidden": 0.75, "loss/logits": 0.12698382139205933, "loss/reg": 0.006197154987603426, "step": 21 }, { "epoch": 0.00275, "grad_norm": 2.4918222427368164, "grad_norm_var": 23.049732177187614, "learning_rate": 2.2000000000000003e-05, "loss": 1.2047, "loss/crossentropy": 2.2802374362945557, "loss/hidden": 0.953125, "loss/logits": 0.18965375423431396, "loss/reg": 0.006197045091539621, "step": 22 }, { "epoch": 0.002875, "grad_norm": 3.3273494243621826, "grad_norm_var": 23.069242834486193, "learning_rate": 2.3000000000000003e-05, "loss": 1.2554, "loss/crossentropy": 2.3062734603881836, "loss/hidden": 1.0078125, "loss/logits": 0.18566077947616577, "loss/reg": 0.006196921691298485, "step": 23 }, { "epoch": 0.003, "grad_norm": 2.5644068717956543, "grad_norm_var": 23.075070365271714, "learning_rate": 2.4e-05, "loss": 1.2266, "loss/crossentropy": 2.460878372192383, "loss/hidden": 0.98046875, "loss/logits": 0.18418912589550018, "loss/reg": 0.006196786183863878, "step": 24 }, { "epoch": 0.003125, "grad_norm": 2.3506264686584473, "grad_norm_var": 23.059636834121356, "learning_rate": 2.5e-05, "loss": 1.0205, "loss/crossentropy": 2.4281811714172363, "loss/hidden": 0.82421875, "loss/logits": 0.13434948027133942, "loss/reg": 0.0061966474168002605, "step": 25 }, { "epoch": 0.00325, "grad_norm": 2.25004506111145, "grad_norm_var": 23.062003716592635, "learning_rate": 2.6000000000000002e-05, "loss": 1.1133, "loss/crossentropy": 2.326843500137329, "loss/hidden": 0.9140625, "loss/logits": 0.13725802302360535, "loss/reg": 0.006196498870849609, "step": 26 }, { "epoch": 0.003375, "grad_norm": 2.283770799636841, "grad_norm_var": 0.2469546323472817, "learning_rate": 2.7000000000000002e-05, "loss": 1.1459, "loss/crossentropy": 2.3002493381500244, "loss/hidden": 0.9140625, "loss/logits": 0.16987068951129913, "loss/reg": 0.006196335889399052, "step": 27 }, { "epoch": 0.0035, "grad_norm": 2.805088758468628, "grad_norm_var": 0.24805442740468303, "learning_rate": 2.8000000000000003e-05, "loss": 1.0272, "loss/crossentropy": 2.510472536087036, "loss/hidden": 0.8359375, "loss/logits": 0.12927240133285522, "loss/reg": 0.006196176633238792, "step": 28 }, { "epoch": 0.003625, "grad_norm": 2.0331132411956787, "grad_norm_var": 0.2692014993258605, "learning_rate": 2.9e-05, "loss": 1.0913, "loss/crossentropy": 2.51584529876709, "loss/hidden": 0.87109375, "loss/logits": 0.15820594131946564, "loss/reg": 0.006195997819304466, "step": 29 }, { "epoch": 0.00375, "grad_norm": 2.1523566246032715, "grad_norm_var": 0.17596421900176604, "learning_rate": 3e-05, "loss": 1.0026, "loss/crossentropy": 2.704220771789551, "loss/hidden": 0.796875, "loss/logits": 0.14372289180755615, "loss/reg": 0.0061958180740475655, "step": 30 }, { "epoch": 0.003875, "grad_norm": 2.6658694744110107, "grad_norm_var": 0.1771001402109505, "learning_rate": 3.1e-05, "loss": 1.122, "loss/crossentropy": 2.4840426445007324, "loss/hidden": 0.89453125, "loss/logits": 0.1655040979385376, "loss/reg": 0.006195634603500366, "step": 31 }, { "epoch": 0.004, "grad_norm": 2.813079595565796, "grad_norm_var": 0.153583095436327, "learning_rate": 3.2000000000000005e-05, "loss": 1.0653, "loss/crossentropy": 2.442962646484375, "loss/hidden": 0.859375, "loss/logits": 0.14400474727153778, "loss/reg": 0.00619542459025979, "step": 32 }, { "epoch": 0.004125, "grad_norm": 2.4273953437805176, "grad_norm_var": 0.1496371777315666, "learning_rate": 3.3e-05, "loss": 1.1025, "loss/crossentropy": 2.515721559524536, "loss/hidden": 0.89453125, "loss/logits": 0.1460331827402115, "loss/reg": 0.006195210851728916, "step": 33 }, { "epoch": 0.00425, "grad_norm": 2.0594100952148438, "grad_norm_var": 0.11442956053255457, "learning_rate": 3.4000000000000007e-05, "loss": 1.118, "loss/crossentropy": 2.5347506999969482, "loss/hidden": 0.8984375, "loss/logits": 0.15760375559329987, "loss/reg": 0.006195001769810915, "step": 34 }, { "epoch": 0.004375, "grad_norm": 2.497893810272217, "grad_norm_var": 0.11457586733464495, "learning_rate": 3.5e-05, "loss": 1.2359, "loss/crossentropy": 1.7681002616882324, "loss/hidden": 1.0390625, "loss/logits": 0.13490143418312073, "loss/reg": 0.006194803398102522, "step": 35 }, { "epoch": 0.0045, "grad_norm": 3.3231709003448486, "grad_norm_var": 0.16029457606237638, "learning_rate": 3.6e-05, "loss": 1.3588, "loss/crossentropy": 2.729518175125122, "loss/hidden": 1.09375, "loss/logits": 0.20313453674316406, "loss/reg": 0.00619460316374898, "step": 36 }, { "epoch": 0.004625, "grad_norm": 2.5542962551116943, "grad_norm_var": 0.14901290879942408, "learning_rate": 3.7e-05, "loss": 1.1671, "loss/crossentropy": 2.3359429836273193, "loss/hidden": 0.9296875, "loss/logits": 0.17546769976615906, "loss/reg": 0.006194361485540867, "step": 37 }, { "epoch": 0.00475, "grad_norm": 3.5138309001922607, "grad_norm_var": 0.2080724542279834, "learning_rate": 3.8e-05, "loss": 1.2044, "loss/crossentropy": 2.447890520095825, "loss/hidden": 0.96484375, "loss/logits": 0.17756858468055725, "loss/reg": 0.0061941081658005714, "step": 38 }, { "epoch": 0.004875, "grad_norm": 3.813410758972168, "grad_norm_var": 0.2698887106917669, "learning_rate": 3.9000000000000006e-05, "loss": 1.0819, "loss/crossentropy": 2.766765832901001, "loss/hidden": 0.88671875, "loss/logits": 0.13325469195842743, "loss/reg": 0.006193886045366526, "step": 39 }, { "epoch": 0.005, "grad_norm": 3.1502718925476074, "grad_norm_var": 0.2860816910243668, "learning_rate": 4e-05, "loss": 1.3622, "loss/crossentropy": 2.3325388431549072, "loss/hidden": 1.109375, "loss/logits": 0.19087004661560059, "loss/reg": 0.006193609442561865, "step": 40 }, { "epoch": 0.005125, "grad_norm": 2.422366142272949, "grad_norm_var": 0.28336421674108553, "learning_rate": 4.1e-05, "loss": 1.2212, "loss/crossentropy": 2.3002498149871826, "loss/hidden": 0.96875, "loss/logits": 0.19054222106933594, "loss/reg": 0.00619333703070879, "step": 41 }, { "epoch": 0.00525, "grad_norm": 2.7353622913360596, "grad_norm_var": 0.2707266796228128, "learning_rate": 4.2e-05, "loss": 1.0549, "loss/crossentropy": 2.0319221019744873, "loss/hidden": 0.87890625, "loss/logits": 0.1140664741396904, "loss/reg": 0.006193041335791349, "step": 42 }, { "epoch": 0.005375, "grad_norm": 1.9425387382507324, "grad_norm_var": 0.2970857034274398, "learning_rate": 4.3e-05, "loss": 1.0366, "loss/crossentropy": 2.431666374206543, "loss/hidden": 0.83203125, "loss/logits": 0.1426728069782257, "loss/reg": 0.006192733999341726, "step": 43 }, { "epoch": 0.0055, "grad_norm": 2.7009642124176025, "grad_norm_var": 0.2960522402202514, "learning_rate": 4.4000000000000006e-05, "loss": 0.9824, "loss/crossentropy": 2.391608476638794, "loss/hidden": 0.78515625, "loss/logits": 0.13533324003219604, "loss/reg": 0.006192411296069622, "step": 44 }, { "epoch": 0.005625, "grad_norm": 2.6632983684539795, "grad_norm_var": 0.2669107471214488, "learning_rate": 4.5e-05, "loss": 1.1067, "loss/crossentropy": 2.7733116149902344, "loss/hidden": 0.87109375, "loss/logits": 0.1736893653869629, "loss/reg": 0.006192059256136417, "step": 45 }, { "epoch": 0.00575, "grad_norm": 2.1037468910217285, "grad_norm_var": 0.2707032714108967, "learning_rate": 4.600000000000001e-05, "loss": 0.9831, "loss/crossentropy": 2.4606895446777344, "loss/hidden": 0.7890625, "loss/logits": 0.13213258981704712, "loss/reg": 0.006191718857735395, "step": 46 }, { "epoch": 0.005875, "grad_norm": 2.1911983489990234, "grad_norm_var": 0.28768473978113296, "learning_rate": 4.7e-05, "loss": 0.9509, "loss/crossentropy": 2.6825270652770996, "loss/hidden": 0.76953125, "loss/logits": 0.11942489445209503, "loss/reg": 0.006191306747496128, "step": 47 }, { "epoch": 0.006, "grad_norm": 3.2640700340270996, "grad_norm_var": 0.30827796768009724, "learning_rate": 4.8e-05, "loss": 1.0346, "loss/crossentropy": 2.3665199279785156, "loss/hidden": 0.83203125, "loss/logits": 0.14068934321403503, "loss/reg": 0.0061909533105790615, "step": 48 }, { "epoch": 0.006125, "grad_norm": 2.259894847869873, "grad_norm_var": 0.3163475179157634, "learning_rate": 4.9e-05, "loss": 0.9647, "loss/crossentropy": 2.4414587020874023, "loss/hidden": 0.79296875, "loss/logits": 0.10987477004528046, "loss/reg": 0.0061905342154204845, "step": 49 }, { "epoch": 0.00625, "grad_norm": 2.7616565227508545, "grad_norm_var": 0.28721415330329, "learning_rate": 5e-05, "loss": 1.019, "loss/crossentropy": 2.0829460620880127, "loss/hidden": 0.83984375, "loss/logits": 0.11724002659320831, "loss/reg": 0.0061900559812784195, "step": 50 }, { "epoch": 0.006375, "grad_norm": 2.7897861003875732, "grad_norm_var": 0.28297568806904866, "learning_rate": 5.1000000000000006e-05, "loss": 0.853, "loss/crossentropy": 2.5636909008026123, "loss/hidden": 0.6953125, "loss/logits": 0.09577471762895584, "loss/reg": 0.00618965458124876, "step": 51 }, { "epoch": 0.0065, "grad_norm": 2.3134403228759766, "grad_norm_var": 0.2711290924819705, "learning_rate": 5.2000000000000004e-05, "loss": 1.0497, "loss/crossentropy": 2.440258026123047, "loss/hidden": 0.83984375, "loss/logits": 0.14791719615459442, "loss/reg": 0.006189141888171434, "step": 52 }, { "epoch": 0.006625, "grad_norm": 2.2032997608184814, "grad_norm_var": 0.2855897568404882, "learning_rate": 5.300000000000001e-05, "loss": 0.9934, "loss/crossentropy": 2.4747955799102783, "loss/hidden": 0.796875, "loss/logits": 0.13461169600486755, "loss/reg": 0.006188610102981329, "step": 53 }, { "epoch": 0.00675, "grad_norm": 2.267400026321411, "grad_norm_var": 0.24358579758792467, "learning_rate": 5.4000000000000005e-05, "loss": 1.1149, "loss/crossentropy": 2.705127477645874, "loss/hidden": 0.89453125, "loss/logits": 0.1585235595703125, "loss/reg": 0.0061880191788077354, "step": 54 }, { "epoch": 0.006875, "grad_norm": 2.281036853790283, "grad_norm_var": 0.14220569464836952, "learning_rate": 5.500000000000001e-05, "loss": 0.9642, "loss/crossentropy": 2.545010805130005, "loss/hidden": 0.78515625, "loss/logits": 0.11717304587364197, "loss/reg": 0.006187579594552517, "step": 55 }, { "epoch": 0.007, "grad_norm": 4.942420959472656, "grad_norm_var": 0.4975759650139497, "learning_rate": 5.6000000000000006e-05, "loss": 1.1237, "loss/crossentropy": 2.7698795795440674, "loss/hidden": 0.91796875, "loss/logits": 0.14385326206684113, "loss/reg": 0.006187067367136478, "step": 56 }, { "epoch": 0.007125, "grad_norm": 2.4213955402374268, "grad_norm_var": 0.4976009733976563, "learning_rate": 5.6999999999999996e-05, "loss": 1.0386, "loss/crossentropy": 2.572023868560791, "loss/hidden": 0.84765625, "loss/logits": 0.12909512221813202, "loss/reg": 0.006186594720929861, "step": 57 }, { "epoch": 0.00725, "grad_norm": 2.15891695022583, "grad_norm_var": 0.5091253321428854, "learning_rate": 5.8e-05, "loss": 0.961, "loss/crossentropy": 2.283557415008545, "loss/hidden": 0.7734375, "loss/logits": 0.12568500638008118, "loss/reg": 0.006185955833643675, "step": 58 }, { "epoch": 0.007375, "grad_norm": 2.36811900138855, "grad_norm_var": 0.48432608682591366, "learning_rate": 5.9e-05, "loss": 0.8386, "loss/crossentropy": 2.453810453414917, "loss/hidden": 0.6796875, "loss/logits": 0.09709502756595612, "loss/reg": 0.0061853062361478806, "step": 59 }, { "epoch": 0.0075, "grad_norm": 2.591327667236328, "grad_norm_var": 0.4836842483889178, "learning_rate": 6e-05, "loss": 1.033, "loss/crossentropy": 2.8110511302948, "loss/hidden": 0.81640625, "loss/logits": 0.1547423005104065, "loss/reg": 0.006184632424265146, "step": 60 }, { "epoch": 0.007625, "grad_norm": 2.0103816986083984, "grad_norm_var": 0.5047142009615214, "learning_rate": 6.1e-05, "loss": 0.9296, "loss/crossentropy": 2.15134334564209, "loss/hidden": 0.7578125, "loss/logits": 0.1099701076745987, "loss/reg": 0.0061841062270104885, "step": 61 }, { "epoch": 0.00775, "grad_norm": 1.80124831199646, "grad_norm_var": 0.5287549745746596, "learning_rate": 6.2e-05, "loss": 0.9266, "loss/crossentropy": 2.7054479122161865, "loss/hidden": 0.7421875, "loss/logits": 0.12253857403993607, "loss/reg": 0.0061835781671106815, "step": 62 }, { "epoch": 0.007875, "grad_norm": 2.277440309524536, "grad_norm_var": 0.5252193383179133, "learning_rate": 6.3e-05, "loss": 0.914, "loss/crossentropy": 2.6631381511688232, "loss/hidden": 0.734375, "loss/logits": 0.1177992895245552, "loss/reg": 0.0061830319464206696, "step": 63 }, { "epoch": 0.008, "grad_norm": 3.3314151763916016, "grad_norm_var": 0.531964164332922, "learning_rate": 6.400000000000001e-05, "loss": 1.29, "loss/crossentropy": 2.1269633769989014, "loss/hidden": 1.0625, "loss/logits": 0.16565865278244019, "loss/reg": 0.006182366982102394, "step": 64 }, { "epoch": 0.008125, "grad_norm": 4.333358287811279, "grad_norm_var": 0.7208240839518936, "learning_rate": 6.500000000000001e-05, "loss": 1.1615, "loss/crossentropy": 2.714442491531372, "loss/hidden": 0.94140625, "loss/logits": 0.15825161337852478, "loss/reg": 0.006181675940752029, "step": 65 }, { "epoch": 0.00825, "grad_norm": 2.853740930557251, "grad_norm_var": 0.7223776199927481, "learning_rate": 6.6e-05, "loss": 1.062, "loss/crossentropy": 2.2147135734558105, "loss/hidden": 0.8515625, "loss/logits": 0.14859826862812042, "loss/reg": 0.006180979777127504, "step": 66 }, { "epoch": 0.008375, "grad_norm": 2.8853657245635986, "grad_norm_var": 0.7242961395218184, "learning_rate": 6.7e-05, "loss": 0.9533, "loss/crossentropy": 2.619598388671875, "loss/hidden": 0.7734375, "loss/logits": 0.11804014444351196, "loss/reg": 0.006180332973599434, "step": 67 }, { "epoch": 0.0085, "grad_norm": 2.725229501724243, "grad_norm_var": 0.7142181363616674, "learning_rate": 6.800000000000001e-05, "loss": 1.1308, "loss/crossentropy": 2.4091367721557617, "loss/hidden": 0.90234375, "loss/logits": 0.16662752628326416, "loss/reg": 0.006179714575409889, "step": 68 }, { "epoch": 0.008625, "grad_norm": 2.93643856048584, "grad_norm_var": 0.6977178730278022, "learning_rate": 6.9e-05, "loss": 1.1414, "loss/crossentropy": 2.509793281555176, "loss/hidden": 0.90234375, "loss/logits": 0.17730477452278137, "loss/reg": 0.0061789220198988914, "step": 69 }, { "epoch": 0.00875, "grad_norm": 2.4086973667144775, "grad_norm_var": 0.6896555586144653, "learning_rate": 7e-05, "loss": 0.9852, "loss/crossentropy": 2.7080371379852295, "loss/hidden": 0.7890625, "loss/logits": 0.1343374401330948, "loss/reg": 0.0061781019903719425, "step": 70 }, { "epoch": 0.008875, "grad_norm": 1.9355547428131104, "grad_norm_var": 0.7196579708330165, "learning_rate": 7.1e-05, "loss": 0.9176, "loss/crossentropy": 2.451488494873047, "loss/hidden": 0.7421875, "loss/logits": 0.11365102231502533, "loss/reg": 0.006177456583827734, "step": 71 }, { "epoch": 0.009, "grad_norm": 2.273902654647827, "grad_norm_var": 0.38422972669649574, "learning_rate": 7.2e-05, "loss": 1.0112, "loss/crossentropy": 2.4479947090148926, "loss/hidden": 0.8125, "loss/logits": 0.13690924644470215, "loss/reg": 0.006176764145493507, "step": 72 }, { "epoch": 0.009125, "grad_norm": 3.385849952697754, "grad_norm_var": 0.4217084598233742, "learning_rate": 7.3e-05, "loss": 1.3992, "loss/crossentropy": 2.3916804790496826, "loss/hidden": 1.1484375, "loss/logits": 0.18896484375, "loss/reg": 0.006176079623401165, "step": 73 }, { "epoch": 0.00925, "grad_norm": 1.893932580947876, "grad_norm_var": 0.44317594415441114, "learning_rate": 7.4e-05, "loss": 0.9357, "loss/crossentropy": 2.3809518814086914, "loss/hidden": 0.74609375, "loss/logits": 0.12787015736103058, "loss/reg": 0.00617539556697011, "step": 74 }, { "epoch": 0.009375, "grad_norm": 2.431032657623291, "grad_norm_var": 0.4412621914582907, "learning_rate": 7.500000000000001e-05, "loss": 1.0796, "loss/crossentropy": 2.5346295833587646, "loss/hidden": 0.86328125, "loss/logits": 0.1545613557100296, "loss/reg": 0.006174764130264521, "step": 75 }, { "epoch": 0.0095, "grad_norm": 2.2421321868896484, "grad_norm_var": 0.45066905079875685, "learning_rate": 7.6e-05, "loss": 0.9869, "loss/crossentropy": 2.756843090057373, "loss/hidden": 0.796875, "loss/logits": 0.1282375454902649, "loss/reg": 0.006174163427203894, "step": 76 }, { "epoch": 0.009625, "grad_norm": 2.7022979259490967, "grad_norm_var": 0.4254703741989109, "learning_rate": 7.7e-05, "loss": 1.2503, "loss/crossentropy": 2.0696699619293213, "loss/hidden": 1.015625, "loss/logits": 0.1729813814163208, "loss/reg": 0.006173421163111925, "step": 77 }, { "epoch": 0.00975, "grad_norm": 2.501106023788452, "grad_norm_var": 0.37677934250983375, "learning_rate": 7.800000000000001e-05, "loss": 1.0516, "loss/crossentropy": 2.629380941390991, "loss/hidden": 0.83984375, "loss/logits": 0.15003597736358643, "loss/reg": 0.006172672379761934, "step": 78 }, { "epoch": 0.009875, "grad_norm": 2.137601137161255, "grad_norm_var": 0.3857841035513881, "learning_rate": 7.900000000000001e-05, "loss": 0.9388, "loss/crossentropy": 2.6841280460357666, "loss/hidden": 0.75, "loss/logits": 0.12706515192985535, "loss/reg": 0.006171974819153547, "step": 79 }, { "epoch": 0.01, "grad_norm": 4.655951976776123, "grad_norm_var": 0.6093991769416703, "learning_rate": 8e-05, "loss": 1.2659, "loss/crossentropy": 2.4634439945220947, "loss/hidden": 1.0390625, "loss/logits": 0.16511483490467072, "loss/reg": 0.006171175744384527, "step": 80 }, { "epoch": 0.010125, "grad_norm": 2.2418179512023926, "grad_norm_var": 0.44652068466097317, "learning_rate": 8.1e-05, "loss": 1.0773, "loss/crossentropy": 2.479743480682373, "loss/hidden": 0.87890625, "loss/logits": 0.1366729438304901, "loss/reg": 0.006170437205582857, "step": 81 }, { "epoch": 0.01025, "grad_norm": 2.0470192432403564, "grad_norm_var": 0.4640077865797357, "learning_rate": 8.2e-05, "loss": 0.8599, "loss/crossentropy": 2.440803050994873, "loss/hidden": 0.68359375, "loss/logits": 0.11458206921815872, "loss/reg": 0.0061697582714259624, "step": 82 }, { "epoch": 0.010375, "grad_norm": 2.0131125450134277, "grad_norm_var": 0.47694604476552793, "learning_rate": 8.3e-05, "loss": 0.8585, "loss/crossentropy": 2.480877637863159, "loss/hidden": 0.6875, "loss/logits": 0.10927767306566238, "loss/reg": 0.006169027183204889, "step": 83 }, { "epoch": 0.0105, "grad_norm": 2.2644267082214355, "grad_norm_var": 0.47842071328175656, "learning_rate": 8.4e-05, "loss": 0.8351, "loss/crossentropy": 2.693246841430664, "loss/hidden": 0.67578125, "loss/logits": 0.09764716029167175, "loss/reg": 0.006168315652757883, "step": 84 }, { "epoch": 0.010625, "grad_norm": 3.1729207038879395, "grad_norm_var": 0.4955376038232837, "learning_rate": 8.5e-05, "loss": 1.2314, "loss/crossentropy": 2.3339309692382812, "loss/hidden": 1.015625, "loss/logits": 0.15408015251159668, "loss/reg": 0.006167604587972164, "step": 85 }, { "epoch": 0.01075, "grad_norm": 2.281872510910034, "grad_norm_var": 0.4984116504809473, "learning_rate": 8.6e-05, "loss": 1.1113, "loss/crossentropy": 2.410794258117676, "loss/hidden": 0.8828125, "loss/logits": 0.16686803102493286, "loss/reg": 0.0061669000424444675, "step": 86 }, { "epoch": 0.010875, "grad_norm": 2.701244354248047, "grad_norm_var": 0.4762769450482454, "learning_rate": 8.7e-05, "loss": 0.9115, "loss/crossentropy": 2.5270962715148926, "loss/hidden": 0.73046875, "loss/logits": 0.11935658752918243, "loss/reg": 0.0061660343781113625, "step": 87 }, { "epoch": 0.011, "grad_norm": 2.0738677978515625, "grad_norm_var": 0.4863854399313406, "learning_rate": 8.800000000000001e-05, "loss": 0.9634, "loss/crossentropy": 2.625903844833374, "loss/hidden": 0.7734375, "loss/logits": 0.12826378643512726, "loss/reg": 0.006165289785712957, "step": 88 }, { "epoch": 0.011125, "grad_norm": 2.827744245529175, "grad_norm_var": 0.44340376520124375, "learning_rate": 8.900000000000001e-05, "loss": 1.0134, "loss/crossentropy": 2.2436654567718506, "loss/hidden": 0.80078125, "loss/logits": 0.15097512304782867, "loss/reg": 0.006164397578686476, "step": 89 }, { "epoch": 0.01125, "grad_norm": 2.412203788757324, "grad_norm_var": 0.4174983019540292, "learning_rate": 9e-05, "loss": 0.9541, "loss/crossentropy": 2.4847052097320557, "loss/hidden": 0.78515625, "loss/logits": 0.10735376924276352, "loss/reg": 0.006163434591144323, "step": 90 }, { "epoch": 0.011375, "grad_norm": 2.385309934616089, "grad_norm_var": 0.41831854842319344, "learning_rate": 9.1e-05, "loss": 1.0455, "loss/crossentropy": 2.1011688709259033, "loss/hidden": 0.828125, "loss/logits": 0.15577414631843567, "loss/reg": 0.0061626131646335125, "step": 91 }, { "epoch": 0.0115, "grad_norm": 2.779266595840454, "grad_norm_var": 0.4149256226543306, "learning_rate": 9.200000000000001e-05, "loss": 0.9782, "loss/crossentropy": 2.770954132080078, "loss/hidden": 0.78125, "loss/logits": 0.13530117273330688, "loss/reg": 0.006161784287542105, "step": 92 }, { "epoch": 0.011625, "grad_norm": 2.816206216812134, "grad_norm_var": 0.41767206123470924, "learning_rate": 9.300000000000001e-05, "loss": 1.2584, "loss/crossentropy": 2.4919488430023193, "loss/hidden": 1.0234375, "loss/logits": 0.17335021495819092, "loss/reg": 0.006160792429000139, "step": 93 }, { "epoch": 0.01175, "grad_norm": 2.1000349521636963, "grad_norm_var": 0.4320504871954351, "learning_rate": 9.4e-05, "loss": 0.9293, "loss/crossentropy": 2.6951355934143066, "loss/hidden": 0.7421875, "loss/logits": 0.12551091611385345, "loss/reg": 0.006159830838441849, "step": 94 }, { "epoch": 0.011875, "grad_norm": 2.6696228981018066, "grad_norm_var": 0.4199965621062515, "learning_rate": 9.5e-05, "loss": 1.0491, "loss/crossentropy": 2.6532485485076904, "loss/hidden": 0.83984375, "loss/logits": 0.14771661162376404, "loss/reg": 0.006158801261335611, "step": 95 }, { "epoch": 0.012, "grad_norm": 2.308758020401001, "grad_norm_var": 0.11782165750081125, "learning_rate": 9.6e-05, "loss": 1.1178, "loss/crossentropy": 2.38185977935791, "loss/hidden": 0.90625, "loss/logits": 0.1499352604150772, "loss/reg": 0.006157839670777321, "step": 96 }, { "epoch": 0.012125, "grad_norm": 2.4204304218292236, "grad_norm_var": 0.11501335190634426, "learning_rate": 9.7e-05, "loss": 1.092, "loss/crossentropy": 2.4358534812927246, "loss/hidden": 0.86328125, "loss/logits": 0.16712763905525208, "loss/reg": 0.006156752817332745, "step": 97 }, { "epoch": 0.01225, "grad_norm": 3.7184524536132812, "grad_norm_var": 0.198780236272727, "learning_rate": 9.8e-05, "loss": 1.4311, "loss/crossentropy": 2.1283679008483887, "loss/hidden": 1.171875, "loss/logits": 0.1976230889558792, "loss/reg": 0.006155804730951786, "step": 98 }, { "epoch": 0.012375, "grad_norm": 3.2656571865081787, "grad_norm_var": 0.20565265002658914, "learning_rate": 9.900000000000001e-05, "loss": 1.017, "loss/crossentropy": 2.6715664863586426, "loss/hidden": 0.80078125, "loss/logits": 0.15465494990348816, "loss/reg": 0.006154791917651892, "step": 99 }, { "epoch": 0.0125, "grad_norm": 2.915663719177246, "grad_norm_var": 0.19977570339779593, "learning_rate": 0.0001, "loss": 0.98, "loss/crossentropy": 2.5455305576324463, "loss/hidden": 0.77734375, "loss/logits": 0.1410846710205078, "loss/reg": 0.0061536673456430435, "step": 100 }, { "epoch": 0.012625, "grad_norm": 3.3153059482574463, "grad_norm_var": 0.2104372314148539, "learning_rate": 0.0001, "loss": 1.1039, "loss/crossentropy": 2.455479621887207, "loss/hidden": 0.90625, "loss/logits": 0.13615351915359497, "loss/reg": 0.0061526307836174965, "step": 101 }, { "epoch": 0.01275, "grad_norm": 2.40315318107605, "grad_norm_var": 0.20480568897691, "learning_rate": 0.0001, "loss": 0.9588, "loss/crossentropy": 2.6359853744506836, "loss/hidden": 0.76953125, "loss/logits": 0.1277719885110855, "loss/reg": 0.006151493173092604, "step": 102 }, { "epoch": 0.012875, "grad_norm": 3.625624895095825, "grad_norm_var": 0.25903479701245613, "learning_rate": 0.0001, "loss": 1.2481, "loss/crossentropy": 2.0148656368255615, "loss/hidden": 1.046875, "loss/logits": 0.13969773054122925, "loss/reg": 0.006150420755147934, "step": 103 }, { "epoch": 0.013, "grad_norm": 2.497906446456909, "grad_norm_var": 0.23191354079432358, "learning_rate": 0.0001, "loss": 1.0603, "loss/crossentropy": 2.3493525981903076, "loss/hidden": 0.86328125, "loss/logits": 0.13548779487609863, "loss/reg": 0.006149281747639179, "step": 104 }, { "epoch": 0.013125, "grad_norm": 3.258059501647949, "grad_norm_var": 0.24629299643454275, "learning_rate": 0.0001, "loss": 0.9497, "loss/crossentropy": 2.6988418102264404, "loss/hidden": 0.7734375, "loss/logits": 0.11473990976810455, "loss/reg": 0.006148339249193668, "step": 105 }, { "epoch": 0.01325, "grad_norm": 3.1279666423797607, "grad_norm_var": 0.24075672502018505, "learning_rate": 0.0001, "loss": 1.1195, "loss/crossentropy": 2.578716278076172, "loss/hidden": 0.875, "loss/logits": 0.18304204940795898, "loss/reg": 0.006147205363959074, "step": 106 }, { "epoch": 0.013375, "grad_norm": 2.760901927947998, "grad_norm_var": 0.22627915570051277, "learning_rate": 0.0001, "loss": 0.9369, "loss/crossentropy": 2.5835328102111816, "loss/hidden": 0.75, "loss/logits": 0.12544697523117065, "loss/reg": 0.006146106868982315, "step": 107 }, { "epoch": 0.0135, "grad_norm": 3.2917559146881104, "grad_norm_var": 0.23622539643692994, "learning_rate": 0.0001, "loss": 1.1437, "loss/crossentropy": 2.6001460552215576, "loss/hidden": 0.91796875, "loss/logits": 0.16428819298744202, "loss/reg": 0.006144997663795948, "step": 108 }, { "epoch": 0.013625, "grad_norm": 3.3908517360687256, "grad_norm_var": 0.2499864352593607, "learning_rate": 0.0001, "loss": 1.0747, "loss/crossentropy": 2.6003377437591553, "loss/hidden": 0.87109375, "loss/logits": 0.14213082194328308, "loss/reg": 0.00614393362775445, "step": 109 }, { "epoch": 0.01375, "grad_norm": 2.7455620765686035, "grad_norm_var": 0.2035723185991922, "learning_rate": 0.0001, "loss": 1.1844, "loss/crossentropy": 2.446432113647461, "loss/hidden": 0.94921875, "loss/logits": 0.17372827231884003, "loss/reg": 0.00614282488822937, "step": 110 }, { "epoch": 0.013875, "grad_norm": 2.899392604827881, "grad_norm_var": 0.1972949454934593, "learning_rate": 0.0001, "loss": 1.0314, "loss/crossentropy": 2.4233920574188232, "loss/hidden": 0.83984375, "loss/logits": 0.13018067181110382, "loss/reg": 0.00614172825589776, "step": 111 }, { "epoch": 0.014, "grad_norm": 2.204866647720337, "grad_norm_var": 0.20749751086427656, "learning_rate": 0.0001, "loss": 0.9867, "loss/crossentropy": 2.4006736278533936, "loss/hidden": 0.79296875, "loss/logits": 0.13233302533626556, "loss/reg": 0.006140332669019699, "step": 112 }, { "epoch": 0.014125, "grad_norm": 2.5094263553619385, "grad_norm_var": 0.20123279411857975, "learning_rate": 0.0001, "loss": 1.2429, "loss/crossentropy": 2.2730560302734375, "loss/hidden": 1.0078125, "loss/logits": 0.1737476885318756, "loss/reg": 0.006138913799077272, "step": 113 }, { "epoch": 0.01425, "grad_norm": 2.590543031692505, "grad_norm_var": 0.17204464736018749, "learning_rate": 0.0001, "loss": 1.0086, "loss/crossentropy": 2.5709896087646484, "loss/hidden": 0.79296875, "loss/logits": 0.1542350947856903, "loss/reg": 0.0061377594247460365, "step": 114 }, { "epoch": 0.014375, "grad_norm": 2.5024876594543457, "grad_norm_var": 0.17379926494707643, "learning_rate": 0.0001, "loss": 1.0309, "loss/crossentropy": 2.539165496826172, "loss/hidden": 0.828125, "loss/logits": 0.14142319560050964, "loss/reg": 0.006136584095656872, "step": 115 }, { "epoch": 0.0145, "grad_norm": 3.2216732501983643, "grad_norm_var": 0.18121036366206128, "learning_rate": 0.0001, "loss": 0.9404, "loss/crossentropy": 2.7685325145721436, "loss/hidden": 0.765625, "loss/logits": 0.1133967787027359, "loss/reg": 0.006135319825261831, "step": 116 }, { "epoch": 0.014625, "grad_norm": 2.3834009170532227, "grad_norm_var": 0.18346146088524526, "learning_rate": 0.0001, "loss": 1.1432, "loss/crossentropy": 2.4507999420166016, "loss/hidden": 0.92578125, "loss/logits": 0.1561031937599182, "loss/reg": 0.006133983377367258, "step": 117 }, { "epoch": 0.01475, "grad_norm": 2.4703636169433594, "grad_norm_var": 0.17984383474256424, "learning_rate": 0.0001, "loss": 1.0541, "loss/crossentropy": 2.3506076335906982, "loss/hidden": 0.84765625, "loss/logits": 0.14511807262897491, "loss/reg": 0.006132753100246191, "step": 118 }, { "epoch": 0.014875, "grad_norm": 2.5960817337036133, "grad_norm_var": 0.13859654880591943, "learning_rate": 0.0001, "loss": 1.2156, "loss/crossentropy": 2.427006244659424, "loss/hidden": 0.96875, "loss/logits": 0.1855170726776123, "loss/reg": 0.006131566129624844, "step": 119 }, { "epoch": 0.015, "grad_norm": 2.908734083175659, "grad_norm_var": 0.13379147574996655, "learning_rate": 0.0001, "loss": 1.0136, "loss/crossentropy": 2.4075210094451904, "loss/hidden": 0.81640625, "loss/logits": 0.13592825829982758, "loss/reg": 0.006130332592874765, "step": 120 }, { "epoch": 0.015125, "grad_norm": 3.450002670288086, "grad_norm_var": 0.147717685364636, "learning_rate": 0.0001, "loss": 1.1584, "loss/crossentropy": 2.446925640106201, "loss/hidden": 0.92578125, "loss/logits": 0.17129938304424286, "loss/reg": 0.0061291721649467945, "step": 121 }, { "epoch": 0.01525, "grad_norm": 2.941195011138916, "grad_norm_var": 0.14212594790061886, "learning_rate": 0.0001, "loss": 1.0996, "loss/crossentropy": 2.5499086380004883, "loss/hidden": 0.87109375, "loss/logits": 0.1672220528125763, "loss/reg": 0.006127914879471064, "step": 122 }, { "epoch": 0.015375, "grad_norm": 2.951799154281616, "grad_norm_var": 0.14330143067309015, "learning_rate": 0.0001, "loss": 1.0862, "loss/crossentropy": 2.654383420944214, "loss/hidden": 0.87109375, "loss/logits": 0.15379250049591064, "loss/reg": 0.006126696243882179, "step": 123 }, { "epoch": 0.0155, "grad_norm": 2.5093131065368652, "grad_norm_var": 0.13194533540905293, "learning_rate": 0.0001, "loss": 1.0905, "loss/crossentropy": 2.4646618366241455, "loss/hidden": 0.87890625, "loss/logits": 0.15029752254486084, "loss/reg": 0.006125394720584154, "step": 124 }, { "epoch": 0.015625, "grad_norm": 2.357142448425293, "grad_norm_var": 0.11277765633995311, "learning_rate": 0.0001, "loss": 1.0794, "loss/crossentropy": 2.4590322971343994, "loss/hidden": 0.87109375, "loss/logits": 0.1471107453107834, "loss/reg": 0.0061240773648023605, "step": 125 }, { "epoch": 0.01575, "grad_norm": 2.0443954467773438, "grad_norm_var": 0.13949059079901172, "learning_rate": 0.0001, "loss": 1.0064, "loss/crossentropy": 2.6105568408966064, "loss/hidden": 0.80859375, "loss/logits": 0.13658249378204346, "loss/reg": 0.006122750695794821, "step": 126 }, { "epoch": 0.015875, "grad_norm": 2.334003448486328, "grad_norm_var": 0.1413326038540049, "learning_rate": 0.0001, "loss": 1.128, "loss/crossentropy": 2.3226428031921387, "loss/hidden": 0.8984375, "loss/logits": 0.16836631298065186, "loss/reg": 0.006121381651610136, "step": 127 }, { "epoch": 0.016, "grad_norm": 2.6693766117095947, "grad_norm_var": 0.12889249481462456, "learning_rate": 0.0001, "loss": 1.0478, "loss/crossentropy": 2.5844597816467285, "loss/hidden": 0.84765625, "loss/logits": 0.1388963758945465, "loss/reg": 0.006120136007666588, "step": 128 }, { "epoch": 0.016125, "grad_norm": 3.935439348220825, "grad_norm_var": 0.22878447427120438, "learning_rate": 0.0001, "loss": 1.1726, "loss/crossentropy": 2.7213780879974365, "loss/hidden": 0.9375, "loss/logits": 0.1738772690296173, "loss/reg": 0.006118897348642349, "step": 129 }, { "epoch": 0.01625, "grad_norm": 3.463432788848877, "grad_norm_var": 0.25882213944617144, "learning_rate": 0.0001, "loss": 1.0898, "loss/crossentropy": 2.3635873794555664, "loss/hidden": 0.8828125, "loss/logits": 0.1457763910293579, "loss/reg": 0.006117486394941807, "step": 130 }, { "epoch": 0.016375, "grad_norm": 3.779526948928833, "grad_norm_var": 0.31074183113488135, "learning_rate": 0.0001, "loss": 1.2078, "loss/crossentropy": 2.316762924194336, "loss/hidden": 0.98046875, "loss/logits": 0.16614478826522827, "loss/reg": 0.006116243079304695, "step": 131 }, { "epoch": 0.0165, "grad_norm": 2.7554008960723877, "grad_norm_var": 0.3028391023812749, "learning_rate": 0.0001, "loss": 0.9769, "loss/crossentropy": 2.458954095840454, "loss/hidden": 0.7890625, "loss/logits": 0.12667913734912872, "loss/reg": 0.006114880088716745, "step": 132 }, { "epoch": 0.016625, "grad_norm": 2.342526435852051, "grad_norm_var": 0.30546929082944035, "learning_rate": 0.0001, "loss": 1.1137, "loss/crossentropy": 2.6329517364501953, "loss/hidden": 0.890625, "loss/logits": 0.161947563290596, "loss/reg": 0.0061136274598538876, "step": 133 }, { "epoch": 0.01675, "grad_norm": 2.2754058837890625, "grad_norm_var": 0.31756495416411024, "learning_rate": 0.0001, "loss": 1.1703, "loss/crossentropy": 2.2747550010681152, "loss/hidden": 0.94921875, "loss/logits": 0.15994513034820557, "loss/reg": 0.006112351547926664, "step": 134 }, { "epoch": 0.016875, "grad_norm": 3.1313912868499756, "grad_norm_var": 0.3186282278045513, "learning_rate": 0.0001, "loss": 1.2333, "loss/crossentropy": 2.4932894706726074, "loss/hidden": 0.99609375, "loss/logits": 0.17612434923648834, "loss/reg": 0.006111042574048042, "step": 135 }, { "epoch": 0.017, "grad_norm": 3.960482358932495, "grad_norm_var": 0.39381746513703864, "learning_rate": 0.0001, "loss": 1.3101, "loss/crossentropy": 2.581660747528076, "loss/hidden": 1.0625, "loss/logits": 0.18646802008152008, "loss/reg": 0.006109676789492369, "step": 136 }, { "epoch": 0.017125, "grad_norm": 2.7605810165405273, "grad_norm_var": 0.37584340109069647, "learning_rate": 0.0001, "loss": 0.8792, "loss/crossentropy": 2.6490936279296875, "loss/hidden": 0.703125, "loss/logits": 0.1150316372513771, "loss/reg": 0.006108277477324009, "step": 137 }, { "epoch": 0.01725, "grad_norm": 2.6196203231811523, "grad_norm_var": 0.38003486499210315, "learning_rate": 0.0001, "loss": 0.955, "loss/crossentropy": 2.633441209793091, "loss/hidden": 0.76953125, "loss/logits": 0.1244344562292099, "loss/reg": 0.006106934975832701, "step": 138 }, { "epoch": 0.017375, "grad_norm": 4.534512519836426, "grad_norm_var": 0.554255985026353, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.2204151153564453, "loss/hidden": 1.1796875, "loss/logits": 0.1696874350309372, "loss/reg": 0.0061056241393089294, "step": 139 }, { "epoch": 0.0175, "grad_norm": 2.192370653152466, "grad_norm_var": 0.5798771099829023, "learning_rate": 0.0001, "loss": 1.1299, "loss/crossentropy": 2.375506639480591, "loss/hidden": 0.921875, "loss/logits": 0.14694982767105103, "loss/reg": 0.0061043244786560535, "step": 140 }, { "epoch": 0.017625, "grad_norm": 4.368403911590576, "grad_norm_var": 0.6744588881998081, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.3692545890808105, "loss/hidden": 1.03125, "loss/logits": 0.18568292260169983, "loss/reg": 0.006102937273681164, "step": 141 }, { "epoch": 0.01775, "grad_norm": 2.2753779888153076, "grad_norm_var": 0.6461169960118004, "learning_rate": 0.0001, "loss": 1.0276, "loss/crossentropy": 2.470676898956299, "loss/hidden": 0.82421875, "loss/logits": 0.14231771230697632, "loss/reg": 0.006101653911173344, "step": 142 }, { "epoch": 0.017875, "grad_norm": 2.6550562381744385, "grad_norm_var": 0.6203099666067883, "learning_rate": 0.0001, "loss": 0.8712, "loss/crossentropy": 2.8198063373565674, "loss/hidden": 0.69921875, "loss/logits": 0.11099085956811905, "loss/reg": 0.006100376136600971, "step": 143 }, { "epoch": 0.018, "grad_norm": 2.8701858520507812, "grad_norm_var": 0.6111015072729884, "learning_rate": 0.0001, "loss": 1.1794, "loss/crossentropy": 2.413463830947876, "loss/hidden": 0.96484375, "loss/logits": 0.15351834893226624, "loss/reg": 0.006099053658545017, "step": 144 }, { "epoch": 0.018125, "grad_norm": 2.2347958087921143, "grad_norm_var": 0.6069563505613275, "learning_rate": 0.0001, "loss": 1.0832, "loss/crossentropy": 2.446056604385376, "loss/hidden": 0.8671875, "loss/logits": 0.1550455242395401, "loss/reg": 0.006097796373069286, "step": 145 }, { "epoch": 0.01825, "grad_norm": 2.60143780708313, "grad_norm_var": 0.6017061449507364, "learning_rate": 0.0001, "loss": 1.1216, "loss/crossentropy": 2.2890260219573975, "loss/hidden": 0.8984375, "loss/logits": 0.16223573684692383, "loss/reg": 0.006096460856497288, "step": 146 }, { "epoch": 0.018375, "grad_norm": 3.656100273132324, "grad_norm_var": 0.5891684064627459, "learning_rate": 0.0001, "loss": 1.2759, "loss/crossentropy": 2.2077646255493164, "loss/hidden": 1.0546875, "loss/logits": 0.16024138033390045, "loss/reg": 0.006095105782151222, "step": 147 }, { "epoch": 0.0185, "grad_norm": 2.8190999031066895, "grad_norm_var": 0.5877513730221795, "learning_rate": 0.0001, "loss": 1.1416, "loss/crossentropy": 2.4892842769622803, "loss/hidden": 0.9140625, "loss/logits": 0.1665700376033783, "loss/reg": 0.0060938019305467606, "step": 148 }, { "epoch": 0.018625, "grad_norm": 2.6578848361968994, "grad_norm_var": 0.568168306773175, "learning_rate": 0.0001, "loss": 1.1443, "loss/crossentropy": 2.3138527870178223, "loss/hidden": 0.93359375, "loss/logits": 0.14977282285690308, "loss/reg": 0.006092346739023924, "step": 149 }, { "epoch": 0.01875, "grad_norm": 2.656559944152832, "grad_norm_var": 0.5416540961853636, "learning_rate": 0.0001, "loss": 0.9868, "loss/crossentropy": 2.7701377868652344, "loss/hidden": 0.796875, "loss/logits": 0.12901648879051208, "loss/reg": 0.006090943701565266, "step": 150 }, { "epoch": 0.018875, "grad_norm": 1.9359983205795288, "grad_norm_var": 0.6099613145708634, "learning_rate": 0.0001, "loss": 0.9127, "loss/crossentropy": 2.55560040473938, "loss/hidden": 0.73828125, "loss/logits": 0.11351295560598373, "loss/reg": 0.00608965614810586, "step": 151 }, { "epoch": 0.019, "grad_norm": 3.7978732585906982, "grad_norm_var": 0.5891613317586338, "learning_rate": 0.0001, "loss": 1.2275, "loss/crossentropy": 2.4227731227874756, "loss/hidden": 0.98828125, "loss/logits": 0.17836451530456543, "loss/reg": 0.006088252179324627, "step": 152 }, { "epoch": 0.019125, "grad_norm": 2.8193647861480713, "grad_norm_var": 0.588169020521083, "learning_rate": 0.0001, "loss": 0.9739, "loss/crossentropy": 2.474368095397949, "loss/hidden": 0.80078125, "loss/logits": 0.11225409805774689, "loss/reg": 0.006086937617510557, "step": 153 }, { "epoch": 0.01925, "grad_norm": 2.2882325649261475, "grad_norm_var": 0.6082348956957436, "learning_rate": 0.0001, "loss": 1.0395, "loss/crossentropy": 2.3776350021362305, "loss/hidden": 0.82421875, "loss/logits": 0.15443992614746094, "loss/reg": 0.0060854703187942505, "step": 154 }, { "epoch": 0.019375, "grad_norm": 2.006150245666504, "grad_norm_var": 0.4559805309993303, "learning_rate": 0.0001, "loss": 0.9762, "loss/crossentropy": 2.7556076049804688, "loss/hidden": 0.78515625, "loss/logits": 0.13019207119941711, "loss/reg": 0.006084186024963856, "step": 155 }, { "epoch": 0.0195, "grad_norm": 2.8143231868743896, "grad_norm_var": 0.43477030174237014, "learning_rate": 0.0001, "loss": 1.1927, "loss/crossentropy": 2.652045249938965, "loss/hidden": 0.94140625, "loss/logits": 0.19042611122131348, "loss/reg": 0.00608274107798934, "step": 156 }, { "epoch": 0.019625, "grad_norm": 2.957540988922119, "grad_norm_var": 0.2601037584282233, "learning_rate": 0.0001, "loss": 1.0641, "loss/crossentropy": 2.546213150024414, "loss/hidden": 0.86328125, "loss/logits": 0.14000022411346436, "loss/reg": 0.006081291940063238, "step": 157 }, { "epoch": 0.01975, "grad_norm": 2.625493288040161, "grad_norm_var": 0.24839219907499052, "learning_rate": 0.0001, "loss": 1.012, "loss/crossentropy": 2.5120432376861572, "loss/hidden": 0.81640625, "loss/logits": 0.13474689424037933, "loss/reg": 0.006079958751797676, "step": 158 }, { "epoch": 0.019875, "grad_norm": 2.6614878177642822, "grad_norm_var": 0.2483457330217589, "learning_rate": 0.0001, "loss": 0.9873, "loss/crossentropy": 2.312061071395874, "loss/hidden": 0.80859375, "loss/logits": 0.11790065467357635, "loss/reg": 0.006078665144741535, "step": 159 }, { "epoch": 0.02, "grad_norm": 2.6204919815063477, "grad_norm_var": 0.24699792562249925, "learning_rate": 0.0001, "loss": 1.0488, "loss/crossentropy": 2.505072593688965, "loss/hidden": 0.84375, "loss/logits": 0.14428117871284485, "loss/reg": 0.006077310536056757, "step": 160 }, { "epoch": 0.020125, "grad_norm": 3.107072591781616, "grad_norm_var": 0.24079003208151678, "learning_rate": 0.0001, "loss": 1.1736, "loss/crossentropy": 2.6514599323272705, "loss/hidden": 0.96484375, "loss/logits": 0.1480400413274765, "loss/reg": 0.006076075602322817, "step": 161 }, { "epoch": 0.02025, "grad_norm": 2.669001817703247, "grad_norm_var": 0.23972287159530806, "learning_rate": 0.0001, "loss": 1.1966, "loss/crossentropy": 2.4616479873657227, "loss/hidden": 0.9765625, "loss/logits": 0.15933012962341309, "loss/reg": 0.006074720993638039, "step": 162 }, { "epoch": 0.020375, "grad_norm": 2.5872421264648438, "grad_norm_var": 0.1828196031273113, "learning_rate": 0.0001, "loss": 1.0551, "loss/crossentropy": 2.5483999252319336, "loss/hidden": 0.83984375, "loss/logits": 0.1544739305973053, "loss/reg": 0.006073469761759043, "step": 163 }, { "epoch": 0.0205, "grad_norm": 2.3342509269714355, "grad_norm_var": 0.1891007671877621, "learning_rate": 0.0001, "loss": 1.1418, "loss/crossentropy": 2.610344171524048, "loss/hidden": 0.90234375, "loss/logits": 0.17876723408699036, "loss/reg": 0.006072178483009338, "step": 164 }, { "epoch": 0.020625, "grad_norm": 2.548274278640747, "grad_norm_var": 0.18986337395058156, "learning_rate": 0.0001, "loss": 0.9512, "loss/crossentropy": 2.747725009918213, "loss/hidden": 0.7734375, "loss/logits": 0.11706214398145676, "loss/reg": 0.00607073912397027, "step": 165 }, { "epoch": 0.02075, "grad_norm": 2.666066884994507, "grad_norm_var": 0.18987501227134793, "learning_rate": 0.0001, "loss": 1.0557, "loss/crossentropy": 2.3086578845977783, "loss/hidden": 0.83984375, "loss/logits": 0.1551416665315628, "loss/reg": 0.006069260183721781, "step": 166 }, { "epoch": 0.020875, "grad_norm": 3.363084554672241, "grad_norm_var": 0.18083982986582872, "learning_rate": 0.0001, "loss": 0.9886, "loss/crossentropy": 2.7422661781311035, "loss/hidden": 0.79296875, "loss/logits": 0.13497118651866913, "loss/reg": 0.006067754700779915, "step": 167 }, { "epoch": 0.021, "grad_norm": 2.717400550842285, "grad_norm_var": 0.10163689874761227, "learning_rate": 0.0001, "loss": 1.2413, "loss/crossentropy": 2.341296672821045, "loss/hidden": 1.0078125, "loss/logits": 0.17277640104293823, "loss/reg": 0.006066245958209038, "step": 168 }, { "epoch": 0.021125, "grad_norm": 2.2773897647857666, "grad_norm_var": 0.10949759007257095, "learning_rate": 0.0001, "loss": 0.9531, "loss/crossentropy": 2.492532968521118, "loss/hidden": 0.76953125, "loss/logits": 0.12295819818973541, "loss/reg": 0.006064848508685827, "step": 169 }, { "epoch": 0.02125, "grad_norm": 2.7625067234039307, "grad_norm_var": 0.1012976809853086, "learning_rate": 0.0001, "loss": 1.0102, "loss/crossentropy": 2.3799381256103516, "loss/hidden": 0.80859375, "loss/logits": 0.140989288687706, "loss/reg": 0.0060633583925664425, "step": 170 }, { "epoch": 0.021375, "grad_norm": 3.713162899017334, "grad_norm_var": 0.1323542313667114, "learning_rate": 0.0001, "loss": 1.0173, "loss/crossentropy": 2.7296385765075684, "loss/hidden": 0.80078125, "loss/logits": 0.1559314727783203, "loss/reg": 0.006062004715204239, "step": 171 }, { "epoch": 0.0215, "grad_norm": 2.8448026180267334, "grad_norm_var": 0.13256580340874963, "learning_rate": 0.0001, "loss": 1.0945, "loss/crossentropy": 2.211848497390747, "loss/hidden": 0.87890625, "loss/logits": 0.15503031015396118, "loss/reg": 0.006060663145035505, "step": 172 }, { "epoch": 0.021625, "grad_norm": 2.951566696166992, "grad_norm_var": 0.13242537871232402, "learning_rate": 0.0001, "loss": 1.243, "loss/crossentropy": 2.6379833221435547, "loss/hidden": 0.96484375, "loss/logits": 0.21754613518714905, "loss/reg": 0.00605935649946332, "step": 173 }, { "epoch": 0.02175, "grad_norm": 2.6862404346466064, "grad_norm_var": 0.13142011502921586, "learning_rate": 0.0001, "loss": 1.0053, "loss/crossentropy": 2.3807766437530518, "loss/hidden": 0.80078125, "loss/logits": 0.14393460750579834, "loss/reg": 0.006058130878955126, "step": 174 }, { "epoch": 0.021875, "grad_norm": 2.5145609378814697, "grad_norm_var": 0.13512780159794507, "learning_rate": 0.0001, "loss": 1.0609, "loss/crossentropy": 2.4608380794525146, "loss/hidden": 0.85546875, "loss/logits": 0.14485566318035126, "loss/reg": 0.006056922487914562, "step": 175 }, { "epoch": 0.022, "grad_norm": 3.23178768157959, "grad_norm_var": 0.14607750168249728, "learning_rate": 0.0001, "loss": 1.1294, "loss/crossentropy": 2.9791719913482666, "loss/hidden": 0.91796875, "loss/logits": 0.1508345603942871, "loss/reg": 0.006055623292922974, "step": 176 }, { "epoch": 0.022125, "grad_norm": 2.7397234439849854, "grad_norm_var": 0.14000512423072375, "learning_rate": 0.0001, "loss": 1.0578, "loss/crossentropy": 2.4559919834136963, "loss/hidden": 0.86328125, "loss/logits": 0.1340080350637436, "loss/reg": 0.0060544307343661785, "step": 177 }, { "epoch": 0.02225, "grad_norm": 2.6637048721313477, "grad_norm_var": 0.14009088002925954, "learning_rate": 0.0001, "loss": 1.076, "loss/crossentropy": 2.3794586658477783, "loss/hidden": 0.86328125, "loss/logits": 0.15214313566684723, "loss/reg": 0.0060530174523591995, "step": 178 }, { "epoch": 0.022375, "grad_norm": 2.0105221271514893, "grad_norm_var": 0.17628626628935157, "learning_rate": 0.0001, "loss": 0.9703, "loss/crossentropy": 2.3926336765289307, "loss/hidden": 0.77734375, "loss/logits": 0.13244566321372986, "loss/reg": 0.0060517978854477406, "step": 179 }, { "epoch": 0.0225, "grad_norm": 2.571902275085449, "grad_norm_var": 0.16659277386996318, "learning_rate": 0.0001, "loss": 1.0739, "loss/crossentropy": 2.7502923011779785, "loss/hidden": 0.8515625, "loss/logits": 0.16181406378746033, "loss/reg": 0.006050686351954937, "step": 180 }, { "epoch": 0.022625, "grad_norm": 2.700366973876953, "grad_norm_var": 0.1636147823311904, "learning_rate": 0.0001, "loss": 1.0113, "loss/crossentropy": 2.502389669418335, "loss/hidden": 0.8125, "loss/logits": 0.138347327709198, "loss/reg": 0.006049246061593294, "step": 181 }, { "epoch": 0.02275, "grad_norm": 2.7259435653686523, "grad_norm_var": 0.1629618050893432, "learning_rate": 0.0001, "loss": 1.0192, "loss/crossentropy": 2.2493560314178467, "loss/hidden": 0.82421875, "loss/logits": 0.1344609260559082, "loss/reg": 0.006048021838068962, "step": 182 }, { "epoch": 0.022875, "grad_norm": 4.930091857910156, "grad_norm_var": 0.43832731745023895, "learning_rate": 0.0001, "loss": 1.1874, "loss/crossentropy": 2.649231433868408, "loss/hidden": 0.94140625, "loss/logits": 0.1855432242155075, "loss/reg": 0.006046844646334648, "step": 183 }, { "epoch": 0.023, "grad_norm": 2.288604259490967, "grad_norm_var": 0.4589782783160859, "learning_rate": 0.0001, "loss": 1.0354, "loss/crossentropy": 3.0482568740844727, "loss/hidden": 0.8203125, "loss/logits": 0.15461647510528564, "loss/reg": 0.006045445334166288, "step": 184 }, { "epoch": 0.023125, "grad_norm": 2.7902991771698, "grad_norm_var": 0.4362058684835667, "learning_rate": 0.0001, "loss": 1.0744, "loss/crossentropy": 2.726069211959839, "loss/hidden": 0.8359375, "loss/logits": 0.17799492180347443, "loss/reg": 0.006044231820851564, "step": 185 }, { "epoch": 0.02325, "grad_norm": 3.597017526626587, "grad_norm_var": 0.46633972017124825, "learning_rate": 0.0001, "loss": 1.0985, "loss/crossentropy": 2.200692892074585, "loss/hidden": 0.8984375, "loss/logits": 0.13961729407310486, "loss/reg": 0.006042772904038429, "step": 186 }, { "epoch": 0.023375, "grad_norm": 2.969062566757202, "grad_norm_var": 0.42374272593361867, "learning_rate": 0.0001, "loss": 1.2314, "loss/crossentropy": 2.3744540214538574, "loss/hidden": 0.96875, "loss/logits": 0.20225511491298676, "loss/reg": 0.006041594315320253, "step": 187 }, { "epoch": 0.0235, "grad_norm": 3.2257020473480225, "grad_norm_var": 0.4305906329857976, "learning_rate": 0.0001, "loss": 1.0982, "loss/crossentropy": 2.442505121231079, "loss/hidden": 0.875, "loss/logits": 0.16284233331680298, "loss/reg": 0.006040407810360193, "step": 188 }, { "epoch": 0.023625, "grad_norm": 3.670443058013916, "grad_norm_var": 0.4666515285365591, "learning_rate": 0.0001, "loss": 1.2391, "loss/crossentropy": 2.533158540725708, "loss/hidden": 0.98046875, "loss/logits": 0.19827201962471008, "loss/reg": 0.0060392809100449085, "step": 189 }, { "epoch": 0.02375, "grad_norm": 7.53206729888916, "grad_norm_var": 1.7591779439754056, "learning_rate": 0.0001, "loss": 1.1689, "loss/crossentropy": 2.3104734420776367, "loss/hidden": 0.96875, "loss/logits": 0.13976144790649414, "loss/reg": 0.006038178689777851, "step": 190 }, { "epoch": 0.023875, "grad_norm": 4.658889293670654, "grad_norm_var": 1.833400975261701, "learning_rate": 0.0001, "loss": 1.3266, "loss/crossentropy": 2.286229133605957, "loss/hidden": 1.1015625, "loss/logits": 0.16465552151203156, "loss/reg": 0.006036726757884026, "step": 191 }, { "epoch": 0.024, "grad_norm": 3.2109904289245605, "grad_norm_var": 1.8338781863373583, "learning_rate": 0.0001, "loss": 1.278, "loss/crossentropy": 2.5849151611328125, "loss/hidden": 1.0078125, "loss/logits": 0.20983844995498657, "loss/reg": 0.006035543512552977, "step": 192 }, { "epoch": 0.024125, "grad_norm": 2.556408643722534, "grad_norm_var": 1.8519417466969637, "learning_rate": 0.0001, "loss": 1.0335, "loss/crossentropy": 2.635669231414795, "loss/hidden": 0.8359375, "loss/logits": 0.13721294701099396, "loss/reg": 0.006034051068127155, "step": 193 }, { "epoch": 0.02425, "grad_norm": 3.4185855388641357, "grad_norm_var": 1.8153229069184569, "learning_rate": 0.0001, "loss": 1.0115, "loss/crossentropy": 2.3127341270446777, "loss/hidden": 0.828125, "loss/logits": 0.12303752452135086, "loss/reg": 0.00603274954482913, "step": 194 }, { "epoch": 0.024375, "grad_norm": 3.639681816101074, "grad_norm_var": 1.6731808292397734, "learning_rate": 0.0001, "loss": 1.2374, "loss/crossentropy": 2.4363749027252197, "loss/hidden": 0.98046875, "loss/logits": 0.19659578800201416, "loss/reg": 0.006031363736838102, "step": 195 }, { "epoch": 0.0245, "grad_norm": 3.266385078430176, "grad_norm_var": 1.614572274352353, "learning_rate": 0.0001, "loss": 1.19, "loss/crossentropy": 2.2824337482452393, "loss/hidden": 0.9609375, "loss/logits": 0.16878634691238403, "loss/reg": 0.006029782351106405, "step": 196 }, { "epoch": 0.024625, "grad_norm": 3.0692105293273926, "grad_norm_var": 1.5801212385016838, "learning_rate": 0.0001, "loss": 1.1495, "loss/crossentropy": 2.518056631088257, "loss/hidden": 0.921875, "loss/logits": 0.16731634736061096, "loss/reg": 0.006028252653777599, "step": 197 }, { "epoch": 0.02475, "grad_norm": 3.390202283859253, "grad_norm_var": 1.530565626963321, "learning_rate": 0.0001, "loss": 1.1783, "loss/crossentropy": 2.3565316200256348, "loss/hidden": 0.9375, "loss/logits": 0.18055224418640137, "loss/reg": 0.006026738323271275, "step": 198 }, { "epoch": 0.024875, "grad_norm": 2.524461030960083, "grad_norm_var": 1.4779304822181976, "learning_rate": 0.0001, "loss": 1.095, "loss/crossentropy": 2.3489255905151367, "loss/hidden": 0.88671875, "loss/logits": 0.1480264812707901, "loss/reg": 0.006025230046361685, "step": 199 }, { "epoch": 0.025, "grad_norm": 2.8753433227539062, "grad_norm_var": 1.4056158732497617, "learning_rate": 0.0001, "loss": 1.1396, "loss/crossentropy": 2.379971504211426, "loss/hidden": 0.90625, "loss/logits": 0.17312359809875488, "loss/reg": 0.0060236188583076, "step": 200 }, { "epoch": 0.025125, "grad_norm": 2.2297983169555664, "grad_norm_var": 1.4801331513155804, "learning_rate": 0.0001, "loss": 1.1642, "loss/crossentropy": 2.401499032974243, "loss/hidden": 0.9296875, "loss/logits": 0.1743072271347046, "loss/reg": 0.006021994166076183, "step": 201 }, { "epoch": 0.02525, "grad_norm": 2.7430193424224854, "grad_norm_var": 1.5134885749372204, "learning_rate": 0.0001, "loss": 1.3503, "loss/crossentropy": 2.3397345542907715, "loss/hidden": 1.09375, "loss/logits": 0.1963859498500824, "loss/reg": 0.006020485423505306, "step": 202 }, { "epoch": 0.025375, "grad_norm": 3.3862688541412354, "grad_norm_var": 1.4983780502999742, "learning_rate": 0.0001, "loss": 1.3154, "loss/crossentropy": 2.3259048461914062, "loss/hidden": 1.09375, "loss/logits": 0.1614416241645813, "loss/reg": 0.0060190120711922646, "step": 203 }, { "epoch": 0.0255, "grad_norm": 2.554938316345215, "grad_norm_var": 1.547662147741073, "learning_rate": 0.0001, "loss": 1.1147, "loss/crossentropy": 2.559544801712036, "loss/hidden": 0.890625, "loss/logits": 0.16388913989067078, "loss/reg": 0.006017730105668306, "step": 204 }, { "epoch": 0.025625, "grad_norm": 2.6290361881256104, "grad_norm_var": 1.5807281675134672, "learning_rate": 0.0001, "loss": 1.049, "loss/crossentropy": 2.7080090045928955, "loss/hidden": 0.828125, "loss/logits": 0.16068041324615479, "loss/reg": 0.006016433704644442, "step": 205 }, { "epoch": 0.02575, "grad_norm": 2.234259605407715, "grad_norm_var": 0.38456120947827777, "learning_rate": 0.0001, "loss": 1.0392, "loss/crossentropy": 2.3816347122192383, "loss/hidden": 0.8359375, "loss/logits": 0.14315146207809448, "loss/reg": 0.0060149249620735645, "step": 206 }, { "epoch": 0.025875, "grad_norm": 2.810352325439453, "grad_norm_var": 0.19522907990381644, "learning_rate": 0.0001, "loss": 1.1385, "loss/crossentropy": 2.6245384216308594, "loss/hidden": 0.90625, "loss/logits": 0.17206540703773499, "loss/reg": 0.006013684440404177, "step": 207 }, { "epoch": 0.026, "grad_norm": 2.198707342147827, "grad_norm_var": 0.21847125065788287, "learning_rate": 0.0001, "loss": 0.9762, "loss/crossentropy": 2.3812787532806396, "loss/hidden": 0.796875, "loss/logits": 0.119233138859272, "loss/reg": 0.006012204568833113, "step": 208 }, { "epoch": 0.026125, "grad_norm": 2.5001378059387207, "grad_norm_var": 0.22083751043745087, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.5999109745025635, "loss/hidden": 0.984375, "loss/logits": 0.20815744996070862, "loss/reg": 0.006010920740664005, "step": 209 }, { "epoch": 0.02625, "grad_norm": 3.175185203552246, "grad_norm_var": 0.20582482438127556, "learning_rate": 0.0001, "loss": 1.239, "loss/crossentropy": 2.3893682956695557, "loss/hidden": 1.0234375, "loss/logits": 0.15550163388252258, "loss/reg": 0.006009369157254696, "step": 210 }, { "epoch": 0.026375, "grad_norm": 3.482342481613159, "grad_norm_var": 0.19031657232839597, "learning_rate": 0.0001, "loss": 1.1572, "loss/crossentropy": 2.382542848587036, "loss/hidden": 0.94921875, "loss/logits": 0.14788678288459778, "loss/reg": 0.006007815711200237, "step": 211 }, { "epoch": 0.0265, "grad_norm": 2.285135507583618, "grad_norm_var": 0.19168098803167197, "learning_rate": 0.0001, "loss": 0.9667, "loss/crossentropy": 2.552724838256836, "loss/hidden": 0.78125, "loss/logits": 0.1254206746816635, "loss/reg": 0.006006232462823391, "step": 212 }, { "epoch": 0.026625, "grad_norm": 2.991971969604492, "grad_norm_var": 0.1888233667670041, "learning_rate": 0.0001, "loss": 1.1472, "loss/crossentropy": 2.472437620162964, "loss/hidden": 0.9296875, "loss/logits": 0.15750399231910706, "loss/reg": 0.0060045006684958935, "step": 213 }, { "epoch": 0.02675, "grad_norm": 2.3775179386138916, "grad_norm_var": 0.1665701003974154, "learning_rate": 0.0001, "loss": 1.1938, "loss/crossentropy": 2.294337749481201, "loss/hidden": 0.95703125, "loss/logits": 0.17671090364456177, "loss/reg": 0.006002978887408972, "step": 214 }, { "epoch": 0.026875, "grad_norm": 2.2992701530456543, "grad_norm_var": 0.17463199132661936, "learning_rate": 0.0001, "loss": 1.2097, "loss/crossentropy": 2.3843300342559814, "loss/hidden": 0.9609375, "loss/logits": 0.18876615166664124, "loss/reg": 0.006001432426273823, "step": 215 }, { "epoch": 0.027, "grad_norm": 2.4926228523254395, "grad_norm_var": 0.17347807328228151, "learning_rate": 0.0001, "loss": 1.3156, "loss/crossentropy": 2.326836585998535, "loss/hidden": 1.0625, "loss/logits": 0.19308596849441528, "loss/reg": 0.005999880842864513, "step": 216 }, { "epoch": 0.027125, "grad_norm": 2.552459478378296, "grad_norm_var": 0.16193263198218044, "learning_rate": 0.0001, "loss": 1.1424, "loss/crossentropy": 2.6629388332366943, "loss/hidden": 0.91015625, "loss/logits": 0.1722826063632965, "loss/reg": 0.005998372100293636, "step": 217 }, { "epoch": 0.02725, "grad_norm": 2.866387128829956, "grad_norm_var": 0.16409192036900605, "learning_rate": 0.0001, "loss": 1.0142, "loss/crossentropy": 2.8154890537261963, "loss/hidden": 0.80078125, "loss/logits": 0.15349115431308746, "loss/reg": 0.005996840540319681, "step": 218 }, { "epoch": 0.027375, "grad_norm": 2.77524471282959, "grad_norm_var": 0.12966566207502767, "learning_rate": 0.0001, "loss": 1.4111, "loss/crossentropy": 2.4509928226470947, "loss/hidden": 1.1015625, "loss/logits": 0.249616801738739, "loss/reg": 0.005995343904942274, "step": 219 }, { "epoch": 0.0275, "grad_norm": 2.887923240661621, "grad_norm_var": 0.13285907347625023, "learning_rate": 0.0001, "loss": 1.2886, "loss/crossentropy": 2.4280507564544678, "loss/hidden": 1.0234375, "loss/logits": 0.20519307255744934, "loss/reg": 0.005993579979985952, "step": 220 }, { "epoch": 0.027625, "grad_norm": 2.5383920669555664, "grad_norm_var": 0.1337457284607846, "learning_rate": 0.0001, "loss": 1.3292, "loss/crossentropy": 2.0803585052490234, "loss/hidden": 1.09375, "loss/logits": 0.17551109194755554, "loss/reg": 0.005991705227643251, "step": 221 }, { "epoch": 0.02775, "grad_norm": 2.639490842819214, "grad_norm_var": 0.12131687494494538, "learning_rate": 0.0001, "loss": 1.0593, "loss/crossentropy": 2.293325901031494, "loss/hidden": 0.8515625, "loss/logits": 0.14782238006591797, "loss/reg": 0.005989882629364729, "step": 222 }, { "epoch": 0.027875, "grad_norm": 2.4396984577178955, "grad_norm_var": 0.12344012810124999, "learning_rate": 0.0001, "loss": 1.0587, "loss/crossentropy": 2.7268667221069336, "loss/hidden": 0.84765625, "loss/logits": 0.15114662051200867, "loss/reg": 0.0059883627109229565, "step": 223 }, { "epoch": 0.028, "grad_norm": 2.227886438369751, "grad_norm_var": 0.12171264621671582, "learning_rate": 0.0001, "loss": 1.0087, "loss/crossentropy": 2.4431943893432617, "loss/hidden": 0.81640625, "loss/logits": 0.13243696093559265, "loss/reg": 0.005986812058836222, "step": 224 }, { "epoch": 0.028125, "grad_norm": 3.690627098083496, "grad_norm_var": 0.18519755428341872, "learning_rate": 0.0001, "loss": 1.0732, "loss/crossentropy": 2.4630942344665527, "loss/hidden": 0.875, "loss/logits": 0.13830721378326416, "loss/reg": 0.005985158029943705, "step": 225 }, { "epoch": 0.02825, "grad_norm": 3.377890110015869, "grad_norm_var": 0.19972658805784155, "learning_rate": 0.0001, "loss": 1.1848, "loss/crossentropy": 2.2899203300476074, "loss/hidden": 0.9609375, "loss/logits": 0.16401749849319458, "loss/reg": 0.005983633920550346, "step": 226 }, { "epoch": 0.028375, "grad_norm": 2.7600386142730713, "grad_norm_var": 0.16135214723361363, "learning_rate": 0.0001, "loss": 1.0223, "loss/crossentropy": 2.8077659606933594, "loss/hidden": 0.8203125, "loss/logits": 0.14218226075172424, "loss/reg": 0.005982026923447847, "step": 227 }, { "epoch": 0.0285, "grad_norm": 2.3397345542907715, "grad_norm_var": 0.15851713921701366, "learning_rate": 0.0001, "loss": 1.077, "loss/crossentropy": 2.438030958175659, "loss/hidden": 0.875, "loss/logits": 0.14217695593833923, "loss/reg": 0.005980519577860832, "step": 228 }, { "epoch": 0.028625, "grad_norm": 2.744401216506958, "grad_norm_var": 0.15282793193407448, "learning_rate": 0.0001, "loss": 1.1967, "loss/crossentropy": 2.557457447052002, "loss/hidden": 0.97265625, "loss/logits": 0.16425767540931702, "loss/reg": 0.005979116074740887, "step": 229 }, { "epoch": 0.02875, "grad_norm": 2.4241418838500977, "grad_norm_var": 0.15103305834679168, "learning_rate": 0.0001, "loss": 1.0402, "loss/crossentropy": 2.743885040283203, "loss/hidden": 0.828125, "loss/logits": 0.15231972932815552, "loss/reg": 0.005977709777653217, "step": 230 }, { "epoch": 0.028875, "grad_norm": 2.0828442573547363, "grad_norm_var": 0.16526500993595217, "learning_rate": 0.0001, "loss": 0.9747, "loss/crossentropy": 2.719327688217163, "loss/hidden": 0.78125, "loss/logits": 0.133681058883667, "loss/reg": 0.005976095795631409, "step": 231 }, { "epoch": 0.029, "grad_norm": 2.127495527267456, "grad_norm_var": 0.18259721536013085, "learning_rate": 0.0001, "loss": 1.0588, "loss/crossentropy": 2.8147058486938477, "loss/hidden": 0.85546875, "loss/logits": 0.14354225993156433, "loss/reg": 0.005974431522190571, "step": 232 }, { "epoch": 0.029125, "grad_norm": 4.263195991516113, "grad_norm_var": 0.34219781045772657, "learning_rate": 0.0001, "loss": 1.1724, "loss/crossentropy": 2.5414481163024902, "loss/hidden": 0.96484375, "loss/logits": 0.1478062868118286, "loss/reg": 0.005972826853394508, "step": 233 }, { "epoch": 0.02925, "grad_norm": 2.9974324703216553, "grad_norm_var": 0.34510225788824467, "learning_rate": 0.0001, "loss": 1.3152, "loss/crossentropy": 2.697648763656616, "loss/hidden": 1.0546875, "loss/logits": 0.20080995559692383, "loss/reg": 0.005971227772533894, "step": 234 }, { "epoch": 0.029375, "grad_norm": 3.4798855781555176, "grad_norm_var": 0.37664835069757197, "learning_rate": 0.0001, "loss": 1.2096, "loss/crossentropy": 2.3990559577941895, "loss/hidden": 0.95703125, "loss/logits": 0.19287389516830444, "loss/reg": 0.005969603545963764, "step": 235 }, { "epoch": 0.0295, "grad_norm": 2.43911075592041, "grad_norm_var": 0.3848032740432508, "learning_rate": 0.0001, "loss": 1.0658, "loss/crossentropy": 1.966374158859253, "loss/hidden": 0.875, "loss/logits": 0.13115233182907104, "loss/reg": 0.005967943929135799, "step": 236 }, { "epoch": 0.029625, "grad_norm": 3.7423646450042725, "grad_norm_var": 0.4356891905379257, "learning_rate": 0.0001, "loss": 1.2397, "loss/crossentropy": 2.718675374984741, "loss/hidden": 0.9921875, "loss/logits": 0.18789833784103394, "loss/reg": 0.00596608454361558, "step": 237 }, { "epoch": 0.02975, "grad_norm": 3.328033924102783, "grad_norm_var": 0.4449827328026664, "learning_rate": 0.0001, "loss": 1.5581, "loss/crossentropy": 2.272303819656372, "loss/hidden": 1.2421875, "loss/logits": 0.2562662661075592, "loss/reg": 0.005964066833257675, "step": 238 }, { "epoch": 0.029875, "grad_norm": 2.8761045932769775, "grad_norm_var": 0.42986649641521024, "learning_rate": 0.0001, "loss": 1.1392, "loss/crossentropy": 2.6973013877868652, "loss/hidden": 0.91796875, "loss/logits": 0.16159963607788086, "loss/reg": 0.005962541792541742, "step": 239 }, { "epoch": 0.03, "grad_norm": 2.4458563327789307, "grad_norm_var": 0.4123921579785623, "learning_rate": 0.0001, "loss": 1.178, "loss/crossentropy": 2.5731561183929443, "loss/hidden": 0.9375, "loss/logits": 0.18093177676200867, "loss/reg": 0.005961006972938776, "step": 240 }, { "epoch": 0.030125, "grad_norm": 2.4645614624023438, "grad_norm_var": 0.3844441578530656, "learning_rate": 0.0001, "loss": 1.0932, "loss/crossentropy": 2.648738145828247, "loss/hidden": 0.890625, "loss/logits": 0.14302745461463928, "loss/reg": 0.005959144793450832, "step": 241 }, { "epoch": 0.03025, "grad_norm": 3.0715034008026123, "grad_norm_var": 0.3694944025754277, "learning_rate": 0.0001, "loss": 1.1916, "loss/crossentropy": 2.4820139408111572, "loss/hidden": 0.94921875, "loss/logits": 0.18278783559799194, "loss/reg": 0.005957332905381918, "step": 242 }, { "epoch": 0.030375, "grad_norm": 2.479677677154541, "grad_norm_var": 0.37773887013444374, "learning_rate": 0.0001, "loss": 1.0787, "loss/crossentropy": 2.614309549331665, "loss/hidden": 0.87109375, "loss/logits": 0.14808647334575653, "loss/reg": 0.005955492611974478, "step": 243 }, { "epoch": 0.0305, "grad_norm": 3.0970399379730225, "grad_norm_var": 0.36391299171458796, "learning_rate": 0.0001, "loss": 1.1987, "loss/crossentropy": 2.2731809616088867, "loss/hidden": 0.95703125, "loss/logits": 0.18210504949092865, "loss/reg": 0.00595364673063159, "step": 244 }, { "epoch": 0.030625, "grad_norm": 2.388214588165283, "grad_norm_var": 0.37823356386532864, "learning_rate": 0.0001, "loss": 1.1283, "loss/crossentropy": 2.532259225845337, "loss/hidden": 0.91015625, "loss/logits": 0.15858401358127594, "loss/reg": 0.005952049978077412, "step": 245 }, { "epoch": 0.03075, "grad_norm": 2.97310733795166, "grad_norm_var": 0.36540629077152076, "learning_rate": 0.0001, "loss": 1.1177, "loss/crossentropy": 2.5206258296966553, "loss/hidden": 0.89453125, "loss/logits": 0.16365137696266174, "loss/reg": 0.005950110498815775, "step": 246 }, { "epoch": 0.030875, "grad_norm": 2.15498423576355, "grad_norm_var": 0.3579579158371985, "learning_rate": 0.0001, "loss": 1.1046, "loss/crossentropy": 2.478773832321167, "loss/hidden": 0.8828125, "loss/logits": 0.162343829870224, "loss/reg": 0.005948282778263092, "step": 247 }, { "epoch": 0.031, "grad_norm": 2.3404128551483154, "grad_norm_var": 0.338987407645584, "learning_rate": 0.0001, "loss": 1.1555, "loss/crossentropy": 2.1949751377105713, "loss/hidden": 0.93359375, "loss/logits": 0.1624409407377243, "loss/reg": 0.005946675315499306, "step": 248 }, { "epoch": 0.031125, "grad_norm": 2.8813085556030273, "grad_norm_var": 0.20879640313171802, "learning_rate": 0.0001, "loss": 1.1599, "loss/crossentropy": 2.556128978729248, "loss/hidden": 0.9296875, "loss/logits": 0.1707805097103119, "loss/reg": 0.005944731179624796, "step": 249 }, { "epoch": 0.03125, "grad_norm": 3.309937000274658, "grad_norm_var": 0.22219010027481143, "learning_rate": 0.0001, "loss": 1.0939, "loss/crossentropy": 2.4590022563934326, "loss/hidden": 0.88671875, "loss/logits": 0.14774294197559357, "loss/reg": 0.005942681338638067, "step": 250 }, { "epoch": 0.031375, "grad_norm": 3.1676676273345947, "grad_norm_var": 0.201728293925846, "learning_rate": 0.0001, "loss": 1.3162, "loss/crossentropy": 2.419811487197876, "loss/hidden": 1.015625, "loss/logits": 0.24120670557022095, "loss/reg": 0.005940672475844622, "step": 251 }, { "epoch": 0.0315, "grad_norm": 2.6006832122802734, "grad_norm_var": 0.1951007002723287, "learning_rate": 0.0001, "loss": 1.3903, "loss/crossentropy": 2.170666456222534, "loss/hidden": 1.140625, "loss/logits": 0.19024603068828583, "loss/reg": 0.005938523914664984, "step": 252 }, { "epoch": 0.031625, "grad_norm": 2.4954755306243896, "grad_norm_var": 0.14101991304577552, "learning_rate": 0.0001, "loss": 1.1465, "loss/crossentropy": 2.262831449508667, "loss/hidden": 0.93359375, "loss/logits": 0.1535283327102661, "loss/reg": 0.00593681400641799, "step": 253 }, { "epoch": 0.03175, "grad_norm": 2.339406728744507, "grad_norm_var": 0.12652605714113535, "learning_rate": 0.0001, "loss": 0.984, "loss/crossentropy": 2.2793617248535156, "loss/hidden": 0.796875, "loss/logits": 0.12778240442276, "loss/reg": 0.005935273133218288, "step": 254 }, { "epoch": 0.031875, "grad_norm": 2.3391647338867188, "grad_norm_var": 0.131427049667937, "learning_rate": 0.0001, "loss": 1.0622, "loss/crossentropy": 2.4579379558563232, "loss/hidden": 0.83984375, "loss/logits": 0.16299216449260712, "loss/reg": 0.0059331608936190605, "step": 255 }, { "epoch": 0.032, "grad_norm": 2.3896231651306152, "grad_norm_var": 0.13322512800125588, "learning_rate": 0.0001, "loss": 1.057, "loss/crossentropy": 2.8022475242614746, "loss/hidden": 0.85546875, "loss/logits": 0.14219465851783752, "loss/reg": 0.005931555759161711, "step": 256 }, { "epoch": 0.032125, "grad_norm": 2.125249147415161, "grad_norm_var": 0.14907278605534582, "learning_rate": 0.0001, "loss": 1.0611, "loss/crossentropy": 2.33644700050354, "loss/hidden": 0.8515625, "loss/logits": 0.15020999312400818, "loss/reg": 0.005930029321461916, "step": 257 }, { "epoch": 0.03225, "grad_norm": 2.521933078765869, "grad_norm_var": 0.13593429417580463, "learning_rate": 0.0001, "loss": 1.0436, "loss/crossentropy": 2.512619733810425, "loss/hidden": 0.8203125, "loss/logits": 0.16396166384220123, "loss/reg": 0.00592817785218358, "step": 258 }, { "epoch": 0.032375, "grad_norm": 2.5966317653656006, "grad_norm_var": 0.13490910688263208, "learning_rate": 0.0001, "loss": 1.1331, "loss/crossentropy": 2.248013734817505, "loss/hidden": 0.91015625, "loss/logits": 0.16364812850952148, "loss/reg": 0.00592625979334116, "step": 259 }, { "epoch": 0.0325, "grad_norm": 2.2045137882232666, "grad_norm_var": 0.12644607438415487, "learning_rate": 0.0001, "loss": 1.0015, "loss/crossentropy": 2.3253698348999023, "loss/hidden": 0.796875, "loss/logits": 0.14540287852287292, "loss/reg": 0.005924653727561235, "step": 260 }, { "epoch": 0.032625, "grad_norm": 2.4450156688690186, "grad_norm_var": 0.1254090419850094, "learning_rate": 0.0001, "loss": 0.9932, "loss/crossentropy": 2.2374210357666016, "loss/hidden": 0.80078125, "loss/logits": 0.13316848874092102, "loss/reg": 0.005922792013734579, "step": 261 }, { "epoch": 0.03275, "grad_norm": 7.747511863708496, "grad_norm_var": 1.8160510254643325, "learning_rate": 0.0001, "loss": 1.2542, "loss/crossentropy": 2.8747429847717285, "loss/hidden": 1.0234375, "loss/logits": 0.17151576280593872, "loss/reg": 0.005921173375099897, "step": 262 }, { "epoch": 0.032875, "grad_norm": 2.1854233741760254, "grad_norm_var": 1.8132730792650582, "learning_rate": 0.0001, "loss": 1.0069, "loss/crossentropy": 2.4989960193634033, "loss/hidden": 0.8125, "loss/logits": 0.13518914580345154, "loss/reg": 0.005919379647821188, "step": 263 }, { "epoch": 0.033, "grad_norm": 3.5132219791412354, "grad_norm_var": 1.8186749991604263, "learning_rate": 0.0001, "loss": 1.054, "loss/crossentropy": 2.497178316116333, "loss/hidden": 0.84765625, "loss/logits": 0.1471494734287262, "loss/reg": 0.005917761009186506, "step": 264 }, { "epoch": 0.033125, "grad_norm": 4.302145481109619, "grad_norm_var": 1.9358282916849012, "learning_rate": 0.0001, "loss": 1.3123, "loss/crossentropy": 2.1725542545318604, "loss/hidden": 1.0859375, "loss/logits": 0.16722658276557922, "loss/reg": 0.0059160212986171246, "step": 265 }, { "epoch": 0.03325, "grad_norm": 2.3225510120391846, "grad_norm_var": 1.9582913809461102, "learning_rate": 0.0001, "loss": 1.0153, "loss/crossentropy": 2.6670029163360596, "loss/hidden": 0.80859375, "loss/logits": 0.1475904881954193, "loss/reg": 0.0059142098762094975, "step": 266 }, { "epoch": 0.033375, "grad_norm": 5.196990013122559, "grad_norm_var": 2.27294427304937, "learning_rate": 0.0001, "loss": 1.1665, "loss/crossentropy": 2.6792731285095215, "loss/hidden": 0.94140625, "loss/logits": 0.1659836769104004, "loss/reg": 0.00591221172362566, "step": 267 }, { "epoch": 0.0335, "grad_norm": 3.5144336223602295, "grad_norm_var": 2.26638445070385, "learning_rate": 0.0001, "loss": 1.2502, "loss/crossentropy": 2.2949023246765137, "loss/hidden": 1.0234375, "loss/logits": 0.1677004098892212, "loss/reg": 0.005910532083362341, "step": 268 }, { "epoch": 0.033625, "grad_norm": 2.861222267150879, "grad_norm_var": 2.2433162495019436, "learning_rate": 0.0001, "loss": 1.3308, "loss/crossentropy": 2.5955142974853516, "loss/hidden": 1.0703125, "loss/logits": 0.2013990730047226, "loss/reg": 0.005908492021262646, "step": 269 }, { "epoch": 0.03375, "grad_norm": 2.964390754699707, "grad_norm_var": 2.1991134738974947, "learning_rate": 0.0001, "loss": 1.0975, "loss/crossentropy": 2.483924150466919, "loss/hidden": 0.8828125, "loss/logits": 0.15562227368354797, "loss/reg": 0.005906403064727783, "step": 270 }, { "epoch": 0.033875, "grad_norm": 2.75604510307312, "grad_norm_var": 2.1620222961988325, "learning_rate": 0.0001, "loss": 1.2196, "loss/crossentropy": 2.39125394821167, "loss/hidden": 0.9765625, "loss/logits": 0.18403753638267517, "loss/reg": 0.00590470340102911, "step": 271 }, { "epoch": 0.034, "grad_norm": 2.360309362411499, "grad_norm_var": 2.165352535939727, "learning_rate": 0.0001, "loss": 1.0194, "loss/crossentropy": 2.530670404434204, "loss/hidden": 0.8046875, "loss/logits": 0.15565866231918335, "loss/reg": 0.005902664735913277, "step": 272 }, { "epoch": 0.034125, "grad_norm": 2.496027946472168, "grad_norm_var": 2.1195219252368287, "learning_rate": 0.0001, "loss": 1.2228, "loss/crossentropy": 2.7535252571105957, "loss/hidden": 0.9609375, "loss/logits": 0.20284873247146606, "loss/reg": 0.005900639574974775, "step": 273 }, { "epoch": 0.03425, "grad_norm": 2.854250431060791, "grad_norm_var": 2.0941964139517344, "learning_rate": 0.0001, "loss": 1.1387, "loss/crossentropy": 2.134964942932129, "loss/hidden": 0.9296875, "loss/logits": 0.15002194046974182, "loss/reg": 0.005898929201066494, "step": 274 }, { "epoch": 0.034375, "grad_norm": 4.497798442840576, "grad_norm_var": 2.149396374832277, "learning_rate": 0.0001, "loss": 1.2312, "loss/crossentropy": 2.3270835876464844, "loss/hidden": 0.99609375, "loss/logits": 0.17617599666118622, "loss/reg": 0.0058972095139324665, "step": 275 }, { "epoch": 0.0345, "grad_norm": 2.321152448654175, "grad_norm_var": 2.1318278315927155, "learning_rate": 0.0001, "loss": 1.1523, "loss/crossentropy": 1.858445644378662, "loss/hidden": 0.94921875, "loss/logits": 0.14408603310585022, "loss/reg": 0.005895303096622229, "step": 276 }, { "epoch": 0.034625, "grad_norm": 2.4426257610321045, "grad_norm_var": 2.1321312734782243, "learning_rate": 0.0001, "loss": 1.0267, "loss/crossentropy": 2.4483628273010254, "loss/hidden": 0.82421875, "loss/logits": 0.1435263752937317, "loss/reg": 0.005893299821764231, "step": 277 }, { "epoch": 0.03475, "grad_norm": 2.144637107849121, "grad_norm_var": 0.843351985629086, "learning_rate": 0.0001, "loss": 1.0517, "loss/crossentropy": 2.237915277481079, "loss/hidden": 0.8515625, "loss/logits": 0.14119011163711548, "loss/reg": 0.005891298409551382, "step": 278 }, { "epoch": 0.034875, "grad_norm": 2.32000732421875, "grad_norm_var": 0.8290445100225684, "learning_rate": 0.0001, "loss": 1.0462, "loss/crossentropy": 2.6588850021362305, "loss/hidden": 0.83203125, "loss/logits": 0.1552983820438385, "loss/reg": 0.0058892290107905865, "step": 279 }, { "epoch": 0.035, "grad_norm": 3.3390939235687256, "grad_norm_var": 0.820283282746707, "learning_rate": 0.0001, "loss": 1.1937, "loss/crossentropy": 2.5243186950683594, "loss/hidden": 0.953125, "loss/logits": 0.1817275732755661, "loss/reg": 0.00588742271065712, "step": 280 }, { "epoch": 0.035125, "grad_norm": 3.1800894737243652, "grad_norm_var": 0.7106469411621028, "learning_rate": 0.0001, "loss": 1.1937, "loss/crossentropy": 2.556126832962036, "loss/hidden": 0.953125, "loss/logits": 0.18167603015899658, "loss/reg": 0.005885709077119827, "step": 281 }, { "epoch": 0.03525, "grad_norm": 4.466390132904053, "grad_norm_var": 0.8119073339313209, "learning_rate": 0.0001, "loss": 1.27, "loss/crossentropy": 2.5671539306640625, "loss/hidden": 0.984375, "loss/logits": 0.2267427146434784, "loss/reg": 0.0058837407268583775, "step": 282 }, { "epoch": 0.035375, "grad_norm": 3.2809953689575195, "grad_norm_var": 0.5074810718943117, "learning_rate": 0.0001, "loss": 1.1245, "loss/crossentropy": 2.1554338932037354, "loss/hidden": 0.9140625, "loss/logits": 0.1516391634941101, "loss/reg": 0.005881770513951778, "step": 283 }, { "epoch": 0.0355, "grad_norm": 2.9982316493988037, "grad_norm_var": 0.48786559613454966, "learning_rate": 0.0001, "loss": 1.1286, "loss/crossentropy": 2.6773006916046143, "loss/hidden": 0.90625, "loss/logits": 0.1635606288909912, "loss/reg": 0.005880062934011221, "step": 284 }, { "epoch": 0.035625, "grad_norm": 2.387657880783081, "grad_norm_var": 0.5078162485774572, "learning_rate": 0.0001, "loss": 1.1214, "loss/crossentropy": 2.4741320610046387, "loss/hidden": 0.8984375, "loss/logits": 0.1641697734594345, "loss/reg": 0.0058782072737813, "step": 285 }, { "epoch": 0.03575, "grad_norm": 271.6628112792969, "grad_norm_var": 4514.324895160767, "learning_rate": 0.0001, "loss": 1.6171, "loss/crossentropy": 2.5766143798828125, "loss/hidden": 1.375, "loss/logits": 0.1833469420671463, "loss/reg": 0.005876271054148674, "step": 286 }, { "epoch": 0.035875, "grad_norm": 3.545677900314331, "grad_norm_var": 4512.577903953303, "learning_rate": 0.0001, "loss": 1.1466, "loss/crossentropy": 2.5389881134033203, "loss/hidden": 0.88671875, "loss/logits": 0.20117658376693726, "loss/reg": 0.005874336697161198, "step": 287 }, { "epoch": 0.036, "grad_norm": 2.9219233989715576, "grad_norm_var": 4511.294050983276, "learning_rate": 0.0001, "loss": 1.1121, "loss/crossentropy": 2.3270509243011475, "loss/hidden": 0.8828125, "loss/logits": 0.17058232426643372, "loss/reg": 0.005872361361980438, "step": 288 }, { "epoch": 0.036125, "grad_norm": 2.831878423690796, "grad_norm_var": 4510.526061571783, "learning_rate": 0.0001, "loss": 1.148, "loss/crossentropy": 2.4853744506835938, "loss/hidden": 0.91796875, "loss/logits": 0.17128118872642517, "loss/reg": 0.005870639346539974, "step": 289 }, { "epoch": 0.03625, "grad_norm": 2.284134864807129, "grad_norm_var": 4511.83639181831, "learning_rate": 0.0001, "loss": 1.0599, "loss/crossentropy": 2.3107759952545166, "loss/hidden": 0.8515625, "loss/logits": 0.14969472587108612, "loss/reg": 0.005868903826922178, "step": 290 }, { "epoch": 0.036375, "grad_norm": 2.2008161544799805, "grad_norm_var": 4516.84932017332, "learning_rate": 0.0001, "loss": 1.0902, "loss/crossentropy": 2.4265358448028564, "loss/hidden": 0.86328125, "loss/logits": 0.1682073473930359, "loss/reg": 0.0058671231381595135, "step": 291 }, { "epoch": 0.0365, "grad_norm": 2.6285743713378906, "grad_norm_var": 4516.145108725088, "learning_rate": 0.0001, "loss": 1.2494, "loss/crossentropy": 2.372230291366577, "loss/hidden": 0.98046875, "loss/logits": 0.2102714478969574, "loss/reg": 0.005865375977009535, "step": 292 }, { "epoch": 0.036625, "grad_norm": 2.6784040927886963, "grad_norm_var": 4515.607170253259, "learning_rate": 0.0001, "loss": 1.0752, "loss/crossentropy": 2.6276440620422363, "loss/hidden": 0.875, "loss/logits": 0.14159329235553741, "loss/reg": 0.005863656289875507, "step": 293 }, { "epoch": 0.03675, "grad_norm": 2.6373047828674316, "grad_norm_var": 4514.470495103465, "learning_rate": 0.0001, "loss": 1.1694, "loss/crossentropy": 2.70892333984375, "loss/hidden": 0.9453125, "loss/logits": 0.16546514630317688, "loss/reg": 0.005862091202288866, "step": 294 }, { "epoch": 0.036875, "grad_norm": 2.384430170059204, "grad_norm_var": 4514.321377312488, "learning_rate": 0.0001, "loss": 1.2472, "loss/crossentropy": 2.1273090839385986, "loss/hidden": 1.0, "loss/logits": 0.18860690295696259, "loss/reg": 0.005860424134880304, "step": 295 }, { "epoch": 0.037, "grad_norm": 2.5959692001342773, "grad_norm_var": 4515.978398966678, "learning_rate": 0.0001, "loss": 1.0376, "loss/crossentropy": 2.7293522357940674, "loss/hidden": 0.8203125, "loss/logits": 0.1587076485157013, "loss/reg": 0.0058588446117937565, "step": 296 }, { "epoch": 0.037125, "grad_norm": 2.2753238677978516, "grad_norm_var": 4518.0185669920775, "learning_rate": 0.0001, "loss": 1.0063, "loss/crossentropy": 2.4602949619293213, "loss/hidden": 0.8125, "loss/logits": 0.13525693118572235, "loss/reg": 0.005857320036739111, "step": 297 }, { "epoch": 0.03725, "grad_norm": 3.009300708770752, "grad_norm_var": 4521.093589717446, "learning_rate": 0.0001, "loss": 1.2573, "loss/crossentropy": 2.8883349895477295, "loss/hidden": 0.9921875, "loss/logits": 0.20657645165920258, "loss/reg": 0.005855792202055454, "step": 298 }, { "epoch": 0.037375, "grad_norm": 2.700221538543701, "grad_norm_var": 4522.372179334166, "learning_rate": 0.0001, "loss": 1.1557, "loss/crossentropy": 2.5446314811706543, "loss/hidden": 0.90234375, "loss/logits": 0.19479964673519135, "loss/reg": 0.005854278337210417, "step": 299 }, { "epoch": 0.0375, "grad_norm": 2.3786559104919434, "grad_norm_var": 4523.758055495688, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.469960927963257, "loss/hidden": 0.90234375, "loss/logits": 0.16156738996505737, "loss/reg": 0.00585273839533329, "step": 300 }, { "epoch": 0.037625, "grad_norm": 2.7032158374786377, "grad_norm_var": 4523.046593599144, "learning_rate": 0.0001, "loss": 1.1947, "loss/crossentropy": 2.7451162338256836, "loss/hidden": 0.94140625, "loss/logits": 0.19476984441280365, "loss/reg": 0.0058509958907961845, "step": 301 }, { "epoch": 0.03775, "grad_norm": 2.507664442062378, "grad_norm_var": 0.11250867537391755, "learning_rate": 0.0001, "loss": 0.9899, "loss/crossentropy": 2.53341007232666, "loss/hidden": 0.796875, "loss/logits": 0.1345081329345703, "loss/reg": 0.005849248263984919, "step": 302 }, { "epoch": 0.037875, "grad_norm": 3.027892589569092, "grad_norm_var": 0.06692647718721882, "learning_rate": 0.0001, "loss": 1.0973, "loss/crossentropy": 2.7899296283721924, "loss/hidden": 0.890625, "loss/logits": 0.1482122391462326, "loss/reg": 0.005847662687301636, "step": 303 }, { "epoch": 0.038, "grad_norm": 2.1617183685302734, "grad_norm_var": 0.07146536810277529, "learning_rate": 0.0001, "loss": 0.969, "loss/crossentropy": 2.4700305461883545, "loss/hidden": 0.78125, "loss/logits": 0.12925508618354797, "loss/reg": 0.005846073850989342, "step": 304 }, { "epoch": 0.038125, "grad_norm": 2.3791332244873047, "grad_norm_var": 0.06803597239225306, "learning_rate": 0.0001, "loss": 1.1912, "loss/crossentropy": 2.4171202182769775, "loss/hidden": 0.9453125, "loss/logits": 0.18739524483680725, "loss/reg": 0.005844476167112589, "step": 305 }, { "epoch": 0.03825, "grad_norm": 2.7622976303100586, "grad_norm_var": 0.06636088237049004, "learning_rate": 0.0001, "loss": 1.0808, "loss/crossentropy": 2.5030367374420166, "loss/hidden": 0.8359375, "loss/logits": 0.18643516302108765, "loss/reg": 0.005842759273946285, "step": 306 }, { "epoch": 0.038375, "grad_norm": 2.4079246520996094, "grad_norm_var": 0.059000676657357566, "learning_rate": 0.0001, "loss": 1.0359, "loss/crossentropy": 2.381542682647705, "loss/hidden": 0.828125, "loss/logits": 0.1493588387966156, "loss/reg": 0.0058412267826497555, "step": 307 }, { "epoch": 0.0385, "grad_norm": 2.5356478691101074, "grad_norm_var": 0.058906038923372726, "learning_rate": 0.0001, "loss": 1.087, "loss/crossentropy": 2.4928808212280273, "loss/hidden": 0.875, "loss/logits": 0.15363982319831848, "loss/reg": 0.0058394852094352245, "step": 308 }, { "epoch": 0.038625, "grad_norm": 2.4036688804626465, "grad_norm_var": 0.0597099908353601, "learning_rate": 0.0001, "loss": 0.986, "loss/crossentropy": 2.5816946029663086, "loss/hidden": 0.7890625, "loss/logits": 0.13851355016231537, "loss/reg": 0.005837727338075638, "step": 309 }, { "epoch": 0.03875, "grad_norm": 2.630572557449341, "grad_norm_var": 0.05963840398777146, "learning_rate": 0.0001, "loss": 1.0333, "loss/crossentropy": 2.140015125274658, "loss/hidden": 0.828125, "loss/logits": 0.14680367708206177, "loss/reg": 0.005835913587361574, "step": 310 }, { "epoch": 0.038875, "grad_norm": 2.3641905784606934, "grad_norm_var": 0.06012154861927167, "learning_rate": 0.0001, "loss": 1.0947, "loss/crossentropy": 2.3300833702087402, "loss/hidden": 0.8828125, "loss/logits": 0.15358075499534607, "loss/reg": 0.005834224168211222, "step": 311 }, { "epoch": 0.039, "grad_norm": 2.215728759765625, "grad_norm_var": 0.06696490679455162, "learning_rate": 0.0001, "loss": 1.1411, "loss/crossentropy": 2.4583277702331543, "loss/hidden": 0.9140625, "loss/logits": 0.1687404215335846, "loss/reg": 0.005832599475979805, "step": 312 }, { "epoch": 0.039125, "grad_norm": 2.8934550285339355, "grad_norm_var": 0.06994228066174794, "learning_rate": 0.0001, "loss": 1.2763, "loss/crossentropy": 2.409702777862549, "loss/hidden": 1.0390625, "loss/logits": 0.17889352142810822, "loss/reg": 0.005831001792103052, "step": 313 }, { "epoch": 0.03925, "grad_norm": 8.741681098937988, "grad_norm_var": 2.4613182467650705, "learning_rate": 0.0001, "loss": 1.1972, "loss/crossentropy": 2.3858492374420166, "loss/hidden": 0.96875, "loss/logits": 0.1701970100402832, "loss/reg": 0.005829236935824156, "step": 314 }, { "epoch": 0.039375, "grad_norm": 7.412417411804199, "grad_norm_var": 3.707354176329111, "learning_rate": 0.0001, "loss": 1.3096, "loss/crossentropy": 2.3804125785827637, "loss/hidden": 1.1015625, "loss/logits": 0.149795800447464, "loss/reg": 0.005827469285577536, "step": 315 }, { "epoch": 0.0395, "grad_norm": 3.1443870067596436, "grad_norm_var": 3.6580641482995806, "learning_rate": 0.0001, "loss": 1.1365, "loss/crossentropy": 2.481820583343506, "loss/hidden": 0.90234375, "loss/logits": 0.1759084165096283, "loss/reg": 0.005825776606798172, "step": 316 }, { "epoch": 0.039625, "grad_norm": 2.8567562103271484, "grad_norm_var": 3.6479706732170993, "learning_rate": 0.0001, "loss": 1.0023, "loss/crossentropy": 2.5141823291778564, "loss/hidden": 0.80078125, "loss/logits": 0.14331723749637604, "loss/reg": 0.005824015475809574, "step": 317 }, { "epoch": 0.03975, "grad_norm": 2.2817444801330566, "grad_norm_var": 3.674359828489624, "learning_rate": 0.0001, "loss": 1.0893, "loss/crossentropy": 2.184128999710083, "loss/hidden": 0.875, "loss/logits": 0.15605026483535767, "loss/reg": 0.00582248717546463, "step": 318 }, { "epoch": 0.039875, "grad_norm": 2.249969005584717, "grad_norm_var": 3.736641439481692, "learning_rate": 0.0001, "loss": 1.008, "loss/crossentropy": 2.768484354019165, "loss/hidden": 0.80078125, "loss/logits": 0.14897163212299347, "loss/reg": 0.00582079216837883, "step": 319 }, { "epoch": 0.04, "grad_norm": 2.6358306407928467, "grad_norm_var": 3.684102068428194, "learning_rate": 0.0001, "loss": 1.3237, "loss/crossentropy": 2.301954507827759, "loss/hidden": 1.015625, "loss/logits": 0.24987459182739258, "loss/reg": 0.005819002632051706, "step": 320 }, { "epoch": 0.040125, "grad_norm": 2.353457450866699, "grad_norm_var": 3.6871065280104496, "learning_rate": 0.0001, "loss": 1.1095, "loss/crossentropy": 2.379765272140503, "loss/hidden": 0.89453125, "loss/logits": 0.15680107474327087, "loss/reg": 0.005817302968353033, "step": 321 }, { "epoch": 0.04025, "grad_norm": 2.4568967819213867, "grad_norm_var": 3.712514538750317, "learning_rate": 0.0001, "loss": 0.9706, "loss/crossentropy": 2.380795955657959, "loss/hidden": 0.77734375, "loss/logits": 0.13508911430835724, "loss/reg": 0.005815597716718912, "step": 322 }, { "epoch": 0.040375, "grad_norm": 3.207794189453125, "grad_norm_var": 3.6654654630236734, "learning_rate": 0.0001, "loss": 1.3668, "loss/crossentropy": 1.949703574180603, "loss/hidden": 1.1171875, "loss/logits": 0.19150257110595703, "loss/reg": 0.005813860800117254, "step": 323 }, { "epoch": 0.0405, "grad_norm": 3.156318187713623, "grad_norm_var": 3.6284383166396252, "learning_rate": 0.0001, "loss": 1.2742, "loss/crossentropy": 2.1970410346984863, "loss/hidden": 1.0, "loss/logits": 0.21606677770614624, "loss/reg": 0.005812041461467743, "step": 324 }, { "epoch": 0.040625, "grad_norm": 2.556889533996582, "grad_norm_var": 3.611332493108523, "learning_rate": 0.0001, "loss": 0.9529, "loss/crossentropy": 2.7647974491119385, "loss/hidden": 0.7578125, "loss/logits": 0.1369488537311554, "loss/reg": 0.00581031059846282, "step": 325 }, { "epoch": 0.04075, "grad_norm": 2.2634167671203613, "grad_norm_var": 3.653624545749698, "learning_rate": 0.0001, "loss": 1.0757, "loss/crossentropy": 2.334134340286255, "loss/hidden": 0.859375, "loss/logits": 0.1581987738609314, "loss/reg": 0.005808570422232151, "step": 326 }, { "epoch": 0.040875, "grad_norm": 2.3521125316619873, "grad_norm_var": 3.6551397839485555, "learning_rate": 0.0001, "loss": 0.9965, "loss/crossentropy": 2.78828763961792, "loss/hidden": 0.79296875, "loss/logits": 0.1454332172870636, "loss/reg": 0.005806888919323683, "step": 327 }, { "epoch": 0.041, "grad_norm": 3.0836093425750732, "grad_norm_var": 3.5768996944618254, "learning_rate": 0.0001, "loss": 1.1938, "loss/crossentropy": 2.2781612873077393, "loss/hidden": 0.9609375, "loss/logits": 0.1747758537530899, "loss/reg": 0.005805303808301687, "step": 328 }, { "epoch": 0.041125, "grad_norm": 3.6110970973968506, "grad_norm_var": 3.5651235487558246, "learning_rate": 0.0001, "loss": 1.1693, "loss/crossentropy": 2.812913417816162, "loss/hidden": 0.9375, "loss/logits": 0.17377659678459167, "loss/reg": 0.005803780164569616, "step": 329 }, { "epoch": 0.04125, "grad_norm": 2.5020155906677246, "grad_norm_var": 1.552569952590708, "learning_rate": 0.0001, "loss": 1.0862, "loss/crossentropy": 2.6585140228271484, "loss/hidden": 0.86328125, "loss/logits": 0.16489718854427338, "loss/reg": 0.005802258383482695, "step": 330 }, { "epoch": 0.041375, "grad_norm": 2.383924961090088, "grad_norm_var": 0.17978007457456116, "learning_rate": 0.0001, "loss": 1.1592, "loss/crossentropy": 2.4862210750579834, "loss/hidden": 0.94921875, "loss/logits": 0.15199331939220428, "loss/reg": 0.005800731014460325, "step": 331 }, { "epoch": 0.0415, "grad_norm": 2.187321424484253, "grad_norm_var": 0.17949311071790794, "learning_rate": 0.0001, "loss": 1.0507, "loss/crossentropy": 2.6380603313446045, "loss/hidden": 0.84765625, "loss/logits": 0.14507073163986206, "loss/reg": 0.005798923317342997, "step": 332 }, { "epoch": 0.041625, "grad_norm": 2.21768856048584, "grad_norm_var": 0.18601193201957902, "learning_rate": 0.0001, "loss": 1.1027, "loss/crossentropy": 2.3925793170928955, "loss/hidden": 0.875, "loss/logits": 0.16972869634628296, "loss/reg": 0.00579707371070981, "step": 333 }, { "epoch": 0.04175, "grad_norm": 2.682497262954712, "grad_norm_var": 0.17937770683656615, "learning_rate": 0.0001, "loss": 1.3272, "loss/crossentropy": 2.3586106300354004, "loss/hidden": 1.078125, "loss/logits": 0.1911502480506897, "loss/reg": 0.005795224104076624, "step": 334 }, { "epoch": 0.041875, "grad_norm": 3.0983307361602783, "grad_norm_var": 0.1826395003188658, "learning_rate": 0.0001, "loss": 1.1675, "loss/crossentropy": 2.436326265335083, "loss/hidden": 0.91796875, "loss/logits": 0.1915540099143982, "loss/reg": 0.005793258547782898, "step": 335 }, { "epoch": 0.042, "grad_norm": 6.251674652099609, "grad_norm_var": 0.982431631272856, "learning_rate": 0.0001, "loss": 1.6879, "loss/crossentropy": 2.3841142654418945, "loss/hidden": 1.265625, "loss/logits": 0.3643344044685364, "loss/reg": 0.0057912725023925304, "step": 336 }, { "epoch": 0.042125, "grad_norm": 3.0111782550811768, "grad_norm_var": 0.9617308564996427, "learning_rate": 0.0001, "loss": 1.3497, "loss/crossentropy": 2.430532217025757, "loss/hidden": 1.0703125, "loss/logits": 0.2214677333831787, "loss/reg": 0.00578899122774601, "step": 337 }, { "epoch": 0.04225, "grad_norm": 2.4221205711364746, "grad_norm_var": 0.9640415151512265, "learning_rate": 0.0001, "loss": 1.0955, "loss/crossentropy": 2.4376015663146973, "loss/hidden": 0.890625, "loss/logits": 0.1470467746257782, "loss/reg": 0.005786662455648184, "step": 338 }, { "epoch": 0.042375, "grad_norm": 2.615758180618286, "grad_norm_var": 0.9645524062068328, "learning_rate": 0.0001, "loss": 1.0887, "loss/crossentropy": 2.5318005084991455, "loss/hidden": 0.875, "loss/logits": 0.15580901503562927, "loss/reg": 0.0057848175056278706, "step": 339 }, { "epoch": 0.0425, "grad_norm": 2.857177972793579, "grad_norm_var": 0.9599117798964886, "learning_rate": 0.0001, "loss": 1.1153, "loss/crossentropy": 2.4260058403015137, "loss/hidden": 0.89453125, "loss/logits": 0.16291844844818115, "loss/reg": 0.005782809574157, "step": 340 }, { "epoch": 0.042625, "grad_norm": 2.4030630588531494, "grad_norm_var": 0.9680393035693963, "learning_rate": 0.0001, "loss": 1.2054, "loss/crossentropy": 2.3009443283081055, "loss/hidden": 0.953125, "loss/logits": 0.194431871175766, "loss/reg": 0.005780525505542755, "step": 341 }, { "epoch": 0.04275, "grad_norm": 2.264251470565796, "grad_norm_var": 0.9679716782722624, "learning_rate": 0.0001, "loss": 1.0227, "loss/crossentropy": 2.597288131713867, "loss/hidden": 0.8203125, "loss/logits": 0.14457917213439941, "loss/reg": 0.005778233055025339, "step": 342 }, { "epoch": 0.042875, "grad_norm": 2.2368180751800537, "grad_norm_var": 0.9767866404468121, "learning_rate": 0.0001, "loss": 0.943, "loss/crossentropy": 2.4534237384796143, "loss/hidden": 0.7578125, "loss/logits": 0.12742644548416138, "loss/reg": 0.005776000674813986, "step": 343 }, { "epoch": 0.043, "grad_norm": 2.469120979309082, "grad_norm_var": 0.9824165851632264, "learning_rate": 0.0001, "loss": 1.0531, "loss/crossentropy": 2.793834686279297, "loss/hidden": 0.83984375, "loss/logits": 0.15554235875606537, "loss/reg": 0.005774145945906639, "step": 344 }, { "epoch": 0.043125, "grad_norm": 2.8334686756134033, "grad_norm_var": 0.9387961568478952, "learning_rate": 0.0001, "loss": 0.9467, "loss/crossentropy": 2.678666830062866, "loss/hidden": 0.7578125, "loss/logits": 0.13116785883903503, "loss/reg": 0.005771928001195192, "step": 345 }, { "epoch": 0.04325, "grad_norm": 7.863356590270996, "grad_norm_var": 2.5385263322105893, "learning_rate": 0.0001, "loss": 1.4695, "loss/crossentropy": 2.613318920135498, "loss/hidden": 1.2734375, "loss/logits": 0.13832132518291473, "loss/reg": 0.005770097486674786, "step": 346 }, { "epoch": 0.043375, "grad_norm": 2.763582468032837, "grad_norm_var": 2.510660987467067, "learning_rate": 0.0001, "loss": 1.1302, "loss/crossentropy": 2.846453905105591, "loss/hidden": 0.90625, "loss/logits": 0.166295126080513, "loss/reg": 0.0057678911834955215, "step": 347 }, { "epoch": 0.0435, "grad_norm": 3.600456714630127, "grad_norm_var": 2.4567056984087676, "learning_rate": 0.0001, "loss": 1.2108, "loss/crossentropy": 2.515092372894287, "loss/hidden": 0.96875, "loss/logits": 0.18436874449253082, "loss/reg": 0.005765695124864578, "step": 348 }, { "epoch": 0.043625, "grad_norm": 4.2698073387146, "grad_norm_var": 2.4444505062987636, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.3673834800720215, "loss/hidden": 0.8984375, "loss/logits": 0.16628439724445343, "loss/reg": 0.005763507913798094, "step": 349 }, { "epoch": 0.04375, "grad_norm": 2.962045192718506, "grad_norm_var": 2.42435544256402, "learning_rate": 0.0001, "loss": 1.079, "loss/crossentropy": 2.9470205307006836, "loss/hidden": 0.83203125, "loss/logits": 0.1893935650587082, "loss/reg": 0.005761242005974054, "step": 350 }, { "epoch": 0.043875, "grad_norm": 3.0306880474090576, "grad_norm_var": 2.427092851603572, "learning_rate": 0.0001, "loss": 1.0201, "loss/crossentropy": 2.3637542724609375, "loss/hidden": 0.83203125, "loss/logits": 0.13047108054161072, "loss/reg": 0.0057592191733419895, "step": 351 }, { "epoch": 0.044, "grad_norm": 2.599585771560669, "grad_norm_var": 1.855493477227511, "learning_rate": 0.0001, "loss": 0.9429, "loss/crossentropy": 2.9222559928894043, "loss/hidden": 0.7578125, "loss/logits": 0.12747693061828613, "loss/reg": 0.005757040809839964, "step": 352 }, { "epoch": 0.044125, "grad_norm": 2.4723081588745117, "grad_norm_var": 1.882729557078295, "learning_rate": 0.0001, "loss": 1.2276, "loss/crossentropy": 2.5835001468658447, "loss/hidden": 0.94921875, "loss/logits": 0.220790833234787, "loss/reg": 0.005754764657467604, "step": 353 }, { "epoch": 0.04425, "grad_norm": 2.5266165733337402, "grad_norm_var": 1.873911870827686, "learning_rate": 0.0001, "loss": 1.1879, "loss/crossentropy": 2.4273722171783447, "loss/hidden": 0.97265625, "loss/logits": 0.15772980451583862, "loss/reg": 0.005752884317189455, "step": 354 }, { "epoch": 0.044375, "grad_norm": 2.8139867782592773, "grad_norm_var": 1.8632913443851133, "learning_rate": 0.0001, "loss": 1.2803, "loss/crossentropy": 2.591078996658325, "loss/hidden": 1.0234375, "loss/logits": 0.19931599497795105, "loss/reg": 0.0057507967576384544, "step": 355 }, { "epoch": 0.0445, "grad_norm": 2.0173490047454834, "grad_norm_var": 1.9371277324683585, "learning_rate": 0.0001, "loss": 1.0066, "loss/crossentropy": 2.415416955947876, "loss/hidden": 0.80859375, "loss/logits": 0.14050991833209991, "loss/reg": 0.005748571362346411, "step": 356 }, { "epoch": 0.044625, "grad_norm": 3.5304269790649414, "grad_norm_var": 1.916250206343263, "learning_rate": 0.0001, "loss": 1.2665, "loss/crossentropy": 2.7149741649627686, "loss/hidden": 1.0390625, "loss/logits": 0.16997796297073364, "loss/reg": 0.005746254697442055, "step": 357 }, { "epoch": 0.04475, "grad_norm": 47.96537399291992, "grad_norm_var": 127.11164707702224, "learning_rate": 0.0001, "loss": 1.4579, "loss/crossentropy": 2.7637100219726562, "loss/hidden": 1.2265625, "loss/logits": 0.17390823364257812, "loss/reg": 0.005744417663663626, "step": 358 }, { "epoch": 0.044875, "grad_norm": 2.253833055496216, "grad_norm_var": 127.10313415769795, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.3016419410705566, "loss/hidden": 0.9140625, "loss/logits": 0.16676074266433716, "loss/reg": 0.005742207169532776, "step": 359 }, { "epoch": 0.045, "grad_norm": 3.2059576511383057, "grad_norm_var": 126.79034824550331, "learning_rate": 0.0001, "loss": 1.2389, "loss/crossentropy": 2.624589204788208, "loss/hidden": 1.0, "loss/logits": 0.18154433369636536, "loss/reg": 0.005740353371948004, "step": 360 }, { "epoch": 0.045125, "grad_norm": 2.456129789352417, "grad_norm_var": 126.9607902891753, "learning_rate": 0.0001, "loss": 1.0342, "loss/crossentropy": 2.500290870666504, "loss/hidden": 0.83203125, "loss/logits": 0.14475134015083313, "loss/reg": 0.005738324951380491, "step": 361 }, { "epoch": 0.04525, "grad_norm": 3.081372022628784, "grad_norm_var": 127.21513938268541, "learning_rate": 0.0001, "loss": 1.1093, "loss/crossentropy": 2.3305118083953857, "loss/hidden": 0.8984375, "loss/logits": 0.15346962213516235, "loss/reg": 0.0057361493818461895, "step": 362 }, { "epoch": 0.045375, "grad_norm": 2.2634801864624023, "grad_norm_var": 127.4280286195785, "learning_rate": 0.0001, "loss": 1.0956, "loss/crossentropy": 2.4553990364074707, "loss/hidden": 0.875, "loss/logits": 0.16324618458747864, "loss/reg": 0.005734298378229141, "step": 363 }, { "epoch": 0.0455, "grad_norm": 3.9597907066345215, "grad_norm_var": 127.3359579534097, "learning_rate": 0.0001, "loss": 1.3557, "loss/crossentropy": 2.6449685096740723, "loss/hidden": 1.078125, "loss/logits": 0.2202637791633606, "loss/reg": 0.005732398014515638, "step": 364 }, { "epoch": 0.045625, "grad_norm": 2.7794013023376465, "grad_norm_var": 127.76159157574789, "learning_rate": 0.0001, "loss": 1.0787, "loss/crossentropy": 2.3118059635162354, "loss/hidden": 0.86328125, "loss/logits": 0.1581302285194397, "loss/reg": 0.005730301141738892, "step": 365 }, { "epoch": 0.04575, "grad_norm": 4.7589192390441895, "grad_norm_var": 127.32661229099328, "learning_rate": 0.0001, "loss": 1.3244, "loss/crossentropy": 2.5914306640625, "loss/hidden": 1.078125, "loss/logits": 0.18898184597492218, "loss/reg": 0.005728167947381735, "step": 366 }, { "epoch": 0.045875, "grad_norm": 4.024761199951172, "grad_norm_var": 127.03030673720949, "learning_rate": 0.0001, "loss": 1.421, "loss/crossentropy": 2.083667755126953, "loss/hidden": 1.1640625, "loss/logits": 0.1997053027153015, "loss/reg": 0.005726283416152, "step": 367 }, { "epoch": 0.046, "grad_norm": 2.9291043281555176, "grad_norm_var": 126.89672944049376, "learning_rate": 0.0001, "loss": 1.1321, "loss/crossentropy": 2.7017500400543213, "loss/hidden": 0.90625, "loss/logits": 0.1686232089996338, "loss/reg": 0.005724436603486538, "step": 368 }, { "epoch": 0.046125, "grad_norm": 2.289379119873047, "grad_norm_var": 126.98034912166224, "learning_rate": 0.0001, "loss": 1.0433, "loss/crossentropy": 2.404045581817627, "loss/hidden": 0.8359375, "loss/logits": 0.1501048356294632, "loss/reg": 0.005722455680370331, "step": 369 }, { "epoch": 0.04625, "grad_norm": 2.5955307483673096, "grad_norm_var": 126.95053618311779, "learning_rate": 0.0001, "loss": 1.1052, "loss/crossentropy": 2.555497407913208, "loss/hidden": 0.87890625, "loss/logits": 0.16912290453910828, "loss/reg": 0.0057206167839467525, "step": 370 }, { "epoch": 0.046375, "grad_norm": 2.5631515979766846, "grad_norm_var": 127.05459572518181, "learning_rate": 0.0001, "loss": 1.0105, "loss/crossentropy": 2.3253824710845947, "loss/hidden": 0.80859375, "loss/logits": 0.14470672607421875, "loss/reg": 0.005718756001442671, "step": 371 }, { "epoch": 0.0465, "grad_norm": 2.8995003700256348, "grad_norm_var": 126.65924311218065, "learning_rate": 0.0001, "loss": 1.0727, "loss/crossentropy": 2.5171523094177246, "loss/hidden": 0.859375, "loss/logits": 0.15616215765476227, "loss/reg": 0.005716769490391016, "step": 372 }, { "epoch": 0.046625, "grad_norm": 2.4674322605133057, "grad_norm_var": 127.0582358856119, "learning_rate": 0.0001, "loss": 0.9544, "loss/crossentropy": 2.426679849624634, "loss/hidden": 0.765625, "loss/logits": 0.13166998326778412, "loss/reg": 0.005714884493499994, "step": 373 }, { "epoch": 0.04675, "grad_norm": 2.1486146450042725, "grad_norm_var": 0.5554253140062239, "learning_rate": 0.0001, "loss": 1.0123, "loss/crossentropy": 2.3567564487457275, "loss/hidden": 0.8203125, "loss/logits": 0.1348218023777008, "loss/reg": 0.0057129692286252975, "step": 374 }, { "epoch": 0.046875, "grad_norm": 2.4249770641326904, "grad_norm_var": 0.5421168003854054, "learning_rate": 0.0001, "loss": 1.0005, "loss/crossentropy": 2.575383424758911, "loss/hidden": 0.80078125, "loss/logits": 0.1425924003124237, "loss/reg": 0.005710979457944632, "step": 375 }, { "epoch": 0.047, "grad_norm": 3.9449760913848877, "grad_norm_var": 0.6036429091311817, "learning_rate": 0.0001, "loss": 1.1428, "loss/crossentropy": 2.5839173793792725, "loss/hidden": 0.94921875, "loss/logits": 0.13653349876403809, "loss/reg": 0.0057089440524578094, "step": 376 }, { "epoch": 0.047125, "grad_norm": 2.3119592666625977, "grad_norm_var": 0.6148998912723904, "learning_rate": 0.0001, "loss": 1.088, "loss/crossentropy": 2.492663860321045, "loss/hidden": 0.859375, "loss/logits": 0.1715661883354187, "loss/reg": 0.005707095842808485, "step": 377 }, { "epoch": 0.04725, "grad_norm": 3.586817979812622, "grad_norm_var": 0.6386998540868449, "learning_rate": 0.0001, "loss": 1.0907, "loss/crossentropy": 2.8210177421569824, "loss/hidden": 0.87890625, "loss/logits": 0.15476316213607788, "loss/reg": 0.005705154500901699, "step": 378 }, { "epoch": 0.047375, "grad_norm": 2.805647850036621, "grad_norm_var": 0.6040650287121667, "learning_rate": 0.0001, "loss": 1.0792, "loss/crossentropy": 2.54019832611084, "loss/hidden": 0.859375, "loss/logits": 0.16280022263526917, "loss/reg": 0.005703243892639875, "step": 379 }, { "epoch": 0.0475, "grad_norm": 2.7932748794555664, "grad_norm_var": 0.5445939245804574, "learning_rate": 0.0001, "loss": 1.4621, "loss/crossentropy": 2.2343437671661377, "loss/hidden": 1.1953125, "loss/logits": 0.20978981256484985, "loss/reg": 0.005701290909200907, "step": 380 }, { "epoch": 0.047625, "grad_norm": 2.661917209625244, "grad_norm_var": 0.5482497924242672, "learning_rate": 0.0001, "loss": 0.9746, "loss/crossentropy": 2.782052516937256, "loss/hidden": 0.78125, "loss/logits": 0.13640211522579193, "loss/reg": 0.0056994096376001835, "step": 381 }, { "epoch": 0.04775, "grad_norm": 2.4914302825927734, "grad_norm_var": 0.3228126995822395, "learning_rate": 0.0001, "loss": 1.126, "loss/crossentropy": 2.166295051574707, "loss/hidden": 0.91015625, "loss/logits": 0.1589164137840271, "loss/reg": 0.005697426851838827, "step": 382 }, { "epoch": 0.047875, "grad_norm": 2.961653709411621, "grad_norm_var": 0.22106978564282992, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.5477302074432373, "loss/hidden": 0.8828125, "loss/logits": 0.16730068624019623, "loss/reg": 0.005695413798093796, "step": 383 }, { "epoch": 0.048, "grad_norm": 2.9396286010742188, "grad_norm_var": 0.22133896443579198, "learning_rate": 0.0001, "loss": 1.0254, "loss/crossentropy": 2.555258274078369, "loss/hidden": 0.828125, "loss/logits": 0.1403425633907318, "loss/reg": 0.005693417973816395, "step": 384 }, { "epoch": 0.048125, "grad_norm": 2.8298912048339844, "grad_norm_var": 0.20691636732209961, "learning_rate": 0.0001, "loss": 1.195, "loss/crossentropy": 2.472844362258911, "loss/hidden": 0.984375, "loss/logits": 0.15367946028709412, "loss/reg": 0.005691539496183395, "step": 385 }, { "epoch": 0.04825, "grad_norm": 15.47062873840332, "grad_norm_var": 10.256501481265339, "learning_rate": 0.0001, "loss": 1.4448, "loss/crossentropy": 2.521524667739868, "loss/hidden": 1.203125, "loss/logits": 0.1847420334815979, "loss/reg": 0.005689616315066814, "step": 386 }, { "epoch": 0.048375, "grad_norm": 2.455294370651245, "grad_norm_var": 10.271871141002237, "learning_rate": 0.0001, "loss": 1.1018, "loss/crossentropy": 2.309390068054199, "loss/hidden": 0.89453125, "loss/logits": 0.15039557218551636, "loss/reg": 0.005687698721885681, "step": 387 }, { "epoch": 0.0485, "grad_norm": 3.23420786857605, "grad_norm_var": 10.248744715041969, "learning_rate": 0.0001, "loss": 1.2879, "loss/crossentropy": 2.4902544021606445, "loss/hidden": 1.015625, "loss/logits": 0.2154603898525238, "loss/reg": 0.005685731768608093, "step": 388 }, { "epoch": 0.048625, "grad_norm": 2.660858631134033, "grad_norm_var": 10.221989434520331, "learning_rate": 0.0001, "loss": 1.025, "loss/crossentropy": 2.31535267829895, "loss/hidden": 0.8359375, "loss/logits": 0.13224059343338013, "loss/reg": 0.005683773662894964, "step": 389 }, { "epoch": 0.04875, "grad_norm": 2.4209847450256348, "grad_norm_var": 10.173641089965429, "learning_rate": 0.0001, "loss": 0.9974, "loss/crossentropy": 2.1761093139648438, "loss/hidden": 0.8125, "loss/logits": 0.12805956602096558, "loss/reg": 0.005681932438164949, "step": 390 }, { "epoch": 0.048875, "grad_norm": 3.108008623123169, "grad_norm_var": 10.09354551501582, "learning_rate": 0.0001, "loss": 0.979, "loss/crossentropy": 2.721165657043457, "loss/hidden": 0.78125, "loss/logits": 0.14099523425102234, "loss/reg": 0.005679869093000889, "step": 391 }, { "epoch": 0.049, "grad_norm": 2.6531527042388916, "grad_norm_var": 10.150022289467502, "learning_rate": 0.0001, "loss": 1.1723, "loss/crossentropy": 2.518146514892578, "loss/hidden": 0.9375, "loss/logits": 0.17805764079093933, "loss/reg": 0.005677856504917145, "step": 392 }, { "epoch": 0.049125, "grad_norm": 2.2534499168395996, "grad_norm_var": 10.160179916565673, "learning_rate": 0.0001, "loss": 1.1292, "loss/crossentropy": 2.633385181427002, "loss/hidden": 0.91015625, "loss/logits": 0.16230204701423645, "loss/reg": 0.005675735417753458, "step": 393 }, { "epoch": 0.04925, "grad_norm": 2.9424333572387695, "grad_norm_var": 10.185797665159741, "learning_rate": 0.0001, "loss": 1.4214, "loss/crossentropy": 2.62923002243042, "loss/hidden": 1.15625, "loss/logits": 0.20838308334350586, "loss/reg": 0.00567356962710619, "step": 394 }, { "epoch": 0.049375, "grad_norm": 2.622178792953491, "grad_norm_var": 10.20593051221178, "learning_rate": 0.0001, "loss": 0.9697, "loss/crossentropy": 2.5544826984405518, "loss/hidden": 0.78125, "loss/logits": 0.13172510266304016, "loss/reg": 0.005671407096087933, "step": 395 }, { "epoch": 0.0495, "grad_norm": 2.635505199432373, "grad_norm_var": 10.223008906743342, "learning_rate": 0.0001, "loss": 0.933, "loss/crossentropy": 2.5959105491638184, "loss/hidden": 0.75390625, "loss/logits": 0.12239634245634079, "loss/reg": 0.0056692929938435555, "step": 396 }, { "epoch": 0.049625, "grad_norm": 2.6063406467437744, "grad_norm_var": 10.229570355797922, "learning_rate": 0.0001, "loss": 1.0478, "loss/crossentropy": 2.719916343688965, "loss/hidden": 0.83984375, "loss/logits": 0.15127256512641907, "loss/reg": 0.0056673381477594376, "step": 397 }, { "epoch": 0.04975, "grad_norm": 2.589893102645874, "grad_norm_var": 10.216701025853546, "learning_rate": 0.0001, "loss": 1.1265, "loss/crossentropy": 2.3730130195617676, "loss/hidden": 0.90234375, "loss/logits": 0.16749918460845947, "loss/reg": 0.0056652189232409, "step": 398 }, { "epoch": 0.049875, "grad_norm": 2.1503751277923584, "grad_norm_var": 10.318666846324161, "learning_rate": 0.0001, "loss": 1.1685, "loss/crossentropy": 2.2147741317749023, "loss/hidden": 0.92578125, "loss/logits": 0.1860472559928894, "loss/reg": 0.005663097370415926, "step": 399 }, { "epoch": 0.05, "grad_norm": 3.6945109367370605, "grad_norm_var": 10.300567557859127, "learning_rate": 0.0001, "loss": 1.1272, "loss/crossentropy": 2.4212143421173096, "loss/hidden": 0.921875, "loss/logits": 0.1487593650817871, "loss/reg": 0.005661314353346825, "step": 400 }, { "epoch": 0.050125, "grad_norm": 3.7444777488708496, "grad_norm_var": 10.268632820538057, "learning_rate": 0.0001, "loss": 1.1221, "loss/crossentropy": 2.5369904041290283, "loss/hidden": 0.90625, "loss/logits": 0.15929211676120758, "loss/reg": 0.005659462418407202, "step": 401 }, { "epoch": 0.05025, "grad_norm": 5.121776580810547, "grad_norm_var": 0.5518050614602837, "learning_rate": 0.0001, "loss": 1.4671, "loss/crossentropy": 2.2371129989624023, "loss/hidden": 1.2109375, "loss/logits": 0.19960111379623413, "loss/reg": 0.005657529458403587, "step": 402 }, { "epoch": 0.050375, "grad_norm": 28.607572555541992, "grad_norm_var": 41.63994308721723, "learning_rate": 0.0001, "loss": 1.1515, "loss/crossentropy": 2.84385347366333, "loss/hidden": 0.90234375, "loss/logits": 0.19263674318790436, "loss/reg": 0.005655454937368631, "step": 403 }, { "epoch": 0.0505, "grad_norm": 2.38948655128479, "grad_norm_var": 41.834466994087045, "learning_rate": 0.0001, "loss": 1.0929, "loss/crossentropy": 2.2518088817596436, "loss/hidden": 0.8984375, "loss/logits": 0.13791221380233765, "loss/reg": 0.005653408356010914, "step": 404 }, { "epoch": 0.050625, "grad_norm": 6.887917518615723, "grad_norm_var": 41.907583648135414, "learning_rate": 0.0001, "loss": 1.2522, "loss/crossentropy": 2.8729405403137207, "loss/hidden": 1.046875, "loss/logits": 0.14880970120429993, "loss/reg": 0.005651514511555433, "step": 405 }, { "epoch": 0.05075, "grad_norm": 3.2420449256896973, "grad_norm_var": 41.69182027548524, "learning_rate": 0.0001, "loss": 1.2031, "loss/crossentropy": 2.598705530166626, "loss/hidden": 0.98046875, "loss/logits": 0.16617505252361298, "loss/reg": 0.005649634636938572, "step": 406 }, { "epoch": 0.050875, "grad_norm": 2.3294692039489746, "grad_norm_var": 41.9082544413822, "learning_rate": 0.0001, "loss": 1.0316, "loss/crossentropy": 2.7743589878082275, "loss/hidden": 0.84375, "loss/logits": 0.13134868443012238, "loss/reg": 0.005647764541208744, "step": 407 }, { "epoch": 0.051, "grad_norm": 2.3849406242370605, "grad_norm_var": 41.988788990047645, "learning_rate": 0.0001, "loss": 1.1579, "loss/crossentropy": 2.2934722900390625, "loss/hidden": 0.9375, "loss/logits": 0.16397064924240112, "loss/reg": 0.00564591446891427, "step": 408 }, { "epoch": 0.051125, "grad_norm": 2.616523504257202, "grad_norm_var": 41.875558070811756, "learning_rate": 0.0001, "loss": 0.9281, "loss/crossentropy": 2.617312431335449, "loss/hidden": 0.7734375, "loss/logits": 0.09819567203521729, "loss/reg": 0.005644225515425205, "step": 409 }, { "epoch": 0.05125, "grad_norm": 2.302281141281128, "grad_norm_var": 42.058469053043055, "learning_rate": 0.0001, "loss": 1.0583, "loss/crossentropy": 2.8029561042785645, "loss/hidden": 0.859375, "loss/logits": 0.14253735542297363, "loss/reg": 0.005642317235469818, "step": 410 }, { "epoch": 0.051375, "grad_norm": 2.1521739959716797, "grad_norm_var": 42.20532780726832, "learning_rate": 0.0001, "loss": 0.996, "loss/crossentropy": 2.5798304080963135, "loss/hidden": 0.80078125, "loss/logits": 0.13881272077560425, "loss/reg": 0.005640234332531691, "step": 411 }, { "epoch": 0.0515, "grad_norm": 4.3292155265808105, "grad_norm_var": 41.914794683811124, "learning_rate": 0.0001, "loss": 1.3517, "loss/crossentropy": 2.4219868183135986, "loss/hidden": 1.0390625, "loss/logits": 0.2562292516231537, "loss/reg": 0.005638125352561474, "step": 412 }, { "epoch": 0.051625, "grad_norm": 19.01975440979004, "grad_norm_var": 53.903843358167165, "learning_rate": 0.0001, "loss": 1.3283, "loss/crossentropy": 2.2926077842712402, "loss/hidden": 1.078125, "loss/logits": 0.19380658864974976, "loss/reg": 0.005636140704154968, "step": 413 }, { "epoch": 0.05175, "grad_norm": 2.859027862548828, "grad_norm_var": 53.791467006877085, "learning_rate": 0.0001, "loss": 1.1115, "loss/crossentropy": 2.429117441177368, "loss/hidden": 0.90234375, "loss/logits": 0.1528070569038391, "loss/reg": 0.005634027067571878, "step": 414 }, { "epoch": 0.051875, "grad_norm": 2.385204792022705, "grad_norm_var": 53.67862289213027, "learning_rate": 0.0001, "loss": 1.0186, "loss/crossentropy": 2.710325002670288, "loss/hidden": 0.81640625, "loss/logits": 0.1458669900894165, "loss/reg": 0.005631967913359404, "step": 415 }, { "epoch": 0.052, "grad_norm": 2.3011677265167236, "grad_norm_var": 54.20582073402194, "learning_rate": 0.0001, "loss": 1.0843, "loss/crossentropy": 2.485734701156616, "loss/hidden": 0.87109375, "loss/logits": 0.1569264829158783, "loss/reg": 0.0056300037540495396, "step": 416 }, { "epoch": 0.052125, "grad_norm": 2.7714357376098633, "grad_norm_var": 54.53064815195892, "learning_rate": 0.0001, "loss": 1.0741, "loss/crossentropy": 2.6249403953552246, "loss/hidden": 0.85546875, "loss/logits": 0.1623522937297821, "loss/reg": 0.0056281075812876225, "step": 417 }, { "epoch": 0.05225, "grad_norm": 2.376473903656006, "grad_norm_var": 55.22478277620113, "learning_rate": 0.0001, "loss": 1.2116, "loss/crossentropy": 2.5150105953216553, "loss/hidden": 0.95703125, "loss/logits": 0.19830524921417236, "loss/reg": 0.005626222584396601, "step": 418 }, { "epoch": 0.052375, "grad_norm": 2.6247470378875732, "grad_norm_var": 17.572360223815615, "learning_rate": 0.0001, "loss": 1.172, "loss/crossentropy": 2.7201685905456543, "loss/hidden": 0.9453125, "loss/logits": 0.17042091488838196, "loss/reg": 0.005624283570796251, "step": 419 }, { "epoch": 0.0525, "grad_norm": 49.02815628051758, "grad_norm_var": 143.90483482694842, "learning_rate": 0.0001, "loss": 5.3824, "loss/crossentropy": 2.692047357559204, "loss/hidden": 4.84375, "loss/logits": 0.48245739936828613, "loss/reg": 0.005622203927487135, "step": 420 }, { "epoch": 0.052625, "grad_norm": 2.6867082118988037, "grad_norm_var": 144.9870986829453, "learning_rate": 0.0001, "loss": 1.2507, "loss/crossentropy": 2.404517412185669, "loss/hidden": 1.0, "loss/logits": 0.19445687532424927, "loss/reg": 0.005620268173515797, "step": 421 }, { "epoch": 0.05275, "grad_norm": 4.397704124450684, "grad_norm_var": 144.55498651709914, "learning_rate": 0.0001, "loss": 1.4596, "loss/crossentropy": 2.1510226726531982, "loss/hidden": 1.2109375, "loss/logits": 0.19246640801429749, "loss/reg": 0.005618296563625336, "step": 422 }, { "epoch": 0.052875, "grad_norm": 4.239573955535889, "grad_norm_var": 143.68003611616095, "learning_rate": 0.0001, "loss": 1.3275, "loss/crossentropy": 2.686849355697632, "loss/hidden": 1.09375, "loss/logits": 0.17758557200431824, "loss/reg": 0.005616751033812761, "step": 423 }, { "epoch": 0.053, "grad_norm": 2.749202251434326, "grad_norm_var": 143.4748837350726, "learning_rate": 0.0001, "loss": 1.0827, "loss/crossentropy": 2.8104846477508545, "loss/hidden": 0.8828125, "loss/logits": 0.1437493860721588, "loss/reg": 0.005615332629531622, "step": 424 }, { "epoch": 0.053125, "grad_norm": 2.459291458129883, "grad_norm_var": 143.5641839570371, "learning_rate": 0.0001, "loss": 1.0548, "loss/crossentropy": 2.5806379318237305, "loss/hidden": 0.8515625, "loss/logits": 0.14714661240577698, "loss/reg": 0.005613364279270172, "step": 425 }, { "epoch": 0.05325, "grad_norm": 2.294171094894409, "grad_norm_var": 143.56904366210821, "learning_rate": 0.0001, "loss": 1.1486, "loss/crossentropy": 2.6366002559661865, "loss/hidden": 0.90234375, "loss/logits": 0.19014191627502441, "loss/reg": 0.005611394997686148, "step": 426 }, { "epoch": 0.053375, "grad_norm": 2.2255382537841797, "grad_norm_var": 143.52399251007708, "learning_rate": 0.0001, "loss": 1.0752, "loss/crossentropy": 2.542306661605835, "loss/hidden": 0.875, "loss/logits": 0.14408408105373383, "loss/reg": 0.005609368905425072, "step": 427 }, { "epoch": 0.0535, "grad_norm": 3.5708723068237305, "grad_norm_var": 143.80942972780392, "learning_rate": 0.0001, "loss": 1.0863, "loss/crossentropy": 2.2636356353759766, "loss/hidden": 0.8828125, "loss/logits": 0.14744916558265686, "loss/reg": 0.005607361439615488, "step": 428 }, { "epoch": 0.053625, "grad_norm": 2.9189610481262207, "grad_norm_var": 133.66980873374825, "learning_rate": 0.0001, "loss": 0.9895, "loss/crossentropy": 2.7651426792144775, "loss/hidden": 0.78515625, "loss/logits": 0.1482805609703064, "loss/reg": 0.005605428479611874, "step": 429 }, { "epoch": 0.05375, "grad_norm": 3.2735564708709717, "grad_norm_var": 133.5211490137515, "learning_rate": 0.0001, "loss": 1.2363, "loss/crossentropy": 2.248082399368286, "loss/hidden": 0.98046875, "loss/logits": 0.19977417588233948, "loss/reg": 0.0056034415028989315, "step": 430 }, { "epoch": 0.053875, "grad_norm": 3.5670769214630127, "grad_norm_var": 133.0752341056661, "learning_rate": 0.0001, "loss": 1.2766, "loss/crossentropy": 2.500338554382324, "loss/hidden": 1.0234375, "loss/logits": 0.19719059765338898, "loss/reg": 0.005601502023637295, "step": 431 }, { "epoch": 0.054, "grad_norm": 2.2697787284851074, "grad_norm_var": 133.0901180807591, "learning_rate": 0.0001, "loss": 0.9931, "loss/crossentropy": 2.6418793201446533, "loss/hidden": 0.7890625, "loss/logits": 0.14799568057060242, "loss/reg": 0.005599519703537226, "step": 432 }, { "epoch": 0.054125, "grad_norm": 3.220383405685425, "grad_norm_var": 132.91898234062202, "learning_rate": 0.0001, "loss": 1.2515, "loss/crossentropy": 2.5073025226593018, "loss/hidden": 1.0390625, "loss/logits": 0.15643876791000366, "loss/reg": 0.005597477313131094, "step": 433 }, { "epoch": 0.05425, "grad_norm": 3.2845206260681152, "grad_norm_var": 132.5476800488924, "learning_rate": 0.0001, "loss": 1.1441, "loss/crossentropy": 2.509037971496582, "loss/hidden": 0.9296875, "loss/logits": 0.15849418938159943, "loss/reg": 0.005595567170530558, "step": 434 }, { "epoch": 0.054375, "grad_norm": 2.254239320755005, "grad_norm_var": 132.71932731242507, "learning_rate": 0.0001, "loss": 0.9815, "loss/crossentropy": 2.567584991455078, "loss/hidden": 0.78125, "loss/logits": 0.14433184266090393, "loss/reg": 0.005593593697994947, "step": 435 }, { "epoch": 0.0545, "grad_norm": 3.2273480892181396, "grad_norm_var": 0.4676980414191933, "learning_rate": 0.0001, "loss": 1.1645, "loss/crossentropy": 2.3639349937438965, "loss/hidden": 0.94921875, "loss/logits": 0.15934088826179504, "loss/reg": 0.0055916691198945045, "step": 436 }, { "epoch": 0.054625, "grad_norm": 2.6044058799743652, "grad_norm_var": 0.47199755801328347, "learning_rate": 0.0001, "loss": 1.1033, "loss/crossentropy": 2.539247989654541, "loss/hidden": 0.8984375, "loss/logits": 0.14898554980754852, "loss/reg": 0.005589775741100311, "step": 437 }, { "epoch": 0.05475, "grad_norm": 2.9674391746520996, "grad_norm_var": 0.3399405404704983, "learning_rate": 0.0001, "loss": 1.252, "loss/crossentropy": 2.5642499923706055, "loss/hidden": 0.9921875, "loss/logits": 0.20391228795051575, "loss/reg": 0.005587900057435036, "step": 438 }, { "epoch": 0.054875, "grad_norm": 2.4164047241210938, "grad_norm_var": 0.23308679379454797, "learning_rate": 0.0001, "loss": 1.1939, "loss/crossentropy": 2.3462696075439453, "loss/hidden": 0.93359375, "loss/logits": 0.2044137418270111, "loss/reg": 0.005585688166320324, "step": 439 }, { "epoch": 0.055, "grad_norm": 2.7590599060058594, "grad_norm_var": 0.2329847653181711, "learning_rate": 0.0001, "loss": 1.0377, "loss/crossentropy": 2.775485038757324, "loss/hidden": 0.84375, "loss/logits": 0.13808496296405792, "loss/reg": 0.0055835009552538395, "step": 440 }, { "epoch": 0.055125, "grad_norm": 2.7251267433166504, "grad_norm_var": 0.224188675724659, "learning_rate": 0.0001, "loss": 1.0001, "loss/crossentropy": 2.4934420585632324, "loss/hidden": 0.80859375, "loss/logits": 0.1357189267873764, "loss/reg": 0.005581483710557222, "step": 441 }, { "epoch": 0.05525, "grad_norm": 2.4774584770202637, "grad_norm_var": 0.21273704839308963, "learning_rate": 0.0001, "loss": 1.2166, "loss/crossentropy": 2.426271438598633, "loss/hidden": 0.95703125, "loss/logits": 0.20375394821166992, "loss/reg": 0.0055792308412492275, "step": 442 }, { "epoch": 0.055375, "grad_norm": 3.2236833572387695, "grad_norm_var": 0.1905493662305197, "learning_rate": 0.0001, "loss": 1.1724, "loss/crossentropy": 2.9799797534942627, "loss/hidden": 0.92578125, "loss/logits": 0.19083930552005768, "loss/reg": 0.005577271804213524, "step": 443 }, { "epoch": 0.0555, "grad_norm": 2.5997183322906494, "grad_norm_var": 0.16554225723918894, "learning_rate": 0.0001, "loss": 1.126, "loss/crossentropy": 2.2098257541656494, "loss/hidden": 0.92578125, "loss/logits": 0.14447355270385742, "loss/reg": 0.005575183313339949, "step": 444 }, { "epoch": 0.055625, "grad_norm": 2.5179152488708496, "grad_norm_var": 0.1725392629592297, "learning_rate": 0.0001, "loss": 1.2018, "loss/crossentropy": 2.0029213428497314, "loss/hidden": 0.98046875, "loss/logits": 0.1655960977077484, "loss/reg": 0.005572900176048279, "step": 445 }, { "epoch": 0.05575, "grad_norm": 2.5075204372406006, "grad_norm_var": 0.16460110044899826, "learning_rate": 0.0001, "loss": 1.0614, "loss/crossentropy": 2.3672924041748047, "loss/hidden": 0.85546875, "loss/logits": 0.15021467208862305, "loss/reg": 0.005570439621806145, "step": 446 }, { "epoch": 0.055875, "grad_norm": 2.441183567047119, "grad_norm_var": 0.12700610259855102, "learning_rate": 0.0001, "loss": 0.9323, "loss/crossentropy": 2.311056137084961, "loss/hidden": 0.7578125, "loss/logits": 0.11881721019744873, "loss/reg": 0.00556844100356102, "step": 447 }, { "epoch": 0.056, "grad_norm": 2.6724319458007812, "grad_norm_var": 0.11304803744365562, "learning_rate": 0.0001, "loss": 1.0937, "loss/crossentropy": 2.562101364135742, "loss/hidden": 0.8671875, "loss/logits": 0.1708334982395172, "loss/reg": 0.005566492676734924, "step": 448 }, { "epoch": 0.056125, "grad_norm": 2.196300506591797, "grad_norm_var": 0.11350312697665288, "learning_rate": 0.0001, "loss": 0.9882, "loss/crossentropy": 2.4227116107940674, "loss/hidden": 0.80078125, "loss/logits": 0.13182450830936432, "loss/reg": 0.00556437112390995, "step": 449 }, { "epoch": 0.05625, "grad_norm": 2.912667989730835, "grad_norm_var": 0.0921566818687341, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 1.9439491033554077, "loss/hidden": 1.109375, "loss/logits": 0.2070913016796112, "loss/reg": 0.0055623650550842285, "step": 450 }, { "epoch": 0.056375, "grad_norm": 2.011991500854492, "grad_norm_var": 0.10881512213368959, "learning_rate": 0.0001, "loss": 1.0172, "loss/crossentropy": 2.498812675476074, "loss/hidden": 0.81640625, "loss/logits": 0.14521706104278564, "loss/reg": 0.005560221150517464, "step": 451 }, { "epoch": 0.0565, "grad_norm": 2.2709267139434814, "grad_norm_var": 0.0912508163184422, "learning_rate": 0.0001, "loss": 1.1384, "loss/crossentropy": 2.320579767227173, "loss/hidden": 0.9140625, "loss/logits": 0.16879746317863464, "loss/reg": 0.005558326840400696, "step": 452 }, { "epoch": 0.056625, "grad_norm": 2.954127788543701, "grad_norm_var": 0.09996231296479816, "learning_rate": 0.0001, "loss": 1.2415, "loss/crossentropy": 2.483376979827881, "loss/hidden": 0.99609375, "loss/logits": 0.18988527357578278, "loss/reg": 0.005556488875299692, "step": 453 }, { "epoch": 0.05675, "grad_norm": 2.442729949951172, "grad_norm_var": 0.0916992305907788, "learning_rate": 0.0001, "loss": 1.0533, "loss/crossentropy": 2.414472818374634, "loss/hidden": 0.84765625, "loss/logits": 0.1501239389181137, "loss/reg": 0.005554646719247103, "step": 454 }, { "epoch": 0.056875, "grad_norm": 2.598292589187622, "grad_norm_var": 0.09002796513685567, "learning_rate": 0.0001, "loss": 0.9797, "loss/crossentropy": 2.8175811767578125, "loss/hidden": 0.78515625, "loss/logits": 0.13899990916252136, "loss/reg": 0.005552831571549177, "step": 455 }, { "epoch": 0.057, "grad_norm": 2.284618616104126, "grad_norm_var": 0.09289234998963139, "learning_rate": 0.0001, "loss": 1.1767, "loss/crossentropy": 2.5178730487823486, "loss/hidden": 0.953125, "loss/logits": 0.1680239588022232, "loss/reg": 0.005550856236368418, "step": 456 }, { "epoch": 0.057125, "grad_norm": 2.9749691486358643, "grad_norm_var": 0.10255115779464533, "learning_rate": 0.0001, "loss": 1.146, "loss/crossentropy": 2.6965036392211914, "loss/hidden": 0.89453125, "loss/logits": 0.19602364301681519, "loss/reg": 0.005548745859414339, "step": 457 }, { "epoch": 0.05725, "grad_norm": 2.4419991970062256, "grad_norm_var": 0.10305738190390912, "learning_rate": 0.0001, "loss": 1.0782, "loss/crossentropy": 2.507200241088867, "loss/hidden": 0.87890625, "loss/logits": 0.14385411143302917, "loss/reg": 0.005546758882701397, "step": 458 }, { "epoch": 0.057375, "grad_norm": 2.41898250579834, "grad_norm_var": 0.07293072023693033, "learning_rate": 0.0001, "loss": 1.0665, "loss/crossentropy": 2.4068796634674072, "loss/hidden": 0.87109375, "loss/logits": 0.13996180891990662, "loss/reg": 0.005544655025005341, "step": 459 }, { "epoch": 0.0575, "grad_norm": 3.584895372390747, "grad_norm_var": 0.1446675774892469, "learning_rate": 0.0001, "loss": 1.419, "loss/crossentropy": 2.4029970169067383, "loss/hidden": 1.15625, "loss/logits": 0.20734865963459015, "loss/reg": 0.005542535334825516, "step": 460 }, { "epoch": 0.057625, "grad_norm": 2.5190699100494385, "grad_norm_var": 0.14465856873481447, "learning_rate": 0.0001, "loss": 1.0687, "loss/crossentropy": 2.632817268371582, "loss/hidden": 0.84375, "loss/logits": 0.16959112882614136, "loss/reg": 0.005540382582694292, "step": 461 }, { "epoch": 0.05775, "grad_norm": 3.293412446975708, "grad_norm_var": 0.1759751166057581, "learning_rate": 0.0001, "loss": 1.2079, "loss/crossentropy": 1.8526346683502197, "loss/hidden": 0.984375, "loss/logits": 0.16817334294319153, "loss/reg": 0.005538390018045902, "step": 462 }, { "epoch": 0.057875, "grad_norm": 2.090097665786743, "grad_norm_var": 0.1923380804679141, "learning_rate": 0.0001, "loss": 1.0403, "loss/crossentropy": 2.7256767749786377, "loss/hidden": 0.83984375, "loss/logits": 0.14509689807891846, "loss/reg": 0.005536381620913744, "step": 463 }, { "epoch": 0.058, "grad_norm": 2.367372751235962, "grad_norm_var": 0.19537989350592183, "learning_rate": 0.0001, "loss": 0.967, "loss/crossentropy": 2.440683603286743, "loss/hidden": 0.78125, "loss/logits": 0.13041679561138153, "loss/reg": 0.005534291733056307, "step": 464 }, { "epoch": 0.058125, "grad_norm": 2.5434730052948, "grad_norm_var": 0.18491306851457617, "learning_rate": 0.0001, "loss": 1.1396, "loss/crossentropy": 2.811406373977661, "loss/hidden": 0.91015625, "loss/logits": 0.1740744560956955, "loss/reg": 0.005532294511795044, "step": 465 }, { "epoch": 0.05825, "grad_norm": 2.613758087158203, "grad_norm_var": 0.17830906169392974, "learning_rate": 0.0001, "loss": 1.0313, "loss/crossentropy": 2.5138356685638428, "loss/hidden": 0.828125, "loss/logits": 0.1479034125804901, "loss/reg": 0.005530340131372213, "step": 466 }, { "epoch": 0.058375, "grad_norm": 3.6053991317749023, "grad_norm_var": 0.21458171164135606, "learning_rate": 0.0001, "loss": 1.2109, "loss/crossentropy": 1.9949983358383179, "loss/hidden": 1.0, "loss/logits": 0.155661940574646, "loss/reg": 0.0055284383706748486, "step": 467 }, { "epoch": 0.0585, "grad_norm": 2.2574644088745117, "grad_norm_var": 0.21534123971961966, "learning_rate": 0.0001, "loss": 1.08, "loss/crossentropy": 2.514662981033325, "loss/hidden": 0.859375, "loss/logits": 0.16538314521312714, "loss/reg": 0.005526562221348286, "step": 468 }, { "epoch": 0.058625, "grad_norm": 2.2614095211029053, "grad_norm_var": 0.2206521084247221, "learning_rate": 0.0001, "loss": 1.2297, "loss/crossentropy": 2.4910507202148438, "loss/hidden": 0.98046875, "loss/logits": 0.19400066137313843, "loss/reg": 0.005524714011698961, "step": 469 }, { "epoch": 0.05875, "grad_norm": 3.083524465560913, "grad_norm_var": 0.22915168035201153, "learning_rate": 0.0001, "loss": 1.1725, "loss/crossentropy": 2.5548853874206543, "loss/hidden": 0.92578125, "loss/logits": 0.19151920080184937, "loss/reg": 0.005522689316421747, "step": 470 }, { "epoch": 0.058875, "grad_norm": 2.6530709266662598, "grad_norm_var": 0.2287156357176549, "learning_rate": 0.0001, "loss": 0.9819, "loss/crossentropy": 2.5769848823547363, "loss/hidden": 0.79296875, "loss/logits": 0.1337730437517166, "loss/reg": 0.00552078802138567, "step": 471 }, { "epoch": 0.059, "grad_norm": 2.857489585876465, "grad_norm_var": 0.21848469951039154, "learning_rate": 0.0001, "loss": 1.2335, "loss/crossentropy": 2.6933629512786865, "loss/hidden": 0.98828125, "loss/logits": 0.19003306329250336, "loss/reg": 0.005518974736332893, "step": 472 }, { "epoch": 0.059125, "grad_norm": 1.960106372833252, "grad_norm_var": 0.24874750636482734, "learning_rate": 0.0001, "loss": 0.9776, "loss/crossentropy": 2.534855365753174, "loss/hidden": 0.7890625, "loss/logits": 0.13338381052017212, "loss/reg": 0.005517229437828064, "step": 473 }, { "epoch": 0.05925, "grad_norm": 2.787822961807251, "grad_norm_var": 0.24619457779295406, "learning_rate": 0.0001, "loss": 1.0858, "loss/crossentropy": 2.396390438079834, "loss/hidden": 0.88671875, "loss/logits": 0.14397624135017395, "loss/reg": 0.005515479948371649, "step": 474 }, { "epoch": 0.059375, "grad_norm": 2.3396122455596924, "grad_norm_var": 0.24936205040752385, "learning_rate": 0.0001, "loss": 1.0392, "loss/crossentropy": 2.6306259632110596, "loss/hidden": 0.83984375, "loss/logits": 0.14426180720329285, "loss/reg": 0.005513759795576334, "step": 475 }, { "epoch": 0.0595, "grad_norm": 2.367551803588867, "grad_norm_var": 0.19447740210993794, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.342672348022461, "loss/hidden": 0.890625, "loss/logits": 0.16136375069618225, "loss/reg": 0.0055120959877967834, "step": 476 }, { "epoch": 0.059625, "grad_norm": 2.3029873371124268, "grad_norm_var": 0.19972845357339655, "learning_rate": 0.0001, "loss": 0.9785, "loss/crossentropy": 2.725276231765747, "loss/hidden": 0.796875, "loss/logits": 0.12647491693496704, "loss/reg": 0.0055101178586483, "step": 477 }, { "epoch": 0.05975, "grad_norm": 2.3109138011932373, "grad_norm_var": 0.1674590503375268, "learning_rate": 0.0001, "loss": 1.012, "loss/crossentropy": 2.6665799617767334, "loss/hidden": 0.81640625, "loss/logits": 0.14054208993911743, "loss/reg": 0.005508116912096739, "step": 478 }, { "epoch": 0.059875, "grad_norm": 2.8778023719787598, "grad_norm_var": 0.1605488706137739, "learning_rate": 0.0001, "loss": 1.0028, "loss/crossentropy": 2.599010705947876, "loss/hidden": 0.80078125, "loss/logits": 0.14697444438934326, "loss/reg": 0.0055063748732209206, "step": 479 }, { "epoch": 0.06, "grad_norm": 2.7762978076934814, "grad_norm_var": 0.15971446982347573, "learning_rate": 0.0001, "loss": 1.1492, "loss/crossentropy": 2.6345436573028564, "loss/hidden": 0.9296875, "loss/logits": 0.1645045280456543, "loss/reg": 0.005504653323441744, "step": 480 }, { "epoch": 0.060125, "grad_norm": 3.0745112895965576, "grad_norm_var": 0.1733429982183973, "learning_rate": 0.0001, "loss": 1.2914, "loss/crossentropy": 2.1021008491516113, "loss/hidden": 1.0546875, "loss/logits": 0.18168240785598755, "loss/reg": 0.005502650048583746, "step": 481 }, { "epoch": 0.06025, "grad_norm": 2.5635828971862793, "grad_norm_var": 0.17362979402171655, "learning_rate": 0.0001, "loss": 1.1746, "loss/crossentropy": 2.599754810333252, "loss/hidden": 0.9453125, "loss/logits": 0.1743006557226181, "loss/reg": 0.005500909872353077, "step": 482 }, { "epoch": 0.060375, "grad_norm": 2.982170343399048, "grad_norm_var": 0.11685041441696337, "learning_rate": 0.0001, "loss": 1.084, "loss/crossentropy": 2.780411958694458, "loss/hidden": 0.875, "loss/logits": 0.15399503707885742, "loss/reg": 0.005499421618878841, "step": 483 }, { "epoch": 0.0605, "grad_norm": 6.475743770599365, "grad_norm_var": 1.0413639393420129, "learning_rate": 0.0001, "loss": 2.1473, "loss/crossentropy": 2.3867931365966797, "loss/hidden": 1.703125, "loss/logits": 0.38922837376594543, "loss/reg": 0.005497433710843325, "step": 484 }, { "epoch": 0.060625, "grad_norm": 2.522434711456299, "grad_norm_var": 1.024975132918582, "learning_rate": 0.0001, "loss": 1.0915, "loss/crossentropy": 2.741684675216675, "loss/hidden": 0.88671875, "loss/logits": 0.14987404644489288, "loss/reg": 0.0054954588413238525, "step": 485 }, { "epoch": 0.06075, "grad_norm": 2.6852359771728516, "grad_norm_var": 1.0236023483547378, "learning_rate": 0.0001, "loss": 1.0905, "loss/crossentropy": 2.2552525997161865, "loss/hidden": 0.8984375, "loss/logits": 0.13711076974868774, "loss/reg": 0.005493887234479189, "step": 486 }, { "epoch": 0.060875, "grad_norm": 6.048346996307373, "grad_norm_var": 1.65671866064532, "learning_rate": 0.0001, "loss": 1.4058, "loss/crossentropy": 3.1526873111724854, "loss/hidden": 1.0625, "loss/logits": 0.2884060740470886, "loss/reg": 0.005492268595844507, "step": 487 }, { "epoch": 0.061, "grad_norm": 5.24729061126709, "grad_norm_var": 1.9496829900519608, "learning_rate": 0.0001, "loss": 1.5487, "loss/crossentropy": 2.391798496246338, "loss/hidden": 1.234375, "loss/logits": 0.2594112157821655, "loss/reg": 0.0054903156124055386, "step": 488 }, { "epoch": 0.061125, "grad_norm": 3.4879932403564453, "grad_norm_var": 1.8414378354073275, "learning_rate": 0.0001, "loss": 1.2408, "loss/crossentropy": 2.3853161334991455, "loss/hidden": 1.015625, "loss/logits": 0.1702655553817749, "loss/reg": 0.005488729570060968, "step": 489 }, { "epoch": 0.06125, "grad_norm": 2.416243076324463, "grad_norm_var": 1.875598350696971, "learning_rate": 0.0001, "loss": 1.0646, "loss/crossentropy": 2.310605049133301, "loss/hidden": 0.86328125, "loss/logits": 0.146418958902359, "loss/reg": 0.005487216170877218, "step": 490 }, { "epoch": 0.061375, "grad_norm": 2.9619152545928955, "grad_norm_var": 1.8217813283025472, "learning_rate": 0.0001, "loss": 1.2577, "loss/crossentropy": 2.3735132217407227, "loss/hidden": 1.015625, "loss/logits": 0.18721503019332886, "loss/reg": 0.005485245026648045, "step": 491 }, { "epoch": 0.0615, "grad_norm": 2.9602112770080566, "grad_norm_var": 1.7685642295810833, "learning_rate": 0.0001, "loss": 1.1274, "loss/crossentropy": 2.6420083045959473, "loss/hidden": 0.90234375, "loss/logits": 0.17025524377822876, "loss/reg": 0.005483296699821949, "step": 492 }, { "epoch": 0.061625, "grad_norm": 2.5772223472595215, "grad_norm_var": 1.7347667738241757, "learning_rate": 0.0001, "loss": 1.1004, "loss/crossentropy": 2.4166319370269775, "loss/hidden": 0.890625, "loss/logits": 0.15491390228271484, "loss/reg": 0.005481342785060406, "step": 493 }, { "epoch": 0.06175, "grad_norm": 2.6494603157043457, "grad_norm_var": 1.693988292922673, "learning_rate": 0.0001, "loss": 1.0762, "loss/crossentropy": 2.7021005153656006, "loss/hidden": 0.8671875, "loss/logits": 0.1542307734489441, "loss/reg": 0.005479689687490463, "step": 494 }, { "epoch": 0.061875, "grad_norm": 2.065351963043213, "grad_norm_var": 1.7911776893626628, "learning_rate": 0.0001, "loss": 1.015, "loss/crossentropy": 2.4842755794525146, "loss/hidden": 0.8203125, "loss/logits": 0.13995476067066193, "loss/reg": 0.005478002596646547, "step": 495 }, { "epoch": 0.062, "grad_norm": 2.650660753250122, "grad_norm_var": 1.8016636980513454, "learning_rate": 0.0001, "loss": 1.1699, "loss/crossentropy": 2.3899097442626953, "loss/hidden": 0.94921875, "loss/logits": 0.16591498255729675, "loss/reg": 0.005476430524140596, "step": 496 }, { "epoch": 0.062125, "grad_norm": 3.412050724029541, "grad_norm_var": 1.7970375838694677, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.4459383487701416, "loss/hidden": 0.94140625, "loss/logits": 0.20212361216545105, "loss/reg": 0.005474465899169445, "step": 497 }, { "epoch": 0.06225, "grad_norm": 2.7389674186706543, "grad_norm_var": 1.7804152177025587, "learning_rate": 0.0001, "loss": 1.1076, "loss/crossentropy": 2.6794888973236084, "loss/hidden": 0.90625, "loss/logits": 0.1465749740600586, "loss/reg": 0.005472847726196051, "step": 498 }, { "epoch": 0.062375, "grad_norm": 20.56003761291504, "grad_norm_var": 20.18846043733062, "learning_rate": 0.0001, "loss": 1.0568, "loss/crossentropy": 2.527268409729004, "loss/hidden": 0.859375, "loss/logits": 0.14275437593460083, "loss/reg": 0.005471326876431704, "step": 499 }, { "epoch": 0.0625, "grad_norm": 2.9909119606018066, "grad_norm_var": 20.013739807194945, "learning_rate": 0.0001, "loss": 1.0002, "loss/crossentropy": 2.311053991317749, "loss/hidden": 0.80859375, "loss/logits": 0.13688521087169647, "loss/reg": 0.005469587165862322, "step": 500 }, { "epoch": 0.062625, "grad_norm": 2.6013972759246826, "grad_norm_var": 19.995957990662593, "learning_rate": 0.0001, "loss": 1.1804, "loss/crossentropy": 2.4651801586151123, "loss/hidden": 0.94921875, "loss/logits": 0.17647257447242737, "loss/reg": 0.005467594135552645, "step": 501 }, { "epoch": 0.06275, "grad_norm": 3.800658702850342, "grad_norm_var": 19.840506630980723, "learning_rate": 0.0001, "loss": 0.9929, "loss/crossentropy": 2.3829755783081055, "loss/hidden": 0.8125, "loss/logits": 0.1257120817899704, "loss/reg": 0.005465896334499121, "step": 502 }, { "epoch": 0.062875, "grad_norm": 2.5520195960998535, "grad_norm_var": 19.800229612103674, "learning_rate": 0.0001, "loss": 0.9335, "loss/crossentropy": 2.318957567214966, "loss/hidden": 0.73828125, "loss/logits": 0.14062564074993134, "loss/reg": 0.005463926587253809, "step": 503 }, { "epoch": 0.063, "grad_norm": 2.2392287254333496, "grad_norm_var": 19.907422060176042, "learning_rate": 0.0001, "loss": 0.9748, "loss/crossentropy": 2.6403889656066895, "loss/hidden": 0.79296875, "loss/logits": 0.12718063592910767, "loss/reg": 0.005461950786411762, "step": 504 }, { "epoch": 0.063125, "grad_norm": 59.44195556640625, "grad_norm_var": 212.38825001063205, "learning_rate": 0.0001, "loss": 1.0377, "loss/crossentropy": 2.3318405151367188, "loss/hidden": 0.859375, "loss/logits": 0.12373203039169312, "loss/reg": 0.005460206884890795, "step": 505 }, { "epoch": 0.06325, "grad_norm": 2.711045265197754, "grad_norm_var": 212.19724917857505, "learning_rate": 0.0001, "loss": 1.0909, "loss/crossentropy": 2.5284109115600586, "loss/hidden": 0.8984375, "loss/logits": 0.13792011141777039, "loss/reg": 0.0054582892917096615, "step": 506 }, { "epoch": 0.063375, "grad_norm": 2.2481954097747803, "grad_norm_var": 212.65447803299938, "learning_rate": 0.0001, "loss": 0.9413, "loss/crossentropy": 2.5293643474578857, "loss/hidden": 0.7578125, "loss/logits": 0.12894126772880554, "loss/reg": 0.005456262268126011, "step": 507 }, { "epoch": 0.0635, "grad_norm": 2.7053897380828857, "grad_norm_var": 212.80895755175322, "learning_rate": 0.0001, "loss": 1.0773, "loss/crossentropy": 2.506075143814087, "loss/hidden": 0.87109375, "loss/logits": 0.15166552364826202, "loss/reg": 0.005454184953123331, "step": 508 }, { "epoch": 0.063625, "grad_norm": 2.2455244064331055, "grad_norm_var": 213.0278691549671, "learning_rate": 0.0001, "loss": 0.9768, "loss/crossentropy": 2.6757209300994873, "loss/hidden": 0.7890625, "loss/logits": 0.13317805528640747, "loss/reg": 0.005452104844152927, "step": 509 }, { "epoch": 0.06375, "grad_norm": 2.599388360977173, "grad_norm_var": 213.05941324718256, "learning_rate": 0.0001, "loss": 0.9486, "loss/crossentropy": 2.5610482692718506, "loss/hidden": 0.76953125, "loss/logits": 0.1245487853884697, "loss/reg": 0.005449967924505472, "step": 510 }, { "epoch": 0.063875, "grad_norm": 2.648411989212036, "grad_norm_var": 212.67000591016287, "learning_rate": 0.0001, "loss": 1.0344, "loss/crossentropy": 2.4520514011383057, "loss/hidden": 0.86328125, "loss/logits": 0.11664330959320068, "loss/reg": 0.005447922740131617, "step": 511 }, { "epoch": 0.064, "grad_norm": 2.380727767944336, "grad_norm_var": 212.84492196466834, "learning_rate": 0.0001, "loss": 1.0266, "loss/crossentropy": 2.1030967235565186, "loss/hidden": 0.83984375, "loss/logits": 0.13228739798069, "loss/reg": 0.0054458137601614, "step": 512 }, { "epoch": 0.064125, "grad_norm": 2.198631763458252, "grad_norm_var": 213.5768536641417, "learning_rate": 0.0001, "loss": 1.1228, "loss/crossentropy": 2.3845298290252686, "loss/hidden": 0.90625, "loss/logits": 0.16212098300457, "loss/reg": 0.0054436735808849335, "step": 513 }, { "epoch": 0.06425, "grad_norm": 2.8499302864074707, "grad_norm_var": 213.51026966359936, "learning_rate": 0.0001, "loss": 1.114, "loss/crossentropy": 2.8064866065979004, "loss/hidden": 0.89453125, "loss/logits": 0.16508570313453674, "loss/reg": 0.0054416959173977375, "step": 514 }, { "epoch": 0.064375, "grad_norm": 2.5610392093658447, "grad_norm_var": 201.9317150765609, "learning_rate": 0.0001, "loss": 0.9877, "loss/crossentropy": 2.331996202468872, "loss/hidden": 0.79296875, "loss/logits": 0.14031504094600677, "loss/reg": 0.005439713131636381, "step": 515 }, { "epoch": 0.0645, "grad_norm": 2.0442378520965576, "grad_norm_var": 202.38943138060174, "learning_rate": 0.0001, "loss": 1.0106, "loss/crossentropy": 2.4245524406433105, "loss/hidden": 0.8203125, "loss/logits": 0.13589094579219818, "loss/reg": 0.0054377601481974125, "step": 516 }, { "epoch": 0.064625, "grad_norm": 2.7959375381469727, "grad_norm_var": 202.30067826507621, "learning_rate": 0.0001, "loss": 1.1462, "loss/crossentropy": 2.4928247928619385, "loss/hidden": 0.9375, "loss/logits": 0.154384583234787, "loss/reg": 0.005435979925096035, "step": 517 }, { "epoch": 0.06475, "grad_norm": 2.5521795749664307, "grad_norm_var": 202.78524814255965, "learning_rate": 0.0001, "loss": 1.1979, "loss/crossentropy": 2.4683828353881836, "loss/hidden": 0.96484375, "loss/logits": 0.17875435948371887, "loss/reg": 0.005434305872768164, "step": 518 }, { "epoch": 0.064875, "grad_norm": 3.04142165184021, "grad_norm_var": 202.5720686279479, "learning_rate": 0.0001, "loss": 1.0734, "loss/crossentropy": 2.5951480865478516, "loss/hidden": 0.87890625, "loss/logits": 0.14021140336990356, "loss/reg": 0.005432285368442535, "step": 519 }, { "epoch": 0.065, "grad_norm": 2.7776551246643066, "grad_norm_var": 202.31453305561996, "learning_rate": 0.0001, "loss": 1.0546, "loss/crossentropy": 2.4622325897216797, "loss/hidden": 0.85546875, "loss/logits": 0.14482024312019348, "loss/reg": 0.0054303682409226894, "step": 520 }, { "epoch": 0.065125, "grad_norm": 3.091510057449341, "grad_norm_var": 0.0909682998746592, "learning_rate": 0.0001, "loss": 1.2902, "loss/crossentropy": 2.1909449100494385, "loss/hidden": 1.0703125, "loss/logits": 0.1655557006597519, "loss/reg": 0.005428609903901815, "step": 521 }, { "epoch": 0.06525, "grad_norm": 2.5140562057495117, "grad_norm_var": 0.09023274223209772, "learning_rate": 0.0001, "loss": 1.0304, "loss/crossentropy": 2.4252545833587646, "loss/hidden": 0.8125, "loss/logits": 0.163617342710495, "loss/reg": 0.005426718853414059, "step": 522 }, { "epoch": 0.065375, "grad_norm": 2.4871199131011963, "grad_norm_var": 0.08328167859519695, "learning_rate": 0.0001, "loss": 1.0136, "loss/crossentropy": 2.8315296173095703, "loss/hidden": 0.80859375, "loss/logits": 0.15072785317897797, "loss/reg": 0.0054249088279902935, "step": 523 }, { "epoch": 0.0655, "grad_norm": 2.5452873706817627, "grad_norm_var": 0.082491431169228, "learning_rate": 0.0001, "loss": 1.1747, "loss/crossentropy": 2.4235100746154785, "loss/hidden": 0.953125, "loss/logits": 0.1673499345779419, "loss/reg": 0.005423161666840315, "step": 524 }, { "epoch": 0.065625, "grad_norm": 2.5849223136901855, "grad_norm_var": 0.0744047548688132, "learning_rate": 0.0001, "loss": 1.1537, "loss/crossentropy": 2.517444133758545, "loss/hidden": 0.9375, "loss/logits": 0.16202498972415924, "loss/reg": 0.005421151407063007, "step": 525 }, { "epoch": 0.06575, "grad_norm": 2.5873005390167236, "grad_norm_var": 0.07442217159387093, "learning_rate": 0.0001, "loss": 1.0671, "loss/crossentropy": 2.467041254043579, "loss/hidden": 0.8671875, "loss/logits": 0.1457323431968689, "loss/reg": 0.0054191285744309425, "step": 526 }, { "epoch": 0.065875, "grad_norm": 2.362294912338257, "grad_norm_var": 0.07783568042826777, "learning_rate": 0.0001, "loss": 1.0313, "loss/crossentropy": 2.1884958744049072, "loss/hidden": 0.8359375, "loss/logits": 0.14114870131015778, "loss/reg": 0.005417390260845423, "step": 527 }, { "epoch": 0.066, "grad_norm": 2.9457032680511475, "grad_norm_var": 0.08233057116115011, "learning_rate": 0.0001, "loss": 1.1121, "loss/crossentropy": 2.4786124229431152, "loss/hidden": 0.9140625, "loss/logits": 0.14386197924613953, "loss/reg": 0.005415752530097961, "step": 528 }, { "epoch": 0.066125, "grad_norm": 2.302025556564331, "grad_norm_var": 0.07717323196560505, "learning_rate": 0.0001, "loss": 1.0216, "loss/crossentropy": 2.3902788162231445, "loss/hidden": 0.81640625, "loss/logits": 0.15102702379226685, "loss/reg": 0.005413680803030729, "step": 529 }, { "epoch": 0.06625, "grad_norm": 2.2427210807800293, "grad_norm_var": 0.08222220602995639, "learning_rate": 0.0001, "loss": 1.0503, "loss/crossentropy": 2.5187177658081055, "loss/hidden": 0.85546875, "loss/logits": 0.1406846046447754, "loss/reg": 0.005411935038864613, "step": 530 }, { "epoch": 0.066375, "grad_norm": 2.215160846710205, "grad_norm_var": 0.09102156065525453, "learning_rate": 0.0001, "loss": 1.0564, "loss/crossentropy": 2.7218358516693115, "loss/hidden": 0.8515625, "loss/logits": 0.15077106654644012, "loss/reg": 0.00541025260463357, "step": 531 }, { "epoch": 0.0665, "grad_norm": 49.282718658447266, "grad_norm_var": 136.25864998814177, "learning_rate": 0.0001, "loss": 1.0971, "loss/crossentropy": 2.2880303859710693, "loss/hidden": 0.90625, "loss/logits": 0.136735200881958, "loss/reg": 0.005408551078289747, "step": 532 }, { "epoch": 0.066625, "grad_norm": 2.9009287357330322, "grad_norm_var": 136.22119824556128, "learning_rate": 0.0001, "loss": 1.2051, "loss/crossentropy": 2.069505214691162, "loss/hidden": 0.96875, "loss/logits": 0.18224555253982544, "loss/reg": 0.00540671544149518, "step": 533 }, { "epoch": 0.06675, "grad_norm": 3.366948127746582, "grad_norm_var": 135.93950988587417, "learning_rate": 0.0001, "loss": 1.2696, "loss/crossentropy": 2.5054562091827393, "loss/hidden": 1.0234375, "loss/logits": 0.19209496676921844, "loss/reg": 0.005404717288911343, "step": 534 }, { "epoch": 0.066875, "grad_norm": 2.380380868911743, "grad_norm_var": 136.19039047350162, "learning_rate": 0.0001, "loss": 1.0532, "loss/crossentropy": 2.4644546508789062, "loss/hidden": 0.85546875, "loss/logits": 0.14370107650756836, "loss/reg": 0.005402736831456423, "step": 535 }, { "epoch": 0.067, "grad_norm": 2.2630538940429688, "grad_norm_var": 136.3962470934156, "learning_rate": 0.0001, "loss": 1.106, "loss/crossentropy": 2.362761974334717, "loss/hidden": 0.88671875, "loss/logits": 0.16529247164726257, "loss/reg": 0.005401079077273607, "step": 536 }, { "epoch": 0.067125, "grad_norm": 2.2985637187957764, "grad_norm_var": 136.69066191681563, "learning_rate": 0.0001, "loss": 1.1188, "loss/crossentropy": 2.2513458728790283, "loss/hidden": 0.90625, "loss/logits": 0.158562570810318, "loss/reg": 0.005399197805672884, "step": 537 }, { "epoch": 0.06725, "grad_norm": 2.9178526401519775, "grad_norm_var": 136.54251636267432, "learning_rate": 0.0001, "loss": 1.2523, "loss/crossentropy": 2.4714279174804688, "loss/hidden": 1.0, "loss/logits": 0.1983010470867157, "loss/reg": 0.005397453438490629, "step": 538 }, { "epoch": 0.067375, "grad_norm": 2.1185367107391357, "grad_norm_var": 136.69809974879476, "learning_rate": 0.0001, "loss": 1.0218, "loss/crossentropy": 2.5675318241119385, "loss/hidden": 0.83203125, "loss/logits": 0.135833740234375, "loss/reg": 0.005395461805164814, "step": 539 }, { "epoch": 0.0675, "grad_norm": 2.471010684967041, "grad_norm_var": 136.72728236316837, "learning_rate": 0.0001, "loss": 0.9973, "loss/crossentropy": 2.434710741043091, "loss/hidden": 0.80078125, "loss/logits": 0.1425883173942566, "loss/reg": 0.005393547471612692, "step": 540 }, { "epoch": 0.067625, "grad_norm": 2.24953556060791, "grad_norm_var": 136.86254598209106, "learning_rate": 0.0001, "loss": 1.031, "loss/crossentropy": 2.2015278339385986, "loss/hidden": 0.83984375, "loss/logits": 0.13719773292541504, "loss/reg": 0.005391509272158146, "step": 541 }, { "epoch": 0.06775, "grad_norm": 3.6081595420837402, "grad_norm_var": 136.54053740800046, "learning_rate": 0.0001, "loss": 1.4308, "loss/crossentropy": 2.6074843406677246, "loss/hidden": 1.1328125, "loss/logits": 0.244051992893219, "loss/reg": 0.0053895004093647, "step": 542 }, { "epoch": 0.067875, "grad_norm": 2.293153762817383, "grad_norm_var": 136.5697192568711, "learning_rate": 0.0001, "loss": 1.0019, "loss/crossentropy": 2.604656934738159, "loss/hidden": 0.8125, "loss/logits": 0.13555657863616943, "loss/reg": 0.005387555807828903, "step": 543 }, { "epoch": 0.068, "grad_norm": 3.9054906368255615, "grad_norm_var": 136.30156429508213, "learning_rate": 0.0001, "loss": 1.2067, "loss/crossentropy": 2.723475456237793, "loss/hidden": 1.0078125, "loss/logits": 0.14498497545719147, "loss/reg": 0.005385412368923426, "step": 544 }, { "epoch": 0.068125, "grad_norm": 2.571354389190674, "grad_norm_var": 136.18942504783266, "learning_rate": 0.0001, "loss": 1.1473, "loss/crossentropy": 2.686601400375366, "loss/hidden": 0.87109375, "loss/logits": 0.22237557172775269, "loss/reg": 0.005383248440921307, "step": 545 }, { "epoch": 0.06825, "grad_norm": 2.71347975730896, "grad_norm_var": 135.99456491905767, "learning_rate": 0.0001, "loss": 1.2265, "loss/crossentropy": 2.3059206008911133, "loss/hidden": 0.99609375, "loss/logits": 0.17658907175064087, "loss/reg": 0.00538119999691844, "step": 546 }, { "epoch": 0.068375, "grad_norm": 2.2055041790008545, "grad_norm_var": 135.99892540184646, "learning_rate": 0.0001, "loss": 1.227, "loss/crossentropy": 2.1022770404815674, "loss/hidden": 1.0078125, "loss/logits": 0.165423184633255, "loss/reg": 0.00537898438051343, "step": 547 }, { "epoch": 0.0685, "grad_norm": 2.9577088356018066, "grad_norm_var": 0.2900975003793434, "learning_rate": 0.0001, "loss": 1.0223, "loss/crossentropy": 2.7894651889801025, "loss/hidden": 0.8359375, "loss/logits": 0.13254427909851074, "loss/reg": 0.0053769489750266075, "step": 548 }, { "epoch": 0.068625, "grad_norm": 2.289632797241211, "grad_norm_var": 0.2971860973100412, "learning_rate": 0.0001, "loss": 0.9706, "loss/crossentropy": 1.9662660360336304, "loss/hidden": 0.796875, "loss/logits": 0.11997567117214203, "loss/reg": 0.005374929867684841, "step": 549 }, { "epoch": 0.06875, "grad_norm": 2.7140934467315674, "grad_norm_var": 0.26256089477725764, "learning_rate": 0.0001, "loss": 1.0877, "loss/crossentropy": 2.600255012512207, "loss/hidden": 0.875, "loss/logits": 0.15900644659996033, "loss/reg": 0.005373071413487196, "step": 550 }, { "epoch": 0.068875, "grad_norm": 2.7848618030548096, "grad_norm_var": 0.25973690827483153, "learning_rate": 0.0001, "loss": 1.1564, "loss/crossentropy": 2.347534656524658, "loss/hidden": 0.9296875, "loss/logits": 0.1730039119720459, "loss/reg": 0.005370850209146738, "step": 551 }, { "epoch": 0.069, "grad_norm": 2.1625566482543945, "grad_norm_var": 0.26552124449597064, "learning_rate": 0.0001, "loss": 1.0454, "loss/crossentropy": 2.766214609146118, "loss/hidden": 0.84765625, "loss/logits": 0.1440483182668686, "loss/reg": 0.005368667654693127, "step": 552 }, { "epoch": 0.069125, "grad_norm": 2.8881192207336426, "grad_norm_var": 0.2602997020069113, "learning_rate": 0.0001, "loss": 1.0958, "loss/crossentropy": 2.808046817779541, "loss/hidden": 0.89453125, "loss/logits": 0.14757747948169708, "loss/reg": 0.005366665776818991, "step": 553 }, { "epoch": 0.06925, "grad_norm": 2.6187915802001953, "grad_norm_var": 0.2563330715515538, "learning_rate": 0.0001, "loss": 1.2873, "loss/crossentropy": 2.5937459468841553, "loss/hidden": 1.03125, "loss/logits": 0.20239150524139404, "loss/reg": 0.005364455748349428, "step": 554 }, { "epoch": 0.069375, "grad_norm": 2.6418044567108154, "grad_norm_var": 0.23570370249948383, "learning_rate": 0.0001, "loss": 1.1877, "loss/crossentropy": 2.5635995864868164, "loss/hidden": 0.9609375, "loss/logits": 0.17318235337734222, "loss/reg": 0.005362290423363447, "step": 555 }, { "epoch": 0.0695, "grad_norm": 2.79367733001709, "grad_norm_var": 0.2326946034348102, "learning_rate": 0.0001, "loss": 1.2018, "loss/crossentropy": 2.13350510597229, "loss/hidden": 1.0, "loss/logits": 0.14820048213005066, "loss/reg": 0.005360215436667204, "step": 556 }, { "epoch": 0.069625, "grad_norm": 3.387333869934082, "grad_norm_var": 0.2433911623755942, "learning_rate": 0.0001, "loss": 1.0866, "loss/crossentropy": 2.4682462215423584, "loss/hidden": 0.859375, "loss/logits": 0.17359653115272522, "loss/reg": 0.005357977002859116, "step": 557 }, { "epoch": 0.06975, "grad_norm": 2.9143950939178467, "grad_norm_var": 0.19718877969401258, "learning_rate": 0.0001, "loss": 1.1093, "loss/crossentropy": 2.6049532890319824, "loss/hidden": 0.87890625, "loss/logits": 0.1768435537815094, "loss/reg": 0.005355944857001305, "step": 558 }, { "epoch": 0.069875, "grad_norm": 2.495455741882324, "grad_norm_var": 0.18769030937939207, "learning_rate": 0.0001, "loss": 1.0522, "loss/crossentropy": 2.2471513748168945, "loss/hidden": 0.8203125, "loss/logits": 0.17833727598190308, "loss/reg": 0.005353772081434727, "step": 559 }, { "epoch": 0.07, "grad_norm": 3.548495054244995, "grad_norm_var": 0.140786672247814, "learning_rate": 0.0001, "loss": 1.2284, "loss/crossentropy": 2.0618865489959717, "loss/hidden": 0.984375, "loss/logits": 0.1905221790075302, "loss/reg": 0.005351651925593615, "step": 560 }, { "epoch": 0.070125, "grad_norm": 2.8271002769470215, "grad_norm_var": 0.1394493347625937, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.392343282699585, "loss/hidden": 0.87109375, "loss/logits": 0.1387328803539276, "loss/reg": 0.005349620711058378, "step": 561 }, { "epoch": 0.07025, "grad_norm": 2.02376389503479, "grad_norm_var": 0.17221200465598158, "learning_rate": 0.0001, "loss": 1.0841, "loss/crossentropy": 2.26953125, "loss/hidden": 0.87890625, "loss/logits": 0.15167732536792755, "loss/reg": 0.005347614176571369, "step": 562 }, { "epoch": 0.070375, "grad_norm": 3.3085341453552246, "grad_norm_var": 0.17503849488183504, "learning_rate": 0.0001, "loss": 0.9333, "loss/crossentropy": 2.278923511505127, "loss/hidden": 0.76171875, "loss/logits": 0.11809547245502472, "loss/reg": 0.005345623474568129, "step": 563 }, { "epoch": 0.0705, "grad_norm": 6.067057132720947, "grad_norm_var": 0.8561705035714908, "learning_rate": 0.0001, "loss": 1.6764, "loss/crossentropy": 2.513504981994629, "loss/hidden": 1.421875, "loss/logits": 0.20107370615005493, "loss/reg": 0.00534354243427515, "step": 564 }, { "epoch": 0.070625, "grad_norm": 2.2686562538146973, "grad_norm_var": 0.8580914081280743, "learning_rate": 0.0001, "loss": 1.0927, "loss/crossentropy": 2.2088499069213867, "loss/hidden": 0.88671875, "loss/logits": 0.15257079899311066, "loss/reg": 0.005341436248272657, "step": 565 }, { "epoch": 0.07075, "grad_norm": 3.1334891319274902, "grad_norm_var": 0.8550377421403706, "learning_rate": 0.0001, "loss": 1.3017, "loss/crossentropy": 2.117647886276245, "loss/hidden": 1.0703125, "loss/logits": 0.17795339226722717, "loss/reg": 0.005339318886399269, "step": 566 }, { "epoch": 0.070875, "grad_norm": 4.6122727394104, "grad_norm_var": 1.0134023805364716, "learning_rate": 0.0001, "loss": 1.4641, "loss/crossentropy": 2.7603936195373535, "loss/hidden": 1.140625, "loss/logits": 0.27013444900512695, "loss/reg": 0.005337177775800228, "step": 567 }, { "epoch": 0.071, "grad_norm": 2.583162307739258, "grad_norm_var": 0.9715659491999304, "learning_rate": 0.0001, "loss": 1.2051, "loss/crossentropy": 2.380053758621216, "loss/hidden": 0.9765625, "loss/logits": 0.1751583367586136, "loss/reg": 0.005335117690265179, "step": 568 }, { "epoch": 0.071125, "grad_norm": 2.2990646362304688, "grad_norm_var": 1.0124076074311148, "learning_rate": 0.0001, "loss": 1.1188, "loss/crossentropy": 2.587369203567505, "loss/hidden": 0.8984375, "loss/logits": 0.16701750457286835, "loss/reg": 0.005332810804247856, "step": 569 }, { "epoch": 0.07125, "grad_norm": 3.1470165252685547, "grad_norm_var": 0.9962936596825245, "learning_rate": 0.0001, "loss": 1.0239, "loss/crossentropy": 2.492009162902832, "loss/hidden": 0.79296875, "loss/logits": 0.17766177654266357, "loss/reg": 0.0053307050839066505, "step": 570 }, { "epoch": 0.071375, "grad_norm": 2.8765156269073486, "grad_norm_var": 0.9845149270166076, "learning_rate": 0.0001, "loss": 1.1206, "loss/crossentropy": 2.714184045791626, "loss/hidden": 0.91015625, "loss/logits": 0.15715843439102173, "loss/reg": 0.005328655708581209, "step": 571 }, { "epoch": 0.0715, "grad_norm": 3.696258068084717, "grad_norm_var": 0.9934068745617035, "learning_rate": 0.0001, "loss": 1.7557, "loss/crossentropy": 2.322871685028076, "loss/hidden": 1.40625, "loss/logits": 0.2961430847644806, "loss/reg": 0.005326449871063232, "step": 572 }, { "epoch": 0.071625, "grad_norm": 2.3756725788116455, "grad_norm_var": 1.0320075552341808, "learning_rate": 0.0001, "loss": 1.0069, "loss/crossentropy": 2.446782350540161, "loss/hidden": 0.80859375, "loss/logits": 0.14505374431610107, "loss/reg": 0.005324224475771189, "step": 573 }, { "epoch": 0.07175, "grad_norm": 3.0114002227783203, "grad_norm_var": 1.0297287032782758, "learning_rate": 0.0001, "loss": 1.0473, "loss/crossentropy": 2.815708637237549, "loss/hidden": 0.84765625, "loss/logits": 0.146395742893219, "loss/reg": 0.005321910604834557, "step": 574 }, { "epoch": 0.071875, "grad_norm": 2.7700350284576416, "grad_norm_var": 1.0107660796879199, "learning_rate": 0.0001, "loss": 0.9849, "loss/crossentropy": 2.579951286315918, "loss/hidden": 0.79296875, "loss/logits": 0.13869163393974304, "loss/reg": 0.005319789983332157, "step": 575 }, { "epoch": 0.072, "grad_norm": 2.1934142112731934, "grad_norm_var": 1.0552091073780958, "learning_rate": 0.0001, "loss": 1.0376, "loss/crossentropy": 2.7635080814361572, "loss/hidden": 0.83203125, "loss/logits": 0.1523815095424652, "loss/reg": 0.005317789036780596, "step": 576 }, { "epoch": 0.072125, "grad_norm": 2.203432321548462, "grad_norm_var": 1.1000992612664597, "learning_rate": 0.0001, "loss": 1.0214, "loss/crossentropy": 2.2990267276763916, "loss/hidden": 0.81640625, "loss/logits": 0.15180249512195587, "loss/reg": 0.005315590649843216, "step": 577 }, { "epoch": 0.07225, "grad_norm": 2.7597663402557373, "grad_norm_var": 1.0346594183063509, "learning_rate": 0.0001, "loss": 1.2895, "loss/crossentropy": 2.7941789627075195, "loss/hidden": 1.0390625, "loss/logits": 0.1973191797733307, "loss/reg": 0.00531340204179287, "step": 578 }, { "epoch": 0.072375, "grad_norm": 2.151498794555664, "grad_norm_var": 1.0833220696738444, "learning_rate": 0.0001, "loss": 0.9999, "loss/crossentropy": 2.4545745849609375, "loss/hidden": 0.8125, "loss/logits": 0.1343034952878952, "loss/reg": 0.005311093758791685, "step": 579 }, { "epoch": 0.0725, "grad_norm": 2.758521318435669, "grad_norm_var": 0.4185770203540026, "learning_rate": 0.0001, "loss": 1.1404, "loss/crossentropy": 2.3098721504211426, "loss/hidden": 0.93359375, "loss/logits": 0.1536703109741211, "loss/reg": 0.00530878035351634, "step": 580 }, { "epoch": 0.072625, "grad_norm": 3.102933406829834, "grad_norm_var": 0.40269379192041677, "learning_rate": 0.0001, "loss": 1.4046, "loss/crossentropy": 2.4322452545166016, "loss/hidden": 1.125, "loss/logits": 0.22653597593307495, "loss/reg": 0.005306490696966648, "step": 581 }, { "epoch": 0.07275, "grad_norm": 2.831894636154175, "grad_norm_var": 0.39716603194751554, "learning_rate": 0.0001, "loss": 1.0557, "loss/crossentropy": 2.5340616703033447, "loss/hidden": 0.859375, "loss/logits": 0.14325933158397675, "loss/reg": 0.005304399877786636, "step": 582 }, { "epoch": 0.072875, "grad_norm": 2.5802910327911377, "grad_norm_var": 0.17392503265121587, "learning_rate": 0.0001, "loss": 1.0795, "loss/crossentropy": 2.5839056968688965, "loss/hidden": 0.86328125, "loss/logits": 0.16314582526683807, "loss/reg": 0.005302343517541885, "step": 583 }, { "epoch": 0.073, "grad_norm": 2.650399684906006, "grad_norm_var": 0.17308120367785024, "learning_rate": 0.0001, "loss": 1.1285, "loss/crossentropy": 2.487835645675659, "loss/hidden": 0.91796875, "loss/logits": 0.1575045883655548, "loss/reg": 0.005300293210893869, "step": 584 }, { "epoch": 0.073125, "grad_norm": 2.5146095752716064, "grad_norm_var": 0.16408850139509898, "learning_rate": 0.0001, "loss": 0.8801, "loss/crossentropy": 2.710824728012085, "loss/hidden": 0.71875, "loss/logits": 0.10839369148015976, "loss/reg": 0.005298234056681395, "step": 585 }, { "epoch": 0.07325, "grad_norm": 3.5579047203063965, "grad_norm_var": 0.19767952383566936, "learning_rate": 0.0001, "loss": 1.0659, "loss/crossentropy": 2.5979011058807373, "loss/hidden": 0.86328125, "loss/logits": 0.1496235430240631, "loss/reg": 0.005296017974615097, "step": 586 }, { "epoch": 0.073375, "grad_norm": 3.229036569595337, "grad_norm_var": 0.21129156050842327, "learning_rate": 0.0001, "loss": 1.3736, "loss/crossentropy": 2.0404820442199707, "loss/hidden": 1.109375, "loss/logits": 0.2113049328327179, "loss/reg": 0.005293776281177998, "step": 587 }, { "epoch": 0.0735, "grad_norm": 3.219778537750244, "grad_norm_var": 0.1669016788033178, "learning_rate": 0.0001, "loss": 1.1673, "loss/crossentropy": 2.5946946144104004, "loss/hidden": 0.96875, "loss/logits": 0.14563477039337158, "loss/reg": 0.005291698966175318, "step": 588 }, { "epoch": 0.073625, "grad_norm": 2.622143030166626, "grad_norm_var": 0.15858063234232014, "learning_rate": 0.0001, "loss": 1.2658, "loss/crossentropy": 2.2183616161346436, "loss/hidden": 1.0078125, "loss/logits": 0.20505878329277039, "loss/reg": 0.0052896649576723576, "step": 589 }, { "epoch": 0.07375, "grad_norm": 2.496985673904419, "grad_norm_var": 0.15786373129452994, "learning_rate": 0.0001, "loss": 1.008, "loss/crossentropy": 2.4871459007263184, "loss/hidden": 0.80078125, "loss/logits": 0.15438680350780487, "loss/reg": 0.005287437699735165, "step": 590 }, { "epoch": 0.073875, "grad_norm": 3.0701065063476562, "grad_norm_var": 0.1651866047673136, "learning_rate": 0.0001, "loss": 1.1327, "loss/crossentropy": 2.414074420928955, "loss/hidden": 0.9140625, "loss/logits": 0.16581328213214874, "loss/reg": 0.005285393912345171, "step": 591 }, { "epoch": 0.074, "grad_norm": 3.9464497566223145, "grad_norm_var": 0.22799900213871613, "learning_rate": 0.0001, "loss": 1.5728, "loss/crossentropy": 2.3605363368988037, "loss/hidden": 1.265625, "loss/logits": 0.25435870885849, "loss/reg": 0.0052834744565188885, "step": 592 }, { "epoch": 0.074125, "grad_norm": 2.5776402950286865, "grad_norm_var": 0.20419228079197158, "learning_rate": 0.0001, "loss": 1.2113, "loss/crossentropy": 2.3519833087921143, "loss/hidden": 0.9765625, "loss/logits": 0.18191702663898468, "loss/reg": 0.0052813272923231125, "step": 593 }, { "epoch": 0.07425, "grad_norm": 2.2281813621520996, "grad_norm_var": 0.23033113710586123, "learning_rate": 0.0001, "loss": 1.072, "loss/crossentropy": 2.38511323928833, "loss/hidden": 0.875, "loss/logits": 0.14420956373214722, "loss/reg": 0.0052796173840761185, "step": 594 }, { "epoch": 0.074375, "grad_norm": 3.1069376468658447, "grad_norm_var": 0.19889239941204717, "learning_rate": 0.0001, "loss": 1.1811, "loss/crossentropy": 2.413785696029663, "loss/hidden": 0.984375, "loss/logits": 0.14399147033691406, "loss/reg": 0.005277944263070822, "step": 595 }, { "epoch": 0.0745, "grad_norm": 3.3450682163238525, "grad_norm_var": 0.2088716594217845, "learning_rate": 0.0001, "loss": 1.2822, "loss/crossentropy": 2.434509038925171, "loss/hidden": 1.0390625, "loss/logits": 0.1903418004512787, "loss/reg": 0.005276298616081476, "step": 596 }, { "epoch": 0.074625, "grad_norm": 2.371161937713623, "grad_norm_var": 0.22668853941960734, "learning_rate": 0.0001, "loss": 1.0528, "loss/crossentropy": 2.396265745162964, "loss/hidden": 0.85546875, "loss/logits": 0.14461319148540497, "loss/reg": 0.005274245049804449, "step": 597 }, { "epoch": 0.07475, "grad_norm": 3.314265251159668, "grad_norm_var": 0.2370575162921483, "learning_rate": 0.0001, "loss": 1.1987, "loss/crossentropy": 2.7262494564056396, "loss/hidden": 0.9765625, "loss/logits": 0.16941672563552856, "loss/reg": 0.0052725388668477535, "step": 598 }, { "epoch": 0.074875, "grad_norm": 3.7527589797973633, "grad_norm_var": 0.2687845607887461, "learning_rate": 0.0001, "loss": 1.2156, "loss/crossentropy": 2.6718039512634277, "loss/hidden": 0.98046875, "loss/logits": 0.18241068720817566, "loss/reg": 0.005270869936794043, "step": 599 }, { "epoch": 0.075, "grad_norm": 2.5073466300964355, "grad_norm_var": 0.2767358438012515, "learning_rate": 0.0001, "loss": 1.0856, "loss/crossentropy": 2.3735952377319336, "loss/hidden": 0.875, "loss/logits": 0.15789943933486938, "loss/reg": 0.005268939305096865, "step": 600 }, { "epoch": 0.075125, "grad_norm": 4.061317443847656, "grad_norm_var": 0.3279536252115766, "learning_rate": 0.0001, "loss": 1.3519, "loss/crossentropy": 2.5530035495758057, "loss/hidden": 1.1015625, "loss/logits": 0.19769783318042755, "loss/reg": 0.005267218686640263, "step": 601 }, { "epoch": 0.07525, "grad_norm": 2.4795703887939453, "grad_norm_var": 0.33305877012988483, "learning_rate": 0.0001, "loss": 1.1642, "loss/crossentropy": 2.4662020206451416, "loss/hidden": 0.93359375, "loss/logits": 0.17794585227966309, "loss/reg": 0.005265380721539259, "step": 602 }, { "epoch": 0.075375, "grad_norm": 3.2844204902648926, "grad_norm_var": 0.33479007900948143, "learning_rate": 0.0001, "loss": 1.0854, "loss/crossentropy": 2.5502281188964844, "loss/hidden": 0.87890625, "loss/logits": 0.15388712286949158, "loss/reg": 0.005263412371277809, "step": 603 }, { "epoch": 0.0755, "grad_norm": 2.4871444702148438, "grad_norm_var": 0.3492133912507728, "learning_rate": 0.0001, "loss": 1.0922, "loss/crossentropy": 2.5286808013916016, "loss/hidden": 0.86328125, "loss/logits": 0.17630262672901154, "loss/reg": 0.005261610262095928, "step": 604 }, { "epoch": 0.075625, "grad_norm": 2.9881107807159424, "grad_norm_var": 0.3402092077326716, "learning_rate": 0.0001, "loss": 1.1525, "loss/crossentropy": 2.522861957550049, "loss/hidden": 0.9375, "loss/logits": 0.16244357824325562, "loss/reg": 0.005259564146399498, "step": 605 }, { "epoch": 0.07575, "grad_norm": 2.3586983680725098, "grad_norm_var": 0.35069927923201, "learning_rate": 0.0001, "loss": 1.1314, "loss/crossentropy": 2.2875781059265137, "loss/hidden": 0.921875, "loss/logits": 0.15690375864505768, "loss/reg": 0.0052574859000742435, "step": 606 }, { "epoch": 0.075875, "grad_norm": 2.6491522789001465, "grad_norm_var": 0.35741571312755316, "learning_rate": 0.0001, "loss": 1.1971, "loss/crossentropy": 2.2390084266662598, "loss/hidden": 0.95703125, "loss/logits": 0.1875428408384323, "loss/reg": 0.005255614407360554, "step": 607 }, { "epoch": 0.076, "grad_norm": 2.073080539703369, "grad_norm_var": 0.33189569909254335, "learning_rate": 0.0001, "loss": 0.9664, "loss/crossentropy": 2.3911304473876953, "loss/hidden": 0.7890625, "loss/logits": 0.12475378811359406, "loss/reg": 0.005253734532743692, "step": 608 }, { "epoch": 0.076125, "grad_norm": 2.265080451965332, "grad_norm_var": 0.34931259933064945, "learning_rate": 0.0001, "loss": 1.1539, "loss/crossentropy": 2.5041847229003906, "loss/hidden": 0.9375, "loss/logits": 0.16390517354011536, "loss/reg": 0.005251840688288212, "step": 609 }, { "epoch": 0.07625, "grad_norm": 2.0803020000457764, "grad_norm_var": 0.3625360811458743, "learning_rate": 0.0001, "loss": 1.0503, "loss/crossentropy": 2.504490375518799, "loss/hidden": 0.84765625, "loss/logits": 0.15017710626125336, "loss/reg": 0.0052499608136713505, "step": 610 }, { "epoch": 0.076375, "grad_norm": 2.5562875270843506, "grad_norm_var": 0.3604403500297356, "learning_rate": 0.0001, "loss": 1.0444, "loss/crossentropy": 2.462942361831665, "loss/hidden": 0.82421875, "loss/logits": 0.16772450506687164, "loss/reg": 0.005247869063168764, "step": 611 }, { "epoch": 0.0765, "grad_norm": 2.2976746559143066, "grad_norm_var": 0.3509101683634808, "learning_rate": 0.0001, "loss": 1.0751, "loss/crossentropy": 2.423419713973999, "loss/hidden": 0.87890625, "loss/logits": 0.1437685340642929, "loss/reg": 0.005245808511972427, "step": 612 }, { "epoch": 0.076625, "grad_norm": 2.0925452709198, "grad_norm_var": 0.368735612720054, "learning_rate": 0.0001, "loss": 1.0097, "loss/crossentropy": 2.422513961791992, "loss/hidden": 0.80078125, "loss/logits": 0.15644872188568115, "loss/reg": 0.0052436222322285175, "step": 613 }, { "epoch": 0.07675, "grad_norm": 3.361826181411743, "grad_norm_var": 0.3727533997750771, "learning_rate": 0.0001, "loss": 1.0847, "loss/crossentropy": 2.6778650283813477, "loss/hidden": 0.88671875, "loss/logits": 0.14552772045135498, "loss/reg": 0.005241374485194683, "step": 614 }, { "epoch": 0.076875, "grad_norm": 2.418203353881836, "grad_norm_var": 0.29779963975303353, "learning_rate": 0.0001, "loss": 0.9395, "loss/crossentropy": 2.5650947093963623, "loss/hidden": 0.765625, "loss/logits": 0.12145733833312988, "loss/reg": 0.005239336285740137, "step": 615 }, { "epoch": 0.077, "grad_norm": 2.0879790782928467, "grad_norm_var": 0.3152329983661199, "learning_rate": 0.0001, "loss": 1.0952, "loss/crossentropy": 2.393650531768799, "loss/hidden": 0.8828125, "loss/logits": 0.15999376773834229, "loss/reg": 0.005237067583948374, "step": 616 }, { "epoch": 0.077125, "grad_norm": 2.236255168914795, "grad_norm_var": 0.1669205481679067, "learning_rate": 0.0001, "loss": 0.997, "loss/crossentropy": 2.7451670169830322, "loss/hidden": 0.796875, "loss/logits": 0.1478239744901657, "loss/reg": 0.005235039163380861, "step": 617 }, { "epoch": 0.07725, "grad_norm": 3.4291763305664062, "grad_norm_var": 0.2229381174587303, "learning_rate": 0.0001, "loss": 1.4063, "loss/crossentropy": 2.2041330337524414, "loss/hidden": 1.1484375, "loss/logits": 0.2055673450231552, "loss/reg": 0.005233013071119785, "step": 618 }, { "epoch": 0.077375, "grad_norm": 2.1689059734344482, "grad_norm_var": 0.19023093415819758, "learning_rate": 0.0001, "loss": 1.1399, "loss/crossentropy": 2.3984005451202393, "loss/hidden": 0.9296875, "loss/logits": 0.1579177975654602, "loss/reg": 0.0052308449521660805, "step": 619 }, { "epoch": 0.0775, "grad_norm": 2.2414422035217285, "grad_norm_var": 0.1935046668737487, "learning_rate": 0.0001, "loss": 0.997, "loss/crossentropy": 2.448946475982666, "loss/hidden": 0.81640625, "loss/logits": 0.12831541895866394, "loss/reg": 0.005228678695857525, "step": 620 }, { "epoch": 0.077625, "grad_norm": 2.2024965286254883, "grad_norm_var": 0.17639827374390885, "learning_rate": 0.0001, "loss": 1.1341, "loss/crossentropy": 2.475011110305786, "loss/hidden": 0.9140625, "loss/logits": 0.16773179173469543, "loss/reg": 0.005226653069257736, "step": 621 }, { "epoch": 0.07775, "grad_norm": 3.0845813751220703, "grad_norm_var": 0.20461207914334617, "learning_rate": 0.0001, "loss": 1.2647, "loss/crossentropy": 2.6116580963134766, "loss/hidden": 1.0234375, "loss/logits": 0.188987135887146, "loss/reg": 0.005224402993917465, "step": 622 }, { "epoch": 0.077875, "grad_norm": 1.887999415397644, "grad_norm_var": 0.2208956692966997, "learning_rate": 0.0001, "loss": 1.0901, "loss/crossentropy": 2.1740164756774902, "loss/hidden": 0.87890625, "loss/logits": 0.15901124477386475, "loss/reg": 0.005222304258495569, "step": 623 }, { "epoch": 0.078, "grad_norm": 2.436877489089966, "grad_norm_var": 0.2130556319156203, "learning_rate": 0.0001, "loss": 1.1171, "loss/crossentropy": 2.405428647994995, "loss/hidden": 0.9140625, "loss/logits": 0.1507887840270996, "loss/reg": 0.005220047663897276, "step": 624 }, { "epoch": 0.078125, "grad_norm": 2.1241559982299805, "grad_norm_var": 0.21735767872165226, "learning_rate": 0.0001, "loss": 1.0586, "loss/crossentropy": 2.534162759780884, "loss/hidden": 0.84765625, "loss/logits": 0.1587330847978592, "loss/reg": 0.005217918660491705, "step": 625 }, { "epoch": 0.07825, "grad_norm": 3.6194941997528076, "grad_norm_var": 0.2958829671732156, "learning_rate": 0.0001, "loss": 1.1975, "loss/crossentropy": 2.4529123306274414, "loss/hidden": 0.953125, "loss/logits": 0.1922522485256195, "loss/reg": 0.005215668119490147, "step": 626 }, { "epoch": 0.078375, "grad_norm": 2.9666078090667725, "grad_norm_var": 0.3086442760245996, "learning_rate": 0.0001, "loss": 1.3623, "loss/crossentropy": 2.2351934909820557, "loss/hidden": 1.1015625, "loss/logits": 0.2085917890071869, "loss/reg": 0.005213598720729351, "step": 627 }, { "epoch": 0.0785, "grad_norm": 3.424123764038086, "grad_norm_var": 0.35140186017642155, "learning_rate": 0.0001, "loss": 1.0209, "loss/crossentropy": 2.593291997909546, "loss/hidden": 0.8359375, "loss/logits": 0.13284316658973694, "loss/reg": 0.005211306270211935, "step": 628 }, { "epoch": 0.078625, "grad_norm": 3.1431374549865723, "grad_norm_var": 0.34770286145400553, "learning_rate": 0.0001, "loss": 1.1602, "loss/crossentropy": 2.5281360149383545, "loss/hidden": 0.9375, "loss/logits": 0.1706121563911438, "loss/reg": 0.005208863411098719, "step": 629 }, { "epoch": 0.07875, "grad_norm": 2.7911853790283203, "grad_norm_var": 0.315955495515612, "learning_rate": 0.0001, "loss": 1.2247, "loss/crossentropy": 1.7649813890457153, "loss/hidden": 0.99609375, "loss/logits": 0.17649176716804504, "loss/reg": 0.005206458270549774, "step": 630 }, { "epoch": 0.078875, "grad_norm": 2.941204309463501, "grad_norm_var": 0.3174858804582356, "learning_rate": 0.0001, "loss": 1.1613, "loss/crossentropy": 2.402109146118164, "loss/hidden": 0.921875, "loss/logits": 0.1873561143875122, "loss/reg": 0.0052040074951946735, "step": 631 }, { "epoch": 0.079, "grad_norm": 2.391481876373291, "grad_norm_var": 0.2995243667524064, "learning_rate": 0.0001, "loss": 1.178, "loss/crossentropy": 2.353907346725464, "loss/hidden": 0.9609375, "loss/logits": 0.165082648396492, "loss/reg": 0.005201911553740501, "step": 632 }, { "epoch": 0.079125, "grad_norm": 2.87488055229187, "grad_norm_var": 0.2861166812267043, "learning_rate": 0.0001, "loss": 1.1462, "loss/crossentropy": 2.653827667236328, "loss/hidden": 0.91796875, "loss/logits": 0.17622330784797668, "loss/reg": 0.00519942119717598, "step": 633 }, { "epoch": 0.07925, "grad_norm": 2.4115476608276367, "grad_norm_var": 0.2563777078404484, "learning_rate": 0.0001, "loss": 1.1225, "loss/crossentropy": 2.4889907836914062, "loss/hidden": 0.8984375, "loss/logits": 0.1720612496137619, "loss/reg": 0.005197320133447647, "step": 634 }, { "epoch": 0.079375, "grad_norm": 2.4616434574127197, "grad_norm_var": 0.24219922325532028, "learning_rate": 0.0001, "loss": 1.1805, "loss/crossentropy": 2.3522822856903076, "loss/hidden": 0.94140625, "loss/logits": 0.1871228963136673, "loss/reg": 0.005195194855332375, "step": 635 }, { "epoch": 0.0795, "grad_norm": 2.386276960372925, "grad_norm_var": 0.23489288483797985, "learning_rate": 0.0001, "loss": 0.9463, "loss/crossentropy": 2.584338426589966, "loss/hidden": 0.765625, "loss/logits": 0.12873858213424683, "loss/reg": 0.005192761775106192, "step": 636 }, { "epoch": 0.079625, "grad_norm": 2.454456090927124, "grad_norm_var": 0.22225700139133117, "learning_rate": 0.0001, "loss": 1.0199, "loss/crossentropy": 2.4049668312072754, "loss/hidden": 0.80859375, "loss/logits": 0.15936976671218872, "loss/reg": 0.005190614145249128, "step": 637 }, { "epoch": 0.07975, "grad_norm": 2.1882073879241943, "grad_norm_var": 0.22800243516591642, "learning_rate": 0.0001, "loss": 1.1182, "loss/crossentropy": 2.5147759914398193, "loss/hidden": 0.92578125, "loss/logits": 0.14053833484649658, "loss/reg": 0.005188319832086563, "step": 638 }, { "epoch": 0.079875, "grad_norm": 2.214505434036255, "grad_norm_var": 0.20121127216839246, "learning_rate": 0.0001, "loss": 0.9821, "loss/crossentropy": 2.6784555912017822, "loss/hidden": 0.79296875, "loss/logits": 0.13726986944675446, "loss/reg": 0.0051859593950212, "step": 639 }, { "epoch": 0.08, "grad_norm": 2.8519279956817627, "grad_norm_var": 0.19869721717550323, "learning_rate": 0.0001, "loss": 1.3825, "loss/crossentropy": 2.4976460933685303, "loss/hidden": 1.1171875, "loss/logits": 0.21343019604682922, "loss/reg": 0.005183514207601547, "step": 640 }, { "epoch": 0.080125, "grad_norm": 3.2747607231140137, "grad_norm_var": 0.1926680012221444, "learning_rate": 0.0001, "loss": 1.0585, "loss/crossentropy": 2.265321969985962, "loss/hidden": 0.83203125, "loss/logits": 0.174637109041214, "loss/reg": 0.005180996377021074, "step": 641 }, { "epoch": 0.08025, "grad_norm": 2.433096408843994, "grad_norm_var": 0.14700668719525894, "learning_rate": 0.0001, "loss": 1.0417, "loss/crossentropy": 2.4713757038116455, "loss/hidden": 0.83984375, "loss/logits": 0.15003418922424316, "loss/reg": 0.005178460851311684, "step": 642 }, { "epoch": 0.080375, "grad_norm": 3.037181854248047, "grad_norm_var": 0.149821407729875, "learning_rate": 0.0001, "loss": 1.2828, "loss/crossentropy": 2.387122631072998, "loss/hidden": 1.0625, "loss/logits": 0.16852089762687683, "loss/reg": 0.005175705999135971, "step": 643 }, { "epoch": 0.0805, "grad_norm": 4.6729350090026855, "grad_norm_var": 0.3670359647179557, "learning_rate": 0.0001, "loss": 1.4674, "loss/crossentropy": 1.862856149673462, "loss/hidden": 1.21875, "loss/logits": 0.196872740983963, "loss/reg": 0.005172953009605408, "step": 644 }, { "epoch": 0.080625, "grad_norm": 3.1887784004211426, "grad_norm_var": 0.3693575970723629, "learning_rate": 0.0001, "loss": 1.3902, "loss/crossentropy": 2.418116569519043, "loss/hidden": 1.0703125, "loss/logits": 0.26818597316741943, "loss/reg": 0.005170162301510572, "step": 645 }, { "epoch": 0.08075, "grad_norm": 2.502976179122925, "grad_norm_var": 0.37434523124654007, "learning_rate": 0.0001, "loss": 0.9914, "loss/crossentropy": 2.666684865951538, "loss/hidden": 0.796875, "loss/logits": 0.14282414317131042, "loss/reg": 0.005168135743588209, "step": 646 }, { "epoch": 0.080875, "grad_norm": 3.457416534423828, "grad_norm_var": 0.4029304846601008, "learning_rate": 0.0001, "loss": 1.1007, "loss/crossentropy": 2.6663055419921875, "loss/hidden": 0.8671875, "loss/logits": 0.18181806802749634, "loss/reg": 0.005165606737136841, "step": 647 }, { "epoch": 0.081, "grad_norm": 3.3614838123321533, "grad_norm_var": 0.40888510034567177, "learning_rate": 0.0001, "loss": 1.1832, "loss/crossentropy": 2.758859157562256, "loss/hidden": 0.97265625, "loss/logits": 0.15888546407222748, "loss/reg": 0.005163096822798252, "step": 648 }, { "epoch": 0.081125, "grad_norm": 3.84016752243042, "grad_norm_var": 0.46893935653157826, "learning_rate": 0.0001, "loss": 1.1826, "loss/crossentropy": 2.6556475162506104, "loss/hidden": 0.91796875, "loss/logits": 0.21306458115577698, "loss/reg": 0.00516059435904026, "step": 649 }, { "epoch": 0.08125, "grad_norm": 3.2409677505493164, "grad_norm_var": 0.45558605122396995, "learning_rate": 0.0001, "loss": 1.3848, "loss/crossentropy": 2.2988944053649902, "loss/hidden": 1.078125, "loss/logits": 0.25512105226516724, "loss/reg": 0.005158509127795696, "step": 650 }, { "epoch": 0.081375, "grad_norm": 5.813977241516113, "grad_norm_var": 0.9294389115085245, "learning_rate": 0.0001, "loss": 1.3714, "loss/crossentropy": 2.1522276401519775, "loss/hidden": 1.09375, "loss/logits": 0.22606953978538513, "loss/reg": 0.005156443454325199, "step": 651 }, { "epoch": 0.0815, "grad_norm": 3.5314903259277344, "grad_norm_var": 0.8898375889113737, "learning_rate": 0.0001, "loss": 1.2683, "loss/crossentropy": 2.523782253265381, "loss/hidden": 1.015625, "loss/logits": 0.2011091113090515, "loss/reg": 0.00515406858175993, "step": 652 }, { "epoch": 0.081625, "grad_norm": 2.5944650173187256, "grad_norm_var": 0.8761365904132077, "learning_rate": 0.0001, "loss": 1.1349, "loss/crossentropy": 2.3588359355926514, "loss/hidden": 0.921875, "loss/logits": 0.16150620579719543, "loss/reg": 0.005151691380888224, "step": 653 }, { "epoch": 0.08175, "grad_norm": 3.0321786403656006, "grad_norm_var": 0.7997344400314499, "learning_rate": 0.0001, "loss": 1.2751, "loss/crossentropy": 2.35185170173645, "loss/hidden": 1.046875, "loss/logits": 0.17669130861759186, "loss/reg": 0.005149615928530693, "step": 654 }, { "epoch": 0.081875, "grad_norm": 3.2158820629119873, "grad_norm_var": 0.7154026962141908, "learning_rate": 0.0001, "loss": 1.0974, "loss/crossentropy": 2.809157133102417, "loss/hidden": 0.8984375, "loss/logits": 0.14747856557369232, "loss/reg": 0.005147217772901058, "step": 655 }, { "epoch": 0.082, "grad_norm": 2.1148674488067627, "grad_norm_var": 0.8010662785463902, "learning_rate": 0.0001, "loss": 1.056, "loss/crossentropy": 2.5246095657348633, "loss/hidden": 0.84765625, "loss/logits": 0.1568629890680313, "loss/reg": 0.005145091563463211, "step": 656 }, { "epoch": 0.082125, "grad_norm": 2.541887044906616, "grad_norm_var": 0.8402323056924367, "learning_rate": 0.0001, "loss": 1.2087, "loss/crossentropy": 2.3598315715789795, "loss/hidden": 1.0, "loss/logits": 0.15723757445812225, "loss/reg": 0.005142755340784788, "step": 657 }, { "epoch": 0.08225, "grad_norm": 2.292616605758667, "grad_norm_var": 0.8574455385669723, "learning_rate": 0.0001, "loss": 1.0515, "loss/crossentropy": 2.5242886543273926, "loss/hidden": 0.84375, "loss/logits": 0.15634778141975403, "loss/reg": 0.005140629597008228, "step": 658 }, { "epoch": 0.082375, "grad_norm": 3.6106507778167725, "grad_norm_var": 0.8596278513525417, "learning_rate": 0.0001, "loss": 1.3178, "loss/crossentropy": 2.6528077125549316, "loss/hidden": 1.09375, "loss/logits": 0.17268945276737213, "loss/reg": 0.005138530861586332, "step": 659 }, { "epoch": 0.0825, "grad_norm": 2.4270260334014893, "grad_norm_var": 0.7677345681069748, "learning_rate": 0.0001, "loss": 1.072, "loss/crossentropy": 2.5859150886535645, "loss/hidden": 0.875, "loss/logits": 0.14567336440086365, "loss/reg": 0.005136391613632441, "step": 660 }, { "epoch": 0.082625, "grad_norm": 10.746210098266602, "grad_norm_var": 4.3533807562107985, "learning_rate": 0.0001, "loss": 1.3555, "loss/crossentropy": 2.6105966567993164, "loss/hidden": 1.09375, "loss/logits": 0.21044138073921204, "loss/reg": 0.005134167615324259, "step": 661 }, { "epoch": 0.08275, "grad_norm": 2.277845621109009, "grad_norm_var": 4.3908370843377496, "learning_rate": 0.0001, "loss": 1.1244, "loss/crossentropy": 2.624403715133667, "loss/hidden": 0.9140625, "loss/logits": 0.15903490781784058, "loss/reg": 0.005131968762725592, "step": 662 }, { "epoch": 0.082875, "grad_norm": 2.2439072132110596, "grad_norm_var": 4.510992587376572, "learning_rate": 0.0001, "loss": 1.1974, "loss/crossentropy": 2.325004816055298, "loss/hidden": 0.96875, "loss/logits": 0.1773640513420105, "loss/reg": 0.005129888188093901, "step": 663 }, { "epoch": 0.083, "grad_norm": 2.5274457931518555, "grad_norm_var": 4.57602786514919, "learning_rate": 0.0001, "loss": 1.0481, "loss/crossentropy": 2.7819478511810303, "loss/hidden": 0.8515625, "loss/logits": 0.1452445089817047, "loss/reg": 0.005127874203026295, "step": 664 }, { "epoch": 0.083125, "grad_norm": 2.381653308868408, "grad_norm_var": 4.643456939433319, "learning_rate": 0.0001, "loss": 1.0594, "loss/crossentropy": 2.476351737976074, "loss/hidden": 0.859375, "loss/logits": 0.14875781536102295, "loss/reg": 0.005125833675265312, "step": 665 }, { "epoch": 0.08325, "grad_norm": 2.565531015396118, "grad_norm_var": 4.687379253455119, "learning_rate": 0.0001, "loss": 1.092, "loss/crossentropy": 2.164882183074951, "loss/hidden": 0.88671875, "loss/logits": 0.15401628613471985, "loss/reg": 0.005123757291585207, "step": 666 }, { "epoch": 0.083375, "grad_norm": 2.686464309692383, "grad_norm_var": 4.279508443254811, "learning_rate": 0.0001, "loss": 1.1857, "loss/crossentropy": 2.49807071685791, "loss/hidden": 0.9609375, "loss/logits": 0.17354023456573486, "loss/reg": 0.005121580790728331, "step": 667 }, { "epoch": 0.0835, "grad_norm": 2.211970806121826, "grad_norm_var": 4.325501093334068, "learning_rate": 0.0001, "loss": 0.9944, "loss/crossentropy": 2.3563051223754883, "loss/hidden": 0.80078125, "loss/logits": 0.14245735108852386, "loss/reg": 0.005119378212839365, "step": 668 }, { "epoch": 0.083625, "grad_norm": 40.70305252075195, "grad_norm_var": 92.56442532718542, "learning_rate": 0.0001, "loss": 1.1762, "loss/crossentropy": 2.488436460494995, "loss/hidden": 0.9453125, "loss/logits": 0.1797066330909729, "loss/reg": 0.0051171439699828625, "step": 669 }, { "epoch": 0.08375, "grad_norm": 2.748046875, "grad_norm_var": 92.66196615048598, "learning_rate": 0.0001, "loss": 1.1097, "loss/crossentropy": 2.310842990875244, "loss/hidden": 0.91796875, "loss/logits": 0.14054188132286072, "loss/reg": 0.0051149362698197365, "step": 670 }, { "epoch": 0.083875, "grad_norm": 3.018019199371338, "grad_norm_var": 92.72350960683757, "learning_rate": 0.0001, "loss": 1.1882, "loss/crossentropy": 2.5257725715637207, "loss/hidden": 0.9921875, "loss/logits": 0.144926518201828, "loss/reg": 0.005112735088914633, "step": 671 }, { "epoch": 0.084, "grad_norm": 3.2833805084228516, "grad_norm_var": 92.29023014918404, "learning_rate": 0.0001, "loss": 1.3506, "loss/crossentropy": 2.737273693084717, "loss/hidden": 1.0703125, "loss/logits": 0.22916561365127563, "loss/reg": 0.005110514350235462, "step": 672 }, { "epoch": 0.084125, "grad_norm": 3.5235512256622314, "grad_norm_var": 91.96110241564834, "learning_rate": 0.0001, "loss": 1.1667, "loss/crossentropy": 2.603550434112549, "loss/hidden": 0.97265625, "loss/logits": 0.14292669296264648, "loss/reg": 0.00510829733684659, "step": 673 }, { "epoch": 0.08425, "grad_norm": 8.889531135559082, "grad_norm_var": 91.7913062331738, "learning_rate": 0.0001, "loss": 1.6779, "loss/crossentropy": 2.7345638275146484, "loss/hidden": 1.25, "loss/logits": 0.37679582834243774, "loss/reg": 0.005105969030410051, "step": 674 }, { "epoch": 0.084375, "grad_norm": 3.833319664001465, "grad_norm_var": 91.72375618009856, "learning_rate": 0.0001, "loss": 1.1576, "loss/crossentropy": 2.670295000076294, "loss/hidden": 0.91015625, "loss/logits": 0.19638602435588837, "loss/reg": 0.005103633739054203, "step": 675 }, { "epoch": 0.0845, "grad_norm": 13.332308769226074, "grad_norm_var": 93.95525708687954, "learning_rate": 0.0001, "loss": 1.2802, "loss/crossentropy": 2.4972240924835205, "loss/hidden": 1.046875, "loss/logits": 0.18228942155838013, "loss/reg": 0.005101518705487251, "step": 676 }, { "epoch": 0.084625, "grad_norm": 3.3041481971740723, "grad_norm_var": 93.38769696489507, "learning_rate": 0.0001, "loss": 1.1323, "loss/crossentropy": 2.318303108215332, "loss/hidden": 0.92578125, "loss/logits": 0.15550082921981812, "loss/reg": 0.005099330097436905, "step": 677 }, { "epoch": 0.08475, "grad_norm": 3.0213379859924316, "grad_norm_var": 93.03138783085473, "learning_rate": 0.0001, "loss": 1.2046, "loss/crossentropy": 2.330695629119873, "loss/hidden": 0.921875, "loss/logits": 0.23171411454677582, "loss/reg": 0.005097060929983854, "step": 678 }, { "epoch": 0.084875, "grad_norm": 2.5998549461364746, "grad_norm_var": 92.84836678832802, "learning_rate": 0.0001, "loss": 1.0649, "loss/crossentropy": 2.699117422103882, "loss/hidden": 0.859375, "loss/logits": 0.15458270907402039, "loss/reg": 0.005094949621707201, "step": 679 }, { "epoch": 0.085, "grad_norm": 2.244635581970215, "grad_norm_var": 92.99521966737969, "learning_rate": 0.0001, "loss": 1.1168, "loss/crossentropy": 2.572654962539673, "loss/hidden": 0.90234375, "loss/logits": 0.1635233759880066, "loss/reg": 0.005092862527817488, "step": 680 }, { "epoch": 0.085125, "grad_norm": 2.6163716316223145, "grad_norm_var": 92.87692169982786, "learning_rate": 0.0001, "loss": 1.1651, "loss/crossentropy": 2.735013961791992, "loss/hidden": 0.93359375, "loss/logits": 0.18059232831001282, "loss/reg": 0.005090588703751564, "step": 681 }, { "epoch": 0.08525, "grad_norm": 2.5760252475738525, "grad_norm_var": 92.8717223043897, "learning_rate": 0.0001, "loss": 0.9637, "loss/crossentropy": 2.4715609550476074, "loss/hidden": 0.7890625, "loss/logits": 0.1237054169178009, "loss/reg": 0.0050884694792330265, "step": 682 }, { "epoch": 0.085375, "grad_norm": 2.3052964210510254, "grad_norm_var": 93.06379073504952, "learning_rate": 0.0001, "loss": 1.1353, "loss/crossentropy": 2.2887136936187744, "loss/hidden": 0.92578125, "loss/logits": 0.15868628025054932, "loss/reg": 0.005086386110633612, "step": 683 }, { "epoch": 0.0855, "grad_norm": 2.239668130874634, "grad_norm_var": 93.04887766727983, "learning_rate": 0.0001, "loss": 1.033, "loss/crossentropy": 2.420260429382324, "loss/hidden": 0.84765625, "loss/logits": 0.13450753688812256, "loss/reg": 0.005084337200969458, "step": 684 }, { "epoch": 0.085625, "grad_norm": 2.7788403034210205, "grad_norm_var": 8.800650863003963, "learning_rate": 0.0001, "loss": 1.2414, "loss/crossentropy": 2.505138397216797, "loss/hidden": 1.03125, "loss/logits": 0.15931686758995056, "loss/reg": 0.005082385148853064, "step": 685 }, { "epoch": 0.08575, "grad_norm": 2.274430513381958, "grad_norm_var": 8.887076805039055, "learning_rate": 0.0001, "loss": 1.0324, "loss/crossentropy": 2.24831485748291, "loss/hidden": 0.83984375, "loss/logits": 0.14174425601959229, "loss/reg": 0.00508028594776988, "step": 686 }, { "epoch": 0.085875, "grad_norm": 2.8570923805236816, "grad_norm_var": 8.906869950057784, "learning_rate": 0.0001, "loss": 1.2426, "loss/crossentropy": 2.1708295345306396, "loss/hidden": 1.0078125, "loss/logits": 0.18402233719825745, "loss/reg": 0.005078236572444439, "step": 687 }, { "epoch": 0.086, "grad_norm": 2.6168949604034424, "grad_norm_var": 8.985428302339566, "learning_rate": 0.0001, "loss": 1.1911, "loss/crossentropy": 2.4759016036987305, "loss/hidden": 0.97265625, "loss/logits": 0.1677204668521881, "loss/reg": 0.005076236091554165, "step": 688 }, { "epoch": 0.086125, "grad_norm": 2.738102674484253, "grad_norm_var": 9.054334077972502, "learning_rate": 0.0001, "loss": 1.1386, "loss/crossentropy": 2.4160187244415283, "loss/hidden": 0.94140625, "loss/logits": 0.1464519500732422, "loss/reg": 0.005074144806712866, "step": 689 }, { "epoch": 0.08625, "grad_norm": 2.7573044300079346, "grad_norm_var": 7.214004841898908, "learning_rate": 0.0001, "loss": 1.1327, "loss/crossentropy": 2.342536449432373, "loss/hidden": 0.9296875, "loss/logits": 0.15226896107196808, "loss/reg": 0.0050718653947114944, "step": 690 }, { "epoch": 0.086375, "grad_norm": 2.4906835556030273, "grad_norm_var": 7.245694276683736, "learning_rate": 0.0001, "loss": 1.043, "loss/crossentropy": 2.6861307621002197, "loss/hidden": 0.84765625, "loss/logits": 0.14462026953697205, "loss/reg": 0.005069798789918423, "step": 691 }, { "epoch": 0.0865, "grad_norm": 2.3750221729278564, "grad_norm_var": 0.08836772564593408, "learning_rate": 0.0001, "loss": 1.1331, "loss/crossentropy": 2.5460920333862305, "loss/hidden": 0.91796875, "loss/logits": 0.1644265055656433, "loss/reg": 0.0050675952807068825, "step": 692 }, { "epoch": 0.086625, "grad_norm": 2.2382612228393555, "grad_norm_var": 0.06104096205675281, "learning_rate": 0.0001, "loss": 1.1182, "loss/crossentropy": 2.5386240482330322, "loss/hidden": 0.9140625, "loss/logits": 0.15351390838623047, "loss/reg": 0.005065726116299629, "step": 693 }, { "epoch": 0.08675, "grad_norm": 2.582509994506836, "grad_norm_var": 0.04524178053572901, "learning_rate": 0.0001, "loss": 1.1349, "loss/crossentropy": 2.5054309368133545, "loss/hidden": 0.92578125, "loss/logits": 0.15845400094985962, "loss/reg": 0.00506393238902092, "step": 694 }, { "epoch": 0.086875, "grad_norm": 3.3852474689483643, "grad_norm_var": 0.09234654068112012, "learning_rate": 0.0001, "loss": 1.375, "loss/crossentropy": 2.463137626647949, "loss/hidden": 1.09375, "loss/logits": 0.23065921664237976, "loss/reg": 0.005062177777290344, "step": 695 }, { "epoch": 0.087, "grad_norm": 2.7022039890289307, "grad_norm_var": 0.085748197103725, "learning_rate": 0.0001, "loss": 1.1825, "loss/crossentropy": 2.2784640789031982, "loss/hidden": 0.953125, "loss/logits": 0.17874625325202942, "loss/reg": 0.005060084629803896, "step": 696 }, { "epoch": 0.087125, "grad_norm": 3.218095064163208, "grad_norm_var": 0.11002230581329756, "learning_rate": 0.0001, "loss": 1.4532, "loss/crossentropy": 2.372589111328125, "loss/hidden": 1.1875, "loss/logits": 0.21512824296951294, "loss/reg": 0.005058267153799534, "step": 697 }, { "epoch": 0.08725, "grad_norm": 2.2941925525665283, "grad_norm_var": 0.11714567363764346, "learning_rate": 0.0001, "loss": 1.2103, "loss/crossentropy": 2.2349698543548584, "loss/hidden": 0.99609375, "loss/logits": 0.16368569433689117, "loss/reg": 0.005056225229054689, "step": 698 }, { "epoch": 0.087375, "grad_norm": 2.4463765621185303, "grad_norm_var": 0.11254763430842919, "learning_rate": 0.0001, "loss": 1.0284, "loss/crossentropy": 2.49548077583313, "loss/hidden": 0.84375, "loss/logits": 0.13410091400146484, "loss/reg": 0.005054513458162546, "step": 699 }, { "epoch": 0.0875, "grad_norm": 2.5363550186157227, "grad_norm_var": 0.1028185685472406, "learning_rate": 0.0001, "loss": 0.9756, "loss/crossentropy": 2.900705099105835, "loss/hidden": 0.7890625, "loss/logits": 0.13605856895446777, "loss/reg": 0.005052678752690554, "step": 700 }, { "epoch": 0.087625, "grad_norm": 3.4383292198181152, "grad_norm_var": 0.1419262550473822, "learning_rate": 0.0001, "loss": 1.2789, "loss/crossentropy": 2.239861488342285, "loss/hidden": 1.0390625, "loss/logits": 0.18931907415390015, "loss/reg": 0.005051023792475462, "step": 701 }, { "epoch": 0.08775, "grad_norm": 2.1923646926879883, "grad_norm_var": 0.14683359089866196, "learning_rate": 0.0001, "loss": 1.0723, "loss/crossentropy": 2.5594210624694824, "loss/hidden": 0.8515625, "loss/logits": 0.17024339735507965, "loss/reg": 0.005049179773777723, "step": 702 }, { "epoch": 0.087875, "grad_norm": 2.492584466934204, "grad_norm_var": 0.1464975365420211, "learning_rate": 0.0001, "loss": 1.0832, "loss/crossentropy": 2.549513339996338, "loss/hidden": 0.890625, "loss/logits": 0.1421511173248291, "loss/reg": 0.005047108978033066, "step": 703 }, { "epoch": 0.088, "grad_norm": 3.04917311668396, "grad_norm_var": 0.15589194049567348, "learning_rate": 0.0001, "loss": 0.9286, "loss/crossentropy": 2.3838376998901367, "loss/hidden": 0.7578125, "loss/logits": 0.12035049498081207, "loss/reg": 0.005045315716415644, "step": 704 }, { "epoch": 0.088125, "grad_norm": 3.7284188270568848, "grad_norm_var": 0.22439052206896856, "learning_rate": 0.0001, "loss": 1.2283, "loss/crossentropy": 2.4002139568328857, "loss/hidden": 1.0, "loss/logits": 0.17786875367164612, "loss/reg": 0.005043353885412216, "step": 705 }, { "epoch": 0.08825, "grad_norm": 2.3665406703948975, "grad_norm_var": 0.2333161514143832, "learning_rate": 0.0001, "loss": 1.0864, "loss/crossentropy": 2.2537431716918945, "loss/hidden": 0.88671875, "loss/logits": 0.14927825331687927, "loss/reg": 0.005041591357439756, "step": 706 }, { "epoch": 0.088375, "grad_norm": 2.461461067199707, "grad_norm_var": 0.23426700013735413, "learning_rate": 0.0001, "loss": 0.9665, "loss/crossentropy": 2.267686605453491, "loss/hidden": 0.78125, "loss/logits": 0.1348324567079544, "loss/reg": 0.005039647221565247, "step": 707 }, { "epoch": 0.0885, "grad_norm": 2.219465494155884, "grad_norm_var": 0.24291783945608714, "learning_rate": 0.0001, "loss": 1.0682, "loss/crossentropy": 2.5170199871063232, "loss/hidden": 0.8828125, "loss/logits": 0.13497616350650787, "loss/reg": 0.0050375694409012794, "step": 708 }, { "epoch": 0.088625, "grad_norm": 2.5682785511016846, "grad_norm_var": 0.22899036593855726, "learning_rate": 0.0001, "loss": 1.1712, "loss/crossentropy": 2.3696398735046387, "loss/hidden": 0.95703125, "loss/logits": 0.16378942131996155, "loss/reg": 0.00503552844747901, "step": 709 }, { "epoch": 0.08875, "grad_norm": 2.2680654525756836, "grad_norm_var": 0.2413579176160397, "learning_rate": 0.0001, "loss": 1.1427, "loss/crossentropy": 2.4121482372283936, "loss/hidden": 0.9296875, "loss/logits": 0.16271916031837463, "loss/reg": 0.005033775232732296, "step": 710 }, { "epoch": 0.088875, "grad_norm": 7.707209587097168, "grad_norm_var": 1.7976793028734939, "learning_rate": 0.0001, "loss": 1.4757, "loss/crossentropy": 2.64532470703125, "loss/hidden": 1.1953125, "loss/logits": 0.23005220293998718, "loss/reg": 0.005031922832131386, "step": 711 }, { "epoch": 0.089, "grad_norm": 2.4962596893310547, "grad_norm_var": 1.8079738281494115, "learning_rate": 0.0001, "loss": 1.1922, "loss/crossentropy": 2.481624126434326, "loss/hidden": 0.96875, "loss/logits": 0.17317567765712738, "loss/reg": 0.005030201282352209, "step": 712 }, { "epoch": 0.089125, "grad_norm": 2.164900779724121, "grad_norm_var": 1.842137749294079, "learning_rate": 0.0001, "loss": 1.064, "loss/crossentropy": 2.228675365447998, "loss/hidden": 0.85546875, "loss/logits": 0.1582651436328888, "loss/reg": 0.005028109531849623, "step": 713 }, { "epoch": 0.08925, "grad_norm": 2.5871829986572266, "grad_norm_var": 1.8237636675870703, "learning_rate": 0.0001, "loss": 1.1265, "loss/crossentropy": 2.4661970138549805, "loss/hidden": 0.9140625, "loss/logits": 0.16222231090068817, "loss/reg": 0.005026375409215689, "step": 714 }, { "epoch": 0.089375, "grad_norm": 2.6158599853515625, "grad_norm_var": 1.814851924792763, "learning_rate": 0.0001, "loss": 1.2786, "loss/crossentropy": 2.4959585666656494, "loss/hidden": 1.0390625, "loss/logits": 0.18929770588874817, "loss/reg": 0.005024294834583998, "step": 715 }, { "epoch": 0.0895, "grad_norm": 1.9927250146865845, "grad_norm_var": 1.8619121365324653, "learning_rate": 0.0001, "loss": 1.0324, "loss/crossentropy": 2.471590518951416, "loss/hidden": 0.828125, "loss/logits": 0.15405681729316711, "loss/reg": 0.005022158846259117, "step": 716 }, { "epoch": 0.089625, "grad_norm": 2.2087814807891846, "grad_norm_var": 1.8676209281098621, "learning_rate": 0.0001, "loss": 1.0376, "loss/crossentropy": 2.4889180660247803, "loss/hidden": 0.83984375, "loss/logits": 0.14751726388931274, "loss/reg": 0.005020026583224535, "step": 717 }, { "epoch": 0.08975, "grad_norm": 2.185058116912842, "grad_norm_var": 1.8682356690613582, "learning_rate": 0.0001, "loss": 1.1229, "loss/crossentropy": 2.4008331298828125, "loss/hidden": 0.921875, "loss/logits": 0.15089130401611328, "loss/reg": 0.005017881281673908, "step": 718 }, { "epoch": 0.089875, "grad_norm": 2.22664737701416, "grad_norm_var": 1.8842476127138506, "learning_rate": 0.0001, "loss": 0.9793, "loss/crossentropy": 2.6462786197662354, "loss/hidden": 0.7890625, "loss/logits": 0.14010348916053772, "loss/reg": 0.0050159962847828865, "step": 719 }, { "epoch": 0.09, "grad_norm": 7.475221633911133, "grad_norm_var": 3.2539659864593964, "learning_rate": 0.0001, "loss": 1.2258, "loss/crossentropy": 2.450388193130493, "loss/hidden": 1.015625, "loss/logits": 0.16006067395210266, "loss/reg": 0.005013884510844946, "step": 720 }, { "epoch": 0.090125, "grad_norm": 2.1259288787841797, "grad_norm_var": 3.275813935195105, "learning_rate": 0.0001, "loss": 1.1558, "loss/crossentropy": 2.3211934566497803, "loss/hidden": 0.9375, "loss/logits": 0.1681801825761795, "loss/reg": 0.0050118486396968365, "step": 721 }, { "epoch": 0.09025, "grad_norm": 3.284715414047241, "grad_norm_var": 3.2534822002252284, "learning_rate": 0.0001, "loss": 1.227, "loss/crossentropy": 2.3303604125976562, "loss/hidden": 1.0, "loss/logits": 0.17691665887832642, "loss/reg": 0.005009867250919342, "step": 722 }, { "epoch": 0.090375, "grad_norm": 2.481712818145752, "grad_norm_var": 3.2519544593852943, "learning_rate": 0.0001, "loss": 1.0745, "loss/crossentropy": 2.516172409057617, "loss/hidden": 0.86328125, "loss/logits": 0.1610938012599945, "loss/reg": 0.005008057691156864, "step": 723 }, { "epoch": 0.0905, "grad_norm": 2.6934256553649902, "grad_norm_var": 3.2142672637689955, "learning_rate": 0.0001, "loss": 1.0241, "loss/crossentropy": 2.4445412158966064, "loss/hidden": 0.83203125, "loss/logits": 0.14205417037010193, "loss/reg": 0.0050062634982168674, "step": 724 }, { "epoch": 0.090625, "grad_norm": 2.8393290042877197, "grad_norm_var": 3.2008126847008653, "learning_rate": 0.0001, "loss": 1.4807, "loss/crossentropy": 2.155627489089966, "loss/hidden": 1.203125, "loss/logits": 0.22749567031860352, "loss/reg": 0.005004186183214188, "step": 725 }, { "epoch": 0.09075, "grad_norm": 2.7673983573913574, "grad_norm_var": 3.16203540734179, "learning_rate": 0.0001, "loss": 1.0853, "loss/crossentropy": 2.841604709625244, "loss/hidden": 0.859375, "loss/logits": 0.17592039704322815, "loss/reg": 0.005002181977033615, "step": 726 }, { "epoch": 0.090875, "grad_norm": 2.4957730770111084, "grad_norm_var": 1.6690794582387851, "learning_rate": 0.0001, "loss": 1.0616, "loss/crossentropy": 2.4109010696411133, "loss/hidden": 0.85546875, "loss/logits": 0.15614046156406403, "loss/reg": 0.005000332836061716, "step": 727 }, { "epoch": 0.091, "grad_norm": 2.2517614364624023, "grad_norm_var": 1.6823934112280023, "learning_rate": 0.0001, "loss": 0.9522, "loss/crossentropy": 2.5019116401672363, "loss/hidden": 0.77734375, "loss/logits": 0.12488029897212982, "loss/reg": 0.004998230375349522, "step": 728 }, { "epoch": 0.091125, "grad_norm": 3.1151885986328125, "grad_norm_var": 1.6615595314428224, "learning_rate": 0.0001, "loss": 1.3341, "loss/crossentropy": 2.0731663703918457, "loss/hidden": 1.140625, "loss/logits": 0.1435013860464096, "loss/reg": 0.00499630905687809, "step": 729 }, { "epoch": 0.09125, "grad_norm": 2.27622127532959, "grad_norm_var": 1.6778435468635766, "learning_rate": 0.0001, "loss": 0.9945, "loss/crossentropy": 2.5911362171173096, "loss/hidden": 0.8046875, "loss/logits": 0.13988272845745087, "loss/reg": 0.004994215443730354, "step": 730 }, { "epoch": 0.091375, "grad_norm": 2.634037971496582, "grad_norm_var": 1.6773821814765582, "learning_rate": 0.0001, "loss": 1.0821, "loss/crossentropy": 2.217550754547119, "loss/hidden": 0.8828125, "loss/logits": 0.14941135048866272, "loss/reg": 0.004992038011550903, "step": 731 }, { "epoch": 0.0915, "grad_norm": 3.696157693862915, "grad_norm_var": 1.6717809998289452, "learning_rate": 0.0001, "loss": 1.4666, "loss/crossentropy": 2.527979850769043, "loss/hidden": 1.1796875, "loss/logits": 0.23702046275138855, "loss/reg": 0.004989837761968374, "step": 732 }, { "epoch": 0.091625, "grad_norm": 2.2505931854248047, "grad_norm_var": 1.6679122787177638, "learning_rate": 0.0001, "loss": 1.1594, "loss/crossentropy": 2.539890766143799, "loss/hidden": 0.93359375, "loss/logits": 0.17588791251182556, "loss/reg": 0.004987762775272131, "step": 733 }, { "epoch": 0.09175, "grad_norm": 2.468350887298584, "grad_norm_var": 1.6449808034713405, "learning_rate": 0.0001, "loss": 0.9491, "loss/crossentropy": 2.3645944595336914, "loss/hidden": 0.77734375, "loss/logits": 0.12187166512012482, "loss/reg": 0.004985733889043331, "step": 734 }, { "epoch": 0.091875, "grad_norm": 2.4574854373931885, "grad_norm_var": 1.6262736490095788, "learning_rate": 0.0001, "loss": 0.9553, "loss/crossentropy": 2.5622363090515137, "loss/hidden": 0.78515625, "loss/logits": 0.12034176290035248, "loss/reg": 0.004983709193766117, "step": 735 }, { "epoch": 0.092, "grad_norm": 2.2961714267730713, "grad_norm_var": 0.18272698620191838, "learning_rate": 0.0001, "loss": 0.9857, "loss/crossentropy": 2.65541934967041, "loss/hidden": 0.8046875, "loss/logits": 0.13119381666183472, "loss/reg": 0.004981704521924257, "step": 736 }, { "epoch": 0.092125, "grad_norm": 1.9556196928024292, "grad_norm_var": 0.19606320022039506, "learning_rate": 0.0001, "loss": 1.1183, "loss/crossentropy": 2.4507250785827637, "loss/hidden": 0.90625, "loss/logits": 0.16227680444717407, "loss/reg": 0.004979623947292566, "step": 737 }, { "epoch": 0.09225, "grad_norm": 4.161533832550049, "grad_norm_var": 0.32150407886374516, "learning_rate": 0.0001, "loss": 1.2986, "loss/crossentropy": 2.07208514213562, "loss/hidden": 1.0703125, "loss/logits": 0.17855030298233032, "loss/reg": 0.0049775131046772, "step": 738 }, { "epoch": 0.092375, "grad_norm": 3.0142910480499268, "grad_norm_var": 0.3253252453994368, "learning_rate": 0.0001, "loss": 0.9659, "loss/crossentropy": 2.569051742553711, "loss/hidden": 0.78515625, "loss/logits": 0.13097813725471497, "loss/reg": 0.004975371062755585, "step": 739 }, { "epoch": 0.0925, "grad_norm": 2.2635934352874756, "grad_norm_var": 0.33787014856401254, "learning_rate": 0.0001, "loss": 1.0329, "loss/crossentropy": 2.4809088706970215, "loss/hidden": 0.83203125, "loss/logits": 0.15113815665245056, "loss/reg": 0.004973322618752718, "step": 740 }, { "epoch": 0.092625, "grad_norm": 2.633608818054199, "grad_norm_var": 0.33625377709689125, "learning_rate": 0.0001, "loss": 1.1759, "loss/crossentropy": 2.336703062057495, "loss/hidden": 0.953125, "loss/logits": 0.17310968041419983, "loss/reg": 0.004971369635313749, "step": 741 }, { "epoch": 0.09275, "grad_norm": 2.431776285171509, "grad_norm_var": 0.3389851198561174, "learning_rate": 0.0001, "loss": 1.0201, "loss/crossentropy": 2.4961469173431396, "loss/hidden": 0.8203125, "loss/logits": 0.15009689331054688, "loss/reg": 0.004969437140971422, "step": 742 }, { "epoch": 0.092875, "grad_norm": 2.756232976913452, "grad_norm_var": 0.3378643921182785, "learning_rate": 0.0001, "loss": 1.0276, "loss/crossentropy": 2.4552102088928223, "loss/hidden": 0.8515625, "loss/logits": 0.12634404003620148, "loss/reg": 0.0049674008041620255, "step": 743 }, { "epoch": 0.093, "grad_norm": 2.5648443698883057, "grad_norm_var": 0.32668128102186145, "learning_rate": 0.0001, "loss": 1.14, "loss/crossentropy": 2.811657667160034, "loss/hidden": 0.93359375, "loss/logits": 0.15678739547729492, "loss/reg": 0.0049654701724648476, "step": 744 }, { "epoch": 0.093125, "grad_norm": 2.283196210861206, "grad_norm_var": 0.32233157119037936, "learning_rate": 0.0001, "loss": 1.0877, "loss/crossentropy": 2.3052849769592285, "loss/hidden": 0.8828125, "loss/logits": 0.1552238166332245, "loss/reg": 0.004963380750268698, "step": 745 }, { "epoch": 0.09325, "grad_norm": 2.500383138656616, "grad_norm_var": 0.3147792588205774, "learning_rate": 0.0001, "loss": 1.0794, "loss/crossentropy": 2.6310482025146484, "loss/hidden": 0.88671875, "loss/logits": 0.14305052161216736, "loss/reg": 0.0049613784067332745, "step": 746 }, { "epoch": 0.093375, "grad_norm": 2.7470545768737793, "grad_norm_var": 0.3153672801439085, "learning_rate": 0.0001, "loss": 1.1538, "loss/crossentropy": 2.645195722579956, "loss/hidden": 0.921875, "loss/logits": 0.18234500288963318, "loss/reg": 0.004959197249263525, "step": 747 }, { "epoch": 0.0935, "grad_norm": 2.790817975997925, "grad_norm_var": 0.24092132942115865, "learning_rate": 0.0001, "loss": 1.015, "loss/crossentropy": 2.0862972736358643, "loss/hidden": 0.8359375, "loss/logits": 0.12952454388141632, "loss/reg": 0.004957180004566908, "step": 748 }, { "epoch": 0.093625, "grad_norm": 2.898916482925415, "grad_norm_var": 0.23711979067914365, "learning_rate": 0.0001, "loss": 1.0498, "loss/crossentropy": 2.522505760192871, "loss/hidden": 0.8515625, "loss/logits": 0.14867964386940002, "loss/reg": 0.004955058917403221, "step": 749 }, { "epoch": 0.09375, "grad_norm": 3.0828936100006104, "grad_norm_var": 0.24674152232076801, "learning_rate": 0.0001, "loss": 1.3487, "loss/crossentropy": 2.078137159347534, "loss/hidden": 1.0703125, "loss/logits": 0.2288488745689392, "loss/reg": 0.004952888935804367, "step": 750 }, { "epoch": 0.093875, "grad_norm": 4.464056491851807, "grad_norm_var": 0.439550102142455, "learning_rate": 0.0001, "loss": 1.125, "loss/crossentropy": 2.6811301708221436, "loss/hidden": 0.9140625, "loss/logits": 0.16146372258663177, "loss/reg": 0.004950782749801874, "step": 751 }, { "epoch": 0.094, "grad_norm": 2.9327120780944824, "grad_norm_var": 0.4218744680947139, "learning_rate": 0.0001, "loss": 1.1226, "loss/crossentropy": 2.502683639526367, "loss/hidden": 0.91015625, "loss/logits": 0.16296005249023438, "loss/reg": 0.004948711488395929, "step": 752 }, { "epoch": 0.094125, "grad_norm": 2.4569215774536133, "grad_norm_var": 0.3782952433457505, "learning_rate": 0.0001, "loss": 1.1117, "loss/crossentropy": 2.5091257095336914, "loss/hidden": 0.90234375, "loss/logits": 0.15988323092460632, "loss/reg": 0.004946760833263397, "step": 753 }, { "epoch": 0.09425, "grad_norm": 5.152282238006592, "grad_norm_var": 0.6097367248532388, "learning_rate": 0.0001, "loss": 1.3929, "loss/crossentropy": 2.2930901050567627, "loss/hidden": 1.1484375, "loss/logits": 0.19504866003990173, "loss/reg": 0.004944849293678999, "step": 754 }, { "epoch": 0.094375, "grad_norm": 2.604393243789673, "grad_norm_var": 0.6159506323654304, "learning_rate": 0.0001, "loss": 1.0457, "loss/crossentropy": 2.59228777885437, "loss/hidden": 0.8359375, "loss/logits": 0.16035211086273193, "loss/reg": 0.004942973144352436, "step": 755 }, { "epoch": 0.0945, "grad_norm": 2.197974443435669, "grad_norm_var": 0.6218773019698792, "learning_rate": 0.0001, "loss": 1.0755, "loss/crossentropy": 2.3389058113098145, "loss/hidden": 0.8828125, "loss/logits": 0.14329570531845093, "loss/reg": 0.004941044840961695, "step": 756 }, { "epoch": 0.094625, "grad_norm": 3.0741820335388184, "grad_norm_var": 0.6180001684099087, "learning_rate": 0.0001, "loss": 1.4422, "loss/crossentropy": 2.2101480960845947, "loss/hidden": 1.1796875, "loss/logits": 0.2130812704563141, "loss/reg": 0.004938756115734577, "step": 757 }, { "epoch": 0.09475, "grad_norm": 2.604829788208008, "grad_norm_var": 0.6082914113291829, "learning_rate": 0.0001, "loss": 1.1533, "loss/crossentropy": 2.431821584701538, "loss/hidden": 0.94140625, "loss/logits": 0.16252049803733826, "loss/reg": 0.0049363370053470135, "step": 758 }, { "epoch": 0.094875, "grad_norm": 2.824411630630493, "grad_norm_var": 0.606870668349819, "learning_rate": 0.0001, "loss": 1.2267, "loss/crossentropy": 2.3011207580566406, "loss/hidden": 0.97265625, "loss/logits": 0.20468328893184662, "loss/reg": 0.004933919291943312, "step": 759 }, { "epoch": 0.095, "grad_norm": 2.5809361934661865, "grad_norm_var": 0.606063171082104, "learning_rate": 0.0001, "loss": 1.0216, "loss/crossentropy": 2.5506176948547363, "loss/hidden": 0.8359375, "loss/logits": 0.13629919290542603, "loss/reg": 0.004931787494570017, "step": 760 }, { "epoch": 0.095125, "grad_norm": 4.444363117218018, "grad_norm_var": 0.7059078117077803, "learning_rate": 0.0001, "loss": 1.3498, "loss/crossentropy": 2.8235206604003906, "loss/hidden": 1.109375, "loss/logits": 0.191168874502182, "loss/reg": 0.0049294959753751755, "step": 761 }, { "epoch": 0.09525, "grad_norm": 2.169010639190674, "grad_norm_var": 0.7385929926525419, "learning_rate": 0.0001, "loss": 0.9763, "loss/crossentropy": 2.706693172454834, "loss/hidden": 0.78515625, "loss/logits": 0.1419064998626709, "loss/reg": 0.004927367437630892, "step": 762 }, { "epoch": 0.095375, "grad_norm": 2.4050183296203613, "grad_norm_var": 0.7603640408605048, "learning_rate": 0.0001, "loss": 1.043, "loss/crossentropy": 2.622105836868286, "loss/hidden": 0.84765625, "loss/logits": 0.1461138278245926, "loss/reg": 0.004925237502902746, "step": 763 }, { "epoch": 0.0955, "grad_norm": 2.4372246265411377, "grad_norm_var": 0.7800550132454624, "learning_rate": 0.0001, "loss": 1.058, "loss/crossentropy": 2.4866652488708496, "loss/hidden": 0.86328125, "loss/logits": 0.14548787474632263, "loss/reg": 0.004923122003674507, "step": 764 }, { "epoch": 0.095625, "grad_norm": 2.517997980117798, "grad_norm_var": 0.7953055666315338, "learning_rate": 0.0001, "loss": 1.2213, "loss/crossentropy": 2.3578498363494873, "loss/hidden": 1.015625, "loss/logits": 0.1564468890428543, "loss/reg": 0.004920901730656624, "step": 765 }, { "epoch": 0.09575, "grad_norm": 2.278327226638794, "grad_norm_var": 0.8265305072858796, "learning_rate": 0.0001, "loss": 1.0466, "loss/crossentropy": 2.55232310295105, "loss/hidden": 0.84765625, "loss/logits": 0.14976537227630615, "loss/reg": 0.0049185301177203655, "step": 766 }, { "epoch": 0.095875, "grad_norm": 2.337240219116211, "grad_norm_var": 0.6789092499013821, "learning_rate": 0.0001, "loss": 1.1614, "loss/crossentropy": 2.672149658203125, "loss/hidden": 0.94921875, "loss/logits": 0.16301041841506958, "loss/reg": 0.004916144534945488, "step": 767 }, { "epoch": 0.096, "grad_norm": 8.908835411071777, "grad_norm_var": 3.005936619726153, "learning_rate": 0.0001, "loss": 0.9922, "loss/crossentropy": 2.3541464805603027, "loss/hidden": 0.8359375, "loss/logits": 0.10713944584131241, "loss/reg": 0.0049139889888465405, "step": 768 }, { "epoch": 0.096125, "grad_norm": 2.1774449348449707, "grad_norm_var": 3.0380281733161834, "learning_rate": 0.0001, "loss": 0.9611, "loss/crossentropy": 2.5367493629455566, "loss/hidden": 0.80078125, "loss/logits": 0.11117491126060486, "loss/reg": 0.00491185300052166, "step": 769 }, { "epoch": 0.09625, "grad_norm": 4.010209083557129, "grad_norm_var": 2.8176414116633506, "learning_rate": 0.0001, "loss": 1.6116, "loss/crossentropy": 2.4206178188323975, "loss/hidden": 1.2734375, "loss/logits": 0.28903743624687195, "loss/reg": 0.00490949209779501, "step": 770 }, { "epoch": 0.096375, "grad_norm": 2.3479456901550293, "grad_norm_var": 2.838639045972002, "learning_rate": 0.0001, "loss": 1.0775, "loss/crossentropy": 2.3782246112823486, "loss/hidden": 0.87109375, "loss/logits": 0.15734228491783142, "loss/reg": 0.004907363560050726, "step": 771 }, { "epoch": 0.0965, "grad_norm": 2.5986974239349365, "grad_norm_var": 2.801428785253172, "learning_rate": 0.0001, "loss": 1.0758, "loss/crossentropy": 2.4752702713012695, "loss/hidden": 0.890625, "loss/logits": 0.13615593314170837, "loss/reg": 0.0049048978835344315, "step": 772 }, { "epoch": 0.096625, "grad_norm": 2.3621203899383545, "grad_norm_var": 2.8362617972026043, "learning_rate": 0.0001, "loss": 0.9929, "loss/crossentropy": 2.6385865211486816, "loss/hidden": 0.80859375, "loss/logits": 0.13527554273605347, "loss/reg": 0.004902740474790335, "step": 773 }, { "epoch": 0.09675, "grad_norm": 2.2466084957122803, "grad_norm_var": 2.866155351424041, "learning_rate": 0.0001, "loss": 1.1506, "loss/crossentropy": 2.5522897243499756, "loss/hidden": 0.93359375, "loss/logits": 0.1680143177509308, "loss/reg": 0.004900622647255659, "step": 774 }, { "epoch": 0.096875, "grad_norm": 2.6764907836914062, "grad_norm_var": 2.871782767876315, "learning_rate": 0.0001, "loss": 1.1166, "loss/crossentropy": 2.5808451175689697, "loss/hidden": 0.90625, "loss/logits": 0.16133888065814972, "loss/reg": 0.004898467101156712, "step": 775 }, { "epoch": 0.097, "grad_norm": 2.1325623989105225, "grad_norm_var": 2.911263182397389, "learning_rate": 0.0001, "loss": 1.1775, "loss/crossentropy": 2.3073110580444336, "loss/hidden": 0.953125, "loss/logits": 0.17539924383163452, "loss/reg": 0.004896300844848156, "step": 776 }, { "epoch": 0.097125, "grad_norm": 2.845750570297241, "grad_norm_var": 2.7637895893424673, "learning_rate": 0.0001, "loss": 0.9791, "loss/crossentropy": 2.452115774154663, "loss/hidden": 0.796875, "loss/logits": 0.13326548039913177, "loss/reg": 0.004894034005701542, "step": 777 }, { "epoch": 0.09725, "grad_norm": 6.139473915100098, "grad_norm_var": 3.360390097314651, "learning_rate": 0.0001, "loss": 1.1467, "loss/crossentropy": 2.44069242477417, "loss/hidden": 0.94140625, "loss/logits": 0.15639880299568176, "loss/reg": 0.0048917257227003574, "step": 778 }, { "epoch": 0.097375, "grad_norm": 2.3391048908233643, "grad_norm_var": 3.3672209294330075, "learning_rate": 0.0001, "loss": 1.137, "loss/crossentropy": 2.5613744258880615, "loss/hidden": 0.92578125, "loss/logits": 0.16234168410301208, "loss/reg": 0.004889402538537979, "step": 779 }, { "epoch": 0.0975, "grad_norm": 2.2655279636383057, "grad_norm_var": 3.3853179937680844, "learning_rate": 0.0001, "loss": 1.1061, "loss/crossentropy": 2.3578813076019287, "loss/hidden": 0.90234375, "loss/logits": 0.15491583943367004, "loss/reg": 0.004887087736278772, "step": 780 }, { "epoch": 0.097625, "grad_norm": 2.2749481201171875, "grad_norm_var": 3.4090543314963564, "learning_rate": 0.0001, "loss": 1.0216, "loss/crossentropy": 2.4622697830200195, "loss/hidden": 0.828125, "loss/logits": 0.14463937282562256, "loss/reg": 0.004884790629148483, "step": 781 }, { "epoch": 0.09775, "grad_norm": 2.891165256500244, "grad_norm_var": 3.363644225109554, "learning_rate": 0.0001, "loss": 1.1747, "loss/crossentropy": 2.5788161754608154, "loss/hidden": 0.953125, "loss/logits": 0.1727641224861145, "loss/reg": 0.0048824455589056015, "step": 782 }, { "epoch": 0.097875, "grad_norm": 2.347449541091919, "grad_norm_var": 3.362531263350426, "learning_rate": 0.0001, "loss": 0.995, "loss/crossentropy": 2.5813019275665283, "loss/hidden": 0.8046875, "loss/logits": 0.14150115847587585, "loss/reg": 0.004880187567323446, "step": 783 }, { "epoch": 0.098, "grad_norm": 3.219748020172119, "grad_norm_var": 1.0248437110061321, "learning_rate": 0.0001, "loss": 1.331, "loss/crossentropy": 2.4393255710601807, "loss/hidden": 1.078125, "loss/logits": 0.20414334535598755, "loss/reg": 0.004877839703112841, "step": 784 }, { "epoch": 0.098125, "grad_norm": 2.7481374740600586, "grad_norm_var": 0.997469803821544, "learning_rate": 0.0001, "loss": 1.1664, "loss/crossentropy": 2.856651544570923, "loss/hidden": 0.93359375, "loss/logits": 0.18401256203651428, "loss/reg": 0.0048755621537566185, "step": 785 }, { "epoch": 0.09825, "grad_norm": 3.8722689151763916, "grad_norm_var": 0.9771433382716601, "learning_rate": 0.0001, "loss": 1.1632, "loss/crossentropy": 2.7695257663726807, "loss/hidden": 0.92578125, "loss/logits": 0.18866363167762756, "loss/reg": 0.004873441066592932, "step": 786 }, { "epoch": 0.098375, "grad_norm": 2.877189874649048, "grad_norm_var": 0.9605094695400339, "learning_rate": 0.0001, "loss": 1.1935, "loss/crossentropy": 2.569603443145752, "loss/hidden": 0.92578125, "loss/logits": 0.21895866096019745, "loss/reg": 0.004871242213994265, "step": 787 }, { "epoch": 0.0985, "grad_norm": 2.430058717727661, "grad_norm_var": 0.9682708910971911, "learning_rate": 0.0001, "loss": 0.9791, "loss/crossentropy": 2.767273187637329, "loss/hidden": 0.80078125, "loss/logits": 0.12963160872459412, "loss/reg": 0.004869125317782164, "step": 788 }, { "epoch": 0.098625, "grad_norm": 2.63755202293396, "grad_norm_var": 0.9549378382865437, "learning_rate": 0.0001, "loss": 0.9765, "loss/crossentropy": 2.550255537033081, "loss/hidden": 0.79296875, "loss/logits": 0.13482339680194855, "loss/reg": 0.004866961855441332, "step": 789 }, { "epoch": 0.09875, "grad_norm": 2.9017562866210938, "grad_norm_var": 0.9271776289312942, "learning_rate": 0.0001, "loss": 1.152, "loss/crossentropy": 2.3682990074157715, "loss/hidden": 0.91796875, "loss/logits": 0.18535348773002625, "loss/reg": 0.004864787682890892, "step": 790 }, { "epoch": 0.098875, "grad_norm": 2.388214349746704, "grad_norm_var": 0.9414410795556288, "learning_rate": 0.0001, "loss": 1.4406, "loss/crossentropy": 2.2244787216186523, "loss/hidden": 1.1640625, "loss/logits": 0.2279416173696518, "loss/reg": 0.0048627713695168495, "step": 791 }, { "epoch": 0.099, "grad_norm": 2.3403820991516113, "grad_norm_var": 0.9230295318881718, "learning_rate": 0.0001, "loss": 1.1661, "loss/crossentropy": 2.3729617595672607, "loss/hidden": 0.95703125, "loss/logits": 0.16043922305107117, "loss/reg": 0.004860777873545885, "step": 792 }, { "epoch": 0.099125, "grad_norm": 2.9416215419769287, "grad_norm_var": 0.9228156704300846, "learning_rate": 0.0001, "loss": 1.0463, "loss/crossentropy": 2.6738815307617188, "loss/hidden": 0.85546875, "loss/logits": 0.14225679636001587, "loss/reg": 0.0048586721532046795, "step": 793 }, { "epoch": 0.09925, "grad_norm": 3.0802805423736572, "grad_norm_var": 0.1918460569244303, "learning_rate": 0.0001, "loss": 1.095, "loss/crossentropy": 2.368016242980957, "loss/hidden": 0.890625, "loss/logits": 0.15585456788539886, "loss/reg": 0.0048565310426056385, "step": 794 }, { "epoch": 0.099375, "grad_norm": 2.2744922637939453, "grad_norm_var": 0.19540746296378658, "learning_rate": 0.0001, "loss": 1.056, "loss/crossentropy": 2.4861905574798584, "loss/hidden": 0.85546875, "loss/logits": 0.15199777483940125, "loss/reg": 0.004854561761021614, "step": 795 }, { "epoch": 0.0995, "grad_norm": 2.4716484546661377, "grad_norm_var": 0.1856228513035127, "learning_rate": 0.0001, "loss": 1.1547, "loss/crossentropy": 2.315619468688965, "loss/hidden": 0.95703125, "loss/logits": 0.1491631120443344, "loss/reg": 0.004852783400565386, "step": 796 }, { "epoch": 0.099625, "grad_norm": 1.8618899583816528, "grad_norm_var": 0.22140635444161577, "learning_rate": 0.0001, "loss": 0.9489, "loss/crossentropy": 2.5243325233459473, "loss/hidden": 0.77734375, "loss/logits": 0.12307024002075195, "loss/reg": 0.004850673023611307, "step": 797 }, { "epoch": 0.09975, "grad_norm": 2.3018720149993896, "grad_norm_var": 0.22850198783917974, "learning_rate": 0.0001, "loss": 1.1395, "loss/crossentropy": 2.437481164932251, "loss/hidden": 0.93359375, "loss/logits": 0.1574450135231018, "loss/reg": 0.0048488411121070385, "step": 798 }, { "epoch": 0.099875, "grad_norm": 2.351703643798828, "grad_norm_var": 0.22832106568478797, "learning_rate": 0.0001, "loss": 1.1459, "loss/crossentropy": 2.7124335765838623, "loss/hidden": 0.91796875, "loss/logits": 0.17945504188537598, "loss/reg": 0.004846641793847084, "step": 799 }, { "epoch": 0.1, "grad_norm": 3.841269016265869, "grad_norm_var": 0.29813113065747426, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.2013328075408936, "loss/hidden": 1.0703125, "loss/logits": 0.2869468331336975, "loss/reg": 0.00484456866979599, "step": 800 }, { "epoch": 0.100125, "grad_norm": 2.1647138595581055, "grad_norm_var": 0.3162455329850776, "learning_rate": 0.0001, "loss": 1.1089, "loss/crossentropy": 2.5788028240203857, "loss/hidden": 0.91015625, "loss/logits": 0.15036620199680328, "loss/reg": 0.004842570051550865, "step": 801 }, { "epoch": 0.10025, "grad_norm": 7.012945652008057, "grad_norm_var": 1.4357519156716128, "learning_rate": 0.0001, "loss": 1.3117, "loss/crossentropy": 2.785094738006592, "loss/hidden": 1.078125, "loss/logits": 0.18520238995552063, "loss/reg": 0.004840615671128035, "step": 802 }, { "epoch": 0.100375, "grad_norm": 2.403449535369873, "grad_norm_var": 1.4491572072812606, "learning_rate": 0.0001, "loss": 1.0264, "loss/crossentropy": 2.464657783508301, "loss/hidden": 0.83984375, "loss/logits": 0.1381913125514984, "loss/reg": 0.004838695749640465, "step": 803 }, { "epoch": 0.1005, "grad_norm": 2.554766893386841, "grad_norm_var": 1.4433503798033878, "learning_rate": 0.0001, "loss": 1.025, "loss/crossentropy": 2.506692409515381, "loss/hidden": 0.84765625, "loss/logits": 0.1289561688899994, "loss/reg": 0.0048366026021540165, "step": 804 }, { "epoch": 0.100625, "grad_norm": 2.103414535522461, "grad_norm_var": 1.4759940006075438, "learning_rate": 0.0001, "loss": 1.0631, "loss/crossentropy": 2.4695632457733154, "loss/hidden": 0.86328125, "loss/logits": 0.15149196982383728, "loss/reg": 0.004834519233554602, "step": 805 }, { "epoch": 0.10075, "grad_norm": 2.1140382289886475, "grad_norm_var": 1.5053641044502488, "learning_rate": 0.0001, "loss": 1.0509, "loss/crossentropy": 2.536261796951294, "loss/hidden": 0.84375, "loss/logits": 0.15881776809692383, "loss/reg": 0.004832423757761717, "step": 806 }, { "epoch": 0.100875, "grad_norm": 2.4623467922210693, "grad_norm_var": 1.5020038787198757, "learning_rate": 0.0001, "loss": 1.0163, "loss/crossentropy": 2.6407546997070312, "loss/hidden": 0.828125, "loss/logits": 0.1398705244064331, "loss/reg": 0.004830438643693924, "step": 807 }, { "epoch": 0.101, "grad_norm": 2.196262836456299, "grad_norm_var": 1.5115104848001217, "learning_rate": 0.0001, "loss": 0.9786, "loss/crossentropy": 2.7860398292541504, "loss/hidden": 0.78515625, "loss/logits": 0.1451636254787445, "loss/reg": 0.0048283860087394714, "step": 808 }, { "epoch": 0.101125, "grad_norm": 1.9615752696990967, "grad_norm_var": 1.547617987738648, "learning_rate": 0.0001, "loss": 0.9815, "loss/crossentropy": 2.4804906845092773, "loss/hidden": 0.796875, "loss/logits": 0.13634686172008514, "loss/reg": 0.004826539196074009, "step": 809 }, { "epoch": 0.10125, "grad_norm": 2.813462257385254, "grad_norm_var": 1.5384423691934326, "learning_rate": 0.0001, "loss": 1.0387, "loss/crossentropy": 2.4615039825439453, "loss/hidden": 0.84375, "loss/logits": 0.1467183232307434, "loss/reg": 0.004824436269700527, "step": 810 }, { "epoch": 0.101375, "grad_norm": 2.155719518661499, "grad_norm_var": 1.5457555739015585, "learning_rate": 0.0001, "loss": 1.0501, "loss/crossentropy": 2.5954880714416504, "loss/hidden": 0.85546875, "loss/logits": 0.1464114934206009, "loss/reg": 0.004822410177439451, "step": 811 }, { "epoch": 0.1015, "grad_norm": 9.283843040466309, "grad_norm_var": 4.263069385825235, "learning_rate": 0.0001, "loss": 2.7627, "loss/crossentropy": 2.139838218688965, "loss/hidden": 2.375, "loss/logits": 0.3395351767539978, "loss/reg": 0.0048205070197582245, "step": 812 }, { "epoch": 0.101625, "grad_norm": 2.0784647464752197, "grad_norm_var": 4.230278658390638, "learning_rate": 0.0001, "loss": 0.9664, "loss/crossentropy": 2.3511369228363037, "loss/hidden": 0.7890625, "loss/logits": 0.12912708520889282, "loss/reg": 0.0048186322674155235, "step": 813 }, { "epoch": 0.10175, "grad_norm": 2.328005313873291, "grad_norm_var": 4.227496791404921, "learning_rate": 0.0001, "loss": 1.2465, "loss/crossentropy": 2.512450933456421, "loss/hidden": 0.96875, "loss/logits": 0.22958813607692719, "loss/reg": 0.004816535394638777, "step": 814 }, { "epoch": 0.101875, "grad_norm": 2.315840721130371, "grad_norm_var": 4.231222857846664, "learning_rate": 0.0001, "loss": 1.1597, "loss/crossentropy": 2.5316786766052246, "loss/hidden": 0.94140625, "loss/logits": 0.1701403111219406, "loss/reg": 0.004814418964087963, "step": 815 }, { "epoch": 0.102, "grad_norm": 2.384153127670288, "grad_norm_var": 4.222215248181442, "learning_rate": 0.0001, "loss": 1.0444, "loss/crossentropy": 2.540562629699707, "loss/hidden": 0.86328125, "loss/logits": 0.13295237720012665, "loss/reg": 0.004812437575310469, "step": 816 }, { "epoch": 0.102125, "grad_norm": 2.4128434658050537, "grad_norm_var": 4.197740139734587, "learning_rate": 0.0001, "loss": 1.1886, "loss/crossentropy": 2.070441246032715, "loss/hidden": 0.96875, "loss/logits": 0.17178985476493835, "loss/reg": 0.0048103369772434235, "step": 817 }, { "epoch": 0.10225, "grad_norm": 2.2246007919311523, "grad_norm_var": 3.0918953553368502, "learning_rate": 0.0001, "loss": 1.1071, "loss/crossentropy": 2.6368751525878906, "loss/hidden": 0.8984375, "loss/logits": 0.16057901084423065, "loss/reg": 0.004808461759239435, "step": 818 }, { "epoch": 0.102375, "grad_norm": 2.450014591217041, "grad_norm_var": 3.089959662810087, "learning_rate": 0.0001, "loss": 0.9846, "loss/crossentropy": 2.521289587020874, "loss/hidden": 0.8046875, "loss/logits": 0.13186746835708618, "loss/reg": 0.004806382581591606, "step": 819 }, { "epoch": 0.1025, "grad_norm": 2.4022092819213867, "grad_norm_var": 3.095181282590954, "learning_rate": 0.0001, "loss": 1.0461, "loss/crossentropy": 2.563452959060669, "loss/hidden": 0.8515625, "loss/logits": 0.146540105342865, "loss/reg": 0.00480444822460413, "step": 820 }, { "epoch": 0.102625, "grad_norm": 2.3734519481658936, "grad_norm_var": 3.0771633032177723, "learning_rate": 0.0001, "loss": 1.0145, "loss/crossentropy": 2.565798759460449, "loss/hidden": 0.8125, "loss/logits": 0.15395045280456543, "loss/reg": 0.004802408628165722, "step": 821 }, { "epoch": 0.10275, "grad_norm": 2.9813950061798096, "grad_norm_var": 3.0509471234209884, "learning_rate": 0.0001, "loss": 1.1867, "loss/crossentropy": 2.463094711303711, "loss/hidden": 0.96875, "loss/logits": 0.169905886054039, "loss/reg": 0.004800358321517706, "step": 822 }, { "epoch": 0.102875, "grad_norm": 2.231248617172241, "grad_norm_var": 3.06473574306492, "learning_rate": 0.0001, "loss": 0.95, "loss/crossentropy": 2.479421615600586, "loss/hidden": 0.78125, "loss/logits": 0.12073921412229538, "loss/reg": 0.00479841185733676, "step": 823 }, { "epoch": 0.103, "grad_norm": 2.4154672622680664, "grad_norm_var": 3.050471285485306, "learning_rate": 0.0001, "loss": 1.0843, "loss/crossentropy": 2.3831257820129395, "loss/hidden": 0.85546875, "loss/logits": 0.1808249056339264, "loss/reg": 0.004796158988028765, "step": 824 }, { "epoch": 0.103125, "grad_norm": 2.0772101879119873, "grad_norm_var": 3.0383683290584145, "learning_rate": 0.0001, "loss": 1.0639, "loss/crossentropy": 2.4004011154174805, "loss/hidden": 0.859375, "loss/logits": 0.15656441450119019, "loss/reg": 0.004794239532202482, "step": 825 }, { "epoch": 0.10325, "grad_norm": 2.839860200881958, "grad_norm_var": 3.0384311233460473, "learning_rate": 0.0001, "loss": 1.0861, "loss/crossentropy": 2.6619362831115723, "loss/hidden": 0.87890625, "loss/logits": 0.15924152731895447, "loss/reg": 0.004792260471731424, "step": 826 }, { "epoch": 0.103375, "grad_norm": 1.972954511642456, "grad_norm_var": 3.056454118437356, "learning_rate": 0.0001, "loss": 0.9135, "loss/crossentropy": 2.7229561805725098, "loss/hidden": 0.7421875, "loss/logits": 0.1234164908528328, "loss/reg": 0.004790398757904768, "step": 827 }, { "epoch": 0.1035, "grad_norm": 2.682563066482544, "grad_norm_var": 0.07155742185727737, "learning_rate": 0.0001, "loss": 1.1117, "loss/crossentropy": 2.5051097869873047, "loss/hidden": 0.89453125, "loss/logits": 0.16926732659339905, "loss/reg": 0.00478832283988595, "step": 828 }, { "epoch": 0.103625, "grad_norm": 3.4523675441741943, "grad_norm_var": 0.13326196210074007, "learning_rate": 0.0001, "loss": 0.9601, "loss/crossentropy": 2.658190965652466, "loss/hidden": 0.78125, "loss/logits": 0.13097809255123138, "loss/reg": 0.004786360543221235, "step": 829 }, { "epoch": 0.10375, "grad_norm": 3.48335599899292, "grad_norm_var": 0.19458248394843167, "learning_rate": 0.0001, "loss": 1.2642, "loss/crossentropy": 2.538571357727051, "loss/hidden": 1.0234375, "loss/logits": 0.1929442286491394, "loss/reg": 0.00478436890989542, "step": 830 }, { "epoch": 0.103875, "grad_norm": 2.62361216545105, "grad_norm_var": 0.19115134798182468, "learning_rate": 0.0001, "loss": 1.0218, "loss/crossentropy": 2.067270040512085, "loss/hidden": 0.83984375, "loss/logits": 0.13410484790802002, "loss/reg": 0.004782302770763636, "step": 831 }, { "epoch": 0.104, "grad_norm": 2.9096603393554688, "grad_norm_var": 0.1958828676095777, "learning_rate": 0.0001, "loss": 1.0541, "loss/crossentropy": 2.664367198944092, "loss/hidden": 0.859375, "loss/logits": 0.1468919813632965, "loss/reg": 0.004780208226293325, "step": 832 }, { "epoch": 0.104125, "grad_norm": 2.942896604537964, "grad_norm_var": 0.20051234736722562, "learning_rate": 0.0001, "loss": 1.131, "loss/crossentropy": 3.0427563190460205, "loss/hidden": 0.89453125, "loss/logits": 0.18868675827980042, "loss/reg": 0.004778183531016111, "step": 833 }, { "epoch": 0.10425, "grad_norm": 2.589113712310791, "grad_norm_var": 0.18916564172237998, "learning_rate": 0.0001, "loss": 1.0925, "loss/crossentropy": 2.8398096561431885, "loss/hidden": 0.8828125, "loss/logits": 0.16195189952850342, "loss/reg": 0.004776162561029196, "step": 834 }, { "epoch": 0.104375, "grad_norm": 3.4404890537261963, "grad_norm_var": 0.22384389332806312, "learning_rate": 0.0001, "loss": 1.3237, "loss/crossentropy": 2.627953290939331, "loss/hidden": 1.0234375, "loss/logits": 0.252508282661438, "loss/reg": 0.004774080123752356, "step": 835 }, { "epoch": 0.1045, "grad_norm": 3.8319575786590576, "grad_norm_var": 0.2922407313041238, "learning_rate": 0.0001, "loss": 1.1363, "loss/crossentropy": 2.598100185394287, "loss/hidden": 0.94140625, "loss/logits": 0.14722198247909546, "loss/reg": 0.004771828651428223, "step": 836 }, { "epoch": 0.104625, "grad_norm": 2.5748939514160156, "grad_norm_var": 0.28324037377024425, "learning_rate": 0.0001, "loss": 1.154, "loss/crossentropy": 2.4594998359680176, "loss/hidden": 0.92578125, "loss/logits": 0.1805158108472824, "loss/reg": 0.004769548308104277, "step": 837 }, { "epoch": 0.10475, "grad_norm": 2.4466588497161865, "grad_norm_var": 0.2892884485845587, "learning_rate": 0.0001, "loss": 1.1832, "loss/crossentropy": 2.3166677951812744, "loss/hidden": 0.94921875, "loss/logits": 0.18628260493278503, "loss/reg": 0.004767347127199173, "step": 838 }, { "epoch": 0.104875, "grad_norm": 2.3794689178466797, "grad_norm_var": 0.2797743363037663, "learning_rate": 0.0001, "loss": 1.0935, "loss/crossentropy": 2.5804922580718994, "loss/hidden": 0.8984375, "loss/logits": 0.14742916822433472, "loss/reg": 0.004765105899423361, "step": 839 }, { "epoch": 0.105, "grad_norm": 3.077510118484497, "grad_norm_var": 0.27398293806763996, "learning_rate": 0.0001, "loss": 1.3542, "loss/crossentropy": 2.3323357105255127, "loss/hidden": 1.1015625, "loss/logits": 0.20499414205551147, "loss/reg": 0.004762987140566111, "step": 840 }, { "epoch": 0.105125, "grad_norm": 2.878331422805786, "grad_norm_var": 0.23338745113117412, "learning_rate": 0.0001, "loss": 1.1036, "loss/crossentropy": 2.450366258621216, "loss/hidden": 0.88671875, "loss/logits": 0.16925078630447388, "loss/reg": 0.004760903771966696, "step": 841 }, { "epoch": 0.10525, "grad_norm": 3.3161306381225586, "grad_norm_var": 0.24483420410498696, "learning_rate": 0.0001, "loss": 1.2537, "loss/crossentropy": 2.5501890182495117, "loss/hidden": 1.0234375, "loss/logits": 0.18264800310134888, "loss/reg": 0.004758887458592653, "step": 842 }, { "epoch": 0.105375, "grad_norm": 3.6941001415252686, "grad_norm_var": 0.21433980549929005, "learning_rate": 0.0001, "loss": 1.1001, "loss/crossentropy": 2.984022378921509, "loss/hidden": 0.890625, "loss/logits": 0.16192708909511566, "loss/reg": 0.004756839480251074, "step": 843 }, { "epoch": 0.1055, "grad_norm": 2.4516656398773193, "grad_norm_var": 0.22806633375319052, "learning_rate": 0.0001, "loss": 1.1966, "loss/crossentropy": 2.304518699645996, "loss/hidden": 0.96484375, "loss/logits": 0.18419940769672394, "loss/reg": 0.0047547114081680775, "step": 844 }, { "epoch": 0.105625, "grad_norm": 2.1033735275268555, "grad_norm_var": 0.2614740255031826, "learning_rate": 0.0001, "loss": 1.1366, "loss/crossentropy": 2.54909086227417, "loss/hidden": 0.93359375, "loss/logits": 0.15546000003814697, "loss/reg": 0.004752539098262787, "step": 845 }, { "epoch": 0.10575, "grad_norm": 2.3745031356811523, "grad_norm_var": 0.2552452215100343, "learning_rate": 0.0001, "loss": 1.0427, "loss/crossentropy": 2.629120349884033, "loss/hidden": 0.828125, "loss/logits": 0.16704407334327698, "loss/reg": 0.004750436637550592, "step": 846 }, { "epoch": 0.105875, "grad_norm": 2.338932514190674, "grad_norm_var": 0.26898497299798596, "learning_rate": 0.0001, "loss": 1.0028, "loss/crossentropy": 2.5220935344696045, "loss/hidden": 0.828125, "loss/logits": 0.12721288204193115, "loss/reg": 0.004748245235532522, "step": 847 }, { "epoch": 0.106, "grad_norm": 2.051365375518799, "grad_norm_var": 0.3064090147954744, "learning_rate": 0.0001, "loss": 1.0144, "loss/crossentropy": 2.5917866230010986, "loss/hidden": 0.828125, "loss/logits": 0.13877776265144348, "loss/reg": 0.0047462498769164085, "step": 848 }, { "epoch": 0.106125, "grad_norm": 2.7592124938964844, "grad_norm_var": 0.3045456563454231, "learning_rate": 0.0001, "loss": 1.0817, "loss/crossentropy": 2.400892972946167, "loss/hidden": 0.88671875, "loss/logits": 0.14757487177848816, "loss/reg": 0.004744186066091061, "step": 849 }, { "epoch": 0.10625, "grad_norm": 2.843409538269043, "grad_norm_var": 0.3024802042353009, "learning_rate": 0.0001, "loss": 1.1031, "loss/crossentropy": 2.4847569465637207, "loss/hidden": 0.90234375, "loss/logits": 0.153322234749794, "loss/reg": 0.004742183722555637, "step": 850 }, { "epoch": 0.106375, "grad_norm": 2.1840505599975586, "grad_norm_var": 0.291355140168911, "learning_rate": 0.0001, "loss": 0.9921, "loss/crossentropy": 2.5778043270111084, "loss/hidden": 0.80859375, "loss/logits": 0.13607874512672424, "loss/reg": 0.004740222357213497, "step": 851 }, { "epoch": 0.1065, "grad_norm": 2.0575978755950928, "grad_norm_var": 0.2218880841874949, "learning_rate": 0.0001, "loss": 0.97, "loss/crossentropy": 2.3618547916412354, "loss/hidden": 0.79296875, "loss/logits": 0.12968632578849792, "loss/reg": 0.004738117568194866, "step": 852 }, { "epoch": 0.106625, "grad_norm": 1.9274516105651855, "grad_norm_var": 0.2498830541667985, "learning_rate": 0.0001, "loss": 0.9944, "loss/crossentropy": 2.616579532623291, "loss/hidden": 0.80859375, "loss/logits": 0.13845369219779968, "loss/reg": 0.004735942464321852, "step": 853 }, { "epoch": 0.10675, "grad_norm": 4.250617980957031, "grad_norm_var": 0.42715921119526734, "learning_rate": 0.0001, "loss": 1.0122, "loss/crossentropy": 2.658142328262329, "loss/hidden": 0.828125, "loss/logits": 0.13677959144115448, "loss/reg": 0.004733935464173555, "step": 854 }, { "epoch": 0.106875, "grad_norm": 2.366472005844116, "grad_norm_var": 0.42766974025784443, "learning_rate": 0.0001, "loss": 0.951, "loss/crossentropy": 2.4926974773406982, "loss/hidden": 0.78125, "loss/logits": 0.12242163717746735, "loss/reg": 0.004731933120638132, "step": 855 }, { "epoch": 0.107, "grad_norm": 2.754833221435547, "grad_norm_var": 0.41652297282437467, "learning_rate": 0.0001, "loss": 1.1565, "loss/crossentropy": 2.3678700923919678, "loss/hidden": 0.93359375, "loss/logits": 0.17560826241970062, "loss/reg": 0.004729805048555136, "step": 856 }, { "epoch": 0.107125, "grad_norm": 3.3536951541900635, "grad_norm_var": 0.44530816036993104, "learning_rate": 0.0001, "loss": 1.3138, "loss/crossentropy": 2.4103269577026367, "loss/hidden": 1.0703125, "loss/logits": 0.19625738263130188, "loss/reg": 0.004727587569504976, "step": 857 }, { "epoch": 0.10725, "grad_norm": 2.150568962097168, "grad_norm_var": 0.43084581061497124, "learning_rate": 0.0001, "loss": 1.1975, "loss/crossentropy": 2.288213014602661, "loss/hidden": 0.984375, "loss/logits": 0.16585032641887665, "loss/reg": 0.004725386388599873, "step": 858 }, { "epoch": 0.107375, "grad_norm": 2.526709794998169, "grad_norm_var": 0.3463235885418951, "learning_rate": 0.0001, "loss": 1.0485, "loss/crossentropy": 2.668027639389038, "loss/hidden": 0.8515625, "loss/logits": 0.14968323707580566, "loss/reg": 0.0047230906784534454, "step": 859 }, { "epoch": 0.1075, "grad_norm": 2.573915958404541, "grad_norm_var": 0.3459660758761667, "learning_rate": 0.0001, "loss": 1.285, "loss/crossentropy": 2.4578278064727783, "loss/hidden": 1.046875, "loss/logits": 0.19090218842029572, "loss/reg": 0.004720703698694706, "step": 860 }, { "epoch": 0.107625, "grad_norm": 2.0752198696136475, "grad_norm_var": 0.3476491685761097, "learning_rate": 0.0001, "loss": 1.0997, "loss/crossentropy": 2.5527398586273193, "loss/hidden": 0.8984375, "loss/logits": 0.15407393872737885, "loss/reg": 0.004718627315014601, "step": 861 }, { "epoch": 0.10775, "grad_norm": 2.0546956062316895, "grad_norm_var": 0.3609613231593753, "learning_rate": 0.0001, "loss": 0.9752, "loss/crossentropy": 2.371561050415039, "loss/hidden": 0.796875, "loss/logits": 0.13113868236541748, "loss/reg": 0.0047163935378193855, "step": 862 }, { "epoch": 0.107875, "grad_norm": 2.4428114891052246, "grad_norm_var": 0.35917223636502416, "learning_rate": 0.0001, "loss": 1.1124, "loss/crossentropy": 2.4627277851104736, "loss/hidden": 0.90625, "loss/logits": 0.15898552536964417, "loss/reg": 0.004714163951575756, "step": 863 }, { "epoch": 0.108, "grad_norm": 9.508520126342773, "grad_norm_var": 3.365516663733607, "learning_rate": 0.0001, "loss": 2.2474, "loss/crossentropy": 2.571873426437378, "loss/hidden": 1.8203125, "loss/logits": 0.3799425959587097, "loss/reg": 0.00471192691475153, "step": 864 }, { "epoch": 0.108125, "grad_norm": 1.8954740762710571, "grad_norm_var": 3.4386495429465245, "learning_rate": 0.0001, "loss": 0.9501, "loss/crossentropy": 2.6599161624908447, "loss/hidden": 0.7734375, "loss/logits": 0.12957873940467834, "loss/reg": 0.00470972154289484, "step": 865 }, { "epoch": 0.10825, "grad_norm": 2.4904675483703613, "grad_norm_var": 3.4507629712816508, "learning_rate": 0.0001, "loss": 1.0515, "loss/crossentropy": 2.4854214191436768, "loss/hidden": 0.8671875, "loss/logits": 0.13725632429122925, "loss/reg": 0.004707681480795145, "step": 866 }, { "epoch": 0.108375, "grad_norm": 2.947456121444702, "grad_norm_var": 3.4129568938410895, "learning_rate": 0.0001, "loss": 1.2279, "loss/crossentropy": 2.4842288494110107, "loss/hidden": 0.9921875, "loss/logits": 0.1886221170425415, "loss/reg": 0.004705703817307949, "step": 867 }, { "epoch": 0.1085, "grad_norm": 2.2447702884674072, "grad_norm_var": 3.3926001028643817, "learning_rate": 0.0001, "loss": 1.0195, "loss/crossentropy": 2.148575782775879, "loss/hidden": 0.84375, "loss/logits": 0.1287107914686203, "loss/reg": 0.0047035738825798035, "step": 868 }, { "epoch": 0.108625, "grad_norm": 2.057748317718506, "grad_norm_var": 3.3755016690991604, "learning_rate": 0.0001, "loss": 1.0202, "loss/crossentropy": 2.483203172683716, "loss/hidden": 0.83203125, "loss/logits": 0.14113682508468628, "loss/reg": 0.004701647907495499, "step": 869 }, { "epoch": 0.10875, "grad_norm": 2.5249640941619873, "grad_norm_var": 3.2694673269882863, "learning_rate": 0.0001, "loss": 1.113, "loss/crossentropy": 2.5260114669799805, "loss/hidden": 0.91015625, "loss/logits": 0.1558540314435959, "loss/reg": 0.004699505399912596, "step": 870 }, { "epoch": 0.108875, "grad_norm": 2.737452507019043, "grad_norm_var": 3.2530130532767125, "learning_rate": 0.0001, "loss": 1.281, "loss/crossentropy": 2.0429792404174805, "loss/hidden": 1.0625, "loss/logits": 0.1715242862701416, "loss/reg": 0.004697592929005623, "step": 871 }, { "epoch": 0.109, "grad_norm": 3.320223569869995, "grad_norm_var": 3.2623347194326366, "learning_rate": 0.0001, "loss": 1.1007, "loss/crossentropy": 2.4485509395599365, "loss/hidden": 0.91015625, "loss/logits": 0.14355334639549255, "loss/reg": 0.004695762414485216, "step": 872 }, { "epoch": 0.109125, "grad_norm": 2.352004051208496, "grad_norm_var": 3.268664190896945, "learning_rate": 0.0001, "loss": 0.9754, "loss/crossentropy": 2.566997528076172, "loss/hidden": 0.81640625, "loss/logits": 0.11203782260417938, "loss/reg": 0.004693967290222645, "step": 873 }, { "epoch": 0.10925, "grad_norm": 2.532027244567871, "grad_norm_var": 3.2412215675029845, "learning_rate": 0.0001, "loss": 1.019, "loss/crossentropy": 2.5623605251312256, "loss/hidden": 0.82421875, "loss/logits": 0.14787867665290833, "loss/reg": 0.0046923235058784485, "step": 874 }, { "epoch": 0.109375, "grad_norm": 2.850015878677368, "grad_norm_var": 3.231974182838817, "learning_rate": 0.0001, "loss": 1.3183, "loss/crossentropy": 2.292745351791382, "loss/hidden": 1.09375, "loss/logits": 0.17765681445598602, "loss/reg": 0.004690241534262896, "step": 875 }, { "epoch": 0.1095, "grad_norm": 2.694929361343384, "grad_norm_var": 3.2274185214577464, "learning_rate": 0.0001, "loss": 1.3804, "loss/crossentropy": 2.5017001628875732, "loss/hidden": 1.1171875, "loss/logits": 0.21630127727985382, "loss/reg": 0.004688601475208998, "step": 876 }, { "epoch": 0.109625, "grad_norm": 3.1318111419677734, "grad_norm_var": 3.1781036409629513, "learning_rate": 0.0001, "loss": 1.0142, "loss/crossentropy": 2.467827796936035, "loss/hidden": 0.80859375, "loss/logits": 0.15873777866363525, "loss/reg": 0.004686909727752209, "step": 877 }, { "epoch": 0.10975, "grad_norm": 2.534363269805908, "grad_norm_var": 3.132884034258479, "learning_rate": 0.0001, "loss": 1.0536, "loss/crossentropy": 2.559304714202881, "loss/hidden": 0.84765625, "loss/logits": 0.15912304818630219, "loss/reg": 0.004684917628765106, "step": 878 }, { "epoch": 0.109875, "grad_norm": 2.3481605052948, "grad_norm_var": 3.1406848036532913, "learning_rate": 0.0001, "loss": 1.1184, "loss/crossentropy": 2.436594247817993, "loss/hidden": 0.91796875, "loss/logits": 0.15359237790107727, "loss/reg": 0.004682839848101139, "step": 879 }, { "epoch": 0.11, "grad_norm": 2.825532913208008, "grad_norm_var": 0.1420546261528272, "learning_rate": 0.0001, "loss": 1.05, "loss/crossentropy": 2.62540602684021, "loss/hidden": 0.859375, "loss/logits": 0.14383457601070404, "loss/reg": 0.004681065212935209, "step": 880 }, { "epoch": 0.110125, "grad_norm": 2.246893882751465, "grad_norm_var": 0.11709161648718099, "learning_rate": 0.0001, "loss": 1.1922, "loss/crossentropy": 2.3017749786376953, "loss/hidden": 0.98046875, "loss/logits": 0.1649210900068283, "loss/reg": 0.004679176490753889, "step": 881 }, { "epoch": 0.11025, "grad_norm": 2.405453681945801, "grad_norm_var": 0.11895408888104815, "learning_rate": 0.0001, "loss": 1.0762, "loss/crossentropy": 2.4893195629119873, "loss/hidden": 0.87109375, "loss/logits": 0.15834550559520721, "loss/reg": 0.004677077289670706, "step": 882 }, { "epoch": 0.110375, "grad_norm": 2.809741258621216, "grad_norm_var": 0.11393595478610692, "learning_rate": 0.0001, "loss": 1.0382, "loss/crossentropy": 2.4846301078796387, "loss/hidden": 0.84375, "loss/logits": 0.1476888507604599, "loss/reg": 0.004674948286265135, "step": 883 }, { "epoch": 0.1105, "grad_norm": 2.79677152633667, "grad_norm_var": 0.10676105158749939, "learning_rate": 0.0001, "loss": 1.0628, "loss/crossentropy": 2.4412240982055664, "loss/hidden": 0.859375, "loss/logits": 0.15672242641448975, "loss/reg": 0.0046728490851819515, "step": 884 }, { "epoch": 0.110625, "grad_norm": 2.2540183067321777, "grad_norm_var": 0.09404914291929553, "learning_rate": 0.0001, "loss": 1.0763, "loss/crossentropy": 2.42918062210083, "loss/hidden": 0.87890625, "loss/logits": 0.15069469809532166, "loss/reg": 0.004670663271099329, "step": 885 }, { "epoch": 0.11075, "grad_norm": 2.4896061420440674, "grad_norm_var": 0.09470624757332567, "learning_rate": 0.0001, "loss": 1.0982, "loss/crossentropy": 2.2188639640808105, "loss/hidden": 0.91796875, "loss/logits": 0.13358637690544128, "loss/reg": 0.004668715409934521, "step": 886 }, { "epoch": 0.110875, "grad_norm": 2.1856141090393066, "grad_norm_var": 0.10697799820098434, "learning_rate": 0.0001, "loss": 1.0706, "loss/crossentropy": 2.508702039718628, "loss/hidden": 0.87109375, "loss/logits": 0.15285125374794006, "loss/reg": 0.0046665905974805355, "step": 887 }, { "epoch": 0.111, "grad_norm": 2.0393009185791016, "grad_norm_var": 0.08841005951741536, "learning_rate": 0.0001, "loss": 1.1924, "loss/crossentropy": 2.38267183303833, "loss/hidden": 0.97265625, "loss/logits": 0.17313829064369202, "loss/reg": 0.004664612468332052, "step": 888 }, { "epoch": 0.111125, "grad_norm": 2.3410797119140625, "grad_norm_var": 0.08867826223563284, "learning_rate": 0.0001, "loss": 1.074, "loss/crossentropy": 2.1943254470825195, "loss/hidden": 0.87109375, "loss/logits": 0.1563197374343872, "loss/reg": 0.004662699997425079, "step": 889 }, { "epoch": 0.11125, "grad_norm": 2.1273703575134277, "grad_norm_var": 0.09882102282956354, "learning_rate": 0.0001, "loss": 1.1126, "loss/crossentropy": 2.685147762298584, "loss/hidden": 0.8984375, "loss/logits": 0.16760051250457764, "loss/reg": 0.00466081453487277, "step": 890 }, { "epoch": 0.111375, "grad_norm": 1.990721583366394, "grad_norm_var": 0.1054455812123658, "learning_rate": 0.0001, "loss": 1.0555, "loss/crossentropy": 2.5216498374938965, "loss/hidden": 0.86328125, "loss/logits": 0.14559441804885864, "loss/reg": 0.004659009166061878, "step": 891 }, { "epoch": 0.1115, "grad_norm": 2.070897340774536, "grad_norm_var": 0.10951603310177038, "learning_rate": 0.0001, "loss": 1.1023, "loss/crossentropy": 2.40985369682312, "loss/hidden": 0.890625, "loss/logits": 0.16513003408908844, "loss/reg": 0.0046569365076720715, "step": 892 }, { "epoch": 0.111625, "grad_norm": 4.464876651763916, "grad_norm_var": 0.3484639481637311, "learning_rate": 0.0001, "loss": 0.9826, "loss/crossentropy": 2.7498655319213867, "loss/hidden": 0.80859375, "loss/logits": 0.12744669616222382, "loss/reg": 0.0046548242680728436, "step": 893 }, { "epoch": 0.11175, "grad_norm": 3.390195608139038, "grad_norm_var": 0.39865960381583576, "learning_rate": 0.0001, "loss": 1.221, "loss/crossentropy": 2.4615988731384277, "loss/hidden": 0.9453125, "loss/logits": 0.22916388511657715, "loss/reg": 0.004652821458876133, "step": 894 }, { "epoch": 0.111875, "grad_norm": 5.686069488525391, "grad_norm_var": 1.005565195852795, "learning_rate": 0.0001, "loss": 1.6881, "loss/crossentropy": 2.871785879135132, "loss/hidden": 1.3515625, "loss/logits": 0.2899933457374573, "loss/reg": 0.0046508111990988255, "step": 895 }, { "epoch": 0.112, "grad_norm": 2.610992193222046, "grad_norm_var": 1.006503225573829, "learning_rate": 0.0001, "loss": 1.2867, "loss/crossentropy": 2.433950901031494, "loss/hidden": 1.046875, "loss/logits": 0.19335989654064178, "loss/reg": 0.004648844711482525, "step": 896 }, { "epoch": 0.112125, "grad_norm": 2.4823808670043945, "grad_norm_var": 0.9943498438598022, "learning_rate": 0.0001, "loss": 1.0789, "loss/crossentropy": 2.469367742538452, "loss/hidden": 0.86328125, "loss/logits": 0.16918183863162994, "loss/reg": 0.004646934103220701, "step": 897 }, { "epoch": 0.11225, "grad_norm": 2.520416736602783, "grad_norm_var": 0.9897555293936913, "learning_rate": 0.0001, "loss": 1.0086, "loss/crossentropy": 2.4925904273986816, "loss/hidden": 0.828125, "loss/logits": 0.13399645686149597, "loss/reg": 0.004645092878490686, "step": 898 }, { "epoch": 0.112375, "grad_norm": 3.859619140625, "grad_norm_var": 1.064733358455831, "learning_rate": 0.0001, "loss": 1.392, "loss/crossentropy": 2.2787818908691406, "loss/hidden": 1.125, "loss/logits": 0.22061912715435028, "loss/reg": 0.004643063060939312, "step": 899 }, { "epoch": 0.1125, "grad_norm": 2.3090193271636963, "grad_norm_var": 1.081884870890947, "learning_rate": 0.0001, "loss": 1.0228, "loss/crossentropy": 2.5625975131988525, "loss/hidden": 0.83984375, "loss/logits": 0.13652384281158447, "loss/reg": 0.004641035571694374, "step": 900 }, { "epoch": 0.112625, "grad_norm": 3.0584754943847656, "grad_norm_var": 1.063620631316445, "learning_rate": 0.0001, "loss": 1.1209, "loss/crossentropy": 2.5125515460968018, "loss/hidden": 0.91015625, "loss/logits": 0.16438385844230652, "loss/reg": 0.004638944752514362, "step": 901 }, { "epoch": 0.11275, "grad_norm": 2.350011110305786, "grad_norm_var": 1.0715774319545346, "learning_rate": 0.0001, "loss": 1.1854, "loss/crossentropy": 2.548809766769409, "loss/hidden": 0.9765625, "loss/logits": 0.16251316666603088, "loss/reg": 0.004636852536350489, "step": 902 }, { "epoch": 0.112875, "grad_norm": 2.3605165481567383, "grad_norm_var": 1.0581603064239917, "learning_rate": 0.0001, "loss": 1.0876, "loss/crossentropy": 2.338804006576538, "loss/hidden": 0.90234375, "loss/logits": 0.13895326852798462, "loss/reg": 0.004634756129235029, "step": 903 }, { "epoch": 0.113, "grad_norm": 2.982060432434082, "grad_norm_var": 1.0113174770986684, "learning_rate": 0.0001, "loss": 1.122, "loss/crossentropy": 2.604545831680298, "loss/hidden": 0.91796875, "loss/logits": 0.15767651796340942, "loss/reg": 0.004632753320038319, "step": 904 }, { "epoch": 0.113125, "grad_norm": 2.4179961681365967, "grad_norm_var": 1.0058240052270713, "learning_rate": 0.0001, "loss": 1.0464, "loss/crossentropy": 2.3685691356658936, "loss/hidden": 0.85546875, "loss/logits": 0.14459514617919922, "loss/reg": 0.0046308403834700584, "step": 905 }, { "epoch": 0.11325, "grad_norm": 2.9855105876922607, "grad_norm_var": 0.9614321136201335, "learning_rate": 0.0001, "loss": 1.0734, "loss/crossentropy": 2.4018514156341553, "loss/hidden": 0.875, "loss/logits": 0.15208232402801514, "loss/reg": 0.00462888041511178, "step": 906 }, { "epoch": 0.113375, "grad_norm": 3.7471723556518555, "grad_norm_var": 0.9246222750158322, "learning_rate": 0.0001, "loss": 1.5348, "loss/crossentropy": 2.3634986877441406, "loss/hidden": 1.2265625, "loss/logits": 0.26200127601623535, "loss/reg": 0.0046269698068499565, "step": 907 }, { "epoch": 0.1135, "grad_norm": 2.5677998065948486, "grad_norm_var": 0.8731304087305998, "learning_rate": 0.0001, "loss": 1.2106, "loss/crossentropy": 2.3270931243896484, "loss/hidden": 0.984375, "loss/logits": 0.17999057471752167, "loss/reg": 0.004624930210411549, "step": 908 }, { "epoch": 0.113625, "grad_norm": 2.649965286254883, "grad_norm_var": 0.7516360272379186, "learning_rate": 0.0001, "loss": 1.1275, "loss/crossentropy": 2.627746343612671, "loss/hidden": 0.9140625, "loss/logits": 0.16723725199699402, "loss/reg": 0.0046230582520365715, "step": 909 }, { "epoch": 0.11375, "grad_norm": 2.4064176082611084, "grad_norm_var": 0.7607639000768924, "learning_rate": 0.0001, "loss": 1.0182, "loss/crossentropy": 2.47015380859375, "loss/hidden": 0.81640625, "loss/logits": 0.15559975802898407, "loss/reg": 0.00462103309109807, "step": 910 }, { "epoch": 0.113875, "grad_norm": 2.1189992427825928, "grad_norm_var": 0.24860211648851874, "learning_rate": 0.0001, "loss": 1.1855, "loss/crossentropy": 2.2618770599365234, "loss/hidden": 0.96484375, "loss/logits": 0.1744484156370163, "loss/reg": 0.004618941340595484, "step": 911 }, { "epoch": 0.114, "grad_norm": 2.220656633377075, "grad_norm_var": 0.26349665304340514, "learning_rate": 0.0001, "loss": 1.2433, "loss/crossentropy": 2.232293128967285, "loss/hidden": 1.0, "loss/logits": 0.19716452062129974, "loss/reg": 0.004616775084286928, "step": 912 }, { "epoch": 0.114125, "grad_norm": 2.525851011276245, "grad_norm_var": 0.2624124723651427, "learning_rate": 0.0001, "loss": 1.2848, "loss/crossentropy": 2.2225029468536377, "loss/hidden": 1.0703125, "loss/logits": 0.16834387183189392, "loss/reg": 0.004614519886672497, "step": 913 }, { "epoch": 0.11425, "grad_norm": 2.6499905586242676, "grad_norm_var": 0.2604882837896163, "learning_rate": 0.0001, "loss": 1.1621, "loss/crossentropy": 2.7070581912994385, "loss/hidden": 0.953125, "loss/logits": 0.16290049254894257, "loss/reg": 0.004612345714122057, "step": 914 }, { "epoch": 0.114375, "grad_norm": 2.417423963546753, "grad_norm_var": 0.1676183523849204, "learning_rate": 0.0001, "loss": 1.2021, "loss/crossentropy": 1.8924601078033447, "loss/hidden": 0.97265625, "loss/logits": 0.183339461684227, "loss/reg": 0.004610271658748388, "step": 915 }, { "epoch": 0.1145, "grad_norm": 2.935338258743286, "grad_norm_var": 0.16695985677138575, "learning_rate": 0.0001, "loss": 1.2591, "loss/crossentropy": 2.2739417552948, "loss/hidden": 1.046875, "loss/logits": 0.1661624014377594, "loss/reg": 0.00460821995511651, "step": 916 }, { "epoch": 0.114625, "grad_norm": 2.1026499271392822, "grad_norm_var": 0.17195618728893744, "learning_rate": 0.0001, "loss": 0.9901, "loss/crossentropy": 2.655440330505371, "loss/hidden": 0.796875, "loss/logits": 0.14714528620243073, "loss/reg": 0.004606001079082489, "step": 917 }, { "epoch": 0.11475, "grad_norm": 2.9179234504699707, "grad_norm_var": 0.17394937416602924, "learning_rate": 0.0001, "loss": 1.014, "loss/crossentropy": 2.5501797199249268, "loss/hidden": 0.8125, "loss/logits": 0.15548643469810486, "loss/reg": 0.004603679291903973, "step": 918 }, { "epoch": 0.114875, "grad_norm": 2.9957125186920166, "grad_norm_var": 0.17673345245174207, "learning_rate": 0.0001, "loss": 1.0982, "loss/crossentropy": 2.3883516788482666, "loss/hidden": 0.8828125, "loss/logits": 0.16936160624027252, "loss/reg": 0.004601585678756237, "step": 919 }, { "epoch": 0.115, "grad_norm": 2.67694354057312, "grad_norm_var": 0.16965697193046006, "learning_rate": 0.0001, "loss": 1.1013, "loss/crossentropy": 2.3806746006011963, "loss/hidden": 0.91015625, "loss/logits": 0.14516542851924896, "loss/reg": 0.004599516745656729, "step": 920 }, { "epoch": 0.115125, "grad_norm": 2.1424858570098877, "grad_norm_var": 0.1827775525511394, "learning_rate": 0.0001, "loss": 1.0246, "loss/crossentropy": 2.2575674057006836, "loss/hidden": 0.828125, "loss/logits": 0.15050096809864044, "loss/reg": 0.004597416613250971, "step": 921 }, { "epoch": 0.11525, "grad_norm": 5.457708358764648, "grad_norm_var": 0.682343045667132, "learning_rate": 0.0001, "loss": 1.7819, "loss/crossentropy": 2.941784381866455, "loss/hidden": 1.40625, "loss/logits": 0.3296935558319092, "loss/reg": 0.004595189820975065, "step": 922 }, { "epoch": 0.115375, "grad_norm": 2.692840814590454, "grad_norm_var": 0.6163222739990933, "learning_rate": 0.0001, "loss": 1.3758, "loss/crossentropy": 2.374514102935791, "loss/hidden": 1.1171875, "loss/logits": 0.21266797184944153, "loss/reg": 0.004593092482537031, "step": 923 }, { "epoch": 0.1155, "grad_norm": 3.2622177600860596, "grad_norm_var": 0.6326076754218235, "learning_rate": 0.0001, "loss": 1.1365, "loss/crossentropy": 2.4111597537994385, "loss/hidden": 0.89453125, "loss/logits": 0.19610172510147095, "loss/reg": 0.004590968135744333, "step": 924 }, { "epoch": 0.115625, "grad_norm": 3.9593584537506104, "grad_norm_var": 0.7204108733775624, "learning_rate": 0.0001, "loss": 1.4237, "loss/crossentropy": 2.770081043243408, "loss/hidden": 1.1796875, "loss/logits": 0.19811320304870605, "loss/reg": 0.004588917829096317, "step": 925 }, { "epoch": 0.11575, "grad_norm": 2.504918336868286, "grad_norm_var": 0.7152879483588092, "learning_rate": 0.0001, "loss": 1.0618, "loss/crossentropy": 2.4681365489959717, "loss/hidden": 0.86328125, "loss/logits": 0.1526341289281845, "loss/reg": 0.004586971364915371, "step": 926 }, { "epoch": 0.115875, "grad_norm": 3.0209288597106934, "grad_norm_var": 0.6783647636612234, "learning_rate": 0.0001, "loss": 1.138, "loss/crossentropy": 2.3891263008117676, "loss/hidden": 0.91796875, "loss/logits": 0.17415405809879303, "loss/reg": 0.004584896378219128, "step": 927 }, { "epoch": 0.116, "grad_norm": 2.3569037914276123, "grad_norm_var": 0.6670896431721521, "learning_rate": 0.0001, "loss": 1.1342, "loss/crossentropy": 2.373786687850952, "loss/hidden": 0.93359375, "loss/logits": 0.15477266907691956, "loss/reg": 0.004582802765071392, "step": 928 }, { "epoch": 0.116125, "grad_norm": 2.584897041320801, "grad_norm_var": 0.6642540884373107, "learning_rate": 0.0001, "loss": 1.187, "loss/crossentropy": 2.5142221450805664, "loss/hidden": 0.95703125, "loss/logits": 0.18413202464580536, "loss/reg": 0.004580747336149216, "step": 929 }, { "epoch": 0.11625, "grad_norm": 3.1578471660614014, "grad_norm_var": 0.6622672348995062, "learning_rate": 0.0001, "loss": 1.1606, "loss/crossentropy": 2.6126952171325684, "loss/hidden": 0.91796875, "loss/logits": 0.19688570499420166, "loss/reg": 0.004578826949000359, "step": 930 }, { "epoch": 0.116375, "grad_norm": 2.6005330085754395, "grad_norm_var": 0.6513814069878736, "learning_rate": 0.0001, "loss": 1.0783, "loss/crossentropy": 2.405707597732544, "loss/hidden": 0.87109375, "loss/logits": 0.16143286228179932, "loss/reg": 0.004576742183417082, "step": 931 }, { "epoch": 0.1165, "grad_norm": 2.879091501235962, "grad_norm_var": 0.6517684060932252, "learning_rate": 0.0001, "loss": 1.0623, "loss/crossentropy": 2.5898799896240234, "loss/hidden": 0.84765625, "loss/logits": 0.16887424886226654, "loss/reg": 0.004575024824589491, "step": 932 }, { "epoch": 0.116625, "grad_norm": 3.1379029750823975, "grad_norm_var": 0.6008152897241831, "learning_rate": 0.0001, "loss": 1.2402, "loss/crossentropy": 2.3071014881134033, "loss/hidden": 1.0234375, "loss/logits": 0.1710711419582367, "loss/reg": 0.004573314916342497, "step": 933 }, { "epoch": 0.11675, "grad_norm": 4.292084217071533, "grad_norm_var": 0.6998094594428453, "learning_rate": 0.0001, "loss": 1.2793, "loss/crossentropy": 2.428403854370117, "loss/hidden": 1.0078125, "loss/logits": 0.22576534748077393, "loss/reg": 0.004571723286062479, "step": 934 }, { "epoch": 0.116875, "grad_norm": 3.4453883171081543, "grad_norm_var": 0.7057361661794924, "learning_rate": 0.0001, "loss": 1.3151, "loss/crossentropy": 2.603928327560425, "loss/hidden": 1.09375, "loss/logits": 0.1756502389907837, "loss/reg": 0.004570134915411472, "step": 935 }, { "epoch": 0.117, "grad_norm": 2.8401777744293213, "grad_norm_var": 0.6974157138244702, "learning_rate": 0.0001, "loss": 1.1098, "loss/crossentropy": 2.304800033569336, "loss/hidden": 0.91015625, "loss/logits": 0.15392211079597473, "loss/reg": 0.004568077158182859, "step": 936 }, { "epoch": 0.117125, "grad_norm": 2.5680997371673584, "grad_norm_var": 0.6517920111715199, "learning_rate": 0.0001, "loss": 1.1496, "loss/crossentropy": 2.341132402420044, "loss/hidden": 0.94140625, "loss/logits": 0.1625438630580902, "loss/reg": 0.004566343035548925, "step": 937 }, { "epoch": 0.11725, "grad_norm": 2.748947858810425, "grad_norm_var": 0.2850544648160998, "learning_rate": 0.0001, "loss": 1.2239, "loss/crossentropy": 2.4194369316101074, "loss/hidden": 0.99609375, "loss/logits": 0.18219077587127686, "loss/reg": 0.004564360249787569, "step": 938 }, { "epoch": 0.117375, "grad_norm": 3.1744375228881836, "grad_norm_var": 0.2796176021162296, "learning_rate": 0.0001, "loss": 1.049, "loss/crossentropy": 2.328961133956909, "loss/hidden": 0.86328125, "loss/logits": 0.14012068510055542, "loss/reg": 0.0045626200735569, "step": 939 }, { "epoch": 0.1175, "grad_norm": 2.8904807567596436, "grad_norm_var": 0.276910977824096, "learning_rate": 0.0001, "loss": 1.0396, "loss/crossentropy": 2.96333646774292, "loss/hidden": 0.84765625, "loss/logits": 0.146368145942688, "loss/reg": 0.004560848698019981, "step": 940 }, { "epoch": 0.117625, "grad_norm": 2.7542107105255127, "grad_norm_var": 0.2151558946350927, "learning_rate": 0.0001, "loss": 1.0154, "loss/crossentropy": 2.326488494873047, "loss/hidden": 0.828125, "loss/logits": 0.14169706404209137, "loss/reg": 0.0045591117814183235, "step": 941 }, { "epoch": 0.11775, "grad_norm": 2.8061575889587402, "grad_norm_var": 0.20356104069779402, "learning_rate": 0.0001, "loss": 1.067, "loss/crossentropy": 2.483823537826538, "loss/hidden": 0.85546875, "loss/logits": 0.16598659753799438, "loss/reg": 0.004557049833238125, "step": 942 }, { "epoch": 0.117875, "grad_norm": 2.4513025283813477, "grad_norm_var": 0.2187293570918861, "learning_rate": 0.0001, "loss": 1.1831, "loss/crossentropy": 2.3770830631256104, "loss/hidden": 0.9609375, "loss/logits": 0.176588237285614, "loss/reg": 0.0045554060488939285, "step": 943 }, { "epoch": 0.118, "grad_norm": 2.757690906524658, "grad_norm_var": 0.19878318945221735, "learning_rate": 0.0001, "loss": 1.2487, "loss/crossentropy": 2.335298538208008, "loss/hidden": 1.0078125, "loss/logits": 0.19532084465026855, "loss/reg": 0.00455334922298789, "step": 944 }, { "epoch": 0.118125, "grad_norm": 3.0741937160491943, "grad_norm_var": 0.19037881818987876, "learning_rate": 0.0001, "loss": 1.1111, "loss/crossentropy": 2.5113272666931152, "loss/hidden": 0.8984375, "loss/logits": 0.16711819171905518, "loss/reg": 0.004551599267870188, "step": 945 }, { "epoch": 0.11825, "grad_norm": 2.156649589538574, "grad_norm_var": 0.22844079123048383, "learning_rate": 0.0001, "loss": 1.0124, "loss/crossentropy": 2.654160976409912, "loss/hidden": 0.8203125, "loss/logits": 0.14654606580734253, "loss/reg": 0.004549470264464617, "step": 946 }, { "epoch": 0.118375, "grad_norm": 3.1699886322021484, "grad_norm_var": 0.22512891612332073, "learning_rate": 0.0001, "loss": 1.2032, "loss/crossentropy": 2.6897716522216797, "loss/hidden": 0.9921875, "loss/logits": 0.16554811596870422, "loss/reg": 0.0045473333448171616, "step": 947 }, { "epoch": 0.1185, "grad_norm": 54.68584442138672, "grad_norm_var": 167.50451750408678, "learning_rate": 0.0001, "loss": 1.0296, "loss/crossentropy": 2.5791327953338623, "loss/hidden": 0.85546875, "loss/logits": 0.12866336107254028, "loss/reg": 0.004545523319393396, "step": 948 }, { "epoch": 0.118625, "grad_norm": 3.2524545192718506, "grad_norm_var": 167.45880382689273, "learning_rate": 0.0001, "loss": 1.1858, "loss/crossentropy": 2.6711151599884033, "loss/hidden": 0.95703125, "loss/logits": 0.1833563894033432, "loss/reg": 0.00454343156889081, "step": 949 }, { "epoch": 0.11875, "grad_norm": 5.301136016845703, "grad_norm_var": 167.26685801766013, "learning_rate": 0.0001, "loss": 1.553, "loss/crossentropy": 2.7770638465881348, "loss/hidden": 1.28125, "loss/logits": 0.22637835144996643, "loss/reg": 0.004541344009339809, "step": 950 }, { "epoch": 0.118875, "grad_norm": 2.384737730026245, "grad_norm_var": 167.73447965423813, "learning_rate": 0.0001, "loss": 1.0308, "loss/crossentropy": 2.795858144760132, "loss/hidden": 0.83984375, "loss/logits": 0.14552150666713715, "loss/reg": 0.004539397079497576, "step": 951 }, { "epoch": 0.119, "grad_norm": 3.921651601791382, "grad_norm_var": 167.32475778000344, "learning_rate": 0.0001, "loss": 1.3721, "loss/crossentropy": 2.6767590045928955, "loss/hidden": 1.140625, "loss/logits": 0.1861056089401245, "loss/reg": 0.004537293687462807, "step": 952 }, { "epoch": 0.119125, "grad_norm": 2.567948818206787, "grad_norm_var": 167.32483199379854, "learning_rate": 0.0001, "loss": 1.022, "loss/crossentropy": 2.5112462043762207, "loss/hidden": 0.82421875, "loss/logits": 0.1524544656276703, "loss/reg": 0.004535375162959099, "step": 953 }, { "epoch": 0.11925, "grad_norm": 3.9113171100616455, "grad_norm_var": 166.86572618880632, "learning_rate": 0.0001, "loss": 1.4182, "loss/crossentropy": 2.447330951690674, "loss/hidden": 1.1640625, "loss/logits": 0.20876702666282654, "loss/reg": 0.004533402621746063, "step": 954 }, { "epoch": 0.119375, "grad_norm": 2.407547950744629, "grad_norm_var": 167.22501953627514, "learning_rate": 0.0001, "loss": 1.0107, "loss/crossentropy": 2.4667739868164062, "loss/hidden": 0.82421875, "loss/logits": 0.1411362886428833, "loss/reg": 0.004531473852694035, "step": 955 }, { "epoch": 0.1195, "grad_norm": 3.312300205230713, "grad_norm_var": 167.0454581165803, "learning_rate": 0.0001, "loss": 1.1527, "loss/crossentropy": 2.6299078464508057, "loss/hidden": 0.9375, "loss/logits": 0.16988055408000946, "loss/reg": 0.0045296428725123405, "step": 956 }, { "epoch": 0.119625, "grad_norm": 2.991645097732544, "grad_norm_var": 166.93650144942362, "learning_rate": 0.0001, "loss": 1.145, "loss/crossentropy": 2.484005928039551, "loss/hidden": 0.94140625, "loss/logits": 0.158270001411438, "loss/reg": 0.0045278542675077915, "step": 957 }, { "epoch": 0.11975, "grad_norm": 2.3066608905792236, "grad_norm_var": 167.18625092351098, "learning_rate": 0.0001, "loss": 1.0326, "loss/crossentropy": 2.3425681591033936, "loss/hidden": 0.828125, "loss/logits": 0.1592123955488205, "loss/reg": 0.004525760654360056, "step": 958 }, { "epoch": 0.119875, "grad_norm": 2.4849298000335693, "grad_norm_var": 167.16910661257995, "learning_rate": 0.0001, "loss": 1.016, "loss/crossentropy": 2.577565908432007, "loss/hidden": 0.8203125, "loss/logits": 0.15045437216758728, "loss/reg": 0.00452386075630784, "step": 959 }, { "epoch": 0.12, "grad_norm": 3.1377410888671875, "grad_norm_var": 166.99899214100878, "learning_rate": 0.0001, "loss": 1.1344, "loss/crossentropy": 2.570946455001831, "loss/hidden": 0.921875, "loss/logits": 0.16726532578468323, "loss/reg": 0.004521827679127455, "step": 960 }, { "epoch": 0.120125, "grad_norm": 4.738165378570557, "grad_norm_var": 166.45265671613615, "learning_rate": 0.0001, "loss": 1.4551, "loss/crossentropy": 2.71376371383667, "loss/hidden": 1.203125, "loss/logits": 0.20679137110710144, "loss/reg": 0.0045196013525128365, "step": 961 }, { "epoch": 0.12025, "grad_norm": 3.7636489868164062, "grad_norm_var": 165.70042257567124, "learning_rate": 0.0001, "loss": 1.3982, "loss/crossentropy": 2.3803255558013916, "loss/hidden": 1.1015625, "loss/logits": 0.2515062689781189, "loss/reg": 0.004517595283687115, "step": 962 }, { "epoch": 0.120375, "grad_norm": 2.638967752456665, "grad_norm_var": 165.95531506158162, "learning_rate": 0.0001, "loss": 1.2337, "loss/crossentropy": 2.2372846603393555, "loss/hidden": 1.0234375, "loss/logits": 0.16512709856033325, "loss/reg": 0.004515463951975107, "step": 963 }, { "epoch": 0.1205, "grad_norm": 45.81782531738281, "grad_norm_var": 113.88107496030446, "learning_rate": 0.0001, "loss": 1.1605, "loss/crossentropy": 2.742631435394287, "loss/hidden": 0.9453125, "loss/logits": 0.17006908357143402, "loss/reg": 0.004513174295425415, "step": 964 }, { "epoch": 0.120625, "grad_norm": 2.8806581497192383, "grad_norm_var": 114.02262985566777, "learning_rate": 0.0001, "loss": 1.2776, "loss/crossentropy": 2.4858973026275635, "loss/hidden": 1.0234375, "loss/logits": 0.20908400416374207, "loss/reg": 0.004510868340730667, "step": 965 }, { "epoch": 0.12075, "grad_norm": 2.2246646881103516, "grad_norm_var": 114.86410220669458, "learning_rate": 0.0001, "loss": 1.1264, "loss/crossentropy": 2.499250888824463, "loss/hidden": 0.91015625, "loss/logits": 0.17112451791763306, "loss/reg": 0.004508919548243284, "step": 966 }, { "epoch": 0.120875, "grad_norm": 2.6645731925964355, "grad_norm_var": 114.74462216300226, "learning_rate": 0.0001, "loss": 1.2525, "loss/crossentropy": 2.5269155502319336, "loss/hidden": 1.03125, "loss/logits": 0.17614489793777466, "loss/reg": 0.004506917670369148, "step": 967 }, { "epoch": 0.121, "grad_norm": 2.5289218425750732, "grad_norm_var": 115.20270599436985, "learning_rate": 0.0001, "loss": 0.9631, "loss/crossentropy": 2.5555944442749023, "loss/hidden": 0.78515625, "loss/logits": 0.13286443054676056, "loss/reg": 0.004505137912929058, "step": 968 }, { "epoch": 0.121125, "grad_norm": 2.0906901359558105, "grad_norm_var": 115.41297732177252, "learning_rate": 0.0001, "loss": 1.0284, "loss/crossentropy": 2.4322237968444824, "loss/hidden": 0.8515625, "loss/logits": 0.1318206787109375, "loss/reg": 0.0045034573413431644, "step": 969 }, { "epoch": 0.12125, "grad_norm": 2.5444202423095703, "grad_norm_var": 115.84094031889703, "learning_rate": 0.0001, "loss": 1.1368, "loss/crossentropy": 2.548710584640503, "loss/hidden": 0.9296875, "loss/logits": 0.16206462681293488, "loss/reg": 0.004501515068113804, "step": 970 }, { "epoch": 0.121375, "grad_norm": 2.175011157989502, "grad_norm_var": 115.94123463799326, "learning_rate": 0.0001, "loss": 1.0439, "loss/crossentropy": 2.5955300331115723, "loss/hidden": 0.8515625, "loss/logits": 0.14735567569732666, "loss/reg": 0.004499473143368959, "step": 971 }, { "epoch": 0.1215, "grad_norm": 2.360872507095337, "grad_norm_var": 116.2777207470046, "learning_rate": 0.0001, "loss": 1.078, "loss/crossentropy": 2.328791379928589, "loss/hidden": 0.88671875, "loss/logits": 0.14633293449878693, "loss/reg": 0.00449743214994669, "step": 972 }, { "epoch": 0.121625, "grad_norm": 2.2626869678497314, "grad_norm_var": 116.55077789644868, "learning_rate": 0.0001, "loss": 1.0265, "loss/crossentropy": 2.6862330436706543, "loss/hidden": 0.83984375, "loss/logits": 0.1417045295238495, "loss/reg": 0.004495698027312756, "step": 973 }, { "epoch": 0.12175, "grad_norm": 2.239927291870117, "grad_norm_var": 116.57870277427698, "learning_rate": 0.0001, "loss": 0.9865, "loss/crossentropy": 2.6493136882781982, "loss/hidden": 0.8046875, "loss/logits": 0.13682736456394196, "loss/reg": 0.004493638873100281, "step": 974 }, { "epoch": 0.121875, "grad_norm": 3.526413917541504, "grad_norm_var": 116.24036193195982, "learning_rate": 0.0001, "loss": 1.4212, "loss/crossentropy": 2.3598437309265137, "loss/hidden": 1.1640625, "loss/logits": 0.21225669980049133, "loss/reg": 0.004491583444178104, "step": 975 }, { "epoch": 0.122, "grad_norm": 2.4012386798858643, "grad_norm_var": 116.5037542152016, "learning_rate": 0.0001, "loss": 1.0692, "loss/crossentropy": 2.8136839866638184, "loss/hidden": 0.8828125, "loss/logits": 0.14152291417121887, "loss/reg": 0.004489597398787737, "step": 976 }, { "epoch": 0.122125, "grad_norm": 3.48690128326416, "grad_norm_var": 116.71680821300758, "learning_rate": 0.0001, "loss": 1.1166, "loss/crossentropy": 2.5272679328918457, "loss/hidden": 0.921875, "loss/logits": 0.1498267650604248, "loss/reg": 0.004487714730203152, "step": 977 }, { "epoch": 0.12225, "grad_norm": 2.596402406692505, "grad_norm_var": 117.0489228171561, "learning_rate": 0.0001, "loss": 1.0618, "loss/crossentropy": 2.5853097438812256, "loss/hidden": 0.875, "loss/logits": 0.14198589324951172, "loss/reg": 0.0044856141321361065, "step": 978 }, { "epoch": 0.122375, "grad_norm": 2.4075088500976562, "grad_norm_var": 117.1336997192439, "learning_rate": 0.0001, "loss": 1.0708, "loss/crossentropy": 2.4603147506713867, "loss/hidden": 0.875, "loss/logits": 0.1509513258934021, "loss/reg": 0.004483620636165142, "step": 979 }, { "epoch": 0.1225, "grad_norm": 2.511711597442627, "grad_norm_var": 0.17809257377307758, "learning_rate": 0.0001, "loss": 1.0112, "loss/crossentropy": 2.369588613510132, "loss/hidden": 0.80859375, "loss/logits": 0.15779206156730652, "loss/reg": 0.00448161456733942, "step": 980 }, { "epoch": 0.122625, "grad_norm": 2.4200518131256104, "grad_norm_var": 0.1714391921620101, "learning_rate": 0.0001, "loss": 1.1021, "loss/crossentropy": 2.5299947261810303, "loss/hidden": 0.8828125, "loss/logits": 0.17450904846191406, "loss/reg": 0.004479666240513325, "step": 981 }, { "epoch": 0.12275, "grad_norm": 2.180694580078125, "grad_norm_var": 0.17333618624260225, "learning_rate": 0.0001, "loss": 0.9857, "loss/crossentropy": 2.4115102291107178, "loss/hidden": 0.80859375, "loss/logits": 0.13229887187480927, "loss/reg": 0.004477777983993292, "step": 982 }, { "epoch": 0.122875, "grad_norm": 2.063762664794922, "grad_norm_var": 0.1847061967544647, "learning_rate": 0.0001, "loss": 1.0019, "loss/crossentropy": 2.617342948913574, "loss/hidden": 0.8046875, "loss/logits": 0.1524919718503952, "loss/reg": 0.004475918132811785, "step": 983 }, { "epoch": 0.123, "grad_norm": 2.1369118690490723, "grad_norm_var": 0.19213655390988696, "learning_rate": 0.0001, "loss": 0.9803, "loss/crossentropy": 2.2203562259674072, "loss/hidden": 0.796875, "loss/logits": 0.13871444761753082, "loss/reg": 0.004473875742405653, "step": 984 }, { "epoch": 0.123125, "grad_norm": 2.5142624378204346, "grad_norm_var": 0.18233307349070932, "learning_rate": 0.0001, "loss": 1.2147, "loss/crossentropy": 2.369795322418213, "loss/hidden": 0.98046875, "loss/logits": 0.1895258128643036, "loss/reg": 0.00447199959307909, "step": 985 }, { "epoch": 0.12325, "grad_norm": 2.2619707584381104, "grad_norm_var": 0.18524330473807685, "learning_rate": 0.0001, "loss": 1.0356, "loss/crossentropy": 2.594536781311035, "loss/hidden": 0.84765625, "loss/logits": 0.1432015299797058, "loss/reg": 0.0044701374135911465, "step": 986 }, { "epoch": 0.123375, "grad_norm": 2.548429012298584, "grad_norm_var": 0.1791892169034893, "learning_rate": 0.0001, "loss": 1.1041, "loss/crossentropy": 2.4283149242401123, "loss/hidden": 0.88671875, "loss/logits": 0.17271637916564941, "loss/reg": 0.004468323662877083, "step": 987 }, { "epoch": 0.1235, "grad_norm": 1.967695951461792, "grad_norm_var": 0.19588156260189724, "learning_rate": 0.0001, "loss": 1.1141, "loss/crossentropy": 2.6532421112060547, "loss/hidden": 0.9140625, "loss/logits": 0.15532562136650085, "loss/reg": 0.004466407001018524, "step": 988 }, { "epoch": 0.123625, "grad_norm": 1.9731650352478027, "grad_norm_var": 0.2091392377621749, "learning_rate": 0.0001, "loss": 0.9699, "loss/crossentropy": 2.6200947761535645, "loss/hidden": 0.796875, "loss/logits": 0.1283724009990692, "loss/reg": 0.004464692436158657, "step": 989 }, { "epoch": 0.12375, "grad_norm": 2.0132744312286377, "grad_norm_var": 0.21876841065467237, "learning_rate": 0.0001, "loss": 1.0469, "loss/crossentropy": 2.4755969047546387, "loss/hidden": 0.84375, "loss/logits": 0.1585705578327179, "loss/reg": 0.00446262676268816, "step": 990 }, { "epoch": 0.123875, "grad_norm": 2.1972060203552246, "grad_norm_var": 0.13632242813181178, "learning_rate": 0.0001, "loss": 1.0328, "loss/crossentropy": 2.3555867671966553, "loss/hidden": 0.85546875, "loss/logits": 0.13275080919265747, "loss/reg": 0.004460789728909731, "step": 991 }, { "epoch": 0.124, "grad_norm": 3.8369944095611572, "grad_norm_var": 0.2739970385829828, "learning_rate": 0.0001, "loss": 1.7339, "loss/crossentropy": 2.2441718578338623, "loss/hidden": 1.3984375, "loss/logits": 0.2908269166946411, "loss/reg": 0.00445876969024539, "step": 992 }, { "epoch": 0.124125, "grad_norm": 2.786052703857422, "grad_norm_var": 0.20731647630768535, "learning_rate": 0.0001, "loss": 1.1283, "loss/crossentropy": 2.646028995513916, "loss/hidden": 0.9140625, "loss/logits": 0.1697021722793579, "loss/reg": 0.004456843715161085, "step": 993 }, { "epoch": 0.12425, "grad_norm": 2.5664174556732178, "grad_norm_var": 0.20659147596586322, "learning_rate": 0.0001, "loss": 1.1557, "loss/crossentropy": 2.3940696716308594, "loss/hidden": 0.94921875, "loss/logits": 0.16191905736923218, "loss/reg": 0.004455073736608028, "step": 994 }, { "epoch": 0.124375, "grad_norm": 2.2383058071136475, "grad_norm_var": 0.20819184179118794, "learning_rate": 0.0001, "loss": 0.9266, "loss/crossentropy": 2.499830722808838, "loss/hidden": 0.76171875, "loss/logits": 0.12036766111850739, "loss/reg": 0.004453308880329132, "step": 995 }, { "epoch": 0.1245, "grad_norm": 2.340665340423584, "grad_norm_var": 0.207211701006554, "learning_rate": 0.0001, "loss": 1.1757, "loss/crossentropy": 2.1450212001800537, "loss/hidden": 0.96484375, "loss/logits": 0.16630741953849792, "loss/reg": 0.004451683722436428, "step": 996 }, { "epoch": 0.124625, "grad_norm": 2.18617582321167, "grad_norm_var": 0.20931483319411062, "learning_rate": 0.0001, "loss": 1.1681, "loss/crossentropy": 2.145817518234253, "loss/hidden": 0.94140625, "loss/logits": 0.18221250176429749, "loss/reg": 0.004450384993106127, "step": 997 }, { "epoch": 0.12475, "grad_norm": 2.809575319290161, "grad_norm_var": 0.218725690321934, "learning_rate": 0.0001, "loss": 1.0028, "loss/crossentropy": 2.4370713233947754, "loss/hidden": 0.81640625, "loss/logits": 0.14186908304691315, "loss/reg": 0.004448299296200275, "step": 998 }, { "epoch": 0.124875, "grad_norm": 2.1984119415283203, "grad_norm_var": 0.21377643978811706, "learning_rate": 0.0001, "loss": 1.3013, "loss/crossentropy": 2.4677696228027344, "loss/hidden": 1.078125, "loss/logits": 0.1787460744380951, "loss/reg": 0.004446576349437237, "step": 999 }, { "epoch": 0.125, "grad_norm": 2.6378896236419678, "grad_norm_var": 0.2111563626515095, "learning_rate": 0.0001, "loss": 1.1774, "loss/crossentropy": 2.496150255203247, "loss/hidden": 0.93359375, "loss/logits": 0.19933247566223145, "loss/reg": 0.004444715566933155, "step": 1000 }, { "epoch": 0.125125, "grad_norm": 2.227482795715332, "grad_norm_var": 0.213544138660752, "learning_rate": 0.0001, "loss": 1.0916, "loss/crossentropy": 2.3874671459198, "loss/hidden": 0.89453125, "loss/logits": 0.15264838933944702, "loss/reg": 0.004442666191607714, "step": 1001 }, { "epoch": 0.12525, "grad_norm": 2.6360232830047607, "grad_norm_var": 0.21419004520447135, "learning_rate": 0.0001, "loss": 1.2063, "loss/crossentropy": 2.181767225265503, "loss/hidden": 1.0234375, "loss/logits": 0.1384468972682953, "loss/reg": 0.004440974909812212, "step": 1002 }, { "epoch": 0.125375, "grad_norm": 2.564113140106201, "grad_norm_var": 0.2144159920830437, "learning_rate": 0.0001, "loss": 1.2358, "loss/crossentropy": 2.3964531421661377, "loss/hidden": 1.0234375, "loss/logits": 0.16797608137130737, "loss/reg": 0.00443902425467968, "step": 1003 }, { "epoch": 0.1255, "grad_norm": 2.2647745609283447, "grad_norm_var": 0.20087855485435188, "learning_rate": 0.0001, "loss": 1.1095, "loss/crossentropy": 2.4876301288604736, "loss/hidden": 0.90625, "loss/logits": 0.1588534414768219, "loss/reg": 0.004437169525772333, "step": 1004 }, { "epoch": 0.125625, "grad_norm": 2.9978485107421875, "grad_norm_var": 0.19899346976312698, "learning_rate": 0.0001, "loss": 1.0965, "loss/crossentropy": 2.5442988872528076, "loss/hidden": 0.8984375, "loss/logits": 0.15375682711601257, "loss/reg": 0.00443507032468915, "step": 1005 }, { "epoch": 0.12575, "grad_norm": 3.734666585922241, "grad_norm_var": 0.26529031932980823, "learning_rate": 0.0001, "loss": 1.3977, "loss/crossentropy": 2.6442348957061768, "loss/hidden": 1.109375, "loss/logits": 0.24400165677070618, "loss/reg": 0.0044328500516712666, "step": 1006 }, { "epoch": 0.125875, "grad_norm": 2.0463345050811768, "grad_norm_var": 0.27559841867322327, "learning_rate": 0.0001, "loss": 0.9624, "loss/crossentropy": 2.744716167449951, "loss/hidden": 0.77734375, "loss/logits": 0.14069810509681702, "loss/reg": 0.004430860280990601, "step": 1007 }, { "epoch": 0.126, "grad_norm": 2.5981385707855225, "grad_norm_var": 0.1720635201097591, "learning_rate": 0.0001, "loss": 1.1825, "loss/crossentropy": 2.3077027797698975, "loss/hidden": 0.95703125, "loss/logits": 0.18115082383155823, "loss/reg": 0.004428706131875515, "step": 1008 }, { "epoch": 0.126125, "grad_norm": 2.8853800296783447, "grad_norm_var": 0.17577912545770383, "learning_rate": 0.0001, "loss": 1.1727, "loss/crossentropy": 3.0455784797668457, "loss/hidden": 0.9375, "loss/logits": 0.19088850915431976, "loss/reg": 0.004426531493663788, "step": 1009 }, { "epoch": 0.12625, "grad_norm": 3.3546810150146484, "grad_norm_var": 0.2154711693487454, "learning_rate": 0.0001, "loss": 1.356, "loss/crossentropy": 2.360203504562378, "loss/hidden": 1.125, "loss/logits": 0.18671754002571106, "loss/reg": 0.004424425307661295, "step": 1010 }, { "epoch": 0.126375, "grad_norm": 3.1136724948883057, "grad_norm_var": 0.2202687348010554, "learning_rate": 0.0001, "loss": 1.5006, "loss/crossentropy": 1.7926069498062134, "loss/hidden": 1.2265625, "loss/logits": 0.22981694340705872, "loss/reg": 0.004422247409820557, "step": 1011 }, { "epoch": 0.1265, "grad_norm": 2.9608895778656006, "grad_norm_var": 0.2177180299988663, "learning_rate": 0.0001, "loss": 0.9969, "loss/crossentropy": 2.716583251953125, "loss/hidden": 0.80859375, "loss/logits": 0.14412102103233337, "loss/reg": 0.004419958218932152, "step": 1012 }, { "epoch": 0.126625, "grad_norm": 2.627195358276367, "grad_norm_var": 0.1996009545068233, "learning_rate": 0.0001, "loss": 1.2014, "loss/crossentropy": 2.4077627658843994, "loss/hidden": 0.96484375, "loss/logits": 0.19235679507255554, "loss/reg": 0.004417847376316786, "step": 1013 }, { "epoch": 0.12675, "grad_norm": 2.6041698455810547, "grad_norm_var": 0.20001931967983994, "learning_rate": 0.0001, "loss": 1.0357, "loss/crossentropy": 2.6691579818725586, "loss/hidden": 0.85546875, "loss/logits": 0.13609513640403748, "loss/reg": 0.0044156271032989025, "step": 1014 }, { "epoch": 0.126875, "grad_norm": 2.6999282836914062, "grad_norm_var": 0.18114680748988857, "learning_rate": 0.0001, "loss": 1.1957, "loss/crossentropy": 2.2058401107788086, "loss/hidden": 0.98828125, "loss/logits": 0.16330450773239136, "loss/reg": 0.004413560498505831, "step": 1015 }, { "epoch": 0.127, "grad_norm": 2.9605417251586914, "grad_norm_var": 0.1829561774470515, "learning_rate": 0.0001, "loss": 1.1698, "loss/crossentropy": 2.5311779975891113, "loss/hidden": 0.9296875, "loss/logits": 0.19603696465492249, "loss/reg": 0.0044115264900028706, "step": 1016 }, { "epoch": 0.127125, "grad_norm": 3.1632213592529297, "grad_norm_var": 0.17033870731749232, "learning_rate": 0.0001, "loss": 1.1371, "loss/crossentropy": 2.628852367401123, "loss/hidden": 0.9453125, "loss/logits": 0.1477031111717224, "loss/reg": 0.0044094715267419815, "step": 1017 }, { "epoch": 0.12725, "grad_norm": 2.1563079357147217, "grad_norm_var": 0.19685525865983494, "learning_rate": 0.0001, "loss": 1.156, "loss/crossentropy": 2.51649808883667, "loss/hidden": 0.93359375, "loss/logits": 0.17829856276512146, "loss/reg": 0.004407336004078388, "step": 1018 }, { "epoch": 0.127375, "grad_norm": 2.838027238845825, "grad_norm_var": 0.19308506502144737, "learning_rate": 0.0001, "loss": 1.1716, "loss/crossentropy": 1.851514458656311, "loss/hidden": 0.984375, "loss/logits": 0.1431439369916916, "loss/reg": 0.004405440296977758, "step": 1019 }, { "epoch": 0.1275, "grad_norm": 3.7514472007751465, "grad_norm_var": 0.2225789179284817, "learning_rate": 0.0001, "loss": 1.2725, "loss/crossentropy": 2.4725522994995117, "loss/hidden": 0.99609375, "loss/logits": 0.23235675692558289, "loss/reg": 0.004403635859489441, "step": 1020 }, { "epoch": 0.127625, "grad_norm": 2.899569034576416, "grad_norm_var": 0.22197611268337217, "learning_rate": 0.0001, "loss": 1.3405, "loss/crossentropy": 2.393155336380005, "loss/hidden": 1.109375, "loss/logits": 0.187089741230011, "loss/reg": 0.004401590209454298, "step": 1021 }, { "epoch": 0.12775, "grad_norm": 3.2884371280670166, "grad_norm_var": 0.18473910601510302, "learning_rate": 0.0001, "loss": 1.6095, "loss/crossentropy": 2.3898444175720215, "loss/hidden": 1.3203125, "loss/logits": 0.24515338242053986, "loss/reg": 0.004399486817419529, "step": 1022 }, { "epoch": 0.127875, "grad_norm": 3.160599708557129, "grad_norm_var": 0.13970793310639895, "learning_rate": 0.0001, "loss": 1.1549, "loss/crossentropy": 2.3978278636932373, "loss/hidden": 0.96484375, "loss/logits": 0.14608745276927948, "loss/reg": 0.004397205542773008, "step": 1023 }, { "epoch": 0.128, "grad_norm": 3.4500718116760254, "grad_norm_var": 0.14607975431924464, "learning_rate": 0.0001, "loss": 1.3923, "loss/crossentropy": 1.8279949426651, "loss/hidden": 1.1640625, "loss/logits": 0.18429754674434662, "loss/reg": 0.00439491355791688, "step": 1024 }, { "epoch": 0.128125, "grad_norm": 3.99407696723938, "grad_norm_var": 0.20675474417581274, "learning_rate": 0.0001, "loss": 1.5049, "loss/crossentropy": 2.702716588973999, "loss/hidden": 1.2265625, "loss/logits": 0.23438766598701477, "loss/reg": 0.004392672795802355, "step": 1025 }, { "epoch": 0.12825, "grad_norm": 2.2928214073181152, "grad_norm_var": 0.23606107387848377, "learning_rate": 0.0001, "loss": 1.2118, "loss/crossentropy": 2.398599624633789, "loss/hidden": 0.9921875, "loss/logits": 0.17572686076164246, "loss/reg": 0.004390507936477661, "step": 1026 }, { "epoch": 0.128375, "grad_norm": 3.0106916427612305, "grad_norm_var": 0.2351295893724907, "learning_rate": 0.0001, "loss": 1.2084, "loss/crossentropy": 2.242279529571533, "loss/hidden": 0.98046875, "loss/logits": 0.18401223421096802, "loss/reg": 0.004388165660202503, "step": 1027 }, { "epoch": 0.1285, "grad_norm": 3.3926002979278564, "grad_norm_var": 0.24503759295084418, "learning_rate": 0.0001, "loss": 1.2829, "loss/crossentropy": 2.689535617828369, "loss/hidden": 1.0625, "loss/logits": 0.1765148937702179, "loss/reg": 0.0043861158192157745, "step": 1028 }, { "epoch": 0.128625, "grad_norm": 4.1095452308654785, "grad_norm_var": 0.3051103506304455, "learning_rate": 0.0001, "loss": 1.4167, "loss/crossentropy": 2.3078904151916504, "loss/hidden": 1.1484375, "loss/logits": 0.224439799785614, "loss/reg": 0.004383730702102184, "step": 1029 }, { "epoch": 0.12875, "grad_norm": 2.599076747894287, "grad_norm_var": 0.30545598256471346, "learning_rate": 0.0001, "loss": 1.1392, "loss/crossentropy": 2.7188191413879395, "loss/hidden": 0.890625, "loss/logits": 0.20472605526447296, "loss/reg": 0.004381467588245869, "step": 1030 }, { "epoch": 0.128875, "grad_norm": 2.078481435775757, "grad_norm_var": 0.36360767736667393, "learning_rate": 0.0001, "loss": 1.0199, "loss/crossentropy": 2.741642713546753, "loss/hidden": 0.82421875, "loss/logits": 0.15186432003974915, "loss/reg": 0.00437910296022892, "step": 1031 }, { "epoch": 0.129, "grad_norm": 5.224343776702881, "grad_norm_var": 0.6503873685492408, "learning_rate": 0.0001, "loss": 1.3017, "loss/crossentropy": 2.672182559967041, "loss/hidden": 1.0390625, "loss/logits": 0.21886497735977173, "loss/reg": 0.0043770503252744675, "step": 1032 }, { "epoch": 0.129125, "grad_norm": 3.197111129760742, "grad_norm_var": 0.6502338467882434, "learning_rate": 0.0001, "loss": 1.3376, "loss/crossentropy": 2.442309617996216, "loss/hidden": 1.1015625, "loss/logits": 0.19233301281929016, "loss/reg": 0.004375019110739231, "step": 1033 }, { "epoch": 0.12925, "grad_norm": 2.78690767288208, "grad_norm_var": 0.5860556952241872, "learning_rate": 0.0001, "loss": 1.1761, "loss/crossentropy": 2.430758237838745, "loss/hidden": 0.9609375, "loss/logits": 0.17138496041297913, "loss/reg": 0.00437304237857461, "step": 1034 }, { "epoch": 0.129375, "grad_norm": 2.5011260509490967, "grad_norm_var": 0.6118626954588627, "learning_rate": 0.0001, "loss": 1.0916, "loss/crossentropy": 2.541623115539551, "loss/hidden": 0.89453125, "loss/logits": 0.15334823727607727, "loss/reg": 0.004370801616460085, "step": 1035 }, { "epoch": 0.1295, "grad_norm": 2.1834847927093506, "grad_norm_var": 0.6572482832048148, "learning_rate": 0.0001, "loss": 1.002, "loss/crossentropy": 2.252703905105591, "loss/hidden": 0.81640625, "loss/logits": 0.1418866515159607, "loss/reg": 0.004368768073618412, "step": 1036 }, { "epoch": 0.129625, "grad_norm": 3.4379196166992188, "grad_norm_var": 0.6584227357505256, "learning_rate": 0.0001, "loss": 1.2578, "loss/crossentropy": 2.5080173015594482, "loss/hidden": 1.03125, "loss/logits": 0.18284769356250763, "loss/reg": 0.004366564564406872, "step": 1037 }, { "epoch": 0.12975, "grad_norm": 2.793656587600708, "grad_norm_var": 0.6658574542034102, "learning_rate": 0.0001, "loss": 1.5344, "loss/crossentropy": 2.028933048248291, "loss/hidden": 1.265625, "loss/logits": 0.22508540749549866, "loss/reg": 0.004364544991403818, "step": 1038 }, { "epoch": 0.129875, "grad_norm": 2.5851385593414307, "grad_norm_var": 0.6848422923307773, "learning_rate": 0.0001, "loss": 1.1177, "loss/crossentropy": 2.3001692295074463, "loss/hidden": 0.9140625, "loss/logits": 0.16001108288764954, "loss/reg": 0.0043626632541418076, "step": 1039 }, { "epoch": 0.13, "grad_norm": 3.0864601135253906, "grad_norm_var": 0.6762458829728395, "learning_rate": 0.0001, "loss": 1.2109, "loss/crossentropy": 2.3810689449310303, "loss/hidden": 0.98828125, "loss/logits": 0.17900380492210388, "loss/reg": 0.0043608080595731735, "step": 1040 }, { "epoch": 0.130125, "grad_norm": 2.6737496852874756, "grad_norm_var": 0.6242103012797673, "learning_rate": 0.0001, "loss": 1.3919, "loss/crossentropy": 2.0557336807250977, "loss/hidden": 1.1328125, "loss/logits": 0.2154931128025055, "loss/reg": 0.004358771722763777, "step": 1041 }, { "epoch": 0.13025, "grad_norm": 2.583439350128174, "grad_norm_var": 0.6022000179942284, "learning_rate": 0.0001, "loss": 1.0657, "loss/crossentropy": 2.530609607696533, "loss/hidden": 0.875, "loss/logits": 0.14716514945030212, "loss/reg": 0.004356934688985348, "step": 1042 }, { "epoch": 0.130375, "grad_norm": 2.3127689361572266, "grad_norm_var": 0.6330661539787814, "learning_rate": 0.0001, "loss": 1.0062, "loss/crossentropy": 2.5741043090820312, "loss/hidden": 0.83203125, "loss/logits": 0.1306590735912323, "loss/reg": 0.004354908596724272, "step": 1043 }, { "epoch": 0.1305, "grad_norm": 2.1028034687042236, "grad_norm_var": 0.6646412556630875, "learning_rate": 0.0001, "loss": 1.005, "loss/crossentropy": 2.2496836185455322, "loss/hidden": 0.8359375, "loss/logits": 0.12550613284111023, "loss/reg": 0.004353053402155638, "step": 1044 }, { "epoch": 0.130625, "grad_norm": 2.9018990993499756, "grad_norm_var": 0.5595824371855532, "learning_rate": 0.0001, "loss": 1.0929, "loss/crossentropy": 2.459836959838867, "loss/hidden": 0.88671875, "loss/logits": 0.16267293691635132, "loss/reg": 0.004351151175796986, "step": 1045 }, { "epoch": 0.13075, "grad_norm": 2.1798477172851562, "grad_norm_var": 0.582665735357125, "learning_rate": 0.0001, "loss": 1.0568, "loss/crossentropy": 2.393702983856201, "loss/hidden": 0.85546875, "loss/logits": 0.15785646438598633, "loss/reg": 0.004349268972873688, "step": 1046 }, { "epoch": 0.130875, "grad_norm": 3.7185163497924805, "grad_norm_var": 0.5953326384247966, "learning_rate": 0.0001, "loss": 1.114, "loss/crossentropy": 2.5875675678253174, "loss/hidden": 0.91796875, "loss/logits": 0.1525106430053711, "loss/reg": 0.004347451031208038, "step": 1047 }, { "epoch": 0.131, "grad_norm": 2.5062150955200195, "grad_norm_var": 0.21175117035612606, "learning_rate": 0.0001, "loss": 1.2681, "loss/crossentropy": 2.432136297225952, "loss/hidden": 1.0390625, "loss/logits": 0.18555600941181183, "loss/reg": 0.004345426335930824, "step": 1048 }, { "epoch": 0.131125, "grad_norm": 2.811901807785034, "grad_norm_var": 0.19661994295048071, "learning_rate": 0.0001, "loss": 1.041, "loss/crossentropy": 2.586029291152954, "loss/hidden": 0.85546875, "loss/logits": 0.14211300015449524, "loss/reg": 0.004343352280557156, "step": 1049 }, { "epoch": 0.13125, "grad_norm": 2.6836400032043457, "grad_norm_var": 0.19606042121245745, "learning_rate": 0.0001, "loss": 1.1189, "loss/crossentropy": 2.2877132892608643, "loss/hidden": 0.92578125, "loss/logits": 0.14965856075286865, "loss/reg": 0.004341335967183113, "step": 1050 }, { "epoch": 0.131375, "grad_norm": 3.004340171813965, "grad_norm_var": 0.19911977640688458, "learning_rate": 0.0001, "loss": 0.9704, "loss/crossentropy": 2.545414686203003, "loss/hidden": 0.79296875, "loss/logits": 0.13407136499881744, "loss/reg": 0.004339275881648064, "step": 1051 }, { "epoch": 0.1315, "grad_norm": 2.3175387382507324, "grad_norm_var": 0.19060218969874068, "learning_rate": 0.0001, "loss": 1.2081, "loss/crossentropy": 2.1056127548217773, "loss/hidden": 1.0, "loss/logits": 0.16477006673812866, "loss/reg": 0.004337204620242119, "step": 1052 }, { "epoch": 0.131625, "grad_norm": 2.9183707237243652, "grad_norm_var": 0.15851891177443728, "learning_rate": 0.0001, "loss": 1.1596, "loss/crossentropy": 2.5549087524414062, "loss/hidden": 0.93359375, "loss/logits": 0.18266820907592773, "loss/reg": 0.004335105884820223, "step": 1053 }, { "epoch": 0.13175, "grad_norm": 2.005140781402588, "grad_norm_var": 0.18740257136220345, "learning_rate": 0.0001, "loss": 1.0446, "loss/crossentropy": 2.5802786350250244, "loss/hidden": 0.84765625, "loss/logits": 0.1536553055047989, "loss/reg": 0.00433309143409133, "step": 1054 }, { "epoch": 0.131875, "grad_norm": 2.5984582901000977, "grad_norm_var": 0.18729938166855695, "learning_rate": 0.0001, "loss": 1.1794, "loss/crossentropy": 2.447845458984375, "loss/hidden": 0.95703125, "loss/logits": 0.17910084128379822, "loss/reg": 0.0043309698812663555, "step": 1055 }, { "epoch": 0.132, "grad_norm": 2.2338852882385254, "grad_norm_var": 0.1831504662831539, "learning_rate": 0.0001, "loss": 1.0552, "loss/crossentropy": 2.322484254837036, "loss/hidden": 0.859375, "loss/logits": 0.15252941846847534, "loss/reg": 0.004328942392021418, "step": 1056 }, { "epoch": 0.132125, "grad_norm": 2.383525848388672, "grad_norm_var": 0.18544613518572697, "learning_rate": 0.0001, "loss": 1.2197, "loss/crossentropy": 2.3245205879211426, "loss/hidden": 1.0078125, "loss/logits": 0.16857783496379852, "loss/reg": 0.004326963797211647, "step": 1057 }, { "epoch": 0.13225, "grad_norm": 3.00066876411438, "grad_norm_var": 0.19657906255275273, "learning_rate": 0.0001, "loss": 1.3914, "loss/crossentropy": 2.3864874839782715, "loss/hidden": 1.109375, "loss/logits": 0.23875172436237335, "loss/reg": 0.004325190093368292, "step": 1058 }, { "epoch": 0.132375, "grad_norm": 3.0184719562530518, "grad_norm_var": 0.20021081345078998, "learning_rate": 0.0001, "loss": 1.1478, "loss/crossentropy": 2.5393733978271484, "loss/hidden": 0.91015625, "loss/logits": 0.1943821907043457, "loss/reg": 0.004323435481637716, "step": 1059 }, { "epoch": 0.1325, "grad_norm": 4.587968826293945, "grad_norm_var": 0.4052032312871603, "learning_rate": 0.0001, "loss": 1.2345, "loss/crossentropy": 2.2922558784484863, "loss/hidden": 1.015625, "loss/logits": 0.17564015090465546, "loss/reg": 0.00432176748290658, "step": 1060 }, { "epoch": 0.132625, "grad_norm": 8.61639404296875, "grad_norm_var": 2.5204572599607027, "learning_rate": 0.0001, "loss": 1.7566, "loss/crossentropy": 2.5616378784179688, "loss/hidden": 1.5, "loss/logits": 0.21341325342655182, "loss/reg": 0.0043197330087423325, "step": 1061 }, { "epoch": 0.13275, "grad_norm": 3.9714202880859375, "grad_norm_var": 2.486558816101914, "learning_rate": 0.0001, "loss": 1.449, "loss/crossentropy": 2.4953112602233887, "loss/hidden": 1.1875, "loss/logits": 0.21827445924282074, "loss/reg": 0.0043179914355278015, "step": 1062 }, { "epoch": 0.132875, "grad_norm": 4.598790168762207, "grad_norm_var": 2.587217087573132, "learning_rate": 0.0001, "loss": 1.46, "loss/crossentropy": 2.4032106399536133, "loss/hidden": 1.1796875, "loss/logits": 0.23717749118804932, "loss/reg": 0.004315928090363741, "step": 1063 }, { "epoch": 0.133, "grad_norm": 2.2761240005493164, "grad_norm_var": 2.6157540828571424, "learning_rate": 0.0001, "loss": 1.1132, "loss/crossentropy": 2.2180681228637695, "loss/hidden": 0.91015625, "loss/logits": 0.15992170572280884, "loss/reg": 0.004314035642892122, "step": 1064 }, { "epoch": 0.133125, "grad_norm": 5.624876022338867, "grad_norm_var": 2.921925131142435, "learning_rate": 0.0001, "loss": 1.5429, "loss/crossentropy": 2.552769899368286, "loss/hidden": 1.2734375, "loss/logits": 0.22636112570762634, "loss/reg": 0.004311975557357073, "step": 1065 }, { "epoch": 0.13325, "grad_norm": 2.610703945159912, "grad_norm_var": 2.9300990717021325, "learning_rate": 0.0001, "loss": 1.152, "loss/crossentropy": 2.693028450012207, "loss/hidden": 0.92578125, "loss/logits": 0.18307983875274658, "loss/reg": 0.00430967565625906, "step": 1066 }, { "epoch": 0.133375, "grad_norm": 2.063502311706543, "grad_norm_var": 3.0457713158671065, "learning_rate": 0.0001, "loss": 0.9481, "loss/crossentropy": 2.3977253437042236, "loss/hidden": 0.7890625, "loss/logits": 0.11597828567028046, "loss/reg": 0.004307459108531475, "step": 1067 }, { "epoch": 0.1335, "grad_norm": 2.2253830432891846, "grad_norm_var": 3.059929800360324, "learning_rate": 0.0001, "loss": 1.1051, "loss/crossentropy": 2.43769907951355, "loss/hidden": 0.90234375, "loss/logits": 0.1597452163696289, "loss/reg": 0.0043051764369010925, "step": 1068 }, { "epoch": 0.133625, "grad_norm": 2.5021073818206787, "grad_norm_var": 3.0986482846073766, "learning_rate": 0.0001, "loss": 1.213, "loss/crossentropy": 2.4540703296661377, "loss/hidden": 0.99609375, "loss/logits": 0.1738748550415039, "loss/reg": 0.004303151275962591, "step": 1069 }, { "epoch": 0.13375, "grad_norm": 2.6333425045013428, "grad_norm_var": 3.006911696263074, "learning_rate": 0.0001, "loss": 1.1423, "loss/crossentropy": 2.481794595718384, "loss/hidden": 0.9375, "loss/logits": 0.16177596151828766, "loss/reg": 0.00430120388045907, "step": 1070 }, { "epoch": 0.133875, "grad_norm": 4.2749247550964355, "grad_norm_var": 2.995780076937819, "learning_rate": 0.0001, "loss": 1.232, "loss/crossentropy": 2.376904249191284, "loss/hidden": 1.046875, "loss/logits": 0.1420845091342926, "loss/reg": 0.004299336113035679, "step": 1071 }, { "epoch": 0.134, "grad_norm": 3.749925374984741, "grad_norm_var": 2.8756386517730372, "learning_rate": 0.0001, "loss": 1.6449, "loss/crossentropy": 2.1668522357940674, "loss/hidden": 1.3515625, "loss/logits": 0.2503596842288971, "loss/reg": 0.004297502338886261, "step": 1072 }, { "epoch": 0.134125, "grad_norm": 2.394890069961548, "grad_norm_var": 2.873752523963793, "learning_rate": 0.0001, "loss": 1.0081, "loss/crossentropy": 2.4769225120544434, "loss/hidden": 0.8203125, "loss/logits": 0.14485791325569153, "loss/reg": 0.004295617341995239, "step": 1073 }, { "epoch": 0.13425, "grad_norm": 2.521232843399048, "grad_norm_var": 2.9286262129865914, "learning_rate": 0.0001, "loss": 1.1533, "loss/crossentropy": 2.302316665649414, "loss/hidden": 0.9375, "loss/logits": 0.1728420853614807, "loss/reg": 0.004293751437216997, "step": 1074 }, { "epoch": 0.134375, "grad_norm": 3.0982587337493896, "grad_norm_var": 2.92279106991038, "learning_rate": 0.0001, "loss": 1.0719, "loss/crossentropy": 2.5007832050323486, "loss/hidden": 0.8828125, "loss/logits": 0.14618419110774994, "loss/reg": 0.004291870631277561, "step": 1075 }, { "epoch": 0.1345, "grad_norm": 2.1802334785461426, "grad_norm_var": 2.9709529639568184, "learning_rate": 0.0001, "loss": 1.0165, "loss/crossentropy": 2.736433982849121, "loss/hidden": 0.83984375, "loss/logits": 0.1337248980998993, "loss/reg": 0.00428979704156518, "step": 1076 }, { "epoch": 0.134625, "grad_norm": 2.309565782546997, "grad_norm_var": 1.1199522794543773, "learning_rate": 0.0001, "loss": 1.1145, "loss/crossentropy": 2.277212619781494, "loss/hidden": 0.91796875, "loss/logits": 0.1536218523979187, "loss/reg": 0.004287887830287218, "step": 1077 }, { "epoch": 0.13475, "grad_norm": 2.270759105682373, "grad_norm_var": 1.0951157521634287, "learning_rate": 0.0001, "loss": 0.9921, "loss/crossentropy": 2.6023480892181396, "loss/hidden": 0.8046875, "loss/logits": 0.14457595348358154, "loss/reg": 0.00428583100438118, "step": 1078 }, { "epoch": 0.134875, "grad_norm": 2.0448989868164062, "grad_norm_var": 0.9441842031089095, "learning_rate": 0.0001, "loss": 1.0484, "loss/crossentropy": 2.619910955429077, "loss/hidden": 0.84765625, "loss/logits": 0.15788725018501282, "loss/reg": 0.0042838454246521, "step": 1079 }, { "epoch": 0.135, "grad_norm": 2.8515126705169678, "grad_norm_var": 0.9247776412201837, "learning_rate": 0.0001, "loss": 0.9959, "loss/crossentropy": 2.4871740341186523, "loss/hidden": 0.8203125, "loss/logits": 0.13277903199195862, "loss/reg": 0.004281722474843264, "step": 1080 }, { "epoch": 0.135125, "grad_norm": 2.5293588638305664, "grad_norm_var": 0.3720854176506897, "learning_rate": 0.0001, "loss": 1.0284, "loss/crossentropy": 2.8959267139434814, "loss/hidden": 0.828125, "loss/logits": 0.15750843286514282, "loss/reg": 0.004279691725969315, "step": 1081 }, { "epoch": 0.13525, "grad_norm": 2.639998197555542, "grad_norm_var": 0.37201959594675976, "learning_rate": 0.0001, "loss": 1.1434, "loss/crossentropy": 2.4725499153137207, "loss/hidden": 0.93359375, "loss/logits": 0.16706131398677826, "loss/reg": 0.0042777759954333305, "step": 1082 }, { "epoch": 0.135375, "grad_norm": 2.474238157272339, "grad_norm_var": 0.35082104566976846, "learning_rate": 0.0001, "loss": 0.9969, "loss/crossentropy": 2.4064362049102783, "loss/hidden": 0.81640625, "loss/logits": 0.13769997656345367, "loss/reg": 0.004275754559785128, "step": 1083 }, { "epoch": 0.1355, "grad_norm": 5.083343982696533, "grad_norm_var": 0.6923522790607459, "learning_rate": 0.0001, "loss": 1.1973, "loss/crossentropy": 2.635760545730591, "loss/hidden": 0.98828125, "loss/logits": 0.16631773114204407, "loss/reg": 0.0042738220654428005, "step": 1084 }, { "epoch": 0.135625, "grad_norm": 2.9828197956085205, "grad_norm_var": 0.6846627645265992, "learning_rate": 0.0001, "loss": 1.4219, "loss/crossentropy": 2.64700984954834, "loss/hidden": 1.171875, "loss/logits": 0.20732998847961426, "loss/reg": 0.0042719184421002865, "step": 1085 }, { "epoch": 0.13575, "grad_norm": 2.5733866691589355, "grad_norm_var": 0.6868389075343996, "learning_rate": 0.0001, "loss": 1.0162, "loss/crossentropy": 2.6855356693267822, "loss/hidden": 0.83203125, "loss/logits": 0.14148542284965515, "loss/reg": 0.004269769415259361, "step": 1086 }, { "epoch": 0.135875, "grad_norm": 3.7361340522766113, "grad_norm_var": 0.6043207840777595, "learning_rate": 0.0001, "loss": 1.6576, "loss/crossentropy": 2.38763689994812, "loss/hidden": 1.328125, "loss/logits": 0.286837637424469, "loss/reg": 0.004267562180757523, "step": 1087 }, { "epoch": 0.136, "grad_norm": 2.4554789066314697, "grad_norm_var": 0.5520046435601859, "learning_rate": 0.0001, "loss": 1.0165, "loss/crossentropy": 2.697847366333008, "loss/hidden": 0.8203125, "loss/logits": 0.15349683165550232, "loss/reg": 0.004265283700078726, "step": 1088 }, { "epoch": 0.136125, "grad_norm": 2.2833757400512695, "grad_norm_var": 0.5581976166383347, "learning_rate": 0.0001, "loss": 1.1297, "loss/crossentropy": 2.6681339740753174, "loss/hidden": 0.91796875, "loss/logits": 0.16904997825622559, "loss/reg": 0.004263162147253752, "step": 1089 }, { "epoch": 0.13625, "grad_norm": 2.1826672554016113, "grad_norm_var": 0.5757864160069344, "learning_rate": 0.0001, "loss": 1.0232, "loss/crossentropy": 2.3187673091888428, "loss/hidden": 0.84375, "loss/logits": 0.136864572763443, "loss/reg": 0.004261130001395941, "step": 1090 }, { "epoch": 0.136375, "grad_norm": 3.739326238632202, "grad_norm_var": 0.632863410677898, "learning_rate": 0.0001, "loss": 1.1344, "loss/crossentropy": 2.590574264526367, "loss/hidden": 0.9140625, "loss/logits": 0.1777951866388321, "loss/reg": 0.004259143024682999, "step": 1091 }, { "epoch": 0.1365, "grad_norm": 2.1081202030181885, "grad_norm_var": 0.6388693719171433, "learning_rate": 0.0001, "loss": 1.0255, "loss/crossentropy": 2.8422060012817383, "loss/hidden": 0.84375, "loss/logits": 0.13917985558509827, "loss/reg": 0.0042572119273245335, "step": 1092 }, { "epoch": 0.136625, "grad_norm": 1.9327036142349243, "grad_norm_var": 0.6707091951265027, "learning_rate": 0.0001, "loss": 0.9773, "loss/crossentropy": 2.6250529289245605, "loss/hidden": 0.796875, "loss/logits": 0.13788098096847534, "loss/reg": 0.004255138337612152, "step": 1093 }, { "epoch": 0.13675, "grad_norm": 2.4659841060638428, "grad_norm_var": 0.6607986154781931, "learning_rate": 0.0001, "loss": 1.159, "loss/crossentropy": 2.2569518089294434, "loss/hidden": 0.94140625, "loss/logits": 0.1750330626964569, "loss/reg": 0.004253007471561432, "step": 1094 }, { "epoch": 0.136875, "grad_norm": 2.6554629802703857, "grad_norm_var": 0.6262725765926574, "learning_rate": 0.0001, "loss": 1.3871, "loss/crossentropy": 2.323014974594116, "loss/hidden": 1.1015625, "loss/logits": 0.2430596649646759, "loss/reg": 0.004250808618962765, "step": 1095 }, { "epoch": 0.137, "grad_norm": 2.722032070159912, "grad_norm_var": 0.6263166142478738, "learning_rate": 0.0001, "loss": 1.0409, "loss/crossentropy": 2.4675052165985107, "loss/hidden": 0.86328125, "loss/logits": 0.13518026471138, "loss/reg": 0.004248757380992174, "step": 1096 }, { "epoch": 0.137125, "grad_norm": 2.510000228881836, "grad_norm_var": 0.6270005997929298, "learning_rate": 0.0001, "loss": 1.0616, "loss/crossentropy": 2.3804080486297607, "loss/hidden": 0.87109375, "loss/logits": 0.14798954129219055, "loss/reg": 0.004246733151376247, "step": 1097 }, { "epoch": 0.13725, "grad_norm": 3.058847188949585, "grad_norm_var": 0.6299195109389221, "learning_rate": 0.0001, "loss": 1.0296, "loss/crossentropy": 2.7722012996673584, "loss/hidden": 0.83984375, "loss/logits": 0.1473253071308136, "loss/reg": 0.004244515672326088, "step": 1098 }, { "epoch": 0.137375, "grad_norm": 4.170520782470703, "grad_norm_var": 0.7337604064260393, "learning_rate": 0.0001, "loss": 1.1115, "loss/crossentropy": 3.227797508239746, "loss/hidden": 0.8828125, "loss/logits": 0.18627075850963593, "loss/reg": 0.004242491442710161, "step": 1099 }, { "epoch": 0.1375, "grad_norm": 2.20058536529541, "grad_norm_var": 0.4201977001022351, "learning_rate": 0.0001, "loss": 1.0558, "loss/crossentropy": 2.337442636489868, "loss/hidden": 0.8671875, "loss/logits": 0.14616578817367554, "loss/reg": 0.004240325652062893, "step": 1100 }, { "epoch": 0.137625, "grad_norm": 2.378139019012451, "grad_norm_var": 0.42315778530047454, "learning_rate": 0.0001, "loss": 0.9849, "loss/crossentropy": 2.418541431427002, "loss/hidden": 0.82421875, "loss/logits": 0.1183251217007637, "loss/reg": 0.00423810537904501, "step": 1101 }, { "epoch": 0.13775, "grad_norm": 2.4013891220092773, "grad_norm_var": 0.42787131976948645, "learning_rate": 0.0001, "loss": 1.044, "loss/crossentropy": 2.642296552658081, "loss/hidden": 0.8359375, "loss/logits": 0.16568773984909058, "loss/reg": 0.004236077889800072, "step": 1102 }, { "epoch": 0.137875, "grad_norm": 2.395822286605835, "grad_norm_var": 0.35275757091918025, "learning_rate": 0.0001, "loss": 1.0828, "loss/crossentropy": 2.457338571548462, "loss/hidden": 0.8828125, "loss/logits": 0.15768851339817047, "loss/reg": 0.004234058782458305, "step": 1103 }, { "epoch": 0.138, "grad_norm": 2.5931308269500732, "grad_norm_var": 0.35121999529942605, "learning_rate": 0.0001, "loss": 1.2445, "loss/crossentropy": 2.370553731918335, "loss/hidden": 1.015625, "loss/logits": 0.1865496039390564, "loss/reg": 0.004232100211083889, "step": 1104 }, { "epoch": 0.138125, "grad_norm": 2.5963783264160156, "grad_norm_var": 0.3436125305875331, "learning_rate": 0.0001, "loss": 0.9801, "loss/crossentropy": 2.5301551818847656, "loss/hidden": 0.80078125, "loss/logits": 0.1369716078042984, "loss/reg": 0.004230163525789976, "step": 1105 }, { "epoch": 0.13825, "grad_norm": 2.951883316040039, "grad_norm_var": 0.3345145438296379, "learning_rate": 0.0001, "loss": 0.9827, "loss/crossentropy": 3.0806362628936768, "loss/hidden": 0.8046875, "loss/logits": 0.13572098314762115, "loss/reg": 0.0042281243950128555, "step": 1106 }, { "epoch": 0.138375, "grad_norm": 2.111954927444458, "grad_norm_var": 0.2701844296523925, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.3894336223602295, "loss/hidden": 0.984375, "loss/logits": 0.1716623604297638, "loss/reg": 0.004226126708090305, "step": 1107 }, { "epoch": 0.1385, "grad_norm": 3.0929460525512695, "grad_norm_var": 0.2690614225265311, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.3786656856536865, "loss/hidden": 1.0, "loss/logits": 0.15606652200222015, "loss/reg": 0.004224089439958334, "step": 1108 }, { "epoch": 0.138625, "grad_norm": 3.325866460800171, "grad_norm_var": 0.25900974055676873, "learning_rate": 0.0001, "loss": 1.0968, "loss/crossentropy": 2.668332099914551, "loss/hidden": 0.8671875, "loss/logits": 0.1874256730079651, "loss/reg": 0.00422210618853569, "step": 1109 }, { "epoch": 0.13875, "grad_norm": 3.774113178253174, "grad_norm_var": 0.32044570279647266, "learning_rate": 0.0001, "loss": 0.9994, "loss/crossentropy": 2.6192150115966797, "loss/hidden": 0.8203125, "loss/logits": 0.13685137033462524, "loss/reg": 0.004220074508339167, "step": 1110 }, { "epoch": 0.138875, "grad_norm": 3.656229019165039, "grad_norm_var": 0.362595306683378, "learning_rate": 0.0001, "loss": 1.1159, "loss/crossentropy": 2.3791658878326416, "loss/hidden": 0.91796875, "loss/logits": 0.1557137668132782, "loss/reg": 0.004218076355755329, "step": 1111 }, { "epoch": 0.139, "grad_norm": 2.500005006790161, "grad_norm_var": 0.37009339748612907, "learning_rate": 0.0001, "loss": 1.1123, "loss/crossentropy": 2.2807750701904297, "loss/hidden": 0.89453125, "loss/logits": 0.17557448148727417, "loss/reg": 0.004216110333800316, "step": 1112 }, { "epoch": 0.139125, "grad_norm": 2.6660091876983643, "grad_norm_var": 0.36438900758074033, "learning_rate": 0.0001, "loss": 1.0914, "loss/crossentropy": 2.6014909744262695, "loss/hidden": 0.89453125, "loss/logits": 0.1547611951828003, "loss/reg": 0.004214086104184389, "step": 1113 }, { "epoch": 0.13925, "grad_norm": 2.3377296924591064, "grad_norm_var": 0.37845468238227015, "learning_rate": 0.0001, "loss": 1.1139, "loss/crossentropy": 2.7464191913604736, "loss/hidden": 0.90234375, "loss/logits": 0.16947275400161743, "loss/reg": 0.004212013445794582, "step": 1114 }, { "epoch": 0.139375, "grad_norm": 2.4301981925964355, "grad_norm_var": 0.2548452172505712, "learning_rate": 0.0001, "loss": 1.072, "loss/crossentropy": 2.5051262378692627, "loss/hidden": 0.8828125, "loss/logits": 0.14710885286331177, "loss/reg": 0.0042099012061953545, "step": 1115 }, { "epoch": 0.1395, "grad_norm": 2.815537452697754, "grad_norm_var": 0.23644342440034408, "learning_rate": 0.0001, "loss": 1.1469, "loss/crossentropy": 2.6059226989746094, "loss/hidden": 0.92578125, "loss/logits": 0.17900194227695465, "loss/reg": 0.0042078145779669285, "step": 1116 }, { "epoch": 0.139625, "grad_norm": 2.9746217727661133, "grad_norm_var": 0.22897005663627562, "learning_rate": 0.0001, "loss": 1.1095, "loss/crossentropy": 2.892439842224121, "loss/hidden": 0.91015625, "loss/logits": 0.15726403892040253, "loss/reg": 0.004205791745334864, "step": 1117 }, { "epoch": 0.13975, "grad_norm": 2.1311089992523193, "grad_norm_var": 0.24750381735717988, "learning_rate": 0.0001, "loss": 1.0609, "loss/crossentropy": 2.477149248123169, "loss/hidden": 0.8671875, "loss/logits": 0.15162619948387146, "loss/reg": 0.004203863441944122, "step": 1118 }, { "epoch": 0.139875, "grad_norm": 2.1927154064178467, "grad_norm_var": 0.2602719277895896, "learning_rate": 0.0001, "loss": 1.0056, "loss/crossentropy": 2.3840315341949463, "loss/hidden": 0.8046875, "loss/logits": 0.15886801481246948, "loss/reg": 0.004202014300972223, "step": 1119 }, { "epoch": 0.14, "grad_norm": 2.850994348526001, "grad_norm_var": 0.25871108381456814, "learning_rate": 0.0001, "loss": 1.2744, "loss/crossentropy": 2.935702085494995, "loss/hidden": 1.0390625, "loss/logits": 0.1933366060256958, "loss/reg": 0.004200007766485214, "step": 1120 }, { "epoch": 0.140125, "grad_norm": 2.7692806720733643, "grad_norm_var": 0.2564497076881023, "learning_rate": 0.0001, "loss": 1.1429, "loss/crossentropy": 2.305704355239868, "loss/hidden": 0.89453125, "loss/logits": 0.20640595257282257, "loss/reg": 0.0041981167159974575, "step": 1121 }, { "epoch": 0.14025, "grad_norm": 2.4898860454559326, "grad_norm_var": 0.2595914437365072, "learning_rate": 0.0001, "loss": 0.9756, "loss/crossentropy": 2.6060853004455566, "loss/hidden": 0.7890625, "loss/logits": 0.14455264806747437, "loss/reg": 0.004196107853204012, "step": 1122 }, { "epoch": 0.140375, "grad_norm": 31.69025421142578, "grad_norm_var": 52.39364291838152, "learning_rate": 0.0001, "loss": 1.1367, "loss/crossentropy": 2.816709518432617, "loss/hidden": 0.9453125, "loss/logits": 0.14948531985282898, "loss/reg": 0.004194286651909351, "step": 1123 }, { "epoch": 0.1405, "grad_norm": 2.6551873683929443, "grad_norm_var": 52.493939083618166, "learning_rate": 0.0001, "loss": 1.1028, "loss/crossentropy": 2.7032201290130615, "loss/hidden": 0.88671875, "loss/logits": 0.17413891851902008, "loss/reg": 0.004192298278212547, "step": 1124 }, { "epoch": 0.140625, "grad_norm": 2.712380886077881, "grad_norm_var": 52.61994398728479, "learning_rate": 0.0001, "loss": 1.1739, "loss/crossentropy": 2.622096300125122, "loss/hidden": 0.95703125, "loss/logits": 0.17497727274894714, "loss/reg": 0.0041902982629835606, "step": 1125 }, { "epoch": 0.14075, "grad_norm": 2.355632781982422, "grad_norm_var": 52.890626023811876, "learning_rate": 0.0001, "loss": 0.9784, "loss/crossentropy": 2.3552989959716797, "loss/hidden": 0.79296875, "loss/logits": 0.143496572971344, "loss/reg": 0.004188500810414553, "step": 1126 }, { "epoch": 0.140875, "grad_norm": 2.7159671783447266, "grad_norm_var": 53.045613069983446, "learning_rate": 0.0001, "loss": 1.1407, "loss/crossentropy": 2.45786190032959, "loss/hidden": 0.93359375, "loss/logits": 0.16524501144886017, "loss/reg": 0.004186683334410191, "step": 1127 }, { "epoch": 0.141, "grad_norm": 3.830094575881958, "grad_norm_var": 52.820475932071744, "learning_rate": 0.0001, "loss": 1.0811, "loss/crossentropy": 2.1976234912872314, "loss/hidden": 0.88671875, "loss/logits": 0.1525820791721344, "loss/reg": 0.004184682387858629, "step": 1128 }, { "epoch": 0.141125, "grad_norm": 2.6621882915496826, "grad_norm_var": 52.82139900035407, "learning_rate": 0.0001, "loss": 1.0459, "loss/crossentropy": 2.539736270904541, "loss/hidden": 0.81640625, "loss/logits": 0.1876693218946457, "loss/reg": 0.004182685166597366, "step": 1129 }, { "epoch": 0.14125, "grad_norm": 2.478451728820801, "grad_norm_var": 52.78251904082665, "learning_rate": 0.0001, "loss": 1.0815, "loss/crossentropy": 2.4196884632110596, "loss/hidden": 0.88671875, "loss/logits": 0.1529858261346817, "loss/reg": 0.00418076990172267, "step": 1130 }, { "epoch": 0.141375, "grad_norm": 3.299400568008423, "grad_norm_var": 52.59163994639377, "learning_rate": 0.0001, "loss": 1.2759, "loss/crossentropy": 2.5420329570770264, "loss/hidden": 1.0625, "loss/logits": 0.17164292931556702, "loss/reg": 0.004178792238235474, "step": 1131 }, { "epoch": 0.1415, "grad_norm": 2.3637001514434814, "grad_norm_var": 52.70822859008106, "learning_rate": 0.0001, "loss": 1.1613, "loss/crossentropy": 2.6717777252197266, "loss/hidden": 0.9453125, "loss/logits": 0.17417655885219574, "loss/reg": 0.004176879767328501, "step": 1132 }, { "epoch": 0.141625, "grad_norm": 2.500570058822632, "grad_norm_var": 52.819367266798494, "learning_rate": 0.0001, "loss": 0.9633, "loss/crossentropy": 2.6206371784210205, "loss/hidden": 0.78515625, "loss/logits": 0.13640643656253815, "loss/reg": 0.004175043664872646, "step": 1133 }, { "epoch": 0.14175, "grad_norm": 2.323843479156494, "grad_norm_var": 52.76129867971668, "learning_rate": 0.0001, "loss": 1.0942, "loss/crossentropy": 2.5218851566314697, "loss/hidden": 0.8984375, "loss/logits": 0.15399572253227234, "loss/reg": 0.004173224791884422, "step": 1134 }, { "epoch": 0.141875, "grad_norm": 2.0677201747894287, "grad_norm_var": 52.80061443559793, "learning_rate": 0.0001, "loss": 1.0591, "loss/crossentropy": 2.580735683441162, "loss/hidden": 0.87109375, "loss/logits": 0.14630158245563507, "loss/reg": 0.0041715288534760475, "step": 1135 }, { "epoch": 0.142, "grad_norm": 2.8460326194763184, "grad_norm_var": 52.80169720296209, "learning_rate": 0.0001, "loss": 1.2675, "loss/crossentropy": 2.315237283706665, "loss/hidden": 1.0625, "loss/logits": 0.16331645846366882, "loss/reg": 0.0041695088148117065, "step": 1136 }, { "epoch": 0.142125, "grad_norm": 2.169602632522583, "grad_norm_var": 52.96135990851253, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.5278215408325195, "loss/hidden": 0.8828125, "loss/logits": 0.1388036012649536, "loss/reg": 0.004167445003986359, "step": 1137 }, { "epoch": 0.14225, "grad_norm": 2.01615047454834, "grad_norm_var": 53.099042280735006, "learning_rate": 0.0001, "loss": 0.9819, "loss/crossentropy": 2.4882190227508545, "loss/hidden": 0.8046875, "loss/logits": 0.13553820550441742, "loss/reg": 0.004165465943515301, "step": 1138 }, { "epoch": 0.142375, "grad_norm": 2.4339070320129395, "grad_norm_var": 0.2098356411642726, "learning_rate": 0.0001, "loss": 0.9859, "loss/crossentropy": 2.644569158554077, "loss/hidden": 0.80859375, "loss/logits": 0.13571619987487793, "loss/reg": 0.00416343891993165, "step": 1139 }, { "epoch": 0.1425, "grad_norm": 2.3045334815979004, "grad_norm_var": 0.2144459690924632, "learning_rate": 0.0001, "loss": 1.061, "loss/crossentropy": 2.491943597793579, "loss/hidden": 0.859375, "loss/logits": 0.16004428267478943, "loss/reg": 0.004161354620009661, "step": 1140 }, { "epoch": 0.142625, "grad_norm": 2.173408031463623, "grad_norm_var": 0.22219091176189235, "learning_rate": 0.0001, "loss": 1.0067, "loss/crossentropy": 2.374779224395752, "loss/hidden": 0.8203125, "loss/logits": 0.14481596648693085, "loss/reg": 0.004159385804086924, "step": 1141 }, { "epoch": 0.14275, "grad_norm": 3.0710461139678955, "grad_norm_var": 0.23718192859111392, "learning_rate": 0.0001, "loss": 1.0164, "loss/crossentropy": 2.6076908111572266, "loss/hidden": 0.83984375, "loss/logits": 0.13495643436908722, "loss/reg": 0.004157309886068106, "step": 1142 }, { "epoch": 0.142875, "grad_norm": 3.763521432876587, "grad_norm_var": 0.3249627427406568, "learning_rate": 0.0001, "loss": 1.1313, "loss/crossentropy": 2.3016669750213623, "loss/hidden": 0.92578125, "loss/logits": 0.16395646333694458, "loss/reg": 0.004155360162258148, "step": 1143 }, { "epoch": 0.143, "grad_norm": 8.867222785949707, "grad_norm_var": 2.70734825211357, "learning_rate": 0.0001, "loss": 1.3971, "loss/crossentropy": 2.549853563308716, "loss/hidden": 1.1171875, "loss/logits": 0.23840667307376862, "loss/reg": 0.0041534146293997765, "step": 1144 }, { "epoch": 0.143125, "grad_norm": 4.079037189483643, "grad_norm_var": 2.7767747967197427, "learning_rate": 0.0001, "loss": 1.2853, "loss/crossentropy": 2.298107862472534, "loss/hidden": 1.0859375, "loss/logits": 0.15788918733596802, "loss/reg": 0.004151403903961182, "step": 1145 }, { "epoch": 0.14325, "grad_norm": 2.5469048023223877, "grad_norm_var": 2.7718749700746383, "learning_rate": 0.0001, "loss": 1.1133, "loss/crossentropy": 2.588602066040039, "loss/hidden": 0.8984375, "loss/logits": 0.17332546412944794, "loss/reg": 0.004149466287344694, "step": 1146 }, { "epoch": 0.143375, "grad_norm": 2.2044925689697266, "grad_norm_var": 2.8106347308786437, "learning_rate": 0.0001, "loss": 0.9728, "loss/crossentropy": 2.4486637115478516, "loss/hidden": 0.79296875, "loss/logits": 0.13837072253227234, "loss/reg": 0.004147485829889774, "step": 1147 }, { "epoch": 0.1435, "grad_norm": 2.0629138946533203, "grad_norm_var": 2.841135428686917, "learning_rate": 0.0001, "loss": 1.0498, "loss/crossentropy": 2.611081123352051, "loss/hidden": 0.8515625, "loss/logits": 0.1567818820476532, "loss/reg": 0.004145504906773567, "step": 1148 }, { "epoch": 0.143625, "grad_norm": 2.686124324798584, "grad_norm_var": 2.8318111276034035, "learning_rate": 0.0001, "loss": 1.1465, "loss/crossentropy": 2.3890397548675537, "loss/hidden": 0.9453125, "loss/logits": 0.1597655862569809, "loss/reg": 0.004143500700592995, "step": 1149 }, { "epoch": 0.14375, "grad_norm": 2.1094777584075928, "grad_norm_var": 2.8533239929343903, "learning_rate": 0.0001, "loss": 1.1325, "loss/crossentropy": 2.349745273590088, "loss/hidden": 0.91796875, "loss/logits": 0.17312946915626526, "loss/reg": 0.0041414061561226845, "step": 1150 }, { "epoch": 0.143875, "grad_norm": 2.2789037227630615, "grad_norm_var": 2.830912674059921, "learning_rate": 0.0001, "loss": 1.0119, "loss/crossentropy": 2.482297897338867, "loss/hidden": 0.828125, "loss/logits": 0.14236654341220856, "loss/reg": 0.004139502998441458, "step": 1151 }, { "epoch": 0.144, "grad_norm": 2.3409323692321777, "grad_norm_var": 2.855599485961874, "learning_rate": 0.0001, "loss": 1.0217, "loss/crossentropy": 2.4027068614959717, "loss/hidden": 0.8359375, "loss/logits": 0.1443997174501419, "loss/reg": 0.004137733485549688, "step": 1152 }, { "epoch": 0.144125, "grad_norm": 3.2646842002868652, "grad_norm_var": 2.8174411429914015, "learning_rate": 0.0001, "loss": 1.1094, "loss/crossentropy": 2.5013201236724854, "loss/hidden": 0.91015625, "loss/logits": 0.15787330269813538, "loss/reg": 0.004135794471949339, "step": 1153 }, { "epoch": 0.14425, "grad_norm": 3.01409912109375, "grad_norm_var": 2.747083786295129, "learning_rate": 0.0001, "loss": 1.4099, "loss/crossentropy": 2.341641426086426, "loss/hidden": 1.171875, "loss/logits": 0.1966913640499115, "loss/reg": 0.004133842419832945, "step": 1154 }, { "epoch": 0.144375, "grad_norm": 2.1998982429504395, "grad_norm_var": 2.770511502568856, "learning_rate": 0.0001, "loss": 1.027, "loss/crossentropy": 2.67824649810791, "loss/hidden": 0.8359375, "loss/logits": 0.1497696489095688, "loss/reg": 0.004131934605538845, "step": 1155 }, { "epoch": 0.1445, "grad_norm": 2.5142602920532227, "grad_norm_var": 2.7521224578865087, "learning_rate": 0.0001, "loss": 1.1456, "loss/crossentropy": 2.4931130409240723, "loss/hidden": 0.921875, "loss/logits": 0.18238465487957, "loss/reg": 0.004129941575229168, "step": 1156 }, { "epoch": 0.144625, "grad_norm": 2.1878349781036377, "grad_norm_var": 2.750403944498737, "learning_rate": 0.0001, "loss": 1.0802, "loss/crossentropy": 2.387873888015747, "loss/hidden": 0.890625, "loss/logits": 0.1483183354139328, "loss/reg": 0.004127953667193651, "step": 1157 }, { "epoch": 0.14475, "grad_norm": 2.688089609146118, "grad_norm_var": 2.759744220974267, "learning_rate": 0.0001, "loss": 1.0354, "loss/crossentropy": 2.6154494285583496, "loss/hidden": 0.85546875, "loss/logits": 0.13863371312618256, "loss/reg": 0.004126036539673805, "step": 1158 }, { "epoch": 0.144875, "grad_norm": 3.1651244163513184, "grad_norm_var": 2.7252368192156586, "learning_rate": 0.0001, "loss": 1.2689, "loss/crossentropy": 2.3369271755218506, "loss/hidden": 1.0234375, "loss/logits": 0.20426858961582184, "loss/reg": 0.004124056547880173, "step": 1159 }, { "epoch": 0.145, "grad_norm": 2.437448740005493, "grad_norm_var": 0.290374675783868, "learning_rate": 0.0001, "loss": 1.1068, "loss/crossentropy": 2.446481227874756, "loss/hidden": 0.91015625, "loss/logits": 0.15547212958335876, "loss/reg": 0.0041221086867153645, "step": 1160 }, { "epoch": 0.145125, "grad_norm": 3.550361394882202, "grad_norm_var": 0.20437982896584472, "learning_rate": 0.0001, "loss": 1.6847, "loss/crossentropy": 2.6945204734802246, "loss/hidden": 1.28125, "loss/logits": 0.36227577924728394, "loss/reg": 0.004120130091905594, "step": 1161 }, { "epoch": 0.14525, "grad_norm": 2.150529384613037, "grad_norm_var": 0.21585453142654387, "learning_rate": 0.0001, "loss": 1.0391, "loss/crossentropy": 2.3554482460021973, "loss/hidden": 0.84765625, "loss/logits": 0.15022103488445282, "loss/reg": 0.0041182260029017925, "step": 1162 }, { "epoch": 0.145375, "grad_norm": 4.6839704513549805, "grad_norm_var": 0.48472891056564626, "learning_rate": 0.0001, "loss": 1.5962, "loss/crossentropy": 2.456437587738037, "loss/hidden": 1.2890625, "loss/logits": 0.2660132944583893, "loss/reg": 0.004116271156817675, "step": 1163 }, { "epoch": 0.1455, "grad_norm": 2.3992183208465576, "grad_norm_var": 0.4628530155911977, "learning_rate": 0.0001, "loss": 1.0986, "loss/crossentropy": 2.535423755645752, "loss/hidden": 0.9140625, "loss/logits": 0.14340469241142273, "loss/reg": 0.0041144127026200294, "step": 1164 }, { "epoch": 0.145625, "grad_norm": 2.455538034439087, "grad_norm_var": 0.4675077175097224, "learning_rate": 0.0001, "loss": 1.1422, "loss/crossentropy": 2.5659542083740234, "loss/hidden": 0.9296875, "loss/logits": 0.17135412991046906, "loss/reg": 0.004112581722438335, "step": 1165 }, { "epoch": 0.14575, "grad_norm": 3.7746567726135254, "grad_norm_var": 0.5063635000809102, "learning_rate": 0.0001, "loss": 1.4588, "loss/crossentropy": 2.807483196258545, "loss/hidden": 1.1953125, "loss/logits": 0.22239741683006287, "loss/reg": 0.004110958427190781, "step": 1166 }, { "epoch": 0.145875, "grad_norm": 2.2929201126098633, "grad_norm_var": 0.5053662377320952, "learning_rate": 0.0001, "loss": 1.1351, "loss/crossentropy": 2.5097239017486572, "loss/hidden": 0.921875, "loss/logits": 0.17209036648273468, "loss/reg": 0.004109338391572237, "step": 1167 }, { "epoch": 0.146, "grad_norm": 4.034673690795898, "grad_norm_var": 0.5764809506272021, "learning_rate": 0.0001, "loss": 1.4816, "loss/crossentropy": 2.6598453521728516, "loss/hidden": 1.1953125, "loss/logits": 0.24519138038158417, "loss/reg": 0.0041076927445828915, "step": 1168 }, { "epoch": 0.146125, "grad_norm": 2.2866017818450928, "grad_norm_var": 0.5920811915580522, "learning_rate": 0.0001, "loss": 1.177, "loss/crossentropy": 2.3927576541900635, "loss/hidden": 0.953125, "loss/logits": 0.18283367156982422, "loss/reg": 0.004106137901544571, "step": 1169 }, { "epoch": 0.14625, "grad_norm": 2.899941921234131, "grad_norm_var": 0.5906217092668515, "learning_rate": 0.0001, "loss": 1.1387, "loss/crossentropy": 2.4218740463256836, "loss/hidden": 0.92578125, "loss/logits": 0.17189282178878784, "loss/reg": 0.004104320891201496, "step": 1170 }, { "epoch": 0.146375, "grad_norm": 2.9730427265167236, "grad_norm_var": 0.5601848624373048, "learning_rate": 0.0001, "loss": 1.0993, "loss/crossentropy": 2.669074296951294, "loss/hidden": 0.90234375, "loss/logits": 0.15591827034950256, "loss/reg": 0.004102461040019989, "step": 1171 }, { "epoch": 0.1465, "grad_norm": 3.55232572555542, "grad_norm_var": 0.5733288711493515, "learning_rate": 0.0001, "loss": 1.2912, "loss/crossentropy": 2.384329319000244, "loss/hidden": 1.0234375, "loss/logits": 0.22678744792938232, "loss/reg": 0.004100624471902847, "step": 1172 }, { "epoch": 0.146625, "grad_norm": 2.591209650039673, "grad_norm_var": 0.541389636484242, "learning_rate": 0.0001, "loss": 1.1135, "loss/crossentropy": 2.367767810821533, "loss/hidden": 0.9375, "loss/logits": 0.13497930765151978, "loss/reg": 0.004098633769899607, "step": 1173 }, { "epoch": 0.14675, "grad_norm": 2.9488017559051514, "grad_norm_var": 0.534935103556153, "learning_rate": 0.0001, "loss": 1.4669, "loss/crossentropy": 1.9882688522338867, "loss/hidden": 1.25, "loss/logits": 0.17595870792865753, "loss/reg": 0.004096675664186478, "step": 1174 }, { "epoch": 0.146875, "grad_norm": 2.837010145187378, "grad_norm_var": 0.5349767501482856, "learning_rate": 0.0001, "loss": 1.1548, "loss/crossentropy": 2.8100757598876953, "loss/hidden": 0.97265625, "loss/logits": 0.14116618037223816, "loss/reg": 0.004094698466360569, "step": 1175 }, { "epoch": 0.147, "grad_norm": 2.5989038944244385, "grad_norm_var": 0.5246730089916675, "learning_rate": 0.0001, "loss": 1.2746, "loss/crossentropy": 2.367872953414917, "loss/hidden": 1.0703125, "loss/logits": 0.16331195831298828, "loss/reg": 0.004092712886631489, "step": 1176 }, { "epoch": 0.147125, "grad_norm": 3.459014892578125, "grad_norm_var": 0.5185139879820743, "learning_rate": 0.0001, "loss": 1.1223, "loss/crossentropy": 2.533998727798462, "loss/hidden": 0.92578125, "loss/logits": 0.15561142563819885, "loss/reg": 0.004090711008757353, "step": 1177 }, { "epoch": 0.14725, "grad_norm": 5.084310054779053, "grad_norm_var": 0.7256747423481064, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.582167863845825, "loss/hidden": 1.0625, "loss/logits": 0.1748521625995636, "loss/reg": 0.004088713321834803, "step": 1178 }, { "epoch": 0.147375, "grad_norm": 2.778517961502075, "grad_norm_var": 0.5703725263929961, "learning_rate": 0.0001, "loss": 1.0471, "loss/crossentropy": 2.336113929748535, "loss/hidden": 0.8515625, "loss/logits": 0.15465718507766724, "loss/reg": 0.004086779430508614, "step": 1179 }, { "epoch": 0.1475, "grad_norm": 2.8686232566833496, "grad_norm_var": 0.5427611216294432, "learning_rate": 0.0001, "loss": 1.1435, "loss/crossentropy": 2.776197910308838, "loss/hidden": 0.9453125, "loss/logits": 0.1573391556739807, "loss/reg": 0.0040848455391824245, "step": 1180 }, { "epoch": 0.147625, "grad_norm": 2.511221170425415, "grad_norm_var": 0.5382462121749844, "learning_rate": 0.0001, "loss": 1.0821, "loss/crossentropy": 2.7198173999786377, "loss/hidden": 0.87890625, "loss/logits": 0.16234460473060608, "loss/reg": 0.004082926083356142, "step": 1181 }, { "epoch": 0.14775, "grad_norm": 2.1064679622650146, "grad_norm_var": 0.5606094401847085, "learning_rate": 0.0001, "loss": 0.98, "loss/crossentropy": 2.6126627922058105, "loss/hidden": 0.8046875, "loss/logits": 0.1345212161540985, "loss/reg": 0.004080874379724264, "step": 1182 }, { "epoch": 0.147875, "grad_norm": 4.242970943450928, "grad_norm_var": 0.6172993082607185, "learning_rate": 0.0001, "loss": 1.2843, "loss/crossentropy": 2.121168375015259, "loss/hidden": 1.0703125, "loss/logits": 0.1731598824262619, "loss/reg": 0.0040789819322526455, "step": 1183 }, { "epoch": 0.148, "grad_norm": 2.6352462768554688, "grad_norm_var": 0.5673230040929977, "learning_rate": 0.0001, "loss": 1.0252, "loss/crossentropy": 2.692413091659546, "loss/hidden": 0.84375, "loss/logits": 0.14067476987838745, "loss/reg": 0.004077126272022724, "step": 1184 }, { "epoch": 0.148125, "grad_norm": 2.905735731124878, "grad_norm_var": 0.5304583396362827, "learning_rate": 0.0001, "loss": 1.2004, "loss/crossentropy": 2.290217399597168, "loss/hidden": 0.98046875, "loss/logits": 0.17918136715888977, "loss/reg": 0.004075322765856981, "step": 1185 }, { "epoch": 0.14825, "grad_norm": 2.0793650150299072, "grad_norm_var": 0.5902824998400082, "learning_rate": 0.0001, "loss": 0.949, "loss/crossentropy": 2.605687379837036, "loss/hidden": 0.77734375, "loss/logits": 0.1309652477502823, "loss/reg": 0.00407352764159441, "step": 1186 }, { "epoch": 0.148375, "grad_norm": 2.730095624923706, "grad_norm_var": 0.5951944585982081, "learning_rate": 0.0001, "loss": 1.018, "loss/crossentropy": 2.399637222290039, "loss/hidden": 0.84375, "loss/logits": 0.13348934054374695, "loss/reg": 0.004071622621268034, "step": 1187 }, { "epoch": 0.1485, "grad_norm": 2.3118958473205566, "grad_norm_var": 0.5992861461620653, "learning_rate": 0.0001, "loss": 0.969, "loss/crossentropy": 2.583070993423462, "loss/hidden": 0.7890625, "loss/logits": 0.13926547765731812, "loss/reg": 0.004069886170327663, "step": 1188 }, { "epoch": 0.148625, "grad_norm": 2.281296491622925, "grad_norm_var": 0.6187961724202968, "learning_rate": 0.0001, "loss": 1.1247, "loss/crossentropy": 2.4077699184417725, "loss/hidden": 0.91796875, "loss/logits": 0.16607880592346191, "loss/reg": 0.00406790804117918, "step": 1189 }, { "epoch": 0.14875, "grad_norm": 3.76084041595459, "grad_norm_var": 0.6654318302540614, "learning_rate": 0.0001, "loss": 1.3819, "loss/crossentropy": 2.7298569679260254, "loss/hidden": 1.0546875, "loss/logits": 0.28656214475631714, "loss/reg": 0.004065926186740398, "step": 1190 }, { "epoch": 0.148875, "grad_norm": 3.076002597808838, "grad_norm_var": 0.6654180683387788, "learning_rate": 0.0001, "loss": 1.605, "loss/crossentropy": 1.9846049547195435, "loss/hidden": 1.3125, "loss/logits": 0.2518633008003235, "loss/reg": 0.004063920117914677, "step": 1191 }, { "epoch": 0.149, "grad_norm": 2.316555976867676, "grad_norm_var": 0.6841604530042008, "learning_rate": 0.0001, "loss": 1.0979, "loss/crossentropy": 2.3328921794891357, "loss/hidden": 0.89453125, "loss/logits": 0.16276977956295013, "loss/reg": 0.004061955027282238, "step": 1192 }, { "epoch": 0.149125, "grad_norm": 2.814150810241699, "grad_norm_var": 0.6661064219784556, "learning_rate": 0.0001, "loss": 1.203, "loss/crossentropy": 2.3084676265716553, "loss/hidden": 0.984375, "loss/logits": 0.17800584435462952, "loss/reg": 0.004059869330376387, "step": 1193 }, { "epoch": 0.14925, "grad_norm": 4.5728559494018555, "grad_norm_var": 0.5339391843004021, "learning_rate": 0.0001, "loss": 1.235, "loss/crossentropy": 2.786238431930542, "loss/hidden": 1.0078125, "loss/logits": 0.18664950132369995, "loss/reg": 0.004058040212839842, "step": 1194 }, { "epoch": 0.149375, "grad_norm": 2.267996311187744, "grad_norm_var": 0.556761488955063, "learning_rate": 0.0001, "loss": 1.1708, "loss/crossentropy": 2.1988203525543213, "loss/hidden": 0.96484375, "loss/logits": 0.1653607189655304, "loss/reg": 0.004056154750287533, "step": 1195 }, { "epoch": 0.1495, "grad_norm": 2.1123440265655518, "grad_norm_var": 0.5898830056876873, "learning_rate": 0.0001, "loss": 0.9765, "loss/crossentropy": 2.301168441772461, "loss/hidden": 0.80859375, "loss/logits": 0.12740209698677063, "loss/reg": 0.004054322373121977, "step": 1196 }, { "epoch": 0.149625, "grad_norm": 3.071465253829956, "grad_norm_var": 0.5882785049222033, "learning_rate": 0.0001, "loss": 1.2782, "loss/crossentropy": 2.6485557556152344, "loss/hidden": 1.0234375, "loss/logits": 0.21422387659549713, "loss/reg": 0.004052514210343361, "step": 1197 }, { "epoch": 0.14975, "grad_norm": 2.4168970584869385, "grad_norm_var": 0.5643403352790185, "learning_rate": 0.0001, "loss": 1.0625, "loss/crossentropy": 2.5202739238739014, "loss/hidden": 0.87109375, "loss/logits": 0.15086334943771362, "loss/reg": 0.004050557967275381, "step": 1198 }, { "epoch": 0.149875, "grad_norm": 2.1462841033935547, "grad_norm_var": 0.44960492320894496, "learning_rate": 0.0001, "loss": 1.0988, "loss/crossentropy": 2.435528039932251, "loss/hidden": 0.90234375, "loss/logits": 0.1560034155845642, "loss/reg": 0.004048600792884827, "step": 1199 }, { "epoch": 0.15, "grad_norm": 2.1528284549713135, "grad_norm_var": 0.46951760615473387, "learning_rate": 0.0001, "loss": 1.3208, "loss/crossentropy": 2.4528579711914062, "loss/hidden": 1.078125, "loss/logits": 0.2022087574005127, "loss/reg": 0.0040466394275426865, "step": 1200 }, { "epoch": 0.150125, "grad_norm": 2.827105760574341, "grad_norm_var": 0.46762692410470286, "learning_rate": 0.0001, "loss": 1.098, "loss/crossentropy": 2.634087562561035, "loss/hidden": 0.88671875, "loss/logits": 0.17087715864181519, "loss/reg": 0.004044875968247652, "step": 1201 }, { "epoch": 0.15025, "grad_norm": 2.268160104751587, "grad_norm_var": 0.45464383775398576, "learning_rate": 0.0001, "loss": 1.0992, "loss/crossentropy": 2.574035167694092, "loss/hidden": 0.89453125, "loss/logits": 0.16423508524894714, "loss/reg": 0.004043125547468662, "step": 1202 }, { "epoch": 0.150375, "grad_norm": 2.10304594039917, "grad_norm_var": 0.4763194687664772, "learning_rate": 0.0001, "loss": 1.0253, "loss/crossentropy": 2.686739444732666, "loss/hidden": 0.83984375, "loss/logits": 0.14499551057815552, "loss/reg": 0.004041461274027824, "step": 1203 }, { "epoch": 0.1505, "grad_norm": 2.8614940643310547, "grad_norm_var": 0.46996517485336897, "learning_rate": 0.0001, "loss": 1.3262, "loss/crossentropy": 2.327242136001587, "loss/hidden": 1.09375, "loss/logits": 0.1920863389968872, "loss/reg": 0.004039805382490158, "step": 1204 }, { "epoch": 0.150625, "grad_norm": 2.0004308223724365, "grad_norm_var": 0.4902227797061412, "learning_rate": 0.0001, "loss": 0.9759, "loss/crossentropy": 2.55458664894104, "loss/hidden": 0.80078125, "loss/logits": 0.13473068177700043, "loss/reg": 0.00403786962851882, "step": 1205 }, { "epoch": 0.15075, "grad_norm": 2.0800395011901855, "grad_norm_var": 0.42300499990140544, "learning_rate": 0.0001, "loss": 1.0823, "loss/crossentropy": 2.2755930423736572, "loss/hidden": 0.89453125, "loss/logits": 0.14743617177009583, "loss/reg": 0.00403629383072257, "step": 1206 }, { "epoch": 0.150875, "grad_norm": 2.8714046478271484, "grad_norm_var": 0.41176251270089, "learning_rate": 0.0001, "loss": 1.0406, "loss/crossentropy": 2.389319896697998, "loss/hidden": 0.83984375, "loss/logits": 0.16044044494628906, "loss/reg": 0.004034355282783508, "step": 1207 }, { "epoch": 0.151, "grad_norm": 2.136133909225464, "grad_norm_var": 0.41953769445076433, "learning_rate": 0.0001, "loss": 1.0709, "loss/crossentropy": 2.745492458343506, "loss/hidden": 0.8828125, "loss/logits": 0.14779764413833618, "loss/reg": 0.004032687284052372, "step": 1208 }, { "epoch": 0.151125, "grad_norm": 2.6204607486724854, "grad_norm_var": 0.4149034970549467, "learning_rate": 0.0001, "loss": 1.0942, "loss/crossentropy": 2.473696708679199, "loss/hidden": 0.91015625, "loss/logits": 0.14370602369308472, "loss/reg": 0.004030975513160229, "step": 1209 }, { "epoch": 0.15125, "grad_norm": 2.2523136138916016, "grad_norm_var": 0.11994939680660437, "learning_rate": 0.0001, "loss": 1.1321, "loss/crossentropy": 2.7228078842163086, "loss/hidden": 0.9296875, "loss/logits": 0.16212627291679382, "loss/reg": 0.004029178526252508, "step": 1210 }, { "epoch": 0.151375, "grad_norm": 3.1284432411193848, "grad_norm_var": 0.15259538885302745, "learning_rate": 0.0001, "loss": 1.0887, "loss/crossentropy": 2.81272029876709, "loss/hidden": 0.88671875, "loss/logits": 0.16168195009231567, "loss/reg": 0.004027185495942831, "step": 1211 }, { "epoch": 0.1515, "grad_norm": 2.160048246383667, "grad_norm_var": 0.15065002461184704, "learning_rate": 0.0001, "loss": 0.9783, "loss/crossentropy": 2.311624526977539, "loss/hidden": 0.8046875, "loss/logits": 0.13339203596115112, "loss/reg": 0.0040252963081002235, "step": 1212 }, { "epoch": 0.151625, "grad_norm": 2.9693639278411865, "grad_norm_var": 0.14275322843417157, "learning_rate": 0.0001, "loss": 0.9642, "loss/crossentropy": 2.664057970046997, "loss/hidden": 0.7890625, "loss/logits": 0.13490962982177734, "loss/reg": 0.0040232837200164795, "step": 1213 }, { "epoch": 0.15175, "grad_norm": 3.021174669265747, "grad_norm_var": 0.16394313365960494, "learning_rate": 0.0001, "loss": 1.2676, "loss/crossentropy": 2.6081109046936035, "loss/hidden": 1.046875, "loss/logits": 0.18048575520515442, "loss/reg": 0.004021205008029938, "step": 1214 }, { "epoch": 0.151875, "grad_norm": 2.3486392498016357, "grad_norm_var": 0.15763551716868943, "learning_rate": 0.0001, "loss": 0.9701, "loss/crossentropy": 2.4917783737182617, "loss/hidden": 0.79296875, "loss/logits": 0.13695400953292847, "loss/reg": 0.004019314423203468, "step": 1215 }, { "epoch": 0.152, "grad_norm": 2.184790849685669, "grad_norm_var": 0.15627282346626145, "learning_rate": 0.0001, "loss": 1.0597, "loss/crossentropy": 2.4108786582946777, "loss/hidden": 0.86328125, "loss/logits": 0.15620392560958862, "loss/reg": 0.004017516039311886, "step": 1216 }, { "epoch": 0.152125, "grad_norm": 2.46875262260437, "grad_norm_var": 0.1481710731830276, "learning_rate": 0.0001, "loss": 1.067, "loss/crossentropy": 2.67449688911438, "loss/hidden": 0.86328125, "loss/logits": 0.1635635495185852, "loss/reg": 0.004015681799501181, "step": 1217 }, { "epoch": 0.15225, "grad_norm": 2.1573519706726074, "grad_norm_var": 0.15187870918378293, "learning_rate": 0.0001, "loss": 1.1704, "loss/crossentropy": 2.501345634460449, "loss/hidden": 0.94140625, "loss/logits": 0.18887701630592346, "loss/reg": 0.0040138899348676205, "step": 1218 }, { "epoch": 0.152375, "grad_norm": 2.381070137023926, "grad_norm_var": 0.14346854325687347, "learning_rate": 0.0001, "loss": 1.0696, "loss/crossentropy": 2.485503673553467, "loss/hidden": 0.859375, "loss/logits": 0.1700785756111145, "loss/reg": 0.004012054763734341, "step": 1219 }, { "epoch": 0.1525, "grad_norm": 2.212719678878784, "grad_norm_var": 0.13656890921584572, "learning_rate": 0.0001, "loss": 1.0381, "loss/crossentropy": 2.6198275089263916, "loss/hidden": 0.84765625, "loss/logits": 0.15031346678733826, "loss/reg": 0.004010040778666735, "step": 1220 }, { "epoch": 0.152625, "grad_norm": 2.1536829471588135, "grad_norm_var": 0.12911465723150664, "learning_rate": 0.0001, "loss": 0.9917, "loss/crossentropy": 2.542593240737915, "loss/hidden": 0.8203125, "loss/logits": 0.13133659958839417, "loss/reg": 0.0040080067701637745, "step": 1221 }, { "epoch": 0.15275, "grad_norm": 2.5055301189422607, "grad_norm_var": 0.11963125742360768, "learning_rate": 0.0001, "loss": 1.1078, "loss/crossentropy": 2.522934675216675, "loss/hidden": 0.8671875, "loss/logits": 0.20054848492145538, "loss/reg": 0.004005954600870609, "step": 1222 }, { "epoch": 0.152875, "grad_norm": 2.0575757026672363, "grad_norm_var": 0.1178213242465496, "learning_rate": 0.0001, "loss": 1.0593, "loss/crossentropy": 2.6986868381500244, "loss/hidden": 0.875, "loss/logits": 0.14429835975170135, "loss/reg": 0.004004053305834532, "step": 1223 }, { "epoch": 0.153, "grad_norm": 6.1870503425598145, "grad_norm_var": 0.9888346629412268, "learning_rate": 0.0001, "loss": 1.1983, "loss/crossentropy": 2.4771060943603516, "loss/hidden": 0.99609375, "loss/logits": 0.1621606945991516, "loss/reg": 0.004002041183412075, "step": 1224 }, { "epoch": 0.153125, "grad_norm": 2.094531774520874, "grad_norm_var": 1.009986051026931, "learning_rate": 0.0001, "loss": 1.0328, "loss/crossentropy": 2.4067788124084473, "loss/hidden": 0.8515625, "loss/logits": 0.141241192817688, "loss/reg": 0.004000107757747173, "step": 1225 }, { "epoch": 0.15325, "grad_norm": 2.305340528488159, "grad_norm_var": 1.007401731577304, "learning_rate": 0.0001, "loss": 1.1055, "loss/crossentropy": 2.624967336654663, "loss/hidden": 0.91796875, "loss/logits": 0.1475597620010376, "loss/reg": 0.003998105879873037, "step": 1226 }, { "epoch": 0.153375, "grad_norm": 9.206901550292969, "grad_norm_var": 3.7076283352537036, "learning_rate": 0.0001, "loss": 1.7336, "loss/crossentropy": 2.340308666229248, "loss/hidden": 1.265625, "loss/logits": 0.4280090034008026, "loss/reg": 0.003996132407337427, "step": 1227 }, { "epoch": 0.1535, "grad_norm": 7.1393914222717285, "grad_norm_var": 4.682389594648043, "learning_rate": 0.0001, "loss": 2.1514, "loss/crossentropy": 2.1572506427764893, "loss/hidden": 1.7578125, "loss/logits": 0.3536328077316284, "loss/reg": 0.0039941999129951, "step": 1228 }, { "epoch": 0.153625, "grad_norm": 2.1037001609802246, "grad_norm_var": 4.771672156590602, "learning_rate": 0.0001, "loss": 0.9941, "loss/crossentropy": 2.7013680934906006, "loss/hidden": 0.80859375, "loss/logits": 0.14557519555091858, "loss/reg": 0.003992319572716951, "step": 1229 }, { "epoch": 0.15375, "grad_norm": 2.157334804534912, "grad_norm_var": 4.84846901790952, "learning_rate": 0.0001, "loss": 1.0878, "loss/crossentropy": 2.556549310684204, "loss/hidden": 0.88671875, "loss/logits": 0.16115835309028625, "loss/reg": 0.003990530967712402, "step": 1230 }, { "epoch": 0.153875, "grad_norm": 2.0853333473205566, "grad_norm_var": 4.883710165437185, "learning_rate": 0.0001, "loss": 1.0472, "loss/crossentropy": 2.114297866821289, "loss/hidden": 0.86328125, "loss/logits": 0.14400479197502136, "loss/reg": 0.003988809883594513, "step": 1231 }, { "epoch": 0.154, "grad_norm": 9.08906364440918, "grad_norm_var": 6.916882811324148, "learning_rate": 0.0001, "loss": 1.2453, "loss/crossentropy": 2.644493818283081, "loss/hidden": 1.046875, "loss/logits": 0.15859541296958923, "loss/reg": 0.003986929077655077, "step": 1232 }, { "epoch": 0.154125, "grad_norm": 2.883406639099121, "grad_norm_var": 6.862648195671316, "learning_rate": 0.0001, "loss": 1.4104, "loss/crossentropy": 2.49361252784729, "loss/hidden": 1.140625, "loss/logits": 0.22993981838226318, "loss/reg": 0.003985217306762934, "step": 1233 }, { "epoch": 0.15425, "grad_norm": 2.5608956813812256, "grad_norm_var": 6.7914369374581725, "learning_rate": 0.0001, "loss": 1.0612, "loss/crossentropy": 2.4652509689331055, "loss/hidden": 0.8671875, "loss/logits": 0.15421175956726074, "loss/reg": 0.003983261063694954, "step": 1234 }, { "epoch": 0.154375, "grad_norm": 2.1633126735687256, "grad_norm_var": 6.832556056171203, "learning_rate": 0.0001, "loss": 1.0324, "loss/crossentropy": 2.364274024963379, "loss/hidden": 0.8515625, "loss/logits": 0.14098191261291504, "loss/reg": 0.003981346730142832, "step": 1235 }, { "epoch": 0.1545, "grad_norm": 2.7880406379699707, "grad_norm_var": 6.740565356111725, "learning_rate": 0.0001, "loss": 1.2871, "loss/crossentropy": 2.7725203037261963, "loss/hidden": 1.0390625, "loss/logits": 0.20828330516815186, "loss/reg": 0.003979409113526344, "step": 1236 }, { "epoch": 0.154625, "grad_norm": 2.0032100677490234, "grad_norm_var": 6.773356796491393, "learning_rate": 0.0001, "loss": 1.0215, "loss/crossentropy": 2.426025390625, "loss/hidden": 0.84375, "loss/logits": 0.13792634010314941, "loss/reg": 0.003977475222200155, "step": 1237 }, { "epoch": 0.15475, "grad_norm": 2.803041696548462, "grad_norm_var": 6.731182546058613, "learning_rate": 0.0001, "loss": 1.1383, "loss/crossentropy": 2.7533788681030273, "loss/hidden": 0.93359375, "loss/logits": 0.1649492084980011, "loss/reg": 0.0039755916222929955, "step": 1238 }, { "epoch": 0.154875, "grad_norm": 2.6718337535858154, "grad_norm_var": 6.618056769993946, "learning_rate": 0.0001, "loss": 1.1889, "loss/crossentropy": 2.4473633766174316, "loss/hidden": 0.96484375, "loss/logits": 0.1843622773885727, "loss/reg": 0.003973691258579493, "step": 1239 }, { "epoch": 0.155, "grad_norm": 2.8771820068359375, "grad_norm_var": 6.233935399852195, "learning_rate": 0.0001, "loss": 1.1242, "loss/crossentropy": 2.7280538082122803, "loss/hidden": 0.9140625, "loss/logits": 0.17046083509922028, "loss/reg": 0.0039716921746730804, "step": 1240 }, { "epoch": 0.155125, "grad_norm": 2.2324299812316895, "grad_norm_var": 6.208210747435883, "learning_rate": 0.0001, "loss": 1.001, "loss/crossentropy": 2.7301042079925537, "loss/hidden": 0.82421875, "loss/logits": 0.1370982974767685, "loss/reg": 0.003969752229750156, "step": 1241 }, { "epoch": 0.15525, "grad_norm": 2.414759874343872, "grad_norm_var": 6.1905538159398015, "learning_rate": 0.0001, "loss": 1.0086, "loss/crossentropy": 2.161578893661499, "loss/hidden": 0.83203125, "loss/logits": 0.13684576749801636, "loss/reg": 0.00396784907206893, "step": 1242 }, { "epoch": 0.155375, "grad_norm": 2.278144359588623, "grad_norm_var": 3.986925647036762, "learning_rate": 0.0001, "loss": 1.0588, "loss/crossentropy": 2.1866931915283203, "loss/hidden": 0.8828125, "loss/logits": 0.13629356026649475, "loss/reg": 0.0039659528993070126, "step": 1243 }, { "epoch": 0.1555, "grad_norm": 2.3172078132629395, "grad_norm_var": 2.8692718796241876, "learning_rate": 0.0001, "loss": 1.0269, "loss/crossentropy": 2.7066776752471924, "loss/hidden": 0.84375, "loss/logits": 0.14347760379314423, "loss/reg": 0.003964039962738752, "step": 1244 }, { "epoch": 0.155625, "grad_norm": 2.5149173736572266, "grad_norm_var": 2.8395080960927275, "learning_rate": 0.0001, "loss": 1.147, "loss/crossentropy": 2.6849911212921143, "loss/hidden": 0.93359375, "loss/logits": 0.17379310727119446, "loss/reg": 0.00396218616515398, "step": 1245 }, { "epoch": 0.15575, "grad_norm": 3.1449217796325684, "grad_norm_var": 2.807281033079679, "learning_rate": 0.0001, "loss": 1.3347, "loss/crossentropy": 2.340134620666504, "loss/hidden": 1.09375, "loss/logits": 0.20130982995033264, "loss/reg": 0.003960458096116781, "step": 1246 }, { "epoch": 0.155875, "grad_norm": 2.4941136837005615, "grad_norm_var": 2.771865274736687, "learning_rate": 0.0001, "loss": 1.0381, "loss/crossentropy": 2.569666862487793, "loss/hidden": 0.84375, "loss/logits": 0.15474390983581543, "loss/reg": 0.003958826884627342, "step": 1247 }, { "epoch": 0.156, "grad_norm": 2.905941963195801, "grad_norm_var": 0.10203846777971345, "learning_rate": 0.0001, "loss": 1.0627, "loss/crossentropy": 2.7658021450042725, "loss/hidden": 0.87109375, "loss/logits": 0.1519913673400879, "loss/reg": 0.003957261331379414, "step": 1248 }, { "epoch": 0.156125, "grad_norm": 2.955136299133301, "grad_norm_var": 0.10539728005771849, "learning_rate": 0.0001, "loss": 1.1496, "loss/crossentropy": 2.783196449279785, "loss/hidden": 0.94140625, "loss/logits": 0.16862890124320984, "loss/reg": 0.0039556450210511684, "step": 1249 }, { "epoch": 0.15625, "grad_norm": 2.683770179748535, "grad_norm_var": 0.10618654391323404, "learning_rate": 0.0001, "loss": 1.2244, "loss/crossentropy": 2.306131362915039, "loss/hidden": 1.0, "loss/logits": 0.18487581610679626, "loss/reg": 0.0039537097327411175, "step": 1250 }, { "epoch": 0.156375, "grad_norm": 2.1440393924713135, "grad_norm_var": 0.10727540575258346, "learning_rate": 0.0001, "loss": 1.0298, "loss/crossentropy": 2.5097758769989014, "loss/hidden": 0.8515625, "loss/logits": 0.13874852657318115, "loss/reg": 0.003951748367398977, "step": 1251 }, { "epoch": 0.1565, "grad_norm": 5.823795795440674, "grad_norm_var": 0.7687695668695557, "learning_rate": 0.0001, "loss": 1.229, "loss/crossentropy": 2.7339322566986084, "loss/hidden": 1.046875, "loss/logits": 0.14258000254631042, "loss/reg": 0.003950015641748905, "step": 1252 }, { "epoch": 0.156625, "grad_norm": 3.07033109664917, "grad_norm_var": 0.7313342744887732, "learning_rate": 0.0001, "loss": 1.2339, "loss/crossentropy": 2.9465441703796387, "loss/hidden": 1.03125, "loss/logits": 0.16319361329078674, "loss/reg": 0.003948097582906485, "step": 1253 }, { "epoch": 0.15675, "grad_norm": 2.2648935317993164, "grad_norm_var": 0.7516000874171217, "learning_rate": 0.0001, "loss": 1.0369, "loss/crossentropy": 2.462608814239502, "loss/hidden": 0.83984375, "loss/logits": 0.157545804977417, "loss/reg": 0.003946339711546898, "step": 1254 }, { "epoch": 0.156875, "grad_norm": 8.53079605102539, "grad_norm_var": 2.7972635310947167, "learning_rate": 0.0001, "loss": 1.3134, "loss/crossentropy": 2.634783983230591, "loss/hidden": 1.1328125, "loss/logits": 0.1411134898662567, "loss/reg": 0.003944624215364456, "step": 1255 }, { "epoch": 0.157, "grad_norm": 2.3669466972351074, "grad_norm_var": 2.833168083556588, "learning_rate": 0.0001, "loss": 1.1473, "loss/crossentropy": 2.3000428676605225, "loss/hidden": 0.95703125, "loss/logits": 0.15085983276367188, "loss/reg": 0.003942654933780432, "step": 1256 }, { "epoch": 0.157125, "grad_norm": 3.7946250438690186, "grad_norm_var": 2.797930128567604, "learning_rate": 0.0001, "loss": 1.257, "loss/crossentropy": 2.2643165588378906, "loss/hidden": 1.0546875, "loss/logits": 0.16289997100830078, "loss/reg": 0.003940712660551071, "step": 1257 }, { "epoch": 0.15725, "grad_norm": 2.7023470401763916, "grad_norm_var": 2.7717805963911966, "learning_rate": 0.0001, "loss": 1.0415, "loss/crossentropy": 2.6158485412597656, "loss/hidden": 0.8515625, "loss/logits": 0.15053331851959229, "loss/reg": 0.003938745241612196, "step": 1258 }, { "epoch": 0.157375, "grad_norm": 2.1059186458587646, "grad_norm_var": 2.7959400050235406, "learning_rate": 0.0001, "loss": 0.9954, "loss/crossentropy": 2.2793667316436768, "loss/hidden": 0.828125, "loss/logits": 0.12795627117156982, "loss/reg": 0.003936750814318657, "step": 1259 }, { "epoch": 0.1575, "grad_norm": 3.712144136428833, "grad_norm_var": 2.74615990110932, "learning_rate": 0.0001, "loss": 1.5484, "loss/crossentropy": 2.348719835281372, "loss/hidden": 1.265625, "loss/logits": 0.2434225231409073, "loss/reg": 0.003934717271476984, "step": 1260 }, { "epoch": 0.157625, "grad_norm": 2.565690755844116, "grad_norm_var": 2.7408307436849646, "learning_rate": 0.0001, "loss": 0.9844, "loss/crossentropy": 2.635305404663086, "loss/hidden": 0.80078125, "loss/logits": 0.14425452053546906, "loss/reg": 0.003932674881070852, "step": 1261 }, { "epoch": 0.15775, "grad_norm": 2.1676135063171387, "grad_norm_var": 2.824524782775095, "learning_rate": 0.0001, "loss": 1.0319, "loss/crossentropy": 2.582833766937256, "loss/hidden": 0.84375, "loss/logits": 0.1488630175590515, "loss/reg": 0.003930607810616493, "step": 1262 }, { "epoch": 0.157875, "grad_norm": 2.2690494060516357, "grad_norm_var": 2.8509140700262754, "learning_rate": 0.0001, "loss": 0.984, "loss/crossentropy": 2.694347620010376, "loss/hidden": 0.8125, "loss/logits": 0.13225148618221283, "loss/reg": 0.003928603138774633, "step": 1263 }, { "epoch": 0.158, "grad_norm": 3.369201183319092, "grad_norm_var": 2.842832034310379, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.450927972793579, "loss/hidden": 0.94140625, "loss/logits": 0.1599656641483307, "loss/reg": 0.003926686476916075, "step": 1264 }, { "epoch": 0.158125, "grad_norm": 1.9869253635406494, "grad_norm_var": 2.9437333300576247, "learning_rate": 0.0001, "loss": 1.0556, "loss/crossentropy": 2.3994815349578857, "loss/hidden": 0.87890625, "loss/logits": 0.1374390572309494, "loss/reg": 0.003924795426428318, "step": 1265 }, { "epoch": 0.15825, "grad_norm": 2.7239654064178467, "grad_norm_var": 2.9409477001102284, "learning_rate": 0.0001, "loss": 1.1347, "loss/crossentropy": 2.549830913543701, "loss/hidden": 0.9453125, "loss/logits": 0.15013578534126282, "loss/reg": 0.003923265729099512, "step": 1266 }, { "epoch": 0.158375, "grad_norm": 3.253939151763916, "grad_norm_var": 2.857988200257295, "learning_rate": 0.0001, "loss": 1.5404, "loss/crossentropy": 2.297355890274048, "loss/hidden": 1.234375, "loss/logits": 0.2668222188949585, "loss/reg": 0.003921460825949907, "step": 1267 }, { "epoch": 0.1585, "grad_norm": 4.972027778625488, "grad_norm_var": 2.6160556342714547, "learning_rate": 0.0001, "loss": 1.1048, "loss/crossentropy": 2.5779943466186523, "loss/hidden": 0.8828125, "loss/logits": 0.18274636566638947, "loss/reg": 0.003919865936040878, "step": 1268 }, { "epoch": 0.158625, "grad_norm": 3.8399877548217773, "grad_norm_var": 2.635561990200084, "learning_rate": 0.0001, "loss": 1.1315, "loss/crossentropy": 2.8448750972747803, "loss/hidden": 0.9296875, "loss/logits": 0.16261914372444153, "loss/reg": 0.003917965106666088, "step": 1269 }, { "epoch": 0.15875, "grad_norm": 4.097829341888428, "grad_norm_var": 2.595225849251786, "learning_rate": 0.0001, "loss": 1.2206, "loss/crossentropy": 2.9463918209075928, "loss/hidden": 0.98828125, "loss/logits": 0.19318252801895142, "loss/reg": 0.0039165741764009, "step": 1270 }, { "epoch": 0.158875, "grad_norm": 4.908977508544922, "grad_norm_var": 0.9391465897119967, "learning_rate": 0.0001, "loss": 1.1711, "loss/crossentropy": 2.5089914798736572, "loss/hidden": 0.96875, "loss/logits": 0.16315871477127075, "loss/reg": 0.003915099892765284, "step": 1271 }, { "epoch": 0.159, "grad_norm": 2.4336774349212646, "grad_norm_var": 0.9322146223506894, "learning_rate": 0.0001, "loss": 1.0449, "loss/crossentropy": 2.3010151386260986, "loss/hidden": 0.85546875, "loss/logits": 0.15030014514923096, "loss/reg": 0.0039135850965976715, "step": 1272 }, { "epoch": 0.159125, "grad_norm": 2.9904537200927734, "grad_norm_var": 0.9068912920584419, "learning_rate": 0.0001, "loss": 1.1666, "loss/crossentropy": 2.428278923034668, "loss/hidden": 0.96484375, "loss/logits": 0.16259220242500305, "loss/reg": 0.0039116572588682175, "step": 1273 }, { "epoch": 0.15925, "grad_norm": 1.83692467212677, "grad_norm_var": 1.0031901798578553, "learning_rate": 0.0001, "loss": 0.9189, "loss/crossentropy": 2.4613776206970215, "loss/hidden": 0.76171875, "loss/logits": 0.11810196936130524, "loss/reg": 0.003909708932042122, "step": 1274 }, { "epoch": 0.159375, "grad_norm": 2.4019277095794678, "grad_norm_var": 0.9703342604960014, "learning_rate": 0.0001, "loss": 1.1094, "loss/crossentropy": 2.298656702041626, "loss/hidden": 0.9296875, "loss/logits": 0.14063113927841187, "loss/reg": 0.003907748498022556, "step": 1275 }, { "epoch": 0.1595, "grad_norm": 2.206735372543335, "grad_norm_var": 0.9882309911375784, "learning_rate": 0.0001, "loss": 1.0258, "loss/crossentropy": 2.4418487548828125, "loss/hidden": 0.84375, "loss/logits": 0.1430271565914154, "loss/reg": 0.003905918914824724, "step": 1276 }, { "epoch": 0.159625, "grad_norm": 2.794393539428711, "grad_norm_var": 0.9782088480890478, "learning_rate": 0.0001, "loss": 1.1423, "loss/crossentropy": 2.5516860485076904, "loss/hidden": 0.92578125, "loss/logits": 0.1774648129940033, "loss/reg": 0.003904127748683095, "step": 1277 }, { "epoch": 0.15975, "grad_norm": 2.9287729263305664, "grad_norm_var": 0.928333134335493, "learning_rate": 0.0001, "loss": 1.2932, "loss/crossentropy": 2.533024311065674, "loss/hidden": 1.0625, "loss/logits": 0.19170062243938446, "loss/reg": 0.0039022008422762156, "step": 1278 }, { "epoch": 0.159875, "grad_norm": 2.364227056503296, "grad_norm_var": 0.918818410696285, "learning_rate": 0.0001, "loss": 1.0528, "loss/crossentropy": 2.4888012409210205, "loss/hidden": 0.859375, "loss/logits": 0.15442782640457153, "loss/reg": 0.0039004147984087467, "step": 1279 }, { "epoch": 0.16, "grad_norm": 2.9671173095703125, "grad_norm_var": 0.9128487251694849, "learning_rate": 0.0001, "loss": 1.4056, "loss/crossentropy": 2.2953407764434814, "loss/hidden": 1.15625, "loss/logits": 0.21034780144691467, "loss/reg": 0.0038986552972346544, "step": 1280 }, { "epoch": 0.160125, "grad_norm": 2.1895735263824463, "grad_norm_var": 0.8868469140494813, "learning_rate": 0.0001, "loss": 1.1023, "loss/crossentropy": 2.418267011642456, "loss/hidden": 0.91015625, "loss/logits": 0.15315604209899902, "loss/reg": 0.0038967591244727373, "step": 1281 }, { "epoch": 0.16025, "grad_norm": 2.0041797161102295, "grad_norm_var": 0.9511806175749209, "learning_rate": 0.0001, "loss": 1.0711, "loss/crossentropy": 2.6270132064819336, "loss/hidden": 0.87890625, "loss/logits": 0.15329018235206604, "loss/reg": 0.0038950201123952866, "step": 1282 }, { "epoch": 0.160375, "grad_norm": 2.7158069610595703, "grad_norm_var": 0.9519147622693618, "learning_rate": 0.0001, "loss": 1.0333, "loss/crossentropy": 2.49900221824646, "loss/hidden": 0.84765625, "loss/logits": 0.14673107862472534, "loss/reg": 0.0038934045005589724, "step": 1283 }, { "epoch": 0.1605, "grad_norm": 2.5366370677948, "grad_norm_var": 0.6752056332094691, "learning_rate": 0.0001, "loss": 1.1833, "loss/crossentropy": 2.5442113876342773, "loss/hidden": 0.98046875, "loss/logits": 0.1639258712530136, "loss/reg": 0.003891737898811698, "step": 1284 }, { "epoch": 0.160625, "grad_norm": 3.805074453353882, "grad_norm_var": 0.6705619509398928, "learning_rate": 0.0001, "loss": 1.2758, "loss/crossentropy": 2.0399181842803955, "loss/hidden": 1.078125, "loss/logits": 0.15876121819019318, "loss/reg": 0.003889812156558037, "step": 1285 }, { "epoch": 0.16075, "grad_norm": 2.670008420944214, "grad_norm_var": 0.5554521676123362, "learning_rate": 0.0001, "loss": 1.2431, "loss/crossentropy": 2.538208484649658, "loss/hidden": 1.0234375, "loss/logits": 0.18076473474502563, "loss/reg": 0.003887931350618601, "step": 1286 }, { "epoch": 0.160875, "grad_norm": 2.1082379817962646, "grad_norm_var": 0.23374974294705825, "learning_rate": 0.0001, "loss": 1.0764, "loss/crossentropy": 2.4281837940216064, "loss/hidden": 0.87109375, "loss/logits": 0.16639548540115356, "loss/reg": 0.003886270336806774, "step": 1287 }, { "epoch": 0.161, "grad_norm": 2.232421398162842, "grad_norm_var": 0.23966051398124927, "learning_rate": 0.0001, "loss": 0.9928, "loss/crossentropy": 2.5111913681030273, "loss/hidden": 0.8203125, "loss/logits": 0.1336727738380432, "loss/reg": 0.003884353907778859, "step": 1288 }, { "epoch": 0.161125, "grad_norm": 1.8333826065063477, "grad_norm_var": 0.25492677200505887, "learning_rate": 0.0001, "loss": 1.0349, "loss/crossentropy": 2.4361870288848877, "loss/hidden": 0.859375, "loss/logits": 0.13674689829349518, "loss/reg": 0.0038827096577733755, "step": 1289 }, { "epoch": 0.16125, "grad_norm": 2.227004051208496, "grad_norm_var": 0.23126510746358592, "learning_rate": 0.0001, "loss": 1.0776, "loss/crossentropy": 2.1035265922546387, "loss/hidden": 0.88671875, "loss/logits": 0.1520470380783081, "loss/reg": 0.0038811014965176582, "step": 1290 }, { "epoch": 0.161375, "grad_norm": 2.4726340770721436, "grad_norm_var": 0.23066153493828262, "learning_rate": 0.0001, "loss": 1.3176, "loss/crossentropy": 2.5521106719970703, "loss/hidden": 1.078125, "loss/logits": 0.20070654153823853, "loss/reg": 0.0038791669066995382, "step": 1291 }, { "epoch": 0.1615, "grad_norm": 2.505643606185913, "grad_norm_var": 0.22441776850007666, "learning_rate": 0.0001, "loss": 0.9609, "loss/crossentropy": 2.562110662460327, "loss/hidden": 0.7890625, "loss/logits": 0.1330765336751938, "loss/reg": 0.003877209033817053, "step": 1292 }, { "epoch": 0.161625, "grad_norm": 2.2074098587036133, "grad_norm_var": 0.22464862758212098, "learning_rate": 0.0001, "loss": 1.0223, "loss/crossentropy": 2.4465317726135254, "loss/hidden": 0.8359375, "loss/logits": 0.1476120948791504, "loss/reg": 0.0038752437103539705, "step": 1293 }, { "epoch": 0.16175, "grad_norm": 4.475348949432373, "grad_norm_var": 0.4655478968178611, "learning_rate": 0.0001, "loss": 1.2869, "loss/crossentropy": 2.6663005352020264, "loss/hidden": 1.0546875, "loss/logits": 0.1934768706560135, "loss/reg": 0.0038730644155293703, "step": 1294 }, { "epoch": 0.161875, "grad_norm": 3.0794217586517334, "grad_norm_var": 0.4767340552867606, "learning_rate": 0.0001, "loss": 1.1619, "loss/crossentropy": 2.463402271270752, "loss/hidden": 0.96484375, "loss/logits": 0.15831388533115387, "loss/reg": 0.0038711209781467915, "step": 1295 }, { "epoch": 0.162, "grad_norm": 2.185600757598877, "grad_norm_var": 0.47945242338888056, "learning_rate": 0.0001, "loss": 1.2293, "loss/crossentropy": 2.240428924560547, "loss/hidden": 1.0078125, "loss/logits": 0.18282610177993774, "loss/reg": 0.003869203384965658, "step": 1296 }, { "epoch": 0.162125, "grad_norm": 2.6001839637756348, "grad_norm_var": 0.46872306833601746, "learning_rate": 0.0001, "loss": 1.1285, "loss/crossentropy": 2.6634023189544678, "loss/hidden": 0.9296875, "loss/logits": 0.16017360985279083, "loss/reg": 0.0038670580834150314, "step": 1297 }, { "epoch": 0.16225, "grad_norm": 2.9891393184661865, "grad_norm_var": 0.4506250664032753, "learning_rate": 0.0001, "loss": 1.2209, "loss/crossentropy": 2.2928833961486816, "loss/hidden": 0.98046875, "loss/logits": 0.2018088847398758, "loss/reg": 0.0038651188369840384, "step": 1298 }, { "epoch": 0.162375, "grad_norm": 2.5674245357513428, "grad_norm_var": 0.45100085978748794, "learning_rate": 0.0001, "loss": 1.099, "loss/crossentropy": 2.604619026184082, "loss/hidden": 0.87109375, "loss/logits": 0.18927830457687378, "loss/reg": 0.003863039892166853, "step": 1299 }, { "epoch": 0.1625, "grad_norm": 5.368757247924805, "grad_norm_var": 0.9072441308021212, "learning_rate": 0.0001, "loss": 1.5845, "loss/crossentropy": 1.9008941650390625, "loss/hidden": 1.234375, "loss/logits": 0.31154388189315796, "loss/reg": 0.0038609288167208433, "step": 1300 }, { "epoch": 0.162625, "grad_norm": 2.402621030807495, "grad_norm_var": 0.8483983819636706, "learning_rate": 0.0001, "loss": 1.224, "loss/crossentropy": 2.582455635070801, "loss/hidden": 1.0078125, "loss/logits": 0.17763689160346985, "loss/reg": 0.0038588044699281454, "step": 1301 }, { "epoch": 0.16275, "grad_norm": 2.1669483184814453, "grad_norm_var": 0.8692672249500539, "learning_rate": 0.0001, "loss": 1.1932, "loss/crossentropy": 2.2148444652557373, "loss/hidden": 0.98828125, "loss/logits": 0.1663748174905777, "loss/reg": 0.0038566740695387125, "step": 1302 }, { "epoch": 0.162875, "grad_norm": 2.7048940658569336, "grad_norm_var": 0.8433353029278644, "learning_rate": 0.0001, "loss": 1.0917, "loss/crossentropy": 2.576660394668579, "loss/hidden": 0.89453125, "loss/logits": 0.15860411524772644, "loss/reg": 0.0038547737058252096, "step": 1303 }, { "epoch": 0.163, "grad_norm": 7.6692304611206055, "grad_norm_var": 2.314715920521659, "learning_rate": 0.0001, "loss": 1.7499, "loss/crossentropy": 2.1765997409820557, "loss/hidden": 1.5078125, "loss/logits": 0.2035428285598755, "loss/reg": 0.0038528875447809696, "step": 1304 }, { "epoch": 0.163125, "grad_norm": 2.3525166511535645, "grad_norm_var": 2.2445116172134316, "learning_rate": 0.0001, "loss": 1.0827, "loss/crossentropy": 2.7200188636779785, "loss/hidden": 0.875, "loss/logits": 0.16918785870075226, "loss/reg": 0.003850834909826517, "step": 1305 }, { "epoch": 0.16325, "grad_norm": 2.1109790802001953, "grad_norm_var": 2.259220587304002, "learning_rate": 0.0001, "loss": 0.9954, "loss/crossentropy": 2.4676554203033447, "loss/hidden": 0.81640625, "loss/logits": 0.14051809906959534, "loss/reg": 0.003848861902952194, "step": 1306 }, { "epoch": 0.163375, "grad_norm": 2.2503230571746826, "grad_norm_var": 2.281384886865043, "learning_rate": 0.0001, "loss": 1.0349, "loss/crossentropy": 2.4450364112854004, "loss/hidden": 0.8515625, "loss/logits": 0.14489130675792694, "loss/reg": 0.0038469466380774975, "step": 1307 }, { "epoch": 0.1635, "grad_norm": 3.661198139190674, "grad_norm_var": 2.272915770254127, "learning_rate": 0.0001, "loss": 1.2995, "loss/crossentropy": 3.0266950130462646, "loss/hidden": 1.0390625, "loss/logits": 0.22195252776145935, "loss/reg": 0.0038448853883892298, "step": 1308 }, { "epoch": 0.163625, "grad_norm": 2.6725175380706787, "grad_norm_var": 2.226462629702375, "learning_rate": 0.0001, "loss": 1.1629, "loss/crossentropy": 2.4850494861602783, "loss/hidden": 0.9609375, "loss/logits": 0.16352644562721252, "loss/reg": 0.003842818085104227, "step": 1309 }, { "epoch": 0.16375, "grad_norm": 2.4281601905822754, "grad_norm_var": 2.1412558591766695, "learning_rate": 0.0001, "loss": 1.0183, "loss/crossentropy": 2.629995107650757, "loss/hidden": 0.83984375, "loss/logits": 0.14006099104881287, "loss/reg": 0.0038410108536481857, "step": 1310 }, { "epoch": 0.163875, "grad_norm": 2.779705762863159, "grad_norm_var": 2.146718277972097, "learning_rate": 0.0001, "loss": 1.0696, "loss/crossentropy": 2.845191240310669, "loss/hidden": 0.87890625, "loss/logits": 0.1523093283176422, "loss/reg": 0.0038392541464418173, "step": 1311 }, { "epoch": 0.164, "grad_norm": 2.1593542098999023, "grad_norm_var": 2.1498104356164496, "learning_rate": 0.0001, "loss": 1.1257, "loss/crossentropy": 2.5214879512786865, "loss/hidden": 0.93359375, "loss/logits": 0.15370512008666992, "loss/reg": 0.0038373004645109177, "step": 1312 }, { "epoch": 0.164125, "grad_norm": 2.261361837387085, "grad_norm_var": 2.177543523879501, "learning_rate": 0.0001, "loss": 1.193, "loss/crossentropy": 2.370466947555542, "loss/hidden": 0.9921875, "loss/logits": 0.16246996819972992, "loss/reg": 0.0038352562114596367, "step": 1313 }, { "epoch": 0.16425, "grad_norm": 4.186817169189453, "grad_norm_var": 2.260020426671607, "learning_rate": 0.0001, "loss": 1.3862, "loss/crossentropy": 2.2702040672302246, "loss/hidden": 1.1328125, "loss/logits": 0.21508020162582397, "loss/reg": 0.0038333996199071407, "step": 1314 }, { "epoch": 0.164375, "grad_norm": 3.48109769821167, "grad_norm_var": 2.2462280124966996, "learning_rate": 0.0001, "loss": 1.0443, "loss/crossentropy": 2.6975324153900146, "loss/hidden": 0.85546875, "loss/logits": 0.15051524341106415, "loss/reg": 0.0038314287085086107, "step": 1315 }, { "epoch": 0.1645, "grad_norm": 2.3781416416168213, "grad_norm_var": 1.9268796990894694, "learning_rate": 0.0001, "loss": 1.069, "loss/crossentropy": 2.565978527069092, "loss/hidden": 0.8671875, "loss/logits": 0.1635463535785675, "loss/reg": 0.003829606808722019, "step": 1316 }, { "epoch": 0.164625, "grad_norm": 2.532841682434082, "grad_norm_var": 1.9179299858722438, "learning_rate": 0.0001, "loss": 1.1731, "loss/crossentropy": 2.379420757293701, "loss/hidden": 0.9453125, "loss/logits": 0.18955053389072418, "loss/reg": 0.003827982349321246, "step": 1317 }, { "epoch": 0.16475, "grad_norm": 2.824406862258911, "grad_norm_var": 1.8730366601404502, "learning_rate": 0.0001, "loss": 1.2196, "loss/crossentropy": 2.745607852935791, "loss/hidden": 0.98828125, "loss/logits": 0.19303223490715027, "loss/reg": 0.003826139261946082, "step": 1318 }, { "epoch": 0.164875, "grad_norm": 2.927591562271118, "grad_norm_var": 1.866532019300668, "learning_rate": 0.0001, "loss": 1.0242, "loss/crossentropy": 2.745173215866089, "loss/hidden": 0.8515625, "loss/logits": 0.13440603017807007, "loss/reg": 0.003824233775958419, "step": 1319 }, { "epoch": 0.165, "grad_norm": 2.249950408935547, "grad_norm_var": 0.3587598970037938, "learning_rate": 0.0001, "loss": 1.1167, "loss/crossentropy": 2.182314395904541, "loss/hidden": 0.9140625, "loss/logits": 0.16442248225212097, "loss/reg": 0.0038224998861551285, "step": 1320 }, { "epoch": 0.165125, "grad_norm": 2.136213779449463, "grad_norm_var": 0.3718083111594938, "learning_rate": 0.0001, "loss": 1.2208, "loss/crossentropy": 2.3340518474578857, "loss/hidden": 0.99609375, "loss/logits": 0.18647333979606628, "loss/reg": 0.0038206097669899464, "step": 1321 }, { "epoch": 0.16525, "grad_norm": 2.3353726863861084, "grad_norm_var": 0.3576302941917286, "learning_rate": 0.0001, "loss": 1.0998, "loss/crossentropy": 2.558197498321533, "loss/hidden": 0.890625, "loss/logits": 0.17098355293273926, "loss/reg": 0.003818872617557645, "step": 1322 }, { "epoch": 0.165375, "grad_norm": 2.0035364627838135, "grad_norm_var": 0.376367123736596, "learning_rate": 0.0001, "loss": 0.9672, "loss/crossentropy": 2.0580060482025146, "loss/hidden": 0.8046875, "loss/logits": 0.12435194849967957, "loss/reg": 0.003817170625552535, "step": 1323 }, { "epoch": 0.1655, "grad_norm": 2.704374074935913, "grad_norm_var": 0.3095112579833424, "learning_rate": 0.0001, "loss": 1.2064, "loss/crossentropy": 2.056586742401123, "loss/hidden": 1.03125, "loss/logits": 0.1370464414358139, "loss/reg": 0.003815267700701952, "step": 1324 }, { "epoch": 0.165625, "grad_norm": 2.861746072769165, "grad_norm_var": 0.3128512221250444, "learning_rate": 0.0001, "loss": 1.2048, "loss/crossentropy": 2.649263381958008, "loss/hidden": 0.9921875, "loss/logits": 0.1744484305381775, "loss/reg": 0.0038135608192533255, "step": 1325 }, { "epoch": 0.16575, "grad_norm": 4.303040027618408, "grad_norm_var": 0.4794263231115619, "learning_rate": 0.0001, "loss": 1.396, "loss/crossentropy": 2.5463757514953613, "loss/hidden": 1.0859375, "loss/logits": 0.27196431159973145, "loss/reg": 0.003811680944636464, "step": 1326 }, { "epoch": 0.165875, "grad_norm": 2.6042354106903076, "grad_norm_var": 0.48083927966075424, "learning_rate": 0.0001, "loss": 1.3305, "loss/crossentropy": 2.652782917022705, "loss/hidden": 1.0625, "loss/logits": 0.2298596352338791, "loss/reg": 0.003809748450294137, "step": 1327 }, { "epoch": 0.166, "grad_norm": 2.099306344985962, "grad_norm_var": 0.4857685954885028, "learning_rate": 0.0001, "loss": 1.1101, "loss/crossentropy": 2.54836368560791, "loss/hidden": 0.89453125, "loss/logits": 0.17748317122459412, "loss/reg": 0.0038078485522419214, "step": 1328 }, { "epoch": 0.166125, "grad_norm": 5.760173320770264, "grad_norm_var": 1.026126259782783, "learning_rate": 0.0001, "loss": 1.4286, "loss/crossentropy": 2.3031833171844482, "loss/hidden": 1.0703125, "loss/logits": 0.3202149271965027, "loss/reg": 0.003806003602221608, "step": 1329 }, { "epoch": 0.16625, "grad_norm": 2.2108118534088135, "grad_norm_var": 0.9474122587329703, "learning_rate": 0.0001, "loss": 1.0457, "loss/crossentropy": 2.5074656009674072, "loss/hidden": 0.859375, "loss/logits": 0.14826124906539917, "loss/reg": 0.003804128849878907, "step": 1330 }, { "epoch": 0.166375, "grad_norm": 3.3911850452423096, "grad_norm_var": 0.9402114702613211, "learning_rate": 0.0001, "loss": 1.2478, "loss/crossentropy": 2.4859514236450195, "loss/hidden": 1.015625, "loss/logits": 0.19414550065994263, "loss/reg": 0.0038022748194634914, "step": 1331 }, { "epoch": 0.1665, "grad_norm": 2.3483588695526123, "grad_norm_var": 0.9420719086390626, "learning_rate": 0.0001, "loss": 1.1072, "loss/crossentropy": 2.7186105251312256, "loss/hidden": 0.90234375, "loss/logits": 0.16681891679763794, "loss/reg": 0.003800415899604559, "step": 1332 }, { "epoch": 0.166625, "grad_norm": 2.478461503982544, "grad_norm_var": 0.9444172935081412, "learning_rate": 0.0001, "loss": 1.2747, "loss/crossentropy": 2.414044141769409, "loss/hidden": 1.046875, "loss/logits": 0.18985730409622192, "loss/reg": 0.003798494813963771, "step": 1333 }, { "epoch": 0.16675, "grad_norm": 3.826606273651123, "grad_norm_var": 1.0067895170922097, "learning_rate": 0.0001, "loss": 1.5191, "loss/crossentropy": 2.52571964263916, "loss/hidden": 1.25, "loss/logits": 0.2311021387577057, "loss/reg": 0.0037966351956129074, "step": 1334 }, { "epoch": 0.166875, "grad_norm": 2.2956812381744385, "grad_norm_var": 1.0285842417783648, "learning_rate": 0.0001, "loss": 0.9857, "loss/crossentropy": 2.333362579345703, "loss/hidden": 0.8046875, "loss/logits": 0.14301563799381256, "loss/reg": 0.003794773481786251, "step": 1335 }, { "epoch": 0.167, "grad_norm": 2.215144634246826, "grad_norm_var": 1.0314472749301025, "learning_rate": 0.0001, "loss": 1.0197, "loss/crossentropy": 2.6435439586639404, "loss/hidden": 0.83203125, "loss/logits": 0.14977750182151794, "loss/reg": 0.003792962059378624, "step": 1336 }, { "epoch": 0.167125, "grad_norm": 2.289198875427246, "grad_norm_var": 1.0183830630566924, "learning_rate": 0.0001, "loss": 1.0596, "loss/crossentropy": 2.4188473224639893, "loss/hidden": 0.875, "loss/logits": 0.14672745764255524, "loss/reg": 0.0037910572718828917, "step": 1337 }, { "epoch": 0.16725, "grad_norm": 2.585996627807617, "grad_norm_var": 1.0048460491356954, "learning_rate": 0.0001, "loss": 1.1657, "loss/crossentropy": 2.459104537963867, "loss/hidden": 0.95703125, "loss/logits": 0.17073991894721985, "loss/reg": 0.0037892020773142576, "step": 1338 }, { "epoch": 0.167375, "grad_norm": 2.2020533084869385, "grad_norm_var": 0.9842790473450236, "learning_rate": 0.0001, "loss": 1.0632, "loss/crossentropy": 2.602936029434204, "loss/hidden": 0.875, "loss/logits": 0.1503719538450241, "loss/reg": 0.00378743140026927, "step": 1339 }, { "epoch": 0.1675, "grad_norm": 2.0642025470733643, "grad_norm_var": 1.0253976633091109, "learning_rate": 0.0001, "loss": 1.0186, "loss/crossentropy": 2.5779831409454346, "loss/hidden": 0.8359375, "loss/logits": 0.14476892352104187, "loss/reg": 0.0037853880785405636, "step": 1340 }, { "epoch": 0.167625, "grad_norm": 2.0386452674865723, "grad_norm_var": 1.0660144013342174, "learning_rate": 0.0001, "loss": 1.0255, "loss/crossentropy": 2.599996328353882, "loss/hidden": 0.8359375, "loss/logits": 0.15170088410377502, "loss/reg": 0.0037834926042705774, "step": 1341 }, { "epoch": 0.16775, "grad_norm": 2.469545841217041, "grad_norm_var": 0.9073509513912974, "learning_rate": 0.0001, "loss": 1.1727, "loss/crossentropy": 2.4470956325531006, "loss/hidden": 0.96875, "loss/logits": 0.1660883128643036, "loss/reg": 0.003781634848564863, "step": 1342 }, { "epoch": 0.167875, "grad_norm": 2.972076892852783, "grad_norm_var": 0.9120929514276962, "learning_rate": 0.0001, "loss": 1.0577, "loss/crossentropy": 2.2371230125427246, "loss/hidden": 0.87890625, "loss/logits": 0.14097647368907928, "loss/reg": 0.0037799072451889515, "step": 1343 }, { "epoch": 0.168, "grad_norm": 2.47365403175354, "grad_norm_var": 0.8907210075164879, "learning_rate": 0.0001, "loss": 1.1224, "loss/crossentropy": 2.574859857559204, "loss/hidden": 0.91015625, "loss/logits": 0.1744510978460312, "loss/reg": 0.003778197569772601, "step": 1344 }, { "epoch": 0.168125, "grad_norm": 3.3522775173187256, "grad_norm_var": 0.27908018822892944, "learning_rate": 0.0001, "loss": 1.1439, "loss/crossentropy": 2.3602190017700195, "loss/hidden": 0.94921875, "loss/logits": 0.15688063204288483, "loss/reg": 0.0037766145542263985, "step": 1345 }, { "epoch": 0.16825, "grad_norm": 2.2945148944854736, "grad_norm_var": 0.27544389245511314, "learning_rate": 0.0001, "loss": 1.0967, "loss/crossentropy": 2.0383851528167725, "loss/hidden": 0.94140625, "loss/logits": 0.11755125969648361, "loss/reg": 0.0037747540045529604, "step": 1346 }, { "epoch": 0.168375, "grad_norm": 2.3891069889068604, "grad_norm_var": 0.2299681545095474, "learning_rate": 0.0001, "loss": 1.1306, "loss/crossentropy": 2.3858978748321533, "loss/hidden": 0.91796875, "loss/logits": 0.174921452999115, "loss/reg": 0.003772968426346779, "step": 1347 }, { "epoch": 0.1685, "grad_norm": 2.070483684539795, "grad_norm_var": 0.24109670204344696, "learning_rate": 0.0001, "loss": 0.9534, "loss/crossentropy": 2.616994619369507, "loss/hidden": 0.7890625, "loss/logits": 0.12660518288612366, "loss/reg": 0.003771234769374132, "step": 1348 }, { "epoch": 0.168625, "grad_norm": 2.5373406410217285, "grad_norm_var": 0.24113562481536305, "learning_rate": 0.0001, "loss": 1.149, "loss/crossentropy": 2.5550339221954346, "loss/hidden": 0.9296875, "loss/logits": 0.18165861070156097, "loss/reg": 0.0037693637423217297, "step": 1349 }, { "epoch": 0.16875, "grad_norm": 2.4782979488372803, "grad_norm_var": 0.11712655452237944, "learning_rate": 0.0001, "loss": 1.1058, "loss/crossentropy": 2.424433469772339, "loss/hidden": 0.90625, "loss/logits": 0.16192512214183807, "loss/reg": 0.003767443122342229, "step": 1350 }, { "epoch": 0.168875, "grad_norm": 2.141442060470581, "grad_norm_var": 0.12118062200624896, "learning_rate": 0.0001, "loss": 1.0005, "loss/crossentropy": 2.4995741844177246, "loss/hidden": 0.828125, "loss/logits": 0.13474415242671967, "loss/reg": 0.0037655706983059645, "step": 1351 }, { "epoch": 0.169, "grad_norm": 3.0165703296661377, "grad_norm_var": 0.1404083277914895, "learning_rate": 0.0001, "loss": 1.0107, "loss/crossentropy": 2.753091812133789, "loss/hidden": 0.81640625, "loss/logits": 0.1566263735294342, "loss/reg": 0.0037636584602296352, "step": 1352 }, { "epoch": 0.169125, "grad_norm": 2.388427257537842, "grad_norm_var": 0.1387512034039254, "learning_rate": 0.0001, "loss": 0.9903, "loss/crossentropy": 2.3668482303619385, "loss/hidden": 0.8125, "loss/logits": 0.14017178118228912, "loss/reg": 0.0037617513444274664, "step": 1353 }, { "epoch": 0.16925, "grad_norm": 1.86200749874115, "grad_norm_var": 0.1600401535940278, "learning_rate": 0.0001, "loss": 0.9282, "loss/crossentropy": 2.6024041175842285, "loss/hidden": 0.765625, "loss/logits": 0.12501290440559387, "loss/reg": 0.003759781364351511, "step": 1354 }, { "epoch": 0.169375, "grad_norm": 2.6516194343566895, "grad_norm_var": 0.15949300228247545, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.66141939163208, "loss/hidden": 0.87109375, "loss/logits": 0.1546727865934372, "loss/reg": 0.0037579077761620283, "step": 1355 }, { "epoch": 0.1695, "grad_norm": 2.536879777908325, "grad_norm_var": 0.1491417929813607, "learning_rate": 0.0001, "loss": 1.0189, "loss/crossentropy": 2.509375810623169, "loss/hidden": 0.828125, "loss/logits": 0.1532239019870758, "loss/reg": 0.0037560255732387304, "step": 1356 }, { "epoch": 0.169625, "grad_norm": 2.869354724884033, "grad_norm_var": 0.14343589299983103, "learning_rate": 0.0001, "loss": 1.0562, "loss/crossentropy": 2.676845073699951, "loss/hidden": 0.84765625, "loss/logits": 0.1710355579853058, "loss/reg": 0.0037540122866630554, "step": 1357 }, { "epoch": 0.16975, "grad_norm": 3.0909178256988525, "grad_norm_var": 0.16243653600033944, "learning_rate": 0.0001, "loss": 0.9284, "loss/crossentropy": 2.4438745975494385, "loss/hidden": 0.76953125, "loss/logits": 0.12132885307073593, "loss/reg": 0.003752180142328143, "step": 1358 }, { "epoch": 0.169875, "grad_norm": 2.1628170013427734, "grad_norm_var": 0.16001678424908092, "learning_rate": 0.0001, "loss": 0.9578, "loss/crossentropy": 2.4705512523651123, "loss/hidden": 0.7890625, "loss/logits": 0.13126134872436523, "loss/reg": 0.0037503293715417385, "step": 1359 }, { "epoch": 0.17, "grad_norm": 2.031930923461914, "grad_norm_var": 0.17492556648024848, "learning_rate": 0.0001, "loss": 0.9667, "loss/crossentropy": 2.6681065559387207, "loss/hidden": 0.796875, "loss/logits": 0.1323142796754837, "loss/reg": 0.003748575458303094, "step": 1360 }, { "epoch": 0.170125, "grad_norm": 2.473466634750366, "grad_norm_var": 0.12240658206718862, "learning_rate": 0.0001, "loss": 1.1075, "loss/crossentropy": 2.744309902191162, "loss/hidden": 0.8984375, "loss/logits": 0.17157718539237976, "loss/reg": 0.003746669040992856, "step": 1361 }, { "epoch": 0.17025, "grad_norm": 2.9182069301605225, "grad_norm_var": 0.1348531412058311, "learning_rate": 0.0001, "loss": 0.9539, "loss/crossentropy": 2.7478702068328857, "loss/hidden": 0.77734375, "loss/logits": 0.13914981484413147, "loss/reg": 0.003744750050827861, "step": 1362 }, { "epoch": 0.170375, "grad_norm": 2.5078988075256348, "grad_norm_var": 0.13435597843808764, "learning_rate": 0.0001, "loss": 0.99, "loss/crossentropy": 2.5372326374053955, "loss/hidden": 0.8046875, "loss/logits": 0.14791148900985718, "loss/reg": 0.003742997534573078, "step": 1363 }, { "epoch": 0.1705, "grad_norm": 2.880740165710449, "grad_norm_var": 0.13075709652999348, "learning_rate": 0.0001, "loss": 1.2749, "loss/crossentropy": 2.247453451156616, "loss/hidden": 1.0546875, "loss/logits": 0.18284665048122406, "loss/reg": 0.003741198917850852, "step": 1364 }, { "epoch": 0.170625, "grad_norm": 2.802058219909668, "grad_norm_var": 0.13524607605756855, "learning_rate": 0.0001, "loss": 1.0738, "loss/crossentropy": 2.5267081260681152, "loss/hidden": 0.890625, "loss/logits": 0.1457740068435669, "loss/reg": 0.0037392526865005493, "step": 1365 }, { "epoch": 0.17075, "grad_norm": 2.8533761501312256, "grad_norm_var": 0.14041346014174008, "learning_rate": 0.0001, "loss": 1.1666, "loss/crossentropy": 2.620199680328369, "loss/hidden": 0.94921875, "loss/logits": 0.1800282597541809, "loss/reg": 0.0037372722290456295, "step": 1366 }, { "epoch": 0.170875, "grad_norm": 3.108058214187622, "grad_norm_var": 0.14303122083475486, "learning_rate": 0.0001, "loss": 1.223, "loss/crossentropy": 2.40995717048645, "loss/hidden": 1.015625, "loss/logits": 0.17001160979270935, "loss/reg": 0.0037354743108153343, "step": 1367 }, { "epoch": 0.171, "grad_norm": 2.5328550338745117, "grad_norm_var": 0.13302262467848933, "learning_rate": 0.0001, "loss": 1.1243, "loss/crossentropy": 2.48756742477417, "loss/hidden": 0.91015625, "loss/logits": 0.17680081725120544, "loss/reg": 0.0037336875684559345, "step": 1368 }, { "epoch": 0.171125, "grad_norm": 2.2643351554870605, "grad_norm_var": 0.13755867625505444, "learning_rate": 0.0001, "loss": 0.9683, "loss/crossentropy": 2.642735719680786, "loss/hidden": 0.80078125, "loss/logits": 0.13022208213806152, "loss/reg": 0.003731830744072795, "step": 1369 }, { "epoch": 0.17125, "grad_norm": 1.9663736820220947, "grad_norm_var": 0.12801642728851045, "learning_rate": 0.0001, "loss": 1.0785, "loss/crossentropy": 2.4266912937164307, "loss/hidden": 0.90625, "loss/logits": 0.13490843772888184, "loss/reg": 0.0037299375981092453, "step": 1370 }, { "epoch": 0.171375, "grad_norm": 2.1911253929138184, "grad_norm_var": 0.1382957404551554, "learning_rate": 0.0001, "loss": 1.1049, "loss/crossentropy": 2.4400012493133545, "loss/hidden": 0.90625, "loss/logits": 0.1613638699054718, "loss/reg": 0.003728190902620554, "step": 1371 }, { "epoch": 0.1715, "grad_norm": 2.3214714527130127, "grad_norm_var": 0.14227339992063978, "learning_rate": 0.0001, "loss": 0.9918, "loss/crossentropy": 2.5530331134796143, "loss/hidden": 0.81640625, "loss/logits": 0.13810178637504578, "loss/reg": 0.003726301481947303, "step": 1372 }, { "epoch": 0.171625, "grad_norm": 2.2973015308380127, "grad_norm_var": 0.13920199708697206, "learning_rate": 0.0001, "loss": 0.997, "loss/crossentropy": 2.51094913482666, "loss/hidden": 0.82421875, "loss/logits": 0.13557234406471252, "loss/reg": 0.0037244099657982588, "step": 1373 }, { "epoch": 0.17175, "grad_norm": 2.0010879039764404, "grad_norm_var": 0.1312278234613122, "learning_rate": 0.0001, "loss": 1.0181, "loss/crossentropy": 2.556290864944458, "loss/hidden": 0.8359375, "loss/logits": 0.1449136734008789, "loss/reg": 0.00372238177806139, "step": 1374 }, { "epoch": 0.171875, "grad_norm": 2.9626214504241943, "grad_norm_var": 0.13982906840783826, "learning_rate": 0.0001, "loss": 1.1252, "loss/crossentropy": 2.6870641708374023, "loss/hidden": 0.9453125, "loss/logits": 0.14271126687526703, "loss/reg": 0.003720562905073166, "step": 1375 }, { "epoch": 0.172, "grad_norm": 2.2013375759124756, "grad_norm_var": 0.1308908021708324, "learning_rate": 0.0001, "loss": 0.9615, "loss/crossentropy": 2.4653432369232178, "loss/hidden": 0.79296875, "loss/logits": 0.13133826851844788, "loss/reg": 0.0037186951376497746, "step": 1376 }, { "epoch": 0.172125, "grad_norm": 1.8874136209487915, "grad_norm_var": 0.15580902298580662, "learning_rate": 0.0001, "loss": 0.9726, "loss/crossentropy": 2.7080636024475098, "loss/hidden": 0.80078125, "loss/logits": 0.13469372689723969, "loss/reg": 0.0037167875561863184, "step": 1377 }, { "epoch": 0.17225, "grad_norm": 2.2416839599609375, "grad_norm_var": 0.14497829998406733, "learning_rate": 0.0001, "loss": 1.0436, "loss/crossentropy": 2.523420572280884, "loss/hidden": 0.8515625, "loss/logits": 0.15487292408943176, "loss/reg": 0.003714931197464466, "step": 1378 }, { "epoch": 0.172375, "grad_norm": 2.8195207118988037, "grad_norm_var": 0.15392134715338787, "learning_rate": 0.0001, "loss": 1.1432, "loss/crossentropy": 2.4862663745880127, "loss/hidden": 0.99609375, "loss/logits": 0.10999385267496109, "loss/reg": 0.003713154001161456, "step": 1379 }, { "epoch": 0.1725, "grad_norm": 2.118751287460327, "grad_norm_var": 0.14728210095098948, "learning_rate": 0.0001, "loss": 1.0546, "loss/crossentropy": 2.33610200881958, "loss/hidden": 0.875, "loss/logits": 0.14247506856918335, "loss/reg": 0.0037111735437065363, "step": 1380 }, { "epoch": 0.172625, "grad_norm": 2.783078193664551, "grad_norm_var": 0.14631392823386963, "learning_rate": 0.0001, "loss": 1.1765, "loss/crossentropy": 2.6499156951904297, "loss/hidden": 0.96484375, "loss/logits": 0.17454375326633453, "loss/reg": 0.0037092994898557663, "step": 1381 }, { "epoch": 0.17275, "grad_norm": 2.4130048751831055, "grad_norm_var": 0.13236574600067233, "learning_rate": 0.0001, "loss": 0.9868, "loss/crossentropy": 2.5483062267303467, "loss/hidden": 0.80859375, "loss/logits": 0.14112114906311035, "loss/reg": 0.003707532538101077, "step": 1382 }, { "epoch": 0.172875, "grad_norm": 2.2584409713745117, "grad_norm_var": 0.09521777507376417, "learning_rate": 0.0001, "loss": 1.1614, "loss/crossentropy": 2.5460643768310547, "loss/hidden": 0.9609375, "loss/logits": 0.16338081657886505, "loss/reg": 0.0037057846784591675, "step": 1383 }, { "epoch": 0.173, "grad_norm": 3.7609903812408447, "grad_norm_var": 0.2229059105024603, "learning_rate": 0.0001, "loss": 1.0447, "loss/crossentropy": 2.775035858154297, "loss/hidden": 0.85546875, "loss/logits": 0.15221986174583435, "loss/reg": 0.0037039562594145536, "step": 1384 }, { "epoch": 0.173125, "grad_norm": 2.734567642211914, "grad_norm_var": 0.22787300757804296, "learning_rate": 0.0001, "loss": 1.0205, "loss/crossentropy": 2.524040937423706, "loss/hidden": 0.83984375, "loss/logits": 0.14358995854854584, "loss/reg": 0.003702066373080015, "step": 1385 }, { "epoch": 0.17325, "grad_norm": 25.204544067382812, "grad_norm_var": 32.52689382508577, "learning_rate": 0.0001, "loss": 1.2611, "loss/crossentropy": 2.7667076587677, "loss/hidden": 1.078125, "loss/logits": 0.1459766924381256, "loss/reg": 0.0037002949975430965, "step": 1386 }, { "epoch": 0.173375, "grad_norm": 6.416390419006348, "grad_norm_var": 32.687121260827944, "learning_rate": 0.0001, "loss": 1.2171, "loss/crossentropy": 2.594097137451172, "loss/hidden": 1.0546875, "loss/logits": 0.1254766285419464, "loss/reg": 0.0036984088364988565, "step": 1387 }, { "epoch": 0.1735, "grad_norm": 1.9167765378952026, "grad_norm_var": 32.796098433775775, "learning_rate": 0.0001, "loss": 0.9744, "loss/crossentropy": 2.2913591861724854, "loss/hidden": 0.8125, "loss/logits": 0.12497787177562714, "loss/reg": 0.003696783911436796, "step": 1388 }, { "epoch": 0.173625, "grad_norm": 2.2160396575927734, "grad_norm_var": 32.816325970432494, "learning_rate": 0.0001, "loss": 0.9752, "loss/crossentropy": 2.8192343711853027, "loss/hidden": 0.8046875, "loss/logits": 0.13359323143959045, "loss/reg": 0.0036951396614313126, "step": 1389 }, { "epoch": 0.17375, "grad_norm": 2.288888931274414, "grad_norm_var": 32.74015382821924, "learning_rate": 0.0001, "loss": 1.1827, "loss/crossentropy": 2.637406826019287, "loss/hidden": 0.97265625, "loss/logits": 0.17314687371253967, "loss/reg": 0.0036930718924850225, "step": 1390 }, { "epoch": 0.173875, "grad_norm": 2.411794662475586, "grad_norm_var": 32.8455146358098, "learning_rate": 0.0001, "loss": 1.0526, "loss/crossentropy": 2.189383029937744, "loss/hidden": 0.86328125, "loss/logits": 0.1524190902709961, "loss/reg": 0.0036910499911755323, "step": 1391 }, { "epoch": 0.174, "grad_norm": 4.155782222747803, "grad_norm_var": 32.58828549446247, "learning_rate": 0.0001, "loss": 1.0565, "loss/crossentropy": 2.463545560836792, "loss/hidden": 0.87890625, "loss/logits": 0.14067277312278748, "loss/reg": 0.0036890122573822737, "step": 1392 }, { "epoch": 0.174125, "grad_norm": 5.347140312194824, "grad_norm_var": 32.25727325951478, "learning_rate": 0.0001, "loss": 1.3215, "loss/crossentropy": 2.153430223464966, "loss/hidden": 1.1171875, "loss/logits": 0.1673976629972458, "loss/reg": 0.0036869607865810394, "step": 1393 }, { "epoch": 0.17425, "grad_norm": 2.7766270637512207, "grad_norm_var": 32.11815070371227, "learning_rate": 0.0001, "loss": 1.0493, "loss/crossentropy": 2.4740800857543945, "loss/hidden": 0.88671875, "loss/logits": 0.1257045567035675, "loss/reg": 0.003685306990519166, "step": 1394 }, { "epoch": 0.174375, "grad_norm": 4.125362396240234, "grad_norm_var": 31.936244846904135, "learning_rate": 0.0001, "loss": 1.3623, "loss/crossentropy": 2.216123342514038, "loss/hidden": 1.1328125, "loss/logits": 0.1926833689212799, "loss/reg": 0.003683644812554121, "step": 1395 }, { "epoch": 0.1745, "grad_norm": 2.5013086795806885, "grad_norm_var": 31.82097080900541, "learning_rate": 0.0001, "loss": 1.1619, "loss/crossentropy": 2.2907092571258545, "loss/hidden": 0.98046875, "loss/logits": 0.14458391070365906, "loss/reg": 0.00368195166811347, "step": 1396 }, { "epoch": 0.174625, "grad_norm": 2.9556281566619873, "grad_norm_var": 31.78144628269959, "learning_rate": 0.0001, "loss": 1.3064, "loss/crossentropy": 2.600534439086914, "loss/hidden": 0.98828125, "loss/logits": 0.2812826335430145, "loss/reg": 0.0036801020614802837, "step": 1397 }, { "epoch": 0.17475, "grad_norm": 2.312662363052368, "grad_norm_var": 31.811237788762753, "learning_rate": 0.0001, "loss": 1.2056, "loss/crossentropy": 2.346238613128662, "loss/hidden": 1.0078125, "loss/logits": 0.16105079650878906, "loss/reg": 0.0036781977396458387, "step": 1398 }, { "epoch": 0.174875, "grad_norm": 2.161569356918335, "grad_norm_var": 31.841893155076757, "learning_rate": 0.0001, "loss": 1.0891, "loss/crossentropy": 2.6312341690063477, "loss/hidden": 0.8984375, "loss/logits": 0.15386250615119934, "loss/reg": 0.0036762990057468414, "step": 1399 }, { "epoch": 0.175, "grad_norm": 2.7927346229553223, "grad_norm_var": 32.00627187711323, "learning_rate": 0.0001, "loss": 1.0835, "loss/crossentropy": 2.3059043884277344, "loss/hidden": 0.90234375, "loss/logits": 0.1444191038608551, "loss/reg": 0.0036745734978467226, "step": 1400 }, { "epoch": 0.175125, "grad_norm": 3.7161078453063965, "grad_norm_var": 31.832840403479917, "learning_rate": 0.0001, "loss": 1.2519, "loss/crossentropy": 2.26311993598938, "loss/hidden": 1.015625, "loss/logits": 0.19954730570316315, "loss/reg": 0.0036726652178913355, "step": 1401 }, { "epoch": 0.17525, "grad_norm": 2.494056224822998, "grad_norm_var": 1.6194340047826798, "learning_rate": 0.0001, "loss": 1.1725, "loss/crossentropy": 2.2084007263183594, "loss/hidden": 0.97265625, "loss/logits": 0.16313332319259644, "loss/reg": 0.00367086473852396, "step": 1402 }, { "epoch": 0.175375, "grad_norm": 8.896705627441406, "grad_norm_var": 3.0802516385388357, "learning_rate": 0.0001, "loss": 1.8271, "loss/crossentropy": 2.3072290420532227, "loss/hidden": 1.546875, "loss/logits": 0.24357590079307556, "loss/reg": 0.0036691350396722555, "step": 1403 }, { "epoch": 0.1755, "grad_norm": 2.695955753326416, "grad_norm_var": 2.972744932112194, "learning_rate": 0.0001, "loss": 1.2248, "loss/crossentropy": 2.350301504135132, "loss/hidden": 1.015625, "loss/logits": 0.1725081503391266, "loss/reg": 0.003667246550321579, "step": 1404 }, { "epoch": 0.175625, "grad_norm": 2.44710373878479, "grad_norm_var": 2.940667944838992, "learning_rate": 0.0001, "loss": 1.1112, "loss/crossentropy": 2.5243077278137207, "loss/hidden": 0.921875, "loss/logits": 0.15269093215465546, "loss/reg": 0.0036653466522693634, "step": 1405 }, { "epoch": 0.17575, "grad_norm": 2.1895623207092285, "grad_norm_var": 2.9557342642141196, "learning_rate": 0.0001, "loss": 1.035, "loss/crossentropy": 2.4991495609283447, "loss/hidden": 0.85546875, "loss/logits": 0.14287039637565613, "loss/reg": 0.003663522657006979, "step": 1406 }, { "epoch": 0.175875, "grad_norm": 6.32980489730835, "grad_norm_var": 3.412629436693123, "learning_rate": 0.0001, "loss": 1.4889, "loss/crossentropy": 2.193056583404541, "loss/hidden": 1.265625, "loss/logits": 0.18663693964481354, "loss/reg": 0.0036618507001549006, "step": 1407 }, { "epoch": 0.176, "grad_norm": 2.4646599292755127, "grad_norm_var": 3.4702546151327094, "learning_rate": 0.0001, "loss": 1.1915, "loss/crossentropy": 2.23488450050354, "loss/hidden": 0.984375, "loss/logits": 0.17051902413368225, "loss/reg": 0.003659995039924979, "step": 1408 }, { "epoch": 0.176125, "grad_norm": 2.211646318435669, "grad_norm_var": 3.317894410006067, "learning_rate": 0.0001, "loss": 0.9964, "loss/crossentropy": 2.5030038356781006, "loss/hidden": 0.8125, "loss/logits": 0.1473085880279541, "loss/reg": 0.0036581193562597036, "step": 1409 }, { "epoch": 0.17625, "grad_norm": 2.623199224472046, "grad_norm_var": 3.330419454675635, "learning_rate": 0.0001, "loss": 1.3115, "loss/crossentropy": 2.550668716430664, "loss/hidden": 1.0859375, "loss/logits": 0.18897610902786255, "loss/reg": 0.0036564578767865896, "step": 1410 }, { "epoch": 0.176375, "grad_norm": 2.587066173553467, "grad_norm_var": 3.3105432674443476, "learning_rate": 0.0001, "loss": 1.0134, "loss/crossentropy": 2.560816764831543, "loss/hidden": 0.8359375, "loss/logits": 0.14094150066375732, "loss/reg": 0.003654823638498783, "step": 1411 }, { "epoch": 0.1765, "grad_norm": 3.003955602645874, "grad_norm_var": 3.278755120231675, "learning_rate": 0.0001, "loss": 1.1132, "loss/crossentropy": 2.602020740509033, "loss/hidden": 0.9296875, "loss/logits": 0.14699101448059082, "loss/reg": 0.0036529425997287035, "step": 1412 }, { "epoch": 0.176625, "grad_norm": 2.4333064556121826, "grad_norm_var": 3.315795478379337, "learning_rate": 0.0001, "loss": 0.9971, "loss/crossentropy": 2.402600049972534, "loss/hidden": 0.82421875, "loss/logits": 0.13639256358146667, "loss/reg": 0.003651064820587635, "step": 1413 }, { "epoch": 0.17675, "grad_norm": 3.2620654106140137, "grad_norm_var": 3.2585387544687285, "learning_rate": 0.0001, "loss": 1.2037, "loss/crossentropy": 2.5123348236083984, "loss/hidden": 0.98828125, "loss/logits": 0.17892900109291077, "loss/reg": 0.0036491919308900833, "step": 1414 }, { "epoch": 0.176875, "grad_norm": 2.7178690433502197, "grad_norm_var": 3.195713317595645, "learning_rate": 0.0001, "loss": 1.1421, "loss/crossentropy": 2.5939574241638184, "loss/hidden": 0.94140625, "loss/logits": 0.16417454183101654, "loss/reg": 0.003647380042821169, "step": 1415 }, { "epoch": 0.177, "grad_norm": 2.7069778442382812, "grad_norm_var": 3.2020201720099597, "learning_rate": 0.0001, "loss": 1.0438, "loss/crossentropy": 2.4623637199401855, "loss/hidden": 0.859375, "loss/logits": 0.14793148636817932, "loss/reg": 0.0036454948130995035, "step": 1416 }, { "epoch": 0.177125, "grad_norm": 3.3774526119232178, "grad_norm_var": 3.1903428630054806, "learning_rate": 0.0001, "loss": 1.2305, "loss/crossentropy": 2.493734836578369, "loss/hidden": 1.03125, "loss/logits": 0.1627763956785202, "loss/reg": 0.003643598174676299, "step": 1417 }, { "epoch": 0.17725, "grad_norm": 3.0849242210388184, "grad_norm_var": 3.1504347640183825, "learning_rate": 0.0001, "loss": 1.1358, "loss/crossentropy": 2.631613254547119, "loss/hidden": 0.92578125, "loss/logits": 0.1735854148864746, "loss/reg": 0.0036417231895029545, "step": 1418 }, { "epoch": 0.177375, "grad_norm": 2.294229030609131, "grad_norm_var": 0.9608081109988983, "learning_rate": 0.0001, "loss": 1.0362, "loss/crossentropy": 2.3616018295288086, "loss/hidden": 0.8515625, "loss/logits": 0.1482659876346588, "loss/reg": 0.003640011651441455, "step": 1419 }, { "epoch": 0.1775, "grad_norm": 2.5457763671875, "grad_norm_var": 0.9663407595303662, "learning_rate": 0.0001, "loss": 1.1153, "loss/crossentropy": 2.293968439102173, "loss/hidden": 0.92578125, "loss/logits": 0.1531478762626648, "loss/reg": 0.003638186492025852, "step": 1420 }, { "epoch": 0.177625, "grad_norm": 2.2807705402374268, "grad_norm_var": 0.9779472660718359, "learning_rate": 0.0001, "loss": 1.043, "loss/crossentropy": 2.3743157386779785, "loss/hidden": 0.859375, "loss/logits": 0.14730778336524963, "loss/reg": 0.003636348759755492, "step": 1421 }, { "epoch": 0.17775, "grad_norm": 3.111150026321411, "grad_norm_var": 0.9459346801334104, "learning_rate": 0.0001, "loss": 1.445, "loss/crossentropy": 2.40647292137146, "loss/hidden": 1.1875, "loss/logits": 0.22118544578552246, "loss/reg": 0.0036345720291137695, "step": 1422 }, { "epoch": 0.177875, "grad_norm": 2.957676410675049, "grad_norm_var": 0.13237886720594336, "learning_rate": 0.0001, "loss": 1.454, "loss/crossentropy": 2.2662835121154785, "loss/hidden": 1.1953125, "loss/logits": 0.22237327694892883, "loss/reg": 0.0036327948328107595, "step": 1423 }, { "epoch": 0.178, "grad_norm": 2.8840086460113525, "grad_norm_var": 0.12859406693209482, "learning_rate": 0.0001, "loss": 1.1926, "loss/crossentropy": 2.29878830909729, "loss/hidden": 0.98828125, "loss/logits": 0.16803184151649475, "loss/reg": 0.003631110303103924, "step": 1424 }, { "epoch": 0.178125, "grad_norm": 2.56199049949646, "grad_norm_var": 0.11087788727616968, "learning_rate": 0.0001, "loss": 1.1011, "loss/crossentropy": 2.510556221008301, "loss/hidden": 0.88671875, "loss/logits": 0.1780451089143753, "loss/reg": 0.003629653248935938, "step": 1425 }, { "epoch": 0.17825, "grad_norm": 2.547821044921875, "grad_norm_var": 0.11277902977970579, "learning_rate": 0.0001, "loss": 1.1145, "loss/crossentropy": 2.4963526725769043, "loss/hidden": 0.9140625, "loss/logits": 0.1641579121351242, "loss/reg": 0.003628302598372102, "step": 1426 }, { "epoch": 0.178375, "grad_norm": 4.021576404571533, "grad_norm_var": 0.20596057757328007, "learning_rate": 0.0001, "loss": 1.0427, "loss/crossentropy": 2.6380093097686768, "loss/hidden": 0.875, "loss/logits": 0.13147366046905518, "loss/reg": 0.00362660875543952, "step": 1427 }, { "epoch": 0.1785, "grad_norm": 2.6613025665283203, "grad_norm_var": 0.2068119512618426, "learning_rate": 0.0001, "loss": 1.1507, "loss/crossentropy": 2.449720859527588, "loss/hidden": 0.94140625, "loss/logits": 0.17299975454807281, "loss/reg": 0.003625056240707636, "step": 1428 }, { "epoch": 0.178625, "grad_norm": 2.810811996459961, "grad_norm_var": 0.19522032187841584, "learning_rate": 0.0001, "loss": 1.2604, "loss/crossentropy": 2.5181782245635986, "loss/hidden": 1.0390625, "loss/logits": 0.18506762385368347, "loss/reg": 0.0036235651932656765, "step": 1429 }, { "epoch": 0.17875, "grad_norm": 2.3159515857696533, "grad_norm_var": 0.20096961733446103, "learning_rate": 0.0001, "loss": 1.0049, "loss/crossentropy": 2.4013493061065674, "loss/hidden": 0.83203125, "loss/logits": 0.13662859797477722, "loss/reg": 0.0036217791493982077, "step": 1430 }, { "epoch": 0.178875, "grad_norm": 2.760279893875122, "grad_norm_var": 0.20058922636977528, "learning_rate": 0.0001, "loss": 1.1095, "loss/crossentropy": 2.494328737258911, "loss/hidden": 0.90234375, "loss/logits": 0.17097754776477814, "loss/reg": 0.0036199174355715513, "step": 1431 }, { "epoch": 0.179, "grad_norm": 2.1179399490356445, "grad_norm_var": 0.23018267869760295, "learning_rate": 0.0001, "loss": 0.9199, "loss/crossentropy": 2.328307628631592, "loss/hidden": 0.76171875, "loss/logits": 0.12199117988348007, "loss/reg": 0.0036181595642119646, "step": 1432 }, { "epoch": 0.179125, "grad_norm": 1.972982406616211, "grad_norm_var": 0.23987289746597373, "learning_rate": 0.0001, "loss": 1.1017, "loss/crossentropy": 2.3198962211608887, "loss/hidden": 0.91015625, "loss/logits": 0.15536776185035706, "loss/reg": 0.0036163018085062504, "step": 1433 }, { "epoch": 0.17925, "grad_norm": 2.297431707382202, "grad_norm_var": 0.23643810387164474, "learning_rate": 0.0001, "loss": 1.0198, "loss/crossentropy": 2.3877716064453125, "loss/hidden": 0.84765625, "loss/logits": 0.13596263527870178, "loss/reg": 0.003614293411374092, "step": 1434 }, { "epoch": 0.179375, "grad_norm": 4.3237714767456055, "grad_norm_var": 0.4019732306137899, "learning_rate": 0.0001, "loss": 1.8528, "loss/crossentropy": 2.4731862545013428, "loss/hidden": 1.515625, "loss/logits": 0.3010145425796509, "loss/reg": 0.0036123625468462706, "step": 1435 }, { "epoch": 0.1795, "grad_norm": 2.4513933658599854, "grad_norm_var": 0.4052347077082838, "learning_rate": 0.0001, "loss": 1.0992, "loss/crossentropy": 2.420868396759033, "loss/hidden": 0.91796875, "loss/logits": 0.14515507221221924, "loss/reg": 0.003610546700656414, "step": 1436 }, { "epoch": 0.179625, "grad_norm": 3.088550329208374, "grad_norm_var": 0.39496121989805694, "learning_rate": 0.0001, "loss": 0.9986, "loss/crossentropy": 2.5977416038513184, "loss/hidden": 0.8046875, "loss/logits": 0.1577831655740738, "loss/reg": 0.0036085534375160933, "step": 1437 }, { "epoch": 0.17975, "grad_norm": 2.4659948348999023, "grad_norm_var": 0.3946649959456747, "learning_rate": 0.0001, "loss": 1.12, "loss/crossentropy": 2.3049263954162598, "loss/hidden": 0.9296875, "loss/logits": 0.1542476862668991, "loss/reg": 0.0036067250184714794, "step": 1438 }, { "epoch": 0.179875, "grad_norm": 3.8259365558624268, "grad_norm_var": 0.46409173226906736, "learning_rate": 0.0001, "loss": 1.0567, "loss/crossentropy": 2.495133876800537, "loss/hidden": 0.87890625, "loss/logits": 0.14177045226097107, "loss/reg": 0.0036046463064849377, "step": 1439 }, { "epoch": 0.18, "grad_norm": 4.203658103942871, "grad_norm_var": 0.5843312188094536, "learning_rate": 0.0001, "loss": 1.1694, "loss/crossentropy": 2.5536646842956543, "loss/hidden": 0.98828125, "loss/logits": 0.14505554735660553, "loss/reg": 0.0036028120666742325, "step": 1440 }, { "epoch": 0.180125, "grad_norm": 2.6005194187164307, "grad_norm_var": 0.5826787847955602, "learning_rate": 0.0001, "loss": 1.0808, "loss/crossentropy": 2.723029613494873, "loss/hidden": 0.87890625, "loss/logits": 0.16591691970825195, "loss/reg": 0.003600981319323182, "step": 1441 }, { "epoch": 0.18025, "grad_norm": 2.6143887042999268, "grad_norm_var": 0.5797933388848217, "learning_rate": 0.0001, "loss": 1.0984, "loss/crossentropy": 2.677304744720459, "loss/hidden": 0.890625, "loss/logits": 0.17178688943386078, "loss/reg": 0.003598999697715044, "step": 1442 }, { "epoch": 0.180375, "grad_norm": 3.0147054195404053, "grad_norm_var": 0.4936957943628516, "learning_rate": 0.0001, "loss": 1.1792, "loss/crossentropy": 2.567859649658203, "loss/hidden": 0.95703125, "loss/logits": 0.18622992932796478, "loss/reg": 0.00359702087007463, "step": 1443 }, { "epoch": 0.1805, "grad_norm": 2.5004806518554688, "grad_norm_var": 0.4992588141750974, "learning_rate": 0.0001, "loss": 1.0141, "loss/crossentropy": 2.335355043411255, "loss/hidden": 0.84765625, "loss/logits": 0.1304875910282135, "loss/reg": 0.0035950199235230684, "step": 1444 }, { "epoch": 0.180625, "grad_norm": 2.9756336212158203, "grad_norm_var": 0.5004185509481144, "learning_rate": 0.0001, "loss": 1.1931, "loss/crossentropy": 2.262080430984497, "loss/hidden": 1.0, "loss/logits": 0.15718932449817657, "loss/reg": 0.00359291210770607, "step": 1445 }, { "epoch": 0.18075, "grad_norm": 2.8728082180023193, "grad_norm_var": 0.48047395147950145, "learning_rate": 0.0001, "loss": 1.1377, "loss/crossentropy": 2.4912827014923096, "loss/hidden": 0.91796875, "loss/logits": 0.18385069072246552, "loss/reg": 0.0035907707642763853, "step": 1446 }, { "epoch": 0.180875, "grad_norm": 2.440415143966675, "grad_norm_var": 0.4919916999810859, "learning_rate": 0.0001, "loss": 1.1763, "loss/crossentropy": 2.539290189743042, "loss/hidden": 0.95703125, "loss/logits": 0.18333688378334045, "loss/reg": 0.003588638501241803, "step": 1447 }, { "epoch": 0.181, "grad_norm": 2.3080894947052, "grad_norm_var": 0.4754273782914159, "learning_rate": 0.0001, "loss": 1.0376, "loss/crossentropy": 2.552766799926758, "loss/hidden": 0.8515625, "loss/logits": 0.1501522660255432, "loss/reg": 0.0035868044942617416, "step": 1448 }, { "epoch": 0.181125, "grad_norm": 2.343794822692871, "grad_norm_var": 0.43955761846480074, "learning_rate": 0.0001, "loss": 1.3478, "loss/crossentropy": 2.1275408267974854, "loss/hidden": 1.109375, "loss/logits": 0.2026042938232422, "loss/reg": 0.0035850289277732372, "step": 1449 }, { "epoch": 0.18125, "grad_norm": 2.139036178588867, "grad_norm_var": 0.45375597061421685, "learning_rate": 0.0001, "loss": 1.0913, "loss/crossentropy": 2.6152234077453613, "loss/hidden": 0.89453125, "loss/logits": 0.16098400950431824, "loss/reg": 0.003583215642720461, "step": 1450 }, { "epoch": 0.181375, "grad_norm": 6.257389545440674, "grad_norm_var": 1.0582259715841054, "learning_rate": 0.0001, "loss": 1.7871, "loss/crossentropy": 2.396705150604248, "loss/hidden": 1.4140625, "loss/logits": 0.3372613787651062, "loss/reg": 0.003581451950594783, "step": 1451 }, { "epoch": 0.1815, "grad_norm": 2.660068988800049, "grad_norm_var": 1.0455046997651758, "learning_rate": 0.0001, "loss": 1.1302, "loss/crossentropy": 2.4636411666870117, "loss/hidden": 0.9375, "loss/logits": 0.15687166154384613, "loss/reg": 0.0035797141026705503, "step": 1452 }, { "epoch": 0.181625, "grad_norm": 2.0276317596435547, "grad_norm_var": 1.1060792073261607, "learning_rate": 0.0001, "loss": 1.0576, "loss/crossentropy": 2.645092010498047, "loss/hidden": 0.8671875, "loss/logits": 0.15465402603149414, "loss/reg": 0.003578024450689554, "step": 1453 }, { "epoch": 0.18175, "grad_norm": 2.706411123275757, "grad_norm_var": 1.0940753984711251, "learning_rate": 0.0001, "loss": 1.2783, "loss/crossentropy": 2.465183973312378, "loss/hidden": 1.0703125, "loss/logits": 0.1722460836172104, "loss/reg": 0.003576185554265976, "step": 1454 }, { "epoch": 0.181875, "grad_norm": 2.883852243423462, "grad_norm_var": 1.0418023995859433, "learning_rate": 0.0001, "loss": 1.1553, "loss/crossentropy": 2.62016224861145, "loss/hidden": 0.9609375, "loss/logits": 0.15863552689552307, "loss/reg": 0.0035745068453252316, "step": 1455 }, { "epoch": 0.182, "grad_norm": 2.712892532348633, "grad_norm_var": 0.9234243773258602, "learning_rate": 0.0001, "loss": 1.2436, "loss/crossentropy": 2.7399840354919434, "loss/hidden": 1.0078125, "loss/logits": 0.20005175471305847, "loss/reg": 0.0035726907663047314, "step": 1456 }, { "epoch": 0.182125, "grad_norm": 2.2389307022094727, "grad_norm_var": 0.9419911218676927, "learning_rate": 0.0001, "loss": 1.0805, "loss/crossentropy": 2.4630889892578125, "loss/hidden": 0.88671875, "loss/logits": 0.15808376669883728, "loss/reg": 0.003570869332179427, "step": 1457 }, { "epoch": 0.18225, "grad_norm": 2.272373676300049, "grad_norm_var": 0.9574713564477899, "learning_rate": 0.0001, "loss": 1.0423, "loss/crossentropy": 2.420292377471924, "loss/hidden": 0.8515625, "loss/logits": 0.1550384759902954, "loss/reg": 0.003569073276594281, "step": 1458 }, { "epoch": 0.182375, "grad_norm": 1.9881032705307007, "grad_norm_var": 0.9901407757083646, "learning_rate": 0.0001, "loss": 1.0698, "loss/crossentropy": 2.412853240966797, "loss/hidden": 0.890625, "loss/logits": 0.14347431063652039, "loss/reg": 0.0035672772210091352, "step": 1459 }, { "epoch": 0.1825, "grad_norm": 9.163015365600586, "grad_norm_var": 3.5801338990358595, "learning_rate": 0.0001, "loss": 1.2578, "loss/crossentropy": 2.492182731628418, "loss/hidden": 1.0390625, "loss/logits": 0.18306787312030792, "loss/reg": 0.003565459046512842, "step": 1460 }, { "epoch": 0.182625, "grad_norm": 2.5656776428222656, "grad_norm_var": 3.598769741394437, "learning_rate": 0.0001, "loss": 1.001, "loss/crossentropy": 2.5562655925750732, "loss/hidden": 0.82421875, "loss/logits": 0.14114579558372498, "loss/reg": 0.003563658567145467, "step": 1461 }, { "epoch": 0.18275, "grad_norm": 2.286931276321411, "grad_norm_var": 3.6378752514728876, "learning_rate": 0.0001, "loss": 0.9807, "loss/crossentropy": 2.698147773742676, "loss/hidden": 0.8125, "loss/logits": 0.13260522484779358, "loss/reg": 0.0035618396941572428, "step": 1462 }, { "epoch": 0.182875, "grad_norm": 2.371244430541992, "grad_norm_var": 3.643908523891269, "learning_rate": 0.0001, "loss": 1.1124, "loss/crossentropy": 2.432371139526367, "loss/hidden": 0.91015625, "loss/logits": 0.16662147641181946, "loss/reg": 0.0035599328111857176, "step": 1463 }, { "epoch": 0.183, "grad_norm": 2.6184229850769043, "grad_norm_var": 3.6189046702026477, "learning_rate": 0.0001, "loss": 1.0458, "loss/crossentropy": 2.947530508041382, "loss/hidden": 0.8671875, "loss/logits": 0.143006831407547, "loss/reg": 0.0035581255797296762, "step": 1464 }, { "epoch": 0.183125, "grad_norm": 2.143239736557007, "grad_norm_var": 3.641031281987516, "learning_rate": 0.0001, "loss": 1.0678, "loss/crossentropy": 2.6318516731262207, "loss/hidden": 0.88671875, "loss/logits": 0.14551308751106262, "loss/reg": 0.00355625175870955, "step": 1465 }, { "epoch": 0.18325, "grad_norm": 2.7335619926452637, "grad_norm_var": 3.589745013057074, "learning_rate": 0.0001, "loss": 1.2078, "loss/crossentropy": 2.375401020050049, "loss/hidden": 0.9765625, "loss/logits": 0.19566710293293, "loss/reg": 0.0035545255523175, "step": 1466 }, { "epoch": 0.183375, "grad_norm": 2.3009843826293945, "grad_norm_var": 2.903458838671109, "learning_rate": 0.0001, "loss": 1.1591, "loss/crossentropy": 2.5083110332489014, "loss/hidden": 0.94921875, "loss/logits": 0.1743055284023285, "loss/reg": 0.003552833804860711, "step": 1467 }, { "epoch": 0.1835, "grad_norm": 1.9567177295684814, "grad_norm_var": 2.952619415111201, "learning_rate": 0.0001, "loss": 0.9881, "loss/crossentropy": 2.5743629932403564, "loss/hidden": 0.80859375, "loss/logits": 0.14398899674415588, "loss/reg": 0.003551185131072998, "step": 1468 }, { "epoch": 0.183625, "grad_norm": 2.2741682529449463, "grad_norm_var": 2.9306800113679072, "learning_rate": 0.0001, "loss": 1.0048, "loss/crossentropy": 2.4656572341918945, "loss/hidden": 0.83203125, "loss/logits": 0.1372774988412857, "loss/reg": 0.003549505490809679, "step": 1469 }, { "epoch": 0.18375, "grad_norm": 2.9343631267547607, "grad_norm_var": 2.9302919053315675, "learning_rate": 0.0001, "loss": 1.1254, "loss/crossentropy": 2.4442336559295654, "loss/hidden": 0.8984375, "loss/logits": 0.19146251678466797, "loss/reg": 0.003547689877450466, "step": 1470 }, { "epoch": 0.183875, "grad_norm": 2.1364428997039795, "grad_norm_var": 2.9608635231208154, "learning_rate": 0.0001, "loss": 1.0281, "loss/crossentropy": 2.4912197589874268, "loss/hidden": 0.84765625, "loss/logits": 0.14495311677455902, "loss/reg": 0.003545962506905198, "step": 1471 }, { "epoch": 0.184, "grad_norm": 2.244561195373535, "grad_norm_var": 2.979609556239146, "learning_rate": 0.0001, "loss": 0.9575, "loss/crossentropy": 2.461921215057373, "loss/hidden": 0.80078125, "loss/logits": 0.12127329409122467, "loss/reg": 0.0035440947394818068, "step": 1472 }, { "epoch": 0.184125, "grad_norm": 2.6610682010650635, "grad_norm_var": 2.96117686540241, "learning_rate": 0.0001, "loss": 1.0065, "loss/crossentropy": 2.4418022632598877, "loss/hidden": 0.828125, "loss/logits": 0.14295433461666107, "loss/reg": 0.003542231861501932, "step": 1473 }, { "epoch": 0.18425, "grad_norm": 2.17217755317688, "grad_norm_var": 2.9687286207062233, "learning_rate": 0.0001, "loss": 1.0648, "loss/crossentropy": 2.4521894454956055, "loss/hidden": 0.8828125, "loss/logits": 0.146602600812912, "loss/reg": 0.003540375269949436, "step": 1474 }, { "epoch": 0.184375, "grad_norm": 2.5094003677368164, "grad_norm_var": 2.930364197494137, "learning_rate": 0.0001, "loss": 1.0535, "loss/crossentropy": 2.2975356578826904, "loss/hidden": 0.890625, "loss/logits": 0.12753836810588837, "loss/reg": 0.003538495395332575, "step": 1475 }, { "epoch": 0.1845, "grad_norm": 2.8615875244140625, "grad_norm_var": 0.08025149529701801, "learning_rate": 0.0001, "loss": 1.1913, "loss/crossentropy": 2.4363348484039307, "loss/hidden": 0.98046875, "loss/logits": 0.1754908561706543, "loss/reg": 0.0035365556832402945, "step": 1476 }, { "epoch": 0.184625, "grad_norm": 2.3295183181762695, "grad_norm_var": 0.07924959319393471, "learning_rate": 0.0001, "loss": 1.085, "loss/crossentropy": 2.641104221343994, "loss/hidden": 0.89453125, "loss/logits": 0.15514951944351196, "loss/reg": 0.0035345894284546375, "step": 1477 }, { "epoch": 0.18475, "grad_norm": 3.0680177211761475, "grad_norm_var": 0.10473031746968976, "learning_rate": 0.0001, "loss": 1.0339, "loss/crossentropy": 2.1963143348693848, "loss/hidden": 0.84765625, "loss/logits": 0.15095466375350952, "loss/reg": 0.003532707691192627, "step": 1478 }, { "epoch": 0.184875, "grad_norm": 2.1066739559173584, "grad_norm_var": 0.11213794701280312, "learning_rate": 0.0001, "loss": 0.9353, "loss/crossentropy": 2.988297700881958, "loss/hidden": 0.78515625, "loss/logits": 0.11482476443052292, "loss/reg": 0.0035308676306158304, "step": 1479 }, { "epoch": 0.185, "grad_norm": 2.4566586017608643, "grad_norm_var": 0.10993979963402485, "learning_rate": 0.0001, "loss": 1.3199, "loss/crossentropy": 2.324228048324585, "loss/hidden": 1.09375, "loss/logits": 0.19089123606681824, "loss/reg": 0.0035288881044834852, "step": 1480 }, { "epoch": 0.185125, "grad_norm": 2.360886573791504, "grad_norm_var": 0.10456219156340367, "learning_rate": 0.0001, "loss": 1.11, "loss/crossentropy": 2.5515263080596924, "loss/hidden": 0.90625, "loss/logits": 0.16846542060375214, "loss/reg": 0.003527080873027444, "step": 1481 }, { "epoch": 0.18525, "grad_norm": 2.6080853939056396, "grad_norm_var": 0.10070469690842856, "learning_rate": 0.0001, "loss": 1.1455, "loss/crossentropy": 2.456618070602417, "loss/hidden": 0.9296875, "loss/logits": 0.18056106567382812, "loss/reg": 0.003525231732055545, "step": 1482 }, { "epoch": 0.185375, "grad_norm": 2.5533339977264404, "grad_norm_var": 0.10013072862828926, "learning_rate": 0.0001, "loss": 1.0745, "loss/crossentropy": 2.6200571060180664, "loss/hidden": 0.89453125, "loss/logits": 0.14470210671424866, "loss/reg": 0.003523309249430895, "step": 1483 }, { "epoch": 0.1855, "grad_norm": 2.5463483333587646, "grad_norm_var": 0.08291376946414909, "learning_rate": 0.0001, "loss": 1.1705, "loss/crossentropy": 2.4352331161499023, "loss/hidden": 0.95703125, "loss/logits": 0.1782476007938385, "loss/reg": 0.0035214636009186506, "step": 1484 }, { "epoch": 0.185625, "grad_norm": 2.1944315433502197, "grad_norm_var": 0.08559466734096356, "learning_rate": 0.0001, "loss": 0.9894, "loss/crossentropy": 2.350627899169922, "loss/hidden": 0.82421875, "loss/logits": 0.13000428676605225, "loss/reg": 0.0035196379758417606, "step": 1485 }, { "epoch": 0.18575, "grad_norm": 2.352766990661621, "grad_norm_var": 0.0718094639254095, "learning_rate": 0.0001, "loss": 1.1058, "loss/crossentropy": 2.625410556793213, "loss/hidden": 0.921875, "loss/logits": 0.1486971080303192, "loss/reg": 0.003517881967127323, "step": 1486 }, { "epoch": 0.185875, "grad_norm": 2.4021310806274414, "grad_norm_var": 0.06519778826045103, "learning_rate": 0.0001, "loss": 0.9818, "loss/crossentropy": 2.3567581176757812, "loss/hidden": 0.8203125, "loss/logits": 0.1262809932231903, "loss/reg": 0.0035160251427441835, "step": 1487 }, { "epoch": 0.186, "grad_norm": 2.510427951812744, "grad_norm_var": 0.06182866367774575, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.475712537765503, "loss/hidden": 0.921875, "loss/logits": 0.1811387538909912, "loss/reg": 0.003514372045174241, "step": 1488 }, { "epoch": 0.186125, "grad_norm": 2.0962274074554443, "grad_norm_var": 0.0681959672911449, "learning_rate": 0.0001, "loss": 0.9282, "loss/crossentropy": 2.600541830062866, "loss/hidden": 0.76953125, "loss/logits": 0.12352926284074783, "loss/reg": 0.0035127766896039248, "step": 1489 }, { "epoch": 0.18625, "grad_norm": 2.7332022190093994, "grad_norm_var": 0.06741919371529713, "learning_rate": 0.0001, "loss": 1.0116, "loss/crossentropy": 2.5983481407165527, "loss/hidden": 0.8359375, "loss/logits": 0.14054536819458008, "loss/reg": 0.003511229529976845, "step": 1490 }, { "epoch": 0.186375, "grad_norm": 2.262906789779663, "grad_norm_var": 0.07027029030217118, "learning_rate": 0.0001, "loss": 1.2372, "loss/crossentropy": 2.5684587955474854, "loss/hidden": 1.015625, "loss/logits": 0.1864907145500183, "loss/reg": 0.0035094181075692177, "step": 1491 }, { "epoch": 0.1865, "grad_norm": 2.142493963241577, "grad_norm_var": 0.06458349300590362, "learning_rate": 0.0001, "loss": 0.9329, "loss/crossentropy": 2.6954498291015625, "loss/hidden": 0.765625, "loss/logits": 0.13216395676136017, "loss/reg": 0.003507613204419613, "step": 1492 }, { "epoch": 0.186625, "grad_norm": 2.312235116958618, "grad_norm_var": 0.06481126280718001, "learning_rate": 0.0001, "loss": 1.0631, "loss/crossentropy": 2.7299952507019043, "loss/hidden": 0.8828125, "loss/logits": 0.1452367901802063, "loss/reg": 0.003506068605929613, "step": 1493 }, { "epoch": 0.18675, "grad_norm": 2.66845703125, "grad_norm_var": 0.040222462022599596, "learning_rate": 0.0001, "loss": 1.1768, "loss/crossentropy": 2.5572774410247803, "loss/hidden": 0.96875, "loss/logits": 0.17304158210754395, "loss/reg": 0.0035042495001107454, "step": 1494 }, { "epoch": 0.186875, "grad_norm": 2.0365381240844727, "grad_norm_var": 0.04321872460463207, "learning_rate": 0.0001, "loss": 1.0286, "loss/crossentropy": 2.845195770263672, "loss/hidden": 0.8515625, "loss/logits": 0.14202150702476501, "loss/reg": 0.0035027535632252693, "step": 1495 }, { "epoch": 0.187, "grad_norm": 2.2023096084594727, "grad_norm_var": 0.04499537551175739, "learning_rate": 0.0001, "loss": 1.1091, "loss/crossentropy": 2.4882686138153076, "loss/hidden": 0.91796875, "loss/logits": 0.15609663724899292, "loss/reg": 0.0035012420266866684, "step": 1496 }, { "epoch": 0.187125, "grad_norm": 2.208552122116089, "grad_norm_var": 0.04671054126144914, "learning_rate": 0.0001, "loss": 1.2829, "loss/crossentropy": 2.6602842807769775, "loss/hidden": 1.0546875, "loss/logits": 0.19319944083690643, "loss/reg": 0.0034994245506823063, "step": 1497 }, { "epoch": 0.18725, "grad_norm": 2.562239170074463, "grad_norm_var": 0.04535231939183457, "learning_rate": 0.0001, "loss": 1.0075, "loss/crossentropy": 2.238691806793213, "loss/hidden": 0.8359375, "loss/logits": 0.136610209941864, "loss/reg": 0.0034975947346538305, "step": 1498 }, { "epoch": 0.187375, "grad_norm": 2.401848316192627, "grad_norm_var": 0.04291264261425264, "learning_rate": 0.0001, "loss": 1.0662, "loss/crossentropy": 2.685908079147339, "loss/hidden": 0.890625, "loss/logits": 0.1406560093164444, "loss/reg": 0.0034957744646817446, "step": 1499 }, { "epoch": 0.1875, "grad_norm": 2.12306809425354, "grad_norm_var": 0.04314595548621488, "learning_rate": 0.0001, "loss": 1.1621, "loss/crossentropy": 2.2005772590637207, "loss/hidden": 0.9609375, "loss/logits": 0.1661786586046219, "loss/reg": 0.0034941888879984617, "step": 1500 }, { "epoch": 0.187625, "grad_norm": 3.0501010417938232, "grad_norm_var": 0.07394000618437152, "learning_rate": 0.0001, "loss": 1.3451, "loss/crossentropy": 2.623453140258789, "loss/hidden": 1.09375, "loss/logits": 0.21640917658805847, "loss/reg": 0.003492384683340788, "step": 1501 }, { "epoch": 0.18775, "grad_norm": 3.257550001144409, "grad_norm_var": 0.12192848616994235, "learning_rate": 0.0001, "loss": 1.3142, "loss/crossentropy": 2.670974016189575, "loss/hidden": 1.0859375, "loss/logits": 0.19331884384155273, "loss/reg": 0.0034905769862234592, "step": 1502 }, { "epoch": 0.187875, "grad_norm": 1.9728327989578247, "grad_norm_var": 0.13536526430902043, "learning_rate": 0.0001, "loss": 0.9776, "loss/crossentropy": 2.603712320327759, "loss/hidden": 0.8046875, "loss/logits": 0.1380743682384491, "loss/reg": 0.0034887471701949835, "step": 1503 }, { "epoch": 0.188, "grad_norm": 41.34659194946289, "grad_norm_var": 94.9270262878801, "learning_rate": 0.0001, "loss": 1.3036, "loss/crossentropy": 2.142944812774658, "loss/hidden": 1.0859375, "loss/logits": 0.1828157901763916, "loss/reg": 0.003486843081191182, "step": 1504 }, { "epoch": 0.188125, "grad_norm": 2.199233293533325, "grad_norm_var": 94.89006007533027, "learning_rate": 0.0001, "loss": 1.027, "loss/crossentropy": 2.5525684356689453, "loss/hidden": 0.8515625, "loss/logits": 0.14055398106575012, "loss/reg": 0.0034850805532187223, "step": 1505 }, { "epoch": 0.18825, "grad_norm": 2.0132434368133545, "grad_norm_var": 95.12493831851324, "learning_rate": 0.0001, "loss": 0.9913, "loss/crossentropy": 2.3946876525878906, "loss/hidden": 0.82421875, "loss/logits": 0.13224273920059204, "loss/reg": 0.0034832614473998547, "step": 1506 }, { "epoch": 0.188375, "grad_norm": 2.1540908813476562, "grad_norm_var": 95.16245243204516, "learning_rate": 0.0001, "loss": 1.0228, "loss/crossentropy": 2.4561750888824463, "loss/hidden": 0.83203125, "loss/logits": 0.15593823790550232, "loss/reg": 0.003481344785541296, "step": 1507 }, { "epoch": 0.1885, "grad_norm": 2.6965999603271484, "grad_norm_var": 94.98598958949965, "learning_rate": 0.0001, "loss": 1.2656, "loss/crossentropy": 2.38210391998291, "loss/hidden": 1.0390625, "loss/logits": 0.19175776839256287, "loss/reg": 0.003479481441900134, "step": 1508 }, { "epoch": 0.188625, "grad_norm": 3.4180994033813477, "grad_norm_var": 94.69186888365496, "learning_rate": 0.0001, "loss": 1.2663, "loss/crossentropy": 2.4697988033294678, "loss/hidden": 1.046875, "loss/logits": 0.18465159833431244, "loss/reg": 0.0034777566324919462, "step": 1509 }, { "epoch": 0.18875, "grad_norm": 2.273015022277832, "grad_norm_var": 94.81900961164247, "learning_rate": 0.0001, "loss": 1.0814, "loss/crossentropy": 2.623250722885132, "loss/hidden": 0.87890625, "loss/logits": 0.16774481534957886, "loss/reg": 0.003475895617157221, "step": 1510 }, { "epoch": 0.188875, "grad_norm": 2.247028112411499, "grad_norm_var": 94.74226385976161, "learning_rate": 0.0001, "loss": 1.117, "loss/crossentropy": 2.674126148223877, "loss/hidden": 0.921875, "loss/logits": 0.1603851318359375, "loss/reg": 0.0034741731360554695, "step": 1511 }, { "epoch": 0.189, "grad_norm": 4.356043338775635, "grad_norm_var": 94.26240397096606, "learning_rate": 0.0001, "loss": 1.2795, "loss/crossentropy": 2.6789023876190186, "loss/hidden": 1.0546875, "loss/logits": 0.19006717205047607, "loss/reg": 0.0034723973367363214, "step": 1512 }, { "epoch": 0.189125, "grad_norm": 2.381601572036743, "grad_norm_var": 94.19946382080788, "learning_rate": 0.0001, "loss": 1.245, "loss/crossentropy": 2.090708017349243, "loss/hidden": 1.046875, "loss/logits": 0.1634030044078827, "loss/reg": 0.0034707069862633944, "step": 1513 }, { "epoch": 0.18925, "grad_norm": 4.63723087310791, "grad_norm_var": 93.78628244843514, "learning_rate": 0.0001, "loss": 1.0803, "loss/crossentropy": 2.9273617267608643, "loss/hidden": 0.90625, "loss/logits": 0.13935977220535278, "loss/reg": 0.0034691800829023123, "step": 1514 }, { "epoch": 0.189375, "grad_norm": 3.1558918952941895, "grad_norm_var": 93.54471655609011, "learning_rate": 0.0001, "loss": 1.1021, "loss/crossentropy": 2.6062216758728027, "loss/hidden": 0.8984375, "loss/logits": 0.16897618770599365, "loss/reg": 0.003467726521193981, "step": 1515 }, { "epoch": 0.1895, "grad_norm": 2.6901845932006836, "grad_norm_var": 93.331765452413, "learning_rate": 0.0001, "loss": 1.0927, "loss/crossentropy": 2.563309907913208, "loss/hidden": 0.9140625, "loss/logits": 0.14401212334632874, "loss/reg": 0.003465942805632949, "step": 1516 }, { "epoch": 0.189625, "grad_norm": 2.6145591735839844, "grad_norm_var": 93.47082774818871, "learning_rate": 0.0001, "loss": 1.1674, "loss/crossentropy": 2.80975604057312, "loss/hidden": 0.96484375, "loss/logits": 0.1679481714963913, "loss/reg": 0.0034644228871911764, "step": 1517 }, { "epoch": 0.18975, "grad_norm": 2.1897029876708984, "grad_norm_var": 93.82056409785089, "learning_rate": 0.0001, "loss": 1.1465, "loss/crossentropy": 2.4143311977386475, "loss/hidden": 0.94140625, "loss/logits": 0.17049774527549744, "loss/reg": 0.003462952096015215, "step": 1518 }, { "epoch": 0.189875, "grad_norm": 2.565324068069458, "grad_norm_var": 93.59177882800311, "learning_rate": 0.0001, "loss": 1.3113, "loss/crossentropy": 2.0998597145080566, "loss/hidden": 1.0703125, "loss/logits": 0.20640692114830017, "loss/reg": 0.0034615371841937304, "step": 1519 }, { "epoch": 0.19, "grad_norm": 2.4154701232910156, "grad_norm_var": 0.6036209187642118, "learning_rate": 0.0001, "loss": 0.9326, "loss/crossentropy": 2.6523633003234863, "loss/hidden": 0.76171875, "loss/logits": 0.1362442970275879, "loss/reg": 0.0034597725607454777, "step": 1520 }, { "epoch": 0.190125, "grad_norm": 2.2444441318511963, "grad_norm_var": 0.6004258293545394, "learning_rate": 0.0001, "loss": 1.128, "loss/crossentropy": 2.0900661945343018, "loss/hidden": 0.95703125, "loss/logits": 0.136434406042099, "loss/reg": 0.0034582833759486675, "step": 1521 }, { "epoch": 0.19025, "grad_norm": 2.411386728286743, "grad_norm_var": 0.5710476325004736, "learning_rate": 0.0001, "loss": 1.2059, "loss/crossentropy": 2.381988286972046, "loss/hidden": 0.99609375, "loss/logits": 0.17528721690177917, "loss/reg": 0.0034567993134260178, "step": 1522 }, { "epoch": 0.190375, "grad_norm": 2.5889010429382324, "grad_norm_var": 0.5466832532559577, "learning_rate": 0.0001, "loss": 1.0782, "loss/crossentropy": 2.6577584743499756, "loss/hidden": 0.8984375, "loss/logits": 0.14523936808109283, "loss/reg": 0.003455315949395299, "step": 1523 }, { "epoch": 0.1905, "grad_norm": 2.219857692718506, "grad_norm_var": 0.56780075329749, "learning_rate": 0.0001, "loss": 1.0392, "loss/crossentropy": 2.7978203296661377, "loss/hidden": 0.8359375, "loss/logits": 0.16872358322143555, "loss/reg": 0.0034539303742349148, "step": 1524 }, { "epoch": 0.190625, "grad_norm": 2.3635013103485107, "grad_norm_var": 0.5469604537174282, "learning_rate": 0.0001, "loss": 1.0257, "loss/crossentropy": 2.6388540267944336, "loss/hidden": 0.8359375, "loss/logits": 0.15525703132152557, "loss/reg": 0.0034521608613431454, "step": 1525 }, { "epoch": 0.19075, "grad_norm": 2.082015037536621, "grad_norm_var": 0.560359742807311, "learning_rate": 0.0001, "loss": 1.1153, "loss/crossentropy": 2.446864128112793, "loss/hidden": 0.921875, "loss/logits": 0.15890395641326904, "loss/reg": 0.003450631396844983, "step": 1526 }, { "epoch": 0.190875, "grad_norm": 3.5639352798461914, "grad_norm_var": 0.5896182471249951, "learning_rate": 0.0001, "loss": 1.193, "loss/crossentropy": 2.174250602722168, "loss/hidden": 1.0234375, "loss/logits": 0.13503766059875488, "loss/reg": 0.0034488984383642673, "step": 1527 }, { "epoch": 0.191, "grad_norm": 3.0047361850738525, "grad_norm_var": 0.4197832623445635, "learning_rate": 0.0001, "loss": 1.2854, "loss/crossentropy": 2.883190631866455, "loss/hidden": 1.0625, "loss/logits": 0.1883969008922577, "loss/reg": 0.003447153139859438, "step": 1528 }, { "epoch": 0.191125, "grad_norm": 2.433673858642578, "grad_norm_var": 0.4177730223981217, "learning_rate": 0.0001, "loss": 1.1853, "loss/crossentropy": 2.390415668487549, "loss/hidden": 0.9765625, "loss/logits": 0.17427203059196472, "loss/reg": 0.003445402719080448, "step": 1529 }, { "epoch": 0.19125, "grad_norm": 3.7508089542388916, "grad_norm_var": 0.23777977315326002, "learning_rate": 0.0001, "loss": 1.2249, "loss/crossentropy": 2.995112895965576, "loss/hidden": 1.015625, "loss/logits": 0.17482982575893402, "loss/reg": 0.003443735418841243, "step": 1530 }, { "epoch": 0.191375, "grad_norm": 2.349748373031616, "grad_norm_var": 0.22331083482360797, "learning_rate": 0.0001, "loss": 1.1455, "loss/crossentropy": 2.5757083892822266, "loss/hidden": 0.9453125, "loss/logits": 0.16579627990722656, "loss/reg": 0.00344208930619061, "step": 1531 }, { "epoch": 0.1915, "grad_norm": 2.0953431129455566, "grad_norm_var": 0.23771892232560557, "learning_rate": 0.0001, "loss": 1.0868, "loss/crossentropy": 2.493220567703247, "loss/hidden": 0.8828125, "loss/logits": 0.169611394405365, "loss/reg": 0.003440374741330743, "step": 1532 }, { "epoch": 0.191625, "grad_norm": 2.2157857418060303, "grad_norm_var": 0.24453549562240368, "learning_rate": 0.0001, "loss": 1.071, "loss/crossentropy": 2.383723735809326, "loss/hidden": 0.8984375, "loss/logits": 0.1382179707288742, "loss/reg": 0.0034387765917927027, "step": 1533 }, { "epoch": 0.19175, "grad_norm": 2.3576011657714844, "grad_norm_var": 0.23865884883084618, "learning_rate": 0.0001, "loss": 1.0066, "loss/crossentropy": 2.542468547821045, "loss/hidden": 0.828125, "loss/logits": 0.14409103989601135, "loss/reg": 0.0034371281508356333, "step": 1534 }, { "epoch": 0.191875, "grad_norm": 3.1461169719696045, "grad_norm_var": 0.2615933880776604, "learning_rate": 0.0001, "loss": 1.2554, "loss/crossentropy": 2.378319025039673, "loss/hidden": 1.046875, "loss/logits": 0.17414087057113647, "loss/reg": 0.0034355788957327604, "step": 1535 }, { "epoch": 0.192, "grad_norm": 2.0135512351989746, "grad_norm_var": 0.28038375054829506, "learning_rate": 0.0001, "loss": 1.083, "loss/crossentropy": 1.9969367980957031, "loss/hidden": 0.90625, "loss/logits": 0.1424512416124344, "loss/reg": 0.0034339565318077803, "step": 1536 }, { "epoch": 0.192125, "grad_norm": 2.7598655223846436, "grad_norm_var": 0.27581093075343593, "learning_rate": 0.0001, "loss": 1.2095, "loss/crossentropy": 2.7056527137756348, "loss/hidden": 1.0, "loss/logits": 0.1751493215560913, "loss/reg": 0.0034322130959481, "step": 1537 }, { "epoch": 0.19225, "grad_norm": 2.116117000579834, "grad_norm_var": 0.28808717203202694, "learning_rate": 0.0001, "loss": 1.0636, "loss/crossentropy": 2.7393789291381836, "loss/hidden": 0.87890625, "loss/logits": 0.15042325854301453, "loss/reg": 0.003430649871006608, "step": 1538 }, { "epoch": 0.192375, "grad_norm": 3.571504592895508, "grad_norm_var": 0.35138636847546134, "learning_rate": 0.0001, "loss": 1.3128, "loss/crossentropy": 2.3102715015411377, "loss/hidden": 1.0859375, "loss/logits": 0.19261470437049866, "loss/reg": 0.0034290915355086327, "step": 1539 }, { "epoch": 0.1925, "grad_norm": 2.954730987548828, "grad_norm_var": 0.34517124347645045, "learning_rate": 0.0001, "loss": 1.2081, "loss/crossentropy": 2.3605659008026123, "loss/hidden": 1.0078125, "loss/logits": 0.16598157584667206, "loss/reg": 0.003427294548600912, "step": 1540 }, { "epoch": 0.192625, "grad_norm": 2.188232660293579, "grad_norm_var": 0.3543400274390722, "learning_rate": 0.0001, "loss": 1.0359, "loss/crossentropy": 2.4097092151641846, "loss/hidden": 0.8515625, "loss/logits": 0.15011939406394958, "loss/reg": 0.0034254533238708973, "step": 1541 }, { "epoch": 0.19275, "grad_norm": 1.8244503736495972, "grad_norm_var": 0.37842932295744913, "learning_rate": 0.0001, "loss": 1.1172, "loss/crossentropy": 2.4335646629333496, "loss/hidden": 0.92578125, "loss/logits": 0.15719163417816162, "loss/reg": 0.003423537826165557, "step": 1542 }, { "epoch": 0.192875, "grad_norm": 2.0547943115234375, "grad_norm_var": 0.33619594757236554, "learning_rate": 0.0001, "loss": 1.1579, "loss/crossentropy": 2.5234384536743164, "loss/hidden": 0.9453125, "loss/logits": 0.17836084961891174, "loss/reg": 0.0034218020737171173, "step": 1543 }, { "epoch": 0.193, "grad_norm": 2.1028945446014404, "grad_norm_var": 0.33262686711846395, "learning_rate": 0.0001, "loss": 1.024, "loss/crossentropy": 2.5168957710266113, "loss/hidden": 0.86328125, "loss/logits": 0.1265672892332077, "loss/reg": 0.003420063992962241, "step": 1544 }, { "epoch": 0.193125, "grad_norm": 3.1589317321777344, "grad_norm_var": 0.3594795180238894, "learning_rate": 0.0001, "loss": 1.3999, "loss/crossentropy": 2.19030499458313, "loss/hidden": 1.171875, "loss/logits": 0.19384470582008362, "loss/reg": 0.0034181931987404823, "step": 1545 }, { "epoch": 0.19325, "grad_norm": 2.3448944091796875, "grad_norm_var": 0.2562841379896558, "learning_rate": 0.0001, "loss": 1.1032, "loss/crossentropy": 2.6743974685668945, "loss/hidden": 0.91015625, "loss/logits": 0.15885460376739502, "loss/reg": 0.003416434396058321, "step": 1546 }, { "epoch": 0.193375, "grad_norm": 2.1164956092834473, "grad_norm_var": 0.2629084863422197, "learning_rate": 0.0001, "loss": 1.1537, "loss/crossentropy": 2.233337879180908, "loss/hidden": 0.96875, "loss/logits": 0.15075814723968506, "loss/reg": 0.0034145053941756487, "step": 1547 }, { "epoch": 0.1935, "grad_norm": 2.2009761333465576, "grad_norm_var": 0.2587680482498602, "learning_rate": 0.0001, "loss": 1.066, "loss/crossentropy": 2.364064931869507, "loss/hidden": 0.87890625, "loss/logits": 0.15300363302230835, "loss/reg": 0.003412702353671193, "step": 1548 }, { "epoch": 0.193625, "grad_norm": 2.375746250152588, "grad_norm_var": 0.2554693062414391, "learning_rate": 0.0001, "loss": 1.1073, "loss/crossentropy": 2.6182949542999268, "loss/hidden": 0.92578125, "loss/logits": 0.14743448793888092, "loss/reg": 0.0034108073450624943, "step": 1549 }, { "epoch": 0.19375, "grad_norm": 2.5327534675598145, "grad_norm_var": 0.25510200809180733, "learning_rate": 0.0001, "loss": 1.0888, "loss/crossentropy": 2.710033416748047, "loss/hidden": 0.88671875, "loss/logits": 0.16803640127182007, "loss/reg": 0.0034089069813489914, "step": 1550 }, { "epoch": 0.193875, "grad_norm": 2.195998430252075, "grad_norm_var": 0.22541138413578582, "learning_rate": 0.0001, "loss": 1.1853, "loss/crossentropy": 2.199061393737793, "loss/hidden": 0.9765625, "loss/logits": 0.17465950548648834, "loss/reg": 0.003407144919037819, "step": 1551 }, { "epoch": 0.194, "grad_norm": 2.297307014465332, "grad_norm_var": 0.2155580849353057, "learning_rate": 0.0001, "loss": 0.9825, "loss/crossentropy": 2.3787636756896973, "loss/hidden": 0.82421875, "loss/logits": 0.12426453083753586, "loss/reg": 0.0034053786657750607, "step": 1552 }, { "epoch": 0.194125, "grad_norm": 42.85431671142578, "grad_norm_var": 102.47997721663579, "learning_rate": 0.0001, "loss": 1.0865, "loss/crossentropy": 2.3751256465911865, "loss/hidden": 0.8984375, "loss/logits": 0.15405802428722382, "loss/reg": 0.0034037018194794655, "step": 1553 }, { "epoch": 0.19425, "grad_norm": 2.448805809020996, "grad_norm_var": 102.36204705695503, "learning_rate": 0.0001, "loss": 1.1293, "loss/crossentropy": 2.632035255432129, "loss/hidden": 0.91796875, "loss/logits": 0.17729425430297852, "loss/reg": 0.0034019986633211374, "step": 1554 }, { "epoch": 0.194375, "grad_norm": 2.3281965255737305, "grad_norm_var": 102.68741632356571, "learning_rate": 0.0001, "loss": 1.1374, "loss/crossentropy": 2.444570302963257, "loss/hidden": 0.9375, "loss/logits": 0.16586540639400482, "loss/reg": 0.003400270827114582, "step": 1555 }, { "epoch": 0.1945, "grad_norm": 2.293851852416992, "grad_norm_var": 102.8838099010742, "learning_rate": 0.0001, "loss": 1.0756, "loss/crossentropy": 2.545522928237915, "loss/hidden": 0.875, "loss/logits": 0.16658270359039307, "loss/reg": 0.0033986270427703857, "step": 1556 }, { "epoch": 0.194625, "grad_norm": 4.360113143920898, "grad_norm_var": 102.41291327849744, "learning_rate": 0.0001, "loss": 1.2058, "loss/crossentropy": 2.3416459560394287, "loss/hidden": 1.0234375, "loss/logits": 0.14838361740112305, "loss/reg": 0.003396830288693309, "step": 1557 }, { "epoch": 0.19475, "grad_norm": 2.38606858253479, "grad_norm_var": 102.19721826513539, "learning_rate": 0.0001, "loss": 1.0499, "loss/crossentropy": 2.7640974521636963, "loss/hidden": 0.875, "loss/logits": 0.1409429907798767, "loss/reg": 0.0033951113000512123, "step": 1558 }, { "epoch": 0.194875, "grad_norm": 2.7758493423461914, "grad_norm_var": 101.94624591139775, "learning_rate": 0.0001, "loss": 1.0704, "loss/crossentropy": 2.497638702392578, "loss/hidden": 0.890625, "loss/logits": 0.14583545923233032, "loss/reg": 0.0033934745006263256, "step": 1559 }, { "epoch": 0.195, "grad_norm": 3.479710340499878, "grad_norm_var": 101.52401358472737, "learning_rate": 0.0001, "loss": 1.1734, "loss/crossentropy": 2.457615375518799, "loss/hidden": 0.9609375, "loss/logits": 0.1785276234149933, "loss/reg": 0.0033916765823960304, "step": 1560 }, { "epoch": 0.195125, "grad_norm": 2.017589807510376, "grad_norm_var": 101.90605089709193, "learning_rate": 0.0001, "loss": 1.1169, "loss/crossentropy": 2.240902900695801, "loss/hidden": 0.90625, "loss/logits": 0.17671313881874084, "loss/reg": 0.0033898656256496906, "step": 1561 }, { "epoch": 0.19525, "grad_norm": 3.5527615547180176, "grad_norm_var": 101.55947999989262, "learning_rate": 0.0001, "loss": 1.3431, "loss/crossentropy": 1.965632677078247, "loss/hidden": 1.1484375, "loss/logits": 0.16078650951385498, "loss/reg": 0.0033880271948873997, "step": 1562 }, { "epoch": 0.195375, "grad_norm": 2.345229387283325, "grad_norm_var": 101.47058431829664, "learning_rate": 0.0001, "loss": 1.0961, "loss/crossentropy": 2.583942174911499, "loss/hidden": 0.8984375, "loss/logits": 0.1638316810131073, "loss/reg": 0.003386161755770445, "step": 1563 }, { "epoch": 0.1955, "grad_norm": 3.6912682056427, "grad_norm_var": 101.02284512008372, "learning_rate": 0.0001, "loss": 1.1561, "loss/crossentropy": 2.344486713409424, "loss/hidden": 0.9140625, "loss/logits": 0.2081700563430786, "loss/reg": 0.0033844145946204662, "step": 1564 }, { "epoch": 0.195625, "grad_norm": 2.0806732177734375, "grad_norm_var": 101.14121040687311, "learning_rate": 0.0001, "loss": 1.1902, "loss/crossentropy": 2.3066744804382324, "loss/hidden": 0.9765625, "loss/logits": 0.17980894446372986, "loss/reg": 0.0033826008439064026, "step": 1565 }, { "epoch": 0.19575, "grad_norm": 2.6736996173858643, "grad_norm_var": 101.09180955446247, "learning_rate": 0.0001, "loss": 1.1123, "loss/crossentropy": 2.6491916179656982, "loss/hidden": 0.83984375, "loss/logits": 0.23861975967884064, "loss/reg": 0.0033807456493377686, "step": 1566 }, { "epoch": 0.195875, "grad_norm": 2.320620059967041, "grad_norm_var": 101.0422612381744, "learning_rate": 0.0001, "loss": 0.9997, "loss/crossentropy": 2.432460069656372, "loss/hidden": 0.83984375, "loss/logits": 0.12610265612602234, "loss/reg": 0.003379035508260131, "step": 1567 }, { "epoch": 0.196, "grad_norm": 2.170931816101074, "grad_norm_var": 101.09291343176476, "learning_rate": 0.0001, "loss": 1.0037, "loss/crossentropy": 2.6432549953460693, "loss/hidden": 0.83203125, "loss/logits": 0.13785216212272644, "loss/reg": 0.0033771616872400045, "step": 1568 }, { "epoch": 0.196125, "grad_norm": 1.9352695941925049, "grad_norm_var": 0.5014398496234489, "learning_rate": 0.0001, "loss": 1.0546, "loss/crossentropy": 2.368238925933838, "loss/hidden": 0.8671875, "loss/logits": 0.15368467569351196, "loss/reg": 0.0033752431627362967, "step": 1569 }, { "epoch": 0.19625, "grad_norm": 1.7521263360977173, "grad_norm_var": 0.5531383546467253, "learning_rate": 0.0001, "loss": 0.964, "loss/crossentropy": 2.388716220855713, "loss/hidden": 0.796875, "loss/logits": 0.13340537250041962, "loss/reg": 0.0033735185861587524, "step": 1570 }, { "epoch": 0.196375, "grad_norm": 2.942868709564209, "grad_norm_var": 0.5515874670900174, "learning_rate": 0.0001, "loss": 1.246, "loss/crossentropy": 2.4898412227630615, "loss/hidden": 1.03125, "loss/logits": 0.18107619881629944, "loss/reg": 0.0033718394115567207, "step": 1571 }, { "epoch": 0.1965, "grad_norm": 2.828531503677368, "grad_norm_var": 0.5423780354131977, "learning_rate": 0.0001, "loss": 1.1128, "loss/crossentropy": 2.526690721511841, "loss/hidden": 0.9140625, "loss/logits": 0.1650082767009735, "loss/reg": 0.0033701006323099136, "step": 1572 }, { "epoch": 0.196625, "grad_norm": 1.8868728876113892, "grad_norm_var": 0.3795729319831989, "learning_rate": 0.0001, "loss": 0.9528, "loss/crossentropy": 2.4025442600250244, "loss/hidden": 0.78125, "loss/logits": 0.13787029683589935, "loss/reg": 0.0033683953806757927, "step": 1573 }, { "epoch": 0.19675, "grad_norm": 2.290252208709717, "grad_norm_var": 0.38227303455985767, "learning_rate": 0.0001, "loss": 1.0692, "loss/crossentropy": 2.349837064743042, "loss/hidden": 0.87109375, "loss/logits": 0.16445474326610565, "loss/reg": 0.0033667993266135454, "step": 1574 }, { "epoch": 0.196875, "grad_norm": 2.190880537033081, "grad_norm_var": 0.385772762292572, "learning_rate": 0.0001, "loss": 0.9696, "loss/crossentropy": 2.2366368770599365, "loss/hidden": 0.796875, "loss/logits": 0.13903136551380157, "loss/reg": 0.0033651133999228477, "step": 1575 }, { "epoch": 0.197, "grad_norm": 3.2513270378112793, "grad_norm_var": 0.35950258294762044, "learning_rate": 0.0001, "loss": 1.2407, "loss/crossentropy": 2.3932385444641113, "loss/hidden": 1.03125, "loss/logits": 0.1757686287164688, "loss/reg": 0.003363401163369417, "step": 1576 }, { "epoch": 0.197125, "grad_norm": 2.902137279510498, "grad_norm_var": 0.35201813546933614, "learning_rate": 0.0001, "loss": 1.3339, "loss/crossentropy": 2.5385305881500244, "loss/hidden": 1.1171875, "loss/logits": 0.18306049704551697, "loss/reg": 0.0033617597073316574, "step": 1577 }, { "epoch": 0.19725, "grad_norm": 2.4924919605255127, "grad_norm_var": 0.2806556923123916, "learning_rate": 0.0001, "loss": 1.101, "loss/crossentropy": 2.913592576980591, "loss/hidden": 0.9140625, "loss/logits": 0.15331600606441498, "loss/reg": 0.003360015107318759, "step": 1578 }, { "epoch": 0.197375, "grad_norm": 2.6192233562469482, "grad_norm_var": 0.2802525663669532, "learning_rate": 0.0001, "loss": 1.2151, "loss/crossentropy": 2.543704032897949, "loss/hidden": 1.0234375, "loss/logits": 0.1581249237060547, "loss/reg": 0.003358310554176569, "step": 1579 }, { "epoch": 0.1975, "grad_norm": 2.3134047985076904, "grad_norm_var": 0.1803902922500375, "learning_rate": 0.0001, "loss": 1.217, "loss/crossentropy": 2.3356986045837402, "loss/hidden": 1.0234375, "loss/logits": 0.15994945168495178, "loss/reg": 0.003356639062985778, "step": 1580 }, { "epoch": 0.197625, "grad_norm": 2.918966054916382, "grad_norm_var": 0.18686370719447395, "learning_rate": 0.0001, "loss": 1.4755, "loss/crossentropy": 2.0418221950531006, "loss/hidden": 1.2109375, "loss/logits": 0.2309650182723999, "loss/reg": 0.00335493846796453, "step": 1581 }, { "epoch": 0.19775, "grad_norm": 2.1771440505981445, "grad_norm_var": 0.1886619692371113, "learning_rate": 0.0001, "loss": 0.9935, "loss/crossentropy": 2.502734661102295, "loss/hidden": 0.82421875, "loss/logits": 0.1357189416885376, "loss/reg": 0.0033531710505485535, "step": 1582 }, { "epoch": 0.197875, "grad_norm": 2.2884721755981445, "grad_norm_var": 0.18922569213149815, "learning_rate": 0.0001, "loss": 0.9984, "loss/crossentropy": 2.4832651615142822, "loss/hidden": 0.8125, "loss/logits": 0.15235686302185059, "loss/reg": 0.00335147837176919, "step": 1583 }, { "epoch": 0.198, "grad_norm": 2.547328472137451, "grad_norm_var": 0.18482493667708866, "learning_rate": 0.0001, "loss": 1.1119, "loss/crossentropy": 2.575803279876709, "loss/hidden": 0.92578125, "loss/logits": 0.15261293947696686, "loss/reg": 0.003349804785102606, "step": 1584 }, { "epoch": 0.198125, "grad_norm": 3.402043581008911, "grad_norm_var": 0.21694510449547597, "learning_rate": 0.0001, "loss": 1.3013, "loss/crossentropy": 2.0987796783447266, "loss/hidden": 1.09375, "loss/logits": 0.17406561970710754, "loss/reg": 0.0033480448182672262, "step": 1585 }, { "epoch": 0.19825, "grad_norm": 2.545072317123413, "grad_norm_var": 0.17185981683361665, "learning_rate": 0.0001, "loss": 1.1464, "loss/crossentropy": 2.5816264152526855, "loss/hidden": 0.9375, "loss/logits": 0.17544244229793549, "loss/reg": 0.003346419893205166, "step": 1586 }, { "epoch": 0.198375, "grad_norm": 3.0395045280456543, "grad_norm_var": 0.17686366063397868, "learning_rate": 0.0001, "loss": 1.248, "loss/crossentropy": 2.302675247192383, "loss/hidden": 1.015625, "loss/logits": 0.19897626340389252, "loss/reg": 0.0033445856533944607, "step": 1587 }, { "epoch": 0.1985, "grad_norm": 2.4645659923553467, "grad_norm_var": 0.17433679981741357, "learning_rate": 0.0001, "loss": 1.1567, "loss/crossentropy": 2.4756343364715576, "loss/hidden": 0.95703125, "loss/logits": 0.16626408696174622, "loss/reg": 0.0033427351154386997, "step": 1588 }, { "epoch": 0.198625, "grad_norm": 2.810246706008911, "grad_norm_var": 0.14190777744166377, "learning_rate": 0.0001, "loss": 1.1902, "loss/crossentropy": 2.5803637504577637, "loss/hidden": 0.9765625, "loss/logits": 0.18018130958080292, "loss/reg": 0.0033410657197237015, "step": 1589 }, { "epoch": 0.19875, "grad_norm": 2.4339115619659424, "grad_norm_var": 0.13648274466220206, "learning_rate": 0.0001, "loss": 1.028, "loss/crossentropy": 2.399829149246216, "loss/hidden": 0.828125, "loss/logits": 0.16653020679950714, "loss/reg": 0.0033393700141459703, "step": 1590 }, { "epoch": 0.198875, "grad_norm": 3.239158868789673, "grad_norm_var": 0.141020529033392, "learning_rate": 0.0001, "loss": 1.1675, "loss/crossentropy": 2.2257063388824463, "loss/hidden": 0.9609375, "loss/logits": 0.17314554750919342, "loss/reg": 0.0033375280909240246, "step": 1591 }, { "epoch": 0.199, "grad_norm": 2.437912940979004, "grad_norm_var": 0.12423960548648268, "learning_rate": 0.0001, "loss": 0.922, "loss/crossentropy": 2.4248645305633545, "loss/hidden": 0.765625, "loss/logits": 0.12298044562339783, "loss/reg": 0.00333569198846817, "step": 1592 }, { "epoch": 0.199125, "grad_norm": 2.8941097259521484, "grad_norm_var": 0.12398925250324358, "learning_rate": 0.0001, "loss": 1.1451, "loss/crossentropy": 2.6563851833343506, "loss/hidden": 0.93359375, "loss/logits": 0.17815393209457397, "loss/reg": 0.0033339851070195436, "step": 1593 }, { "epoch": 0.19925, "grad_norm": 2.803880214691162, "grad_norm_var": 0.12292982191307994, "learning_rate": 0.0001, "loss": 1.1357, "loss/crossentropy": 2.2759344577789307, "loss/hidden": 0.9609375, "loss/logits": 0.141413152217865, "loss/reg": 0.003332150634378195, "step": 1594 }, { "epoch": 0.199375, "grad_norm": 2.237532138824463, "grad_norm_var": 0.13530315628679926, "learning_rate": 0.0001, "loss": 1.2666, "loss/crossentropy": 2.077180862426758, "loss/hidden": 1.0625, "loss/logits": 0.1708061397075653, "loss/reg": 0.003330171573907137, "step": 1595 }, { "epoch": 0.1995, "grad_norm": 1.8717583417892456, "grad_norm_var": 0.16787872576412224, "learning_rate": 0.0001, "loss": 0.933, "loss/crossentropy": 2.5325214862823486, "loss/hidden": 0.7734375, "loss/logits": 0.12631843984127045, "loss/reg": 0.0033281673677265644, "step": 1596 }, { "epoch": 0.199625, "grad_norm": 2.3740711212158203, "grad_norm_var": 0.16558500323165828, "learning_rate": 0.0001, "loss": 0.9823, "loss/crossentropy": 2.456437349319458, "loss/hidden": 0.81640625, "loss/logits": 0.13264372944831848, "loss/reg": 0.003326438134536147, "step": 1597 }, { "epoch": 0.19975, "grad_norm": 2.2797210216522217, "grad_norm_var": 0.16048771364269968, "learning_rate": 0.0001, "loss": 1.1277, "loss/crossentropy": 2.478566884994507, "loss/hidden": 0.92578125, "loss/logits": 0.16867585480213165, "loss/reg": 0.0033244146034121513, "step": 1598 }, { "epoch": 0.199875, "grad_norm": 2.273390293121338, "grad_norm_var": 0.1611370953897993, "learning_rate": 0.0001, "loss": 1.0944, "loss/crossentropy": 2.3444995880126953, "loss/hidden": 0.91015625, "loss/logits": 0.15098696947097778, "loss/reg": 0.0033223910722881556, "step": 1599 }, { "epoch": 0.2, "grad_norm": 2.5064868927001953, "grad_norm_var": 0.1615466221150351, "learning_rate": 0.0001, "loss": 1.1207, "loss/crossentropy": 2.6761395931243896, "loss/hidden": 0.91796875, "loss/logits": 0.16954849660396576, "loss/reg": 0.003320206655189395, "step": 1600 }, { "epoch": 0.200125, "grad_norm": 2.1371653079986572, "grad_norm_var": 0.12641732646446915, "learning_rate": 0.0001, "loss": 1.1086, "loss/crossentropy": 2.167865037918091, "loss/hidden": 0.921875, "loss/logits": 0.15358451008796692, "loss/reg": 0.0033184492494910955, "step": 1601 }, { "epoch": 0.20025, "grad_norm": 2.7362449169158936, "grad_norm_var": 0.12929521265355667, "learning_rate": 0.0001, "loss": 1.1727, "loss/crossentropy": 2.7397940158843994, "loss/hidden": 0.9609375, "loss/logits": 0.17860586941242218, "loss/reg": 0.0033167051151394844, "step": 1602 }, { "epoch": 0.200375, "grad_norm": 2.124201536178589, "grad_norm_var": 0.11993136224217782, "learning_rate": 0.0001, "loss": 1.0476, "loss/crossentropy": 2.3766019344329834, "loss/hidden": 0.875, "loss/logits": 0.13947069644927979, "loss/reg": 0.003314658999443054, "step": 1603 }, { "epoch": 0.2005, "grad_norm": 2.1093101501464844, "grad_norm_var": 0.12838562389595007, "learning_rate": 0.0001, "loss": 1.0742, "loss/crossentropy": 2.491436004638672, "loss/hidden": 0.87109375, "loss/logits": 0.16998031735420227, "loss/reg": 0.0033127006608992815, "step": 1604 }, { "epoch": 0.200625, "grad_norm": 2.2037789821624756, "grad_norm_var": 0.12259215079400192, "learning_rate": 0.0001, "loss": 1.1443, "loss/crossentropy": 2.429650068283081, "loss/hidden": 0.93359375, "loss/logits": 0.1775752156972885, "loss/reg": 0.0033109041396528482, "step": 1605 }, { "epoch": 0.20075, "grad_norm": 2.0681684017181396, "grad_norm_var": 0.13009940320814947, "learning_rate": 0.0001, "loss": 0.9251, "loss/crossentropy": 2.5921154022216797, "loss/hidden": 0.7734375, "loss/logits": 0.11855532228946686, "loss/reg": 0.0033087998162955046, "step": 1606 }, { "epoch": 0.200875, "grad_norm": 2.6388769149780273, "grad_norm_var": 0.08494051001129363, "learning_rate": 0.0001, "loss": 1.0706, "loss/crossentropy": 2.5055747032165527, "loss/hidden": 0.89453125, "loss/logits": 0.14303961396217346, "loss/reg": 0.0033070738427340984, "step": 1607 }, { "epoch": 0.201, "grad_norm": 2.181697368621826, "grad_norm_var": 0.08624639517303852, "learning_rate": 0.0001, "loss": 1.0201, "loss/crossentropy": 2.4525694847106934, "loss/hidden": 0.84375, "loss/logits": 0.14332106709480286, "loss/reg": 0.003305203514173627, "step": 1608 }, { "epoch": 0.201125, "grad_norm": 2.6697564125061035, "grad_norm_var": 0.07281751738566834, "learning_rate": 0.0001, "loss": 1.0102, "loss/crossentropy": 2.7797605991363525, "loss/hidden": 0.828125, "loss/logits": 0.14901109039783478, "loss/reg": 0.003303457982838154, "step": 1609 }, { "epoch": 0.20125, "grad_norm": 2.582929849624634, "grad_norm_var": 0.06179040816687683, "learning_rate": 0.0001, "loss": 1.0996, "loss/crossentropy": 2.530728578567505, "loss/hidden": 0.91015625, "loss/logits": 0.1564064621925354, "loss/reg": 0.0033016535453498363, "step": 1610 }, { "epoch": 0.201375, "grad_norm": 1.8796063661575317, "grad_norm_var": 0.07336041461658145, "learning_rate": 0.0001, "loss": 1.0591, "loss/crossentropy": 2.5042145252227783, "loss/hidden": 0.87890625, "loss/logits": 0.14722013473510742, "loss/reg": 0.0032997329253703356, "step": 1611 }, { "epoch": 0.2015, "grad_norm": 3.313866138458252, "grad_norm_var": 0.1229542381526608, "learning_rate": 0.0001, "loss": 1.2483, "loss/crossentropy": 2.173633575439453, "loss/hidden": 1.0390625, "loss/logits": 0.17622298002243042, "loss/reg": 0.00329802418127656, "step": 1612 }, { "epoch": 0.201625, "grad_norm": 2.6430091857910156, "grad_norm_var": 0.12726375044356592, "learning_rate": 0.0001, "loss": 1.0964, "loss/crossentropy": 2.1356022357940674, "loss/hidden": 0.91015625, "loss/logits": 0.15332993865013123, "loss/reg": 0.0032957610674202442, "step": 1613 }, { "epoch": 0.20175, "grad_norm": 2.283130645751953, "grad_norm_var": 0.12721126777018643, "learning_rate": 0.0001, "loss": 1.1047, "loss/crossentropy": 2.6446938514709473, "loss/hidden": 0.9140625, "loss/logits": 0.15773829817771912, "loss/reg": 0.003294040448963642, "step": 1614 }, { "epoch": 0.201875, "grad_norm": 2.009594678878784, "grad_norm_var": 0.1359073820378919, "learning_rate": 0.0001, "loss": 1.3146, "loss/crossentropy": 2.353555679321289, "loss/hidden": 1.0859375, "loss/logits": 0.1957610696554184, "loss/reg": 0.0032923046965152025, "step": 1615 }, { "epoch": 0.202, "grad_norm": 2.2532074451446533, "grad_norm_var": 0.1356617628627058, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.421433210372925, "loss/hidden": 0.890625, "loss/logits": 0.139800027012825, "loss/reg": 0.0032904213294386864, "step": 1616 }, { "epoch": 0.202125, "grad_norm": 2.2480552196502686, "grad_norm_var": 0.1330667309785141, "learning_rate": 0.0001, "loss": 1.1926, "loss/crossentropy": 2.4565775394439697, "loss/hidden": 0.99609375, "loss/logits": 0.16361382603645325, "loss/reg": 0.0032883703242987394, "step": 1617 }, { "epoch": 0.20225, "grad_norm": 4.123042106628418, "grad_norm_var": 0.3206941892301216, "learning_rate": 0.0001, "loss": 1.4135, "loss/crossentropy": 2.3067095279693604, "loss/hidden": 1.140625, "loss/logits": 0.2400142401456833, "loss/reg": 0.0032866299152374268, "step": 1618 }, { "epoch": 0.202375, "grad_norm": 2.0010764598846436, "grad_norm_var": 0.3271258788637292, "learning_rate": 0.0001, "loss": 1.0215, "loss/crossentropy": 2.4679181575775146, "loss/hidden": 0.83984375, "loss/logits": 0.14884260296821594, "loss/reg": 0.0032845879904925823, "step": 1619 }, { "epoch": 0.2025, "grad_norm": 2.1155405044555664, "grad_norm_var": 0.3268448163523752, "learning_rate": 0.0001, "loss": 1.1275, "loss/crossentropy": 2.7230353355407715, "loss/hidden": 0.9375, "loss/logits": 0.15715520083904266, "loss/reg": 0.0032828382682055235, "step": 1620 }, { "epoch": 0.202625, "grad_norm": 2.0337374210357666, "grad_norm_var": 0.3342560560773141, "learning_rate": 0.0001, "loss": 1.0079, "loss/crossentropy": 2.726454734802246, "loss/hidden": 0.8359375, "loss/logits": 0.13911572098731995, "loss/reg": 0.003280794247984886, "step": 1621 }, { "epoch": 0.20275, "grad_norm": 2.493293285369873, "grad_norm_var": 0.32445634627695763, "learning_rate": 0.0001, "loss": 1.3205, "loss/crossentropy": 2.4674136638641357, "loss/hidden": 1.1015625, "loss/logits": 0.1861811727285385, "loss/reg": 0.003278720658272505, "step": 1622 }, { "epoch": 0.202875, "grad_norm": 2.0618650913238525, "grad_norm_var": 0.3320343293044615, "learning_rate": 0.0001, "loss": 1.0074, "loss/crossentropy": 2.5865259170532227, "loss/hidden": 0.83984375, "loss/logits": 0.13479462265968323, "loss/reg": 0.0032766172662377357, "step": 1623 }, { "epoch": 0.203, "grad_norm": 4.24573278427124, "grad_norm_var": 0.5297347853177716, "learning_rate": 0.0001, "loss": 1.0286, "loss/crossentropy": 2.7966721057891846, "loss/hidden": 0.84765625, "loss/logits": 0.1481596827507019, "loss/reg": 0.0032749150414019823, "step": 1624 }, { "epoch": 0.203125, "grad_norm": 4.453259468078613, "grad_norm_var": 0.7546780963902354, "learning_rate": 0.0001, "loss": 1.4379, "loss/crossentropy": 2.324061155319214, "loss/hidden": 1.1875, "loss/logits": 0.21765878796577454, "loss/reg": 0.003273224225267768, "step": 1625 }, { "epoch": 0.20325, "grad_norm": 2.1763815879821777, "grad_norm_var": 0.7697989170952411, "learning_rate": 0.0001, "loss": 1.2117, "loss/crossentropy": 2.2315030097961426, "loss/hidden": 1.0234375, "loss/logits": 0.1554989069700241, "loss/reg": 0.0032715124543756247, "step": 1626 }, { "epoch": 0.203375, "grad_norm": 2.147721529006958, "grad_norm_var": 0.7468977871556014, "learning_rate": 0.0001, "loss": 1.1699, "loss/crossentropy": 2.5481529235839844, "loss/hidden": 0.953125, "loss/logits": 0.18407508730888367, "loss/reg": 0.0032695841509848833, "step": 1627 }, { "epoch": 0.2035, "grad_norm": 2.1995062828063965, "grad_norm_var": 0.727752660020807, "learning_rate": 0.0001, "loss": 1.0413, "loss/crossentropy": 2.62490177154541, "loss/hidden": 0.86328125, "loss/logits": 0.14532649517059326, "loss/reg": 0.0032678483985364437, "step": 1628 }, { "epoch": 0.203625, "grad_norm": 2.6025071144104004, "grad_norm_var": 0.7275851745925003, "learning_rate": 0.0001, "loss": 1.3071, "loss/crossentropy": 2.261453628540039, "loss/hidden": 1.0703125, "loss/logits": 0.2041451334953308, "loss/reg": 0.0032659387215971947, "step": 1629 }, { "epoch": 0.20375, "grad_norm": 2.3895230293273926, "grad_norm_var": 0.7239327077368195, "learning_rate": 0.0001, "loss": 1.1771, "loss/crossentropy": 2.437429904937744, "loss/hidden": 0.95703125, "loss/logits": 0.1874021738767624, "loss/reg": 0.003264203667640686, "step": 1630 }, { "epoch": 0.203875, "grad_norm": 2.5536670684814453, "grad_norm_var": 0.6998122275898206, "learning_rate": 0.0001, "loss": 1.2855, "loss/crossentropy": 2.308814287185669, "loss/hidden": 1.078125, "loss/logits": 0.17473715543746948, "loss/reg": 0.003262232756242156, "step": 1631 }, { "epoch": 0.204, "grad_norm": 1.893472671508789, "grad_norm_var": 0.7260273238761611, "learning_rate": 0.0001, "loss": 1.0432, "loss/crossentropy": 2.4681177139282227, "loss/hidden": 0.859375, "loss/logits": 0.15119843184947968, "loss/reg": 0.003260491183027625, "step": 1632 }, { "epoch": 0.204125, "grad_norm": 2.6406593322753906, "grad_norm_var": 0.7167848758234858, "learning_rate": 0.0001, "loss": 0.9315, "loss/crossentropy": 2.692917585372925, "loss/hidden": 0.76953125, "loss/logits": 0.12942397594451904, "loss/reg": 0.003258763812482357, "step": 1633 }, { "epoch": 0.20425, "grad_norm": 2.6017487049102783, "grad_norm_var": 0.5592297482073356, "learning_rate": 0.0001, "loss": 1.4767, "loss/crossentropy": 1.8698807954788208, "loss/hidden": 1.2109375, "loss/logits": 0.2331976294517517, "loss/reg": 0.0032570629846304655, "step": 1634 }, { "epoch": 0.204375, "grad_norm": 2.134735107421875, "grad_norm_var": 0.5507758063156113, "learning_rate": 0.0001, "loss": 0.9424, "loss/crossentropy": 2.657256841659546, "loss/hidden": 0.78515625, "loss/logits": 0.1246921718120575, "loss/reg": 0.003255224786698818, "step": 1635 }, { "epoch": 0.2045, "grad_norm": 2.775542736053467, "grad_norm_var": 0.5400799961918049, "learning_rate": 0.0001, "loss": 1.1952, "loss/crossentropy": 2.3927488327026367, "loss/hidden": 0.98046875, "loss/logits": 0.18224084377288818, "loss/reg": 0.003253570292145014, "step": 1636 }, { "epoch": 0.204625, "grad_norm": 3.4010424613952637, "grad_norm_var": 0.5559319990050954, "learning_rate": 0.0001, "loss": 0.9753, "loss/crossentropy": 2.674126386642456, "loss/hidden": 0.80859375, "loss/logits": 0.1341724693775177, "loss/reg": 0.0032518133521080017, "step": 1637 }, { "epoch": 0.20475, "grad_norm": 2.3625357151031494, "grad_norm_var": 0.5601365603978583, "learning_rate": 0.0001, "loss": 1.1191, "loss/crossentropy": 2.00201416015625, "loss/hidden": 0.93359375, "loss/logits": 0.15301145613193512, "loss/reg": 0.003249979577958584, "step": 1638 }, { "epoch": 0.204875, "grad_norm": 2.233456611633301, "grad_norm_var": 0.5481778857364833, "learning_rate": 0.0001, "loss": 1.0923, "loss/crossentropy": 2.260629892349243, "loss/hidden": 0.9140625, "loss/logits": 0.14572405815124512, "loss/reg": 0.003248338820412755, "step": 1639 }, { "epoch": 0.205, "grad_norm": 2.4094839096069336, "grad_norm_var": 0.37452435323996436, "learning_rate": 0.0001, "loss": 1.0426, "loss/crossentropy": 2.3188180923461914, "loss/hidden": 0.8515625, "loss/logits": 0.15855032205581665, "loss/reg": 0.0032465672120451927, "step": 1640 }, { "epoch": 0.205125, "grad_norm": 2.5767383575439453, "grad_norm_var": 0.12114709294457929, "learning_rate": 0.0001, "loss": 1.107, "loss/crossentropy": 2.5933563709259033, "loss/hidden": 0.92578125, "loss/logits": 0.14877736568450928, "loss/reg": 0.003244933672249317, "step": 1641 }, { "epoch": 0.20525, "grad_norm": 2.55334734916687, "grad_norm_var": 0.11659405774919757, "learning_rate": 0.0001, "loss": 1.4642, "loss/crossentropy": 2.26149582862854, "loss/hidden": 1.1953125, "loss/logits": 0.23644839227199554, "loss/reg": 0.0032434100285172462, "step": 1642 }, { "epoch": 0.205375, "grad_norm": 2.3793838024139404, "grad_norm_var": 0.11007918089816542, "learning_rate": 0.0001, "loss": 1.0786, "loss/crossentropy": 2.1200153827667236, "loss/hidden": 0.89453125, "loss/logits": 0.15168824791908264, "loss/reg": 0.0032420321367681026, "step": 1643 }, { "epoch": 0.2055, "grad_norm": 2.159534215927124, "grad_norm_var": 0.11168307348258182, "learning_rate": 0.0001, "loss": 1.2839, "loss/crossentropy": 2.5448434352874756, "loss/hidden": 1.078125, "loss/logits": 0.17339983582496643, "loss/reg": 0.003240725724026561, "step": 1644 }, { "epoch": 0.205625, "grad_norm": 2.512781858444214, "grad_norm_var": 0.1107112022419983, "learning_rate": 0.0001, "loss": 1.0038, "loss/crossentropy": 2.5887203216552734, "loss/hidden": 0.83984375, "loss/logits": 0.1315690279006958, "loss/reg": 0.0032390966080129147, "step": 1645 }, { "epoch": 0.20575, "grad_norm": 2.471038818359375, "grad_norm_var": 0.11021265436342276, "learning_rate": 0.0001, "loss": 1.0674, "loss/crossentropy": 2.4827725887298584, "loss/hidden": 0.8984375, "loss/logits": 0.1366003006696701, "loss/reg": 0.003237416036427021, "step": 1646 }, { "epoch": 0.205875, "grad_norm": 2.5491740703582764, "grad_norm_var": 0.11016900462870065, "learning_rate": 0.0001, "loss": 0.9898, "loss/crossentropy": 2.808387279510498, "loss/hidden": 0.8203125, "loss/logits": 0.1371297538280487, "loss/reg": 0.0032356702722609043, "step": 1647 }, { "epoch": 0.206, "grad_norm": 2.476713180541992, "grad_norm_var": 0.08594114936163895, "learning_rate": 0.0001, "loss": 1.1398, "loss/crossentropy": 2.4765665531158447, "loss/hidden": 0.9375, "loss/logits": 0.1700000762939453, "loss/reg": 0.003234060015529394, "step": 1648 }, { "epoch": 0.206125, "grad_norm": 2.1351890563964844, "grad_norm_var": 0.09343219350858452, "learning_rate": 0.0001, "loss": 1.1968, "loss/crossentropy": 2.2499215602874756, "loss/hidden": 0.98828125, "loss/logits": 0.176192969083786, "loss/reg": 0.0032324332278221846, "step": 1649 }, { "epoch": 0.20625, "grad_norm": 2.1201257705688477, "grad_norm_var": 0.10032196484461338, "learning_rate": 0.0001, "loss": 1.0841, "loss/crossentropy": 2.6280465126037598, "loss/hidden": 0.890625, "loss/logits": 0.16118907928466797, "loss/reg": 0.003230888629332185, "step": 1650 }, { "epoch": 0.206375, "grad_norm": 2.829747200012207, "grad_norm_var": 0.10100266775164073, "learning_rate": 0.0001, "loss": 1.0978, "loss/crossentropy": 2.610457420349121, "loss/hidden": 0.90234375, "loss/logits": 0.16314734518527985, "loss/reg": 0.003229183377698064, "step": 1651 }, { "epoch": 0.2065, "grad_norm": 2.5310165882110596, "grad_norm_var": 0.09564570596240832, "learning_rate": 0.0001, "loss": 0.9424, "loss/crossentropy": 2.548105478286743, "loss/hidden": 0.7890625, "loss/logits": 0.1211063414812088, "loss/reg": 0.0032274452969431877, "step": 1652 }, { "epoch": 0.206625, "grad_norm": 2.257979393005371, "grad_norm_var": 0.037136142432717394, "learning_rate": 0.0001, "loss": 1.1971, "loss/crossentropy": 2.516902446746826, "loss/hidden": 0.97265625, "loss/logits": 0.19221451878547668, "loss/reg": 0.003225695574656129, "step": 1653 }, { "epoch": 0.20675, "grad_norm": 2.3023855686187744, "grad_norm_var": 0.03774205518613461, "learning_rate": 0.0001, "loss": 0.9739, "loss/crossentropy": 2.71262788772583, "loss/hidden": 0.80859375, "loss/logits": 0.1330820769071579, "loss/reg": 0.003224144922569394, "step": 1654 }, { "epoch": 0.206875, "grad_norm": 2.521340847015381, "grad_norm_var": 0.0362938578599632, "learning_rate": 0.0001, "loss": 1.098, "loss/crossentropy": 2.643906593322754, "loss/hidden": 0.90625, "loss/logits": 0.15947577357292175, "loss/reg": 0.003222482278943062, "step": 1655 }, { "epoch": 0.207, "grad_norm": 5.88134765625, "grad_norm_var": 0.7828817213139186, "learning_rate": 0.0001, "loss": 1.4211, "loss/crossentropy": 2.313748836517334, "loss/hidden": 1.2109375, "loss/logits": 0.17793220281600952, "loss/reg": 0.0032208384945988655, "step": 1656 }, { "epoch": 0.207125, "grad_norm": 2.5916645526885986, "grad_norm_var": 0.7827675255288795, "learning_rate": 0.0001, "loss": 1.2521, "loss/crossentropy": 2.4316515922546387, "loss/hidden": 1.03125, "loss/logits": 0.18862737715244293, "loss/reg": 0.0032191697973757982, "step": 1657 }, { "epoch": 0.20725, "grad_norm": 2.2206692695617676, "grad_norm_var": 0.7936192015383082, "learning_rate": 0.0001, "loss": 1.0245, "loss/crossentropy": 2.6382174491882324, "loss/hidden": 0.84765625, "loss/logits": 0.14466163516044617, "loss/reg": 0.00321741821244359, "step": 1658 }, { "epoch": 0.207375, "grad_norm": 2.8307604789733887, "grad_norm_var": 0.7917962945035991, "learning_rate": 0.0001, "loss": 1.4057, "loss/crossentropy": 2.4240641593933105, "loss/hidden": 1.15625, "loss/logits": 0.21726641058921814, "loss/reg": 0.0032156051602214575, "step": 1659 }, { "epoch": 0.2075, "grad_norm": 2.049522638320923, "grad_norm_var": 0.7997391376511624, "learning_rate": 0.0001, "loss": 0.9971, "loss/crossentropy": 2.6553359031677246, "loss/hidden": 0.83203125, "loss/logits": 0.1329321563243866, "loss/reg": 0.003213758347555995, "step": 1660 }, { "epoch": 0.207625, "grad_norm": 3.276784658432007, "grad_norm_var": 0.8229971260041339, "learning_rate": 0.0001, "loss": 1.44, "loss/crossentropy": 2.5629796981811523, "loss/hidden": 1.171875, "loss/logits": 0.23601150512695312, "loss/reg": 0.0032118717208504677, "step": 1661 }, { "epoch": 0.20775, "grad_norm": 3.7711448669433594, "grad_norm_var": 0.8906238399602217, "learning_rate": 0.0001, "loss": 1.4247, "loss/crossentropy": 2.474085569381714, "loss/hidden": 1.1875, "loss/logits": 0.20507219433784485, "loss/reg": 0.0032101524993777275, "step": 1662 }, { "epoch": 0.207875, "grad_norm": 2.741032361984253, "grad_norm_var": 0.887234593717244, "learning_rate": 0.0001, "loss": 1.1324, "loss/crossentropy": 2.6519734859466553, "loss/hidden": 0.9296875, "loss/logits": 0.17065690457820892, "loss/reg": 0.0032084728591144085, "step": 1663 }, { "epoch": 0.208, "grad_norm": 3.385438919067383, "grad_norm_var": 0.9016638698725956, "learning_rate": 0.0001, "loss": 1.4632, "loss/crossentropy": 2.8452768325805664, "loss/hidden": 1.21875, "loss/logits": 0.2123820185661316, "loss/reg": 0.0032068106811493635, "step": 1664 }, { "epoch": 0.208125, "grad_norm": 2.4284839630126953, "grad_norm_var": 0.8794628798393888, "learning_rate": 0.0001, "loss": 1.1069, "loss/crossentropy": 2.3913896083831787, "loss/hidden": 0.91796875, "loss/logits": 0.15689219534397125, "loss/reg": 0.0032050481531769037, "step": 1665 }, { "epoch": 0.20825, "grad_norm": 1.9970413446426392, "grad_norm_var": 0.8925309231944419, "learning_rate": 0.0001, "loss": 1.0111, "loss/crossentropy": 2.1553092002868652, "loss/hidden": 0.8359375, "loss/logits": 0.14312410354614258, "loss/reg": 0.003203297033905983, "step": 1666 }, { "epoch": 0.208375, "grad_norm": 2.629917621612549, "grad_norm_var": 0.8955935228773692, "learning_rate": 0.0001, "loss": 1.0293, "loss/crossentropy": 2.4769906997680664, "loss/hidden": 0.828125, "loss/logits": 0.16919545829296112, "loss/reg": 0.003201601095497608, "step": 1667 }, { "epoch": 0.2085, "grad_norm": 2.4040727615356445, "grad_norm_var": 0.9018056713863368, "learning_rate": 0.0001, "loss": 0.9916, "loss/crossentropy": 2.4735305309295654, "loss/hidden": 0.8046875, "loss/logits": 0.15492868423461914, "loss/reg": 0.003199809929355979, "step": 1668 }, { "epoch": 0.208625, "grad_norm": 2.666597843170166, "grad_norm_var": 0.8810435015232821, "learning_rate": 0.0001, "loss": 0.9514, "loss/crossentropy": 2.219966173171997, "loss/hidden": 0.78515625, "loss/logits": 0.13430990278720856, "loss/reg": 0.003198012476786971, "step": 1669 }, { "epoch": 0.20875, "grad_norm": 2.6586005687713623, "grad_norm_var": 0.8626734234562614, "learning_rate": 0.0001, "loss": 1.162, "loss/crossentropy": 2.411811351776123, "loss/hidden": 0.9765625, "loss/logits": 0.15343676507472992, "loss/reg": 0.0031961523927748203, "step": 1670 }, { "epoch": 0.208875, "grad_norm": 3.573573350906372, "grad_norm_var": 0.8817782564271193, "learning_rate": 0.0001, "loss": 1.0668, "loss/crossentropy": 2.475907802581787, "loss/hidden": 0.88671875, "loss/logits": 0.14815643429756165, "loss/reg": 0.003194718388840556, "step": 1671 }, { "epoch": 0.209, "grad_norm": 2.6452560424804688, "grad_norm_var": 0.26896437314466315, "learning_rate": 0.0001, "loss": 1.0764, "loss/crossentropy": 2.2734243869781494, "loss/hidden": 0.8828125, "loss/logits": 0.16163568198680878, "loss/reg": 0.0031934131402522326, "step": 1672 }, { "epoch": 0.209125, "grad_norm": 2.2780601978302, "grad_norm_var": 0.28139345731230725, "learning_rate": 0.0001, "loss": 1.1114, "loss/crossentropy": 2.6828384399414062, "loss/hidden": 0.90625, "loss/logits": 0.17327217757701874, "loss/reg": 0.003191707655787468, "step": 1673 }, { "epoch": 0.20925, "grad_norm": 3.0598275661468506, "grad_norm_var": 0.2692776803865986, "learning_rate": 0.0001, "loss": 1.1438, "loss/crossentropy": 2.626523017883301, "loss/hidden": 0.94921875, "loss/logits": 0.16267018020153046, "loss/reg": 0.0031899947207421064, "step": 1674 }, { "epoch": 0.209375, "grad_norm": 2.3135793209075928, "grad_norm_var": 0.28213310678472675, "learning_rate": 0.0001, "loss": 1.2395, "loss/crossentropy": 2.501697063446045, "loss/hidden": 1.015625, "loss/logits": 0.19198425114154816, "loss/reg": 0.003188441740348935, "step": 1675 }, { "epoch": 0.2095, "grad_norm": 3.061948537826538, "grad_norm_var": 0.25265989074379286, "learning_rate": 0.0001, "loss": 1.0785, "loss/crossentropy": 2.630337715148926, "loss/hidden": 0.8984375, "loss/logits": 0.14819283783435822, "loss/reg": 0.003186658024787903, "step": 1676 }, { "epoch": 0.209625, "grad_norm": 2.0030603408813477, "grad_norm_var": 0.27405567589367946, "learning_rate": 0.0001, "loss": 1.0526, "loss/crossentropy": 2.529303789138794, "loss/hidden": 0.87109375, "loss/logits": 0.1496969759464264, "loss/reg": 0.0031848950311541557, "step": 1677 }, { "epoch": 0.20975, "grad_norm": 5.00962495803833, "grad_norm_var": 0.5424888351687083, "learning_rate": 0.0001, "loss": 1.1409, "loss/crossentropy": 2.745102882385254, "loss/hidden": 0.95703125, "loss/logits": 0.15204139053821564, "loss/reg": 0.0031832093372941017, "step": 1678 }, { "epoch": 0.209875, "grad_norm": 2.7985355854034424, "grad_norm_var": 0.5422164981145214, "learning_rate": 0.0001, "loss": 1.2753, "loss/crossentropy": 2.2465641498565674, "loss/hidden": 1.0625, "loss/logits": 0.1809433400630951, "loss/reg": 0.0031814700923860073, "step": 1679 }, { "epoch": 0.21, "grad_norm": 4.798881530761719, "grad_norm_var": 0.7760732092314867, "learning_rate": 0.0001, "loss": 1.466, "loss/crossentropy": 2.329308271408081, "loss/hidden": 1.2421875, "loss/logits": 0.1920267790555954, "loss/reg": 0.003179659601300955, "step": 1680 }, { "epoch": 0.210125, "grad_norm": 2.5762155055999756, "grad_norm_var": 0.7682393360080743, "learning_rate": 0.0001, "loss": 1.0703, "loss/crossentropy": 2.5117287635803223, "loss/hidden": 0.8828125, "loss/logits": 0.1556781381368637, "loss/reg": 0.003177785314619541, "step": 1681 }, { "epoch": 0.21025, "grad_norm": 2.2130141258239746, "grad_norm_var": 0.7450180582948017, "learning_rate": 0.0001, "loss": 1.0062, "loss/crossentropy": 2.4735710620880127, "loss/hidden": 0.83203125, "loss/logits": 0.14245635271072388, "loss/reg": 0.0031759634148329496, "step": 1682 }, { "epoch": 0.210375, "grad_norm": 2.1119225025177, "grad_norm_var": 0.7816966335511644, "learning_rate": 0.0001, "loss": 1.1029, "loss/crossentropy": 2.5301568508148193, "loss/hidden": 0.90234375, "loss/logits": 0.1688534915447235, "loss/reg": 0.0031742649152874947, "step": 1683 }, { "epoch": 0.2105, "grad_norm": 2.5934348106384277, "grad_norm_var": 0.7717750228974445, "learning_rate": 0.0001, "loss": 1.3001, "loss/crossentropy": 2.7114899158477783, "loss/hidden": 1.078125, "loss/logits": 0.19023996591567993, "loss/reg": 0.0031723512802273035, "step": 1684 }, { "epoch": 0.210625, "grad_norm": 2.822317361831665, "grad_norm_var": 0.7684936610933231, "learning_rate": 0.0001, "loss": 1.1469, "loss/crossentropy": 2.4820547103881836, "loss/hidden": 0.95703125, "loss/logits": 0.15812493860721588, "loss/reg": 0.0031706641893833876, "step": 1685 }, { "epoch": 0.21075, "grad_norm": 2.204843044281006, "grad_norm_var": 0.7964126984830915, "learning_rate": 0.0001, "loss": 1.0662, "loss/crossentropy": 2.5764718055725098, "loss/hidden": 0.8828125, "loss/logits": 0.15168514847755432, "loss/reg": 0.0031687715090811253, "step": 1686 }, { "epoch": 0.210875, "grad_norm": 1.9805725812911987, "grad_norm_var": 0.8074897214563511, "learning_rate": 0.0001, "loss": 0.9752, "loss/crossentropy": 2.5879499912261963, "loss/hidden": 0.81640625, "loss/logits": 0.12714409828186035, "loss/reg": 0.0031668762676417828, "step": 1687 }, { "epoch": 0.211, "grad_norm": 2.5287587642669678, "grad_norm_var": 0.8104222753256015, "learning_rate": 0.0001, "loss": 1.0874, "loss/crossentropy": 2.712538957595825, "loss/hidden": 0.90234375, "loss/logits": 0.1533837765455246, "loss/reg": 0.0031649123411625624, "step": 1688 }, { "epoch": 0.211125, "grad_norm": 3.2568001747131348, "grad_norm_var": 0.8058133582529289, "learning_rate": 0.0001, "loss": 1.0633, "loss/crossentropy": 2.5451202392578125, "loss/hidden": 0.90625, "loss/logits": 0.12538479268550873, "loss/reg": 0.0031630489975214005, "step": 1689 }, { "epoch": 0.21125, "grad_norm": 2.1492838859558105, "grad_norm_var": 0.8301337770059201, "learning_rate": 0.0001, "loss": 1.0673, "loss/crossentropy": 2.457829236984253, "loss/hidden": 0.89453125, "loss/logits": 0.14114046096801758, "loss/reg": 0.0031612419988960028, "step": 1690 }, { "epoch": 0.211375, "grad_norm": 2.0851340293884277, "grad_norm_var": 0.8474934557513625, "learning_rate": 0.0001, "loss": 1.0295, "loss/crossentropy": 2.593254327774048, "loss/hidden": 0.8515625, "loss/logits": 0.14630341529846191, "loss/reg": 0.0031592664308845997, "step": 1691 }, { "epoch": 0.2115, "grad_norm": 2.692077159881592, "grad_norm_var": 0.8412586771616655, "learning_rate": 0.0001, "loss": 1.1157, "loss/crossentropy": 2.491499900817871, "loss/hidden": 0.9375, "loss/logits": 0.14665257930755615, "loss/reg": 0.003157339058816433, "step": 1692 }, { "epoch": 0.211625, "grad_norm": 1.9899176359176636, "grad_norm_var": 0.8425591567104391, "learning_rate": 0.0001, "loss": 1.043, "loss/crossentropy": 2.550938367843628, "loss/hidden": 0.8671875, "loss/logits": 0.14427784085273743, "loss/reg": 0.0031556852627545595, "step": 1693 }, { "epoch": 0.21175, "grad_norm": 1.861344575881958, "grad_norm_var": 0.508564313907548, "learning_rate": 0.0001, "loss": 0.9407, "loss/crossentropy": 2.7122299671173096, "loss/hidden": 0.78515625, "loss/logits": 0.12400847673416138, "loss/reg": 0.0031540419440716505, "step": 1694 }, { "epoch": 0.211875, "grad_norm": 2.5342886447906494, "grad_norm_var": 0.5038702664044002, "learning_rate": 0.0001, "loss": 1.2289, "loss/crossentropy": 2.5080294609069824, "loss/hidden": 0.9921875, "loss/logits": 0.20523084700107574, "loss/reg": 0.00315248966217041, "step": 1695 }, { "epoch": 0.212, "grad_norm": 2.043546438217163, "grad_norm_var": 0.14296074842546982, "learning_rate": 0.0001, "loss": 0.9838, "loss/crossentropy": 2.683417320251465, "loss/hidden": 0.80078125, "loss/logits": 0.15149196982383728, "loss/reg": 0.003150953445583582, "step": 1696 }, { "epoch": 0.212125, "grad_norm": 2.110501289367676, "grad_norm_var": 0.14263816283126, "learning_rate": 0.0001, "loss": 1.1321, "loss/crossentropy": 2.24113392829895, "loss/hidden": 0.9453125, "loss/logits": 0.155296191573143, "loss/reg": 0.003149296622723341, "step": 1697 }, { "epoch": 0.21225, "grad_norm": 2.7359836101531982, "grad_norm_var": 0.1520199744222225, "learning_rate": 0.0001, "loss": 1.1932, "loss/crossentropy": 2.5510122776031494, "loss/hidden": 0.9765625, "loss/logits": 0.1851990818977356, "loss/reg": 0.003147589974105358, "step": 1698 }, { "epoch": 0.212375, "grad_norm": 2.346010208129883, "grad_norm_var": 0.1478174979612748, "learning_rate": 0.0001, "loss": 1.0448, "loss/crossentropy": 2.4974029064178467, "loss/hidden": 0.87890625, "loss/logits": 0.13440260291099548, "loss/reg": 0.0031458197627216578, "step": 1699 }, { "epoch": 0.2125, "grad_norm": 2.1124165058135986, "grad_norm_var": 0.14800787911656718, "learning_rate": 0.0001, "loss": 1.1699, "loss/crossentropy": 2.265637159347534, "loss/hidden": 0.9765625, "loss/logits": 0.16191065311431885, "loss/reg": 0.003143977839499712, "step": 1700 }, { "epoch": 0.212625, "grad_norm": 2.4635298252105713, "grad_norm_var": 0.13302139739859153, "learning_rate": 0.0001, "loss": 1.2526, "loss/crossentropy": 2.3409652709960938, "loss/hidden": 1.03125, "loss/logits": 0.18988975882530212, "loss/reg": 0.0031423657201230526, "step": 1701 }, { "epoch": 0.21275, "grad_norm": 2.1505675315856934, "grad_norm_var": 0.1340275686171452, "learning_rate": 0.0001, "loss": 1.1472, "loss/crossentropy": 2.48573899269104, "loss/hidden": 0.95703125, "loss/logits": 0.15877564251422882, "loss/reg": 0.0031405584886670113, "step": 1702 }, { "epoch": 0.212875, "grad_norm": 2.3113481998443604, "grad_norm_var": 0.12611443887347676, "learning_rate": 0.0001, "loss": 1.0576, "loss/crossentropy": 2.165038824081421, "loss/hidden": 0.88671875, "loss/logits": 0.139505535364151, "loss/reg": 0.0031387642957270145, "step": 1703 }, { "epoch": 0.213, "grad_norm": 2.0424857139587402, "grad_norm_var": 0.12837729482331822, "learning_rate": 0.0001, "loss": 1.189, "loss/crossentropy": 2.2224714756011963, "loss/hidden": 0.9609375, "loss/logits": 0.19673088192939758, "loss/reg": 0.0031370187643915415, "step": 1704 }, { "epoch": 0.213125, "grad_norm": 2.016124725341797, "grad_norm_var": 0.06718613229377196, "learning_rate": 0.0001, "loss": 0.9392, "loss/crossentropy": 2.3173725605010986, "loss/hidden": 0.78125, "loss/logits": 0.1265917271375656, "loss/reg": 0.0031352161895483732, "step": 1705 }, { "epoch": 0.21325, "grad_norm": 2.1489810943603516, "grad_norm_var": 0.06718930728756754, "learning_rate": 0.0001, "loss": 0.9914, "loss/crossentropy": 2.3617665767669678, "loss/hidden": 0.828125, "loss/logits": 0.13194003701210022, "loss/reg": 0.0031336136162281036, "step": 1706 }, { "epoch": 0.213375, "grad_norm": 2.2248895168304443, "grad_norm_var": 0.06575221726070399, "learning_rate": 0.0001, "loss": 0.9928, "loss/crossentropy": 2.3003880977630615, "loss/hidden": 0.83984375, "loss/logits": 0.12159569561481476, "loss/reg": 0.0031318794935941696, "step": 1707 }, { "epoch": 0.2135, "grad_norm": 2.7656562328338623, "grad_norm_var": 0.07056003633158045, "learning_rate": 0.0001, "loss": 1.0895, "loss/crossentropy": 2.4966132640838623, "loss/hidden": 0.90625, "loss/logits": 0.1519893854856491, "loss/reg": 0.0031300997361540794, "step": 1708 }, { "epoch": 0.213625, "grad_norm": 2.4991726875305176, "grad_norm_var": 0.06971341387025494, "learning_rate": 0.0001, "loss": 1.0979, "loss/crossentropy": 2.0994441509246826, "loss/hidden": 0.90234375, "loss/logits": 0.16422367095947266, "loss/reg": 0.003128266194835305, "step": 1709 }, { "epoch": 0.21375, "grad_norm": 2.4266979694366455, "grad_norm_var": 0.05866460350893187, "learning_rate": 0.0001, "loss": 1.0559, "loss/crossentropy": 2.5089967250823975, "loss/hidden": 0.87890625, "loss/logits": 0.14569693803787231, "loss/reg": 0.003126643830910325, "step": 1710 }, { "epoch": 0.213875, "grad_norm": 2.5570080280303955, "grad_norm_var": 0.05938155406816629, "learning_rate": 0.0001, "loss": 1.1512, "loss/crossentropy": 2.390232801437378, "loss/hidden": 0.98046875, "loss/logits": 0.13949596881866455, "loss/reg": 0.0031250508036464453, "step": 1711 }, { "epoch": 0.214, "grad_norm": 2.4763524532318115, "grad_norm_var": 0.05573108256271908, "learning_rate": 0.0001, "loss": 1.2109, "loss/crossentropy": 2.4115543365478516, "loss/hidden": 1.0, "loss/logits": 0.17969435453414917, "loss/reg": 0.003123391419649124, "step": 1712 }, { "epoch": 0.214125, "grad_norm": 1.9605647325515747, "grad_norm_var": 0.061658860743410496, "learning_rate": 0.0001, "loss": 1.004, "loss/crossentropy": 2.1503031253814697, "loss/hidden": 0.83203125, "loss/logits": 0.14071419835090637, "loss/reg": 0.0031219625379890203, "step": 1713 }, { "epoch": 0.21425, "grad_norm": 2.206909656524658, "grad_norm_var": 0.05032832725278471, "learning_rate": 0.0001, "loss": 0.9881, "loss/crossentropy": 2.740894079208374, "loss/hidden": 0.81640625, "loss/logits": 0.14050860702991486, "loss/reg": 0.0031206535641103983, "step": 1714 }, { "epoch": 0.214375, "grad_norm": 2.084909677505493, "grad_norm_var": 0.05278877705570242, "learning_rate": 0.0001, "loss": 1.1267, "loss/crossentropy": 2.4331443309783936, "loss/hidden": 0.9375, "loss/logits": 0.1579878181219101, "loss/reg": 0.003117976011708379, "step": 1715 }, { "epoch": 0.2145, "grad_norm": 2.2570641040802, "grad_norm_var": 0.050903424022511426, "learning_rate": 0.0001, "loss": 0.9784, "loss/crossentropy": 2.688154935836792, "loss/hidden": 0.80078125, "loss/logits": 0.14644555747509003, "loss/reg": 0.0031148470006883144, "step": 1716 }, { "epoch": 0.214625, "grad_norm": 2.1008963584899902, "grad_norm_var": 0.05058773933843851, "learning_rate": 0.0001, "loss": 1.0819, "loss/crossentropy": 2.5256693363189697, "loss/hidden": 0.90234375, "loss/logits": 0.14845484495162964, "loss/reg": 0.003113400423899293, "step": 1717 }, { "epoch": 0.21475, "grad_norm": 2.4614713191986084, "grad_norm_var": 0.05191226779637403, "learning_rate": 0.0001, "loss": 1.0564, "loss/crossentropy": 2.5726735591888428, "loss/hidden": 0.87109375, "loss/logits": 0.15416929125785828, "loss/reg": 0.0031117405742406845, "step": 1718 }, { "epoch": 0.214875, "grad_norm": 2.3064160346984863, "grad_norm_var": 0.05189566088851283, "learning_rate": 0.0001, "loss": 1.1013, "loss/crossentropy": 2.7065393924713135, "loss/hidden": 0.90234375, "loss/logits": 0.16784769296646118, "loss/reg": 0.0031088702380657196, "step": 1719 }, { "epoch": 0.215, "grad_norm": 9.079465866088867, "grad_norm_var": 2.9207271705017384, "learning_rate": 0.0001, "loss": 1.3854, "loss/crossentropy": 2.3284268379211426, "loss/hidden": 1.1328125, "loss/logits": 0.22156238555908203, "loss/reg": 0.003107408294454217, "step": 1720 }, { "epoch": 0.215125, "grad_norm": 2.1745572090148926, "grad_norm_var": 2.9073576589134493, "learning_rate": 0.0001, "loss": 1.1169, "loss/crossentropy": 2.095036745071411, "loss/hidden": 0.93359375, "loss/logits": 0.15225940942764282, "loss/reg": 0.003105347976088524, "step": 1721 }, { "epoch": 0.21525, "grad_norm": 2.243680238723755, "grad_norm_var": 2.9005416312984256, "learning_rate": 0.0001, "loss": 1.0107, "loss/crossentropy": 2.2994542121887207, "loss/hidden": 0.84375, "loss/logits": 0.1359337866306305, "loss/reg": 0.003103848546743393, "step": 1722 }, { "epoch": 0.215375, "grad_norm": 2.6446003913879395, "grad_norm_var": 2.882775101197618, "learning_rate": 0.0001, "loss": 1.2141, "loss/crossentropy": 2.498286008834839, "loss/hidden": 1.0234375, "loss/logits": 0.1595914363861084, "loss/reg": 0.003102482995018363, "step": 1723 }, { "epoch": 0.2155, "grad_norm": 4.21211576461792, "grad_norm_var": 3.0136016192372757, "learning_rate": 0.0001, "loss": 1.0643, "loss/crossentropy": 2.2280969619750977, "loss/hidden": 0.8828125, "loss/logits": 0.15044564008712769, "loss/reg": 0.0031010708771646023, "step": 1724 }, { "epoch": 0.215625, "grad_norm": 2.093863010406494, "grad_norm_var": 3.0431383662912457, "learning_rate": 0.0001, "loss": 0.9815, "loss/crossentropy": 2.27380108833313, "loss/hidden": 0.82421875, "loss/logits": 0.12631164491176605, "loss/reg": 0.0030996648129075766, "step": 1725 }, { "epoch": 0.21575, "grad_norm": 2.16963267326355, "grad_norm_var": 3.061105934508265, "learning_rate": 0.0001, "loss": 1.1956, "loss/crossentropy": 2.180711269378662, "loss/hidden": 0.9921875, "loss/logits": 0.17242172360420227, "loss/reg": 0.00309771322645247, "step": 1726 }, { "epoch": 0.215875, "grad_norm": 3.120957136154175, "grad_norm_var": 3.0616334113432386, "learning_rate": 0.0001, "loss": 1.1386, "loss/crossentropy": 2.4755892753601074, "loss/hidden": 0.92578125, "loss/logits": 0.18183037638664246, "loss/reg": 0.0030961879529058933, "step": 1727 }, { "epoch": 0.216, "grad_norm": 2.810375213623047, "grad_norm_var": 3.051983920589513, "learning_rate": 0.0001, "loss": 1.0356, "loss/crossentropy": 2.844090461730957, "loss/hidden": 0.85546875, "loss/logits": 0.14921194314956665, "loss/reg": 0.0030942922458052635, "step": 1728 }, { "epoch": 0.216125, "grad_norm": 2.351074695587158, "grad_norm_var": 3.01413823672746, "learning_rate": 0.0001, "loss": 1.1406, "loss/crossentropy": 2.617952823638916, "loss/hidden": 0.9375, "loss/logits": 0.17222052812576294, "loss/reg": 0.00309238163754344, "step": 1729 }, { "epoch": 0.21625, "grad_norm": 3.7283058166503906, "grad_norm_var": 3.0192480530971872, "learning_rate": 0.0001, "loss": 1.3238, "loss/crossentropy": 2.559141159057617, "loss/hidden": 0.99609375, "loss/logits": 0.2968454360961914, "loss/reg": 0.0030908475164324045, "step": 1730 }, { "epoch": 0.216375, "grad_norm": 2.9617199897766113, "grad_norm_var": 2.961489976152211, "learning_rate": 0.0001, "loss": 1.3122, "loss/crossentropy": 2.598041534423828, "loss/hidden": 1.078125, "loss/logits": 0.2031644731760025, "loss/reg": 0.00308916624635458, "step": 1731 }, { "epoch": 0.2165, "grad_norm": 2.236485481262207, "grad_norm_var": 2.9636777426758716, "learning_rate": 0.0001, "loss": 1.0227, "loss/crossentropy": 2.625614881515503, "loss/hidden": 0.83984375, "loss/logits": 0.15201476216316223, "loss/reg": 0.003087525488808751, "step": 1732 }, { "epoch": 0.216625, "grad_norm": 2.3135018348693848, "grad_norm_var": 2.9397831294271772, "learning_rate": 0.0001, "loss": 0.9559, "loss/crossentropy": 2.3203794956207275, "loss/hidden": 0.7890625, "loss/logits": 0.13600721955299377, "loss/reg": 0.003086032345890999, "step": 1733 }, { "epoch": 0.21675, "grad_norm": 2.0412943363189697, "grad_norm_var": 2.984167856020982, "learning_rate": 0.0001, "loss": 1.0881, "loss/crossentropy": 2.715367317199707, "loss/hidden": 0.89453125, "loss/logits": 0.1627357304096222, "loss/reg": 0.0030844947323203087, "step": 1734 }, { "epoch": 0.216875, "grad_norm": 3.1161649227142334, "grad_norm_var": 2.946971551780882, "learning_rate": 0.0001, "loss": 1.0572, "loss/crossentropy": 2.4200503826141357, "loss/hidden": 0.8828125, "loss/logits": 0.14354635775089264, "loss/reg": 0.0030825661960989237, "step": 1735 }, { "epoch": 0.217, "grad_norm": 3.2668862342834473, "grad_norm_var": 0.4098138660932987, "learning_rate": 0.0001, "loss": 1.1414, "loss/crossentropy": 2.4813458919525146, "loss/hidden": 0.953125, "loss/logits": 0.15744319558143616, "loss/reg": 0.003080642083659768, "step": 1736 }, { "epoch": 0.217125, "grad_norm": 1.9718143939971924, "grad_norm_var": 0.42706875074818434, "learning_rate": 0.0001, "loss": 1.0276, "loss/crossentropy": 2.2978994846343994, "loss/hidden": 0.8671875, "loss/logits": 0.12966391444206238, "loss/reg": 0.0030790595337748528, "step": 1737 }, { "epoch": 0.21725, "grad_norm": 2.1405398845672607, "grad_norm_var": 0.4340798374863008, "learning_rate": 0.0001, "loss": 1.1405, "loss/crossentropy": 2.091478109359741, "loss/hidden": 0.94140625, "loss/logits": 0.16836108267307281, "loss/reg": 0.0030774776823818684, "step": 1738 }, { "epoch": 0.217375, "grad_norm": 3.8788137435913086, "grad_norm_var": 0.5203809166356061, "learning_rate": 0.0001, "loss": 1.751, "loss/crossentropy": 2.3941147327423096, "loss/hidden": 1.4453125, "loss/logits": 0.274971067905426, "loss/reg": 0.003075655549764633, "step": 1739 }, { "epoch": 0.2175, "grad_norm": 1.9825366735458374, "grad_norm_var": 0.4040997474989869, "learning_rate": 0.0001, "loss": 1.1001, "loss/crossentropy": 2.233893632888794, "loss/hidden": 0.91796875, "loss/logits": 0.15135906636714935, "loss/reg": 0.0030739654321223497, "step": 1740 }, { "epoch": 0.217625, "grad_norm": 2.0987682342529297, "grad_norm_var": 0.4037463519266102, "learning_rate": 0.0001, "loss": 1.0675, "loss/crossentropy": 2.4455137252807617, "loss/hidden": 0.875, "loss/logits": 0.1617324948310852, "loss/reg": 0.0030723894014954567, "step": 1741 }, { "epoch": 0.21775, "grad_norm": 3.920135498046875, "grad_norm_var": 0.48622454106490515, "learning_rate": 0.0001, "loss": 1.1559, "loss/crossentropy": 2.6298489570617676, "loss/hidden": 0.94140625, "loss/logits": 0.18376988172531128, "loss/reg": 0.0030707602854818106, "step": 1742 }, { "epoch": 0.217875, "grad_norm": 2.690436363220215, "grad_norm_var": 0.4762973265463903, "learning_rate": 0.0001, "loss": 1.2467, "loss/crossentropy": 2.1424267292022705, "loss/hidden": 1.0625, "loss/logits": 0.15353702008724213, "loss/reg": 0.0030691707506775856, "step": 1743 }, { "epoch": 0.218, "grad_norm": 2.604951858520508, "grad_norm_var": 0.47644030986631136, "learning_rate": 0.0001, "loss": 1.1014, "loss/crossentropy": 2.54174542427063, "loss/hidden": 0.91015625, "loss/logits": 0.1606130450963974, "loss/reg": 0.00306738936342299, "step": 1744 }, { "epoch": 0.218125, "grad_norm": 3.3228073120117188, "grad_norm_var": 0.48941099514094233, "learning_rate": 0.0001, "loss": 1.0578, "loss/crossentropy": 2.3931949138641357, "loss/hidden": 0.890625, "loss/logits": 0.13647425174713135, "loss/reg": 0.003065774915739894, "step": 1745 }, { "epoch": 0.21825, "grad_norm": 2.641117811203003, "grad_norm_var": 0.42396390393689143, "learning_rate": 0.0001, "loss": 0.971, "loss/crossentropy": 2.4205129146575928, "loss/hidden": 0.83203125, "loss/logits": 0.10831993818283081, "loss/reg": 0.003064037999138236, "step": 1746 }, { "epoch": 0.218375, "grad_norm": 2.7706568241119385, "grad_norm_var": 0.4195589879953497, "learning_rate": 0.0001, "loss": 1.1104, "loss/crossentropy": 2.719468355178833, "loss/hidden": 0.9140625, "loss/logits": 0.16571125388145447, "loss/reg": 0.0030624454375356436, "step": 1747 }, { "epoch": 0.2185, "grad_norm": 2.2494888305664062, "grad_norm_var": 0.41878793071204773, "learning_rate": 0.0001, "loss": 1.0321, "loss/crossentropy": 2.7627296447753906, "loss/hidden": 0.84765625, "loss/logits": 0.15384793281555176, "loss/reg": 0.003060864983126521, "step": 1748 }, { "epoch": 0.218625, "grad_norm": 2.4056599140167236, "grad_norm_var": 0.41471554214321593, "learning_rate": 0.0001, "loss": 1.0339, "loss/crossentropy": 2.546539068222046, "loss/hidden": 0.8671875, "loss/logits": 0.13611117005348206, "loss/reg": 0.0030592146795243025, "step": 1749 }, { "epoch": 0.21875, "grad_norm": 2.493166923522949, "grad_norm_var": 0.38815929501957475, "learning_rate": 0.0001, "loss": 1.0489, "loss/crossentropy": 2.243028402328491, "loss/hidden": 0.87109375, "loss/logits": 0.1472695916891098, "loss/reg": 0.0030575725249946117, "step": 1750 }, { "epoch": 0.218875, "grad_norm": 2.213844060897827, "grad_norm_var": 0.3916385925474808, "learning_rate": 0.0001, "loss": 1.0008, "loss/crossentropy": 2.419025182723999, "loss/hidden": 0.83984375, "loss/logits": 0.13044574856758118, "loss/reg": 0.003055924316868186, "step": 1751 }, { "epoch": 0.219, "grad_norm": 2.769224166870117, "grad_norm_var": 0.3672278962107095, "learning_rate": 0.0001, "loss": 1.0385, "loss/crossentropy": 2.7889208793640137, "loss/hidden": 0.8671875, "loss/logits": 0.1408083736896515, "loss/reg": 0.0030543410684913397, "step": 1752 }, { "epoch": 0.219125, "grad_norm": 2.4764599800109863, "grad_norm_var": 0.33854682568550765, "learning_rate": 0.0001, "loss": 1.0605, "loss/crossentropy": 2.422456741333008, "loss/hidden": 0.89453125, "loss/logits": 0.1354484111070633, "loss/reg": 0.0030527592170983553, "step": 1753 }, { "epoch": 0.21925, "grad_norm": 2.322908639907837, "grad_norm_var": 0.32784450880299965, "learning_rate": 0.0001, "loss": 1.1523, "loss/crossentropy": 2.441586494445801, "loss/hidden": 0.94921875, "loss/logits": 0.17252781987190247, "loss/reg": 0.003051069099456072, "step": 1754 }, { "epoch": 0.219375, "grad_norm": 2.6239187717437744, "grad_norm_var": 0.22527430071220297, "learning_rate": 0.0001, "loss": 1.1402, "loss/crossentropy": 2.3880774974823, "loss/hidden": 0.94140625, "loss/logits": 0.16831141710281372, "loss/reg": 0.0030493696685880423, "step": 1755 }, { "epoch": 0.2195, "grad_norm": 17.505414962768555, "grad_norm_var": 14.009084703934427, "learning_rate": 0.0001, "loss": 1.3282, "loss/crossentropy": 2.6417012214660645, "loss/hidden": 1.1328125, "loss/logits": 0.16491027176380157, "loss/reg": 0.003047748701646924, "step": 1756 }, { "epoch": 0.219625, "grad_norm": 4.20535135269165, "grad_norm_var": 13.873398017294948, "learning_rate": 0.0001, "loss": 1.131, "loss/crossentropy": 2.521103858947754, "loss/hidden": 0.96484375, "loss/logits": 0.13569971919059753, "loss/reg": 0.0030460914131253958, "step": 1757 }, { "epoch": 0.21975, "grad_norm": 2.9682602882385254, "grad_norm_var": 13.902211592229294, "learning_rate": 0.0001, "loss": 1.1961, "loss/crossentropy": 2.4939701557159424, "loss/hidden": 1.0, "loss/logits": 0.165610671043396, "loss/reg": 0.0030445046722888947, "step": 1758 }, { "epoch": 0.219875, "grad_norm": 2.7488410472869873, "grad_norm_var": 13.895018738483493, "learning_rate": 0.0001, "loss": 1.0071, "loss/crossentropy": 2.5324008464813232, "loss/hidden": 0.84765625, "loss/logits": 0.12899169325828552, "loss/reg": 0.003042889991775155, "step": 1759 }, { "epoch": 0.22, "grad_norm": 2.1646311283111572, "grad_norm_var": 13.968204624958085, "learning_rate": 0.0001, "loss": 1.1448, "loss/crossentropy": 2.5288894176483154, "loss/hidden": 0.94921875, "loss/logits": 0.16518330574035645, "loss/reg": 0.0030412436462938786, "step": 1760 }, { "epoch": 0.220125, "grad_norm": 1.9247658252716064, "grad_norm_var": 14.145314883597047, "learning_rate": 0.0001, "loss": 1.0406, "loss/crossentropy": 2.5389163494110107, "loss/hidden": 0.8671875, "loss/logits": 0.1429884135723114, "loss/reg": 0.003039830131456256, "step": 1761 }, { "epoch": 0.22025, "grad_norm": 2.1075799465179443, "grad_norm_var": 14.226356437632456, "learning_rate": 0.0001, "loss": 1.1968, "loss/crossentropy": 2.422471046447754, "loss/hidden": 1.0078125, "loss/logits": 0.15863800048828125, "loss/reg": 0.0030385232530534267, "step": 1762 }, { "epoch": 0.220375, "grad_norm": 2.6114046573638916, "grad_norm_var": 14.243361987467381, "learning_rate": 0.0001, "loss": 0.9892, "loss/crossentropy": 2.574613571166992, "loss/hidden": 0.83203125, "loss/logits": 0.1268356591463089, "loss/reg": 0.0030372380279004574, "step": 1763 }, { "epoch": 0.2205, "grad_norm": 2.0650744438171387, "grad_norm_var": 14.275914518581828, "learning_rate": 0.0001, "loss": 1.1595, "loss/crossentropy": 2.4691414833068848, "loss/hidden": 0.98046875, "loss/logits": 0.1486453115940094, "loss/reg": 0.003035652916878462, "step": 1764 }, { "epoch": 0.220625, "grad_norm": 2.160597085952759, "grad_norm_var": 14.314622026235172, "learning_rate": 0.0001, "loss": 1.1022, "loss/crossentropy": 2.609285593032837, "loss/hidden": 0.921875, "loss/logits": 0.15001603960990906, "loss/reg": 0.0030343951657414436, "step": 1765 }, { "epoch": 0.22075, "grad_norm": 3.069577217102051, "grad_norm_var": 14.261074973549244, "learning_rate": 0.0001, "loss": 1.1575, "loss/crossentropy": 2.647573947906494, "loss/hidden": 0.96875, "loss/logits": 0.1584310233592987, "loss/reg": 0.003032844513654709, "step": 1766 }, { "epoch": 0.220875, "grad_norm": 2.568370819091797, "grad_norm_var": 14.208317261947524, "learning_rate": 0.0001, "loss": 1.0488, "loss/crossentropy": 2.6948702335357666, "loss/hidden": 0.87109375, "loss/logits": 0.1474071443080902, "loss/reg": 0.0030314817558974028, "step": 1767 }, { "epoch": 0.221, "grad_norm": 2.68941068649292, "grad_norm_var": 14.21668663304105, "learning_rate": 0.0001, "loss": 1.0456, "loss/crossentropy": 2.5210416316986084, "loss/hidden": 0.87109375, "loss/logits": 0.14420348405838013, "loss/reg": 0.0030302261002361774, "step": 1768 }, { "epoch": 0.221125, "grad_norm": 1.9921388626098633, "grad_norm_var": 14.298301261709696, "learning_rate": 0.0001, "loss": 1.0236, "loss/crossentropy": 2.522998332977295, "loss/hidden": 0.84765625, "loss/logits": 0.14565014839172363, "loss/reg": 0.00302865426056087, "step": 1769 }, { "epoch": 0.22125, "grad_norm": 1.8499289751052856, "grad_norm_var": 14.38544404016637, "learning_rate": 0.0001, "loss": 1.0027, "loss/crossentropy": 2.466783046722412, "loss/hidden": 0.828125, "loss/logits": 0.14433151483535767, "loss/reg": 0.0030273436568677425, "step": 1770 }, { "epoch": 0.221375, "grad_norm": 2.4636669158935547, "grad_norm_var": 14.404773691988826, "learning_rate": 0.0001, "loss": 1.0178, "loss/crossentropy": 2.4166789054870605, "loss/hidden": 0.84765625, "loss/logits": 0.13983449339866638, "loss/reg": 0.0030260428320616484, "step": 1771 }, { "epoch": 0.2215, "grad_norm": 2.67075514793396, "grad_norm_var": 0.3450175902124835, "learning_rate": 0.0001, "loss": 1.0199, "loss/crossentropy": 2.695432662963867, "loss/hidden": 0.84765625, "loss/logits": 0.14194995164871216, "loss/reg": 0.0030244754161685705, "step": 1772 }, { "epoch": 0.221625, "grad_norm": 2.4573214054107666, "grad_norm_var": 0.14231832979352052, "learning_rate": 0.0001, "loss": 0.9742, "loss/crossentropy": 2.5035762786865234, "loss/hidden": 0.8203125, "loss/logits": 0.12364549934864044, "loss/reg": 0.003022938035428524, "step": 1773 }, { "epoch": 0.22175, "grad_norm": 2.343374729156494, "grad_norm_var": 0.11996201542798221, "learning_rate": 0.0001, "loss": 1.0924, "loss/crossentropy": 2.43853497505188, "loss/hidden": 0.9140625, "loss/logits": 0.14812292158603668, "loss/reg": 0.003021500539034605, "step": 1774 }, { "epoch": 0.221875, "grad_norm": 3.566265344619751, "grad_norm_var": 0.2032350727933528, "learning_rate": 0.0001, "loss": 1.1878, "loss/crossentropy": 2.46608567237854, "loss/hidden": 0.97265625, "loss/logits": 0.18495416641235352, "loss/reg": 0.00301993521861732, "step": 1775 }, { "epoch": 0.222, "grad_norm": 2.747910976409912, "grad_norm_var": 0.20471190685867377, "learning_rate": 0.0001, "loss": 1.0738, "loss/crossentropy": 2.6310603618621826, "loss/hidden": 0.87890625, "loss/logits": 0.16469183564186096, "loss/reg": 0.003018364543095231, "step": 1776 }, { "epoch": 0.222125, "grad_norm": 2.1164255142211914, "grad_norm_var": 0.1934448052628894, "learning_rate": 0.0001, "loss": 0.9762, "loss/crossentropy": 2.8867831230163574, "loss/hidden": 0.8046875, "loss/logits": 0.1413034349679947, "loss/reg": 0.0030167822260409594, "step": 1777 }, { "epoch": 0.22225, "grad_norm": 3.035059690475464, "grad_norm_var": 0.20270085598930473, "learning_rate": 0.0001, "loss": 1.1703, "loss/crossentropy": 2.1823151111602783, "loss/hidden": 0.9765625, "loss/logits": 0.16358500719070435, "loss/reg": 0.0030153028201311827, "step": 1778 }, { "epoch": 0.222375, "grad_norm": 2.5045015811920166, "grad_norm_var": 0.20219002055305796, "learning_rate": 0.0001, "loss": 1.2038, "loss/crossentropy": 2.401313543319702, "loss/hidden": 1.0078125, "loss/logits": 0.16582559049129486, "loss/reg": 0.0030142655596137047, "step": 1779 }, { "epoch": 0.2225, "grad_norm": 2.473170042037964, "grad_norm_var": 0.1879118733867992, "learning_rate": 0.0001, "loss": 1.0329, "loss/crossentropy": 2.824697494506836, "loss/hidden": 0.859375, "loss/logits": 0.14334872364997864, "loss/reg": 0.003013218054547906, "step": 1780 }, { "epoch": 0.222625, "grad_norm": 2.2666685581207275, "grad_norm_var": 0.18318870026567513, "learning_rate": 0.0001, "loss": 0.91, "loss/crossentropy": 2.4266369342803955, "loss/hidden": 0.7734375, "loss/logits": 0.10641002655029297, "loss/reg": 0.0030120951123535633, "step": 1781 }, { "epoch": 0.22275, "grad_norm": 2.411001205444336, "grad_norm_var": 0.16475203538871402, "learning_rate": 0.0001, "loss": 0.9902, "loss/crossentropy": 2.7164502143859863, "loss/hidden": 0.82421875, "loss/logits": 0.13585732877254486, "loss/reg": 0.0030110396910458803, "step": 1782 }, { "epoch": 0.222875, "grad_norm": 2.555788040161133, "grad_norm_var": 0.16466357931168235, "learning_rate": 0.0001, "loss": 0.9362, "loss/crossentropy": 2.4590888023376465, "loss/hidden": 0.78515625, "loss/logits": 0.12098520994186401, "loss/reg": 0.0030094829853624105, "step": 1783 }, { "epoch": 0.223, "grad_norm": 2.0648751258850098, "grad_norm_var": 0.17401513224721246, "learning_rate": 0.0001, "loss": 0.9556, "loss/crossentropy": 2.3822426795959473, "loss/hidden": 0.78515625, "loss/logits": 0.1403384804725647, "loss/reg": 0.003007945604622364, "step": 1784 }, { "epoch": 0.223125, "grad_norm": 4.551196098327637, "grad_norm_var": 0.4202881155883109, "learning_rate": 0.0001, "loss": 1.8349, "loss/crossentropy": 1.9226468801498413, "loss/hidden": 1.4609375, "loss/logits": 0.3438549041748047, "loss/reg": 0.003006393788382411, "step": 1785 }, { "epoch": 0.22325, "grad_norm": 3.032957077026367, "grad_norm_var": 0.38473481866021925, "learning_rate": 0.0001, "loss": 1.091, "loss/crossentropy": 2.3850667476654053, "loss/hidden": 0.90625, "loss/logits": 0.15473738312721252, "loss/reg": 0.0030050212517380714, "step": 1786 }, { "epoch": 0.223375, "grad_norm": 2.6902732849121094, "grad_norm_var": 0.3806885371660407, "learning_rate": 0.0001, "loss": 1.1889, "loss/crossentropy": 2.324967861175537, "loss/hidden": 0.984375, "loss/logits": 0.17453113198280334, "loss/reg": 0.0030037900432944298, "step": 1787 }, { "epoch": 0.2235, "grad_norm": 2.168304443359375, "grad_norm_var": 0.3996302660743254, "learning_rate": 0.0001, "loss": 1.0764, "loss/crossentropy": 2.2956700325012207, "loss/hidden": 0.8984375, "loss/logits": 0.14791284501552582, "loss/reg": 0.003002553479745984, "step": 1788 }, { "epoch": 0.223625, "grad_norm": 2.74092435836792, "grad_norm_var": 0.3959885005070151, "learning_rate": 0.0001, "loss": 0.9535, "loss/crossentropy": 2.631945848464966, "loss/hidden": 0.79296875, "loss/logits": 0.1305209994316101, "loss/reg": 0.0030011215712875128, "step": 1789 }, { "epoch": 0.22375, "grad_norm": 2.1927857398986816, "grad_norm_var": 0.4046525348789257, "learning_rate": 0.0001, "loss": 1.0108, "loss/crossentropy": 2.404738664627075, "loss/hidden": 0.83984375, "loss/logits": 0.14092186093330383, "loss/reg": 0.002999563468620181, "step": 1790 }, { "epoch": 0.223875, "grad_norm": 2.6189329624176025, "grad_norm_var": 0.35067712323398886, "learning_rate": 0.0001, "loss": 0.9651, "loss/crossentropy": 2.389113664627075, "loss/hidden": 0.78515625, "loss/logits": 0.15001046657562256, "loss/reg": 0.002998023759573698, "step": 1791 }, { "epoch": 0.224, "grad_norm": 2.298084020614624, "grad_norm_var": 0.35659197751071947, "learning_rate": 0.0001, "loss": 1.0233, "loss/crossentropy": 2.5539371967315674, "loss/hidden": 0.84765625, "loss/logits": 0.14568671584129333, "loss/reg": 0.002996444469317794, "step": 1792 }, { "epoch": 0.224125, "grad_norm": 2.6242432594299316, "grad_norm_var": 0.3394552173241588, "learning_rate": 0.0001, "loss": 1.1973, "loss/crossentropy": 2.5641584396362305, "loss/hidden": 0.984375, "loss/logits": 0.18293890357017517, "loss/reg": 0.002994769951328635, "step": 1793 }, { "epoch": 0.22425, "grad_norm": 3.7077572345733643, "grad_norm_var": 0.4032349111532985, "learning_rate": 0.0001, "loss": 1.26, "loss/crossentropy": 1.6599891185760498, "loss/hidden": 1.09375, "loss/logits": 0.13635557889938354, "loss/reg": 0.0029932132456451654, "step": 1794 }, { "epoch": 0.224375, "grad_norm": 3.1581003665924072, "grad_norm_var": 0.41452339637513186, "learning_rate": 0.0001, "loss": 0.9995, "loss/crossentropy": 2.8043997287750244, "loss/hidden": 0.83203125, "loss/logits": 0.1375463604927063, "loss/reg": 0.0029914977494627237, "step": 1795 }, { "epoch": 0.2245, "grad_norm": 3.2209360599517822, "grad_norm_var": 0.42464256487447377, "learning_rate": 0.0001, "loss": 1.1752, "loss/crossentropy": 2.321751832962036, "loss/hidden": 0.9921875, "loss/logits": 0.15309128165245056, "loss/reg": 0.0029897540807724, "step": 1796 }, { "epoch": 0.224625, "grad_norm": 3.6839895248413086, "grad_norm_var": 0.45527767818373166, "learning_rate": 0.0001, "loss": 1.2765, "loss/crossentropy": 2.8542239665985107, "loss/hidden": 1.0234375, "loss/logits": 0.2231515347957611, "loss/reg": 0.002988190157338977, "step": 1797 }, { "epoch": 0.22475, "grad_norm": 3.53092622756958, "grad_norm_var": 0.4669931032592082, "learning_rate": 0.0001, "loss": 1.2964, "loss/crossentropy": 2.4456238746643066, "loss/hidden": 1.0703125, "loss/logits": 0.19626130163669586, "loss/reg": 0.002986533334478736, "step": 1798 }, { "epoch": 0.224875, "grad_norm": 3.053359031677246, "grad_norm_var": 0.4578059410900018, "learning_rate": 0.0001, "loss": 0.9795, "loss/crossentropy": 2.8894784450531006, "loss/hidden": 0.8203125, "loss/logits": 0.12932872772216797, "loss/reg": 0.0029848606791347265, "step": 1799 }, { "epoch": 0.225, "grad_norm": 3.2195730209350586, "grad_norm_var": 0.4035408308703057, "learning_rate": 0.0001, "loss": 1.4028, "loss/crossentropy": 2.3305225372314453, "loss/hidden": 1.1875, "loss/logits": 0.18548178672790527, "loss/reg": 0.002983321435749531, "step": 1800 }, { "epoch": 0.225125, "grad_norm": 2.86997652053833, "grad_norm_var": 0.23937467026572654, "learning_rate": 0.0001, "loss": 1.1938, "loss/crossentropy": 2.579697847366333, "loss/hidden": 0.98828125, "loss/logits": 0.17568713426589966, "loss/reg": 0.0029817591421306133, "step": 1801 }, { "epoch": 0.22525, "grad_norm": 3.553030252456665, "grad_norm_var": 0.26371729729337307, "learning_rate": 0.0001, "loss": 1.1028, "loss/crossentropy": 2.8150346279144287, "loss/hidden": 0.93359375, "loss/logits": 0.13943785429000854, "loss/reg": 0.002980235731229186, "step": 1802 }, { "epoch": 0.225375, "grad_norm": 3.208162307739258, "grad_norm_var": 0.26197953760215476, "learning_rate": 0.0001, "loss": 1.1901, "loss/crossentropy": 2.499155044555664, "loss/hidden": 0.94921875, "loss/logits": 0.21110975742340088, "loss/reg": 0.002978700678795576, "step": 1803 }, { "epoch": 0.2255, "grad_norm": 4.348999500274658, "grad_norm_var": 0.3201132095155003, "learning_rate": 0.0001, "loss": 1.1629, "loss/crossentropy": 2.4431777000427246, "loss/hidden": 0.984375, "loss/logits": 0.14872056245803833, "loss/reg": 0.002977173076942563, "step": 1804 }, { "epoch": 0.225625, "grad_norm": 2.438546895980835, "grad_norm_var": 0.34138753432729513, "learning_rate": 0.0001, "loss": 1.0855, "loss/crossentropy": 2.518237352371216, "loss/hidden": 0.89453125, "loss/logits": 0.16119888424873352, "loss/reg": 0.002975636161863804, "step": 1805 }, { "epoch": 0.22575, "grad_norm": 3.8823599815368652, "grad_norm_var": 0.31363593562404785, "learning_rate": 0.0001, "loss": 1.1725, "loss/crossentropy": 2.419356107711792, "loss/hidden": 0.96875, "loss/logits": 0.17398422956466675, "loss/reg": 0.002974169095978141, "step": 1806 }, { "epoch": 0.225875, "grad_norm": 5.898488521575928, "grad_norm_var": 0.725838270489255, "learning_rate": 0.0001, "loss": 1.3984, "loss/crossentropy": 2.538686990737915, "loss/hidden": 1.140625, "loss/logits": 0.22802415490150452, "loss/reg": 0.002972749760374427, "step": 1807 }, { "epoch": 0.226, "grad_norm": 2.616062879562378, "grad_norm_var": 0.6846537892399804, "learning_rate": 0.0001, "loss": 0.9967, "loss/crossentropy": 2.91412091255188, "loss/hidden": 0.828125, "loss/logits": 0.13887512683868408, "loss/reg": 0.0029713378753513098, "step": 1808 }, { "epoch": 0.226125, "grad_norm": 2.6786835193634033, "grad_norm_var": 0.678929251874992, "learning_rate": 0.0001, "loss": 1.3028, "loss/crossentropy": 2.530329465866089, "loss/hidden": 1.0859375, "loss/logits": 0.18713583052158356, "loss/reg": 0.0029699981678277254, "step": 1809 }, { "epoch": 0.22625, "grad_norm": 2.3305532932281494, "grad_norm_var": 0.7486371828354244, "learning_rate": 0.0001, "loss": 1.1417, "loss/crossentropy": 2.5374956130981445, "loss/hidden": 0.9453125, "loss/logits": 0.16669398546218872, "loss/reg": 0.0029686312191188335, "step": 1810 }, { "epoch": 0.226375, "grad_norm": 2.9841620922088623, "grad_norm_var": 0.7551115699539401, "learning_rate": 0.0001, "loss": 1.1804, "loss/crossentropy": 2.4051711559295654, "loss/hidden": 0.97265625, "loss/logits": 0.17810006439685822, "loss/reg": 0.002967282198369503, "step": 1811 }, { "epoch": 0.2265, "grad_norm": 1.9030566215515137, "grad_norm_var": 0.885438078387665, "learning_rate": 0.0001, "loss": 1.0451, "loss/crossentropy": 2.449751615524292, "loss/hidden": 0.87109375, "loss/logits": 0.14433653652668, "loss/reg": 0.0029659466817975044, "step": 1812 }, { "epoch": 0.226625, "grad_norm": 2.075324773788452, "grad_norm_var": 0.9567700729341877, "learning_rate": 0.0001, "loss": 1.0281, "loss/crossentropy": 2.4204511642456055, "loss/hidden": 0.87109375, "loss/logits": 0.12739630043506622, "loss/reg": 0.0029646598268300295, "step": 1813 }, { "epoch": 0.22675, "grad_norm": 2.0552451610565186, "grad_norm_var": 1.0202742097321296, "learning_rate": 0.0001, "loss": 0.9888, "loss/crossentropy": 2.424468755722046, "loss/hidden": 0.81640625, "loss/logits": 0.14275333285331726, "loss/reg": 0.002963270992040634, "step": 1814 }, { "epoch": 0.226875, "grad_norm": 2.0605251789093018, "grad_norm_var": 1.0840480132956123, "learning_rate": 0.0001, "loss": 1.0656, "loss/crossentropy": 2.625704050064087, "loss/hidden": 0.89453125, "loss/logits": 0.1414736658334732, "loss/reg": 0.0029617997352033854, "step": 1815 }, { "epoch": 0.227, "grad_norm": 2.0984017848968506, "grad_norm_var": 1.1309350809822944, "learning_rate": 0.0001, "loss": 1.1418, "loss/crossentropy": 2.2266194820404053, "loss/hidden": 0.9453125, "loss/logits": 0.16688337922096252, "loss/reg": 0.002960240002721548, "step": 1816 }, { "epoch": 0.227125, "grad_norm": 2.257150411605835, "grad_norm_var": 1.1599327396837886, "learning_rate": 0.0001, "loss": 1.2117, "loss/crossentropy": 2.2862818241119385, "loss/hidden": 1.0078125, "loss/logits": 0.1742808222770691, "loss/reg": 0.0029587687458842993, "step": 1817 }, { "epoch": 0.22725, "grad_norm": 2.0277411937713623, "grad_norm_var": 1.1723884671928733, "learning_rate": 0.0001, "loss": 1.0665, "loss/crossentropy": 2.4400901794433594, "loss/hidden": 0.89453125, "loss/logits": 0.14237791299819946, "loss/reg": 0.0029572048224508762, "step": 1818 }, { "epoch": 0.227375, "grad_norm": 2.6541121006011963, "grad_norm_var": 1.161714891934859, "learning_rate": 0.0001, "loss": 1.0231, "loss/crossentropy": 2.5243773460388184, "loss/hidden": 0.85546875, "loss/logits": 0.13802778720855713, "loss/reg": 0.0029556897934526205, "step": 1819 }, { "epoch": 0.2275, "grad_norm": 2.3991587162017822, "grad_norm_var": 0.9886539748965065, "learning_rate": 0.0001, "loss": 1.3316, "loss/crossentropy": 2.3047027587890625, "loss/hidden": 1.1328125, "loss/logits": 0.16926254332065582, "loss/reg": 0.0029541929252445698, "step": 1820 }, { "epoch": 0.227625, "grad_norm": 2.27185320854187, "grad_norm_var": 0.9950342111305434, "learning_rate": 0.0001, "loss": 1.0846, "loss/crossentropy": 2.3836841583251953, "loss/hidden": 0.90625, "loss/logits": 0.14881691336631775, "loss/reg": 0.002952732378616929, "step": 1821 }, { "epoch": 0.22775, "grad_norm": 2.8672397136688232, "grad_norm_var": 0.8908872852448818, "learning_rate": 0.0001, "loss": 1.2423, "loss/crossentropy": 2.2023377418518066, "loss/hidden": 0.9921875, "loss/logits": 0.22060072422027588, "loss/reg": 0.0029511942993849516, "step": 1822 }, { "epoch": 0.227875, "grad_norm": 2.2803525924682617, "grad_norm_var": 0.10508732681826037, "learning_rate": 0.0001, "loss": 1.0304, "loss/crossentropy": 2.372196912765503, "loss/hidden": 0.8515625, "loss/logits": 0.14933274686336517, "loss/reg": 0.00294972350820899, "step": 1823 }, { "epoch": 0.228, "grad_norm": 2.6830813884735107, "grad_norm_var": 0.10776807926507198, "learning_rate": 0.0001, "loss": 1.1309, "loss/crossentropy": 2.475736618041992, "loss/hidden": 0.9296875, "loss/logits": 0.17176774144172668, "loss/reg": 0.002948229666799307, "step": 1824 }, { "epoch": 0.228125, "grad_norm": 2.7077255249023438, "grad_norm_var": 0.1090870968752443, "learning_rate": 0.0001, "loss": 1.1178, "loss/crossentropy": 2.6619277000427246, "loss/hidden": 0.92578125, "loss/logits": 0.16253264248371124, "loss/reg": 0.002946724882349372, "step": 1825 }, { "epoch": 0.22825, "grad_norm": 2.7542712688446045, "grad_norm_var": 0.11901288025463994, "learning_rate": 0.0001, "loss": 1.023, "loss/crossentropy": 2.3113021850585938, "loss/hidden": 0.84375, "loss/logits": 0.14976277947425842, "loss/reg": 0.00294527318328619, "step": 1826 }, { "epoch": 0.228375, "grad_norm": 2.1749322414398193, "grad_norm_var": 0.0947496886136868, "learning_rate": 0.0001, "loss": 1.076, "loss/crossentropy": 2.3755033016204834, "loss/hidden": 0.890625, "loss/logits": 0.1559135913848877, "loss/reg": 0.0029437614139169455, "step": 1827 }, { "epoch": 0.2285, "grad_norm": 2.2916691303253174, "grad_norm_var": 0.08209817483413247, "learning_rate": 0.0001, "loss": 1.2318, "loss/crossentropy": 2.409095287322998, "loss/hidden": 1.03125, "loss/logits": 0.17112311720848083, "loss/reg": 0.0029421907383948565, "step": 1828 }, { "epoch": 0.228625, "grad_norm": 3.1720941066741943, "grad_norm_var": 0.11657495418614777, "learning_rate": 0.0001, "loss": 1.0408, "loss/crossentropy": 2.571678400039673, "loss/hidden": 0.85546875, "loss/logits": 0.15594975650310516, "loss/reg": 0.0029407108668237925, "step": 1829 }, { "epoch": 0.22875, "grad_norm": 3.709214448928833, "grad_norm_var": 0.20662170797658444, "learning_rate": 0.0001, "loss": 1.4423, "loss/crossentropy": 2.3802380561828613, "loss/hidden": 1.2265625, "loss/logits": 0.18629974126815796, "loss/reg": 0.0029392328578978777, "step": 1830 }, { "epoch": 0.228875, "grad_norm": 2.151437520980835, "grad_norm_var": 0.20150086001236314, "learning_rate": 0.0001, "loss": 1.1299, "loss/crossentropy": 2.409843921661377, "loss/hidden": 0.9296875, "loss/logits": 0.17083843052387238, "loss/reg": 0.0029376852326095104, "step": 1831 }, { "epoch": 0.229, "grad_norm": 2.108267307281494, "grad_norm_var": 0.20093753742009024, "learning_rate": 0.0001, "loss": 1.0452, "loss/crossentropy": 2.7673611640930176, "loss/hidden": 0.86328125, "loss/logits": 0.15255650877952576, "loss/reg": 0.0029360908083617687, "step": 1832 }, { "epoch": 0.229125, "grad_norm": 2.295905828475952, "grad_norm_var": 0.19961170535207368, "learning_rate": 0.0001, "loss": 1.1137, "loss/crossentropy": 2.4432260990142822, "loss/hidden": 0.93359375, "loss/logits": 0.15079358220100403, "loss/reg": 0.002934559714049101, "step": 1833 }, { "epoch": 0.22925, "grad_norm": 2.0841267108917236, "grad_norm_var": 0.19600194880262786, "learning_rate": 0.0001, "loss": 1.1015, "loss/crossentropy": 2.406099319458008, "loss/hidden": 0.91015625, "loss/logits": 0.1619967818260193, "loss/reg": 0.002933042123913765, "step": 1834 }, { "epoch": 0.229375, "grad_norm": 2.2038230895996094, "grad_norm_var": 0.20169366112067166, "learning_rate": 0.0001, "loss": 1.1717, "loss/crossentropy": 2.36592435836792, "loss/hidden": 0.9765625, "loss/logits": 0.1658266931772232, "loss/reg": 0.0029314891435205936, "step": 1835 }, { "epoch": 0.2295, "grad_norm": 4.329677581787109, "grad_norm_var": 0.40617225913744975, "learning_rate": 0.0001, "loss": 1.0861, "loss/crossentropy": 2.428100824356079, "loss/hidden": 0.90625, "loss/logits": 0.15050062537193298, "loss/reg": 0.002929947804659605, "step": 1836 }, { "epoch": 0.229625, "grad_norm": 2.962228536605835, "grad_norm_var": 0.4029608323643475, "learning_rate": 0.0001, "loss": 1.2032, "loss/crossentropy": 2.661451816558838, "loss/hidden": 1.0078125, "loss/logits": 0.166073739528656, "loss/reg": 0.0029283969197422266, "step": 1837 }, { "epoch": 0.22975, "grad_norm": 2.1242473125457764, "grad_norm_var": 0.418270528733823, "learning_rate": 0.0001, "loss": 1.3265, "loss/crossentropy": 2.3514444828033447, "loss/hidden": 1.09375, "loss/logits": 0.20345856249332428, "loss/reg": 0.002926844172179699, "step": 1838 }, { "epoch": 0.229875, "grad_norm": 2.6637930870056152, "grad_norm_var": 0.4097338351488264, "learning_rate": 0.0001, "loss": 1.0729, "loss/crossentropy": 2.8255255222320557, "loss/hidden": 0.890625, "loss/logits": 0.15303359925746918, "loss/reg": 0.0029252341482788324, "step": 1839 }, { "epoch": 0.23, "grad_norm": 2.8248424530029297, "grad_norm_var": 0.4115956483187119, "learning_rate": 0.0001, "loss": 1.2172, "loss/crossentropy": 2.730475902557373, "loss/hidden": 0.9921875, "loss/logits": 0.19581902027130127, "loss/reg": 0.0029237025883048773, "step": 1840 }, { "epoch": 0.230125, "grad_norm": 2.1636877059936523, "grad_norm_var": 0.42662438202454495, "learning_rate": 0.0001, "loss": 1.2799, "loss/crossentropy": 2.4928698539733887, "loss/hidden": 1.078125, "loss/logits": 0.1725054681301117, "loss/reg": 0.002922156360000372, "step": 1841 }, { "epoch": 0.23025, "grad_norm": 2.155371904373169, "grad_norm_var": 0.4387901405468398, "learning_rate": 0.0001, "loss": 1.0823, "loss/crossentropy": 2.8904709815979004, "loss/hidden": 0.89453125, "loss/logits": 0.15856406092643738, "loss/reg": 0.002920587779954076, "step": 1842 }, { "epoch": 0.230375, "grad_norm": 2.688028573989868, "grad_norm_var": 0.4269539462286275, "learning_rate": 0.0001, "loss": 0.9496, "loss/crossentropy": 2.479196548461914, "loss/hidden": 0.80859375, "loss/logits": 0.1118479073047638, "loss/reg": 0.0029189754277467728, "step": 1843 }, { "epoch": 0.2305, "grad_norm": 3.4530677795410156, "grad_norm_var": 0.46033235618827817, "learning_rate": 0.0001, "loss": 1.3334, "loss/crossentropy": 2.3263473510742188, "loss/hidden": 1.109375, "loss/logits": 0.19483482837677002, "loss/reg": 0.0029173565562814474, "step": 1844 }, { "epoch": 0.230625, "grad_norm": 2.624260425567627, "grad_norm_var": 0.44410306117912624, "learning_rate": 0.0001, "loss": 1.1256, "loss/crossentropy": 2.4004266262054443, "loss/hidden": 0.921875, "loss/logits": 0.1746046245098114, "loss/reg": 0.00291584269143641, "step": 1845 }, { "epoch": 0.23075, "grad_norm": 2.484194755554199, "grad_norm_var": 0.36633673651388654, "learning_rate": 0.0001, "loss": 1.1565, "loss/crossentropy": 2.500600814819336, "loss/hidden": 0.96484375, "loss/logits": 0.1624952256679535, "loss/reg": 0.0029144061263650656, "step": 1846 }, { "epoch": 0.230875, "grad_norm": 2.1600430011749268, "grad_norm_var": 0.36584698292128315, "learning_rate": 0.0001, "loss": 1.0968, "loss/crossentropy": 2.7809672355651855, "loss/hidden": 0.9140625, "loss/logits": 0.1535949558019638, "loss/reg": 0.00291292741894722, "step": 1847 }, { "epoch": 0.231, "grad_norm": 1.9736144542694092, "grad_norm_var": 0.37550067856469693, "learning_rate": 0.0001, "loss": 1.0361, "loss/crossentropy": 2.4563684463500977, "loss/hidden": 0.86328125, "loss/logits": 0.14372900128364563, "loss/reg": 0.002911404939368367, "step": 1848 }, { "epoch": 0.231125, "grad_norm": 2.2191903591156006, "grad_norm_var": 0.3787174770815568, "learning_rate": 0.0001, "loss": 1.0114, "loss/crossentropy": 2.5433239936828613, "loss/hidden": 0.83984375, "loss/logits": 0.14246006309986115, "loss/reg": 0.002909915754571557, "step": 1849 }, { "epoch": 0.23125, "grad_norm": 2.5526938438415527, "grad_norm_var": 0.3621070968588713, "learning_rate": 0.0001, "loss": 0.9695, "loss/crossentropy": 2.5453128814697266, "loss/hidden": 0.8125, "loss/logits": 0.1279653012752533, "loss/reg": 0.0029083597473800182, "step": 1850 }, { "epoch": 0.231375, "grad_norm": 1.9157322645187378, "grad_norm_var": 0.38247098077205705, "learning_rate": 0.0001, "loss": 0.9632, "loss/crossentropy": 2.626375913619995, "loss/hidden": 0.796875, "loss/logits": 0.13728055357933044, "loss/reg": 0.0029068856965750456, "step": 1851 }, { "epoch": 0.2315, "grad_norm": 2.353100538253784, "grad_norm_var": 0.16577489550661723, "learning_rate": 0.0001, "loss": 1.1287, "loss/crossentropy": 2.479382038116455, "loss/hidden": 0.9296875, "loss/logits": 0.16994883120059967, "loss/reg": 0.0029053473845124245, "step": 1852 }, { "epoch": 0.231625, "grad_norm": 2.6295785903930664, "grad_norm_var": 0.1502992299825194, "learning_rate": 0.0001, "loss": 1.2396, "loss/crossentropy": 2.2818477153778076, "loss/hidden": 1.0390625, "loss/logits": 0.17154169082641602, "loss/reg": 0.002903790445998311, "step": 1853 }, { "epoch": 0.23175, "grad_norm": 2.1487951278686523, "grad_norm_var": 0.14931457999495443, "learning_rate": 0.0001, "loss": 1.0425, "loss/crossentropy": 2.6548938751220703, "loss/hidden": 0.86328125, "loss/logits": 0.15023818612098694, "loss/reg": 0.0029021373484283686, "step": 1854 }, { "epoch": 0.231875, "grad_norm": 2.2083189487457275, "grad_norm_var": 0.14857580667151063, "learning_rate": 0.0001, "loss": 1.0317, "loss/crossentropy": 2.3384876251220703, "loss/hidden": 0.8671875, "loss/logits": 0.13553163409233093, "loss/reg": 0.0029005296528339386, "step": 1855 }, { "epoch": 0.232, "grad_norm": 1.848423957824707, "grad_norm_var": 0.1541103110008151, "learning_rate": 0.0001, "loss": 0.9426, "loss/crossentropy": 2.4241435527801514, "loss/hidden": 0.8046875, "loss/logits": 0.1089288517832756, "loss/reg": 0.002899044193327427, "step": 1856 }, { "epoch": 0.232125, "grad_norm": 2.036503553390503, "grad_norm_var": 0.15825755313067677, "learning_rate": 0.0001, "loss": 0.9461, "loss/crossentropy": 2.3014883995056152, "loss/hidden": 0.78515625, "loss/logits": 0.1319369375705719, "loss/reg": 0.0028974406886845827, "step": 1857 }, { "epoch": 0.23225, "grad_norm": 2.712230682373047, "grad_norm_var": 0.16387938230163232, "learning_rate": 0.0001, "loss": 1.2105, "loss/crossentropy": 2.365384101867676, "loss/hidden": 1.0078125, "loss/logits": 0.17370465397834778, "loss/reg": 0.0028959375340491533, "step": 1858 }, { "epoch": 0.232375, "grad_norm": 2.5608208179473877, "grad_norm_var": 0.15958970126699837, "learning_rate": 0.0001, "loss": 1.1594, "loss/crossentropy": 2.623105049133301, "loss/hidden": 0.94140625, "loss/logits": 0.1890484094619751, "loss/reg": 0.002894355682656169, "step": 1859 }, { "epoch": 0.2325, "grad_norm": 1.9694890975952148, "grad_norm_var": 0.08242289833427534, "learning_rate": 0.0001, "loss": 0.8934, "loss/crossentropy": 2.1551883220672607, "loss/hidden": 0.74609375, "loss/logits": 0.11839769035577774, "loss/reg": 0.0028928006067872047, "step": 1860 }, { "epoch": 0.232625, "grad_norm": 2.146641492843628, "grad_norm_var": 0.0744266244705272, "learning_rate": 0.0001, "loss": 1.0166, "loss/crossentropy": 2.694187641143799, "loss/hidden": 0.85546875, "loss/logits": 0.1322515606880188, "loss/reg": 0.002891267416998744, "step": 1861 }, { "epoch": 0.23275, "grad_norm": 2.412816286087036, "grad_norm_var": 0.07246823357878984, "learning_rate": 0.0001, "loss": 1.182, "loss/crossentropy": 2.708153009414673, "loss/hidden": 0.96875, "loss/logits": 0.1843324601650238, "loss/reg": 0.0028897603042423725, "step": 1862 }, { "epoch": 0.232875, "grad_norm": 3.3550024032592773, "grad_norm_var": 0.14889475511776779, "learning_rate": 0.0001, "loss": 1.0168, "loss/crossentropy": 2.240648031234741, "loss/hidden": 0.8671875, "loss/logits": 0.12075857073068619, "loss/reg": 0.0028883127961307764, "step": 1863 }, { "epoch": 0.233, "grad_norm": 2.539043664932251, "grad_norm_var": 0.14312548265110311, "learning_rate": 0.0001, "loss": 0.9847, "loss/crossentropy": 2.3216147422790527, "loss/hidden": 0.828125, "loss/logits": 0.12774166464805603, "loss/reg": 0.0028867912478744984, "step": 1864 }, { "epoch": 0.233125, "grad_norm": 2.358919382095337, "grad_norm_var": 0.14189893172667017, "learning_rate": 0.0001, "loss": 1.0164, "loss/crossentropy": 2.5843939781188965, "loss/hidden": 0.83984375, "loss/logits": 0.14774055778980255, "loss/reg": 0.0028853206895291805, "step": 1865 }, { "epoch": 0.23325, "grad_norm": 2.560542106628418, "grad_norm_var": 0.14210520060771656, "learning_rate": 0.0001, "loss": 1.3539, "loss/crossentropy": 2.2003164291381836, "loss/hidden": 1.1015625, "loss/logits": 0.22345313429832458, "loss/reg": 0.002883851993829012, "step": 1866 }, { "epoch": 0.233375, "grad_norm": 2.2669901847839355, "grad_norm_var": 0.12902140426953868, "learning_rate": 0.0001, "loss": 1.0907, "loss/crossentropy": 2.550511360168457, "loss/hidden": 0.89453125, "loss/logits": 0.16732323169708252, "loss/reg": 0.0028824072796851397, "step": 1867 }, { "epoch": 0.2335, "grad_norm": 1.9912503957748413, "grad_norm_var": 0.1385847546259417, "learning_rate": 0.0001, "loss": 1.1119, "loss/crossentropy": 2.4620280265808105, "loss/hidden": 0.92578125, "loss/logits": 0.15735240280628204, "loss/reg": 0.0028808878269046545, "step": 1868 }, { "epoch": 0.233625, "grad_norm": 2.5993523597717285, "grad_norm_var": 0.13755172432265106, "learning_rate": 0.0001, "loss": 1.4922, "loss/crossentropy": 2.358572244644165, "loss/hidden": 1.2578125, "loss/logits": 0.20562607049942017, "loss/reg": 0.0028794193640351295, "step": 1869 }, { "epoch": 0.23375, "grad_norm": 3.30731463432312, "grad_norm_var": 0.18924561660283715, "learning_rate": 0.0001, "loss": 1.376, "loss/crossentropy": 2.7302675247192383, "loss/hidden": 1.140625, "loss/logits": 0.20656052231788635, "loss/reg": 0.0028778668493032455, "step": 1870 }, { "epoch": 0.233875, "grad_norm": 2.8096041679382324, "grad_norm_var": 0.19410140740735352, "learning_rate": 0.0001, "loss": 1.2133, "loss/crossentropy": 2.322666645050049, "loss/hidden": 1.0078125, "loss/logits": 0.17673031985759735, "loss/reg": 0.0028763674199581146, "step": 1871 }, { "epoch": 0.234, "grad_norm": 2.766726493835449, "grad_norm_var": 0.17104518125896953, "learning_rate": 0.0001, "loss": 1.194, "loss/crossentropy": 2.258401393890381, "loss/hidden": 0.99609375, "loss/logits": 0.169195756316185, "loss/reg": 0.0028748363256454468, "step": 1872 }, { "epoch": 0.234125, "grad_norm": 2.1711134910583496, "grad_norm_var": 0.16341771516509496, "learning_rate": 0.0001, "loss": 0.9274, "loss/crossentropy": 2.3150506019592285, "loss/hidden": 0.77734375, "loss/logits": 0.12128670513629913, "loss/reg": 0.002873300574719906, "step": 1873 }, { "epoch": 0.23425, "grad_norm": 2.1112637519836426, "grad_norm_var": 0.17162801880261083, "learning_rate": 0.0001, "loss": 0.9732, "loss/crossentropy": 2.506507396697998, "loss/hidden": 0.8046875, "loss/logits": 0.13975682854652405, "loss/reg": 0.0028718682006001472, "step": 1874 }, { "epoch": 0.234375, "grad_norm": 2.183361053466797, "grad_norm_var": 0.17724180763689687, "learning_rate": 0.0001, "loss": 1.1296, "loss/crossentropy": 2.4888036251068115, "loss/hidden": 0.93359375, "loss/logits": 0.16730833053588867, "loss/reg": 0.002870464464649558, "step": 1875 }, { "epoch": 0.2345, "grad_norm": 2.735436201095581, "grad_norm_var": 0.16260582148087882, "learning_rate": 0.0001, "loss": 1.1365, "loss/crossentropy": 2.1528542041778564, "loss/hidden": 0.96484375, "loss/logits": 0.14295265078544617, "loss/reg": 0.002869043732061982, "step": 1876 }, { "epoch": 0.234625, "grad_norm": 3.999509572982788, "grad_norm_var": 0.28500931963959975, "learning_rate": 0.0001, "loss": 1.3381, "loss/crossentropy": 3.042414903640747, "loss/hidden": 1.0703125, "loss/logits": 0.23910346627235413, "loss/reg": 0.0028676455840468407, "step": 1877 }, { "epoch": 0.23475, "grad_norm": 2.6461031436920166, "grad_norm_var": 0.2814837056327926, "learning_rate": 0.0001, "loss": 0.9627, "loss/crossentropy": 2.62857985496521, "loss/hidden": 0.7890625, "loss/logits": 0.14501667022705078, "loss/reg": 0.002866254420951009, "step": 1878 }, { "epoch": 0.234875, "grad_norm": 2.356269359588623, "grad_norm_var": 0.2499569691597026, "learning_rate": 0.0001, "loss": 1.1219, "loss/crossentropy": 2.535068988800049, "loss/hidden": 0.9453125, "loss/logits": 0.14790667593479156, "loss/reg": 0.002864871872588992, "step": 1879 }, { "epoch": 0.235, "grad_norm": 2.321554660797119, "grad_norm_var": 0.25432354819466757, "learning_rate": 0.0001, "loss": 1.0411, "loss/crossentropy": 2.5561270713806152, "loss/hidden": 0.86328125, "loss/logits": 0.14915838837623596, "loss/reg": 0.0028635053895413876, "step": 1880 }, { "epoch": 0.235125, "grad_norm": 2.6328799724578857, "grad_norm_var": 0.2511549738430517, "learning_rate": 0.0001, "loss": 1.2299, "loss/crossentropy": 2.174438238143921, "loss/hidden": 1.015625, "loss/logits": 0.18564572930335999, "loss/reg": 0.0028620159719139338, "step": 1881 }, { "epoch": 0.23525, "grad_norm": 2.7941927909851074, "grad_norm_var": 0.25361177630329473, "learning_rate": 0.0001, "loss": 1.0417, "loss/crossentropy": 2.698789596557617, "loss/hidden": 0.87109375, "loss/logits": 0.14195041358470917, "loss/reg": 0.0028606669511646032, "step": 1882 }, { "epoch": 0.235375, "grad_norm": 2.9317786693573, "grad_norm_var": 0.251201000396558, "learning_rate": 0.0001, "loss": 1.222, "loss/crossentropy": 2.6760761737823486, "loss/hidden": 1.015625, "loss/logits": 0.17773698270320892, "loss/reg": 0.0028592266608029604, "step": 1883 }, { "epoch": 0.2355, "grad_norm": 2.170546293258667, "grad_norm_var": 0.23752522799550563, "learning_rate": 0.0001, "loss": 1.0143, "loss/crossentropy": 2.390109062194824, "loss/hidden": 0.83984375, "loss/logits": 0.14584662020206451, "loss/reg": 0.0028577372431755066, "step": 1884 }, { "epoch": 0.235625, "grad_norm": 3.2109017372131348, "grad_norm_var": 0.25607174442198255, "learning_rate": 0.0001, "loss": 1.0177, "loss/crossentropy": 2.848381519317627, "loss/hidden": 0.83203125, "loss/logits": 0.1571502387523651, "loss/reg": 0.0028563509695231915, "step": 1885 }, { "epoch": 0.23575, "grad_norm": 3.3898189067840576, "grad_norm_var": 0.2632133556348774, "learning_rate": 0.0001, "loss": 0.9715, "loss/crossentropy": 2.551968574523926, "loss/hidden": 0.8046875, "loss/logits": 0.13831113278865814, "loss/reg": 0.0028548440895974636, "step": 1886 }, { "epoch": 0.235875, "grad_norm": 2.2735543251037598, "grad_norm_var": 0.27347767108519155, "learning_rate": 0.0001, "loss": 1.0096, "loss/crossentropy": 2.356642484664917, "loss/hidden": 0.84765625, "loss/logits": 0.1334502100944519, "loss/reg": 0.0028533469885587692, "step": 1887 }, { "epoch": 0.236, "grad_norm": 2.8403327465057373, "grad_norm_var": 0.274780906820475, "learning_rate": 0.0001, "loss": 1.1044, "loss/crossentropy": 2.49477481842041, "loss/hidden": 0.89453125, "loss/logits": 0.18135184049606323, "loss/reg": 0.002851872704923153, "step": 1888 }, { "epoch": 0.236125, "grad_norm": 2.4770002365112305, "grad_norm_var": 0.26015786291882914, "learning_rate": 0.0001, "loss": 1.1539, "loss/crossentropy": 2.3961706161499023, "loss/hidden": 0.9765625, "loss/logits": 0.14887994527816772, "loss/reg": 0.002850309479981661, "step": 1889 }, { "epoch": 0.23625, "grad_norm": 97.88726806640625, "grad_norm_var": 566.1572677979839, "learning_rate": 0.0001, "loss": 1.2242, "loss/crossentropy": 2.2282721996307373, "loss/hidden": 1.0234375, "loss/logits": 0.17226368188858032, "loss/reg": 0.002848886651918292, "step": 1890 }, { "epoch": 0.236375, "grad_norm": 2.3098387718200684, "grad_norm_var": 566.048741327807, "learning_rate": 0.0001, "loss": 1.0645, "loss/crossentropy": 2.4957542419433594, "loss/hidden": 0.88671875, "loss/logits": 0.14935660362243652, "loss/reg": 0.0028473958373069763, "step": 1891 }, { "epoch": 0.2365, "grad_norm": 3.0709023475646973, "grad_norm_var": 565.7896104746242, "learning_rate": 0.0001, "loss": 1.0863, "loss/crossentropy": 2.701103925704956, "loss/hidden": 0.90234375, "loss/logits": 0.15548059344291687, "loss/reg": 0.0028457811567932367, "step": 1892 }, { "epoch": 0.236625, "grad_norm": 4.502190113067627, "grad_norm_var": 565.4898863883287, "learning_rate": 0.0001, "loss": 1.4863, "loss/crossentropy": 2.418233871459961, "loss/hidden": 1.265625, "loss/logits": 0.1922321766614914, "loss/reg": 0.002844167174771428, "step": 1893 }, { "epoch": 0.23675, "grad_norm": 3.704207420349121, "grad_norm_var": 564.7003492594727, "learning_rate": 0.0001, "loss": 1.2931, "loss/crossentropy": 2.34171462059021, "loss/hidden": 1.078125, "loss/logits": 0.18653494119644165, "loss/reg": 0.0028426784556359053, "step": 1894 }, { "epoch": 0.236875, "grad_norm": 4.64210319519043, "grad_norm_var": 563.0616126406595, "learning_rate": 0.0001, "loss": 1.3692, "loss/crossentropy": 2.5464377403259277, "loss/hidden": 1.140625, "loss/logits": 0.2001732736825943, "loss/reg": 0.0028411895036697388, "step": 1895 }, { "epoch": 0.237, "grad_norm": 2.472402572631836, "grad_norm_var": 562.9297680002472, "learning_rate": 0.0001, "loss": 1.1043, "loss/crossentropy": 2.488330602645874, "loss/hidden": 0.9140625, "loss/logits": 0.1618151217699051, "loss/reg": 0.002839608583599329, "step": 1896 }, { "epoch": 0.237125, "grad_norm": 3.018164873123169, "grad_norm_var": 562.6141740686131, "learning_rate": 0.0001, "loss": 1.0354, "loss/crossentropy": 2.5816335678100586, "loss/hidden": 0.8671875, "loss/logits": 0.13983488082885742, "loss/reg": 0.0028381715528666973, "step": 1897 }, { "epoch": 0.23725, "grad_norm": 3.750331401824951, "grad_norm_var": 561.8825919502568, "learning_rate": 0.0001, "loss": 1.1906, "loss/crossentropy": 2.758951187133789, "loss/hidden": 0.98828125, "loss/logits": 0.17395856976509094, "loss/reg": 0.002836685860529542, "step": 1898 }, { "epoch": 0.237375, "grad_norm": 2.309964895248413, "grad_norm_var": 562.4132399812779, "learning_rate": 0.0001, "loss": 1.1765, "loss/crossentropy": 2.7613322734832764, "loss/hidden": 0.96875, "loss/logits": 0.17939506471157074, "loss/reg": 0.0028351792134344578, "step": 1899 }, { "epoch": 0.2375, "grad_norm": 3.629230499267578, "grad_norm_var": 561.2175971903463, "learning_rate": 0.0001, "loss": 1.1221, "loss/crossentropy": 2.351802110671997, "loss/hidden": 0.94921875, "loss/logits": 0.1445481777191162, "loss/reg": 0.002833602949976921, "step": 1900 }, { "epoch": 0.237625, "grad_norm": 2.0922610759735107, "grad_norm_var": 562.1731362143731, "learning_rate": 0.0001, "loss": 0.9687, "loss/crossentropy": 2.6160526275634766, "loss/hidden": 0.80859375, "loss/logits": 0.13180279731750488, "loss/reg": 0.0028320997953414917, "step": 1901 }, { "epoch": 0.23775, "grad_norm": 2.7156622409820557, "grad_norm_var": 562.7079033711701, "learning_rate": 0.0001, "loss": 1.1645, "loss/crossentropy": 2.891047239303589, "loss/hidden": 0.9453125, "loss/logits": 0.19086427986621857, "loss/reg": 0.0028305284213274717, "step": 1902 }, { "epoch": 0.237875, "grad_norm": 2.3164191246032715, "grad_norm_var": 562.6696833086194, "learning_rate": 0.0001, "loss": 1.1205, "loss/crossentropy": 2.3813233375549316, "loss/hidden": 0.9453125, "loss/logits": 0.1469373106956482, "loss/reg": 0.0028289342299103737, "step": 1903 }, { "epoch": 0.238, "grad_norm": 2.695955991744995, "grad_norm_var": 562.7892462486658, "learning_rate": 0.0001, "loss": 0.9631, "loss/crossentropy": 2.7349748611450195, "loss/hidden": 0.80078125, "loss/logits": 0.13401402533054352, "loss/reg": 0.0028272622730582952, "step": 1904 }, { "epoch": 0.238125, "grad_norm": 2.295044422149658, "grad_norm_var": 562.9489527602544, "learning_rate": 0.0001, "loss": 1.0814, "loss/crossentropy": 2.499460458755493, "loss/hidden": 0.91796875, "loss/logits": 0.1352241486310959, "loss/reg": 0.0028255698271095753, "step": 1905 }, { "epoch": 0.23825, "grad_norm": 2.244823455810547, "grad_norm_var": 0.6781732251602759, "learning_rate": 0.0001, "loss": 1.0582, "loss/crossentropy": 2.587311029434204, "loss/hidden": 0.8828125, "loss/logits": 0.14716576039791107, "loss/reg": 0.0028240818064659834, "step": 1906 }, { "epoch": 0.238375, "grad_norm": 2.270704507827759, "grad_norm_var": 0.6817949672684023, "learning_rate": 0.0001, "loss": 1.1391, "loss/crossentropy": 2.707805633544922, "loss/hidden": 0.93359375, "loss/logits": 0.1773141622543335, "loss/reg": 0.0028225905261933804, "step": 1907 }, { "epoch": 0.2385, "grad_norm": 3.1557998657226562, "grad_norm_var": 0.6832387916335013, "learning_rate": 0.0001, "loss": 1.1169, "loss/crossentropy": 2.439981460571289, "loss/hidden": 0.93359375, "loss/logits": 0.15508322417736053, "loss/reg": 0.0028210440650582314, "step": 1908 }, { "epoch": 0.238625, "grad_norm": 2.238996982574463, "grad_norm_var": 0.54658289647939, "learning_rate": 0.0001, "loss": 1.0766, "loss/crossentropy": 2.543613910675049, "loss/hidden": 0.890625, "loss/logits": 0.1577475666999817, "loss/reg": 0.0028194894548505545, "step": 1909 }, { "epoch": 0.23875, "grad_norm": 2.3347530364990234, "grad_norm_var": 0.5072756946952609, "learning_rate": 0.0001, "loss": 1.1956, "loss/crossentropy": 2.391685962677002, "loss/hidden": 1.0, "loss/logits": 0.16745620965957642, "loss/reg": 0.002817926462739706, "step": 1910 }, { "epoch": 0.238875, "grad_norm": 3.038079261779785, "grad_norm_var": 0.26585868434679016, "learning_rate": 0.0001, "loss": 1.1833, "loss/crossentropy": 2.6581690311431885, "loss/hidden": 0.9609375, "loss/logits": 0.19423067569732666, "loss/reg": 0.00281645474024117, "step": 1911 }, { "epoch": 0.239, "grad_norm": 2.801513671875, "grad_norm_var": 0.26434526750178977, "learning_rate": 0.0001, "loss": 1.2958, "loss/crossentropy": 2.2399282455444336, "loss/hidden": 1.0859375, "loss/logits": 0.18169432878494263, "loss/reg": 0.002814988372847438, "step": 1912 }, { "epoch": 0.239125, "grad_norm": 2.835973024368286, "grad_norm_var": 0.2582471639147646, "learning_rate": 0.0001, "loss": 1.2229, "loss/crossentropy": 2.8033790588378906, "loss/hidden": 1.0, "loss/logits": 0.19473232328891754, "loss/reg": 0.0028135550674051046, "step": 1913 }, { "epoch": 0.23925, "grad_norm": 2.284269094467163, "grad_norm_var": 0.18147043790236214, "learning_rate": 0.0001, "loss": 1.0563, "loss/crossentropy": 2.4895052909851074, "loss/hidden": 0.875, "loss/logits": 0.15318460762500763, "loss/reg": 0.0028121541254222393, "step": 1914 }, { "epoch": 0.239375, "grad_norm": 2.1640424728393555, "grad_norm_var": 0.18803017488825446, "learning_rate": 0.0001, "loss": 1.0567, "loss/crossentropy": 2.5001635551452637, "loss/hidden": 0.8828125, "loss/logits": 0.14573311805725098, "loss/reg": 0.0028106593526899815, "step": 1915 }, { "epoch": 0.2395, "grad_norm": 1.9871270656585693, "grad_norm_var": 0.12455762918461702, "learning_rate": 0.0001, "loss": 1.0899, "loss/crossentropy": 2.402726888656616, "loss/hidden": 0.9140625, "loss/logits": 0.14770221710205078, "loss/reg": 0.0028093019500374794, "step": 1916 }, { "epoch": 0.239625, "grad_norm": 1.9124521017074585, "grad_norm_var": 0.13556166178302545, "learning_rate": 0.0001, "loss": 1.0272, "loss/crossentropy": 2.475602388381958, "loss/hidden": 0.8515625, "loss/logits": 0.14752693474292755, "loss/reg": 0.002807790180668235, "step": 1917 }, { "epoch": 0.23975, "grad_norm": 2.6607353687286377, "grad_norm_var": 0.1338465573837538, "learning_rate": 0.0001, "loss": 1.0073, "loss/crossentropy": 2.5274460315704346, "loss/hidden": 0.8359375, "loss/logits": 0.14331699907779694, "loss/reg": 0.0028062344063073397, "step": 1918 }, { "epoch": 0.239875, "grad_norm": 2.725597381591797, "grad_norm_var": 0.13689784558561602, "learning_rate": 0.0001, "loss": 1.3288, "loss/crossentropy": 2.164072275161743, "loss/hidden": 1.109375, "loss/logits": 0.19133153557777405, "loss/reg": 0.002804698422551155, "step": 1919 }, { "epoch": 0.24, "grad_norm": 2.5143990516662598, "grad_norm_var": 0.13367861240944112, "learning_rate": 0.0001, "loss": 1.1068, "loss/crossentropy": 2.6517221927642822, "loss/hidden": 0.91015625, "loss/logits": 0.16858091950416565, "loss/reg": 0.002803155919536948, "step": 1920 }, { "epoch": 0.240125, "grad_norm": 4.250274658203125, "grad_norm_var": 0.32790836135060264, "learning_rate": 0.0001, "loss": 1.2321, "loss/crossentropy": 2.756089925765991, "loss/hidden": 0.9375, "loss/logits": 0.2666040062904358, "loss/reg": 0.00280156172811985, "step": 1921 }, { "epoch": 0.24025, "grad_norm": 3.579580307006836, "grad_norm_var": 0.3780541826973238, "learning_rate": 0.0001, "loss": 1.1615, "loss/crossentropy": 2.4776012897491455, "loss/hidden": 0.95703125, "loss/logits": 0.17651261389255524, "loss/reg": 0.002799983136355877, "step": 1922 }, { "epoch": 0.240375, "grad_norm": 2.552597761154175, "grad_norm_var": 0.3679322737687584, "learning_rate": 0.0001, "loss": 1.1309, "loss/crossentropy": 2.535444498062134, "loss/hidden": 0.9453125, "loss/logits": 0.1575796902179718, "loss/reg": 0.0027983970940113068, "step": 1923 }, { "epoch": 0.2405, "grad_norm": 2.1624505519866943, "grad_norm_var": 0.3678785758486583, "learning_rate": 0.0001, "loss": 1.2067, "loss/crossentropy": 2.214770555496216, "loss/hidden": 1.0234375, "loss/logits": 0.15530873835086823, "loss/reg": 0.0027969072107225657, "step": 1924 }, { "epoch": 0.240625, "grad_norm": 2.1517112255096436, "grad_norm_var": 0.3728782554598296, "learning_rate": 0.0001, "loss": 1.0171, "loss/crossentropy": 2.520151138305664, "loss/hidden": 0.84765625, "loss/logits": 0.14149996638298035, "loss/reg": 0.002795466920360923, "step": 1925 }, { "epoch": 0.24075, "grad_norm": 2.09903883934021, "grad_norm_var": 0.3853855727658185, "learning_rate": 0.0001, "loss": 1.0477, "loss/crossentropy": 2.3022382259368896, "loss/hidden": 0.875, "loss/logits": 0.14477625489234924, "loss/reg": 0.0027940254658460617, "step": 1926 }, { "epoch": 0.240875, "grad_norm": 2.2137794494628906, "grad_norm_var": 0.3805278519877115, "learning_rate": 0.0001, "loss": 1.0049, "loss/crossentropy": 2.397075653076172, "loss/hidden": 0.84375, "loss/logits": 0.13318374752998352, "loss/reg": 0.00279267062433064, "step": 1927 }, { "epoch": 0.241, "grad_norm": 2.1640615463256836, "grad_norm_var": 0.3850549000224494, "learning_rate": 0.0001, "loss": 1.2428, "loss/crossentropy": 2.292894124984741, "loss/hidden": 1.0390625, "loss/logits": 0.17587056756019592, "loss/reg": 0.002791155595332384, "step": 1928 }, { "epoch": 0.241125, "grad_norm": 2.0617549419403076, "grad_norm_var": 0.38950121594236903, "learning_rate": 0.0001, "loss": 1.091, "loss/crossentropy": 2.3114614486694336, "loss/hidden": 0.9140625, "loss/logits": 0.14906063675880432, "loss/reg": 0.0027897644322365522, "step": 1929 }, { "epoch": 0.24125, "grad_norm": 2.2500698566436768, "grad_norm_var": 0.3904109329361792, "learning_rate": 0.0001, "loss": 1.0731, "loss/crossentropy": 2.530860662460327, "loss/hidden": 0.8984375, "loss/logits": 0.14675912261009216, "loss/reg": 0.0027884161099791527, "step": 1930 }, { "epoch": 0.241375, "grad_norm": 3.118149518966675, "grad_norm_var": 0.40894295029893557, "learning_rate": 0.0001, "loss": 1.0974, "loss/crossentropy": 2.5993683338165283, "loss/hidden": 0.90625, "loss/logits": 0.16328555345535278, "loss/reg": 0.0027870717458426952, "step": 1931 }, { "epoch": 0.2415, "grad_norm": 2.231571912765503, "grad_norm_var": 0.39513912896007114, "learning_rate": 0.0001, "loss": 1.0225, "loss/crossentropy": 2.673400402069092, "loss/hidden": 0.85546875, "loss/logits": 0.13916508853435516, "loss/reg": 0.0027857308741658926, "step": 1932 }, { "epoch": 0.241625, "grad_norm": 2.2554473876953125, "grad_norm_var": 0.3737690186064941, "learning_rate": 0.0001, "loss": 1.1379, "loss/crossentropy": 2.538700819015503, "loss/hidden": 0.9453125, "loss/logits": 0.16477283835411072, "loss/reg": 0.002784265670925379, "step": 1933 }, { "epoch": 0.24175, "grad_norm": 2.140296459197998, "grad_norm_var": 0.3838427455168045, "learning_rate": 0.0001, "loss": 1.0912, "loss/crossentropy": 2.588484287261963, "loss/hidden": 0.8984375, "loss/logits": 0.1649562418460846, "loss/reg": 0.002782786963507533, "step": 1934 }, { "epoch": 0.241875, "grad_norm": 2.2381234169006348, "grad_norm_var": 0.38594407304694867, "learning_rate": 0.0001, "loss": 1.0233, "loss/crossentropy": 2.5911705493927, "loss/hidden": 0.859375, "loss/logits": 0.13616088032722473, "loss/reg": 0.0027812945190817118, "step": 1935 }, { "epoch": 0.242, "grad_norm": 1.9112108945846558, "grad_norm_var": 0.40744186602944354, "learning_rate": 0.0001, "loss": 0.9393, "loss/crossentropy": 2.5094900131225586, "loss/hidden": 0.78515625, "loss/logits": 0.12632881104946136, "loss/reg": 0.002779774833470583, "step": 1936 }, { "epoch": 0.242125, "grad_norm": 5.132593631744385, "grad_norm_var": 0.6665618029327437, "learning_rate": 0.0001, "loss": 1.2427, "loss/crossentropy": 2.5614383220672607, "loss/hidden": 1.0703125, "loss/logits": 0.144596129655838, "loss/reg": 0.002778239781036973, "step": 1937 }, { "epoch": 0.24225, "grad_norm": 3.1231844425201416, "grad_norm_var": 0.6148830410155909, "learning_rate": 0.0001, "loss": 0.8716, "loss/crossentropy": 2.501601457595825, "loss/hidden": 0.734375, "loss/logits": 0.1094457358121872, "loss/reg": 0.002776721026748419, "step": 1938 }, { "epoch": 0.242375, "grad_norm": 2.5960471630096436, "grad_norm_var": 0.6153759718928247, "learning_rate": 0.0001, "loss": 1.1199, "loss/crossentropy": 2.5568716526031494, "loss/hidden": 0.91796875, "loss/logits": 0.17422887682914734, "loss/reg": 0.0027751729357987642, "step": 1939 }, { "epoch": 0.2425, "grad_norm": 2.3119328022003174, "grad_norm_var": 0.6102323306014952, "learning_rate": 0.0001, "loss": 0.9679, "loss/crossentropy": 2.3882250785827637, "loss/hidden": 0.8046875, "loss/logits": 0.13546867668628693, "loss/reg": 0.0027736674528568983, "step": 1940 }, { "epoch": 0.242625, "grad_norm": 4.160861015319824, "grad_norm_var": 0.7692402881847016, "learning_rate": 0.0001, "loss": 1.2181, "loss/crossentropy": 2.2808122634887695, "loss/hidden": 0.984375, "loss/logits": 0.2060256004333496, "loss/reg": 0.0027721913065761328, "step": 1941 }, { "epoch": 0.24275, "grad_norm": 2.8490424156188965, "grad_norm_var": 0.7517497358643694, "learning_rate": 0.0001, "loss": 1.2525, "loss/crossentropy": 2.345285415649414, "loss/hidden": 1.03125, "loss/logits": 0.19358152151107788, "loss/reg": 0.0027707451954483986, "step": 1942 }, { "epoch": 0.242875, "grad_norm": 7.469876289367676, "grad_norm_var": 2.157014120724763, "learning_rate": 0.0001, "loss": 1.4803, "loss/crossentropy": 2.0654871463775635, "loss/hidden": 1.25, "loss/logits": 0.20259954035282135, "loss/reg": 0.002769321436062455, "step": 1943 }, { "epoch": 0.243, "grad_norm": 2.437960624694824, "grad_norm_var": 2.131142079716708, "learning_rate": 0.0001, "loss": 1.1002, "loss/crossentropy": 2.231019973754883, "loss/hidden": 0.921875, "loss/logits": 0.1506144404411316, "loss/reg": 0.002767904195934534, "step": 1944 }, { "epoch": 0.243125, "grad_norm": 3.1553590297698975, "grad_norm_var": 2.066455279052258, "learning_rate": 0.0001, "loss": 1.1744, "loss/crossentropy": 2.465205669403076, "loss/hidden": 0.95703125, "loss/logits": 0.18965642154216766, "loss/reg": 0.0027665095403790474, "step": 1945 }, { "epoch": 0.24325, "grad_norm": 2.4227399826049805, "grad_norm_var": 2.0490651192590494, "learning_rate": 0.0001, "loss": 1.2199, "loss/crossentropy": 2.8017942905426025, "loss/hidden": 0.9921875, "loss/logits": 0.20010419189929962, "loss/reg": 0.0027651283890008926, "step": 1946 }, { "epoch": 0.243375, "grad_norm": 1.9351319074630737, "grad_norm_var": 2.1332233829394576, "learning_rate": 0.0001, "loss": 1.1051, "loss/crossentropy": 2.385049819946289, "loss/hidden": 0.921875, "loss/logits": 0.15559425950050354, "loss/reg": 0.002763670403510332, "step": 1947 }, { "epoch": 0.2435, "grad_norm": 2.439415454864502, "grad_norm_var": 2.1139850344569364, "learning_rate": 0.0001, "loss": 1.0697, "loss/crossentropy": 2.5390353202819824, "loss/hidden": 0.859375, "loss/logits": 0.18273773789405823, "loss/reg": 0.002762230345979333, "step": 1948 }, { "epoch": 0.243625, "grad_norm": 3.9348716735839844, "grad_norm_var": 2.1154351813563985, "learning_rate": 0.0001, "loss": 1.5285, "loss/crossentropy": 2.544994354248047, "loss/hidden": 1.2265625, "loss/logits": 0.27432870864868164, "loss/reg": 0.0027608247473835945, "step": 1949 }, { "epoch": 0.24375, "grad_norm": 2.6699066162109375, "grad_norm_var": 2.0622895626261593, "learning_rate": 0.0001, "loss": 1.1582, "loss/crossentropy": 2.3712096214294434, "loss/hidden": 0.9609375, "loss/logits": 0.16963256895542145, "loss/reg": 0.002759524155408144, "step": 1950 }, { "epoch": 0.243875, "grad_norm": 2.510438919067383, "grad_norm_var": 2.032934141151713, "learning_rate": 0.0001, "loss": 1.1192, "loss/crossentropy": 2.6832380294799805, "loss/hidden": 0.9140625, "loss/logits": 0.17755383253097534, "loss/reg": 0.002758244751021266, "step": 1951 }, { "epoch": 0.244, "grad_norm": 2.703141689300537, "grad_norm_var": 1.9369671914291076, "learning_rate": 0.0001, "loss": 1.2835, "loss/crossentropy": 2.1617352962493896, "loss/hidden": 1.0625, "loss/logits": 0.1934407502412796, "loss/reg": 0.002756967907771468, "step": 1952 }, { "epoch": 0.244125, "grad_norm": 2.3727617263793945, "grad_norm_var": 1.7168647286460454, "learning_rate": 0.0001, "loss": 1.1945, "loss/crossentropy": 2.636251211166382, "loss/hidden": 0.9765625, "loss/logits": 0.19042374193668365, "loss/reg": 0.0027555141132324934, "step": 1953 }, { "epoch": 0.24425, "grad_norm": 2.1022789478302, "grad_norm_var": 1.774533228862542, "learning_rate": 0.0001, "loss": 1.0134, "loss/crossentropy": 2.517247200012207, "loss/hidden": 0.85546875, "loss/logits": 0.13039694726467133, "loss/reg": 0.0027542109601199627, "step": 1954 }, { "epoch": 0.244375, "grad_norm": 2.1997644901275635, "grad_norm_var": 1.8059291585278145, "learning_rate": 0.0001, "loss": 1.0473, "loss/crossentropy": 2.655820608139038, "loss/hidden": 0.87109375, "loss/logits": 0.1486739218235016, "loss/reg": 0.002752919914200902, "step": 1955 }, { "epoch": 0.2445, "grad_norm": 2.6467058658599854, "grad_norm_var": 1.7831262007346442, "learning_rate": 0.0001, "loss": 1.3302, "loss/crossentropy": 2.114410400390625, "loss/hidden": 1.125, "loss/logits": 0.17764800786972046, "loss/reg": 0.0027516759000718594, "step": 1956 }, { "epoch": 0.244625, "grad_norm": 2.7396280765533447, "grad_norm_var": 1.68951109645124, "learning_rate": 0.0001, "loss": 1.1134, "loss/crossentropy": 2.5355465412139893, "loss/hidden": 0.921875, "loss/logits": 0.16403117775917053, "loss/reg": 0.002750229090452194, "step": 1957 }, { "epoch": 0.24475, "grad_norm": 2.410583019256592, "grad_norm_var": 1.7051962159964043, "learning_rate": 0.0001, "loss": 1.1862, "loss/crossentropy": 2.0772080421447754, "loss/hidden": 1.015625, "loss/logits": 0.1430494487285614, "loss/reg": 0.0027487878687679768, "step": 1958 }, { "epoch": 0.244875, "grad_norm": 2.3453474044799805, "grad_norm_var": 0.21338224169162548, "learning_rate": 0.0001, "loss": 0.9842, "loss/crossentropy": 2.786540985107422, "loss/hidden": 0.80859375, "loss/logits": 0.14812374114990234, "loss/reg": 0.002747328719124198, "step": 1959 }, { "epoch": 0.245, "grad_norm": 3.010045051574707, "grad_norm_var": 0.22421355318186417, "learning_rate": 0.0001, "loss": 1.2039, "loss/crossentropy": 2.6426658630371094, "loss/hidden": 0.953125, "loss/logits": 0.22334754467010498, "loss/reg": 0.0027458607219159603, "step": 1960 }, { "epoch": 0.245125, "grad_norm": 2.431356191635132, "grad_norm_var": 0.20335259794886623, "learning_rate": 0.0001, "loss": 1.0896, "loss/crossentropy": 2.223994255065918, "loss/hidden": 0.921875, "loss/logits": 0.14024955034255981, "loss/reg": 0.0027444439474493265, "step": 1961 }, { "epoch": 0.24525, "grad_norm": 2.264490842819214, "grad_norm_var": 0.20770068539455805, "learning_rate": 0.0001, "loss": 0.9975, "loss/crossentropy": 2.5207021236419678, "loss/hidden": 0.83203125, "loss/logits": 0.13799090683460236, "loss/reg": 0.0027429983019828796, "step": 1962 }, { "epoch": 0.245375, "grad_norm": 2.420308828353882, "grad_norm_var": 0.18297715933091632, "learning_rate": 0.0001, "loss": 1.2007, "loss/crossentropy": 2.4860877990722656, "loss/hidden": 0.9921875, "loss/logits": 0.1810786873102188, "loss/reg": 0.00274151680059731, "step": 1963 }, { "epoch": 0.2455, "grad_norm": 2.414461851119995, "grad_norm_var": 0.18346740397452094, "learning_rate": 0.0001, "loss": 1.071, "loss/crossentropy": 2.4090845584869385, "loss/hidden": 0.90234375, "loss/logits": 0.14120522141456604, "loss/reg": 0.0027401153929531574, "step": 1964 }, { "epoch": 0.245625, "grad_norm": 2.4071054458618164, "grad_norm_var": 0.05203356240904213, "learning_rate": 0.0001, "loss": 1.0379, "loss/crossentropy": 2.6016945838928223, "loss/hidden": 0.86328125, "loss/logits": 0.14726971089839935, "loss/reg": 0.0027385957073420286, "step": 1965 }, { "epoch": 0.24575, "grad_norm": 2.428882360458374, "grad_norm_var": 0.04949778844412904, "learning_rate": 0.0001, "loss": 0.9793, "loss/crossentropy": 2.6819698810577393, "loss/hidden": 0.8203125, "loss/logits": 0.13164296746253967, "loss/reg": 0.002737129107117653, "step": 1966 }, { "epoch": 0.245875, "grad_norm": 2.2721593379974365, "grad_norm_var": 0.05153780887834784, "learning_rate": 0.0001, "loss": 1.0456, "loss/crossentropy": 2.2785394191741943, "loss/hidden": 0.86328125, "loss/logits": 0.1549597829580307, "loss/reg": 0.002735583111643791, "step": 1967 }, { "epoch": 0.246, "grad_norm": 2.5156705379486084, "grad_norm_var": 0.04735843285122859, "learning_rate": 0.0001, "loss": 1.1212, "loss/crossentropy": 2.4128031730651855, "loss/hidden": 0.9375, "loss/logits": 0.15634964406490326, "loss/reg": 0.002734163776040077, "step": 1968 }, { "epoch": 0.246125, "grad_norm": 2.315765380859375, "grad_norm_var": 0.04804468545032871, "learning_rate": 0.0001, "loss": 1.1106, "loss/crossentropy": 2.5273008346557617, "loss/hidden": 0.93359375, "loss/logits": 0.1496410369873047, "loss/reg": 0.0027327670250087976, "step": 1969 }, { "epoch": 0.24625, "grad_norm": 3.655541181564331, "grad_norm_var": 0.13038539827467327, "learning_rate": 0.0001, "loss": 1.1545, "loss/crossentropy": 2.816701889038086, "loss/hidden": 0.9453125, "loss/logits": 0.18190214037895203, "loss/reg": 0.002731376327574253, "step": 1970 }, { "epoch": 0.246375, "grad_norm": 2.0330469608306885, "grad_norm_var": 0.13946034117999087, "learning_rate": 0.0001, "loss": 0.9543, "loss/crossentropy": 2.5322234630584717, "loss/hidden": 0.8046875, "loss/logits": 0.12230876088142395, "loss/reg": 0.0027299553621560335, "step": 1971 }, { "epoch": 0.2465, "grad_norm": 2.3114256858825684, "grad_norm_var": 0.1407970077955942, "learning_rate": 0.0001, "loss": 1.0766, "loss/crossentropy": 2.403012752532959, "loss/hidden": 0.90625, "loss/logits": 0.14305217564105988, "loss/reg": 0.0027285972610116005, "step": 1972 }, { "epoch": 0.246625, "grad_norm": 2.872380256652832, "grad_norm_var": 0.14616669234115964, "learning_rate": 0.0001, "loss": 1.0601, "loss/crossentropy": 2.620347261428833, "loss/hidden": 0.875, "loss/logits": 0.15787017345428467, "loss/reg": 0.0027271404396742582, "step": 1973 }, { "epoch": 0.24675, "grad_norm": 2.204484462738037, "grad_norm_var": 0.15146511044817218, "learning_rate": 0.0001, "loss": 1.0066, "loss/crossentropy": 2.5610415935516357, "loss/hidden": 0.828125, "loss/logits": 0.15118342638015747, "loss/reg": 0.002725655445829034, "step": 1974 }, { "epoch": 0.246875, "grad_norm": 2.2441866397857666, "grad_norm_var": 0.15410845728410152, "learning_rate": 0.0001, "loss": 0.9559, "loss/crossentropy": 2.513578414916992, "loss/hidden": 0.8046875, "loss/logits": 0.12393586337566376, "loss/reg": 0.002724139718338847, "step": 1975 }, { "epoch": 0.247, "grad_norm": 2.8840277194976807, "grad_norm_var": 0.14632239260073235, "learning_rate": 0.0001, "loss": 1.0875, "loss/crossentropy": 2.491835832595825, "loss/hidden": 0.90625, "loss/logits": 0.15406697988510132, "loss/reg": 0.0027226670645177364, "step": 1976 }, { "epoch": 0.247125, "grad_norm": 2.744213342666626, "grad_norm_var": 0.15042299567527168, "learning_rate": 0.0001, "loss": 1.2954, "loss/crossentropy": 2.233274459838867, "loss/hidden": 1.0859375, "loss/logits": 0.18220359086990356, "loss/reg": 0.0027212114073336124, "step": 1977 }, { "epoch": 0.24725, "grad_norm": 2.3532700538635254, "grad_norm_var": 0.1481365956517531, "learning_rate": 0.0001, "loss": 1.1782, "loss/crossentropy": 2.70269775390625, "loss/hidden": 0.96484375, "loss/logits": 0.18613766133785248, "loss/reg": 0.002719811163842678, "step": 1978 }, { "epoch": 0.247375, "grad_norm": 2.016599416732788, "grad_norm_var": 0.16287134788210172, "learning_rate": 0.0001, "loss": 1.0368, "loss/crossentropy": 2.7286431789398193, "loss/hidden": 0.859375, "loss/logits": 0.1502460092306137, "loss/reg": 0.0027184404898434877, "step": 1979 }, { "epoch": 0.2475, "grad_norm": 2.468029499053955, "grad_norm_var": 0.16258562087950257, "learning_rate": 0.0001, "loss": 1.0856, "loss/crossentropy": 2.4531679153442383, "loss/hidden": 0.9140625, "loss/logits": 0.14435534179210663, "loss/reg": 0.0027170274406671524, "step": 1980 }, { "epoch": 0.247625, "grad_norm": 2.8880743980407715, "grad_norm_var": 0.1721816167867452, "learning_rate": 0.0001, "loss": 1.0137, "loss/crossentropy": 2.8085103034973145, "loss/hidden": 0.8203125, "loss/logits": 0.16619496047496796, "loss/reg": 0.0027155885472893715, "step": 1981 }, { "epoch": 0.24775, "grad_norm": 2.2269508838653564, "grad_norm_var": 0.1769945282356974, "learning_rate": 0.0001, "loss": 1.0995, "loss/crossentropy": 2.7113091945648193, "loss/hidden": 0.92578125, "loss/logits": 0.1465437412261963, "loss/reg": 0.0027142076287418604, "step": 1982 }, { "epoch": 0.247875, "grad_norm": 2.3174233436584473, "grad_norm_var": 0.17574531851225003, "learning_rate": 0.0001, "loss": 1.4501, "loss/crossentropy": 1.9189180135726929, "loss/hidden": 1.1953125, "loss/logits": 0.22766916453838348, "loss/reg": 0.0027127759531140327, "step": 1983 }, { "epoch": 0.248, "grad_norm": 1.7954802513122559, "grad_norm_var": 0.20696429693966606, "learning_rate": 0.0001, "loss": 1.0248, "loss/crossentropy": 2.484433174133301, "loss/hidden": 0.859375, "loss/logits": 0.1383214145898819, "loss/reg": 0.0027115046977996826, "step": 1984 }, { "epoch": 0.248125, "grad_norm": 1.98469078540802, "grad_norm_var": 0.22010164823287107, "learning_rate": 0.0001, "loss": 1.0636, "loss/crossentropy": 2.355754852294922, "loss/hidden": 0.875, "loss/logits": 0.16146710515022278, "loss/reg": 0.0027100895531475544, "step": 1985 }, { "epoch": 0.24825, "grad_norm": 3.0588693618774414, "grad_norm_var": 0.14544907650537522, "learning_rate": 0.0001, "loss": 1.2953, "loss/crossentropy": 2.55202317237854, "loss/hidden": 1.0625, "loss/logits": 0.20571765303611755, "loss/reg": 0.002708751941099763, "step": 1986 }, { "epoch": 0.248375, "grad_norm": 1.9647020101547241, "grad_norm_var": 0.14908673013686075, "learning_rate": 0.0001, "loss": 1.0645, "loss/crossentropy": 2.5482850074768066, "loss/hidden": 0.8828125, "loss/logits": 0.15463218092918396, "loss/reg": 0.0027072790544480085, "step": 1987 }, { "epoch": 0.2485, "grad_norm": 2.367072582244873, "grad_norm_var": 0.1486533124992943, "learning_rate": 0.0001, "loss": 1.087, "loss/crossentropy": 2.4881367683410645, "loss/hidden": 0.90625, "loss/logits": 0.15367110073566437, "loss/reg": 0.0027056580875068903, "step": 1988 }, { "epoch": 0.248625, "grad_norm": 2.4453704357147217, "grad_norm_var": 0.1331206329775275, "learning_rate": 0.0001, "loss": 1.0873, "loss/crossentropy": 2.556847333908081, "loss/hidden": 0.88671875, "loss/logits": 0.17354083061218262, "loss/reg": 0.0027040427085012197, "step": 1989 }, { "epoch": 0.24875, "grad_norm": 9.09829044342041, "grad_norm_var": 2.9487722333659785, "learning_rate": 0.0001, "loss": 1.2835, "loss/crossentropy": 2.4852135181427, "loss/hidden": 1.09375, "loss/logits": 0.16272324323654175, "loss/reg": 0.0027026128955185413, "step": 1990 }, { "epoch": 0.248875, "grad_norm": 2.424456834793091, "grad_norm_var": 2.9373577672795688, "learning_rate": 0.0001, "loss": 1.0357, "loss/crossentropy": 2.4856038093566895, "loss/hidden": 0.8515625, "loss/logits": 0.15710175037384033, "loss/reg": 0.0027011926285922527, "step": 1991 }, { "epoch": 0.249, "grad_norm": 2.624159336090088, "grad_norm_var": 2.939181373576417, "learning_rate": 0.0001, "loss": 1.3347, "loss/crossentropy": 2.166581869125366, "loss/hidden": 1.109375, "loss/logits": 0.19837325811386108, "loss/reg": 0.0026996470987796783, "step": 1992 }, { "epoch": 0.249125, "grad_norm": 2.957742691040039, "grad_norm_var": 2.9404825335519735, "learning_rate": 0.0001, "loss": 1.3991, "loss/crossentropy": 2.6836252212524414, "loss/hidden": 1.140625, "loss/logits": 0.23152483999729156, "loss/reg": 0.002698224736377597, "step": 1993 }, { "epoch": 0.24925, "grad_norm": 2.1770012378692627, "grad_norm_var": 2.9532045555307374, "learning_rate": 0.0001, "loss": 1.099, "loss/crossentropy": 2.48860239982605, "loss/hidden": 0.9296875, "loss/logits": 0.14239296317100525, "loss/reg": 0.002696766285225749, "step": 1994 }, { "epoch": 0.249375, "grad_norm": 2.590374708175659, "grad_norm_var": 2.9137765910812217, "learning_rate": 0.0001, "loss": 1.1914, "loss/crossentropy": 2.4991402626037598, "loss/hidden": 0.98828125, "loss/logits": 0.17614206671714783, "loss/reg": 0.0026953339111059904, "step": 1995 }, { "epoch": 0.2495, "grad_norm": 2.9202098846435547, "grad_norm_var": 2.904322765602717, "learning_rate": 0.0001, "loss": 1.1382, "loss/crossentropy": 2.286001682281494, "loss/hidden": 0.9453125, "loss/logits": 0.1659601628780365, "loss/reg": 0.0026938265655189753, "step": 1996 }, { "epoch": 0.249625, "grad_norm": 2.133800745010376, "grad_norm_var": 2.93756568739632, "learning_rate": 0.0001, "loss": 1.1125, "loss/crossentropy": 2.492587089538574, "loss/hidden": 0.91015625, "loss/logits": 0.17543631792068481, "loss/reg": 0.002692408859729767, "step": 1997 }, { "epoch": 0.24975, "grad_norm": 2.7813234329223633, "grad_norm_var": 2.9130920460482055, "learning_rate": 0.0001, "loss": 1.0991, "loss/crossentropy": 2.636305332183838, "loss/hidden": 0.921875, "loss/logits": 0.15036045014858246, "loss/reg": 0.0026909259613603354, "step": 1998 }, { "epoch": 0.249875, "grad_norm": 2.2112627029418945, "grad_norm_var": 2.9213711600102763, "learning_rate": 0.0001, "loss": 0.885, "loss/crossentropy": 2.672691822052002, "loss/hidden": 0.73828125, "loss/logits": 0.11980107426643372, "loss/reg": 0.002689523156732321, "step": 1999 }, { "epoch": 0.25, "grad_norm": 2.095968246459961, "grad_norm_var": 2.8849283178664864, "learning_rate": 0.0001, "loss": 0.9991, "loss/crossentropy": 2.556314706802368, "loss/hidden": 0.83203125, "loss/logits": 0.14016053080558777, "loss/reg": 0.002688055392354727, "step": 2000 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.28811723128832e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }